Refactor character-functions with Herodotus - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 65785f699be45dd77bdcbfc1d3aded39151f3205
 (DIR) parent b13acfd6cd5114fcddbffaf9855664a95f966403
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Sat, 24 Sep 2022 11:45:20 +0200
       
       Refactor character-functions with Herodotus
       
       This also unifies the code and drops a lot of complicated state
       handling.
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M src/character.c                     |      60 ++++++++++---------------------
         M src/util.c                          |       6 +++++-
       
       2 files changed, 24 insertions(+), 42 deletions(-)
       ---
 (DIR) diff --git a/src/character.c b/src/character.c
       @@ -175,61 +175,39 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
                return !notbreak;
        }
        
       -size_t
       -grapheme_next_character_break(const uint_least32_t *str, size_t len)
       +static size_t
       +next_character_break(HERODOTUS_READER *r)
        {
                GRAPHEME_STATE state = { 0 };
       -        size_t off;
       -
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       +        uint_least32_t cp0 = 0, cp1 = 0;
        
       -        for (off = 1; off < len; off++) {
       -                if (grapheme_is_character_break(str[off - 1], str[off], &state)) {
       +        for (herodotus_read_codepoint(r, true, &cp0);
       +             herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
       +             herodotus_read_codepoint(r, true, &cp0)) {
       +                if (grapheme_is_character_break(cp0, cp1, &state)) {
                                break;
                        }
                }
        
       -        return off;
       +        return herodotus_reader_number_read(r);
        }
        
        size_t
       -grapheme_next_character_break_utf8(const char *str, size_t len)
       +grapheme_next_character_break(const uint_least32_t *str, size_t len)
        {
       -        GRAPHEME_STATE state = { 0 };
       -        uint_least32_t cp0 = 0, cp1 = 0;
       -        size_t off, ret;
       -
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       +        HERODOTUS_READER r;
        
       -        for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
       -                cp0 = cp1;
       -                ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
       -                                           SIZE_MAX : len - off, &cp1);
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
        
       -                if (len != SIZE_MAX && ret > (len - off)) {
       -                        /* string ended abruptly, simply accept cropping */
       -                        ret = len - off;
       -                }
       +        return next_character_break(&r);
       +}
        
       -                if (len == SIZE_MAX && cp1 == 0) {
       -                        /* we hit a NUL-byte and are done */
       -                        break;
       -                }
       +size_t
       +grapheme_next_character_break_utf8(const char *str, size_t len)
       +{
       +        HERODOTUS_READER r;
        
       -                if (off == 0) {
       -                        /*
       -                         * we skip the first round, as we need both
       -                         * cp0 and cp1 to be initialized
       -                         */
       -                        continue;
       -                } else if (grapheme_is_character_break(cp0, cp1, &state)) {
       -                        break;
       -                }
       -        }
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
        
       -        return off;
       +        return next_character_break(&r);
        }
 (DIR) diff --git a/src/util.c b/src/util.c
       @@ -111,7 +111,11 @@ herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
                }
        
                if (r->type == HERODOTUS_TYPE_CODEPOINT) {
       -                *cp = ((const uint_least32_t *)(r->src))[r->off++];
       +                *cp = ((const uint_least32_t *)(r->src))[r->off];
       +
       +                if (advance) {
       +                        r->off++;
       +                }
                } else { /* r->type == HERODOTUS_TYPE_UTF8 */
                        ret = grapheme_decode_utf8((const char *)r->src + r->off,
                                                   MIN(r->srclen, r->soft_limit[0]) -