Refactor word-functions with Proper (using Herodotus in the background) - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 52b0e29e02068d6a8123042ef901f73e37b2f38f
 (DIR) parent b899fd685c50cbc61999296ce1e0a03a45e74f52
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Sun,  2 Oct 2022 21:17:03 +0200
       
       Refactor word-functions with Proper (using Herodotus in the background)
       
       As promised, this leads to a heavy simplification and separation of
       concerns in the code. Additionally, this fixes some known quirks in
       regard to handling NUL-terminated strings.
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M src/word.c                          |     367 ++++++++++++-------------------
       
       1 file changed, 142 insertions(+), 225 deletions(-)
       ---
 (DIR) diff --git a/src/word.c b/src/word.c
       @@ -6,328 +6,237 @@
        #include "../grapheme.h"
        #include "util.h"
        
       -static inline enum word_break_property
       -get_break_prop(uint_least32_t cp)
       +struct word_break_state
       +{
       +        bool ri_even;
       +};
       +
       +static inline uint_least8_t
       +get_word_break_prop(uint_least32_t cp)
        {
                if (likely(cp <= 0x10FFFF)) {
       -                return (enum word_break_property)
       +                return (uint_least8_t)
                               word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
                } else {
                        return WORD_BREAK_PROP_OTHER;
                }
        }
        
       -static size_t
       -next_word_break(const void *str, size_t len, size_t (*get_codepoint)
       -                (const void *, size_t, size_t, uint_least32_t *))
       +static bool
       +is_skippable_word_prop(uint_least8_t prop)
        {
       -        struct {
       -                enum word_break_property a, b, c, d;
       -        } raw, skip;
       -        enum word_break_property res;
       -        uint_least32_t cp;
       -        size_t off, tmp, new_off;
       -        bool ri_even = true;
       -
       -        /* check degenerate cases */
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       -
       -        /*
       -         * Apply word breaking algorithm (UAX #29), see
       -         * https://unicode.org/reports/tr29/#Word_Boundary_Rules
       -         *
       -         * There are 4 slots (a, b, c, d) of "break" properties and
       -         * we check if there is a break in the middle between b and c.
       -         *
       -         * The position of this middle spot is determined by off,
       -         * which gives the offset of the first element on the right
       -         * hand side of said spot, or, in other words, gives the number
       -         * of elements on the left hand side.
       -         *
       -         * It is further complicated by the fact that the algorithm
       -         * expects you to skip certain characters for the second
       -         * half of the rules (after WB4). Thus, we do not only have
       -         * the "raw" properties as described above, but also the "skip"
       -         * properties, where the skip.a and skip.b, for instance,
       -         * give the two preceding character properties behind the
       -         * currently investigated breakpoint.
       -         *
       -         */
       +        return prop == WORD_BREAK_PROP_EXTEND ||
       +               prop == WORD_BREAK_PROP_FORMAT ||
       +               prop == WORD_BREAK_PROP_ZWJ;
       +}
        
       -        /*
       -         * Initialize the different properties such that we have
       -         * a good state after the state-update in the loop
       -         */
       -        raw.b = NUM_WORD_BREAK_PROPS;
       -        if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
       -                /*
       -                 * A line is at least one codepoint long, so we can
       -                 * safely return here
       -                 */
       -                return len;
       -        }
       -        raw.c = get_break_prop(cp);
       -        (void)get_codepoint(str, len, off, &cp);
       -        raw.d = get_break_prop(cp);
       -        skip.a = skip.b = NUM_WORD_BREAK_PROPS;
       +static void
       +word_skip_shift_callback(uint_least8_t prop, void *s)
       +{
       +        struct word_break_state *state = (struct word_break_state *)s;
        
       -        for (; off < len; off = new_off) {
       +        if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
                        /*
       -                 * Update left side (a and b) of the skip state by
       -                 * "shifting in" the raw.c property as long as it is
       -                 * not one of the "ignored" character properties.
       -                 * While at it, update the RI-counter.
       +                 * The property we just shifted in is
       +                 * a regional indicator, increasing the
       +                 * number of consecutive RIs on the left
       +                 * side of the breakpoint by one, changing
       +                 * the oddness.
                         *
                         */
       -                if (raw.c != WORD_BREAK_PROP_EXTEND &&
       -                    raw.c != WORD_BREAK_PROP_FORMAT &&
       -                    raw.c != WORD_BREAK_PROP_ZWJ) {
       -                            skip.a = skip.b;
       -                        skip.b = raw.c;
       -
       -                        if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
       -                                /*
       -                                 * The property we just shifted in is
       -                                 * a regional indicator, increasing the
       -                                 * number of consecutive RIs on the left
       -                                 * side of the breakpoint by one, changing
       -                                 * the oddness.
       -                                 *
       -                                 */
       -                                ri_even = !ri_even;
       -                        } else {
       -                                /*
       -                                 * We saw no regional indicator, so the
       -                                 * number of consecutive RIs on the left
       -                                 * side of the breakpoint is zero, which
       -                                 * is an even number.
       -                                 *
       -                                 */
       -                                ri_even = true;
       -                        }
       -                }
       -
       +                state->ri_even = !(state->ri_even);
       +        } else {
                        /*
       -                 * Update right side (b and c) of the skip state by
       -                 * starting at the breakpoint and detecting the two
       -                 * following non-ignored character classes
       +                 * We saw no regional indicator, so the
       +                 * number of consecutive RIs on the left
       +                 * side of the breakpoint is zero, which
       +                 * is an even number.
                         *
                         */
       -                skip.c = NUM_WORD_BREAK_PROPS;
       -                for (tmp = off; tmp < len; ) {
       -                        tmp += get_codepoint(str, len, tmp, &cp);
       -                        res = get_break_prop(cp);
       -
       -                        if (res != WORD_BREAK_PROP_EXTEND &&
       -                            res != WORD_BREAK_PROP_FORMAT &&
       -                            res != WORD_BREAK_PROP_ZWJ) {
       -                                skip.c = res;
       -                                break;
       -                        }
       -                }
       -                skip.d = NUM_WORD_BREAK_PROPS;
       -                for (; tmp < len; ) {
       -                        tmp += get_codepoint(str, len, tmp, &cp);
       -                        res = get_break_prop(cp);
       -
       -                        if (res != WORD_BREAK_PROP_EXTEND &&
       -                            res != WORD_BREAK_PROP_FORMAT &&
       -                            res != WORD_BREAK_PROP_ZWJ) {
       -                                skip.d = res;
       -                                break;
       -                        }
       -                }
       +                state->ri_even = true;
       +        }
       +}
        
       -                /*
       -                 * Update the raw state by simply shifting everything
       -                 * in and, if we still have data left, determining
       -                 * the character class of the next codepoint.
       -                 *
       -                 */
       -                raw.a = raw.b;
       -                raw.b = raw.c;
       -                raw.c = raw.d;
       -                if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
       -                        get_codepoint(str, len, new_off, &cp);
       -                        raw.d = get_break_prop(cp);
       -                } else {
       -                        raw.d = NUM_WORD_BREAK_PROPS;
       -                }
       +static size_t
       +next_word_break(HERODOTUS_READER *r)
       +{
       +        struct proper p;
       +        struct word_break_state state = { .ri_even = true };
        
       +        /*
       +         * Apply word breaking algorithm (UAX #29), see
       +         * https://unicode.org/reports/tr29/#Word_Boundary_Rules
       +         */
       +        proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
       +                    is_skippable_word_prop, word_skip_shift_callback, &p);
       +
       +        while (!proper_advance(&p)) {
                        /* WB3 */
       -                if (raw.b == WORD_BREAK_PROP_CR &&
       -                    raw.c == WORD_BREAK_PROP_LF) {
       +                if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
                                continue;
                        }
        
                        /* WB3a */
       -                if (raw.b == WORD_BREAK_PROP_NEWLINE ||
       -                    raw.b == WORD_BREAK_PROP_CR      ||
       -                    raw.b == WORD_BREAK_PROP_LF) {
       +                if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
       +                    p.raw.prev_prop[0] == WORD_BREAK_PROP_CR      ||
       +                    p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
                                break;
                        }
        
                        /* WB3b */
       -                if (raw.c == WORD_BREAK_PROP_NEWLINE ||
       -                    raw.c == WORD_BREAK_PROP_CR      ||
       -                    raw.c == WORD_BREAK_PROP_LF) {
       +                if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_CR      ||
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
                                break;
                        }
        
                        /* WB3c */
       -                if (raw.b == WORD_BREAK_PROP_ZWJ &&
       -                    (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
       -                     raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
       +                if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
       +                    (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
       +                     p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
                                continue;
                        }
        
                        /* WB3d */
       -                if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
       -                    raw.c == WORD_BREAK_PROP_WSEGSPACE) {
       +                if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
                                continue;
                        }
        
                        /* WB4 */
       -                if (raw.c == WORD_BREAK_PROP_EXTEND ||
       -                    raw.c == WORD_BREAK_PROP_FORMAT ||
       -                    raw.c == WORD_BREAK_PROP_ZWJ) {
       +                if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
                                continue;
                        }
        
                        /* WB5 */
       -                if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
       -                    (skip.c == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                                continue;
                        }
        
                        /* WB6 */
       -                if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
       -                    (skip.c == WORD_BREAK_PROP_MIDLETTER    ||
       -                     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
       -                     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       -                    len > 2 &&
       -                    (skip.d == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER    ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       +                    (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                                continue;
                        }
        
                        /* WB7 */
       -                if ((skip.b == WORD_BREAK_PROP_MIDLETTER    ||
       -                     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
       -                     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       -                    (skip.c == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
       -                    len > 2 &&
       -                    (skip.a == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER    ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
       +                    (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                                continue;
                        }
        
                        /* WB7a */
       -                if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
       -                    skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
                                continue;
                        }
        
                        /* WB7b */
       -                if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
       -                    skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
       -                    len > 2 &&
       -                    skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
       +                    p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
                                continue;
                        }
        
                        /* WB7c */
       -                if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
       -                    skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
       -                    off > 1 &&
       -                    skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
       +                    p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
                                continue;
                        }
        
                        /* WB8 */
       -                if (skip.b == WORD_BREAK_PROP_NUMERIC &&
       -                    skip.c == WORD_BREAK_PROP_NUMERIC) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* WB9 */
       -                if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
       -                    skip.c == WORD_BREAK_PROP_NUMERIC) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* WB10 */
       -                if (skip.b == WORD_BREAK_PROP_NUMERIC &&
       -                    (skip.c == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                                continue;
                        }
        
                        /* WB11 */
       -                if ((skip.b == WORD_BREAK_PROP_MIDNUM       ||
       -                     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
       -                     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       -                    skip.c == WORD_BREAK_PROP_NUMERIC &&
       -                    off > 1 &&
       -                    skip.a == WORD_BREAK_PROP_NUMERIC) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM       ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
       +                    p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* WB12 */
       -                if (skip.b == WORD_BREAK_PROP_NUMERIC &&
       -                    (skip.c == WORD_BREAK_PROP_MIDNUM       ||
       -                     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
       -                     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       -                    len > 2 &&
       -                    skip.d == WORD_BREAK_PROP_NUMERIC) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM       ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       +                    p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* WB13 */
       -                if (skip.b == WORD_BREAK_PROP_KATAKANA &&
       -                    skip.c == WORD_BREAK_PROP_KATAKANA) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
                                continue;
                        }
        
                        /* WB13a */
       -                if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.b == WORD_BREAK_PROP_HEBREW_LETTER        ||
       -                     skip.b == WORD_BREAK_PROP_NUMERIC              ||
       -                     skip.b == WORD_BREAK_PROP_KATAKANA             ||
       -                     skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
       -                    skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER        ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA             ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
                                continue;
                        }
        
                        /* WB13b */
       -                if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
       -                    (skip.c == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.c == WORD_BREAK_PROP_HEBREW_LETTER        ||
       -                     skip.c == WORD_BREAK_PROP_NUMERIC              ||
       -                     skip.c == WORD_BREAK_PROP_KATAKANA)) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER        ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
                                continue;
                        }
        
                        /* WB15 and WB16 */
       -                if (!ri_even &&
       -                    skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
       +                if (!state.ri_even &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
                                continue;
                        }
        
       @@ -335,17 +244,25 @@ next_word_break(const void *str, size_t len, size_t (*get_codepoint)
                        break;
                }
        
       -        return off;
       +        return herodotus_reader_number_read(&(p.mid_reader));
        }
        
        size_t
        grapheme_next_word_break(const uint_least32_t *str, size_t len)
        {
       -        return next_word_break(str, len, get_codepoint);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
       +
       +        return next_word_break(&r);
        }
        
        size_t
        grapheme_next_word_break_utf8(const char *str, size_t len)
        {
       -        return next_word_break(str, len, get_codepoint_utf8);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
       +
       +        return next_word_break(&r);
        }