Refactor sentence-functions with Proper (using Herodotus in the background) - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit a5b1b0c0c7bc1576b5893175b27585fa963f4433
 (DIR) parent 52b0e29e02068d6a8123042ef901f73e37b2f38f
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Sun,  2 Oct 2022 22:05:11 +0200
       
       Refactor sentence-functions with Proper (using Herodotus in the background)
       
       This refactor was a breeze and it passed all conformance tests on the
       first try. This, just like with the word-functions, leads to a massive
       simplification and separation of concerns in the code. And as with the
       word functions, this fixes some known quirks.
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M src/sentence.c                      |     426 +++++++++++++------------------
       
       1 file changed, 181 insertions(+), 245 deletions(-)
       ---
 (DIR) diff --git a/src/sentence.c b/src/sentence.c
       @@ -6,11 +6,17 @@
        #include "../grapheme.h"
        #include "util.h"
        
       -static inline enum sentence_break_property
       -get_break_prop(uint_least32_t cp)
       +struct sentence_break_state
       +{
       +        uint_least8_t aterm_close_sp_level;
       +        uint_least8_t saterm_close_sp_parasep_level;
       +};
       +
       +static inline uint_least8_t
       +get_sentence_break_prop(uint_least32_t cp)
        {
                if (likely(cp <= 0x10FFFF)) {
       -                return (enum sentence_break_property)
       +                return (uint_least8_t)
                               sentence_break_minor[sentence_break_major[cp >> 8] +
                               (cp & 0xff)];
                } else {
       @@ -18,243 +24,157 @@ get_break_prop(uint_least32_t cp)
                }
        }
        
       -static size_t
       -next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
       -                    (const void *, size_t, size_t, uint_least32_t *))
       +static bool
       +is_skippable_sentence_prop(uint_least8_t prop)
        {
       -        struct {
       -                enum sentence_break_property a, b, c, d;
       -        } raw, skip;
       -        enum sentence_break_property res;
       -        uint_least32_t cp;
       -        uint_least8_t aterm_close_sp_level = 0,
       -                      saterm_close_sp_parasep_level = 0;
       -        size_t off, tmp, new_off;
       +        return prop == SENTENCE_BREAK_PROP_EXTEND ||
       +               prop == SENTENCE_BREAK_PROP_FORMAT;
       +}
        
       -        /* check degenerate cases */
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       +static void
       +sentence_skip_shift_callback(uint_least8_t prop, void *s)
       +{
       +        struct sentence_break_state *state = (struct sentence_break_state *)s;
        
                /*
       -         * Apply sentence breaking algorithm (UAX #29), see
       -         * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
       +         * Here comes a bit of magic. The rules
       +         * SB8, SB8a, SB9 and SB10 have very complicated
       +         * left-hand-side-rules of the form
                 *
       -         * There are 4 slots (a, b, c, d) of "break" properties and
       -         * we check if there is a break in the middle between b and c.
       +         *  ATerm Close* Sp*
       +         *  SATerm Close*
       +         *  SATerm Close* Sp*
       +         *  SATerm Close* Sp* ParaSep?
                 *
       -         * The position of this middle spot is determined by off,
       -         * which gives the offset of the first element on the right
       -         * hand side of said spot, or, in other words, gives the number
       -         * of elements on the left hand side.
       +         * but instead of backtracking, we keep the
       +         * state as some kind of "power level" in
       +         * two state-variables
                 *
       -         * It is further complicated by the fact that the algorithm
       -         * expects you to skip certain characters for the second
       -         * half of the rules (after SB5). Thus, we do not only have
       -         * the "raw" properties as described above, but also the "skip"
       -         * properties, where the skip.a and skip.b, for instance,
       -         * give the two preceding character properties behind the
       -         * currently investigated breakpoint.
       +         *  aterm_close_sp_level
       +         *  saterm_close_sp_parasep_level
       +         *
       +         * that go from 0 to 3/4:
       +         *
       +         *  0: we are not in the sequence
       +         *  1: we have one ATerm/SATerm to the left of
       +         *     the middle spot
       +         *  2: we have one ATerm/SATerm and one or more
       +         *     Close to the left of the middle spot
       +         *  3: we have one ATerm/SATerm, zero or more
       +         *     Close and one or more Sp to the left of
       +         *     the middle spot.
       +         *  4: we have one SATerm, zero or more Close,
       +         *     zero or more Sp and one ParaSep to the
       +         *     left of the middle spot.
                 *
                 */
       -
       -        /*
       -         * Initialize the different properties such that we have
       -         * a good state after the state-update in the loop
       -         */
       -        raw.b = NUM_SENTENCE_BREAK_PROPS;
       -        if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
       -                /*
       -                 * A line is at least one codepoint long, so we can
       -                 * safely return here
       -                 */
       -                return len;
       +        if ((state->aterm_close_sp_level == 0 ||
       +             state->aterm_close_sp_level == 1) &&
       +            prop == SENTENCE_BREAK_PROP_ATERM) {
       +                /* sequence has begun */
       +                state->aterm_close_sp_level = 1;
       +        } else if ((state->aterm_close_sp_level == 1 ||
       +                    state->aterm_close_sp_level == 2) &&
       +                   prop == SENTENCE_BREAK_PROP_CLOSE) {
       +                /* close-sequence begins or continued */
       +                state->aterm_close_sp_level = 2;
       +        } else if ((state->aterm_close_sp_level == 1 ||
       +                    state->aterm_close_sp_level == 2 ||
       +                    state->aterm_close_sp_level == 3) &&
       +                   prop == SENTENCE_BREAK_PROP_SP) {
       +                /* sp-sequence begins or continued */
       +                state->aterm_close_sp_level = 3;
       +        } else {
       +                /* sequence broke */
       +                state->aterm_close_sp_level = 0;
                }
       -        raw.c = get_break_prop(cp);
       -        (void)get_codepoint(str, len, off, &cp);
       -        raw.d = get_break_prop(cp);
       -        skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS;
       -
       -        for (; off < len; off = new_off) {
       -                /*
       -                 * Update left side (a and b) of the skip state by
       -                 * "shifting in" the raw.c property as long as it is
       -                 * not one of the "ignored" character properties.
       -                 * While at it, update the RI-counter.
       -                 *
       -                 */
       -                if (raw.c != SENTENCE_BREAK_PROP_EXTEND &&
       -                    raw.c != SENTENCE_BREAK_PROP_FORMAT) {
       -                            skip.a = skip.b;
       -                        skip.b = raw.c;
       -
       -                        /*
       -                         * Here comes a bit of magic. The rules
       -                         * SB8, SB8a, SB9 and SB10 have very complicated
       -                         * left-hand-side-rules of the form
       -                         *
       -                         *  ATerm Close* Sp*
       -                         *  SATerm Close*
       -                         *  SATerm Close* Sp*
       -                         *  SATerm Close* Sp* ParaSep?
       -                         * 
       -                         * but instead of backtracking, we keep the
       -                         * state as some kind of "power level" in
       -                         * two variables
       -                         *
       -                         *  aterm_close_sp_level
       -                         *  saterm_close_sp_parasep_level
       -                         * 
       -                         * that go from 0 to 3/4:
       -                         *
       -                         *  0: we are not in the sequence
       -                         *  1: we have one ATerm/SATerm to the left of
       -                         *     the middle spot
       -                         *  2: we have one ATerm/SATerm and one or more
       -                         *     Close to the left of the middle spot
       -                         *  3: we have one ATerm/SATerm, zero or more
       -                         *     Close and one or more Sp to the left of
       -                         *     the middle spot.
       -                         *  4: we have one SATerm, zero or more Close,
       -                         *     zero or more Sp and one ParaSep to the
       -                         *     left of the middle spot.
       -                         *
       -                         */
       -                        if ((aterm_close_sp_level == 0 ||
       -                             aterm_close_sp_level == 1) &&
       -                            skip.b == SENTENCE_BREAK_PROP_ATERM) {
       -                                    /* sequence has begun */
       -                                aterm_close_sp_level = 1;
       -                        } else if ((aterm_close_sp_level == 1 ||
       -                                    aterm_close_sp_level == 2) &&
       -                                   skip.b == SENTENCE_BREAK_PROP_CLOSE) {
       -                                /* close-sequence begins or continued */
       -                                aterm_close_sp_level = 2;
       -                        } else if ((aterm_close_sp_level == 1 ||
       -                                    aterm_close_sp_level == 2 ||
       -                                    aterm_close_sp_level == 3) &&
       -                                   skip.b == SENTENCE_BREAK_PROP_SP) {
       -                                /* sp-sequence begins or continued */
       -                                aterm_close_sp_level = 3;
       -                        } else {
       -                                /* sequence broke */
       -                                aterm_close_sp_level = 0;
       -                        }
        
       -                        if ((saterm_close_sp_parasep_level == 0 ||
       -                             saterm_close_sp_parasep_level == 1) &&
       -                            (skip.b == SENTENCE_BREAK_PROP_STERM ||
       -                             skip.b == SENTENCE_BREAK_PROP_ATERM)) {
       -                                    /* sequence has begun */
       -                                saterm_close_sp_parasep_level = 1;
       -                        } else if ((saterm_close_sp_parasep_level == 1 ||
       -                                    saterm_close_sp_parasep_level == 2) &&
       -                                   skip.b == SENTENCE_BREAK_PROP_CLOSE) {
       -                                /* close-sequence begins or continued */
       -                                saterm_close_sp_parasep_level = 2;
       -                        } else if ((saterm_close_sp_parasep_level == 1 ||
       -                                    saterm_close_sp_parasep_level == 2 ||
       -                                    saterm_close_sp_parasep_level == 3) &&
       -                                   skip.b == SENTENCE_BREAK_PROP_SP) {
       -                                /* sp-sequence begins or continued */
       -                                saterm_close_sp_parasep_level = 3;
       -                        } else if ((saterm_close_sp_parasep_level == 1 ||
       -                                    saterm_close_sp_parasep_level == 2 ||
       -                                    saterm_close_sp_parasep_level == 3) &&
       -                                   (skip.b == SENTENCE_BREAK_PROP_SEP ||
       -                                    skip.b == SENTENCE_BREAK_PROP_CR  ||
       -                                    skip.b == SENTENCE_BREAK_PROP_LF)) {
       -                                /* ParaSep at the end of the sequence */
       -                                saterm_close_sp_parasep_level = 4;
       -                        } else {
       -                                /* sequence broke */
       -                                saterm_close_sp_parasep_level = 0;
       -                        }
       -                }
       -
       -                /*
       -                 * Update right side (b and c) of the skip state by
       -                 * starting at the breakpoint and detecting the two
       -                 * following non-ignored character classes
       -                 *
       -                 */
       -                skip.c = NUM_SENTENCE_BREAK_PROPS;
       -                for (tmp = off; tmp < len; ) {
       -                        tmp += get_codepoint(str, len, tmp, &cp);
       -                        res = get_break_prop(cp);
       -
       -                        if (res != SENTENCE_BREAK_PROP_EXTEND &&
       -                            res != SENTENCE_BREAK_PROP_FORMAT) {
       -                                skip.c = res;
       -                                break;
       -                        }
       -                }
       -                skip.d = NUM_SENTENCE_BREAK_PROPS;
       -                for (; tmp < len; ) {
       -                        tmp += get_codepoint(str, len, tmp, &cp);
       -                        res = get_break_prop(cp);
       +        if ((state->saterm_close_sp_parasep_level == 0 ||
       +             state->saterm_close_sp_parasep_level == 1) &&
       +            (prop == SENTENCE_BREAK_PROP_STERM ||
       +             prop == SENTENCE_BREAK_PROP_ATERM)) {
       +                /* sequence has begun */
       +                state->saterm_close_sp_parasep_level = 1;
       +        } else if ((state->saterm_close_sp_parasep_level == 1 ||
       +                    state->saterm_close_sp_parasep_level == 2) &&
       +                   prop == SENTENCE_BREAK_PROP_CLOSE) {
       +                /* close-sequence begins or continued */
       +                state->saterm_close_sp_parasep_level = 2;
       +        } else if ((state->saterm_close_sp_parasep_level == 1 ||
       +                    state->saterm_close_sp_parasep_level == 2 ||
       +                    state->saterm_close_sp_parasep_level == 3) &&
       +                   prop == SENTENCE_BREAK_PROP_SP) {
       +                /* sp-sequence begins or continued */
       +                state->saterm_close_sp_parasep_level = 3;
       +        } else if ((state->saterm_close_sp_parasep_level == 1 ||
       +                    state->saterm_close_sp_parasep_level == 2 ||
       +                    state->saterm_close_sp_parasep_level == 3) &&
       +                   (prop == SENTENCE_BREAK_PROP_SEP ||
       +                    prop == SENTENCE_BREAK_PROP_CR  ||
       +                    prop == SENTENCE_BREAK_PROP_LF)) {
       +                /* ParaSep at the end of the sequence */
       +                state->saterm_close_sp_parasep_level = 4;
       +        } else {
       +                /* sequence broke */
       +                state->saterm_close_sp_parasep_level = 0;
       +        }
       +}
        
       -                        if (res != SENTENCE_BREAK_PROP_EXTEND &&
       -                            res != SENTENCE_BREAK_PROP_FORMAT) {
       -                                skip.d = res;
       -                                break;
       -                        }
       -                }
       +static size_t
       +next_sentence_break(HERODOTUS_READER *r)
       +{
       +        HERODOTUS_READER tmp;
       +        enum sentence_break_property prop;
       +        struct proper p;
       +        struct sentence_break_state state = { 0 };
       +        uint_least32_t cp;
        
       -                /*
       -                 * Update the raw state by simply shifting everything
       -                 * in and, if we still have data left, determining
       -                 * the character class of the next codepoint.
       -                 *
       -                 */
       -                raw.a = raw.b;
       -                raw.b = raw.c;
       -                raw.c = raw.d;
       -                if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
       -                        get_codepoint(str, len, new_off, &cp);
       -                        raw.d = get_break_prop(cp);
       -                } else {
       -                        raw.d = NUM_SENTENCE_BREAK_PROPS;
       -                }
       +        /*
       +         * Apply sentence breaking algorithm (UAX #29), see
       +         * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
       +         */
       +        proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
       +                    get_sentence_break_prop, is_skippable_sentence_prop,
       +                    sentence_skip_shift_callback, &p);
        
       +        while (!proper_advance(&p)) {
                        /* SB3 */
       -                if (raw.b == SENTENCE_BREAK_PROP_CR &&
       -                    raw.c == SENTENCE_BREAK_PROP_LF) {
       +                if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
       +                    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
                                continue;
                        }
        
                        /* SB4 */
       -                if (raw.b == SENTENCE_BREAK_PROP_SEP ||
       -                    raw.b == SENTENCE_BREAK_PROP_CR  ||
       -                    raw.b == SENTENCE_BREAK_PROP_LF) {
       +                if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
       +                    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR  ||
       +                    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
                                break;
                        }
        
                        /* SB5 */
       -                if (raw.c == SENTENCE_BREAK_PROP_EXTEND ||
       -                    raw.c == SENTENCE_BREAK_PROP_FORMAT) {
       +                if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
       +                    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
                                continue;
                        }
        
                        /* SB6 */
       -                if (skip.b == SENTENCE_BREAK_PROP_ATERM &&
       -                    skip.c == SENTENCE_BREAK_PROP_NUMERIC) {
       +                if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
       +                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* SB7 */
       -                if (off > 1 &&
       -                    (skip.a == SENTENCE_BREAK_PROP_UPPER ||
       -                     skip.a == SENTENCE_BREAK_PROP_LOWER) &&
       -                    skip.b == SENTENCE_BREAK_PROP_ATERM &&
       -                    skip.c == SENTENCE_BREAK_PROP_UPPER) {
       +                if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
       +                     p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
       +                    p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
       +                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
                                continue;
                        }
        
                        /* SB8 */
       -                if (aterm_close_sp_level == 1 ||
       -                    aterm_close_sp_level == 2 ||
       -                    aterm_close_sp_level == 3) {
       +                if (state.aterm_close_sp_level == 1 ||
       +                    state.aterm_close_sp_level == 2 ||
       +                    state.aterm_close_sp_level == 3) {
                                /*
                                 * This is the most complicated rule, requiring
                                 * the right-hand-side to satisfy the regular expression
       @@ -262,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
                                 *  ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
                                 *
                                 * which we simply check "manually" given LUT-lookups
       -                         * are very cheap.
       +                         * are very cheap by starting at the mid_reader.
                                 *
                                 */
       -                        for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < len; ) {
       -                                tmp += get_codepoint(str, len, tmp, &cp);
       -                                res = get_break_prop(cp);
       +                        herodotus_reader_copy(&(p.mid_reader), &tmp);
       +
       +                        prop = NUM_SENTENCE_BREAK_PROPS;
       +                        while (herodotus_read_codepoint(&tmp, true, &cp) ==
       +                               HERODOTUS_STATUS_SUCCESS) {
       +                                prop = get_sentence_break_prop(cp);
        
       -                                if (res == SENTENCE_BREAK_PROP_OLETTER ||
       -                                    res == SENTENCE_BREAK_PROP_UPPER   ||
       -                                    res == SENTENCE_BREAK_PROP_LOWER   ||
       -                                    res == SENTENCE_BREAK_PROP_SEP     ||
       -                                    res == SENTENCE_BREAK_PROP_CR      ||
       -                                    res == SENTENCE_BREAK_PROP_LF      ||
       -                                    res == SENTENCE_BREAK_PROP_STERM   ||
       -                                    res == SENTENCE_BREAK_PROP_ATERM) {
       +                                /*
       +                                 * the skippable properties are ignored
       +                                 * automatically here given they do not
       +                                 * match the following condition
       +                                 */
       +                                if (prop == SENTENCE_BREAK_PROP_OLETTER ||
       +                                    prop == SENTENCE_BREAK_PROP_UPPER   ||
       +                                    prop == SENTENCE_BREAK_PROP_LOWER   ||
       +                                    prop == SENTENCE_BREAK_PROP_SEP     ||
       +                                    prop == SENTENCE_BREAK_PROP_CR      ||
       +                                    prop == SENTENCE_BREAK_PROP_LF      ||
       +                                    prop == SENTENCE_BREAK_PROP_STERM   ||
       +                                    prop == SENTENCE_BREAK_PROP_ATERM) {
                                                break;
                                        }
                                }
        
       -                        if (res == SENTENCE_BREAK_PROP_LOWER) {
       +                        if (prop == SENTENCE_BREAK_PROP_LOWER) {
                                        continue;
                                }
                        }
        
                        /* SB8a */
       -                if ((saterm_close_sp_parasep_level == 1 ||
       -                     saterm_close_sp_parasep_level == 2 ||
       -                     saterm_close_sp_parasep_level == 3) &&
       -                    (skip.c == SENTENCE_BREAK_PROP_SCONTINUE ||
       -                     skip.c == SENTENCE_BREAK_PROP_STERM     ||
       -                     skip.c == SENTENCE_BREAK_PROP_ATERM)) {
       +                if ((state.saterm_close_sp_parasep_level == 1 ||
       +                     state.saterm_close_sp_parasep_level == 2 ||
       +                     state.saterm_close_sp_parasep_level == 3) &&
       +                    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM     ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
                                continue;
                        }
        
                        /* SB9 */
       -                if ((saterm_close_sp_parasep_level == 1 ||
       -                     saterm_close_sp_parasep_level == 2) &&
       -                    (skip.c == SENTENCE_BREAK_PROP_CLOSE ||
       -                     skip.c == SENTENCE_BREAK_PROP_SP    ||
       -                     skip.c == SENTENCE_BREAK_PROP_SEP   ||
       -                     skip.c == SENTENCE_BREAK_PROP_CR    ||
       -                     skip.c == SENTENCE_BREAK_PROP_LF)) {
       +                if ((state.saterm_close_sp_parasep_level == 1 ||
       +                     state.saterm_close_sp_parasep_level == 2) &&
       +                    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP    ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP   ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR    ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
                                continue;
                        }
        
                        /* SB10 */
       -                if ((saterm_close_sp_parasep_level == 1 ||
       -                     saterm_close_sp_parasep_level == 2 ||
       -                     saterm_close_sp_parasep_level == 3) &&
       -                    (skip.c == SENTENCE_BREAK_PROP_SP  ||
       -                     skip.c == SENTENCE_BREAK_PROP_SEP ||
       -                     skip.c == SENTENCE_BREAK_PROP_CR  ||
       -                     skip.c == SENTENCE_BREAK_PROP_LF)) {
       +                if ((state.saterm_close_sp_parasep_level == 1 ||
       +                     state.saterm_close_sp_parasep_level == 2 ||
       +                     state.saterm_close_sp_parasep_level == 3) &&
       +                    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP  ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR  ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
                                continue;
                        }
        
                        /* SB11 */
       -                if (saterm_close_sp_parasep_level == 1 ||
       -                    saterm_close_sp_parasep_level == 2 ||
       -                    saterm_close_sp_parasep_level == 3 ||
       -                    saterm_close_sp_parasep_level == 4) {
       +                if (state.saterm_close_sp_parasep_level == 1 ||
       +                    state.saterm_close_sp_parasep_level == 2 ||
       +                    state.saterm_close_sp_parasep_level == 3 ||
       +                    state.saterm_close_sp_parasep_level == 4) {
                                break;
                        }
        
       @@ -330,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
                        continue;
                }
        
       -        return off;
       +        return herodotus_reader_number_read(&(p.mid_reader));
        }
        
        size_t
        grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
        {
       -        return next_sentence_break(str, len, get_codepoint);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
       +
       +        return next_sentence_break(&r);
        }
        
        size_t
        grapheme_next_sentence_break_utf8(const char *str, size_t len)
        {
       -        return next_sentence_break(str, len, get_codepoint_utf8);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
       +
       +        return next_sentence_break(&r);
        }