Add "proper"-property-reader - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit b899fd685c50cbc61999296ce1e0a03a45e74f52
 (DIR) parent a4d42053f13e8471ee3903522f964fc0a1d3161a
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Sun,  2 Oct 2022 21:09:08 +0200
       
       Add "proper"-property-reader
       
       The word- and sentence-segmentation algorithms make use of a complicated
       logic to accomodate "raw" and "skip" properties. The code is barely
       readable and doesn't separate abstractions away nicely. Moreover, there
       is a high probability that certain edge-cases are not handled properly.
       
       To fix this, this commit adds a "proper"-property-reader, which
       basically does the whole dirty details in the background using
       well-commented and transparent code that builds on top of the
       herodotus-reader instead of doing this by hand. This ensures that we
       will (provably) never have buffer overflows unless there is a mistake
       in the implementation itself, which can be verified relatively easily
       given each function has a limited scope.
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M src/case.c                          |      25 ++++++++++++-------------
         M src/util.c                          |     159 ++++++++++++++++++++++++++++++-
         M src/util.h                          |      23 +++++++++++++++++++++++
       
       3 files changed, 190 insertions(+), 17 deletions(-)
       ---
 (DIR) diff --git a/src/case.c b/src/case.c
       @@ -147,18 +147,14 @@ to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
        static size_t
        herodotus_next_word_break(const HERODOTUS_READER *r)
        {
       -        if (r->src == NULL || r->off > r->srclen) {
       -                return 0;
       -        }
       +        HERODOTUS_READER tmp;
       +
       +        herodotus_reader_copy(r, &tmp);
        
                if (r->type == HERODOTUS_TYPE_CODEPOINT) {
       -                return grapheme_next_word_break(
       -                        ((const uint_least32_t *)(r->src)) + r->off,
       -                        r->srclen - r->off);
       +                return grapheme_next_word_break(tmp.src, tmp.srclen);
                } else { /* r->type == HERODOTUS_TYPE_UTF8 */
       -                return grapheme_next_word_break_utf8(
       -                        ((const char *)(r->src)) + r->off,
       -                        r->srclen - r->off);
       +                return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
                }
        }
        
       @@ -168,9 +164,10 @@ to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
                enum case_property prop;
                enum herodotus_status s;
                uint_least32_t cp;
       +        size_t nwb;
        
       -        for (;;) {
       -                herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r));
       +        for (; (nwb = herodotus_next_word_break(r)) > 0;) {
       +                herodotus_reader_push_advance_limit(r, nwb);
                        for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
                                /* check if we have a cased character */
                                prop = get_case_property(cp);
       @@ -354,9 +351,10 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
                enum herodotus_status s;
                bool ret = true;
                uint_least32_t cp;
       +        size_t nwb;
        
       -        for (;;) {
       -                herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r));
       +        for (; (nwb = herodotus_next_word_break(r)) > 0;) {
       +                herodotus_reader_push_advance_limit(r, nwb);
                        for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
                                /* check if we have a cased character */
                                prop = get_case_property(cp);
       @@ -377,6 +375,7 @@ is_titlecase(HERODOTUS_READER *r, size_t *output)
                                 * we did not encounter any cased character
                                 * up to the word break
                                 */
       +                        herodotus_reader_pop_limit(r);
                                continue;
                        } else {
                                /*
 (DIR) diff --git a/src/util.c b/src/util.c
       @@ -30,14 +30,31 @@ herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
        {
                size_t i;
        
       +        /*
       +         * we copy such that we have a "fresh" start and build
       +         * on the fact that src->soft_limit[i] for any i and src->srclen
       +         * are always larger or equal to src->off
       +         */
                dest->type = src->type;
       -        dest->src = src->src;
       -        dest->srclen = src->srclen;
       -        dest->off = src->off;
       +        if (src->type == HERODOTUS_TYPE_CODEPOINT) {
       +                dest->src = ((const uint_least32_t *)(src->src)) + src->off;
       +        } else { /* src->type == HERODOTUS_TYPE_UTF8 */
       +                dest->src = ((const char *)(src->src)) + src->off;
       +        }
       +        if (src->srclen == SIZE_MAX) {
       +                dest->srclen = SIZE_MAX;
       +        } else {
       +                dest->srclen = src->srclen - src->off;
       +        }
       +        dest->off = 0;
                dest->terminated_by_null = src->terminated_by_null;
        
                for (i = 0; i < LEN(src->soft_limit); i++) {
       -                dest->soft_limit[i] = src->soft_limit[i];
       +                if (src->soft_limit[i] == SIZE_MAX) {
       +                        dest->soft_limit[i] = src->soft_limit[i];
       +                } else {
       +                        dest->soft_limit[i] = src->soft_limit[i] - src->off;
       +                }
                }
        }
        
       @@ -258,6 +275,140 @@ herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
                }
        }
        
       +void
       +proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
       +            uint_least8_t (*get_break_prop)(uint_least32_t),
       +            bool (*is_skippable_prop)(uint_least8_t),
       +            void (*skip_shift_callback)(uint_least8_t, void *),
       +            struct proper *p)
       +{
       +        uint_least8_t prop;
       +        uint_least32_t cp;
       +        size_t i;
       +
       +        /* set internal variables */
       +        p->state = state;
       +        p->no_prop = no_prop;
       +        p->get_break_prop = get_break_prop;
       +        p->is_skippable_prop = is_skippable_prop;
       +        p->skip_shift_callback = skip_shift_callback;
       +
       +        /*
       +         * Initialize mid-reader, which is basically just there
       +         * to reflect the current position of the viewing-line
       +         */
       +        herodotus_reader_copy(r, &(p->mid_reader));
       +
       +        /*
       +         * In the initialization, we simply (try to) fill in next_prop.
       +         * If we cannot read in more (due to the buffer ending), we
       +         * fill in the prop as invalid
       +         */
       +
       +        /*
       +         * initialize the previous properties to have no property
       +         * (given we are at the start of the buffer)
       +         */
       +        p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
       +        p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
       +
       +        /*
       +         * initialize the next properties
       +         */
       +
       +        /* initialize the raw reader */
       +        herodotus_reader_copy(r, &(p->raw_reader));
       +
       +        /* fill in the two next raw properties (after no-initialization) */
       +        p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
       +        for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
       +             HERODOTUS_STATUS_SUCCESS; ) {
       +                p->raw.next_prop[i++] = p->get_break_prop(cp);
       +        }
       +
       +        /* initialize the skip reader */
       +        herodotus_reader_copy(r, &(p->skip_reader));
       +
       +        /* fill in the two next skip properties (after no-initialization) */
       +        p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
       +        for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
       +             HERODOTUS_STATUS_SUCCESS; ) {
       +                prop = p->get_break_prop(cp);
       +                if (!p->is_skippable_prop(prop)) {
       +                        p->skip.next_prop[i++] = prop;
       +                }
       +        }
       +}
       +
       +int
       +proper_advance(struct proper *p)
       +{
       +        uint_least8_t prop;
       +        uint_least32_t cp;
       +
       +        /* read in next "raw" property */
       +        if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
       +            HERODOTUS_STATUS_SUCCESS) {
       +                prop = p->get_break_prop(cp);
       +        } else {
       +                prop = p->no_prop;
       +        }
       +
       +        /*
       +         * do a shift-in, unless we find that the property that is to
       +         * be moved past the "raw-viewing-line" (this property is stored
       +         * in p->raw.next_prop[0]) is a no_prop, indicating that
       +         * we are at the end of the buffer.
       +         */
       +        if (p->raw.next_prop[0] == p->no_prop) {
       +                return 1;
       +        }
       +
       +        /* shift in the properties */
       +        p->raw.prev_prop[1] = p->raw.prev_prop[0];
       +        p->raw.prev_prop[0] = p->raw.next_prop[0];
       +        p->raw.next_prop[0] = p->raw.next_prop[1];
       +        p->raw.next_prop[1] = prop;
       +
       +        /* advance the middle reader viewing-line */
       +        (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
       +
       +        /* check skippability-property */
       +        if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
       +                /*
       +                 * the property that has moved past the "raw-viewing-line"
       +                 * (this property is now (after the raw-shift) stored in
       +                 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
       +                 * guaranteeing that we won't shift a no-prop past the
       +                 * "viewing-line" in the skip-properties) is not a skippable
       +                 * property, thus we need to shift the skip property as well.
       +                 */
       +                p->skip.prev_prop[1] = p->skip.prev_prop[0];
       +                p->skip.prev_prop[0] = p->skip.next_prop[0];
       +                p->skip.next_prop[0] = p->skip.next_prop[1];
       +
       +                /*
       +                 * call the skip-shift-callback on the property that
       +                 * passed the skip-viewing-line (this property is now
       +                 * stored in p->skip.prev_prop[0]).
       +                 */
       +                p->skip_shift_callback(p->skip.prev_prop[0], p->state);
       +
       +                /* determine the next shift property */
       +                p->skip.next_prop[1] = p->no_prop;
       +                while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
       +                       HERODOTUS_STATUS_SUCCESS) {
       +                        prop = p->get_break_prop(cp);
       +                        if (!p->is_skippable_prop(prop)) {
       +                                p->skip.next_prop[1] = prop;
       +                                break;
       +                        }
       +                }
       +        }
       +
       +        return 0;
       +}
       +
        inline size_t
        get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
        {
 (DIR) diff --git a/src/util.h b/src/util.h
       @@ -74,6 +74,22 @@ typedef struct herodotus_writer {
                size_t first_unwritable_offset;
        } HERODOTUS_WRITER;
        
       +struct proper {
       +        /*
       +         * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
       +         */
       +        struct {
       +                uint_least8_t prev_prop[2];
       +                uint_least8_t next_prop[2];
       +        } raw, skip;
       +        HERODOTUS_READER mid_reader, raw_reader, skip_reader;
       +        void *state;
       +        uint_least8_t no_prop;
       +        uint_least8_t (*get_break_prop)(uint_least32_t);
       +        bool (*is_skippable_prop)(uint_least8_t);
       +        void (*skip_shift_callback)(uint_least8_t, void *);
       +};
       +
        void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
                                   const void *, size_t);
        void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
       @@ -90,6 +106,13 @@ void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
        size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
        void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
        
       +void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
       +                 uint_least8_t (*get_break_prop)(uint_least32_t),
       +                 bool (*is_skippable_prop)(uint_least8_t),
       +                 void (*skip_shift_callback)(uint_least8_t, void *),
       +                 struct proper *);
       +int proper_advance(struct proper *);
       +
        size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *);
        size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *);