Refactor case-conversion-functions with Herodotus - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 5332f7ee034081618617c2b0785733ccc9ec8753
 (DIR) parent 563eb65bfbaa4f27c77d73ae81b51882c916993d
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Wed, 21 Sep 2022 20:16:00 +0200
       
       Refactor case-conversion-functions with Herodotus
       
       The readability of the code is greatly improved, and the code is now
       much more robust than before.
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M src/case.c                          |     255 ++++++++++++++-----------------
       
       1 file changed, 112 insertions(+), 143 deletions(-)
       ---
 (DIR) diff --git a/src/case.c b/src/case.c
       @@ -33,22 +33,18 @@ get_case_offset(uint_least32_t cp, const uint_least16_t *major,
        }
        
        static inline size_t
       -to_case(const void *src, size_t srclen, void *dest, size_t destlen,
       -        size_t srcnumprocess, uint_least8_t final_sigma_level,
       -        size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
       -        size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t),
       -        const uint_least16_t *major, const int_least32_t *minor,
       -        const struct special_case *sc)
       +to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
       +        uint_least8_t final_sigma_level, const uint_least16_t *major,
       +        const int_least32_t *minor, const struct special_case *sc)
        {
       +        HERODOTUS_READER tmp;
                enum case_property prop;
       -        size_t srcoff, destoff, res, tmp, off, i;
       +        enum herodotus_status s;
       +        size_t off, i;
                uint_least32_t cp, tmp_cp;
                int_least32_t map;
        
       -        for (srcoff = 0, destoff = 0; srcoff < srcnumprocess; srcoff += res) {
       -                /* read in next source codepoint */
       -                res = get_codepoint((const char *)src, srclen, srcoff, &cp);
       -
       +        for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
                        if (sc == lower_special) {
                                /*
                                 * For the special Final_Sigma-rule (see SpecialCasing.txt),
       @@ -72,8 +68,10 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
                                         * if the succeeding character is cased, invalidating
                                         * the after-condition
                                         */
       -                                for (tmp = srcoff + res, prop = NUM_CASE_PROPS; tmp < srclen; ) {
       -                                        tmp += get_codepoint(src, srclen, tmp, &tmp_cp);
       +                                herodotus_reader_copy(r, &tmp);
       +                                for (prop = NUM_CASE_PROPS;
       +                                     (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
       +                                     HERODOTUS_STATUS_SUCCESS; ) {
                                                prop = get_case_property(tmp_cp);
        
                                                if (prop != CASE_PROP_CASE_IGNORABLE &&
       @@ -83,20 +81,19 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
                                        }
        
                                        /*
       -                                 * Now prop is something other than case-ignorable.
       +                                 * Now prop is something other than case-ignorable or
       +                                 * the source-string ended.
                                         * If it is something other than cased, we know
                                         * that the after-condition holds
                                         */
       -                                if (prop != CASE_PROP_CASED &&
       -                                    prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
       +                                if (s != HERODOTUS_STATUS_SUCCESS ||
       +                                    (prop != CASE_PROP_CASED &&
       +                                     prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
                                                /*
                                                 * write GREEK SMALL LETTER FINAL SIGMA to
                                                 * destination
                                                 */
       -                                        destoff += set_codepoint(UINT32_C(0x03C2),
       -                                                                 dest,
       -                                                                 destlen,
       -                                                                 destoff);
       +                                        herodotus_write_codepoint(w, UINT32_C(0x03C2));
                                                
                                                /* reset Final_Sigma-state and continue */
                                                final_sigma_level = 0;
       @@ -132,191 +129,163 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
                                off = (uint_least32_t)map - UINT32_C(0x110000);
        
                                for (i = 0; i < sc[off].cplen; i++) {
       -                                if (likely(destoff < destlen)) {
       -                                        /*
       -                                         * write special mapping to destination
       -                                         */
       -                                        destoff += set_codepoint(sc[off].cp[i],
       -                                                                 dest,
       -                                                                 destlen,
       -                                                                 destoff);
       -                                } else {
       -                                        /*
       -                                         * further increase destoff to indicate
       -                                         * how much buffer space we need
       -                                         */
       -                                        destoff += set_codepoint(sc[off].cp[i],
       -                                                                 NULL, 0, 0);
       -                                }
       +                                herodotus_write_codepoint(w, sc[off].cp[i]);
                                }
                        } else {
                                /* we have a simple mapping */
       -                        if (likely(destoff < destlen)) {
       -                                destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map),
       -                                                         dest, destlen, destoff);
       -                        } else {
       -                                destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map),
       -                                                         NULL, 0, 0);
       -                        }
       +                        herodotus_write_codepoint(w, (uint_least32_t)
       +                                                  ((int_least32_t)cp + map));
                        }
                }
        
       -        if (set_codepoint == set_codepoint_utf8 && destlen > 0) {
       -                /*
       -                 * NUL-terminate destination to always ensure NUL-termination,
       -                 * unless in check mode.
       -                 * Just like with snprintf() a return value >= destlen indicates
       -                 * truncation.
       -                 */
       -                ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0';
       +        herodotus_writer_nul_terminate(w);
       +
       +        return herodotus_writer_number_written(w);
       +}
       +
       +static size_t
       +herodotus_next_word_break(const HERODOTUS_READER *r)
       +{
       +        if (r->src == NULL || r->off > r->srclen) {
       +                return 0;
                }
        
       -        return destoff;
       +        if (r->type == HERODOTUS_TYPE_CODEPOINT) {
       +                return grapheme_next_word_break(
       +                        ((const uint_least32_t *)(r->src)) + r->off,
       +                        r->srclen - r->off);
       +        } else { /* r->type == HERODOTUS_TYPE_UTF8 */
       +                return grapheme_next_word_break_utf8(
       +                        ((const char *)(r->src)) + r->off,
       +                        r->srclen - r->off);
       +        }
        }
        
        static inline size_t
       -to_titlecase(const void *src, size_t srclen, void *dest, size_t destlen,
       -             size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
       -             size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t))
       +to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
        {
                enum case_property prop;
       -        size_t next_wb, srcoff, destoff, res;
       +        enum herodotus_status s;
                uint_least32_t cp;
        
       -        for (srcoff = destoff = 0; ; ) {
       -                if (get_codepoint == get_codepoint_utf8) {
       -                        if ((next_wb = grapheme_next_word_break_utf8((const char *)src + srcoff,
       -                                                                     srclen - srcoff)) == 0) {
       -                                /* we consumed all of the string */
       -                                break;
       -                        }
       -                } else {
       -                        if ((next_wb = grapheme_next_word_break((const uint_least32_t *)src + srcoff,
       -                                                                srclen - srcoff)) == 0) {
       -                                /* we consumed all of the string */
       -                                break;
       -                        }
       -                }
       -
       -                for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff += res) {
       +        for (;;) {
       +                herodotus_reader_push_advance_limit(r, herodotus_next_word_break(r));
       +                for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
                                /* check if we have a cased character */
       -                        res = get_codepoint(src, srclen, srcoff, &cp);
                                prop = get_case_property(cp);
                                if (prop == CASE_PROP_CASED ||
                                    prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
                                        break;
                                } else {
                                        /* write the data to the output verbatim, it if permits */
       -                                destoff += set_codepoint_utf8(cp, dest, destlen, destoff);
       -                        }
       -                }
       +                                herodotus_write_codepoint(w, cp);
        
       -                if (next_wb > 0) {
       -                        /* get character length */
       -                        res = get_codepoint(src, srclen, srcoff, &cp);
       -
       -                        /* we have a cased character at srcoff, map it to titlecase */
       -                        if (get_codepoint == get_codepoint_utf8) {
       -                                destoff += to_case((const char *)src + srcoff,
       -                                                   srclen - srcoff,
       -                                                   (char *)dest + destoff,
       -                                                   (destoff < destlen) ? (destlen - destoff) : 0,
       -                                                   res, 0,
       -                                                   get_codepoint_utf8,
       -                                                   set_codepoint_utf8, title_major,
       -                                                   title_minor, title_special);
       -                        } else {
       -                                destoff += to_case((const uint_least32_t *)src + srcoff,
       -                                                   srclen - srcoff,
       -                                                   (uint_least32_t *)dest + destoff,
       -                                                   (destoff < destlen) ? (destlen - destoff) : 0,
       -                                                   res, 0,
       -                                                   get_codepoint,
       -                                                   set_codepoint, title_major,
       -                                                   title_minor, title_special);
       +                                /* increment reader */
       +                                herodotus_read_codepoint(r, true, &cp);
                                }
       -
       -                        /* we consumed a character */
       -                        srcoff += res;
       -                        next_wb -= res;
                        }
        
       -                /* cast the rest of the codepoints in the word to lowercase */
       -                if (get_codepoint == get_codepoint_utf8) {
       -                        destoff += to_case((const char *)src + srcoff,
       -                                           srclen - srcoff,
       -                                           (char *)dest + destoff,
       -                                           (destoff < destlen) ? (destlen - destoff) : 0,
       -                                           next_wb, 1,
       -                                           get_codepoint_utf8,
       -                                           set_codepoint_utf8, lower_major,
       -                                           lower_minor, lower_special);
       +                if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
       +                        /* we are done */
       +                        break;
       +                } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
       +                        /*
       +                         * we did not encounter any cased character
       +                         * up to the word break
       +                         */
       +                        continue;
                        } else {
       -                        destoff += to_case((const uint_least32_t *)src + srcoff,
       -                                           srclen - srcoff,
       -                                           (uint_least32_t *)dest + destoff,
       -                                           (destoff < destlen) ? (destlen - destoff) : 0,
       -                                           next_wb, 1,
       -                                           get_codepoint,
       -                                           set_codepoint, lower_major,
       -                                           lower_minor, lower_special);
       +                        /*
       +                         * we encountered a cased character before the word
       +                         * break, convert it to titlecase
       +                         */
       +                        herodotus_reader_push_advance_limit(r,
       +                                herodotus_reader_next_codepoint_break(r));
       +                        to_case(r, w, 0, title_major, title_minor, title_special);
       +                        herodotus_reader_pop_limit(r);
                        }
       -                srcoff += next_wb;
       -        }
        
       -        if (set_codepoint == set_codepoint_utf8) {
       -                /*
       -                 * NUL-terminate destination to always ensure NUL-termination.
       -                 * Just like with snprintf() a return value >= destlen indicates
       -                 * truncation.
       -                 */
       -                ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0';
       +                /* cast the rest of the codepoints in the word to lowercase */
       +                to_case(r, w, 1, lower_major, lower_minor, lower_special);
       +
       +                herodotus_reader_pop_limit(r);
                }
        
       -        return destoff;
       +        herodotus_writer_nul_terminate(w);
       +
       +        return herodotus_writer_number_written(w);
        }
        
        size_t
        grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
        {
       -        return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint,
       -                       upper_major, upper_minor, upper_special);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
       +
       +        return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
        }
        
        size_t
        grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
        {
       -        return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint,
       -                       lower_major, lower_minor, lower_special);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
       +
       +        return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
        }
        
        size_t
        grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
        {
       -        return to_titlecase(src, srclen, dest, destlen, get_codepoint,
       -                            set_codepoint);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
       +
       +        return to_titlecase(&r, &w);
        }
        
        size_t
        grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
        {
       -        return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8,
       -                       upper_major, upper_minor, upper_special);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
       +
       +        return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
        }
        
        size_t
        grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
        {
       -        return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8,
       -                       lower_major, lower_minor, lower_special);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
        
       +        return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
        }
        
        size_t
        grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
        {
       -        return to_titlecase(src, srclen, dest, destlen, get_codepoint_utf8,
       -                            set_codepoint_utf8);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
       +
       +        return to_titlecase(&r, &w);
        }
        
        static inline bool