Convert GRAPHEME_STATE to uint_least16_t and remove it - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit fc73d06fed76dd7cde37d3704949d01391ea0032
 (DIR) parent a815be4b5de7f7df2da664049fdb04874d37016a
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Mon,  3 Oct 2022 23:13:26 +0200
       
       Convert GRAPHEME_STATE to uint_least16_t and remove it
       
       I was never quite happy with the fact that the internal state-struct
       was visible in the grapheme.h-header, given the declaration of the
       fields only namely served internal purposes and were useless noise to
       the reader.
       
       To keep it in was merely a choice made because I had always hoped
       to be able to implement maybe a few more state-based pairwise
       segmentation check functions and use the GRAPHEME_STATE type in more
       places, but now after implementing the algorithms it becomes clear that
       they all do not satisfy these pairwise semantics.
       
       The first logical step was to convert the struct to an uint_least16_t,
       which provides enough space (at least 16 bits) to store all the complete
       state, and have internal deserialiation and serialization functions.
       
       The remaining question was if the
       
               typedef uint_least16_t GRAPHEME_STATE
       
       should be removed. I took inspiration from the Linux kernel coding
       style[0], which in section 5b lays out the exact case of typedeffing
       an integer that is meant to store flags (just like in our case).
       It is argued there that there needs to be a good reason to typedef an
       integer (e.g. given it might change by architecture or maybe change in
       later versions). Both cases are not given here (we will _never_ need
       more than 16 bits to store the grapheme cluster break state and you can
       even reduce more wastage, e.g. for storing the prop which never exceeds
       4 bits given NUM_CHAR_BREAK_PROPS == 14 < 15 == 2^4-1), and I must admit
       that it improves readability a bit given you finally know what you're
       dealing with.
       
       The expression
       
               GRAPHEME_STATE state = 0;
       
       admittedly looks a little fishy, given you don't really know what happens
       behind the scenes unless you look in the header, and I want all of the
       semantics to be crystal clear to the end-user.
       
       [0]:https://www.kernel.org/doc/html/latest/process/coding-style.html#typedefs
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M benchmark/character.c               |       2 +-
         M grapheme.h                          |       9 +--------
         M man/grapheme_is_character_break.sh  |       4 ++--
         M src/character.c                     |      60 +++++++++++++++++++++++--------
       
       4 files changed, 49 insertions(+), 26 deletions(-)
       ---
 (DIR) diff --git a/benchmark/character.c b/benchmark/character.c
       @@ -23,7 +23,7 @@ struct break_benchmark_payload {
        void
        libgrapheme(const void *payload)
        {
       -        GRAPHEME_STATE state = { 0 };
       +        uint_least16_t state = 0;
                const struct break_benchmark_payload *p = payload;
                size_t i;
        
 (DIR) diff --git a/grapheme.h b/grapheme.h
       @@ -6,19 +6,12 @@
        #include <stddef.h>
        #include <stdint.h>
        
       -typedef struct grapheme_internal_segmentation_state {
       -        uint_least8_t prop;
       -        bool prop_set;
       -        bool gb11_flag;
       -        bool gb12_13_flag;
       -} GRAPHEME_STATE;
       -
        #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
        
        size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
        size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
        
       -bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);
       +bool grapheme_is_character_break(uint_least32_t, uint_least32_t, uint_least16_t *);
        
        bool grapheme_is_lowercase(const uint_least32_t *, size_t, size_t *);
        bool grapheme_is_uppercase(const uint_least32_t *, size_t, size_t *);
 (DIR) diff --git a/man/grapheme_is_character_break.sh b/man/grapheme_is_character_break.sh
       @@ -8,7 +8,7 @@ cat << EOF
        .Sh SYNOPSIS
        .In grapheme.h
        .Ft size_t
       -.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "GRAPHEME_STATE *state"
       +.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state"
        .Sh DESCRIPTION
        The
        .Fn grapheme_is_character_break
       @@ -52,7 +52,7 @@ if there is not.
        int
        main(void)
        {
       -        GRAPHEME_STATE state = { 0 };
       +        uint_least16_t state = 0;
                uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
                size_t i;
        
 (DIR) diff --git a/src/character.c b/src/character.c
       @@ -7,6 +7,13 @@
        #include "../grapheme.h"
        #include "util.h"
        
       +struct character_break_state {
       +        uint_least8_t prop;
       +        bool prop_set;
       +        bool gb11_flag;
       +        bool gb12_13_flag;
       +};
       +
        static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_OTHER] =
                        UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       @@ -106,38 +113,59 @@ get_break_prop(uint_least32_t cp)
        {
                if (likely(cp <= 0x10FFFF)) {
                        return (enum char_break_property)
       -                       char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)];
       +                       char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
                } else {
                        return CHAR_BREAK_PROP_OTHER;
                }
        }
        
       +static inline void
       +state_serialize(const struct character_break_state *in, uint_least16_t *out)
       +{
       +        *out = (uint_least16_t)(in->prop & UINT8_C(0xFF))                   | /* first 8 bits */
       +               (uint_least16_t)(((uint_least16_t)(in->prop_set))     <<  8) | /* 9th bit */
       +               (uint_least16_t)(((uint_least16_t)(in->gb11_flag))    <<  9) | /* 10th bit */
       +               (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10);  /* 11th bit */
       +}
       +
       +static inline void
       +state_deserialize(uint_least16_t in, struct character_break_state *out)
       +{
       +        out->prop         = in & UINT8_C(0xFF);
       +        out->prop_set     = in & (((uint_least16_t)(1)) <<  8);
       +        out->gb11_flag    = in & (((uint_least16_t)(1)) <<  9);
       +        out->gb12_13_flag = in & (((uint_least16_t)(1)) << 10);
       +}
       +
        bool
       -grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
       +grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
        {
       +        struct character_break_state state;
                enum char_break_property cp0_prop, cp1_prop;
                bool notbreak = false;
        
       -        if (likely(state)) {
       -                if (likely(state->prop_set)) {
       -                        cp0_prop = state->prop;
       +        if (likely(s)) {
       +                state_deserialize(*s, &state);
       +
       +                if (likely(state.prop_set)) {
       +                        cp0_prop = state.prop;
                        } else {
                                cp0_prop = get_break_prop(cp0);
                        }
                        cp1_prop = get_break_prop(cp1);
        
                        /* preserve prop of right codepoint for next iteration */
       -                state->prop = (uint_least8_t)cp1_prop;
       -                state->prop_set = true;
       +                state.prop = (uint_least8_t)cp1_prop;
       +                state.prop_set = true;
        
                        /* update flags */
       -                state->gb11_flag =
       +                state.gb11_flag =
                                flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
       -                                         state->gb11_flag] &
       +                                         state.gb11_flag] &
                                UINT16_C(1) << cp1_prop;
       -                state->gb12_13_flag =
       +                state.gb12_13_flag =
                                flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
       -                                            state->gb12_13_flag] &
       +                                            state.gb12_13_flag] &
                                UINT16_C(1) << cp1_prop;
        
                        /*
       @@ -145,17 +173,19 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
                         * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
                         */
                        notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
       -                           (dont_break_gb11[cp0_prop + state->gb11_flag *
       +                           (dont_break_gb11[cp0_prop + state.gb11_flag *
                                                    NUM_CHAR_BREAK_PROPS] &
                                    (UINT16_C(1) << cp1_prop)) ||
       -                           (dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
       +                           (dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
                                                       NUM_CHAR_BREAK_PROPS] &
                                    (UINT16_C(1) << cp1_prop));
        
                        /* update or reset flags (when we have a break) */
                        if (likely(!notbreak)) {
       -                        state->gb11_flag = state->gb12_13_flag = false;
       +                        state.gb11_flag = state.gb12_13_flag = false;
                        }
       +
       +                state_serialize(&state, s);
                } else {
                        cp0_prop = get_break_prop(cp0);
                        cp1_prop = get_break_prop(cp1);
       @@ -178,7 +208,7 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
        static size_t
        next_character_break(HERODOTUS_READER *r)
        {
       -        GRAPHEME_STATE state = { 0 };
       +        uint_least16_t state = 0;
                uint_least32_t cp0 = 0, cp1 = 0;
        
                for (herodotus_read_codepoint(r, true, &cp0);