Refactor state into unsigned integer - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit ea1be565ad117a3e9846ae0e855d41021d94ee8a
 (DIR) parent f517655a98a155694cf57c180531724baa081c26
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Mon, 21 Nov 2022 11:05:26 +0100
       
       Refactor state into unsigned integer
       
       Now that we separated the level-determination itself, there
       is no need to have a signed integer for this purpose. This
       simplifies the masking.
       
       Diffstat:
         M grapheme.h                          |       6 +++---
         M src/bidirectional.c                 |     122 +++++++++----------------------
         M test/bidirectional.c                |       2 +-
       
       3 files changed, 37 insertions(+), 93 deletions(-)
       ---
 (DIR) diff --git a/grapheme.h b/grapheme.h
       @@ -16,14 +16,14 @@ enum grapheme_bidirectional_override {
        };
        
        void grapheme_bidirectional_get_line_embedding_levels(
       -        const int_least32_t *, size_t, int_least8_t *);
       +        const uint_least32_t *, size_t, int_least8_t *);
        
        size_t grapheme_bidirectional_preprocess(
                const uint_least32_t *, size_t, enum grapheme_bidirectional_override,
       -        int_least32_t *, size_t);
       +        uint_least32_t *, size_t);
        size_t grapheme_bidirectional_preprocess_utf8(
                const char *, size_t, enum grapheme_bidirectional_override,
       -        int_least32_t *, size_t);
       +        uint_least32_t *, size_t);
        
        size_t grapheme_bidirectional_reorder_line(
                const uint_least32_t *, const int_least8_t *, size_t,
 (DIR) diff --git a/src/bidirectional.c b/src/bidirectional.c
       @@ -8,127 +8,71 @@
        
        #define MAX_DEPTH 125
        
       -#if 0
        enum state_type {
                STATE_PROP,            /* in 0..23, bidi_property */
       +        STATE_PRESERVED_PROP,  /* in 0..23, preserved bidi_property for L1-rules */
                STATE_BRACKET_OFF,     /* in 0..255, offset in bidi_bracket */
                STATE_LEVEL,           /* in 0..MAX_DEPTH+1=126, embedding level */
                STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */
                STATE_VISITED,         /* in 0..1, visited within isolating run */
        };
        
       -/* without rawprop, as it should be */
        static struct {
       -        int_least32_t filter_mask;
       -        int_least32_t clear_mask;
       +        uint_least32_t filter_mask;
                size_t mask_shift;
                int_least16_t value_offset;
        } state_lut[] = {
                [STATE_PROP] = {
       -                .filter_mask  = 0x00001F, /* 00000000 00000000 00011111 */
       -                .clear_mask   = 0x3FFFE0, /* 00111111 11111111 11100000 */
       +                .filter_mask  = 0x000001F, /* 00000000 00000000 00000000 00011111 */
                        .mask_shift   = 0,
                        .value_offset = 0,
                },
       -        [STATE_BRACKET_OFF] = {
       -                .filter_mask  = 0x001FE0, /* 00000000 00011111 11100000 */
       -                .clear_mask   = 0x3FE01F, /* 00111111 11100000 00011111 */
       +        [STATE_PRESERVED_PROP] = {
       +                .filter_mask  = 0x00003E0, /* 00000000 00000000 00000011 11100000 */
                        .mask_shift   = 5,
                        .value_offset = 0,
                },
       -        [STATE_LEVEL] = {
       -                .filter_mask  = 0x0FE000, /* 00001111 11100000 00000000 */
       -                .clear_mask   = 0x301FFF, /* 00110000 00011111 11111111 */
       -                .mask_shift   = 13,
       -                .value_offset = -1,
       -        },
       -        [STATE_PARAGRAPH_LEVEL] = {
       -                .filter_mask  = 0x100000, /* 00010000 00000000 00000000 */
       -                .clear_mask   = 0x2FFFFF, /* 00101111 11111111 11111111 */
       -                .mask_shift   = 20,
       -                .value_offset = 0,
       -        },
       -        [STATE_VISITED] = {
       -                .filter_mask  = 0x200000, /* 00100000 00000000 00000000 */
       -                .clear_mask   = 0x1FFFFF, /* 00011111 11111111 11111111 */
       -                .mask_shift   = 21,
       -                .value_offset = 0,
       -        },
       -};
       -#endif
       -
       -enum state_type {
       -        STATE_PROP,            /* in 0..23, bidi_property */
       -        STATE_BRACKET_OFF,     /* in 0..255, offset in bidi_bracket */
       -        STATE_LEVEL,           /* in 0..MAX_DEPTH+1=126, embedding level */
       -        STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */
       -        STATE_VISITED,         /* in 0..1, visited within isolating run */
       -        STATE_RAWPROP,
       -};
       -
       -static struct {
       -        int_least32_t filter_mask;
       -        int_least32_t clear_mask;
       -        size_t mask_shift;
       -        int_least16_t value_offset;
       -} state_lut[] = {
       -        [STATE_PROP] = {
       -                .filter_mask  = 0x000001F, /* 00000000 00000000 00000000 00011111 */
       -                .clear_mask   = 0x7FFFFE0, /* 00000111 11111111 11111111 11100000 */
       -                .mask_shift   = 0,
       -                .value_offset = 0,
       -        },
                [STATE_BRACKET_OFF] = {
       -                .filter_mask  = 0x0001FE0, /* 00000000 00000000 00011111 11100000 */
       -                .clear_mask   = 0x7FFE01F, /* 00000111 11111111 11100000 00011111 */
       -                .mask_shift   = 5,
       +                .filter_mask  = 0x003FC00, /* 00000000 00000011 11111100 00000000 */
       +                .mask_shift   = 10,
                        .value_offset = 0,
                },
                [STATE_LEVEL] = {
       -                .filter_mask  = 0x00FE000, /* 00000000 00001111 11100000 00000000 */
       -                .clear_mask   = 0x7F01FFF, /* 00000111 11110000 00011111 11111111 */
       -                .mask_shift   = 13,
       +                .filter_mask  = 0x1FC0000, /* 00000001 11111100 00000000 00000000 */
       +                .mask_shift   = 18,
                        .value_offset = -1,
                },
                [STATE_PARAGRAPH_LEVEL] = {
       -                .filter_mask  = 0x0100000, /* 00000000 00010000 00000000 00000000 */
       -                .clear_mask   = 0x7EFFFFF, /* 00000111 11101111 11111111 11111111 */
       -                .mask_shift   = 20,
       +                .filter_mask  = 0x2000000, /* 00000010 00000000 00000000 00000000 */
       +                .mask_shift   = 25,
                        .value_offset = 0,
                },
                [STATE_VISITED] = {
       -                .filter_mask  = 0x0200000, /* 00000000 00100000 00000000 00000000 */
       -                .clear_mask   = 0x7DFFFFF, /* 00000111 11011111 11111111 11111111 */
       -                .mask_shift   = 21,
       -                .value_offset = 0,
       -        },
       -        [STATE_RAWPROP] = {
       -                .filter_mask  = 0x7C00000, /* 00000111 11000000 00000000 00000000 */
       -                .clear_mask   = 0x03FFFFF, /* 00000000 00111111 11111111 11111111 */
       -                .mask_shift   = 22,
       +                .filter_mask  = 0x4000000, /* 00000100 00000000 00000000 00000000 */
       +                .mask_shift   = 26,
                        .value_offset = 0,
                },
        };
        
        static inline int_least16_t
       -get_state(enum state_type t, int_least32_t input)
       +get_state(enum state_type t, uint_least32_t input)
        {
       -        return (int_least16_t)(((input & state_lut[t].filter_mask) >>
       -                                state_lut[t].mask_shift) +
       -                               state_lut[t].value_offset);
       +        return (int_least16_t)((input & state_lut[t].filter_mask) >>
       +                               state_lut[t].mask_shift) +
       +                               state_lut[t].value_offset;
        }
        
        static inline void
       -set_state(enum state_type t, int_least16_t value, int_least32_t *output)
       +set_state(enum state_type t, int_least16_t value, uint_least32_t *output)
        {
       -        *output &= state_lut[t].clear_mask;
       -        *output |= ((value - state_lut[t].value_offset)
       +        *output &= ~state_lut[t].filter_mask;
       +        *output |= ((uint_least32_t)(value - state_lut[t].value_offset)
                            << state_lut[t].mask_shift) &
                           state_lut[t].filter_mask;
        }
        
        struct isolate_runner {
       -        int_least32_t *buf;
       +        uint_least32_t *buf;
                size_t buflen;
        
                struct {
       @@ -179,7 +123,7 @@ ir_set_current_prop(struct isolate_runner *ir, enum bidi_property prop)
        }
        
        static void
       -ir_init(int_least32_t *buf, size_t buflen, size_t off,
       +ir_init(uint_least32_t *buf, size_t buflen, size_t off,
                uint_least8_t paragraph_level, bool within, struct isolate_runner *ir)
        {
                size_t i;
       @@ -385,7 +329,7 @@ ir_advance(struct isolate_runner *ir)
        }
        
        static size_t
       -preprocess_isolating_run_sequence(int_least32_t *buf, size_t buflen, size_t off,
       +preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen, size_t off,
                                          uint_least8_t paragraph_level)
        {
                enum bidi_property sequence_prop, prop;
       @@ -597,7 +541,7 @@ preprocess_isolating_run_sequence(int_least32_t *buf, size_t buflen, size_t off,
        
        static uint_least8_t
        get_paragraph_level(enum grapheme_bidirectional_override override,
       -                    bool terminate_on_pdi, const int_least32_t *buf,
       +                    bool terminate_on_pdi, const uint_least32_t *buf,
                            size_t buflen)
        {
                enum bidi_property prop;
       @@ -653,7 +597,7 @@ get_paragraph_level(enum grapheme_bidirectional_override override,
        
        static void
        preprocess_paragraph(enum grapheme_bidirectional_override override,
       -                     int_least32_t *buf, size_t buflen)
       +                     uint_least32_t *buf, size_t buflen)
        {
                enum bidi_property prop;
                int_least8_t level;
       @@ -961,7 +905,7 @@ again:
                runsince = SIZE_MAX;
                for (bufoff = 0; bufoff < buflen; bufoff++) {
                        level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
       -                prop = (uint_least8_t)get_state(STATE_RAWPROP, buf[bufoff]);
       +                prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP, buf[bufoff]);
        
                        if (level == -1) {
                                /* ignored character */
       @@ -1038,7 +982,7 @@ get_bidi_bracket_off(uint_least32_t cp)
        static size_t
        preprocess(HERODOTUS_READER *r,
                   enum grapheme_bidirectional_override override,
       -           int_least32_t *buf, size_t buflen)
       +           uint_least32_t *buf, size_t buflen)
        {
                size_t bufoff, bufsize, lastparoff;
                uint_least32_t cp;
       @@ -1075,7 +1019,7 @@ preprocess(HERODOTUS_READER *r,
                                set_state(STATE_LEVEL, 0, &(buf[bufoff]));
                                set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff]));
                                set_state(STATE_VISITED, 0, &(buf[bufoff]));
       -                        set_state(STATE_RAWPROP,
       +                        set_state(STATE_PRESERVED_PROP,
                                          (uint_least8_t)get_bidi_property(cp),
                                          &(buf[bufoff]));
                        }
       @@ -1110,7 +1054,7 @@ preprocess(HERODOTUS_READER *r,
        size_t
        grapheme_bidirectional_preprocess(
                const uint_least32_t *src, size_t srclen,
       -        enum grapheme_bidirectional_override override, int_least32_t *dest,
       +        enum grapheme_bidirectional_override override, uint_least32_t *dest,
                size_t destlen)
        {
                HERODOTUS_READER r;
       @@ -1123,7 +1067,7 @@ grapheme_bidirectional_preprocess(
        size_t
        grapheme_bidirectional_preprocess_utf8(
                const char *src, size_t srclen,
       -        enum grapheme_bidirectional_override override, int_least32_t *dest,
       +        enum grapheme_bidirectional_override override, uint_least32_t *dest,
                size_t destlen)
        {
                HERODOTUS_READER r;
       @@ -1135,7 +1079,7 @@ grapheme_bidirectional_preprocess_utf8(
        
        void
        grapheme_bidirectional_get_line_embedding_levels(
       -        const int_least32_t *linedata, size_t linelen, int_least8_t *linelevel)
       +        const uint_least32_t *linedata, size_t linelen, int_least8_t *linelevel)
        {
                enum bidi_property prop;
                size_t i, runsince;
       @@ -1143,7 +1087,7 @@ grapheme_bidirectional_get_line_embedding_levels(
                /* rule L1.4 */
                runsince = SIZE_MAX;
                for (i = 0; i < linelen; i++) {
       -                prop = (uint_least8_t)get_state(STATE_RAWPROP, linedata[i]);
       +                prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP, linedata[i]);
        
                        /* write level into level array */
                        if ((linelevel[i] = (int_least8_t)get_state(
       @@ -1171,7 +1115,7 @@ grapheme_bidirectional_get_line_embedding_levels(
                         */
                        for (i = runsince; i < linelen; i++) {
                                if (linelevel[i] != -1) {
       -                                linelevel[i] = get_state(
       +                                linelevel[i] = (int_least8_t)get_state(
                                                STATE_PARAGRAPH_LEVEL, linedata[i]);
                                }
                        }
 (DIR) diff --git a/test/bidirectional.c b/test/bidirectional.c
       @@ -12,7 +12,7 @@
        int
        main(int argc, char *argv[])
        {
       -        int_least32_t data[512]; /* TODO iterate and get max, allocate */
       +        uint_least32_t data[512]; /* TODO iterate and get max, allocate */
                int_least8_t lev[512];
                size_t i, num_tests, failed, datalen, ret, j, m;