Refactor bidi and add reordering function - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 52ee78ea80d51b163f7fc85e9387389266d2331b
 (DIR) parent 77e30a69ce0807fbee01d43eebedda34b54f41af
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Fri, 26 May 2023 09:40:10 +0200
       
       Refactor bidi and add reordering function
       
       - Rename bidi-override enum to bidi-direction, including entries. This
         better reflects the general nature of it.
       - Remove UTF-8-related bidi-functions, given it would be too complicated
         to reflect in an API and opens up some very difficult challenges.
       - Rename *_preprocess to *_preprocess_paragraph and return the resolved
         paragraph embedding level as an optional out-parameter. This is the
         only way to meaningfully handle large chunks of text with paragraphs
         of different embedding levels.
       - Separate the get_paragraph_level() function into two for
         isolated-paragraphs and whole paragraphs. This simplifies it a lot, as
         we don't have the crazy bool-flag-mess any more.
       - Add a grapheme_bidirectional_reorder_line function that directly
         operates on preprocessed data and returns the reordered string without
         any additionally necessary buffering. For this the
         get_line_embedding_levels had to be made a bit more general to allow
         different ways of writing the levels into the output.
         This function makes use of the mirror-LUT and has a small section
         still commented out regarding the proper inversion of grapheme
         clusters that will need more investigation.
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M gen/bidirectional-test.c            |      44 ++++++++++++++++---------------
         M grapheme.h                          |      24 ++++++++----------------
         M src/bidirectional.c                 |     432 ++++++++++++++++++++++++-------
       
       3 files changed, 366 insertions(+), 134 deletions(-)
       ---
 (DIR) diff --git a/gen/bidirectional-test.c b/gen/bidirectional-test.c
       @@ -12,7 +12,7 @@
        struct bidirectional_test {
                uint_least32_t *cp;
                size_t cplen;
       -        enum grapheme_bidirectional_override mode[3];
       +        enum grapheme_bidirectional_direction mode[3];
                size_t modelen;
                int_least8_t *level;
                int_least8_t *reorder;
       @@ -210,7 +210,7 @@ bidirectional_test_list_print(const struct bidirectional_test *test,
                printf("static const struct {\n"
                       "\tuint_least32_t *cp;\n"
                       "\tsize_t cplen;\n"
       -               "\tenum grapheme_bidirectional_override *mode;\n"
       +               "\tenum grapheme_bidirectional_direction *mode;\n"
                       "\tsize_t modelen;\n"
                       "\tint_least8_t *level;\n"
                       "\tint_least8_t *reorder;\n"
       @@ -230,18 +230,18 @@ bidirectional_test_list_print(const struct bidirectional_test *test,
                        printf("\t\t.cplen      = %zu,\n", test[i].cplen);
        
                        printf("\t\t.mode       = (enum "
       -                       "grapheme_bidirectional_override[]){");
       +                       "grapheme_bidirectional_direction[]){");
                        for (j = 0; j < test[i].modelen; j++) {
                                if (test[i].mode[j] ==
       -                            GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL) {
       -                                printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_"
       +                            GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL) {
       +                                printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_"
                                               "NEUTRAL");
                                } else if (test[i].mode[j] ==
       -                                   GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
       -                                printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR");
       +                                   GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
       +                                printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR");
                                } else if (test[i].mode[j] ==
       -                                   GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
       -                                printf(" GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL");
       +                                   GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
       +                                printf(" GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL");
                                }
                                if (j + 1 < test[i].modelen) {
                                        putchar(',');
       @@ -374,32 +374,32 @@ test_callback(const char *file, char **field, size_t nfields, char *comment,
                                exit(1);
                        } else if (field[1][0] == '2') {
                                test[testlen - 1].mode[0] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
                                test[testlen - 1].modelen = 1;
                        } else if (field[1][0] == '3') {
                                /* auto=0 and LTR=1 */
                                test[testlen - 1].mode[0] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                                test[testlen - 1].mode[1] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
                                test[testlen - 1].modelen = 2;
                        } else if (field[1][0] == '4') {
                                test[testlen - 1].mode[0] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
                                test[testlen - 1].modelen = 1;
                        } else if (field[1][0] == '5') {
                                test[testlen - 1].mode[0] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                                test[testlen - 1].mode[1] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
                                test[testlen - 1].modelen = 2;
                        } else if (field[1][0] == '7') {
                                test[testlen - 1].mode[0] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                                test[testlen - 1].mode[1] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
                                test[testlen - 1].mode[2] =
       -                                GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
       +                                GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
                                test[testlen - 1].modelen = 3;
                        } else {
                                fprintf(stderr,
       @@ -445,12 +445,14 @@ character_test_callback(const char *file, char **field, size_t nfields,
                        fprintf(stderr, "malformed paragraph-level-setting.\n");
                        exit(1);
                } else if (field[1][0] == '0') {
       -                test[testlen - 1].mode[0] = GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
       +                test[testlen - 1].mode[0] =
       +                        GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
                } else if (field[1][0] == '1') {
       -                test[testlen - 1].mode[0] = GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
       +                test[testlen - 1].mode[0] =
       +                        GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
                } else if (field[1][0] == '2') {
                        test[testlen - 1].mode[0] =
       -                        GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +                        GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                } else {
                        fprintf(stderr, "unhandled paragraph-level-setting.\n");
                        exit(1);
 (DIR) diff --git a/grapheme.h b/grapheme.h
       @@ -8,31 +8,23 @@
        
        #define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
        
       -/* TODO call it simply "direction" without override */
       -enum grapheme_bidirectional_override {
       -        GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL,
       -        GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR,
       -        GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL,
       +enum grapheme_bidirectional_direction {
       +        GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL,
       +        GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR,
       +        GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL,
        };
        
        size_t grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *,
                                                                size_t, int_least8_t *,
                                                                size_t);
        
       -size_t grapheme_bidirectional_preprocess(const uint_least32_t *, size_t,
       -                                         enum grapheme_bidirectional_override,
       -                                         uint_least32_t *, size_t);
       -size_t
       -grapheme_bidirectional_preprocess_utf8(const char *, size_t,
       -                                       enum grapheme_bidirectional_override,
       -                                       uint_least32_t *, size_t);
       +size_t grapheme_bidirectional_preprocess_paragraph(
       +        const uint_least32_t *, size_t, enum grapheme_bidirectional_direction,
       +        uint_least32_t *, size_t, enum grapheme_bidirectional_direction *);
        
        size_t grapheme_bidirectional_reorder_line(const uint_least32_t *,
       -                                           const int_least8_t *, size_t,
       +                                           const uint_least32_t *, size_t,
                                                   uint_least32_t *, size_t);
       -size_t grapheme_bidirectional_reorder_line_utf8(const char *,
       -                                                const int_least8_t *, size_t,
       -                                                char *, size_t);
        
        size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
        size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
 (DIR) diff --git a/src/bidirectional.c b/src/bidirectional.c
       @@ -895,28 +895,17 @@ preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen,
        }
        
        static uint_least8_t
       -get_paragraph_level(enum grapheme_bidirectional_override override,
       -                    bool terminate_on_pdi, const uint_least32_t *buf,
       -                    size_t buflen)
       +get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen)
        {
                enum bidi_property prop;
                int_least8_t isolate_level;
       -        size_t bufoff;
       +        size_t stateoff;
        
       -        /* check overrides first according to rule HL1 */
       -        if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
       -                return 0;
       -        } else if (override == GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
       -                return 1;
       -        }
       -
       -        /* determine paragraph level (rules P1-P3) */
       +        /* determine paragraph level (rules P1-P3) and terminate on PDI */
       +        for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++) {
       +                prop = get_state(STATE_PROP, state[stateoff]);
        
       -        for (bufoff = 0, isolate_level = 0; bufoff < buflen; bufoff++) {
       -                prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
       -
       -                if (prop == BIDI_PROP_PDI && isolate_level == 0 &&
       -                    terminate_on_pdi) {
       +                if (prop == BIDI_PROP_PDI && isolate_level == 0) {
                                /*
                                 * we are in a FSI-subsection of a paragraph and
                                 * matched with the terminating PDI
       @@ -950,28 +939,86 @@ get_paragraph_level(enum grapheme_bidirectional_override override,
                return 0;
        }
        
       +static inline uint_least8_t
       +get_bidi_property(uint_least32_t cp)
       +{
       +        if (likely(cp <= 0x10FFFF)) {
       +                return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
       +                       0x1F /* 00011111 */;
       +        } else {
       +                return BIDI_PROP_L;
       +        }
       +}
       +
       +static uint_least8_t
       +get_paragraph_level(enum grapheme_bidirectional_direction override,
       +                    const HERODOTUS_READER *r)
       +{
       +        HERODOTUS_READER tmp;
       +        enum bidi_property prop;
       +        int_least8_t isolate_level;
       +        uint_least32_t cp;
       +
       +        /* check overrides first according to rule HL1 */
       +        if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
       +                return 0;
       +        } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
       +                return 1;
       +        }
       +
       +        /* copy reader into temporary reader */
       +        herodotus_reader_copy(r, &tmp);
       +
       +        /* determine paragraph level (rules P1-P3) */
       +        for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) ==
       +                                HERODOTUS_STATUS_SUCCESS;) {
       +                prop = get_bidi_property(cp);
       +
       +                /* BD8/BD9 */
       +                if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
       +                     prop == BIDI_PROP_FSI) &&
       +                    isolate_level < MAX_DEPTH) {
       +                        /* we hit an isolate initiator, increment counter */
       +                        isolate_level++;
       +                } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
       +                        isolate_level--;
       +                }
       +
       +                /* P2 */
       +                if (isolate_level > 0) {
       +                        continue;
       +                }
       +
       +                /* P3 */
       +                if (prop == BIDI_PROP_L) {
       +                        return 0;
       +                } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
       +                        return 1;
       +                }
       +        }
       +
       +        return 0;
       +}
       +
        static void
       -preprocess_paragraph(enum grapheme_bidirectional_override override,
       -                     uint_least32_t *buf, size_t buflen)
       +preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf,
       +                     size_t buflen)
        {
                enum bidi_property prop;
                int_least8_t level;
        
                struct {
                        int_least8_t level;
       -                enum grapheme_bidirectional_override override;
       +                enum grapheme_bidirectional_direction override;
                        bool directional_isolate;
                } directional_status[MAX_DEPTH + 2], *dirstat = directional_status;
        
                size_t overflow_isolate_count, overflow_embedding_count,
                        valid_isolate_count, bufoff, i, runsince;
       -        uint_least8_t paragraph_level;
       -
       -        paragraph_level = get_paragraph_level(override, false, buf, buflen);
        
                /* X1 */
                dirstat->level = (int_least8_t)paragraph_level;
       -        dirstat->override = GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +        dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                dirstat->directional_isolate = false;
                overflow_isolate_count = overflow_embedding_count =
                        valid_isolate_count = 0;
       @@ -995,7 +1042,7 @@ again:
                                                (dirstat - 1)->level +
                                                ((dirstat - 1)->level % 2 != 0) + 1;
                                        dirstat->override =
       -                                        GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +                                        GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                                        dirstat->directional_isolate = false;
                                } else {
                                        /* overflow RLE */
       @@ -1014,7 +1061,7 @@ again:
                                                (dirstat - 1)->level +
                                                ((dirstat - 1)->level % 2 == 0) + 1;
                                        dirstat->override =
       -                                        GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +                                        GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                                        dirstat->directional_isolate = false;
                                } else {
                                        /* overflow LRE */
       @@ -1033,7 +1080,7 @@ again:
                                                (dirstat - 1)->level +
                                                ((dirstat - 1)->level % 2 != 0) + 1;
                                        dirstat->override =
       -                                        GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL;
       +                                        GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
                                        dirstat->directional_isolate = false;
                                } else {
                                        /* overflow RLO */
       @@ -1052,7 +1099,7 @@ again:
                                                (dirstat - 1)->level +
                                                ((dirstat - 1)->level % 2 == 0) + 1;
                                        dirstat->override =
       -                                        GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR;
       +                                        GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
                                        dirstat->directional_isolate = false;
                                } else {
                                        /* overflow LRO */
       @@ -1063,11 +1110,11 @@ again:
                                /* X5a */
                                set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
                                if (dirstat->override ==
       -                            GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
       +                            GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
                                        set_state(STATE_PROP, BIDI_PROP_L,
                                                  &(buf[bufoff]));
                                } else if (dirstat->override ==
       -                                   GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
       +                                   GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
                                        set_state(STATE_PROP, BIDI_PROP_R,
                                                  &(buf[bufoff]));
                                }
       @@ -1084,7 +1131,7 @@ again:
                                                (dirstat - 1)->level +
                                                ((dirstat - 1)->level % 2 != 0) + 1;
                                        dirstat->override =
       -                                        GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +                                        GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                                        dirstat->directional_isolate = true;
                                } else {
                                        /* overflow RLI */
       @@ -1094,11 +1141,11 @@ again:
                                /* X5b */
                                set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
                                if (dirstat->override ==
       -                            GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
       +                            GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
                                        set_state(STATE_PROP, BIDI_PROP_L,
                                                  &(buf[bufoff]));
                                } else if (dirstat->override ==
       -                                   GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
       +                                   GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
                                        set_state(STATE_PROP, BIDI_PROP_R,
                                                  &(buf[bufoff]));
                                }
       @@ -1115,7 +1162,7 @@ again:
                                                (dirstat - 1)->level +
                                                ((dirstat - 1)->level % 2 == 0) + 1;
                                        dirstat->override =
       -                                        GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL;
       +                                        GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
                                        dirstat->directional_isolate = true;
                                } else {
                                        /* overflow LRI */
       @@ -1123,9 +1170,8 @@ again:
                                }
                        } else if (prop == BIDI_PROP_FSI) {
                                /* X5c */
       -                        if (get_paragraph_level(
       -                                    GRAPHEME_BIDIRECTIONAL_OVERRIDE_NEUTRAL,
       -                                    true, buf + (bufoff + 1),
       +                        if (get_isolated_paragraph_level(
       +                                    buf + (bufoff + 1),
                                            buflen - (bufoff + 1)) == 1) {
                                        prop = BIDI_PROP_RLI;
                                        goto again;
       @@ -1138,11 +1184,11 @@ again:
                                /* X6 */
                                set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
                                if (dirstat->override ==
       -                            GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
       +                            GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
                                        set_state(STATE_PROP, BIDI_PROP_L,
                                                  &(buf[bufoff]));
                                } else if (dirstat->override ==
       -                                   GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
       +                                   GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
                                        set_state(STATE_PROP, BIDI_PROP_R,
                                                  &(buf[bufoff]));
                                }
       @@ -1190,11 +1236,11 @@ again:
        
                                set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
                                if (dirstat->override ==
       -                            GRAPHEME_BIDIRECTIONAL_OVERRIDE_LTR) {
       +                            GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
                                        set_state(STATE_PROP, BIDI_PROP_L,
                                                  &(buf[bufoff]));
                                } else if (dirstat->override ==
       -                                   GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL) {
       +                                   GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
                                        set_state(STATE_PROP, BIDI_PROP_R,
                                                  &(buf[bufoff]));
                                }
       @@ -1317,17 +1363,6 @@ again:
        }
        
        static inline uint_least8_t
       -get_bidi_property(uint_least32_t cp)
       -{
       -        if (likely(cp <= 0x10FFFF)) {
       -                return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
       -                       0x1F /* 00011111 */;
       -        } else {
       -                return BIDI_PROP_L;
       -        }
       -}
       -
       -static inline uint_least8_t
        get_bidi_bracket_off(uint_least32_t cp)
        {
                if (likely(cp <= 0x10FFFF)) {
       @@ -1340,20 +1375,35 @@ get_bidi_bracket_off(uint_least32_t cp)
        }
        
        static size_t
       -preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override,
       -           uint_least32_t *buf, size_t buflen)
       +preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override,
       +           uint_least32_t *buf, size_t buflen,
       +           enum grapheme_bidirectional_direction *resolved)
        {
       -        size_t bufoff, bufsize, lastparoff;
       +        HERODOTUS_READER tmp;
       +        size_t bufoff, bufsize, paragraph_len;
                uint_least32_t cp;
       +        uint_least8_t paragraph_level;
        
       -        if (buf == NULL) {
       -                for (; herodotus_read_codepoint(r, true, &cp) ==
       -                       HERODOTUS_STATUS_SUCCESS;) {
       -                        ;
       +        /* determine length and level of the paragraph */
       +        herodotus_reader_copy(r, &tmp);
       +        for (; herodotus_read_codepoint(&tmp, true, &cp) ==
       +               HERODOTUS_STATUS_SUCCESS;) {
       +                /* break on paragraph separator */
       +                if (get_bidi_property(cp) == BIDI_PROP_B) {
       +                        break;
                        }
       +        }
       +        paragraph_len = herodotus_reader_number_read(&tmp);
       +        paragraph_level = get_paragraph_level(override, r);
       +
       +        if (resolved != NULL) {
       +                /* store resolved paragraph level in output variable */
       +                *resolved = paragraph_level;
       +        }
        
       +        if (buf == NULL) {
                        /* see below for return value reasoning */
       -                return herodotus_reader_number_read(r);
       +                return paragraph_len;
                }
        
                /*
       @@ -1361,6 +1411,7 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override,
                 * and store them in the buffer
                 */
                for (bufoff = 0;
       +             bufoff < paragraph_len &&
                     herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;
                     bufoff++) {
                        if (bufoff < buflen) {
       @@ -1385,7 +1436,7 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override,
                }
                bufsize = herodotus_reader_number_read(r);
        
       -        for (bufoff = 0, lastparoff = 0; bufoff < bufsize; bufoff++) {
       +        for (bufoff = 0; bufoff < bufsize; bufoff++) {
                        if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B &&
                            bufoff != bufsize - 1) {
                                continue;
       @@ -1398,9 +1449,8 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override,
                         * the terminating character or last character of the
                         * string respectively
                         */
       -                preprocess_paragraph(override, buf + lastparoff,
       -                                     bufoff + 1 - lastparoff);
       -                lastparoff = bufoff + 1;
       +                preprocess_paragraph(paragraph_level, buf, bufoff + 1);
       +                break;
                }
        
                /*
       @@ -1411,50 +1461,41 @@ preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_override override,
        }
        
        size_t
       -grapheme_bidirectional_preprocess(const uint_least32_t *src, size_t srclen,
       -                                  enum grapheme_bidirectional_override override,
       -                                  uint_least32_t *dest, size_t destlen)
       +grapheme_bidirectional_preprocess_paragraph(
       +        const uint_least32_t *src, size_t srclen,
       +        enum grapheme_bidirectional_direction override, uint_least32_t *dest,
       +        size_t destlen, enum grapheme_bidirectional_direction *resolved)
        {
                HERODOTUS_READER r;
        
                herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
        
       -        return preprocess(&r, override, dest, destlen);
       +        return preprocess(&r, override, dest, destlen, resolved);
        }
        
       -size_t
       -grapheme_bidirectional_preprocess_utf8(
       -        const char *src, size_t srclen,
       -        enum grapheme_bidirectional_override override, uint_least32_t *dest,
       -        size_t destlen)
       -{
       -        HERODOTUS_READER r;
       -
       -        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       -
       -        return preprocess(&r, override, dest, destlen);
       -}
       -
       -size_t
       -grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
       -                                                 size_t linelen,
       -                                                 int_least8_t *lev,
       -                                                 size_t levlen)
       +static inline size_t
       +get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen,
       +                          int_least8_t (*get_level)(const void *, size_t),
       +                          void (*set_level)(void *, size_t, int_least8_t),
       +                          void *lev, size_t levsize, bool skipignored)
        {
                enum bidi_property prop;
       -        size_t i, runsince;
       -        int_least8_t level;
       +        size_t i, levlen, runsince;
       +        int_least8_t level, runlevel;
        
                /* rule L1.4 */
                runsince = SIZE_MAX;
       -        for (i = 0; i < linelen; i++) {
       +        for (i = 0, levlen = 0; i < linelen; i++) {
                        level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]);
                        prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
                                                        linedata[i]);
        
                        /* write level into level array if we still have space */
       -                if (i < levlen) {
       -                        lev[i] = level;
       +                if (level != -1 || skipignored == false) {
       +                        if (levlen <= levsize) {
       +                                set_level(lev, levlen, level);
       +                        }
       +                        levlen++;
                        }
        
                        if (level == -1) {
       @@ -1467,11 +1508,14 @@ grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
                            prop == BIDI_PROP_PDI) {
                                if (runsince == SIZE_MAX) {
                                        /* a new run has begun */
       -                                runsince = i;
       +                                runsince = levlen - 1; /* levlen > 0 */
       +                                runlevel = get_state(STATE_PARAGRAPH_LEVEL,
       +                                                     linedata[i]);
                                }
                        } else {
                                /* sequence ended */
                                runsince = SIZE_MAX;
       +                        runlevel = -1;
                        }
                }
                if (runsince != SIZE_MAX) {
       @@ -1479,13 +1523,207 @@ grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
                         * we hit the end of the line but were in a run;
                         * reset the line levels to the paragraph level
                         */
       -                for (i = runsince; i < MIN(linelen, levlen); i++) {
       -                        if (lev[i] != -1) {
       -                                lev[i] = (int_least8_t)get_state(
       -                                        STATE_PARAGRAPH_LEVEL, linedata[i]);
       +                for (i = runsince; i < MIN(linelen, levsize); i++) {
       +                        if (get_level(lev, i) != -1) {
       +                                set_level(lev, i, runlevel);
       +                        }
       +                }
       +        }
       +
       +        return levlen;
       +}
       +
       +static inline int_least8_t
       +get_level_int8(const void *lev, size_t off)
       +{
       +        return ((int_least8_t *)lev)[off];
       +}
       +
       +static inline void
       +set_level_int8(void *lev, size_t off, int_least8_t value)
       +{
       +        ((int_least8_t *)lev)[off] = value;
       +}
       +
       +size_t
       +grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
       +                                                 size_t linelen,
       +                                                 int_least8_t *lev,
       +                                                 size_t levlen)
       +{
       +        return get_line_embedding_levels(linedata, linelen, get_level_int8,
       +                                         set_level_int8, lev, levlen, false);
       +}
       +
       +static inline int_least8_t
       +get_level_uint32(const void *lev, size_t off)
       +{
       +        return (int_least8_t)((((uint_least32_t *)lev)[off] &
       +                               UINT32_C(0x1FE00000)) >>
       +                              21) -
       +               1;
       +}
       +
       +static inline void
       +set_level_uint32(void *lev, size_t off, int_least8_t value)
       +{
       +        ((uint_least32_t *)lev)[off] ^=
       +                ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000);
       +        ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21;
       +}
       +
       +static inline int_least16_t
       +get_mirror_offset(uint_least32_t cp)
       +{
       +        if (cp <= UINT32_C(0x10FFFF)) {
       +                return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)];
       +        } else {
       +                return 0;
       +        }
       +}
       +
       +size_t
       +grapheme_bidirectional_reorder_line(const uint_least32_t *line,
       +                                    const uint_least32_t *linedata,
       +                                    size_t linelen, uint_least32_t *output,
       +                                    size_t outputsize)
       +{
       +        size_t i, outputlen, first, last, j, k, l, laststart;
       +        int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0;
       +        uint_least32_t tmp;
       +
       +        /* write output characters */
       +        for (i = 0, outputlen = 0; i < linelen; i++) {
       +                if (get_state(STATE_LEVEL, linedata[i]) != -1) {
       +                        if (outputlen < outputsize) {
       +                                output[outputlen] = line[i];
       +                        }
       +                        outputlen++;
       +                }
       +        }
       +        if (outputlen >= outputsize) {
       +                /* clear output buffer */
       +                for (i = 0; i < outputsize; i++) {
       +                        output[i] = GRAPHEME_INVALID_CODEPOINT;
       +                }
       +
       +                /* return required size */
       +                return outputlen;
       +        }
       +
       +        /*
       +         * write line embedding levels as metadata and codepoints into the
       +         * output
       +         */
       +        get_line_embedding_levels(linedata, linelen, get_level_uint32,
       +                                  set_level_uint32, output, linelen, true);
       +
       +        /* determine level range */
       +        for (i = 0; i < outputlen; i++) {
       +                level = get_level_uint32(output, i);
       +
       +                if (level == -1) {
       +                        /* ignored character */
       +                        continue;
       +                }
       +
       +                if (level % 2 == 1 && level < min_odd_level) {
       +                        min_odd_level = level;
       +                }
       +                if (level > max_level) {
       +                        max_level = level;
       +                }
       +        }
       +
       +        for (level = max_level; level >= min_odd_level /* > 0 */; level--) {
       +                for (i = 0; i < outputlen; i++) {
       +                        if (get_level_uint32(output, i) >= level) {
       +                                /*
       +                                 * the current character has the desired level
       +                                 */
       +                                first = last = i;
       +
       +                                /* find the end of the level-sequence */
       +                                for (i++; i < outputlen; i++) {
       +                                        if (get_level_uint32(output, i) >=
       +                                            level) {
       +                                                /* the sequence continues */
       +                                                last = i;
       +                                        } else {
       +                                                break;
       +                                        }
       +                                }
       +
       +                                /* invert the sequence first..last respecting
       +                                 * grapheme clusters
       +                                 *
       +                                 * The standard only speaks of combining marks
       +                                 * inversion, but we should in the perfect case
       +                                 * respect _all_ grapheme clusters, which we do
       +                                 * here!
       +                                 */
       +
       +                                /* mark grapheme cluster breaks */
       +                                for (j = first; j <= last;
       +                                     j += grapheme_next_character_break(
       +                                             line + j, outputlen - j)) {
       +                                        /*
       +                                         * we use a special trick here: The
       +                                         * first 21 bits of the state are filled
       +                                         * with the codepoint, the next 8 bits
       +                                         * are used for the level, so we can use
       +                                         * the 30th bit to mark the grapheme
       +                                         * cluster breaks. This allows us to
       +                                         * reinvert the grapheme clusters into
       +                                         * the proper direction later.
       +                                         */
       +                                        output[j] |= UINT32_C(1) << 29;
       +                                }
       +
       +                                /* global inversion */
       +                                for (k = first, l = last; k < l; k++, l--) {
       +                                        /* swap */
       +                                        tmp = output[k];
       +                                        output[k] = output[l];
       +                                        output[l] = tmp;
       +                                }
       +
       +                                /* grapheme cluster reinversion */
       +#if 0
       +                                for (j = first, laststart = first; j <= last;
       +                                     j++) {
       +                                        if (output[j] & (UINT32_C(1) << 29)) {
       +                                                /* we hit a mark! given the
       +                                                 * grapheme cluster is inverted,
       +                                                 * this means that the cluster
       +                                                 * ended and we now reinvert it
       +                                                 * again
       +                                                 */
       +                                                for (k = laststart, l = j;
       +                                                     k < l; k++, l--) {
       +                                                        /* swap */
       +                                                        tmp = output[k];
       +                                                        output[k] = output[l];
       +                                                        output[l] = tmp;
       +                                                }
       +                                                laststart = j + 1;
       +                                        }
       +                                }
       +#endif
       +
       +                                /* unmark grapheme cluster breaks */
       +                                for (j = first; j <= last; j++) {
       +                                        output[j] ^= output[j] &
       +                                                     UINT32_C(0x20000000);
       +                                }
                                }
                        }
                }
        
       -        return linelen;
       +        /* remove embedding level metadata */
       +        for (i = 0; i < outputlen; i++) {
       +                output[i] ^= output[i] & UINT32_C(0x1FE00000);
       +        }
       +
       +        return outputlen;
        }