bidirectional.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       bidirectional.c (48331B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <stdbool.h>
            3 #include <stddef.h>
            4 
            5 #include "../gen/bidirectional.h"
            6 #include "../grapheme.h"
            7 #include "util.h"
            8 
            9 #define MAX_DEPTH 125
           10 
           11 enum state_type {
           12         STATE_PROP,            /* in 0..23, bidi_property */
           13         STATE_PRESERVED_PROP,  /* in 0..23, preserved bidi_prop for L1-rule */
           14         STATE_BRACKET_OFF,     /* in 0..255, offset in bidi_bracket */
           15         STATE_LEVEL,           /* in 0..MAX_DEPTH+1=126, embedding level */
           16         STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */
           17         STATE_VISITED,         /* in 0..1, visited within isolating run */
           18 };
           19 
           20 static struct {
           21         uint_least32_t filter_mask;
           22         size_t mask_shift;
           23         int_least16_t value_offset;
           24 } state_lut[] = {
           25         [STATE_PROP] = {
           26                 .filter_mask  = 0x000001F, /* 00000000 00000000 00000000 00011111 */
           27                 .mask_shift   = 0,
           28                 .value_offset = 0,
           29         },
           30         [STATE_PRESERVED_PROP] = {
           31                 .filter_mask  = 0x00003E0, /* 00000000 00000000 00000011 11100000 */
           32                 .mask_shift   = 5,
           33                 .value_offset = 0,
           34         },
           35         [STATE_BRACKET_OFF] = {
           36                 .filter_mask  = 0x003FC00, /* 00000000 00000011 11111100 00000000 */
           37                 .mask_shift   = 10,
           38                 .value_offset = 0,
           39         },
           40         [STATE_LEVEL] = {
           41                 .filter_mask  = 0x1FC0000, /* 00000001 11111100 00000000 00000000 */
           42                 .mask_shift   = 18,
           43                 .value_offset = -1,
           44         },
           45         [STATE_PARAGRAPH_LEVEL] = {
           46                 .filter_mask  = 0x2000000, /* 00000010 00000000 00000000 00000000 */
           47                 .mask_shift   = 25,
           48                 .value_offset = 0,
           49         },
           50         [STATE_VISITED] = {
           51                 .filter_mask  = 0x4000000, /* 00000100 00000000 00000000 00000000 */
           52                 .mask_shift   = 26,
           53                 .value_offset = 0,
           54         },
           55 };
           56 
           57 static inline int_least16_t
           58 get_state(enum state_type t, uint_least32_t input)
           59 {
           60         return (int_least16_t)((input & state_lut[t].filter_mask) >>
           61                                state_lut[t].mask_shift) +
           62                state_lut[t].value_offset;
           63 }
           64 
           65 static inline void
           66 set_state(enum state_type t, int_least16_t value, uint_least32_t *output)
           67 {
           68         *output &= ~state_lut[t].filter_mask;
           69         *output |= ((uint_least32_t)(value - state_lut[t].value_offset)
           70                     << state_lut[t].mask_shift) &
           71                    state_lut[t].filter_mask;
           72 }
           73 
           74 struct isolate_runner {
           75         uint_least32_t *buf;
           76         size_t buflen;
           77         size_t start;
           78 
           79         struct {
           80                 size_t off;
           81         } prev, cur, next;
           82 
           83         enum bidi_property sos, eos;
           84 
           85         uint_least8_t paragraph_level;
           86         int_least8_t isolating_run_level;
           87 };
           88 
           89 static inline enum bidi_property
           90 ir_get_previous_prop(const struct isolate_runner *ir)
           91 {
           92         return (ir->prev.off == SIZE_MAX) ?
           93                        ir->sos :
           94                        (uint_least8_t)get_state(STATE_PROP,
           95                                                 ir->buf[ir->prev.off]);
           96 }
           97 
           98 static inline enum bidi_property
           99 ir_get_current_prop(const struct isolate_runner *ir)
          100 {
          101         return (uint_least8_t)get_state(STATE_PROP, ir->buf[ir->cur.off]);
          102 }
          103 
          104 static inline enum bidi_property
          105 ir_get_next_prop(const struct isolate_runner *ir)
          106 {
          107         return (ir->next.off == SIZE_MAX) ?
          108                        ir->eos :
          109                        (uint_least8_t)get_state(STATE_PROP,
          110                                                 ir->buf[ir->next.off]);
          111 }
          112 
          113 static inline enum bidi_property
          114 ir_get_current_preserved_prop(const struct isolate_runner *ir)
          115 {
          116         return (uint_least8_t)get_state(STATE_PRESERVED_PROP,
          117                                         ir->buf[ir->cur.off]);
          118 }
          119 
          120 static inline int_least8_t
          121 ir_get_current_level(const struct isolate_runner *ir)
          122 {
          123         return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]);
          124 }
          125 
          126 static inline const struct bracket *
          127 ir_get_current_bracket_prop(const struct isolate_runner *ir)
          128 {
          129         return bidi_bracket +
          130                (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cur.off]);
          131 }
          132 
          133 static void
          134 ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property prop)
          135 {
          136         set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off]));
          137 }
          138 
          139 static void
          140 ir_init(uint_least32_t *buf, size_t buflen, size_t off,
          141         uint_least8_t paragraph_level, bool within, struct isolate_runner *ir)
          142 {
          143         size_t i;
          144         int_least8_t sos_level;
          145 
          146         /* initialize invariants */
          147         ir->buf = buf;
          148         ir->buflen = buflen;
          149         ir->paragraph_level = paragraph_level;
          150         ir->start = off;
          151 
          152         /* advance off until we are at a non-removed character */
          153         for (; off < buflen; off++) {
          154                 if (get_state(STATE_LEVEL, buf[off]) != -1) {
          155                         break;
          156                 }
          157         }
          158         if (off == buflen) {
          159                 /* we encountered no more non-removed character, terminate */
          160                 ir->next.off = SIZE_MAX;
          161                 return;
          162         }
          163 
          164         /* set the isolating run level to that of the current offset */
          165         ir->isolating_run_level =
          166                 (int_least8_t)get_state(STATE_LEVEL, buf[off]);
          167 
          168         /* initialize sos and eos to dummy values */
          169         ir->sos = ir->eos = NUM_BIDI_PROPS;
          170 
          171         /*
          172          * we write the information of the "current" state into next,
          173          * so that the shift-in at the first advancement moves it in
          174          * cur, as desired.
          175          */
          176         ir->next.off = off;
          177 
          178         /*
          179          * determine the previous state but store its offset in cur.off,
          180          * given it's shifted in on the first advancement
          181          */
          182         ir->cur.off = SIZE_MAX;
          183         for (i = off, sos_level = -1; i >= 1; i--) {
          184                 if (get_state(STATE_LEVEL, buf[i - 1]) != -1) {
          185                         /*
          186                          * we found a character that has not been
          187                          * removed in X9
          188                          */
          189                         sos_level = (int_least8_t)get_state(STATE_LEVEL,
          190                                                             buf[i - 1]);
          191 
          192                         if (within) {
          193                                 /* we just take it */
          194                                 ir->cur.off = i;
          195                         }
          196 
          197                         break;
          198                 }
          199         }
          200         if (sos_level == -1) {
          201                 /*
          202                  * there were no preceding non-removed characters, set
          203                  * sos-level to paragraph embedding level
          204                  */
          205                 sos_level = (int_least8_t)paragraph_level;
          206         }
          207 
          208         if (!within || ir->cur.off == SIZE_MAX) {
          209                 /*
          210                  * we are at the beginning of the sequence; initialize
          211                  * it faithfully according to the algorithm by looking
          212                  * at the sos-level
          213                  */
          214                 if (MAX(sos_level, ir->isolating_run_level) % 2 == 0) {
          215                         /* the higher level is even, set sos to L */
          216                         ir->sos = BIDI_PROP_L;
          217                 } else {
          218                         /* the higher level is odd, set sos to R */
          219                         ir->sos = BIDI_PROP_R;
          220                 }
          221         }
          222 }
          223 
          224 static int
          225 ir_advance(struct isolate_runner *ir)
          226 {
          227         enum bidi_property prop;
          228         int_least8_t level, isolate_level, last_isolate_level;
          229         size_t i;
          230 
          231         if (ir->next.off == SIZE_MAX) {
          232                 /* the sequence is over */
          233                 return 1;
          234         }
          235 
          236         /* shift in */
          237         ir->prev.off = ir->cur.off;
          238         ir->cur.off = ir->next.off;
          239 
          240         /* mark as visited */
          241         set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off]));
          242 
          243         /* initialize next state by going to the next character in the sequence
          244          */
          245         ir->next.off = SIZE_MAX;
          246 
          247         last_isolate_level = -1;
          248         for (i = ir->cur.off, isolate_level = 0; i < ir->buflen; i++) {
          249                 level = (int_least8_t)get_state(STATE_LEVEL, ir->buf[i]);
          250                 prop = (uint_least8_t)get_state(STATE_PROP, ir->buf[i]);
          251 
          252                 if (level == -1) {
          253                         /* this is one of the ignored characters, skip */
          254                         continue;
          255                 } else if (level == ir->isolating_run_level) {
          256                         last_isolate_level = level;
          257                 }
          258 
          259                 /* follow BD8/BD9 and P2 to traverse the current sequence */
          260                 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
          261                     prop == BIDI_PROP_FSI) {
          262                         /*
          263                          * we encountered an isolate initiator, increment
          264                          * counter, but go into processing when we
          265                          * were not isolated before
          266                          */
          267                         if (isolate_level < MAX_DEPTH) {
          268                                 isolate_level++;
          269                         }
          270                         if (isolate_level != 1) {
          271                                 continue;
          272                         }
          273                 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
          274                         isolate_level--;
          275 
          276                         /*
          277                          * if the current PDI dropped the isolate-level
          278                          * to zero, it is itself part of the isolating
          279                          * run sequence; otherwise we simply continue.
          280                          */
          281                         if (isolate_level > 0) {
          282                                 continue;
          283                         }
          284                 } else if (isolate_level > 0) {
          285                         /* we are in an isolating sequence */
          286                         continue;
          287                 }
          288 
          289                 /*
          290                  * now we either still are in our sequence or we hit
          291                  * the eos-case as we left the sequence and hit the
          292                  * first non-isolating-sequence character.
          293                  */
          294                 if (i == ir->cur.off) {
          295                         /* we were in the first initializing round */
          296                         continue;
          297                 } else if (level == ir->isolating_run_level) {
          298                         /* isolate_level-skips have been handled before, we're
          299                          * good */
          300                         /* still in the sequence */
          301                         ir->next.off = i;
          302                 } else {
          303                         /* out of sequence or isolated, compare levels via eos
          304                          */
          305                         ir->next.off = SIZE_MAX;
          306                         if (MAX(last_isolate_level, level) % 2 == 0) {
          307                                 ir->eos = BIDI_PROP_L;
          308                         } else {
          309                                 ir->eos = BIDI_PROP_R;
          310                         }
          311                 }
          312                 break;
          313         }
          314         if (i == ir->buflen) {
          315                 /*
          316                  * the sequence ended before we could grab an offset.
          317                  * we need to determine the eos-prop by comparing the
          318                  * level of the last element in the isolating run sequence
          319                  * with the paragraph level.
          320                  */
          321                 ir->next.off = SIZE_MAX;
          322                 if (MAX(last_isolate_level, ir->paragraph_level) % 2 == 0) {
          323                         /* the higher level is even, set eos to L */
          324                         ir->eos = BIDI_PROP_L;
          325                 } else {
          326                         /* the higher level is odd, set eos to R */
          327                         ir->eos = BIDI_PROP_R;
          328                 }
          329         }
          330 
          331         return 0;
          332 }
          333 
          334 static enum bidi_property
          335 ir_get_last_strong_prop(const struct isolate_runner *ir)
          336 {
          337         struct isolate_runner tmp;
          338         enum bidi_property last_strong_prop = ir->sos, prop;
          339 
          340         ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
          341                 &tmp);
          342         for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
          343                 prop = ir_get_current_prop(&tmp);
          344 
          345                 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
          346                     prop == BIDI_PROP_AL) {
          347                         last_strong_prop = prop;
          348                 }
          349         }
          350 
          351         return last_strong_prop;
          352 }
          353 
          354 static enum bidi_property
          355 ir_get_last_strong_or_number_prop(const struct isolate_runner *ir)
          356 {
          357         struct isolate_runner tmp;
          358         enum bidi_property last_strong_or_number_prop = ir->sos, prop;
          359 
          360         ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
          361                 &tmp);
          362         for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
          363                 prop = ir_get_current_prop(&tmp);
          364 
          365                 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
          366                     prop == BIDI_PROP_AL || prop == BIDI_PROP_EN ||
          367                     prop == BIDI_PROP_AN) {
          368                         last_strong_or_number_prop = prop;
          369                 }
          370         }
          371 
          372         return last_strong_or_number_prop;
          373 }
          374 
          375 static void
          376 preprocess_bracket_pair(const struct isolate_runner *start,
          377                         const struct isolate_runner *end)
          378 {
          379         enum bidi_property prop, bracket_prop, last_strong_or_number_prop;
          380         struct isolate_runner ir;
          381         size_t strong_type_off;
          382 
          383         /*
          384          * check if the bracket contains a strong type (L or R|EN|AN)
          385          */
          386         for (ir = *start, strong_type_off = SIZE_MAX,
          387             bracket_prop = NUM_BIDI_PROPS;
          388              !ir_advance(&ir) && ir.cur.off < end->cur.off;) {
          389                 prop = ir_get_current_prop(&ir);
          390 
          391                 if (prop == BIDI_PROP_L) {
          392                         strong_type_off = ir.cur.off;
          393                         if (ir.isolating_run_level % 2 == 0) {
          394                                 /*
          395                                  * set the type for both brackets to L (so they
          396                                  * match the strong type they contain)
          397                                  */
          398                                 bracket_prop = BIDI_PROP_L;
          399                         }
          400                 } else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN ||
          401                            prop == BIDI_PROP_AN) {
          402                         strong_type_off = ir.cur.off;
          403                         if (ir.isolating_run_level % 2 != 0) {
          404                                 /*
          405                                  * set the type for both brackets to R (so they
          406                                  * match the strong type they contain)
          407                                  */
          408                                 bracket_prop = BIDI_PROP_R;
          409                         }
          410                 }
          411         }
          412         if (strong_type_off == SIZE_MAX) {
          413                 /*
          414                  * there are no strong types within the brackets and we just
          415                  * leave the brackets as is
          416                  */
          417                 return;
          418         }
          419 
          420         if (bracket_prop == NUM_BIDI_PROPS) {
          421                 /*
          422                  * We encountered a strong type, but it was opposite
          423                  * to the embedding direction.
          424                  * Check the previous strong type before the opening
          425                  * bracket
          426                  */
          427                 last_strong_or_number_prop =
          428                         ir_get_last_strong_or_number_prop(start);
          429                 if (last_strong_or_number_prop == BIDI_PROP_L &&
          430                     ir.isolating_run_level % 2 != 0) {
          431                         /*
          432                          * the previous strong type is also opposite
          433                          * to the embedding direction, so the context
          434                          * was established and we set the brackets
          435                          * accordingly.
          436                          */
          437                         bracket_prop = BIDI_PROP_L;
          438                 } else if ((last_strong_or_number_prop == BIDI_PROP_R ||
          439                             last_strong_or_number_prop == BIDI_PROP_EN ||
          440                             last_strong_or_number_prop == BIDI_PROP_AN) &&
          441                            ir.isolating_run_level % 2 == 0) {
          442                         /*
          443                          * the previous strong type is also opposite
          444                          * to the embedding direction, so the context
          445                          * was established and we set the brackets
          446                          * accordingly.
          447                          */
          448                         bracket_prop = BIDI_PROP_R;
          449                 } else {
          450                         /* set brackets to the embedding direction */
          451                         if (ir.isolating_run_level % 2 == 0) {
          452                                 bracket_prop = BIDI_PROP_L;
          453                         } else {
          454                                 bracket_prop = BIDI_PROP_R;
          455                         }
          456                 }
          457         }
          458 
          459         ir_set_current_prop(start, bracket_prop);
          460         ir_set_current_prop(end, bracket_prop);
          461 
          462         /*
          463          * any sequence of NSMs after opening or closing brackets get
          464          * the same property as the one we set on the brackets
          465          */
          466         for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_prop(
          467                                                       &ir) == BIDI_PROP_NSM;) {
          468                 ir_set_current_prop(&ir, bracket_prop);
          469         }
          470         for (ir = *end; !ir_advance(&ir) &&
          471                         ir_get_current_preserved_prop(&ir) == BIDI_PROP_NSM;) {
          472                 ir_set_current_prop(&ir, bracket_prop);
          473         }
          474 }
          475 
          476 static void
          477 preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off,
          478                          uint_least8_t paragraph_level)
          479 {
          480         /*
          481          * The N0-rule deals with bracket pairs that shall be determined
          482          * with the rule BD16. This is specified as an algorithm with a
          483          * stack of 63 bracket openings that are used to resolve into a
          484          * separate list of pairs, which is then to be sorted by opening
          485          * position. Thus, even though the bracketing-depth is limited
          486          * by 63, the algorithm, as is, requires dynamic memory
          487          * management.
          488          *
          489          * A naive approach (used by Fribidi) would be to screw the
          490          * stack-approach and simply directly determine the
          491          * corresponding closing bracket offset for a given opening
          492          * bracket, leading to O(n²) time complexity in the worst case
          493          * with a lot of brackets. While many brackets are not common,
          494          * it is still possible to find a middle ground where you obtain
          495          * strongly linear time complexity in most common cases:
          496          *
          497          * Instead of a stack, we use a FIFO data structure which is
          498          * filled with bracket openings in the order of appearance (thus
          499          * yielding an implicit sorting!) at the top. If the
          500          * corresponding closing bracket is encountered, it is added to
          501          * the respective entry, making it ready to "move out" at the
          502          * bottom (i.e. passed to the bracket processing). Due to the
          503          * nature of the specified pair detection algorithm, which only
          504          * cares about the bracket type and nothing else (bidi class,
          505          * level, etc.), we can mix processing and bracket detection.
          506          *
          507          * Obviously, if you, for instance, have one big bracket pair at
          508          * the bottom that has not been closed yet, it will block the
          509          * bracket processing and the FIFO might hit its capacity limit.
          510          * At this point, the blockage is manually resolved using the
          511          * naive quadratic approach.
          512          *
          513          * To remain within the specified standard behaviour, which
          514          * mandates that processing of brackets should stop when the
          515          * bracketing-depth is at 63, we simply check in an "overflow"
          516          * scenario if all 63 elements in the LIFO are unfinished, which
          517          * corresponds with such a bracketing depth.
          518          */
          519         enum bidi_property prop;
          520 
          521         struct {
          522                 bool complete;
          523                 size_t bracket_class;
          524                 struct isolate_runner start;
          525                 struct isolate_runner end;
          526         } fifo[63];
          527         const struct bracket *bracket_prop, *tmp_bracket_prop;
          528         struct isolate_runner ir, tmp_ir;
          529         size_t fifo_len = 0, i, blevel, j, k;
          530 
          531         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          532         while (!ir_advance(&ir)) {
          533                 prop = ir_get_current_prop(&ir);
          534                 bracket_prop = ir_get_current_bracket_prop(&ir);
          535                 if (prop == BIDI_PROP_ON &&
          536                     bracket_prop->type == BIDI_BRACKET_OPEN) {
          537                         if (fifo_len == LEN(fifo)) {
          538                                 /*
          539                                  * The FIFO is full, check first if it's
          540                                  * completely blocked (i.e. no finished
          541                                  * bracket pairs, triggering the standard
          542                                  * that mandates to abort in such a case
          543                                  */
          544                                 for (i = 0; i < fifo_len; i++) {
          545                                         if (fifo[i].complete) {
          546                                                 break;
          547                                         }
          548                                 }
          549                                 if (i == fifo_len) {
          550                                         /* abort processing */
          551                                         return;
          552                                 }
          553 
          554                                 /*
          555                                  * by construction, the bottom entry
          556                                  * in the FIFO is guaranteed to be
          557                                  * unfinished (given we "consume" all
          558                                  * finished bottom entries after each
          559                                  * iteration).
          560                                  *
          561                                  * iterate, starting after the opening
          562                                  * bracket, and find the corresponding
          563                                  * closing bracket.
          564                                  *
          565                                  * if we find none, just drop the FIFO
          566                                  * entry silently
          567                                  */
          568                                 for (tmp_ir = fifo[0].start, blevel = 0;
          569                                      !ir_advance(&tmp_ir);) {
          570                                         tmp_bracket_prop =
          571                                                 ir_get_current_bracket_prop(
          572                                                         &tmp_ir);
          573 
          574                                         if (tmp_bracket_prop->type ==
          575                                                     BIDI_BRACKET_OPEN &&
          576                                             tmp_bracket_prop->class ==
          577                                                     bracket_prop->class) {
          578                                                 /* we encountered another
          579                                                  * opening bracket of the
          580                                                  * same class */
          581                                                 blevel++;
          582 
          583                                         } else if (tmp_bracket_prop->type ==
          584                                                            BIDI_BRACKET_CLOSE &&
          585                                                    tmp_bracket_prop->class ==
          586                                                            bracket_prop
          587                                                                    ->class) {
          588                                                 /* we encountered a closing
          589                                                  * bracket of the same class
          590                                                  */
          591                                                 if (blevel == 0) {
          592                                                         /* this is the
          593                                                          * corresponding
          594                                                          * closing bracket
          595                                                          */
          596                                                         fifo[0].complete = true;
          597                                                         fifo[0].end = ir;
          598                                                 } else {
          599                                                         blevel--;
          600                                                 }
          601                                         }
          602                                 }
          603                                 if (fifo[0].complete) {
          604                                         /* we found the matching bracket */
          605                                         preprocess_bracket_pair(
          606                                                 &(fifo[i].start),
          607                                                 &(fifo[i].end));
          608                                 }
          609 
          610                                 /* shift FIFO one to the left */
          611                                 for (i = 1; i < fifo_len; i++) {
          612                                         fifo[i - 1] = fifo[i];
          613                                 }
          614                                 fifo_len--;
          615                         }
          616 
          617                         /* add element to the FIFO */
          618                         fifo_len++;
          619                         fifo[fifo_len - 1].complete = false;
          620                         fifo[fifo_len - 1].bracket_class = bracket_prop->class;
          621                         fifo[fifo_len - 1].start = ir;
          622                 } else if (prop == BIDI_PROP_ON &&
          623                            bracket_prop->type == BIDI_BRACKET_CLOSE) {
          624                         /*
          625                          * go backwards in the FIFO, skip finished entries
          626                          * and simply ignore (do nothing) the closing
          627                          * bracket if we do not match anything
          628                          */
          629                         for (i = fifo_len; i > 0; i--) {
          630                                 if (bracket_prop->class ==
          631                                             fifo[i - 1].bracket_class &&
          632                                     !fifo[i - 1].complete) {
          633                                         /* we have found a pair */
          634                                         fifo[i - 1].complete = true;
          635                                         fifo[i - 1].end = ir;
          636 
          637                                         /* remove all uncompleted FIFO elements
          638                                          * above i - 1 */
          639                                         for (j = i; j < fifo_len;) {
          640                                                 if (fifo[j].complete) {
          641                                                         j++;
          642                                                         continue;
          643                                                 }
          644 
          645                                                 /* shift FIFO one to the left */
          646                                                 for (k = j + 1; k < fifo_len;
          647                                                      k++) {
          648                                                         fifo[k - 1] = fifo[k];
          649                                                 }
          650                                                 fifo_len--;
          651                                         }
          652                                         break;
          653                                 }
          654                         }
          655                 }
          656 
          657                 /* process all ready bracket pairs from the FIFO bottom */
          658                 while (fifo_len > 0 && fifo[0].complete) {
          659                         preprocess_bracket_pair(&(fifo[0].start),
          660                                                 &(fifo[0].end));
          661 
          662                         /* shift FIFO one to the left */
          663                         for (j = 0; j + 1 < fifo_len; j++) {
          664                                 fifo[j] = fifo[j + 1];
          665                         }
          666                         fifo_len--;
          667                 }
          668         }
          669 
          670         /*
          671          * afterwards, we still might have unfinished bracket pairs
          672          * that will remain as such, but the subsequent finished pairs
          673          * also need to be taken into account, so we go through the
          674          * FIFO once more and process all finished pairs
          675          */
          676         for (i = 0; i < fifo_len; i++) {
          677                 if (fifo[i].complete) {
          678                         preprocess_bracket_pair(&(fifo[i].start),
          679                                                 &(fifo[i].end));
          680                 }
          681         }
          682 }
          683 
          684 static size_t
          685 preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen,
          686                                   size_t off, uint_least8_t paragraph_level)
          687 {
          688         enum bidi_property sequence_prop, prop;
          689         struct isolate_runner ir, tmp;
          690         size_t runsince, sequence_end;
          691 
          692         /* W1 */
          693         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          694         while (!ir_advance(&ir)) {
          695                 if (ir_get_current_prop(&ir) == BIDI_PROP_NSM) {
          696                         prop = ir_get_previous_prop(&ir);
          697 
          698                         if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
          699                             prop == BIDI_PROP_FSI || prop == BIDI_PROP_PDI) {
          700                                 ir_set_current_prop(&ir, BIDI_PROP_ON);
          701                         } else {
          702                                 ir_set_current_prop(&ir, prop);
          703                         }
          704                 }
          705         }
          706 
          707         /* W2 */
          708         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          709         while (!ir_advance(&ir)) {
          710                 if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
          711                     ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) {
          712                         ir_set_current_prop(&ir, BIDI_PROP_AN);
          713                 }
          714         }
          715 
          716         /* W3 */
          717         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          718         while (!ir_advance(&ir)) {
          719                 if (ir_get_current_prop(&ir) == BIDI_PROP_AL) {
          720                         ir_set_current_prop(&ir, BIDI_PROP_R);
          721                 }
          722         }
          723 
          724         /* W4 */
          725         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          726         while (!ir_advance(&ir)) {
          727                 if (ir_get_previous_prop(&ir) == BIDI_PROP_EN &&
          728                     (ir_get_current_prop(&ir) == BIDI_PROP_ES ||
          729                      ir_get_current_prop(&ir) == BIDI_PROP_CS) &&
          730                     ir_get_next_prop(&ir) == BIDI_PROP_EN) {
          731                         ir_set_current_prop(&ir, BIDI_PROP_EN);
          732                 }
          733 
          734                 if (ir_get_previous_prop(&ir) == BIDI_PROP_AN &&
          735                     ir_get_current_prop(&ir) == BIDI_PROP_CS &&
          736                     ir_get_next_prop(&ir) == BIDI_PROP_AN) {
          737                         ir_set_current_prop(&ir, BIDI_PROP_AN);
          738                 }
          739         }
          740 
          741         /* W5 */
          742         runsince = SIZE_MAX;
          743         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          744         while (!ir_advance(&ir)) {
          745                 if (ir_get_current_prop(&ir) == BIDI_PROP_ET) {
          746                         if (runsince == SIZE_MAX) {
          747                                 /* a new run has begun */
          748                                 runsince = ir.cur.off;
          749                         }
          750                 } else if (ir_get_current_prop(&ir) == BIDI_PROP_EN) {
          751                         /* set the preceding sequence */
          752                         if (runsince != SIZE_MAX) {
          753                                 ir_init(buf, buflen, runsince, paragraph_level,
          754                                         (runsince > off), &tmp);
          755                                 while (!ir_advance(&tmp) &&
          756                                        tmp.cur.off < ir.cur.off) {
          757                                         ir_set_current_prop(&tmp, BIDI_PROP_EN);
          758                                 }
          759                                 runsince = SIZE_MAX;
          760                         } else {
          761                                 ir_init(buf, buflen, ir.cur.off,
          762                                         paragraph_level, (ir.cur.off > off),
          763                                         &tmp);
          764                                 ir_advance(&tmp);
          765                         }
          766                         /* follow the succeeding sequence */
          767                         while (!ir_advance(&tmp)) {
          768                                 if (ir_get_current_prop(&tmp) != BIDI_PROP_ET) {
          769                                         break;
          770                                 }
          771                                 ir_set_current_prop(&tmp, BIDI_PROP_EN);
          772                         }
          773                 } else {
          774                         /* sequence ended */
          775                         runsince = SIZE_MAX;
          776                 }
          777         }
          778 
          779         /* W6 */
          780         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          781         while (!ir_advance(&ir)) {
          782                 prop = ir_get_current_prop(&ir);
          783 
          784                 if (prop == BIDI_PROP_ES || prop == BIDI_PROP_ET ||
          785                     prop == BIDI_PROP_CS) {
          786                         ir_set_current_prop(&ir, BIDI_PROP_ON);
          787                 }
          788         }
          789 
          790         /* W7 */
          791         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          792         while (!ir_advance(&ir)) {
          793                 if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
          794                     ir_get_last_strong_prop(&ir) == BIDI_PROP_L) {
          795                         ir_set_current_prop(&ir, BIDI_PROP_L);
          796                 }
          797         }
          798 
          799         /* N0 */
          800         preprocess_bracket_pairs(buf, buflen, off, paragraph_level);
          801 
          802         /* N1 */
          803         sequence_end = SIZE_MAX;
          804         sequence_prop = NUM_BIDI_PROPS;
          805         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          806         while (!ir_advance(&ir)) {
          807                 if (sequence_end == SIZE_MAX) {
          808                         prop = ir_get_current_prop(&ir);
          809 
          810                         if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
          811                             prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
          812                             prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
          813                             prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
          814                                 /* the current character is an NI (neutral
          815                                  * or isolate) */
          816 
          817                                 /* scan ahead to the end of the NI-sequence
          818                                  */
          819                                 ir_init(buf, buflen, ir.cur.off,
          820                                         paragraph_level, (ir.cur.off > off),
          821                                         &tmp);
          822                                 while (!ir_advance(&tmp)) {
          823                                         prop = ir_get_next_prop(&tmp);
          824 
          825                                         if (prop != BIDI_PROP_B &&
          826                                             prop != BIDI_PROP_S &&
          827                                             prop != BIDI_PROP_WS &&
          828                                             prop != BIDI_PROP_ON &&
          829                                             prop != BIDI_PROP_FSI &&
          830                                             prop != BIDI_PROP_LRI &&
          831                                             prop != BIDI_PROP_RLI &&
          832                                             prop != BIDI_PROP_PDI) {
          833                                                 break;
          834                                         }
          835                                 }
          836 
          837                                 /*
          838                                  * check what follows and see if the text
          839                                  * has the same direction on both sides
          840                                  */
          841                                 if (ir_get_previous_prop(&ir) == BIDI_PROP_L &&
          842                                     ir_get_next_prop(&tmp) == BIDI_PROP_L) {
          843                                         sequence_end = tmp.cur.off;
          844                                         sequence_prop = BIDI_PROP_L;
          845                                 } else if ((ir_get_previous_prop(&ir) ==
          846                                                     BIDI_PROP_R ||
          847                                             ir_get_previous_prop(&ir) ==
          848                                                     BIDI_PROP_EN ||
          849                                             ir_get_previous_prop(&ir) ==
          850                                                     BIDI_PROP_AN) &&
          851                                            (ir_get_next_prop(&tmp) ==
          852                                                     BIDI_PROP_R ||
          853                                             ir_get_next_prop(&tmp) ==
          854                                                     BIDI_PROP_EN ||
          855                                             ir_get_next_prop(&tmp) ==
          856                                                     BIDI_PROP_AN)) {
          857                                         sequence_end = tmp.cur.off;
          858                                         sequence_prop = BIDI_PROP_R;
          859                                 }
          860                         }
          861                 }
          862 
          863                 if (sequence_end != SIZE_MAX) {
          864                         if (ir.cur.off <= sequence_end) {
          865                                 ir_set_current_prop(&ir, sequence_prop);
          866                         } else {
          867                                 /* end of sequence, reset */
          868                                 sequence_end = SIZE_MAX;
          869                                 sequence_prop = NUM_BIDI_PROPS;
          870                         }
          871                 }
          872         }
          873 
          874         /* N2 */
          875         ir_init(buf, buflen, off, paragraph_level, false, &ir);
          876         while (!ir_advance(&ir)) {
          877                 prop = ir_get_current_prop(&ir);
          878 
          879                 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
          880                     prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
          881                     prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
          882                     prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
          883                         /* N2 */
          884                         if (ir_get_current_level(&ir) % 2 == 0) {
          885                                 /* even embedding level */
          886                                 ir_set_current_prop(&ir, BIDI_PROP_L);
          887                         } else {
          888                                 /* odd embedding level */
          889                                 ir_set_current_prop(&ir, BIDI_PROP_R);
          890                         }
          891                 }
          892         }
          893 
          894         return 0;
          895 }
          896 
          897 static uint_least8_t
          898 get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen)
          899 {
          900         enum bidi_property prop;
          901         int_least8_t isolate_level;
          902         size_t stateoff;
          903 
          904         /* determine paragraph level (rules P1-P3) and terminate on PDI */
          905         for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++) {
          906                 prop = (uint_least8_t)get_state(STATE_PROP, state[stateoff]);
          907 
          908                 if (prop == BIDI_PROP_PDI && isolate_level == 0) {
          909                         /*
          910                          * we are in a FSI-subsection of a paragraph and
          911                          * matched with the terminating PDI
          912                          */
          913                         break;
          914                 }
          915 
          916                 /* BD8/BD9 */
          917                 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
          918                      prop == BIDI_PROP_FSI) &&
          919                     isolate_level < MAX_DEPTH) {
          920                         /* we hit an isolate initiator, increment counter */
          921                         isolate_level++;
          922                 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
          923                         isolate_level--;
          924                 }
          925 
          926                 /* P2 */
          927                 if (isolate_level > 0) {
          928                         continue;
          929                 }
          930 
          931                 /* P3 */
          932                 if (prop == BIDI_PROP_L) {
          933                         return 0;
          934                 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
          935                         return 1;
          936                 }
          937         }
          938 
          939         return 0;
          940 }
          941 
          942 static inline uint_least8_t
          943 get_bidi_property(uint_least32_t cp)
          944 {
          945         if (likely(cp <= 0x10FFFF)) {
          946                 return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
          947                        0x1F /* 00011111 */;
          948         } else {
          949                 return BIDI_PROP_L;
          950         }
          951 }
          952 
          953 static uint_least8_t
          954 get_paragraph_level(enum grapheme_bidirectional_direction override,
          955                     const HERODOTUS_READER *r)
          956 {
          957         HERODOTUS_READER tmp;
          958         enum bidi_property prop;
          959         int_least8_t isolate_level;
          960         uint_least32_t cp;
          961 
          962         /* check overrides first according to rule HL1 */
          963         if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
          964                 return 0;
          965         } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
          966                 return 1;
          967         }
          968 
          969         /* copy reader into temporary reader */
          970         herodotus_reader_copy(r, &tmp);
          971 
          972         /* determine paragraph level (rules P1-P3) */
          973         for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) ==
          974                                 HERODOTUS_STATUS_SUCCESS;) {
          975                 prop = get_bidi_property(cp);
          976 
          977                 /* BD8/BD9 */
          978                 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
          979                      prop == BIDI_PROP_FSI) &&
          980                     isolate_level < MAX_DEPTH) {
          981                         /* we hit an isolate initiator, increment counter */
          982                         isolate_level++;
          983                 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
          984                         isolate_level--;
          985                 }
          986 
          987                 /* P2 */
          988                 if (isolate_level > 0) {
          989                         continue;
          990                 }
          991 
          992                 /* P3 */
          993                 if (prop == BIDI_PROP_L) {
          994                         return 0;
          995                 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
          996                         return 1;
          997                 }
          998         }
          999 
         1000         return 0;
         1001 }
         1002 
         1003 static void
         1004 preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf,
         1005                      size_t buflen)
         1006 {
         1007         enum bidi_property prop;
         1008         int_least8_t level;
         1009 
         1010         struct {
         1011                 int_least8_t level;
         1012                 enum grapheme_bidirectional_direction override;
         1013                 bool directional_isolate;
         1014         } directional_status[MAX_DEPTH + 2], *dirstat = directional_status;
         1015 
         1016         size_t overflow_isolate_count, overflow_embedding_count,
         1017                 valid_isolate_count, bufoff, i, runsince;
         1018 
         1019         /* X1 */
         1020         dirstat->level = (int_least8_t)paragraph_level;
         1021         dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
         1022         dirstat->directional_isolate = false;
         1023         overflow_isolate_count = overflow_embedding_count =
         1024                 valid_isolate_count = 0;
         1025 
         1026         for (bufoff = 0; bufoff < buflen; bufoff++) {
         1027                 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
         1028 
         1029                 /* set paragraph level we need for line-level-processing */
         1030                 set_state(STATE_PARAGRAPH_LEVEL, paragraph_level,
         1031                           &(buf[bufoff]));
         1032 again:
         1033                 if (prop == BIDI_PROP_RLE) {
         1034                         /* X2 */
         1035                         if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
         1036                                     MAX_DEPTH &&
         1037                             overflow_isolate_count == 0 &&
         1038                             overflow_embedding_count == 0) {
         1039                                 /* valid RLE */
         1040                                 dirstat++;
         1041                                 dirstat->level =
         1042                                         (dirstat - 1)->level +
         1043                                         ((dirstat - 1)->level % 2 != 0) + 1;
         1044                                 dirstat->override =
         1045                                         GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
         1046                                 dirstat->directional_isolate = false;
         1047                         } else {
         1048                                 /* overflow RLE */
         1049                                 overflow_embedding_count +=
         1050                                         (overflow_isolate_count == 0);
         1051                         }
         1052                 } else if (prop == BIDI_PROP_LRE) {
         1053                         /* X3 */
         1054                         if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
         1055                                     MAX_DEPTH &&
         1056                             overflow_isolate_count == 0 &&
         1057                             overflow_embedding_count == 0) {
         1058                                 /* valid LRE */
         1059                                 dirstat++;
         1060                                 dirstat->level =
         1061                                         (dirstat - 1)->level +
         1062                                         ((dirstat - 1)->level % 2 == 0) + 1;
         1063                                 dirstat->override =
         1064                                         GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
         1065                                 dirstat->directional_isolate = false;
         1066                         } else {
         1067                                 /* overflow LRE */
         1068                                 overflow_embedding_count +=
         1069                                         (overflow_isolate_count == 0);
         1070                         }
         1071                 } else if (prop == BIDI_PROP_RLO) {
         1072                         /* X4 */
         1073                         if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
         1074                                     MAX_DEPTH &&
         1075                             overflow_isolate_count == 0 &&
         1076                             overflow_embedding_count == 0) {
         1077                                 /* valid RLO */
         1078                                 dirstat++;
         1079                                 dirstat->level =
         1080                                         (dirstat - 1)->level +
         1081                                         ((dirstat - 1)->level % 2 != 0) + 1;
         1082                                 dirstat->override =
         1083                                         GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
         1084                                 dirstat->directional_isolate = false;
         1085                         } else {
         1086                                 /* overflow RLO */
         1087                                 overflow_embedding_count +=
         1088                                         (overflow_isolate_count == 0);
         1089                         }
         1090                 } else if (prop == BIDI_PROP_LRO) {
         1091                         /* X5 */
         1092                         if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
         1093                                     MAX_DEPTH &&
         1094                             overflow_isolate_count == 0 &&
         1095                             overflow_embedding_count == 0) {
         1096                                 /* valid LRE */
         1097                                 dirstat++;
         1098                                 dirstat->level =
         1099                                         (dirstat - 1)->level +
         1100                                         ((dirstat - 1)->level % 2 == 0) + 1;
         1101                                 dirstat->override =
         1102                                         GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
         1103                                 dirstat->directional_isolate = false;
         1104                         } else {
         1105                                 /* overflow LRO */
         1106                                 overflow_embedding_count +=
         1107                                         (overflow_isolate_count == 0);
         1108                         }
         1109                 } else if (prop == BIDI_PROP_RLI) {
         1110                         /* X5a */
         1111                         set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
         1112                         if (dirstat->override ==
         1113                             GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
         1114                                 set_state(STATE_PROP, BIDI_PROP_L,
         1115                                           &(buf[bufoff]));
         1116                         } else if (dirstat->override ==
         1117                                    GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
         1118                                 set_state(STATE_PROP, BIDI_PROP_R,
         1119                                           &(buf[bufoff]));
         1120                         }
         1121 
         1122                         if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
         1123                                     MAX_DEPTH &&
         1124                             overflow_isolate_count == 0 &&
         1125                             overflow_embedding_count == 0) {
         1126                                 /* valid RLI */
         1127                                 valid_isolate_count++;
         1128 
         1129                                 dirstat++;
         1130                                 dirstat->level =
         1131                                         (dirstat - 1)->level +
         1132                                         ((dirstat - 1)->level % 2 != 0) + 1;
         1133                                 dirstat->override =
         1134                                         GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
         1135                                 dirstat->directional_isolate = true;
         1136                         } else {
         1137                                 /* overflow RLI */
         1138                                 overflow_isolate_count++;
         1139                         }
         1140                 } else if (prop == BIDI_PROP_LRI) {
         1141                         /* X5b */
         1142                         set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
         1143                         if (dirstat->override ==
         1144                             GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
         1145                                 set_state(STATE_PROP, BIDI_PROP_L,
         1146                                           &(buf[bufoff]));
         1147                         } else if (dirstat->override ==
         1148                                    GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
         1149                                 set_state(STATE_PROP, BIDI_PROP_R,
         1150                                           &(buf[bufoff]));
         1151                         }
         1152 
         1153                         if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
         1154                                     MAX_DEPTH &&
         1155                             overflow_isolate_count == 0 &&
         1156                             overflow_embedding_count == 0) {
         1157                                 /* valid LRI */
         1158                                 valid_isolate_count++;
         1159 
         1160                                 dirstat++;
         1161                                 dirstat->level =
         1162                                         (dirstat - 1)->level +
         1163                                         ((dirstat - 1)->level % 2 == 0) + 1;
         1164                                 dirstat->override =
         1165                                         GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
         1166                                 dirstat->directional_isolate = true;
         1167                         } else {
         1168                                 /* overflow LRI */
         1169                                 overflow_isolate_count++;
         1170                         }
         1171                 } else if (prop == BIDI_PROP_FSI) {
         1172                         /* X5c */
         1173                         if (get_isolated_paragraph_level(
         1174                                     buf + (bufoff + 1),
         1175                                     buflen - (bufoff + 1)) == 1) {
         1176                                 prop = BIDI_PROP_RLI;
         1177                                 goto again;
         1178                         } else { /* ... == 0 */
         1179                                 prop = BIDI_PROP_LRI;
         1180                                 goto again;
         1181                         }
         1182                 } else if (prop != BIDI_PROP_B && prop != BIDI_PROP_BN &&
         1183                            prop != BIDI_PROP_PDF && prop != BIDI_PROP_PDI) {
         1184                         /* X6 */
         1185                         set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
         1186                         if (dirstat->override ==
         1187                             GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
         1188                                 set_state(STATE_PROP, BIDI_PROP_L,
         1189                                           &(buf[bufoff]));
         1190                         } else if (dirstat->override ==
         1191                                    GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
         1192                                 set_state(STATE_PROP, BIDI_PROP_R,
         1193                                           &(buf[bufoff]));
         1194                         }
         1195                 } else if (prop == BIDI_PROP_PDI) {
         1196                         /* X6a */
         1197                         if (overflow_isolate_count > 0) {
         1198                                 /* PDI matches an overflow isolate initiator
         1199                                  */
         1200                                 overflow_isolate_count--;
         1201                         } else if (valid_isolate_count > 0) {
         1202                                 /* PDI matches a normal isolate initiator */
         1203                                 overflow_embedding_count = 0;
         1204                                 while (dirstat->directional_isolate == false &&
         1205                                        dirstat > directional_status) {
         1206                                         /*
         1207                                          * we are safe here as given the
         1208                                          * valid isolate count is positive
         1209                                          * there must be a stack-entry
         1210                                          * with positive directional
         1211                                          * isolate status, but we take
         1212                                          * no chances and include an
         1213                                          * explicit check
         1214                                          *
         1215                                          * POSSIBLE OPTIMIZATION: Whenever
         1216                                          * we push on the stack, check if it
         1217                                          * has the directional isolate
         1218                                          * status true and store a pointer
         1219                                          * to it so we can jump to it very
         1220                                          * quickly.
         1221                                          */
         1222                                         dirstat--;
         1223                                 }
         1224 
         1225                                 /*
         1226                                  * as above, the following check is not
         1227                                  * necessary, given we are guaranteed to
         1228                                  * have at least one stack entry left,
         1229                                  * but it's better to be safe
         1230                                  */
         1231                                 if (dirstat > directional_status) {
         1232                                         dirstat--;
         1233                                 }
         1234                                 valid_isolate_count--;
         1235                         }
         1236 
         1237                         set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
         1238                         if (dirstat->override ==
         1239                             GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
         1240                                 set_state(STATE_PROP, BIDI_PROP_L,
         1241                                           &(buf[bufoff]));
         1242                         } else if (dirstat->override ==
         1243                                    GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
         1244                                 set_state(STATE_PROP, BIDI_PROP_R,
         1245                                           &(buf[bufoff]));
         1246                         }
         1247                 } else if (prop == BIDI_PROP_PDF) {
         1248                         /* X7 */
         1249                         if (overflow_isolate_count > 0) {
         1250                                 /* do nothing */
         1251                         } else if (overflow_embedding_count > 0) {
         1252                                 overflow_embedding_count--;
         1253                         } else if (dirstat->directional_isolate == false &&
         1254                                    dirstat > directional_status) {
         1255                                 dirstat--;
         1256                         }
         1257                 } else if (prop == BIDI_PROP_B) {
         1258                         /* X8 */
         1259                         set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
         1260                 }
         1261 
         1262                 /* X9 */
         1263                 if (prop == BIDI_PROP_RLE || prop == BIDI_PROP_LRE ||
         1264                     prop == BIDI_PROP_RLO || prop == BIDI_PROP_LRO ||
         1265                     prop == BIDI_PROP_PDF || prop == BIDI_PROP_BN) {
         1266                         set_state(STATE_LEVEL, -1, &(buf[bufoff]));
         1267                 }
         1268         }
         1269 
         1270         /* X10 (W1-W7, N0-N2) */
         1271         for (bufoff = 0; bufoff < buflen; bufoff++) {
         1272                 if (get_state(STATE_VISITED, buf[bufoff]) == 0 &&
         1273                     get_state(STATE_LEVEL, buf[bufoff]) != -1) {
         1274                         bufoff += preprocess_isolating_run_sequence(
         1275                                 buf, buflen, bufoff, paragraph_level);
         1276                 }
         1277         }
         1278 
         1279         /*
         1280          * I1-I2 (given our sequential approach to processing the
         1281          * isolating run sequences, we apply this rule separately)
         1282          */
         1283         for (bufoff = 0; bufoff < buflen; bufoff++) {
         1284                 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
         1285                 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
         1286 
         1287                 if (level % 2 == 0) {
         1288                         /* even level */
         1289                         if (prop == BIDI_PROP_R) {
         1290                                 set_state(STATE_LEVEL, level + 1,
         1291                                           &(buf[bufoff]));
         1292                         } else if (prop == BIDI_PROP_AN ||
         1293                                    prop == BIDI_PROP_EN) {
         1294                                 set_state(STATE_LEVEL, level + 2,
         1295                                           &(buf[bufoff]));
         1296                         }
         1297                 } else {
         1298                         /* odd level */
         1299                         if (prop == BIDI_PROP_L || prop == BIDI_PROP_EN ||
         1300                             prop == BIDI_PROP_AN) {
         1301                                 set_state(STATE_LEVEL, level + 1,
         1302                                           &(buf[bufoff]));
         1303                         }
         1304                 }
         1305         }
         1306 
         1307         /* L1 (rules 1-3) */
         1308         runsince = SIZE_MAX;
         1309         for (bufoff = 0; bufoff < buflen; bufoff++) {
         1310                 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
         1311                 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
         1312                                                 buf[bufoff]);
         1313 
         1314                 if (level == -1) {
         1315                         /* ignored character */
         1316                         continue;
         1317                 }
         1318 
         1319                 /* rules 1 and 2 */
         1320                 if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
         1321                         set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
         1322                 }
         1323 
         1324                 /* rule 3 */
         1325                 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
         1326                     prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
         1327                     prop == BIDI_PROP_PDI) {
         1328                         if (runsince == SIZE_MAX) {
         1329                                 /* a new run has begun */
         1330                                 runsince = bufoff;
         1331                         }
         1332                 } else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) &&
         1333                            runsince != SIZE_MAX) {
         1334                         /*
         1335                          * we hit a segment or paragraph separator in a
         1336                          * sequence, reset sequence-levels
         1337                          */
         1338                         for (i = runsince; i < bufoff; i++) {
         1339                                 if (get_state(STATE_LEVEL, buf[i]) != -1) {
         1340                                         set_state(STATE_LEVEL, paragraph_level,
         1341                                                   &(buf[i]));
         1342                                 }
         1343                         }
         1344                         runsince = SIZE_MAX;
         1345                 } else {
         1346                         /* sequence ended */
         1347                         runsince = SIZE_MAX;
         1348                 }
         1349         }
         1350         if (runsince != SIZE_MAX) {
         1351                 /*
         1352                  * this is the end of the paragraph and we
         1353                  * are in a run
         1354                  */
         1355                 for (i = runsince; i < buflen; i++) {
         1356                         if (get_state(STATE_LEVEL, buf[i]) != -1) {
         1357                                 set_state(STATE_LEVEL, paragraph_level,
         1358                                           &(buf[i]));
         1359                         }
         1360                 }
         1361                 runsince = SIZE_MAX;
         1362         }
         1363 }
         1364 
         1365 static inline uint_least8_t
         1366 get_bidi_bracket_off(uint_least32_t cp)
         1367 {
         1368         if (likely(cp <= 0x10FFFF)) {
         1369                 return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] +
         1370                                                    (cp & 0xff)]) >>
         1371                                        5);
         1372         } else {
         1373                 return 0;
         1374         }
         1375 }
         1376 
         1377 static size_t
         1378 preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override,
         1379            uint_least32_t *buf, size_t buflen,
         1380            enum grapheme_bidirectional_direction *resolved)
         1381 {
         1382         HERODOTUS_READER tmp;
         1383         size_t bufoff, bufsize, paragraph_len;
         1384         uint_least32_t cp;
         1385         uint_least8_t paragraph_level;
         1386 
         1387         /* determine length and level of the paragraph */
         1388         herodotus_reader_copy(r, &tmp);
         1389         for (; herodotus_read_codepoint(&tmp, true, &cp) ==
         1390                HERODOTUS_STATUS_SUCCESS;) {
         1391                 /* break on paragraph separator */
         1392                 if (get_bidi_property(cp) == BIDI_PROP_B) {
         1393                         break;
         1394                 }
         1395         }
         1396         paragraph_len = herodotus_reader_number_read(&tmp);
         1397         paragraph_level = get_paragraph_level(override, r);
         1398 
         1399         if (resolved != NULL) {
         1400                 /* store resolved paragraph level in output variable */
         1401                 /* TODO use enum-type */
         1402                 *resolved = (paragraph_level == 0) ?
         1403                                     GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR :
         1404                                     GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
         1405         }
         1406 
         1407         if (buf == NULL) {
         1408                 /* see below for return value reasoning */
         1409                 return paragraph_len;
         1410         }
         1411 
         1412         /*
         1413          * the first step is to determine the bidirectional properties
         1414          * and store them in the buffer
         1415          */
         1416         for (bufoff = 0;
         1417              bufoff < paragraph_len &&
         1418              herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;
         1419              bufoff++) {
         1420                 if (bufoff < buflen) {
         1421                         /*
         1422                          * actually only do something when we have
         1423                          * space in the level-buffer. We continue
         1424                          * the iteration to be able to give a good
         1425                          * return value
         1426                          */
         1427                         set_state(STATE_PROP,
         1428                                   (uint_least8_t)get_bidi_property(cp),
         1429                                   &(buf[bufoff]));
         1430                         set_state(STATE_BRACKET_OFF, get_bidi_bracket_off(cp),
         1431                                   &(buf[bufoff]));
         1432                         set_state(STATE_LEVEL, 0, &(buf[bufoff]));
         1433                         set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff]));
         1434                         set_state(STATE_VISITED, 0, &(buf[bufoff]));
         1435                         set_state(STATE_PRESERVED_PROP,
         1436                                   (uint_least8_t)get_bidi_property(cp),
         1437                                   &(buf[bufoff]));
         1438                 }
         1439         }
         1440         bufsize = herodotus_reader_number_read(r);
         1441 
         1442         for (bufoff = 0; bufoff < bufsize; bufoff++) {
         1443                 if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B &&
         1444                     bufoff != bufsize - 1) {
         1445                         continue;
         1446                 }
         1447 
         1448                 /*
         1449                  * we either encountered a paragraph terminator or this
         1450                  * is the last character in the string.
         1451                  * Call the paragraph handler on the paragraph, including
         1452                  * the terminating character or last character of the
         1453                  * string respectively
         1454                  */
         1455                 preprocess_paragraph(paragraph_level, buf, bufoff + 1);
         1456                 break;
         1457         }
         1458 
         1459         /*
         1460          * we return the number of total bytes read, as the function
         1461          * should indicate if the given level-buffer is too small
         1462          */
         1463         return herodotus_reader_number_read(r);
         1464 }
         1465 
         1466 size_t
         1467 grapheme_bidirectional_preprocess_paragraph(
         1468         const uint_least32_t *src, size_t srclen,
         1469         enum grapheme_bidirectional_direction override, uint_least32_t *dest,
         1470         size_t destlen, enum grapheme_bidirectional_direction *resolved)
         1471 {
         1472         HERODOTUS_READER r;
         1473 
         1474         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
         1475 
         1476         return preprocess(&r, override, dest, destlen, resolved);
         1477 }
         1478 
         1479 static inline size_t
         1480 get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen,
         1481                           int_least8_t (*get_level)(const void *, size_t),
         1482                           void (*set_level)(void *, size_t, int_least8_t),
         1483                           void *lev, size_t levsize, bool skipignored)
         1484 {
         1485         enum bidi_property prop;
         1486         size_t i, levlen, runsince;
         1487         int_least8_t level, runlevel;
         1488 
         1489         /* rule L1.4 */
         1490         runsince = SIZE_MAX;
         1491         runlevel = -1;
         1492         for (i = 0, levlen = 0; i < linelen; i++) {
         1493                 level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]);
         1494                 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
         1495                                                 linedata[i]);
         1496 
         1497                 /* write level into level array if we still have space */
         1498                 if (level != -1 || skipignored == false) {
         1499                         if (levlen <= levsize) {
         1500                                 set_level(lev, levlen, level);
         1501                         }
         1502                         levlen++;
         1503                 }
         1504 
         1505                 if (level == -1) {
         1506                         /* ignored character */
         1507                         continue;
         1508                 }
         1509 
         1510                 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
         1511                     prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
         1512                     prop == BIDI_PROP_PDI) {
         1513                         if (runsince == SIZE_MAX) {
         1514                                 /* a new run has begun */
         1515                                 runsince = levlen - 1; /* levlen > 0 */
         1516                                 runlevel = (int_least8_t)get_state(
         1517                                         STATE_PARAGRAPH_LEVEL, linedata[i]);
         1518                         }
         1519                 } else {
         1520                         /* sequence ended */
         1521                         runsince = SIZE_MAX;
         1522                         runlevel = -1;
         1523                 }
         1524         }
         1525         if (runsince != SIZE_MAX) {
         1526                 /*
         1527                  * we hit the end of the line but were in a run;
         1528                  * reset the line levels to the paragraph level
         1529                  */
         1530                 for (i = runsince; i < MIN(linelen, levlen); i++) {
         1531                         if (get_level(lev, i) != -1) {
         1532                                 set_level(lev, i, runlevel);
         1533                         }
         1534                 }
         1535         }
         1536 
         1537         return levlen;
         1538 }
         1539 
         1540 static inline int_least8_t
         1541 get_level_int8(const void *lev, size_t off)
         1542 {
         1543         return ((const int_least8_t *)lev)[off];
         1544 }
         1545 
         1546 static inline void
         1547 set_level_int8(void *lev, size_t off, int_least8_t value)
         1548 {
         1549         ((int_least8_t *)lev)[off] = value;
         1550 }
         1551 
         1552 size_t
         1553 grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
         1554                                                  size_t linelen,
         1555                                                  int_least8_t *lev,
         1556                                                  size_t levlen)
         1557 {
         1558         return get_line_embedding_levels(linedata, linelen, get_level_int8,
         1559                                          set_level_int8, lev, levlen, false);
         1560 }
         1561 
         1562 static inline int_least8_t
         1563 get_level_uint32(const void *lev, size_t off)
         1564 {
         1565         return (int_least8_t)((((const uint_least32_t *)lev)[off] &
         1566                                UINT32_C(0x1FE00000)) >>
         1567                               21) -
         1568                1;
         1569 }
         1570 
         1571 static inline void
         1572 set_level_uint32(void *lev, size_t off, int_least8_t value)
         1573 {
         1574         ((uint_least32_t *)lev)[off] ^=
         1575                 ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000);
         1576         ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21;
         1577 }
         1578 
         1579 static inline int_least16_t
         1580 get_mirror_offset(uint_least32_t cp)
         1581 {
         1582         if (cp <= UINT32_C(0x10FFFF)) {
         1583                 return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)];
         1584         } else {
         1585                 return 0;
         1586         }
         1587 }
         1588 
         1589 size_t
         1590 grapheme_bidirectional_reorder_line(const uint_least32_t *line,
         1591                                     const uint_least32_t *linedata,
         1592                                     size_t linelen, uint_least32_t *output,
         1593                                     size_t outputsize)
         1594 {
         1595         size_t i, outputlen, first, last, j, k, l /*, laststart*/;
         1596         int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0;
         1597         uint_least32_t tmp;
         1598 
         1599         /* write output characters (and apply possible mirroring) */
         1600         for (i = 0, outputlen = 0; i < linelen; i++) {
         1601                 if (get_state(STATE_LEVEL, linedata[i]) != -1) {
         1602                         if (outputlen < outputsize) {
         1603                                 output[outputlen] =
         1604                                         (uint_least32_t)((int_least32_t)
         1605                                                                  line[i] +
         1606                                                          get_mirror_offset(
         1607                                                                  line[i]));
         1608                         }
         1609                         outputlen++;
         1610                 }
         1611         }
         1612         if (outputlen >= outputsize) {
         1613                 /* clear output buffer */
         1614                 for (i = 0; i < outputsize; i++) {
         1615                         output[i] = GRAPHEME_INVALID_CODEPOINT;
         1616                 }
         1617 
         1618                 /* return required size */
         1619                 return outputlen;
         1620         }
         1621 
         1622         /*
         1623          * write line embedding levels as metadata and codepoints into the
         1624          * output
         1625          */
         1626         get_line_embedding_levels(linedata, linelen, get_level_uint32,
         1627                                   set_level_uint32, output, outputsize, true);
         1628 
         1629         /* determine level range */
         1630         for (i = 0; i < outputlen; i++) {
         1631                 level = get_level_uint32(output, i);
         1632 
         1633                 if (level == -1) {
         1634                         /* ignored character */
         1635                         continue;
         1636                 }
         1637 
         1638                 if (level % 2 == 1 && level < min_odd_level) {
         1639                         min_odd_level = level;
         1640                 }
         1641                 if (level > max_level) {
         1642                         max_level = level;
         1643                 }
         1644         }
         1645 
         1646         for (level = max_level; level >= min_odd_level /* > 0 */; level--) {
         1647                 for (i = 0; i < outputlen; i++) {
         1648                         if (get_level_uint32(output, i) >= level) {
         1649                                 /*
         1650                                  * the current character has the desired level
         1651                                  */
         1652                                 first = last = i;
         1653 
         1654                                 /* find the end of the level-sequence */
         1655                                 for (i++; i < outputlen; i++) {
         1656                                         if (get_level_uint32(output, i) >=
         1657                                             level) {
         1658                                                 /* the sequence continues */
         1659                                                 last = i;
         1660                                         } else {
         1661                                                 break;
         1662                                         }
         1663                                 }
         1664 
         1665                                 /* invert the sequence first..last respecting
         1666                                  * grapheme clusters
         1667                                  *
         1668                                  * The standard only speaks of combining marks
         1669                                  * inversion, but we should in the perfect case
         1670                                  * respect _all_ grapheme clusters, which we do
         1671                                  * here!
         1672                                  */
         1673 
         1674                                 /* mark grapheme cluster breaks */
         1675                                 for (j = first; j <= last;
         1676                                      j += grapheme_next_character_break(
         1677                                              line + j, outputlen - j)) {
         1678                                         /*
         1679                                          * we use a special trick here: The
         1680                                          * first 21 bits of the state are filled
         1681                                          * with the codepoint, the next 8 bits
         1682                                          * are used for the level, so we can use
         1683                                          * the 30th bit to mark the grapheme
         1684                                          * cluster breaks. This allows us to
         1685                                          * reinvert the grapheme clusters into
         1686                                          * the proper direction later.
         1687                                          */
         1688                                         output[j] |= UINT32_C(1) << 29;
         1689                                 }
         1690 
         1691                                 /* global inversion */
         1692                                 for (k = first, l = last; k < l; k++, l--) {
         1693                                         /* swap */
         1694                                         tmp = output[k];
         1695                                         output[k] = output[l];
         1696                                         output[l] = tmp;
         1697                                 }
         1698 
         1699                                 /* grapheme cluster reinversion */
         1700 #if 0
         1701                                 for (j = first, laststart = first; j <= last;
         1702                                      j++) {
         1703                                         if (output[j] & (UINT32_C(1) << 29)) {
         1704                                                 /* we hit a mark! given the
         1705                                                  * grapheme cluster is inverted,
         1706                                                  * this means that the cluster
         1707                                                  * ended and we now reinvert it
         1708                                                  * again
         1709                                                  */
         1710                                                 for (k = laststart, l = j;
         1711                                                      k < l; k++, l--) {
         1712                                                         /* swap */
         1713                                                         tmp = output[k];
         1714                                                         output[k] = output[l];
         1715                                                         output[l] = tmp;
         1716                                                 }
         1717                                                 laststart = j + 1;
         1718                                         }
         1719                                 }
         1720 #endif
         1721 
         1722                                 /* unmark grapheme cluster breaks */
         1723                                 for (j = first; j <= last; j++) {
         1724                                         output[j] ^= output[j] &
         1725                                                      UINT32_C(0x20000000);
         1726                                 }
         1727                         }
         1728                 }
         1729         }
         1730 
         1731         /* remove embedding level metadata */
         1732         for (i = 0; i < outputlen; i++) {
         1733                 output[i] ^= output[i] & UINT32_C(0x1FE00000);
         1734         }
         1735 
         1736         return outputlen;
         1737 }