bidirectional.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
bidirectional.c (48331B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4
5 #include "../gen/bidirectional.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 #define MAX_DEPTH 125
10
11 enum state_type {
12 STATE_PROP, /* in 0..23, bidi_property */
13 STATE_PRESERVED_PROP, /* in 0..23, preserved bidi_prop for L1-rule */
14 STATE_BRACKET_OFF, /* in 0..255, offset in bidi_bracket */
15 STATE_LEVEL, /* in 0..MAX_DEPTH+1=126, embedding level */
16 STATE_PARAGRAPH_LEVEL, /* in 0..1, paragraph embedding level */
17 STATE_VISITED, /* in 0..1, visited within isolating run */
18 };
19
20 static struct {
21 uint_least32_t filter_mask;
22 size_t mask_shift;
23 int_least16_t value_offset;
24 } state_lut[] = {
25 [STATE_PROP] = {
26 .filter_mask = 0x000001F, /* 00000000 00000000 00000000 00011111 */
27 .mask_shift = 0,
28 .value_offset = 0,
29 },
30 [STATE_PRESERVED_PROP] = {
31 .filter_mask = 0x00003E0, /* 00000000 00000000 00000011 11100000 */
32 .mask_shift = 5,
33 .value_offset = 0,
34 },
35 [STATE_BRACKET_OFF] = {
36 .filter_mask = 0x003FC00, /* 00000000 00000011 11111100 00000000 */
37 .mask_shift = 10,
38 .value_offset = 0,
39 },
40 [STATE_LEVEL] = {
41 .filter_mask = 0x1FC0000, /* 00000001 11111100 00000000 00000000 */
42 .mask_shift = 18,
43 .value_offset = -1,
44 },
45 [STATE_PARAGRAPH_LEVEL] = {
46 .filter_mask = 0x2000000, /* 00000010 00000000 00000000 00000000 */
47 .mask_shift = 25,
48 .value_offset = 0,
49 },
50 [STATE_VISITED] = {
51 .filter_mask = 0x4000000, /* 00000100 00000000 00000000 00000000 */
52 .mask_shift = 26,
53 .value_offset = 0,
54 },
55 };
56
57 static inline int_least16_t
58 get_state(enum state_type t, uint_least32_t input)
59 {
60 return (int_least16_t)((input & state_lut[t].filter_mask) >>
61 state_lut[t].mask_shift) +
62 state_lut[t].value_offset;
63 }
64
65 static inline void
66 set_state(enum state_type t, int_least16_t value, uint_least32_t *output)
67 {
68 *output &= ~state_lut[t].filter_mask;
69 *output |= ((uint_least32_t)(value - state_lut[t].value_offset)
70 << state_lut[t].mask_shift) &
71 state_lut[t].filter_mask;
72 }
73
74 struct isolate_runner {
75 uint_least32_t *buf;
76 size_t buflen;
77 size_t start;
78
79 struct {
80 size_t off;
81 } prev, cur, next;
82
83 enum bidi_property sos, eos;
84
85 uint_least8_t paragraph_level;
86 int_least8_t isolating_run_level;
87 };
88
89 static inline enum bidi_property
90 ir_get_previous_prop(const struct isolate_runner *ir)
91 {
92 return (ir->prev.off == SIZE_MAX) ?
93 ir->sos :
94 (uint_least8_t)get_state(STATE_PROP,
95 ir->buf[ir->prev.off]);
96 }
97
98 static inline enum bidi_property
99 ir_get_current_prop(const struct isolate_runner *ir)
100 {
101 return (uint_least8_t)get_state(STATE_PROP, ir->buf[ir->cur.off]);
102 }
103
104 static inline enum bidi_property
105 ir_get_next_prop(const struct isolate_runner *ir)
106 {
107 return (ir->next.off == SIZE_MAX) ?
108 ir->eos :
109 (uint_least8_t)get_state(STATE_PROP,
110 ir->buf[ir->next.off]);
111 }
112
113 static inline enum bidi_property
114 ir_get_current_preserved_prop(const struct isolate_runner *ir)
115 {
116 return (uint_least8_t)get_state(STATE_PRESERVED_PROP,
117 ir->buf[ir->cur.off]);
118 }
119
120 static inline int_least8_t
121 ir_get_current_level(const struct isolate_runner *ir)
122 {
123 return (int_least8_t)get_state(STATE_LEVEL, ir->buf[ir->cur.off]);
124 }
125
126 static inline const struct bracket *
127 ir_get_current_bracket_prop(const struct isolate_runner *ir)
128 {
129 return bidi_bracket +
130 (int_least8_t)get_state(STATE_BRACKET_OFF, ir->buf[ir->cur.off]);
131 }
132
133 static void
134 ir_set_current_prop(const struct isolate_runner *ir, enum bidi_property prop)
135 {
136 set_state(STATE_PROP, (int_least16_t)prop, &(ir->buf[ir->cur.off]));
137 }
138
139 static void
140 ir_init(uint_least32_t *buf, size_t buflen, size_t off,
141 uint_least8_t paragraph_level, bool within, struct isolate_runner *ir)
142 {
143 size_t i;
144 int_least8_t sos_level;
145
146 /* initialize invariants */
147 ir->buf = buf;
148 ir->buflen = buflen;
149 ir->paragraph_level = paragraph_level;
150 ir->start = off;
151
152 /* advance off until we are at a non-removed character */
153 for (; off < buflen; off++) {
154 if (get_state(STATE_LEVEL, buf[off]) != -1) {
155 break;
156 }
157 }
158 if (off == buflen) {
159 /* we encountered no more non-removed character, terminate */
160 ir->next.off = SIZE_MAX;
161 return;
162 }
163
164 /* set the isolating run level to that of the current offset */
165 ir->isolating_run_level =
166 (int_least8_t)get_state(STATE_LEVEL, buf[off]);
167
168 /* initialize sos and eos to dummy values */
169 ir->sos = ir->eos = NUM_BIDI_PROPS;
170
171 /*
172 * we write the information of the "current" state into next,
173 * so that the shift-in at the first advancement moves it in
174 * cur, as desired.
175 */
176 ir->next.off = off;
177
178 /*
179 * determine the previous state but store its offset in cur.off,
180 * given it's shifted in on the first advancement
181 */
182 ir->cur.off = SIZE_MAX;
183 for (i = off, sos_level = -1; i >= 1; i--) {
184 if (get_state(STATE_LEVEL, buf[i - 1]) != -1) {
185 /*
186 * we found a character that has not been
187 * removed in X9
188 */
189 sos_level = (int_least8_t)get_state(STATE_LEVEL,
190 buf[i - 1]);
191
192 if (within) {
193 /* we just take it */
194 ir->cur.off = i;
195 }
196
197 break;
198 }
199 }
200 if (sos_level == -1) {
201 /*
202 * there were no preceding non-removed characters, set
203 * sos-level to paragraph embedding level
204 */
205 sos_level = (int_least8_t)paragraph_level;
206 }
207
208 if (!within || ir->cur.off == SIZE_MAX) {
209 /*
210 * we are at the beginning of the sequence; initialize
211 * it faithfully according to the algorithm by looking
212 * at the sos-level
213 */
214 if (MAX(sos_level, ir->isolating_run_level) % 2 == 0) {
215 /* the higher level is even, set sos to L */
216 ir->sos = BIDI_PROP_L;
217 } else {
218 /* the higher level is odd, set sos to R */
219 ir->sos = BIDI_PROP_R;
220 }
221 }
222 }
223
224 static int
225 ir_advance(struct isolate_runner *ir)
226 {
227 enum bidi_property prop;
228 int_least8_t level, isolate_level, last_isolate_level;
229 size_t i;
230
231 if (ir->next.off == SIZE_MAX) {
232 /* the sequence is over */
233 return 1;
234 }
235
236 /* shift in */
237 ir->prev.off = ir->cur.off;
238 ir->cur.off = ir->next.off;
239
240 /* mark as visited */
241 set_state(STATE_VISITED, 1, &(ir->buf[ir->cur.off]));
242
243 /* initialize next state by going to the next character in the sequence
244 */
245 ir->next.off = SIZE_MAX;
246
247 last_isolate_level = -1;
248 for (i = ir->cur.off, isolate_level = 0; i < ir->buflen; i++) {
249 level = (int_least8_t)get_state(STATE_LEVEL, ir->buf[i]);
250 prop = (uint_least8_t)get_state(STATE_PROP, ir->buf[i]);
251
252 if (level == -1) {
253 /* this is one of the ignored characters, skip */
254 continue;
255 } else if (level == ir->isolating_run_level) {
256 last_isolate_level = level;
257 }
258
259 /* follow BD8/BD9 and P2 to traverse the current sequence */
260 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
261 prop == BIDI_PROP_FSI) {
262 /*
263 * we encountered an isolate initiator, increment
264 * counter, but go into processing when we
265 * were not isolated before
266 */
267 if (isolate_level < MAX_DEPTH) {
268 isolate_level++;
269 }
270 if (isolate_level != 1) {
271 continue;
272 }
273 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
274 isolate_level--;
275
276 /*
277 * if the current PDI dropped the isolate-level
278 * to zero, it is itself part of the isolating
279 * run sequence; otherwise we simply continue.
280 */
281 if (isolate_level > 0) {
282 continue;
283 }
284 } else if (isolate_level > 0) {
285 /* we are in an isolating sequence */
286 continue;
287 }
288
289 /*
290 * now we either still are in our sequence or we hit
291 * the eos-case as we left the sequence and hit the
292 * first non-isolating-sequence character.
293 */
294 if (i == ir->cur.off) {
295 /* we were in the first initializing round */
296 continue;
297 } else if (level == ir->isolating_run_level) {
298 /* isolate_level-skips have been handled before, we're
299 * good */
300 /* still in the sequence */
301 ir->next.off = i;
302 } else {
303 /* out of sequence or isolated, compare levels via eos
304 */
305 ir->next.off = SIZE_MAX;
306 if (MAX(last_isolate_level, level) % 2 == 0) {
307 ir->eos = BIDI_PROP_L;
308 } else {
309 ir->eos = BIDI_PROP_R;
310 }
311 }
312 break;
313 }
314 if (i == ir->buflen) {
315 /*
316 * the sequence ended before we could grab an offset.
317 * we need to determine the eos-prop by comparing the
318 * level of the last element in the isolating run sequence
319 * with the paragraph level.
320 */
321 ir->next.off = SIZE_MAX;
322 if (MAX(last_isolate_level, ir->paragraph_level) % 2 == 0) {
323 /* the higher level is even, set eos to L */
324 ir->eos = BIDI_PROP_L;
325 } else {
326 /* the higher level is odd, set eos to R */
327 ir->eos = BIDI_PROP_R;
328 }
329 }
330
331 return 0;
332 }
333
334 static enum bidi_property
335 ir_get_last_strong_prop(const struct isolate_runner *ir)
336 {
337 struct isolate_runner tmp;
338 enum bidi_property last_strong_prop = ir->sos, prop;
339
340 ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
341 &tmp);
342 for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
343 prop = ir_get_current_prop(&tmp);
344
345 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
346 prop == BIDI_PROP_AL) {
347 last_strong_prop = prop;
348 }
349 }
350
351 return last_strong_prop;
352 }
353
354 static enum bidi_property
355 ir_get_last_strong_or_number_prop(const struct isolate_runner *ir)
356 {
357 struct isolate_runner tmp;
358 enum bidi_property last_strong_or_number_prop = ir->sos, prop;
359
360 ir_init(ir->buf, ir->buflen, ir->start, ir->paragraph_level, false,
361 &tmp);
362 for (; !ir_advance(&tmp) && tmp.cur.off < ir->cur.off;) {
363 prop = ir_get_current_prop(&tmp);
364
365 if (prop == BIDI_PROP_R || prop == BIDI_PROP_L ||
366 prop == BIDI_PROP_AL || prop == BIDI_PROP_EN ||
367 prop == BIDI_PROP_AN) {
368 last_strong_or_number_prop = prop;
369 }
370 }
371
372 return last_strong_or_number_prop;
373 }
374
375 static void
376 preprocess_bracket_pair(const struct isolate_runner *start,
377 const struct isolate_runner *end)
378 {
379 enum bidi_property prop, bracket_prop, last_strong_or_number_prop;
380 struct isolate_runner ir;
381 size_t strong_type_off;
382
383 /*
384 * check if the bracket contains a strong type (L or R|EN|AN)
385 */
386 for (ir = *start, strong_type_off = SIZE_MAX,
387 bracket_prop = NUM_BIDI_PROPS;
388 !ir_advance(&ir) && ir.cur.off < end->cur.off;) {
389 prop = ir_get_current_prop(&ir);
390
391 if (prop == BIDI_PROP_L) {
392 strong_type_off = ir.cur.off;
393 if (ir.isolating_run_level % 2 == 0) {
394 /*
395 * set the type for both brackets to L (so they
396 * match the strong type they contain)
397 */
398 bracket_prop = BIDI_PROP_L;
399 }
400 } else if (prop == BIDI_PROP_R || prop == BIDI_PROP_EN ||
401 prop == BIDI_PROP_AN) {
402 strong_type_off = ir.cur.off;
403 if (ir.isolating_run_level % 2 != 0) {
404 /*
405 * set the type for both brackets to R (so they
406 * match the strong type they contain)
407 */
408 bracket_prop = BIDI_PROP_R;
409 }
410 }
411 }
412 if (strong_type_off == SIZE_MAX) {
413 /*
414 * there are no strong types within the brackets and we just
415 * leave the brackets as is
416 */
417 return;
418 }
419
420 if (bracket_prop == NUM_BIDI_PROPS) {
421 /*
422 * We encountered a strong type, but it was opposite
423 * to the embedding direction.
424 * Check the previous strong type before the opening
425 * bracket
426 */
427 last_strong_or_number_prop =
428 ir_get_last_strong_or_number_prop(start);
429 if (last_strong_or_number_prop == BIDI_PROP_L &&
430 ir.isolating_run_level % 2 != 0) {
431 /*
432 * the previous strong type is also opposite
433 * to the embedding direction, so the context
434 * was established and we set the brackets
435 * accordingly.
436 */
437 bracket_prop = BIDI_PROP_L;
438 } else if ((last_strong_or_number_prop == BIDI_PROP_R ||
439 last_strong_or_number_prop == BIDI_PROP_EN ||
440 last_strong_or_number_prop == BIDI_PROP_AN) &&
441 ir.isolating_run_level % 2 == 0) {
442 /*
443 * the previous strong type is also opposite
444 * to the embedding direction, so the context
445 * was established and we set the brackets
446 * accordingly.
447 */
448 bracket_prop = BIDI_PROP_R;
449 } else {
450 /* set brackets to the embedding direction */
451 if (ir.isolating_run_level % 2 == 0) {
452 bracket_prop = BIDI_PROP_L;
453 } else {
454 bracket_prop = BIDI_PROP_R;
455 }
456 }
457 }
458
459 ir_set_current_prop(start, bracket_prop);
460 ir_set_current_prop(end, bracket_prop);
461
462 /*
463 * any sequence of NSMs after opening or closing brackets get
464 * the same property as the one we set on the brackets
465 */
466 for (ir = *start; !ir_advance(&ir) && ir_get_current_preserved_prop(
467 &ir) == BIDI_PROP_NSM;) {
468 ir_set_current_prop(&ir, bracket_prop);
469 }
470 for (ir = *end; !ir_advance(&ir) &&
471 ir_get_current_preserved_prop(&ir) == BIDI_PROP_NSM;) {
472 ir_set_current_prop(&ir, bracket_prop);
473 }
474 }
475
476 static void
477 preprocess_bracket_pairs(uint_least32_t *buf, size_t buflen, size_t off,
478 uint_least8_t paragraph_level)
479 {
480 /*
481 * The N0-rule deals with bracket pairs that shall be determined
482 * with the rule BD16. This is specified as an algorithm with a
483 * stack of 63 bracket openings that are used to resolve into a
484 * separate list of pairs, which is then to be sorted by opening
485 * position. Thus, even though the bracketing-depth is limited
486 * by 63, the algorithm, as is, requires dynamic memory
487 * management.
488 *
489 * A naive approach (used by Fribidi) would be to screw the
490 * stack-approach and simply directly determine the
491 * corresponding closing bracket offset for a given opening
492 * bracket, leading to O(n²) time complexity in the worst case
493 * with a lot of brackets. While many brackets are not common,
494 * it is still possible to find a middle ground where you obtain
495 * strongly linear time complexity in most common cases:
496 *
497 * Instead of a stack, we use a FIFO data structure which is
498 * filled with bracket openings in the order of appearance (thus
499 * yielding an implicit sorting!) at the top. If the
500 * corresponding closing bracket is encountered, it is added to
501 * the respective entry, making it ready to "move out" at the
502 * bottom (i.e. passed to the bracket processing). Due to the
503 * nature of the specified pair detection algorithm, which only
504 * cares about the bracket type and nothing else (bidi class,
505 * level, etc.), we can mix processing and bracket detection.
506 *
507 * Obviously, if you, for instance, have one big bracket pair at
508 * the bottom that has not been closed yet, it will block the
509 * bracket processing and the FIFO might hit its capacity limit.
510 * At this point, the blockage is manually resolved using the
511 * naive quadratic approach.
512 *
513 * To remain within the specified standard behaviour, which
514 * mandates that processing of brackets should stop when the
515 * bracketing-depth is at 63, we simply check in an "overflow"
516 * scenario if all 63 elements in the LIFO are unfinished, which
517 * corresponds with such a bracketing depth.
518 */
519 enum bidi_property prop;
520
521 struct {
522 bool complete;
523 size_t bracket_class;
524 struct isolate_runner start;
525 struct isolate_runner end;
526 } fifo[63];
527 const struct bracket *bracket_prop, *tmp_bracket_prop;
528 struct isolate_runner ir, tmp_ir;
529 size_t fifo_len = 0, i, blevel, j, k;
530
531 ir_init(buf, buflen, off, paragraph_level, false, &ir);
532 while (!ir_advance(&ir)) {
533 prop = ir_get_current_prop(&ir);
534 bracket_prop = ir_get_current_bracket_prop(&ir);
535 if (prop == BIDI_PROP_ON &&
536 bracket_prop->type == BIDI_BRACKET_OPEN) {
537 if (fifo_len == LEN(fifo)) {
538 /*
539 * The FIFO is full, check first if it's
540 * completely blocked (i.e. no finished
541 * bracket pairs, triggering the standard
542 * that mandates to abort in such a case
543 */
544 for (i = 0; i < fifo_len; i++) {
545 if (fifo[i].complete) {
546 break;
547 }
548 }
549 if (i == fifo_len) {
550 /* abort processing */
551 return;
552 }
553
554 /*
555 * by construction, the bottom entry
556 * in the FIFO is guaranteed to be
557 * unfinished (given we "consume" all
558 * finished bottom entries after each
559 * iteration).
560 *
561 * iterate, starting after the opening
562 * bracket, and find the corresponding
563 * closing bracket.
564 *
565 * if we find none, just drop the FIFO
566 * entry silently
567 */
568 for (tmp_ir = fifo[0].start, blevel = 0;
569 !ir_advance(&tmp_ir);) {
570 tmp_bracket_prop =
571 ir_get_current_bracket_prop(
572 &tmp_ir);
573
574 if (tmp_bracket_prop->type ==
575 BIDI_BRACKET_OPEN &&
576 tmp_bracket_prop->class ==
577 bracket_prop->class) {
578 /* we encountered another
579 * opening bracket of the
580 * same class */
581 blevel++;
582
583 } else if (tmp_bracket_prop->type ==
584 BIDI_BRACKET_CLOSE &&
585 tmp_bracket_prop->class ==
586 bracket_prop
587 ->class) {
588 /* we encountered a closing
589 * bracket of the same class
590 */
591 if (blevel == 0) {
592 /* this is the
593 * corresponding
594 * closing bracket
595 */
596 fifo[0].complete = true;
597 fifo[0].end = ir;
598 } else {
599 blevel--;
600 }
601 }
602 }
603 if (fifo[0].complete) {
604 /* we found the matching bracket */
605 preprocess_bracket_pair(
606 &(fifo[i].start),
607 &(fifo[i].end));
608 }
609
610 /* shift FIFO one to the left */
611 for (i = 1; i < fifo_len; i++) {
612 fifo[i - 1] = fifo[i];
613 }
614 fifo_len--;
615 }
616
617 /* add element to the FIFO */
618 fifo_len++;
619 fifo[fifo_len - 1].complete = false;
620 fifo[fifo_len - 1].bracket_class = bracket_prop->class;
621 fifo[fifo_len - 1].start = ir;
622 } else if (prop == BIDI_PROP_ON &&
623 bracket_prop->type == BIDI_BRACKET_CLOSE) {
624 /*
625 * go backwards in the FIFO, skip finished entries
626 * and simply ignore (do nothing) the closing
627 * bracket if we do not match anything
628 */
629 for (i = fifo_len; i > 0; i--) {
630 if (bracket_prop->class ==
631 fifo[i - 1].bracket_class &&
632 !fifo[i - 1].complete) {
633 /* we have found a pair */
634 fifo[i - 1].complete = true;
635 fifo[i - 1].end = ir;
636
637 /* remove all uncompleted FIFO elements
638 * above i - 1 */
639 for (j = i; j < fifo_len;) {
640 if (fifo[j].complete) {
641 j++;
642 continue;
643 }
644
645 /* shift FIFO one to the left */
646 for (k = j + 1; k < fifo_len;
647 k++) {
648 fifo[k - 1] = fifo[k];
649 }
650 fifo_len--;
651 }
652 break;
653 }
654 }
655 }
656
657 /* process all ready bracket pairs from the FIFO bottom */
658 while (fifo_len > 0 && fifo[0].complete) {
659 preprocess_bracket_pair(&(fifo[0].start),
660 &(fifo[0].end));
661
662 /* shift FIFO one to the left */
663 for (j = 0; j + 1 < fifo_len; j++) {
664 fifo[j] = fifo[j + 1];
665 }
666 fifo_len--;
667 }
668 }
669
670 /*
671 * afterwards, we still might have unfinished bracket pairs
672 * that will remain as such, but the subsequent finished pairs
673 * also need to be taken into account, so we go through the
674 * FIFO once more and process all finished pairs
675 */
676 for (i = 0; i < fifo_len; i++) {
677 if (fifo[i].complete) {
678 preprocess_bracket_pair(&(fifo[i].start),
679 &(fifo[i].end));
680 }
681 }
682 }
683
684 static size_t
685 preprocess_isolating_run_sequence(uint_least32_t *buf, size_t buflen,
686 size_t off, uint_least8_t paragraph_level)
687 {
688 enum bidi_property sequence_prop, prop;
689 struct isolate_runner ir, tmp;
690 size_t runsince, sequence_end;
691
692 /* W1 */
693 ir_init(buf, buflen, off, paragraph_level, false, &ir);
694 while (!ir_advance(&ir)) {
695 if (ir_get_current_prop(&ir) == BIDI_PROP_NSM) {
696 prop = ir_get_previous_prop(&ir);
697
698 if (prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
699 prop == BIDI_PROP_FSI || prop == BIDI_PROP_PDI) {
700 ir_set_current_prop(&ir, BIDI_PROP_ON);
701 } else {
702 ir_set_current_prop(&ir, prop);
703 }
704 }
705 }
706
707 /* W2 */
708 ir_init(buf, buflen, off, paragraph_level, false, &ir);
709 while (!ir_advance(&ir)) {
710 if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
711 ir_get_last_strong_prop(&ir) == BIDI_PROP_AL) {
712 ir_set_current_prop(&ir, BIDI_PROP_AN);
713 }
714 }
715
716 /* W3 */
717 ir_init(buf, buflen, off, paragraph_level, false, &ir);
718 while (!ir_advance(&ir)) {
719 if (ir_get_current_prop(&ir) == BIDI_PROP_AL) {
720 ir_set_current_prop(&ir, BIDI_PROP_R);
721 }
722 }
723
724 /* W4 */
725 ir_init(buf, buflen, off, paragraph_level, false, &ir);
726 while (!ir_advance(&ir)) {
727 if (ir_get_previous_prop(&ir) == BIDI_PROP_EN &&
728 (ir_get_current_prop(&ir) == BIDI_PROP_ES ||
729 ir_get_current_prop(&ir) == BIDI_PROP_CS) &&
730 ir_get_next_prop(&ir) == BIDI_PROP_EN) {
731 ir_set_current_prop(&ir, BIDI_PROP_EN);
732 }
733
734 if (ir_get_previous_prop(&ir) == BIDI_PROP_AN &&
735 ir_get_current_prop(&ir) == BIDI_PROP_CS &&
736 ir_get_next_prop(&ir) == BIDI_PROP_AN) {
737 ir_set_current_prop(&ir, BIDI_PROP_AN);
738 }
739 }
740
741 /* W5 */
742 runsince = SIZE_MAX;
743 ir_init(buf, buflen, off, paragraph_level, false, &ir);
744 while (!ir_advance(&ir)) {
745 if (ir_get_current_prop(&ir) == BIDI_PROP_ET) {
746 if (runsince == SIZE_MAX) {
747 /* a new run has begun */
748 runsince = ir.cur.off;
749 }
750 } else if (ir_get_current_prop(&ir) == BIDI_PROP_EN) {
751 /* set the preceding sequence */
752 if (runsince != SIZE_MAX) {
753 ir_init(buf, buflen, runsince, paragraph_level,
754 (runsince > off), &tmp);
755 while (!ir_advance(&tmp) &&
756 tmp.cur.off < ir.cur.off) {
757 ir_set_current_prop(&tmp, BIDI_PROP_EN);
758 }
759 runsince = SIZE_MAX;
760 } else {
761 ir_init(buf, buflen, ir.cur.off,
762 paragraph_level, (ir.cur.off > off),
763 &tmp);
764 ir_advance(&tmp);
765 }
766 /* follow the succeeding sequence */
767 while (!ir_advance(&tmp)) {
768 if (ir_get_current_prop(&tmp) != BIDI_PROP_ET) {
769 break;
770 }
771 ir_set_current_prop(&tmp, BIDI_PROP_EN);
772 }
773 } else {
774 /* sequence ended */
775 runsince = SIZE_MAX;
776 }
777 }
778
779 /* W6 */
780 ir_init(buf, buflen, off, paragraph_level, false, &ir);
781 while (!ir_advance(&ir)) {
782 prop = ir_get_current_prop(&ir);
783
784 if (prop == BIDI_PROP_ES || prop == BIDI_PROP_ET ||
785 prop == BIDI_PROP_CS) {
786 ir_set_current_prop(&ir, BIDI_PROP_ON);
787 }
788 }
789
790 /* W7 */
791 ir_init(buf, buflen, off, paragraph_level, false, &ir);
792 while (!ir_advance(&ir)) {
793 if (ir_get_current_prop(&ir) == BIDI_PROP_EN &&
794 ir_get_last_strong_prop(&ir) == BIDI_PROP_L) {
795 ir_set_current_prop(&ir, BIDI_PROP_L);
796 }
797 }
798
799 /* N0 */
800 preprocess_bracket_pairs(buf, buflen, off, paragraph_level);
801
802 /* N1 */
803 sequence_end = SIZE_MAX;
804 sequence_prop = NUM_BIDI_PROPS;
805 ir_init(buf, buflen, off, paragraph_level, false, &ir);
806 while (!ir_advance(&ir)) {
807 if (sequence_end == SIZE_MAX) {
808 prop = ir_get_current_prop(&ir);
809
810 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
811 prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
812 prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
813 prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
814 /* the current character is an NI (neutral
815 * or isolate) */
816
817 /* scan ahead to the end of the NI-sequence
818 */
819 ir_init(buf, buflen, ir.cur.off,
820 paragraph_level, (ir.cur.off > off),
821 &tmp);
822 while (!ir_advance(&tmp)) {
823 prop = ir_get_next_prop(&tmp);
824
825 if (prop != BIDI_PROP_B &&
826 prop != BIDI_PROP_S &&
827 prop != BIDI_PROP_WS &&
828 prop != BIDI_PROP_ON &&
829 prop != BIDI_PROP_FSI &&
830 prop != BIDI_PROP_LRI &&
831 prop != BIDI_PROP_RLI &&
832 prop != BIDI_PROP_PDI) {
833 break;
834 }
835 }
836
837 /*
838 * check what follows and see if the text
839 * has the same direction on both sides
840 */
841 if (ir_get_previous_prop(&ir) == BIDI_PROP_L &&
842 ir_get_next_prop(&tmp) == BIDI_PROP_L) {
843 sequence_end = tmp.cur.off;
844 sequence_prop = BIDI_PROP_L;
845 } else if ((ir_get_previous_prop(&ir) ==
846 BIDI_PROP_R ||
847 ir_get_previous_prop(&ir) ==
848 BIDI_PROP_EN ||
849 ir_get_previous_prop(&ir) ==
850 BIDI_PROP_AN) &&
851 (ir_get_next_prop(&tmp) ==
852 BIDI_PROP_R ||
853 ir_get_next_prop(&tmp) ==
854 BIDI_PROP_EN ||
855 ir_get_next_prop(&tmp) ==
856 BIDI_PROP_AN)) {
857 sequence_end = tmp.cur.off;
858 sequence_prop = BIDI_PROP_R;
859 }
860 }
861 }
862
863 if (sequence_end != SIZE_MAX) {
864 if (ir.cur.off <= sequence_end) {
865 ir_set_current_prop(&ir, sequence_prop);
866 } else {
867 /* end of sequence, reset */
868 sequence_end = SIZE_MAX;
869 sequence_prop = NUM_BIDI_PROPS;
870 }
871 }
872 }
873
874 /* N2 */
875 ir_init(buf, buflen, off, paragraph_level, false, &ir);
876 while (!ir_advance(&ir)) {
877 prop = ir_get_current_prop(&ir);
878
879 if (prop == BIDI_PROP_B || prop == BIDI_PROP_S ||
880 prop == BIDI_PROP_WS || prop == BIDI_PROP_ON ||
881 prop == BIDI_PROP_FSI || prop == BIDI_PROP_LRI ||
882 prop == BIDI_PROP_RLI || prop == BIDI_PROP_PDI) {
883 /* N2 */
884 if (ir_get_current_level(&ir) % 2 == 0) {
885 /* even embedding level */
886 ir_set_current_prop(&ir, BIDI_PROP_L);
887 } else {
888 /* odd embedding level */
889 ir_set_current_prop(&ir, BIDI_PROP_R);
890 }
891 }
892 }
893
894 return 0;
895 }
896
897 static uint_least8_t
898 get_isolated_paragraph_level(const uint_least32_t *state, size_t statelen)
899 {
900 enum bidi_property prop;
901 int_least8_t isolate_level;
902 size_t stateoff;
903
904 /* determine paragraph level (rules P1-P3) and terminate on PDI */
905 for (stateoff = 0, isolate_level = 0; stateoff < statelen; stateoff++) {
906 prop = (uint_least8_t)get_state(STATE_PROP, state[stateoff]);
907
908 if (prop == BIDI_PROP_PDI && isolate_level == 0) {
909 /*
910 * we are in a FSI-subsection of a paragraph and
911 * matched with the terminating PDI
912 */
913 break;
914 }
915
916 /* BD8/BD9 */
917 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
918 prop == BIDI_PROP_FSI) &&
919 isolate_level < MAX_DEPTH) {
920 /* we hit an isolate initiator, increment counter */
921 isolate_level++;
922 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
923 isolate_level--;
924 }
925
926 /* P2 */
927 if (isolate_level > 0) {
928 continue;
929 }
930
931 /* P3 */
932 if (prop == BIDI_PROP_L) {
933 return 0;
934 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
935 return 1;
936 }
937 }
938
939 return 0;
940 }
941
942 static inline uint_least8_t
943 get_bidi_property(uint_least32_t cp)
944 {
945 if (likely(cp <= 0x10FFFF)) {
946 return (bidi_minor[bidi_major[cp >> 8] + (cp & 0xff)]) &
947 0x1F /* 00011111 */;
948 } else {
949 return BIDI_PROP_L;
950 }
951 }
952
953 static uint_least8_t
954 get_paragraph_level(enum grapheme_bidirectional_direction override,
955 const HERODOTUS_READER *r)
956 {
957 HERODOTUS_READER tmp;
958 enum bidi_property prop;
959 int_least8_t isolate_level;
960 uint_least32_t cp;
961
962 /* check overrides first according to rule HL1 */
963 if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
964 return 0;
965 } else if (override == GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
966 return 1;
967 }
968
969 /* copy reader into temporary reader */
970 herodotus_reader_copy(r, &tmp);
971
972 /* determine paragraph level (rules P1-P3) */
973 for (isolate_level = 0; herodotus_read_codepoint(&tmp, true, &cp) ==
974 HERODOTUS_STATUS_SUCCESS;) {
975 prop = get_bidi_property(cp);
976
977 /* BD8/BD9 */
978 if ((prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
979 prop == BIDI_PROP_FSI) &&
980 isolate_level < MAX_DEPTH) {
981 /* we hit an isolate initiator, increment counter */
982 isolate_level++;
983 } else if (prop == BIDI_PROP_PDI && isolate_level > 0) {
984 isolate_level--;
985 }
986
987 /* P2 */
988 if (isolate_level > 0) {
989 continue;
990 }
991
992 /* P3 */
993 if (prop == BIDI_PROP_L) {
994 return 0;
995 } else if (prop == BIDI_PROP_AL || prop == BIDI_PROP_R) {
996 return 1;
997 }
998 }
999
1000 return 0;
1001 }
1002
1003 static void
1004 preprocess_paragraph(uint_least8_t paragraph_level, uint_least32_t *buf,
1005 size_t buflen)
1006 {
1007 enum bidi_property prop;
1008 int_least8_t level;
1009
1010 struct {
1011 int_least8_t level;
1012 enum grapheme_bidirectional_direction override;
1013 bool directional_isolate;
1014 } directional_status[MAX_DEPTH + 2], *dirstat = directional_status;
1015
1016 size_t overflow_isolate_count, overflow_embedding_count,
1017 valid_isolate_count, bufoff, i, runsince;
1018
1019 /* X1 */
1020 dirstat->level = (int_least8_t)paragraph_level;
1021 dirstat->override = GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
1022 dirstat->directional_isolate = false;
1023 overflow_isolate_count = overflow_embedding_count =
1024 valid_isolate_count = 0;
1025
1026 for (bufoff = 0; bufoff < buflen; bufoff++) {
1027 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
1028
1029 /* set paragraph level we need for line-level-processing */
1030 set_state(STATE_PARAGRAPH_LEVEL, paragraph_level,
1031 &(buf[bufoff]));
1032 again:
1033 if (prop == BIDI_PROP_RLE) {
1034 /* X2 */
1035 if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
1036 MAX_DEPTH &&
1037 overflow_isolate_count == 0 &&
1038 overflow_embedding_count == 0) {
1039 /* valid RLE */
1040 dirstat++;
1041 dirstat->level =
1042 (dirstat - 1)->level +
1043 ((dirstat - 1)->level % 2 != 0) + 1;
1044 dirstat->override =
1045 GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
1046 dirstat->directional_isolate = false;
1047 } else {
1048 /* overflow RLE */
1049 overflow_embedding_count +=
1050 (overflow_isolate_count == 0);
1051 }
1052 } else if (prop == BIDI_PROP_LRE) {
1053 /* X3 */
1054 if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
1055 MAX_DEPTH &&
1056 overflow_isolate_count == 0 &&
1057 overflow_embedding_count == 0) {
1058 /* valid LRE */
1059 dirstat++;
1060 dirstat->level =
1061 (dirstat - 1)->level +
1062 ((dirstat - 1)->level % 2 == 0) + 1;
1063 dirstat->override =
1064 GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
1065 dirstat->directional_isolate = false;
1066 } else {
1067 /* overflow LRE */
1068 overflow_embedding_count +=
1069 (overflow_isolate_count == 0);
1070 }
1071 } else if (prop == BIDI_PROP_RLO) {
1072 /* X4 */
1073 if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
1074 MAX_DEPTH &&
1075 overflow_isolate_count == 0 &&
1076 overflow_embedding_count == 0) {
1077 /* valid RLO */
1078 dirstat++;
1079 dirstat->level =
1080 (dirstat - 1)->level +
1081 ((dirstat - 1)->level % 2 != 0) + 1;
1082 dirstat->override =
1083 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
1084 dirstat->directional_isolate = false;
1085 } else {
1086 /* overflow RLO */
1087 overflow_embedding_count +=
1088 (overflow_isolate_count == 0);
1089 }
1090 } else if (prop == BIDI_PROP_LRO) {
1091 /* X5 */
1092 if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
1093 MAX_DEPTH &&
1094 overflow_isolate_count == 0 &&
1095 overflow_embedding_count == 0) {
1096 /* valid LRE */
1097 dirstat++;
1098 dirstat->level =
1099 (dirstat - 1)->level +
1100 ((dirstat - 1)->level % 2 == 0) + 1;
1101 dirstat->override =
1102 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR;
1103 dirstat->directional_isolate = false;
1104 } else {
1105 /* overflow LRO */
1106 overflow_embedding_count +=
1107 (overflow_isolate_count == 0);
1108 }
1109 } else if (prop == BIDI_PROP_RLI) {
1110 /* X5a */
1111 set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
1112 if (dirstat->override ==
1113 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
1114 set_state(STATE_PROP, BIDI_PROP_L,
1115 &(buf[bufoff]));
1116 } else if (dirstat->override ==
1117 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
1118 set_state(STATE_PROP, BIDI_PROP_R,
1119 &(buf[bufoff]));
1120 }
1121
1122 if (dirstat->level + (dirstat->level % 2 != 0) + 1 <=
1123 MAX_DEPTH &&
1124 overflow_isolate_count == 0 &&
1125 overflow_embedding_count == 0) {
1126 /* valid RLI */
1127 valid_isolate_count++;
1128
1129 dirstat++;
1130 dirstat->level =
1131 (dirstat - 1)->level +
1132 ((dirstat - 1)->level % 2 != 0) + 1;
1133 dirstat->override =
1134 GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
1135 dirstat->directional_isolate = true;
1136 } else {
1137 /* overflow RLI */
1138 overflow_isolate_count++;
1139 }
1140 } else if (prop == BIDI_PROP_LRI) {
1141 /* X5b */
1142 set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
1143 if (dirstat->override ==
1144 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
1145 set_state(STATE_PROP, BIDI_PROP_L,
1146 &(buf[bufoff]));
1147 } else if (dirstat->override ==
1148 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
1149 set_state(STATE_PROP, BIDI_PROP_R,
1150 &(buf[bufoff]));
1151 }
1152
1153 if (dirstat->level + (dirstat->level % 2 == 0) + 1 <=
1154 MAX_DEPTH &&
1155 overflow_isolate_count == 0 &&
1156 overflow_embedding_count == 0) {
1157 /* valid LRI */
1158 valid_isolate_count++;
1159
1160 dirstat++;
1161 dirstat->level =
1162 (dirstat - 1)->level +
1163 ((dirstat - 1)->level % 2 == 0) + 1;
1164 dirstat->override =
1165 GRAPHEME_BIDIRECTIONAL_DIRECTION_NEUTRAL;
1166 dirstat->directional_isolate = true;
1167 } else {
1168 /* overflow LRI */
1169 overflow_isolate_count++;
1170 }
1171 } else if (prop == BIDI_PROP_FSI) {
1172 /* X5c */
1173 if (get_isolated_paragraph_level(
1174 buf + (bufoff + 1),
1175 buflen - (bufoff + 1)) == 1) {
1176 prop = BIDI_PROP_RLI;
1177 goto again;
1178 } else { /* ... == 0 */
1179 prop = BIDI_PROP_LRI;
1180 goto again;
1181 }
1182 } else if (prop != BIDI_PROP_B && prop != BIDI_PROP_BN &&
1183 prop != BIDI_PROP_PDF && prop != BIDI_PROP_PDI) {
1184 /* X6 */
1185 set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
1186 if (dirstat->override ==
1187 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
1188 set_state(STATE_PROP, BIDI_PROP_L,
1189 &(buf[bufoff]));
1190 } else if (dirstat->override ==
1191 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
1192 set_state(STATE_PROP, BIDI_PROP_R,
1193 &(buf[bufoff]));
1194 }
1195 } else if (prop == BIDI_PROP_PDI) {
1196 /* X6a */
1197 if (overflow_isolate_count > 0) {
1198 /* PDI matches an overflow isolate initiator
1199 */
1200 overflow_isolate_count--;
1201 } else if (valid_isolate_count > 0) {
1202 /* PDI matches a normal isolate initiator */
1203 overflow_embedding_count = 0;
1204 while (dirstat->directional_isolate == false &&
1205 dirstat > directional_status) {
1206 /*
1207 * we are safe here as given the
1208 * valid isolate count is positive
1209 * there must be a stack-entry
1210 * with positive directional
1211 * isolate status, but we take
1212 * no chances and include an
1213 * explicit check
1214 *
1215 * POSSIBLE OPTIMIZATION: Whenever
1216 * we push on the stack, check if it
1217 * has the directional isolate
1218 * status true and store a pointer
1219 * to it so we can jump to it very
1220 * quickly.
1221 */
1222 dirstat--;
1223 }
1224
1225 /*
1226 * as above, the following check is not
1227 * necessary, given we are guaranteed to
1228 * have at least one stack entry left,
1229 * but it's better to be safe
1230 */
1231 if (dirstat > directional_status) {
1232 dirstat--;
1233 }
1234 valid_isolate_count--;
1235 }
1236
1237 set_state(STATE_LEVEL, dirstat->level, &(buf[bufoff]));
1238 if (dirstat->override ==
1239 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR) {
1240 set_state(STATE_PROP, BIDI_PROP_L,
1241 &(buf[bufoff]));
1242 } else if (dirstat->override ==
1243 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL) {
1244 set_state(STATE_PROP, BIDI_PROP_R,
1245 &(buf[bufoff]));
1246 }
1247 } else if (prop == BIDI_PROP_PDF) {
1248 /* X7 */
1249 if (overflow_isolate_count > 0) {
1250 /* do nothing */
1251 } else if (overflow_embedding_count > 0) {
1252 overflow_embedding_count--;
1253 } else if (dirstat->directional_isolate == false &&
1254 dirstat > directional_status) {
1255 dirstat--;
1256 }
1257 } else if (prop == BIDI_PROP_B) {
1258 /* X8 */
1259 set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
1260 }
1261
1262 /* X9 */
1263 if (prop == BIDI_PROP_RLE || prop == BIDI_PROP_LRE ||
1264 prop == BIDI_PROP_RLO || prop == BIDI_PROP_LRO ||
1265 prop == BIDI_PROP_PDF || prop == BIDI_PROP_BN) {
1266 set_state(STATE_LEVEL, -1, &(buf[bufoff]));
1267 }
1268 }
1269
1270 /* X10 (W1-W7, N0-N2) */
1271 for (bufoff = 0; bufoff < buflen; bufoff++) {
1272 if (get_state(STATE_VISITED, buf[bufoff]) == 0 &&
1273 get_state(STATE_LEVEL, buf[bufoff]) != -1) {
1274 bufoff += preprocess_isolating_run_sequence(
1275 buf, buflen, bufoff, paragraph_level);
1276 }
1277 }
1278
1279 /*
1280 * I1-I2 (given our sequential approach to processing the
1281 * isolating run sequences, we apply this rule separately)
1282 */
1283 for (bufoff = 0; bufoff < buflen; bufoff++) {
1284 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
1285 prop = (uint_least8_t)get_state(STATE_PROP, buf[bufoff]);
1286
1287 if (level % 2 == 0) {
1288 /* even level */
1289 if (prop == BIDI_PROP_R) {
1290 set_state(STATE_LEVEL, level + 1,
1291 &(buf[bufoff]));
1292 } else if (prop == BIDI_PROP_AN ||
1293 prop == BIDI_PROP_EN) {
1294 set_state(STATE_LEVEL, level + 2,
1295 &(buf[bufoff]));
1296 }
1297 } else {
1298 /* odd level */
1299 if (prop == BIDI_PROP_L || prop == BIDI_PROP_EN ||
1300 prop == BIDI_PROP_AN) {
1301 set_state(STATE_LEVEL, level + 1,
1302 &(buf[bufoff]));
1303 }
1304 }
1305 }
1306
1307 /* L1 (rules 1-3) */
1308 runsince = SIZE_MAX;
1309 for (bufoff = 0; bufoff < buflen; bufoff++) {
1310 level = (int_least8_t)get_state(STATE_LEVEL, buf[bufoff]);
1311 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
1312 buf[bufoff]);
1313
1314 if (level == -1) {
1315 /* ignored character */
1316 continue;
1317 }
1318
1319 /* rules 1 and 2 */
1320 if (prop == BIDI_PROP_S || prop == BIDI_PROP_B) {
1321 set_state(STATE_LEVEL, paragraph_level, &(buf[bufoff]));
1322 }
1323
1324 /* rule 3 */
1325 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
1326 prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
1327 prop == BIDI_PROP_PDI) {
1328 if (runsince == SIZE_MAX) {
1329 /* a new run has begun */
1330 runsince = bufoff;
1331 }
1332 } else if ((prop == BIDI_PROP_S || prop == BIDI_PROP_B) &&
1333 runsince != SIZE_MAX) {
1334 /*
1335 * we hit a segment or paragraph separator in a
1336 * sequence, reset sequence-levels
1337 */
1338 for (i = runsince; i < bufoff; i++) {
1339 if (get_state(STATE_LEVEL, buf[i]) != -1) {
1340 set_state(STATE_LEVEL, paragraph_level,
1341 &(buf[i]));
1342 }
1343 }
1344 runsince = SIZE_MAX;
1345 } else {
1346 /* sequence ended */
1347 runsince = SIZE_MAX;
1348 }
1349 }
1350 if (runsince != SIZE_MAX) {
1351 /*
1352 * this is the end of the paragraph and we
1353 * are in a run
1354 */
1355 for (i = runsince; i < buflen; i++) {
1356 if (get_state(STATE_LEVEL, buf[i]) != -1) {
1357 set_state(STATE_LEVEL, paragraph_level,
1358 &(buf[i]));
1359 }
1360 }
1361 runsince = SIZE_MAX;
1362 }
1363 }
1364
1365 static inline uint_least8_t
1366 get_bidi_bracket_off(uint_least32_t cp)
1367 {
1368 if (likely(cp <= 0x10FFFF)) {
1369 return (uint_least8_t)((bidi_minor[bidi_major[cp >> 8] +
1370 (cp & 0xff)]) >>
1371 5);
1372 } else {
1373 return 0;
1374 }
1375 }
1376
1377 static size_t
1378 preprocess(HERODOTUS_READER *r, enum grapheme_bidirectional_direction override,
1379 uint_least32_t *buf, size_t buflen,
1380 enum grapheme_bidirectional_direction *resolved)
1381 {
1382 HERODOTUS_READER tmp;
1383 size_t bufoff, bufsize, paragraph_len;
1384 uint_least32_t cp;
1385 uint_least8_t paragraph_level;
1386
1387 /* determine length and level of the paragraph */
1388 herodotus_reader_copy(r, &tmp);
1389 for (; herodotus_read_codepoint(&tmp, true, &cp) ==
1390 HERODOTUS_STATUS_SUCCESS;) {
1391 /* break on paragraph separator */
1392 if (get_bidi_property(cp) == BIDI_PROP_B) {
1393 break;
1394 }
1395 }
1396 paragraph_len = herodotus_reader_number_read(&tmp);
1397 paragraph_level = get_paragraph_level(override, r);
1398
1399 if (resolved != NULL) {
1400 /* store resolved paragraph level in output variable */
1401 /* TODO use enum-type */
1402 *resolved = (paragraph_level == 0) ?
1403 GRAPHEME_BIDIRECTIONAL_DIRECTION_LTR :
1404 GRAPHEME_BIDIRECTIONAL_DIRECTION_RTL;
1405 }
1406
1407 if (buf == NULL) {
1408 /* see below for return value reasoning */
1409 return paragraph_len;
1410 }
1411
1412 /*
1413 * the first step is to determine the bidirectional properties
1414 * and store them in the buffer
1415 */
1416 for (bufoff = 0;
1417 bufoff < paragraph_len &&
1418 herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;
1419 bufoff++) {
1420 if (bufoff < buflen) {
1421 /*
1422 * actually only do something when we have
1423 * space in the level-buffer. We continue
1424 * the iteration to be able to give a good
1425 * return value
1426 */
1427 set_state(STATE_PROP,
1428 (uint_least8_t)get_bidi_property(cp),
1429 &(buf[bufoff]));
1430 set_state(STATE_BRACKET_OFF, get_bidi_bracket_off(cp),
1431 &(buf[bufoff]));
1432 set_state(STATE_LEVEL, 0, &(buf[bufoff]));
1433 set_state(STATE_PARAGRAPH_LEVEL, 0, &(buf[bufoff]));
1434 set_state(STATE_VISITED, 0, &(buf[bufoff]));
1435 set_state(STATE_PRESERVED_PROP,
1436 (uint_least8_t)get_bidi_property(cp),
1437 &(buf[bufoff]));
1438 }
1439 }
1440 bufsize = herodotus_reader_number_read(r);
1441
1442 for (bufoff = 0; bufoff < bufsize; bufoff++) {
1443 if (get_state(STATE_PROP, buf[bufoff]) != BIDI_PROP_B &&
1444 bufoff != bufsize - 1) {
1445 continue;
1446 }
1447
1448 /*
1449 * we either encountered a paragraph terminator or this
1450 * is the last character in the string.
1451 * Call the paragraph handler on the paragraph, including
1452 * the terminating character or last character of the
1453 * string respectively
1454 */
1455 preprocess_paragraph(paragraph_level, buf, bufoff + 1);
1456 break;
1457 }
1458
1459 /*
1460 * we return the number of total bytes read, as the function
1461 * should indicate if the given level-buffer is too small
1462 */
1463 return herodotus_reader_number_read(r);
1464 }
1465
1466 size_t
1467 grapheme_bidirectional_preprocess_paragraph(
1468 const uint_least32_t *src, size_t srclen,
1469 enum grapheme_bidirectional_direction override, uint_least32_t *dest,
1470 size_t destlen, enum grapheme_bidirectional_direction *resolved)
1471 {
1472 HERODOTUS_READER r;
1473
1474 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
1475
1476 return preprocess(&r, override, dest, destlen, resolved);
1477 }
1478
1479 static inline size_t
1480 get_line_embedding_levels(const uint_least32_t *linedata, size_t linelen,
1481 int_least8_t (*get_level)(const void *, size_t),
1482 void (*set_level)(void *, size_t, int_least8_t),
1483 void *lev, size_t levsize, bool skipignored)
1484 {
1485 enum bidi_property prop;
1486 size_t i, levlen, runsince;
1487 int_least8_t level, runlevel;
1488
1489 /* rule L1.4 */
1490 runsince = SIZE_MAX;
1491 runlevel = -1;
1492 for (i = 0, levlen = 0; i < linelen; i++) {
1493 level = (int_least8_t)get_state(STATE_LEVEL, linedata[i]);
1494 prop = (uint_least8_t)get_state(STATE_PRESERVED_PROP,
1495 linedata[i]);
1496
1497 /* write level into level array if we still have space */
1498 if (level != -1 || skipignored == false) {
1499 if (levlen <= levsize) {
1500 set_level(lev, levlen, level);
1501 }
1502 levlen++;
1503 }
1504
1505 if (level == -1) {
1506 /* ignored character */
1507 continue;
1508 }
1509
1510 if (prop == BIDI_PROP_WS || prop == BIDI_PROP_FSI ||
1511 prop == BIDI_PROP_LRI || prop == BIDI_PROP_RLI ||
1512 prop == BIDI_PROP_PDI) {
1513 if (runsince == SIZE_MAX) {
1514 /* a new run has begun */
1515 runsince = levlen - 1; /* levlen > 0 */
1516 runlevel = (int_least8_t)get_state(
1517 STATE_PARAGRAPH_LEVEL, linedata[i]);
1518 }
1519 } else {
1520 /* sequence ended */
1521 runsince = SIZE_MAX;
1522 runlevel = -1;
1523 }
1524 }
1525 if (runsince != SIZE_MAX) {
1526 /*
1527 * we hit the end of the line but were in a run;
1528 * reset the line levels to the paragraph level
1529 */
1530 for (i = runsince; i < MIN(linelen, levlen); i++) {
1531 if (get_level(lev, i) != -1) {
1532 set_level(lev, i, runlevel);
1533 }
1534 }
1535 }
1536
1537 return levlen;
1538 }
1539
1540 static inline int_least8_t
1541 get_level_int8(const void *lev, size_t off)
1542 {
1543 return ((const int_least8_t *)lev)[off];
1544 }
1545
1546 static inline void
1547 set_level_int8(void *lev, size_t off, int_least8_t value)
1548 {
1549 ((int_least8_t *)lev)[off] = value;
1550 }
1551
1552 size_t
1553 grapheme_bidirectional_get_line_embedding_levels(const uint_least32_t *linedata,
1554 size_t linelen,
1555 int_least8_t *lev,
1556 size_t levlen)
1557 {
1558 return get_line_embedding_levels(linedata, linelen, get_level_int8,
1559 set_level_int8, lev, levlen, false);
1560 }
1561
1562 static inline int_least8_t
1563 get_level_uint32(const void *lev, size_t off)
1564 {
1565 return (int_least8_t)((((const uint_least32_t *)lev)[off] &
1566 UINT32_C(0x1FE00000)) >>
1567 21) -
1568 1;
1569 }
1570
1571 static inline void
1572 set_level_uint32(void *lev, size_t off, int_least8_t value)
1573 {
1574 ((uint_least32_t *)lev)[off] ^=
1575 ((uint_least32_t *)lev)[off] & UINT32_C(0x1FE00000);
1576 ((uint_least32_t *)lev)[off] |= ((uint_least32_t)(value + 1)) << 21;
1577 }
1578
1579 static inline int_least16_t
1580 get_mirror_offset(uint_least32_t cp)
1581 {
1582 if (cp <= UINT32_C(0x10FFFF)) {
1583 return mirror_minor[mirror_major[cp >> 8] + (cp & 0xFF)];
1584 } else {
1585 return 0;
1586 }
1587 }
1588
1589 size_t
1590 grapheme_bidirectional_reorder_line(const uint_least32_t *line,
1591 const uint_least32_t *linedata,
1592 size_t linelen, uint_least32_t *output,
1593 size_t outputsize)
1594 {
1595 size_t i, outputlen, first, last, j, k, l /*, laststart*/;
1596 int_least8_t level, min_odd_level = MAX_DEPTH + 2, max_level = 0;
1597 uint_least32_t tmp;
1598
1599 /* write output characters (and apply possible mirroring) */
1600 for (i = 0, outputlen = 0; i < linelen; i++) {
1601 if (get_state(STATE_LEVEL, linedata[i]) != -1) {
1602 if (outputlen < outputsize) {
1603 output[outputlen] =
1604 (uint_least32_t)((int_least32_t)
1605 line[i] +
1606 get_mirror_offset(
1607 line[i]));
1608 }
1609 outputlen++;
1610 }
1611 }
1612 if (outputlen >= outputsize) {
1613 /* clear output buffer */
1614 for (i = 0; i < outputsize; i++) {
1615 output[i] = GRAPHEME_INVALID_CODEPOINT;
1616 }
1617
1618 /* return required size */
1619 return outputlen;
1620 }
1621
1622 /*
1623 * write line embedding levels as metadata and codepoints into the
1624 * output
1625 */
1626 get_line_embedding_levels(linedata, linelen, get_level_uint32,
1627 set_level_uint32, output, outputsize, true);
1628
1629 /* determine level range */
1630 for (i = 0; i < outputlen; i++) {
1631 level = get_level_uint32(output, i);
1632
1633 if (level == -1) {
1634 /* ignored character */
1635 continue;
1636 }
1637
1638 if (level % 2 == 1 && level < min_odd_level) {
1639 min_odd_level = level;
1640 }
1641 if (level > max_level) {
1642 max_level = level;
1643 }
1644 }
1645
1646 for (level = max_level; level >= min_odd_level /* > 0 */; level--) {
1647 for (i = 0; i < outputlen; i++) {
1648 if (get_level_uint32(output, i) >= level) {
1649 /*
1650 * the current character has the desired level
1651 */
1652 first = last = i;
1653
1654 /* find the end of the level-sequence */
1655 for (i++; i < outputlen; i++) {
1656 if (get_level_uint32(output, i) >=
1657 level) {
1658 /* the sequence continues */
1659 last = i;
1660 } else {
1661 break;
1662 }
1663 }
1664
1665 /* invert the sequence first..last respecting
1666 * grapheme clusters
1667 *
1668 * The standard only speaks of combining marks
1669 * inversion, but we should in the perfect case
1670 * respect _all_ grapheme clusters, which we do
1671 * here!
1672 */
1673
1674 /* mark grapheme cluster breaks */
1675 for (j = first; j <= last;
1676 j += grapheme_next_character_break(
1677 line + j, outputlen - j)) {
1678 /*
1679 * we use a special trick here: The
1680 * first 21 bits of the state are filled
1681 * with the codepoint, the next 8 bits
1682 * are used for the level, so we can use
1683 * the 30th bit to mark the grapheme
1684 * cluster breaks. This allows us to
1685 * reinvert the grapheme clusters into
1686 * the proper direction later.
1687 */
1688 output[j] |= UINT32_C(1) << 29;
1689 }
1690
1691 /* global inversion */
1692 for (k = first, l = last; k < l; k++, l--) {
1693 /* swap */
1694 tmp = output[k];
1695 output[k] = output[l];
1696 output[l] = tmp;
1697 }
1698
1699 /* grapheme cluster reinversion */
1700 #if 0
1701 for (j = first, laststart = first; j <= last;
1702 j++) {
1703 if (output[j] & (UINT32_C(1) << 29)) {
1704 /* we hit a mark! given the
1705 * grapheme cluster is inverted,
1706 * this means that the cluster
1707 * ended and we now reinvert it
1708 * again
1709 */
1710 for (k = laststart, l = j;
1711 k < l; k++, l--) {
1712 /* swap */
1713 tmp = output[k];
1714 output[k] = output[l];
1715 output[l] = tmp;
1716 }
1717 laststart = j + 1;
1718 }
1719 }
1720 #endif
1721
1722 /* unmark grapheme cluster breaks */
1723 for (j = first; j <= last; j++) {
1724 output[j] ^= output[j] &
1725 UINT32_C(0x20000000);
1726 }
1727 }
1728 }
1729 }
1730
1731 /* remove embedding level metadata */
1732 for (i = 0; i < outputlen; i++) {
1733 output[i] ^= output[i] & UINT32_C(0x1FE00000);
1734 }
1735
1736 return outputlen;
1737 }