line.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
line.c (14397B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4
5 #include "../gen/line.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 static inline enum line_break_property
10 get_break_prop(uint_least32_t cp)
11 {
12 if (likely(cp <= UINT32_C(0x10FFFF))) {
13 return (enum line_break_property)
14 line_break_minor[line_break_major[cp >> 8] +
15 (cp & 0xff)];
16 } else {
17 return LINE_BREAK_PROP_AL;
18 }
19 }
20
21 static size_t
22 next_line_break(HERODOTUS_READER *r)
23 {
24 HERODOTUS_READER tmp;
25 enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
26 last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
27 uint_least32_t cp;
28 uint_least8_t lb25_level = 0;
29 bool lb21a_flag = false, ri_even = true;
30
31 /*
32 * Apply line breaking algorithm (UAX #14), see
33 * https://unicode.org/reports/tr14/#Algorithm and tailoring
34 * https://unicode.org/reports/tr14/#Examples (example 7),
35 * given the automatic test-cases implement this example for
36 * better number handling.
37 *
38 */
39
40 /*
41 * Initialize the different properties such that we have
42 * a good state after the state-update in the loop
43 */
44 last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
45 last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
46
47 for (herodotus_read_codepoint(r, true, &cp),
48 cp0_prop = get_break_prop(cp);
49 herodotus_read_codepoint(r, false, &cp) ==
50 HERODOTUS_STATUS_SUCCESS;
51 herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
52 /* get property of the right codepoint */
53 cp1_prop = get_break_prop(cp);
54
55 /* update retention-states */
56
57 /*
58 * store the last observed non-CM-or-ZWJ-property for
59 * LB9 and following.
60 */
61 if (cp0_prop != LINE_BREAK_PROP_CM &&
62 cp0_prop != LINE_BREAK_PROP_ZWJ) {
63 /*
64 * check if the property we are overwriting now is an
65 * HL. If so, we set the LB21a-flag which depends on
66 * this knowledge.
67 */
68 lb21a_flag =
69 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
70
71 /* check regional indicator state */
72 if (cp0_prop == LINE_BREAK_PROP_RI) {
73 /*
74 * The property we just shifted in is
75 * a regional indicator, increasing the
76 * number of consecutive RIs on the left
77 * side of the breakpoint by one, changing
78 * the oddness.
79 *
80 */
81 ri_even = !ri_even;
82 } else {
83 /*
84 * We saw no regional indicator, so the
85 * number of consecutive RIs on the left
86 * side of the breakpoint is zero, which
87 * is an even number.
88 *
89 */
90 ri_even = true;
91 }
92
93 /*
94 * Here comes a bit of magic. The tailored rule
95 * LB25 (using example 7) has a very complicated
96 * left-hand-side-rule of the form
97 *
98 * NU (NU | SY | IS)* (CL | CP)?
99 *
100 * but instead of backtracking, we keep the state
101 * as some kind of "power level" in the variable
102 *
103 * lb25_level
104 *
105 * that goes from 0 to 3
106 *
107 * 0: we are not in the sequence
108 * 1: we have one NU to the left of the middle
109 * spot
110 * 2: we have one NU and one or more (NU | SY | IS)
111 * to the left of the middle spot
112 * 3: we have one NU, zero or more (NU | SY | IS)
113 * and one (CL | CP) to the left of the middle
114 * spot
115 */
116 if ((lb25_level == 0 || lb25_level == 1) &&
117 cp0_prop == LINE_BREAK_PROP_NU) {
118 /* sequence has begun */
119 lb25_level = 1;
120 } else if ((lb25_level == 1 || lb25_level == 2) &&
121 (cp0_prop == LINE_BREAK_PROP_NU ||
122 cp0_prop == LINE_BREAK_PROP_SY ||
123 cp0_prop == LINE_BREAK_PROP_IS)) {
124 /* (NU | SY | IS) sequence begins or continued
125 */
126 lb25_level = 2;
127 } else if (
128 (lb25_level == 1 || lb25_level == 2) &&
129 (cp0_prop == LINE_BREAK_PROP_CL ||
130 cp0_prop ==
131 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
132 cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
133 /* CL or CP at the end of the sequence */
134 lb25_level = 3;
135 } else {
136 /* sequence broke */
137 lb25_level = 0;
138 }
139
140 last_non_cm_or_zwj_prop = cp0_prop;
141 }
142
143 /*
144 * store the last observed non-SP-property for LB8, LB14,
145 * LB15, LB16 and LB17. LB8 gets its own unskipped property,
146 * whereas the others build on top of the CM-ZWJ-skipped
147 * properties as they come after LB9
148 */
149 if (cp0_prop != LINE_BREAK_PROP_SP) {
150 last_non_sp_prop = cp0_prop;
151 }
152 if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
153 last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
154 }
155
156 /* apply the algorithm */
157
158 /* LB4 */
159 if (cp0_prop == LINE_BREAK_PROP_BK) {
160 break;
161 }
162
163 /* LB5 */
164 if (cp0_prop == LINE_BREAK_PROP_CR &&
165 cp1_prop == LINE_BREAK_PROP_LF) {
166 continue;
167 }
168 if (cp0_prop == LINE_BREAK_PROP_CR ||
169 cp0_prop == LINE_BREAK_PROP_LF ||
170 cp0_prop == LINE_BREAK_PROP_NL) {
171 break;
172 }
173
174 /* LB6 */
175 if (cp1_prop == LINE_BREAK_PROP_BK ||
176 cp1_prop == LINE_BREAK_PROP_CR ||
177 cp1_prop == LINE_BREAK_PROP_LF ||
178 cp1_prop == LINE_BREAK_PROP_NL) {
179 continue;
180 }
181
182 /* LB7 */
183 if (cp1_prop == LINE_BREAK_PROP_SP ||
184 cp1_prop == LINE_BREAK_PROP_ZW) {
185 continue;
186 }
187
188 /* LB8 */
189 if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
190 break;
191 }
192
193 /* LB8a */
194 if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
195 continue;
196 }
197
198 /* LB9 */
199 if ((cp0_prop != LINE_BREAK_PROP_BK &&
200 cp0_prop != LINE_BREAK_PROP_CR &&
201 cp0_prop != LINE_BREAK_PROP_LF &&
202 cp0_prop != LINE_BREAK_PROP_NL &&
203 cp0_prop != LINE_BREAK_PROP_SP &&
204 cp0_prop != LINE_BREAK_PROP_ZW) &&
205 (cp1_prop == LINE_BREAK_PROP_CM ||
206 cp1_prop == LINE_BREAK_PROP_ZWJ)) {
207 /*
208 * given we skip them, we don't break in such
209 * a sequence
210 */
211 continue;
212 }
213
214 /* LB10 is baked into the following rules */
215
216 /* LB11 */
217 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
218 cp1_prop == LINE_BREAK_PROP_WJ) {
219 continue;
220 }
221
222 /* LB12 */
223 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
224 continue;
225 }
226
227 /* LB12a */
228 if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
229 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
230 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
231 cp1_prop == LINE_BREAK_PROP_GL) {
232 continue;
233 }
234
235 /* LB13 (affected by tailoring for LB25, see example 7) */
236 if (cp1_prop == LINE_BREAK_PROP_EX ||
237 (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
238 (cp1_prop == LINE_BREAK_PROP_CL ||
239 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
240 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
241 cp1_prop == LINE_BREAK_PROP_IS ||
242 cp1_prop == LINE_BREAK_PROP_SY))) {
243 continue;
244 }
245
246 /* LB14 */
247 if (last_non_sp_cm_or_zwj_prop ==
248 LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
249 last_non_sp_cm_or_zwj_prop ==
250 LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
251 continue;
252 }
253
254 /* LB15 */
255 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
256 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
257 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
258 continue;
259 }
260
261 /* LB16 */
262 if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
263 last_non_sp_cm_or_zwj_prop ==
264 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
265 last_non_sp_cm_or_zwj_prop ==
266 LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
267 cp1_prop == LINE_BREAK_PROP_NS) {
268 continue;
269 }
270
271 /* LB17 */
272 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
273 cp1_prop == LINE_BREAK_PROP_B2) {
274 continue;
275 }
276
277 /* LB18 */
278 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
279 break;
280 }
281
282 /* LB19 */
283 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
284 cp1_prop == LINE_BREAK_PROP_QU) {
285 continue;
286 }
287
288 /* LB20 */
289 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
290 cp1_prop == LINE_BREAK_PROP_CB) {
291 break;
292 }
293
294 /* LB21 */
295 if (cp1_prop == LINE_BREAK_PROP_BA ||
296 cp1_prop == LINE_BREAK_PROP_HY ||
297 cp1_prop == LINE_BREAK_PROP_NS ||
298 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
299 continue;
300 }
301
302 /* LB21a */
303 if (lb21a_flag &&
304 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
305 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
306 continue;
307 }
308
309 /* LB21b */
310 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
311 cp1_prop == LINE_BREAK_PROP_HL) {
312 continue;
313 }
314
315 /* LB22 */
316 if (cp1_prop == LINE_BREAK_PROP_IN) {
317 continue;
318 }
319
320 /* LB23 */
321 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
322 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
323 cp1_prop == LINE_BREAK_PROP_NU) {
324 continue;
325 }
326 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
327 (cp1_prop == LINE_BREAK_PROP_AL ||
328 cp1_prop == LINE_BREAK_PROP_HL)) {
329 continue;
330 }
331
332 /* LB23a */
333 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
334 (cp1_prop == LINE_BREAK_PROP_ID ||
335 cp1_prop == LINE_BREAK_PROP_EB ||
336 cp1_prop == LINE_BREAK_PROP_EM)) {
337 continue;
338 }
339 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
340 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
341 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
342 cp1_prop == LINE_BREAK_PROP_PO) {
343 continue;
344 }
345
346 /* LB24 */
347 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
348 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
349 (cp1_prop == LINE_BREAK_PROP_AL ||
350 cp1_prop == LINE_BREAK_PROP_HL)) {
351 continue;
352 }
353 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
354 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
355 (cp1_prop == LINE_BREAK_PROP_PR ||
356 cp1_prop == LINE_BREAK_PROP_PO)) {
357 continue;
358 }
359
360 /* LB25 (tailored with example 7) */
361 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
362 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
363 if (cp1_prop == LINE_BREAK_PROP_NU) {
364 continue;
365 }
366
367 /* this stupid rule is the reason why we cannot
368 * simply have a stateful break-detection between
369 * two adjacent codepoints as we have it with
370 * characters.
371 */
372 herodotus_reader_copy(r, &tmp);
373 herodotus_read_codepoint(&tmp, true, &cp);
374 if (herodotus_read_codepoint(&tmp, true, &cp) ==
375 HERODOTUS_STATUS_SUCCESS &&
376 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
377 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
378 cp1_prop == LINE_BREAK_PROP_HY)) {
379 if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
380 continue;
381 }
382 }
383 }
384 if ((last_non_cm_or_zwj_prop ==
385 LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
386 last_non_cm_or_zwj_prop ==
387 LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
388 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
389 cp1_prop == LINE_BREAK_PROP_NU) {
390 continue;
391 }
392 if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU ||
393 cp1_prop == LINE_BREAK_PROP_SY ||
394 cp1_prop == LINE_BREAK_PROP_IS)) {
395 continue;
396 }
397 if ((lb25_level == 1 || lb25_level == 2) &&
398 (cp1_prop == LINE_BREAK_PROP_NU ||
399 cp1_prop == LINE_BREAK_PROP_SY ||
400 cp1_prop == LINE_BREAK_PROP_IS ||
401 cp1_prop == LINE_BREAK_PROP_CL ||
402 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
403 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
404 continue;
405 }
406 if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
407 (cp1_prop == LINE_BREAK_PROP_PO ||
408 cp1_prop == LINE_BREAK_PROP_PR)) {
409 continue;
410 }
411
412 /* LB26 */
413 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
414 (cp1_prop == LINE_BREAK_PROP_JL ||
415 cp1_prop == LINE_BREAK_PROP_JV ||
416 cp1_prop == LINE_BREAK_PROP_H2 ||
417 cp1_prop == LINE_BREAK_PROP_H3)) {
418 continue;
419 }
420 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
421 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
422 (cp1_prop == LINE_BREAK_PROP_JV ||
423 cp1_prop == LINE_BREAK_PROP_JT)) {
424 continue;
425 }
426 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
427 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
428 cp1_prop == LINE_BREAK_PROP_JT) {
429 continue;
430 }
431
432 /* LB27 */
433 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
434 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
435 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
436 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
437 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
438 cp1_prop == LINE_BREAK_PROP_PO) {
439 continue;
440 }
441 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
442 (cp1_prop == LINE_BREAK_PROP_JL ||
443 cp1_prop == LINE_BREAK_PROP_JV ||
444 cp1_prop == LINE_BREAK_PROP_JT ||
445 cp1_prop == LINE_BREAK_PROP_H2 ||
446 cp1_prop == LINE_BREAK_PROP_H3)) {
447 continue;
448 }
449
450 /* LB28 */
451 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
452 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
453 (cp1_prop == LINE_BREAK_PROP_AL ||
454 cp1_prop == LINE_BREAK_PROP_HL)) {
455 continue;
456 }
457
458 /* LB29 */
459 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
460 (cp1_prop == LINE_BREAK_PROP_AL ||
461 cp1_prop == LINE_BREAK_PROP_HL)) {
462 continue;
463 }
464
465 /* LB30 */
466 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
467 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
468 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
469 cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
470 continue;
471 }
472 if (last_non_cm_or_zwj_prop ==
473 LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
474 (cp1_prop == LINE_BREAK_PROP_AL ||
475 cp1_prop == LINE_BREAK_PROP_HL ||
476 cp1_prop == LINE_BREAK_PROP_NU)) {
477 continue;
478 }
479
480 /* LB30a */
481 if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
482 cp1_prop == LINE_BREAK_PROP_RI) {
483 continue;
484 }
485
486 /* LB30b */
487 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
488 cp1_prop == LINE_BREAK_PROP_EM) {
489 continue;
490 }
491 if (last_non_cm_or_zwj_prop ==
492 LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
493 cp1_prop == LINE_BREAK_PROP_EM) {
494 continue;
495 }
496
497 /* LB31 */
498 break;
499 }
500
501 return herodotus_reader_number_read(r);
502 }
503
504 size_t
505 grapheme_next_line_break(const uint_least32_t *str, size_t len)
506 {
507 HERODOTUS_READER r;
508
509 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
510
511 return next_line_break(&r);
512 }
513
514 size_t
515 grapheme_next_line_break_utf8(const char *str, size_t len)
516 {
517 HERODOTUS_READER r;
518
519 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
520
521 return next_line_break(&r);
522 }