line.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
line.c (11273B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "util.h"
7
8 #define FILE_EAW "data/EastAsianWidth.txt"
9 #define FILE_EMOJI "data/emoji-data.txt"
10 #define FILE_LINE "data/LineBreak.txt"
11
12 static const struct property_spec line_break_property[] = {
13 {
14 .enumname = "AL",
15 .file = FILE_LINE,
16 .ucdname = "AL",
17 },
18 /*
19 * Both extended pictographic and cn are large classes,
20 * but we are only interested in their intersection for LB30b,
21 * so we have the following two temporary classes. At first
22 * the extpict-class is filled, then the cn-class, which leads
23 * to conflicts (that we handle by putting them in the "proper"
24 * class BOTH_CN_EXTPICT). We make use of the fact that there
25 * is no intersection between AL and Cn.
26 *
27 * Any consecutive conflicts are permitted to overwrite
28 * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need
29 * them, and in the final postprocessing we "reset" all
30 * remaining matches (that then didn't fit any of the other
31 * classes) to the generic class AL.
32 */
33 {
34 .enumname = "TMP_CN",
35 .file = FILE_LINE,
36 .ucdname = "Cn",
37 },
38 {
39 .enumname = "TMP_EXTENDED_PICTOGRAPHIC",
40 .file = FILE_EMOJI,
41 .ucdname = "Extended_Pictographic",
42 },
43 /* end of special block */
44 {
45 .enumname = "B2",
46 .file = FILE_LINE,
47 .ucdname = "B2",
48 },
49 {
50 .enumname = "BA",
51 .file = FILE_LINE,
52 .ucdname = "BA",
53 },
54 {
55 .enumname = "BB",
56 .file = FILE_LINE,
57 .ucdname = "BB",
58 },
59 {
60 .enumname = "BK",
61 .file = FILE_LINE,
62 .ucdname = "BK",
63 },
64 {
65 .enumname = "BOTH_CN_EXTPICT",
66 .file = NULL,
67 .ucdname = NULL,
68 },
69 {
70 .enumname = "CB",
71 .file = FILE_LINE,
72 .ucdname = "CB",
73 },
74 {
75 .enumname = "CL",
76 .file = FILE_LINE,
77 .ucdname = "CL",
78 },
79 {
80 .enumname = "CM",
81 .file = FILE_LINE,
82 .ucdname = "CM",
83 },
84 {
85 .enumname = "CP_WITHOUT_EAW_HWF",
86 .file = FILE_LINE,
87 .ucdname = "CP",
88 },
89 {
90 .enumname = "CP_WITH_EAW_HWF",
91 .file = NULL,
92 .ucdname = NULL,
93 },
94 {
95 .enumname = "CR",
96 .file = FILE_LINE,
97 .ucdname = "CR",
98 },
99 {
100 .enumname = "EB",
101 .file = FILE_LINE,
102 .ucdname = "EB",
103 },
104 {
105 .enumname = "EM",
106 .file = FILE_LINE,
107 .ucdname = "EM",
108 },
109 {
110 .enumname = "EX",
111 .file = FILE_LINE,
112 .ucdname = "EX",
113 },
114 {
115 .enumname = "GL",
116 .file = FILE_LINE,
117 .ucdname = "GL",
118 },
119 {
120 .enumname = "H2",
121 .file = FILE_LINE,
122 .ucdname = "H2",
123 },
124 {
125 .enumname = "H3",
126 .file = FILE_LINE,
127 .ucdname = "H3",
128 },
129 {
130 .enumname = "HL",
131 .file = FILE_LINE,
132 .ucdname = "HL",
133 },
134 {
135 .enumname = "HY",
136 .file = FILE_LINE,
137 .ucdname = "HY",
138 },
139 {
140 .enumname = "ID",
141 .file = FILE_LINE,
142 .ucdname = "ID",
143 },
144 {
145 .enumname = "IN",
146 .file = FILE_LINE,
147 .ucdname = "IN",
148 },
149 {
150 .enumname = "IS",
151 .file = FILE_LINE,
152 .ucdname = "IS",
153 },
154 {
155 .enumname = "JL",
156 .file = FILE_LINE,
157 .ucdname = "JL",
158 },
159 {
160 .enumname = "JT",
161 .file = FILE_LINE,
162 .ucdname = "JT",
163 },
164 {
165 .enumname = "JV",
166 .file = FILE_LINE,
167 .ucdname = "JV",
168 },
169 {
170 .enumname = "LF",
171 .file = FILE_LINE,
172 .ucdname = "LF",
173 },
174 {
175 .enumname = "NL",
176 .file = FILE_LINE,
177 .ucdname = "NL",
178 },
179 {
180 .enumname = "NS",
181 .file = FILE_LINE,
182 .ucdname = "NS",
183 },
184 {
185 .enumname = "NU",
186 .file = FILE_LINE,
187 .ucdname = "NU",
188 },
189 {
190 .enumname = "OP_WITHOUT_EAW_HWF",
191 .file = FILE_LINE,
192 .ucdname = "OP",
193 },
194 {
195 .enumname = "OP_WITH_EAW_HWF",
196 .file = NULL,
197 .ucdname = NULL,
198 },
199 {
200 .enumname = "PO",
201 .file = FILE_LINE,
202 .ucdname = "PO",
203 },
204 {
205 .enumname = "PR",
206 .file = FILE_LINE,
207 .ucdname = "PR",
208 },
209 {
210 .enumname = "QU",
211 .file = FILE_LINE,
212 .ucdname = "QU",
213 },
214 {
215 .enumname = "RI",
216 .file = FILE_LINE,
217 .ucdname = "RI",
218 },
219 {
220 .enumname = "SP",
221 .file = FILE_LINE,
222 .ucdname = "SP",
223 },
224 {
225 .enumname = "SY",
226 .file = FILE_LINE,
227 .ucdname = "SY",
228 },
229 {
230 .enumname = "WJ",
231 .file = FILE_LINE,
232 .ucdname = "WJ",
233 },
234 {
235 .enumname = "ZW",
236 .file = FILE_LINE,
237 .ucdname = "ZW",
238 },
239 {
240 .enumname = "ZWJ",
241 .file = FILE_LINE,
242 .ucdname = "ZWJ",
243 },
244 {
245 .enumname = "TMP_AI",
246 .file = FILE_LINE,
247 .ucdname = "AI",
248 },
249 {
250 .enumname = "TMP_CJ",
251 .file = FILE_LINE,
252 .ucdname = "CJ",
253 },
254 {
255 .enumname = "TMP_XX",
256 .file = NULL,
257 .ucdname = NULL,
258 },
259 {
260 .enumname = "TMP_MN",
261 .file = FILE_LINE,
262 .ucdname = "Mn",
263 },
264 {
265 .enumname = "TMP_MC",
266 .file = FILE_LINE,
267 .ucdname = "Mc",
268 },
269 {
270 .enumname = "TMP_SA_WITHOUT_MN_OR_MC",
271 .file = FILE_LINE,
272 .ucdname = "SA",
273 },
274 {
275 .enumname = "TMP_SA_WITH_MN_OR_MC",
276 .file = FILE_LINE,
277 .ucdname = "SA",
278 },
279 {
280 .enumname = "TMP_SG",
281 .file = FILE_LINE,
282 .ucdname = "SG",
283 },
284 {
285 .enumname = "TMP_EAW_H",
286 .file = FILE_EAW,
287 .ucdname = "H",
288 },
289 {
290 .enumname = "TMP_EAW_W",
291 .file = FILE_EAW,
292 .ucdname = "W",
293 },
294 {
295 .enumname = "TMP_EAW_F",
296 .file = FILE_EAW,
297 .ucdname = "F",
298 },
299 };
300
301 static uint_least8_t
302 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
303 {
304 uint_least8_t result = prop2;
305 char *target = NULL;
306
307 (void)cp;
308
309 if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
310 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
311 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) ||
312 (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") ||
313 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") ||
314 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) {
315 if (!strcmp(line_break_property[prop1].enumname,
316 "CP_WITHOUT_EAW_HWF") ||
317 !strcmp(line_break_property[prop2].enumname,
318 "CP_WITHOUT_EAW_HWF")) {
319 target = "CP_WITH_EAW_HWF";
320 } else if (!strcmp(line_break_property[prop1].enumname,
321 "OP_WITHOUT_EAW_HWF") ||
322 !strcmp(line_break_property[prop2].enumname,
323 "OP_WITHOUT_EAW_HWF")) {
324 target = "OP_WITH_EAW_HWF";
325 } else {
326 /* ignore EAW for the rest */
327 if ((!strcmp(line_break_property[prop1].enumname,
328 "TMP_EAW_H") ||
329 !strcmp(line_break_property[prop1].enumname,
330 "TMP_EAW_W") ||
331 !strcmp(line_break_property[prop1].enumname,
332 "TMP_EAW_F"))) {
333 result = prop2;
334 } else {
335 result = prop1;
336 }
337 }
338 } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
339 !strcmp(line_break_property[prop1].enumname, "TMP_MC")) ||
340 (!strcmp(line_break_property[prop2].enumname, "TMP_MN") ||
341 !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) {
342 if (!strcmp(line_break_property[prop1].enumname,
343 "SA_WITHOUT_MN_OR_MC") ||
344 !strcmp(line_break_property[prop2].enumname,
345 "SA_WITHOUT_MN_OR_MC")) {
346 target = "SA_WITH_MN_OR_MC";
347 } else {
348 /* ignore Mn and Mc for the rest */
349 if ((!strcmp(line_break_property[prop1].enumname,
350 "TMP_MN") ||
351 !strcmp(line_break_property[prop1].enumname,
352 "TMP_MC"))) {
353 result = prop2;
354 } else {
355 result = prop1;
356 }
357 }
358 } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
359 !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
360 if (!strcmp(line_break_property[prop1].enumname,
361 "TMP_EXTENDED_PICTOGRAPHIC") ||
362 !strcmp(line_break_property[prop2].enumname,
363 "TMP_EXTENDED_PICTOGRAPHIC")) {
364 target = "BOTH_CN_EXTPICT";
365 } else {
366 /* ignore Cn for all the other properties */
367 if (!strcmp(line_break_property[prop1].enumname,
368 "TMP_CN")) {
369 result = prop2;
370 } else {
371 result = prop1;
372 }
373 }
374 } else if (!strcmp(line_break_property[prop1].enumname,
375 "TMP_EXTENDED_PICTOGRAPHIC") ||
376 !strcmp(line_break_property[prop2].enumname,
377 "TMP_EXTENDED_PICTOGRAPHIC")) {
378 if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
379 !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
380 target = "BOTH_CN_EXTPICT";
381 } else {
382 /* ignore Extended_Pictographic for all the other
383 * properties */
384 if (!strcmp(line_break_property[prop1].enumname,
385 "TMP_EXTENDED_PICTOGRAPHIC")) {
386 result = prop2;
387 } else {
388 result = prop1;
389 }
390 }
391 } else {
392 fprintf(stderr,
393 "handle_conflict: Cannot handle conflict %s <- %s.\n",
394 line_break_property[prop1].enumname,
395 line_break_property[prop2].enumname);
396 exit(1);
397 }
398
399 if (target) {
400 for (result = 0; result < LEN(line_break_property); result++) {
401 if (!strcmp(line_break_property[result].enumname,
402 target)) {
403 break;
404 }
405 }
406 if (result == LEN(line_break_property)) {
407 fprintf(stderr, "handle_conflict: Internal error.\n");
408 exit(1);
409 }
410 }
411
412 return result;
413 }
414
415 static void
416 post_process(struct properties *prop)
417 {
418 const char *target;
419 uint_least8_t result;
420 size_t i;
421
422 /* post-mapping according to the line breaking algorithm */
423 for (i = 0; i < UINT32_C(0x110000); i++) {
424 /* LB1 */
425 if (!strcmp(line_break_property[prop[i].property].enumname,
426 "TMP_AI") ||
427 !strcmp(line_break_property[prop[i].property].enumname,
428 "TMP_SG") ||
429 !strcmp(line_break_property[prop[i].property].enumname,
430 "TMP_XX")) {
431 /* map AI, SG and XX to AL */
432 target = "AL";
433 } else if (!strcmp(line_break_property[prop[i].property]
434 .enumname,
435 "TMP_SA_WITH_MN_OR_MC")) {
436 /* map SA (with General_Category Mn or Mc) to CM */
437 target = "CM";
438 } else if (!strcmp(line_break_property[prop[i].property]
439 .enumname,
440 "TMP_SA_WITHOUT_MN_OR_MC")) {
441 /* map SA (without General_Category Mn or Mc) to AL */
442 target = "AL";
443 } else if (!strcmp(line_break_property[prop[i].property]
444 .enumname,
445 "TMP_CJ")) {
446 /* map CJ to NS */
447 target = "NS";
448 } else if (
449 !strcmp(line_break_property[prop[i].property].enumname,
450 "TMP_CN") ||
451 !strcmp(line_break_property[prop[i].property].enumname,
452 "TMP_EXTENDED_PICTOGRAPHIC") ||
453 !strcmp(line_break_property[prop[i].property].enumname,
454 "TMP_MN") ||
455 !strcmp(line_break_property[prop[i].property].enumname,
456 "TMP_MC") ||
457 !strcmp(line_break_property[prop[i].property].enumname,
458 "TMP_EAW_H") ||
459 !strcmp(line_break_property[prop[i].property].enumname,
460 "TMP_EAW_W") ||
461 !strcmp(line_break_property[prop[i].property].enumname,
462 "TMP_EAW_F")) {
463 /* map all the temporary classes "residue" to AL */
464 target = "AL";
465 } else {
466 target = NULL;
467 }
468
469 if (target) {
470 for (result = 0; result < LEN(line_break_property);
471 result++) {
472 if (!strcmp(line_break_property[result]
473 .enumname,
474 target)) {
475 break;
476 }
477 }
478 if (result == LEN(line_break_property)) {
479 fprintf(stderr,
480 "handle_conflict: Internal error.\n");
481 exit(1);
482 }
483
484 prop[i].property = result;
485 }
486 }
487 }
488
489 int
490 main(int argc, char *argv[])
491 {
492 (void)argc;
493
494 properties_generate_break_property(
495 line_break_property, LEN(line_break_property), NULL,
496 handle_conflict, post_process, "line_break", argv[0]);
497
498 return 0;
499 }