bidirectional.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
bidirectional.c (12515B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <errno.h>
3 #include <inttypes.h>
4 #include <stddef.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8
9 #include "util.h"
10
11 #define FILE_BIDI_BRACKETS "data/BidiBrackets.txt"
12 #define FILE_BIDI_CLASS "data/DerivedBidiClass.txt"
13 #define FILE_BIDI_MIRRORING "data/BidiMirroring.txt"
14 #define FILE_UNICODE_DATA "data/UnicodeData.txt"
15
16 #define NUM_BRACKET_ALIASES 20
17
18 static const struct property_spec bidi_property[] = {
19 {
20 /* default */
21 .enumname = "L",
22 .file = FILE_BIDI_CLASS,
23 .ucdname = "L",
24 },
25 {
26 .enumname = "AL",
27 .file = FILE_BIDI_CLASS,
28 .ucdname = "AL",
29 },
30 {
31 .enumname = "AN",
32 .file = FILE_BIDI_CLASS,
33 .ucdname = "AN",
34 },
35 {
36 .enumname = "B",
37 .file = FILE_BIDI_CLASS,
38 .ucdname = "B",
39 },
40 {
41 .enumname = "BN",
42 .file = FILE_BIDI_CLASS,
43 .ucdname = "BN",
44 },
45 {
46 .enumname = "CS",
47 .file = FILE_BIDI_CLASS,
48 .ucdname = "CS",
49 },
50 {
51 .enumname = "EN",
52 .file = FILE_BIDI_CLASS,
53 .ucdname = "EN",
54 },
55 {
56 .enumname = "ES",
57 .file = FILE_BIDI_CLASS,
58 .ucdname = "ES",
59 },
60 {
61 .enumname = "ET",
62 .file = FILE_BIDI_CLASS,
63 .ucdname = "ET",
64 },
65 {
66 .enumname = "FSI",
67 .file = FILE_BIDI_CLASS,
68 .ucdname = "FSI",
69 },
70 {
71 .enumname = "LRE",
72 .file = FILE_BIDI_CLASS,
73 .ucdname = "LRE",
74 },
75 {
76 .enumname = "LRI",
77 .file = FILE_BIDI_CLASS,
78 .ucdname = "LRI",
79 },
80 {
81 .enumname = "LRO",
82 .file = FILE_BIDI_CLASS,
83 .ucdname = "LRO",
84 },
85 {
86 .enumname = "NSM",
87 .file = FILE_BIDI_CLASS,
88 .ucdname = "NSM",
89 },
90 {
91 .enumname = "ON",
92 .file = FILE_BIDI_CLASS,
93 .ucdname = "ON",
94 },
95 {
96 .enumname = "PDF",
97 .file = FILE_BIDI_CLASS,
98 .ucdname = "PDF",
99 },
100 {
101 .enumname = "PDI",
102 .file = FILE_BIDI_CLASS,
103 .ucdname = "PDI",
104 },
105 {
106 .enumname = "R",
107 .file = FILE_BIDI_CLASS,
108 .ucdname = "R",
109 },
110 {
111 .enumname = "RLE",
112 .file = FILE_BIDI_CLASS,
113 .ucdname = "RLE",
114 },
115 {
116 .enumname = "RLI",
117 .file = FILE_BIDI_CLASS,
118 .ucdname = "RLI",
119 },
120 {
121 .enumname = "RLO",
122 .file = FILE_BIDI_CLASS,
123 .ucdname = "RLO",
124 },
125 {
126 .enumname = "S",
127 .file = FILE_BIDI_CLASS,
128 .ucdname = "S",
129 },
130 {
131 .enumname = "WS",
132 .file = FILE_BIDI_CLASS,
133 .ucdname = "WS",
134 },
135 };
136
137 struct decomposition_payload {
138 uint_least32_t cp;
139 uint_least32_t decomposition;
140 };
141
142 static int
143 decomposition_callback(const char *file, char **field, size_t nfields,
144 char *comment, void *payload)
145 {
146 char *p;
147 struct decomposition_payload *decomp =
148 (struct decomposition_payload *)payload;
149 uint_least32_t cp;
150
151 (void)file;
152 (void)comment;
153
154 if (nfields < 6) {
155 /* we have fewer than 6 fields, discard the line */
156 return 0;
157 }
158
159 hextocp(field[0], strlen(field[0]), &cp);
160
161 if (decomp->cp == cp) {
162 /* we hit the line that contains our decomposition target */
163 if (strlen(field[5]) > 0) {
164 p = field[5];
165 if (*p == '<') {
166 /*
167 * the decomposition contains some metadata
168 * <...> we skip
169 */
170 for (; *p != '\0'; p++) {
171 if (*p == '>') {
172 p++;
173 while (*p == ' ') {
174 p++;
175 }
176 break;
177 }
178 }
179 }
180 hextocp(p, strlen(p), &(decomp->decomposition));
181 } else {
182 decomp->decomposition = decomp->cp;
183 }
184 }
185
186 return 0;
187 }
188
189 static struct {
190 uint_least32_t base[NUM_BRACKET_ALIASES];
191 size_t baselen;
192 uint_least32_t pair[NUM_BRACKET_ALIASES];
193 size_t pairlen;
194 uint_least8_t class;
195 char type;
196 } *b = NULL;
197
198 static size_t blen;
199 static uint_least8_t bracket_class_count = 1;
200
201 static int
202 bracket_callback(const char *file, char **field, size_t nfields, char *comment,
203 void *payload)
204 {
205 size_t i, j;
206 struct decomposition_payload decomp_base, decomp_pair;
207 uint_least32_t cp_base, cp_pair;
208
209 (void)file;
210 (void)comment;
211 (void)payload;
212
213 if (nfields < 3) {
214 /* we have fewer than 3 fields, discard the line */
215 return 0;
216 }
217
218 /* parse field data */
219 hextocp(field[0], strlen(field[0]), &cp_base);
220 hextocp(field[1], strlen(field[1]), &cp_pair);
221
222 /* determine decomposition of the base and pair codepoints */
223 decomp_base.cp = cp_base;
224 decomp_pair.cp = cp_pair;
225 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
226 &decomp_base);
227 parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
228 &decomp_pair);
229
230 /*
231 * check if we already have the canonical form in the bracket array,
232 * per convention the canonical form is the first element of the alias
233 * array
234 */
235 for (i = 0; i < blen; i++) {
236 if (decomp_base.decomposition == b[i].base[0]) {
237 /* we have a match, check type */
238 if (strlen(field[2]) != 1 ||
239 (field[2][0] != 'o' && field[2][0] != 'c')) {
240 /* malformed line */
241 return 1;
242 } else if (b[i].type != field[2][0]) {
243 /* mismatching types */
244 return 1;
245 }
246
247 /*
248 * add our base alias to the base array unless it isn't
249 * already in it
250 */
251 for (j = 0; j < b[i].baselen; j++) {
252 if (cp_base == b[i].base[j]) {
253 /* already in array, do nothing */
254 break;
255 }
256 }
257 if (j == b[i].baselen) {
258 /*
259 * the base alias is not already in the array,
260 * add it
261 */
262 if (b[i].baselen == NUM_BRACKET_ALIASES) {
263 fprintf(stderr, "too many aliases\n");
264 return 1;
265 }
266 b[i].baselen++;
267 b[i].base[b[i].baselen - 1] = cp_base;
268 }
269
270 /*
271 * also add our pair alias to the pair array unless
272 * it isn't already in it
273 */
274 for (j = 0; j < b[i].pairlen; j++) {
275 if (cp_pair == b[i].pair[j]) {
276 /* already in array, do nothing */
277 break;
278 }
279 }
280 if (j == b[i].pairlen) {
281 /*
282 * the pair alias is not already in the array,
283 * add it
284 */
285 if (b[i].pairlen == NUM_BRACKET_ALIASES) {
286 fprintf(stderr, "too many aliases\n");
287 return 1;
288 }
289 b[i].pairlen++;
290 b[i].pair[b[i].pairlen - 1] = cp_pair;
291 }
292
293 return 0;
294 }
295 }
296
297 /* extend bracket pair array, as this is a new bracket type */
298 if (!(b = realloc(b, (++blen) * sizeof(*b)))) {
299 fprintf(stderr, "realloc: %s\n", strerror(errno));
300 exit(1);
301 }
302
303 /* fill field data by adding the canonical form first */
304 b[blen - 1].base[0] = decomp_base.decomposition;
305 b[blen - 1].baselen = 1;
306 b[blen - 1].pair[0] = decomp_pair.decomposition;
307 b[blen - 1].pairlen = 1;
308
309 /* add alias if it differs from the canonical form */
310 if (cp_base != decomp_base.decomposition) {
311 b[blen - 1].base[1] = cp_base;
312 b[blen - 1].baselen = 2;
313 }
314 if (cp_pair != decomp_pair.decomposition) {
315 b[blen - 1].pair[1] = cp_pair;
316 b[blen - 1].pairlen = 2;
317 }
318
319 /* add bracket type */
320 if (strlen(field[2]) != 1 ||
321 (field[2][0] != 'o' && field[2][0] != 'c')) {
322 /* malformed line */
323 return 1;
324 } else {
325 b[blen - 1].type = field[2][0];
326 }
327
328 /*
329 * determine bracket class by iterating over the bracket-array
330 * and seeing if our current canonical cp already has a matching pair.
331 * We only need to check the first entry in each bracket alias
332 * list, as this is, per convention, the canonical form.
333 * If not, add a new class.
334 */
335 for (i = 0; i + 1 < blen; i++) {
336 if (b[i].pair[0] == b[blen - 1].base[0]) {
337 /* matched class */
338 b[blen - 1].class = b[i].class;
339 break;
340 }
341 }
342 if (i + 1 == blen) {
343 /* no match, assign a new class */
344 b[blen - 1].class = bracket_class_count++;
345 }
346
347 return 0;
348 }
349
350 static void
351 post_process(struct properties *prop)
352 {
353 size_t i, j;
354
355 for (i = 0; i < blen; i++) {
356 /*
357 * given the base property fits in 5 bits, we simply
358 * store the bracket-offset in the bits above that.
359 *
360 * All those properties that are not set here implicitly
361 * have offset 0, which we prepared to contain a stub
362 * for a character that is not a bracket.
363 */
364 for (j = 0; j < b[i].baselen; j++) {
365 prop[b[i].base[j]].property |= (i << 5);
366 }
367 }
368 }
369
370 static uint_least8_t
371 fill_missing(uint_least32_t cp)
372 {
373 /* based on the @missing-properties in data/DerivedBidiClass.txt */
374 if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) ||
375 (cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) ||
376 (cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) ||
377 (cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) ||
378 (cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) ||
379 (cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) ||
380 (cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) ||
381 (cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) ||
382 (cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) ||
383 (cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) ||
384 (cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) {
385 return 17; /* class R */
386 } else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) ||
387 (cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) ||
388 (cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) ||
389 (cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) ||
390 (cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) ||
391 (cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) ||
392 (cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) ||
393 (cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) ||
394 (cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) ||
395 (cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) ||
396 (cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF))) {
397 return 1; /* class AL */
398 } else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) {
399 return 8; /* class ET */
400 } else {
401 return 0; /* class L */
402 }
403 }
404
405 static struct properties *prop_mirror = NULL;
406
407 static int
408 mirror_callback(const char *file, char **field, size_t nfields, char *comment,
409 void *payload)
410 {
411 uint_least32_t cp, cp_mirror;
412
413 (void)file;
414 (void)comment;
415 (void)payload;
416
417 hextocp(field[0], strlen(field[0]), &cp);
418
419 cp_mirror = cp;
420
421 if (nfields >= 2 && strlen(field[1]) > 0 &&
422 hextocp(field[1], strlen(field[1]), &cp_mirror)) {
423 return 1;
424 }
425
426 prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least32_t)cp;
427
428 return 0;
429 }
430
431 static int_least64_t
432 get_value(const struct properties *prop, size_t offset)
433 {
434 return prop[offset].property;
435 }
436
437 int
438 main(int argc, char *argv[])
439 {
440 struct properties_compressed comp_mirror;
441 struct properties_major_minor mm_mirror;
442 size_t i;
443
444 (void)argc;
445
446 /*
447 * the first element in the bracket array is initialized to
448 * all-zeros, as we use the implicit 0-offset for all those
449 * codepoints that are not a bracket
450 */
451 if (!(b = calloc((blen = 1), sizeof(*b)))) {
452 fprintf(stderr, "calloc: %s\n", strerror(errno));
453 exit(1);
454 }
455 parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, NULL);
456
457 properties_generate_break_property(bidi_property, LEN(bidi_property),
458 fill_missing, NULL, post_process,
459 "bidi", argv[0]);
460
461 printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t"
462 "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n"
463 "static const struct bracket {\n\tenum bracket_type type;\n"
464 "\tuint_least8_t class;\n} bidi_bracket[] = {\n");
465 for (i = 0; i < blen; i++) {
466 printf("\t{\n\t\t.type = %s,\n\t\t.class = "
467 "%" PRIuLEAST8 ",\n\t},\n",
468 (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" :
469 (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" :
470 "BIDI_BRACKET_NONE",
471 b[i].class);
472 }
473 printf("};\n");
474
475 /*
476 * allocate property buffer for all 0x110000 codepoints
477 *
478 * the buffers contain the offset from the "base" character
479 * to the respective mirrored character. By callocing we set all
480 * fields to zero, which is also the Unicode "default" in the sense
481 * that the coe point is its mirror (unless we fill it in)
482 */
483 if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirror)))) {
484 fprintf(stderr, "calloc: %s\n", strerror(errno));
485 exit(1);
486 }
487 parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, NULL);
488
489 /* compress properties */
490 properties_compress(prop_mirror, &comp_mirror);
491
492 fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", argv[0],
493 properties_get_major_minor(&comp_mirror, &mm_mirror));
494
495 /* print tables */
496 properties_print_lookup_table("mirror_major", mm_mirror.major, 0x1100);
497 printf("\n");
498 properties_print_derived_lookup_table("mirror_minor", mm_mirror.minor,
499 mm_mirror.minorlen, get_value,
500 comp_mirror.data);
501
502 free(comp_mirror.data);
503 free(comp_mirror.offset);
504 free(mm_mirror.major);
505 free(mm_mirror.minor);
506
507 return 0;
508 }