util.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
util.c (19744B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <ctype.h>
3 #include <errno.h>
4 #include <inttypes.h>
5 #include <stdbool.h>
6 #include <stddef.h>
7 #include <stdint.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11
12 #include "util.h"
13
14 struct range {
15 uint_least32_t lower;
16 uint_least32_t upper;
17 };
18
19 static int
20 hextocp(const char *str, size_t len, uint_least32_t *cp)
21 {
22 size_t i;
23 int off;
24 char relative;
25
26 /* the maximum valid codepoint is 0x10FFFF */
27 if (len > 6) {
28 fprintf(stderr, "hextocp: '%.*s' is too long.\n", (int)len,
29 str);
30 return 1;
31 }
32
33 for (i = 0, *cp = 0; i < len; i++) {
34 if (str[i] >= '0' && str[i] <= '9') {
35 relative = '0';
36 off = 0;
37 } else if (str[i] >= 'a' && str[i] <= 'f') {
38 relative = 'a';
39 off = 10;
40 } else if (str[i] >= 'A' && str[i] <= 'F') {
41 relative = 'A';
42 off = 10;
43 } else {
44 fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n",
45 (int)len, str);
46 return 1;
47 }
48
49 *cp += ((uint_least32_t)1 << (4 * (len - i - 1))) *
50 (uint_least32_t)(str[i] - relative + off);
51 }
52
53 if (*cp > UINT32_C(0x10FFFF)) {
54 fprintf(stderr, "hextocp: '%.*s' is too large.\n", (int)len,
55 str);
56 return 1;
57 }
58
59 return 0;
60 }
61
62 int
63 parse_range(const char *str, struct range *range)
64 {
65 char *p;
66
67 if ((p = strstr(str, "..")) == NULL) {
68 /* input has the form "XXXXXX" */
69 if (hextocp(str, strlen(str), &range->lower)) {
70 return 1;
71 }
72 range->upper = range->lower;
73 } else {
74 /* input has the form "XXXXXX..XXXXXX" */
75 if (hextocp(str, (size_t)(p - str), &range->lower) ||
76 hextocp(p + 2, strlen(p + 2), &range->upper)) {
77 return 1;
78 }
79 }
80
81 return 0;
82 }
83
84 static bool
85 get_line(char **buf, size_t *bufsize, FILE *fp, size_t *len)
86 {
87 int ret = EOF;
88
89 for (*len = 0;; (*len)++) {
90 if (*len > 0 && *buf != NULL && (*buf)[*len - 1] == '\n') {
91 /*
92 * if the previously read character was a newline,
93 * we fake an end-of-file so we NUL-terminate and
94 * are done.
95 */
96 ret = EOF;
97 } else {
98 ret = fgetc(fp);
99 }
100
101 if (*len >= *bufsize) {
102 /* the buffer needs to be expanded */
103 *bufsize += 512;
104 if ((*buf = realloc(*buf, *bufsize)) == NULL) {
105 fprintf(stderr, "get_line: Out of memory.\n");
106 exit(1);
107 }
108 }
109
110 if (ret != EOF) {
111 (*buf)[*len] = (char)ret;
112 } else {
113 (*buf)[*len] = '\0';
114 break;
115 }
116 }
117
118 return *len == 0 && (feof(fp) || ferror(fp));
119 }
120
121 static char *
122 duplicate_string(const char *src)
123 {
124 size_t len = strlen(src);
125 char *dest;
126
127 if (!(dest = malloc(len + 1))) {
128 fprintf(stderr, "malloc: %s\n", strerror(errno));
129 exit(1);
130 }
131
132 memcpy(dest, src, len);
133 dest[len] = '\0';
134
135 return dest;
136 }
137
138 void
139 duplicate_codepoint_property(const struct codepoint_property *src,
140 struct codepoint_property *dest)
141 {
142 size_t i;
143
144 /* copy the field count */
145 dest->field_count = src->field_count;
146
147 /* allocate the field array */
148 if (!(dest->fields = calloc(dest->field_count, sizeof(*(dest->fields))))) {
149 fprintf(stderr, "malloc: %s\n", strerror(errno));
150 exit(1);
151 }
152
153 for (i = 0; i < dest->field_count; i++) {
154 dest->fields[i] = duplicate_string(src->fields[i]);
155 }
156 }
157
158 static void
159 free_codepoint_property(struct codepoint_property *p)
160 {
161 size_t i;
162
163 for (i = 0; i < p->field_count; i++) {
164 free(p->fields[i]);
165 }
166 free(p->fields);
167 }
168
169 static void
170 free_codepoint_property_set(struct codepoint_property_set *p)
171 {
172 size_t i;
173
174 for (i = 0; i < p->property_count; i++) {
175 free_codepoint_property(&(p->properties[i]));
176 }
177 free(p->properties);
178 }
179
180 void
181 free_codepoint_property_set_array(struct codepoint_property_set *p)
182 {
183 uint_least32_t cp;
184
185 for (cp = 0; cp < UINT32_C(0x110000); cp++) {
186 free_codepoint_property_set(&(p[cp]));
187 }
188 free(p);
189 }
190
191 const struct codepoint_property *
192 match_in_codepoint_property_set(const struct codepoint_property_set *p,
193 const char *str, size_t field_offset)
194 {
195 size_t i;
196
197 for (i = 0; i < p->property_count; i++) {
198 /* there must be enough fields to reach the offset */
199 if (field_offset >= p->properties[i].field_count) {
200 continue;
201 }
202
203 /* check for a match */
204 if (strcmp(p->properties[i].fields[field_offset], str) == 0) {
205 return &(p->properties[i]);
206 }
207 }
208
209 return NULL;
210 }
211
212 struct codepoint_property_set *
213 parse_property_file(const char *fname)
214 {
215 FILE *fp;
216 struct codepoint_property p;
217 struct codepoint_property_set *cpp, *missing;
218 struct range range;
219 char *line = NULL, **field = NULL, *comment;
220 uint_least32_t cp;
221 size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields, len;
222 bool is_missing;
223
224 /*
225 * allocate cpp buffer of length 0x11000, initialised to zeros
226 * (NULL 'properties' pointer, zero property_count)
227 */
228 if (!(cpp = calloc((size_t)UINT32_C(0x110000), sizeof(*cpp)))) {
229 fprintf(stderr, "calloc: %s\n", strerror(errno));
230 exit(1);
231 }
232
233 /*
234 * allocate same-sized array for 'missing' fields. The data files
235 * have this strange notion of defining some properties in bloody
236 * comments, and in a way that says 'yeah, if we did not define
237 * something for some, then replace it with this value'. However,
238 * it complicates things, as multiple, overlapping @missing
239 * can be defined in a single file. One can deduce that subsequent
240 * @missing just overwrite each other, but there's no way to
241 * track which properties have not been set without running
242 * through the file multiple times, which we want to avoid, so
243 * we accumulate the (self-overwriting) @missing set separately
244 * and fill it in at the end.
245 */
246 if (!(missing = calloc((size_t)UINT32_C(0x110000), sizeof(*missing)))) {
247 fprintf(stderr, "calloc: %s\n", strerror(errno));
248 exit(1);
249 }
250
251 /* open the property file */
252 if (!(fp = fopen(fname, "r"))) {
253 fprintf(stderr, "parse_property_file: fopen '%s': %s.\n",
254 fname, strerror(errno));
255 exit(1);
256 }
257
258 while (!get_line(&line, &linebufsize, fp, &len)) {
259 /* remove trailing newline */
260 if (len > 0 && line[len - 1] == '\n') {
261 line[len - 1] = '\0';
262 len--;
263 }
264
265 /* skip empty lines */
266 if (len == 0) {
267 continue;
268 }
269
270 /* comment, check if it's a @missing */
271 is_missing = false;
272 if (line[0] == '#') {
273 if (strncmp(line + 1, " @missing: ", LEN(" @missing: ") - 1) == 0) {
274 /*
275 * this comment specifies a 'missing' line.
276 * We shift it to the left and treat it
277 * like any other line, differentiating it
278 * with the 'is_missing' flag.
279 */
280 size_t offset = 1 + (LEN(" @missing: ") - 1);
281
282 memmove(line, line + offset, len - offset + 1);
283 len -= offset;
284 is_missing = true;
285 } else {
286 /* skip unrelated comment line */
287 continue;
288 }
289 }
290
291 /* tokenize line into fields */
292 for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
293 /* skip leading whitespace */
294 while (line[i] == ' ') {
295 i++;
296 }
297
298 /* check if we crashed into the comment */
299 if (line[i] != '#') {
300 /* extend field buffer, if necessary */
301 if (++nfields > fieldbufsize) {
302 if ((field = realloc(
303 field,
304 nfields *
305 sizeof(*field))) ==
306 NULL) {
307 fprintf(stderr,
308 "parse_property_"
309 "file: realloc: "
310 "%s.\n",
311 strerror(errno));
312 exit(1);
313 }
314 fieldbufsize = nfields;
315 }
316
317 /* set current position as field start */
318 field[nfields - 1] = &line[i];
319
320 /* continue until we reach ';' or '#' or end */
321 while (line[i] != ';' && line[i] != '#' &&
322 line[i] != '\0') {
323 i++;
324 }
325 }
326
327 if (line[i] == '#') {
328 /* set comment-variable for later */
329 comment = &line[i + 1];
330 }
331
332 /* go back whitespace and terminate field there */
333 if (i > 0) {
334 for (j = i - 1; line[j] == ' '; j--) {
335 ;
336 }
337 line[j + 1] = '\0';
338 } else {
339 line[i] = '\0';
340 }
341
342 /* if comment is set, we are done */
343 if (comment != NULL) {
344 break;
345 }
346 }
347
348 /* skip leading whitespace in comment */
349 while (comment != NULL && comment[0] == ' ') {
350 comment++;
351 }
352
353 /*
354 * We now have an array 'field' with 'nfields'. We can
355 * follow from the file format that, if nfields > 0,
356 * field[0] specifies a codepoint or range of
357 * codepoints, which we parse as the first step.
358 */
359 if (nfields < 2) {
360 /*
361 * either there is only a range or no range at
362 * all. Report this
363 */
364 fprintf(stderr, "parse_property_file: malformed "
365 "line with either no range or no entry\n");
366 exit(1);
367 }
368
369 if (parse_range(field[0], &range)) {
370 fprintf(stderr, "parse_property_file: parse_range of "
371 "'%s' failed.\n", field[0]);
372 exit(1);
373 }
374
375 /*
376 * fill a codepoint_property from the remaining
377 * fields (no allocations on a stack-allocated struct
378 */
379 p.fields = field + 1;
380 p.field_count = nfields - 1;
381
382 /*
383 * add a duplicate of the filled codepoint_property
384 * to all codepoints in the range (i.e. the cpp or
385 * missing array, depending on is_missing)
386 */
387 for (cp = range.lower; cp <= range.upper; cp++) {
388 if (is_missing) {
389 /*
390 * we overwrite the set at the codepoint,
391 * as the @missing properties overwrite
392 * each other (bad design)
393 */
394 if (missing[cp].property_count == 0) {
395 /*
396 * set is still empty, add space
397 * for one pointer to a
398 * codepoint_property
399 */
400 if (!(missing[cp].properties =
401 malloc(sizeof(*(missing[cp].properties))))) {
402 fprintf(stderr, "malloc: %s\n",
403 strerror(errno));
404 exit(1);
405 }
406 missing[cp].property_count = 1;
407 } else if (missing[cp].property_count == 1) {
408 /* free the old property */
409 free_codepoint_property(
410 &(missing[cp].properties[0]));
411 } else {
412 /* this shouldn't happen */
413 fprintf(stderr, "parse_property_file: "
414 "malformed missing set\n");
415 exit(1);
416 }
417
418 /* copy in the new one */
419 duplicate_codepoint_property(
420 &p, &(missing[cp].properties[0]));
421 } else {
422 /* expand the set by one element */
423 if (!(cpp[cp].properties = realloc(
424 cpp[cp].properties,
425 sizeof(*(cpp[cp].properties)) *
426 (++cpp[cp].property_count)))) {
427 fprintf(stderr, "malloc: %s\n",
428 strerror(errno));
429 exit(1);
430 }
431
432 duplicate_codepoint_property(&p,
433 &(cpp[cp].properties[cpp[cp].property_count - 1]));
434 }
435 }
436 }
437
438 /*
439 * now we add the missing elements. We purposefully do not
440 * follow the interpretation in DerivedCoreProperties.txt for
441 * the @missing 'InCB; None'. Missing, for us, means that
442 * _no_ properties have been extracted and thus property_count
443 * is zero, not that some first field is not set. Absolute
444 * madness to even publish data like this...
445 */
446 for (cp = 0; cp < UINT32_C(0x110000); cp++) {
447 if (cpp[cp].property_count == 0) {
448 /* swap the whole lot */
449 cpp[cp].properties = missing[cp].properties;
450 cpp[cp].property_count = missing[cp].property_count;
451 missing[cp].properties = NULL;
452 missing[cp].property_count = 0;
453 }
454 }
455
456 /* close file */
457 if (fclose(fp)) {
458 fprintf(stderr, "parse_property_file: fclose '%s': %s.\n",
459 fname, strerror(errno));
460 exit(1);
461 }
462
463 /* cleanup */
464 free_codepoint_property_set_array(missing);
465 free(line);
466 free(field);
467
468 /* return codepoint properties array */
469 return cpp;
470 }
471
472 struct compression_output {
473 uint_least64_t *major;
474 uint_least64_t *minor;
475 size_t block_size_shift;
476 size_t block_size;
477 size_t major_size;
478 size_t minor_size;
479 size_t major_entry_size;
480 size_t minor_entry_size;
481 size_t total_size;
482 };
483
484 static void
485 compress_array(const uint_least64_t *array, size_t block_size_shift,
486 struct compression_output *co)
487 {
488 size_t i, j, major_maximum, minor_maximum;
489
490 /* set some preliminary data in the output struct */
491 co->block_size_shift = block_size_shift;
492 co->block_size = ((size_t)1) << block_size_shift;
493 co->major_size = UINT32_C(0x110000) / co->block_size;
494
495 /* allocate/initialise */
496 if (!(co->major = malloc(co->major_size * sizeof(*(co->major))))) {
497 fprintf(stderr, "malloc: %s\n", strerror(errno));
498 exit(1);
499 }
500 co->minor = NULL;
501 co->minor_size = 0;
502
503 /* iterate over all blocks in the array */
504 for (i = 0; i < co->major_size; i++) {
505 size_t block_offset = i * co->block_size;
506
507 /*
508 * iterate over all possible matching block starting
509 * positions in the minor array
510 */
511 for (j = 0; j + co->block_size < co->minor_size; j++) {
512 /*
513 * check if our current block matches the
514 * candidate block in the minor array
515 */
516 if (memcmp(array + block_offset,
517 co->minor + j,
518 co->block_size * sizeof(*array)) == 0) {
519 /*
520 * we have a match, so we don't have to
521 * store this block explicitly and just
522 * point to the offset in minor
523 */
524 co->major[i] = j;
525 break;
526 }
527 }
528 if (j + co->block_size >= co->minor_size) {
529 /*
530 * we found no matching subblock in minor. Add it
531 * to the minor array. The index to the first
532 * element we add now is exactly the size
533 * of the minor array.
534 */
535 co->major[i] = co->minor_size;
536 co->minor_size += co->block_size;
537 if (!(co->minor = realloc(co->minor, co->minor_size *
538 sizeof(*(co->minor))))) {
539 fprintf(stderr, "malloc: %s\n",
540 strerror(errno));
541 exit(1);
542 }
543 memcpy(co->minor + co->minor_size - co->block_size,
544 array + block_offset,
545 co->block_size * sizeof(*array));
546 }
547 }
548
549 /* the compression is done. Now we derive some metadata */
550
551 /* determine maxima */
552 for (i = 0, major_maximum = 0; i < co->major_size; i++) {
553 if (co->major[i] > major_maximum) {
554 major_maximum = co->major[i];
555 }
556 }
557 for (i = 0, minor_maximum = 0; i < co->minor_size; i++) {
558 if (co->minor[i] > minor_maximum) {
559 minor_maximum = co->minor[i];
560 }
561 }
562
563 /* determine entry sizes */
564 if (major_maximum < UINT64_C(1) << 8) {
565 co->major_entry_size = 8;
566 } else if (major_maximum < UINT64_C(1) << 16) {
567 co->major_entry_size = 16;
568 } else if (major_maximum < UINT64_C(1) << 32) {
569 co->major_entry_size = 32;
570 } else {
571 co->major_entry_size = 64;
572 }
573
574 if (minor_maximum < UINT64_C(1) << 4) {
575 /* using 4 bit packed nibbles */
576 co->minor_entry_size = 4;
577 } else if (minor_maximum < UINT64_C(1) << 8) {
578 co->minor_entry_size = 8;
579 } else if (minor_maximum < UINT64_C(1) << 16) {
580 co->minor_entry_size = 16;
581 } else if (minor_maximum < UINT64_C(1) << 32) {
582 co->minor_entry_size = 32;
583 } else {
584 co->minor_entry_size = 64;
585 }
586
587 /* total data size in bytes */
588 co->total_size = ((co->major_size * co->major_entry_size) +
589 (co->minor_size * co->minor_entry_size)) / 8;
590 }
591
592 void
593 free_compressed_data(struct compression_output *co)
594 {
595 free(co->major);
596 free(co->minor);
597 memset(co, 0, sizeof(*co));
598 }
599
600 void
601 print_array(const uint_least64_t *array, size_t array_size,
602 size_t array_entry_size, const char *prefix,
603 const char *name)
604 {
605 size_t i;
606
607 if (array_entry_size == 4) {
608 /* we store two 4-bit nibbles in one byte */
609 if (array_size % 2 != 0) {
610 /* ensure array lenght is even */
611 fprintf(stderr, "print_array: 4-bit array "
612 "is not implemented for odd length.");
613 exit(1);
614 }
615
616 printf("static const uint_least8_t %s_%s[] = {",
617 prefix, name);
618 for (i = 0; i < array_size; i += 2) {
619 if ((i / 2) % 8 == 0) {
620 printf("\n\t");
621 } else {
622 printf(" ");
623 }
624
625 /* write high and low nibbles */
626 printf("%zu", (array[i] << 4) | array[i + 1]);
627
628 if (i + 2 < array_size) {
629 printf(",");
630 }
631 }
632 printf("\n};\n");
633 } else {
634 printf("static const uint_least%zu_t %s_%s[] = {",
635 array_entry_size, prefix, name);
636 for (i = 0; i < array_size; i++) {
637 if (i % 8 == 0) {
638 printf("\n\t");
639 } else {
640 printf(" ");
641 }
642 printf("%zu", array[i]);
643 if (i + 1 < array_size) {
644 printf(",");
645 }
646 }
647 printf("\n};\n");
648 }
649 }
650
651 void
652 compress_and_output(uint_least64_t *array, const char *prefix)
653 {
654 struct compression_output co, co_best;
655 size_t i, j, dictionary_size, dictionary_entry_size;
656 uint_least64_t maximum = 0, *dictionary, *keys;
657
658 /* initialise dictionary */
659 dictionary = NULL;
660 dictionary_size = 0;
661
662 /* initialise keys */
663 if (!(keys = calloc(UINT32_C(0x110000), sizeof(*keys)))) {
664 fprintf(stderr, "calloc: %s\n", strerror(errno));
665 exit(1);
666 }
667
668 for (i = 0; i < UINT32_C(0x110000); i++) {
669 /* maximum determination */
670 if (array[i] > maximum) {
671 maximum = array[i];
672 }
673
674 /* check if the array value is already in the dictionary */
675 for (j = 0; j < dictionary_size; j++) {
676 if (array[i] == dictionary[j]) {
677 break;
678 }
679 }
680 if (j == dictionary_size) {
681 /* not in the dictionary, insert the array value */
682 if (!(dictionary = realloc(dictionary, (++dictionary_size) *
683 sizeof(*dictionary)))) {
684 fprintf(stderr, "realloc: %s\n", strerror(errno));
685 exit(1);
686 }
687 dictionary[dictionary_size - 1] = array[i];
688 }
689
690 /* set the index (key) of the matched dictionary entry */
691 keys[i] = j;
692 }
693
694 /* check if dictionary size is below a reasonable threshold */
695 if (dictionary_size > 256) {
696 fprintf(stderr, "compress_and_output: dictionary too large\n");
697 exit(1);
698 }
699
700 /*
701 * compress keys array with different block size shifts
702 * (block sizes from 16=1<<4 to 32768=1<<15)
703 */
704 memset(&co_best, 0, sizeof(co_best));
705 co_best.total_size = SIZE_MAX;
706 for (i = 15; i >= 4; i--) {
707 compress_array(keys, i, &co);
708
709 fprintf(stderr, "compress_and_output: "
710 "block size %zu -> data size %.1f kB (%zu,%zu)\n",
711 ((size_t)1) << i, (double)co.total_size / 1000, co.major_entry_size, co.minor_entry_size);
712
713 if (co.total_size < co_best.total_size) {
714 /* we have a new best compression */
715 free_compressed_data(&co_best);
716 co_best = co;
717 } else {
718 /* not optimal, discard it */
719 free_compressed_data(&co);
720 }
721 }
722 fprintf(stderr, "compress_and_output: choosing optimal block size %zu (%zu,%zu)\n",
723 co_best.block_size, co_best.major_entry_size, co_best.minor_entry_size);
724
725 /* output header */
726 printf("/* Automatically generated by gen2/%s */\n"
727 "#include <stdint.h>\n\n"
728 "#include \"character.h\"\n\n", prefix);
729
730 /* output dictionary */
731 if (maximum < UINT64_C(1) << 8) {
732 dictionary_entry_size = 8;
733 } else if (maximum < UINT64_C(1) << 16) {
734 dictionary_entry_size = 16;
735 } else if (maximum < UINT64_C(1) << 32) {
736 dictionary_entry_size = 32;
737 } else {
738 dictionary_entry_size = 64;
739 }
740
741 print_array(dictionary, dictionary_size, dictionary_entry_size,
742 prefix, "dictionary");
743 printf("\n");
744
745 /* output major array */
746 print_array(co_best.major, co_best.major_size,
747 co_best.major_entry_size, prefix, "major");
748 printf("\n");
749
750 /* output minor array */
751 print_array(co_best.minor, co_best.minor_size,
752 co_best.minor_entry_size, prefix, "minor");
753 printf("\n");
754
755 /* output accessor function (major is never 4-bits per entry) */
756 printf("static inline uint_least%zu_t\n"
757 "get_%s_property(uint_least32_t cp)\n"
758 "{\n"
759 "\tuint_least%zu_t minor_offset = %s_major[cp >> %zu]\n"
760 "\t\t+ (cp & ((UINT32_C(1) << %zu) - 1));\n",
761 dictionary_entry_size, prefix, co_best.major_entry_size,
762 prefix, co_best.block_size_shift, co_best.block_size_shift);
763 if (co_best.minor_entry_size == 4) {
764 printf("\tuint_least8_t packed_value = %s_minor[minor_offset / 2];\n\n"
765 "\tif (minor_offset & UINT8_C(1) == 0) {\n"
766 "\t\t/* high nibble */\n"
767 "\t\treturn %s_dictionary[packed_value >> 4];\n"
768 "\t} else {\n"
769 "\t\t/* low nibble */\n"
770 "\t\treturn %s_dictionary[packed_value & UINT8_C(0x0f)];\n"
771 "\t}\n",
772 prefix, prefix, prefix);
773 } else {
774 printf("\n\treturn %s_dictionary[%s_minor[minor_offset]];\n",
775 prefix, prefix);
776 }
777 printf("}\n");
778
779 /* cleanup */
780 free(dictionary);
781 free(keys);
782 }