case.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
case.c (8442B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <errno.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7
8 #include "util.h"
9
10 #define FILE_DCP "data/DerivedCoreProperties.txt"
11
12 static const struct property_spec case_property[] = {
13 {
14 .enumname = "OTHER",
15 .file = NULL,
16 .ucdname = NULL,
17 },
18 {
19 .enumname = "BOTH_CASED_CASE_IGNORABLE",
20 .file = NULL,
21 .ucdname = NULL,
22 },
23 {
24 .enumname = "CASED",
25 .file = FILE_DCP,
26 .ucdname = "Cased",
27 },
28 {
29 .enumname = "CASE_IGNORABLE",
30 .file = FILE_DCP,
31 .ucdname = "Case_Ignorable",
32 },
33 {
34 .enumname = "UNCASED",
35 .file = FILE_DCP,
36 .ucdname = "Uncased",
37 },
38 };
39
40 static uint_least8_t
41 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
42 {
43 uint_least8_t result;
44
45 (void)cp;
46
47 if ((!strcmp(case_property[prop1].enumname, "CASED") &&
48 !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
49 (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
50 !strcmp(case_property[prop2].enumname, "CASED"))) {
51 for (result = 0; result < LEN(case_property); result++) {
52 if (!strcmp(case_property[result].enumname,
53 "BOTH_CASED_CASE_IGNORABLE")) {
54 break;
55 }
56 }
57 if (result == LEN(case_property)) {
58 fprintf(stderr, "handle_conflict: Internal error.\n");
59 exit(1);
60 }
61 } else {
62 fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
63 exit(1);
64 }
65
66 return result;
67 }
68
69 static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
70
71 static struct special_case {
72 struct {
73 uint_least32_t *cp;
74 size_t cplen;
75 } upper, lower, title;
76 } *sc = NULL;
77
78 static size_t sclen = 0;
79
80 static int
81 unicodedata_callback(const char *file, char **field, size_t nfields,
82 char *comment, void *payload)
83 {
84 uint_least32_t cp, upper, lower, title;
85
86 (void)file;
87 (void)comment;
88 (void)payload;
89
90 hextocp(field[0], strlen(field[0]), &cp);
91
92 upper = lower = title = cp;
93
94 if ((strlen(field[12]) > 0 &&
95 hextocp(field[12], strlen(field[12]), &upper)) ||
96 (strlen(field[13]) > 0 &&
97 hextocp(field[13], strlen(field[13]), &lower)) ||
98 (nfields >= 15 && strlen(field[14]) > 0 &&
99 hextocp(field[14], strlen(field[14]), &title))) {
100 return 1;
101 }
102
103 prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
104 prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
105 prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
106
107 return 0;
108 }
109
110 static int
111 specialcasing_callback(const char *file, char **field, size_t nfields,
112 char *comment, void *payload)
113 {
114 uint_least32_t cp;
115
116 (void)file;
117 (void)comment;
118 (void)payload;
119
120 if (nfields > 4 && strlen(field[4]) > 0) {
121 /*
122 * we have more than 4 fields, i.e. the rule has a
123 * condition (language-sensitive, etc.) and is discarded
124 */
125 return 0;
126 }
127
128 /* parse affected codepoint */
129 hextocp(field[0], strlen(field[0]), &cp);
130
131 /* extend special case array */
132 if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
133 fprintf(stderr, "realloc: %s\n", strerror(errno));
134 exit(1);
135 }
136
137 /* parse field data */
138 parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
139 &(sc[sclen - 1].upper.cplen));
140 parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
141 &(sc[sclen - 1].lower.cplen));
142 parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
143 &(sc[sclen - 1].title.cplen));
144
145 /*
146 * overwrite value in "single mapping" property table by the
147 * special value 0x110000 + (offset in special case array),
148 * even if the special case has length 1
149 */
150 prop_upper[cp].property =
151 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
152 prop_lower[cp].property =
153 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
154 prop_title[cp].property =
155 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
156
157 return 0;
158 }
159
160 static int_least64_t
161 get_value(const struct properties *prop, size_t offset)
162 {
163 return prop[offset].property;
164 }
165
166 int
167 main(int argc, char *argv[])
168 {
169 struct properties_compressed comp_upper, comp_lower, comp_title;
170 struct properties_major_minor mm_upper, mm_lower, mm_title;
171 size_t i, j;
172
173 (void)argc;
174
175 /* generate case property table from the specification */
176 properties_generate_break_property(case_property, LEN(case_property),
177 NULL, handle_conflict, NULL, "case",
178 argv[0]);
179
180 /*
181 * allocate property buffers for all 0x110000 codepoints
182 *
183 * the buffers contain the offset from the "base" character
184 * to the respective case mapping. By callocing we set all fields
185 * to zero, which is also the Unicode "default" in the sense that
186 * there is no case mapping by default (unless we fill it in)
187 */
188 if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
189 !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
190 !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
191 fprintf(stderr, "calloc: %s\n", strerror(errno));
192 exit(1);
193 }
194 parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
195 NULL);
196 parse_file_with_callback("data/SpecialCasing.txt",
197 specialcasing_callback, NULL);
198
199 /* compress properties */
200 properties_compress(prop_upper, &comp_upper);
201 properties_compress(prop_lower, &comp_lower);
202 properties_compress(prop_title, &comp_title);
203
204 fprintf(stderr,
205 "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, "
206 "title=%.2f%%\n",
207 argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
208 properties_get_major_minor(&comp_lower, &mm_lower),
209 properties_get_major_minor(&comp_title, &mm_title));
210
211 /* print tables */
212 printf("/* Automatically generated by %s */\n#include "
213 "<stdint.h>\n#include <stddef.h>\n\n",
214 argv[0]);
215
216 printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t "
217 "cplen;\n};\n\n");
218
219 properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
220 printf("\n");
221 properties_print_derived_lookup_table("upper_minor", mm_upper.minor,
222 mm_upper.minorlen, get_value,
223 comp_upper.data);
224 printf("\n");
225 properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
226 printf("\n");
227 properties_print_derived_lookup_table("lower_minor", mm_lower.minor,
228 mm_lower.minorlen, get_value,
229 comp_lower.data);
230 printf("\n");
231 properties_print_lookup_table("title_major", mm_title.major, 0x1100);
232 printf("\n");
233 properties_print_derived_lookup_table("title_minor", mm_title.minor,
234 mm_title.minorlen, get_value,
235 comp_title.data);
236 printf("\n");
237
238 printf("static const struct special_case upper_special[] = {\n");
239 for (i = 0; i < sclen; i++) {
240 printf("\t{\n");
241
242 printf("\t\t.cp = (uint_least32_t[]){");
243 for (j = 0; j < sc[i].upper.cplen; j++) {
244 printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
245 if (j + 1 < sc[i].upper.cplen) {
246 putchar(',');
247 }
248 }
249 printf(" },\n");
250 printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen);
251 printf("\t},\n");
252 }
253 printf("};\n\n");
254
255 printf("static const struct special_case lower_special[] = {\n");
256 for (i = 0; i < sclen; i++) {
257 printf("\t{\n");
258
259 printf("\t\t.cp = (uint_least32_t[]){");
260 for (j = 0; j < sc[i].lower.cplen; j++) {
261 printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
262 if (j + 1 < sc[i].lower.cplen) {
263 putchar(',');
264 }
265 }
266 printf(" },\n");
267 printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen);
268 printf("\t},\n");
269 }
270 printf("};\n\n");
271
272 printf("static const struct special_case title_special[] = {\n");
273 for (i = 0; i < sclen; i++) {
274 printf("\t{\n");
275
276 printf("\t\t.cp = (uint_least32_t[]){");
277 for (j = 0; j < sc[i].title.cplen; j++) {
278 printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
279 if (j + 1 < sc[i].title.cplen) {
280 putchar(',');
281 }
282 }
283 printf(" },\n");
284 printf("\t\t.cplen = %zu,\n", sc[i].title.cplen);
285 printf("\t},\n");
286 }
287 printf("};\n\n");
288
289 free(comp_lower.data);
290 free(comp_lower.offset);
291 free(comp_title.data);
292 free(comp_title.offset);
293 free(comp_upper.data);
294 free(comp_upper.offset);
295 free(mm_lower.major);
296 free(mm_lower.minor);
297 free(mm_title.major);
298 free(mm_title.minor);
299 free(mm_upper.major);
300 free(mm_upper.minor);
301
302 return 0;
303 }