case.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
case.c (12993B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stddef.h>
3 #include <stdint.h>
4
5 #include "../gen/case.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 static inline enum case_property
10 get_case_property(uint_least32_t cp)
11 {
12 if (likely(cp <= UINT32_C(0x10FFFF))) {
13 return (enum case_property)
14 case_minor[case_major[cp >> 8] + (cp & 0xFF)];
15 } else {
16 return CASE_PROP_OTHER;
17 }
18 }
19
20 static inline int_least32_t
21 get_case_offset(uint_least32_t cp, const uint_least16_t *major,
22 const int_least32_t *minor)
23 {
24 if (likely(cp <= UINT32_C(0x10FFFF))) {
25 /*
26 * this value might be larger than or equal to 0x110000
27 * for the special-case-mapping. This needs to be handled
28 * separately
29 */
30 return minor[major[cp >> 8] + (cp & 0xFF)];
31 } else {
32 return 0;
33 }
34 }
35
36 static inline size_t
37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
38 uint_least8_t final_sigma_level, const uint_least16_t *major,
39 const int_least32_t *minor, const struct special_case *sc)
40 {
41 HERODOTUS_READER tmp;
42 enum case_property prop;
43 enum herodotus_status s;
44 size_t off, i;
45 uint_least32_t cp, tmp_cp;
46 int_least32_t map;
47
48 for (; herodotus_read_codepoint(r, true, &cp) ==
49 HERODOTUS_STATUS_SUCCESS;) {
50 if (sc == lower_special) {
51 /*
52 * For the special Final_Sigma-rule (see
53 * SpecialCasing.txt), which is the only non-localized
54 * case-dependent rule, we apply a different mapping
55 * when a sigma is at the end of a word.
56 *
57 * Before: cased case-ignorable*
58 * After: not(case-ignorable* cased)
59 *
60 * We check the after-condition on demand, but the
61 * before- condition is best checked using the
62 * "level"-heuristic also used in the sentence and line
63 * breaking-implementations.
64 */
65 if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER
66 SIGMA */
67 (final_sigma_level == 1 ||
68 final_sigma_level == 2)) {
69 /*
70 * check succeeding characters by first skipping
71 * all case-ignorable characters and then
72 * checking if the succeeding character is
73 * cased, invalidating the after-condition
74 */
75 herodotus_reader_copy(r, &tmp);
76 for (prop = NUM_CASE_PROPS;
77 (s = herodotus_read_codepoint(&tmp, true,
78 &tmp_cp)) ==
79 HERODOTUS_STATUS_SUCCESS;) {
80 prop = get_case_property(tmp_cp);
81
82 if (prop != CASE_PROP_CASE_IGNORABLE &&
83 prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
84 break;
85 }
86 }
87
88 /*
89 * Now prop is something other than
90 * case-ignorable or the source-string ended. If
91 * it is something other than cased, we know
92 * that the after-condition holds
93 */
94 if (s != HERODOTUS_STATUS_SUCCESS ||
95 (prop != CASE_PROP_CASED &&
96 prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
97 /*
98 * write GREEK SMALL LETTER FINAL SIGMA
99 * to destination
100 */
101 herodotus_write_codepoint(
102 w, UINT32_C(0x03C2));
103
104 /* reset Final_Sigma-state and continue
105 */
106 final_sigma_level = 0;
107 continue;
108 }
109 }
110
111 /* update state */
112 prop = get_case_property(cp);
113 if ((final_sigma_level == 0 ||
114 final_sigma_level == 1) &&
115 (prop == CASE_PROP_CASED ||
116 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
117 /* sequence has begun */
118 final_sigma_level = 1;
119 } else if (
120 (final_sigma_level == 1 ||
121 final_sigma_level == 2) &&
122 (prop == CASE_PROP_CASE_IGNORABLE ||
123 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
124 /* case-ignorable sequence begins or continued
125 */
126 final_sigma_level = 2;
127 } else {
128 /* sequence broke */
129 final_sigma_level = 0;
130 }
131 }
132
133 /* get and handle case mapping */
134 if (unlikely((map = get_case_offset(cp, major, minor)) >=
135 INT32_C(0x110000))) {
136 /* we have a special case and the offset in the sc-array
137 * is the difference to 0x110000*/
138 off = (uint_least32_t)map - UINT32_C(0x110000);
139
140 for (i = 0; i < sc[off].cplen; i++) {
141 herodotus_write_codepoint(w, sc[off].cp[i]);
142 }
143 } else {
144 /* we have a simple mapping */
145 herodotus_write_codepoint(
146 w, (uint_least32_t)((int_least32_t)cp + map));
147 }
148 }
149
150 herodotus_writer_nul_terminate(w);
151
152 return herodotus_writer_number_written(w);
153 }
154
155 static size_t
156 herodotus_next_word_break(const HERODOTUS_READER *r)
157 {
158 HERODOTUS_READER tmp;
159
160 herodotus_reader_copy(r, &tmp);
161
162 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
163 return grapheme_next_word_break(tmp.src, tmp.srclen);
164 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
165 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
166 }
167 }
168
169 static inline size_t
170 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
171 {
172 enum case_property prop;
173 enum herodotus_status s;
174 uint_least32_t cp;
175 size_t nwb;
176
177 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
178 herodotus_reader_push_advance_limit(r, nwb);
179 for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
180 HERODOTUS_STATUS_SUCCESS;) {
181 /* check if we have a cased character */
182 prop = get_case_property(cp);
183 if (prop == CASE_PROP_CASED ||
184 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
185 break;
186 } else {
187 /* write the data to the output verbatim, it if
188 * permits */
189 herodotus_write_codepoint(w, cp);
190
191 /* increment reader */
192 herodotus_read_codepoint(r, true, &cp);
193 }
194 }
195
196 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
197 /* we are done */
198 herodotus_reader_pop_limit(r);
199 break;
200 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
201 /*
202 * we did not encounter any cased character
203 * up to the word break
204 */
205 herodotus_reader_pop_limit(r);
206 continue;
207 } else {
208 /*
209 * we encountered a cased character before the word
210 * break, convert it to titlecase
211 */
212 herodotus_reader_push_advance_limit(
213 r, herodotus_reader_next_codepoint_break(r));
214 to_case(r, w, 0, title_major, title_minor,
215 title_special);
216 herodotus_reader_pop_limit(r);
217 }
218
219 /* cast the rest of the codepoints in the word to lowercase */
220 to_case(r, w, 1, lower_major, lower_minor, lower_special);
221
222 /* remove the limit on the word before the next iteration */
223 herodotus_reader_pop_limit(r);
224 }
225
226 herodotus_writer_nul_terminate(w);
227
228 return herodotus_writer_number_written(w);
229 }
230
231 size_t
232 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen,
233 uint_least32_t *dest, size_t destlen)
234 {
235 HERODOTUS_READER r;
236 HERODOTUS_WRITER w;
237
238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
239 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
240
241 return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
242 }
243
244 size_t
245 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen,
246 uint_least32_t *dest, size_t destlen)
247 {
248 HERODOTUS_READER r;
249 HERODOTUS_WRITER w;
250
251 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
252 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
253
254 return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
255 }
256
257 size_t
258 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen,
259 uint_least32_t *dest, size_t destlen)
260 {
261 HERODOTUS_READER r;
262 HERODOTUS_WRITER w;
263
264 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
265 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
266
267 return to_titlecase(&r, &w);
268 }
269
270 size_t
271 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest,
272 size_t destlen)
273 {
274 HERODOTUS_READER r;
275 HERODOTUS_WRITER w;
276
277 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
278 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
279
280 return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
281 }
282
283 size_t
284 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest,
285 size_t destlen)
286 {
287 HERODOTUS_READER r;
288 HERODOTUS_WRITER w;
289
290 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
291 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
292
293 return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
294 }
295
296 size_t
297 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest,
298 size_t destlen)
299 {
300 HERODOTUS_READER r;
301 HERODOTUS_WRITER w;
302
303 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
304 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
305
306 return to_titlecase(&r, &w);
307 }
308
309 static inline bool
310 is_case(HERODOTUS_READER *r, const uint_least16_t *major,
311 const int_least32_t *minor, const struct special_case *sc,
312 size_t *output)
313 {
314 size_t off, i;
315 bool ret = true;
316 uint_least32_t cp;
317 int_least32_t map;
318
319 for (; herodotus_read_codepoint(r, false, &cp) ==
320 HERODOTUS_STATUS_SUCCESS;) {
321 /* get and handle case mapping */
322 if (unlikely((map = get_case_offset(cp, major, minor)) >=
323 INT32_C(0x110000))) {
324 /* we have a special case and the offset in the sc-array
325 * is the difference to 0x110000*/
326 off = (uint_least32_t)map - UINT32_C(0x110000);
327
328 for (i = 0; i < sc[off].cplen; i++) {
329 if (herodotus_read_codepoint(r, false, &cp) ==
330 HERODOTUS_STATUS_SUCCESS) {
331 if (cp != sc[off].cp[i]) {
332 ret = false;
333 goto done;
334 } else {
335 /* move forward */
336 herodotus_read_codepoint(
337 r, true, &cp);
338 }
339 } else {
340 /*
341 * input ended and we didn't see
342 * any difference so far, so this
343 * string is in fact okay
344 */
345 ret = true;
346 goto done;
347 }
348 }
349 } else {
350 /* we have a simple mapping */
351 if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
352 /* we have a difference */
353 ret = false;
354 goto done;
355 } else {
356 /* move forward */
357 herodotus_read_codepoint(r, true, &cp);
358 }
359 }
360 }
361 done:
362 if (output) {
363 *output = herodotus_reader_number_read(r);
364 }
365 return ret;
366 }
367
368 static inline bool
369 is_titlecase(HERODOTUS_READER *r, size_t *output)
370 {
371 enum case_property prop;
372 enum herodotus_status s;
373 bool ret = true;
374 uint_least32_t cp;
375 size_t nwb;
376
377 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
378 herodotus_reader_push_advance_limit(r, nwb);
379 for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
380 HERODOTUS_STATUS_SUCCESS;) {
381 /* check if we have a cased character */
382 prop = get_case_property(cp);
383 if (prop == CASE_PROP_CASED ||
384 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
385 break;
386 } else {
387 /* increment reader */
388 herodotus_read_codepoint(r, true, &cp);
389 }
390 }
391
392 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
393 /* we are done */
394 break;
395 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
396 /*
397 * we did not encounter any cased character
398 * up to the word break
399 */
400 herodotus_reader_pop_limit(r);
401 continue;
402 } else {
403 /*
404 * we encountered a cased character before the word
405 * break, check if it's titlecase
406 */
407 herodotus_reader_push_advance_limit(
408 r, herodotus_reader_next_codepoint_break(r));
409 if (!is_case(r, title_major, title_minor, title_special,
410 NULL)) {
411 ret = false;
412 goto done;
413 }
414 herodotus_reader_pop_limit(r);
415 }
416
417 /* check if the rest of the codepoints in the word are lowercase
418 */
419 if (!is_case(r, lower_major, lower_minor, lower_special,
420 NULL)) {
421 ret = false;
422 goto done;
423 }
424
425 /* remove the limit on the word before the next iteration */
426 herodotus_reader_pop_limit(r);
427 }
428 done:
429 if (output) {
430 *output = herodotus_reader_number_read(r);
431 }
432 return ret;
433 }
434
435 bool
436 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
437 {
438 HERODOTUS_READER r;
439
440 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
441
442 return is_case(&r, upper_major, upper_minor, upper_special, caselen);
443 }
444
445 bool
446 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
447 {
448 HERODOTUS_READER r;
449
450 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
451
452 return is_case(&r, lower_major, lower_minor, lower_special, caselen);
453 }
454
455 bool
456 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
457 {
458 HERODOTUS_READER r;
459
460 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
461
462 return is_titlecase(&r, caselen);
463 }
464
465 bool
466 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
467 {
468 HERODOTUS_READER r;
469
470 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
471
472 return is_case(&r, upper_major, upper_minor, upper_special, caselen);
473 }
474
475 bool
476 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
477 {
478 HERODOTUS_READER r;
479
480 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
481
482 return is_case(&r, lower_major, lower_minor, lower_special, caselen);
483 }
484
485 bool
486 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
487 {
488 HERODOTUS_READER r;
489
490 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
491
492 return is_titlecase(&r, caselen);
493 }