character.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
character.c (18706B)
---
1 #include <stdio.h>
2
3 /* See LICENSE file for copyright and license details. */
4 #include <limits.h>
5 #include <stdbool.h>
6 #include <stddef.h>
7
8 #include "../gen/character.h"
9 #include "../grapheme.h"
10 #include "util.h"
11
12 struct character_break_state {
13 uint_least8_t prop;
14 bool prop_set;
15 bool gb11_flag;
16 bool gb12_13_flag;
17 uint_least8_t gb9c_level;
18 };
19
20 static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = {
21 [CHAR_BREAK_PROP_OTHER] =
22 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
23 UINT32_C(1)
24 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
25 UINT32_C(1)
26 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
27 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
28 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
29 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
30 [CHAR_BREAK_PROP_ICB_CONSONANT] =
31 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
32 UINT32_C(1)
33 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
34 UINT32_C(1)
35 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
36 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
37 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
38 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
39 [CHAR_BREAK_PROP_ICB_EXTEND] =
40 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
41 UINT32_C(1)
42 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
43 UINT32_C(1)
44 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
45 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
46 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
47 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
48 [CHAR_BREAK_PROP_ICB_LINKER] =
49 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
50 UINT32_C(1)
51 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
52 UINT32_C(1)
53 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
54 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
55 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
56 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
57 [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
58 [CHAR_BREAK_PROP_EXTEND] =
59 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
60 UINT32_C(1)
61 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
62 UINT32_C(1)
63 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
64 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
65 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
66 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
67 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] =
68 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
69 UINT32_C(1)
70 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
71 UINT32_C(1)
72 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
73 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
74 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
75 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
76 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] =
77 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
78 UINT32_C(1)
79 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
80 UINT32_C(1)
81 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
82 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
83 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
84 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
85 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
86 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
87 UINT32_C(1)
88 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
89 UINT32_C(1)
90 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
91 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
92 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
93 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
94 [CHAR_BREAK_PROP_HANGUL_L] =
95 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
96 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
97 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
98 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
99 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
100 UINT32_C(1)
101 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
102 UINT32_C(1)
103 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
104 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
105 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
106 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
107 [CHAR_BREAK_PROP_HANGUL_V] =
108 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
109 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
110 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
111 UINT32_C(1)
112 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
113 UINT32_C(1)
114 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
115 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
116 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
117 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
118 [CHAR_BREAK_PROP_HANGUL_T] =
119 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
120 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
121 UINT32_C(1)
122 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
123 UINT32_C(1)
124 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
125 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
126 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
127 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
128 [CHAR_BREAK_PROP_HANGUL_LV] =
129 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
130 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
131 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
132 UINT32_C(1)
133 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
134 UINT32_C(1)
135 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
136 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
137 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
138 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
139 [CHAR_BREAK_PROP_HANGUL_LVT] =
140 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
141 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
142 UINT32_C(1)
143 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
144 UINT32_C(1)
145 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
146 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
147 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
148 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
149 [CHAR_BREAK_PROP_PREPEND] =
150 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
151 UINT32_C(1)
152 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
153 UINT32_C(1)
154 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
155 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
156 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
157 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
158 (UINT32_C(0xFFFFFFFF) &
159 ~(UINT32_C(1) << CHAR_BREAK_PROP_CR |
160 UINT32_C(1) << CHAR_BREAK_PROP_LF |
161 UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
162 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
163 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
164 UINT32_C(1)
165 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
166 UINT32_C(1)
167 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
168 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
169 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
170 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
171 [CHAR_BREAK_PROP_SPACINGMARK] =
172 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
173 UINT32_C(1)
174 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
175 UINT32_C(1)
176 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
177 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
178 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
179 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
180 [CHAR_BREAK_PROP_ZWJ] =
181 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
182 UINT32_C(1)
183 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
184 UINT32_C(1)
185 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
186 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
187 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
188 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
189 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] =
190 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
191 UINT32_C(1)
192 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
193 UINT32_C(1)
194 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER | /* GB9 */
195 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
196 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
197 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
198
199 };
200 static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
201 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
202 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
203 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9 */
204 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
205 UINT32_C(1)
206 << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9 */
207 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /* GB9 */
208 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
209 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
210 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
211 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
212 [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
213 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
214 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
215 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
216 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
217 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
218 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
219 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
220 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
221 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
222 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
223 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
224 [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] =
225 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
226 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
227 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
228 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
229 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
230 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
231 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
232 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND |
233 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
234 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
235 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER,
236 };
237 static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
238 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
239 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
240 [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
241 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
242 };
243 static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
244 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
245 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
246 };
247 static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
248 [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
249 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
250 };
251
252 static inline enum char_break_property
253 get_break_prop(uint_least32_t cp)
254 {
255 if (likely(cp <= UINT32_C(0x10FFFF))) {
256 return (enum char_break_property)
257 char_break_minor[char_break_major[cp >> 8] +
258 (cp & 0xFF)];
259 } else {
260 return CHAR_BREAK_PROP_OTHER;
261 }
262 }
263
264 static inline void
265 state_serialize(const struct character_break_state *in, uint_least16_t *out)
266 {
267 *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
268 (uint_least16_t)(((uint_least16_t)(in->prop_set))
269 << 8) | /* 9th bit */
270 (uint_least16_t)(((uint_least16_t)(in->gb11_flag))
271 << 9) | /* 10th bit */
272 (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag))
273 << 10) | /* 11th bit */
274 (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3))
275 << 11); /* 12th and 13th bit */
276 }
277
278 static inline void
279 state_deserialize(uint_least16_t in, struct character_break_state *out)
280 {
281 out->prop = in & UINT8_C(0xFF);
282 out->prop_set = in & (UINT16_C(1) << 8);
283 out->gb11_flag = in & (UINT16_C(1) << 9);
284 out->gb12_13_flag = in & (UINT16_C(1) << 10);
285 out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3);
286 }
287
288 bool
289 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
290 uint_least16_t *s)
291 {
292 struct character_break_state state;
293 enum char_break_property cp0_prop, cp1_prop;
294 bool notbreak = false;
295
296 if (likely(s)) {
297 state_deserialize(*s, &state);
298
299 if (likely(state.prop_set)) {
300 cp0_prop = state.prop;
301 } else {
302 cp0_prop = get_break_prop(cp0);
303 }
304 cp1_prop = get_break_prop(cp1);
305
306 /* preserve prop of right codepoint for next iteration */
307 state.prop = (uint_least8_t)cp1_prop;
308 state.prop_set = true;
309
310 /* update flags */
311 state.gb11_flag =
312 flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
313 state.gb11_flag] &
314 UINT32_C(1) << cp1_prop;
315 state.gb12_13_flag =
316 flag_update_gb12_13[cp0_prop +
317 NUM_CHAR_BREAK_PROPS *
318 state.gb12_13_flag] &
319 UINT32_C(1) << cp1_prop;
320
321 /*
322 * update GB9c state, which deals with indic conjunct breaks.
323 * We want to detect the following prefix:
324 *
325 * ICB_CONSONANT
326 * [ICB_EXTEND ICB_LINKER]*
327 * ICB_LINKER
328 * [ICB_EXTEND ICB_LINKER]*
329 *
330 * This representation is not ideal: In reality, what is
331 * meant is that the prefix is a sequence of [ICB_EXTEND
332 * ICB_LINKER]*, following an ICB_CONSONANT, that contains at
333 * least one ICB_LINKER. We thus use the following equivalent
334 * representation that allows us to store the levels 0..3 in 2
335 * bits.
336 *
337 * ICB_CONSONANT -- Level 1
338 * ICB_EXTEND* -- Level 2
339 * ICB_LINKER -- Level 3
340 * [ICB_EXTEND ICB_LINKER]* -- Level 3
341 *
342 * The following chain of if-else-blocks is a bit redundant and
343 * of course could be optimised, but this is kept as is for
344 * best readability.
345 */
346 if (state.gb9c_level == 0 &&
347 cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
348 /* the sequence has begun */
349 state.gb9c_level = 1;
350 } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
351 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
352 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
353 cp0_prop ==
354 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND)) {
355 /*
356 * either the level is 1 and thus the ICB consonant is
357 * followed by an ICB extend, where we jump
358 * to level 2, or we are at level 2 and just witness
359 * more ICB extends, staying at level 2.
360 */
361 state.gb9c_level = 2;
362 } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
363 (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
364 cp0_prop ==
365 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
366 /*
367 * witnessing an ICB linker directly lifts us up to
368 * level 3
369 */
370 state.gb9c_level = 3;
371 } else if (state.gb9c_level == 3 &&
372 (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
373 cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
374 cp0_prop ==
375 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND ||
376 cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
377 cp0_prop ==
378 CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
379 /*
380 * we stay at level 3 when we observe either ICB
381 * extends or linkers
382 */
383 state.gb9c_level = 3;
384 } else {
385 /*
386 * the sequence has collapsed, but it could be
387 * that the left property is ICB consonant, which
388 * means that we jump right back to level 1 instead
389 * of 0
390 */
391 if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
392 state.gb9c_level = 1;
393 } else {
394 state.gb9c_level = 0;
395 }
396 }
397
398 /*
399 * Apply grapheme cluster breaking algorithm (UAX #29), see
400 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
401 */
402 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
403 (state.gb9c_level == 3 &&
404 cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) ||
405 (dont_break_gb11[cp0_prop +
406 state.gb11_flag *
407 NUM_CHAR_BREAK_PROPS] &
408 (UINT32_C(1) << cp1_prop)) ||
409 (dont_break_gb12_13[cp0_prop +
410 state.gb12_13_flag *
411 NUM_CHAR_BREAK_PROPS] &
412 (UINT32_C(1) << cp1_prop));
413
414 /* update or reset flags (when we have a break) */
415 if (likely(!notbreak)) {
416 state.gb11_flag = state.gb12_13_flag = false;
417 }
418
419 state_serialize(&state, s);
420 } else {
421 cp0_prop = get_break_prop(cp0);
422 cp1_prop = get_break_prop(cp1);
423
424 /*
425 * Apply grapheme cluster breaking algorithm (UAX #29), see
426 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
427 *
428 * Given we have no state, this behaves as if the state-booleans
429 * were all set to false
430 */
431 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
432 (dont_break_gb11[cp0_prop] &
433 (UINT32_C(1) << cp1_prop)) ||
434 (dont_break_gb12_13[cp0_prop] &
435 (UINT32_C(1) << cp1_prop));
436 }
437
438 return !notbreak;
439 }
440
441 static size_t
442 next_character_break(HERODOTUS_READER *r)
443 {
444 uint_least16_t state = 0;
445 uint_least32_t cp0 = 0, cp1 = 0;
446
447 for (herodotus_read_codepoint(r, true, &cp0);
448 herodotus_read_codepoint(r, false, &cp1) ==
449 HERODOTUS_STATUS_SUCCESS;
450 herodotus_read_codepoint(r, true, &cp0)) {
451 if (grapheme_is_character_break(cp0, cp1, &state)) {
452 break;
453 }
454 }
455
456 return herodotus_reader_number_read(r);
457 }
458
459 size_t
460 grapheme_next_character_break(const uint_least32_t *str, size_t len)
461 {
462 HERODOTUS_READER r;
463
464 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
465
466 return next_character_break(&r);
467 }
468
469 size_t
470 grapheme_next_character_break_utf8(const char *str, size_t len)
471 {
472 HERODOTUS_READER r;
473
474 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
475
476 return next_character_break(&r);
477 }