word.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
word.c (8052B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4
5 #include "../gen/word.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 struct word_break_state {
10 bool ri_even;
11 };
12
13 static inline uint_least8_t
14 get_word_break_prop(uint_least32_t cp)
15 {
16 if (likely(cp <= UINT32_C(0x10FFFF))) {
17 return (uint_least8_t)
18 word_break_minor[word_break_major[cp >> 8] +
19 (cp & 0xff)];
20 } else {
21 return WORD_BREAK_PROP_OTHER;
22 }
23 }
24
25 static bool
26 is_skippable_word_prop(uint_least8_t prop)
27 {
28 return prop == WORD_BREAK_PROP_EXTEND ||
29 prop == WORD_BREAK_PROP_FORMAT || prop == WORD_BREAK_PROP_ZWJ;
30 }
31
32 static void
33 word_skip_shift_callback(uint_least8_t prop, void *s)
34 {
35 struct word_break_state *state = (struct word_break_state *)s;
36
37 if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
38 /*
39 * The property we just shifted in is
40 * a regional indicator, increasing the
41 * number of consecutive RIs on the left
42 * side of the breakpoint by one, changing
43 * the oddness.
44 *
45 */
46 state->ri_even = !(state->ri_even);
47 } else {
48 /*
49 * We saw no regional indicator, so the
50 * number of consecutive RIs on the left
51 * side of the breakpoint is zero, which
52 * is an even number.
53 *
54 */
55 state->ri_even = true;
56 }
57 }
58
59 static size_t
60 next_word_break(HERODOTUS_READER *r)
61 {
62 struct proper p;
63 struct word_break_state state = { .ri_even = true };
64
65 /*
66 * Apply word breaking algorithm (UAX #29), see
67 * https://unicode.org/reports/tr29/#Word_Boundary_Rules
68 */
69 proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
70 is_skippable_word_prop, word_skip_shift_callback, &p);
71
72 while (!proper_advance(&p)) {
73 /* WB3 */
74 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
75 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
76 continue;
77 }
78
79 /* WB3a */
80 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
81 p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
82 p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
83 break;
84 }
85
86 /* WB3b */
87 if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
88 p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
89 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
90 break;
91 }
92
93 /* WB3c */
94 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
95 (p.raw.next_prop[0] ==
96 WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
97 p.raw.next_prop[0] ==
98 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
99 continue;
100 }
101
102 /* WB3d */
103 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
104 p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
105 continue;
106 }
107
108 /* WB4 */
109 if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
110 p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
111 p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
112 continue;
113 }
114
115 /* WB5 */
116 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
117 p.skip.prev_prop[0] ==
118 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
119 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
120 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
121 p.skip.next_prop[0] ==
122 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
123 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
124 continue;
125 }
126
127 /* WB6 */
128 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
129 p.skip.prev_prop[0] ==
130 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
131 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
132 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
133 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
134 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
135 (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
136 p.skip.next_prop[1] ==
137 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
138 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
139 continue;
140 }
141
142 /* WB7 */
143 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
144 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
145 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
146 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
147 p.skip.next_prop[0] ==
148 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
149 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
150 (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
151 p.skip.prev_prop[1] ==
152 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
153 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
154 continue;
155 }
156
157 /* WB7a */
158 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
159 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
160 continue;
161 }
162
163 /* WB7b */
164 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
165 p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
166 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
167 continue;
168 }
169
170 /* WB7c */
171 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
172 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
173 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
174 continue;
175 }
176
177 /* WB8 */
178 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
179 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
180 continue;
181 }
182
183 /* WB9 */
184 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
185 p.skip.prev_prop[0] ==
186 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
187 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
188 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
189 continue;
190 }
191
192 /* WB10 */
193 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
194 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
195 p.skip.next_prop[0] ==
196 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
197 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
198 continue;
199 }
200
201 /* WB11 */
202 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
203 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
204 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
205 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
206 p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
207 continue;
208 }
209
210 /* WB12 */
211 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
212 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
213 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
214 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
215 p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
216 continue;
217 }
218
219 /* WB13 */
220 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
221 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
222 continue;
223 }
224
225 /* WB13a */
226 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
227 p.skip.prev_prop[0] ==
228 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
229 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
230 p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
231 p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
232 p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
233 p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
234 continue;
235 }
236
237 /* WB13b */
238 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
239 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
240 p.skip.next_prop[0] ==
241 WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
242 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
243 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
244 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
245 continue;
246 }
247
248 /* WB15 and WB16 */
249 if (!state.ri_even &&
250 p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
251 continue;
252 }
253
254 /* WB999 */
255 break;
256 }
257
258 return herodotus_reader_number_read(&(p.mid_reader));
259 }
260
261 size_t
262 grapheme_next_word_break(const uint_least32_t *str, size_t len)
263 {
264 HERODOTUS_READER r;
265
266 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
267
268 return next_word_break(&r);
269 }
270
271 size_t
272 grapheme_next_word_break_utf8(const char *str, size_t len)
273 {
274 HERODOTUS_READER r;
275
276 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
277
278 return next_word_break(&r);
279 }