sentence.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
sentence.c (8420B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4
5 #include "../gen/sentence.h"
6 #include "../grapheme.h"
7 #include "util.h"
8
9 struct sentence_break_state {
10 uint_least8_t aterm_close_sp_level;
11 uint_least8_t saterm_close_sp_parasep_level;
12 };
13
14 static inline uint_least8_t
15 get_sentence_break_prop(uint_least32_t cp)
16 {
17 if (likely(cp <= UINT32_C(0x10FFFF))) {
18 return (uint_least8_t)
19 sentence_break_minor[sentence_break_major[cp >> 8] +
20 (cp & 0xff)];
21 } else {
22 return SENTENCE_BREAK_PROP_OTHER;
23 }
24 }
25
26 static bool
27 is_skippable_sentence_prop(uint_least8_t prop)
28 {
29 return prop == SENTENCE_BREAK_PROP_EXTEND ||
30 prop == SENTENCE_BREAK_PROP_FORMAT;
31 }
32
33 static void
34 sentence_skip_shift_callback(uint_least8_t prop, void *s)
35 {
36 struct sentence_break_state *state = (struct sentence_break_state *)s;
37
38 /*
39 * Here comes a bit of magic. The rules
40 * SB8, SB8a, SB9 and SB10 have very complicated
41 * left-hand-side-rules of the form
42 *
43 * ATerm Close* Sp*
44 * SATerm Close*
45 * SATerm Close* Sp*
46 * SATerm Close* Sp* ParaSep?
47 *
48 * but instead of backtracking, we keep the
49 * state as some kind of "power level" in
50 * two state-variables
51 *
52 * aterm_close_sp_level
53 * saterm_close_sp_parasep_level
54 *
55 * that go from 0 to 3/4:
56 *
57 * 0: we are not in the sequence
58 * 1: we have one ATerm/SATerm to the left of
59 * the middle spot
60 * 2: we have one ATerm/SATerm and one or more
61 * Close to the left of the middle spot
62 * 3: we have one ATerm/SATerm, zero or more
63 * Close and one or more Sp to the left of
64 * the middle spot.
65 * 4: we have one SATerm, zero or more Close,
66 * zero or more Sp and one ParaSep to the
67 * left of the middle spot.
68 *
69 */
70 if ((state->aterm_close_sp_level == 0 ||
71 state->aterm_close_sp_level == 1) &&
72 prop == SENTENCE_BREAK_PROP_ATERM) {
73 /* sequence has begun */
74 state->aterm_close_sp_level = 1;
75 } else if ((state->aterm_close_sp_level == 1 ||
76 state->aterm_close_sp_level == 2) &&
77 prop == SENTENCE_BREAK_PROP_CLOSE) {
78 /* close-sequence begins or continued */
79 state->aterm_close_sp_level = 2;
80 } else if ((state->aterm_close_sp_level == 1 ||
81 state->aterm_close_sp_level == 2 ||
82 state->aterm_close_sp_level == 3) &&
83 prop == SENTENCE_BREAK_PROP_SP) {
84 /* sp-sequence begins or continued */
85 state->aterm_close_sp_level = 3;
86 } else {
87 /* sequence broke */
88 state->aterm_close_sp_level = 0;
89 }
90
91 if ((state->saterm_close_sp_parasep_level == 0 ||
92 state->saterm_close_sp_parasep_level == 1) &&
93 (prop == SENTENCE_BREAK_PROP_STERM ||
94 prop == SENTENCE_BREAK_PROP_ATERM)) {
95 /* sequence has begun */
96 state->saterm_close_sp_parasep_level = 1;
97 } else if ((state->saterm_close_sp_parasep_level == 1 ||
98 state->saterm_close_sp_parasep_level == 2) &&
99 prop == SENTENCE_BREAK_PROP_CLOSE) {
100 /* close-sequence begins or continued */
101 state->saterm_close_sp_parasep_level = 2;
102 } else if ((state->saterm_close_sp_parasep_level == 1 ||
103 state->saterm_close_sp_parasep_level == 2 ||
104 state->saterm_close_sp_parasep_level == 3) &&
105 prop == SENTENCE_BREAK_PROP_SP) {
106 /* sp-sequence begins or continued */
107 state->saterm_close_sp_parasep_level = 3;
108 } else if ((state->saterm_close_sp_parasep_level == 1 ||
109 state->saterm_close_sp_parasep_level == 2 ||
110 state->saterm_close_sp_parasep_level == 3) &&
111 (prop == SENTENCE_BREAK_PROP_SEP ||
112 prop == SENTENCE_BREAK_PROP_CR ||
113 prop == SENTENCE_BREAK_PROP_LF)) {
114 /* ParaSep at the end of the sequence */
115 state->saterm_close_sp_parasep_level = 4;
116 } else {
117 /* sequence broke */
118 state->saterm_close_sp_parasep_level = 0;
119 }
120 }
121
122 static size_t
123 next_sentence_break(HERODOTUS_READER *r)
124 {
125 HERODOTUS_READER tmp;
126 enum sentence_break_property prop;
127 struct proper p;
128 struct sentence_break_state state = { 0 };
129 uint_least32_t cp;
130
131 /*
132 * Apply sentence breaking algorithm (UAX #29), see
133 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
134 */
135 proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
136 get_sentence_break_prop, is_skippable_sentence_prop,
137 sentence_skip_shift_callback, &p);
138
139 while (!proper_advance(&p)) {
140 /* SB3 */
141 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
142 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
143 continue;
144 }
145
146 /* SB4 */
147 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
148 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
149 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
150 break;
151 }
152
153 /* SB5 */
154 if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
155 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
156 continue;
157 }
158
159 /* SB6 */
160 if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
161 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
162 continue;
163 }
164
165 /* SB7 */
166 if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
167 p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
168 p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
169 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
170 continue;
171 }
172
173 /* SB8 */
174 if (state.aterm_close_sp_level == 1 ||
175 state.aterm_close_sp_level == 2 ||
176 state.aterm_close_sp_level == 3) {
177 /*
178 * This is the most complicated rule, requiring
179 * the right-hand-side to satisfy the regular expression
180 *
181 * ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )*
182 * Lower
183 *
184 * which we simply check "manually" given LUT-lookups
185 * are very cheap by starting at the mid_reader.
186 *
187 */
188 herodotus_reader_copy(&(p.mid_reader), &tmp);
189
190 prop = NUM_SENTENCE_BREAK_PROPS;
191 while (herodotus_read_codepoint(&tmp, true, &cp) ==
192 HERODOTUS_STATUS_SUCCESS) {
193 prop = get_sentence_break_prop(cp);
194
195 /*
196 * the skippable properties are ignored
197 * automatically here given they do not
198 * match the following condition
199 */
200 if (prop == SENTENCE_BREAK_PROP_OLETTER ||
201 prop == SENTENCE_BREAK_PROP_UPPER ||
202 prop == SENTENCE_BREAK_PROP_LOWER ||
203 prop == SENTENCE_BREAK_PROP_SEP ||
204 prop == SENTENCE_BREAK_PROP_CR ||
205 prop == SENTENCE_BREAK_PROP_LF ||
206 prop == SENTENCE_BREAK_PROP_STERM ||
207 prop == SENTENCE_BREAK_PROP_ATERM) {
208 break;
209 }
210 }
211
212 if (prop == SENTENCE_BREAK_PROP_LOWER) {
213 continue;
214 }
215 }
216
217 /* SB8a */
218 if ((state.saterm_close_sp_parasep_level == 1 ||
219 state.saterm_close_sp_parasep_level == 2 ||
220 state.saterm_close_sp_parasep_level == 3) &&
221 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
222 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
223 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
224 continue;
225 }
226
227 /* SB9 */
228 if ((state.saterm_close_sp_parasep_level == 1 ||
229 state.saterm_close_sp_parasep_level == 2) &&
230 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
231 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
232 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
233 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
234 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
235 continue;
236 }
237
238 /* SB10 */
239 if ((state.saterm_close_sp_parasep_level == 1 ||
240 state.saterm_close_sp_parasep_level == 2 ||
241 state.saterm_close_sp_parasep_level == 3) &&
242 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
243 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
244 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
245 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
246 continue;
247 }
248
249 /* SB11 */
250 if (state.saterm_close_sp_parasep_level == 1 ||
251 state.saterm_close_sp_parasep_level == 2 ||
252 state.saterm_close_sp_parasep_level == 3 ||
253 state.saterm_close_sp_parasep_level == 4) {
254 break;
255 }
256
257 /* SB998 */
258 continue;
259 }
260
261 return herodotus_reader_number_read(&(p.mid_reader));
262 }
263
264 size_t
265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
266 {
267 HERODOTUS_READER r;
268
269 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
270
271 return next_sentence_break(&r);
272 }
273
274 size_t
275 grapheme_next_sentence_break_utf8(const char *str, size_t len)
276 {
277 HERODOTUS_READER r;
278
279 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
280
281 return next_sentence_break(&r);
282 }