util.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
util.c (11480B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <limits.h>
3 #include <stdbool.h>
4 #include <stddef.h>
5 #include <stdint.h>
6
7 #include "../gen/types.h"
8 #include "../grapheme.h"
9 #include "util.h"
10
11 void
12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
13 const void *src, size_t srclen)
14 {
15 size_t i;
16
17 r->type = type;
18 r->src = src;
19 r->srclen = srclen;
20 r->off = 0;
21 r->terminated_by_null = false;
22
23 for (i = 0; i < LEN(r->soft_limit); i++) {
24 r->soft_limit[i] = SIZE_MAX;
25 }
26 }
27
28 void
29 herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
30 {
31 size_t i;
32
33 /*
34 * we copy such that we have a "fresh" start and build on the
35 * fact that src->soft_limit[i] for any i and src->srclen are
36 * always larger or equal to src->off
37 */
38 dest->type = src->type;
39 if (src->type == HERODOTUS_TYPE_CODEPOINT) {
40 dest->src =
41 (src->src == NULL) ?
42 NULL :
43 ((const uint_least32_t *)(src->src)) + src->off;
44 } else { /* src->type == HERODOTUS_TYPE_UTF8 */
45 dest->src = (src->src == NULL) ?
46 NULL :
47 ((const char *)(src->src)) + src->off;
48 }
49 if (src->srclen == SIZE_MAX) {
50 dest->srclen = SIZE_MAX;
51 } else {
52 dest->srclen =
53 (src->off < src->srclen) ? src->srclen - src->off : 0;
54 }
55 dest->off = 0;
56 dest->terminated_by_null = src->terminated_by_null;
57
58 for (i = 0; i < LEN(src->soft_limit); i++) {
59 if (src->soft_limit[i] == SIZE_MAX) {
60 dest->soft_limit[i] = SIZE_MAX;
61 } else {
62 /*
63 * if we have a degenerate case where the offset is
64 * higher than the soft-limit, we simply clamp the
65 * soft-limit to zero given we can't decide here
66 * to release the limit and, instead, we just
67 * prevent any more reads
68 */
69 dest->soft_limit[i] =
70 (src->off < src->soft_limit[i]) ?
71 src->soft_limit[i] - src->off :
72 0;
73 }
74 }
75 }
76
77 void
78 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
79 {
80 size_t i;
81
82 for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
83 r->soft_limit[i] = r->soft_limit[i - 1];
84 }
85 r->soft_limit[0] = r->off + count;
86 }
87
88 void
89 herodotus_reader_pop_limit(HERODOTUS_READER *r)
90 {
91 size_t i;
92
93 for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
94 r->soft_limit[i] = r->soft_limit[i + 1];
95 }
96 r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
97 }
98
99 size_t
100 herodotus_reader_next_word_break(const HERODOTUS_READER *r)
101 {
102 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
103 return grapheme_next_word_break(
104 (const uint_least32_t *)(r->src) + r->off,
105 MIN(r->srclen, r->soft_limit[0]) - r->off);
106 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
107 return grapheme_next_word_break_utf8(
108 (const char *)(r->src) + r->off,
109 MIN(r->srclen, r->soft_limit[0]) - r->off);
110 }
111 }
112
113 size_t
114 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
115 {
116 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
117 return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
118 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
119 return grapheme_decode_utf8(
120 (const char *)(r->src) + r->off,
121 MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
122 }
123 }
124
125 size_t
126 herodotus_reader_number_read(const HERODOTUS_READER *r)
127 {
128 return r->off;
129 }
130
131 enum herodotus_status
132 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
133 {
134 size_t ret;
135
136 if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
137 *cp = GRAPHEME_INVALID_CODEPOINT;
138 return HERODOTUS_STATUS_END_OF_BUFFER;
139 }
140
141 if (r->off >= r->soft_limit[0]) {
142 *cp = GRAPHEME_INVALID_CODEPOINT;
143 return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
144 }
145
146 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
147 *cp = ((const uint_least32_t *)(r->src))[r->off];
148 ret = 1;
149 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
150 ret = grapheme_decode_utf8(
151 (const char *)r->src + r->off,
152 MIN(r->srclen, r->soft_limit[0]) - r->off, cp);
153 }
154
155 if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
156 /*
157 * We encountered a null-codepoint. Don't increment
158 * offset and return as if the buffer had ended here all
159 * along
160 */
161 r->terminated_by_null = true;
162 return HERODOTUS_STATUS_END_OF_BUFFER;
163 }
164
165 if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
166 /*
167 * we want more than we have; instead of returning
168 * garbage we terminate here.
169 */
170 return HERODOTUS_STATUS_END_OF_BUFFER;
171 }
172
173 /*
174 * Increase offset which we now know won't surpass the limits,
175 * unless we got told otherwise
176 */
177 if (advance) {
178 r->off += ret;
179 }
180
181 return HERODOTUS_STATUS_SUCCESS;
182 }
183
184 void
185 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, void *dest,
186 size_t destlen)
187 {
188 w->type = type;
189 w->dest = dest;
190 w->destlen = destlen;
191 w->off = 0;
192 w->first_unwritable_offset = SIZE_MAX;
193 }
194
195 void
196 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
197 {
198 if (w->dest == NULL) {
199 return;
200 }
201
202 if (w->off < w->destlen) {
203 /* We still have space in the buffer. Simply use it */
204 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
205 ((uint_least32_t *)(w->dest))[w->off] = 0;
206 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
207 ((char *)(w->dest))[w->off] = '\0';
208 }
209 } else if (w->first_unwritable_offset < w->destlen) {
210 /*
211 * There is no more space in the buffer. However,
212 * we have noted down the first offset we couldn't
213 * use to write into the buffer and it's smaller than
214 * destlen. Thus we bailed writing into the
215 * destination when a multibyte-codepoint couldn't be
216 * written. So the last "real" byte might be at
217 * destlen-4, destlen-3, destlen-2 or destlen-1
218 * (the last case meaning truncation).
219 */
220 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
221 ((uint_least32_t
222 *)(w->dest))[w->first_unwritable_offset] = 0;
223 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
224 ((char *)(w->dest))[w->first_unwritable_offset] = '\0';
225 }
226 } else if (w->destlen > 0) {
227 /*
228 * In this case, there is no more space in the buffer and
229 * the last unwritable offset is larger than
230 * or equal to the destination buffer length. This means
231 * that we are forced to simply write into the last
232 * byte.
233 */
234 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
235 ((uint_least32_t *)(w->dest))[w->destlen - 1] = 0;
236 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
237 ((char *)(w->dest))[w->destlen - 1] = '\0';
238 }
239 }
240
241 /* w->off is not incremented in any case */
242 }
243
244 size_t
245 herodotus_writer_number_written(const HERODOTUS_WRITER *w)
246 {
247 return w->off;
248 }
249
250 void
251 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
252 {
253 size_t ret;
254
255 /*
256 * This function will always faithfully say how many codepoints
257 * were written, even if the buffer ends. This is used to enable
258 * truncation detection.
259 */
260 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
261 if (w->dest != NULL && w->off < w->destlen) {
262 ((uint_least32_t *)(w->dest))[w->off] = cp;
263 }
264
265 w->off += 1;
266 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
267 /*
268 * First determine how many bytes we need to encode the
269 * codepoint
270 */
271 ret = grapheme_encode_utf8(cp, NULL, 0);
272
273 if (w->dest != NULL && w->off + ret < w->destlen) {
274 /* we still have enough room in the buffer */
275 grapheme_encode_utf8(cp, (char *)(w->dest) + w->off,
276 w->destlen - w->off);
277 } else if (w->first_unwritable_offset == SIZE_MAX) {
278 /*
279 * the first unwritable offset has not been
280 * noted down, so this is the first time we can't
281 * write (completely) to an offset
282 */
283 w->first_unwritable_offset = w->off;
284 }
285
286 w->off += ret;
287 }
288 }
289
290 void
291 proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
292 uint_least8_t (*get_break_prop)(uint_least32_t),
293 bool (*is_skippable_prop)(uint_least8_t),
294 void (*skip_shift_callback)(uint_least8_t, void *),
295 struct proper *p)
296 {
297 uint_least8_t prop;
298 uint_least32_t cp;
299 size_t i;
300
301 /* set internal variables */
302 p->state = state;
303 p->no_prop = no_prop;
304 p->get_break_prop = get_break_prop;
305 p->is_skippable_prop = is_skippable_prop;
306 p->skip_shift_callback = skip_shift_callback;
307
308 /*
309 * Initialize mid-reader, which is basically just there
310 * to reflect the current position of the viewing-line
311 */
312 herodotus_reader_copy(r, &(p->mid_reader));
313
314 /*
315 * In the initialization, we simply (try to) fill in next_prop.
316 * If we cannot read in more (due to the buffer ending), we
317 * fill in the prop as invalid
318 */
319
320 /*
321 * initialize the previous properties to have no property
322 * (given we are at the start of the buffer)
323 */
324 p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
325 p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
326
327 /*
328 * initialize the next properties
329 */
330
331 /* initialize the raw reader */
332 herodotus_reader_copy(r, &(p->raw_reader));
333
334 /* fill in the two next raw properties (after no-initialization) */
335 p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
336 for (i = 0;
337 i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
338 HERODOTUS_STATUS_SUCCESS;) {
339 p->raw.next_prop[i++] = p->get_break_prop(cp);
340 }
341
342 /* initialize the skip reader */
343 herodotus_reader_copy(r, &(p->skip_reader));
344
345 /* fill in the two next skip properties (after no-initialization) */
346 p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
347 for (i = 0;
348 i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
349 HERODOTUS_STATUS_SUCCESS;) {
350 prop = p->get_break_prop(cp);
351 if (!p->is_skippable_prop(prop)) {
352 p->skip.next_prop[i++] = prop;
353 }
354 }
355 }
356
357 int
358 proper_advance(struct proper *p)
359 {
360 uint_least8_t prop;
361 uint_least32_t cp;
362
363 /* read in next "raw" property */
364 if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
365 HERODOTUS_STATUS_SUCCESS) {
366 prop = p->get_break_prop(cp);
367 } else {
368 prop = p->no_prop;
369 }
370
371 /*
372 * do a shift-in, unless we find that the property that is to
373 * be moved past the "raw-viewing-line" (this property is stored
374 * in p->raw.next_prop[0]) is a no_prop, indicating that
375 * we are at the end of the buffer.
376 */
377 if (p->raw.next_prop[0] == p->no_prop) {
378 return 1;
379 }
380
381 /* shift in the properties */
382 p->raw.prev_prop[1] = p->raw.prev_prop[0];
383 p->raw.prev_prop[0] = p->raw.next_prop[0];
384 p->raw.next_prop[0] = p->raw.next_prop[1];
385 p->raw.next_prop[1] = prop;
386
387 /* advance the middle reader viewing-line */
388 (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
389
390 /* check skippability-property */
391 if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
392 /*
393 * the property that has moved past the "raw-viewing-line"
394 * (this property is now (after the raw-shift) stored in
395 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
396 * guaranteeing that we won't shift a no-prop past the
397 * "viewing-line" in the skip-properties) is not a skippable
398 * property, thus we need to shift the skip property as well.
399 */
400 p->skip.prev_prop[1] = p->skip.prev_prop[0];
401 p->skip.prev_prop[0] = p->skip.next_prop[0];
402 p->skip.next_prop[0] = p->skip.next_prop[1];
403
404 /*
405 * call the skip-shift-callback on the property that
406 * passed the skip-viewing-line (this property is now
407 * stored in p->skip.prev_prop[0]).
408 */
409 p->skip_shift_callback(p->skip.prev_prop[0], p->state);
410
411 /* determine the next shift property */
412 p->skip.next_prop[1] = p->no_prop;
413 while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
414 HERODOTUS_STATUS_SUCCESS) {
415 prop = p->get_break_prop(cp);
416 if (!p->is_skippable_prop(prop)) {
417 p->skip.next_prop[1] = prop;
418 break;
419 }
420 }
421 }
422
423 return 0;
424 }