utf8-decode.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
utf8-decode.c (7826B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stddef.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <string.h>
6
7 #include "../grapheme.h"
8 #include "util.h"
9
10 static const struct {
11 char *arr; /* UTF-8 byte sequence */
12 size_t len; /* length of UTF-8 byte sequence */
13 size_t exp_len; /* expected length returned */
14 uint_least32_t exp_cp; /* expected codepoint returned */
15 } dec_test[] = {
16 {
17 /* empty sequence
18 * [ ] ->
19 * INVALID
20 */
21 .arr = NULL,
22 .len = 0,
23 .exp_len = 0,
24 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
25 },
26 {
27 /* invalid lead byte
28 * [ 11111101 ] ->
29 * INVALID
30 */
31 .arr = (char *)(unsigned char[]) { 0xFD },
32 .len = 1,
33 .exp_len = 1,
34 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
35 },
36 {
37 /* valid 1-byte sequence
38 * [ 00000001 ] ->
39 * 0000001
40 */
41 .arr = (char *)(unsigned char[]) { 0x01 },
42 .len = 1,
43 .exp_len = 1,
44 .exp_cp = 0x1,
45 },
46 {
47 /* valid 2-byte sequence
48 * [ 11000011 10111111 ] ->
49 * 00011111111
50 */
51 .arr = (char *)(unsigned char[]) { 0xC3, 0xBF },
52 .len = 2,
53 .exp_len = 2,
54 .exp_cp = 0xFF,
55 },
56 {
57 /* invalid 2-byte sequence (second byte missing)
58 * [ 11000011 ] ->
59 * INVALID
60 */
61 .arr = (char *)(unsigned char[]) { 0xC3 },
62 .len = 1,
63 .exp_len = 2,
64 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
65 },
66 {
67 /* invalid 2-byte sequence (second byte malformed)
68 * [ 11000011 11111111 ] ->
69 * INVALID
70 */
71 .arr = (char *)(unsigned char[]) { 0xC3, 0xFF },
72 .len = 2,
73 .exp_len = 1,
74 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
75 },
76 {
77 /* invalid 2-byte sequence (overlong encoded)
78 * [ 11000001 10111111 ] ->
79 * INVALID
80 */
81 .arr = (char *)(unsigned char[]) { 0xC1, 0xBF },
82 .len = 2,
83 .exp_len = 2,
84 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
85 },
86 {
87 /* valid 3-byte sequence
88 * [ 11100000 10111111 10111111 ] ->
89 * 0000111111111111
90 */
91 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0xBF },
92 .len = 3,
93 .exp_len = 3,
94 .exp_cp = 0xFFF,
95 },
96 {
97 /* invalid 3-byte sequence (second byte missing)
98 * [ 11100000 ] ->
99 * INVALID
100 */
101 .arr = (char *)(unsigned char[]) { 0xE0 },
102 .len = 1,
103 .exp_len = 3,
104 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
105 },
106 {
107 /* invalid 3-byte sequence (second byte malformed)
108 * [ 11100000 01111111 10111111 ] ->
109 * INVALID
110 */
111 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F, 0xBF },
112 .len = 3,
113 .exp_len = 1,
114 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
115 },
116 {
117 /* invalid 3-byte sequence (short string, second byte malformed)
118 * [ 11100000 01111111 ] ->
119 * INVALID
120 */
121 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F },
122 .len = 2,
123 .exp_len = 1,
124 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
125 },
126 {
127 /* invalid 3-byte sequence (third byte missing)
128 * [ 11100000 10111111 ] ->
129 * INVALID
130 */
131 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF },
132 .len = 2,
133 .exp_len = 3,
134 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
135 },
136 {
137 /* invalid 3-byte sequence (third byte malformed)
138 * [ 11100000 10111111 01111111 ] ->
139 * INVALID
140 */
141 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0x7F },
142 .len = 3,
143 .exp_len = 2,
144 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
145 },
146 {
147 /* invalid 3-byte sequence (overlong encoded)
148 * [ 11100000 10011111 10111111 ] ->
149 * INVALID
150 */
151 .arr = (char *)(unsigned char[]) { 0xE0, 0x9F, 0xBF },
152 .len = 3,
153 .exp_len = 3,
154 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
155 },
156 {
157 /* invalid 3-byte sequence (UTF-16 surrogate half)
158 * [ 11101101 10100000 10000000 ] ->
159 * INVALID
160 */
161 .arr = (char *)(unsigned char[]) { 0xED, 0xA0, 0x80 },
162 .len = 3,
163 .exp_len = 3,
164 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
165 },
166 {
167 /* valid 4-byte sequence
168 * [ 11110011 10111111 10111111 10111111 ] ->
169 * 011111111111111111111
170 */
171 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0xBF },
172 .len = 4,
173 .exp_len = 4,
174 .exp_cp = UINT32_C(0xFFFFF),
175 },
176 {
177 /* invalid 4-byte sequence (second byte missing)
178 * [ 11110011 ] ->
179 * INVALID
180 */
181 .arr = (char *)(unsigned char[]) { 0xF3 },
182 .len = 1,
183 .exp_len = 4,
184 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
185 },
186 {
187 /* invalid 4-byte sequence (second byte malformed)
188 * [ 11110011 01111111 10111111 10111111 ] ->
189 * INVALID
190 */
191 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF, 0xBF },
192 .len = 4,
193 .exp_len = 1,
194 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
195 },
196 {
197 /* invalid 4-byte sequence (short string 1, second byte
198 * malformed) [ 11110011 011111111 ] -> INVALID
199 */
200 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F },
201 .len = 2,
202 .exp_len = 1,
203 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
204 },
205 {
206 /* invalid 4-byte sequence (short string 2, second byte
207 * malformed) [ 11110011 011111111 10111111 ] -> INVALID
208 */
209 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF },
210 .len = 3,
211 .exp_len = 1,
212 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
213 },
214
215 {
216 /* invalid 4-byte sequence (third byte missing)
217 * [ 11110011 10111111 ] ->
218 * INVALID
219 */
220 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF },
221 .len = 2,
222 .exp_len = 4,
223 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
224 },
225 {
226 /* invalid 4-byte sequence (third byte malformed)
227 * [ 11110011 10111111 01111111 10111111 ] ->
228 * INVALID
229 */
230 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F, 0xBF },
231 .len = 4,
232 .exp_len = 2,
233 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
234 },
235 {
236 /* invalid 4-byte sequence (short string, third byte malformed)
237 * [ 11110011 10111111 01111111 ] ->
238 * INVALID
239 */
240 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F },
241 .len = 3,
242 .exp_len = 2,
243 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
244 },
245 {
246 /* invalid 4-byte sequence (fourth byte missing)
247 * [ 11110011 10111111 10111111 ] ->
248 * INVALID
249 */
250 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF },
251 .len = 3,
252 .exp_len = 4,
253 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
254 },
255 {
256 /* invalid 4-byte sequence (fourth byte malformed)
257 * [ 11110011 10111111 10111111 01111111 ] ->
258 * INVALID
259 */
260 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0x7F },
261 .len = 4,
262 .exp_len = 3,
263 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
264 },
265 {
266 /* invalid 4-byte sequence (overlong encoded)
267 * [ 11110000 10000000 10000001 10111111 ] ->
268 * INVALID
269 */
270 .arr = (char *)(unsigned char[]) { 0xF0, 0x80, 0x81, 0xBF },
271 .len = 4,
272 .exp_len = 4,
273 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
274 },
275 {
276 /* invalid 4-byte sequence (UTF-16-unrepresentable)
277 * [ 11110100 10010000 10000000 10000000 ] ->
278 * INVALID
279 */
280 .arr = (char *)(unsigned char[]) { 0xF4, 0x90, 0x80, 0x80 },
281 .len = 4,
282 .exp_len = 4,
283 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
284 },
285 };
286
287 int
288 main(int argc, char *argv[])
289 {
290 size_t i, failed;
291
292 (void)argc;
293
294 /* UTF-8 decoder test */
295 for (i = 0, failed = 0; i < LEN(dec_test); i++) {
296 size_t len;
297 uint_least32_t cp;
298
299 len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].len,
300 &cp);
301
302 if (len != dec_test[i].exp_len || cp != dec_test[i].exp_cp) {
303 fprintf(stderr,
304 "%s: Failed test %zu: "
305 "Expected (%zx,%u), but got (%zx,%u).\n",
306 argv[0], i, dec_test[i].exp_len,
307 dec_test[i].exp_cp, len, cp);
308 failed++;
309 }
310 }
311 printf("%s: %zu/%zu unit tests passed.\n", argv[0],
312 LEN(dec_test) - failed, LEN(dec_test));
313
314 return (failed > 0) ? 1 : 0;
315 }