utf8.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
utf8.c (6131B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stddef.h>
3 #include <stdint.h>
4
5 #include "../grapheme.h"
6 #include "util.h"
7
8 #define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u))
9
10 /* lookup-table for the types of sequence first bytes */
11 static const struct {
12 uint_least8_t lower; /* lower bound of sequence first byte */
13 uint_least8_t upper; /* upper bound of sequence first byte */
14 uint_least32_t mincp; /* smallest non-overlong encoded codepoint */
15 uint_least32_t maxcp; /* largest encodable codepoint */
16 /*
17 * implicit: table-offset represents the number of following
18 * bytes of the form 10xxxxxx (6 bits capacity each)
19 */
20 } lut[] = {
21 [0] = {
22 /* 0xxxxxxx */
23 .lower = 0x00, /* 00000000 */
24 .upper = 0x7F, /* 01111111 */
25 .mincp = (uint_least32_t)0,
26 .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
27 },
28 [1] = {
29 /* 110xxxxx */
30 .lower = 0xC0, /* 11000000 */
31 .upper = 0xDF, /* 11011111 */
32 .mincp = (uint_least32_t)1 << 7,
33 .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
34 },
35 [2] = {
36 /* 1110xxxx */
37 .lower = 0xE0, /* 11100000 */
38 .upper = 0xEF, /* 11101111 */
39 .mincp = (uint_least32_t)1 << 11,
40 .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
41 },
42 [3] = {
43 /* 11110xxx */
44 .lower = 0xF0, /* 11110000 */
45 .upper = 0xF7, /* 11110111 */
46 .mincp = (uint_least32_t)1 << 16,
47 .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
48 },
49 };
50
51 size_t
52 grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp)
53 {
54 size_t off, i;
55 uint_least32_t tmp;
56
57 if (cp == NULL) {
58 /*
59 * instead of checking every time if cp is NULL within
60 * the decoder, simply point it at a dummy variable here.
61 */
62 cp = &tmp;
63 }
64
65 if (str == NULL || len == 0) {
66 /* a sequence must be at least 1 byte long */
67 *cp = GRAPHEME_INVALID_CODEPOINT;
68 return 0;
69 }
70
71 /* identify sequence type with the first byte */
72 for (off = 0; off < LEN(lut); off++) {
73 if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower,
74 lut[off].upper)) {
75 /*
76 * first byte is within the bounds; fill
77 * p with the the first bits contained in
78 * the first byte (by subtracting the high bits)
79 */
80 *cp = ((const unsigned char *)str)[0] - lut[off].lower;
81 break;
82 }
83 }
84 if (off == LEN(lut)) {
85 /*
86 * first byte does not match a sequence type;
87 * set cp as invalid and return 1 byte processed
88 *
89 * this also includes the cases where bits higher than
90 * the 8th are set on systems with CHAR_BIT > 8
91 */
92 *cp = GRAPHEME_INVALID_CODEPOINT;
93 return 1;
94 }
95 if (1 + off > len) {
96 /*
97 * input is not long enough, set cp as invalid
98 */
99 *cp = GRAPHEME_INVALID_CODEPOINT;
100
101 /*
102 * count the following continuation bytes, but nothing
103 * else in case we have a "rogue" case where e.g. such a
104 * sequence starter occurs right before a NUL-byte.
105 */
106 for (i = 0; 1 + i < len; i++) {
107 if (!BETWEEN(((const unsigned char *)str)[1 + i], 0x80,
108 0xBF)) {
109 break;
110 }
111 }
112
113 /*
114 * if the continuation bytes do not continue until
115 * the end, return the incomplete sequence length.
116 * Otherwise return the number of bytes we actually
117 * expected, which is larger than n.
118 */
119 return ((1 + i) < len) ? (1 + i) : (1 + off);
120 }
121
122 /*
123 * process 'off' following bytes, each of the form 10xxxxxx
124 * (i.e. between 0x80 (10000000) and 0xBF (10111111))
125 */
126 for (i = 1; i <= off; i++) {
127 if (!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) {
128 /*
129 * byte does not match format; return
130 * number of bytes processed excluding the
131 * unexpected character as recommended since
132 * Unicode 6 (chapter 3)
133 *
134 * this also includes the cases where bits
135 * higher than the 8th are set on systems
136 * with CHAR_BIT > 8
137 */
138 *cp = GRAPHEME_INVALID_CODEPOINT;
139 return 1 + (i - 1);
140 }
141 /*
142 * shift codepoint by 6 bits and add the 6 stored bits
143 * in s[i] to it using the bitmask 0x3F (00111111)
144 */
145 *cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F);
146 }
147
148 if (*cp < lut[off].mincp ||
149 BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
150 *cp > UINT32_C(0x10FFFF)) {
151 /*
152 * codepoint is overlong encoded in the sequence, is a
153 * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
154 * not representable in UTF-16 (>0x10FFFF) (RFC-3629
155 * specifies the latter two conditions)
156 */
157 *cp = GRAPHEME_INVALID_CODEPOINT;
158 }
159
160 return 1 + off;
161 }
162
163 size_t
164 grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
165 {
166 size_t off, i;
167
168 if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
169 cp > UINT32_C(0x10FFFF)) {
170 /*
171 * codepoint is a high or low UTF-16 surrogate half
172 * (0xD800..0xDFFF) or not representable in UTF-16
173 * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
174 */
175 cp = GRAPHEME_INVALID_CODEPOINT;
176 }
177
178 /* determine necessary sequence type */
179 for (off = 0; off < LEN(lut); off++) {
180 if (cp <= lut[off].maxcp) {
181 break;
182 }
183 }
184 if (1 + off > len || str == NULL || len == 0) {
185 /*
186 * specified buffer is too small to store sequence or
187 * the caller just wanted to know how many bytes the
188 * codepoint needs by passing a NULL-buffer.
189 */
190 return 1 + off;
191 }
192
193 /* build sequence by filling cp-bits into each byte */
194
195 /*
196 * lut[off].lower is the bit-format for the first byte and
197 * the bits to fill into it are determined by shifting the
198 * cp 6 times the number of following bytes, as each
199 * following byte stores 6 bits, yielding the wanted bits.
200 *
201 * We do not overwrite the mask because we guaranteed earlier
202 * that there are no bits higher than the mask allows.
203 */
204 ((unsigned char *)str)[0] =
205 lut[off].lower | (uint_least8_t)(cp >> (6 * off));
206
207 for (i = 1; i <= off; i++) {
208 /*
209 * the bit-format for following bytes is 10000000 (0x80)
210 * and it each stores 6 bits in the 6 low bits that we
211 * extract from the properly-shifted value using the
212 * mask 00111111 (0x3F)
213 */
214 ((unsigned char *)str)[i] =
215 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
216 }
217
218 return 1 + off;
219 }