json.c - tscrape - twitter scraper (not working anymore)
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
json.c (7806B)
---
1 #include <ctype.h>
2 #include <errno.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7
8 #define GETNEXT getchar
9
10 #include "json.h"
11
12 static int
13 codepointtoutf8(long r, char *s)
14 {
15 if (r == 0) {
16 return 0; /* NUL byte */
17 } else if (r <= 0x7F) {
18 /* 1 byte: 0aaaaaaa */
19 s[0] = r;
20 return 1;
21 } else if (r <= 0x07FF) {
22 /* 2 bytes: 00000aaa aabbbbbb */
23 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
24 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
25 return 2;
26 } else if (r <= 0xFFFF) {
27 /* 3 bytes: aaaabbbb bbcccccc */
28 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
29 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
30 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
31 return 3;
32 } else {
33 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
34 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
35 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
36 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
37 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
38 return 4;
39 }
40 }
41
42 static int
43 hexdigit(int c)
44 {
45 if (c >= '0' && c <= '9')
46 return c - '0';
47 else if (c >= 'a' && c <= 'f')
48 return 10 + (c - 'a');
49 else if (c >= 'A' && c <= 'F')
50 return 10 + (c - 'A');
51 return 0;
52 }
53
54 static int
55 capacity(char **value, size_t *sz, size_t cur, size_t inc)
56 {
57 size_t need, newsiz;
58 char *newp;
59
60 /* check for addition overflow */
61 if (cur > SIZE_MAX - inc) {
62 errno = EOVERFLOW;
63 return -1;
64 }
65 need = cur + inc;
66
67 if (need > *sz) {
68 if (need > SIZE_MAX / 2) {
69 newsiz = SIZE_MAX;
70 } else {
71 for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; newsiz *= 2)
72 ;
73 }
74 if (!(newp = realloc(*value, newsiz)))
75 return -1; /* up to caller to free *value */
76 *value = newp;
77 *sz = newsiz;
78 }
79 return 0;
80 }
81
82 #define EXPECT_VALUE "{[\"-0123456789tfn"
83 #define EXPECT_STRING "\""
84 #define EXPECT_END "}],"
85 #define EXPECT_OBJECT_STRING EXPECT_STRING "}"
86 #define EXPECT_OBJECT_KEY ":"
87 #define EXPECT_ARRAY_VALUE EXPECT_VALUE "]"
88
89 #define JSON_INVALID() do { ret = JSON_ERROR_INVALID; goto end; } while (0);
90
91 int
92 parsejson(void (*cb)(struct json_node *, size_t, const char *))
93 {
94 struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 };
95 size_t depth = 0, p = 0, len, sz = 0;
96 long cp, hi, lo;
97 char pri[128], *str = NULL;
98 int c, i, escape, iskey = 0, ret = JSON_ERROR_MEM;
99 const char *expect = EXPECT_VALUE;
100
101 if (capacity(&(nodes[0].name), &(nodes[0].namesiz), 0, 1) == -1)
102 goto end;
103 nodes[0].name[0] = '\0';
104
105 while (1) {
106 c = GETNEXT();
107 handlechr:
108 if (c == EOF)
109 break;
110
111 /* skip JSON white-space, (NOTE: no \v, \f, \b etc) */
112 if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
113 continue;
114
115 if (!c || !strchr(expect, c))
116 JSON_INVALID();
117
118 switch (c) {
119 case ':':
120 iskey = 0;
121 expect = EXPECT_VALUE;
122 break;
123 case '"':
124 nodes[depth].type = JSON_TYPE_STRING;
125 escape = 0;
126 len = 0;
127 while (1) {
128 c = GETNEXT();
129 chr:
130 /* EOF or control char: 0x7f is not defined as a control char in RFC8259 */
131 if (c < 0x20)
132 JSON_INVALID();
133
134 if (escape) {
135 escchr:
136 escape = 0;
137 switch (c) {
138 case '"': /* FALLTHROUGH */
139 case '\\':
140 case '/': break;
141 case 'b': c = '\b'; break;
142 case 'f': c = '\f'; break;
143 case 'n': c = '\n'; break;
144 case 'r': c = '\r'; break;
145 case 't': c = '\t'; break;
146 case 'u': /* hex hex hex hex */
147 if (capacity(&str, &sz, len, 4) == -1)
148 goto end;
149 for (i = 12, cp = 0; i >= 0; i -= 4) {
150 if ((c = GETNEXT()) == EOF || !isxdigit(c))
151 JSON_INVALID(); /* invalid code point */
152 cp |= (hexdigit(c) << i);
153 }
154 /* RFC8259 - 7. Strings - surrogates.
155 * 0xd800 - 0xdb7f - high surrogates */
156 if (cp >= 0xd800 && cp <= 0xdb7f) {
157 if ((c = GETNEXT()) != '\\') {
158 len += codepointtoutf8(cp, &str[len]);
159 goto chr;
160 }
161 if ((c = GETNEXT()) != 'u') {
162 len += codepointtoutf8(cp, &str[len]);
163 goto escchr;
164 }
165 for (hi = cp, i = 12, lo = 0; i >= 0; i -= 4) {
166 if ((c = GETNEXT()) == EOF || !isxdigit(c))
167 JSON_INVALID(); /* invalid code point */
168 lo |= (hexdigit(c) << i);
169 }
170 /* 0xdc00 - 0xdfff - low surrogates */
171 if (lo >= 0xdc00 && lo <= 0xdfff) {
172 cp = (hi << 10) + lo - 56613888; /* - offset */
173 } else {
174 /* handle graceful: raw invalid output bytes */
175 len += codepointtoutf8(hi, &str[len]);
176 if (capacity(&str, &sz, len, 4) == -1)
177 goto end;
178 len += codepointtoutf8(lo, &str[len]);
179 continue;
180 }
181 }
182 len += codepointtoutf8(cp, &str[len]);
183 continue;
184 default:
185 JSON_INVALID(); /* invalid escape char */
186 }
187 if (capacity(&str, &sz, len, 1) == -1)
188 goto end;
189 str[len++] = c;
190 } else if (c == '\\') {
191 escape = 1;
192 } else if (c == '"') {
193 if (capacity(&str, &sz, len, 1) == -1)
194 goto end;
195 str[len++] = '\0';
196
197 if (iskey) {
198 /* copy string as key, including NUL byte */
199 if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), len, 1) == -1)
200 goto end;
201 memcpy(nodes[depth].name, str, len);
202 } else {
203 cb(nodes, depth + 1, str);
204 }
205 break;
206 } else {
207 if (capacity(&str, &sz, len, 1) == -1)
208 goto end;
209 str[len++] = c;
210 }
211 }
212 if (iskey)
213 expect = EXPECT_OBJECT_KEY;
214 else
215 expect = EXPECT_END;
216 break;
217 case '[':
218 case '{':
219 if (depth + 1 >= JSON_MAX_NODE_DEPTH)
220 JSON_INVALID(); /* too deep */
221
222 nodes[depth].index = 0;
223 if (c == '[') {
224 nodes[depth].type = JSON_TYPE_ARRAY;
225 expect = EXPECT_ARRAY_VALUE;
226 } else if (c == '{') {
227 iskey = 1;
228 nodes[depth].type = JSON_TYPE_OBJECT;
229 expect = EXPECT_OBJECT_STRING;
230 }
231
232 cb(nodes, depth + 1, "");
233
234 depth++;
235 nodes[depth].index = 0;
236 if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), 0, 1) == -1)
237 goto end;
238 nodes[depth].name[0] = '\0';
239 break;
240 case ']':
241 case '}':
242 if (!depth ||
243 (c == ']' && nodes[depth - 1].type != JSON_TYPE_ARRAY) ||
244 (c == '}' && nodes[depth - 1].type != JSON_TYPE_OBJECT))
245 JSON_INVALID(); /* unbalanced nodes */
246
247 nodes[--depth].index++;
248 expect = EXPECT_END;
249 break;
250 case ',':
251 if (!depth)
252 JSON_INVALID(); /* unbalanced nodes */
253
254 nodes[depth - 1].index++;
255 if (nodes[depth - 1].type == JSON_TYPE_OBJECT) {
256 iskey = 1;
257 expect = EXPECT_STRING;
258 } else {
259 expect = EXPECT_VALUE;
260 }
261 break;
262 case 't': /* true */
263 if (GETNEXT() != 'r' || GETNEXT() != 'u' || GETNEXT() != 'e')
264 JSON_INVALID();
265 nodes[depth].type = JSON_TYPE_BOOL;
266 cb(nodes, depth + 1, "true");
267 expect = EXPECT_END;
268 break;
269 case 'f': /* false */
270 if (GETNEXT() != 'a' || GETNEXT() != 'l' || GETNEXT() != 's' ||
271 GETNEXT() != 'e')
272 JSON_INVALID();
273 nodes[depth].type = JSON_TYPE_BOOL;
274 cb(nodes, depth + 1, "false");
275 expect = EXPECT_END;
276 break;
277 case 'n': /* null */
278 if (GETNEXT() != 'u' || GETNEXT() != 'l' || GETNEXT() != 'l')
279 JSON_INVALID();
280 nodes[depth].type = JSON_TYPE_NULL;
281 cb(nodes, depth + 1, "null");
282 expect = EXPECT_END;
283 break;
284 default: /* number */
285 nodes[depth].type = JSON_TYPE_NUMBER;
286 p = 0;
287 pri[p++] = c;
288 expect = EXPECT_END;
289 while (1) {
290 c = GETNEXT();
291 if (c == EOF ||
292 !c || !strchr("0123456789eE+-.", c) ||
293 p + 1 >= sizeof(pri)) {
294 pri[p] = '\0';
295 cb(nodes, depth + 1, pri);
296 goto handlechr; /* do not read next char, handle this */
297 } else {
298 pri[p++] = c;
299 }
300 }
301 }
302 }
303 if (depth)
304 JSON_INVALID(); /* unbalanced nodes */
305
306 ret = 0; /* success */
307 end:
308 for (depth = 0; depth < sizeof(nodes) / sizeof(nodes[0]); depth++)
309 free(nodes[depth].name);
310 free(str);
311
312 return ret;
313 }