extractjson.c - extractjson - extract embedded JSON metadata from HTML pages
(HTM) git clone git://git.codemadness.org/extractjson
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
extractjson.c (7744B)
---
1 #include <ctype.h>
2 #include <errno.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <strings.h>
7
8 #define GETNEXT getnext
9
10 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
11 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
12
13 typedef struct xmlparser {
14 /* current tag */
15 char tag[1024];
16 size_t taglen;
17 /* current tag is in shortform ? <tag /> */
18 int isshorttag;
19 /* current attribute name */
20 char name[1024];
21 /* data buffer used for tag data, cdata and attribute data */
22 char data[BUFSIZ];
23 } XMLParser;
24
25 static XMLParser parser;
26 static int isjson;
27 static const char *ignorestate, *endtag;
28 static int (*getnext)(void) = getchar;
29
30 /* ignore parsing all HTML data inside <script> tags, because they may contain
31 characters such as '<' and '>' */
32 static int
33 getnext_json(void)
34 {
35 int c;
36
37 if ((c = getchar()) == EOF)
38 return EOF;
39
40 if (tolower(c) == tolower((unsigned char)*ignorestate)) {
41 ignorestate++;
42 if (*ignorestate == '\0') {
43 getnext = getchar; /* restore */
44 putchar('\n');
45 isjson = 0;
46 return c;
47 }
48
49 } else {
50 ignorestate = endtag;
51 if (c != '\r' && c != '\n')
52 putchar(c);
53 }
54
55 return ' ';
56 }
57
58 static void
59 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
60 const char *v, size_t vl)
61 {
62 if (!strcasecmp(t, "script") &&
63 !strcasecmp(a, "type") &&
64 (strstr(v, "application/json") ||
65 strstr(v, "application/ld+json") ||
66 strstr(v, "text/json")))
67 isjson = 1;
68 }
69
70 static void
71 xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
72 {
73 if (!strcasecmp(t, "script") && isjson) {
74 ignorestate = endtag = "</script>";
75 getnext = getnext_json;
76 return;
77 }
78 }
79
80 static void
81 xml_parseattrs(XMLParser *x)
82 {
83 size_t namelen = 0, valuelen;
84 int c, endsep, endname = 0, valuestart = 0;
85
86 while ((c = GETNEXT()) != EOF) {
87 if (ISSPACE(c)) {
88 if (namelen)
89 endname = 1;
90 continue;
91 } else if (c == '?')
92 ; /* ignore */
93 else if (c == '=') {
94 x->name[namelen] = '\0';
95 valuestart = 1;
96 endname = 1;
97 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
98 /* attribute without value */
99 xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
100 x->name[namelen] = '\0';
101 endname = 0;
102 x->name[0] = c;
103 namelen = 1;
104 } else if (namelen && valuestart) {
105 /* attribute with value */
106 valuelen = 0;
107 if (c == '\'' || c == '"') {
108 endsep = c;
109 } else {
110 endsep = ' '; /* ISSPACE() */
111 goto startvalue;
112 }
113
114 while ((c = GETNEXT()) != EOF) {
115 startvalue:
116 if (c == '&') { /* entities */
117 x->data[valuelen] = '\0';
118 /* call data function with data before entity if there is data */
119 if (valuelen)
120 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
121 x->data[0] = c;
122 valuelen = 1;
123 while ((c = GETNEXT()) != EOF) {
124 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
125 break;
126 if (valuelen < sizeof(x->data) - 1)
127 x->data[valuelen++] = c;
128 else {
129 /* entity too long for buffer, handle as normal data */
130 x->data[valuelen] = '\0';
131 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
132 x->data[0] = c;
133 valuelen = 1;
134 break;
135 }
136 if (c == ';') {
137 x->data[valuelen] = '\0';
138 valuelen = 0;
139 break;
140 }
141 }
142 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
143 if (valuelen < sizeof(x->data) - 1) {
144 x->data[valuelen++] = c;
145 } else {
146 x->data[valuelen] = '\0';
147 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
148 x->data[0] = c;
149 valuelen = 1;
150 }
151 }
152 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
153 x->data[valuelen] = '\0';
154 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
155 break;
156 }
157 }
158 namelen = endname = valuestart = 0;
159 } else if (namelen < sizeof(x->name) - 1) {
160 x->name[namelen++] = c;
161 }
162 if (c == '>') {
163 break;
164 } else if (c == '/') {
165 x->isshorttag = 1;
166 x->name[0] = '\0';
167 namelen = 0;
168 }
169 }
170 }
171
172 static void
173 xml_parsecomment(XMLParser *x)
174 {
175 int c, i = 0;
176
177 while ((c = GETNEXT()) != EOF) {
178 if (c == '-') {
179 if (++i > 2)
180 i = 2;
181 continue;
182 } else if (c == '>' && i == 2) {
183 return;
184 } else if (i) {
185 i = 0;
186 }
187 }
188 }
189
190 static void
191 xml_parsecdata(XMLParser *x)
192 {
193 size_t datalen = 0, i = 0;
194 int c;
195
196 while ((c = GETNEXT()) != EOF) {
197 if (c == ']') {
198 if (++i > 2)
199 i = 2;
200 continue;
201 } else if (c == '>' && i == 2) {
202 return;
203 } else if (i) {
204 i = 0;
205 }
206
207 if (datalen < sizeof(x->data) - 1) {
208 x->data[datalen++] = c;
209 } else {
210 x->data[datalen] = '\0';
211 x->data[0] = c;
212 datalen = 1;
213 }
214 }
215 }
216
217 static void
218 xml_parse(XMLParser *x)
219 {
220 size_t datalen, tagdatalen;
221 int c, isend;
222
223 while ((c = GETNEXT()) != EOF && c != '<')
224 ; /* skip until < */
225
226 while (c != EOF) {
227 if (c == '<') { /* parse tag */
228 if ((c = GETNEXT()) == EOF)
229 return;
230
231 if (c == '!') { /* cdata and comments */
232 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
233 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
234 if (tagdatalen <= sizeof("[CDATA[") - 1)
235 x->data[tagdatalen++] = c;
236 if (c == '>')
237 break;
238 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
239 (x->data[0] == '-')) {
240 xml_parsecomment(x);
241 break;
242 } else if (c == '[') {
243 if (tagdatalen == sizeof("[CDATA[") - 1 &&
244 !strncmp(x->data, "[CDATA[", tagdatalen)) {
245 xml_parsecdata(x);
246 break;
247 }
248 }
249 }
250 } else {
251 /* normal tag (open, short open, close), processing instruction. */
252 x->tag[0] = c;
253 x->taglen = 1;
254 x->isshorttag = isend = 0;
255
256 /* treat processing instruction as shorttag, don't strip "?" prefix. */
257 if (c == '?') {
258 x->isshorttag = 1;
259 } else if (c == '/') {
260 if ((c = GETNEXT()) == EOF)
261 return;
262 x->tag[0] = c;
263 isend = 1;
264 }
265
266 while ((c = GETNEXT()) != EOF) {
267 if (c == '/')
268 x->isshorttag = 1; /* short tag */
269 else if (c == '>' || ISSPACE(c)) {
270 x->tag[x->taglen] = '\0';
271 if (isend) { /* end tag, starts with </ */
272 while (c != '>' && c != EOF) /* skip until > */
273 c = GETNEXT();
274 x->tag[0] = '\0';
275 x->taglen = 0;
276 } else {
277 /* start tag */
278 if (ISSPACE(c))
279 xml_parseattrs(x);
280 xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
281 }
282 /* call tagend for shortform or processing instruction */
283 if (x->isshorttag) {
284 x->tag[0] = '\0';
285 x->taglen = 0;
286 }
287 break;
288 } else if (x->taglen < sizeof(x->tag) - 1)
289 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
290 }
291 }
292 } else {
293 /* parse tag data */
294 datalen = 0;
295 while ((c = GETNEXT()) != EOF) {
296 if (c == '&') {
297 if (datalen)
298 x->data[datalen] = '\0';
299 x->data[0] = c;
300 datalen = 1;
301 while ((c = GETNEXT()) != EOF) {
302 if (c == '<')
303 break;
304 if (datalen < sizeof(x->data) - 1)
305 x->data[datalen++] = c;
306 else {
307 /* entity too long for buffer, handle as normal data */
308 x->data[datalen] = '\0';
309 x->data[0] = c;
310 datalen = 1;
311 break;
312 }
313 if (c == ';') {
314 x->data[datalen] = '\0';
315 datalen = 0;
316 break;
317 }
318 }
319 } else if (c != '<') {
320 if (datalen < sizeof(x->data) - 1) {
321 x->data[datalen++] = c;
322 } else {
323 x->data[datalen] = '\0';
324 x->data[0] = c;
325 datalen = 1;
326 }
327 }
328 if (c == '<') {
329 x->data[datalen] = '\0';
330 break;
331 }
332 }
333 }
334 }
335 }
336
337 int
338 main(void)
339 {
340 xml_parse(&parser);
341
342 return 0;
343 }