xml.c - webdump - HTML to plain-text converter for webpages
(HTM) git clone git://git.codemadness.org/webdump
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
xml.c (12246B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 /* ifdef for HTML mode. To differentiate xml.c and webdump HTML changes */
9 #define HTML_MODE
10
11 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
12 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
13 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
14 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || (((unsigned)c) | 32) - 'a' < 6)
15
16 static void
17 xml_parseattrs(XMLParser *x)
18 {
19 size_t namelen = 0, valuelen;
20 int c, endsep, endname = 0, valuestart = 0;
21
22 while ((c = GETNEXT()) != EOF) {
23 if (ISSPACE(c)) {
24 if (namelen)
25 endname = 1;
26 continue;
27 } else if (c == '?')
28 ; /* ignore */
29 else if (c == '=') {
30 x->name[namelen] = '\0';
31 valuestart = 1;
32 endname = 1;
33 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
34 /* attribute without value */
35 x->name[namelen] = '\0';
36 if (x->xmlattrstart)
37 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
38 if (x->xmlattr)
39 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
40 if (x->xmlattrend)
41 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
42 endname = 0;
43 x->name[0] = c;
44 namelen = 1;
45 } else if (namelen && valuestart) {
46 /* attribute with value */
47 if (x->xmlattrstart)
48 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
49
50 valuelen = 0;
51 if (c == '\'' || c == '"') {
52 endsep = c;
53 } else {
54 endsep = ' '; /* ISSPACE() */
55 goto startvalue;
56 }
57
58 while ((c = GETNEXT()) != EOF) {
59 startvalue:
60 if (c == '&') { /* entities */
61 x->data[valuelen] = '\0';
62 /* call data function with data before entity if there is data */
63 if (valuelen && x->xmlattr)
64 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
65 x->data[0] = c;
66 valuelen = 1;
67 while ((c = GETNEXT()) != EOF) {
68 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
69 break;
70 if (valuelen < sizeof(x->data) - 1)
71 x->data[valuelen++] = c;
72 else {
73 /* entity too long for buffer, handle as normal data */
74 x->data[valuelen] = '\0';
75 if (x->xmlattr)
76 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
77 x->data[0] = c;
78 valuelen = 1;
79 break;
80 }
81 if (c == ';') {
82 x->data[valuelen] = '\0';
83 if (x->xmlattrentity)
84 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
85 valuelen = 0;
86 break;
87 }
88 }
89 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
90 if (valuelen < sizeof(x->data) - 1) {
91 x->data[valuelen++] = c;
92 } else {
93 x->data[valuelen] = '\0';
94 if (x->xmlattr)
95 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
96 x->data[0] = c;
97 valuelen = 1;
98 }
99 }
100 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
101 x->data[valuelen] = '\0';
102 if (x->xmlattr)
103 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
104 if (x->xmlattrend)
105 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
106 break;
107 }
108 }
109 namelen = endname = valuestart = 0;
110 } else if (namelen < sizeof(x->name) - 1) {
111 x->name[namelen++] = c;
112 }
113 if (c == '>') {
114 break;
115 } else if (c == '/') {
116 x->isshorttag = 1;
117 x->name[0] = '\0';
118 namelen = 0;
119 }
120 }
121 }
122
123 static void
124 xml_parsecomment(XMLParser *x)
125 {
126 size_t datalen = 0, i = 0;
127 int c;
128
129 if (x->xmlcommentstart)
130 x->xmlcommentstart(x);
131 while ((c = GETNEXT()) != EOF) {
132 if (c == '-' || c == '>') {
133 if (x->xmlcomment && datalen) {
134 x->data[datalen] = '\0';
135 x->xmlcomment(x, x->data, datalen);
136 datalen = 0;
137 }
138 }
139
140 if (c == '-') {
141 if (++i > 2) {
142 if (x->xmlcomment)
143 for (; i > 2; i--)
144 x->xmlcomment(x, "-", 1);
145 i = 2;
146 }
147 continue;
148 } else if (c == '>' && i == 2) {
149 if (x->xmlcommentend)
150 x->xmlcommentend(x);
151 return;
152 } else if (i) {
153 if (x->xmlcomment) {
154 for (; i > 0; i--)
155 x->xmlcomment(x, "-", 1);
156 }
157 i = 0;
158 }
159
160 if (datalen < sizeof(x->data) - 1) {
161 x->data[datalen++] = c;
162 } else {
163 x->data[datalen] = '\0';
164 if (x->xmlcomment)
165 x->xmlcomment(x, x->data, datalen);
166 x->data[0] = c;
167 datalen = 1;
168 }
169 }
170 }
171
172 static void
173 xml_parsecdata(XMLParser *x)
174 {
175 size_t datalen = 0, i = 0;
176 int c;
177
178 if (x->xmlcdatastart)
179 x->xmlcdatastart(x);
180 while ((c = GETNEXT()) != EOF) {
181 if (c == ']' || c == '>') {
182 if (x->xmlcdata && datalen) {
183 x->data[datalen] = '\0';
184 x->xmlcdata(x, x->data, datalen);
185 datalen = 0;
186 }
187 }
188
189 if (c == ']') {
190 if (++i > 2) {
191 if (x->xmlcdata)
192 for (; i > 2; i--)
193 x->xmlcdata(x, "]", 1);
194 i = 2;
195 }
196 continue;
197 } else if (c == '>' && i == 2) {
198 if (x->xmlcdataend)
199 x->xmlcdataend(x);
200 return;
201 } else if (i) {
202 if (x->xmlcdata)
203 for (; i > 0; i--)
204 x->xmlcdata(x, "]", 1);
205 i = 0;
206 }
207
208 if (datalen < sizeof(x->data) - 1) {
209 x->data[datalen++] = c;
210 } else {
211 x->data[datalen] = '\0';
212 if (x->xmlcdata)
213 x->xmlcdata(x, x->data, datalen);
214 x->data[0] = c;
215 datalen = 1;
216 }
217 }
218 }
219
220 static int
221 codepointtoutf8(long r, char *s)
222 {
223 if (r == 0) {
224 return 0; /* NUL byte */
225 } else if (r <= 0x7F) {
226 /* 1 byte: 0aaaaaaa */
227 s[0] = r;
228 return 1;
229 } else if (r <= 0x07FF) {
230 /* 2 bytes: 00000aaa aabbbbbb */
231 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
232 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
233 return 2;
234 } else if (r <= 0xFFFF) {
235 /* 3 bytes: aaaabbbb bbcccccc */
236 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
237 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
238 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
239 return 3;
240 } else {
241 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
242 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
243 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
244 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
245 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
246 return 4;
247 }
248 }
249
250 struct namedentity {
251 const char *entity;
252 long cp;
253 };
254
255 static int
256 namedentitycmp(const void *v1, const void *v2)
257 {
258 struct namedentity *n1 = (struct namedentity *)v1;
259 struct namedentity *n2 = (struct namedentity *)v2;
260
261 return strcmp(n1->entity, n2->entity);
262 }
263
264 static const struct namedentity entities[] = {
265 #include "namedentities.h"
266 };
267
268 static int
269 namedentitytostr(const char *e, char *buf, size_t bufsiz)
270 {
271 struct namedentity find, *found;
272 size_t i;
273
274 /* buffer is too small */
275 if (bufsiz < 5)
276 return -1;
277
278 find.entity = e;
279 found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities),
280 sizeof(*entities), namedentitycmp);
281 if (found) {
282 i = codepointtoutf8(found->cp, buf);
283 buf[i] = '\0';
284 return i;
285 }
286 return -1;
287 }
288
289 static int
290 numericentitytostr(const char *e, char *buf, size_t bufsiz)
291 {
292 long l;
293 int base, len;
294 const char *s;
295 char *end;
296
297 /* buffer is too small */
298 if (bufsiz < 5)
299 return -1;
300
301 /* hex (base 16) or decimal (base 10) */
302 if (*e == 'x') {
303 e++;
304 for (s = e; *s && *s != ';'; s++) {
305 if (!ISXDIGIT((unsigned char)*s))
306 return -1; /* invalid: no hex */
307 }
308 base = 16;
309
310 } else {
311 for (s = e; *s && *s != ';'; s++) {
312 if (!ISDIGIT((unsigned char)*s))
313 return -1; /* invalid: no digits */
314 }
315 base = 10;
316 }
317 if (*s != ';' || *(s + 1) != '\0')
318 return -1; /* must end with ';' NUL */
319
320 errno = 0;
321 l = strtol(e, &end, base);
322
323 /* invalid value or not a well-formed entity or invalid code point */
324 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
325 (l >= 0xd800 && l <= 0xdfff)) /* surrogate range */
326 return -1;
327 len = codepointtoutf8(l, buf);
328 buf[len] = '\0';
329
330 return len;
331 }
332
333 /* convert named- or numeric entity string to buffer string
334 * returns byte-length of string or -1 on failure. */
335 int
336 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
337 {
338 /* doesn't start with & */
339 if (e[0] != '&')
340 return -1;
341 /* numeric entity */
342 if (e[1] == '#')
343 return numericentitytostr(e + 2, buf, bufsiz);
344 else /* named entity */
345 return namedentitytostr(e + 1, buf, bufsiz);
346 }
347
348 void
349 xml_parse(XMLParser *x)
350 {
351 size_t datalen, tagdatalen;
352 int c, isend;
353
354 #ifdef HTML_MODE
355 goto read_data;
356 #else
357 /* HTML: process data before a tag occured aswell */
358 while ((c = GETNEXT()) != EOF && c != '<')
359 ; /* skip until < */
360 #endif
361
362 while (c != EOF) {
363 if (c == '<') { /* parse tag */
364 if ((c = GETNEXT()) == EOF)
365 return;
366
367 if (c == '!') { /* CDATA and comments */
368 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
369 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
370 if (tagdatalen <= sizeof("[CDATA[") - 1)
371 x->data[tagdatalen++] = c;
372 if (c == '>')
373 break;
374 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
375 (x->data[0] == '-')) {
376 xml_parsecomment(x);
377 break;
378 } else if (c == '[') {
379 if (tagdatalen == sizeof("[CDATA[") - 1 &&
380 !strncmp(x->data, "[CDATA[", tagdatalen)) {
381 xml_parsecdata(x);
382 break;
383 }
384 }
385 }
386 } else {
387 /* normal tag (open, short open, close), processing instruction. */
388 x->tag[0] = c;
389 x->taglen = 1;
390 x->isshorttag = isend = 0;
391
392 /* treat processing instruction as short tag, don't strip "?" prefix. */
393 if (c == '?') {
394 x->isshorttag = 1;
395 } else if (c == '/') {
396 if ((c = GETNEXT()) == EOF)
397 return;
398 x->tag[0] = c;
399 isend = 1;
400 }
401
402 while ((c = GETNEXT()) != EOF) {
403 if (c == '/')
404 x->isshorttag = 1; /* short tag */
405 else if (c == '>' || ISSPACE(c)) {
406 x->tag[x->taglen] = '\0';
407 if (isend) { /* end tag, starts with </ */
408 while (c != '>' && c != EOF) /* skip until > */
409 c = GETNEXT();
410 if (x->xmltagend)
411 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
412 x->tag[0] = '\0';
413 x->taglen = 0;
414 } else {
415 /* start tag */
416 if (x->xmltagstart)
417 x->xmltagstart(x, x->tag, x->taglen);
418 if (ISSPACE(c))
419 xml_parseattrs(x);
420 if (x->xmltagstartparsed)
421 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
422 }
423 /* call tagend for short tag or processing instruction */
424 if (x->isshorttag) {
425 if (x->xmltagend)
426 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
427 x->tag[0] = '\0';
428 x->taglen = 0;
429 }
430 break;
431 } else if (x->taglen < sizeof(x->tag) - 1)
432 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
433 }
434 }
435 } else {
436 #ifdef HTML_MODE
437 read_data:
438 #endif
439 /* parse tag data */
440 datalen = 0;
441 if (x->xmldatastart)
442 x->xmldatastart(x);
443 while ((c = GETNEXT()) != EOF) {
444 if (c == '&') { /* entities */
445 if (datalen) {
446 x->data[datalen] = '\0';
447 if (x->xmldata)
448 x->xmldata(x, x->data, datalen);
449 }
450 x->data[0] = c;
451 datalen = 1;
452 while ((c = GETNEXT()) != EOF) {
453 if (c == '<')
454 break;
455 if (datalen < sizeof(x->data) - 1)
456 x->data[datalen++] = c;
457 else {
458 /* entity too long for buffer, handle as normal data */
459 x->data[datalen] = '\0';
460 if (x->xmldata)
461 x->xmldata(x, x->data, datalen);
462 x->data[0] = c;
463 datalen = 1;
464 break;
465 }
466 if (c == ';') {
467 x->data[datalen] = '\0';
468 if (x->xmldataentity)
469 x->xmldataentity(x, x->data, datalen);
470 datalen = 0;
471 break;
472 }
473 }
474 } else if (c != '<') {
475 if (datalen < sizeof(x->data) - 1) {
476 x->data[datalen++] = c;
477 } else {
478 x->data[datalen] = '\0';
479 if (x->xmldata)
480 x->xmldata(x, x->data, datalen);
481 x->data[0] = c;
482 datalen = 1;
483 }
484 }
485 if (c == '<') {
486 x->data[datalen] = '\0';
487 if (x->xmldata && datalen)
488 x->xmldata(x, x->data, datalen);
489 if (x->xmldataend)
490 x->xmldataend(x);
491 #ifdef HTML_MODE
492 datalen = 0;
493 #endif
494 break;
495 }
496 }
497
498 #ifdef HTML_MODE
499 /* pending data, even if a tag didn't close (EOF, etc). */
500 if (datalen) {
501 x->data[datalen] = '\0';
502 if (x->xmldata && datalen)
503 x->xmldata(x, x->data, datalen);
504 if (x->xmldataend)
505 x->xmldataend(x);
506 datalen = 0;
507 }
508 #endif
509 }
510 }
511 }