xml.c - frontends - front-ends for some sites (experiment)
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
xml.c (11981B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
9 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
10 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
11 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || (((unsigned)c) | 32) - 'a' < 6)
12
13 /* data buffers, size and offset used for parsing XML, see getnext() */
14 static const unsigned char *xml_data_buf;
15 static size_t xml_data_size;
16 static size_t xml_data_off;
17
18 void
19 setxmldata(const char *s, size_t len)
20 {
21 xml_data_off = 0;
22 xml_data_size = len;
23 xml_data_buf = (unsigned char *)s;
24 }
25
26 static int
27 getnext(void)
28 {
29 if (xml_data_off >= xml_data_size)
30 return EOF;
31 return xml_data_buf[xml_data_off++];
32 }
33
34 static void
35 xml_parseattrs(XMLParser *x)
36 {
37 size_t namelen = 0, valuelen;
38 int c, endsep, endname = 0, valuestart = 0;
39
40 while ((c = GETNEXT()) != EOF) {
41 if (ISSPACE(c)) {
42 if (namelen)
43 endname = 1;
44 continue;
45 } else if (c == '?')
46 ; /* ignore */
47 else if (c == '=') {
48 x->name[namelen] = '\0';
49 valuestart = 1;
50 endname = 1;
51 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
52 /* attribute without value */
53 x->name[namelen] = '\0';
54 if (x->xmlattrstart)
55 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
56 if (x->xmlattr)
57 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
58 if (x->xmlattrend)
59 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
60 endname = 0;
61 x->name[0] = c;
62 namelen = 1;
63 } else if (namelen && valuestart) {
64 /* attribute with value */
65 if (x->xmlattrstart)
66 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
67
68 valuelen = 0;
69 if (c == '\'' || c == '"') {
70 endsep = c;
71 } else {
72 endsep = ' '; /* ISSPACE() */
73 goto startvalue;
74 }
75
76 while ((c = GETNEXT()) != EOF) {
77 startvalue:
78 if (c == '&') { /* entities */
79 x->data[valuelen] = '\0';
80 /* call data function with data before entity if there is data */
81 if (valuelen && x->xmlattr)
82 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
83 x->data[0] = c;
84 valuelen = 1;
85 while ((c = GETNEXT()) != EOF) {
86 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
87 break;
88 if (valuelen < sizeof(x->data) - 1)
89 x->data[valuelen++] = c;
90 else {
91 /* entity too long for buffer, handle as normal data */
92 x->data[valuelen] = '\0';
93 if (x->xmlattr)
94 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
95 x->data[0] = c;
96 valuelen = 1;
97 break;
98 }
99 if (c == ';') {
100 x->data[valuelen] = '\0';
101 if (x->xmlattrentity)
102 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
103 valuelen = 0;
104 break;
105 }
106 }
107 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
108 if (valuelen < sizeof(x->data) - 1) {
109 x->data[valuelen++] = c;
110 } else {
111 x->data[valuelen] = '\0';
112 if (x->xmlattr)
113 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
114 x->data[0] = c;
115 valuelen = 1;
116 }
117 }
118 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
119 x->data[valuelen] = '\0';
120 if (x->xmlattr)
121 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
122 if (x->xmlattrend)
123 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
124 break;
125 }
126 }
127 namelen = endname = valuestart = 0;
128 } else if (namelen < sizeof(x->name) - 1) {
129 x->name[namelen++] = c;
130 }
131 if (c == '>') {
132 break;
133 } else if (c == '/') {
134 x->isshorttag = 1;
135 x->name[0] = '\0';
136 namelen = 0;
137 }
138 }
139 }
140
141 static void
142 xml_parsecomment(XMLParser *x)
143 {
144 size_t datalen = 0, i = 0;
145 int c;
146
147 if (x->xmlcommentstart)
148 x->xmlcommentstart(x);
149 while ((c = GETNEXT()) != EOF) {
150 if (c == '-' || c == '>') {
151 if (x->xmlcomment && datalen) {
152 x->data[datalen] = '\0';
153 x->xmlcomment(x, x->data, datalen);
154 datalen = 0;
155 }
156 }
157
158 if (c == '-') {
159 if (++i > 2) {
160 if (x->xmlcomment)
161 for (; i > 2; i--)
162 x->xmlcomment(x, "-", 1);
163 i = 2;
164 }
165 continue;
166 } else if (c == '>' && i == 2) {
167 if (x->xmlcommentend)
168 x->xmlcommentend(x);
169 return;
170 } else if (i) {
171 if (x->xmlcomment) {
172 for (; i > 0; i--)
173 x->xmlcomment(x, "-", 1);
174 }
175 i = 0;
176 }
177
178 if (datalen < sizeof(x->data) - 1) {
179 x->data[datalen++] = c;
180 } else {
181 x->data[datalen] = '\0';
182 if (x->xmlcomment)
183 x->xmlcomment(x, x->data, datalen);
184 x->data[0] = c;
185 datalen = 1;
186 }
187 }
188 }
189
190 static void
191 xml_parsecdata(XMLParser *x)
192 {
193 size_t datalen = 0, i = 0;
194 int c;
195
196 if (x->xmlcdatastart)
197 x->xmlcdatastart(x);
198 while ((c = GETNEXT()) != EOF) {
199 if (c == ']' || c == '>') {
200 if (x->xmlcdata && datalen) {
201 x->data[datalen] = '\0';
202 x->xmlcdata(x, x->data, datalen);
203 datalen = 0;
204 }
205 }
206
207 if (c == ']') {
208 if (++i > 2) {
209 if (x->xmlcdata)
210 for (; i > 2; i--)
211 x->xmlcdata(x, "]", 1);
212 i = 2;
213 }
214 continue;
215 } else if (c == '>' && i == 2) {
216 if (x->xmlcdataend)
217 x->xmlcdataend(x);
218 return;
219 } else if (i) {
220 if (x->xmlcdata)
221 for (; i > 0; i--)
222 x->xmlcdata(x, "]", 1);
223 i = 0;
224 }
225
226 if (datalen < sizeof(x->data) - 1) {
227 x->data[datalen++] = c;
228 } else {
229 x->data[datalen] = '\0';
230 if (x->xmlcdata)
231 x->xmlcdata(x, x->data, datalen);
232 x->data[0] = c;
233 datalen = 1;
234 }
235 }
236 }
237
238 static int
239 codepointtoutf8(long r, char *s)
240 {
241 if (r == 0) {
242 return 0; /* NUL byte */
243 } else if (r <= 0x7F) {
244 /* 1 byte: 0aaaaaaa */
245 s[0] = r;
246 return 1;
247 } else if (r <= 0x07FF) {
248 /* 2 bytes: 00000aaa aabbbbbb */
249 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
250 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
251 return 2;
252 } else if (r <= 0xFFFF) {
253 /* 3 bytes: aaaabbbb bbcccccc */
254 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
255 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
256 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
257 return 3;
258 } else {
259 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
260 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
261 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
262 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
263 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
264 return 4;
265 }
266 }
267
268 static int
269 namedentitytostr(const char *e, char *buf, size_t bufsiz)
270 {
271 static const struct {
272 const char *entity;
273 int c;
274 } entities[] = {
275 { "amp;", '&' },
276 { "lt;", '<' },
277 { "gt;", '>' },
278 { "apos;", '\'' },
279 { "quot;", '"' },
280 { "AMP;", '&' },
281 { "LT;", '<' },
282 { "GT;", '>' },
283 { "APOS;", '\'' },
284 { "QUOT;", '"' }
285 };
286 size_t i;
287
288 /* buffer is too small */
289 if (bufsiz < 2)
290 return -1;
291
292 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
293 if (!strcmp(e, entities[i].entity)) {
294 buf[0] = entities[i].c;
295 buf[1] = '\0';
296 return 1;
297 }
298 }
299 return -1;
300 }
301
302 static int
303 numericentitytostr(const char *e, char *buf, size_t bufsiz)
304 {
305 long l;
306 int base, len;
307 const char *s;
308 char *end;
309
310 /* buffer is too small */
311 if (bufsiz < 5)
312 return -1;
313
314 /* hex (base 16) or decimal (base 10) */
315 if (*e == 'x') {
316 e++;
317 for (s = e; *s && *s != ';'; s++) {
318 if (!ISXDIGIT((unsigned char)*s))
319 return -1; /* invalid: no hex */
320 }
321 base = 16;
322
323 } else {
324 for (s = e; *s && *s != ';'; s++) {
325 if (!ISDIGIT((unsigned char)*s))
326 return -1; /* invalid: no digits */
327 }
328 base = 10;
329 }
330 if (*s != ';' || *(s + 1) != '\0')
331 return -1; /* must end with ';' NUL */
332
333 errno = 0;
334 l = strtol(e, &end, base);
335
336 /* invalid value or not a well-formed entity or invalid code point */
337 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
338 (l >= 0xd800 && l <= 0xdfff)) /* surrogate range */
339 return -1;
340 len = codepointtoutf8(l, buf);
341 buf[len] = '\0';
342
343 return len;
344 }
345
346 /* convert named- or numeric entity string to buffer string
347 * returns byte-length of string or -1 on failure. */
348 int
349 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
350 {
351 /* doesn't start with & */
352 if (e[0] != '&')
353 return -1;
354 /* numeric entity */
355 if (e[1] == '#')
356 return numericentitytostr(e + 2, buf, bufsiz);
357 else /* named entity */
358 return namedentitytostr(e + 1, buf, bufsiz);
359 }
360
361 void
362 xml_parse(XMLParser *x)
363 {
364 size_t datalen, tagdatalen;
365 int c, isend;
366
367 while ((c = GETNEXT()) != EOF && c != '<')
368 ; /* skip until < */
369
370 while (c != EOF) {
371 if (c == '<') { /* parse tag */
372 if ((c = GETNEXT()) == EOF)
373 return;
374
375 if (c == '!') { /* CDATA and comments */
376 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
377 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
378 if (tagdatalen <= sizeof("[CDATA[") - 1)
379 x->data[tagdatalen++] = c;
380 if (c == '>')
381 break;
382 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
383 (x->data[0] == '-')) {
384 xml_parsecomment(x);
385 break;
386 } else if (c == '[') {
387 if (tagdatalen == sizeof("[CDATA[") - 1 &&
388 !strncmp(x->data, "[CDATA[", tagdatalen)) {
389 xml_parsecdata(x);
390 break;
391 }
392 }
393 }
394 } else {
395 /* normal tag (open, short open, close), processing instruction. */
396 x->tag[0] = c;
397 x->taglen = 1;
398 x->isshorttag = isend = 0;
399
400 /* treat processing instruction as short tag, don't strip "?" prefix. */
401 if (c == '?') {
402 x->isshorttag = 1;
403 } else if (c == '/') {
404 if ((c = GETNEXT()) == EOF)
405 return;
406 x->tag[0] = c;
407 isend = 1;
408 }
409
410 while ((c = GETNEXT()) != EOF) {
411 if (c == '/')
412 x->isshorttag = 1; /* short tag */
413 else if (c == '>' || ISSPACE(c)) {
414 x->tag[x->taglen] = '\0';
415 if (isend) { /* end tag, starts with </ */
416 while (c != '>' && c != EOF) /* skip until > */
417 c = GETNEXT();
418 if (x->xmltagend)
419 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
420 x->tag[0] = '\0';
421 x->taglen = 0;
422 } else {
423 /* start tag */
424 if (x->xmltagstart)
425 x->xmltagstart(x, x->tag, x->taglen);
426 if (ISSPACE(c))
427 xml_parseattrs(x);
428 if (x->xmltagstartparsed)
429 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
430 }
431 /* call tagend for short tag or processing instruction */
432 if (x->isshorttag) {
433 if (x->xmltagend)
434 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
435 x->tag[0] = '\0';
436 x->taglen = 0;
437 }
438 break;
439 } else if (x->taglen < sizeof(x->tag) - 1)
440 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
441 }
442 }
443 } else {
444 /* parse tag data */
445 datalen = 0;
446 if (x->xmldatastart)
447 x->xmldatastart(x);
448 while ((c = GETNEXT()) != EOF) {
449 if (c == '&') { /* entities */
450 if (datalen) {
451 x->data[datalen] = '\0';
452 if (x->xmldata)
453 x->xmldata(x, x->data, datalen);
454 }
455 x->data[0] = c;
456 datalen = 1;
457 while ((c = GETNEXT()) != EOF) {
458 if (c == '<')
459 break;
460 if (datalen < sizeof(x->data) - 1)
461 x->data[datalen++] = c;
462 else {
463 /* entity too long for buffer, handle as normal data */
464 x->data[datalen] = '\0';
465 if (x->xmldata)
466 x->xmldata(x, x->data, datalen);
467 x->data[0] = c;
468 datalen = 1;
469 break;
470 }
471 if (c == ';') {
472 x->data[datalen] = '\0';
473 if (x->xmldataentity)
474 x->xmldataentity(x, x->data, datalen);
475 datalen = 0;
476 break;
477 }
478 }
479 } else if (c != '<') {
480 if (datalen < sizeof(x->data) - 1) {
481 x->data[datalen++] = c;
482 } else {
483 x->data[datalen] = '\0';
484 if (x->xmldata)
485 x->xmldata(x, x->data, datalen);
486 x->data[0] = c;
487 datalen = 1;
488 }
489 }
490 if (c == '<') {
491 x->data[datalen] = '\0';
492 if (x->xmldata && datalen)
493 x->xmldata(x, x->data, datalen);
494 if (x->xmldataend)
495 x->xmldataend(x);
496 break;
497 }
498 }
499 }
500 }
501 }