xml.c - randomcrap - random crap programs of varying quality
(HTM) git clone git://git.codemadness.org/randomcrap
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
xml.c (10936B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
10
11 static void
12 xml_parseattrs(XMLParser *x)
13 {
14 size_t namelen = 0, valuelen;
15 int c, endsep, endname = 0, valuestart = 0;
16
17 while ((c = GETNEXT()) != EOF) {
18 if (ISSPACE(c)) {
19 if (namelen)
20 endname = 1;
21 continue;
22 } else if (c == '?')
23 ; /* ignore */
24 else if (c == '=') {
25 x->name[namelen] = '\0';
26 valuestart = 1;
27 endname = 1;
28 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
29 /* attribute without value */
30 x->name[namelen] = '\0';
31 if (x->xmlattrstart)
32 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
33 if (x->xmlattr)
34 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
35 if (x->xmlattrend)
36 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
37 endname = 0;
38 x->name[0] = c;
39 namelen = 1;
40 } else if (namelen && valuestart) {
41 /* attribute with value */
42 if (x->xmlattrstart)
43 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
44
45 valuelen = 0;
46 if (c == '\'' || c == '"') {
47 endsep = c;
48 } else {
49 endsep = ' '; /* ISSPACE() */
50 goto startvalue;
51 }
52
53 while ((c = GETNEXT()) != EOF) {
54 startvalue:
55 if (c == '&') { /* entities */
56 x->data[valuelen] = '\0';
57 /* call data function with data before entity if there is data */
58 if (valuelen && x->xmlattr)
59 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
60 x->data[0] = c;
61 valuelen = 1;
62 while ((c = GETNEXT()) != EOF) {
63 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
64 break;
65 if (valuelen < sizeof(x->data) - 1)
66 x->data[valuelen++] = c;
67 else {
68 /* entity too long for buffer, handle as normal data */
69 x->data[valuelen] = '\0';
70 if (x->xmlattr)
71 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
72 x->data[0] = c;
73 valuelen = 1;
74 break;
75 }
76 if (c == ';') {
77 x->data[valuelen] = '\0';
78 if (x->xmlattrentity)
79 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
80 valuelen = 0;
81 break;
82 }
83 }
84 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
85 if (valuelen < sizeof(x->data) - 1) {
86 x->data[valuelen++] = c;
87 } else {
88 x->data[valuelen] = '\0';
89 if (x->xmlattr)
90 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
91 x->data[0] = c;
92 valuelen = 1;
93 }
94 }
95 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
96 x->data[valuelen] = '\0';
97 if (x->xmlattr)
98 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
99 if (x->xmlattrend)
100 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
101 break;
102 }
103 }
104 namelen = endname = valuestart = 0;
105 } else if (namelen < sizeof(x->name) - 1) {
106 x->name[namelen++] = c;
107 }
108 if (c == '>') {
109 break;
110 } else if (c == '/') {
111 x->isshorttag = 1;
112 x->name[0] = '\0';
113 namelen = 0;
114 }
115 }
116 }
117
118 static void
119 xml_parsecomment(XMLParser *x)
120 {
121 size_t datalen = 0, i = 0;
122 int c;
123
124 if (x->xmlcommentstart)
125 x->xmlcommentstart(x);
126 while ((c = GETNEXT()) != EOF) {
127 if (c == '-' || c == '>') {
128 if (x->xmlcomment && datalen) {
129 x->data[datalen] = '\0';
130 x->xmlcomment(x, x->data, datalen);
131 datalen = 0;
132 }
133 }
134
135 if (c == '-') {
136 if (++i > 2) {
137 if (x->xmlcomment)
138 for (; i > 2; i--)
139 x->xmlcomment(x, "-", 1);
140 i = 2;
141 }
142 continue;
143 } else if (c == '>' && i == 2) {
144 if (x->xmlcommentend)
145 x->xmlcommentend(x);
146 return;
147 } else if (i) {
148 if (x->xmlcomment) {
149 for (; i > 0; i--)
150 x->xmlcomment(x, "-", 1);
151 }
152 i = 0;
153 }
154
155 if (datalen < sizeof(x->data) - 1) {
156 x->data[datalen++] = c;
157 } else {
158 x->data[datalen] = '\0';
159 if (x->xmlcomment)
160 x->xmlcomment(x, x->data, datalen);
161 x->data[0] = c;
162 datalen = 1;
163 }
164 }
165 }
166
167 static void
168 xml_parsecdata(XMLParser *x)
169 {
170 size_t datalen = 0, i = 0;
171 int c;
172
173 if (x->xmlcdatastart)
174 x->xmlcdatastart(x);
175 while ((c = GETNEXT()) != EOF) {
176 if (c == ']' || c == '>') {
177 if (x->xmlcdata && datalen) {
178 x->data[datalen] = '\0';
179 x->xmlcdata(x, x->data, datalen);
180 datalen = 0;
181 }
182 }
183
184 if (c == ']') {
185 if (++i > 2) {
186 if (x->xmlcdata)
187 for (; i > 2; i--)
188 x->xmlcdata(x, "]", 1);
189 i = 2;
190 }
191 continue;
192 } else if (c == '>' && i == 2) {
193 if (x->xmlcdataend)
194 x->xmlcdataend(x);
195 return;
196 } else if (i) {
197 if (x->xmlcdata)
198 for (; i > 0; i--)
199 x->xmlcdata(x, "]", 1);
200 i = 0;
201 }
202
203 if (datalen < sizeof(x->data) - 1) {
204 x->data[datalen++] = c;
205 } else {
206 x->data[datalen] = '\0';
207 if (x->xmlcdata)
208 x->xmlcdata(x, x->data, datalen);
209 x->data[0] = c;
210 datalen = 1;
211 }
212 }
213 }
214
215 static int
216 codepointtoutf8(long r, char *s)
217 {
218 if (r == 0) {
219 return 0; /* NUL byte */
220 } else if (r <= 0x7F) {
221 /* 1 byte: 0aaaaaaa */
222 s[0] = r;
223 return 1;
224 } else if (r <= 0x07FF) {
225 /* 2 bytes: 00000aaa aabbbbbb */
226 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
227 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
228 return 2;
229 } else if (r <= 0xFFFF) {
230 /* 3 bytes: aaaabbbb bbcccccc */
231 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
232 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
233 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
234 return 3;
235 } else {
236 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
237 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
238 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
239 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
240 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
241 return 4;
242 }
243 }
244
245 static int
246 namedentitytostr(const char *e, char *buf, size_t bufsiz)
247 {
248 static const struct {
249 const char *entity;
250 int c;
251 } entities[] = {
252 { "amp;", '&' },
253 { "lt;", '<' },
254 { "gt;", '>' },
255 { "apos;", '\'' },
256 { "quot;", '"' },
257 };
258 size_t i;
259
260 /* buffer is too small */
261 if (bufsiz < 2)
262 return -1;
263
264 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
265 if (!strcmp(e, entities[i].entity)) {
266 buf[0] = entities[i].c;
267 buf[1] = '\0';
268 return 1;
269 }
270 }
271 return -1;
272 }
273
274 static int
275 numericentitytostr(const char *e, char *buf, size_t bufsiz)
276 {
277 long l;
278 int len;
279 char *end;
280
281 /* buffer is too small */
282 if (bufsiz < 5)
283 return -1;
284
285 errno = 0;
286 /* hex (16) or decimal (10) */
287 if (*e == 'x')
288 l = strtol(++e, &end, 16);
289 else
290 l = strtol(e, &end, 10);
291 /* invalid value or not a well-formed entity or invalid code point */
292 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
293 (l >= 0xd800 && l <= 0xdfff))
294 return -1;
295 len = codepointtoutf8(l, buf);
296 buf[len] = '\0';
297
298 return len;
299 }
300
301 /* convert named- or numeric entity string to buffer string
302 * returns byte-length of string or -1 on failure. */
303 int
304 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
305 {
306 /* doesn't start with & */
307 if (e[0] != '&')
308 return -1;
309 /* numeric entity */
310 if (e[1] == '#')
311 return numericentitytostr(e + 2, buf, bufsiz);
312 else /* named entity */
313 return namedentitytostr(e + 1, buf, bufsiz);
314 }
315
316 void
317 xml_parse(XMLParser *x)
318 {
319 size_t datalen, tagdatalen;
320 int c, isend;
321
322 while ((c = GETNEXT()) != EOF && c != '<')
323 ; /* skip until < */
324
325 while (c != EOF) {
326 if (c == '<') { /* parse tag */
327 if ((c = GETNEXT()) == EOF)
328 return;
329
330 if (c == '!') { /* CDATA and comments */
331 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
332 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
333 if (tagdatalen <= sizeof("[CDATA[") - 1)
334 x->data[tagdatalen++] = c;
335 if (c == '>')
336 break;
337 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
338 (x->data[0] == '-')) {
339 xml_parsecomment(x);
340 break;
341 } else if (c == '[') {
342 if (tagdatalen == sizeof("[CDATA[") - 1 &&
343 !strncmp(x->data, "[CDATA[", tagdatalen)) {
344 xml_parsecdata(x);
345 break;
346 }
347 }
348 }
349 } else {
350 /* normal tag (open, short open, close), processing instruction. */
351 x->tag[0] = c;
352 x->taglen = 1;
353 x->isshorttag = isend = 0;
354
355 /* treat processing instruction as short tag, don't strip "?" prefix. */
356 if (c == '?') {
357 x->isshorttag = 1;
358 } else if (c == '/') {
359 if ((c = GETNEXT()) == EOF)
360 return;
361 x->tag[0] = c;
362 isend = 1;
363 }
364
365 while ((c = GETNEXT()) != EOF) {
366 if (c == '/')
367 x->isshorttag = 1; /* short tag */
368 else if (c == '>' || ISSPACE(c)) {
369 x->tag[x->taglen] = '\0';
370 if (isend) { /* end tag, starts with </ */
371 while (c != '>' && c != EOF) /* skip until > */
372 c = GETNEXT();
373 if (x->xmltagend)
374 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
375 x->tag[0] = '\0';
376 x->taglen = 0;
377 } else {
378 /* start tag */
379 if (x->xmltagstart)
380 x->xmltagstart(x, x->tag, x->taglen);
381 if (ISSPACE(c))
382 xml_parseattrs(x);
383 if (x->xmltagstartparsed)
384 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
385 }
386 /* call tagend for short tag or processing instruction */
387 if (x->isshorttag) {
388 if (x->xmltagend)
389 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
390 x->tag[0] = '\0';
391 x->taglen = 0;
392 }
393 break;
394 } else if (x->taglen < sizeof(x->tag) - 1)
395 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
396 }
397 }
398 } else {
399 /* parse tag data */
400 datalen = 0;
401 if (x->xmldatastart)
402 x->xmldatastart(x);
403 while ((c = GETNEXT()) != EOF) {
404 if (c == '&') {
405 if (datalen) {
406 x->data[datalen] = '\0';
407 if (x->xmldata)
408 x->xmldata(x, x->data, datalen);
409 }
410 x->data[0] = c;
411 datalen = 1;
412 while ((c = GETNEXT()) != EOF) {
413 if (c == '<')
414 break;
415 if (datalen < sizeof(x->data) - 1)
416 x->data[datalen++] = c;
417 else {
418 /* entity too long for buffer, handle as normal data */
419 x->data[datalen] = '\0';
420 if (x->xmldata)
421 x->xmldata(x, x->data, datalen);
422 x->data[0] = c;
423 datalen = 1;
424 break;
425 }
426 if (c == ';') {
427 x->data[datalen] = '\0';
428 if (x->xmldataentity)
429 x->xmldataentity(x, x->data, datalen);
430 datalen = 0;
431 break;
432 }
433 }
434 } else if (c != '<') {
435 if (datalen < sizeof(x->data) - 1) {
436 x->data[datalen++] = c;
437 } else {
438 x->data[datalen] = '\0';
439 if (x->xmldata)
440 x->xmldata(x, x->data, datalen);
441 x->data[0] = c;
442 datalen = 1;
443 }
444 }
445 if (c == '<') {
446 x->data[datalen] = '\0';
447 if (x->xmldata && datalen)
448 x->xmldata(x, x->data, datalen);
449 if (x->xmldataend)
450 x->xmldataend(x);
451 break;
452 }
453 }
454 }
455 }
456 }