xml.c - sfeed - RSS and Atom parser
(HTM) git clone git://git.codemadness.org/sfeed
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
xml.c (10616B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
9 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
10 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
11 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || (((unsigned)c) | 32) - 'a' < 6)
12
13 static void
14 xml_parseattrs(XMLParser *x)
15 {
16 size_t namelen = 0, valuelen;
17 int c, endsep, endname = 0, valuestart = 0;
18
19 while ((c = GETNEXT()) != EOF) {
20 if (ISSPACE(c)) {
21 if (namelen)
22 endname = 1;
23 continue;
24 } else if (c == '?')
25 ; /* ignore */
26 else if (c == '=') {
27 x->name[namelen] = '\0';
28 valuestart = 1;
29 endname = 1;
30 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
31 /* attribute without value */
32 x->name[namelen] = '\0';
33 if (x->xmlattrstart)
34 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
35 if (x->xmlattr)
36 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
37 if (x->xmlattrend)
38 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
39 endname = 0;
40 x->name[0] = c;
41 namelen = 1;
42 } else if (namelen && valuestart) {
43 /* attribute with value */
44 if (x->xmlattrstart)
45 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
46
47 valuelen = 0;
48 if (c == '\'' || c == '"') {
49 endsep = c;
50 } else {
51 endsep = ' '; /* ISSPACE() */
52 goto startvalue;
53 }
54
55 while ((c = GETNEXT()) != EOF) {
56 startvalue:
57 if (c == '&') { /* entities */
58 x->data[valuelen] = '\0';
59 /* call data function with data before entity if there is data */
60 if (valuelen && x->xmlattr)
61 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
62 x->data[0] = c;
63 valuelen = 1;
64 while ((c = GETNEXT()) != EOF) {
65 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
66 break;
67 if (valuelen < sizeof(x->data) - 1)
68 x->data[valuelen++] = c;
69 else {
70 /* entity too long for buffer, handle as normal data */
71 x->data[valuelen] = '\0';
72 if (x->xmlattr)
73 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
74 x->data[0] = c;
75 valuelen = 1;
76 break;
77 }
78 if (c == ';') {
79 x->data[valuelen] = '\0';
80 if (x->xmlattrentity)
81 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
82 valuelen = 0;
83 break;
84 }
85 }
86 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
87 if (valuelen < sizeof(x->data) - 1) {
88 x->data[valuelen++] = c;
89 } else {
90 x->data[valuelen] = '\0';
91 if (x->xmlattr)
92 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
93 x->data[0] = c;
94 valuelen = 1;
95 }
96 }
97 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
98 x->data[valuelen] = '\0';
99 if (x->xmlattr)
100 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
101 if (x->xmlattrend)
102 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
103 break;
104 }
105 }
106 namelen = endname = valuestart = 0;
107 } else if (namelen < sizeof(x->name) - 1) {
108 x->name[namelen++] = c;
109 }
110 if (c == '>') {
111 break;
112 } else if (c == '/') {
113 x->isshorttag = 1;
114 x->name[0] = '\0';
115 namelen = 0;
116 }
117 }
118 }
119
120 static void
121 xml_parsecomment(XMLParser *x)
122 {
123 int c, i = 0;
124
125 while ((c = GETNEXT()) != EOF) {
126 if (c == '-') {
127 if (++i > 2)
128 i = 2;
129 continue;
130 } else if (c == '>' && i == 2) {
131 return;
132 } else if (i) {
133 i = 0;
134 }
135 }
136 }
137
138 static void
139 xml_parsecdata(XMLParser *x)
140 {
141 size_t datalen = 0, i = 0;
142 int c;
143
144 while ((c = GETNEXT()) != EOF) {
145 if (c == ']' || c == '>') {
146 if (x->xmlcdata && datalen) {
147 x->data[datalen] = '\0';
148 x->xmlcdata(x, x->data, datalen);
149 datalen = 0;
150 }
151 }
152
153 if (c == ']') {
154 if (++i > 2) {
155 if (x->xmlcdata)
156 for (; i > 2; i--)
157 x->xmlcdata(x, "]", 1);
158 i = 2;
159 }
160 continue;
161 } else if (c == '>' && i == 2) {
162 return;
163 } else if (i) {
164 if (x->xmlcdata)
165 for (; i > 0; i--)
166 x->xmlcdata(x, "]", 1);
167 i = 0;
168 }
169
170 if (datalen < sizeof(x->data) - 1) {
171 x->data[datalen++] = c;
172 } else {
173 x->data[datalen] = '\0';
174 if (x->xmlcdata)
175 x->xmlcdata(x, x->data, datalen);
176 x->data[0] = c;
177 datalen = 1;
178 }
179 }
180 }
181
182 static int
183 codepointtoutf8(long r, char *s)
184 {
185 if (r == 0) {
186 return 0; /* NUL byte */
187 } else if (r <= 0x7F) {
188 /* 1 byte: 0aaaaaaa */
189 s[0] = r;
190 return 1;
191 } else if (r <= 0x07FF) {
192 /* 2 bytes: 00000aaa aabbbbbb */
193 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
194 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
195 return 2;
196 } else if (r <= 0xFFFF) {
197 /* 3 bytes: aaaabbbb bbcccccc */
198 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
199 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
200 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
201 return 3;
202 } else {
203 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
204 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
205 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
206 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
207 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
208 return 4;
209 }
210 }
211
212 static int
213 namedentitytostr(const char *e, char *buf, size_t bufsiz)
214 {
215 static const struct {
216 const char *entity;
217 int c;
218 } entities[] = {
219 { "amp;", '&' },
220 { "lt;", '<' },
221 { "gt;", '>' },
222 { "apos;", '\'' },
223 { "quot;", '"' },
224 };
225 size_t i;
226
227 /* buffer is too small */
228 if (bufsiz < 2)
229 return -1;
230
231 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
232 if (!strcmp(e, entities[i].entity)) {
233 buf[0] = entities[i].c;
234 buf[1] = '\0';
235 return 1;
236 }
237 }
238 return -1;
239 }
240
241 static int
242 numericentitytostr(const char *e, char *buf, size_t bufsiz)
243 {
244 long l;
245 int base, len;
246 const char *s;
247 char *end;
248
249 /* buffer is too small */
250 if (bufsiz < 5)
251 return -1;
252
253 /* hex (base 16) or decimal (base 10) */
254 if (*e == 'x') {
255 e++;
256 for (s = e; *s && *s != ';'; s++) {
257 if (!ISXDIGIT((unsigned char)*s))
258 return -1; /* invalid: no hex */
259 }
260 base = 16;
261
262 } else {
263 for (s = e; *s && *s != ';'; s++) {
264 if (!ISDIGIT((unsigned char)*s))
265 return -1; /* invalid: no digits */
266 }
267 base = 10;
268 }
269 if (*s != ';' || *(s + 1) != '\0')
270 return -1; /* must end with ';' NUL */
271
272 errno = 0;
273 l = strtol(e, &end, base);
274
275 /* invalid value or not a well-formed entity or invalid code point */
276 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
277 (l >= 0xd800 && l <= 0xdfff)) /* surrogate range */
278 return -1;
279 len = codepointtoutf8(l, buf);
280 buf[len] = '\0';
281
282 return len;
283 }
284
285 /* convert named- or numeric entity string to buffer string
286 * returns byte-length of string or -1 on failure. */
287 int
288 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
289 {
290 /* doesn't start with & */
291 if (e[0] != '&')
292 return -1;
293 /* numeric entity */
294 if (e[1] == '#')
295 return numericentitytostr(e + 2, buf, bufsiz);
296 else /* named entity */
297 return namedentitytostr(e + 1, buf, bufsiz);
298 }
299
300 void
301 xml_parse(XMLParser *x)
302 {
303 size_t datalen, tagdatalen;
304 int c, isend;
305
306 while ((c = GETNEXT()) != EOF && c != '<')
307 ; /* skip until < */
308
309 while (c != EOF) {
310 if (c == '<') { /* parse tag */
311 if ((c = GETNEXT()) == EOF)
312 return;
313
314 if (c == '!') { /* CDATA and comments */
315 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
316 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
317 if (tagdatalen <= sizeof("[CDATA[") - 1)
318 x->data[tagdatalen++] = c;
319 if (c == '>')
320 break;
321 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
322 (x->data[0] == '-')) {
323 xml_parsecomment(x);
324 break;
325 } else if (c == '[') {
326 if (tagdatalen == sizeof("[CDATA[") - 1 &&
327 !strncmp(x->data, "[CDATA[", tagdatalen)) {
328 xml_parsecdata(x);
329 break;
330 }
331 }
332 }
333 } else {
334 /* normal tag (open, short open, close), processing instruction. */
335 x->tag[0] = c;
336 x->taglen = 1;
337 x->isshorttag = isend = 0;
338
339 /* treat processing instruction as short tag, don't strip "?" prefix. */
340 if (c == '?') {
341 x->isshorttag = 1;
342 } else if (c == '/') {
343 if ((c = GETNEXT()) == EOF)
344 return;
345 x->tag[0] = c;
346 isend = 1;
347 }
348
349 while ((c = GETNEXT()) != EOF) {
350 if (c == '/')
351 x->isshorttag = 1; /* short tag */
352 else if (c == '>' || ISSPACE(c)) {
353 x->tag[x->taglen] = '\0';
354 if (isend) { /* end tag, starts with </ */
355 while (c != '>' && c != EOF) /* skip until > */
356 c = GETNEXT();
357 if (x->xmltagend)
358 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
359 x->tag[0] = '\0';
360 x->taglen = 0;
361 } else {
362 /* start tag */
363 if (x->xmltagstart)
364 x->xmltagstart(x, x->tag, x->taglen);
365 if (ISSPACE(c))
366 xml_parseattrs(x);
367 if (x->xmltagstartparsed)
368 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
369 }
370 /* call tagend for short tag or processing instruction */
371 if (x->isshorttag) {
372 if (x->xmltagend)
373 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
374 x->tag[0] = '\0';
375 x->taglen = 0;
376 }
377 break;
378 } else if (x->taglen < sizeof(x->tag) - 1)
379 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
380 }
381 }
382 } else {
383 /* parse tag data */
384 datalen = 0;
385 while ((c = GETNEXT()) != EOF) {
386 if (c == '&') { /* entities */
387 if (datalen) {
388 x->data[datalen] = '\0';
389 if (x->xmldata)
390 x->xmldata(x, x->data, datalen);
391 }
392 x->data[0] = c;
393 datalen = 1;
394 while ((c = GETNEXT()) != EOF) {
395 if (c == '<')
396 break;
397 if (datalen < sizeof(x->data) - 1)
398 x->data[datalen++] = c;
399 else {
400 /* entity too long for buffer, handle as normal data */
401 x->data[datalen] = '\0';
402 if (x->xmldata)
403 x->xmldata(x, x->data, datalen);
404 x->data[0] = c;
405 datalen = 1;
406 break;
407 }
408 if (c == ';') {
409 x->data[datalen] = '\0';
410 if (x->xmldataentity)
411 x->xmldataentity(x, x->data, datalen);
412 datalen = 0;
413 break;
414 }
415 }
416 } else if (c != '<') {
417 if (datalen < sizeof(x->data) - 1) {
418 x->data[datalen++] = c;
419 } else {
420 x->data[datalen] = '\0';
421 if (x->xmldata)
422 x->xmldata(x, x->data, datalen);
423 x->data[0] = c;
424 datalen = 1;
425 }
426 }
427 if (c == '<') {
428 x->data[datalen] = '\0';
429 if (x->xmldata && datalen)
430 x->xmldata(x, x->data, datalen);
431 break;
432 }
433 }
434 }
435 }
436 }