xml.c - bag - BAG Kadaster Extract parser (subset)
(HTM) git clone git://git.codemadness.org/bag
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
xml.c (11249B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
10
11 static int roffset, rtotal;
12 static char rbuf[4096*4];
13
14 int
15 getnext(void)
16 {
17 ssize_t n;
18
19 if (roffset >= rtotal) {
20 n = fread(rbuf, 1, sizeof(rbuf), stdin);
21 if (ferror(stdin))
22 exit(1);
23 if (feof(stdin) || n == 0) {
24 roffset = 0;
25 rtotal = 0;
26 return EOF;
27 }
28 roffset = 0;
29 rtotal = n;
30 }
31 return rbuf[roffset++];
32 }
33
34 //#define GETNEXT getnext
35 #define GETNEXT getchar_unlocked
36
37 static void
38 xml_parseattrs(XMLParser *x)
39 {
40 size_t namelen = 0, valuelen;
41 int c, endsep, endname = 0, valuestart = 0;
42
43 while ((c = GETNEXT()) != EOF) {
44 if (ISSPACE(c)) {
45 if (namelen)
46 endname = 1;
47 continue;
48 } else if (c == '?')
49 ; /* ignore */
50 else if (c == '=') {
51 x->name[namelen] = '\0';
52 valuestart = 1;
53 endname = 1;
54 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
55 /* attribute without value */
56 x->name[namelen] = '\0';
57 if (x->xmlattrstart)
58 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
59 if (x->xmlattr)
60 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
61 if (x->xmlattrend)
62 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
63 endname = 0;
64 x->name[0] = c;
65 namelen = 1;
66 } else if (namelen && valuestart) {
67 /* attribute with value */
68 if (x->xmlattrstart)
69 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
70
71 valuelen = 0;
72 if (c == '\'' || c == '"') {
73 endsep = c;
74 } else {
75 endsep = ' '; /* ISSPACE() */
76 goto startvalue;
77 }
78
79 while ((c = GETNEXT()) != EOF) {
80 startvalue:
81 if (c == '&') { /* entities */
82 x->data[valuelen] = '\0';
83 /* call data function with data before entity if there is data */
84 if (valuelen && x->xmlattr)
85 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
86 x->data[0] = c;
87 valuelen = 1;
88 while ((c = GETNEXT()) != EOF) {
89 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
90 break;
91 if (valuelen < sizeof(x->data) - 1)
92 x->data[valuelen++] = c;
93 else {
94 /* entity too long for buffer, handle as normal data */
95 x->data[valuelen] = '\0';
96 if (x->xmlattr)
97 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
98 x->data[0] = c;
99 valuelen = 1;
100 break;
101 }
102 if (c == ';') {
103 x->data[valuelen] = '\0';
104 if (x->xmlattrentity)
105 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
106 valuelen = 0;
107 break;
108 }
109 }
110 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
111 if (valuelen < sizeof(x->data) - 1) {
112 x->data[valuelen++] = c;
113 } else {
114 x->data[valuelen] = '\0';
115 if (x->xmlattr)
116 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
117 x->data[0] = c;
118 valuelen = 1;
119 }
120 }
121 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
122 x->data[valuelen] = '\0';
123 if (x->xmlattr)
124 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
125 if (x->xmlattrend)
126 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
127 break;
128 }
129 }
130 namelen = endname = valuestart = 0;
131 } else if (namelen < sizeof(x->name) - 1) {
132 x->name[namelen++] = c;
133 }
134 if (c == '>') {
135 break;
136 } else if (c == '/') {
137 x->isshorttag = 1;
138 x->name[0] = '\0';
139 namelen = 0;
140 }
141 }
142 }
143
144 static void
145 xml_parsecomment(XMLParser *x)
146 {
147 size_t datalen = 0, i = 0;
148 int c;
149
150 if (x->xmlcommentstart)
151 x->xmlcommentstart(x);
152 while ((c = GETNEXT()) != EOF) {
153 if (c == '-' || c == '>') {
154 if (x->xmlcomment && datalen) {
155 x->data[datalen] = '\0';
156 x->xmlcomment(x, x->data, datalen);
157 datalen = 0;
158 }
159 }
160
161 if (c == '-') {
162 if (++i > 2) {
163 if (x->xmlcomment)
164 for (; i > 2; i--)
165 x->xmlcomment(x, "-", 1);
166 i = 2;
167 }
168 continue;
169 } else if (c == '>' && i == 2) {
170 if (x->xmlcommentend)
171 x->xmlcommentend(x);
172 return;
173 } else if (i) {
174 if (x->xmlcomment) {
175 for (; i > 0; i--)
176 x->xmlcomment(x, "-", 1);
177 }
178 i = 0;
179 }
180
181 if (datalen < sizeof(x->data) - 1) {
182 x->data[datalen++] = c;
183 } else {
184 x->data[datalen] = '\0';
185 if (x->xmlcomment)
186 x->xmlcomment(x, x->data, datalen);
187 x->data[0] = c;
188 datalen = 1;
189 }
190 }
191 }
192
193 static void
194 xml_parsecdata(XMLParser *x)
195 {
196 size_t datalen = 0, i = 0;
197 int c;
198
199 if (x->xmlcdatastart)
200 x->xmlcdatastart(x);
201 while ((c = GETNEXT()) != EOF) {
202 if (c == ']' || c == '>') {
203 if (x->xmlcdata && datalen) {
204 x->data[datalen] = '\0';
205 x->xmlcdata(x, x->data, datalen);
206 datalen = 0;
207 }
208 }
209
210 if (c == ']') {
211 if (++i > 2) {
212 if (x->xmlcdata)
213 for (; i > 2; i--)
214 x->xmlcdata(x, "]", 1);
215 i = 2;
216 }
217 continue;
218 } else if (c == '>' && i == 2) {
219 if (x->xmlcdataend)
220 x->xmlcdataend(x);
221 return;
222 } else if (i) {
223 if (x->xmlcdata)
224 for (; i > 0; i--)
225 x->xmlcdata(x, "]", 1);
226 i = 0;
227 }
228
229 if (datalen < sizeof(x->data) - 1) {
230 x->data[datalen++] = c;
231 } else {
232 x->data[datalen] = '\0';
233 if (x->xmlcdata)
234 x->xmlcdata(x, x->data, datalen);
235 x->data[0] = c;
236 datalen = 1;
237 }
238 }
239 }
240
241 static int
242 codepointtoutf8(long r, char *s)
243 {
244 if (r == 0) {
245 return 0; /* NUL byte */
246 } else if (r <= 0x7F) {
247 /* 1 byte: 0aaaaaaa */
248 s[0] = r;
249 return 1;
250 } else if (r <= 0x07FF) {
251 /* 2 bytes: 00000aaa aabbbbbb */
252 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
253 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
254 return 2;
255 } else if (r <= 0xFFFF) {
256 /* 3 bytes: aaaabbbb bbcccccc */
257 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
258 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
259 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
260 return 3;
261 } else {
262 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
263 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
264 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
265 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
266 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
267 return 4;
268 }
269 }
270
271 static int
272 namedentitytostr(const char *e, char *buf, size_t bufsiz)
273 {
274 static const struct {
275 const char *entity;
276 int c;
277 } entities[] = {
278 { "amp;", '&' },
279 { "lt;", '<' },
280 { "gt;", '>' },
281 { "apos;", '\'' },
282 { "quot;", '"' },
283 };
284 size_t i;
285
286 /* buffer is too small */
287 if (bufsiz < 2)
288 return -1;
289
290 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
291 if (!strcmp(e, entities[i].entity)) {
292 buf[0] = entities[i].c;
293 buf[1] = '\0';
294 return 1;
295 }
296 }
297 return -1;
298 }
299
300 static int
301 numericentitytostr(const char *e, char *buf, size_t bufsiz)
302 {
303 long l;
304 int len;
305 char *end;
306
307 /* buffer is too small */
308 if (bufsiz < 5)
309 return -1;
310
311 errno = 0;
312 /* hex (16) or decimal (10) */
313 if (*e == 'x')
314 l = strtol(++e, &end, 16);
315 else
316 l = strtol(e, &end, 10);
317 /* invalid value or not a well-formed entity or invalid code point */
318 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
319 (l >= 0xd800 && l <= 0xdfff))
320 return -1;
321 len = codepointtoutf8(l, buf);
322 buf[len] = '\0';
323
324 return len;
325 }
326
327 /* convert named- or numeric entity string to buffer string
328 * returns byte-length of string or -1 on failure. */
329 int
330 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
331 {
332 /* doesn't start with & */
333 if (e[0] != '&')
334 return -1;
335 /* numeric entity */
336 if (e[1] == '#')
337 return numericentitytostr(e + 2, buf, bufsiz);
338 else /* named entity */
339 return namedentitytostr(e + 1, buf, bufsiz);
340 }
341
342 void
343 xml_parse(XMLParser *x)
344 {
345 size_t datalen, tagdatalen;
346 int c, isend;
347
348 while ((c = GETNEXT()) != EOF && c != '<')
349 ; /* skip until < */
350
351 while (c != EOF) {
352 if (c == '<') { /* parse tag */
353 if ((c = GETNEXT()) == EOF)
354 return;
355
356 if (c == '!') { /* CDATA and comments */
357 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
358 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
359 if (tagdatalen <= sizeof("[CDATA[") - 1)
360 x->data[tagdatalen++] = c;
361 if (c == '>')
362 break;
363 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
364 (x->data[0] == '-')) {
365 xml_parsecomment(x);
366 break;
367 } else if (c == '[') {
368 if (tagdatalen == sizeof("[CDATA[") - 1 &&
369 !strncmp(x->data, "[CDATA[", tagdatalen)) {
370 xml_parsecdata(x);
371 break;
372 }
373 }
374 }
375 } else {
376 /* normal tag (open, short open, close), processing instruction. */
377 x->tag[0] = c;
378 x->taglen = 1;
379 x->isshorttag = isend = 0;
380
381 /* treat processing instruction as short tag, don't strip "?" prefix. */
382 if (c == '?') {
383 x->isshorttag = 1;
384 } else if (c == '/') {
385 if ((c = GETNEXT()) == EOF)
386 return;
387 x->tag[0] = c;
388 isend = 1;
389 }
390
391 while ((c = GETNEXT()) != EOF) {
392 if (c == '/')
393 x->isshorttag = 1; /* short tag */
394 else if (c == '>' || ISSPACE(c)) {
395 x->tag[x->taglen] = '\0';
396 if (isend) { /* end tag, starts with </ */
397 if (x->xmltagend)
398 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
399 x->tag[0] = '\0';
400 x->taglen = 0;
401 } else {
402 /* start tag */
403 if (x->xmltagstart)
404 x->xmltagstart(x, x->tag, x->taglen);
405 if (ISSPACE(c))
406 xml_parseattrs(x);
407 if (x->xmltagstartparsed)
408 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
409 }
410 /* call tagend for short tag or processing instruction */
411 if (x->isshorttag) {
412 if (x->xmltagend)
413 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
414 x->tag[0] = '\0';
415 x->taglen = 0;
416 }
417 break;
418 } else if (x->taglen < sizeof(x->tag) - 1)
419 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
420 }
421 }
422 } else {
423 /* parse tag data */
424 datalen = 0;
425 if (x->xmldatastart)
426 x->xmldatastart(x);
427 while ((c = GETNEXT()) != EOF) {
428 if (c == '&') {
429 if (datalen) {
430 x->data[datalen] = '\0';
431 if (x->xmldata)
432 x->xmldata(x, x->data, datalen);
433 }
434 x->data[0] = c;
435 datalen = 1;
436 while ((c = GETNEXT()) != EOF) {
437 if (c == '<')
438 break;
439 if (datalen < sizeof(x->data) - 1)
440 x->data[datalen++] = c;
441 else {
442 /* entity too long for buffer, handle as normal data */
443 x->data[datalen] = '\0';
444 if (x->xmldata)
445 x->xmldata(x, x->data, datalen);
446 x->data[0] = c;
447 datalen = 1;
448 break;
449 }
450 if (c == ';') {
451 x->data[datalen] = '\0';
452 if (x->xmldataentity)
453 x->xmldataentity(x, x->data, datalen);
454 datalen = 0;
455 break;
456 }
457 }
458 } else if (c != '<') {
459 if (datalen < sizeof(x->data) - 1) {
460 x->data[datalen++] = c;
461 } else {
462 x->data[datalen] = '\0';
463 if (x->xmldata)
464 x->xmldata(x, x->data, datalen);
465 x->data[0] = c;
466 datalen = 1;
467 }
468 }
469 if (c == '<') {
470 x->data[datalen] = '\0';
471 if (x->xmldata && datalen)
472 x->xmldata(x, x->data, datalen);
473 if (x->xmldataend)
474 x->xmldataend(x);
475 break;
476 }
477 }
478 }
479 }
480 }