xml.c - xml2tsv - a simple xml-to-tsv converter, based on xmlparser
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
(DIR) README
(DIR) LICENSE
---
xml.c (10070B)
---
1 #include <ctype.h>
2 #include <errno.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6
7 #include "xml.h"
8
9 static void
10 xml_parseattrs(XMLParser *x)
11 {
12 size_t namelen = 0, valuelen;
13 int c, endsep, endname = 0, valuestart = 0;
14
15 while ((c = GETNEXT()) != EOF) {
16 if (isspace(c)) {
17 if (namelen)
18 endname = 1;
19 continue;
20 } else if (c == '?')
21 ; /* ignore */
22 else if (c == '=') {
23 x->name[namelen] = '\0';
24 valuestart = 1;
25 endname = 1;
26 } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
27 /* attribute without value */
28 x->name[namelen] = '\0';
29 if (x->xmlattrstart)
30 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
31 if (x->xmlattr)
32 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
33 if (x->xmlattrend)
34 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
35 endname = 0;
36 x->name[0] = c;
37 namelen = 1;
38 } else if (namelen && valuestart) {
39 /* attribute with value */
40 if (x->xmlattrstart)
41 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
42
43 valuelen = 0;
44 if (c == '\'' || c == '"') {
45 endsep = c;
46 } else {
47 endsep = ' '; /* isspace() */
48 goto startvalue;
49 }
50
51 while ((c = GETNEXT()) != EOF) {
52 startvalue:
53 if (c == '&') { /* entities */
54 x->data[valuelen] = '\0';
55 /* call data function with data before entity if there is data */
56 if (valuelen && x->xmlattr)
57 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
58 x->data[0] = c;
59 valuelen = 1;
60 while ((c = GETNEXT()) != EOF) {
61 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
62 break;
63 if (valuelen < sizeof(x->data) - 1)
64 x->data[valuelen++] = c;
65 else {
66 /* entity too long for buffer, handle as normal data */
67 x->data[valuelen] = '\0';
68 if (x->xmlattr)
69 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
70 x->data[0] = c;
71 valuelen = 1;
72 break;
73 }
74 if (c == ';') {
75 x->data[valuelen] = '\0';
76 if (x->xmlattrentity)
77 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
78 valuelen = 0;
79 break;
80 }
81 }
82 } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
83 if (valuelen < sizeof(x->data) - 1) {
84 x->data[valuelen++] = c;
85 } else {
86 x->data[valuelen] = '\0';
87 if (x->xmlattr)
88 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
89 x->data[0] = c;
90 valuelen = 1;
91 }
92 }
93 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
94 x->data[valuelen] = '\0';
95 if (x->xmlattr)
96 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
97 if (x->xmlattrend)
98 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
99 break;
100 }
101 }
102 namelen = endname = valuestart = 0;
103 } else if (namelen < sizeof(x->name) - 1) {
104 x->name[namelen++] = c;
105 }
106 if (c == '>') {
107 break;
108 } else if (c == '/') {
109 x->isshorttag = 1;
110 x->name[0] = '\0';
111 namelen = 0;
112 }
113 }
114 }
115
116 static void
117 xml_parsecomment(XMLParser *x)
118 {
119 size_t i = 0;
120 int c;
121
122 while ((c = GETNEXT()) != EOF) {
123 if (c == '-') {
124 if (++i > 2)
125 i = 2;
126 continue;
127 } else if (c == '>' && i == 2) {
128 return;
129 } else if (i) {
130 i = 0;
131 }
132 }
133 }
134
135 static void
136 xml_parsecdata(XMLParser *x)
137 {
138 size_t datalen = 0, i = 0;
139 int c;
140
141 if (x->xmlcdatastart)
142 x->xmlcdatastart(x);
143 while ((c = GETNEXT()) != EOF) {
144 if (c == ']' || c == '>') {
145 if (x->xmlcdata && datalen) {
146 x->data[datalen] = '\0';
147 x->xmlcdata(x, x->data, datalen);
148 datalen = 0;
149 }
150 }
151
152 if (c == ']') {
153 if (++i > 2) {
154 if (x->xmlcdata)
155 for (; i > 2; i--)
156 x->xmlcdata(x, "]", 1);
157 i = 2;
158 }
159 continue;
160 } else if (c == '>' && i == 2) {
161 if (x->xmlcdataend)
162 x->xmlcdataend(x);
163 return;
164 } else if (i) {
165 if (x->xmlcdata)
166 for (; i > 0; i--)
167 x->xmlcdata(x, "]", 1);
168 i = 0;
169 }
170
171 if (datalen < sizeof(x->data) - 1) {
172 x->data[datalen++] = c;
173 } else {
174 x->data[datalen] = '\0';
175 if (x->xmlcdata)
176 x->xmlcdata(x, x->data, datalen);
177 x->data[0] = c;
178 datalen = 1;
179 }
180 }
181 }
182
183 static int
184 codepointtoutf8(long r, char *s)
185 {
186 if (r == 0) {
187 return 0; /* NUL byte */
188 } else if (r <= 0x7F) {
189 /* 1 byte: 0aaaaaaa */
190 s[0] = r;
191 return 1;
192 } else if (r <= 0x07FF) {
193 /* 2 bytes: 00000aaa aabbbbbb */
194 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
195 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
196 return 2;
197 } else if (r <= 0xFFFF) {
198 /* 3 bytes: aaaabbbb bbcccccc */
199 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
200 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
201 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
202 return 3;
203 } else {
204 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
205 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
206 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
207 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
208 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
209 return 4;
210 }
211 }
212
213 static int
214 namedentitytostr(const char *e, char *buf, size_t bufsiz)
215 {
216 static const struct {
217 const char *entity;
218 int c;
219 } entities[] = {
220 { "amp;", '&' },
221 { "lt;", '<' },
222 { "gt;", '>' },
223 { "apos;", '\'' },
224 { "quot;", '"' },
225 };
226 size_t i;
227
228 /* buffer is too small */
229 if (bufsiz < 2)
230 return -1;
231
232 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
233 if (!strcmp(e, entities[i].entity)) {
234 buf[0] = entities[i].c;
235 buf[1] = '\0';
236 return 1;
237 }
238 }
239 return -1;
240 }
241
242 static int
243 numericentitytostr(const char *e, char *buf, size_t bufsiz)
244 {
245 long l;
246 int len;
247 char *end;
248
249 /* buffer is too small */
250 if (bufsiz < 5)
251 return -1;
252
253 errno = 0;
254 /* hex (16) or decimal (10) */
255 if (*e == 'x')
256 l = strtol(++e, &end, 16);
257 else
258 l = strtol(e, &end, 10);
259 /* invalid value or not a well-formed entity or invalid code point */
260 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
261 return -1;
262 len = codepointtoutf8(l, buf);
263 buf[len] = '\0';
264
265 return len;
266 }
267
268 /* convert named- or numeric entity string to buffer string
269 * returns byte-length of string or -1 on failure. */
270 int
271 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
272 {
273 /* doesn't start with & */
274 if (e[0] != '&')
275 return -1;
276 /* numeric entity */
277 if (e[1] == '#')
278 return numericentitytostr(e + 2, buf, bufsiz);
279 else /* named entity */
280 return namedentitytostr(e + 1, buf, bufsiz);
281 }
282
283 void
284 xml_parse(XMLParser *x)
285 {
286 size_t datalen, tagdatalen;
287 int c, isend;
288
289 while ((c = GETNEXT()) != EOF && c != '<')
290 ; /* skip until < */
291
292 while (c != EOF) {
293 if (c == '<') { /* parse tag */
294 if ((c = GETNEXT()) == EOF)
295 return;
296
297 if (c == '!') { /* cdata and comments */
298 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
299 /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
300 if (tagdatalen <= sizeof("[CDATA[") - 1)
301 x->data[tagdatalen++] = c;
302 if (c == '>')
303 break;
304 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
305 (x->data[0] == '-')) {
306 xml_parsecomment(x);
307 break;
308 } else if (c == '[') {
309 if (tagdatalen == sizeof("[CDATA[") - 1 &&
310 !strncmp(x->data, "[CDATA[", tagdatalen)) {
311 xml_parsecdata(x);
312 break;
313 }
314 }
315 }
316 } else {
317 /* normal tag (open, short open, close), processing instruction. */
318 x->tag[0] = c;
319 x->taglen = 1;
320 x->isshorttag = isend = 0;
321
322 /* treat processing instruction as shorttag, don't strip "?" prefix. */
323 if (c == '?') {
324 x->isshorttag = 1;
325 } else if (c == '/') {
326 if ((c = GETNEXT()) == EOF)
327 return;
328 x->tag[0] = c;
329 isend = 1;
330 }
331
332 while ((c = GETNEXT()) != EOF) {
333 if (c == '/')
334 x->isshorttag = 1; /* short tag */
335 else if (c == '>' || isspace(c)) {
336 x->tag[x->taglen] = '\0';
337 if (isend) { /* end tag, starts with </ */
338 if (x->xmltagend)
339 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
340 x->tag[0] = '\0';
341 x->taglen = 0;
342 } else {
343 /* start tag */
344 if (x->xmltagstart)
345 x->xmltagstart(x, x->tag, x->taglen);
346 if (isspace(c))
347 xml_parseattrs(x);
348 if (x->xmltagstartparsed)
349 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
350 }
351 /* call tagend for shortform or processing instruction */
352 if (x->isshorttag) {
353 if (x->xmltagend)
354 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
355 x->tag[0] = '\0';
356 x->taglen = 0;
357 }
358 break;
359 } else if (x->taglen < sizeof(x->tag) - 1)
360 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
361 }
362 }
363 } else {
364 /* parse tag data */
365 datalen = 0;
366 if (x->xmldatastart)
367 x->xmldatastart(x);
368 while ((c = GETNEXT()) != EOF) {
369 if (c == '&') {
370 if (datalen) {
371 x->data[datalen] = '\0';
372 if (x->xmldata)
373 x->xmldata(x, x->data, datalen);
374 }
375 x->data[0] = c;
376 datalen = 1;
377 while ((c = GETNEXT()) != EOF) {
378 if (c == '<')
379 break;
380 if (datalen < sizeof(x->data) - 1)
381 x->data[datalen++] = c;
382 else {
383 /* entity too long for buffer, handle as normal data */
384 x->data[datalen] = '\0';
385 if (x->xmldata)
386 x->xmldata(x, x->data, datalen);
387 x->data[0] = c;
388 datalen = 1;
389 break;
390 }
391 if (c == ';') {
392 x->data[datalen] = '\0';
393 if (x->xmldataentity)
394 x->xmldataentity(x, x->data, datalen);
395 datalen = 0;
396 break;
397 }
398 }
399 } else if (c != '<') {
400 if (datalen < sizeof(x->data) - 1) {
401 x->data[datalen++] = c;
402 } else {
403 x->data[datalen] = '\0';
404 if (x->xmldata)
405 x->xmldata(x, x->data, datalen);
406 x->data[0] = c;
407 datalen = 1;
408 }
409 }
410 if (c == '<') {
411 x->data[datalen] = '\0';
412 if (x->xmldata && datalen)
413 x->xmldata(x, x->data, datalen);
414 if (x->xmldataend)
415 x->xmldataend(x);
416 break;
417 }
418 }
419 }
420 }
421 }