xml.c - sub - subscene.com subtitle search
(HTM) git clone git://git.codemadness.org/sub
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
xml.c (11059B)
---
1 #include <sys/types.h>
2
3 #include <ctype.h>
4 #include <errno.h>
5 #include <limits.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9
10 #include "xml.h"
11
12 static void
13 xml_parseattrs(XMLParser *x)
14 {
15 size_t namelen = 0, valuelen;
16 int c, endsep, endname = 0, valuestart = 0;
17
18 while ((c = x->getnext()) != EOF) {
19 if (isspace(c)) {
20 if (namelen)
21 endname = 1;
22 continue;
23 } else if (c == '?')
24 ; /* ignore */
25 else if (c == '=') {
26 x->name[namelen] = '\0';
27 valuestart = 1;
28 endname = 1;
29 } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
30 /* attribute without value */
31 x->name[namelen] = '\0';
32 if (x->xmlattrstart)
33 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
34 if (x->xmlattr)
35 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
36 if (x->xmlattrend)
37 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
38 endname = 0;
39 x->name[0] = c;
40 namelen = 1;
41 } else if (namelen && valuestart) {
42 /* attribute with value */
43 if (x->xmlattrstart)
44 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
45
46 valuelen = 0;
47 if (c == '\'' || c == '"') {
48 endsep = c;
49 } else {
50 endsep = ' '; /* isspace() */
51 goto startvalue;
52 }
53
54 while ((c = x->getnext()) != EOF) {
55 startvalue:
56 if (c == '&') { /* entities */
57 x->data[valuelen] = '\0';
58 /* call data function with data before entity if there is data */
59 if (valuelen && x->xmlattr)
60 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
61 x->data[0] = c;
62 valuelen = 1;
63 while ((c = x->getnext()) != EOF) {
64 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
65 break;
66 if (valuelen < sizeof(x->data) - 1)
67 x->data[valuelen++] = c;
68 else {
69 /* entity too long for buffer, handle as normal data */
70 x->data[valuelen] = '\0';
71 if (x->xmlattr)
72 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
73 x->data[0] = c;
74 valuelen = 1;
75 break;
76 }
77 if (c == ';') {
78 x->data[valuelen] = '\0';
79 if (x->xmlattrentity)
80 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
81 valuelen = 0;
82 break;
83 }
84 }
85 } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
86 if (valuelen < sizeof(x->data) - 1) {
87 x->data[valuelen++] = c;
88 } else {
89 x->data[valuelen] = '\0';
90 if (x->xmlattr)
91 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
92 x->data[0] = c;
93 valuelen = 1;
94 }
95 }
96 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
97 x->data[valuelen] = '\0';
98 if (x->xmlattr)
99 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
100 if (x->xmlattrend)
101 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
102 break;
103 }
104 }
105 namelen = endname = valuestart = 0;
106 } else if (namelen < sizeof(x->name) - 1) {
107 x->name[namelen++] = c;
108 }
109 if (c == '>') {
110 break;
111 } else if (c == '/') {
112 x->isshorttag = 1;
113 x->name[0] = '\0';
114 namelen = 0;
115 }
116 }
117 }
118
119 static void
120 xml_parsecomment(XMLParser *x)
121 {
122 size_t datalen = 0, i = 0;
123 int c;
124
125 if (x->xmlcommentstart)
126 x->xmlcommentstart(x);
127 while ((c = x->getnext()) != EOF) {
128 if (c == '-' || c == '>') {
129 if (x->xmlcomment) {
130 x->data[datalen] = '\0';
131 x->xmlcomment(x, x->data, datalen);
132 datalen = 0;
133 }
134 }
135
136 if (c == '-') {
137 if (++i > 2) {
138 if (x->xmlcomment)
139 for (; i > 2; i--)
140 x->xmlcomment(x, "-", 1);
141 i = 2;
142 }
143 continue;
144 } else if (c == '>' && i == 2) {
145 if (x->xmlcommentend)
146 x->xmlcommentend(x);
147 return;
148 } else if (i) {
149 if (x->xmlcomment) {
150 for (; i > 0; i--)
151 x->xmlcomment(x, "-", 1);
152 }
153 i = 0;
154 }
155
156 if (datalen < sizeof(x->data) - 1) {
157 x->data[datalen++] = c;
158 } else {
159 x->data[datalen] = '\0';
160 if (x->xmlcomment)
161 x->xmlcomment(x, x->data, datalen);
162 x->data[0] = c;
163 datalen = 1;
164 }
165 }
166 }
167
168 static void
169 xml_parsecdata(XMLParser *x)
170 {
171 size_t datalen = 0, i = 0;
172 int c;
173
174 if (x->xmlcdatastart)
175 x->xmlcdatastart(x);
176 while ((c = x->getnext()) != EOF) {
177 if (c == ']' || c == '>') {
178 if (x->xmlcdata) {
179 x->data[datalen] = '\0';
180 x->xmlcdata(x, x->data, datalen);
181 datalen = 0;
182 }
183 }
184
185 if (c == ']') {
186 if (++i > 2) {
187 if (x->xmlcdata)
188 for (; i > 2; i--)
189 x->xmlcdata(x, "]", 1);
190 i = 2;
191 }
192 continue;
193 } else if (c == '>' && i == 2) {
194 if (x->xmlcdataend)
195 x->xmlcdataend(x);
196 return;
197 } else if (i) {
198 if (x->xmlcdata)
199 for (; i > 0; i--)
200 x->xmlcdata(x, "]", 1);
201 i = 0;
202 }
203
204 if (datalen < sizeof(x->data) - 1) {
205 x->data[datalen++] = c;
206 } else {
207 x->data[datalen] = '\0';
208 if (x->xmlcdata)
209 x->xmlcdata(x, x->data, datalen);
210 x->data[0] = c;
211 datalen = 1;
212 }
213 }
214 }
215
216 static int
217 codepointtoutf8(long r, char *s)
218 {
219 if (r == 0) {
220 return 0; /* NUL byte */
221 } else if (r <= 0x7F) {
222 /* 1 byte: 0aaaaaaa */
223 s[0] = r;
224 return 1;
225 } else if (r <= 0x07FF) {
226 /* 2 bytes: 00000aaa aabbbbbb */
227 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
228 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
229 return 2;
230 } else if (r <= 0xFFFF) {
231 /* 3 bytes: aaaabbbb bbcccccc */
232 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
233 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
234 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
235 return 3;
236 } else {
237 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
238 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
239 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
240 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
241 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
242 return 4;
243 }
244 }
245
246 static int
247 namedentitytostr(const char *e, char *buf, size_t bufsiz)
248 {
249 static const struct {
250 char *entity;
251 int c;
252 } entities[] = {
253 { "&", '&' },
254 { "<", '<' },
255 { ">", '>' },
256 { "'", '\'' },
257 { """, '"' },
258 { "&", '&' },
259 { "<", '<' },
260 { ">", '>' },
261 { "&APOS;", '\'' },
262 { """, '"' }
263 };
264 size_t i;
265
266 /* buffer is too small */
267 if (bufsiz < 2)
268 return -1;
269
270 /* doesn't start with &: can't match */
271 if (*e != '&')
272 return 0;
273
274 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
275 if (!strcmp(e, entities[i].entity)) {
276 buf[0] = entities[i].c;
277 buf[1] = '\0';
278 return 1;
279 }
280 }
281 return 0;
282 }
283
284 static int
285 numericentitytostr(const char *e, char *buf, size_t bufsiz)
286 {
287 long l;
288 int len;
289 char *end;
290
291 /* buffer is too small */
292 if (bufsiz < 5)
293 return -1;
294
295 /* not a numeric entity */
296 if (e[0] != '&' || e[1] != '#')
297 return 0;
298
299 /* e[1] == '#', numeric / hexadecimal entity */
300 e += 2; /* skip "&#" */
301 errno = 0;
302 /* hex (16) or decimal (10) */
303 if (*e == 'x')
304 l = strtoul(e + 1, &end, 16);
305 else
306 l = strtoul(e, &end, 10);
307 /* invalid value or not a well-formed entity or too high codepoint */
308 if (errno || *end != ';' || l > 0x10FFFF)
309 return 0;
310 len = codepointtoutf8(l, buf);
311 buf[len] = '\0';
312
313 return len;
314 }
315
316 /* convert named- or numeric entity string to buffer string
317 * returns byte-length of string. */
318 int
319 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
320 {
321 /* buffer is too small */
322 if (bufsiz < 5)
323 return -1;
324 /* doesn't start with & */
325 if (e[0] != '&')
326 return 0;
327 /* named entity */
328 if (e[1] != '#')
329 return namedentitytostr(e, buf, bufsiz);
330 else /* numeric entity */
331 return numericentitytostr(e, buf, bufsiz);
332 }
333
334 void
335 xml_parse(XMLParser *x)
336 {
337 int c, ispi;
338 size_t datalen, tagdatalen, taglen;
339
340 if (!x->getnext)
341 return;
342 while ((c = x->getnext()) != EOF && c != '<')
343 ; /* skip until < */
344
345 while (c != EOF) {
346 if (c == '<') { /* parse tag */
347 if ((c = x->getnext()) == EOF)
348 return;
349
350 if (c == '!') { /* cdata and comments */
351 for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
352 /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
353 if (tagdatalen <= sizeof("[CDATA[") - 1)
354 x->data[tagdatalen++] = c;
355 if (c == '>')
356 break;
357 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
358 (x->data[0] == '-')) {
359 xml_parsecomment(x);
360 break;
361 } else if (c == '[') {
362 if (tagdatalen == sizeof("[CDATA[") - 1 &&
363 !strncmp(x->data, "[CDATA[", tagdatalen)) {
364 xml_parsecdata(x);
365 break;
366 }
367 }
368 }
369 } else {
370 x->tag[0] = '\0';
371 x->taglen = 0;
372
373 /* normal tag (open, short open, close), processing instruction. */
374 if (isspace(c))
375 while ((c = x->getnext()) != EOF && isspace(c))
376 ;
377 if (c == EOF)
378 return;
379 x->tag[0] = c;
380 ispi = (c == '?') ? 1 : 0;
381 x->isshorttag = ispi;
382 taglen = 1;
383 while ((c = x->getnext()) != EOF) {
384 if (c == '/')
385 x->isshorttag = 1; /* short tag */
386 else if (c == '>' || isspace(c)) {
387 x->tag[taglen] = '\0';
388 if (x->tag[0] == '/') { /* end tag, starts with </ */
389 x->taglen = --taglen; /* len -1 because of / */
390 if (taglen && x->xmltagend)
391 x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
392 } else {
393 x->taglen = taglen;
394 /* start tag */
395 if (x->xmltagstart)
396 x->xmltagstart(x, x->tag, x->taglen);
397 if (isspace(c))
398 xml_parseattrs(x);
399 if (x->xmltagstartparsed)
400 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
401 }
402 /* call tagend for shortform or processing instruction */
403 if ((x->isshorttag || ispi) && x->xmltagend)
404 x->xmltagend(x, x->tag, x->taglen, 1);
405 break;
406 } else if (taglen < sizeof(x->tag) - 1)
407 x->tag[taglen++] = c; /* NOTE: tag name truncation */
408 }
409 }
410 } else {
411 /* parse tag data */
412 datalen = 0;
413 if (x->xmldatastart)
414 x->xmldatastart(x);
415 while ((c = x->getnext()) != EOF) {
416 if (c == '&') {
417 if (datalen) {
418 x->data[datalen] = '\0';
419 if (x->xmldata)
420 x->xmldata(x, x->data, datalen);
421 }
422 x->data[0] = c;
423 datalen = 1;
424 while ((c = x->getnext()) != EOF) {
425 if (c == '<')
426 break;
427 if (datalen < sizeof(x->data) - 1)
428 x->data[datalen++] = c;
429 else {
430 /* entity too long for buffer, handle as normal data */
431 x->data[datalen] = '\0';
432 if (x->xmldata)
433 x->xmldata(x, x->data, datalen);
434 x->data[0] = c;
435 datalen = 1;
436 break;
437 }
438 if (c == ';') {
439 x->data[datalen] = '\0';
440 if (x->xmldataentity)
441 x->xmldataentity(x, x->data, datalen);
442 datalen = 0;
443 break;
444 }
445 }
446 } else if (c != '<') {
447 if (datalen < sizeof(x->data) - 1) {
448 x->data[datalen++] = c;
449 } else {
450 x->data[datalen] = '\0';
451 if (x->xmldata)
452 x->xmldata(x, x->data, datalen);
453 x->data[0] = c;
454 datalen = 1;
455 }
456 }
457 if (c == '<') {
458 x->data[datalen] = '\0';
459 if (x->xmldata && datalen)
460 x->xmldata(x, x->data, datalen);
461 if (x->xmldataend)
462 x->xmldataend(x);
463 break;
464 }
465 }
466 }
467 }
468 }