xml.c - grabtitle - stupid HTML title grabber
(HTM) git clone git://git.codemadness.org/grabtitle
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
xml.c (8078B)
---
1 #include <errno.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "xml.h"
7
8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
10
11 static void
12 xml_parseattrs(XMLParser *x)
13 {
14 size_t namelen = 0;
15 int c, endsep, endname = 0, valuestart = 0;
16
17 while ((c = GETNEXT()) != EOF) {
18 if (ISSPACE(c)) {
19 if (namelen)
20 endname = 1;
21 continue;
22 } else if (c == '?')
23 ; /* ignore */
24 else if (c == '=') {
25 valuestart = 1;
26 endname = 1;
27 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
28 endname = 0;
29 namelen = 1;
30 } else if (namelen && valuestart) {
31 /* attribute with value */
32 if (c == '\'' || c == '"') {
33 endsep = c;
34 while ((c = GETNEXT()) != EOF) {
35 if (c == endsep)
36 break;
37 }
38 } else {
39 while ((c = GETNEXT()) != EOF) {
40 if (c == '>' || ISSPACE(c))
41 break;
42 }
43 }
44 namelen = endname = valuestart = 0;
45 } else {
46 namelen = 1;
47 }
48 if (c == '>') {
49 break;
50 } else if (c == '/') {
51 x->isshorttag = 1;
52 namelen = 0;
53 }
54 }
55 }
56
57 static void
58 xml_parsecomment(XMLParser *x)
59 {
60 size_t i = 0;
61 int c;
62
63 while ((c = GETNEXT()) != EOF) {
64 if (c == '-') {
65 if (i < 2)
66 i++;
67 } else if (c == '>' && i == 2) {
68 return;
69 } else {
70 i = 0;
71 }
72 }
73 }
74
75 static void
76 xml_parsecdata(XMLParser *x)
77 {
78 size_t datalen = 0, i = 0;
79 int c;
80
81 while ((c = GETNEXT()) != EOF) {
82 if (c == ']' || c == '>') {
83 if (x->xmlcdata) {
84 x->data[datalen] = '\0';
85 x->xmlcdata(x, x->data, datalen);
86 datalen = 0;
87 }
88 }
89
90 if (c == ']') {
91 if (++i > 2) {
92 if (x->xmlcdata)
93 for (; i > 2; i--)
94 x->xmlcdata(x, "]", 1);
95 i = 2;
96 }
97 continue;
98 } else if (c == '>' && i == 2) {
99 return;
100 } else {
101 if (x->xmlcdata)
102 for (; i > 0; i--)
103 x->xmlcdata(x, "]", 1);
104 i = 0;
105 }
106
107 if (datalen < sizeof(x->data) - 1) {
108 x->data[datalen++] = c;
109 } else {
110 x->data[datalen] = '\0';
111 if (x->xmlcdata)
112 x->xmlcdata(x, x->data, datalen);
113 x->data[0] = c;
114 datalen = 1;
115 }
116 }
117 }
118
119 static int
120 codepointtoutf8(long r, char *s)
121 {
122 if (r == 0) {
123 return 0; /* NUL byte */
124 } else if (r <= 0x7F) {
125 /* 1 byte: 0aaaaaaa */
126 s[0] = r;
127 return 1;
128 } else if (r <= 0x07FF) {
129 /* 2 bytes: 00000aaa aabbbbbb */
130 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
131 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
132 return 2;
133 } else if (r <= 0xFFFF) {
134 /* 3 bytes: aaaabbbb bbcccccc */
135 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
136 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
137 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
138 return 3;
139 } else {
140 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
141 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
142 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
143 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
144 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
145 return 4;
146 }
147 }
148
149 struct namedentity {
150 const char *entity;
151 long cp;
152 };
153
154 int
155 namedentitycmp(const void *v1, const void *v2)
156 {
157 struct namedentity *n1 = (struct namedentity *)v1;
158 struct namedentity *n2 = (struct namedentity *)v2;
159
160 return strcmp(n1->entity, n2->entity);
161 }
162
163 static int
164 namedentitytostr(const char *e, char *buf, size_t bufsiz)
165 {
166 static const struct namedentity entities[] = {
167 #include "namedentities.h"
168 };
169 struct namedentity find, *found;
170 size_t i;
171
172 /* buffer is too small */
173 if (bufsiz < 5)
174 return -1;
175
176 find.entity = e;
177 found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities),
178 sizeof(*entities), namedentitycmp);
179 if (found) {
180 i = codepointtoutf8(found->cp, buf);
181 buf[i] = '\0';
182 return i;
183 }
184 return -1;
185 }
186
187 static int
188 numericentitytostr(const char *e, char *buf, size_t bufsiz)
189 {
190 long l;
191 int len;
192 char *end;
193
194 /* buffer is too small */
195 if (bufsiz < 5)
196 return -1;
197
198 errno = 0;
199 /* hex (16) or decimal (10) */
200 if (*e == 'x')
201 l = strtol(++e, &end, 16);
202 else
203 l = strtol(e, &end, 10);
204 /* invalid value or not a well-formed entity or invalid code point */
205 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
206 (l >= 0xd800 && l <= 0xdfff))
207 return -1;
208 len = codepointtoutf8(l, buf);
209 buf[len] = '\0';
210
211 return len;
212 }
213
214 /* convert named- or numeric entity string to buffer string
215 * returns byte-length of string or -1 on failure. */
216 int
217 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
218 {
219 /* doesn't start with & */
220 if (e[0] != '&')
221 return -1;
222 /* numeric entity */
223 if (e[1] == '#')
224 return numericentitytostr(e + 2, buf, bufsiz);
225 else /* named entity */
226 return namedentitytostr(e + 1, buf, bufsiz);
227 }
228
229 void
230 xml_parse(XMLParser *x)
231 {
232 size_t datalen, tagdatalen;
233 int c, isend;
234
235 while ((c = GETNEXT()) != EOF && c != '<')
236 ; /* skip until < */
237
238 while (c != EOF) {
239 if (c == '<') { /* parse tag */
240 if ((c = GETNEXT()) == EOF)
241 return;
242
243 if (c == '!') { /* CDATA and comments */
244 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
245 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
246 if (tagdatalen <= sizeof("[CDATA[") - 1)
247 x->data[tagdatalen++] = c;
248 if (c == '>')
249 break;
250 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
251 (x->data[0] == '-')) {
252 xml_parsecomment(x);
253 break;
254 } else if (c == '[') {
255 if (tagdatalen == sizeof("[CDATA[") - 1 &&
256 !strncmp(x->data, "[CDATA[", tagdatalen)) {
257 xml_parsecdata(x);
258 break;
259 }
260 }
261 }
262 } else {
263 /* normal tag (open, short open, close), processing instruction. */
264 x->tag[0] = c;
265 x->taglen = 1;
266 x->isshorttag = isend = 0;
267
268 /* treat processing instruction as short tag, don't strip "?" prefix. */
269 if (c == '?') {
270 x->isshorttag = 1;
271 } else if (c == '/') {
272 if ((c = GETNEXT()) == EOF)
273 return;
274 x->tag[0] = c;
275 isend = 1;
276 }
277
278 while ((c = GETNEXT()) != EOF) {
279 if (c == '/')
280 x->isshorttag = 1; /* short tag */
281 else if (c == '>' || ISSPACE(c)) {
282 x->tag[x->taglen] = '\0';
283 if (isend) { /* end tag, starts with </ */
284 while (c != '>' && c != EOF) /* skip until > */
285 c = GETNEXT();
286 if (x->xmltagend)
287 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
288 x->tag[0] = '\0';
289 x->taglen = 0;
290 } else {
291 /* start tag */
292 if (x->xmltagstart)
293 x->xmltagstart(x, x->tag, x->taglen);
294 if (ISSPACE(c))
295 xml_parseattrs(x);
296 }
297 /* call tagend for short tag or processing instruction */
298 if (x->isshorttag) {
299 if (x->xmltagend)
300 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
301 x->tag[0] = '\0';
302 x->taglen = 0;
303 }
304 break;
305 } else if (x->taglen < sizeof(x->tag) - 1)
306 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
307 }
308 }
309 } else {
310 /* parse tag data */
311 datalen = 0;
312 while ((c = GETNEXT()) != EOF) {
313 if (c == '&') {
314 if (datalen) {
315 x->data[datalen] = '\0';
316 if (x->xmldata)
317 x->xmldata(x, x->data, datalen);
318 }
319 x->data[0] = c;
320 datalen = 1;
321 while ((c = GETNEXT()) != EOF) {
322 if (c == '<')
323 break;
324 if (datalen < sizeof(x->data) - 1)
325 x->data[datalen++] = c;
326 else {
327 /* entity too long for buffer, handle as normal data */
328 x->data[datalen] = '\0';
329 if (x->xmldata)
330 x->xmldata(x, x->data, datalen);
331 x->data[0] = c;
332 datalen = 1;
333 break;
334 }
335 if (c == ';') {
336 x->data[datalen] = '\0';
337 if (x->xmldataentity)
338 x->xmldataentity(x, x->data, datalen);
339 datalen = 0;
340 break;
341 }
342 }
343 } else if (c != '<') {
344 if (datalen < sizeof(x->data) - 1) {
345 x->data[datalen++] = c;
346 } else {
347 x->data[datalen] = '\0';
348 if (x->xmldata)
349 x->xmldata(x, x->data, datalen);
350 x->data[0] = c;
351 datalen = 1;
352 }
353 }
354 if (c == '<') {
355 x->data[datalen] = '\0';
356 if (x->xmldata && datalen)
357 x->xmldata(x, x->data, datalen);
358 break;
359 }
360 }
361 }
362 }
363 }