checkhtml.c - randomcrap - random crap programs of varying quality
(HTM) git clone git://git.codemadness.org/randomcrap
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
checkhtml.c (10551B)
---
1 /*
2 Do some checks on XHTML and HTML, with some extra strict rules applied:
3 - Checks unclosed/unbalanced tags.
4 - It does not check all HTML named entities (there are many).
5
6 Examples:
7
8 Check a whole directory of HTML files:
9
10 for f in *.html; do checkhtml < $f; done
11
12 Check a single XHTML file for errors:
13
14 checkhtml -x < somefile.html
15 */
16
17 #include <ctype.h>
18 #include <errno.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #ifdef __OpenBSD__
24 #include <unistd.h>
25 #else
26 #define pledge(p1,p2) 0
27 #endif
28
29 #define MAX_DEPTH 256
30 static struct {
31 char tag[256];
32 size_t linechar;
33 size_t linenr;
34 } nodes[MAX_DEPTH];
35
36 static size_t depth, linechar = 1, linenr = 1;
37 static int checkxhtml, exitstatus;
38
39 /* tags that may be unclosed and are closed automatically */
40 static const char *autoclose[] = {
41 "area", "base", "br", "col", "embed", "hr", "img", "input", "link",
42 "meta", "param", "source", "track", "wbr"
43 };
44
45 int
46 getnext(void)
47 {
48 int c;
49
50 if ((c = getchar()) == '\n') {
51 linechar = 1;
52 linenr++;
53 } else {
54 linechar++;
55 }
56
57 return c;
58 }
59 #define GETNEXT getnext
60
61 typedef struct xmlparser {
62 /* handlers */
63 void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
64 const char *, size_t, const char *, size_t);
65 void (*xmldataentity)(struct xmlparser *, const char *, size_t);
66 void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
67 void (*xmltagstartparsed)(struct xmlparser *, const char *,
68 size_t, int);
69
70 /* current tag */
71 char tag[1024];
72 size_t taglen;
73 /* current tag is in short form ? <tag /> */
74 int isshorttag;
75 /* current attribute name */
76 char name[1024];
77 /* data buffer used for tag data, cdata and attribute data */
78 char data[BUFSIZ];
79 } XMLParser;
80
81 void
82 xml_parseattrs(XMLParser *x)
83 {
84 size_t namelen = 0, valuelen;
85 int c, endsep, endname = 0, valuestart = 0;
86
87 while ((c = GETNEXT()) != EOF) {
88 if (isspace(c)) {
89 if (namelen)
90 endname = 1;
91 continue;
92 } else if (c == '?')
93 ; /* ignore */
94 else if (c == '=') {
95 x->name[namelen] = '\0';
96 valuestart = 1;
97 endname = 1;
98 } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
99 /* attribute without value */
100 x->name[namelen] = '\0';
101 endname = 0;
102 x->name[0] = c;
103 namelen = 1;
104 } else if (namelen && valuestart) {
105 /* attribute with value */
106 valuelen = 0;
107 if (c == '\'' || c == '"') {
108 endsep = c;
109 } else {
110 endsep = ' '; /* isspace() */
111 goto startvalue;
112 }
113
114 while ((c = GETNEXT()) != EOF) {
115 startvalue:
116 if (c == '&') { /* entities */
117 x->data[valuelen] = '\0';
118 x->data[0] = c;
119 valuelen = 1;
120 while ((c = GETNEXT()) != EOF) {
121 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
122 break;
123 if (valuelen < sizeof(x->data) - 1)
124 x->data[valuelen++] = c;
125 else {
126 /* entity too long for buffer, handle as normal data */
127 x->data[valuelen] = '\0';
128 x->data[0] = c;
129 valuelen = 1;
130 break;
131 }
132 if (c == ';') {
133 x->data[valuelen] = '\0';
134 if (x->xmlattrentity)
135 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
136 valuelen = 0;
137 break;
138 }
139 }
140 } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
141 if (valuelen < sizeof(x->data) - 1) {
142 x->data[valuelen++] = c;
143 } else {
144 x->data[valuelen] = '\0';
145 x->data[0] = c;
146 valuelen = 1;
147 }
148 }
149 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
150 x->data[valuelen] = '\0';
151 break;
152 }
153 }
154 namelen = endname = valuestart = 0;
155 } else if (namelen < sizeof(x->name) - 1) {
156 x->name[namelen++] = c;
157 }
158 if (c == '>') {
159 break;
160 } else if (c == '/') {
161 x->isshorttag = 1;
162 x->name[0] = '\0';
163 namelen = 0;
164 }
165 }
166 }
167
168 void
169 xml_parsecomment(XMLParser *x)
170 {
171 size_t i = 0;
172 int c;
173
174 while ((c = GETNEXT()) != EOF) {
175 if (c == '-') {
176 if (++i > 2) {
177 i = 2;
178 }
179 continue;
180 } else if (c == '>' && i == 2) {
181 return;
182 } else if (i) {
183 i = 0;
184 }
185 }
186 }
187
188 void
189 xml_parsecdata(XMLParser *x)
190 {
191 size_t i = 0;
192 int c;
193
194 while ((c = GETNEXT()) != EOF) {
195 if (c == ']') {
196 if (++i > 2) {
197 i = 2;
198 }
199 continue;
200 } else if (c == '>' && i == 2) {
201 return;
202 } else if (i) {
203 i = 0;
204 }
205 }
206 }
207
208 int
209 checknamedentity(const char *e)
210 {
211 static const char *entities[] = {
212 "amp;", "lt;", "gt;", "apos;", "quot;", "nbsp;", "copy;",
213 "ndash;", "euro;", "dollar;", "yen;"
214 };
215 size_t i;
216
217 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++)
218 if (!strcmp(e, entities[i]))
219 return 1;
220
221 return 0;
222 }
223
224 int
225 checknumericentity(const char *e)
226 {
227 long l;
228 char *end;
229
230 errno = 0;
231 /* hex (16) or decimal (10) */
232 if (*e == 'x')
233 l = strtol(++e, &end, 16);
234 else
235 l = strtol(e, &end, 10);
236 /* invalid value or not a well-formed entity or too high codepoint */
237 if (errno || e == end || *end != ';' || l < 0 || l > 0x10FFFF)
238 return 0;
239 return 1;
240 }
241
242 /* convert named- or numeric entity string to buffer string
243 * returns byte-length of string. */
244 int
245 checkentity(const char *e)
246 {
247 /* doesn't start with & */
248 if (e[0] != '&')
249 return 0;
250 /* numeric entity */
251 if (e[1] == '#')
252 return checknumericentity(e + 2);
253 else /* named entity */
254 return checknamedentity(e + 1);
255 }
256
257 void
258 xml_parse(XMLParser *x)
259 {
260 size_t datalen, tagdatalen;
261 int c, isend;
262
263 while ((c = GETNEXT()) != EOF && c != '<')
264 ; /* skip until < */
265
266 while (c != EOF) {
267 if (c == '<') { /* parse tag */
268 if ((c = GETNEXT()) == EOF)
269 return;
270
271 if (c == '!') { /* cdata and comments */
272 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
273 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
274 if (tagdatalen <= sizeof("[CDATA[") - 1)
275 x->data[tagdatalen++] = c;
276 if (c == '>')
277 break;
278 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
279 (x->data[0] == '-')) {
280 xml_parsecomment(x);
281 break;
282 } else if (c == '[') {
283 if (tagdatalen == sizeof("[CDATA[") - 1 &&
284 !strncmp(x->data, "[CDATA[", tagdatalen)) {
285 xml_parsecdata(x);
286 break;
287 }
288 }
289 }
290 } else {
291 /* normal tag (open, short open, close), processing instruction. */
292 x->tag[0] = c;
293 x->taglen = 1;
294 x->isshorttag = isend = 0;
295
296 /* treat processing instruction as shorttag, don't strip "?" prefix. */
297 if (c == '?') {
298 x->isshorttag = 1;
299 } else if (c == '/') {
300 if ((c = GETNEXT()) == EOF)
301 return;
302 x->tag[0] = c;
303 isend = 1;
304 }
305
306 while ((c = GETNEXT()) != EOF) {
307 if (c == '/')
308 x->isshorttag = 1; /* short tag */
309 else if (c == '>' || isspace(c)) {
310 x->tag[x->taglen] = '\0';
311 if (isend) { /* end tag, starts with </ */
312 if (x->xmltagend)
313 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
314 x->tag[0] = '\0';
315 x->taglen = 0;
316 } else {
317 /* start tag */
318 if (isspace(c))
319 xml_parseattrs(x);
320 if (x->xmltagstartparsed)
321 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
322 }
323 /* call tagend for shortform or processing instruction */
324 if (x->isshorttag) {
325 if (x->xmltagend)
326 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
327 x->tag[0] = '\0';
328 x->taglen = 0;
329 }
330 break;
331 } else if (x->taglen < sizeof(x->tag) - 1)
332 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
333 }
334 }
335 } else {
336 /* parse tag data */
337 datalen = 0;
338 while ((c = GETNEXT()) != EOF) {
339 if (c == '&') {
340 if (datalen) {
341 x->data[datalen] = '\0';
342 }
343 x->data[0] = c;
344 datalen = 1;
345 while ((c = GETNEXT()) != EOF) {
346 if (c == '<')
347 break;
348 if (datalen < sizeof(x->data) - 1)
349 x->data[datalen++] = c;
350 else {
351 /* entity too long for buffer, handle as normal data */
352 x->data[datalen] = '\0';
353 x->data[0] = c;
354 datalen = 1;
355 break;
356 }
357 if (c == ';') {
358 x->data[datalen] = '\0';
359 if (x->xmldataentity)
360 x->xmldataentity(x, x->data, datalen);
361 datalen = 0;
362 break;
363 }
364 }
365 } else if (c != '<') {
366 if (datalen < sizeof(x->data) - 1) {
367 x->data[datalen++] = c;
368 } else {
369 x->data[datalen] = '\0';
370 x->data[0] = c;
371 datalen = 1;
372 }
373 }
374 if (c == '<') {
375 x->data[datalen] = '\0';
376 break;
377 }
378 }
379 }
380 }
381 }
382
383 void
384 xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
385 const char *v, size_t vl)
386 {
387 if (!checkentity(v)) {
388 printf("%zu:%zu: invalid entity in attribute: %s %s %s\n",
389 linenr, linechar, t, a, v);
390 exitstatus = 1;
391 }
392 }
393
394 void
395 xmldataentity(XMLParser *x, const char *d, size_t dl)
396 {
397 if (!checkentity(d)) {
398 printf("%zu:%zu: invalid entity: %s\n", linenr, linechar, d);
399 exitstatus = 1;
400 }
401 }
402
403 void
404 xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
405 {
406 if (isshort)
407 return;
408
409 if (depth)
410 depth--;
411 else
412 goto unbalanced;
413
414 if (nodes[depth].tag[0] && strcmp(t, nodes[depth].tag))
415 goto unbalanced;
416
417 memset(&nodes[depth], 0, sizeof(nodes[0]));
418 return;
419
420 unbalanced:
421 printf("%zu:%zu: unbalanced tag %s, expected: </%s> (ends at %zu:%zu)\n",
422 nodes[depth].linenr, nodes[depth].linechar, t, nodes[depth].tag,
423 linenr, linechar);
424 exit(1);
425 }
426
427 void
428 xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
429 {
430 int i;
431
432 if (isshort) {
433 return;
434 } else if (!checkxhtml) {
435 /* HTML has tag that don't need to be closed, allow this. */
436 for (i = 0; i < sizeof(autoclose) / sizeof(autoclose[0]); i++)
437 if (!strcmp(t, autoclose[i]))
438 return;
439 }
440
441 if (depth + 1 >= MAX_DEPTH) {
442 printf("%zu:%zu: too deep >= %d\n", linenr, linechar, MAX_DEPTH);
443 exit(1);
444 }
445 snprintf(nodes[depth].tag, sizeof(nodes[0].tag), "%s", t);
446 nodes[depth].linenr = linenr;
447 nodes[depth].linechar = linechar;
448 depth++;
449 }
450
451 void
452 usage(const char *argv0)
453 {
454 fprintf(stderr, "usage: %s [-x]\n", argv0);
455 exit(1);
456 }
457
458 int
459 main(int argc, char *argv[])
460 {
461 XMLParser x = { 0 };
462
463 if (pledge("stdio", NULL) == -1) {
464 perror("pledge");
465 exit(1);
466 }
467
468 if (argc > 1) {
469 if (argc == 2 && !strcmp(argv[1], "-x"))
470 checkxhtml = 1;
471 else
472 usage(argv[0]);
473 }
474
475 x.xmlattrentity = xmlattrentity;
476 x.xmldataentity = xmldataentity;
477 x.xmltagend = xmltagend;
478 x.xmltagstartparsed = xmltagstartparsed;
479 xml_parse(&x);
480
481 if (depth) {
482 printf("%zu:%zu: unbalanced: %s\n",
483 nodes[depth - 1].linenr, nodes[depth - 1].linechar,
484 nodes[depth - 1].tag);
485 exit(1);
486 }
487
488 return exitstatus;
489 }