checkhtml.c - randomcrap - random crap programs of varying quality
 (HTM) git clone git://git.codemadness.org/randomcrap
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       checkhtml.c (10551B)
       ---
            1 /*
            2 Do some checks on XHTML and HTML, with some extra strict rules applied:
            3 - Checks unclosed/unbalanced tags.
            4 - It does not check all HTML named entities (there are many).
            5 
            6 Examples:
            7 
            8 Check a whole directory of HTML files:
            9 
           10         for f in *.html; do checkhtml < $f; done
           11 
           12 Check a single XHTML file for errors:
           13 
           14         checkhtml -x < somefile.html
           15 */
           16 
           17 #include <ctype.h>
           18 #include <errno.h>
           19 #include <stdio.h>
           20 #include <stdlib.h>
           21 #include <string.h>
           22 
           23 #ifdef __OpenBSD__
           24 #include <unistd.h>
           25 #else
           26 #define pledge(p1,p2) 0
           27 #endif
           28 
           29 #define MAX_DEPTH 256
           30 static struct {
           31         char tag[256];
           32         size_t linechar;
           33         size_t linenr;
           34 } nodes[MAX_DEPTH];
           35 
           36 static size_t depth, linechar = 1, linenr = 1;
           37 static int checkxhtml, exitstatus;
           38 
           39 /* tags that may be unclosed and are closed automatically */
           40 static const char *autoclose[] = {
           41         "area", "base", "br", "col", "embed", "hr", "img", "input", "link",
           42         "meta", "param", "source", "track", "wbr"
           43 };
           44 
           45 int
           46 getnext(void)
           47 {
           48         int c;
           49 
           50         if ((c = getchar()) == '\n') {
           51                 linechar = 1;
           52                 linenr++;
           53         } else {
           54                 linechar++;
           55         }
           56 
           57         return c;
           58 }
           59 #define GETNEXT getnext
           60 
           61 typedef struct xmlparser {
           62         /* handlers */
           63         void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
           64               const char *, size_t, const char *, size_t);
           65         void (*xmldataentity)(struct xmlparser *, const char *, size_t);
           66         void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
           67         void (*xmltagstartparsed)(struct xmlparser *, const char *,
           68               size_t, int);
           69 
           70         /* current tag */
           71         char tag[1024];
           72         size_t taglen;
           73         /* current tag is in short form ? <tag /> */
           74         int isshorttag;
           75         /* current attribute name */
           76         char name[1024];
           77         /* data buffer used for tag data, cdata and attribute data */
           78         char data[BUFSIZ];
           79 } XMLParser;
           80 
           81 void
           82 xml_parseattrs(XMLParser *x)
           83 {
           84         size_t namelen = 0, valuelen;
           85         int c, endsep, endname = 0, valuestart = 0;
           86 
           87         while ((c = GETNEXT()) != EOF) {
           88                 if (isspace(c)) {
           89                         if (namelen)
           90                                 endname = 1;
           91                         continue;
           92                 } else if (c == '?')
           93                         ; /* ignore */
           94                 else if (c == '=') {
           95                         x->name[namelen] = '\0';
           96                         valuestart = 1;
           97                         endname = 1;
           98                 } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
           99                         /* attribute without value */
          100                         x->name[namelen] = '\0';
          101                         endname = 0;
          102                         x->name[0] = c;
          103                         namelen = 1;
          104                 } else if (namelen && valuestart) {
          105                         /* attribute with value */
          106                         valuelen = 0;
          107                         if (c == '\'' || c == '"') {
          108                                 endsep = c;
          109                         } else {
          110                                 endsep = ' '; /* isspace() */
          111                                 goto startvalue;
          112                         }
          113 
          114                         while ((c = GETNEXT()) != EOF) {
          115 startvalue:
          116                                 if (c == '&') { /* entities */
          117                                         x->data[valuelen] = '\0';
          118                                         x->data[0] = c;
          119                                         valuelen = 1;
          120                                         while ((c = GETNEXT()) != EOF) {
          121                                                 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
          122                                                         break;
          123                                                 if (valuelen < sizeof(x->data) - 1)
          124                                                         x->data[valuelen++] = c;
          125                                                 else {
          126                                                         /* entity too long for buffer, handle as normal data */
          127                                                         x->data[valuelen] = '\0';
          128                                                         x->data[0] = c;
          129                                                         valuelen = 1;
          130                                                         break;
          131                                                 }
          132                                                 if (c == ';') {
          133                                                         x->data[valuelen] = '\0';
          134                                                         if (x->xmlattrentity)
          135                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          136                                                         valuelen = 0;
          137                                                         break;
          138                                                 }
          139                                         }
          140                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
          141                                         if (valuelen < sizeof(x->data) - 1) {
          142                                                 x->data[valuelen++] = c;
          143                                         } else {
          144                                                 x->data[valuelen] = '\0';
          145                                                 x->data[0] = c;
          146                                                 valuelen = 1;
          147                                         }
          148                                 }
          149                                 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
          150                                         x->data[valuelen] = '\0';
          151                                         break;
          152                                 }
          153                         }
          154                         namelen = endname = valuestart = 0;
          155                 } else if (namelen < sizeof(x->name) - 1) {
          156                         x->name[namelen++] = c;
          157                 }
          158                 if (c == '>') {
          159                         break;
          160                 } else if (c == '/') {
          161                         x->isshorttag = 1;
          162                         x->name[0] = '\0';
          163                         namelen = 0;
          164                 }
          165         }
          166 }
          167 
          168 void
          169 xml_parsecomment(XMLParser *x)
          170 {
          171         size_t i = 0;
          172         int c;
          173 
          174         while ((c = GETNEXT()) != EOF) {
          175                 if (c == '-') {
          176                         if (++i > 2) {
          177                                 i = 2;
          178                         }
          179                         continue;
          180                 } else if (c == '>' && i == 2) {
          181                         return;
          182                 } else if (i) {
          183                         i = 0;
          184                 }
          185         }
          186 }
          187 
          188 void
          189 xml_parsecdata(XMLParser *x)
          190 {
          191         size_t i = 0;
          192         int c;
          193 
          194         while ((c = GETNEXT()) != EOF) {
          195                 if (c == ']') {
          196                         if (++i > 2) {
          197                                 i = 2;
          198                         }
          199                         continue;
          200                 } else if (c == '>' && i == 2) {
          201                         return;
          202                 } else if (i) {
          203                         i = 0;
          204                 }
          205         }
          206 }
          207 
          208 int
          209 checknamedentity(const char *e)
          210 {
          211         static const char *entities[] = {
          212                 "amp;", "lt;", "gt;", "apos;", "quot;", "nbsp;", "copy;",
          213                 "ndash;", "euro;", "dollar;", "yen;"
          214         };
          215         size_t i;
          216 
          217         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++)
          218                 if (!strcmp(e, entities[i]))
          219                         return 1;
          220 
          221         return 0;
          222 }
          223 
          224 int
          225 checknumericentity(const char *e)
          226 {
          227         long l;
          228         char *end;
          229 
          230         errno = 0;
          231         /* hex (16) or decimal (10) */
          232         if (*e == 'x')
          233                 l = strtol(++e, &end, 16);
          234         else
          235                 l = strtol(e, &end, 10);
          236         /* invalid value or not a well-formed entity or too high codepoint */
          237         if (errno || e == end || *end != ';' || l < 0 || l > 0x10FFFF)
          238                 return 0;
          239         return 1;
          240 }
          241 
          242 /* convert named- or numeric entity string to buffer string
          243  * returns byte-length of string. */
          244 int
          245 checkentity(const char *e)
          246 {
          247         /* doesn't start with & */
          248         if (e[0] != '&')
          249                 return 0;
          250         /* numeric entity */
          251         if (e[1] == '#')
          252                 return checknumericentity(e + 2);
          253         else /* named entity */
          254                 return checknamedentity(e + 1);
          255 }
          256 
          257 void
          258 xml_parse(XMLParser *x)
          259 {
          260         size_t datalen, tagdatalen;
          261         int c, isend;
          262 
          263         while ((c = GETNEXT()) != EOF && c != '<')
          264                 ; /* skip until < */
          265 
          266         while (c != EOF) {
          267                 if (c == '<') { /* parse tag */
          268                         if ((c = GETNEXT()) == EOF)
          269                                 return;
          270 
          271                         if (c == '!') { /* cdata and comments */
          272                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          273                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          274                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          275                                                 x->data[tagdatalen++] = c;
          276                                         if (c == '>')
          277                                                 break;
          278                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          279                                                         (x->data[0] == '-')) {
          280                                                 xml_parsecomment(x);
          281                                                 break;
          282                                         } else if (c == '[') {
          283                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          284                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          285                                                         xml_parsecdata(x);
          286                                                         break;
          287                                                 }
          288                                         }
          289                                 }
          290                         } else {
          291                                 /* normal tag (open, short open, close), processing instruction. */
          292                                 x->tag[0] = c;
          293                                 x->taglen = 1;
          294                                 x->isshorttag = isend = 0;
          295 
          296                                 /* treat processing instruction as shorttag, don't strip "?" prefix. */
          297                                 if (c == '?') {
          298                                         x->isshorttag = 1;
          299                                 } else if (c == '/') {
          300                                         if ((c = GETNEXT()) == EOF)
          301                                                 return;
          302                                         x->tag[0] = c;
          303                                         isend = 1;
          304                                 }
          305 
          306                                 while ((c = GETNEXT()) != EOF) {
          307                                         if (c == '/')
          308                                                 x->isshorttag = 1; /* short tag */
          309                                         else if (c == '>' || isspace(c)) {
          310                                                 x->tag[x->taglen] = '\0';
          311                                                 if (isend) { /* end tag, starts with </ */
          312                                                         if (x->xmltagend)
          313                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          314                                                         x->tag[0] = '\0';
          315                                                         x->taglen = 0;
          316                                                 } else {
          317                                                         /* start tag */
          318                                                         if (isspace(c))
          319                                                                 xml_parseattrs(x);
          320                                                         if (x->xmltagstartparsed)
          321                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          322                                                 }
          323                                                 /* call tagend for shortform or processing instruction */
          324                                                 if (x->isshorttag) {
          325                                                         if (x->xmltagend)
          326                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          327                                                         x->tag[0] = '\0';
          328                                                         x->taglen = 0;
          329                                                 }
          330                                                 break;
          331                                         } else if (x->taglen < sizeof(x->tag) - 1)
          332                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          333                                 }
          334                         }
          335                 } else {
          336                         /* parse tag data */
          337                         datalen = 0;
          338                         while ((c = GETNEXT()) != EOF) {
          339                                 if (c == '&') {
          340                                         if (datalen) {
          341                                                 x->data[datalen] = '\0';
          342                                         }
          343                                         x->data[0] = c;
          344                                         datalen = 1;
          345                                         while ((c = GETNEXT()) != EOF) {
          346                                                 if (c == '<')
          347                                                         break;
          348                                                 if (datalen < sizeof(x->data) - 1)
          349                                                         x->data[datalen++] = c;
          350                                                 else {
          351                                                         /* entity too long for buffer, handle as normal data */
          352                                                         x->data[datalen] = '\0';
          353                                                         x->data[0] = c;
          354                                                         datalen = 1;
          355                                                         break;
          356                                                 }
          357                                                 if (c == ';') {
          358                                                         x->data[datalen] = '\0';
          359                                                         if (x->xmldataentity)
          360                                                                 x->xmldataentity(x, x->data, datalen);
          361                                                         datalen = 0;
          362                                                         break;
          363                                                 }
          364                                         }
          365                                 } else if (c != '<') {
          366                                         if (datalen < sizeof(x->data) - 1) {
          367                                                 x->data[datalen++] = c;
          368                                         } else {
          369                                                 x->data[datalen] = '\0';
          370                                                 x->data[0] = c;
          371                                                 datalen = 1;
          372                                         }
          373                                 }
          374                                 if (c == '<') {
          375                                         x->data[datalen] = '\0';
          376                                         break;
          377                                 }
          378                         }
          379                 }
          380         }
          381 }
          382 
          383 void
          384 xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
          385               const char *v, size_t vl)
          386 {
          387         if (!checkentity(v)) {
          388                 printf("%zu:%zu: invalid entity in attribute: %s %s %s\n",
          389                        linenr, linechar, t, a, v);
          390                 exitstatus = 1;
          391         }
          392 }
          393 
          394 void
          395 xmldataentity(XMLParser *x, const char *d, size_t dl)
          396 {
          397         if (!checkentity(d)) {
          398                 printf("%zu:%zu: invalid entity: %s\n", linenr, linechar, d);
          399                 exitstatus = 1;
          400         }
          401 }
          402 
          403 void
          404 xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
          405 {
          406         if (isshort)
          407                 return;
          408 
          409         if (depth)
          410                 depth--;
          411         else
          412                 goto unbalanced;
          413 
          414         if (nodes[depth].tag[0] && strcmp(t, nodes[depth].tag))
          415                 goto unbalanced;
          416 
          417         memset(&nodes[depth], 0, sizeof(nodes[0]));
          418         return;
          419 
          420 unbalanced:
          421         printf("%zu:%zu: unbalanced tag %s, expected: </%s> (ends at %zu:%zu)\n",
          422                nodes[depth].linenr, nodes[depth].linechar, t, nodes[depth].tag,
          423                linenr, linechar);
          424         exit(1);
          425 }
          426 
          427 void
          428 xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
          429 {
          430         int i;
          431 
          432         if (isshort) {
          433                 return;
          434         } else if (!checkxhtml) {
          435                 /* HTML has tag that don't need to be closed, allow this. */
          436                 for (i = 0; i < sizeof(autoclose) / sizeof(autoclose[0]); i++)
          437                         if (!strcmp(t, autoclose[i]))
          438                                 return;
          439         }
          440 
          441         if (depth + 1 >= MAX_DEPTH) {
          442                 printf("%zu:%zu: too deep >= %d\n", linenr, linechar, MAX_DEPTH);
          443                 exit(1);
          444         }
          445         snprintf(nodes[depth].tag, sizeof(nodes[0].tag), "%s", t);
          446         nodes[depth].linenr = linenr;
          447         nodes[depth].linechar = linechar;
          448         depth++;
          449 }
          450 
          451 void
          452 usage(const char *argv0)
          453 {
          454         fprintf(stderr, "usage: %s [-x]\n", argv0);
          455         exit(1);
          456 }
          457 
          458 int
          459 main(int argc, char *argv[])
          460 {
          461         XMLParser x = { 0 };
          462 
          463         if (pledge("stdio", NULL) == -1) {
          464                 perror("pledge");
          465                 exit(1);
          466         }
          467 
          468         if (argc > 1) {
          469                 if (argc == 2 && !strcmp(argv[1], "-x"))
          470                         checkxhtml = 1;
          471                 else
          472                         usage(argv[0]);
          473         }
          474 
          475         x.xmlattrentity = xmlattrentity;
          476         x.xmldataentity = xmldataentity;
          477         x.xmltagend = xmltagend;
          478         x.xmltagstartparsed = xmltagstartparsed;
          479         xml_parse(&x);
          480 
          481         if (depth) {
          482                 printf("%zu:%zu: unbalanced: %s\n",
          483                         nodes[depth - 1].linenr, nodes[depth - 1].linechar,
          484                         nodes[depth - 1].tag);
          485                 exit(1);
          486         }
          487 
          488         return exitstatus;
          489 }