xml.c - grabtitle - stupid HTML title grabber
 (HTM) git clone git://git.codemadness.org/grabtitle
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (8078B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
            9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           10 
           11 static void
           12 xml_parseattrs(XMLParser *x)
           13 {
           14         size_t namelen = 0;
           15         int c, endsep, endname = 0, valuestart = 0;
           16 
           17         while ((c = GETNEXT()) != EOF) {
           18                 if (ISSPACE(c)) {
           19                         if (namelen)
           20                                 endname = 1;
           21                         continue;
           22                 } else if (c == '?')
           23                         ; /* ignore */
           24                 else if (c == '=') {
           25                         valuestart = 1;
           26                         endname = 1;
           27                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           28                         endname = 0;
           29                         namelen = 1;
           30                 } else if (namelen && valuestart) {
           31                         /* attribute with value */
           32                         if (c == '\'' || c == '"') {
           33                                 endsep = c;
           34                                 while ((c = GETNEXT()) != EOF) {
           35                                         if (c == endsep)
           36                                                 break;
           37                                 }
           38                         } else {
           39                                 while ((c = GETNEXT()) != EOF) {
           40                                         if (c == '>' || ISSPACE(c))
           41                                                 break;
           42                                 }
           43                         }
           44                         namelen = endname = valuestart = 0;
           45                 } else {
           46                         namelen = 1;
           47                 }
           48                 if (c == '>') {
           49                         break;
           50                 } else if (c == '/') {
           51                         x->isshorttag = 1;
           52                         namelen = 0;
           53                 }
           54         }
           55 }
           56 
           57 static void
           58 xml_parsecomment(XMLParser *x)
           59 {
           60         size_t i = 0;
           61         int c;
           62 
           63         while ((c = GETNEXT()) != EOF) {
           64                 if (c == '-') {
           65                         if (i < 2)
           66                                 i++;
           67                 } else if (c == '>' && i == 2) {
           68                         return;
           69                 } else {
           70                         i = 0;
           71                 }
           72         }
           73 }
           74 
           75 static void
           76 xml_parsecdata(XMLParser *x)
           77 {
           78         size_t datalen = 0, i = 0;
           79         int c;
           80 
           81         while ((c = GETNEXT()) != EOF) {
           82                 if (c == ']' || c == '>') {
           83                         if (x->xmlcdata) {
           84                                 x->data[datalen] = '\0';
           85                                 x->xmlcdata(x, x->data, datalen);
           86                                 datalen = 0;
           87                         }
           88                 }
           89 
           90                 if (c == ']') {
           91                         if (++i > 2) {
           92                                 if (x->xmlcdata)
           93                                         for (; i > 2; i--)
           94                                                 x->xmlcdata(x, "]", 1);
           95                                 i = 2;
           96                         }
           97                         continue;
           98                 } else if (c == '>' && i == 2) {
           99                         return;
          100                 } else {
          101                         if (x->xmlcdata)
          102                                 for (; i > 0; i--)
          103                                         x->xmlcdata(x, "]", 1);
          104                         i = 0;
          105                 }
          106 
          107                 if (datalen < sizeof(x->data) - 1) {
          108                         x->data[datalen++] = c;
          109                 } else {
          110                         x->data[datalen] = '\0';
          111                         if (x->xmlcdata)
          112                                 x->xmlcdata(x, x->data, datalen);
          113                         x->data[0] = c;
          114                         datalen = 1;
          115                 }
          116         }
          117 }
          118 
          119 static int
          120 codepointtoutf8(long r, char *s)
          121 {
          122         if (r == 0) {
          123                 return 0; /* NUL byte */
          124         } else if (r <= 0x7F) {
          125                 /* 1 byte: 0aaaaaaa */
          126                 s[0] = r;
          127                 return 1;
          128         } else if (r <= 0x07FF) {
          129                 /* 2 bytes: 00000aaa aabbbbbb */
          130                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          131                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          132                 return 2;
          133         } else if (r <= 0xFFFF) {
          134                 /* 3 bytes: aaaabbbb bbcccccc */
          135                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          136                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          137                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          138                 return 3;
          139         } else {
          140                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          141                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          142                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          143                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          144                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          145                 return 4;
          146         }
          147 }
          148 
          149 struct namedentity {
          150         const char *entity;
          151         long cp;
          152 };
          153 
          154 int
          155 namedentitycmp(const void *v1, const void *v2)
          156 {
          157         struct namedentity *n1 = (struct namedentity *)v1;
          158         struct namedentity *n2 = (struct namedentity *)v2;
          159 
          160         return strcmp(n1->entity, n2->entity);
          161 }
          162 
          163 static int
          164 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          165 {
          166         static const struct namedentity entities[] = {
          167 #include "namedentities.h"
          168         };
          169         struct namedentity find, *found;
          170         size_t i;
          171 
          172         /* buffer is too small */
          173         if (bufsiz < 5)
          174                 return -1;
          175 
          176         find.entity = e;
          177         found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities),
          178                 sizeof(*entities), namedentitycmp);
          179         if (found) {
          180                 i = codepointtoutf8(found->cp, buf);
          181                 buf[i] = '\0';
          182                 return i;
          183         }
          184         return -1;
          185 }
          186 
          187 static int
          188 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          189 {
          190         long l;
          191         int len;
          192         char *end;
          193 
          194         /* buffer is too small */
          195         if (bufsiz < 5)
          196                 return -1;
          197 
          198         errno = 0;
          199         /* hex (16) or decimal (10) */
          200         if (*e == 'x')
          201                 l = strtol(++e, &end, 16);
          202         else
          203                 l = strtol(e, &end, 10);
          204         /* invalid value or not a well-formed entity or invalid code point */
          205         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          206             (l >= 0xd800 && l <= 0xdfff))
          207                 return -1;
          208         len = codepointtoutf8(l, buf);
          209         buf[len] = '\0';
          210 
          211         return len;
          212 }
          213 
          214 /* convert named- or numeric entity string to buffer string
          215  * returns byte-length of string or -1 on failure. */
          216 int
          217 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          218 {
          219         /* doesn't start with & */
          220         if (e[0] != '&')
          221                 return -1;
          222         /* numeric entity */
          223         if (e[1] == '#')
          224                 return numericentitytostr(e + 2, buf, bufsiz);
          225         else /* named entity */
          226                 return namedentitytostr(e + 1, buf, bufsiz);
          227 }
          228 
          229 void
          230 xml_parse(XMLParser *x)
          231 {
          232         size_t datalen, tagdatalen;
          233         int c, isend;
          234 
          235         while ((c = GETNEXT()) != EOF && c != '<')
          236                 ; /* skip until < */
          237 
          238         while (c != EOF) {
          239                 if (c == '<') { /* parse tag */
          240                         if ((c = GETNEXT()) == EOF)
          241                                 return;
          242 
          243                         if (c == '!') { /* CDATA and comments */
          244                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          245                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          246                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          247                                                 x->data[tagdatalen++] = c;
          248                                         if (c == '>')
          249                                                 break;
          250                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          251                                                         (x->data[0] == '-')) {
          252                                                 xml_parsecomment(x);
          253                                                 break;
          254                                         } else if (c == '[') {
          255                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          256                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          257                                                         xml_parsecdata(x);
          258                                                         break;
          259                                                 }
          260                                         }
          261                                 }
          262                         } else {
          263                                 /* normal tag (open, short open, close), processing instruction. */
          264                                 x->tag[0] = c;
          265                                 x->taglen = 1;
          266                                 x->isshorttag = isend = 0;
          267 
          268                                 /* treat processing instruction as short tag, don't strip "?" prefix. */
          269                                 if (c == '?') {
          270                                         x->isshorttag = 1;
          271                                 } else if (c == '/') {
          272                                         if ((c = GETNEXT()) == EOF)
          273                                                 return;
          274                                         x->tag[0] = c;
          275                                         isend = 1;
          276                                 }
          277 
          278                                 while ((c = GETNEXT()) != EOF) {
          279                                         if (c == '/')
          280                                                 x->isshorttag = 1; /* short tag */
          281                                         else if (c == '>' || ISSPACE(c)) {
          282                                                 x->tag[x->taglen] = '\0';
          283                                                 if (isend) { /* end tag, starts with </ */
          284                                                         while (c != '>' && c != EOF) /* skip until > */
          285                                                                 c = GETNEXT();
          286                                                         if (x->xmltagend)
          287                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          288                                                         x->tag[0] = '\0';
          289                                                         x->taglen = 0;
          290                                                 } else {
          291                                                         /* start tag */
          292                                                         if (x->xmltagstart)
          293                                                                 x->xmltagstart(x, x->tag, x->taglen);
          294                                                         if (ISSPACE(c))
          295                                                                 xml_parseattrs(x);
          296                                                 }
          297                                                 /* call tagend for short tag or processing instruction */
          298                                                 if (x->isshorttag) {
          299                                                         if (x->xmltagend)
          300                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          301                                                         x->tag[0] = '\0';
          302                                                         x->taglen = 0;
          303                                                 }
          304                                                 break;
          305                                         } else if (x->taglen < sizeof(x->tag) - 1)
          306                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          307                                 }
          308                         }
          309                 } else {
          310                         /* parse tag data */
          311                         datalen = 0;
          312                         while ((c = GETNEXT()) != EOF) {
          313                                 if (c == '&') {
          314                                         if (datalen) {
          315                                                 x->data[datalen] = '\0';
          316                                                 if (x->xmldata)
          317                                                         x->xmldata(x, x->data, datalen);
          318                                         }
          319                                         x->data[0] = c;
          320                                         datalen = 1;
          321                                         while ((c = GETNEXT()) != EOF) {
          322                                                 if (c == '<')
          323                                                         break;
          324                                                 if (datalen < sizeof(x->data) - 1)
          325                                                         x->data[datalen++] = c;
          326                                                 else {
          327                                                         /* entity too long for buffer, handle as normal data */
          328                                                         x->data[datalen] = '\0';
          329                                                         if (x->xmldata)
          330                                                                 x->xmldata(x, x->data, datalen);
          331                                                         x->data[0] = c;
          332                                                         datalen = 1;
          333                                                         break;
          334                                                 }
          335                                                 if (c == ';') {
          336                                                         x->data[datalen] = '\0';
          337                                                         if (x->xmldataentity)
          338                                                                 x->xmldataentity(x, x->data, datalen);
          339                                                         datalen = 0;
          340                                                         break;
          341                                                 }
          342                                         }
          343                                 } else if (c != '<') {
          344                                         if (datalen < sizeof(x->data) - 1) {
          345                                                 x->data[datalen++] = c;
          346                                         } else {
          347                                                 x->data[datalen] = '\0';
          348                                                 if (x->xmldata)
          349                                                         x->xmldata(x, x->data, datalen);
          350                                                 x->data[0] = c;
          351                                                 datalen = 1;
          352                                         }
          353                                 }
          354                                 if (c == '<') {
          355                                         x->data[datalen] = '\0';
          356                                         if (x->xmldata && datalen)
          357                                                 x->xmldata(x, x->data, datalen);
          358                                         break;
          359                                 }
          360                         }
          361                 }
          362         }
          363 }