xml.c - sfeed - RSS and Atom parser
 (HTM) git clone git://git.codemadness.org/sfeed
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (10616B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
            9 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           10 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           11 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || (((unsigned)c) | 32) - 'a' < 6)
           12 
           13 static void
           14 xml_parseattrs(XMLParser *x)
           15 {
           16         size_t namelen = 0, valuelen;
           17         int c, endsep, endname = 0, valuestart = 0;
           18 
           19         while ((c = GETNEXT()) != EOF) {
           20                 if (ISSPACE(c)) {
           21                         if (namelen)
           22                                 endname = 1;
           23                         continue;
           24                 } else if (c == '?')
           25                         ; /* ignore */
           26                 else if (c == '=') {
           27                         x->name[namelen] = '\0';
           28                         valuestart = 1;
           29                         endname = 1;
           30                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           31                         /* attribute without value */
           32                         x->name[namelen] = '\0';
           33                         if (x->xmlattrstart)
           34                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           35                         if (x->xmlattr)
           36                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           37                         if (x->xmlattrend)
           38                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           39                         endname = 0;
           40                         x->name[0] = c;
           41                         namelen = 1;
           42                 } else if (namelen && valuestart) {
           43                         /* attribute with value */
           44                         if (x->xmlattrstart)
           45                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           46 
           47                         valuelen = 0;
           48                         if (c == '\'' || c == '"') {
           49                                 endsep = c;
           50                         } else {
           51                                 endsep = ' '; /* ISSPACE() */
           52                                 goto startvalue;
           53                         }
           54 
           55                         while ((c = GETNEXT()) != EOF) {
           56 startvalue:
           57                                 if (c == '&') { /* entities */
           58                                         x->data[valuelen] = '\0';
           59                                         /* call data function with data before entity if there is data */
           60                                         if (valuelen && x->xmlattr)
           61                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           62                                         x->data[0] = c;
           63                                         valuelen = 1;
           64                                         while ((c = GETNEXT()) != EOF) {
           65                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
           66                                                         break;
           67                                                 if (valuelen < sizeof(x->data) - 1)
           68                                                         x->data[valuelen++] = c;
           69                                                 else {
           70                                                         /* entity too long for buffer, handle as normal data */
           71                                                         x->data[valuelen] = '\0';
           72                                                         if (x->xmlattr)
           73                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           74                                                         x->data[0] = c;
           75                                                         valuelen = 1;
           76                                                         break;
           77                                                 }
           78                                                 if (c == ';') {
           79                                                         x->data[valuelen] = '\0';
           80                                                         if (x->xmlattrentity)
           81                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           82                                                         valuelen = 0;
           83                                                         break;
           84                                                 }
           85                                         }
           86                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           87                                         if (valuelen < sizeof(x->data) - 1) {
           88                                                 x->data[valuelen++] = c;
           89                                         } else {
           90                                                 x->data[valuelen] = '\0';
           91                                                 if (x->xmlattr)
           92                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           93                                                 x->data[0] = c;
           94                                                 valuelen = 1;
           95                                         }
           96                                 }
           97                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           98                                         x->data[valuelen] = '\0';
           99                                         if (x->xmlattr)
          100                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          101                                         if (x->xmlattrend)
          102                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          103                                         break;
          104                                 }
          105                         }
          106                         namelen = endname = valuestart = 0;
          107                 } else if (namelen < sizeof(x->name) - 1) {
          108                         x->name[namelen++] = c;
          109                 }
          110                 if (c == '>') {
          111                         break;
          112                 } else if (c == '/') {
          113                         x->isshorttag = 1;
          114                         x->name[0] = '\0';
          115                         namelen = 0;
          116                 }
          117         }
          118 }
          119 
          120 static void
          121 xml_parsecomment(XMLParser *x)
          122 {
          123         int c, i = 0;
          124 
          125         while ((c = GETNEXT()) != EOF) {
          126                 if (c == '-') {
          127                         if (++i > 2)
          128                                 i = 2;
          129                         continue;
          130                 } else if (c == '>' && i == 2) {
          131                         return;
          132                 } else if (i) {
          133                         i = 0;
          134                 }
          135         }
          136 }
          137 
          138 static void
          139 xml_parsecdata(XMLParser *x)
          140 {
          141         size_t datalen = 0, i = 0;
          142         int c;
          143 
          144         while ((c = GETNEXT()) != EOF) {
          145                 if (c == ']' || c == '>') {
          146                         if (x->xmlcdata && datalen) {
          147                                 x->data[datalen] = '\0';
          148                                 x->xmlcdata(x, x->data, datalen);
          149                                 datalen = 0;
          150                         }
          151                 }
          152 
          153                 if (c == ']') {
          154                         if (++i > 2) {
          155                                 if (x->xmlcdata)
          156                                         for (; i > 2; i--)
          157                                                 x->xmlcdata(x, "]", 1);
          158                                 i = 2;
          159                         }
          160                         continue;
          161                 } else if (c == '>' && i == 2) {
          162                         return;
          163                 } else if (i) {
          164                         if (x->xmlcdata)
          165                                 for (; i > 0; i--)
          166                                         x->xmlcdata(x, "]", 1);
          167                         i = 0;
          168                 }
          169 
          170                 if (datalen < sizeof(x->data) - 1) {
          171                         x->data[datalen++] = c;
          172                 } else {
          173                         x->data[datalen] = '\0';
          174                         if (x->xmlcdata)
          175                                 x->xmlcdata(x, x->data, datalen);
          176                         x->data[0] = c;
          177                         datalen = 1;
          178                 }
          179         }
          180 }
          181 
          182 static int
          183 codepointtoutf8(long r, char *s)
          184 {
          185         if (r == 0) {
          186                 return 0; /* NUL byte */
          187         } else if (r <= 0x7F) {
          188                 /* 1 byte: 0aaaaaaa */
          189                 s[0] = r;
          190                 return 1;
          191         } else if (r <= 0x07FF) {
          192                 /* 2 bytes: 00000aaa aabbbbbb */
          193                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          194                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          195                 return 2;
          196         } else if (r <= 0xFFFF) {
          197                 /* 3 bytes: aaaabbbb bbcccccc */
          198                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          199                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          200                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          201                 return 3;
          202         } else {
          203                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          204                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          205                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          206                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          207                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          208                 return 4;
          209         }
          210 }
          211 
          212 static int
          213 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          214 {
          215         static const struct {
          216                 const char *entity;
          217                 int c;
          218         } entities[] = {
          219                 { "amp;",  '&'  },
          220                 { "lt;",   '<'  },
          221                 { "gt;",   '>'  },
          222                 { "apos;", '\'' },
          223                 { "quot;", '"'  },
          224         };
          225         size_t i;
          226 
          227         /* buffer is too small */
          228         if (bufsiz < 2)
          229                 return -1;
          230 
          231         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
          232                 if (!strcmp(e, entities[i].entity)) {
          233                         buf[0] = entities[i].c;
          234                         buf[1] = '\0';
          235                         return 1;
          236                 }
          237         }
          238         return -1;
          239 }
          240 
          241 static int
          242 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          243 {
          244         long l;
          245         int base, len;
          246         const char *s;
          247         char *end;
          248 
          249         /* buffer is too small */
          250         if (bufsiz < 5)
          251                 return -1;
          252 
          253         /* hex (base 16) or decimal (base 10) */
          254         if (*e == 'x') {
          255                 e++;
          256                 for (s = e; *s && *s != ';'; s++) {
          257                         if (!ISXDIGIT((unsigned char)*s))
          258                                 return -1; /* invalid: no hex */
          259                 }
          260                 base = 16;
          261 
          262         } else {
          263                 for (s = e; *s && *s != ';'; s++) {
          264                         if (!ISDIGIT((unsigned char)*s))
          265                                 return -1; /* invalid: no digits */
          266                 }
          267                 base = 10;
          268         }
          269         if (*s != ';' || *(s + 1) != '\0')
          270                 return -1; /* must end with ';' NUL */
          271 
          272         errno = 0;
          273         l = strtol(e, &end, base);
          274 
          275         /* invalid value or not a well-formed entity or invalid code point */
          276         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          277             (l >= 0xd800 && l <= 0xdfff)) /* surrogate range */
          278                 return -1;
          279         len = codepointtoutf8(l, buf);
          280         buf[len] = '\0';
          281 
          282         return len;
          283 }
          284 
          285 /* convert named- or numeric entity string to buffer string
          286  * returns byte-length of string or -1 on failure. */
          287 int
          288 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          289 {
          290         /* doesn't start with & */
          291         if (e[0] != '&')
          292                 return -1;
          293         /* numeric entity */
          294         if (e[1] == '#')
          295                 return numericentitytostr(e + 2, buf, bufsiz);
          296         else /* named entity */
          297                 return namedentitytostr(e + 1, buf, bufsiz);
          298 }
          299 
          300 void
          301 xml_parse(XMLParser *x)
          302 {
          303         size_t datalen, tagdatalen;
          304         int c, isend;
          305 
          306         while ((c = GETNEXT()) != EOF && c != '<')
          307                 ; /* skip until < */
          308 
          309         while (c != EOF) {
          310                 if (c == '<') { /* parse tag */
          311                         if ((c = GETNEXT()) == EOF)
          312                                 return;
          313 
          314                         if (c == '!') { /* CDATA and comments */
          315                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          316                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          317                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          318                                                 x->data[tagdatalen++] = c;
          319                                         if (c == '>')
          320                                                 break;
          321                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          322                                                         (x->data[0] == '-')) {
          323                                                 xml_parsecomment(x);
          324                                                 break;
          325                                         } else if (c == '[') {
          326                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          327                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          328                                                         xml_parsecdata(x);
          329                                                         break;
          330                                                 }
          331                                         }
          332                                 }
          333                         } else {
          334                                 /* normal tag (open, short open, close), processing instruction. */
          335                                 x->tag[0] = c;
          336                                 x->taglen = 1;
          337                                 x->isshorttag = isend = 0;
          338 
          339                                 /* treat processing instruction as short tag, don't strip "?" prefix. */
          340                                 if (c == '?') {
          341                                         x->isshorttag = 1;
          342                                 } else if (c == '/') {
          343                                         if ((c = GETNEXT()) == EOF)
          344                                                 return;
          345                                         x->tag[0] = c;
          346                                         isend = 1;
          347                                 }
          348 
          349                                 while ((c = GETNEXT()) != EOF) {
          350                                         if (c == '/')
          351                                                 x->isshorttag = 1; /* short tag */
          352                                         else if (c == '>' || ISSPACE(c)) {
          353                                                 x->tag[x->taglen] = '\0';
          354                                                 if (isend) { /* end tag, starts with </ */
          355                                                         while (c != '>' && c != EOF) /* skip until > */
          356                                                                 c = GETNEXT();
          357                                                         if (x->xmltagend)
          358                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          359                                                         x->tag[0] = '\0';
          360                                                         x->taglen = 0;
          361                                                 } else {
          362                                                         /* start tag */
          363                                                         if (x->xmltagstart)
          364                                                                 x->xmltagstart(x, x->tag, x->taglen);
          365                                                         if (ISSPACE(c))
          366                                                                 xml_parseattrs(x);
          367                                                         if (x->xmltagstartparsed)
          368                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          369                                                 }
          370                                                 /* call tagend for short tag or processing instruction */
          371                                                 if (x->isshorttag) {
          372                                                         if (x->xmltagend)
          373                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          374                                                         x->tag[0] = '\0';
          375                                                         x->taglen = 0;
          376                                                 }
          377                                                 break;
          378                                         } else if (x->taglen < sizeof(x->tag) - 1)
          379                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          380                                 }
          381                         }
          382                 } else {
          383                         /* parse tag data */
          384                         datalen = 0;
          385                         while ((c = GETNEXT()) != EOF) {
          386                                 if (c == '&') { /* entities */
          387                                         if (datalen) {
          388                                                 x->data[datalen] = '\0';
          389                                                 if (x->xmldata)
          390                                                         x->xmldata(x, x->data, datalen);
          391                                         }
          392                                         x->data[0] = c;
          393                                         datalen = 1;
          394                                         while ((c = GETNEXT()) != EOF) {
          395                                                 if (c == '<')
          396                                                         break;
          397                                                 if (datalen < sizeof(x->data) - 1)
          398                                                         x->data[datalen++] = c;
          399                                                 else {
          400                                                         /* entity too long for buffer, handle as normal data */
          401                                                         x->data[datalen] = '\0';
          402                                                         if (x->xmldata)
          403                                                                 x->xmldata(x, x->data, datalen);
          404                                                         x->data[0] = c;
          405                                                         datalen = 1;
          406                                                         break;
          407                                                 }
          408                                                 if (c == ';') {
          409                                                         x->data[datalen] = '\0';
          410                                                         if (x->xmldataentity)
          411                                                                 x->xmldataentity(x, x->data, datalen);
          412                                                         datalen = 0;
          413                                                         break;
          414                                                 }
          415                                         }
          416                                 } else if (c != '<') {
          417                                         if (datalen < sizeof(x->data) - 1) {
          418                                                 x->data[datalen++] = c;
          419                                         } else {
          420                                                 x->data[datalen] = '\0';
          421                                                 if (x->xmldata)
          422                                                         x->xmldata(x, x->data, datalen);
          423                                                 x->data[0] = c;
          424                                                 datalen = 1;
          425                                         }
          426                                 }
          427                                 if (c == '<') {
          428                                         x->data[datalen] = '\0';
          429                                         if (x->xmldata && datalen)
          430                                                 x->xmldata(x, x->data, datalen);
          431                                         break;
          432                                 }
          433                         }
          434                 }
          435         }
          436 }