xml.c - randomcrap - random crap programs of varying quality
 (HTM) git clone git://git.codemadness.org/randomcrap
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (10936B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
            9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           10 
           11 static void
           12 xml_parseattrs(XMLParser *x)
           13 {
           14         size_t namelen = 0, valuelen;
           15         int c, endsep, endname = 0, valuestart = 0;
           16 
           17         while ((c = GETNEXT()) != EOF) {
           18                 if (ISSPACE(c)) {
           19                         if (namelen)
           20                                 endname = 1;
           21                         continue;
           22                 } else if (c == '?')
           23                         ; /* ignore */
           24                 else if (c == '=') {
           25                         x->name[namelen] = '\0';
           26                         valuestart = 1;
           27                         endname = 1;
           28                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           29                         /* attribute without value */
           30                         x->name[namelen] = '\0';
           31                         if (x->xmlattrstart)
           32                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           33                         if (x->xmlattr)
           34                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           35                         if (x->xmlattrend)
           36                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           37                         endname = 0;
           38                         x->name[0] = c;
           39                         namelen = 1;
           40                 } else if (namelen && valuestart) {
           41                         /* attribute with value */
           42                         if (x->xmlattrstart)
           43                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           44 
           45                         valuelen = 0;
           46                         if (c == '\'' || c == '"') {
           47                                 endsep = c;
           48                         } else {
           49                                 endsep = ' '; /* ISSPACE() */
           50                                 goto startvalue;
           51                         }
           52 
           53                         while ((c = GETNEXT()) != EOF) {
           54 startvalue:
           55                                 if (c == '&') { /* entities */
           56                                         x->data[valuelen] = '\0';
           57                                         /* call data function with data before entity if there is data */
           58                                         if (valuelen && x->xmlattr)
           59                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           60                                         x->data[0] = c;
           61                                         valuelen = 1;
           62                                         while ((c = GETNEXT()) != EOF) {
           63                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
           64                                                         break;
           65                                                 if (valuelen < sizeof(x->data) - 1)
           66                                                         x->data[valuelen++] = c;
           67                                                 else {
           68                                                         /* entity too long for buffer, handle as normal data */
           69                                                         x->data[valuelen] = '\0';
           70                                                         if (x->xmlattr)
           71                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           72                                                         x->data[0] = c;
           73                                                         valuelen = 1;
           74                                                         break;
           75                                                 }
           76                                                 if (c == ';') {
           77                                                         x->data[valuelen] = '\0';
           78                                                         if (x->xmlattrentity)
           79                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           80                                                         valuelen = 0;
           81                                                         break;
           82                                                 }
           83                                         }
           84                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           85                                         if (valuelen < sizeof(x->data) - 1) {
           86                                                 x->data[valuelen++] = c;
           87                                         } else {
           88                                                 x->data[valuelen] = '\0';
           89                                                 if (x->xmlattr)
           90                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           91                                                 x->data[0] = c;
           92                                                 valuelen = 1;
           93                                         }
           94                                 }
           95                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           96                                         x->data[valuelen] = '\0';
           97                                         if (x->xmlattr)
           98                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           99                                         if (x->xmlattrend)
          100                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          101                                         break;
          102                                 }
          103                         }
          104                         namelen = endname = valuestart = 0;
          105                 } else if (namelen < sizeof(x->name) - 1) {
          106                         x->name[namelen++] = c;
          107                 }
          108                 if (c == '>') {
          109                         break;
          110                 } else if (c == '/') {
          111                         x->isshorttag = 1;
          112                         x->name[0] = '\0';
          113                         namelen = 0;
          114                 }
          115         }
          116 }
          117 
          118 static void
          119 xml_parsecomment(XMLParser *x)
          120 {
          121         size_t datalen = 0, i = 0;
          122         int c;
          123 
          124         if (x->xmlcommentstart)
          125                 x->xmlcommentstart(x);
          126         while ((c = GETNEXT()) != EOF) {
          127                 if (c == '-' || c == '>') {
          128                         if (x->xmlcomment && datalen) {
          129                                 x->data[datalen] = '\0';
          130                                 x->xmlcomment(x, x->data, datalen);
          131                                 datalen = 0;
          132                         }
          133                 }
          134 
          135                 if (c == '-') {
          136                         if (++i > 2) {
          137                                 if (x->xmlcomment)
          138                                         for (; i > 2; i--)
          139                                                 x->xmlcomment(x, "-", 1);
          140                                 i = 2;
          141                         }
          142                         continue;
          143                 } else if (c == '>' && i == 2) {
          144                         if (x->xmlcommentend)
          145                                 x->xmlcommentend(x);
          146                         return;
          147                 } else if (i) {
          148                         if (x->xmlcomment) {
          149                                 for (; i > 0; i--)
          150                                         x->xmlcomment(x, "-", 1);
          151                         }
          152                         i = 0;
          153                 }
          154 
          155                 if (datalen < sizeof(x->data) - 1) {
          156                         x->data[datalen++] = c;
          157                 } else {
          158                         x->data[datalen] = '\0';
          159                         if (x->xmlcomment)
          160                                 x->xmlcomment(x, x->data, datalen);
          161                         x->data[0] = c;
          162                         datalen = 1;
          163                 }
          164         }
          165 }
          166 
          167 static void
          168 xml_parsecdata(XMLParser *x)
          169 {
          170         size_t datalen = 0, i = 0;
          171         int c;
          172 
          173         if (x->xmlcdatastart)
          174                 x->xmlcdatastart(x);
          175         while ((c = GETNEXT()) != EOF) {
          176                 if (c == ']' || c == '>') {
          177                         if (x->xmlcdata && datalen) {
          178                                 x->data[datalen] = '\0';
          179                                 x->xmlcdata(x, x->data, datalen);
          180                                 datalen = 0;
          181                         }
          182                 }
          183 
          184                 if (c == ']') {
          185                         if (++i > 2) {
          186                                 if (x->xmlcdata)
          187                                         for (; i > 2; i--)
          188                                                 x->xmlcdata(x, "]", 1);
          189                                 i = 2;
          190                         }
          191                         continue;
          192                 } else if (c == '>' && i == 2) {
          193                         if (x->xmlcdataend)
          194                                 x->xmlcdataend(x);
          195                         return;
          196                 } else if (i) {
          197                         if (x->xmlcdata)
          198                                 for (; i > 0; i--)
          199                                         x->xmlcdata(x, "]", 1);
          200                         i = 0;
          201                 }
          202 
          203                 if (datalen < sizeof(x->data) - 1) {
          204                         x->data[datalen++] = c;
          205                 } else {
          206                         x->data[datalen] = '\0';
          207                         if (x->xmlcdata)
          208                                 x->xmlcdata(x, x->data, datalen);
          209                         x->data[0] = c;
          210                         datalen = 1;
          211                 }
          212         }
          213 }
          214 
          215 static int
          216 codepointtoutf8(long r, char *s)
          217 {
          218         if (r == 0) {
          219                 return 0; /* NUL byte */
          220         } else if (r <= 0x7F) {
          221                 /* 1 byte: 0aaaaaaa */
          222                 s[0] = r;
          223                 return 1;
          224         } else if (r <= 0x07FF) {
          225                 /* 2 bytes: 00000aaa aabbbbbb */
          226                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          227                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          228                 return 2;
          229         } else if (r <= 0xFFFF) {
          230                 /* 3 bytes: aaaabbbb bbcccccc */
          231                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          232                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          233                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          234                 return 3;
          235         } else {
          236                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          237                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          238                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          239                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          240                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          241                 return 4;
          242         }
          243 }
          244 
          245 static int
          246 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          247 {
          248         static const struct {
          249                 const char *entity;
          250                 int c;
          251         } entities[] = {
          252                 { "amp;",  '&'  },
          253                 { "lt;",   '<'  },
          254                 { "gt;",   '>'  },
          255                 { "apos;", '\'' },
          256                 { "quot;", '"'  },
          257         };
          258         size_t i;
          259 
          260         /* buffer is too small */
          261         if (bufsiz < 2)
          262                 return -1;
          263 
          264         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
          265                 if (!strcmp(e, entities[i].entity)) {
          266                         buf[0] = entities[i].c;
          267                         buf[1] = '\0';
          268                         return 1;
          269                 }
          270         }
          271         return -1;
          272 }
          273 
          274 static int
          275 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          276 {
          277         long l;
          278         int len;
          279         char *end;
          280 
          281         /* buffer is too small */
          282         if (bufsiz < 5)
          283                 return -1;
          284 
          285         errno = 0;
          286         /* hex (16) or decimal (10) */
          287         if (*e == 'x')
          288                 l = strtol(++e, &end, 16);
          289         else
          290                 l = strtol(e, &end, 10);
          291         /* invalid value or not a well-formed entity or invalid code point */
          292         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          293             (l >= 0xd800 && l <= 0xdfff))
          294                 return -1;
          295         len = codepointtoutf8(l, buf);
          296         buf[len] = '\0';
          297 
          298         return len;
          299 }
          300 
          301 /* convert named- or numeric entity string to buffer string
          302  * returns byte-length of string or -1 on failure. */
          303 int
          304 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          305 {
          306         /* doesn't start with & */
          307         if (e[0] != '&')
          308                 return -1;
          309         /* numeric entity */
          310         if (e[1] == '#')
          311                 return numericentitytostr(e + 2, buf, bufsiz);
          312         else /* named entity */
          313                 return namedentitytostr(e + 1, buf, bufsiz);
          314 }
          315 
          316 void
          317 xml_parse(XMLParser *x)
          318 {
          319         size_t datalen, tagdatalen;
          320         int c, isend;
          321 
          322         while ((c = GETNEXT()) != EOF && c != '<')
          323                 ; /* skip until < */
          324 
          325         while (c != EOF) {
          326                 if (c == '<') { /* parse tag */
          327                         if ((c = GETNEXT()) == EOF)
          328                                 return;
          329 
          330                         if (c == '!') { /* CDATA and comments */
          331                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          332                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          333                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          334                                                 x->data[tagdatalen++] = c;
          335                                         if (c == '>')
          336                                                 break;
          337                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          338                                                         (x->data[0] == '-')) {
          339                                                 xml_parsecomment(x);
          340                                                 break;
          341                                         } else if (c == '[') {
          342                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          343                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          344                                                         xml_parsecdata(x);
          345                                                         break;
          346                                                 }
          347                                         }
          348                                 }
          349                         } else {
          350                                 /* normal tag (open, short open, close), processing instruction. */
          351                                 x->tag[0] = c;
          352                                 x->taglen = 1;
          353                                 x->isshorttag = isend = 0;
          354 
          355                                 /* treat processing instruction as short tag, don't strip "?" prefix. */
          356                                 if (c == '?') {
          357                                         x->isshorttag = 1;
          358                                 } else if (c == '/') {
          359                                         if ((c = GETNEXT()) == EOF)
          360                                                 return;
          361                                         x->tag[0] = c;
          362                                         isend = 1;
          363                                 }
          364 
          365                                 while ((c = GETNEXT()) != EOF) {
          366                                         if (c == '/')
          367                                                 x->isshorttag = 1; /* short tag */
          368                                         else if (c == '>' || ISSPACE(c)) {
          369                                                 x->tag[x->taglen] = '\0';
          370                                                 if (isend) { /* end tag, starts with </ */
          371                                                         while (c != '>' && c != EOF) /* skip until > */
          372                                                                 c = GETNEXT();
          373                                                         if (x->xmltagend)
          374                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          375                                                         x->tag[0] = '\0';
          376                                                         x->taglen = 0;
          377                                                 } else {
          378                                                         /* start tag */
          379                                                         if (x->xmltagstart)
          380                                                                 x->xmltagstart(x, x->tag, x->taglen);
          381                                                         if (ISSPACE(c))
          382                                                                 xml_parseattrs(x);
          383                                                         if (x->xmltagstartparsed)
          384                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          385                                                 }
          386                                                 /* call tagend for short tag or processing instruction */
          387                                                 if (x->isshorttag) {
          388                                                         if (x->xmltagend)
          389                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          390                                                         x->tag[0] = '\0';
          391                                                         x->taglen = 0;
          392                                                 }
          393                                                 break;
          394                                         } else if (x->taglen < sizeof(x->tag) - 1)
          395                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          396                                 }
          397                         }
          398                 } else {
          399                         /* parse tag data */
          400                         datalen = 0;
          401                         if (x->xmldatastart)
          402                                 x->xmldatastart(x);
          403                         while ((c = GETNEXT()) != EOF) {
          404                                 if (c == '&') {
          405                                         if (datalen) {
          406                                                 x->data[datalen] = '\0';
          407                                                 if (x->xmldata)
          408                                                         x->xmldata(x, x->data, datalen);
          409                                         }
          410                                         x->data[0] = c;
          411                                         datalen = 1;
          412                                         while ((c = GETNEXT()) != EOF) {
          413                                                 if (c == '<')
          414                                                         break;
          415                                                 if (datalen < sizeof(x->data) - 1)
          416                                                         x->data[datalen++] = c;
          417                                                 else {
          418                                                         /* entity too long for buffer, handle as normal data */
          419                                                         x->data[datalen] = '\0';
          420                                                         if (x->xmldata)
          421                                                                 x->xmldata(x, x->data, datalen);
          422                                                         x->data[0] = c;
          423                                                         datalen = 1;
          424                                                         break;
          425                                                 }
          426                                                 if (c == ';') {
          427                                                         x->data[datalen] = '\0';
          428                                                         if (x->xmldataentity)
          429                                                                 x->xmldataentity(x, x->data, datalen);
          430                                                         datalen = 0;
          431                                                         break;
          432                                                 }
          433                                         }
          434                                 } else if (c != '<') {
          435                                         if (datalen < sizeof(x->data) - 1) {
          436                                                 x->data[datalen++] = c;
          437                                         } else {
          438                                                 x->data[datalen] = '\0';
          439                                                 if (x->xmldata)
          440                                                         x->xmldata(x, x->data, datalen);
          441                                                 x->data[0] = c;
          442                                                 datalen = 1;
          443                                         }
          444                                 }
          445                                 if (c == '<') {
          446                                         x->data[datalen] = '\0';
          447                                         if (x->xmldata && datalen)
          448                                                 x->xmldata(x, x->data, datalen);
          449                                         if (x->xmldataend)
          450                                                 x->xmldataend(x);
          451                                         break;
          452                                 }
          453                         }
          454                 }
          455         }
          456 }