xml.c - frontends - front-ends for some sites (experiment)
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (11981B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
            9 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           10 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           11 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || (((unsigned)c) | 32) - 'a' < 6)
           12 
           13 /* data buffers, size and offset used for parsing XML, see getnext() */
           14 static const unsigned char *xml_data_buf;
           15 static size_t xml_data_size;
           16 static size_t xml_data_off;
           17 
           18 void
           19 setxmldata(const char *s, size_t len)
           20 {
           21         xml_data_off = 0;
           22         xml_data_size = len;
           23         xml_data_buf = (unsigned char *)s;
           24 }
           25 
           26 static int
           27 getnext(void)
           28 {
           29         if (xml_data_off >= xml_data_size)
           30                 return EOF;
           31         return xml_data_buf[xml_data_off++];
           32 }
           33 
           34 static void
           35 xml_parseattrs(XMLParser *x)
           36 {
           37         size_t namelen = 0, valuelen;
           38         int c, endsep, endname = 0, valuestart = 0;
           39 
           40         while ((c = GETNEXT()) != EOF) {
           41                 if (ISSPACE(c)) {
           42                         if (namelen)
           43                                 endname = 1;
           44                         continue;
           45                 } else if (c == '?')
           46                         ; /* ignore */
           47                 else if (c == '=') {
           48                         x->name[namelen] = '\0';
           49                         valuestart = 1;
           50                         endname = 1;
           51                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           52                         /* attribute without value */
           53                         x->name[namelen] = '\0';
           54                         if (x->xmlattrstart)
           55                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           56                         if (x->xmlattr)
           57                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           58                         if (x->xmlattrend)
           59                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           60                         endname = 0;
           61                         x->name[0] = c;
           62                         namelen = 1;
           63                 } else if (namelen && valuestart) {
           64                         /* attribute with value */
           65                         if (x->xmlattrstart)
           66                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           67 
           68                         valuelen = 0;
           69                         if (c == '\'' || c == '"') {
           70                                 endsep = c;
           71                         } else {
           72                                 endsep = ' '; /* ISSPACE() */
           73                                 goto startvalue;
           74                         }
           75 
           76                         while ((c = GETNEXT()) != EOF) {
           77 startvalue:
           78                                 if (c == '&') { /* entities */
           79                                         x->data[valuelen] = '\0';
           80                                         /* call data function with data before entity if there is data */
           81                                         if (valuelen && x->xmlattr)
           82                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           83                                         x->data[0] = c;
           84                                         valuelen = 1;
           85                                         while ((c = GETNEXT()) != EOF) {
           86                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
           87                                                         break;
           88                                                 if (valuelen < sizeof(x->data) - 1)
           89                                                         x->data[valuelen++] = c;
           90                                                 else {
           91                                                         /* entity too long for buffer, handle as normal data */
           92                                                         x->data[valuelen] = '\0';
           93                                                         if (x->xmlattr)
           94                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           95                                                         x->data[0] = c;
           96                                                         valuelen = 1;
           97                                                         break;
           98                                                 }
           99                                                 if (c == ';') {
          100                                                         x->data[valuelen] = '\0';
          101                                                         if (x->xmlattrentity)
          102                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          103                                                         valuelen = 0;
          104                                                         break;
          105                                                 }
          106                                         }
          107                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          108                                         if (valuelen < sizeof(x->data) - 1) {
          109                                                 x->data[valuelen++] = c;
          110                                         } else {
          111                                                 x->data[valuelen] = '\0';
          112                                                 if (x->xmlattr)
          113                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          114                                                 x->data[0] = c;
          115                                                 valuelen = 1;
          116                                         }
          117                                 }
          118                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          119                                         x->data[valuelen] = '\0';
          120                                         if (x->xmlattr)
          121                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          122                                         if (x->xmlattrend)
          123                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          124                                         break;
          125                                 }
          126                         }
          127                         namelen = endname = valuestart = 0;
          128                 } else if (namelen < sizeof(x->name) - 1) {
          129                         x->name[namelen++] = c;
          130                 }
          131                 if (c == '>') {
          132                         break;
          133                 } else if (c == '/') {
          134                         x->isshorttag = 1;
          135                         x->name[0] = '\0';
          136                         namelen = 0;
          137                 }
          138         }
          139 }
          140 
          141 static void
          142 xml_parsecomment(XMLParser *x)
          143 {
          144         size_t datalen = 0, i = 0;
          145         int c;
          146 
          147         if (x->xmlcommentstart)
          148                 x->xmlcommentstart(x);
          149         while ((c = GETNEXT()) != EOF) {
          150                 if (c == '-' || c == '>') {
          151                         if (x->xmlcomment && datalen) {
          152                                 x->data[datalen] = '\0';
          153                                 x->xmlcomment(x, x->data, datalen);
          154                                 datalen = 0;
          155                         }
          156                 }
          157 
          158                 if (c == '-') {
          159                         if (++i > 2) {
          160                                 if (x->xmlcomment)
          161                                         for (; i > 2; i--)
          162                                                 x->xmlcomment(x, "-", 1);
          163                                 i = 2;
          164                         }
          165                         continue;
          166                 } else if (c == '>' && i == 2) {
          167                         if (x->xmlcommentend)
          168                                 x->xmlcommentend(x);
          169                         return;
          170                 } else if (i) {
          171                         if (x->xmlcomment) {
          172                                 for (; i > 0; i--)
          173                                         x->xmlcomment(x, "-", 1);
          174                         }
          175                         i = 0;
          176                 }
          177 
          178                 if (datalen < sizeof(x->data) - 1) {
          179                         x->data[datalen++] = c;
          180                 } else {
          181                         x->data[datalen] = '\0';
          182                         if (x->xmlcomment)
          183                                 x->xmlcomment(x, x->data, datalen);
          184                         x->data[0] = c;
          185                         datalen = 1;
          186                 }
          187         }
          188 }
          189 
          190 static void
          191 xml_parsecdata(XMLParser *x)
          192 {
          193         size_t datalen = 0, i = 0;
          194         int c;
          195 
          196         if (x->xmlcdatastart)
          197                 x->xmlcdatastart(x);
          198         while ((c = GETNEXT()) != EOF) {
          199                 if (c == ']' || c == '>') {
          200                         if (x->xmlcdata && datalen) {
          201                                 x->data[datalen] = '\0';
          202                                 x->xmlcdata(x, x->data, datalen);
          203                                 datalen = 0;
          204                         }
          205                 }
          206 
          207                 if (c == ']') {
          208                         if (++i > 2) {
          209                                 if (x->xmlcdata)
          210                                         for (; i > 2; i--)
          211                                                 x->xmlcdata(x, "]", 1);
          212                                 i = 2;
          213                         }
          214                         continue;
          215                 } else if (c == '>' && i == 2) {
          216                         if (x->xmlcdataend)
          217                                 x->xmlcdataend(x);
          218                         return;
          219                 } else if (i) {
          220                         if (x->xmlcdata)
          221                                 for (; i > 0; i--)
          222                                         x->xmlcdata(x, "]", 1);
          223                         i = 0;
          224                 }
          225 
          226                 if (datalen < sizeof(x->data) - 1) {
          227                         x->data[datalen++] = c;
          228                 } else {
          229                         x->data[datalen] = '\0';
          230                         if (x->xmlcdata)
          231                                 x->xmlcdata(x, x->data, datalen);
          232                         x->data[0] = c;
          233                         datalen = 1;
          234                 }
          235         }
          236 }
          237 
          238 static int
          239 codepointtoutf8(long r, char *s)
          240 {
          241         if (r == 0) {
          242                 return 0; /* NUL byte */
          243         } else if (r <= 0x7F) {
          244                 /* 1 byte: 0aaaaaaa */
          245                 s[0] = r;
          246                 return 1;
          247         } else if (r <= 0x07FF) {
          248                 /* 2 bytes: 00000aaa aabbbbbb */
          249                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          250                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          251                 return 2;
          252         } else if (r <= 0xFFFF) {
          253                 /* 3 bytes: aaaabbbb bbcccccc */
          254                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          255                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          256                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          257                 return 3;
          258         } else {
          259                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          260                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          261                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          262                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          263                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          264                 return 4;
          265         }
          266 }
          267 
          268 static int
          269 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          270 {
          271         static const struct {
          272                 const char *entity;
          273                 int c;
          274         } entities[] = {
          275                 { "amp;",  '&'  },
          276                 { "lt;",   '<'  },
          277                 { "gt;",   '>'  },
          278                 { "apos;", '\'' },
          279                 { "quot;", '"'  },
          280                 { "AMP;",  '&'  },
          281                 { "LT;",   '<'  },
          282                 { "GT;",   '>'  },
          283                 { "APOS;", '\'' },
          284                 { "QUOT;", '"'  }
          285         };
          286         size_t i;
          287 
          288         /* buffer is too small */
          289         if (bufsiz < 2)
          290                 return -1;
          291 
          292         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
          293                 if (!strcmp(e, entities[i].entity)) {
          294                         buf[0] = entities[i].c;
          295                         buf[1] = '\0';
          296                         return 1;
          297                 }
          298         }
          299         return -1;
          300 }
          301 
          302 static int
          303 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          304 {
          305         long l;
          306         int base, len;
          307         const char *s;
          308         char *end;
          309 
          310         /* buffer is too small */
          311         if (bufsiz < 5)
          312                 return -1;
          313 
          314         /* hex (base 16) or decimal (base 10) */
          315         if (*e == 'x') {
          316                 e++;
          317                 for (s = e; *s && *s != ';'; s++) {
          318                         if (!ISXDIGIT((unsigned char)*s))
          319                                 return -1; /* invalid: no hex */
          320                 }
          321                 base = 16;
          322 
          323         } else {
          324                 for (s = e; *s && *s != ';'; s++) {
          325                         if (!ISDIGIT((unsigned char)*s))
          326                                 return -1; /* invalid: no digits */
          327                 }
          328                 base = 10;
          329         }
          330         if (*s != ';' || *(s + 1) != '\0')
          331                 return -1; /* must end with ';' NUL */
          332 
          333         errno = 0;
          334         l = strtol(e, &end, base);
          335 
          336         /* invalid value or not a well-formed entity or invalid code point */
          337         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          338             (l >= 0xd800 && l <= 0xdfff)) /* surrogate range */
          339                 return -1;
          340         len = codepointtoutf8(l, buf);
          341         buf[len] = '\0';
          342 
          343         return len;
          344 }
          345 
          346 /* convert named- or numeric entity string to buffer string
          347  * returns byte-length of string or -1 on failure. */
          348 int
          349 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          350 {
          351         /* doesn't start with & */
          352         if (e[0] != '&')
          353                 return -1;
          354         /* numeric entity */
          355         if (e[1] == '#')
          356                 return numericentitytostr(e + 2, buf, bufsiz);
          357         else /* named entity */
          358                 return namedentitytostr(e + 1, buf, bufsiz);
          359 }
          360 
          361 void
          362 xml_parse(XMLParser *x)
          363 {
          364         size_t datalen, tagdatalen;
          365         int c, isend;
          366 
          367         while ((c = GETNEXT()) != EOF && c != '<')
          368                 ; /* skip until < */
          369 
          370         while (c != EOF) {
          371                 if (c == '<') { /* parse tag */
          372                         if ((c = GETNEXT()) == EOF)
          373                                 return;
          374 
          375                         if (c == '!') { /* CDATA and comments */
          376                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          377                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          378                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          379                                                 x->data[tagdatalen++] = c;
          380                                         if (c == '>')
          381                                                 break;
          382                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          383                                                         (x->data[0] == '-')) {
          384                                                 xml_parsecomment(x);
          385                                                 break;
          386                                         } else if (c == '[') {
          387                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          388                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          389                                                         xml_parsecdata(x);
          390                                                         break;
          391                                                 }
          392                                         }
          393                                 }
          394                         } else {
          395                                 /* normal tag (open, short open, close), processing instruction. */
          396                                 x->tag[0] = c;
          397                                 x->taglen = 1;
          398                                 x->isshorttag = isend = 0;
          399 
          400                                 /* treat processing instruction as short tag, don't strip "?" prefix. */
          401                                 if (c == '?') {
          402                                         x->isshorttag = 1;
          403                                 } else if (c == '/') {
          404                                         if ((c = GETNEXT()) == EOF)
          405                                                 return;
          406                                         x->tag[0] = c;
          407                                         isend = 1;
          408                                 }
          409 
          410                                 while ((c = GETNEXT()) != EOF) {
          411                                         if (c == '/')
          412                                                 x->isshorttag = 1; /* short tag */
          413                                         else if (c == '>' || ISSPACE(c)) {
          414                                                 x->tag[x->taglen] = '\0';
          415                                                 if (isend) { /* end tag, starts with </ */
          416                                                         while (c != '>' && c != EOF) /* skip until > */
          417                                                                 c = GETNEXT();
          418                                                         if (x->xmltagend)
          419                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          420                                                         x->tag[0] = '\0';
          421                                                         x->taglen = 0;
          422                                                 } else {
          423                                                         /* start tag */
          424                                                         if (x->xmltagstart)
          425                                                                 x->xmltagstart(x, x->tag, x->taglen);
          426                                                         if (ISSPACE(c))
          427                                                                 xml_parseattrs(x);
          428                                                         if (x->xmltagstartparsed)
          429                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          430                                                 }
          431                                                 /* call tagend for short tag or processing instruction */
          432                                                 if (x->isshorttag) {
          433                                                         if (x->xmltagend)
          434                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          435                                                         x->tag[0] = '\0';
          436                                                         x->taglen = 0;
          437                                                 }
          438                                                 break;
          439                                         } else if (x->taglen < sizeof(x->tag) - 1)
          440                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          441                                 }
          442                         }
          443                 } else {
          444                         /* parse tag data */
          445                         datalen = 0;
          446                         if (x->xmldatastart)
          447                                 x->xmldatastart(x);
          448                         while ((c = GETNEXT()) != EOF) {
          449                                 if (c == '&') { /* entities */
          450                                         if (datalen) {
          451                                                 x->data[datalen] = '\0';
          452                                                 if (x->xmldata)
          453                                                         x->xmldata(x, x->data, datalen);
          454                                         }
          455                                         x->data[0] = c;
          456                                         datalen = 1;
          457                                         while ((c = GETNEXT()) != EOF) {
          458                                                 if (c == '<')
          459                                                         break;
          460                                                 if (datalen < sizeof(x->data) - 1)
          461                                                         x->data[datalen++] = c;
          462                                                 else {
          463                                                         /* entity too long for buffer, handle as normal data */
          464                                                         x->data[datalen] = '\0';
          465                                                         if (x->xmldata)
          466                                                                 x->xmldata(x, x->data, datalen);
          467                                                         x->data[0] = c;
          468                                                         datalen = 1;
          469                                                         break;
          470                                                 }
          471                                                 if (c == ';') {
          472                                                         x->data[datalen] = '\0';
          473                                                         if (x->xmldataentity)
          474                                                                 x->xmldataentity(x, x->data, datalen);
          475                                                         datalen = 0;
          476                                                         break;
          477                                                 }
          478                                         }
          479                                 } else if (c != '<') {
          480                                         if (datalen < sizeof(x->data) - 1) {
          481                                                 x->data[datalen++] = c;
          482                                         } else {
          483                                                 x->data[datalen] = '\0';
          484                                                 if (x->xmldata)
          485                                                         x->xmldata(x, x->data, datalen);
          486                                                 x->data[0] = c;
          487                                                 datalen = 1;
          488                                         }
          489                                 }
          490                                 if (c == '<') {
          491                                         x->data[datalen] = '\0';
          492                                         if (x->xmldata && datalen)
          493                                                 x->xmldata(x, x->data, datalen);
          494                                         if (x->xmldataend)
          495                                                 x->xmldataend(x);
          496                                         break;
          497                                 }
          498                         }
          499                 }
          500         }
          501 }