xml.c - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (12246B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 /* ifdef for HTML mode. To differentiate xml.c and webdump HTML changes */
            9 #define HTML_MODE
           10 
           11 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
           12 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           13 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           14 #define ISXDIGIT(c) ((((unsigned)c) - '0' < 10) || (((unsigned)c) | 32) - 'a' < 6)
           15 
           16 static void
           17 xml_parseattrs(XMLParser *x)
           18 {
           19         size_t namelen = 0, valuelen;
           20         int c, endsep, endname = 0, valuestart = 0;
           21 
           22         while ((c = GETNEXT()) != EOF) {
           23                 if (ISSPACE(c)) {
           24                         if (namelen)
           25                                 endname = 1;
           26                         continue;
           27                 } else if (c == '?')
           28                         ; /* ignore */
           29                 else if (c == '=') {
           30                         x->name[namelen] = '\0';
           31                         valuestart = 1;
           32                         endname = 1;
           33                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           34                         /* attribute without value */
           35                         x->name[namelen] = '\0';
           36                         if (x->xmlattrstart)
           37                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           38                         if (x->xmlattr)
           39                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           40                         if (x->xmlattrend)
           41                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           42                         endname = 0;
           43                         x->name[0] = c;
           44                         namelen = 1;
           45                 } else if (namelen && valuestart) {
           46                         /* attribute with value */
           47                         if (x->xmlattrstart)
           48                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           49 
           50                         valuelen = 0;
           51                         if (c == '\'' || c == '"') {
           52                                 endsep = c;
           53                         } else {
           54                                 endsep = ' '; /* ISSPACE() */
           55                                 goto startvalue;
           56                         }
           57 
           58                         while ((c = GETNEXT()) != EOF) {
           59 startvalue:
           60                                 if (c == '&') { /* entities */
           61                                         x->data[valuelen] = '\0';
           62                                         /* call data function with data before entity if there is data */
           63                                         if (valuelen && x->xmlattr)
           64                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           65                                         x->data[0] = c;
           66                                         valuelen = 1;
           67                                         while ((c = GETNEXT()) != EOF) {
           68                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
           69                                                         break;
           70                                                 if (valuelen < sizeof(x->data) - 1)
           71                                                         x->data[valuelen++] = c;
           72                                                 else {
           73                                                         /* entity too long for buffer, handle as normal data */
           74                                                         x->data[valuelen] = '\0';
           75                                                         if (x->xmlattr)
           76                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           77                                                         x->data[0] = c;
           78                                                         valuelen = 1;
           79                                                         break;
           80                                                 }
           81                                                 if (c == ';') {
           82                                                         x->data[valuelen] = '\0';
           83                                                         if (x->xmlattrentity)
           84                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           85                                                         valuelen = 0;
           86                                                         break;
           87                                                 }
           88                                         }
           89                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
           90                                         if (valuelen < sizeof(x->data) - 1) {
           91                                                 x->data[valuelen++] = c;
           92                                         } else {
           93                                                 x->data[valuelen] = '\0';
           94                                                 if (x->xmlattr)
           95                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           96                                                 x->data[0] = c;
           97                                                 valuelen = 1;
           98                                         }
           99                                 }
          100                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          101                                         x->data[valuelen] = '\0';
          102                                         if (x->xmlattr)
          103                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          104                                         if (x->xmlattrend)
          105                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          106                                         break;
          107                                 }
          108                         }
          109                         namelen = endname = valuestart = 0;
          110                 } else if (namelen < sizeof(x->name) - 1) {
          111                         x->name[namelen++] = c;
          112                 }
          113                 if (c == '>') {
          114                         break;
          115                 } else if (c == '/') {
          116                         x->isshorttag = 1;
          117                         x->name[0] = '\0';
          118                         namelen = 0;
          119                 }
          120         }
          121 }
          122 
          123 static void
          124 xml_parsecomment(XMLParser *x)
          125 {
          126         size_t datalen = 0, i = 0;
          127         int c;
          128 
          129         if (x->xmlcommentstart)
          130                 x->xmlcommentstart(x);
          131         while ((c = GETNEXT()) != EOF) {
          132                 if (c == '-' || c == '>') {
          133                         if (x->xmlcomment && datalen) {
          134                                 x->data[datalen] = '\0';
          135                                 x->xmlcomment(x, x->data, datalen);
          136                                 datalen = 0;
          137                         }
          138                 }
          139 
          140                 if (c == '-') {
          141                         if (++i > 2) {
          142                                 if (x->xmlcomment)
          143                                         for (; i > 2; i--)
          144                                                 x->xmlcomment(x, "-", 1);
          145                                 i = 2;
          146                         }
          147                         continue;
          148                 } else if (c == '>' && i == 2) {
          149                         if (x->xmlcommentend)
          150                                 x->xmlcommentend(x);
          151                         return;
          152                 } else if (i) {
          153                         if (x->xmlcomment) {
          154                                 for (; i > 0; i--)
          155                                         x->xmlcomment(x, "-", 1);
          156                         }
          157                         i = 0;
          158                 }
          159 
          160                 if (datalen < sizeof(x->data) - 1) {
          161                         x->data[datalen++] = c;
          162                 } else {
          163                         x->data[datalen] = '\0';
          164                         if (x->xmlcomment)
          165                                 x->xmlcomment(x, x->data, datalen);
          166                         x->data[0] = c;
          167                         datalen = 1;
          168                 }
          169         }
          170 }
          171 
          172 static void
          173 xml_parsecdata(XMLParser *x)
          174 {
          175         size_t datalen = 0, i = 0;
          176         int c;
          177 
          178         if (x->xmlcdatastart)
          179                 x->xmlcdatastart(x);
          180         while ((c = GETNEXT()) != EOF) {
          181                 if (c == ']' || c == '>') {
          182                         if (x->xmlcdata && datalen) {
          183                                 x->data[datalen] = '\0';
          184                                 x->xmlcdata(x, x->data, datalen);
          185                                 datalen = 0;
          186                         }
          187                 }
          188 
          189                 if (c == ']') {
          190                         if (++i > 2) {
          191                                 if (x->xmlcdata)
          192                                         for (; i > 2; i--)
          193                                                 x->xmlcdata(x, "]", 1);
          194                                 i = 2;
          195                         }
          196                         continue;
          197                 } else if (c == '>' && i == 2) {
          198                         if (x->xmlcdataend)
          199                                 x->xmlcdataend(x);
          200                         return;
          201                 } else if (i) {
          202                         if (x->xmlcdata)
          203                                 for (; i > 0; i--)
          204                                         x->xmlcdata(x, "]", 1);
          205                         i = 0;
          206                 }
          207 
          208                 if (datalen < sizeof(x->data) - 1) {
          209                         x->data[datalen++] = c;
          210                 } else {
          211                         x->data[datalen] = '\0';
          212                         if (x->xmlcdata)
          213                                 x->xmlcdata(x, x->data, datalen);
          214                         x->data[0] = c;
          215                         datalen = 1;
          216                 }
          217         }
          218 }
          219 
          220 static int
          221 codepointtoutf8(long r, char *s)
          222 {
          223         if (r == 0) {
          224                 return 0; /* NUL byte */
          225         } else if (r <= 0x7F) {
          226                 /* 1 byte: 0aaaaaaa */
          227                 s[0] = r;
          228                 return 1;
          229         } else if (r <= 0x07FF) {
          230                 /* 2 bytes: 00000aaa aabbbbbb */
          231                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          232                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          233                 return 2;
          234         } else if (r <= 0xFFFF) {
          235                 /* 3 bytes: aaaabbbb bbcccccc */
          236                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          237                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          238                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          239                 return 3;
          240         } else {
          241                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          242                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          243                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          244                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          245                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          246                 return 4;
          247         }
          248 }
          249 
          250 struct namedentity {
          251         const char *entity;
          252         long cp;
          253 };
          254 
          255 static int
          256 namedentitycmp(const void *v1, const void *v2)
          257 {
          258         struct namedentity *n1 = (struct namedentity *)v1;
          259         struct namedentity *n2 = (struct namedentity *)v2;
          260 
          261         return strcmp(n1->entity, n2->entity);
          262 }
          263 
          264 static const struct namedentity entities[] = {
          265 #include "namedentities.h"
          266 };
          267 
          268 static int
          269 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          270 {
          271         struct namedentity find, *found;
          272         size_t i;
          273 
          274         /* buffer is too small */
          275         if (bufsiz < 5)
          276                 return -1;
          277 
          278         find.entity = e;
          279         found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities),
          280                 sizeof(*entities), namedentitycmp);
          281         if (found) {
          282                 i = codepointtoutf8(found->cp, buf);
          283                 buf[i] = '\0';
          284                 return i;
          285         }
          286         return -1;
          287 }
          288 
          289 static int
          290 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          291 {
          292         long l;
          293         int base, len;
          294         const char *s;
          295         char *end;
          296 
          297         /* buffer is too small */
          298         if (bufsiz < 5)
          299                 return -1;
          300 
          301         /* hex (base 16) or decimal (base 10) */
          302         if (*e == 'x') {
          303                 e++;
          304                 for (s = e; *s && *s != ';'; s++) {
          305                         if (!ISXDIGIT((unsigned char)*s))
          306                                 return -1; /* invalid: no hex */
          307                 }
          308                 base = 16;
          309 
          310         } else {
          311                 for (s = e; *s && *s != ';'; s++) {
          312                         if (!ISDIGIT((unsigned char)*s))
          313                                 return -1; /* invalid: no digits */
          314                 }
          315                 base = 10;
          316         }
          317         if (*s != ';' || *(s + 1) != '\0')
          318                 return -1; /* must end with ';' NUL */
          319 
          320         errno = 0;
          321         l = strtol(e, &end, base);
          322 
          323         /* invalid value or not a well-formed entity or invalid code point */
          324         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          325             (l >= 0xd800 && l <= 0xdfff)) /* surrogate range */
          326                 return -1;
          327         len = codepointtoutf8(l, buf);
          328         buf[len] = '\0';
          329 
          330         return len;
          331 }
          332 
          333 /* convert named- or numeric entity string to buffer string
          334  * returns byte-length of string or -1 on failure. */
          335 int
          336 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          337 {
          338         /* doesn't start with & */
          339         if (e[0] != '&')
          340                 return -1;
          341         /* numeric entity */
          342         if (e[1] == '#')
          343                 return numericentitytostr(e + 2, buf, bufsiz);
          344         else /* named entity */
          345                 return namedentitytostr(e + 1, buf, bufsiz);
          346 }
          347 
          348 void
          349 xml_parse(XMLParser *x)
          350 {
          351         size_t datalen, tagdatalen;
          352         int c, isend;
          353 
          354 #ifdef HTML_MODE
          355         goto read_data;
          356 #else
          357         /* HTML: process data before a tag occured aswell */
          358         while ((c = GETNEXT()) != EOF && c != '<')
          359                 ; /* skip until < */
          360 #endif
          361 
          362         while (c != EOF) {
          363                 if (c == '<') { /* parse tag */
          364                         if ((c = GETNEXT()) == EOF)
          365                                 return;
          366 
          367                         if (c == '!') { /* CDATA and comments */
          368                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          369                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          370                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          371                                                 x->data[tagdatalen++] = c;
          372                                         if (c == '>')
          373                                                 break;
          374                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          375                                                         (x->data[0] == '-')) {
          376                                                 xml_parsecomment(x);
          377                                                 break;
          378                                         } else if (c == '[') {
          379                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          380                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          381                                                         xml_parsecdata(x);
          382                                                         break;
          383                                                 }
          384                                         }
          385                                 }
          386                         } else {
          387                                 /* normal tag (open, short open, close), processing instruction. */
          388                                 x->tag[0] = c;
          389                                 x->taglen = 1;
          390                                 x->isshorttag = isend = 0;
          391 
          392                                 /* treat processing instruction as short tag, don't strip "?" prefix. */
          393                                 if (c == '?') {
          394                                         x->isshorttag = 1;
          395                                 } else if (c == '/') {
          396                                         if ((c = GETNEXT()) == EOF)
          397                                                 return;
          398                                         x->tag[0] = c;
          399                                         isend = 1;
          400                                 }
          401 
          402                                 while ((c = GETNEXT()) != EOF) {
          403                                         if (c == '/')
          404                                                 x->isshorttag = 1; /* short tag */
          405                                         else if (c == '>' || ISSPACE(c)) {
          406                                                 x->tag[x->taglen] = '\0';
          407                                                 if (isend) { /* end tag, starts with </ */
          408                                                         while (c != '>' && c != EOF) /* skip until > */
          409                                                                 c = GETNEXT();
          410                                                         if (x->xmltagend)
          411                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          412                                                         x->tag[0] = '\0';
          413                                                         x->taglen = 0;
          414                                                 } else {
          415                                                         /* start tag */
          416                                                         if (x->xmltagstart)
          417                                                                 x->xmltagstart(x, x->tag, x->taglen);
          418                                                         if (ISSPACE(c))
          419                                                                 xml_parseattrs(x);
          420                                                         if (x->xmltagstartparsed)
          421                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          422                                                 }
          423                                                 /* call tagend for short tag or processing instruction */
          424                                                 if (x->isshorttag) {
          425                                                         if (x->xmltagend)
          426                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          427                                                         x->tag[0] = '\0';
          428                                                         x->taglen = 0;
          429                                                 }
          430                                                 break;
          431                                         } else if (x->taglen < sizeof(x->tag) - 1)
          432                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          433                                 }
          434                         }
          435                 } else {
          436 #ifdef HTML_MODE
          437 read_data:
          438 #endif
          439                         /* parse tag data */
          440                         datalen = 0;
          441                         if (x->xmldatastart)
          442                                 x->xmldatastart(x);
          443                         while ((c = GETNEXT()) != EOF) {
          444                                 if (c == '&') { /* entities */
          445                                         if (datalen) {
          446                                                 x->data[datalen] = '\0';
          447                                                 if (x->xmldata)
          448                                                         x->xmldata(x, x->data, datalen);
          449                                         }
          450                                         x->data[0] = c;
          451                                         datalen = 1;
          452                                         while ((c = GETNEXT()) != EOF) {
          453                                                 if (c == '<')
          454                                                         break;
          455                                                 if (datalen < sizeof(x->data) - 1)
          456                                                         x->data[datalen++] = c;
          457                                                 else {
          458                                                         /* entity too long for buffer, handle as normal data */
          459                                                         x->data[datalen] = '\0';
          460                                                         if (x->xmldata)
          461                                                                 x->xmldata(x, x->data, datalen);
          462                                                         x->data[0] = c;
          463                                                         datalen = 1;
          464                                                         break;
          465                                                 }
          466                                                 if (c == ';') {
          467                                                         x->data[datalen] = '\0';
          468                                                         if (x->xmldataentity)
          469                                                                 x->xmldataentity(x, x->data, datalen);
          470                                                         datalen = 0;
          471                                                         break;
          472                                                 }
          473                                         }
          474                                 } else if (c != '<') {
          475                                         if (datalen < sizeof(x->data) - 1) {
          476                                                 x->data[datalen++] = c;
          477                                         } else {
          478                                                 x->data[datalen] = '\0';
          479                                                 if (x->xmldata)
          480                                                         x->xmldata(x, x->data, datalen);
          481                                                 x->data[0] = c;
          482                                                 datalen = 1;
          483                                         }
          484                                 }
          485                                 if (c == '<') {
          486                                         x->data[datalen] = '\0';
          487                                         if (x->xmldata && datalen)
          488                                                 x->xmldata(x, x->data, datalen);
          489                                         if (x->xmldataend)
          490                                                 x->xmldataend(x);
          491 #ifdef HTML_MODE
          492                                         datalen = 0;
          493 #endif
          494                                         break;
          495                                 }
          496                         }
          497 
          498 #ifdef HTML_MODE
          499                         /* pending data, even if a tag didn't close (EOF, etc). */
          500                         if (datalen) {
          501                                 x->data[datalen] = '\0';
          502                                 if (x->xmldata && datalen)
          503                                         x->xmldata(x, x->data, datalen);
          504                                 if (x->xmldataend)
          505                                         x->xmldataend(x);
          506                                 datalen = 0;
          507                         }
          508 #endif
          509                 }
          510         }
          511 }