xml.c - xml2tsv - a simple xml-to-tsv converter, based on xmlparser
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Tags
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (10070B)
       ---
            1 #include <ctype.h>
            2 #include <errno.h>
            3 #include <stdio.h>
            4 #include <stdlib.h>
            5 #include <string.h>
            6 
            7 #include "xml.h"
            8 
            9 static void
           10 xml_parseattrs(XMLParser *x)
           11 {
           12         size_t namelen = 0, valuelen;
           13         int c, endsep, endname = 0, valuestart = 0;
           14 
           15         while ((c = GETNEXT()) != EOF) {
           16                 if (isspace(c)) {
           17                         if (namelen)
           18                                 endname = 1;
           19                         continue;
           20                 } else if (c == '?')
           21                         ; /* ignore */
           22                 else if (c == '=') {
           23                         x->name[namelen] = '\0';
           24                         valuestart = 1;
           25                         endname = 1;
           26                 } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
           27                         /* attribute without value */
           28                         x->name[namelen] = '\0';
           29                         if (x->xmlattrstart)
           30                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           31                         if (x->xmlattr)
           32                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           33                         if (x->xmlattrend)
           34                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           35                         endname = 0;
           36                         x->name[0] = c;
           37                         namelen = 1;
           38                 } else if (namelen && valuestart) {
           39                         /* attribute with value */
           40                         if (x->xmlattrstart)
           41                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           42 
           43                         valuelen = 0;
           44                         if (c == '\'' || c == '"') {
           45                                 endsep = c;
           46                         } else {
           47                                 endsep = ' '; /* isspace() */
           48                                 goto startvalue;
           49                         }
           50 
           51                         while ((c = GETNEXT()) != EOF) {
           52 startvalue:
           53                                 if (c == '&') { /* entities */
           54                                         x->data[valuelen] = '\0';
           55                                         /* call data function with data before entity if there is data */
           56                                         if (valuelen && x->xmlattr)
           57                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           58                                         x->data[0] = c;
           59                                         valuelen = 1;
           60                                         while ((c = GETNEXT()) != EOF) {
           61                                                 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
           62                                                         break;
           63                                                 if (valuelen < sizeof(x->data) - 1)
           64                                                         x->data[valuelen++] = c;
           65                                                 else {
           66                                                         /* entity too long for buffer, handle as normal data */
           67                                                         x->data[valuelen] = '\0';
           68                                                         if (x->xmlattr)
           69                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           70                                                         x->data[0] = c;
           71                                                         valuelen = 1;
           72                                                         break;
           73                                                 }
           74                                                 if (c == ';') {
           75                                                         x->data[valuelen] = '\0';
           76                                                         if (x->xmlattrentity)
           77                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           78                                                         valuelen = 0;
           79                                                         break;
           80                                                 }
           81                                         }
           82                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
           83                                         if (valuelen < sizeof(x->data) - 1) {
           84                                                 x->data[valuelen++] = c;
           85                                         } else {
           86                                                 x->data[valuelen] = '\0';
           87                                                 if (x->xmlattr)
           88                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           89                                                 x->data[0] = c;
           90                                                 valuelen = 1;
           91                                         }
           92                                 }
           93                                 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
           94                                         x->data[valuelen] = '\0';
           95                                         if (x->xmlattr)
           96                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           97                                         if (x->xmlattrend)
           98                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           99                                         break;
          100                                 }
          101                         }
          102                         namelen = endname = valuestart = 0;
          103                 } else if (namelen < sizeof(x->name) - 1) {
          104                         x->name[namelen++] = c;
          105                 }
          106                 if (c == '>') {
          107                         break;
          108                 } else if (c == '/') {
          109                         x->isshorttag = 1;
          110                         x->name[0] = '\0';
          111                         namelen = 0;
          112                 }
          113         }
          114 }
          115 
          116 static void
          117 xml_parsecomment(XMLParser *x)
          118 {
          119         size_t i = 0;
          120         int c;
          121 
          122         while ((c = GETNEXT()) != EOF) {
          123                 if (c == '-') {
          124                         if (++i > 2)
          125                                 i = 2;
          126                         continue;
          127                 } else if (c == '>' && i == 2) {
          128                         return;
          129                 } else if (i) {
          130                         i = 0;
          131                 }
          132         }
          133 }
          134 
          135 static void
          136 xml_parsecdata(XMLParser *x)
          137 {
          138         size_t datalen = 0, i = 0;
          139         int c;
          140 
          141         if (x->xmlcdatastart)
          142                 x->xmlcdatastart(x);
          143         while ((c = GETNEXT()) != EOF) {
          144                 if (c == ']' || c == '>') {
          145                         if (x->xmlcdata && datalen) {
          146                                 x->data[datalen] = '\0';
          147                                 x->xmlcdata(x, x->data, datalen);
          148                                 datalen = 0;
          149                         }
          150                 }
          151 
          152                 if (c == ']') {
          153                         if (++i > 2) {
          154                                 if (x->xmlcdata)
          155                                         for (; i > 2; i--)
          156                                                 x->xmlcdata(x, "]", 1);
          157                                 i = 2;
          158                         }
          159                         continue;
          160                 } else if (c == '>' && i == 2) {
          161                         if (x->xmlcdataend)
          162                                 x->xmlcdataend(x);
          163                         return;
          164                 } else if (i) {
          165                         if (x->xmlcdata)
          166                                 for (; i > 0; i--)
          167                                         x->xmlcdata(x, "]", 1);
          168                         i = 0;
          169                 }
          170 
          171                 if (datalen < sizeof(x->data) - 1) {
          172                         x->data[datalen++] = c;
          173                 } else {
          174                         x->data[datalen] = '\0';
          175                         if (x->xmlcdata)
          176                                 x->xmlcdata(x, x->data, datalen);
          177                         x->data[0] = c;
          178                         datalen = 1;
          179                 }
          180         }
          181 }
          182 
          183 static int
          184 codepointtoutf8(long r, char *s)
          185 {
          186         if (r == 0) {
          187                 return 0; /* NUL byte */
          188         } else if (r <= 0x7F) {
          189                 /* 1 byte: 0aaaaaaa */
          190                 s[0] = r;
          191                 return 1;
          192         } else if (r <= 0x07FF) {
          193                 /* 2 bytes: 00000aaa aabbbbbb */
          194                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          195                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          196                 return 2;
          197         } else if (r <= 0xFFFF) {
          198                 /* 3 bytes: aaaabbbb bbcccccc */
          199                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          200                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          201                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          202                 return 3;
          203         } else {
          204                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          205                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          206                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          207                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          208                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          209                 return 4;
          210         }
          211 }
          212 
          213 static int
          214 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          215 {
          216         static const struct {
          217                 const char *entity;
          218                 int c;
          219         } entities[] = {
          220                 { "amp;",  '&'  },
          221                 { "lt;",   '<'  },
          222                 { "gt;",   '>'  },
          223                 { "apos;", '\'' },
          224                 { "quot;", '"'  },
          225         };
          226         size_t i;
          227 
          228         /* buffer is too small */
          229         if (bufsiz < 2)
          230                 return -1;
          231 
          232         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
          233                 if (!strcmp(e, entities[i].entity)) {
          234                         buf[0] = entities[i].c;
          235                         buf[1] = '\0';
          236                         return 1;
          237                 }
          238         }
          239         return -1;
          240 }
          241 
          242 static int
          243 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          244 {
          245         long l;
          246         int len;
          247         char *end;
          248 
          249         /* buffer is too small */
          250         if (bufsiz < 5)
          251                 return -1;
          252 
          253         errno = 0;
          254         /* hex (16) or decimal (10) */
          255         if (*e == 'x')
          256                 l = strtol(++e, &end, 16);
          257         else
          258                 l = strtol(e, &end, 10);
          259         /* invalid value or not a well-formed entity or invalid code point */
          260         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
          261                 return -1;
          262         len = codepointtoutf8(l, buf);
          263         buf[len] = '\0';
          264 
          265         return len;
          266 }
          267 
          268 /* convert named- or numeric entity string to buffer string
          269  * returns byte-length of string or -1 on failure. */
          270 int
          271 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          272 {
          273         /* doesn't start with & */
          274         if (e[0] != '&')
          275                 return -1;
          276         /* numeric entity */
          277         if (e[1] == '#')
          278                 return numericentitytostr(e + 2, buf, bufsiz);
          279         else /* named entity */
          280                 return namedentitytostr(e + 1, buf, bufsiz);
          281 }
          282 
          283 void
          284 xml_parse(XMLParser *x)
          285 {
          286         size_t datalen, tagdatalen;
          287         int c, isend;
          288 
          289         while ((c = GETNEXT()) != EOF && c != '<')
          290                 ; /* skip until < */
          291 
          292         while (c != EOF) {
          293                 if (c == '<') { /* parse tag */
          294                         if ((c = GETNEXT()) == EOF)
          295                                 return;
          296 
          297                         if (c == '!') { /* cdata and comments */
          298                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          299                                         /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
          300                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          301                                                 x->data[tagdatalen++] = c;
          302                                         if (c == '>')
          303                                                 break;
          304                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          305                                                         (x->data[0] == '-')) {
          306                                                 xml_parsecomment(x);
          307                                                 break;
          308                                         } else if (c == '[') {
          309                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          310                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          311                                                         xml_parsecdata(x);
          312                                                         break;
          313                                                 }
          314                                         }
          315                                 }
          316                         } else {
          317                                 /* normal tag (open, short open, close), processing instruction. */
          318                                 x->tag[0] = c;
          319                                 x->taglen = 1;
          320                                 x->isshorttag = isend = 0;
          321 
          322                                 /* treat processing instruction as shorttag, don't strip "?" prefix. */
          323                                 if (c == '?') {
          324                                         x->isshorttag = 1;
          325                                 } else if (c == '/') {
          326                                         if ((c = GETNEXT()) == EOF)
          327                                                 return;
          328                                         x->tag[0] = c;
          329                                         isend = 1;
          330                                 }
          331 
          332                                 while ((c = GETNEXT()) != EOF) {
          333                                         if (c == '/')
          334                                                 x->isshorttag = 1; /* short tag */
          335                                         else if (c == '>' || isspace(c)) {
          336                                                 x->tag[x->taglen] = '\0';
          337                                                 if (isend) { /* end tag, starts with </ */
          338                                                         if (x->xmltagend)
          339                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          340                                                         x->tag[0] = '\0';
          341                                                         x->taglen = 0;
          342                                                 } else {
          343                                                         /* start tag */
          344                                                         if (x->xmltagstart)
          345                                                                 x->xmltagstart(x, x->tag, x->taglen);
          346                                                         if (isspace(c))
          347                                                                 xml_parseattrs(x);
          348                                                         if (x->xmltagstartparsed)
          349                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          350                                                 }
          351                                                 /* call tagend for shortform or processing instruction */
          352                                                 if (x->isshorttag) {
          353                                                         if (x->xmltagend)
          354                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          355                                                         x->tag[0] = '\0';
          356                                                         x->taglen = 0;
          357                                                 }
          358                                                 break;
          359                                         } else if (x->taglen < sizeof(x->tag) - 1)
          360                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          361                                 }
          362                         }
          363                 } else {
          364                         /* parse tag data */
          365                         datalen = 0;
          366                         if (x->xmldatastart)
          367                                 x->xmldatastart(x);
          368                         while ((c = GETNEXT()) != EOF) {
          369                                 if (c == '&') {
          370                                         if (datalen) {
          371                                                 x->data[datalen] = '\0';
          372                                                 if (x->xmldata)
          373                                                         x->xmldata(x, x->data, datalen);
          374                                         }
          375                                         x->data[0] = c;
          376                                         datalen = 1;
          377                                         while ((c = GETNEXT()) != EOF) {
          378                                                 if (c == '<')
          379                                                         break;
          380                                                 if (datalen < sizeof(x->data) - 1)
          381                                                         x->data[datalen++] = c;
          382                                                 else {
          383                                                         /* entity too long for buffer, handle as normal data */
          384                                                         x->data[datalen] = '\0';
          385                                                         if (x->xmldata)
          386                                                                 x->xmldata(x, x->data, datalen);
          387                                                         x->data[0] = c;
          388                                                         datalen = 1;
          389                                                         break;
          390                                                 }
          391                                                 if (c == ';') {
          392                                                         x->data[datalen] = '\0';
          393                                                         if (x->xmldataentity)
          394                                                                 x->xmldataentity(x, x->data, datalen);
          395                                                         datalen = 0;
          396                                                         break;
          397                                                 }
          398                                         }
          399                                 } else if (c != '<') {
          400                                         if (datalen < sizeof(x->data) - 1) {
          401                                                 x->data[datalen++] = c;
          402                                         } else {
          403                                                 x->data[datalen] = '\0';
          404                                                 if (x->xmldata)
          405                                                         x->xmldata(x, x->data, datalen);
          406                                                 x->data[0] = c;
          407                                                 datalen = 1;
          408                                         }
          409                                 }
          410                                 if (c == '<') {
          411                                         x->data[datalen] = '\0';
          412                                         if (x->xmldata && datalen)
          413                                                 x->xmldata(x, x->data, datalen);
          414                                         if (x->xmldataend)
          415                                                 x->xmldataend(x);
          416                                         break;
          417                                 }
          418                         }
          419                 }
          420         }
          421 }