xml.c - bag - BAG Kadaster Extract parser (subset)
 (HTM) git clone git://git.codemadness.org/bag
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       xml.c (11249B)
       ---
            1 #include <errno.h>
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "xml.h"
            7 
            8 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
            9 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           10 
           11 static int roffset, rtotal;
           12 static char rbuf[4096*4];
           13 
           14 int
           15 getnext(void)
           16 {
           17         ssize_t n;
           18 
           19         if (roffset >= rtotal) {
           20                 n = fread(rbuf, 1, sizeof(rbuf), stdin);
           21                 if (ferror(stdin))
           22                         exit(1);
           23                 if (feof(stdin) || n == 0) {
           24                         roffset = 0;
           25                         rtotal = 0;
           26                         return EOF;
           27                 }
           28                 roffset = 0;
           29                 rtotal = n;
           30         }
           31         return rbuf[roffset++];
           32 }
           33 
           34 //#define GETNEXT getnext
           35 #define GETNEXT getchar_unlocked
           36 
           37 static void
           38 xml_parseattrs(XMLParser *x)
           39 {
           40         size_t namelen = 0, valuelen;
           41         int c, endsep, endname = 0, valuestart = 0;
           42 
           43         while ((c = GETNEXT()) != EOF) {
           44                 if (ISSPACE(c)) {
           45                         if (namelen)
           46                                 endname = 1;
           47                         continue;
           48                 } else if (c == '?')
           49                         ; /* ignore */
           50                 else if (c == '=') {
           51                         x->name[namelen] = '\0';
           52                         valuestart = 1;
           53                         endname = 1;
           54                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           55                         /* attribute without value */
           56                         x->name[namelen] = '\0';
           57                         if (x->xmlattrstart)
           58                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           59                         if (x->xmlattr)
           60                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
           61                         if (x->xmlattrend)
           62                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
           63                         endname = 0;
           64                         x->name[0] = c;
           65                         namelen = 1;
           66                 } else if (namelen && valuestart) {
           67                         /* attribute with value */
           68                         if (x->xmlattrstart)
           69                                 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
           70 
           71                         valuelen = 0;
           72                         if (c == '\'' || c == '"') {
           73                                 endsep = c;
           74                         } else {
           75                                 endsep = ' '; /* ISSPACE() */
           76                                 goto startvalue;
           77                         }
           78 
           79                         while ((c = GETNEXT()) != EOF) {
           80 startvalue:
           81                                 if (c == '&') { /* entities */
           82                                         x->data[valuelen] = '\0';
           83                                         /* call data function with data before entity if there is data */
           84                                         if (valuelen && x->xmlattr)
           85                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           86                                         x->data[0] = c;
           87                                         valuelen = 1;
           88                                         while ((c = GETNEXT()) != EOF) {
           89                                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
           90                                                         break;
           91                                                 if (valuelen < sizeof(x->data) - 1)
           92                                                         x->data[valuelen++] = c;
           93                                                 else {
           94                                                         /* entity too long for buffer, handle as normal data */
           95                                                         x->data[valuelen] = '\0';
           96                                                         if (x->xmlattr)
           97                                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
           98                                                         x->data[0] = c;
           99                                                         valuelen = 1;
          100                                                         break;
          101                                                 }
          102                                                 if (c == ';') {
          103                                                         x->data[valuelen] = '\0';
          104                                                         if (x->xmlattrentity)
          105                                                                 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          106                                                         valuelen = 0;
          107                                                         break;
          108                                                 }
          109                                         }
          110                                 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          111                                         if (valuelen < sizeof(x->data) - 1) {
          112                                                 x->data[valuelen++] = c;
          113                                         } else {
          114                                                 x->data[valuelen] = '\0';
          115                                                 if (x->xmlattr)
          116                                                         x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          117                                                 x->data[0] = c;
          118                                                 valuelen = 1;
          119                                         }
          120                                 }
          121                                 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
          122                                         x->data[valuelen] = '\0';
          123                                         if (x->xmlattr)
          124                                                 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          125                                         if (x->xmlattrend)
          126                                                 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
          127                                         break;
          128                                 }
          129                         }
          130                         namelen = endname = valuestart = 0;
          131                 } else if (namelen < sizeof(x->name) - 1) {
          132                         x->name[namelen++] = c;
          133                 }
          134                 if (c == '>') {
          135                         break;
          136                 } else if (c == '/') {
          137                         x->isshorttag = 1;
          138                         x->name[0] = '\0';
          139                         namelen = 0;
          140                 }
          141         }
          142 }
          143 
          144 static void
          145 xml_parsecomment(XMLParser *x)
          146 {
          147         size_t datalen = 0, i = 0;
          148         int c;
          149 
          150         if (x->xmlcommentstart)
          151                 x->xmlcommentstart(x);
          152         while ((c = GETNEXT()) != EOF) {
          153                 if (c == '-' || c == '>') {
          154                         if (x->xmlcomment && datalen) {
          155                                 x->data[datalen] = '\0';
          156                                 x->xmlcomment(x, x->data, datalen);
          157                                 datalen = 0;
          158                         }
          159                 }
          160 
          161                 if (c == '-') {
          162                         if (++i > 2) {
          163                                 if (x->xmlcomment)
          164                                         for (; i > 2; i--)
          165                                                 x->xmlcomment(x, "-", 1);
          166                                 i = 2;
          167                         }
          168                         continue;
          169                 } else if (c == '>' && i == 2) {
          170                         if (x->xmlcommentend)
          171                                 x->xmlcommentend(x);
          172                         return;
          173                 } else if (i) {
          174                         if (x->xmlcomment) {
          175                                 for (; i > 0; i--)
          176                                         x->xmlcomment(x, "-", 1);
          177                         }
          178                         i = 0;
          179                 }
          180 
          181                 if (datalen < sizeof(x->data) - 1) {
          182                         x->data[datalen++] = c;
          183                 } else {
          184                         x->data[datalen] = '\0';
          185                         if (x->xmlcomment)
          186                                 x->xmlcomment(x, x->data, datalen);
          187                         x->data[0] = c;
          188                         datalen = 1;
          189                 }
          190         }
          191 }
          192 
          193 static void
          194 xml_parsecdata(XMLParser *x)
          195 {
          196         size_t datalen = 0, i = 0;
          197         int c;
          198 
          199         if (x->xmlcdatastart)
          200                 x->xmlcdatastart(x);
          201         while ((c = GETNEXT()) != EOF) {
          202                 if (c == ']' || c == '>') {
          203                         if (x->xmlcdata && datalen) {
          204                                 x->data[datalen] = '\0';
          205                                 x->xmlcdata(x, x->data, datalen);
          206                                 datalen = 0;
          207                         }
          208                 }
          209 
          210                 if (c == ']') {
          211                         if (++i > 2) {
          212                                 if (x->xmlcdata)
          213                                         for (; i > 2; i--)
          214                                                 x->xmlcdata(x, "]", 1);
          215                                 i = 2;
          216                         }
          217                         continue;
          218                 } else if (c == '>' && i == 2) {
          219                         if (x->xmlcdataend)
          220                                 x->xmlcdataend(x);
          221                         return;
          222                 } else if (i) {
          223                         if (x->xmlcdata)
          224                                 for (; i > 0; i--)
          225                                         x->xmlcdata(x, "]", 1);
          226                         i = 0;
          227                 }
          228 
          229                 if (datalen < sizeof(x->data) - 1) {
          230                         x->data[datalen++] = c;
          231                 } else {
          232                         x->data[datalen] = '\0';
          233                         if (x->xmlcdata)
          234                                 x->xmlcdata(x, x->data, datalen);
          235                         x->data[0] = c;
          236                         datalen = 1;
          237                 }
          238         }
          239 }
          240 
          241 static int
          242 codepointtoutf8(long r, char *s)
          243 {
          244         if (r == 0) {
          245                 return 0; /* NUL byte */
          246         } else if (r <= 0x7F) {
          247                 /* 1 byte: 0aaaaaaa */
          248                 s[0] = r;
          249                 return 1;
          250         } else if (r <= 0x07FF) {
          251                 /* 2 bytes: 00000aaa aabbbbbb */
          252                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          253                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          254                 return 2;
          255         } else if (r <= 0xFFFF) {
          256                 /* 3 bytes: aaaabbbb bbcccccc */
          257                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          258                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          259                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          260                 return 3;
          261         } else {
          262                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          263                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          264                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          265                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          266                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          267                 return 4;
          268         }
          269 }
          270 
          271 static int
          272 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          273 {
          274         static const struct {
          275                 const char *entity;
          276                 int c;
          277         } entities[] = {
          278                 { "amp;",  '&'  },
          279                 { "lt;",   '<'  },
          280                 { "gt;",   '>'  },
          281                 { "apos;", '\'' },
          282                 { "quot;", '"'  },
          283         };
          284         size_t i;
          285 
          286         /* buffer is too small */
          287         if (bufsiz < 2)
          288                 return -1;
          289 
          290         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
          291                 if (!strcmp(e, entities[i].entity)) {
          292                         buf[0] = entities[i].c;
          293                         buf[1] = '\0';
          294                         return 1;
          295                 }
          296         }
          297         return -1;
          298 }
          299 
          300 static int
          301 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          302 {
          303         long l;
          304         int len;
          305         char *end;
          306 
          307         /* buffer is too small */
          308         if (bufsiz < 5)
          309                 return -1;
          310 
          311         errno = 0;
          312         /* hex (16) or decimal (10) */
          313         if (*e == 'x')
          314                 l = strtol(++e, &end, 16);
          315         else
          316                 l = strtol(e, &end, 10);
          317         /* invalid value or not a well-formed entity or invalid code point */
          318         if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
          319             (l >= 0xd800 && l <= 0xdfff))
          320                 return -1;
          321         len = codepointtoutf8(l, buf);
          322         buf[len] = '\0';
          323 
          324         return len;
          325 }
          326 
          327 /* convert named- or numeric entity string to buffer string
          328  * returns byte-length of string or -1 on failure. */
          329 int
          330 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          331 {
          332         /* doesn't start with & */
          333         if (e[0] != '&')
          334                 return -1;
          335         /* numeric entity */
          336         if (e[1] == '#')
          337                 return numericentitytostr(e + 2, buf, bufsiz);
          338         else /* named entity */
          339                 return namedentitytostr(e + 1, buf, bufsiz);
          340 }
          341 
          342 void
          343 xml_parse(XMLParser *x)
          344 {
          345         size_t datalen, tagdatalen;
          346         int c, isend;
          347 
          348         while ((c = GETNEXT()) != EOF && c != '<')
          349                 ; /* skip until < */
          350 
          351         while (c != EOF) {
          352                 if (c == '<') { /* parse tag */
          353                         if ((c = GETNEXT()) == EOF)
          354                                 return;
          355 
          356                         if (c == '!') { /* CDATA and comments */
          357                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          358                                         /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
          359                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          360                                                 x->data[tagdatalen++] = c;
          361                                         if (c == '>')
          362                                                 break;
          363                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          364                                                         (x->data[0] == '-')) {
          365                                                 xml_parsecomment(x);
          366                                                 break;
          367                                         } else if (c == '[') {
          368                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          369                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          370                                                         xml_parsecdata(x);
          371                                                         break;
          372                                                 }
          373                                         }
          374                                 }
          375                         } else {
          376                                 /* normal tag (open, short open, close), processing instruction. */
          377                                 x->tag[0] = c;
          378                                 x->taglen = 1;
          379                                 x->isshorttag = isend = 0;
          380 
          381                                 /* treat processing instruction as short tag, don't strip "?" prefix. */
          382                                 if (c == '?') {
          383                                         x->isshorttag = 1;
          384                                 } else if (c == '/') {
          385                                         if ((c = GETNEXT()) == EOF)
          386                                                 return;
          387                                         x->tag[0] = c;
          388                                         isend = 1;
          389                                 }
          390 
          391                                 while ((c = GETNEXT()) != EOF) {
          392                                         if (c == '/')
          393                                                 x->isshorttag = 1; /* short tag */
          394                                         else if (c == '>' || ISSPACE(c)) {
          395                                                 x->tag[x->taglen] = '\0';
          396                                                 if (isend) { /* end tag, starts with </ */
          397                                                         if (x->xmltagend)
          398                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          399                                                         x->tag[0] = '\0';
          400                                                         x->taglen = 0;
          401                                                 } else {
          402                                                         /* start tag */
          403                                                         if (x->xmltagstart)
          404                                                                 x->xmltagstart(x, x->tag, x->taglen);
          405                                                         if (ISSPACE(c))
          406                                                                 xml_parseattrs(x);
          407                                                         if (x->xmltagstartparsed)
          408                                                                 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
          409                                                 }
          410                                                 /* call tagend for short tag or processing instruction */
          411                                                 if (x->isshorttag) {
          412                                                         if (x->xmltagend)
          413                                                                 x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
          414                                                         x->tag[0] = '\0';
          415                                                         x->taglen = 0;
          416                                                 }
          417                                                 break;
          418                                         } else if (x->taglen < sizeof(x->tag) - 1)
          419                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          420                                 }
          421                         }
          422                 } else {
          423                         /* parse tag data */
          424                         datalen = 0;
          425                         if (x->xmldatastart)
          426                                 x->xmldatastart(x);
          427                         while ((c = GETNEXT()) != EOF) {
          428                                 if (c == '&') {
          429                                         if (datalen) {
          430                                                 x->data[datalen] = '\0';
          431                                                 if (x->xmldata)
          432                                                         x->xmldata(x, x->data, datalen);
          433                                         }
          434                                         x->data[0] = c;
          435                                         datalen = 1;
          436                                         while ((c = GETNEXT()) != EOF) {
          437                                                 if (c == '<')
          438                                                         break;
          439                                                 if (datalen < sizeof(x->data) - 1)
          440                                                         x->data[datalen++] = c;
          441                                                 else {
          442                                                         /* entity too long for buffer, handle as normal data */
          443                                                         x->data[datalen] = '\0';
          444                                                         if (x->xmldata)
          445                                                                 x->xmldata(x, x->data, datalen);
          446                                                         x->data[0] = c;
          447                                                         datalen = 1;
          448                                                         break;
          449                                                 }
          450                                                 if (c == ';') {
          451                                                         x->data[datalen] = '\0';
          452                                                         if (x->xmldataentity)
          453                                                                 x->xmldataentity(x, x->data, datalen);
          454                                                         datalen = 0;
          455                                                         break;
          456                                                 }
          457                                         }
          458                                 } else if (c != '<') {
          459                                         if (datalen < sizeof(x->data) - 1) {
          460                                                 x->data[datalen++] = c;
          461                                         } else {
          462                                                 x->data[datalen] = '\0';
          463                                                 if (x->xmldata)
          464                                                         x->xmldata(x, x->data, datalen);
          465                                                 x->data[0] = c;
          466                                                 datalen = 1;
          467                                         }
          468                                 }
          469                                 if (c == '<') {
          470                                         x->data[datalen] = '\0';
          471                                         if (x->xmldata && datalen)
          472                                                 x->xmldata(x, x->data, datalen);
          473                                         if (x->xmldataend)
          474                                                 x->xmldataend(x);
          475                                         break;
          476                                 }
          477                         }
          478                 }
          479         }
          480 }