integrate XML parser and inline read loop - osm-zipcodes - Extract (dutch) addresses from OpenStreetMap OSM XML 
 (HTM) git clone git://git.codemadness.org/osm-zipcodes
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit b81f0c77edbee0be59ca8d14b3fd060aff838486
 (DIR) parent 918d881b3982b6ceb90ee1eaaa3a1f7c11addb4c
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Thu, 11 Apr 2019 18:10:59 +0200
       
       integrate XML parser and inline read loop
       
       18s -> 6s on a small .osm region
       
       Diffstat:
         M Makefile                            |       2 +-
         M main.c                              |     502 ++++++++++++++++++++++++++++++-
         D xml.c                               |     468 -------------------------------
         D xml.h                               |      40 -------------------------------
       
       4 files changed, 493 insertions(+), 519 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       @@ -1,5 +1,5 @@
        build: clean
       -        cc xml.c main.c -o main -O3 -Wall -static
       +        cc main.c -o main -O3 -Wall -static
                strip main
        
        clean:
 (DIR) diff --git a/main.c b/main.c
       @@ -2,14 +2,53 @@
        #include <sys/stat.h>
        #include <sys/types.h>
        
       +#include <sys/types.h>
       +
        #include <ctype.h>
        #include <err.h>
       +#include <errno.h>
        #include <fcntl.h>
       +#include <limits.h>
        #include <stdio.h>
       +#include <stdlib.h>
        #include <string.h>
        #include <unistd.h>
        
       -#include "xml.h"
       +typedef struct xmlparser {
       +        /* handlers */
       +        void (*xmlattr)(struct xmlparser *, const char *, size_t,
       +              const char *, size_t, const char *, size_t);
       +        void (*xmlattrend)(struct xmlparser *, const char *, size_t,
       +              const char *, size_t);
       +        void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
       +              const char *, size_t);
       +        void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
       +              const char *, size_t, const char *, size_t);
       +        void (*xmlcdatastart)(struct xmlparser *);
       +        void (*xmlcdata)(struct xmlparser *, const char *, size_t);
       +        void (*xmlcdataend)(struct xmlparser *);
       +        void (*xmlcommentstart)(struct xmlparser *);
       +        void (*xmlcomment)(struct xmlparser *, const char *, size_t);
       +        void (*xmlcommentend)(struct xmlparser *);
       +        void (*xmldata)(struct xmlparser *, const char *, size_t);
       +        void (*xmldataend)(struct xmlparser *);
       +        void (*xmldataentity)(struct xmlparser *, const char *, size_t);
       +        void (*xmldatastart)(struct xmlparser *);
       +        void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
       +        void (*xmltagstart)(struct xmlparser *, const char *, size_t);
       +        void (*xmltagstartparsed)(struct xmlparser *, const char *,
       +              size_t, int);
       +
       +        /* current tag */
       +        char tag[1024];
       +        size_t taglen;
       +        /* current tag is in short form ? <tag /> */
       +        int isshorttag;
       +        /* current attribute name */
       +        char name[1024];
       +        /* data buffer used for tag data, cdata and attribute data */
       +        char data[BUFSIZ];
       +} XMLParser;
        
        enum FieldType {
                Postcode = 1,
       @@ -50,6 +89,458 @@ struct stat st;
        unsigned char *reg;
        size_t len, off;
        
       +#define GETNEXT() (off >= len ? EOF : reg[off++])
       +
       +static void
       +xml_parseattrs(XMLParser *x)
       +{
       +        size_t namelen = 0, valuelen;
       +        int c, endsep, endname = 0, valuestart = 0;
       +
       +        while ((c = GETNEXT()) != EOF) {
       +                if (isspace(c)) {
       +                        if (namelen)
       +                                endname = 1;
       +                        continue;
       +                } else if (c == '?')
       +                        ; /* ignore */
       +                else if (c == '=') {
       +                        x->name[namelen] = '\0';
       +                        valuestart = 1;
       +                        endname = 1;
       +                } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
       +                        /* attribute without value */
       +                        x->name[namelen] = '\0';
       +                        if (x->xmlattrstart)
       +                                x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
       +                        if (x->xmlattr)
       +                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
       +                        if (x->xmlattrend)
       +                                x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
       +                        endname = 0;
       +                        x->name[0] = c;
       +                        namelen = 1;
       +                } else if (namelen && valuestart) {
       +                        /* attribute with value */
       +                        if (x->xmlattrstart)
       +                                x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
       +
       +                        valuelen = 0;
       +                        if (c == '\'' || c == '"') {
       +                                endsep = c;
       +                        } else {
       +                                endsep = ' '; /* isspace() */
       +                                goto startvalue;
       +                        }
       +
       +                        while ((c = GETNEXT()) != EOF) {
       +startvalue:
       +                                if (c == '&') { /* entities */
       +                                        x->data[valuelen] = '\0';
       +                                        /* call data function with data before entity if there is data */
       +                                        if (valuelen && x->xmlattr)
       +                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                        x->data[0] = c;
       +                                        valuelen = 1;
       +                                        while ((c = GETNEXT()) != EOF) {
       +                                                if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
       +                                                        break;
       +                                                if (valuelen < sizeof(x->data) - 1)
       +                                                        x->data[valuelen++] = c;
       +                                                else {
       +                                                        /* entity too long for buffer, handle as normal data */
       +                                                        x->data[valuelen] = '\0';
       +                                                        if (x->xmlattr)
       +                                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                                        x->data[0] = c;
       +                                                        valuelen = 1;
       +                                                        break;
       +                                                }
       +                                                if (c == ';') {
       +                                                        x->data[valuelen] = '\0';
       +                                                        if (x->xmlattrentity)
       +                                                                x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                                        valuelen = 0;
       +                                                        break;
       +                                                }
       +                                        }
       +                                } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
       +                                        if (valuelen < sizeof(x->data) - 1) {
       +                                                x->data[valuelen++] = c;
       +                                        } else {
       +                                                x->data[valuelen] = '\0';
       +                                                if (x->xmlattr)
       +                                                        x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                                x->data[0] = c;
       +                                                valuelen = 1;
       +                                        }
       +                                }
       +                                if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
       +                                        x->data[valuelen] = '\0';
       +                                        if (x->xmlattr)
       +                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       +                                        if (x->xmlattrend)
       +                                                x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
       +                                        break;
       +                                }
       +                        }
       +                        namelen = endname = valuestart = 0;
       +                } else if (namelen < sizeof(x->name) - 1) {
       +                        x->name[namelen++] = c;
       +                }
       +                if (c == '>') {
       +                        break;
       +                } else if (c == '/') {
       +                        x->isshorttag = 1;
       +                        x->name[0] = '\0';
       +                        namelen = 0;
       +                }
       +        }
       +}
       +
       +static void
       +xml_parsecomment(XMLParser *x)
       +{
       +        size_t datalen = 0, i = 0;
       +        int c;
       +
       +        if (x->xmlcommentstart)
       +                x->xmlcommentstart(x);
       +        while ((c = GETNEXT()) != EOF) {
       +                if (c == '-' || c == '>') {
       +                        if (x->xmlcomment) {
       +                                x->data[datalen] = '\0';
       +                                x->xmlcomment(x, x->data, datalen);
       +                                datalen = 0;
       +                        }
       +                }
       +
       +                if (c == '-') {
       +                        if (++i > 2) {
       +                                if (x->xmlcomment)
       +                                        for (; i > 2; i--)
       +                                                x->xmlcomment(x, "-", 1);
       +                                i = 2;
       +                        }
       +                        continue;
       +                } else if (c == '>' && i == 2) {
       +                        if (x->xmlcommentend)
       +                                x->xmlcommentend(x);
       +                        return;
       +                } else if (i) {
       +                        if (x->xmlcomment) {
       +                                for (; i > 0; i--)
       +                                        x->xmlcomment(x, "-", 1);
       +                        }
       +                        i = 0;
       +                }
       +
       +                if (datalen < sizeof(x->data) - 1) {
       +                        x->data[datalen++] = c;
       +                } else {
       +                        x->data[datalen] = '\0';
       +                        if (x->xmlcomment)
       +                                x->xmlcomment(x, x->data, datalen);
       +                        x->data[0] = c;
       +                        datalen = 1;
       +                }
       +        }
       +}
       +
       +static void
       +xml_parsecdata(XMLParser *x)
       +{
       +        size_t datalen = 0, i = 0;
       +        int c;
       +
       +        if (x->xmlcdatastart)
       +                x->xmlcdatastart(x);
       +        while ((c = GETNEXT()) != EOF) {
       +                if (c == ']' || c == '>') {
       +                        if (x->xmlcdata) {
       +                                x->data[datalen] = '\0';
       +                                x->xmlcdata(x, x->data, datalen);
       +                                datalen = 0;
       +                        }
       +                }
       +
       +                if (c == ']') {
       +                        if (++i > 2) {
       +                                if (x->xmlcdata)
       +                                        for (; i > 2; i--)
       +                                                x->xmlcdata(x, "]", 1);
       +                                i = 2;
       +                        }
       +                        continue;
       +                } else if (c == '>' && i == 2) {
       +                        if (x->xmlcdataend)
       +                                x->xmlcdataend(x);
       +                        return;
       +                } else if (i) {
       +                        if (x->xmlcdata)
       +                                for (; i > 0; i--)
       +                                        x->xmlcdata(x, "]", 1);
       +                        i = 0;
       +                }
       +
       +                if (datalen < sizeof(x->data) - 1) {
       +                        x->data[datalen++] = c;
       +                } else {
       +                        x->data[datalen] = '\0';
       +                        if (x->xmlcdata)
       +                                x->xmlcdata(x, x->data, datalen);
       +                        x->data[0] = c;
       +                        datalen = 1;
       +                }
       +        }
       +}
       +
       +static int
       +codepointtoutf8(long r, char *s)
       +{
       +        if (r == 0) {
       +                return 0; /* NUL byte */
       +        } else if (r <= 0x7F) {
       +                /* 1 byte: 0aaaaaaa */
       +                s[0] = r;
       +                return 1;
       +        } else if (r <= 0x07FF) {
       +                /* 2 bytes: 00000aaa aabbbbbb */
       +                s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
       +                s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
       +                return 2;
       +        } else if (r <= 0xFFFF) {
       +                /* 3 bytes: aaaabbbb bbcccccc */
       +                s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
       +                s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
       +                s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
       +                return 3;
       +        } else {
       +                /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
       +                s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
       +                s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
       +                s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
       +                s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
       +                return 4;
       +        }
       +}
       +
       +static int
       +namedentitytostr(const char *e, char *buf, size_t bufsiz)
       +{
       +        static const struct {
       +                const char *entity;
       +                int c;
       +        } entities[] = {
       +                { "amp;",  '&'  },
       +                { "lt;",   '<'  },
       +                { "gt;",   '>'  },
       +                { "apos;", '\'' },
       +                { "quot;", '"'  },
       +                { "AMP;",  '&'  },
       +                { "LT;",   '<'  },
       +                { "GT;",   '>'  },
       +                { "APOS;", '\'' },
       +                { "QUOT;", '"'  }
       +        };
       +        size_t i;
       +
       +        /* buffer is too small */
       +        if (bufsiz < 2)
       +                return -1;
       +
       +        for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
       +                if (!strcmp(e, entities[i].entity)) {
       +                        buf[0] = entities[i].c;
       +                        buf[1] = '\0';
       +                        return 1;
       +                }
       +        }
       +        return 0;
       +}
       +
       +static int
       +numericentitytostr(const char *e, char *buf, size_t bufsiz)
       +{
       +        long l;
       +        int len;
       +        char *end;
       +
       +        /* buffer is too small */
       +        if (bufsiz < 5)
       +                return -1;
       +
       +        errno = 0;
       +        /* hex (16) or decimal (10) */
       +        if (*e == 'x')
       +                l = strtoul(e + 1, &end, 16);
       +        else
       +                l = strtoul(e, &end, 10);
       +        /* invalid value or not a well-formed entity or too high codepoint */
       +        if (errno || *end != ';' || l > 0x10FFFF)
       +                return 0;
       +        len = codepointtoutf8(l, buf);
       +        buf[len] = '\0';
       +
       +        return len;
       +}
       +
       +/* convert named- or numeric entity string to buffer string
       + * returns byte-length of string. */
       +int
       +xml_entitytostr(const char *e, char *buf, size_t bufsiz)
       +{
       +        /* doesn't start with & */
       +        if (e[0] != '&')
       +                return 0;
       +        /* numeric entity */
       +        if (e[1] == '#')
       +                return numericentitytostr(e + 2, buf, bufsiz);
       +        else /* named entity */
       +                return namedentitytostr(e + 1, buf, bufsiz);
       +}
       +
       +void
       +xml_parse(XMLParser *x)
       +{
       +        size_t datalen, tagdatalen;
       +        int c, isend;
       +
       +        while ((c = GETNEXT()) != EOF && c != '<')
       +                ; /* skip until < */
       +
       +        while (c != EOF) {
       +                if (c == '<') { /* parse tag */
       +                        if ((c = GETNEXT()) == EOF)
       +                                return;
       +
       +                        if (c == '!') { /* cdata and comments */
       +                                for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
       +                                        /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
       +                                        if (tagdatalen <= sizeof("[CDATA[") - 1)
       +                                                x->data[tagdatalen++] = c;
       +                                        if (c == '>')
       +                                                break;
       +                                        else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
       +                                                        (x->data[0] == '-')) {
       +                                                xml_parsecomment(x);
       +                                                break;
       +                                        } else if (c == '[') {
       +                                                if (tagdatalen == sizeof("[CDATA[") - 1 &&
       +                                                    !strncmp(x->data, "[CDATA[", tagdatalen)) {
       +                                                        xml_parsecdata(x);
       +                                                        break;
       +                                                }
       +                                        }
       +                                }
       +                        } else {
       +                                /* normal tag (open, short open, close), processing instruction. */
       +                                x->tag[0] = c;
       +                                x->taglen = 1;
       +                                x->isshorttag = isend = 0;
       +
       +                                /* treat processing instruction as shorttag, don't strip "?" prefix. */
       +                                if (c == '?') {
       +                                        x->isshorttag = 1;
       +                                } else if (c == '/') {
       +                                        if ((c = GETNEXT()) == EOF)
       +                                                return;
       +                                        x->tag[0] = c;
       +                                        isend = 1;
       +                                }
       +
       +                                while ((c = GETNEXT()) != EOF) {
       +                                        if (c == '/')
       +                                                x->isshorttag = 1; /* short tag */
       +                                        else if (c == '>' || isspace(c)) {
       +                                                x->tag[x->taglen] = '\0';
       +                                                if (isend) { /* end tag, starts with </ */
       +                                                        if (x->xmltagend)
       +                                                                x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
       +                                                        x->tag[0] = '\0';
       +                                                        x->taglen = 0;
       +                                                } else {
       +                                                        /* start tag */
       +                                                        if (x->xmltagstart)
       +                                                                x->xmltagstart(x, x->tag, x->taglen);
       +                                                        if (isspace(c))
       +                                                                xml_parseattrs(x);
       +                                                        if (x->xmltagstartparsed)
       +                                                                x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
       +                                                }
       +                                                /* call tagend for shortform or processing instruction */
       +                                                if (x->isshorttag) {
       +                                                        if (x->xmltagend)
       +                                                                x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
       +                                                        x->tag[0] = '\0';
       +                                                        x->taglen = 0;
       +                                                }
       +                                                break;
       +                                        } else if (x->taglen < sizeof(x->tag) - 1)
       +                                                x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
       +                                }
       +                        }
       +                } else {
       +                        /* parse tag data */
       +                        datalen = 0;
       +                        if (x->xmldatastart)
       +                                x->xmldatastart(x);
       +                        while ((c = GETNEXT()) != EOF) {
       +                                if (c == '&') {
       +                                        if (datalen) {
       +                                                x->data[datalen] = '\0';
       +                                                if (x->xmldata)
       +                                                        x->xmldata(x, x->data, datalen);
       +                                        }
       +                                        x->data[0] = c;
       +                                        datalen = 1;
       +                                        while ((c = GETNEXT()) != EOF) {
       +                                                if (c == '<')
       +                                                        break;
       +                                                if (datalen < sizeof(x->data) - 1)
       +                                                        x->data[datalen++] = c;
       +                                                else {
       +                                                        /* entity too long for buffer, handle as normal data */
       +                                                        x->data[datalen] = '\0';
       +                                                        if (x->xmldata)
       +                                                                x->xmldata(x, x->data, datalen);
       +                                                        x->data[0] = c;
       +                                                        datalen = 1;
       +                                                        break;
       +                                                }
       +                                                if (c == ';') {
       +                                                        x->data[datalen] = '\0';
       +                                                        if (x->xmldataentity)
       +                                                                x->xmldataentity(x, x->data, datalen);
       +                                                        datalen = 0;
       +                                                        break;
       +                                                }
       +                                        }
       +                                } else if (c != '<') {
       +                                        if (datalen < sizeof(x->data) - 1) {
       +                                                x->data[datalen++] = c;
       +                                        } else {
       +                                                x->data[datalen] = '\0';
       +                                                if (x->xmldata)
       +                                                        x->xmldata(x, x->data, datalen);
       +                                                x->data[0] = c;
       +                                                datalen = 1;
       +                                        }
       +                                }
       +                                if (c == '<') {
       +                                        x->data[datalen] = '\0';
       +                                        if (x->xmldata && datalen)
       +                                                x->xmldata(x, x->data, datalen);
       +                                        if (x->xmldataend)
       +                                                x->xmldataend(x);
       +                                        break;
       +                                }
       +                        }
       +                }
       +        }
       +}
       +
       +
        /* ignore control chars (such as TABs) */
        static inline void
        printfield(const char *s)
       @@ -228,14 +719,6 @@ xmlattrentity(XMLParser *x, const char *t, size_t tl,
                        xmlattr(x, t, tl, a, al, buf, len);
        }
        
       -static inline int
       -getnext(void)
       -{
       -        if (off >= len)
       -                return EOF;
       -        return reg[off++];
       -}
       -
        int
        main(int argc, char *argv[])
        {
       @@ -261,7 +744,6 @@ main(int argc, char *argv[])
                if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == MAP_FAILED)
                        err(1, "mmap");
        
       -        x.getnext = getnext;
                xml_parse(&x);
        
                /* progress meter */
 (DIR) diff --git a/xml.c b/xml.c
       @@ -1,468 +0,0 @@
       -#include <sys/types.h>
       -
       -#include <ctype.h>
       -#include <errno.h>
       -#include <limits.h>
       -#include <stdio.h>
       -#include <stdlib.h>
       -#include <string.h>
       -
       -#include "xml.h"
       -
       -static void
       -xml_parseattrs(XMLParser *x)
       -{
       -        size_t namelen = 0, valuelen;
       -        int c, endsep, endname = 0, valuestart = 0;
       -
       -        while ((c = x->getnext()) != EOF) {
       -                if (isspace(c)) {
       -                        if (namelen)
       -                                endname = 1;
       -                        continue;
       -                } else if (c == '?')
       -                        ; /* ignore */
       -                else if (c == '=') {
       -                        x->name[namelen] = '\0';
       -                        valuestart = 1;
       -                        endname = 1;
       -                } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
       -                        /* attribute without value */
       -                        x->name[namelen] = '\0';
       -                        if (x->xmlattrstart)
       -                                x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
       -                        if (x->xmlattr)
       -                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
       -                        if (x->xmlattrend)
       -                                x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
       -                        endname = 0;
       -                        x->name[0] = c;
       -                        namelen = 1;
       -                } else if (namelen && valuestart) {
       -                        /* attribute with value */
       -                        if (x->xmlattrstart)
       -                                x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
       -
       -                        valuelen = 0;
       -                        if (c == '\'' || c == '"') {
       -                                endsep = c;
       -                        } else {
       -                                endsep = ' '; /* isspace() */
       -                                goto startvalue;
       -                        }
       -
       -                        while ((c = x->getnext()) != EOF) {
       -startvalue:
       -                                if (c == '&') { /* entities */
       -                                        x->data[valuelen] = '\0';
       -                                        /* call data function with data before entity if there is data */
       -                                        if (valuelen && x->xmlattr)
       -                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                        x->data[0] = c;
       -                                        valuelen = 1;
       -                                        while ((c = x->getnext()) != EOF) {
       -                                                if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
       -                                                        break;
       -                                                if (valuelen < sizeof(x->data) - 1)
       -                                                        x->data[valuelen++] = c;
       -                                                else {
       -                                                        /* entity too long for buffer, handle as normal data */
       -                                                        x->data[valuelen] = '\0';
       -                                                        if (x->xmlattr)
       -                                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                                        x->data[0] = c;
       -                                                        valuelen = 1;
       -                                                        break;
       -                                                }
       -                                                if (c == ';') {
       -                                                        x->data[valuelen] = '\0';
       -                                                        if (x->xmlattrentity)
       -                                                                x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                                        valuelen = 0;
       -                                                        break;
       -                                                }
       -                                        }
       -                                } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
       -                                        if (valuelen < sizeof(x->data) - 1) {
       -                                                x->data[valuelen++] = c;
       -                                        } else {
       -                                                x->data[valuelen] = '\0';
       -                                                if (x->xmlattr)
       -                                                        x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                                x->data[0] = c;
       -                                                valuelen = 1;
       -                                        }
       -                                }
       -                                if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
       -                                        x->data[valuelen] = '\0';
       -                                        if (x->xmlattr)
       -                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                        if (x->xmlattrend)
       -                                                x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
       -                                        break;
       -                                }
       -                        }
       -                        namelen = endname = valuestart = 0;
       -                } else if (namelen < sizeof(x->name) - 1) {
       -                        x->name[namelen++] = c;
       -                }
       -                if (c == '>') {
       -                        break;
       -                } else if (c == '/') {
       -                        x->isshorttag = 1;
       -                        x->name[0] = '\0';
       -                        namelen = 0;
       -                }
       -        }
       -}
       -
       -static void
       -xml_parsecomment(XMLParser *x)
       -{
       -        size_t datalen = 0, i = 0;
       -        int c;
       -
       -        if (x->xmlcommentstart)
       -                x->xmlcommentstart(x);
       -        while ((c = x->getnext()) != EOF) {
       -                if (c == '-' || c == '>') {
       -                        if (x->xmlcomment) {
       -                                x->data[datalen] = '\0';
       -                                x->xmlcomment(x, x->data, datalen);
       -                                datalen = 0;
       -                        }
       -                }
       -
       -                if (c == '-') {
       -                        if (++i > 2) {
       -                                if (x->xmlcomment)
       -                                        for (; i > 2; i--)
       -                                                x->xmlcomment(x, "-", 1);
       -                                i = 2;
       -                        }
       -                        continue;
       -                } else if (c == '>' && i == 2) {
       -                        if (x->xmlcommentend)
       -                                x->xmlcommentend(x);
       -                        return;
       -                } else if (i) {
       -                        if (x->xmlcomment) {
       -                                for (; i > 0; i--)
       -                                        x->xmlcomment(x, "-", 1);
       -                        }
       -                        i = 0;
       -                }
       -
       -                if (datalen < sizeof(x->data) - 1) {
       -                        x->data[datalen++] = c;
       -                } else {
       -                        x->data[datalen] = '\0';
       -                        if (x->xmlcomment)
       -                                x->xmlcomment(x, x->data, datalen);
       -                        x->data[0] = c;
       -                        datalen = 1;
       -                }
       -        }
       -}
       -
       -static void
       -xml_parsecdata(XMLParser *x)
       -{
       -        size_t datalen = 0, i = 0;
       -        int c;
       -
       -        if (x->xmlcdatastart)
       -                x->xmlcdatastart(x);
       -        while ((c = x->getnext()) != EOF) {
       -                if (c == ']' || c == '>') {
       -                        if (x->xmlcdata) {
       -                                x->data[datalen] = '\0';
       -                                x->xmlcdata(x, x->data, datalen);
       -                                datalen = 0;
       -                        }
       -                }
       -
       -                if (c == ']') {
       -                        if (++i > 2) {
       -                                if (x->xmlcdata)
       -                                        for (; i > 2; i--)
       -                                                x->xmlcdata(x, "]", 1);
       -                                i = 2;
       -                        }
       -                        continue;
       -                } else if (c == '>' && i == 2) {
       -                        if (x->xmlcdataend)
       -                                x->xmlcdataend(x);
       -                        return;
       -                } else if (i) {
       -                        if (x->xmlcdata)
       -                                for (; i > 0; i--)
       -                                        x->xmlcdata(x, "]", 1);
       -                        i = 0;
       -                }
       -
       -                if (datalen < sizeof(x->data) - 1) {
       -                        x->data[datalen++] = c;
       -                } else {
       -                        x->data[datalen] = '\0';
       -                        if (x->xmlcdata)
       -                                x->xmlcdata(x, x->data, datalen);
       -                        x->data[0] = c;
       -                        datalen = 1;
       -                }
       -        }
       -}
       -
       -static int
       -codepointtoutf8(long r, char *s)
       -{
       -        if (r == 0) {
       -                return 0; /* NUL byte */
       -        } else if (r <= 0x7F) {
       -                /* 1 byte: 0aaaaaaa */
       -                s[0] = r;
       -                return 1;
       -        } else if (r <= 0x07FF) {
       -                /* 2 bytes: 00000aaa aabbbbbb */
       -                s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
       -                s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
       -                return 2;
       -        } else if (r <= 0xFFFF) {
       -                /* 3 bytes: aaaabbbb bbcccccc */
       -                s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
       -                s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
       -                s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
       -                return 3;
       -        } else {
       -                /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
       -                s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
       -                s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
       -                s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
       -                s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
       -                return 4;
       -        }
       -}
       -
       -static int
       -namedentitytostr(const char *e, char *buf, size_t bufsiz)
       -{
       -        static const struct {
       -                char *entity;
       -                int c;
       -        } entities[] = {
       -                { "&amp;",  '&'  },
       -                { "&lt;",   '<'  },
       -                { "&gt;",   '>'  },
       -                { "&apos;", '\'' },
       -                { "&quot;", '"'  },
       -                { "&AMP;",  '&'  },
       -                { "&LT;",   '<'  },
       -                { "&GT;",   '>'  },
       -                { "&APOS;", '\'' },
       -                { "&QUOT;", '"'  }
       -        };
       -        size_t i;
       -
       -        /* buffer is too small */
       -        if (bufsiz < 2)
       -                return -1;
       -
       -        /* doesn't start with &: can't match */
       -        if (*e != '&')
       -                return 0;
       -
       -        for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
       -                if (!strcmp(e, entities[i].entity)) {
       -                        buf[0] = entities[i].c;
       -                        buf[1] = '\0';
       -                        return 1;
       -                }
       -        }
       -        return 0;
       -}
       -
       -static int
       -numericentitytostr(const char *e, char *buf, size_t bufsiz)
       -{
       -        long l;
       -        int len;
       -        char *end;
       -
       -        /* buffer is too small */
       -        if (bufsiz < 5)
       -                return -1;
       -
       -        /* not a numeric entity */
       -        if (e[0] != '&' || e[1] != '#')
       -                return 0;
       -
       -        /* e[1] == '#', numeric / hexadecimal entity */
       -        e += 2; /* skip "&#" */
       -        errno = 0;
       -        /* hex (16) or decimal (10) */
       -        if (*e == 'x')
       -                l = strtoul(e + 1, &end, 16);
       -        else
       -                l = strtoul(e, &end, 10);
       -        /* invalid value or not a well-formed entity or too high codepoint */
       -        if (errno || *end != ';' || l > 0x10FFFF)
       -                return 0;
       -        len = codepointtoutf8(l, buf);
       -        buf[len] = '\0';
       -
       -        return len;
       -}
       -
       -/* convert named- or numeric entity string to buffer string
       - * returns byte-length of string. */
       -int
       -xml_entitytostr(const char *e, char *buf, size_t bufsiz)
       -{
       -        /* buffer is too small */
       -        if (bufsiz < 5)
       -                return -1;
       -        /* doesn't start with & */
       -        if (e[0] != '&')
       -                return 0;
       -        /* named entity */
       -        if (e[1] != '#')
       -                return namedentitytostr(e, buf, bufsiz);
       -        else /* numeric entity */
       -                return numericentitytostr(e, buf, bufsiz);
       -}
       -
       -void
       -xml_parse(XMLParser *x)
       -{
       -        int c, ispi;
       -        size_t datalen, tagdatalen, taglen;
       -
       -        if (!x->getnext)
       -                return;
       -        while ((c = x->getnext()) != EOF && c != '<')
       -                ; /* skip until < */
       -
       -        while (c != EOF) {
       -                if (c == '<') { /* parse tag */
       -                        if ((c = x->getnext()) == EOF)
       -                                return;
       -
       -                        if (c == '!') { /* cdata and comments */
       -                                for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
       -                                        /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
       -                                        if (tagdatalen <= sizeof("[CDATA[") - 1)
       -                                                x->data[tagdatalen++] = c;
       -                                        if (c == '>')
       -                                                break;
       -                                        else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
       -                                                        (x->data[0] == '-')) {
       -                                                xml_parsecomment(x);
       -                                                break;
       -                                        } else if (c == '[') {
       -                                                if (tagdatalen == sizeof("[CDATA[") - 1 &&
       -                                                    !strncmp(x->data, "[CDATA[", tagdatalen)) {
       -                                                        xml_parsecdata(x);
       -                                                        break;
       -                                                }
       -                                        }
       -                                }
       -                        } else {
       -                                x->tag[0] = '\0';
       -                                x->taglen = 0;
       -
       -                                /* normal tag (open, short open, close), processing instruction. */
       -                                if (isspace(c))
       -                                        while ((c = x->getnext()) != EOF && isspace(c))
       -                                                ;
       -                                if (c == EOF)
       -                                        return;
       -                                x->tag[0] = c;
       -                                ispi = (c == '?') ? 1 : 0;
       -                                x->isshorttag = ispi;
       -                                taglen = 1;
       -                                while ((c = x->getnext()) != EOF) {
       -                                        if (c == '/')
       -                                                x->isshorttag = 1; /* short tag */
       -                                        else if (c == '>' || isspace(c)) {
       -                                                x->tag[taglen] = '\0';
       -                                                if (x->tag[0] == '/') { /* end tag, starts with </ */
       -                                                        x->taglen = --taglen; /* len -1 because of / */
       -                                                        if (taglen && x->xmltagend)
       -                                                                x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
       -                                                } else {
       -                                                        x->taglen = taglen;
       -                                                        /* start tag */
       -                                                        if (x->xmltagstart)
       -                                                                x->xmltagstart(x, x->tag, x->taglen);
       -                                                        if (isspace(c))
       -                                                                xml_parseattrs(x);
       -                                                        if (x->xmltagstartparsed)
       -                                                                x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
       -                                                }
       -                                                /* call tagend for shortform or processing instruction */
       -                                                if ((x->isshorttag || ispi) && x->xmltagend)
       -                                                        x->xmltagend(x, x->tag, x->taglen, 1);
       -                                                break;
       -                                        } else if (taglen < sizeof(x->tag) - 1)
       -                                                x->tag[taglen++] = c; /* NOTE: tag name truncation */
       -                                }
       -                        }
       -                } else {
       -                        /* parse tag data */
       -                        datalen = 0;
       -                        if (x->xmldatastart)
       -                                x->xmldatastart(x);
       -                        while ((c = x->getnext()) != EOF) {
       -                                if (c == '&') {
       -                                        if (datalen) {
       -                                                x->data[datalen] = '\0';
       -                                                if (x->xmldata)
       -                                                        x->xmldata(x, x->data, datalen);
       -                                        }
       -                                        x->data[0] = c;
       -                                        datalen = 1;
       -                                        while ((c = x->getnext()) != EOF) {
       -                                                if (c == '<')
       -                                                        break;
       -                                                if (datalen < sizeof(x->data) - 1)
       -                                                        x->data[datalen++] = c;
       -                                                else {
       -                                                        /* entity too long for buffer, handle as normal data */
       -                                                        x->data[datalen] = '\0';
       -                                                        if (x->xmldata)
       -                                                                x->xmldata(x, x->data, datalen);
       -                                                        x->data[0] = c;
       -                                                        datalen = 1;
       -                                                        break;
       -                                                }
       -                                                if (c == ';') {
       -                                                        x->data[datalen] = '\0';
       -                                                        if (x->xmldataentity)
       -                                                                x->xmldataentity(x, x->data, datalen);
       -                                                        datalen = 0;
       -                                                        break;
       -                                                }
       -                                        }
       -                                } else if (c != '<') {
       -                                        if (datalen < sizeof(x->data) - 1) {
       -                                                x->data[datalen++] = c;
       -                                        } else {
       -                                                x->data[datalen] = '\0';
       -                                                if (x->xmldata)
       -                                                        x->xmldata(x, x->data, datalen);
       -                                                x->data[0] = c;
       -                                                datalen = 1;
       -                                        }
       -                                }
       -                                if (c == '<') {
       -                                        x->data[datalen] = '\0';
       -                                        if (x->xmldata && datalen)
       -                                                x->xmldata(x, x->data, datalen);
       -                                        if (x->xmldataend)
       -                                                x->xmldataend(x);
       -                                        break;
       -                                }
       -                        }
       -                }
       -        }
       -}
 (DIR) diff --git a/xml.h b/xml.h
       @@ -1,40 +0,0 @@
       -typedef struct xmlparser {
       -        /* handlers */
       -        void (*xmlattr)(struct xmlparser *, const char *, size_t,
       -              const char *, size_t, const char *, size_t);
       -        void (*xmlattrend)(struct xmlparser *, const char *, size_t,
       -              const char *, size_t);
       -        void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
       -              const char *, size_t);
       -        void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
       -              const char *, size_t, const char *, size_t);
       -        void (*xmlcdatastart)(struct xmlparser *);
       -        void (*xmlcdata)(struct xmlparser *, const char *, size_t);
       -        void (*xmlcdataend)(struct xmlparser *);
       -        void (*xmlcommentstart)(struct xmlparser *);
       -        void (*xmlcomment)(struct xmlparser *, const char *, size_t);
       -        void (*xmlcommentend)(struct xmlparser *);
       -        void (*xmldata)(struct xmlparser *, const char *, size_t);
       -        void (*xmldataend)(struct xmlparser *);
       -        void (*xmldataentity)(struct xmlparser *, const char *, size_t);
       -        void (*xmldatastart)(struct xmlparser *);
       -        void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
       -        void (*xmltagstart)(struct xmlparser *, const char *, size_t);
       -        void (*xmltagstartparsed)(struct xmlparser *, const char *,
       -              size_t, int);
       -
       -        int (*getnext)(void);
       -
       -        /* current tag */
       -        char tag[1024];
       -        size_t taglen;
       -        /* current tag is in short form ? <tag /> */
       -        int isshorttag;
       -        /* current attribute name */
       -        char name[1024];
       -        /* data buffer used for tag data, cdata and attribute data */
       -        char data[BUFSIZ];
       -} XMLParser;
       -
       -int xml_entitytostr(const char *, char *, size_t);
       -void xml_parse(XMLParser *);