txml: sync many XML parser improvements - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit d22cedcf1a4d6a4066489e029ee2888d76308318
 (DIR) parent b0fd3fce528a98b283ee135d2a09da04191223c3
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sun, 26 Aug 2018 15:27:26 +0200
       
       xml: sync many XML parser improvements
       
       Diffstat:
         M main.c                              |       1 -
         M xml.c                               |     250 +++++++++++++++++--------------
         M xml.h                               |      10 +++-------
       
       3 files changed, 144 insertions(+), 117 deletions(-)
       ---
 (DIR) diff --git a/main.c b/main.c
       t@@ -3,7 +3,6 @@
        
        #include <ctype.h>
        #include <err.h>
       -#include <stdint.h>
        #include <stdio.h>
        #include <stdlib.h>
        #include <string.h>
 (DIR) diff --git a/xml.c b/xml.c
       t@@ -1,7 +1,8 @@
       +#include <sys/types.h>
       +
        #include <ctype.h>
        #include <errno.h>
        #include <limits.h>
       -#include <stdint.h>
        #include <stdio.h>
        #include <stdlib.h>
        #include <string.h>
       t@@ -12,19 +13,20 @@ static void
        xml_parseattrs(XMLParser *x)
        {
                size_t namelen = 0, valuelen;
       -        int c, endsep, endname = 0;
       +        int c, endsep, endname = 0, valuestart = 0;
        
                while ((c = x->getnext()) != EOF) {
       -                if (isspace(c)) { /* TODO: simplify endname ? */
       +                if (isspace(c)) {
                                if (namelen)
                                        endname = 1;
                                continue;
       -                }
       -                if (c == '?')
       +                } else if (c == '?')
                                ; /* ignore */
                        else if (c == '=') {
                                x->name[namelen] = '\0';
       -                } else if (namelen && ((endname && isalpha(c)) || (c == '>' || c == '/'))) {
       +                        valuestart = 1;
       +                        endname = 1;
       +                } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
                                /* attribute without value */
                                x->name[namelen] = '\0';
                                if (x->xmlattrstart)
       t@@ -36,12 +38,21 @@ xml_parseattrs(XMLParser *x)
                                endname = 0;
                                x->name[0] = c;
                                namelen = 1;
       -                } else if (namelen && (c == '\'' || c == '"')) {
       +                } else if (namelen && valuestart) {
                                /* attribute with value */
       -                        endsep = c; /* c is end separator */
                                if (x->xmlattrstart)
                                        x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
       -                        for (valuelen = 0; (c = x->getnext()) != EOF;) {
       +
       +                        valuelen = 0;
       +                        if (c == '\'' || c == '"') {
       +                                endsep = c;
       +                        } else {
       +                                endsep = ' '; /* isspace() */
       +                                goto startvalue;
       +                        }
       +
       +                        while ((c = x->getnext()) != EOF) {
       +startvalue:
                                        if (c == '&') { /* entities */
                                                x->data[valuelen] = '\0';
                                                /* call data function with data before entity if there is data */
       t@@ -50,16 +61,17 @@ xml_parseattrs(XMLParser *x)
                                                x->data[0] = c;
                                                valuelen = 1;
                                                while ((c = x->getnext()) != EOF) {
       -                                                if (c == endsep)
       +                                                if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
                                                                break;
                                                        if (valuelen < sizeof(x->data) - 1)
                                                                x->data[valuelen++] = c;
                                                        else {
       -                                                        /* TODO: entity too long? this should be very strange. */
       +                                                        /* entity too long for buffer, handle as normal data */
                                                                x->data[valuelen] = '\0';
                                                                if (x->xmlattr)
                                                                        x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                                        valuelen = 0;
       +                                                        x->data[0] = c;
       +                                                        valuelen = 1;
                                                                break;
                                                        }
                                                        if (c == ';') {
       t@@ -70,7 +82,7 @@ xml_parseattrs(XMLParser *x)
                                                                break;
                                                        }
                                                }
       -                                } else if (c != endsep) {
       +                                } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
                                                if (valuelen < sizeof(x->data) - 1) {
                                                        x->data[valuelen++] = c;
                                                } else {
       t@@ -81,7 +93,7 @@ xml_parseattrs(XMLParser *x)
                                                        valuelen = 1;
                                                }
                                        }
       -                                if (c == endsep) {
       +                                if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
                                                x->data[valuelen] = '\0';
                                                if (x->xmlattr)
                                                        x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       t@@ -90,8 +102,7 @@ xml_parseattrs(XMLParser *x)
                                                break;
                                        }
                                }
       -                        namelen = 0;
       -                        endname = 0;
       +                        namelen = endname = valuestart = 0;
                        } else if (namelen < sizeof(x->name) - 1) {
                                x->name[namelen++] = c;
                        }
       t@@ -99,8 +110,8 @@ xml_parseattrs(XMLParser *x)
                                break;
                        } else if (c == '/') {
                                x->isshorttag = 1;
       -                        namelen = 0;
                                x->name[0] = '\0';
       +                        namelen = 0;
                        }
                }
        }
       t@@ -108,36 +119,41 @@ xml_parseattrs(XMLParser *x)
        static void
        xml_parsecomment(XMLParser *x)
        {
       -        static const char *end = "-->";
                size_t datalen = 0, i = 0;
       -        char tmp[4];
                int c;
        
                if (x->xmlcommentstart)
                        x->xmlcommentstart(x);
                while ((c = x->getnext()) != EOF) {
       -                if (c == end[i]) {
       -                        if (end[++i] == '\0') { /* end */
       +                if (c == '-' || c == '>') {
       +                        if (x->xmlcomment) {
                                        x->data[datalen] = '\0';
       +                                x->xmlcomment(x, x->data, datalen);
       +                                datalen = 0;
       +                        }
       +                }
       +
       +                if (c == '-') {
       +                        if (++i > 2) {
                                        if (x->xmlcomment)
       -                                        x->xmlcomment(x, x->data, datalen);
       -                                if (x->xmlcommentend)
       -                                        x->xmlcommentend(x);
       -                                return;
       +                                        for (; i > 2; i--)
       +                                                x->xmlcomment(x, "-", 1);
       +                                i = 2;
                                }
       +                        continue;
       +                } else if (c == '>' && i == 2) {
       +                        if (x->xmlcommentend)
       +                                x->xmlcommentend(x);
       +                        return;
                        } else if (i) {
                                if (x->xmlcomment) {
       -                                x->data[datalen] = '\0';
       -                                if (datalen)
       -                                        x->xmlcomment(x, x->data, datalen);
       -                                memcpy(tmp, end, i);
       -                                tmp[i] = '\0';
       -                                x->xmlcomment(x, tmp, i);
       +                                for (; i > 0; i--)
       +                                        x->xmlcomment(x, "-", 1);
                                }
                                i = 0;
       -                        x->data[0] = c;
       -                        datalen = 1;
       -                } else if (datalen < sizeof(x->data) - 1) {
       +                }
       +
       +                if (datalen < sizeof(x->data) - 1) {
                                x->data[datalen++] = c;
                        } else {
                                x->data[datalen] = '\0';
       t@@ -152,36 +168,40 @@ xml_parsecomment(XMLParser *x)
        static void
        xml_parsecdata(XMLParser *x)
        {
       -        static const char *end = "]]>";
                size_t datalen = 0, i = 0;
       -        char tmp[4];
                int c;
        
                if (x->xmlcdatastart)
                        x->xmlcdatastart(x);
                while ((c = x->getnext()) != EOF) {
       -                if (c == end[i]) {
       -                        if (end[++i] == '\0') { /* end */
       +                if (c == ']' || c == '>') {
       +                        if (x->xmlcdata) {
                                        x->data[datalen] = '\0';
       +                                x->xmlcdata(x, x->data, datalen);
       +                                datalen = 0;
       +                        }
       +                }
       +
       +                if (c == ']') {
       +                        if (++i > 2) {
                                        if (x->xmlcdata)
       -                                        x->xmlcdata(x, x->data, datalen);
       -                                if (x->xmlcdataend)
       -                                        x->xmlcdataend(x);
       -                                return;
       +                                        for (; i > 2; i--)
       +                                                x->xmlcdata(x, "]", 1);
       +                                i = 2;
                                }
       +                        continue;
       +                } else if (c == '>' && i == 2) {
       +                        if (x->xmlcdataend)
       +                                x->xmlcdataend(x);
       +                        return;
                        } else if (i) {
       -                        x->data[datalen] = '\0';
       -                        if (x->xmlcdata) {
       -                                if (datalen)
       -                                        x->xmlcdata(x, x->data, datalen);
       -                                memcpy(tmp, end, i);
       -                                tmp[i] = '\0';
       -                                x->xmlcdata(x, tmp, i);
       -                        }
       +                        if (x->xmlcdata)
       +                                for (; i > 0; i--)
       +                                        x->xmlcdata(x, "]", 1);
                                i = 0;
       -                        x->data[0] = c;
       -                        datalen = 1;
       -                } else if (datalen < sizeof(x->data) - 1) {
       +                }
       +
       +                if (datalen < sizeof(x->data) - 1) {
                                x->data[datalen++] = c;
                        } else {
                                x->data[datalen] = '\0';
       t@@ -193,48 +213,53 @@ xml_parsecdata(XMLParser *x)
                }
        }
        
       -int
       -xml_codepointtoutf8(uint32_t cp, uint32_t *utf)
       +static int
       +codepointtoutf8(long r, char *s)
        {
       -        if (cp >= 0x10000) {
       -                /* 4 bytes */
       -                *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
       -                       ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
       -                       (cp & 0x3f);
       -                return 4;
       -        } else if (cp >= 0x00800) {
       -                /* 3 bytes */
       -                *utf = 0xe08080 |
       -                       ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
       -                       (cp & 0x3f);
       -                return 3;
       -        } else if (cp >= 0x80) {
       -                /* 2 bytes */
       -                *utf = 0xc080 |
       -                       ((cp & 0xfc0) << 2) | (cp & 0x3f);
       +        if (r == 0) {
       +                return 0; /* NUL byte */
       +        } else if (r <= 0x7F) {
       +                /* 1 byte: 0aaaaaaa */
       +                s[0] = r;
       +                return 1;
       +        } else if (r <= 0x07FF) {
       +                /* 2 bytes: 00000aaa aabbbbbb */
       +                s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
       +                s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
                        return 2;
       +        } else if (r <= 0xFFFF) {
       +                /* 3 bytes: aaaabbbb bbcccccc */
       +                s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
       +                s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
       +                s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
       +                return 3;
       +        } else {
       +                /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
       +                s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
       +                s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
       +                s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
       +                s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
       +                return 4;
                }
       -        *utf = cp & 0xff;
       -        return *utf ? 1 : 0; /* 1 byte */
        }
        
       -ssize_t
       -xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
       +static int
       +namedentitytostr(const char *e, char *buf, size_t bufsiz)
        {
       -        const struct {
       +        static const struct {
                        char *entity;
                        int c;
                } entities[] = {
       -                { .entity = "&amp;",  .c = '&'  },
       -                { .entity = "&lt;",   .c = '<'  },
       -                { .entity = "&gt;",   .c = '>'  },
       -                { .entity = "&apos;", .c = '\'' },
       -                { .entity = "&quot;", .c = '"'  },
       -                { .entity = "&AMP;",  .c = '&'  },
       -                { .entity = "&LT;",   .c = '<'  },
       -                { .entity = "&GT;",   .c = '>'  },
       -                { .entity = "&APOS;", .c = '\'' },
       -                { .entity = "&QUOT;", .c = '"'  }
       +                { "&amp;",  '&'  },
       +                { "&lt;",   '<'  },
       +                { "&gt;",   '>'  },
       +                { "&apos;", '\'' },
       +                { "&quot;", '"'  },
       +                { "&AMP;",  '&'  },
       +                { "&LT;",   '<'  },
       +                { "&GT;",   '>'  },
       +                { "&APOS;", '\'' },
       +                { "&QUOT;", '"'  }
                };
                size_t i;
        
       t@@ -256,11 +281,11 @@ xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
                return 0;
        }
        
       -ssize_t
       -xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
       +static int
       +numericentitytostr(const char *e, char *buf, size_t bufsiz)
        {
       -        uint32_t l = 0, cp = 0;
       -        size_t b, len;
       +        long l;
       +        int len;
                char *end;
        
                /* buffer is too small */
       t@@ -268,7 +293,7 @@ xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
                        return -1;
        
                /* not a numeric entity */
       -        if (!(e[0] == '&' && e[1] == '#'))
       +        if (e[0] != '&' || e[1] != '#')
                        return 0;
        
                /* e[1] == '#', numeric / hexadecimal entity */
       t@@ -279,21 +304,18 @@ xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
                        l = strtoul(e + 1, &end, 16);
                else
                        l = strtoul(e, &end, 10);
       -        /* invalid value or not a well-formed entity */
       -        if (errno || *end != ';')
       +        /* invalid value or not a well-formed entity or too high codepoint */
       +        if (errno || *end != ';' || l > 0x10FFFF)
                        return 0;
       -        len = xml_codepointtoutf8(l, &cp);
       -        /* make string */
       -        for (b = 0; b < len; b++)
       -                buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
       +        len = codepointtoutf8(l, buf);
                buf[len] = '\0';
        
       -        return (ssize_t)len;
       +        return len;
        }
        
        /* convert named- or numeric entity string to buffer string
         * returns byte-length of string. */
       -ssize_t
       +int
        xml_entitytostr(const char *e, char *buf, size_t bufsiz)
        {
                /* buffer is too small */
       t@@ -304,9 +326,9 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz)
                        return 0;
                /* named entity */
                if (e[1] != '#')
       -                return xml_namedentitytostr(e, buf, bufsiz);
       +                return namedentitytostr(e, buf, bufsiz);
                else /* numeric entity */
       -                return xml_numericentitytostr(e, buf, bufsiz);
       +                return numericentitytostr(e, buf, bufsiz);
        }
        
        void
       t@@ -324,12 +346,12 @@ xml_parse(XMLParser *x)
                        if (c == '<') { /* parse tag */
                                if ((c = x->getnext()) == EOF)
                                        return;
       -                        x->tag[0] = '\0';
       -                        x->taglen = 0;
       +
                                if (c == '!') { /* cdata and comments */
                                        for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
       -                                        if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */
       -                                                x->data[tagdatalen++] = c; /* TODO: prevent overflow */
       +                                        /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
       +                                        if (tagdatalen <= sizeof("[CDATA[") - 1)
       +                                                x->data[tagdatalen++] = c;
                                                if (c == '>')
                                                        break;
                                                else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
       t@@ -345,6 +367,9 @@ xml_parse(XMLParser *x)
                                                }
                                        }
                                } else {
       +                                x->tag[0] = '\0';
       +                                x->taglen = 0;
       +
                                        /* normal tag (open, short open, close), processing instruction. */
                                        if (isspace(c))
                                                while ((c = x->getnext()) != EOF && isspace(c))
       t@@ -356,7 +381,7 @@ xml_parse(XMLParser *x)
                                        x->isshorttag = ispi;
                                        taglen = 1;
                                        while ((c = x->getnext()) != EOF) {
       -                                        if (c == '/') /* TODO: simplify short tag? */
       +                                        if (c == '/')
                                                        x->isshorttag = 1; /* short tag */
                                                else if (c == '>' || isspace(c)) {
                                                        x->tag[taglen] = '\0';
       t@@ -379,7 +404,7 @@ xml_parse(XMLParser *x)
                                                                x->xmltagend(x, x->tag, x->taglen, 1);
                                                        break;
                                                } else if (taglen < sizeof(x->tag) - 1)
       -                                                x->tag[taglen++] = c;
       +                                                x->tag[taglen++] = c; /* NOTE: tag name truncation */
                                        }
                                }
                        } else {
       t@@ -401,9 +426,16 @@ xml_parse(XMLParser *x)
                                                                break;
                                                        if (datalen < sizeof(x->data) - 1)
                                                                x->data[datalen++] = c;
       -                                                if (isspace(c))
       +                                                else {
       +                                                        /* entity too long for buffer, handle as normal data */
       +                                                        x->data[datalen] = '\0';
       +                                                        if (x->xmldata)
       +                                                                x->xmldata(x, x->data, datalen);
       +                                                        x->data[0] = c;
       +                                                        datalen = 1;
                                                                break;
       -                                                else if (c == ';') {
       +                                                }
       +                                                if (c == ';') {
                                                                x->data[datalen] = '\0';
                                                                if (x->xmldataentity)
                                                                        x->xmldataentity(x, x->data, datalen);
 (DIR) diff --git a/xml.h b/xml.h
       t@@ -31,14 +31,10 @@ typedef struct xmlparser {
                /* current tag is in short form ? <tag /> */
                int isshorttag;
                /* current attribute name */
       -        char name[256];
       +        char name[1024];
                /* data buffer used for tag data, cdata and attribute data */
                char data[BUFSIZ];
        } XMLParser;
        
       -int     xml_codepointtoutf8(uint32_t, uint32_t *);
       -ssize_t xml_entitytostr(const char *, char *, size_t);
       -ssize_t xml_namedentitytostr(const char *, char *, size_t);
       -ssize_t xml_numericentitytostr(const char *, char *, size_t);
       -
       -void    xml_parse(XMLParser *);
       +int xml_entitytostr(const char *, char *, size_t);
       +void xml_parse(XMLParser *);