txml: sync many XML parser improvements - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit d22cedcf1a4d6a4066489e029ee2888d76308318
(DIR) parent b0fd3fce528a98b283ee135d2a09da04191223c3
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sun, 26 Aug 2018 15:27:26 +0200
xml: sync many XML parser improvements
Diffstat:
M main.c | 1 -
M xml.c | 250 +++++++++++++++++--------------
M xml.h | 10 +++-------
3 files changed, 144 insertions(+), 117 deletions(-)
---
(DIR) diff --git a/main.c b/main.c
t@@ -3,7 +3,6 @@
#include <ctype.h>
#include <err.h>
-#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
(DIR) diff --git a/xml.c b/xml.c
t@@ -1,7 +1,8 @@
+#include <sys/types.h>
+
#include <ctype.h>
#include <errno.h>
#include <limits.h>
-#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
t@@ -12,19 +13,20 @@ static void
xml_parseattrs(XMLParser *x)
{
size_t namelen = 0, valuelen;
- int c, endsep, endname = 0;
+ int c, endsep, endname = 0, valuestart = 0;
while ((c = x->getnext()) != EOF) {
- if (isspace(c)) { /* TODO: simplify endname ? */
+ if (isspace(c)) {
if (namelen)
endname = 1;
continue;
- }
- if (c == '?')
+ } else if (c == '?')
; /* ignore */
else if (c == '=') {
x->name[namelen] = '\0';
- } else if (namelen && ((endname && isalpha(c)) || (c == '>' || c == '/'))) {
+ valuestart = 1;
+ endname = 1;
+ } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
/* attribute without value */
x->name[namelen] = '\0';
if (x->xmlattrstart)
t@@ -36,12 +38,21 @@ xml_parseattrs(XMLParser *x)
endname = 0;
x->name[0] = c;
namelen = 1;
- } else if (namelen && (c == '\'' || c == '"')) {
+ } else if (namelen && valuestart) {
/* attribute with value */
- endsep = c; /* c is end separator */
if (x->xmlattrstart)
x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
- for (valuelen = 0; (c = x->getnext()) != EOF;) {
+
+ valuelen = 0;
+ if (c == '\'' || c == '"') {
+ endsep = c;
+ } else {
+ endsep = ' '; /* isspace() */
+ goto startvalue;
+ }
+
+ while ((c = x->getnext()) != EOF) {
+startvalue:
if (c == '&') { /* entities */
x->data[valuelen] = '\0';
/* call data function with data before entity if there is data */
t@@ -50,16 +61,17 @@ xml_parseattrs(XMLParser *x)
x->data[0] = c;
valuelen = 1;
while ((c = x->getnext()) != EOF) {
- if (c == endsep)
+ if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
break;
if (valuelen < sizeof(x->data) - 1)
x->data[valuelen++] = c;
else {
- /* TODO: entity too long? this should be very strange. */
+ /* entity too long for buffer, handle as normal data */
x->data[valuelen] = '\0';
if (x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- valuelen = 0;
+ x->data[0] = c;
+ valuelen = 1;
break;
}
if (c == ';') {
t@@ -70,7 +82,7 @@ xml_parseattrs(XMLParser *x)
break;
}
}
- } else if (c != endsep) {
+ } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
if (valuelen < sizeof(x->data) - 1) {
x->data[valuelen++] = c;
} else {
t@@ -81,7 +93,7 @@ xml_parseattrs(XMLParser *x)
valuelen = 1;
}
}
- if (c == endsep) {
+ if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
x->data[valuelen] = '\0';
if (x->xmlattr)
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
t@@ -90,8 +102,7 @@ xml_parseattrs(XMLParser *x)
break;
}
}
- namelen = 0;
- endname = 0;
+ namelen = endname = valuestart = 0;
} else if (namelen < sizeof(x->name) - 1) {
x->name[namelen++] = c;
}
t@@ -99,8 +110,8 @@ xml_parseattrs(XMLParser *x)
break;
} else if (c == '/') {
x->isshorttag = 1;
- namelen = 0;
x->name[0] = '\0';
+ namelen = 0;
}
}
}
t@@ -108,36 +119,41 @@ xml_parseattrs(XMLParser *x)
static void
xml_parsecomment(XMLParser *x)
{
- static const char *end = "-->";
size_t datalen = 0, i = 0;
- char tmp[4];
int c;
if (x->xmlcommentstart)
x->xmlcommentstart(x);
while ((c = x->getnext()) != EOF) {
- if (c == end[i]) {
- if (end[++i] == '\0') { /* end */
+ if (c == '-' || c == '>') {
+ if (x->xmlcomment) {
x->data[datalen] = '\0';
+ x->xmlcomment(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == '-') {
+ if (++i > 2) {
if (x->xmlcomment)
- x->xmlcomment(x, x->data, datalen);
- if (x->xmlcommentend)
- x->xmlcommentend(x);
- return;
+ for (; i > 2; i--)
+ x->xmlcomment(x, "-", 1);
+ i = 2;
}
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcommentend)
+ x->xmlcommentend(x);
+ return;
} else if (i) {
if (x->xmlcomment) {
- x->data[datalen] = '\0';
- if (datalen)
- x->xmlcomment(x, x->data, datalen);
- memcpy(tmp, end, i);
- tmp[i] = '\0';
- x->xmlcomment(x, tmp, i);
+ for (; i > 0; i--)
+ x->xmlcomment(x, "-", 1);
}
i = 0;
- x->data[0] = c;
- datalen = 1;
- } else if (datalen < sizeof(x->data) - 1) {
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
x->data[datalen++] = c;
} else {
x->data[datalen] = '\0';
t@@ -152,36 +168,40 @@ xml_parsecomment(XMLParser *x)
static void
xml_parsecdata(XMLParser *x)
{
- static const char *end = "]]>";
size_t datalen = 0, i = 0;
- char tmp[4];
int c;
if (x->xmlcdatastart)
x->xmlcdatastart(x);
while ((c = x->getnext()) != EOF) {
- if (c == end[i]) {
- if (end[++i] == '\0') { /* end */
+ if (c == ']' || c == '>') {
+ if (x->xmlcdata) {
x->data[datalen] = '\0';
+ x->xmlcdata(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == ']') {
+ if (++i > 2) {
if (x->xmlcdata)
- x->xmlcdata(x, x->data, datalen);
- if (x->xmlcdataend)
- x->xmlcdataend(x);
- return;
+ for (; i > 2; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 2;
}
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcdataend)
+ x->xmlcdataend(x);
+ return;
} else if (i) {
- x->data[datalen] = '\0';
- if (x->xmlcdata) {
- if (datalen)
- x->xmlcdata(x, x->data, datalen);
- memcpy(tmp, end, i);
- tmp[i] = '\0';
- x->xmlcdata(x, tmp, i);
- }
+ if (x->xmlcdata)
+ for (; i > 0; i--)
+ x->xmlcdata(x, "]", 1);
i = 0;
- x->data[0] = c;
- datalen = 1;
- } else if (datalen < sizeof(x->data) - 1) {
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
x->data[datalen++] = c;
} else {
x->data[datalen] = '\0';
t@@ -193,48 +213,53 @@ xml_parsecdata(XMLParser *x)
}
}
-int
-xml_codepointtoutf8(uint32_t cp, uint32_t *utf)
+static int
+codepointtoutf8(long r, char *s)
{
- if (cp >= 0x10000) {
- /* 4 bytes */
- *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
- ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
- (cp & 0x3f);
- return 4;
- } else if (cp >= 0x00800) {
- /* 3 bytes */
- *utf = 0xe08080 |
- ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
- (cp & 0x3f);
- return 3;
- } else if (cp >= 0x80) {
- /* 2 bytes */
- *utf = 0xc080 |
- ((cp & 0xfc0) << 2) | (cp & 0x3f);
+ if (r == 0) {
+ return 0; /* NUL byte */
+ } else if (r <= 0x7F) {
+ /* 1 byte: 0aaaaaaa */
+ s[0] = r;
+ return 1;
+ } else if (r <= 0x07FF) {
+ /* 2 bytes: 00000aaa aabbbbbb */
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
return 2;
+ } else if (r <= 0xFFFF) {
+ /* 3 bytes: aaaabbbb bbcccccc */
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
+ return 3;
+ } else {
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
+ return 4;
}
- *utf = cp & 0xff;
- return *utf ? 1 : 0; /* 1 byte */
}
-ssize_t
-xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
+static int
+namedentitytostr(const char *e, char *buf, size_t bufsiz)
{
- const struct {
+ static const struct {
char *entity;
int c;
} entities[] = {
- { .entity = "&", .c = '&' },
- { .entity = "<", .c = '<' },
- { .entity = ">", .c = '>' },
- { .entity = "'", .c = '\'' },
- { .entity = """, .c = '"' },
- { .entity = "&", .c = '&' },
- { .entity = "<", .c = '<' },
- { .entity = ">", .c = '>' },
- { .entity = "&APOS;", .c = '\'' },
- { .entity = """, .c = '"' }
+ { "&", '&' },
+ { "<", '<' },
+ { ">", '>' },
+ { "'", '\'' },
+ { """, '"' },
+ { "&", '&' },
+ { "<", '<' },
+ { ">", '>' },
+ { "&APOS;", '\'' },
+ { """, '"' }
};
size_t i;
t@@ -256,11 +281,11 @@ xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
return 0;
}
-ssize_t
-xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
+static int
+numericentitytostr(const char *e, char *buf, size_t bufsiz)
{
- uint32_t l = 0, cp = 0;
- size_t b, len;
+ long l;
+ int len;
char *end;
/* buffer is too small */
t@@ -268,7 +293,7 @@ xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
return -1;
/* not a numeric entity */
- if (!(e[0] == '&' && e[1] == '#'))
+ if (e[0] != '&' || e[1] != '#')
return 0;
/* e[1] == '#', numeric / hexadecimal entity */
t@@ -279,21 +304,18 @@ xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
l = strtoul(e + 1, &end, 16);
else
l = strtoul(e, &end, 10);
- /* invalid value or not a well-formed entity */
- if (errno || *end != ';')
+ /* invalid value or not a well-formed entity or too high codepoint */
+ if (errno || *end != ';' || l > 0x10FFFF)
return 0;
- len = xml_codepointtoutf8(l, &cp);
- /* make string */
- for (b = 0; b < len; b++)
- buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
+ len = codepointtoutf8(l, buf);
buf[len] = '\0';
- return (ssize_t)len;
+ return len;
}
/* convert named- or numeric entity string to buffer string
* returns byte-length of string. */
-ssize_t
+int
xml_entitytostr(const char *e, char *buf, size_t bufsiz)
{
/* buffer is too small */
t@@ -304,9 +326,9 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz)
return 0;
/* named entity */
if (e[1] != '#')
- return xml_namedentitytostr(e, buf, bufsiz);
+ return namedentitytostr(e, buf, bufsiz);
else /* numeric entity */
- return xml_numericentitytostr(e, buf, bufsiz);
+ return numericentitytostr(e, buf, bufsiz);
}
void
t@@ -324,12 +346,12 @@ xml_parse(XMLParser *x)
if (c == '<') { /* parse tag */
if ((c = x->getnext()) == EOF)
return;
- x->tag[0] = '\0';
- x->taglen = 0;
+
if (c == '!') { /* cdata and comments */
for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
- if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */
- x->data[tagdatalen++] = c; /* TODO: prevent overflow */
+ /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
+ if (tagdatalen <= sizeof("[CDATA[") - 1)
+ x->data[tagdatalen++] = c;
if (c == '>')
break;
else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
t@@ -345,6 +367,9 @@ xml_parse(XMLParser *x)
}
}
} else {
+ x->tag[0] = '\0';
+ x->taglen = 0;
+
/* normal tag (open, short open, close), processing instruction. */
if (isspace(c))
while ((c = x->getnext()) != EOF && isspace(c))
t@@ -356,7 +381,7 @@ xml_parse(XMLParser *x)
x->isshorttag = ispi;
taglen = 1;
while ((c = x->getnext()) != EOF) {
- if (c == '/') /* TODO: simplify short tag? */
+ if (c == '/')
x->isshorttag = 1; /* short tag */
else if (c == '>' || isspace(c)) {
x->tag[taglen] = '\0';
t@@ -379,7 +404,7 @@ xml_parse(XMLParser *x)
x->xmltagend(x, x->tag, x->taglen, 1);
break;
} else if (taglen < sizeof(x->tag) - 1)
- x->tag[taglen++] = c;
+ x->tag[taglen++] = c; /* NOTE: tag name truncation */
}
}
} else {
t@@ -401,9 +426,16 @@ xml_parse(XMLParser *x)
break;
if (datalen < sizeof(x->data) - 1)
x->data[datalen++] = c;
- if (isspace(c))
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
break;
- else if (c == ';') {
+ }
+ if (c == ';') {
x->data[datalen] = '\0';
if (x->xmldataentity)
x->xmldataentity(x, x->data, datalen);
(DIR) diff --git a/xml.h b/xml.h
t@@ -31,14 +31,10 @@ typedef struct xmlparser {
/* current tag is in short form ? <tag /> */
int isshorttag;
/* current attribute name */
- char name[256];
+ char name[1024];
/* data buffer used for tag data, cdata and attribute data */
char data[BUFSIZ];
} XMLParser;
-int xml_codepointtoutf8(uint32_t, uint32_t *);
-ssize_t xml_entitytostr(const char *, char *, size_t);
-ssize_t xml_namedentitytostr(const char *, char *, size_t);
-ssize_t xml_numericentitytostr(const char *, char *, size_t);
-
-void xml_parse(XMLParser *);
+int xml_entitytostr(const char *, char *, size_t);
+void xml_parse(XMLParser *);