integrate XML parser and inline read loop - osm-zipcodes - Extract (dutch) addresses from OpenStreetMap OSM XML
(HTM) git clone git://git.codemadness.org/osm-zipcodes
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit b81f0c77edbee0be59ca8d14b3fd060aff838486
(DIR) parent 918d881b3982b6ceb90ee1eaaa3a1f7c11addb4c
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Thu, 11 Apr 2019 18:10:59 +0200
integrate XML parser and inline read loop
18s -> 6s on a small .osm region
Diffstat:
M Makefile | 2 +-
M main.c | 502 ++++++++++++++++++++++++++++++-
D xml.c | 468 -------------------------------
D xml.h | 40 -------------------------------
4 files changed, 493 insertions(+), 519 deletions(-)
---
(DIR) diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
build: clean
- cc xml.c main.c -o main -O3 -Wall -static
+ cc main.c -o main -O3 -Wall -static
strip main
clean:
(DIR) diff --git a/main.c b/main.c
@@ -2,14 +2,53 @@
#include <sys/stat.h>
#include <sys/types.h>
+#include <sys/types.h>
+
#include <ctype.h>
#include <err.h>
+#include <errno.h>
#include <fcntl.h>
+#include <limits.h>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include <unistd.h>
-#include "xml.h"
+typedef struct xmlparser {
+ /* handlers */
+ void (*xmlattr)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlattrend)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlcdatastart)(struct xmlparser *);
+ void (*xmlcdata)(struct xmlparser *, const char *, size_t);
+ void (*xmlcdataend)(struct xmlparser *);
+ void (*xmlcommentstart)(struct xmlparser *);
+ void (*xmlcomment)(struct xmlparser *, const char *, size_t);
+ void (*xmlcommentend)(struct xmlparser *);
+ void (*xmldata)(struct xmlparser *, const char *, size_t);
+ void (*xmldataend)(struct xmlparser *);
+ void (*xmldataentity)(struct xmlparser *, const char *, size_t);
+ void (*xmldatastart)(struct xmlparser *);
+ void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
+ void (*xmltagstart)(struct xmlparser *, const char *, size_t);
+ void (*xmltagstartparsed)(struct xmlparser *, const char *,
+ size_t, int);
+
+ /* current tag */
+ char tag[1024];
+ size_t taglen;
+ /* current tag is in short form ? <tag /> */
+ int isshorttag;
+ /* current attribute name */
+ char name[1024];
+ /* data buffer used for tag data, cdata and attribute data */
+ char data[BUFSIZ];
+} XMLParser;
enum FieldType {
Postcode = 1,
@@ -50,6 +89,458 @@ struct stat st;
unsigned char *reg;
size_t len, off;
+#define GETNEXT() (off >= len ? EOF : reg[off++])
+
+static void
+xml_parseattrs(XMLParser *x)
+{
+ size_t namelen = 0, valuelen;
+ int c, endsep, endname = 0, valuestart = 0;
+
+ while ((c = GETNEXT()) != EOF) {
+ if (isspace(c)) {
+ if (namelen)
+ endname = 1;
+ continue;
+ } else if (c == '?')
+ ; /* ignore */
+ else if (c == '=') {
+ x->name[namelen] = '\0';
+ valuestart = 1;
+ endname = 1;
+ } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
+ /* attribute without value */
+ x->name[namelen] = '\0';
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
+ endname = 0;
+ x->name[0] = c;
+ namelen = 1;
+ } else if (namelen && valuestart) {
+ /* attribute with value */
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
+
+ valuelen = 0;
+ if (c == '\'' || c == '"') {
+ endsep = c;
+ } else {
+ endsep = ' '; /* isspace() */
+ goto startvalue;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
+startvalue:
+ if (c == '&') { /* entities */
+ x->data[valuelen] = '\0';
+ /* call data function with data before entity if there is data */
+ if (valuelen && x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
+ break;
+ if (valuelen < sizeof(x->data) - 1)
+ x->data[valuelen++] = c;
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[valuelen] = '\0';
+ if (x->xmlattrentity)
+ x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ valuelen = 0;
+ break;
+ }
+ }
+ } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
+ if (valuelen < sizeof(x->data) - 1) {
+ x->data[valuelen++] = c;
+ } else {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ }
+ }
+ if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
+ break;
+ }
+ }
+ namelen = endname = valuestart = 0;
+ } else if (namelen < sizeof(x->name) - 1) {
+ x->name[namelen++] = c;
+ }
+ if (c == '>') {
+ break;
+ } else if (c == '/') {
+ x->isshorttag = 1;
+ x->name[0] = '\0';
+ namelen = 0;
+ }
+ }
+}
+
+static void
+xml_parsecomment(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcommentstart)
+ x->xmlcommentstart(x);
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '-' || c == '>') {
+ if (x->xmlcomment) {
+ x->data[datalen] = '\0';
+ x->xmlcomment(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == '-') {
+ if (++i > 2) {
+ if (x->xmlcomment)
+ for (; i > 2; i--)
+ x->xmlcomment(x, "-", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcommentend)
+ x->xmlcommentend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcomment) {
+ for (; i > 0; i--)
+ x->xmlcomment(x, "-", 1);
+ }
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcomment)
+ x->xmlcomment(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static void
+xml_parsecdata(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcdatastart)
+ x->xmlcdatastart(x);
+ while ((c = GETNEXT()) != EOF) {
+ if (c == ']' || c == '>') {
+ if (x->xmlcdata) {
+ x->data[datalen] = '\0';
+ x->xmlcdata(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == ']') {
+ if (++i > 2) {
+ if (x->xmlcdata)
+ for (; i > 2; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcdataend)
+ x->xmlcdataend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcdata)
+ for (; i > 0; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcdata)
+ x->xmlcdata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static int
+codepointtoutf8(long r, char *s)
+{
+ if (r == 0) {
+ return 0; /* NUL byte */
+ } else if (r <= 0x7F) {
+ /* 1 byte: 0aaaaaaa */
+ s[0] = r;
+ return 1;
+ } else if (r <= 0x07FF) {
+ /* 2 bytes: 00000aaa aabbbbbb */
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
+ return 2;
+ } else if (r <= 0xFFFF) {
+ /* 3 bytes: aaaabbbb bbcccccc */
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
+ return 3;
+ } else {
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
+ return 4;
+ }
+}
+
+static int
+namedentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ static const struct {
+ const char *entity;
+ int c;
+ } entities[] = {
+ { "amp;", '&' },
+ { "lt;", '<' },
+ { "gt;", '>' },
+ { "apos;", '\'' },
+ { "quot;", '"' },
+ { "AMP;", '&' },
+ { "LT;", '<' },
+ { "GT;", '>' },
+ { "APOS;", '\'' },
+ { "QUOT;", '"' }
+ };
+ size_t i;
+
+ /* buffer is too small */
+ if (bufsiz < 2)
+ return -1;
+
+ for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
+ if (!strcmp(e, entities[i].entity)) {
+ buf[0] = entities[i].c;
+ buf[1] = '\0';
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+numericentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ long l;
+ int len;
+ char *end;
+
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+
+ errno = 0;
+ /* hex (16) or decimal (10) */
+ if (*e == 'x')
+ l = strtoul(e + 1, &end, 16);
+ else
+ l = strtoul(e, &end, 10);
+ /* invalid value or not a well-formed entity or too high codepoint */
+ if (errno || *end != ';' || l > 0x10FFFF)
+ return 0;
+ len = codepointtoutf8(l, buf);
+ buf[len] = '\0';
+
+ return len;
+}
+
+/* convert named- or numeric entity string to buffer string
+ * returns byte-length of string. */
+int
+xml_entitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ /* doesn't start with & */
+ if (e[0] != '&')
+ return 0;
+ /* numeric entity */
+ if (e[1] == '#')
+ return numericentitytostr(e + 2, buf, bufsiz);
+ else /* named entity */
+ return namedentitytostr(e + 1, buf, bufsiz);
+}
+
+void
+xml_parse(XMLParser *x)
+{
+ size_t datalen, tagdatalen;
+ int c, isend;
+
+ while ((c = GETNEXT()) != EOF && c != '<')
+ ; /* skip until < */
+
+ while (c != EOF) {
+ if (c == '<') { /* parse tag */
+ if ((c = GETNEXT()) == EOF)
+ return;
+
+ if (c == '!') { /* cdata and comments */
+ for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
+ /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
+ if (tagdatalen <= sizeof("[CDATA[") - 1)
+ x->data[tagdatalen++] = c;
+ if (c == '>')
+ break;
+ else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
+ (x->data[0] == '-')) {
+ xml_parsecomment(x);
+ break;
+ } else if (c == '[') {
+ if (tagdatalen == sizeof("[CDATA[") - 1 &&
+ !strncmp(x->data, "[CDATA[", tagdatalen)) {
+ xml_parsecdata(x);
+ break;
+ }
+ }
+ }
+ } else {
+ /* normal tag (open, short open, close), processing instruction. */
+ x->tag[0] = c;
+ x->taglen = 1;
+ x->isshorttag = isend = 0;
+
+ /* treat processing instruction as shorttag, don't strip "?" prefix. */
+ if (c == '?') {
+ x->isshorttag = 1;
+ } else if (c == '/') {
+ if ((c = GETNEXT()) == EOF)
+ return;
+ x->tag[0] = c;
+ isend = 1;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '/')
+ x->isshorttag = 1; /* short tag */
+ else if (c == '>' || isspace(c)) {
+ x->tag[x->taglen] = '\0';
+ if (isend) { /* end tag, starts with </ */
+ if (x->xmltagend)
+ x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ } else {
+ /* start tag */
+ if (x->xmltagstart)
+ x->xmltagstart(x, x->tag, x->taglen);
+ if (isspace(c))
+ xml_parseattrs(x);
+ if (x->xmltagstartparsed)
+ x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
+ }
+ /* call tagend for shortform or processing instruction */
+ if (x->isshorttag) {
+ if (x->xmltagend)
+ x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ }
+ break;
+ } else if (x->taglen < sizeof(x->tag) - 1)
+ x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
+ }
+ }
+ } else {
+ /* parse tag data */
+ datalen = 0;
+ if (x->xmldatastart)
+ x->xmldatastart(x);
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '&') {
+ if (datalen) {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ }
+ x->data[0] = c;
+ datalen = 1;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '<')
+ break;
+ if (datalen < sizeof(x->data) - 1)
+ x->data[datalen++] = c;
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[datalen] = '\0';
+ if (x->xmldataentity)
+ x->xmldataentity(x, x->data, datalen);
+ datalen = 0;
+ break;
+ }
+ }
+ } else if (c != '<') {
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+ if (c == '<') {
+ x->data[datalen] = '\0';
+ if (x->xmldata && datalen)
+ x->xmldata(x, x->data, datalen);
+ if (x->xmldataend)
+ x->xmldataend(x);
+ break;
+ }
+ }
+ }
+ }
+}
+
+
/* ignore control chars (such as TABs) */
static inline void
printfield(const char *s)
@@ -228,14 +719,6 @@ xmlattrentity(XMLParser *x, const char *t, size_t tl,
xmlattr(x, t, tl, a, al, buf, len);
}
-static inline int
-getnext(void)
-{
- if (off >= len)
- return EOF;
- return reg[off++];
-}
-
int
main(int argc, char *argv[])
{
@@ -261,7 +744,6 @@ main(int argc, char *argv[])
if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == MAP_FAILED)
err(1, "mmap");
- x.getnext = getnext;
xml_parse(&x);
/* progress meter */
(DIR) diff --git a/xml.c b/xml.c
@@ -1,468 +0,0 @@
-#include <sys/types.h>
-
-#include <ctype.h>
-#include <errno.h>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "xml.h"
-
-static void
-xml_parseattrs(XMLParser *x)
-{
- size_t namelen = 0, valuelen;
- int c, endsep, endname = 0, valuestart = 0;
-
- while ((c = x->getnext()) != EOF) {
- if (isspace(c)) {
- if (namelen)
- endname = 1;
- continue;
- } else if (c == '?')
- ; /* ignore */
- else if (c == '=') {
- x->name[namelen] = '\0';
- valuestart = 1;
- endname = 1;
- } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
- /* attribute without value */
- x->name[namelen] = '\0';
- if (x->xmlattrstart)
- x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
- if (x->xmlattrend)
- x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
- endname = 0;
- x->name[0] = c;
- namelen = 1;
- } else if (namelen && valuestart) {
- /* attribute with value */
- if (x->xmlattrstart)
- x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
-
- valuelen = 0;
- if (c == '\'' || c == '"') {
- endsep = c;
- } else {
- endsep = ' '; /* isspace() */
- goto startvalue;
- }
-
- while ((c = x->getnext()) != EOF) {
-startvalue:
- if (c == '&') { /* entities */
- x->data[valuelen] = '\0';
- /* call data function with data before entity if there is data */
- if (valuelen && x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- x->data[0] = c;
- valuelen = 1;
- while ((c = x->getnext()) != EOF) {
- if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
- break;
- if (valuelen < sizeof(x->data) - 1)
- x->data[valuelen++] = c;
- else {
- /* entity too long for buffer, handle as normal data */
- x->data[valuelen] = '\0';
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- x->data[0] = c;
- valuelen = 1;
- break;
- }
- if (c == ';') {
- x->data[valuelen] = '\0';
- if (x->xmlattrentity)
- x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- valuelen = 0;
- break;
- }
- }
- } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
- if (valuelen < sizeof(x->data) - 1) {
- x->data[valuelen++] = c;
- } else {
- x->data[valuelen] = '\0';
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- x->data[0] = c;
- valuelen = 1;
- }
- }
- if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
- x->data[valuelen] = '\0';
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- if (x->xmlattrend)
- x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
- break;
- }
- }
- namelen = endname = valuestart = 0;
- } else if (namelen < sizeof(x->name) - 1) {
- x->name[namelen++] = c;
- }
- if (c == '>') {
- break;
- } else if (c == '/') {
- x->isshorttag = 1;
- x->name[0] = '\0';
- namelen = 0;
- }
- }
-}
-
-static void
-xml_parsecomment(XMLParser *x)
-{
- size_t datalen = 0, i = 0;
- int c;
-
- if (x->xmlcommentstart)
- x->xmlcommentstart(x);
- while ((c = x->getnext()) != EOF) {
- if (c == '-' || c == '>') {
- if (x->xmlcomment) {
- x->data[datalen] = '\0';
- x->xmlcomment(x, x->data, datalen);
- datalen = 0;
- }
- }
-
- if (c == '-') {
- if (++i > 2) {
- if (x->xmlcomment)
- for (; i > 2; i--)
- x->xmlcomment(x, "-", 1);
- i = 2;
- }
- continue;
- } else if (c == '>' && i == 2) {
- if (x->xmlcommentend)
- x->xmlcommentend(x);
- return;
- } else if (i) {
- if (x->xmlcomment) {
- for (; i > 0; i--)
- x->xmlcomment(x, "-", 1);
- }
- i = 0;
- }
-
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmlcomment)
- x->xmlcomment(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- }
- }
-}
-
-static void
-xml_parsecdata(XMLParser *x)
-{
- size_t datalen = 0, i = 0;
- int c;
-
- if (x->xmlcdatastart)
- x->xmlcdatastart(x);
- while ((c = x->getnext()) != EOF) {
- if (c == ']' || c == '>') {
- if (x->xmlcdata) {
- x->data[datalen] = '\0';
- x->xmlcdata(x, x->data, datalen);
- datalen = 0;
- }
- }
-
- if (c == ']') {
- if (++i > 2) {
- if (x->xmlcdata)
- for (; i > 2; i--)
- x->xmlcdata(x, "]", 1);
- i = 2;
- }
- continue;
- } else if (c == '>' && i == 2) {
- if (x->xmlcdataend)
- x->xmlcdataend(x);
- return;
- } else if (i) {
- if (x->xmlcdata)
- for (; i > 0; i--)
- x->xmlcdata(x, "]", 1);
- i = 0;
- }
-
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmlcdata)
- x->xmlcdata(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- }
- }
-}
-
-static int
-codepointtoutf8(long r, char *s)
-{
- if (r == 0) {
- return 0; /* NUL byte */
- } else if (r <= 0x7F) {
- /* 1 byte: 0aaaaaaa */
- s[0] = r;
- return 1;
- } else if (r <= 0x07FF) {
- /* 2 bytes: 00000aaa aabbbbbb */
- s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
- s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
- return 2;
- } else if (r <= 0xFFFF) {
- /* 3 bytes: aaaabbbb bbcccccc */
- s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
- s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
- s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
- return 3;
- } else {
- /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
- s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
- s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
- s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
- s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
- return 4;
- }
-}
-
-static int
-namedentitytostr(const char *e, char *buf, size_t bufsiz)
-{
- static const struct {
- char *entity;
- int c;
- } entities[] = {
- { "&", '&' },
- { "<", '<' },
- { ">", '>' },
- { "'", '\'' },
- { """, '"' },
- { "&", '&' },
- { "<", '<' },
- { ">", '>' },
- { "&APOS;", '\'' },
- { """, '"' }
- };
- size_t i;
-
- /* buffer is too small */
- if (bufsiz < 2)
- return -1;
-
- /* doesn't start with &: can't match */
- if (*e != '&')
- return 0;
-
- for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
- if (!strcmp(e, entities[i].entity)) {
- buf[0] = entities[i].c;
- buf[1] = '\0';
- return 1;
- }
- }
- return 0;
-}
-
-static int
-numericentitytostr(const char *e, char *buf, size_t bufsiz)
-{
- long l;
- int len;
- char *end;
-
- /* buffer is too small */
- if (bufsiz < 5)
- return -1;
-
- /* not a numeric entity */
- if (e[0] != '&' || e[1] != '#')
- return 0;
-
- /* e[1] == '#', numeric / hexadecimal entity */
- e += 2; /* skip "&#" */
- errno = 0;
- /* hex (16) or decimal (10) */
- if (*e == 'x')
- l = strtoul(e + 1, &end, 16);
- else
- l = strtoul(e, &end, 10);
- /* invalid value or not a well-formed entity or too high codepoint */
- if (errno || *end != ';' || l > 0x10FFFF)
- return 0;
- len = codepointtoutf8(l, buf);
- buf[len] = '\0';
-
- return len;
-}
-
-/* convert named- or numeric entity string to buffer string
- * returns byte-length of string. */
-int
-xml_entitytostr(const char *e, char *buf, size_t bufsiz)
-{
- /* buffer is too small */
- if (bufsiz < 5)
- return -1;
- /* doesn't start with & */
- if (e[0] != '&')
- return 0;
- /* named entity */
- if (e[1] != '#')
- return namedentitytostr(e, buf, bufsiz);
- else /* numeric entity */
- return numericentitytostr(e, buf, bufsiz);
-}
-
-void
-xml_parse(XMLParser *x)
-{
- int c, ispi;
- size_t datalen, tagdatalen, taglen;
-
- if (!x->getnext)
- return;
- while ((c = x->getnext()) != EOF && c != '<')
- ; /* skip until < */
-
- while (c != EOF) {
- if (c == '<') { /* parse tag */
- if ((c = x->getnext()) == EOF)
- return;
-
- if (c == '!') { /* cdata and comments */
- for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
- /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
- if (tagdatalen <= sizeof("[CDATA[") - 1)
- x->data[tagdatalen++] = c;
- if (c == '>')
- break;
- else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
- (x->data[0] == '-')) {
- xml_parsecomment(x);
- break;
- } else if (c == '[') {
- if (tagdatalen == sizeof("[CDATA[") - 1 &&
- !strncmp(x->data, "[CDATA[", tagdatalen)) {
- xml_parsecdata(x);
- break;
- }
- }
- }
- } else {
- x->tag[0] = '\0';
- x->taglen = 0;
-
- /* normal tag (open, short open, close), processing instruction. */
- if (isspace(c))
- while ((c = x->getnext()) != EOF && isspace(c))
- ;
- if (c == EOF)
- return;
- x->tag[0] = c;
- ispi = (c == '?') ? 1 : 0;
- x->isshorttag = ispi;
- taglen = 1;
- while ((c = x->getnext()) != EOF) {
- if (c == '/')
- x->isshorttag = 1; /* short tag */
- else if (c == '>' || isspace(c)) {
- x->tag[taglen] = '\0';
- if (x->tag[0] == '/') { /* end tag, starts with </ */
- x->taglen = --taglen; /* len -1 because of / */
- if (taglen && x->xmltagend)
- x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
- } else {
- x->taglen = taglen;
- /* start tag */
- if (x->xmltagstart)
- x->xmltagstart(x, x->tag, x->taglen);
- if (isspace(c))
- xml_parseattrs(x);
- if (x->xmltagstartparsed)
- x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
- }
- /* call tagend for shortform or processing instruction */
- if ((x->isshorttag || ispi) && x->xmltagend)
- x->xmltagend(x, x->tag, x->taglen, 1);
- break;
- } else if (taglen < sizeof(x->tag) - 1)
- x->tag[taglen++] = c; /* NOTE: tag name truncation */
- }
- }
- } else {
- /* parse tag data */
- datalen = 0;
- if (x->xmldatastart)
- x->xmldatastart(x);
- while ((c = x->getnext()) != EOF) {
- if (c == '&') {
- if (datalen) {
- x->data[datalen] = '\0';
- if (x->xmldata)
- x->xmldata(x, x->data, datalen);
- }
- x->data[0] = c;
- datalen = 1;
- while ((c = x->getnext()) != EOF) {
- if (c == '<')
- break;
- if (datalen < sizeof(x->data) - 1)
- x->data[datalen++] = c;
- else {
- /* entity too long for buffer, handle as normal data */
- x->data[datalen] = '\0';
- if (x->xmldata)
- x->xmldata(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- break;
- }
- if (c == ';') {
- x->data[datalen] = '\0';
- if (x->xmldataentity)
- x->xmldataentity(x, x->data, datalen);
- datalen = 0;
- break;
- }
- }
- } else if (c != '<') {
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmldata)
- x->xmldata(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- }
- }
- if (c == '<') {
- x->data[datalen] = '\0';
- if (x->xmldata && datalen)
- x->xmldata(x, x->data, datalen);
- if (x->xmldataend)
- x->xmldataend(x);
- break;
- }
- }
- }
- }
-}
(DIR) diff --git a/xml.h b/xml.h
@@ -1,40 +0,0 @@
-typedef struct xmlparser {
- /* handlers */
- void (*xmlattr)(struct xmlparser *, const char *, size_t,
- const char *, size_t, const char *, size_t);
- void (*xmlattrend)(struct xmlparser *, const char *, size_t,
- const char *, size_t);
- void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
- const char *, size_t);
- void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
- const char *, size_t, const char *, size_t);
- void (*xmlcdatastart)(struct xmlparser *);
- void (*xmlcdata)(struct xmlparser *, const char *, size_t);
- void (*xmlcdataend)(struct xmlparser *);
- void (*xmlcommentstart)(struct xmlparser *);
- void (*xmlcomment)(struct xmlparser *, const char *, size_t);
- void (*xmlcommentend)(struct xmlparser *);
- void (*xmldata)(struct xmlparser *, const char *, size_t);
- void (*xmldataend)(struct xmlparser *);
- void (*xmldataentity)(struct xmlparser *, const char *, size_t);
- void (*xmldatastart)(struct xmlparser *);
- void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
- void (*xmltagstart)(struct xmlparser *, const char *, size_t);
- void (*xmltagstartparsed)(struct xmlparser *, const char *,
- size_t, int);
-
- int (*getnext)(void);
-
- /* current tag */
- char tag[1024];
- size_t taglen;
- /* current tag is in short form ? <tag /> */
- int isshorttag;
- /* current attribute name */
- char name[1024];
- /* data buffer used for tag data, cdata and attribute data */
- char data[BUFSIZ];
-} XMLParser;
-
-int xml_entitytostr(const char *, char *, size_t);
-void xml_parse(XMLParser *);