initial repo - bag - BAG Kadaster Extract parser (subset)
(HTM) git clone git://git.codemadness.org/bag
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit bc7bd116af0cada05627c574f5b0f6c69a82da36
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 18 Nov 2023 23:23:31 +0100
initial repo
Diffstat:
A LICENSE | 15 +++++++++++++++
A Makefile | 6 ++++++
A README | 21 +++++++++++++++++++++
A glue.awk | 39 +++++++++++++++++++++++++++++++
A glue.c | 146 +++++++++++++++++++++++++++++++
A parse.c | 718 +++++++++++++++++++++++++++++++
A process.sh | 71 +++++++++++++++++++++++++++++++
A xml.c | 480 +++++++++++++++++++++++++++++++
A xml.h | 44 +++++++++++++++++++++++++++++++
9 files changed, 1540 insertions(+), 0 deletions(-)
---
(DIR) diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2023 Hiltjo Posthuma <hiltjo@codemadness.org>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
(DIR) diff --git a/Makefile b/Makefile
@@ -0,0 +1,6 @@
+build:
+ ${CC} -o parse parse.c -O3 -Wall
+ ${CC} -o glue glue.c -O3 -Wall
+
+clean:
+ rm -f glue parse
(DIR) diff --git a/README b/README
@@ -0,0 +1,21 @@
+BAG Kadaster extract parser
+
+
+# Usage
+
+Download extract:
+https://www.kadaster.nl/zakelijk/producten/adressen-en-gebouwen/bag-2.0-extract
+Free version: https://www.kadaster.nl/-/kosteloze-download-bag-2-0-extract
+
+* unzip 9999VBO*.zip and 9999NUM*.zip files into the same directory.
+* Edit parse.c if needed.
+* Compile by running:
+ make
+* Edit settings such as the files directory in process.sh if needed.
+* Run:
+ ./process.sh
+
+
+# Tested
+
+Tested on Linux, OpenBSD and Windows (mingw gcc and tcc).
(DIR) diff --git a/glue.awk b/glue.awk
@@ -0,0 +1,39 @@
+BEGIN {
+ FS = OFS = "\t";
+}
+# fields:
+# 1. bagnr
+# 2. postcode
+# 3. huisnummer
+# 4. huisletter
+# 5. huisnummertoevoeging
+# 6. status
+# 7. oppervlakte
+# 8. gebruiksdoel
+{
+ if ($1 != prev) {
+ print v1 "\t" v2 "\t" v3 "\t" v4 "\t" v5 "\t" v6 "\t" v7 "\t" v8;
+ v1 = v2 = v3 = v4 = v5 = v6 = v7 = v8 = "";
+ prev = $1;
+ }
+
+ if ($1 != "")
+ v1 = $1;
+ if ($2 != "")
+ v2 = $2;
+ if ($3 != "")
+ v3 = $3;
+ if ($4 != "")
+ v4 = $4;
+ if ($5 != "")
+ v5 = $5;
+ if ($6 != "")
+ v6 = $6;
+ if ($7 != "")
+ v7 = $7;
+ if ($8 != "")
+ v8 = $8;
+}
+END {
+ print v1 "\t" v2 "\t" v3 "\t" v4 "\t" v5 "\t" v6 "\t" v7 "\t" v8;
+}
(DIR) diff --git a/glue.c b/glue.c
@@ -0,0 +1,146 @@
+#if WIN32
+#include <io.h>
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define PUTCHAR putchar
+#define FPUTS fputs
+
+#define FieldLast 8
+
+struct string {
+ char *data;
+ size_t len;
+ size_t cap;
+};
+
+static struct string mergedfields[FieldLast];
+static char *fields[FieldLast];
+
+/* Splits fields in the line buffer by replacing TAB separators with NUL ('\0')
+* terminators and assign these fields as pointers. If there are less fields
+* than expected then the field is an empty string constant. */
+void
+parseline(char *line, char *fields[FieldLast])
+{
+ char *prev, *s;
+ size_t i;
+
+ for (prev = line, i = 0;
+ (s = strchr(prev, '\t')) && i < FieldLast - 1;
+ ++i) {
+ *s = '\0';
+ fields[i] = prev;
+ prev = s + 1;
+ }
+ fields[i++] = prev;
+ /* make non-parsed fields empty. */
+ for (; i < FieldLast; i++)
+ fields[i] = "";
+}
+
+void
+printfields(void)
+{
+ if (!mergedfields[0].len)
+ return;
+
+ fputs(mergedfields[0].data, stdout);
+ fputs("\t", stdout);
+ fputs(mergedfields[1].data, stdout);
+ fputs("\t", stdout);
+ fputs(mergedfields[2].data, stdout);
+ fputs("\t", stdout);
+ fputs(mergedfields[3].data, stdout);
+ fputs("\t", stdout);
+ fputs(mergedfields[4].data, stdout);
+ fputs("\t", stdout);
+ fputs(mergedfields[5].data, stdout);
+ fputs("\t", stdout);
+ fputs(mergedfields[6].data, stdout);
+ fputs("\t", stdout);
+ fputs(mergedfields[7].data, stdout);
+ fputs("\n", stdout);
+}
+
+void
+string_reset(struct string *d)
+{
+ d->data[0] = '\0';
+ d->len = 0;
+}
+
+void
+string_set(struct string *d, const char *data)
+{
+ size_t len;
+
+ len = strlen(data);
+ if (len + 1 >= d->cap) {
+ d->cap = d->cap + len + 1;
+ if (!(d->data = realloc(d->data, d->cap))) {
+ perror(NULL);
+ exit(1);
+ }
+ }
+ memcpy(d->data, data, len+ 1 ); /* copy including NUL byte */
+ //d->data[len] = '\0';
+ d->len = len;
+}
+
+int
+main(void)
+{
+ char line[4096], *p;
+ size_t i;
+
+ /* required for Windows binary mode aka more retarded bullshit. */
+#if WIN32
+ /* binary mode for stdin, stdout and stderr */
+ _setmode(0, 0x8000); /* 0x8000 is O_BINARY */
+ _setmode(1, 0x8000);
+ _setmode(2, 0x8000);
+#endif
+
+ for (i = 0; i < FieldLast; ++i) {
+ mergedfields[i].cap = 4096;
+ if (!(mergedfields[i].data = calloc(1, 4096))) {
+ perror(NULL);
+ exit(1);
+ }
+ mergedfields[i].len = 0;
+ }
+
+ while (fgets(line, sizeof(line), stdin)) {
+ if ((p = strchr(line, '\n')))
+ *p = '\0';
+
+ parseline(line, fields);
+
+ /* primary key */
+ if (strcmp(fields[0], mergedfields[0].data)) {
+ printfields();
+ for (i = 0; i < FieldLast; ++i)
+ string_reset(&mergedfields[i]);
+ string_set(&mergedfields[0], fields[0]);
+ }
+
+ for (i = 1; i < FieldLast; ++i) {
+ /* field is set: override with next */
+ if (!fields[i][0])
+ continue;
+ string_set(&mergedfields[i], fields[i]);
+ }
+ }
+ printfields();
+
+ if (ferror(stdin) || (fflush(stdout) && ferror(stdout))) {
+ perror(NULL);
+ exit(1);
+ }
+
+ return 0;
+}
(DIR) diff --git a/parse.c b/parse.c
@@ -0,0 +1,718 @@
+#define USE_MMAP
+
+#if WIN32
+#include <io.h>
+#endif
+
+#ifdef USE_MMAP
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <err.h>
+#include <fcntl.h>
+#endif
+
+#include <errno.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* ctype-like macros, but always compatible with ASCII / UTF-8 */
+#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
+#define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
+#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
+
+#define PUTCHAR putchar_unlocked
+/*#define PUTCHAR putchar*/
+
+struct address {
+ char bagnr[64];
+ char oppervlakte[256];
+ char status[256];
+ char gebruiksdoel[256];
+ char huisnummer[32];
+ char huisletter[32];
+ char huisnummertoevoeging[32];
+ char postcode[8];
+};
+
+typedef struct xmlparser {
+ /* current tag */
+ char tag[1024];
+ size_t taglen;
+ /* current tag is a short tag ? <tag /> */
+ int isshorttag;
+ /* current attribute name */
+ char name[1024];
+ /* data buffer used for tag data, CDATA and attribute data */
+ char data[BUFSIZ];
+} XMLParser;
+
+int xml_entitytostr(const char *, char *, size_t);
+void xml_parse(XMLParser *);
+
+static void xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
+ const char *v, size_t vl);
+static void xmldata(XMLParser *x, const char *d, size_t dl);
+static void xmltagend(XMLParser *x, const char *t, size_t tl, int isshort);
+static void xmltagstart(XMLParser *x, const char *t, size_t tl);
+
+static XMLParser x;
+static struct address address;
+static int inbagobject, innummeraanduiding, inhoofdadres;
+static int isbagnrtype;
+static int eindgeldig;
+
+/* different readers, performance differs per platform */
+#ifdef USE_MMAP
+
+static int fd;
+struct stat st;
+unsigned char *reg;
+size_t len, off;
+
+#define GETNEXT() (off >= len ? EOF : reg[off++])
+
+#else
+
+#if 1
+#define GETNEXT getchar_unlocked
+#else
+static int roffset, rtotal;
+static char rbuf[4096*4];
+
+int
+getnext(void)
+{
+ ssize_t n;
+
+ if (roffset >= rtotal) {
+ n = fread(rbuf, 1, sizeof(rbuf), stdin);
+ if (ferror(stdin)) {
+ perror(NULL);
+ exit(1);
+ }
+ if (feof(stdin) || n == 0) {
+ roffset = 0;
+ rtotal = 0;
+ return EOF;
+ }
+ roffset = 0;
+ rtotal = n;
+ }
+ return rbuf[roffset++];
+}
+
+#define GETNEXT getnext
+#endif
+#endif
+
+static void
+xml_parseattrs(XMLParser *x)
+{
+ size_t namelen = 0, valuelen;
+ int c, endsep, endname = 0, valuestart = 0;
+
+ while ((c = GETNEXT()) != EOF) {
+ if (ISSPACE(c)) {
+ if (namelen)
+ endname = 1;
+ continue;
+ } else if (c == '?')
+ ; /* ignore */
+ else if (c == '=') {
+ x->name[namelen] = '\0';
+ valuestart = 1;
+ endname = 1;
+ } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
+ /* attribute without value */
+ x->name[namelen] = '\0';
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
+ endname = 0;
+ x->name[0] = c;
+ namelen = 1;
+ } else if (namelen && valuestart) {
+ /* attribute with value */
+
+ valuelen = 0;
+ if (c == '\'' || c == '"') {
+ endsep = c;
+ } else {
+ endsep = ' '; /* ISSPACE() */
+ goto startvalue;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
+startvalue:
+ if (c == '&') { /* entities */
+ x->data[valuelen] = '\0';
+ /* call data function with data before entity if there is data */
+ if (valuelen)
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
+ break;
+ if (valuelen < sizeof(x->data) - 1)
+ x->data[valuelen++] = c;
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[valuelen] = '\0';
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[valuelen] = '\0';
+ valuelen = 0;
+ break;
+ }
+ }
+ } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
+ if (valuelen < sizeof(x->data) - 1) {
+ x->data[valuelen++] = c;
+ } else {
+ x->data[valuelen] = '\0';
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ }
+ }
+ if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
+ x->data[valuelen] = '\0';
+ xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ break;
+ }
+ }
+ namelen = endname = valuestart = 0;
+ } else if (namelen < sizeof(x->name) - 1) {
+ x->name[namelen++] = c;
+ }
+ if (c == '>') {
+ break;
+ } else if (c == '/') {
+ x->isshorttag = 1;
+ x->name[0] = '\0';
+ namelen = 0;
+ }
+ }
+}
+
+static void
+xml_parsecomment(XMLParser *x)
+{
+ size_t i = 0;
+ int c;
+
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '-') {
+ if (++i > 2) {
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ return;
+ } else if (i) {
+ i = 0;
+ }
+ }
+}
+
+static void
+xml_parsecdata(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ while ((c = GETNEXT()) != EOF) {
+ if (c == ']' || c == '>') {
+ if (datalen) {
+ x->data[datalen] = '\0';
+ xmldata(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == ']') {
+ if (++i > 2) {
+ for (; i > 2; i--)
+ xmldata(x, "]", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ return;
+ } else if (i) {
+ for (; i > 0; i--)
+ xmldata(x, "]", 1);
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static int
+codepointtoutf8(long r, char *s)
+{
+ if (r == 0) {
+ return 0; /* NUL byte */
+ } else if (r <= 0x7F) {
+ /* 1 byte: 0aaaaaaa */
+ s[0] = r;
+ return 1;
+ } else if (r <= 0x07FF) {
+ /* 2 bytes: 00000aaa aabbbbbb */
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
+ return 2;
+ } else if (r <= 0xFFFF) {
+ /* 3 bytes: aaaabbbb bbcccccc */
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
+ return 3;
+ } else {
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
+ return 4;
+ }
+}
+
+static int
+namedentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ static const struct {
+ const char *entity;
+ int c;
+ } entities[] = {
+ { "amp;", '&' },
+ { "lt;", '<' },
+ { "gt;", '>' },
+ { "apos;", '\'' },
+ { "quot;", '"' },
+ };
+ size_t i;
+
+ /* buffer is too small */
+ if (bufsiz < 2)
+ return -1;
+
+ for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
+ if (!strcmp(e, entities[i].entity)) {
+ buf[0] = entities[i].c;
+ buf[1] = '\0';
+ return 1;
+ }
+ }
+ return -1;
+}
+
+static int
+numericentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ long l;
+ int len;
+ char *end;
+
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+
+ errno = 0;
+ /* hex (16) or decimal (10) */
+ if (*e == 'x')
+ l = strtol(++e, &end, 16);
+ else
+ l = strtol(e, &end, 10);
+ /* invalid value or not a well-formed entity or invalid code point */
+ if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
+ (l >= 0xd800 && l <= 0xdfff))
+ return -1;
+ len = codepointtoutf8(l, buf);
+ buf[len] = '\0';
+
+ return len;
+}
+
+/* convert named- or numeric entity string to buffer string
+ * returns byte-length of string or -1 on failure. */
+int
+xml_entitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ /* doesn't start with & */
+ if (e[0] != '&')
+ return -1;
+ /* numeric entity */
+ if (e[1] == '#')
+ return numericentitytostr(e + 2, buf, bufsiz);
+ else /* named entity */
+ return namedentitytostr(e + 1, buf, bufsiz);
+}
+
+void
+xml_parse(XMLParser *x)
+{
+ size_t datalen, tagdatalen;
+ int c, isend;
+
+ while ((c = GETNEXT()) != EOF && c != '<')
+ ; /* skip until < */
+
+ while (c != EOF) {
+ if (c == '<') { /* parse tag */
+ if ((c = GETNEXT()) == EOF)
+ return;
+
+ if (c == '!') { /* CDATA and comments */
+ for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
+ /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
+ if (tagdatalen <= sizeof("[CDATA[") - 1)
+ x->data[tagdatalen++] = c;
+ if (c == '>')
+ break;
+ else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
+ (x->data[0] == '-')) {
+ xml_parsecomment(x);
+ break;
+ } else if (c == '[') {
+ if (tagdatalen == sizeof("[CDATA[") - 1 &&
+ !strncmp(x->data, "[CDATA[", tagdatalen)) {
+ xml_parsecdata(x);
+ break;
+ }
+ }
+ }
+ } else {
+ /* normal tag (open, short open, close), processing instruction. */
+ x->tag[0] = c;
+ x->taglen = 1;
+ x->isshorttag = isend = 0;
+
+ /* treat processing instruction as short tag, don't strip "?" prefix. */
+ if (c == '?') {
+ x->isshorttag = 1;
+ } else if (c == '/') {
+ if ((c = GETNEXT()) == EOF)
+ return;
+ x->tag[0] = c;
+ isend = 1;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '/')
+ x->isshorttag = 1; /* short tag */
+ else if (c == '>' || ISSPACE(c)) {
+ x->tag[x->taglen] = '\0';
+ if (isend) { /* end tag, starts with </ */
+ xmltagend(x, x->tag, x->taglen, x->isshorttag);
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ } else {
+ /* start tag */
+ xmltagstart(x, x->tag, x->taglen);
+ if (ISSPACE(c))
+ xml_parseattrs(x);
+ }
+ /* call tagend for short tag or processing instruction */
+ if (x->isshorttag) {
+ xmltagend(x, x->tag, x->taglen, x->isshorttag);
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ }
+ break;
+ } else if (x->taglen < sizeof(x->tag) - 1)
+ x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
+ }
+ }
+ } else {
+ /* parse tag data */
+ datalen = 0;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '&') {
+ if (datalen) {
+ x->data[datalen] = '\0';
+ xmldata(x, x->data, datalen);
+ }
+ x->data[0] = c;
+ datalen = 1;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '<')
+ break;
+ if (datalen < sizeof(x->data) - 1)
+ x->data[datalen++] = c;
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[datalen] = '\0';
+ xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[datalen] = '\0';
+ datalen = 0;
+ break;
+ }
+ }
+ } else if (c != '<') {
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+ if (c == '<') {
+ x->data[datalen] = '\0';
+ if (datalen)
+ xmldata(x, x->data, datalen);
+ break;
+ }
+ }
+ }
+ }
+}
+
+static void
+clearaddress(struct address *a)
+{
+ a->bagnr[0] = '\0';
+ a->oppervlakte[0] = '\0';
+ a->status[0] = '\0';
+ a->gebruiksdoel[0] = '\0';
+ a->huisnummer[0] = '\0';
+ a->huisletter[0] = '\0';
+ a->huisnummertoevoeging[0] = '\0';
+ a->postcode[0] = '\0';
+}
+
+static char *
+ltrim(const char *s)
+{
+ for (; ISSPACE((unsigned char)*s); s++)
+ ;
+ return (char *)s;
+}
+
+/* changed version of strlcpy: copy all non-control characters */
+static size_t
+concat(char *dst, const char *src, size_t dsize)
+{
+ const char *odst = dst;
+ const char *osrc = src;
+ size_t n = dsize;
+ size_t dlen;
+
+ dst = ltrim(dst);
+
+ /* Find the end of dst and adjust bytes left but don't go past end. */
+ while (n-- != 0 && *dst != '\0')
+ dst++;
+ dlen = dst - odst;
+ n = dsize - dlen;
+
+ if (n-- == 0)
+ return(dlen + strlen(src));
+ while (*src != '\0') {
+ if (n != 0 && !ISCNTRL((unsigned char)*src)) {
+ *dst++ = *src;
+ n--;
+ }
+ src++;
+ }
+ *dst = '\0';
+
+ return(dlen + (src - osrc)); /* count does not include NUL */
+}
+
+static void
+printfield(const char *s)
+{
+/* for (; *s; s++)
+ PUTCHAR(*s);*/
+ fputs(s, stdout);
+}
+
+static void
+printaddress(void)
+{
+ if (!address.bagnr[0])
+ return;
+ /* historical: ignore */
+ if (eindgeldig)
+ return;
+
+ printfield(address.bagnr);
+ PUTCHAR('\t');
+ /* NUM */
+ printfield(address.postcode);
+ PUTCHAR('\t');
+ printfield(address.huisnummer);
+ PUTCHAR('\t');
+ printfield(address.huisletter);
+ PUTCHAR('\t');
+ printfield(address.huisnummertoevoeging);
+ PUTCHAR('\t');
+ /* VBO */
+ printfield(address.status);
+ PUTCHAR('\t');
+ printfield(address.oppervlakte);
+ PUTCHAR('\t');
+ printfield(address.gebruiksdoel);
+ PUTCHAR('\n');
+}
+
+static void
+xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
+ const char *v, size_t vl)
+{
+ if (a[0] != 'd' || t[0] != 'O')
+ return;
+ if (!strcmp(t, "Objecten:identificatie") || !strcmp(t, "Objecten-ref:NummeraanduidingRef"))
+ if (!strcmp(a, "domein") && !strcmp(v, "NL.IMBAG.Nummeraanduiding")) {
+ isbagnrtype = 1;
+ }
+}
+
+static void
+xmldata(XMLParser *x, const char *d, size_t dl)
+{
+ if (x->tag[0] != 'O')
+ return;
+
+ if (!strcmp(x->tag, "Objecten:postcode")) {
+ concat(address.postcode, d, sizeof(address.postcode));
+ } else if (!strcmp(x->tag, "Objecten:huisnummer")) {
+ concat(address.huisnummer, d, sizeof(address.huisnummer));
+ } else if (!strcmp(x->tag, "Objecten:huisletter")) {
+ concat(address.huisletter, d, sizeof(address.huisletter));
+ } else if (!strcmp(x->tag, "Objecten:huisnummertoevoeging")) {
+ concat(address.huisnummertoevoeging, d, sizeof(address.huisnummertoevoeging));
+ } else if (isbagnrtype && !strcmp(x->tag, "Objecten:identificatie")) {
+ concat(address.bagnr, d, sizeof(address.bagnr));
+ } else if (inhoofdadres && isbagnrtype && !strcmp(x->tag, "Objecten-ref:NummeraanduidingRef")) {
+ concat(address.bagnr, d, sizeof(address.bagnr));
+ } else if (!strcmp(x->tag, "Objecten:oppervlakte")) {
+ concat(address.oppervlakte, d, sizeof(address.oppervlakte));
+ } else if (!strcmp(x->tag, "Objecten:status")) {
+ concat(address.status, d, sizeof(address.status));
+ } else if (!strcmp(x->tag, "Objecten:gebruiksdoel")) {
+ if (address.gebruiksdoel[0])
+ concat(address.gebruiksdoel, ", ", sizeof(address.gebruiksdoel));
+ concat(address.gebruiksdoel, d, sizeof(address.gebruiksdoel));
+ }
+}
+
+static void
+xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
+{
+ if (t[0] != 's' && t[0] != 'O')
+ return;
+ if (inbagobject && !strcmp(t, "sl-bag-extract:bagObject")) {
+ printaddress();
+
+ inbagobject = 0;
+ innummeraanduiding = 0;
+ inhoofdadres = 0;
+ eindgeldig = 0;
+ clearaddress(&address);
+ } else if (innummeraanduiding) {
+ if (!strcmp(t, "Objecten:Nummeraanduiding") || !strcmp(t, "Objecten-ref:NummeraanduidingRef")) {
+ innummeraanduiding = 0;
+ isbagnrtype = 0;
+ }
+ } else if (isbagnrtype && !strcmp(t, "Objecten:identificatie")) {
+ isbagnrtype = 0;
+ } else if (inhoofdadres && !strcmp(t, "Objecten:heeftAlsHoofdadres")) {
+ inhoofdadres = 0;
+ }
+}
+
+static void
+xmltagstart(XMLParser *x, const char *t, size_t tl)
+{
+ if (t[0] != 's' && t[0] != 'O' && t[0] != 'H')
+ return;
+ if (!inbagobject && !strcmp(t, "sl-bag-extract:bagObject")) {
+ inbagobject = 1;
+ eindgeldig = 0;
+ clearaddress(&address);
+ } else if (inbagobject) {
+ if (!innummeraanduiding && !strcmp(t, "Objecten:Nummeraanduiding"))
+ innummeraanduiding = 1;
+
+ if (!inhoofdadres && !strcmp(t, "Objecten:heeftAlsHoofdadres"))
+ inhoofdadres = 1;
+
+ if (isbagnrtype) {
+ if (!strcmp(x->tag, "Objecten:identificatie") || !strcmp(x->tag, "Objecten-ref:NummeraanduidingRef"))
+ isbagnrtype = 0;
+ }
+ /* historical document */
+ if (!strcmp(x->tag, "Historie:eindGeldigheid")) {
+ eindgeldig = 1;
+ }
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+#ifdef USE_MMAP
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s <file>\n", argv[0]);
+ return 1;
+ }
+
+ if ((fd = open(argv[1], O_RDONLY)) < 0)
+ err(1, "open");
+ if (fstat(fd, &st) < 0)
+ err(1, "fstat");
+
+ off = 0;
+ len = st.st_size;
+ /*posix_fadvise(fd, 0, len, POSIX_FADV_SEQUENTIAL);*/ /* Linux */
+ if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == MAP_FAILED)
+ err(1, "mmap");
+
+ xml_parse(&x);
+
+ /* progress meter */
+ /*fprintf(stderr, "\rProgress: %.2f%%\n", 100.0);*/
+
+ munmap(reg, len);
+ close(fd);
+#else
+ /* required for Windows binary mode aka more retarded bullshit. */
+#if WIN32
+ /* binary mode for stdin, stdout and stderr */
+ _setmode(0, 0x8000); /* 0x8000 is O_BINARY */
+ _setmode(1, 0x8000);
+ _setmode(2, 0x8000);
+#endif
+
+ xml_parse(&x);
+#endif
+
+ printaddress();
+
+ return 0;
+}
(DIR) diff --git a/process.sh b/process.sh
@@ -0,0 +1,71 @@
+#!/bin/sh
+
+bin="./parse"
+d="../data"
+glue="./glue"
+
+maxjobs=64
+
+log() {
+ echo "$1" >&2
+}
+
+# child process job: parse each file and process them to a file in parallel.
+if test "$CHILD_PROC" = "1"; then
+ # arguments: count, name, infile, outfile
+ log "[$1] $2 started"
+
+ # mmap version
+ "$bin" "$3" > "$4"
+
+ # stdin version
+ #"$bin" < "$3" > "$4"
+ status="$?"
+
+ log "[$1] $2 done"
+ exit "$status"
+fi
+
+# generate a list of jobs for processing.
+list() {
+ i=1
+ for f in "$d"/*.xml; do
+ b="${f##*/}"
+ out="tmp/$b"
+
+ printf '%s\0%s\0%s\0%s\0' "$i" "$b" "$f" "$out"
+ i=$((i+1))
+ done
+}
+
+# old awk version of glueing records, very slow on some platforms.
+#awk_glue() {
+# LC_ALL=C awk -f glue.awk
+#}
+
+merge() {
+ log "Sorting data before merging records..."
+ LC_ALL=C sort -k1,1 -k8,8 results.csv > results_sorted.csv
+
+ log "Merging records..."
+ "$glue" < results_sorted.csv > results2.csv
+
+ log "Sorting resulting data by zipcode, address number, etc..."
+ # sort results by zipcode, address number, etc.
+ LC_ALL=C sort -k2,2 -k3,3n -k4,4 results2.csv > final.csv
+}
+
+rm -rf tmp
+mkdir -p tmp
+
+# parse in parallel.
+list | CHILD_PROC="1" xargs -r -0 -P "${maxjobs}" -L 4 "$(readlink -f "$0")"
+
+# concat results to one file.
+cat tmp/* > results.csv
+
+# merge results together.
+merge
+
+# cleanup temp files.
+rm -rf tmp
(DIR) diff --git a/xml.c b/xml.c
@@ -0,0 +1,480 @@
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xml.h"
+
+#define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
+#define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
+
+static int roffset, rtotal;
+static char rbuf[4096*4];
+
+int
+getnext(void)
+{
+ ssize_t n;
+
+ if (roffset >= rtotal) {
+ n = fread(rbuf, 1, sizeof(rbuf), stdin);
+ if (ferror(stdin))
+ exit(1);
+ if (feof(stdin) || n == 0) {
+ roffset = 0;
+ rtotal = 0;
+ return EOF;
+ }
+ roffset = 0;
+ rtotal = n;
+ }
+ return rbuf[roffset++];
+}
+
+//#define GETNEXT getnext
+#define GETNEXT getchar_unlocked
+
+static void
+xml_parseattrs(XMLParser *x)
+{
+ size_t namelen = 0, valuelen;
+ int c, endsep, endname = 0, valuestart = 0;
+
+ while ((c = GETNEXT()) != EOF) {
+ if (ISSPACE(c)) {
+ if (namelen)
+ endname = 1;
+ continue;
+ } else if (c == '?')
+ ; /* ignore */
+ else if (c == '=') {
+ x->name[namelen] = '\0';
+ valuestart = 1;
+ endname = 1;
+ } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
+ /* attribute without value */
+ x->name[namelen] = '\0';
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
+ endname = 0;
+ x->name[0] = c;
+ namelen = 1;
+ } else if (namelen && valuestart) {
+ /* attribute with value */
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
+
+ valuelen = 0;
+ if (c == '\'' || c == '"') {
+ endsep = c;
+ } else {
+ endsep = ' '; /* ISSPACE() */
+ goto startvalue;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
+startvalue:
+ if (c == '&') { /* entities */
+ x->data[valuelen] = '\0';
+ /* call data function with data before entity if there is data */
+ if (valuelen && x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
+ break;
+ if (valuelen < sizeof(x->data) - 1)
+ x->data[valuelen++] = c;
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[valuelen] = '\0';
+ if (x->xmlattrentity)
+ x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ valuelen = 0;
+ break;
+ }
+ }
+ } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
+ if (valuelen < sizeof(x->data) - 1) {
+ x->data[valuelen++] = c;
+ } else {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ }
+ }
+ if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
+ break;
+ }
+ }
+ namelen = endname = valuestart = 0;
+ } else if (namelen < sizeof(x->name) - 1) {
+ x->name[namelen++] = c;
+ }
+ if (c == '>') {
+ break;
+ } else if (c == '/') {
+ x->isshorttag = 1;
+ x->name[0] = '\0';
+ namelen = 0;
+ }
+ }
+}
+
+static void
+xml_parsecomment(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcommentstart)
+ x->xmlcommentstart(x);
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '-' || c == '>') {
+ if (x->xmlcomment && datalen) {
+ x->data[datalen] = '\0';
+ x->xmlcomment(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == '-') {
+ if (++i > 2) {
+ if (x->xmlcomment)
+ for (; i > 2; i--)
+ x->xmlcomment(x, "-", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcommentend)
+ x->xmlcommentend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcomment) {
+ for (; i > 0; i--)
+ x->xmlcomment(x, "-", 1);
+ }
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcomment)
+ x->xmlcomment(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static void
+xml_parsecdata(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcdatastart)
+ x->xmlcdatastart(x);
+ while ((c = GETNEXT()) != EOF) {
+ if (c == ']' || c == '>') {
+ if (x->xmlcdata && datalen) {
+ x->data[datalen] = '\0';
+ x->xmlcdata(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == ']') {
+ if (++i > 2) {
+ if (x->xmlcdata)
+ for (; i > 2; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcdataend)
+ x->xmlcdataend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcdata)
+ for (; i > 0; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcdata)
+ x->xmlcdata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static int
+codepointtoutf8(long r, char *s)
+{
+ if (r == 0) {
+ return 0; /* NUL byte */
+ } else if (r <= 0x7F) {
+ /* 1 byte: 0aaaaaaa */
+ s[0] = r;
+ return 1;
+ } else if (r <= 0x07FF) {
+ /* 2 bytes: 00000aaa aabbbbbb */
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
+ return 2;
+ } else if (r <= 0xFFFF) {
+ /* 3 bytes: aaaabbbb bbcccccc */
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
+ return 3;
+ } else {
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
+ return 4;
+ }
+}
+
+static int
+namedentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ static const struct {
+ const char *entity;
+ int c;
+ } entities[] = {
+ { "amp;", '&' },
+ { "lt;", '<' },
+ { "gt;", '>' },
+ { "apos;", '\'' },
+ { "quot;", '"' },
+ };
+ size_t i;
+
+ /* buffer is too small */
+ if (bufsiz < 2)
+ return -1;
+
+ for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
+ if (!strcmp(e, entities[i].entity)) {
+ buf[0] = entities[i].c;
+ buf[1] = '\0';
+ return 1;
+ }
+ }
+ return -1;
+}
+
+static int
+numericentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ long l;
+ int len;
+ char *end;
+
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+
+ errno = 0;
+ /* hex (16) or decimal (10) */
+ if (*e == 'x')
+ l = strtol(++e, &end, 16);
+ else
+ l = strtol(e, &end, 10);
+ /* invalid value or not a well-formed entity or invalid code point */
+ if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
+ (l >= 0xd800 && l <= 0xdfff))
+ return -1;
+ len = codepointtoutf8(l, buf);
+ buf[len] = '\0';
+
+ return len;
+}
+
+/* convert named- or numeric entity string to buffer string
+ * returns byte-length of string or -1 on failure. */
+int
+xml_entitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ /* doesn't start with & */
+ if (e[0] != '&')
+ return -1;
+ /* numeric entity */
+ if (e[1] == '#')
+ return numericentitytostr(e + 2, buf, bufsiz);
+ else /* named entity */
+ return namedentitytostr(e + 1, buf, bufsiz);
+}
+
+void
+xml_parse(XMLParser *x)
+{
+ size_t datalen, tagdatalen;
+ int c, isend;
+
+ while ((c = GETNEXT()) != EOF && c != '<')
+ ; /* skip until < */
+
+ while (c != EOF) {
+ if (c == '<') { /* parse tag */
+ if ((c = GETNEXT()) == EOF)
+ return;
+
+ if (c == '!') { /* CDATA and comments */
+ for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
+ /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
+ if (tagdatalen <= sizeof("[CDATA[") - 1)
+ x->data[tagdatalen++] = c;
+ if (c == '>')
+ break;
+ else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
+ (x->data[0] == '-')) {
+ xml_parsecomment(x);
+ break;
+ } else if (c == '[') {
+ if (tagdatalen == sizeof("[CDATA[") - 1 &&
+ !strncmp(x->data, "[CDATA[", tagdatalen)) {
+ xml_parsecdata(x);
+ break;
+ }
+ }
+ }
+ } else {
+ /* normal tag (open, short open, close), processing instruction. */
+ x->tag[0] = c;
+ x->taglen = 1;
+ x->isshorttag = isend = 0;
+
+ /* treat processing instruction as short tag, don't strip "?" prefix. */
+ if (c == '?') {
+ x->isshorttag = 1;
+ } else if (c == '/') {
+ if ((c = GETNEXT()) == EOF)
+ return;
+ x->tag[0] = c;
+ isend = 1;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '/')
+ x->isshorttag = 1; /* short tag */
+ else if (c == '>' || ISSPACE(c)) {
+ x->tag[x->taglen] = '\0';
+ if (isend) { /* end tag, starts with </ */
+ if (x->xmltagend)
+ x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ } else {
+ /* start tag */
+ if (x->xmltagstart)
+ x->xmltagstart(x, x->tag, x->taglen);
+ if (ISSPACE(c))
+ xml_parseattrs(x);
+ if (x->xmltagstartparsed)
+ x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
+ }
+ /* call tagend for short tag or processing instruction */
+ if (x->isshorttag) {
+ if (x->xmltagend)
+ x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ }
+ break;
+ } else if (x->taglen < sizeof(x->tag) - 1)
+ x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
+ }
+ }
+ } else {
+ /* parse tag data */
+ datalen = 0;
+ if (x->xmldatastart)
+ x->xmldatastart(x);
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '&') {
+ if (datalen) {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ }
+ x->data[0] = c;
+ datalen = 1;
+ while ((c = GETNEXT()) != EOF) {
+ if (c == '<')
+ break;
+ if (datalen < sizeof(x->data) - 1)
+ x->data[datalen++] = c;
+ else {
+ /* entity too long for buffer, handle as normal data */
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ break;
+ }
+ if (c == ';') {
+ x->data[datalen] = '\0';
+ if (x->xmldataentity)
+ x->xmldataentity(x, x->data, datalen);
+ datalen = 0;
+ break;
+ }
+ }
+ } else if (c != '<') {
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+ if (c == '<') {
+ x->data[datalen] = '\0';
+ if (x->xmldata && datalen)
+ x->xmldata(x, x->data, datalen);
+ if (x->xmldataend)
+ x->xmldataend(x);
+ break;
+ }
+ }
+ }
+ }
+}
(DIR) diff --git a/xml.h b/xml.h
@@ -0,0 +1,44 @@
+#ifndef XML_H
+#define XML_H
+
+#include <stdio.h>
+
+typedef struct xmlparser {
+ /* handlers */
+ void (*xmlattr)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlattrend)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlcdatastart)(struct xmlparser *);
+ void (*xmlcdata)(struct xmlparser *, const char *, size_t);
+ void (*xmlcdataend)(struct xmlparser *);
+ void (*xmlcommentstart)(struct xmlparser *);
+ void (*xmlcomment)(struct xmlparser *, const char *, size_t);
+ void (*xmlcommentend)(struct xmlparser *);
+ void (*xmldata)(struct xmlparser *, const char *, size_t);
+ void (*xmldataend)(struct xmlparser *);
+ void (*xmldataentity)(struct xmlparser *, const char *, size_t);
+ void (*xmldatastart)(struct xmlparser *);
+ void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
+ void (*xmltagstart)(struct xmlparser *, const char *, size_t);
+ void (*xmltagstartparsed)(struct xmlparser *, const char *,
+ size_t, int);
+
+ /* current tag */
+ char tag[1024];
+ size_t taglen;
+ /* current tag is a short tag ? <tag /> */
+ int isshorttag;
+ /* current attribute name */
+ char name[1024];
+ /* data buffer used for tag data, CDATA and attribute data */
+ char data[BUFSIZ];
+} XMLParser;
+
+int xml_entitytostr(const char *, char *, size_t);
+void xml_parse(XMLParser *);
+#endif