initial import (from randomcrap repo) - osm-zipcodes - Extract (dutch) addresses from OpenStreetMap OSM XML
(HTM) git clone git://git.codemadness.org/osm-zipcodes
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit c5dbbf96ed47a3b6f680a17a6881cd8f8a4169e5
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Fri, 18 May 2018 16:09:17 +0200
initial import (from randomcrap repo)
Diffstat:
A LICENSE | 15 +++++++++++++++
A Makefile | 6 ++++++
A README | 23 +++++++++++++++++++++++
A main.c | 269 +++++++++++++++++++++++++++++++
A sort.sh | 20 ++++++++++++++++++++
A xml.c | 446 ++++++++++++++++++++++++++++++
A xml.h | 44 +++++++++++++++++++++++++++++++
7 files changed, 823 insertions(+), 0 deletions(-)
---
(DIR) diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2018 Hiltjo Posthuma <hiltjo@codemadness.org>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
(DIR) diff --git a/Makefile b/Makefile
@@ -0,0 +1,6 @@
+build: clean
+ cc xml.c main.c -o main -O3 -Wall -static
+ strip main
+
+clean:
+ rm -f *.o main
(DIR) diff --git a/README b/README
@@ -0,0 +1,23 @@
+Purpose
+-------
+
+Extract (Dutch) zipcode and address data from OpenStreetMap OSM XML files.
+
+
+How to use
+----------
+
+1. Get the XML OSM from for example: http://download.geofabrik.de/europe/netherlands.html
+ Tested and intended to be used with The Netherlands.
+2. Build program:
+ make
+3. Run program and process data:
+ ./main netherlands-latest.osm > addr.csv
+4. Sort on city, street, housenumber, see the file sort.sh
+ LC_ALL=C sort -k 6,6 -k 4,4 -k 5,5n -t "$(printf '\t')" -T "$(pwd)" -o addr_sort.csv addr.csv
+5. Unique entries, ignore latitude, longitude.
+ LC_ALL=C uniq -f 2 addr_sort.csv > addr_sort_uniq.csv
+
+
+If you think this program is useful, please consider donating money or
+other means of contributing to the OpenStreetMap project.
(DIR) diff --git a/main.c b/main.c
@@ -0,0 +1,269 @@
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "xml.h"
+
+enum FieldType {
+ Postcode = 1,
+ Street = 2,
+ Housenr = 3,
+ City = 4,
+};
+
+struct node_address {
+ char id[16]; /* node ID */
+ char lat[16]; /* node latitude */
+ char lon[16]; /* node longitude, must be same buffer size as lat */
+ char postcode[16];
+ char street[128];
+ char housenr[16];
+ char city[128];
+};
+
+struct node_tag {
+ char key[16];
+ char value[256];
+};
+
+void xmltagstart(XMLParser *x, const char *t, size_t tl);
+void xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
+ const char *v, size_t vl);
+void xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a,
+ size_t al, const char *v, size_t vl);
+void xmltagend(XMLParser *x, const char *t, size_t tl, int isshort);
+
+static struct node_address na;
+static struct node_tag nt;
+static int isnode, istag;
+static int fieldtype;
+
+static int fd;
+struct stat st;
+unsigned char *reg;
+size_t len, off;
+
+/* ignore control chars (such as TABs) */
+static inline void
+printfield(const char *s)
+{
+ for (; *s; s++)
+ if (!iscntrl(*s))
+ putchar(*s);
+}
+
+/* print first zipcode, remove whitespaces (dutch format: "1234AB") */
+static inline void
+printzipcode(const char *s)
+{
+ for (; *s && *s != ';'; s++)
+ if (!isspace(*s) && !iscntrl(*s))
+ putchar(*s);
+}
+
+static inline void
+printaddress(void)
+{
+ char *p, *s;
+
+ if (!na.id[0] || !na.lat[0] || !na.lon[0] || !na.postcode[0] ||
+ !na.street[0] || !na.housenr[0] || !na.city[0])
+ return;
+
+ /* print each housenr as a separate line */
+ for (s = na.housenr; s; ) {
+ printfield(na.id);
+ putchar('\t');
+ printfield(na.lat);
+ putchar('\t');
+ printfield(na.lon);
+ putchar('\t');
+ printzipcode(na.postcode);
+ putchar('\t');
+ printfield(na.street);
+ putchar('\t');
+
+ /* housenr */
+ if ((p = strchr(s, ';'))) {
+ *p = '\0';
+ printfield(s);
+ *p = ';';
+ s = p + 1;
+ } else {
+ printfield(s);
+ s = NULL;
+ }
+
+ putchar('\t');
+ printfield(na.city);
+ putchar('\n');
+ }
+}
+
+void
+xmltagstart(XMLParser *x, const char *t, size_t tl)
+{
+ if (tl == 4 && t[0] == 'n' && t[1] == 'o' && t[2] == 'd' && t[3] == 'e') {
+ isnode = 1;
+ return;
+ }
+ if (!isnode)
+ return;
+
+ if (tl == 3 && t[0] == 't' && t[1] == 'a' && t[2] == 'g') {
+ istag = 1;
+ return;
+ }
+}
+
+void
+xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
+{
+ if (isnode && tl == 4 && t[0] == 'n' && t[1] == 'o' && t[2] == 'd' && t[3] == 'e') {
+ printaddress();
+
+ isnode = 0;
+ fieldtype = 0; /* reset fieldtype */
+ na.id[0] = '\0';
+ na.lat[0] = '\0';
+ na.lon[0] = '\0';
+ na.postcode[0] = '\0';
+ na.street[0] = '\0';
+ na.housenr[0] = '\0';
+ na.city[0] = '\0';
+ return;
+ } else if (istag && tl == 3 && t[0] == 't' && t[1] == 'a' && t[2] == 'g') {
+ /* NOTE: assumes key attribute is parsed first */
+ switch (fieldtype) {
+ case Postcode:
+ strlcpy(na.postcode, nt.value, sizeof(na.postcode));
+ break;
+ case Street:
+ strlcpy(na.street, nt.value, sizeof(na.street));
+ break;
+ case Housenr:
+ strlcpy(na.housenr, nt.value, sizeof(na.housenr));
+ break;
+ case City:
+ strlcpy(na.city, nt.value, sizeof(na.city));
+ break;
+ }
+
+ istag = 0;
+ fieldtype = 0;
+ nt.key[0] = '\0';
+ nt.value[0] = '\0';
+ return;
+ }
+}
+
+void
+xmlattr(XMLParser *x, const char *t, size_t tl,
+ const char *a, size_t al, const char *v, size_t vl)
+{
+ if (isnode && !istag) {
+ if (al == 2 && a[0] == 'i' && a[1] == 'd' && vl + 1 < sizeof(na.id)) {
+ /* id */
+ memcpy(na.id, v, vl);
+ } else if (al == 3 && a[0] == 'l' && vl + 1 < sizeof(na.lat)) {
+ /* lat */
+ if (a[1] == 'a' && a[2] == 't') {
+ memcpy(na.lat, v, vl);
+ } else if (a[1] == 'o' && a[2] == 'n') {
+ /* lon */
+ memcpy(na.lon, v, vl);
+ }
+ }
+ return;
+ }
+ if (al != 1)
+ return;
+
+ if (a[0] == 'k' && v[0] == 'a' && v[1] == 'd' && v[2] == 'd' && v[3] == 'r') {
+ if (!strcmp(v + 4, ":postcode")) {
+ fieldtype = Postcode;
+ strlcat(nt.key, v, sizeof(nt.key));
+ } else if (!strcmp(v + 4, ":street")) {
+ fieldtype = Street;
+ strlcat(nt.key, v, sizeof(nt.key));
+ } else if (!strcmp(v + 4, ":housenumber")) {
+ fieldtype = Housenr;
+ strlcat(nt.key, v, sizeof(nt.key));
+ } else if (!strcmp(v + 4, ":city")) {
+ fieldtype = City;
+ strlcat(nt.key, v, sizeof(nt.key));
+ }
+ return;
+ } else if (a[0] == 'v') {
+ strlcat(nt.value, v, sizeof(nt.value));
+ }
+}
+
+void
+xmlattrentity(XMLParser *x, const char *t, size_t tl,
+ const char *a, size_t al, const char *v, size_t vl)
+{
+ char buf[16];
+ ssize_t len;
+
+ if (!istag || al != 1 || a[0] != 'v')
+ return;
+
+ if ((len = xml_entitytostr(v, buf, sizeof(buf))) < 0)
+ xmlattr(x, t, tl, a, al, v, vl);
+ else
+ xmlattr(x, t, tl, a, al, buf, len);
+}
+
+static inline int
+getnext(void)
+{
+ if (off >= len)
+ return EOF;
+ /* progress meter */
+ if (off % (1048576*10) == 0)
+ fprintf(stderr, "\rProgress: %.2f%%", ((float)off / (float)len) * 100.0);
+
+ return reg[off++];
+}
+
+int
+main(int argc, char *argv[])
+{
+ XMLParser x = { 0 };
+
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s <file>\n", argv[0]);
+ return 1;
+ }
+
+ x.xmltagstart = xmltagstart;
+ x.xmlattr = xmlattr;
+ x.xmlattrentity = xmlattrentity;
+ x.xmltagend = xmltagend;
+
+ if ((fd = open(argv[1], O_RDONLY)) < 0)
+ err(1, "open");
+ if (fstat(fd, &st) < 0)
+ err(1, "fstat");
+
+ off = 0;
+ len = st.st_size;
+ if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == NULL)
+ err(1, "mmap");
+
+ x.getnext = getnext;
+ xml_parse(&x);
+
+ munmap(reg, len);
+ close(fd);
+
+ return 0;
+}
(DIR) diff --git a/sort.sh b/sort.sh
@@ -0,0 +1,20 @@
+#!/bin/sh
+# override malloc options for sort, else it is too slow.
+export MALLOC_OPTIONS="scfgju"
+# simple/binary collation.
+export LC_ALL=C
+
+# sort on city, street, housenumber
+sort \
+ --mmap \
+ -k 7,7 -k 5,5 -k 6,6n \
+ -t "$(printf '\t')" \
+ -S 1G \
+ -T "$(pwd)" \
+ -o addr_sort.csv addr.csv
+
+# unique entries, ignore latitude, longitude.
+uniq -f 3 addr_sort.csv > addr_sort_uniq.csv
+
+# DEBUG: show duplicate entries.
+#uniq -d -f 3 addr_sort.csv > addr_sort_uniq.csv
(DIR) diff --git a/xml.c b/xml.c
@@ -0,0 +1,446 @@
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xml.h"
+
+static void
+xml_parseattrs(XMLParser *x)
+{
+ size_t namelen = 0, valuelen;
+ int c, endsep, endname = 0;
+
+ while ((c = x->getnext()) != EOF) {
+ if (isspace(c)) { /* TODO: simplify endname ? */
+ if (namelen)
+ endname = 1;
+ continue;
+ }
+ if (c == '?')
+ ; /* ignore */
+ else if (c == '=') {
+ x->name[namelen] = '\0';
+ } else if (namelen && ((endname && isalpha(c)) || (c == '>' || c == '/'))) {
+ /* attribute without value */
+ x->name[namelen] = '\0';
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
+ endname = 0;
+ x->name[0] = c;
+ namelen = 1;
+ } else if (namelen && (c == '\'' || c == '"')) {
+ /* attribute with value */
+ endsep = c; /* c is end separator */
+ if (x->xmlattrstart)
+ x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
+ for (valuelen = 0; (c = x->getnext()) != EOF;) {
+ if (c == '&') { /* entities */
+ x->data[valuelen] = '\0';
+ /* call data function with data before entity if there is data */
+ if (valuelen && x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == endsep)
+ break;
+ if (valuelen < sizeof(x->data) - 1)
+ x->data[valuelen++] = c;
+ else {
+ /* TODO: entity too long? this should be very strange. */
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ valuelen = 0;
+ break;
+ }
+ if (c == ';') {
+ x->data[valuelen] = '\0';
+ if (x->xmlattrentity)
+ x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ valuelen = 0;
+ break;
+ }
+ }
+ } else if (c != endsep) {
+ if (valuelen < sizeof(x->data) - 1) {
+ x->data[valuelen++] = c;
+ } else {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ x->data[0] = c;
+ valuelen = 1;
+ }
+ }
+ if (c == endsep) {
+ x->data[valuelen] = '\0';
+ if (x->xmlattr)
+ x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
+ if (x->xmlattrend)
+ x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
+ break;
+ }
+ }
+ namelen = endname = 0;
+ } else if (namelen < sizeof(x->name) - 1) {
+ x->name[namelen++] = c;
+ }
+ if (c == '>') {
+ break;
+ } else if (c == '/') {
+ x->isshorttag = 1;
+ namelen = 0;
+ x->name[0] = '\0';
+ }
+ }
+}
+
+static void
+xml_parsecomment(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcommentstart)
+ x->xmlcommentstart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == '-' || c == '>') {
+ if (x->xmlcomment) {
+ x->data[datalen] = '\0';
+ x->xmlcomment(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == '-') {
+ if (++i > 2) {
+ if (x->xmlcomment)
+ for (; i > 2; i--)
+ x->xmlcomment(x, "-", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcommentend)
+ x->xmlcommentend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcomment) {
+ for (; i > 0; i--)
+ x->xmlcomment(x, "-", 1);
+ }
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcomment)
+ x->xmlcomment(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+static void
+xml_parsecdata(XMLParser *x)
+{
+ size_t datalen = 0, i = 0;
+ int c;
+
+ if (x->xmlcdatastart)
+ x->xmlcdatastart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == ']' || c == '>') {
+ if (x->xmlcdata) {
+ x->data[datalen] = '\0';
+ x->xmlcdata(x, x->data, datalen);
+ datalen = 0;
+ }
+ }
+
+ if (c == ']') {
+ if (++i > 2) {
+ if (x->xmlcdata)
+ for (; i > 2; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 2;
+ }
+ continue;
+ } else if (c == '>' && i == 2) {
+ if (x->xmlcdataend)
+ x->xmlcdataend(x);
+ return;
+ } else if (i) {
+ if (x->xmlcdata)
+ for (; i > 0; i--)
+ x->xmlcdata(x, "]", 1);
+ i = 0;
+ }
+
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmlcdata)
+ x->xmlcdata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+}
+
+int
+xml_codepointtoutf8(uint32_t cp, uint32_t *utf)
+{
+ if (cp >= 0x10000) {
+ /* 4 bytes */
+ *utf = 0xf0808080 | ((cp & 0xfc0000) << 6) |
+ ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
+ (cp & 0x3f);
+ return 4;
+ } else if (cp >= 0x00800) {
+ /* 3 bytes */
+ *utf = 0xe08080 |
+ ((cp & 0x3f000) << 4) | ((cp & 0xfc0) << 2) |
+ (cp & 0x3f);
+ return 3;
+ } else if (cp >= 0x80) {
+ /* 2 bytes */
+ *utf = 0xc080 |
+ ((cp & 0xfc0) << 2) | (cp & 0x3f);
+ return 2;
+ }
+ *utf = cp & 0xff;
+ return *utf ? 1 : 0; /* 1 byte */
+}
+
+ssize_t
+xml_namedentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ static const struct {
+ char *entity;
+ int c;
+ } entities[] = {
+ { .entity = "&", .c = '&' },
+ { .entity = "<", .c = '<' },
+ { .entity = ">", .c = '>' },
+ { .entity = "'", .c = '\'' },
+ { .entity = """, .c = '"' },
+ { .entity = "&", .c = '&' },
+ { .entity = "<", .c = '<' },
+ { .entity = ">", .c = '>' },
+ { .entity = "&APOS;", .c = '\'' },
+ { .entity = """, .c = '"' }
+ };
+ size_t i;
+
+ /* buffer is too small */
+ if (bufsiz < 2)
+ return -1;
+
+ /* doesn't start with &: can't match */
+ if (*e != '&')
+ return 0;
+
+ for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
+ if (!strcmp(e, entities[i].entity)) {
+ buf[0] = entities[i].c;
+ buf[1] = '\0';
+ return 1;
+ }
+ }
+ return 0;
+}
+
+ssize_t
+xml_numericentitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ uint32_t l = 0, cp = 0;
+ size_t b, len;
+ char *end;
+
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+
+ /* not a numeric entity */
+ if (e[0] != '&' || e[1] != '#')
+ return 0;
+
+ /* e[1] == '#', numeric / hexadecimal entity */
+ e += 2; /* skip "&#" */
+ errno = 0;
+ /* hex (16) or decimal (10) */
+ if (*e == 'x')
+ l = strtoul(e + 1, &end, 16);
+ else
+ l = strtoul(e, &end, 10);
+ /* invalid value or not a well-formed entity */
+ if (errno || *end != ';')
+ return 0;
+ len = xml_codepointtoutf8(l, &cp);
+ /* make string */
+ for (b = 0; b < len; b++)
+ buf[b] = (cp >> (8 * (len - 1 - b))) & 0xff;
+ buf[len] = '\0';
+
+ return (ssize_t)len;
+}
+
+/* convert named- or numeric entity string to buffer string
+ * returns byte-length of string. */
+ssize_t
+xml_entitytostr(const char *e, char *buf, size_t bufsiz)
+{
+ /* buffer is too small */
+ if (bufsiz < 5)
+ return -1;
+ /* doesn't start with & */
+ if (e[0] != '&')
+ return 0;
+ /* named entity */
+ if (e[1] != '#')
+ return xml_namedentitytostr(e, buf, bufsiz);
+ else /* numeric entity */
+ return xml_numericentitytostr(e, buf, bufsiz);
+}
+
+void
+xml_parse(XMLParser *x)
+{
+ int c, ispi;
+ size_t datalen, tagdatalen, taglen;
+
+ if (!x->getnext)
+ return;
+ while ((c = x->getnext()) != EOF && c != '<')
+ ; /* skip until < */
+
+ while (c != EOF) {
+ if (c == '<') { /* parse tag */
+ if ((c = x->getnext()) == EOF)
+ return;
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ if (c == '!') { /* cdata and comments */
+ for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
+ if (tagdatalen <= sizeof("[CDATA[") - 1) /* if (d < sizeof(x->data)) */
+ x->data[tagdatalen++] = c; /* TODO: prevent overflow */
+ if (c == '>')
+ break;
+ else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
+ (x->data[0] == '-')) {
+ xml_parsecomment(x);
+ break;
+ } else if (c == '[') {
+ if (tagdatalen == sizeof("[CDATA[") - 1 &&
+ !strncmp(x->data, "[CDATA[", tagdatalen)) {
+ xml_parsecdata(x);
+ break;
+ }
+ }
+ }
+ } else {
+ /* normal tag (open, short open, close), processing instruction. */
+ if (isspace(c))
+ while ((c = x->getnext()) != EOF && isspace(c))
+ ;
+ if (c == EOF)
+ return;
+ x->tag[0] = c;
+ ispi = (c == '?') ? 1 : 0;
+ x->isshorttag = ispi;
+ taglen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == '/') /* TODO: simplify short tag? */
+ x->isshorttag = 1; /* short tag */
+ else if (c == '>' || isspace(c)) {
+ x->tag[taglen] = '\0';
+ if (x->tag[0] == '/') { /* end tag, starts with </ */
+ x->taglen = --taglen; /* len -1 because of / */
+ if (taglen && x->xmltagend)
+ x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
+ } else {
+ x->taglen = taglen;
+ /* start tag */
+ if (x->xmltagstart)
+ x->xmltagstart(x, x->tag, x->taglen);
+ if (isspace(c))
+ xml_parseattrs(x);
+ if (x->xmltagstartparsed)
+ x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
+ }
+ /* call tagend for shortform or processing instruction */
+ if ((x->isshorttag || ispi) && x->xmltagend)
+ x->xmltagend(x, x->tag, x->taglen, 1);
+ break;
+ } else if (taglen < sizeof(x->tag) - 1)
+ x->tag[taglen++] = c;
+ }
+ }
+ } else {
+ /* parse tag data */
+ datalen = 0;
+ if (x->xmldatastart)
+ x->xmldatastart(x);
+ while ((c = x->getnext()) != EOF) {
+ if (c == '&') {
+ if (datalen) {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ }
+ x->data[0] = c;
+ datalen = 1;
+ while ((c = x->getnext()) != EOF) {
+ if (c == '<')
+ break;
+ if (datalen < sizeof(x->data) - 1)
+ x->data[datalen++] = c;
+ if (isspace(c))
+ break;
+ else if (c == ';') {
+ x->data[datalen] = '\0';
+ if (x->xmldataentity)
+ x->xmldataentity(x, x->data, datalen);
+ datalen = 0;
+ break;
+ }
+ }
+ } else if (c != '<') {
+ if (datalen < sizeof(x->data) - 1) {
+ x->data[datalen++] = c;
+ } else {
+ x->data[datalen] = '\0';
+ if (x->xmldata)
+ x->xmldata(x, x->data, datalen);
+ x->data[0] = c;
+ datalen = 1;
+ }
+ }
+ if (c == '<') {
+ x->data[datalen] = '\0';
+ if (x->xmldata && datalen)
+ x->xmldata(x, x->data, datalen);
+ if (x->xmldataend)
+ x->xmldataend(x);
+ break;
+ }
+ }
+ }
+ }
+}
(DIR) diff --git a/xml.h b/xml.h
@@ -0,0 +1,44 @@
+typedef struct xmlparser {
+ /* handlers */
+ void (*xmlattr)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlattrend)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
+ const char *, size_t);
+ void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
+ const char *, size_t, const char *, size_t);
+ void (*xmlcdatastart)(struct xmlparser *);
+ void (*xmlcdata)(struct xmlparser *, const char *, size_t);
+ void (*xmlcdataend)(struct xmlparser *);
+ void (*xmlcommentstart)(struct xmlparser *);
+ void (*xmlcomment)(struct xmlparser *, const char *, size_t);
+ void (*xmlcommentend)(struct xmlparser *);
+ void (*xmldata)(struct xmlparser *, const char *, size_t);
+ void (*xmldataend)(struct xmlparser *);
+ void (*xmldataentity)(struct xmlparser *, const char *, size_t);
+ void (*xmldatastart)(struct xmlparser *);
+ void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
+ void (*xmltagstart)(struct xmlparser *, const char *, size_t);
+ void (*xmltagstartparsed)(struct xmlparser *, const char *,
+ size_t, int);
+
+ int (*getnext)(void);
+
+ /* current tag */
+ char tag[1024];
+ size_t taglen;
+ /* current tag is in short form ? <tag /> */
+ int isshorttag;
+ /* current attribute name */
+ char name[256];
+ /* data buffer used for tag data, cdata and attribute data */
+ char data[BUFSIZ];
+} XMLParser;
+
+int xml_codepointtoutf8(uint32_t, uint32_t *);
+ssize_t xml_entitytostr(const char *, char *, size_t);
+ssize_t xml_namedentitytostr(const char *, char *, size_t);
+ssize_t xml_numericentitytostr(const char *, char *, size_t);
+
+void xml_parse(XMLParser *);