parse.c - bag - Dutch BAG Kadaster Extract parser (subset)
(HTM) git clone git://git.codemadness.org/bag
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
parse.c (16615B)
---
1 #define USE_MMAP
2
3 #if WIN32
4 #include <io.h> /* for setmode() */
5 #endif
6
7 #ifdef USE_MMAP
8 #include <sys/mman.h>
9 #include <sys/stat.h>
10 #include <sys/types.h>
11
12 #include <err.h>
13 #include <fcntl.h>
14 #endif
15
16 #include <errno.h>
17 #include <limits.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <unistd.h>
22
23 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
24 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
25 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
26 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
27
28 #define PUTCHAR putchar_unlocked
29 /*#define PUTCHAR putchar*/
30
31 struct address {
32 char bagnr[64];
33 char oppervlakte[256];
34 char status[256];
35 char gebruiksdoel[256];
36 char huisnummer[32];
37 char huisletter[32];
38 char huisnummertoevoeging[32];
39 char postcode[8];
40 };
41
42 typedef struct xmlparser {
43 /* current tag */
44 char tag[1024];
45 size_t taglen;
46 /* current tag is a short tag ? <tag /> */
47 int isshorttag;
48 /* current attribute name */
49 char name[1024];
50 /* data buffer used for tag data, CDATA and attribute data */
51 char data[BUFSIZ];
52 } XMLParser;
53
54 int xml_entitytostr(const char *, char *, size_t);
55 void xml_parse(XMLParser *);
56
57 static void xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
58 const char *v, size_t vl);
59 static void xmldata(XMLParser *x, const char *d, size_t dl);
60 static void xmltagend(XMLParser *x, const char *t, size_t tl, int isshort);
61 static void xmltagstart(XMLParser *x, const char *t, size_t tl);
62
63 static XMLParser x;
64 static struct address address;
65 static int inbagobject, innummeraanduiding, inhoofdadres;
66 static int isbagnrtype;
67 static int eindgeldig;
68
69 /* different readers, performance differs per platform */
70 #ifdef USE_MMAP
71
72 static int fd;
73 struct stat st;
74 unsigned char *reg;
75 size_t len, off;
76
77 #define GETNEXT() (off >= len ? EOF : reg[off++])
78
79 #else
80
81 #if 1
82 #define GETNEXT getchar_unlocked
83 #else
84 static int roffset, rtotal;
85 static char rbuf[4096*4];
86
87 int
88 getnext(void)
89 {
90 ssize_t n;
91
92 if (roffset >= rtotal) {
93 n = fread(rbuf, 1, sizeof(rbuf), stdin);
94 if (ferror(stdin)) {
95 perror(NULL);
96 exit(1);
97 }
98 if (feof(stdin) || n == 0) {
99 roffset = 0;
100 rtotal = 0;
101 return EOF;
102 }
103 roffset = 0;
104 rtotal = n;
105 }
106 return rbuf[roffset++];
107 }
108
109 #define GETNEXT getnext
110 #endif
111 #endif
112
113 static void
114 xml_parseattrs(XMLParser *x)
115 {
116 size_t namelen = 0, valuelen;
117 int c, endsep, endname = 0, valuestart = 0;
118
119 while ((c = GETNEXT()) != EOF) {
120 if (ISSPACE(c)) {
121 if (namelen)
122 endname = 1;
123 continue;
124 } else if (c == '?')
125 ; /* ignore */
126 else if (c == '=') {
127 x->name[namelen] = '\0';
128 valuestart = 1;
129 endname = 1;
130 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
131 /* attribute without value */
132 x->name[namelen] = '\0';
133 xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
134 endname = 0;
135 x->name[0] = c;
136 namelen = 1;
137 } else if (namelen && valuestart) {
138 /* attribute with value */
139
140 valuelen = 0;
141 if (c == '\'' || c == '"') {
142 endsep = c;
143 } else {
144 endsep = ' '; /* ISSPACE() */
145 goto startvalue;
146 }
147
148 while ((c = GETNEXT()) != EOF) {
149 startvalue:
150 if (c == '&') { /* entities */
151 x->data[valuelen] = '\0';
152 /* call data function with data before entity if there is data */
153 if (valuelen)
154 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
155 x->data[0] = c;
156 valuelen = 1;
157 while ((c = GETNEXT()) != EOF) {
158 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
159 break;
160 if (valuelen < sizeof(x->data) - 1)
161 x->data[valuelen++] = c;
162 else {
163 /* entity too long for buffer, handle as normal data */
164 x->data[valuelen] = '\0';
165 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
166 x->data[0] = c;
167 valuelen = 1;
168 break;
169 }
170 if (c == ';') {
171 x->data[valuelen] = '\0';
172 valuelen = 0;
173 break;
174 }
175 }
176 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
177 if (valuelen < sizeof(x->data) - 1) {
178 x->data[valuelen++] = c;
179 } else {
180 x->data[valuelen] = '\0';
181 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
182 x->data[0] = c;
183 valuelen = 1;
184 }
185 }
186 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
187 x->data[valuelen] = '\0';
188 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
189 break;
190 }
191 }
192 namelen = endname = valuestart = 0;
193 } else if (namelen < sizeof(x->name) - 1) {
194 x->name[namelen++] = c;
195 }
196 if (c == '>') {
197 break;
198 } else if (c == '/') {
199 x->isshorttag = 1;
200 x->name[0] = '\0';
201 namelen = 0;
202 }
203 }
204 }
205
206 static void
207 xml_parsecomment(XMLParser *x)
208 {
209 size_t i = 0;
210 int c;
211
212 while ((c = GETNEXT()) != EOF) {
213 if (c == '-') {
214 if (++i > 2) {
215 i = 2;
216 }
217 continue;
218 } else if (c == '>' && i == 2) {
219 return;
220 } else if (i) {
221 i = 0;
222 }
223 }
224 }
225
226 static void
227 xml_parsecdata(XMLParser *x)
228 {
229 size_t datalen = 0, i = 0;
230 int c;
231
232 while ((c = GETNEXT()) != EOF) {
233 if (c == ']' || c == '>') {
234 if (datalen) {
235 x->data[datalen] = '\0';
236 xmldata(x, x->data, datalen);
237 datalen = 0;
238 }
239 }
240
241 if (c == ']') {
242 if (++i > 2) {
243 for (; i > 2; i--)
244 xmldata(x, "]", 1);
245 i = 2;
246 }
247 continue;
248 } else if (c == '>' && i == 2) {
249 return;
250 } else if (i) {
251 for (; i > 0; i--)
252 xmldata(x, "]", 1);
253 i = 0;
254 }
255
256 if (datalen < sizeof(x->data) - 1) {
257 x->data[datalen++] = c;
258 } else {
259 x->data[datalen] = '\0';
260 xmldata(x, x->data, datalen);
261 x->data[0] = c;
262 datalen = 1;
263 }
264 }
265 }
266
267 static int
268 codepointtoutf8(long r, char *s)
269 {
270 if (r == 0) {
271 return 0; /* NUL byte */
272 } else if (r <= 0x7F) {
273 /* 1 byte: 0aaaaaaa */
274 s[0] = r;
275 return 1;
276 } else if (r <= 0x07FF) {
277 /* 2 bytes: 00000aaa aabbbbbb */
278 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
279 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
280 return 2;
281 } else if (r <= 0xFFFF) {
282 /* 3 bytes: aaaabbbb bbcccccc */
283 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
284 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
285 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
286 return 3;
287 } else {
288 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
289 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
290 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
291 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
292 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
293 return 4;
294 }
295 }
296
297 static int
298 namedentitytostr(const char *e, char *buf, size_t bufsiz)
299 {
300 static const struct {
301 const char *entity;
302 int c;
303 } entities[] = {
304 { "amp;", '&' },
305 { "lt;", '<' },
306 { "gt;", '>' },
307 { "apos;", '\'' },
308 { "quot;", '"' },
309 };
310 size_t i;
311
312 /* buffer is too small */
313 if (bufsiz < 2)
314 return -1;
315
316 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
317 if (!strcmp(e, entities[i].entity)) {
318 buf[0] = entities[i].c;
319 buf[1] = '\0';
320 return 1;
321 }
322 }
323 return -1;
324 }
325
326 static int
327 numericentitytostr(const char *e, char *buf, size_t bufsiz)
328 {
329 long l;
330 int len;
331 char *end;
332
333 /* buffer is too small */
334 if (bufsiz < 5)
335 return -1;
336
337 errno = 0;
338 /* hex (16) or decimal (10) */
339 if (*e == 'x')
340 l = strtol(++e, &end, 16);
341 else
342 l = strtol(e, &end, 10);
343 /* invalid value or not a well-formed entity or invalid code point */
344 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff ||
345 (l >= 0xd800 && l <= 0xdfff))
346 return -1;
347 len = codepointtoutf8(l, buf);
348 buf[len] = '\0';
349
350 return len;
351 }
352
353 /* convert named- or numeric entity string to buffer string
354 * returns byte-length of string or -1 on failure. */
355 int
356 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
357 {
358 /* doesn't start with & */
359 if (e[0] != '&')
360 return -1;
361 /* numeric entity */
362 if (e[1] == '#')
363 return numericentitytostr(e + 2, buf, bufsiz);
364 else /* named entity */
365 return namedentitytostr(e + 1, buf, bufsiz);
366 }
367
368 void
369 xml_parse(XMLParser *x)
370 {
371 size_t datalen, tagdatalen;
372 int c, isend;
373
374 while ((c = GETNEXT()) != EOF && c != '<')
375 ; /* skip until < */
376
377 while (c != EOF) {
378 if (c == '<') { /* parse tag */
379 if ((c = GETNEXT()) == EOF)
380 return;
381
382 if (c == '!') { /* CDATA and comments */
383 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
384 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
385 if (tagdatalen <= sizeof("[CDATA[") - 1)
386 x->data[tagdatalen++] = c;
387 if (c == '>')
388 break;
389 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
390 (x->data[0] == '-')) {
391 xml_parsecomment(x);
392 break;
393 } else if (c == '[') {
394 if (tagdatalen == sizeof("[CDATA[") - 1 &&
395 !strncmp(x->data, "[CDATA[", tagdatalen)) {
396 xml_parsecdata(x);
397 break;
398 }
399 }
400 }
401 } else {
402 /* normal tag (open, short open, close), processing instruction. */
403 x->tag[0] = c;
404 x->taglen = 1;
405 x->isshorttag = isend = 0;
406
407 /* treat processing instruction as short tag, don't strip "?" prefix. */
408 if (c == '?') {
409 x->isshorttag = 1;
410 } else if (c == '/') {
411 if ((c = GETNEXT()) == EOF)
412 return;
413 x->tag[0] = c;
414 isend = 1;
415 }
416
417 while ((c = GETNEXT()) != EOF) {
418 if (c == '/')
419 x->isshorttag = 1; /* short tag */
420 else if (c == '>' || ISSPACE(c)) {
421 x->tag[x->taglen] = '\0';
422 if (isend) { /* end tag, starts with </ */
423 xmltagend(x, x->tag, x->taglen, x->isshorttag);
424 x->tag[0] = '\0';
425 x->taglen = 0;
426 } else {
427 /* start tag */
428 xmltagstart(x, x->tag, x->taglen);
429 if (ISSPACE(c))
430 xml_parseattrs(x);
431 }
432 /* call tagend for short tag or processing instruction */
433 if (x->isshorttag) {
434 xmltagend(x, x->tag, x->taglen, x->isshorttag);
435 x->tag[0] = '\0';
436 x->taglen = 0;
437 }
438 break;
439 } else if (x->taglen < sizeof(x->tag) - 1)
440 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
441 }
442 }
443 } else {
444 /* parse tag data */
445 datalen = 0;
446 while ((c = GETNEXT()) != EOF) {
447 if (c == '&') {
448 if (datalen) {
449 x->data[datalen] = '\0';
450 xmldata(x, x->data, datalen);
451 }
452 x->data[0] = c;
453 datalen = 1;
454 while ((c = GETNEXT()) != EOF) {
455 if (c == '<')
456 break;
457 if (datalen < sizeof(x->data) - 1)
458 x->data[datalen++] = c;
459 else {
460 /* entity too long for buffer, handle as normal data */
461 x->data[datalen] = '\0';
462 xmldata(x, x->data, datalen);
463 x->data[0] = c;
464 datalen = 1;
465 break;
466 }
467 if (c == ';') {
468 x->data[datalen] = '\0';
469 datalen = 0;
470 break;
471 }
472 }
473 } else if (c != '<') {
474 if (datalen < sizeof(x->data) - 1) {
475 x->data[datalen++] = c;
476 } else {
477 x->data[datalen] = '\0';
478 xmldata(x, x->data, datalen);
479 x->data[0] = c;
480 datalen = 1;
481 }
482 }
483 if (c == '<') {
484 x->data[datalen] = '\0';
485 if (datalen)
486 xmldata(x, x->data, datalen);
487 break;
488 }
489 }
490 }
491 }
492 }
493
494 static void
495 clearaddress(struct address *a)
496 {
497 a->bagnr[0] = '\0';
498 a->oppervlakte[0] = '\0';
499 a->status[0] = '\0';
500 a->gebruiksdoel[0] = '\0';
501 a->huisnummer[0] = '\0';
502 a->huisletter[0] = '\0';
503 a->huisnummertoevoeging[0] = '\0';
504 a->postcode[0] = '\0';
505 }
506
507 static char *
508 ltrim(const char *s)
509 {
510 for (; ISSPACE((unsigned char)*s); s++)
511 ;
512 return (char *)s;
513 }
514
515 /* changed version of strlcpy: copy all non-control characters */
516 static size_t
517 concat(char *dst, const char *src, size_t dsize)
518 {
519 const char *odst = dst;
520 const char *osrc = src;
521 size_t n = dsize;
522 size_t dlen;
523
524 dst = ltrim(dst);
525
526 /* Find the end of dst and adjust bytes left but don't go past end. */
527 while (n-- != 0 && *dst != '\0')
528 dst++;
529 dlen = dst - odst;
530 n = dsize - dlen;
531
532 if (n-- == 0)
533 return(dlen + strlen(src));
534 while (*src != '\0') {
535 if (n != 0 && !ISCNTRL((unsigned char)*src)) {
536 *dst++ = *src;
537 n--;
538 }
539 src++;
540 }
541 *dst = '\0';
542
543 return(dlen + (src - osrc)); /* count does not include NUL */
544 }
545
546 static void
547 printfield(const char *s)
548 {
549 /* for (; *s; s++)
550 PUTCHAR(*s);*/
551 fputs(s, stdout);
552 }
553
554 static void
555 printaddress(void)
556 {
557 if (!address.bagnr[0])
558 return;
559 /* historical: ignore */
560 if (eindgeldig)
561 return;
562
563 printfield(address.bagnr);
564 PUTCHAR('\t');
565 /* NUM */
566 printfield(address.postcode);
567 PUTCHAR('\t');
568 printfield(address.huisnummer);
569 PUTCHAR('\t');
570 printfield(address.huisletter);
571 PUTCHAR('\t');
572 printfield(address.huisnummertoevoeging);
573 PUTCHAR('\t');
574 /* VBO */
575 printfield(address.status);
576 PUTCHAR('\t');
577 printfield(address.oppervlakte);
578 PUTCHAR('\t');
579 printfield(address.gebruiksdoel);
580 PUTCHAR('\n');
581 }
582
583 static void
584 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
585 const char *v, size_t vl)
586 {
587 if (a[0] != 'd' || t[0] != 'O')
588 return;
589 if (!strcmp(t, "Objecten:identificatie") || !strcmp(t, "Objecten-ref:NummeraanduidingRef"))
590 if (!strcmp(a, "domein") && !strcmp(v, "NL.IMBAG.Nummeraanduiding")) {
591 isbagnrtype = 1;
592 }
593 }
594
595 static void
596 xmldata(XMLParser *x, const char *d, size_t dl)
597 {
598 if (x->tag[0] != 'O')
599 return;
600
601 if (!strcmp(x->tag, "Objecten:postcode")) {
602 concat(address.postcode, d, sizeof(address.postcode));
603 } else if (!strcmp(x->tag, "Objecten:huisnummer")) {
604 concat(address.huisnummer, d, sizeof(address.huisnummer));
605 } else if (!strcmp(x->tag, "Objecten:huisletter")) {
606 concat(address.huisletter, d, sizeof(address.huisletter));
607 } else if (!strcmp(x->tag, "Objecten:huisnummertoevoeging")) {
608 concat(address.huisnummertoevoeging, d, sizeof(address.huisnummertoevoeging));
609 } else if (isbagnrtype && !strcmp(x->tag, "Objecten:identificatie")) {
610 concat(address.bagnr, d, sizeof(address.bagnr));
611 } else if (inhoofdadres && isbagnrtype && !strcmp(x->tag, "Objecten-ref:NummeraanduidingRef")) {
612 concat(address.bagnr, d, sizeof(address.bagnr));
613 } else if (!strcmp(x->tag, "Objecten:oppervlakte")) {
614 concat(address.oppervlakte, d, sizeof(address.oppervlakte));
615 } else if (!strcmp(x->tag, "Objecten:status")) {
616 concat(address.status, d, sizeof(address.status));
617 } else if (!strcmp(x->tag, "Objecten:gebruiksdoel")) {
618 if (address.gebruiksdoel[0])
619 concat(address.gebruiksdoel, ", ", sizeof(address.gebruiksdoel));
620 concat(address.gebruiksdoel, d, sizeof(address.gebruiksdoel));
621 }
622 }
623
624 static void
625 xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
626 {
627 if (t[0] != 's' && t[0] != 'O')
628 return;
629 if (inbagobject && !strcmp(t, "sl-bag-extract:bagObject")) {
630 printaddress();
631
632 inbagobject = 0;
633 innummeraanduiding = 0;
634 inhoofdadres = 0;
635 eindgeldig = 0;
636 clearaddress(&address);
637 } else if (innummeraanduiding) {
638 if (!strcmp(t, "Objecten:Nummeraanduiding") || !strcmp(t, "Objecten-ref:NummeraanduidingRef")) {
639 innummeraanduiding = 0;
640 isbagnrtype = 0;
641 }
642 } else if (isbagnrtype && !strcmp(t, "Objecten:identificatie")) {
643 isbagnrtype = 0;
644 } else if (inhoofdadres && !strcmp(t, "Objecten:heeftAlsHoofdadres")) {
645 inhoofdadres = 0;
646 }
647 }
648
649 static void
650 xmltagstart(XMLParser *x, const char *t, size_t tl)
651 {
652 if (t[0] != 's' && t[0] != 'O' && t[0] != 'H')
653 return;
654 if (!inbagobject && !strcmp(t, "sl-bag-extract:bagObject")) {
655 inbagobject = 1;
656 eindgeldig = 0;
657 clearaddress(&address);
658 } else if (inbagobject) {
659 if (!innummeraanduiding && !strcmp(t, "Objecten:Nummeraanduiding"))
660 innummeraanduiding = 1;
661
662 if (!inhoofdadres && !strcmp(t, "Objecten:heeftAlsHoofdadres"))
663 inhoofdadres = 1;
664
665 if (isbagnrtype) {
666 if (!strcmp(x->tag, "Objecten:identificatie") || !strcmp(x->tag, "Objecten-ref:NummeraanduidingRef"))
667 isbagnrtype = 0;
668 }
669 /* historical document */
670 if (!strcmp(x->tag, "Historie:eindGeldigheid")) {
671 eindgeldig = 1;
672 }
673 }
674 }
675
676 int
677 main(int argc, char *argv[])
678 {
679 #ifdef USE_MMAP
680 if (argc < 2) {
681 fprintf(stderr, "usage: %s <file>\n", argv[0]);
682 return 1;
683 }
684
685 if ((fd = open(argv[1], O_RDONLY)) < 0)
686 err(1, "open");
687 if (fstat(fd, &st) < 0)
688 err(1, "fstat");
689
690 off = 0;
691 len = st.st_size;
692 /*posix_fadvise(fd, 0, len, POSIX_FADV_SEQUENTIAL);*/ /* Linux */
693 if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == MAP_FAILED)
694 err(1, "mmap");
695
696 xml_parse(&x);
697
698 /* progress meter */
699 /*fprintf(stderr, "\rProgress: %.2f%%\n", 100.0);*/
700
701 munmap(reg, len);
702 close(fd);
703 #else
704 /* required for Windows binary mode aka more retarded bullshit. */
705 #if WIN32
706 /* binary mode for stdin, stdout and stderr */
707 _setmode(0, 0x8000); /* 0x8000 is O_BINARY */
708 _setmode(1, 0x8000);
709 _setmode(2, 0x8000);
710 #endif
711
712 xml_parse(&x);
713 #endif
714
715 printaddress();
716
717 return 0;
718 }