main.c - osm-zipcodes - Extract (dutch) addresses from OpenStreetMap OSM XML
(HTM) git clone git://git.codemadness.org/osm-zipcodes
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
main.c (14375B)
---
1 #include <sys/mman.h>
2 #include <sys/stat.h>
3 #include <sys/types.h>
4
5 #include <sys/types.h>
6
7 #include <err.h>
8 #include <errno.h>
9 #include <fcntl.h>
10 #include <limits.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15
16 size_t strlcat(char *dst, const char *src, size_t dsize);
17 size_t strlcpy(char *dst, const char *src, size_t dsize);
18
19 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
20 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
21 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
22 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
23 #define PUTCHAR(c) putchar_unlocked(c)
24
25 typedef struct xmlparser {
26 /* current tag */
27 char tag[1024];
28 size_t taglen;
29 /* current tag is in short form ? <tag /> */
30 int isshorttag;
31 /* current attribute name */
32 char name[1024];
33 /* data buffer used for tag data, cdata and attribute data */
34 char data[BUFSIZ];
35 } XMLParser;
36
37 enum FieldType {
38 Postcode = 1,
39 Street = 2,
40 Housenr = 3,
41 City = 4,
42 };
43
44 struct node_address {
45 char id[16]; /* node ID */
46 char lat[16]; /* node latitude */
47 char lon[16]; /* node longitude, must be same buffer size as lat */
48 char postcode[16];
49 char street[128];
50 char housenr[16];
51 char city[128];
52 };
53
54 struct node_tag {
55 char key[16];
56 char value[256];
57 };
58
59 void xmltagstart(const char *t, size_t tl);
60 void xmlattr(const char *t, size_t tl, const char *a, size_t al,
61 const char *v, size_t vl);
62 void xmlattrentity(const char *t, size_t tl, const char *a,
63 size_t al, const char *v, size_t vl);
64 void xmltagend(const char *t, size_t tl, int isshort);
65
66 static XMLParser x;
67
68 static struct node_address na;
69 static struct node_tag nt;
70 static int isnode, istag;
71 static int fieldtype;
72
73 static int fd;
74 struct stat st;
75 unsigned char *reg;
76 size_t len, off;
77
78 #define GETNEXT() (off >= len ? EOF : reg[off++])
79
80 static void
81 xml_parseattrs(XMLParser *x)
82 {
83 size_t namelen = 0, valuelen;
84 int c, endsep, endname = 0, valuestart = 0;
85
86 while ((c = GETNEXT()) != EOF) {
87 if (ISSPACE(c)) {
88 if (namelen)
89 endname = 1;
90 continue;
91 } else if (c == '?')
92 ; /* ignore */
93 else if (c == '=') {
94 x->name[namelen] = '\0';
95 valuestart = 1;
96 endname = 1;
97 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
98 /* attribute without value */
99 x->name[namelen] = '\0';
100 xmlattr(x->tag, x->taglen, x->name, namelen, "", 0);
101 endname = 0;
102 x->name[0] = c;
103 namelen = 1;
104 } else if (namelen && valuestart) {
105 /* attribute with value */
106 valuelen = 0;
107 if (c == '\'' || c == '"') {
108 endsep = c;
109 } else {
110 endsep = ' '; /* isspace() */
111 goto startvalue;
112 }
113
114 while ((c = GETNEXT()) != EOF) {
115 startvalue:
116 if (c == '&') { /* entities */
117 x->data[valuelen] = '\0';
118 /* call data function with data before entity if there is data */
119 if (valuelen)
120 xmlattr(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
121 x->data[0] = c;
122 valuelen = 1;
123 while ((c = GETNEXT()) != EOF) {
124 if (c == endsep)
125 break;
126 if (valuelen < sizeof(x->data) - 1)
127 x->data[valuelen++] = c;
128 else {
129 /* entity too long for buffer, handle as normal data */
130 x->data[valuelen] = '\0';
131 xmlattr(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
132 x->data[0] = c;
133 valuelen = 1;
134 break;
135 }
136 if (c == ';') {
137 x->data[valuelen] = '\0';
138 xmlattrentity(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
139 valuelen = 0;
140 break;
141 }
142 }
143 } else if (c != endsep) {
144 if (valuelen < sizeof(x->data) - 1) {
145 x->data[valuelen++] = c;
146 } else {
147 x->data[valuelen] = '\0';
148 xmlattr(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
149 x->data[0] = c;
150 valuelen = 1;
151 }
152 }
153 if (c == endsep) {
154 x->data[valuelen] = '\0';
155 xmlattr(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
156 break;
157 }
158 }
159 namelen = endname = valuestart = 0;
160 } else if (namelen < sizeof(x->name) - 1) {
161 x->name[namelen++] = c;
162 }
163 if (c == '>') {
164 break;
165 } else if (c == '/') {
166 x->isshorttag = 1;
167 x->name[0] = '\0';
168 namelen = 0;
169 }
170 }
171 }
172
173 static void
174 xml_parsecomment(XMLParser *x)
175 {
176 size_t i = 0;
177 int c;
178
179 while ((c = GETNEXT()) != EOF) {
180 if (c == '-') {
181 if (++i > 2) {
182 i = 2;
183 }
184 continue;
185 } else if (c == '>' && i == 2) {
186 return;
187 } else if (i) {
188 i = 0;
189 }
190 }
191 }
192
193 static void
194 xml_parsecdata(XMLParser *x)
195 {
196 size_t i = 0;
197 int c;
198
199 while ((c = GETNEXT()) != EOF) {
200 if (c == ']') {
201 if (++i > 2) {
202 i = 2;
203 }
204 continue;
205 } else if (c == '>' && i == 2) {
206 return;
207 } else if (i) {
208 i = 0;
209 }
210 }
211 }
212
213 static int
214 codepointtoutf8(long r, char *s)
215 {
216 if (r == 0) {
217 return 0; /* NUL byte */
218 } else if (r <= 0x7F) {
219 /* 1 byte: 0aaaaaaa */
220 s[0] = r;
221 return 1;
222 } else if (r <= 0x07FF) {
223 /* 2 bytes: 00000aaa aabbbbbb */
224 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
225 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
226 return 2;
227 } else if (r <= 0xFFFF) {
228 /* 3 bytes: aaaabbbb bbcccccc */
229 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
230 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
231 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
232 return 3;
233 } else {
234 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
235 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
236 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
237 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
238 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
239 return 4;
240 }
241 }
242
243 static int
244 namedentitytostr(const char *e, char *buf, size_t bufsiz)
245 {
246 static const struct {
247 const char *entity;
248 int c;
249 } entities[] = {
250 { "amp;", '&' },
251 { "lt;", '<' },
252 { "gt;", '>' },
253 { "apos;", '\'' },
254 { "quot;", '"' },
255 { "AMP;", '&' },
256 { "LT;", '<' },
257 { "GT;", '>' },
258 { "APOS;", '\'' },
259 { "QUOT;", '"' }
260 };
261 size_t i;
262
263 /* buffer is too small */
264 if (bufsiz < 2)
265 return -1;
266
267 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
268 if (!strcmp(e, entities[i].entity)) {
269 buf[0] = entities[i].c;
270 buf[1] = '\0';
271 return 1;
272 }
273 }
274 return 0;
275 }
276
277 static int
278 numericentitytostr(const char *e, char *buf, size_t bufsiz)
279 {
280 long l;
281 int len;
282 char *end;
283
284 /* buffer is too small */
285 if (bufsiz < 5)
286 return -1;
287
288 errno = 0;
289 /* hex (16) or decimal (10) */
290 if (*e == 'x')
291 l = strtoul(e + 1, &end, 16);
292 else
293 l = strtoul(e, &end, 10);
294 /* invalid value or not a well-formed entity or too high codepoint */
295 if (errno || *end != ';' || l > 0x10FFFF)
296 return 0;
297 len = codepointtoutf8(l, buf);
298 buf[len] = '\0';
299
300 return len;
301 }
302
303 /* convert named- or numeric entity string to buffer string
304 * returns byte-length of string. */
305 int
306 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
307 {
308 /* doesn't start with & */
309 if (e[0] != '&')
310 return 0;
311 /* numeric entity */
312 if (e[1] == '#')
313 return numericentitytostr(e + 2, buf, bufsiz);
314 else /* named entity */
315 return namedentitytostr(e + 1, buf, bufsiz);
316 }
317
318 void
319 xml_parse(XMLParser *x)
320 {
321 size_t datalen, tagdatalen;
322 int c, isend;
323
324 while ((c = GETNEXT()) != EOF && c != '<')
325 ; /* skip until < */
326
327 while (c != EOF) {
328 if (c == '<') { /* parse tag */
329 if ((c = GETNEXT()) == EOF)
330 return;
331
332 if (c == '!') { /* cdata and comments */
333 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
334 /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
335 if (tagdatalen <= sizeof("[CDATA[") - 1)
336 x->data[tagdatalen++] = c;
337 if (c == '>')
338 break;
339 else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
340 (x->data[0] == '-')) {
341 xml_parsecomment(x);
342 break;
343 } else if (c == '[') {
344 if (tagdatalen == sizeof("[CDATA[") - 1 &&
345 !strncmp(x->data, "[CDATA[", tagdatalen)) {
346 xml_parsecdata(x);
347 break;
348 }
349 }
350 }
351 } else {
352 /* normal tag (open, short open, close), processing instruction. */
353 x->tag[0] = c;
354 x->taglen = 1;
355 x->isshorttag = isend = 0;
356
357 /* treat processing instruction as shorttag, don't strip "?" prefix. */
358 if (c == '?') {
359 x->isshorttag = 1;
360 } else if (c == '/') {
361 if ((c = GETNEXT()) == EOF)
362 return;
363 x->tag[0] = c;
364 isend = 1;
365 }
366
367 while ((c = GETNEXT()) != EOF) {
368 if (c == '/')
369 x->isshorttag = 1; /* short tag */
370 else if (c == '>' || ISSPACE(c)) {
371 x->tag[x->taglen] = '\0';
372 if (isend) { /* end tag, starts with </ */
373 xmltagend(x->tag, x->taglen, x->isshorttag);
374 x->tag[0] = '\0';
375 x->taglen = 0;
376 } else {
377 /* start tag */
378 xmltagstart(x->tag, x->taglen);
379 if (ISSPACE(c))
380 xml_parseattrs(x);
381 }
382 /* call tagend for shortform or processing instruction */
383 if (x->isshorttag) {
384 xmltagend(x->tag, x->taglen, x->isshorttag);
385 x->tag[0] = '\0';
386 x->taglen = 0;
387 }
388 break;
389 } else if (x->taglen < sizeof(x->tag) - 1)
390 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
391 }
392 }
393 } else {
394 /* parse tag data */
395 datalen = 0;
396 while ((c = GETNEXT()) != EOF) {
397 if (c == '&') {
398 if (datalen) {
399 x->data[datalen] = '\0';
400 }
401 x->data[0] = c;
402 datalen = 1;
403 while ((c = GETNEXT()) != EOF) {
404 if (c == '<')
405 break;
406 if (datalen < sizeof(x->data) - 1)
407 x->data[datalen++] = c;
408 else {
409 /* entity too long for buffer, handle as normal data */
410 x->data[datalen] = '\0';
411 x->data[0] = c;
412 datalen = 1;
413 break;
414 }
415 if (c == ';') {
416 x->data[datalen] = '\0';
417 datalen = 0;
418 break;
419 }
420 }
421 } else if (c != '<') {
422 if (datalen < sizeof(x->data) - 1) {
423 x->data[datalen++] = c;
424 } else {
425 x->data[datalen] = '\0';
426 x->data[0] = c;
427 datalen = 1;
428 }
429 }
430 if (c == '<') {
431 x->data[datalen] = '\0';
432 break;
433 }
434 }
435 }
436 }
437 }
438
439
440 /* ignore control chars (such as TABs) */
441 static inline void
442 printfield(const char *s)
443 {
444 for (; *s; s++)
445 if (!ISCNTRL((unsigned char)*s))
446 PUTCHAR(*s);
447 }
448
449 /* print first zipcode, remove whitespaces (dutch format: "1234AB") */
450 static inline void
451 printzipcode(const char *s)
452 {
453 for (; *s && *s != ';'; s++)
454 if (!ISSPACE((unsigned char)*s) && !ISCNTRL((unsigned char)*s))
455 PUTCHAR(*s);
456 }
457
458 static inline void
459 printaddress(void)
460 {
461 char *p, *s;
462
463 if (!na.id[0] || !na.lat[0] || !na.lon[0] || !na.postcode[0] ||
464 !na.street[0] || !na.housenr[0] || !na.city[0])
465 return;
466
467 /* print each housenr as a separate line */
468 for (s = na.housenr; s; ) {
469 printfield(na.id);
470 PUTCHAR('\t');
471 printfield(na.lat);
472 PUTCHAR('\t');
473 printfield(na.lon);
474 PUTCHAR('\t');
475 printzipcode(na.postcode);
476 PUTCHAR('\t');
477 printfield(na.street);
478 PUTCHAR('\t');
479
480 /* housenr */
481 if ((p = strchr(s, ';'))) {
482 *p = '\0';
483 printfield(s);
484 *p = ';';
485 s = p + 1;
486 } else {
487 printfield(s);
488 s = NULL;
489 }
490
491 PUTCHAR('\t');
492 printfield(na.city);
493 PUTCHAR('\n');
494 }
495 }
496
497 void
498 xmltagstart(const char *t, size_t tl)
499 {
500 if (tl == 4 && t[0] == 'n' && t[1] == 'o' && t[2] == 'd' && t[3] == 'e') {
501 isnode = 1;
502 return;
503 }
504 if (!isnode)
505 return;
506
507 if (tl == 3 && t[0] == 't' && t[1] == 'a' && t[2] == 'g') {
508 istag = 1;
509 return;
510 }
511 }
512
513 void
514 xmltagend(const char *t, size_t tl, int isshort)
515 {
516 static size_t nodecount;
517
518 if (isnode && tl == 4 && t[0] == 'n' && t[1] == 'o' && t[2] == 'd' && t[3] == 'e') {
519 printaddress();
520
521 /* progress meter */
522 if ((nodecount++ % 100000) == 0)
523 fprintf(stderr, "\rProgress: %.2f%%", ((float)off / (float)len) * 100.0);
524
525 isnode = 0;
526 fieldtype = 0; /* reset fieldtype */
527 na.id[0] = '\0';
528 na.lat[0] = '\0';
529 na.lon[0] = '\0';
530 na.postcode[0] = '\0';
531 na.street[0] = '\0';
532 na.housenr[0] = '\0';
533 na.city[0] = '\0';
534 return;
535 } else if (istag && tl == 3 && t[0] == 't' && t[1] == 'a' && t[2] == 'g') {
536 /* NOTE: assumes key attribute is parsed first */
537 switch (fieldtype) {
538 case Postcode:
539 strlcpy(na.postcode, nt.value, sizeof(na.postcode));
540 break;
541 case Street:
542 strlcpy(na.street, nt.value, sizeof(na.street));
543 break;
544 case Housenr:
545 strlcpy(na.housenr, nt.value, sizeof(na.housenr));
546 break;
547 case City:
548 strlcpy(na.city, nt.value, sizeof(na.city));
549 break;
550 }
551
552 istag = 0;
553 fieldtype = 0;
554 nt.key[0] = '\0';
555 nt.value[0] = '\0';
556 return;
557 }
558 }
559
560 void
561 xmlattr(const char *t, size_t tl,
562 const char *a, size_t al, const char *v, size_t vl)
563 {
564 if (isnode && !istag) {
565 if (al == 2 && a[0] == 'i' && a[1] == 'd' && vl + 1 < sizeof(na.id)) {
566 /* id */
567 strlcpy(na.id, v, sizeof(na.id));
568 } else if (al == 3 && a[0] == 'l' && vl + 1 < sizeof(na.lat)) {
569 /* lat */
570 if (a[1] == 'a' && a[2] == 't') {
571 strlcpy(na.lat, v, sizeof(na.lat));
572 } else if (a[1] == 'o' && a[2] == 'n') {
573 /* lon */
574 strlcpy(na.lon, v, sizeof(na.lon));
575 }
576 }
577 return;
578 }
579 if (al != 1)
580 return;
581
582 if (a[0] == 'k' && v[0] == 'a' && v[1] == 'd' && v[2] == 'd' && v[3] == 'r') {
583 if (!strcmp(v + 4, ":postcode")) {
584 fieldtype = Postcode;
585 strlcat(nt.key, v, sizeof(nt.key));
586 } else if (!strcmp(v + 4, ":street")) {
587 fieldtype = Street;
588 strlcat(nt.key, v, sizeof(nt.key));
589 } else if (!strcmp(v + 4, ":housenumber")) {
590 fieldtype = Housenr;
591 strlcat(nt.key, v, sizeof(nt.key));
592 } else if (!strcmp(v + 4, ":city")) {
593 fieldtype = City;
594 strlcat(nt.key, v, sizeof(nt.key));
595 }
596 return;
597 } else if (a[0] == 'v') {
598 strlcat(nt.value, v, sizeof(nt.value));
599 }
600 }
601
602 void
603 xmlattrentity(const char *t, size_t tl,
604 const char *a, size_t al, const char *v, size_t vl)
605 {
606 char buf[16];
607 ssize_t len;
608
609 if (!istag || al != 1 || a[0] != 'v')
610 return;
611
612 if ((len = xml_entitytostr(v, buf, sizeof(buf))) < 0)
613 xmlattr(t, tl, a, al, v, vl);
614 else
615 xmlattr(t, tl, a, al, buf, len);
616 }
617
618 int
619 main(int argc, char *argv[])
620 {
621 if (argc < 2) {
622 fprintf(stderr, "usage: %s <file>\n", argv[0]);
623 return 1;
624 }
625
626 if ((fd = open(argv[1], O_RDONLY)) < 0)
627 err(1, "open");
628 if (fstat(fd, &st) < 0)
629 err(1, "fstat");
630
631 off = 0;
632 len = st.st_size;
633 if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == MAP_FAILED)
634 err(1, "mmap");
635
636 xml_parse(&x);
637
638 /* progress meter */
639 fprintf(stderr, "\rProgress: %.2f%%\n", 100.0);
640
641 munmap(reg, len);
642 close(fd);
643
644 return 0;
645 }