main.c - osm-zipcodes - Extract (dutch) addresses from OpenStreetMap OSM XML 
 (HTM) git clone git://git.codemadness.org/osm-zipcodes
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       main.c (14375B)
       ---
            1 #include <sys/mman.h>
            2 #include <sys/stat.h>
            3 #include <sys/types.h>
            4 
            5 #include <sys/types.h>
            6 
            7 #include <err.h>
            8 #include <errno.h>
            9 #include <fcntl.h>
           10 #include <limits.h>
           11 #include <stdio.h>
           12 #include <stdlib.h>
           13 #include <string.h>
           14 #include <unistd.h>
           15 
           16 size_t strlcat(char *dst, const char *src, size_t dsize);
           17 size_t strlcpy(char *dst, const char *src, size_t dsize);
           18 
           19 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
           20 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
           21 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
           22 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           23 #define PUTCHAR(c) putchar_unlocked(c)
           24 
           25 typedef struct xmlparser {
           26         /* current tag */
           27         char tag[1024];
           28         size_t taglen;
           29         /* current tag is in short form ? <tag /> */
           30         int isshorttag;
           31         /* current attribute name */
           32         char name[1024];
           33         /* data buffer used for tag data, cdata and attribute data */
           34         char data[BUFSIZ];
           35 } XMLParser;
           36 
           37 enum FieldType {
           38         Postcode = 1,
           39         Street   = 2,
           40         Housenr  = 3,
           41         City     = 4,
           42 };
           43 
           44 struct node_address {
           45         char id[16];  /* node ID */
           46         char lat[16]; /* node latitude */
           47         char lon[16]; /* node longitude, must be same buffer size as lat */
           48         char postcode[16];
           49         char street[128];
           50         char housenr[16];
           51         char city[128];
           52 };
           53 
           54 struct node_tag {
           55         char key[16];
           56         char value[256];
           57 };
           58 
           59 void xmltagstart(const char *t, size_t tl);
           60 void xmlattr(const char *t, size_t tl, const char *a, size_t al,
           61              const char *v, size_t vl);
           62 void xmlattrentity(const char *t, size_t tl, const char *a,
           63                    size_t al, const char *v, size_t vl);
           64 void xmltagend(const char *t, size_t tl, int isshort);
           65 
           66 static XMLParser x;
           67 
           68 static struct node_address na;
           69 static struct node_tag nt;
           70 static int isnode, istag;
           71 static int fieldtype;
           72 
           73 static int fd;
           74 struct stat st;
           75 unsigned char *reg;
           76 size_t len, off;
           77 
           78 #define GETNEXT() (off >= len ? EOF : reg[off++])
           79 
           80 static void
           81 xml_parseattrs(XMLParser *x)
           82 {
           83         size_t namelen = 0, valuelen;
           84         int c, endsep, endname = 0, valuestart = 0;
           85 
           86         while ((c = GETNEXT()) != EOF) {
           87                 if (ISSPACE(c)) {
           88                         if (namelen)
           89                                 endname = 1;
           90                         continue;
           91                 } else if (c == '?')
           92                         ; /* ignore */
           93                 else if (c == '=') {
           94                         x->name[namelen] = '\0';
           95                         valuestart = 1;
           96                         endname = 1;
           97                 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
           98                         /* attribute without value */
           99                         x->name[namelen] = '\0';
          100                         xmlattr(x->tag, x->taglen, x->name, namelen, "", 0);
          101                         endname = 0;
          102                         x->name[0] = c;
          103                         namelen = 1;
          104                 } else if (namelen && valuestart) {
          105                         /* attribute with value */
          106                         valuelen = 0;
          107                         if (c == '\'' || c == '"') {
          108                                 endsep = c;
          109                         } else {
          110                                 endsep = ' '; /* isspace() */
          111                                 goto startvalue;
          112                         }
          113 
          114                         while ((c = GETNEXT()) != EOF) {
          115 startvalue:
          116                                 if (c == '&') { /* entities */
          117                                         x->data[valuelen] = '\0';
          118                                         /* call data function with data before entity if there is data */
          119                                         if (valuelen)
          120                                                 xmlattr(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          121                                         x->data[0] = c;
          122                                         valuelen = 1;
          123                                         while ((c = GETNEXT()) != EOF) {
          124                                                 if (c == endsep)
          125                                                         break;
          126                                                 if (valuelen < sizeof(x->data) - 1)
          127                                                         x->data[valuelen++] = c;
          128                                                 else {
          129                                                         /* entity too long for buffer, handle as normal data */
          130                                                         x->data[valuelen] = '\0';
          131                                                         xmlattr(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          132                                                         x->data[0] = c;
          133                                                         valuelen = 1;
          134                                                         break;
          135                                                 }
          136                                                 if (c == ';') {
          137                                                         x->data[valuelen] = '\0';
          138                                                         xmlattrentity(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          139                                                         valuelen = 0;
          140                                                         break;
          141                                                 }
          142                                         }
          143                                 } else if (c != endsep) {
          144                                         if (valuelen < sizeof(x->data) - 1) {
          145                                                 x->data[valuelen++] = c;
          146                                         } else {
          147                                                 x->data[valuelen] = '\0';
          148                                                 xmlattr(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          149                                                 x->data[0] = c;
          150                                                 valuelen = 1;
          151                                         }
          152                                 }
          153                                 if (c == endsep) {
          154                                         x->data[valuelen] = '\0';
          155                                         xmlattr(x->tag, x->taglen, x->name, namelen, x->data, valuelen);
          156                                         break;
          157                                 }
          158                         }
          159                         namelen = endname = valuestart = 0;
          160                 } else if (namelen < sizeof(x->name) - 1) {
          161                         x->name[namelen++] = c;
          162                 }
          163                 if (c == '>') {
          164                         break;
          165                 } else if (c == '/') {
          166                         x->isshorttag = 1;
          167                         x->name[0] = '\0';
          168                         namelen = 0;
          169                 }
          170         }
          171 }
          172 
          173 static void
          174 xml_parsecomment(XMLParser *x)
          175 {
          176         size_t i = 0;
          177         int c;
          178 
          179         while ((c = GETNEXT()) != EOF) {
          180                 if (c == '-') {
          181                         if (++i > 2) {
          182                                 i = 2;
          183                         }
          184                         continue;
          185                 } else if (c == '>' && i == 2) {
          186                         return;
          187                 } else if (i) {
          188                         i = 0;
          189                 }
          190         }
          191 }
          192 
          193 static void
          194 xml_parsecdata(XMLParser *x)
          195 {
          196         size_t i = 0;
          197         int c;
          198 
          199         while ((c = GETNEXT()) != EOF) {
          200                 if (c == ']') {
          201                         if (++i > 2) {
          202                                 i = 2;
          203                         }
          204                         continue;
          205                 } else if (c == '>' && i == 2) {
          206                         return;
          207                 } else if (i) {
          208                         i = 0;
          209                 }
          210         }
          211 }
          212 
          213 static int
          214 codepointtoutf8(long r, char *s)
          215 {
          216         if (r == 0) {
          217                 return 0; /* NUL byte */
          218         } else if (r <= 0x7F) {
          219                 /* 1 byte: 0aaaaaaa */
          220                 s[0] = r;
          221                 return 1;
          222         } else if (r <= 0x07FF) {
          223                 /* 2 bytes: 00000aaa aabbbbbb */
          224                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          225                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          226                 return 2;
          227         } else if (r <= 0xFFFF) {
          228                 /* 3 bytes: aaaabbbb bbcccccc */
          229                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          230                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          231                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          232                 return 3;
          233         } else {
          234                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          235                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          236                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          237                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          238                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          239                 return 4;
          240         }
          241 }
          242 
          243 static int
          244 namedentitytostr(const char *e, char *buf, size_t bufsiz)
          245 {
          246         static const struct {
          247                 const char *entity;
          248                 int c;
          249         } entities[] = {
          250                 { "amp;",  '&'  },
          251                 { "lt;",   '<'  },
          252                 { "gt;",   '>'  },
          253                 { "apos;", '\'' },
          254                 { "quot;", '"'  },
          255                 { "AMP;",  '&'  },
          256                 { "LT;",   '<'  },
          257                 { "GT;",   '>'  },
          258                 { "APOS;", '\'' },
          259                 { "QUOT;", '"'  }
          260         };
          261         size_t i;
          262 
          263         /* buffer is too small */
          264         if (bufsiz < 2)
          265                 return -1;
          266 
          267         for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
          268                 if (!strcmp(e, entities[i].entity)) {
          269                         buf[0] = entities[i].c;
          270                         buf[1] = '\0';
          271                         return 1;
          272                 }
          273         }
          274         return 0;
          275 }
          276 
          277 static int
          278 numericentitytostr(const char *e, char *buf, size_t bufsiz)
          279 {
          280         long l;
          281         int len;
          282         char *end;
          283 
          284         /* buffer is too small */
          285         if (bufsiz < 5)
          286                 return -1;
          287 
          288         errno = 0;
          289         /* hex (16) or decimal (10) */
          290         if (*e == 'x')
          291                 l = strtoul(e + 1, &end, 16);
          292         else
          293                 l = strtoul(e, &end, 10);
          294         /* invalid value or not a well-formed entity or too high codepoint */
          295         if (errno || *end != ';' || l > 0x10FFFF)
          296                 return 0;
          297         len = codepointtoutf8(l, buf);
          298         buf[len] = '\0';
          299 
          300         return len;
          301 }
          302 
          303 /* convert named- or numeric entity string to buffer string
          304  * returns byte-length of string. */
          305 int
          306 xml_entitytostr(const char *e, char *buf, size_t bufsiz)
          307 {
          308         /* doesn't start with & */
          309         if (e[0] != '&')
          310                 return 0;
          311         /* numeric entity */
          312         if (e[1] == '#')
          313                 return numericentitytostr(e + 2, buf, bufsiz);
          314         else /* named entity */
          315                 return namedentitytostr(e + 1, buf, bufsiz);
          316 }
          317 
          318 void
          319 xml_parse(XMLParser *x)
          320 {
          321         size_t datalen, tagdatalen;
          322         int c, isend;
          323 
          324         while ((c = GETNEXT()) != EOF && c != '<')
          325                 ; /* skip until < */
          326 
          327         while (c != EOF) {
          328                 if (c == '<') { /* parse tag */
          329                         if ((c = GETNEXT()) == EOF)
          330                                 return;
          331 
          332                         if (c == '!') { /* cdata and comments */
          333                                 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
          334                                         /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
          335                                         if (tagdatalen <= sizeof("[CDATA[") - 1)
          336                                                 x->data[tagdatalen++] = c;
          337                                         if (c == '>')
          338                                                 break;
          339                                         else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
          340                                                         (x->data[0] == '-')) {
          341                                                 xml_parsecomment(x);
          342                                                 break;
          343                                         } else if (c == '[') {
          344                                                 if (tagdatalen == sizeof("[CDATA[") - 1 &&
          345                                                     !strncmp(x->data, "[CDATA[", tagdatalen)) {
          346                                                         xml_parsecdata(x);
          347                                                         break;
          348                                                 }
          349                                         }
          350                                 }
          351                         } else {
          352                                 /* normal tag (open, short open, close), processing instruction. */
          353                                 x->tag[0] = c;
          354                                 x->taglen = 1;
          355                                 x->isshorttag = isend = 0;
          356 
          357                                 /* treat processing instruction as shorttag, don't strip "?" prefix. */
          358                                 if (c == '?') {
          359                                         x->isshorttag = 1;
          360                                 } else if (c == '/') {
          361                                         if ((c = GETNEXT()) == EOF)
          362                                                 return;
          363                                         x->tag[0] = c;
          364                                         isend = 1;
          365                                 }
          366 
          367                                 while ((c = GETNEXT()) != EOF) {
          368                                         if (c == '/')
          369                                                 x->isshorttag = 1; /* short tag */
          370                                         else if (c == '>' || ISSPACE(c)) {
          371                                                 x->tag[x->taglen] = '\0';
          372                                                 if (isend) { /* end tag, starts with </ */
          373                                                         xmltagend(x->tag, x->taglen, x->isshorttag);
          374                                                         x->tag[0] = '\0';
          375                                                         x->taglen = 0;
          376                                                 } else {
          377                                                         /* start tag */
          378                                                         xmltagstart(x->tag, x->taglen);
          379                                                         if (ISSPACE(c))
          380                                                                 xml_parseattrs(x);
          381                                                 }
          382                                                 /* call tagend for shortform or processing instruction */
          383                                                 if (x->isshorttag) {
          384                                                         xmltagend(x->tag, x->taglen, x->isshorttag);
          385                                                         x->tag[0] = '\0';
          386                                                         x->taglen = 0;
          387                                                 }
          388                                                 break;
          389                                         } else if (x->taglen < sizeof(x->tag) - 1)
          390                                                 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
          391                                 }
          392                         }
          393                 } else {
          394                         /* parse tag data */
          395                         datalen = 0;
          396                         while ((c = GETNEXT()) != EOF) {
          397                                 if (c == '&') {
          398                                         if (datalen) {
          399                                                 x->data[datalen] = '\0';
          400                                         }
          401                                         x->data[0] = c;
          402                                         datalen = 1;
          403                                         while ((c = GETNEXT()) != EOF) {
          404                                                 if (c == '<')
          405                                                         break;
          406                                                 if (datalen < sizeof(x->data) - 1)
          407                                                         x->data[datalen++] = c;
          408                                                 else {
          409                                                         /* entity too long for buffer, handle as normal data */
          410                                                         x->data[datalen] = '\0';
          411                                                         x->data[0] = c;
          412                                                         datalen = 1;
          413                                                         break;
          414                                                 }
          415                                                 if (c == ';') {
          416                                                         x->data[datalen] = '\0';
          417                                                         datalen = 0;
          418                                                         break;
          419                                                 }
          420                                         }
          421                                 } else if (c != '<') {
          422                                         if (datalen < sizeof(x->data) - 1) {
          423                                                 x->data[datalen++] = c;
          424                                         } else {
          425                                                 x->data[datalen] = '\0';
          426                                                 x->data[0] = c;
          427                                                 datalen = 1;
          428                                         }
          429                                 }
          430                                 if (c == '<') {
          431                                         x->data[datalen] = '\0';
          432                                         break;
          433                                 }
          434                         }
          435                 }
          436         }
          437 }
          438 
          439 
          440 /* ignore control chars (such as TABs) */
          441 static inline void
          442 printfield(const char *s)
          443 {
          444         for (; *s; s++)
          445                 if (!ISCNTRL((unsigned char)*s))
          446                         PUTCHAR(*s);
          447 }
          448 
          449 /* print first zipcode, remove whitespaces (dutch format: "1234AB") */
          450 static inline void
          451 printzipcode(const char *s)
          452 {
          453         for (; *s && *s != ';'; s++)
          454                 if (!ISSPACE((unsigned char)*s) && !ISCNTRL((unsigned char)*s))
          455                         PUTCHAR(*s);
          456 }
          457 
          458 static inline void
          459 printaddress(void)
          460 {
          461         char *p, *s;
          462 
          463         if (!na.id[0] || !na.lat[0] || !na.lon[0] || !na.postcode[0] ||
          464             !na.street[0] || !na.housenr[0] || !na.city[0])
          465                 return;
          466 
          467         /* print each housenr as a separate line */
          468         for (s = na.housenr; s; ) {
          469                 printfield(na.id);
          470                 PUTCHAR('\t');
          471                 printfield(na.lat);
          472                 PUTCHAR('\t');
          473                 printfield(na.lon);
          474                 PUTCHAR('\t');
          475                 printzipcode(na.postcode);
          476                 PUTCHAR('\t');
          477                 printfield(na.street);
          478                 PUTCHAR('\t');
          479 
          480                 /* housenr */
          481                 if ((p = strchr(s, ';'))) {
          482                         *p = '\0';
          483                         printfield(s);
          484                         *p = ';';
          485                         s = p + 1;
          486                 } else {
          487                         printfield(s);
          488                         s = NULL;
          489                 }
          490 
          491                 PUTCHAR('\t');
          492                 printfield(na.city);
          493                 PUTCHAR('\n');
          494         }
          495 }
          496 
          497 void
          498 xmltagstart(const char *t, size_t tl)
          499 {
          500         if (tl == 4 && t[0] == 'n' && t[1] == 'o' && t[2] == 'd' && t[3] == 'e') {
          501                 isnode = 1;
          502                 return;
          503         }
          504         if (!isnode)
          505                 return;
          506 
          507         if (tl == 3 && t[0] == 't' && t[1] == 'a' && t[2] == 'g') {
          508                 istag = 1;
          509                 return;
          510         }
          511 }
          512 
          513 void
          514 xmltagend(const char *t, size_t tl, int isshort)
          515 {
          516         static size_t nodecount;
          517 
          518         if (isnode && tl == 4 && t[0] == 'n' && t[1] == 'o' && t[2] == 'd' && t[3] == 'e') {
          519                 printaddress();
          520 
          521                 /* progress meter */
          522                 if ((nodecount++ % 100000) == 0)
          523                         fprintf(stderr, "\rProgress: %.2f%%", ((float)off / (float)len) * 100.0);
          524 
          525                 isnode = 0;
          526                 fieldtype = 0; /* reset fieldtype */
          527                 na.id[0] = '\0';
          528                 na.lat[0] = '\0';
          529                 na.lon[0] = '\0';
          530                 na.postcode[0] = '\0';
          531                 na.street[0] = '\0';
          532                 na.housenr[0] = '\0';
          533                 na.city[0] = '\0';
          534                 return;
          535         } else if (istag && tl == 3 && t[0] == 't' && t[1] == 'a' && t[2] == 'g') {
          536                 /* NOTE: assumes key attribute is parsed first */
          537                 switch (fieldtype) {
          538                 case Postcode:
          539                         strlcpy(na.postcode, nt.value, sizeof(na.postcode));
          540                         break;
          541                 case Street:
          542                         strlcpy(na.street, nt.value, sizeof(na.street));
          543                         break;
          544                 case Housenr:
          545                         strlcpy(na.housenr, nt.value, sizeof(na.housenr));
          546                         break;
          547                 case City:
          548                         strlcpy(na.city, nt.value, sizeof(na.city));
          549                         break;
          550                 }
          551 
          552                 istag = 0;
          553                 fieldtype = 0;
          554                 nt.key[0] = '\0';
          555                 nt.value[0] = '\0';
          556                 return;
          557         }
          558 }
          559 
          560 void
          561 xmlattr(const char *t, size_t tl,
          562         const char *a, size_t al, const char *v, size_t vl)
          563 {
          564         if (isnode && !istag) {
          565                 if (al == 2 && a[0] == 'i' && a[1] == 'd' && vl + 1 < sizeof(na.id)) {
          566                         /* id */
          567                         strlcpy(na.id, v, sizeof(na.id));
          568                 } else if (al == 3 && a[0] == 'l' && vl + 1 < sizeof(na.lat)) {
          569                         /* lat */
          570                         if (a[1] == 'a' && a[2] == 't') {
          571                                 strlcpy(na.lat, v, sizeof(na.lat));
          572                         } else if (a[1] == 'o' && a[2] == 'n') {
          573                                 /* lon */
          574                                 strlcpy(na.lon, v, sizeof(na.lon));
          575                         }
          576                 }
          577                 return;
          578         }
          579         if (al != 1)
          580                 return;
          581 
          582         if (a[0] == 'k' && v[0] == 'a' && v[1] == 'd' && v[2] == 'd' && v[3] == 'r') {
          583                 if (!strcmp(v + 4, ":postcode")) {
          584                         fieldtype = Postcode;
          585                         strlcat(nt.key, v, sizeof(nt.key));
          586                 } else if (!strcmp(v + 4, ":street")) {
          587                         fieldtype = Street;
          588                         strlcat(nt.key, v, sizeof(nt.key));
          589                 } else if (!strcmp(v + 4, ":housenumber")) {
          590                         fieldtype = Housenr;
          591                         strlcat(nt.key, v, sizeof(nt.key));
          592                 } else if (!strcmp(v + 4, ":city")) {
          593                         fieldtype = City;
          594                         strlcat(nt.key, v, sizeof(nt.key));
          595                 }
          596                 return;
          597         } else if (a[0] == 'v') {
          598                 strlcat(nt.value, v, sizeof(nt.value));
          599         }
          600 }
          601 
          602 void
          603 xmlattrentity(const char *t, size_t tl,
          604               const char *a, size_t al, const char *v, size_t vl)
          605 {
          606         char buf[16];
          607         ssize_t len;
          608 
          609         if (!istag || al != 1 || a[0] != 'v')
          610                 return;
          611 
          612         if ((len = xml_entitytostr(v, buf, sizeof(buf))) < 0)
          613                 xmlattr(t, tl, a, al, v, vl);
          614         else
          615                 xmlattr(t, tl, a, al, buf, len);
          616 }
          617 
          618 int
          619 main(int argc, char *argv[])
          620 {
          621         if (argc < 2) {
          622                 fprintf(stderr, "usage: %s <file>\n", argv[0]);
          623                 return 1;
          624         }
          625 
          626         if ((fd = open(argv[1], O_RDONLY)) < 0)
          627                 err(1, "open");
          628         if (fstat(fd, &st) < 0)
          629                 err(1, "fstat");
          630 
          631         off = 0;
          632         len = st.st_size;
          633         if ((reg = mmap(0, len, PROT_READ, MAP_SHARED|MAP_FILE, fd, off)) == MAP_FAILED)
          634                 err(1, "mmap");
          635 
          636         xml_parse(&x);
          637 
          638         /* progress meter */
          639         fprintf(stderr, "\rProgress: %.2f%%\n", 100.0);
          640 
          641         munmap(reg, len);
          642         close(fd);
          643 
          644         return 0;
          645 }