process.sh - bag - Dutch BAG Kadaster Extract parser (subset)
(HTM) git clone git://git.codemadness.org/bag
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
process.sh (1305B)
---
1 #!/bin/sh
2
3 bin="./parse"
4 d="../data"
5 glue="./glue"
6
7 maxjobs=64
8
9 log() {
10 echo "$1" >&2
11 }
12
13 # child process job: parse each file and process them to a file in parallel.
14 if test "$CHILD_PROC" = "1"; then
15 # arguments: count, name, infile, outfile
16 log "[$1] $2 started"
17
18 # mmap version
19 "$bin" "$3" > "$4"
20
21 # stdin version
22 #"$bin" < "$3" > "$4"
23 status="$?"
24
25 log "[$1] $2 done"
26 exit "$status"
27 fi
28
29 # generate a list of jobs for processing.
30 list() {
31 i=1
32 for f in "$d"/*.xml; do
33 b="${f##*/}"
34 out="tmp/$b"
35
36 printf '%s\0%s\0%s\0%s\0' "$i" "$b" "$f" "$out"
37 i=$((i+1))
38 done
39 }
40
41 # old awk version of glueing records, very slow on some platforms.
42 #awk_glue() {
43 # LC_ALL=C awk -f glue.awk
44 #}
45
46 merge() {
47 log "Sorting data before merging records..."
48 LC_ALL=C sort -k1,1 -k8,8 results.csv > results_sorted.csv
49
50 log "Merging records..."
51 "$glue" < results_sorted.csv > results2.csv
52
53 log "Sorting resulting data by zipcode, address number, etc..."
54 # sort results by zipcode, address number, etc.
55 LC_ALL=C sort -k2,2 -k3,3n -k4,4 results2.csv > final.csv
56 }
57
58 rm -rf tmp
59 mkdir -p tmp
60
61 # parse in parallel.
62 list | CHILD_PROC="1" xargs -r -0 -P "${maxjobs}" -L 4 "$(readlink -f "$0")"
63
64 # concat results to one file.
65 cat tmp/* > results.csv
66
67 # merge results together.
68 merge
69
70 # cleanup temp files.
71 rm -rf tmp