bag

Dutch BAG Kadaster Extract parser (subset)
git clone git://git.codemadness.org/bag
Log | Files | Refs | README | LICENSE

process.sh (1305B)


      1 #!/bin/sh
      2 
      3 bin="./parse"
      4 d="../data"
      5 glue="./glue"
      6 
      7 maxjobs=64
      8 
      9 log() {
     10 	echo "$1" >&2
     11 }
     12 
     13 # child process job: parse each file and process them to a file in parallel.
     14 if test "$CHILD_PROC" = "1"; then
     15 	# arguments: count, name, infile, outfile
     16 	log "[$1] $2 started"
     17 
     18 	# mmap version
     19 	"$bin" "$3" > "$4"
     20 
     21 	# stdin version
     22 	#"$bin" < "$3" > "$4"
     23 	status="$?"
     24 
     25 	log "[$1] $2 done"
     26 	exit "$status"
     27 fi
     28 
     29 # generate a list of jobs for processing.
     30 list() {
     31 	i=1
     32 	for f in "$d"/*.xml; do
     33 		b="${f##*/}"
     34 		out="tmp/$b"
     35 
     36 		printf '%s\0%s\0%s\0%s\0' "$i" "$b" "$f" "$out"
     37 		i=$((i+1))
     38 	done
     39 }
     40 
     41 # old awk version of glueing records, very slow on some platforms.
     42 #awk_glue() {
     43 #	LC_ALL=C awk -f glue.awk
     44 #}
     45 
     46 merge() {
     47 	log "Sorting data before merging records..."
     48 	LC_ALL=C sort -k1,1 -k8,8 results.csv > results_sorted.csv
     49 
     50 	log "Merging records..."
     51 	"$glue" < results_sorted.csv > results2.csv
     52 
     53 	log "Sorting resulting data by zipcode, address number, etc..."
     54 	# sort results by zipcode, address number, etc.
     55 	LC_ALL=C sort -k2,2 -k3,3n -k4,4 results2.csv > final.csv
     56 }
     57 
     58 rm -rf tmp
     59 mkdir -p tmp
     60 
     61 # parse in parallel.
     62 list | CHILD_PROC="1" xargs -r -0 -P "${maxjobs}" -L 4 "$(readlink -f "$0")"
     63 
     64 # concat results to one file.
     65 cat tmp/* > results.csv
     66 
     67 # merge results together.
     68 merge
     69 
     70 # cleanup temp files.
     71 rm -rf tmp