process.sh (1305B)
1 #!/bin/sh 2 3 bin="./parse" 4 d="../data" 5 glue="./glue" 6 7 maxjobs=64 8 9 log() { 10 echo "$1" >&2 11 } 12 13 # child process job: parse each file and process them to a file in parallel. 14 if test "$CHILD_PROC" = "1"; then 15 # arguments: count, name, infile, outfile 16 log "[$1] $2 started" 17 18 # mmap version 19 "$bin" "$3" > "$4" 20 21 # stdin version 22 #"$bin" < "$3" > "$4" 23 status="$?" 24 25 log "[$1] $2 done" 26 exit "$status" 27 fi 28 29 # generate a list of jobs for processing. 30 list() { 31 i=1 32 for f in "$d"/*.xml; do 33 b="${f##*/}" 34 out="tmp/$b" 35 36 printf '%s\0%s\0%s\0%s\0' "$i" "$b" "$f" "$out" 37 i=$((i+1)) 38 done 39 } 40 41 # old awk version of glueing records, very slow on some platforms. 42 #awk_glue() { 43 # LC_ALL=C awk -f glue.awk 44 #} 45 46 merge() { 47 log "Sorting data before merging records..." 48 LC_ALL=C sort -k1,1 -k8,8 results.csv > results_sorted.csv 49 50 log "Merging records..." 51 "$glue" < results_sorted.csv > results2.csv 52 53 log "Sorting resulting data by zipcode, address number, etc..." 54 # sort results by zipcode, address number, etc. 55 LC_ALL=C sort -k2,2 -k3,3n -k4,4 results2.csv > final.csv 56 } 57 58 rm -rf tmp 59 mkdir -p tmp 60 61 # parse in parallel. 62 list | CHILD_PROC="1" xargs -r -0 -P "${maxjobs}" -L 4 "$(readlink -f "$0")" 63 64 # concat results to one file. 65 cat tmp/* > results.csv 66 67 # merge results together. 68 merge 69 70 # cleanup temp files. 71 rm -rf tmp