From 2c07fc793f2ec581c16cca6080f928c508c4149c Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Fri, 9 Jan 2015 08:50:51 +0000 Subject: [PATCH 1/4] add python lib imposm.parser --- README.md | 13 ++++++++++++- imposm-parser.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ run.sh | 31 +++++++++++++++++-------------- 3 files changed, 75 insertions(+), 15 deletions(-) create mode 100644 imposm-parser.py diff --git a/README.md b/README.md index c9e7511..278fc5d 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ The tests involve decompressing a PBF extract of London stored on SSD and serial - `node-osmium` https://github.com/osmcode/node-osmium - `node-osmium-stream` https://github.com/geopipes/osmium-stream - `go-osmpbf` https://github.com/qedus/osmpbf +- `py-imposm-parser` https://github.com/omniscale/imposm-parser ## results @@ -40,21 +41,31 @@ Make sure you have the most current versions of the following installed: - nodejs - golang - mercurial (for the golang dep) +- python for impartial PBF stats I use: - osmconvert (sudo apt-get install osmctools) ### dependencies +node/golang + ```bash go get github.com/qedus/osmpbf; npm install; ``` +python + +```bash +sudo apt-get install build-essential python-dev python-pip protobuf-compiler libprotobuf-dev; +[sudo] pip install imposm.parser; +``` + ### run test ```bash -bash run.sh +bash run.sh; ``` ### drive performance diff --git a/imposm-parser.py b/imposm-parser.py new file mode 100644 index 0000000..f0e7c7d --- /dev/null +++ b/imposm-parser.py @@ -0,0 +1,46 @@ + +import sys +import json +from imposm.parser import OSMParser +from collections import OrderedDict # requires python 2.7+? + +# http://newbebweb.blogspot.co.uk/2012/02/python-head-ioerror-errno-32-broken.html +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE,SIG_DFL) + +# simple class that handles the parsed OSM data. +class JsonOutput(object): + + def nodes(self, nodes): + for osmid, tags, coords in nodes: + output = OrderedDict([ + ('type','node'), + ('id', osmid), + ('lat', coords[1]), + ('lon', coords[0]), + ('tags', tags) + ]) + sys.stdout.write( json.dumps(output, separators=(',',':')) + '\n' ) + + def ways(self, ways): + for osmid, tags, refs in ways: + output = OrderedDict([ + ('type','way'), + ('id', osmid), + ('refs', refs), + ('tags', tags) + ]) + sys.stdout.write( json.dumps(output, separators=(',',':')) + '\n' ) + + def relations(self, relations): + return; # do nothing (yet) + +# instantiate counter and parser and start parsing +jsonify = JsonOutput() +p = OSMParser( + # concurrency=4, # defaults to the number of CPU and cores of the host system + nodes_callback=jsonify.nodes, + ways_callback=jsonify.ways, + relations_callback=jsonify.relations +) +p.parse(sys.argv[1]) \ No newline at end of file diff --git a/run.sh b/run.sh index 14e4d3b..e9c24e3 100755 --- a/run.sh +++ b/run.sh @@ -15,25 +15,28 @@ stats(){ echo "shasum: (`shasum tmpfile`)"; } -echo '--- osm-pbf-parser ---'; -time node osm-pbf-parser $PBF_FILE >tmpfile; -stats; rm tmpfile; echo; +# echo '--- osm-pbf-parser ---'; +# time node osm-pbf-parser $PBF_FILE >tmpfile; +# stats; rm tmpfile; echo; -echo '--- osm-read ---'; -time node osm-read $PBF_FILE >tmpfile; -cp tmpfile tmp1; -stats; rm tmpfile; echo; +# echo '--- osm-read ---'; +# time node osm-read $PBF_FILE >tmpfile; +# cp tmpfile tmp1; +# stats; rm tmpfile; echo; -echo '--- node-osmium ---'; -time node node-osmium $PBF_FILE >tmpfile; -cp tmpfile tmp2; -stats; rm tmpfile; echo; +# echo '--- node-osmium ---'; +# time node node-osmium $PBF_FILE >tmpfile; +# cp tmpfile tmp2; +# stats; rm tmpfile; echo; -echo '--- node-osmium-stream ---'; -time node node-osmium-stream $PBF_FILE >tmpfile; -stats; rm tmpfile; echo; +# echo '--- node-osmium-stream ---'; +# time node node-osmium-stream $PBF_FILE >tmpfile; +# stats; rm tmpfile; echo; echo '--- go-osmpbf ---'; time go run osmpbf.go $PBF_FILE >tmpfile; stats; rm tmpfile; echo; +echo '--- py-imposm-parser ---'; +time python imposm-parser.py $PBF_FILE >tmpfile; +stats; rm tmpfile; echo; From b2ce6cb8c17e91130897cf696921df525d031a25 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Fri, 9 Jan 2015 09:30:19 +0000 Subject: [PATCH 2/4] add coords_callback --- imposm-parser.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/imposm-parser.py b/imposm-parser.py index f0e7c7d..da54530 100644 --- a/imposm-parser.py +++ b/imposm-parser.py @@ -11,13 +11,24 @@ # simple class that handles the parsed OSM data. class JsonOutput(object): + # coords are nodes without tags + def coords(self, coords): + for osmid, lon, lat in coords: + output = OrderedDict([ + ('type','node'), + ('id', osmid), + ('lat', lat), + ('lon', lon) + ]) + sys.stdout.write( json.dumps(output, separators=(',',':')) + '\n' ) + def nodes(self, nodes): - for osmid, tags, coords in nodes: + for osmid, tags, centroid in nodes: output = OrderedDict([ ('type','node'), ('id', osmid), - ('lat', coords[1]), - ('lon', coords[0]), + ('lat', centroid[1]), + ('lon', centroid[0]), ('tags', tags) ]) sys.stdout.write( json.dumps(output, separators=(',',':')) + '\n' ) @@ -39,6 +50,7 @@ def relations(self, relations): jsonify = JsonOutput() p = OSMParser( # concurrency=4, # defaults to the number of CPU and cores of the host system + coords_callback=jsonify.coords, nodes_callback=jsonify.nodes, ways_callback=jsonify.ways, relations_callback=jsonify.relations From 0f729ffa603b1075d8ce24e9eb26c5019df2e991 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Fri, 9 Jan 2015 09:57:22 +0000 Subject: [PATCH 3/4] better grepping --- run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run.sh b/run.sh index e9c24e3..30e5ad6 100755 --- a/run.sh +++ b/run.sh @@ -10,8 +10,8 @@ osmconvert --out-statistics $PBF_FILE; echo; stats(){ echo "total lines: `cat tmpfile | wc -l`"; - echo "total nodes: `cat tmpfile | grep node | wc -l`"; - echo "total ways: `cat tmpfile | grep refs | wc -l`"; + echo "total nodes: `cat tmpfile | grep '\"node\"' | wc -l`"; + echo "total ways: `cat tmpfile | grep '\"way\"' | wc -l`"; echo "shasum: (`shasum tmpfile`)"; } From bb7f5cd470621fb610bdb27a434723ab196a69c4 Mon Sep 17 00:00:00 2001 From: Peter Johnson Date: Fri, 9 Jan 2015 12:02:55 +0000 Subject: [PATCH 4/4] far more performant json serializer, unordered json properties --- README.md | 1 + imposm-parser.py | 65 +++++++++++++++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 278fc5d..72e8a23 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ python ```bash sudo apt-get install build-essential python-dev python-pip protobuf-compiler libprotobuf-dev; [sudo] pip install imposm.parser; +[sudo] pip install ujson; ``` ### run test diff --git a/imposm-parser.py b/imposm-parser.py index da54530..024685f 100644 --- a/imposm-parser.py +++ b/imposm-parser.py @@ -1,6 +1,6 @@ import sys -import json +import ujson from imposm.parser import OSMParser from collections import OrderedDict # requires python 2.7+? @@ -14,34 +14,53 @@ class JsonOutput(object): # coords are nodes without tags def coords(self, coords): for osmid, lon, lat in coords: - output = OrderedDict([ - ('type','node'), - ('id', osmid), - ('lat', lat), - ('lon', lon) - ]) - sys.stdout.write( json.dumps(output, separators=(',',':')) + '\n' ) + # output = OrderedDict([ + # ('type','node'), + # ('id', osmid), + # ('lat', lat), + # ('lon', lon) + # ]) + output = { + 'type': 'node', + 'id': osmid, + 'lat': lat, + 'lon': lon + } + sys.stdout.write( ujson.dumps(output) + '\n' ) def nodes(self, nodes): for osmid, tags, centroid in nodes: - output = OrderedDict([ - ('type','node'), - ('id', osmid), - ('lat', centroid[1]), - ('lon', centroid[0]), - ('tags', tags) - ]) - sys.stdout.write( json.dumps(output, separators=(',',':')) + '\n' ) + # output = OrderedDict([ + # ('type','node'), + # ('id', osmid), + # ('lat', centroid[1]), + # ('lon', centroid[0]), + # ('tags', tags) + # ]) + output = { + 'type': 'node', + 'id': osmid, + 'lat': centroid[1], + 'lon': centroid[0], + 'tags': tags + } + sys.stdout.write( ujson.dumps(output) + '\n' ) def ways(self, ways): for osmid, tags, refs in ways: - output = OrderedDict([ - ('type','way'), - ('id', osmid), - ('refs', refs), - ('tags', tags) - ]) - sys.stdout.write( json.dumps(output, separators=(',',':')) + '\n' ) + # output = OrderedDict([ + # ('type','way'), + # ('id', osmid), + # ('refs', refs), + # ('tags', tags) + # ]) + output = { + 'type': 'way', + 'id': osmid, + 'refs': refs, + 'tags': tags + } + sys.stdout.write( ujson.dumps(output) + '\n' ) def relations(self, relations): return; # do nothing (yet)