From 9fb9c71002fa63cc643f4c27ca21253fdf3b1950 Mon Sep 17 00:00:00 2001 From: Lucy Park Date: Wed, 17 Apr 2013 23:02:54 +0900 Subject: [PATCH] Read unicode in CSVs Former-commit-id: d1364e98bc0f06820600ab06fab71e4171a46bb4 [formerly 9bed84d3a68ac50b2436623b6a366a1f6b4cf1a1] [formerly 1d35c9d1201e2de6e868361d502d60ec0d6d0c21] [formerly 3853bc169c80bb6c599e355f81f4481be1fbc6ce [formerly 3853bc169c80bb6c599e355f81f4481be1fbc6ce [formerly 3af396dee1584e309ad4af01d286ee5863db0958]]] Former-commit-id: 3853bc169c80bb6c599e355f81f4481be1fbc6ce Former-commit-id: 747da1f83cdaa45ccc255d04a29166ee9761bf4c Former-commit-id: d2f60b3c74df0ba6a6d6104425e13379f8efd8ff --- Makefile | 1 + gadm/shp/convert.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 gadm/shp/convert.py diff --git a/Makefile b/Makefile index a97e69a..35328f8 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,3 @@ get_gadm: python gadm/gadm.py + python gadm/shp/convert.py diff --git a/gadm/shp/convert.py b/gadm/shp/convert.py new file mode 100644 index 0000000..16b76a6 --- /dev/null +++ b/gadm/shp/convert.py @@ -0,0 +1,42 @@ +#! /usr/bin/python2.7 +# -*- coding: utf-8 -*- + +import re +import csv +import glob + +def sanitize(item): + item = re.sub(r'', lambda m: unichr(int(m.group(1), 16)), item) + item = item.replace(' ', '') + + if not isinstance(item, unicode): + item = unicode(item.decode('utf-8')) + + return item + +def read_data(infile): + data = [] + with open(infile, 'r') as f: + csvreader = csv.reader(f, delimiter=',', quotechar='"') + for row in csvreader: + data.append(sanitize(item) for item in row) + + return data + +def write_data(data, outfile): + with open(outfile, 'w') as f: + for row in data: + f.write(','.join(row).encode('utf-8')) + f.write('\n') + +if __name__=='__main__': + + infiles = glob.glob('*.csv') + + for infile in infiles: + data = read_data(infile) + outfile = infile.split('.')[0] + '-re.csv' + write_data(data, outfile) + print 'Data written to ' + outfile + + print 'Done.'