Skip to content

Commit

Permalink
Read unicode in CSVs
Browse files Browse the repository at this point in the history
Former-commit-id: d1364e9 [formerly 9bed84d3a68ac50b2436623b6a366a1f6b4cf1a1] [formerly 1d35c9d1201e2de6e868361d502d60ec0d6d0c21] [formerly 3853bc169c80bb6c599e355f81f4481be1fbc6ce [formerly 3853bc169c80bb6c599e355f81f4481be1fbc6ce [formerly 3af396d]]]
Former-commit-id: 3853bc169c80bb6c599e355f81f4481be1fbc6ce
Former-commit-id: 747da1f83cdaa45ccc255d04a29166ee9761bf4c
Former-commit-id: d2f60b3c74df0ba6a6d6104425e13379f8efd8ff
  • Loading branch information
e9t committed Apr 17, 2013
1 parent ad0cf9c commit 9fb9c71
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 0 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
get_gadm:
python gadm/gadm.py
python gadm/shp/convert.py
42 changes: 42 additions & 0 deletions gadm/shp/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

import re
import csv
import glob

def sanitize(item):
item = re.sub(r'<U\+(\w+)>', lambda m: unichr(int(m.group(1), 16)), item)
item = item.replace(' ', '')

if not isinstance(item, unicode):
item = unicode(item.decode('utf-8'))

return item

def read_data(infile):
data = []
with open(infile, 'r') as f:
csvreader = csv.reader(f, delimiter=',', quotechar='"')
for row in csvreader:
data.append(sanitize(item) for item in row)

return data

def write_data(data, outfile):
with open(outfile, 'w') as f:
for row in data:
f.write(','.join(row).encode('utf-8'))
f.write('\n')

if __name__=='__main__':

infiles = glob.glob('*.csv')

for infile in infiles:
data = read_data(infile)
outfile = infile.split('.')[0] + '-re.csv'
write_data(data, outfile)
print 'Data written to ' + outfile

print 'Done.'

0 comments on commit 9fb9c71

Please sign in to comment.