This repository has been archived by the owner on Sep 30, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert.py
127 lines (117 loc) · 2.54 KB
/
convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Script to convert Kentucky Voter Registration database from fixed width to
# CSV with the proper fields
# All information in this script current as of 2017
# By: Jesse Hazel for The Courier-Journal
# To run: python convert.py {FILENAME}.txt
from sys import argv
import csv
import codecs
script, filename = argv
# These are the field widths (start,end) as described in the documentation the
# from State Board of Elections
fieldWidths = (
(1,3),
(4,7),
(8,8),
(9,33),
(34,48),
(49,58),
(59,59),
(60,60),
(61,63),
(64,103),
(104,123),
(124,125),
(126,134),
(135,174),
(175,194),
(195,196),
(197,205),
(206,213),
(214,221),
(222,241),
(222,223),
(224,224),
(225,225),
(226,227),
(228,228),
(229,229),
(230,231),
(232,232),
(233,233),
(234,235),
(236,236),
(237,237),
(238,239),
(240,240),
(241,241)
)
fieldNames = (
"COUNTY_CODE",
"PRECINCT_CODE",
"CITY_CODE",
"LAST_NAME",
"FIRST_NAME",
"MIDDLE_NAME",
"GENDER",
"PARTY",
"OTHER_CODE",
"ADDRESS_RESIDENCE",
"CITY_RESIDENCE",
"STATE_RESIDENCE",
"ZIP_RESIDENCE",
"ADDRESS_MAILING",
"CITY_MAILING",
"STATE_MAILING",
"ZIP_MAILING",
"DOB",
"REGISTRATION_DATE",
"HISTORY_RAW",
"H_YR1_LABEL",
"H_YR1_PRIMARY",
"H_YR1_GENERAL",
"H_YR2_LABEL",
"H_YR2_PRIMARY",
"H_YR2_GENERAL",
"H_YR3_LABEL",
"H_YR3_PRIMARY",
"H_YR3_GENERAL",
"H_YR4_LABEL",
"H_YR4_PRIMARY",
"H_YR4_GENERAL",
"H_YR5_LABEL",
"H_YR5_PRIMARY",
"H_YR5_GENERAL",
)
#String formatter function to clean up some of the fields
def formatString(label, field):
#Use 5 digit zips, discard ones that are malformed (non numeric)
if label == "ZIP_RESIDENCE" or label == "ZIP_MAILING" and field != "":
if field.isdigit():
return field[0:5]
else:
return ""
#Format date to YYYY-MM-DD
elif label == "DOB" or label == "REGISTRATION_DATE" and field != "":
day = field[2:4]
month = field[0:2]
year = field[4:8]
return year + "-" + month + "-" + day
else:
return field
#Open output and write the header
outputFile = open('output.csv', 'w')
writer = csv.writer(outputFile, dialect = 'excel')
writer.writerow(fieldNames)
#Loop through the file and parse each line writing to output.csv
i=0
with codecs.open(filename, 'r', 'utf-8') as f:
for line in f:
lineOut = []
for index, field in enumerate(fieldWidths):
newField = line[field[0]-1:field[1]].encode('utf-8').strip().upper()
lineOut.append(formatString(fieldNames[index], newField))
writer.writerow(lineOut)
i+=1
print "*Wrote line {}\r".format(i),
print "Done. Wrote {} lines to output.csv".format(i)