-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_spacescans.py
301 lines (235 loc) · 10.4 KB
/
run_spacescans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import argparse
import logging
import os
import json
import pandas as pd
from datetime import datetime
from spacescans.dataclean import address_cleaning as addr
#========= GLOBAL VARIABLES ===========
logger = logging.getLogger()
valid_log_levels = {
'DEBUG':logging.DEBUG,
'INFO':logging.INFO,
'WARN':logging.WARN,
'ERROR':logging.ERROR,
'CRITICAL':logging.CRITICAL
}
project_name = ''
#=========== End Global Variables =========
#============ Helper Functions =================
def get_project_name(name='new_project'):
if name == '':
name = 'new_project'
if not os.path.exists(os.getcwd()+f'/projects/{name}.json'):
return f'{name}'
else:
i = 1
# Iterate until a project name does not exist
while True:
if not os.path.exists(os.getcwd()+f'/projects/{name}_{i}.json'):
return f'{name}_{i}'
i+=1
def is_valid_date(date):
# Verify the correctness of the date
#datetime.strptime(date, '%m/%d/%Y')
try:
datetime.strptime(date, '%Y-%m-%d')
return True
except ValueError:
return False
def is_valid_project(project_name):
try:
with open(f'projects/{project_name}.json','r') as f:
return True
except Exception as e:
return False
#============ End Helper Functions ================
def list_projects():
# Either the projects variable will be set or an exception will be raised
try:
if not os.listdir(os.getcwd()+'/projects'):
raise "Directory is empty"
projects = os.listdir(os.getcwd()+'/projects')
except Exception as e:
print("No projects found! Create a new project with 'python3 run_spacescans.py create_project'.")
logger.debug(f"Caught the following exception when attempting to list projects: {e}")
quit(0)
# Safely exit when no project files are found
#Iterate over the projects and list them out
i = 1
for proj in projects:
proj = proj.replace(".json", "")
print(f'{i}. {proj}')
i+=1
def build_project(project_name, start_date, end_date, geoid, filepath):
if project_name is None:
project_name = input("Please enter a name for your project or hit enter to give a default name: ")
project_name = get_project_name(project_name)
# Pass the name to the function to handle duplicate project names
# Get necessary values or validate passed in values
if start_date is None:
start_date = input("Please input a start date for the study period in the format yyyy-mm-dd: ")
while not is_valid_date(start_date):
start_date = input("Date invalid! Please input a start date for the study period in the format yyyy-mm-dd: ")
else:
# Then the user passed in a date, we still have to validate it
while not is_valid_date(start_date):
start_date = input("Date invalid! Please input a start date for the study period in the format yyyy-mm-dd: ")
if end_date is None:
end_date = input("Please input an end date for the study period in the format yyyy-mm-dd: ")
while not is_valid_date(end_date):
end_date = input("Date invalid! Please input an end date for the study period in the format yyyy-mm-dd: ")
else:
# Then the user passed in a date, we still have to validate it
while not is_valid_date(end_date):
end_date = input("Date invalid! Please input an end date for the study period in the format yyyy-mm-dd: ")
if geoid is None:
geoid = input("Please give the geoidentifier for this dataset. (Options: zip9,): ")
while geoid not in ['zip9','zip5']:
geoid = input("Invalid input! Please give the geoidentifier for this dataset. (Options: zip9,): ")
else:
while geoid not in ['zip9','zip5']:
geoid = input("Invalid input! Please give the geoidentifier for this dataset. (Options: zip9,): ")
if filepath is None:
filepath = input("Please enter the location of the patient dataset. Ex: /path/to/data/my_data.csv: ")
while not os.path.isfile(filepath):
filepath = input(f"Invalid input! {filepath} does not exist. Please enter the location of the patient dataset. Ex: /path/to/data/my_data.csv: ")
else:
while not os.path.isfile(filepath):
filepath = input(f"Invalid input! {filepath} does not exist. Please enter the location of the patient dataset. Ex: /path/to/data/my_data.csv: ")
logger.debug('Writing the project file out with the following parameters:')
logger.debug(f'Start date: {start_date}')
logger.debug(f'End date: {end_date}')
logger.debug(f'Geoid: {geoid}')
logger.debug(f'File path: {filepath}')
print('\nWriting the project file out with the following parameters:')
print(f'Start date: {start_date}')
print(f'End date: {end_date}')
print(f'Geoid: {geoid}')
print(f'File path: {filepath}')
data = {
"project_name": project_name,
"start_date": start_date,
"end_date": end_date,
"geoid": geoid,
"filepath": filepath
}
with open(f'projects/{project_name}.json','w') as f:
json.dump(data, f, indent=4)
print(f'Sucessfully created {project_name}.json in projects folder!')
logger.info("Successfully wrote out the project file")
def run_address_cleaning(project_name):
logger.info(f'Performing data_clean for {project_name}...')
with open(f'projects/{project_name}.json','r') as f:
project = json.load(f)
start_date = project['start_date']
end_date = project['end_date']
geoid = project['geoid']
file_path = project['filepath']
start_date = addr.parse_date(start_date)
end_date = addr.parse_date(end_date)
patient_file = pd.read_csv(file_path, converters = {'ADDRESS_ZIP9': str})
zip9_file = pd.read_csv('test_data/combined_zip9s.csv', converters = {'AREAKEY': str})
addr.validate_csvs(patient_file, zip9_file)
ldszip9 = addr.filter_good_zip9s(patient_file, zip9_file)
ids_with_missingness = addr.find_ids_with_missingness(ldszip9)
ldsz9_no_nulls = addr.fix_nulls(ldszip9, ids_with_missingness, start_date, end_date)
ldsz9_continuous = addr.fix_gaps_overlaps_dupes(ldsz9_no_nulls)
ldsz9_in_daterange = addr.limit_timeframe(ldsz9_continuous, start_date, end_date)
logger.info('Success!')
outfile = project_name + '_cleaned_patient_data.csv'
outpath = os.path.join('output', outfile)
logger.info(f'Writing the output to {outpath}')
ldsz9_in_daterange.to_csv(outpath, index=False)
def main():
#============= PARSER DECLARATIONS ==============
# Define parser and command line arguments
parser = argparse.ArgumentParser(prog='run_spacescans.py')
subparser = parser.add_subparsers(dest='command')
# Parsing for the patient dataset preprocess (Address_cleaning, etc)
clean_parse = subparser.add_parser('clean_data', help='Process to perform data cleans on the patient dataset')
clean_parse.add_argument(
'-p','--project_name',
default = '',
required = True,
help='Name of the project to run'
)
# Parsing for linkage
link_parse = subparser.add_parser('link', help='Link the chosen exposomes to the submitted dataset')
link_parse.add_argument(
'-g','--geoid',
required=True,
help='Pass the geoidentifier used in the patient dataset (9-digit zipcode, 5-digit zipcode, etc)'
)
# Parsing for listing projects
projects_parser = subparser.add_parser('projects', help='List all user created projects')
# Parsing for creating a project
create_proj_parser = subparser.add_parser('create_project', help='Create a new project to run linkage against')
create_proj_parser.add_argument(
'-n','--project_name',
default=None,
required=False,
help='Name of the project'
)
create_proj_parser.add_argument(
'-s','--start_date',
default=None,
required=False,
help='Start date of the study period'
)
create_proj_parser.add_argument(
'-e', '--end_date',
default=None,
required=False,
help='End date of the study period'
)
create_proj_parser.add_argument(
'-g','--geoid',
default=None,
required=False,
help='The geoidentifier used in the patient dataset (9-digit zipcode, 5-digit zipcode, etc)'
)
create_proj_parser.add_argument(
'-f','--filepath',
default=None,
required=False,
help='The location of your patient dataset.'
)
#### NEED TO ADD FLAGS AND SUCH TO LINK PARSER
show_catalog = subparser.add_parser('catalog', help='Display the full exposome catalog available in the databases')
#=========== END PARSER DECLARATIONS ============
#================ BUILD LOGGER ==================
log_directory = 'log_files'
if not os.path.exists(log_directory):
os.makedirs(log_directory)
logger.setLevel(valid_log_levels['INFO'])
now = datetime.now()
formatted_date_time = now.strftime("%m-%d-%Y %H:%M:%S")
log_file_path = os.path.join(log_directory, f'{formatted_date_time}.log')
# Create a file handler
file_handler = logging.FileHandler(log_file_path, 'w')
file_handler.setLevel(logging.DEBUG)
# Create a console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
# Create a formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)
#=============== END LOGGER BUILD ===============
args = parser.parse_args()
logger.debug(f'The following arguments were passed in: {args}')
if args.command=='clean_data':
if is_valid_project(args.project_name):
run_address_cleaning(args.project_name)
else:
logger.error(f'Could not find project named \'{args.project_name}\'')
elif args.command=='projects':
list_projects()
elif args.command=='create_project':
build_project(args.project_name, args.start_date, args.end_date, args.geoid, args.filepath)
if __name__=='__main__':
main()