forked from cfpb/hmda-test-files
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_generator.py
571 lines (437 loc) · 19.4 KB
/
file_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
import glob
import json
import logging
import os
import pandas as pd
import numpy as np
import yaml
import lar_constraints
import lar_generator
from rules_engine import rules_engine
from test_file_generator import test_data
import utils
class FileGenerator(object):
def __init__(self,filename=''):
#Loads the filepath configuration.
with open('configurations/test_filepaths.yaml') as f:
#Uses safe_load instead of load.
self.filepaths = yaml.safe_load(f)
#Loads the geographic file configuration.
with open('configurations/geographic_data.yaml') as f:
# Uses safe_load instead of load.
self.geographic = yaml.safe_load(f)
#Loads the clean file configuration.
with open(filename) as f:
# Uses safe_load instead of load.
self.clean_config = yaml.safe_load(f)
#Loads the edit report configuration.
with open('configurations/edit_report_config.yaml') as f:
# Uses safe_load instead of load.
self.edit_report_config = yaml.safe_load(f)
#Stores the column names for the file containing geographic geographic data.
file_cols = self.geographic['file_columns']
#Sets the logging parameters.
#Uses a log file name and file writing mode from the
#test_filepaths yaml.
logging.basicConfig(filename=self.filepaths['log_filename'],
format='%(asctime)s %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
filemode=self.filepaths['log_mode'],
level=logging.INFO)
#Loads geographic geographic data from filepaths named in the test_filepaths
#yaml file.
self.geographic_data = pd.read_csv(self.geographic['geographic_data_file'],
delimiter='|', header=None, names=file_cols, dtype=str)
#Creates county, tract, and small county data from the file containing geographic geographic data.
self.geographic_data['county_fips'] = self.geographic_data['state_code'] + self.geographic_data['county']
self.geographic_data["tract_fips"] = self.geographic_data.county_fips + self.geographic_data.tracts
self.counties = list(self.geographic_data.county_fips)
self.tracts = list(self.geographic_data.tract_fips)
self.small_counties = list(self.geographic_data.county_fips[self.geographic_data.small_county=="S"])
#Loads schemas for LAR and TS.
#Schemas contain valid enumerations, including NA values, for each
#field in the dataset.
self.lar_schema_df = pd.DataFrame(json.load(open(
self.filepaths['lar_schema_json'], "r")))
self.ts_schema_df = pd.DataFrame(json.load(open(
self.filepaths['ts_schema_json'], "r")))
#Instantiates the other classes.
#lar_gen is responsible for generating data according to the values
# in the schema.
self.lar_gen = lar_generator.lar_gen(self.lar_schema_df,
self.ts_schema_df, counties=self.counties, tracts=self.tracts)
#lar_constrains is responsible for modifying generated data so that
#the resulting file passes syntax and validity edits.
self.lar_const = lar_constraints.lar_constraints(counties=self.counties,
tracts=self.tracts)
#lar_validator checks a dataframe and returns a JSON with
#edit pass/fail results.
self.lar_validator = rules_engine(lar_schema=self.lar_schema_df,
ts_schema=self.ts_schema_df, geographic_data=self.geographic_data)
#tracts=tracts, counties=counties, small_counties=small_counties)
#Stores the number of rows in the test file
self.file_length = self.clean_config["file_length"]["value"]
#Stores the LEI for the test file.
self.lei = self.clean_config["lei"]["value"]
def get_const_list(self):
"""Creates a list of constraints from the functions in
the lar_constraints object that apply syntax and validity edits."""
#Creates an empty list to store constraint function names.
constraints = []
#Takes each function pertaining to a syntax or validity edit
#in the lar_constraints class, and adds it to the empty constraints.
for func in dir(self.lar_const):
if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
constraints.append(func)
#Returns the list of constraints as strings.
return constraints
def apply_constraint(self, row, func):
"""Applies a constraint (func) to a generated row
of LAR data and returns a new row in a dictionary format."""
#Copies the row.
row_start = row.copy()
#Uses getattr to apply the constraint in the lar_constraints class.
row = getattr(self.lar_const, func)(row)
#Logs the changes in the intial row after the constraints
#have been applied.
diff_1, diff_2 = self.get_diff(row, row_start)
if len(diff_1) > 0:
logging.info(str(func))
logging.info(diff_1)
logging.info(diff_2)
return row
def get_diff(self, row, row_base):
"""Checks the difference between an initial row and the
row after constraints are applied"""
#Convert initial row to set.
initial_row = set(row_base.items())
#Convert constrained row to set.
changed_row = set(row.items())
#Subtract row sets to show changes from constraint functions.
diff_1 = (changed_row - initial_row)
diff_2 = (initial_row - changed_row)
return diff_1, diff_2
def constraints_loop(self, constraints=[], row=None, row_base=None):
"""Applies the list of constraints
generated to each row"""
for const in constraints:
row = self.apply_constraint(row, const)
diff = self.get_diff(row, row_base)
return row
def validation(self, row, ts_row):
"""
Applies the syntax and validity rules engine logic
to the LAR row to create an edit report.
"""
#Creates dataframes of LAR and TS data.
lar_data = pd.DataFrame(row, index=[1])
ts_data = pd.DataFrame(ts_row, index=[0])
#Instantiates a rules checker to check the row against
#edits in the rules engine.
rules_check = rules_engine(lar_schema=self.lar_schema_df,
ts_schema=self.ts_schema_df, geographic_data=self.geographic_data)
#tracts=tracts, counties=counties) #instantiate edits rules engine
#Loads LAR and TS data to the rules engine.
rules_check.load_lar_data(lar_data)
rules_check.load_ts_data(ts_data)
#Runs the edits against the LAR row and produces edit check results.
for func in dir(rules_check):
if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
#print("applying:", func)
getattr(rules_check, func)()
#Returns edit check results.
return rules_check.results
def make_ts_row(self):
"""
Uses the lar_gen object to create a TS row,
dictionary object.
"""
self.ts_row = self.lar_gen.make_ts_row(self.clean_config)
return self.ts_row
def make_clean_lar_row(self, ts_row):
"""Uses the lar_gen object and a TS row to create a LAR row that
passes syntax and validity edits according to the FIG."""
#Stores the stop condition and the initial number of iterations.
stop = False
iters = 1
#Makes a new row using the lar generator.
row = self.lar_gen.make_row(lei=self.lei)
#Begins a loop that creates the LAR row. The loop generates the row
#with the lar_generator and then validates the row
#against the rules engine for syntax or validity edits.
#If syntax or validity edits are present, the row is run through
#the contraints and validated again.
while stop == False:
#Copies row to enable diff logging.
row_base = row.copy()
#Creates an edit report based on the validation.
res = pd.DataFrame(self.validation(row, ts_row))
#Logs the results of edits that have failed.
logging.info(res[res.status=="failed"])
#If there are no syntax or validity edits present, the stop
#condition is invoked and the row is returned.
if len(res[res.status=="failed"])<=0:
stop = True
#If there are syntax or validity edits present, the constraints
#are applied and revalidated while stop is False.
else:
message = "\nstarting constraints iteration {iter}".format(
iter=iters)
logging.info(message)
row = self.constraints_loop(self.get_const_list(), row,
row_base)
iters+=1
return row
def create_files(self, kind):
"""Creates a clean file or a set of edit files specified by the
function call"""
#Creates TS row data.
self.ts_row = self.make_ts_row()
#Creates a TS row dataframe.
ts_df = pd.DataFrame(self.ts_row, index=[1])
#The following produces a clean file.
if kind == 'clean_file':
#Creates a first row of LAR to begin the dataframe.
#All other rows are concatenated to the dataframe until
#the length of the dataframe reaches the file length specified in the
#test filepaths yaml file.
for i in range(0, self.file_length):
print('Creating row {i}'.format(i=i))
if i==0:
first_row = self.make_clean_lar_row(ts_row=self.ts_row)
lar_frame = pd.DataFrame(first_row, index=[1])
else:
new_row = self.make_clean_lar_row(ts_row=self.ts_row)
new_row = pd.DataFrame(new_row, index=[1])
lar_frame = pd.concat([lar_frame, new_row], axis=0)
#Writes the file to a clean filepath specified in the test_filepaths
#configuration.
utils.write_file(ts_input=ts_df, lar_input=lar_frame,
path=self.filepaths['clean_filepath'].format(
bank_name=self.clean_config["name"]["value"]),
name=self.filepaths['clean_filename'].format(
n=self.file_length, bank_name=self.clean_config["name"]["value"]))
#For error files.
if kind == 'error_files':
#Modifies clean data and outputs
#resulting files that fail specific edits.
#Instantiates the edit file maker.
file_maker = test_data(ts_schema=self.ts_schema_df,
lar_schema=self.lar_schema_df,
geographic_data=self.geographic_data)
#Pulls in the clean data filepath and name from the
#test filepaths yaml file.
ts_data, lar_data = utils.read_data_file(
path=self.filepaths['clean_filepath'].format(
bank_name=self.clean_config["name"]["value"]),
data_file=self.filepaths["clean_filename"].format(
bank_name=self.clean_config["name"]["value"],
n=self.clean_config["file_length"]["value"]))
#Passes clean file data to the file maker object.
file_maker.load_data_frames(ts_data, lar_data)
#Generates a file for each edit function in file maker.
edits = []
#Loops over all data modification functions.
for func in dir(file_maker):
#Checks if function is a numbered syntax or validity edit.
if func[:1] in ("s", "v", "q") and func[1:4].isdigit()==True:
print("applying:", func)
#Applies data modification functions and produces files.
getattr(file_maker, func)()
def validate_quality_edit_file(self, quality_filename):
"""
The test file generator logic for creating quality edit test files
may cause rows to fail not only quality edits, but also
syntax or validity edits in the creation process.
This function takes a quality edit test file from the quality filepaths
directory in the test filepaths yaml, drops rows that have
syntax or validity edits, and duplicates the remaining clean rows to the
length of the original file.
The file is then saved in a new directory for quality edit test files
that also pass syntax and validity edits.
NOTE: This function works to allow LAR rows to pass syntax and validity edits and
does not validate the TS sheet.
"""
try:
#Instantiates an edit checker object with rules_engine.
checker = rules_engine(lar_schema=self.lar_schema_df,
ts_schema=self.ts_schema_df, geographic_data=self.geographic_data)
#Reads the files and separates data into TS and LAR frames.
ts_df, lar_df = utils.read_data_file(
path=self.filepaths['quality_filepath'].format(bank_name=self.clean_config['name']['value']),
data_file=quality_filename)
#Stores the original length of the file.
original_length = len(lar_df.index)
#Loads data into the checker object.
checker.load_data_frames(ts_df, lar_df)
#Produces a report as to which syntax or validity
#edits have passed or failed based on logic in the rules_engine.
for func in dir(checker):
if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
getattr(checker, func)()
#Creates a results dataframe and keeps the results that
#have failed.
report_df = pd.DataFrame(checker.results)
report_df = report_df[(report_df['status']=='failed')]
# The function ignores TS edits and drops results related
# to edit fails from the TS.
report_df = report_df[report_df['row_ids'] != 'TS']
if len(report_df) == 0:
#If there are no syntax or validity edits
#the data is written to a new directory for quality
#test files that pass syntax and validity edits.
utils.write_file(path=self.filepaths['quality_pass_s_v_filepath'].format(
bank_name=self.clean_config['name']['value']),
ts_input=ts_df,
lar_input=lar_df,
name=quality_filename)
#The case if there are rows that failed syntax or validity edits.
else:
#Creates an empty list for storing row numbers
#where edits have failed.
uli_list = []
#Iterates through each row and appends the list of ULI's
#of rows where syntax or validity edits have failed.
for index, row in report_df.iterrows():
uli_list.append(row.row_ids)
#Drops not-a-numbers from the ULI list.
if np.nan in uli_list:
uli_list.remove(np.nan)
#Creates a new list to store the ULI's without nested brackets.
new_list = []
for i in range(len(uli_list)):
for n in range(len(uli_list[i])):
new_list.append(uli_list[i][n])
#Creates a list that removes ULI's that are repeated.
unique_uli_list = []
for i in new_list:
if i not in unique_uli_list:
unique_uli_list.append(i)
#Creates a list of row numbers corresponding to the
#ULI's that have failed syntax or validity edits.
bad_rows = []
for index, row in lar_df.iterrows():
if row['uli'] in unique_uli_list:
failed_uli = row['uli']
bad_rows.append(lar_df[lar_df['uli']==failed_uli].index.values.astype(int)[0])
#Drops all rows that failed syntax or validity edits
#from the original LAR dataframe.
lar_df = lar_df.drop(bad_rows)
#Creates new lar rows to the original length of the file
#using the utils new lar rows function.
ts_df, lar_df = utils.new_lar_rows(final_row_count=original_length,
lar_df=lar_df, ts_df=ts_df)
#Writes the file to the new path for quality test files
#that pass syntax and validity edits.
utils.write_file(path=self.filepaths['quality_pass_s_v_filepath'].format(bank_name=self.clean_config['name']['value']),
ts_input=ts_df, lar_input=lar_df, name=quality_filename)
#Prints to the console the name of the file being changed.
print("Adjusting {file} to pass syntax and validity edits.".format(file=quality_filename))
print("File saved in {path}".format(path=self.filepaths['quality_pass_s_v_filepath'].format(bank_name=self.clean_config['name']['value'])))
#The condition where there are no clean rows present in the file.
except ZeroDivisionError as e:
#Prints a message to indicate that the file has not been validated.
print(e)
print("Sorry no clean file available for {file}.".format(file=quality_filename))
def edit_report(self):
"""
This function takes in a filepath and name, producing a report on
whether any rows of the data failed syntax, validity or quality edits.
The report contains among its fields the edit name, the status of the
edit, the number of rows failed by the edit, if any, and the ULI's or
NULIs (loan ID) of the rows that fail the edit.
The resulting report is saved as a csv file using configurations
from the test_filepaths.yaml file.
"""
#Instantiates the rules engine class as a checker object with a
#LAR schema, a TS schema, and geographic geographic data.
checker = rules_engine(lar_schema=self.lar_schema_df,
ts_schema=self.ts_schema_df, geographic_data=self.geographic_data)
#Seperates data from the filepath and filename into a TS dataframe
#and a LAR dataframe.
ts_df, lar_df = utils.read_data_file(path=self.edit_report_config['data_filepath'],
data_file=self.edit_report_config['data_filename'])
#Loads the TS and LAR dataframes into the checker object.
checker.load_data_frames(ts_df, lar_df)
#Applies each function in the rules engine that checks for edits
#and creates a results list of edits failed or passed.
for func in dir(checker):
if func[:1] in ("s", "v", "q") and func[1:4].isdigit()==True:
getattr(checker, func)()
#Creates a dataframe of results from the checker.
report_df = pd.DataFrame(checker.results)
#Writes the report to the filepath and name designated in
#the test_fielpaths yaml
edit_report_path = self.edit_report_config['edit_report_output_filepath']
if not os.path.exists(edit_report_path):
os.makedirs(edit_report_path)
report_df.to_csv(edit_report_path +self.edit_report_config['edit_report_output_filename'])
#Logs the result.
logging.info("Edit Report has been created in {filepath}".format(
filepath=edit_report_path))
def create_custom_row(self, dictionary, clean_filepath, clean_filename):
"""
Creates a custom clean LAR row by passing in a dictionary of columns
and new values to modify all the rows of an
existing clean file, filters the modified file for clean rows,
and then pulls the first row from the file.
Pulls rows from the clean file last generated. Suggestion that
the file pulled should be 1000 original rows or greater, to ensure
that modified clean rows can be found.
"""
#Creates a TS and LAR dataframe from the clean filepath and name
#specified.
ts_df, lar_df = utils.read_data_file(
path=clean_filepath,
data_file=clean_filename)
#Changes each column (key in the dictionary) to the new value in
# the dictionary.
for key, value in dictionary.items():
lar_df[key] = value
checker = rules_engine(lar_schema=self.lar_schema_df,
ts_schema=self.ts_schema_df,
geographic_data=self.geographic_data)
#Produces a report as to which syntax or validity
#edits have passed or failed based on logic in the rules_engine.
#Loads the TS and LAR dataframes into the checker object.
checker.load_data_frames(ts_df, lar_df)
for func in dir(checker):
if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
getattr(checker, func)()
#Produces a report as to which syntax or validity
#edits have passed or failed based on logic in the rules_engine.
for func in dir(checker):
if func[:1] in ("s", "v") and func[1:4].isdigit()==True:
getattr(checker, func)()
#Creates a results dataframe and keeps the results that
#have failed.
report_df = pd.DataFrame(checker.results)
report_df = report_df[(report_df['status']=='failed')].copy()
# The function ignores TS edits and drops results related
# to edit fails from the TS.
report_df = report_df[report_df['row_ids'] != 'TS']
if len(report_df) == 0:
#If there are no syntax or validity edits
#the data is written to a new directory for quality
#test files that pass syntax and validity edits.
#Takes the first row of data.
lar_row = lar_df[0:1]
#The case if there are rows that failed syntax or validity edits.
else:
#Creates a list of ULI's corresponding to rows where
#syntax or validity edits have failed.
#The resulting list is a list of lists, a list of ulis failed for each
#edit failed.
uli_list = list(report_df.row_ids)
#Converts the list of lists to a single list.
single_uli_list = []
for i in uli_list:
single_uli_list = single_uli_list + i
#Creates a list that removes ULI's that are repeated.
unique_uli_list = set(single_uli_list)
#Drops rows in the data containing syntax or validity edits.
lar_df = lar_df[lar_df.uli.isin(unique_uli_list)].copy()
#Only one row is needed for output.
#The following, takes the first row of data from the clean dataframe
lar_row = lar_df[0:1]
return(lar_row)