Skip to content

Commit

Permalink
Added truncated field to validation groups in json (#205)
Browse files Browse the repository at this point in the history
Closes #204
  • Loading branch information
jcadam14 authored Jun 5, 2024
1 parent 62400c0 commit eec6b3e
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
10 changes: 7 additions & 3 deletions src/regtech_data_validator/data_formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,13 @@ def df_to_dicts(df: pd.DataFrame, max_records: int = 10000, max_group_size: int
total_errors_per_group[group_name] = max_group_size
for validation_id, group in df.groupby("validation_id"):
check = find_check(validation_id, checks)
truncated_group = truncate_validation_group_records(group, total_errors_per_group[validation_id])
truncated_group, need_to_truncate = truncate_validation_group_records(
group, total_errors_per_group[validation_id]
)
group_json = process_chunk(truncated_group, validation_id, check)
if group_json:
json_results.append(process_chunk(truncated_group, validation_id, check))
group_json["validation"]["is_truncated"] = need_to_truncate
json_results.append(group_json)
json_results = sorted(json_results, key=lambda x: x['validation']['id'])
return json_results

Expand Down Expand Up @@ -202,9 +205,10 @@ def calculate_group_chunk_sizes(grouped_df, max_records):
# Cuts off the number of records. Can't just 'head' on the group due to the dataframe structure.
# So this function uses the group error counts to truncate on record numbers
def truncate_validation_group_records(group, group_size):
need_to_truncate = len(group['record_no'].unique()) > group_size
unique_record_nos = group['record_no'].unique()[:group_size]
truncated_group = group[group['record_no'].isin(unique_record_nos)]
return truncated_group
return truncated_group, need_to_truncate


def process_chunk(df: pd.DataFrame, validation_id: str, check: SBLCheck) -> [dict]:
Expand Down
11 changes: 9 additions & 2 deletions tests/test_output_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ def test_output_json(self):
"severity": "Error",
"scope": "multi-field",
"fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.2.7",
"is_truncated": False,
},
"records": [
{
Expand All @@ -127,6 +128,7 @@ def test_output_json(self):
"severity": "Error",
"scope": "register",
"fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1",
"is_truncated": False,
},
"records": [
{
Expand Down Expand Up @@ -157,6 +159,7 @@ def test_output_json_with_max_group_size(self):
"severity": "Error",
"scope": "multi-field",
"fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.2.7",
"is_truncated": False,
},
"records": [
{
Expand All @@ -174,6 +177,7 @@ def test_output_json_with_max_group_size(self):
"severity": "Error",
"scope": "register",
"fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1",
"is_truncated": True,
},
"records": [
{
Expand All @@ -199,6 +203,7 @@ def test_output_json_with_max_records(self):
"severity": "Error",
"scope": "single-field",
"fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.1.4",
"is_truncated": True,
},
"records": [
{"record_no": 4, "uid": "12345678901234567890", "fields": [{"name": "app_method", "value": "5"}]}
Expand All @@ -212,6 +217,7 @@ def test_output_json_with_max_records(self):
"severity": "Error",
"scope": "multi-field",
"fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.2.7",
"is_truncated": False,
},
"records": [
{
Expand All @@ -225,8 +231,9 @@ def test_output_json_with_max_records(self):
expected_output = ujson.dumps(results_object, indent=4, escape_forward_slashes=False)

error_df = pd.DataFrame(self.input_df)
error_df.loc[-1] = [4, '12345678901234567890', 'app_method', '5', 'E0040']
error_df.index = error_df.index + 1
error_df.loc[-1] = [5, '12345678901234567890', 'app_method', '5', 'E0040']
error_df.loc[-2] = [4, '12345678901234567890', 'app_method', '5', 'E0040']
error_df.index = error_df.index + 2
error_df.sort_index(inplace=True)

actual_output = df_to_json(error_df, max_records=2)
Expand Down

0 comments on commit eec6b3e

Please sign in to comment.