Added truncated field to validation groups in json (#205)

Closes #204
cfpb · Jun 5, 2024 · eec6b3e · eec6b3e
1 parent 62400c0
commit eec6b3e
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 5 deletions.
diff --git a/src/regtech_data_validator/data_formatters.py b/src/regtech_data_validator/data_formatters.py
@@ -156,10 +156,13 @@ def df_to_dicts(df: pd.DataFrame, max_records: int = 10000, max_group_size: int
                 total_errors_per_group[group_name] = max_group_size
         for validation_id, group in df.groupby("validation_id"):
             check = find_check(validation_id, checks)
-            truncated_group = truncate_validation_group_records(group, total_errors_per_group[validation_id])
+            truncated_group, need_to_truncate = truncate_validation_group_records(
+                group, total_errors_per_group[validation_id]
+            )
             group_json = process_chunk(truncated_group, validation_id, check)
             if group_json:
-                json_results.append(process_chunk(truncated_group, validation_id, check))
+                group_json["validation"]["is_truncated"] = need_to_truncate
+                json_results.append(group_json)
         json_results = sorted(json_results, key=lambda x: x['validation']['id'])
     return json_results
 
@@ -202,9 +205,10 @@ def calculate_group_chunk_sizes(grouped_df, max_records):
 # Cuts off the number of records.  Can't just 'head' on the group due to the dataframe structure.
 # So this function uses the group error counts to truncate on record numbers
 def truncate_validation_group_records(group, group_size):
+    need_to_truncate = len(group['record_no'].unique()) > group_size
     unique_record_nos = group['record_no'].unique()[:group_size]
     truncated_group = group[group['record_no'].isin(unique_record_nos)]
-    return truncated_group
+    return truncated_group, need_to_truncate
 
 
 def process_chunk(df: pd.DataFrame, validation_id: str, check: SBLCheck) -> [dict]:

diff --git a/tests/test_output_formats.py b/tests/test_output_formats.py
@@ -110,6 +110,7 @@ def test_output_json(self):
                     "severity": "Error",
                     "scope": "multi-field",
                     "fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.2.7",
+                    "is_truncated": False,
                 },
                 "records": [
                     {
@@ -127,6 +128,7 @@ def test_output_json(self):
                     "severity": "Error",
                     "scope": "register",
                     "fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1",
+                    "is_truncated": False,
                 },
                 "records": [
                     {
@@ -157,6 +159,7 @@ def test_output_json_with_max_group_size(self):
                     "severity": "Error",
                     "scope": "multi-field",
                     "fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.2.7",
+                    "is_truncated": False,
                 },
                 "records": [
                     {
@@ -174,6 +177,7 @@ def test_output_json_with_max_group_size(self):
                     "severity": "Error",
                     "scope": "register",
                     "fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1",
+                    "is_truncated": True,
                 },
                 "records": [
                     {
@@ -199,6 +203,7 @@ def test_output_json_with_max_records(self):
                     "severity": "Error",
                     "scope": "single-field",
                     "fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.1.4",
+                    "is_truncated": True,
                 },
                 "records": [
                     {"record_no": 4, "uid": "12345678901234567890", "fields": [{"name": "app_method", "value": "5"}]}
@@ -212,6 +217,7 @@ def test_output_json_with_max_records(self):
                     "severity": "Error",
                     "scope": "multi-field",
                     "fig_link": "https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.2.7",
+                    "is_truncated": False,
                 },
                 "records": [
                     {
@@ -225,8 +231,9 @@ def test_output_json_with_max_records(self):
         expected_output = ujson.dumps(results_object, indent=4, escape_forward_slashes=False)
 
         error_df = pd.DataFrame(self.input_df)
-        error_df.loc[-1] = [4, '12345678901234567890', 'app_method', '5', 'E0040']
-        error_df.index = error_df.index + 1
+        error_df.loc[-1] = [5, '12345678901234567890', 'app_method', '5', 'E0040']
+        error_df.loc[-2] = [4, '12345678901234567890', 'app_method', '5', 'E0040']
+        error_df.index = error_df.index + 2
         error_df.sort_index(inplace=True)
 
         actual_output = df_to_json(error_df, max_records=2)