Task 42: recreating the PR (#50)

Co-authored-by: Nargis Sultani <[email protected]>
cfpb · Sep 21, 2023 · cb55a7f · cb55a7f
1 parent d893989
commit cb55a7f
Show file tree

Hide file tree

Showing 3 changed files with 304 additions and 44 deletions.
diff --git a/src/tests/test_schema_functions.py b/src/tests/test_schema_functions.py
@@ -0,0 +1,212 @@
+import pandas as pd
+
+from validator.create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate, validate_phases
+
+
+class TestUtil:
+    valid_response = {"response": "No validations errors or warnings"}
+
+    def get_data(self, update_data: dict[str, list[str]] = {}) -> dict[str, list[str]]:
+        default = {
+            "uid": ["000TESTFIUIDDONOTUSEXGXVID11XTC1"],
+            "app_date": ["20241201"],
+            "app_method": ["1"],
+            "app_recipient": ["1"],
+            "ct_credit_product": ["988"],
+            "ct_credit_product_ff": [""],
+            "ct_guarantee": ["999"],
+            "ct_guarantee_ff": [""],
+            "ct_loan_term_flag": ["999"],
+            "ct_loan_term": [""],
+            "credit_purpose": ["999"],
+            "credit_purpose_ff": [""],
+            "amount_applied_for_flag": ["999"],
+            "amount_applied_for": [""],
+            "amount_approved": [""],
+            "action_taken": ["5"],
+            "action_taken_date": ["20241231"],
+            "denial_reasons": ["999"],
+            "denial_reasons_ff": [""],
+            "pricing_interest_rate_type": ["999"],
+            "pricing_init_rate_period": [""],
+            "pricing_fixed_rate": [""],
+            "pricing_adj_margin": [""],
+            "pricing_adj_index_name": ["999"],
+            "pricing_adj_index_name_ff": [""],
+            "pricing_adj_index_value": [""],
+            "pricing_origination_charges": [""],
+            "pricing_broker_fees": [""],
+            "pricing_initial_charges": [""],
+            "pricing_mca_addcost_flag": ["999"],
+            "pricing_mca_addcost": [""],
+            "pricing_prepenalty_allowed": ["999"],
+            "pricing_prepenalty_exists": ["999"],
+            "census_tract_adr_type": ["988"],
+            "census_tract_number": [""],
+            "gross_annual_revenue_flag": ["988"],
+            "gross_annual_revenue": [""],
+            "naics_code_flag": ["988"],
+            "naics_code": [""],
+            "number_of_workers": ["988"],
+            "time_in_business_type": ["988"],
+            "time_in_business": [""],
+            "business_ownership_status": ["988"],
+            "num_principal_owners_flag": ["988"],
+            "num_principal_owners": [""],
+            "po_1_ethnicity": [""],
+            "po_1_ethnicity_ff": [""],
+            "po_1_race": [""],
+            "po_1_race_anai_ff": [""],
+            "po_1_race_asian_ff": [""],
+            "po_1_race_baa_ff": [""],
+            "po_1_race_pi_ff": [""],
+            "po_1_gender_flag": [""],
+            "po_1_gender_ff": [""],
+            "po_2_ethnicity": [""],
+            "po_2_ethnicity_ff": [""],
+            "po_2_race": [""],
+            "po_2_race_anai_ff": [""],
+            "po_2_race_asian_ff": [""],
+            "po_2_race_baa_ff": [""],
+            "po_2_race_pi_ff": [""],
+            "po_2_gender_flag": [""],
+            "po_2_gender_ff": [""],
+            "po_3_ethnicity": [""],
+            "po_3_ethnicity_ff": [""],
+            "po_3_race": [""],
+            "po_3_race_anai_ff": [""],
+            "po_3_race_asian_ff": [""],
+            "po_3_race_baa_ff": [""],
+            "po_3_race_pi_ff": [""],
+            "po_3_gender_flag": [""],
+            "po_3_gender_ff": [""],
+            "po_4_ethnicity": [""],
+            "po_4_ethnicity_ff": [""],
+            "po_4_race": [""],
+            "po_4_race_anai_ff": [""],
+            "po_4_race_asian_ff": [""],
+            "po_4_race_baa_ff": [""],
+            "po_4_race_pi_ff": [""],
+            "po_4_gender_flag": [""],
+            "po_4_gender_ff": [""],
+        }
+        default.update(update_data)
+        return default
+
+
+class TestValidate:
+    util = TestUtil()
+    phase1_schema = get_phase_1_schema_for_lei()
+    phase2_schema = get_phase_2_schema_for_lei()
+
+    def test_with_valid_dataframe(self):
+        df = pd.DataFrame(data=self.util.get_data())
+        result = validate(self.phase1_schema, df)
+        ph2_result = validate(self.phase2_schema, df)
+        assert len(result) == 0
+        assert len(ph2_result) == 0
+
+    def test_with_valid_lei(self):
+        lei = "000TESTFIUIDDONOTUSE"
+        phase1_schema_by_lei = get_phase_1_schema_for_lei(lei)
+        phase2_schema_by_lei = get_phase_2_schema_for_lei(lei)
+        df = pd.DataFrame(data=self.util.get_data())
+        result = validate(phase1_schema_by_lei, df)
+        ph2_result = validate(phase2_schema_by_lei, df)
+        assert len(result) == 0
+        assert len(ph2_result) == 0
+
+    def test_with_invalid_dataframe(self):
+        df = pd.DataFrame(data=self.util.get_data({"ct_credit_product": ["989"]}))
+        result = validate(self.phase1_schema, df)
+        ph2_result = validate(self.phase2_schema, df)
+        assert len(result) == 1
+        assert len(ph2_result) == 0
+
+    def test_with_multi_invalid_dataframe(self):
+        df = pd.DataFrame(
+            data=self.util.get_data(
+                {
+                    "ct_credit_product": ["989"],
+                    "num_principal_owners": ["1"],
+                    "action_taken": ["2"],
+                }
+            )
+        )
+        result = validate(self.phase1_schema, df)
+        assert len(result) == 1
+
+        ph2_result = validate(self.phase2_schema, df)
+        assert len(ph2_result) == 3
+
+    def test_with_invalid_lei(self):
+        lei = "000TESTFIUIDDONOTUS1"
+        phase1_schema_by_lei = get_phase_1_schema_for_lei(lei)
+        phase2_schema_by_lei = get_phase_2_schema_for_lei(lei)
+        df = pd.DataFrame(data=self.util.get_data({"ct_credit_product": ["989"]}))
+        result = validate(phase1_schema_by_lei, df)
+        ph2_result = validate(phase2_schema_by_lei, df)
+        assert len(result) == 2
+        assert len(ph2_result) == 0
+
+
+class TestValidatePhases:
+    util = TestUtil()
+
+    def test_with_valid_data(self):
+        result = validate_phases(pd.DataFrame(data=self.util.get_data()))
+
+        assert len(result) == 1
+        assert result[0] == self.util.valid_response
+
+    def test_with_valid_lei(self):
+        lei = "000TESTFIUIDDONOTUSE"
+        df = pd.DataFrame(data=self.util.get_data())
+        result = validate_phases(df, lei)
+        assert len(result) == 1
+        assert result[0] == self.util.valid_response
+
+    def test_with_invalid_data(self):
+        result = validate_phases(pd.DataFrame(data=self.util.get_data({"ct_credit_product": ["989"]})))
+
+        assert len(result) == 1
+        assert result[0] != self.util.valid_response
+
+    def test_with_multi_invalid_data_with_phase1(self):
+        result = validate_phases(
+            pd.DataFrame(
+                data=self.util.get_data(
+                    {
+                        "ct_credit_product": ["989"],
+                        "num_principal_owners": ["1"],
+                        "action_taken": ["2"],
+                    }
+                )
+            )
+        )
+        # should only return phase 1 validation result since phase1 failed
+        assert len(result) == 1
+        assert result[0] != self.util.valid_response
+
+    def test_with_multi_invalid_data_with_phase2(self):
+        result = validate_phases(
+            pd.DataFrame(
+                data=self.util.get_data(
+                    {
+                        "num_principal_owners": ["1"],
+                        "action_taken": ["2"],
+                    }
+                )
+            ),
+        )
+        # since the data passed phase 1 validations
+        # this should return phase 2 validations
+        assert len(result) == 3
+        assert result[0] != self.util.valid_response
+
+    def test_with_invalid_lei(self):
+        lei = "000TESTFIUIDDONOTUS1"
+        df = pd.DataFrame(data=self.util.get_data())
+        result = validate_phases(df, lei)
+        assert len(result) == 1
+        assert result[0] != self.util.valid_response
diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py
@@ -1,6 +1,8 @@
 """Creates two DataFrameSchema objects by rendering the schema template
 with validations listed in phase 1 and phase 2."""
 
+import pandas as pd
+from checks import SBLCheck
 from pandera import DataFrameSchema
 from pandera.errors import SchemaErrors
 from phase_validations import get_phase_1_and_2_validations_for_lei
@@ -20,32 +22,97 @@ def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
     return DataFrameSchema(template)
 
 
-def print_schema_errors(errors: SchemaErrors, phase: str):
-    for error in errors.schema_errors:
-        # Name of the column in the dataframe being checked
-        schema_error = error["error"]
-        check_id = "n/a"
-
-        # built in checks such as unique=True are different than custom
-        # checks unfortunately so the name needs to be accessed differently
-        try:
-            check_name = schema_error.check.name
-            check_id = schema_error.check.id
-            # This will either be a boolean series or a single bool
-            check_output = schema_error.check_output
-        except AttributeError:
-            check_name = schema_error.check
-            # this is just a string that we'd need to parse manually
-            check_output = schema_error.args[0]
-
-        print(f"{phase} Validation `{check_name}` with id: `{check_id}` failed for column `{{column_name}}`")
-        print(check_output)
-        print("")
-
-
 def get_phase_1_schema_for_lei(lei: str = None):
     return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei)
 
 
 def get_phase_2_schema_for_lei(lei: str = None):
     return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei)
+
+
+def validate(schema: DataFrameSchema, df: pd.DataFrame):
+    """
+    validate received dataframe with schema and return list of
+    schema errors
+
+    Args:
+        schema (DataFrameSchema): schema to be used for validation
+        df (pd.DataFrame): data parsed into dataframe
+
+    Returns:
+        list of schema error
+    """
+    findings = []
+    try:
+        schema(df, lazy=True)
+    except SchemaErrors as errors:
+        for error in errors.schema_errors:
+            check: SBLCheck = error.check
+            column_name = error.schema.name
+            check_id = "n/a"
+
+            fields: list[str] = [column_name]
+
+            if hasattr(check, "name") and hasattr(check, "id"):
+                check_name: str = check.name
+                check_id: str = check.id
+
+                if check.groupby:
+                    fields += check.groupby  # type: ignore
+
+                # This will either be a boolean series or a single bool
+                check_output = error.check_output
+            else:
+                # This means this check's column has unique set to True.
+                # we shouldn't be using Unique flag as it doesn't return series of
+                # validation result .  it returns just a printout result string/txt
+                raise AttributeError(f"{str(check)}")
+
+            # Remove duplicates, but keep as `list` for JSON-friendliness
+            fields = list(set(fields))
+
+            if check_output is not None:
+                # `check_output` must be sorted so its index lines up with `df`'s index
+                check_output.sort_index(inplace=True)
+
+                # Filter records using Pandas's boolean indexing, where all False values
+                # get filtered out. The `~` does the inverse since it's actually the
+                # False values we want to keep.
+                # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
+                failed_check_fields_df = df[~check_output][fields].fillna("")
+
+                # Create list of dicts representing the failed validations and the
+                # associated field data for each invalid record.
+                records = []
+                for idx, row in failed_check_fields_df.iterrows():
+                    record = {"number": idx + 1, "field_values": {}}
+                    for field in fields:
+                        record["field_values"][field] = row[field]
+                    records.append(record)
+
+                validation_findings = {
+                    "validation": {
+                        "id": check_id,
+                        "name": check_name,
+                        "description": check.description,
+                        "fields": fields,
+                        "severity": "warning" if check.warning else "error",
+                    },
+                    "records": records,
+                }
+
+                findings.append(validation_findings)
+
+    return findings
+
+
+def validate_phases(df: pd.DataFrame, lei: str = None) -> list:
+    phase1_findings = validate(get_phase_1_schema_for_lei(lei), df)
+    if phase1_findings:
+        return phase1_findings
+    else:
+        phase2_findings = validate(get_phase_2_schema_for_lei((lei)), df)
+        if phase2_findings:
+            return phase2_findings
+        else:
+            return [{"response": "No validations errors or warnings"}]
diff --git a/src/validator/main.py b/src/validator/main.py
@@ -8,12 +8,7 @@
 import sys
 
 import pandas as pd
-from create_schemas import (
-    get_phase_1_schema_for_lei,
-    get_phase_2_schema_for_lei,
-    print_schema_errors,
-)
-from pandera.errors import SchemaErrors
+from create_schemas import validate_phases
 
 
 def csv_to_df(path: str) -> pd.DataFrame:
@@ -32,21 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
     print(df)
     print("")
 
-    phase_1_failure_cases = None
-
-    phase_1_sblar_schema = get_phase_1_schema_for_lei(lei)
-    try:
-        phase_1_sblar_schema(df, lazy=True)
-    except SchemaErrors as errors:
-        phase_1_failure_cases = errors.failure_cases
-        print_schema_errors(errors, "Phase 1")
-
-    if phase_1_failure_cases is None:
-        phase_2_sblar_schema = get_phase_2_schema_for_lei(lei)
-        try:
-            phase_2_sblar_schema(df, lazy=True)
-        except SchemaErrors as errors:
-            print_schema_errors(errors, "Phase 2")
+    print(validate_phases(df, lei))
 
 
 if __name__ == "__main__":