Skip to content

Commit

Permalink
Task 42: recreating the PR (#50)
Browse files Browse the repository at this point in the history
Co-authored-by: Nargis Sultani <[email protected]>
  • Loading branch information
nargis-sultani and Nargis Sultani authored Sep 21, 2023
1 parent d893989 commit cb55a7f
Show file tree
Hide file tree
Showing 3 changed files with 304 additions and 44 deletions.
212 changes: 212 additions & 0 deletions src/tests/test_schema_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import pandas as pd

from validator.create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate, validate_phases


class TestUtil:
valid_response = {"response": "No validations errors or warnings"}

def get_data(self, update_data: dict[str, list[str]] = {}) -> dict[str, list[str]]:
default = {
"uid": ["000TESTFIUIDDONOTUSEXGXVID11XTC1"],
"app_date": ["20241201"],
"app_method": ["1"],
"app_recipient": ["1"],
"ct_credit_product": ["988"],
"ct_credit_product_ff": [""],
"ct_guarantee": ["999"],
"ct_guarantee_ff": [""],
"ct_loan_term_flag": ["999"],
"ct_loan_term": [""],
"credit_purpose": ["999"],
"credit_purpose_ff": [""],
"amount_applied_for_flag": ["999"],
"amount_applied_for": [""],
"amount_approved": [""],
"action_taken": ["5"],
"action_taken_date": ["20241231"],
"denial_reasons": ["999"],
"denial_reasons_ff": [""],
"pricing_interest_rate_type": ["999"],
"pricing_init_rate_period": [""],
"pricing_fixed_rate": [""],
"pricing_adj_margin": [""],
"pricing_adj_index_name": ["999"],
"pricing_adj_index_name_ff": [""],
"pricing_adj_index_value": [""],
"pricing_origination_charges": [""],
"pricing_broker_fees": [""],
"pricing_initial_charges": [""],
"pricing_mca_addcost_flag": ["999"],
"pricing_mca_addcost": [""],
"pricing_prepenalty_allowed": ["999"],
"pricing_prepenalty_exists": ["999"],
"census_tract_adr_type": ["988"],
"census_tract_number": [""],
"gross_annual_revenue_flag": ["988"],
"gross_annual_revenue": [""],
"naics_code_flag": ["988"],
"naics_code": [""],
"number_of_workers": ["988"],
"time_in_business_type": ["988"],
"time_in_business": [""],
"business_ownership_status": ["988"],
"num_principal_owners_flag": ["988"],
"num_principal_owners": [""],
"po_1_ethnicity": [""],
"po_1_ethnicity_ff": [""],
"po_1_race": [""],
"po_1_race_anai_ff": [""],
"po_1_race_asian_ff": [""],
"po_1_race_baa_ff": [""],
"po_1_race_pi_ff": [""],
"po_1_gender_flag": [""],
"po_1_gender_ff": [""],
"po_2_ethnicity": [""],
"po_2_ethnicity_ff": [""],
"po_2_race": [""],
"po_2_race_anai_ff": [""],
"po_2_race_asian_ff": [""],
"po_2_race_baa_ff": [""],
"po_2_race_pi_ff": [""],
"po_2_gender_flag": [""],
"po_2_gender_ff": [""],
"po_3_ethnicity": [""],
"po_3_ethnicity_ff": [""],
"po_3_race": [""],
"po_3_race_anai_ff": [""],
"po_3_race_asian_ff": [""],
"po_3_race_baa_ff": [""],
"po_3_race_pi_ff": [""],
"po_3_gender_flag": [""],
"po_3_gender_ff": [""],
"po_4_ethnicity": [""],
"po_4_ethnicity_ff": [""],
"po_4_race": [""],
"po_4_race_anai_ff": [""],
"po_4_race_asian_ff": [""],
"po_4_race_baa_ff": [""],
"po_4_race_pi_ff": [""],
"po_4_gender_flag": [""],
"po_4_gender_ff": [""],
}
default.update(update_data)
return default


class TestValidate:
util = TestUtil()
phase1_schema = get_phase_1_schema_for_lei()
phase2_schema = get_phase_2_schema_for_lei()

def test_with_valid_dataframe(self):
df = pd.DataFrame(data=self.util.get_data())
result = validate(self.phase1_schema, df)
ph2_result = validate(self.phase2_schema, df)
assert len(result) == 0
assert len(ph2_result) == 0

def test_with_valid_lei(self):
lei = "000TESTFIUIDDONOTUSE"
phase1_schema_by_lei = get_phase_1_schema_for_lei(lei)
phase2_schema_by_lei = get_phase_2_schema_for_lei(lei)
df = pd.DataFrame(data=self.util.get_data())
result = validate(phase1_schema_by_lei, df)
ph2_result = validate(phase2_schema_by_lei, df)
assert len(result) == 0
assert len(ph2_result) == 0

def test_with_invalid_dataframe(self):
df = pd.DataFrame(data=self.util.get_data({"ct_credit_product": ["989"]}))
result = validate(self.phase1_schema, df)
ph2_result = validate(self.phase2_schema, df)
assert len(result) == 1
assert len(ph2_result) == 0

def test_with_multi_invalid_dataframe(self):
df = pd.DataFrame(
data=self.util.get_data(
{
"ct_credit_product": ["989"],
"num_principal_owners": ["1"],
"action_taken": ["2"],
}
)
)
result = validate(self.phase1_schema, df)
assert len(result) == 1

ph2_result = validate(self.phase2_schema, df)
assert len(ph2_result) == 3

def test_with_invalid_lei(self):
lei = "000TESTFIUIDDONOTUS1"
phase1_schema_by_lei = get_phase_1_schema_for_lei(lei)
phase2_schema_by_lei = get_phase_2_schema_for_lei(lei)
df = pd.DataFrame(data=self.util.get_data({"ct_credit_product": ["989"]}))
result = validate(phase1_schema_by_lei, df)
ph2_result = validate(phase2_schema_by_lei, df)
assert len(result) == 2
assert len(ph2_result) == 0


class TestValidatePhases:
util = TestUtil()

def test_with_valid_data(self):
result = validate_phases(pd.DataFrame(data=self.util.get_data()))

assert len(result) == 1
assert result[0] == self.util.valid_response

def test_with_valid_lei(self):
lei = "000TESTFIUIDDONOTUSE"
df = pd.DataFrame(data=self.util.get_data())
result = validate_phases(df, lei)
assert len(result) == 1
assert result[0] == self.util.valid_response

def test_with_invalid_data(self):
result = validate_phases(pd.DataFrame(data=self.util.get_data({"ct_credit_product": ["989"]})))

assert len(result) == 1
assert result[0] != self.util.valid_response

def test_with_multi_invalid_data_with_phase1(self):
result = validate_phases(
pd.DataFrame(
data=self.util.get_data(
{
"ct_credit_product": ["989"],
"num_principal_owners": ["1"],
"action_taken": ["2"],
}
)
)
)
# should only return phase 1 validation result since phase1 failed
assert len(result) == 1
assert result[0] != self.util.valid_response

def test_with_multi_invalid_data_with_phase2(self):
result = validate_phases(
pd.DataFrame(
data=self.util.get_data(
{
"num_principal_owners": ["1"],
"action_taken": ["2"],
}
)
),
)
# since the data passed phase 1 validations
# this should return phase 2 validations
assert len(result) == 3
assert result[0] != self.util.valid_response

def test_with_invalid_lei(self):
lei = "000TESTFIUIDDONOTUS1"
df = pd.DataFrame(data=self.util.get_data())
result = validate_phases(df, lei)
assert len(result) == 1
assert result[0] != self.util.valid_response
113 changes: 90 additions & 23 deletions src/validator/create_schemas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Creates two DataFrameSchema objects by rendering the schema template
with validations listed in phase 1 and phase 2."""

import pandas as pd
from checks import SBLCheck
from pandera import DataFrameSchema
from pandera.errors import SchemaErrors
from phase_validations import get_phase_1_and_2_validations_for_lei
Expand All @@ -20,32 +22,97 @@ def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
return DataFrameSchema(template)


def print_schema_errors(errors: SchemaErrors, phase: str):
for error in errors.schema_errors:
# Name of the column in the dataframe being checked
schema_error = error["error"]
check_id = "n/a"

# built in checks such as unique=True are different than custom
# checks unfortunately so the name needs to be accessed differently
try:
check_name = schema_error.check.name
check_id = schema_error.check.id
# This will either be a boolean series or a single bool
check_output = schema_error.check_output
except AttributeError:
check_name = schema_error.check
# this is just a string that we'd need to parse manually
check_output = schema_error.args[0]

print(f"{phase} Validation `{check_name}` with id: `{check_id}` failed for column `{{column_name}}`")
print(check_output)
print("")


def get_phase_1_schema_for_lei(lei: str = None):
return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei)


def get_phase_2_schema_for_lei(lei: str = None):
return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei)


def validate(schema: DataFrameSchema, df: pd.DataFrame):
"""
validate received dataframe with schema and return list of
schema errors
Args:
schema (DataFrameSchema): schema to be used for validation
df (pd.DataFrame): data parsed into dataframe
Returns:
list of schema error
"""
findings = []
try:
schema(df, lazy=True)
except SchemaErrors as errors:
for error in errors.schema_errors:
check: SBLCheck = error.check
column_name = error.schema.name
check_id = "n/a"

fields: list[str] = [column_name]

if hasattr(check, "name") and hasattr(check, "id"):
check_name: str = check.name
check_id: str = check.id

if check.groupby:
fields += check.groupby # type: ignore

# This will either be a boolean series or a single bool
check_output = error.check_output
else:
# This means this check's column has unique set to True.
# we shouldn't be using Unique flag as it doesn't return series of
# validation result . it returns just a printout result string/txt
raise AttributeError(f"{str(check)}")

# Remove duplicates, but keep as `list` for JSON-friendliness
fields = list(set(fields))

if check_output is not None:
# `check_output` must be sorted so its index lines up with `df`'s index
check_output.sort_index(inplace=True)

# Filter records using Pandas's boolean indexing, where all False values
# get filtered out. The `~` does the inverse since it's actually the
# False values we want to keep.
# http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
failed_check_fields_df = df[~check_output][fields].fillna("")

# Create list of dicts representing the failed validations and the
# associated field data for each invalid record.
records = []
for idx, row in failed_check_fields_df.iterrows():
record = {"number": idx + 1, "field_values": {}}
for field in fields:
record["field_values"][field] = row[field]
records.append(record)

validation_findings = {
"validation": {
"id": check_id,
"name": check_name,
"description": check.description,
"fields": fields,
"severity": "warning" if check.warning else "error",
},
"records": records,
}

findings.append(validation_findings)

return findings


def validate_phases(df: pd.DataFrame, lei: str = None) -> list:
phase1_findings = validate(get_phase_1_schema_for_lei(lei), df)
if phase1_findings:
return phase1_findings
else:
phase2_findings = validate(get_phase_2_schema_for_lei((lei)), df)
if phase2_findings:
return phase2_findings
else:
return [{"response": "No validations errors or warnings"}]
23 changes: 2 additions & 21 deletions src/validator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@
import sys

import pandas as pd
from create_schemas import (
get_phase_1_schema_for_lei,
get_phase_2_schema_for_lei,
print_schema_errors,
)
from pandera.errors import SchemaErrors
from create_schemas import validate_phases


def csv_to_df(path: str) -> pd.DataFrame:
Expand All @@ -32,21 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
print(df)
print("")

phase_1_failure_cases = None

phase_1_sblar_schema = get_phase_1_schema_for_lei(lei)
try:
phase_1_sblar_schema(df, lazy=True)
except SchemaErrors as errors:
phase_1_failure_cases = errors.failure_cases
print_schema_errors(errors, "Phase 1")

if phase_1_failure_cases is None:
phase_2_sblar_schema = get_phase_2_schema_for_lei(lei)
try:
phase_2_sblar_schema(df, lazy=True)
except SchemaErrors as errors:
print_schema_errors(errors, "Phase 2")
print(validate_phases(df, lei))


if __name__ == "__main__":
Expand Down

0 comments on commit cb55a7f

Please sign in to comment.