From 0ff0392411b63d5275f65bf9d78a0f16e4b7617c Mon Sep 17 00:00:00 2001 From: Nargis Sultani Date: Fri, 15 Sep 2023 12:41:12 -0400 Subject: [PATCH] Task 42: recreating the PR --- src/validator/create_schemas.py | 92 +++++++++++++++++++++++++++++++++ src/validator/main.py | 17 +----- 2 files changed, 94 insertions(+), 15 deletions(-) diff --git a/src/validator/create_schemas.py b/src/validator/create_schemas.py index e582747b..22a27827 100644 --- a/src/validator/create_schemas.py +++ b/src/validator/create_schemas.py @@ -1,6 +1,7 @@ """Creates two DataFrameSchema objects by rendering the schema template with validations listed in phase 1 and phase 2.""" +import pandas as pd from pandera import DataFrameSchema from pandera.errors import SchemaErrors from phase_validations import get_phase_1_and_2_validations_for_lei @@ -49,3 +50,94 @@ def get_phase_1_schema_for_lei(lei: str = None): def get_phase_2_schema_for_lei(lei: str = None): return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei) + + +def validate(schema: DataFrameSchema, df: pd.DataFrame): + """ + validate received dataframe with schema and return list of + schema errors + + Args: + schema (DataFrameSchema): schema to be used for validation + df (pd.DataFrame): data parsed into dataframe + + Returns: + list of schema error + """ + findings = [] + try: + schema(df, lazy=True) + except SchemaErrors as errors: + for schema_error in errors.schema_errors: + error = schema_error["error"] + check: SBLCheck = error.check + column_name = error.schema.name + check_id = "n/a" + + fields: list[str] = [column_name] + + if hasattr(check, "name"): + check_name: str = check.name + + if check.groupby: + fields += check.groupby # type: ignore + + # This will either be a boolean series or a single bool + check_output = error.check_output + else: + # This means this check's column has unique set to True. + # we shouldn't be using Unique flag as it doesn't return series of + # validation result . it returns just a printout result string/txt + raise AttributeError(f"{str(check)}") + + if hasattr(check, "id"): + check_id: str = check.id + + # Remove duplicates, but keep as `list` for JSON-friendliness + fields = list(set(fields)) + + if check_output is not None: + # `check_output` must be sorted so its index lines up with `df`'s index + check_output.sort_index(inplace=True) + + # Filter records using Pandas's boolean indexing, where all False values + # get filtered out. The `~` does the inverse since it's actually the + # False values we want to keep. + # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing + failed_check_fields_df = df[~check_output][fields].fillna("") + + # Create list of dicts representing the failed validations and the + # associated field data for each invalid record. + records = [] + for idx, row in failed_check_fields_df.iterrows(): + record = {"number": idx + 1, "field_values": {}} + for field in fields: + record["field_values"][field] = row[field] + records.append(record) + + validation_findings = { + "validation": { + "id": check_id, + "name": check_name, + "description": check.description, + "fields": fields, + "severity": "warning" if check.warning else "error", + }, + "records": records, + } + + findings.append(validation_findings) + + return findings + + +def validate_phases_by_lei(df: pd.DataFrame, lei: str) -> list: + phase1_findings = validate(get_phase_1_schema_for_lei(lei), df) + if phase1_findings: + return phase1_findings + else: + phase2_findings = validate(get_phase_2_schema_for_lei((lei)), df) + if phase2_findings: + return phase2_findings + else: + return [{"response": "No validations errors or warnings"}] diff --git a/src/validator/main.py b/src/validator/main.py index 433275be..58fa0b7a 100644 --- a/src/validator/main.py +++ b/src/validator/main.py @@ -12,6 +12,7 @@ get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, print_schema_errors, + validate_phases_by_lei, ) from pandera.errors import SchemaErrors @@ -32,21 +33,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None: print(df) print("") - phase_1_failure_cases = None - - phase_1_sblar_schema = get_phase_1_schema_for_lei(lei) - try: - phase_1_sblar_schema(df, lazy=True) - except SchemaErrors as errors: - phase_1_failure_cases = errors.failure_cases - print_schema_errors(errors, "Phase 1") - - if phase_1_failure_cases is None: - phase_2_sblar_schema = get_phase_2_schema_for_lei(lei) - try: - phase_2_sblar_schema(df, lazy=True) - except SchemaErrors as errors: - print_schema_errors(errors, "Phase 2") + print(validate_phases_by_lei(df, lei)) if __name__ == "__main__":