Skip to content

Commit

Permalink
Simplify SBLCheck and make severity more explict
Browse files Browse the repository at this point in the history
- Refactor `SBLCheck` to accept Pandara `Check` constructor's
  params, and set the severity explicitly instead of `warning` bool.
- Fixed the severity of several checks
- Improve exception handling for SchemaErrors
- Simply type checker fixups
  • Loading branch information
hkeeler committed Oct 16, 2023
1 parent ac668a0 commit bd8f3c0
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 178 deletions.
81 changes: 34 additions & 47 deletions regtech_data_validator/checks.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,58 @@
"""Custom subclass for warnings and errors.
The class SBLCheck is a subclass of the standard Pandera Check class
that requires the `name` kwarg to be supplied. Errors and warnings are
distinguised based on the value of the warning attribute. It defaults
to false but can be set to True during init to indicate the validation
should be handled as a warning rather than an error.
Examples:
warning_check = SBLCheck(
lambda: True,
warning=True,
name="Just a Warning"
)
error_check_implied = SBLCheck(lambda: True, name="Error Check")
error_check_explicit = SBLCheck(
lambda: True,
warning=False,
name="Also an Error"
)
"""
Subclasses of Pandera's `Check` class
"""


from enum import Enum, auto
from typing import Any, Callable, Type

from pandera import Check
from pandera.backends.base import BaseCheckBackend
from pandera.backends.pandas.checks import PandasCheckBackend

class Severity(Enum):
ERROR = auto()
WARNING = auto()

class SBLCheck(Check):
"""A custom Pandera.Check subclasss that requires a `name` and an `id` be
"""
A Pandera.Check subclasss that requires a `name` and an `id` be
specified. Additionally, an attribute named `warning` is added to
the class to enable distinction between warnings and errors. The
default value of warning is `False` which corresponds to an error.
Don't use this class directly. Make use of the SBLErrorCheck and
SBLWarningCheck subclasses below."""

def __init__(self, check_fn: Callable, id: str = None, warning=False, *args, **kwargs):
"""Custom init method that verifies the presence of `name` and `id` in
kwargs creates a custom class attribute called `warning`. All
other initializaiton is handled by the parent Check class.
SBLWarningCheck subclasses below.
"""

def __init__(self,
check_fn: Callable,
id: str,
name: str,
description: str,
severity: Severity,
**check_kwargs):
"""
Subclass of Pandera's `Check`, with special handling for severity level
Args:
check_fn (Callable): A function which evaluates the validity
of the column(s) being tested.
id (str, required): Each check mut have an id.
warning (bool, optional): Boolean specifying whether to
treat the check as a warning rather than an error.
check_fn (Callable): A function which evaluates the validity of the column(s) being tested.
id (str, required): Unique identifier for a check
name (str, required): Unique name for a check
description (str, required): Long-form description of a check
severity (Severity, required): The severity of a check (error or warning)
check_kwargs (Any, optional): Parameters passed to `check_fn` function
Raises:
ValueError: Raised if `name` not supplied in kwargs and if id is not
supplied or None.
"""

self.id = id

if "name" not in kwargs or id is None:
raise ValueError("Each check must be assigned a `name` and an `id`.")

# if warning==False treat check as an error check
self.warning = warning
self.severity = severity

super().__init__(check_fn=check_fn, *args, **kwargs)
super().__init__(
check_fn,
title=id,
name=name,
description=description,
**check_kwargs
)

@classmethod
def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]:
Expand Down
122 changes: 63 additions & 59 deletions regtech_data_validator/create_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pandas as pd
from pandera import DataFrameSchema
from pandera.errors import SchemaErrors
from pandera.errors import SchemaErrors, SchemaError

from regtech_data_validator.checks import SBLCheck
from regtech_data_validator.phase_validations import get_phase_1_and_2_validations_for_lei
Expand All @@ -17,22 +17,22 @@
phase_2_template = get_template()


def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str|None = None):
for column in get_phase_1_and_2_validations_for_lei(lei):
validations = get_phase_1_and_2_validations_for_lei(lei)[column]
template[column].checks = validations[phase]
return DataFrameSchema(template)


def get_phase_1_schema_for_lei(lei: str = None):
def get_phase_1_schema_for_lei(lei: str|None = None):
return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei)


def get_phase_2_schema_for_lei(lei: str = None):
def get_phase_2_schema_for_lei(lei: str|None = None):
return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei)


def validate(schema: DataFrameSchema, df: pd.DataFrame):
def validate(schema: DataFrameSchema, df: pd.DataFrame) -> list[dict]:
"""
validate received dataframe with schema and return list of
schema errors
Expand All @@ -42,73 +42,77 @@ def validate(schema: DataFrameSchema, df: pd.DataFrame):
df (pd.DataFrame): data parsed into dataframe
Returns:
list of schema error
list of validation findings (warnings and errors)
"""
findings = []
try:
schema(df, lazy=True)
except SchemaErrors as errors:
for error in errors.schema_errors:
check: SBLCheck = error.check
column_name = error.schema.name
check_id = "n/a"

fields: list[str] = [column_name]

if hasattr(check, "name") and hasattr(check, "id"):
check_name: str = check.name
check_id: str = check.id
except SchemaErrors as err:

# WARN: SchemaErrors.schema_errors is supposed to be of type
# list[dict[str,Any]], but it's actually of type SchemaError
schema_error: SchemaError
for schema_error in err.schema_errors: # type: ignore
check = schema_error.check
column_name = schema_error.schema.name

if not check:
raise RuntimeError(
f'SchemaError occurred with no associated Check for {column_name} column'
) from schema_error

if not isinstance(check, SBLCheck):
raise RuntimeError(
f'Check {check} type on {column_name} column not supported'
) from schema_error
else:
fields: list[str] = [column_name]

if check.groupby:
fields += check.groupby # type: ignore

# This will either be a boolean series or a single bool
check_output = error.check_output
else:
# This means this check's column has unique set to True.
# we shouldn't be using Unique flag as it doesn't return series of
# validation result . it returns just a printout result string/txt
raise AttributeError(f"{str(check)}")

# Remove duplicates, but keep as `list` for JSON-friendliness
fields = list(set(fields))

if check_output is not None:
# `check_output` must be sorted so its index lines up with `df`'s index
check_output.sort_index(inplace=True)

# Filter records using Pandas's boolean indexing, where all False values
# get filtered out. The `~` does the inverse since it's actually the
# False values we want to keep.
# http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
failed_check_fields_df = df[~check_output][fields].fillna("")

# Create list of dicts representing the failed validations and the
# associated field data for each invalid record.
records = []
for idx, row in failed_check_fields_df.iterrows():
record = {"number": idx + 1, "field_values": {}}
for field in fields:
record["field_values"][field] = row[field]
records.append(record)

validation_findings = {
"validation": {
"id": check_id,
"name": check_name,
"description": check.description,
"fields": fields,
"severity": "warning" if check.warning else "error",
},
"records": records,
}

findings.append(validation_findings)
check_output = schema_error.check_output

# Remove duplicates, but keep as `list` for JSON-friendliness
fields = list(set(fields))

if check_output is not None:
# `check_output` must be sorted so its index lines up with `df`'s index
check_output.sort_index(inplace=True)

# Filter records using Pandas's boolean indexing, where all False values
# get filtered out. The `~` does the inverse since it's actually the
# False values we want to keep.
# http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
failed_check_fields_df = df[~check_output][fields].fillna("")

# Create list of dicts representing the failed validations and the
# associated field data for each invalid record.
records = []
for idx, row in failed_check_fields_df.iterrows():
record = {"number": idx + 1, "field_values": {}}
for field in fields:
record["field_values"][field] = row[field]
records.append(record)

validation_findings = {
"validation": {
"id": check.title,
"name": check.name,
"description": check.description,
"severity": check.severity,
"fields": fields,
},
"records": records,
}

findings.append(validation_findings)

return findings


def validate_phases(df: pd.DataFrame, lei: str = None) -> list:
def validate_phases(df: pd.DataFrame, lei: str|None = None) -> list:
phase1_findings = validate(get_phase_1_schema_for_lei(lei), df)
if phase1_findings:
return phase1_findings
Expand Down
4 changes: 2 additions & 2 deletions regtech_data_validator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def csv_to_df(path: str) -> pd.DataFrame:
return pd.read_csv(path, dtype=str, na_filter=False)


def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
def run_validation_on_df(df: pd.DataFrame, lei: str|None) -> None:
"""
Run validation on the supplied dataframe and print a report to
the terminal.
Expand All @@ -27,7 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:

if __name__ == "__main__":
csv_path = None
lei: str = None
lei: str|None = None
if len(sys.argv) == 1:
raise ValueError("csv_path arg not provided")
elif len(sys.argv) == 2:
Expand Down
Loading

0 comments on commit bd8f3c0

Please sign in to comment.