Simplify SBLCheck and make severity more explict

- Refactor `SBLCheck` to accept Pandara `Check` constructor's params, and set the severity explicitly instead of `warning` bool. - Fixed the severity of several checks - Improve exception handling for SchemaErrors - Simply type checker fixups
cfpb · Oct 16, 2023 · bd8f3c0 · bd8f3c0
1 parent ac668a0
commit bd8f3c0
Show file tree

Hide file tree

Showing 6 changed files with 292 additions and 178 deletions.
diff --git a/regtech_data_validator/checks.py b/regtech_data_validator/checks.py
@@ -1,71 +1,58 @@
-"""Custom subclass for warnings and errors. 
-
-The class SBLCheck is a subclass of the standard Pandera Check class
-that requires the `name` kwarg to be supplied. Errors and warnings are
-distinguised based on the value of the warning attribute. It defaults
-to false but can be set to True during init to indicate the validation
-should be handled as a warning rather than an error. 
-
-Examples:
-
-    warning_check = SBLCheck(
-        lambda: True, 
-        warning=True, 
-        name="Just a Warning"
-    )
-    
-    error_check_implied = SBLCheck(lambda: True, name="Error Check")
-    
-    error_check_explicit = SBLCheck(
-        lambda: True,
-        warning=False, 
-        name="Also an Error"
-    )
+"""
+Subclasses of Pandera's `Check` class
 """
 
-
+from enum import Enum, auto
 from typing import Any, Callable, Type
 
 from pandera import Check
 from pandera.backends.base import BaseCheckBackend
 from pandera.backends.pandas.checks import PandasCheckBackend
 
+class Severity(Enum):
+    ERROR = auto()
+    WARNING = auto()
 
 class SBLCheck(Check):
-    """A custom Pandera.Check subclasss that requires a `name` and an `id` be
+    """
+    A Pandera.Check subclasss that requires a `name` and an `id` be
     specified. Additionally, an attribute named `warning` is added to
     the class to enable distinction between warnings and errors. The
     default value of warning is `False` which corresponds to an error.
 
     Don't use this class directly. Make use of the SBLErrorCheck and
-    SBLWarningCheck subclasses below."""
-
-    def __init__(self, check_fn: Callable, id: str = None, warning=False, *args, **kwargs):
-        """Custom init method that verifies the presence of `name` and `id` in
-        kwargs creates a custom class attribute called `warning`. All
-        other initializaiton is handled by the parent Check class.
+    SBLWarningCheck subclasses below.
+    """
+
+    def __init__(self, 
+                 check_fn: Callable,
+                 id: str,
+                 name: str,
+                 description: str,
+                 severity: Severity,
+                 **check_kwargs):
+        """
+        Subclass of Pandera's `Check`, with special handling for severity level
 
         Args:
-            check_fn (Callable): A function which evaluates the validity
-                of the column(s) being tested.
-            id (str, required): Each check mut have an id.
-            warning (bool, optional): Boolean specifying whether to
-                treat the check as a warning rather than an error.
+            check_fn (Callable): A function which evaluates the validity of the column(s) being tested.
+            id (str, required): Unique identifier for a check
+            name (str, required): Unique name for a check
+            description (str, required): Long-form description of a check
+            severity (Severity, required): The severity of a check (error or warning)
+            check_kwargs (Any, optional): Parameters passed to `check_fn` function
 
-        Raises:
-            ValueError: Raised if `name` not supplied in kwargs and if id is not
-            supplied or None.
         """
 
-        self.id = id
-
-        if "name" not in kwargs or id is None:
-            raise ValueError("Each check must be assigned a `name` and an `id`.")
-
-        # if warning==False treat check as an error check
-        self.warning = warning
+        self.severity = severity
 
-        super().__init__(check_fn=check_fn, *args, **kwargs)
+        super().__init__(
+            check_fn,
+            title=id,
+            name=name,
+            description=description,
+            **check_kwargs
+        )
 
     @classmethod
     def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]:

diff --git a/regtech_data_validator/create_schemas.py b/regtech_data_validator/create_schemas.py
@@ -3,7 +3,7 @@
 
 import pandas as pd
 from pandera import DataFrameSchema
-from pandera.errors import SchemaErrors
+from pandera.errors import SchemaErrors, SchemaError
 
 from regtech_data_validator.checks import SBLCheck
 from regtech_data_validator.phase_validations import get_phase_1_and_2_validations_for_lei
@@ -17,22 +17,22 @@
 phase_2_template = get_template()
 
 
-def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None):
+def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str|None = None):
     for column in get_phase_1_and_2_validations_for_lei(lei):
         validations = get_phase_1_and_2_validations_for_lei(lei)[column]
         template[column].checks = validations[phase]
     return DataFrameSchema(template)
 
 
-def get_phase_1_schema_for_lei(lei: str = None):
+def get_phase_1_schema_for_lei(lei: str|None = None):
     return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei)
 
 
-def get_phase_2_schema_for_lei(lei: str = None):
+def get_phase_2_schema_for_lei(lei: str|None = None):
     return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei)
 
 
-def validate(schema: DataFrameSchema, df: pd.DataFrame):
+def validate(schema: DataFrameSchema, df: pd.DataFrame) -> list[dict]:
     """
     validate received dataframe with schema and return list of
     schema errors
@@ -42,73 +42,77 @@ def validate(schema: DataFrameSchema, df: pd.DataFrame):
         df (pd.DataFrame): data parsed into dataframe
 
     Returns:
-        list of schema error
+        list of validation findings (warnings and errors)
     """
     findings = []
     try:
         schema(df, lazy=True)
-    except SchemaErrors as errors:
-        for error in errors.schema_errors:
-            check: SBLCheck = error.check
-            column_name = error.schema.name
-            check_id = "n/a"
-
-            fields: list[str] = [column_name]
-
-            if hasattr(check, "name") and hasattr(check, "id"):
-                check_name: str = check.name
-                check_id: str = check.id
+    except SchemaErrors as err:
+
+        # WARN: SchemaErrors.schema_errors is supposed to be of type
+        #       list[dict[str,Any]], but it's actually of type SchemaError
+        schema_error: SchemaError
+        for schema_error in err.schema_errors: # type: ignore
+            check = schema_error.check
+            column_name = schema_error.schema.name
+
+            if not check:
+                raise RuntimeError(
+                    f'SchemaError occurred with no associated Check for {column_name} column'
+                ) from schema_error
+
+            if not isinstance(check, SBLCheck):
+                raise RuntimeError(
+                    f'Check {check} type on {column_name} column not supported'
+                ) from schema_error
+            else:
+                fields: list[str] = [column_name]
 
                 if check.groupby:
                     fields += check.groupby  # type: ignore
 
                 # This will either be a boolean series or a single bool
-                check_output = error.check_output
-            else:
-                # This means this check's column has unique set to True.
-                # we shouldn't be using Unique flag as it doesn't return series of
-                # validation result .  it returns just a printout result string/txt
-                raise AttributeError(f"{str(check)}")
-
-            # Remove duplicates, but keep as `list` for JSON-friendliness
-            fields = list(set(fields))
-
-            if check_output is not None:
-                # `check_output` must be sorted so its index lines up with `df`'s index
-                check_output.sort_index(inplace=True)
-
-                # Filter records using Pandas's boolean indexing, where all False values
-                # get filtered out. The `~` does the inverse since it's actually the
-                # False values we want to keep.
-                # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
-                failed_check_fields_df = df[~check_output][fields].fillna("")
-
-                # Create list of dicts representing the failed validations and the
-                # associated field data for each invalid record.
-                records = []
-                for idx, row in failed_check_fields_df.iterrows():
-                    record = {"number": idx + 1, "field_values": {}}
-                    for field in fields:
-                        record["field_values"][field] = row[field]
-                    records.append(record)
-
-                validation_findings = {
-                    "validation": {
-                        "id": check_id,
-                        "name": check_name,
-                        "description": check.description,
-                        "fields": fields,
-                        "severity": "warning" if check.warning else "error",
-                    },
-                    "records": records,
-                }
-
-                findings.append(validation_findings)
+                check_output = schema_error.check_output
+
+                # Remove duplicates, but keep as `list` for JSON-friendliness
+                fields = list(set(fields))
+
+                if check_output is not None:
+                    # `check_output` must be sorted so its index lines up with `df`'s index
+                    check_output.sort_index(inplace=True)
+
+                    # Filter records using Pandas's boolean indexing, where all False values
+                    # get filtered out. The `~` does the inverse since it's actually the
+                    # False values we want to keep.
+                    # http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing
+                    failed_check_fields_df = df[~check_output][fields].fillna("")
+
+                    # Create list of dicts representing the failed validations and the
+                    # associated field data for each invalid record.
+                    records = []
+                    for idx, row in failed_check_fields_df.iterrows():
+                        record = {"number": idx + 1, "field_values": {}}
+                        for field in fields:
+                            record["field_values"][field] = row[field]
+                        records.append(record)
+
+                    validation_findings = {
+                        "validation": {
+                            "id": check.title,
+                            "name": check.name,
+                            "description": check.description,
+                            "severity": check.severity,
+                            "fields": fields,
+                        },
+                        "records": records,
+                    }
+
+                    findings.append(validation_findings)
 
     return findings
 
 
-def validate_phases(df: pd.DataFrame, lei: str = None) -> list:
+def validate_phases(df: pd.DataFrame, lei: str|None = None) -> list:
     phase1_findings = validate(get_phase_1_schema_for_lei(lei), df)
     if phase1_findings:
         return phase1_findings

diff --git a/regtech_data_validator/main.py b/regtech_data_validator/main.py
@@ -16,7 +16,7 @@ def csv_to_df(path: str) -> pd.DataFrame:
     return pd.read_csv(path, dtype=str, na_filter=False)
 
 
-def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
+def run_validation_on_df(df: pd.DataFrame, lei: str|None) -> None:
     """
     Run validation on the supplied dataframe and print a report to
     the terminal.
@@ -27,7 +27,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None:
 
 if __name__ == "__main__":
     csv_path = None
-    lei: str = None
+    lei: str|None = None
     if len(sys.argv) == 1:
         raise ValueError("csv_path arg not provided")
     elif len(sys.argv) == 2: