From ba6a1c495ba243fd0a84f59572106429229bfd77 Mon Sep 17 00:00:00 2001 From: Hans Keeler Date: Fri, 20 Oct 2023 19:56:46 -0400 Subject: [PATCH] refactor: standardize repo structure and other prep for open-sourcing (#60) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Grab bag of tune-up in prep for open-sourcing this repo. 1. Restructure repo to be more compliant with modern Python projects. 1. Move `tests` out to top-level directory. 2. Rename `src/validator` to `regtech_data_validator`. 2. Consolidate external datasource code and data to `data` dir. 1. Move `config.py` settings into their respective scripts, and file paths are now passed in as CLI args instead. 3. Move processed CSV files into the project itself. This allows for simpler data lookups via package name via `importlib.resources`. This allowed the removal of the `ROOT_PATH` Python path logic in all of the `__init__.py`s. 4. Refactor `global_data.py` to load data only once where module is first imported. 5. Refactor `SBLCheck`'s 1. `warning: bool` for a more explicit `severity`, backed by an enum that only allows `ERROR` and `WARNING`. 1. Several of the warning-level validations were not setting `warning=True`, and were thus defaulting to `False`. This will prevent that. I also fixed all these instances. 2. Removes the need for translation to `severity` when building JSON output. 2. Use explicit args in the constructor, and pass all shared args on to parent class. This removes the need for the arg `name`/`id` error handling. 6. Switch CLI output from Python dict to JSON. 7. Rollback `black` version used in linting Action due to bug in latest version. - https://github.com/psf/black/issues/3953 **Note:** Some of the files that I both moved _and_ changed seem to now show as having deleted the old file and created a new one. I'm not sure why it's doing this. I did the moves and changes in separate commits, which usually prevents this, but doesn't seem to be the case here. Perhaps there's just so much change in some that git considers it a whole new file? 🤷 It's kind of annoying, especially if it results in losing git history for those files. --- .devcontainer/devcontainer.json | 2 +- .github/workflows/linters.yml | 5 +- config.py | 19 -- data/census/README.md | 3 + {tools => data/census}/process_census.py | 65 ++--- data/naics/README.md | 3 + data/naics/process_naics.py | 57 +++++ poetry.lock | 29 ++- pyproject.toml | 12 +- .../__init__.py | 0 .../check_functions.py | 0 regtech_data_validator/checks.py | 47 ++++ .../create_schemas.py | 69 ++--- .../data/census}/Census2022.processed.csv | 0 .../data/census}/__init__.py | 0 .../data/naics}/2022_codes.csv | 0 .../data/naics/__init__.py | 0 regtech_data_validator/global_data.py | 20 ++ .../main.py | 20 +- .../phase_validations.py | 236 ++++++++++++++---- .../schema_template.py | 7 +- src/tests/__init__.py | 6 - src/tests/test_checks.py | 27 -- src/tests/test_global_data.py | 25 -- src/validator/__init__.py | 5 - src/validator/checks.py | 73 ------ src/validator/global_data.py | 36 --- tests/__init__.py | 0 .../data/sbl-validations-fail.csv | 0 .../data/sbl-validations-pass.csv | 0 {src/tests => tests}/test_check_functions.py | 8 +- tests/test_global_data.py | 9 + {src/tests => tests}/test_sample_data.py | 12 +- {src/tests => tests}/test_schema_functions.py | 7 +- tools/process_naics.py | 49 ---- 35 files changed, 469 insertions(+), 382 deletions(-) delete mode 100644 config.py create mode 100644 data/census/README.md rename {tools => data/census}/process_census.py (56%) create mode 100644 data/naics/README.md create mode 100644 data/naics/process_naics.py rename __init__.py => regtech_data_validator/__init__.py (100%) rename {src/validator => regtech_data_validator}/check_functions.py (100%) create mode 100644 regtech_data_validator/checks.py rename {src/validator => regtech_data_validator}/create_schemas.py (63%) rename {data/census/processed => regtech_data_validator/data/census}/Census2022.processed.csv (100%) rename {tools => regtech_data_validator/data/census}/__init__.py (100%) rename {data/naics/processed => regtech_data_validator/data/naics}/2022_codes.csv (100%) rename src/tests/.gitkeep => regtech_data_validator/data/naics/__init__.py (100%) create mode 100644 regtech_data_validator/global_data.py rename {src/validator => regtech_data_validator}/main.py (71%) rename {src/validator => regtech_data_validator}/phase_validations.py (93%) rename {src/validator => regtech_data_validator}/schema_template.py (98%) delete mode 100644 src/tests/__init__.py delete mode 100644 src/tests/test_checks.py delete mode 100644 src/tests/test_global_data.py delete mode 100644 src/validator/__init__.py delete mode 100644 src/validator/checks.py delete mode 100644 src/validator/global_data.py create mode 100644 tests/__init__.py rename {src/tests => tests}/data/sbl-validations-fail.csv (100%) rename {src/tests => tests}/data/sbl-validations-pass.csv (100%) rename {src/tests => tests}/test_check_functions.py (99%) create mode 100644 tests/test_global_data.py rename {src/tests => tests}/test_sample_data.py (82%) rename {src/tests => tests}/test_schema_functions.py (98%) delete mode 100644 tools/process_naics.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 6fc52aad..9656043b 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -49,7 +49,7 @@ "python.testing.unittestEnabled": false, "python.testing.pytestArgs": [ "--rootdir", - "${workspaceFolder}/src/tests" + "${workspaceFolder}/tests" ] } } diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml index 2f242985..db76efdf 100644 --- a/.github/workflows/linters.yml +++ b/.github/workflows/linters.yml @@ -8,8 +8,11 @@ jobs: steps: - uses: actions/checkout@v3 - uses: psf/black@stable + with: + options: "--check --diff --verbose" + version: "~= 22.0" ruff: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: chartboost/ruff-action@v1 \ No newline at end of file + - uses: chartboost/ruff-action@v1 diff --git a/config.py b/config.py deleted file mode 100644 index 00a125f2..00000000 --- a/config.py +++ /dev/null @@ -1,19 +0,0 @@ -# path to original/raw NAICS excel file -NAICS_EXCEL_PATH = "./data/naics/raw/2-6 digit_2022_Codes.xlsx" -# path to parsed/filtered naics codes file -NAICS_CSV_PATH = "./data/naics/processed/2022_codes.csv" -# column header text containing naics code -NAICS_CODE_COL = "2022 NAICS US Code" -# column header text containing naics title/description -NAICS_TITLE_COL = "2022 NAICS US Title" - -# path to original/raw NAICS zip file -CENSUS_RAW_ZIP_PATH = "./data/census/raw/CensusFlatFile2022.zip" -# path to parsed/filtered naics codes file -CENSUS_PROCESSED_CSV_PATH = "./data/census/processed/Census2022.processed.csv" -# census file col indexes -CENSUS_STATE_COL_INDEX = 2 -CENSUS_COUNTY_COL_INDEX = 3 -CENSUS_TRACT_COL_INDEX = 4 - -CENSUS_GEOID_COL = "geoid" diff --git a/data/census/README.md b/data/census/README.md new file mode 100644 index 00000000..0b57af2e --- /dev/null +++ b/data/census/README.md @@ -0,0 +1,3 @@ +# FFIEC's Census Flat File + +- https://www.ffiec.gov/censusapp.htm diff --git a/tools/process_census.py b/data/census/process_census.py similarity index 56% rename from tools/process_census.py rename to data/census/process_census.py index 0686b9c5..95a1445d 100644 --- a/tools/process_census.py +++ b/data/census/process_census.py @@ -5,10 +5,12 @@ import pandas as pd -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402 -sys.path.append(ROOT_DIR) # noqa: E402 +# census file col indexes +CENSUS_STATE_COL_INDEX = 2 +CENSUS_COUNTY_COL_INDEX = 3 +CENSUS_TRACT_COL_INDEX = 4 -import config # noqa: E402 +CENSUS_GEOID_COL = "geoid" # helper function to check number (float/int/negative) @@ -21,24 +23,22 @@ def _is_number(s): # helper function to unzip census file and extract CSV file -def _extract_census_zip_file(): - CENSUS_TMP_CSV_PATH = config.CENSUS_RAW_ZIP_PATH + ".tmp.csv" +def _extract_census_zip_file(raw_src): + census_tmp_csv_path = raw_src + ".tmp.csv" # unzip and extract csv files - with zipfile.ZipFile(config.CENSUS_RAW_ZIP_PATH, "r") as zip_ref: + with zipfile.ZipFile(raw_src, "r") as zip_ref: for file in zip_ref.namelist(): # iterate over files in archive if file[-4:] == ".csv": - print("Extracting CSV to {}".format(CENSUS_TMP_CSV_PATH)) - with open(CENSUS_TMP_CSV_PATH, "wb") as outfile: + print("Extracting CSV to {}".format(census_tmp_csv_path)) + with open(census_tmp_csv_path, "wb") as outfile: outfile.write(zip_ref.read(file)) - # it should only have one csv file - return CENSUS_TMP_CSV_PATH + # it should only have one csv file + + return census_tmp_csv_path # helper function to read extracted csv file and filter only geo-tract-id -def _read_census_csv(src_path: str, csv_path: str): - STATE_COL = config.CENSUS_STATE_COL_INDEX - COUNTY_COL = config.CENSUS_COUNTY_COL_INDEX - TRACT_COL = config.CENSUS_TRACT_COL_INDEX +def _process_census_csv(src_path: str, csv_path: str): # check paths if not os.path.isfile(src_path): @@ -52,14 +52,14 @@ def _read_census_csv(src_path: str, csv_path: str): ) # add header - result = [[config.CENSUS_GEOID_COL]] + result = [[CENSUS_GEOID_COL]] # read excel file # and create csv data list for index, row in df.iterrows(): - state_value = str(row[STATE_COL]) - county_value = str(row[COUNTY_COL]) - tract_value = str(row[TRACT_COL]) + state_value = str(row[CENSUS_STATE_COL_INDEX]) + county_value = str(row[CENSUS_COUNTY_COL_INDEX]) + tract_value = str(row[CENSUS_TRACT_COL_INDEX]) if ( _is_number(state_value) and _is_number(county_value) @@ -84,14 +84,23 @@ def _read_census_csv(src_path: str, csv_path: str): - output to defined output file """ if __name__ == "__main__": - CSV_PATH = config.CENSUS_PROCESSED_CSV_PATH - - if os.path.isfile(CSV_PATH): - error_msg = "Output {} csv file existed".format(CSV_PATH) - raise FileExistsError(error_msg) - - tmp_census_csv_file = _extract_census_zip_file() - print("Reading extracted CSV File . {}".format(tmp_census_csv_file)) - _read_census_csv(tmp_census_csv_file, CSV_PATH) - print("Removing extracted CSV File") + if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} ") + exit(1) + + raw_src = sys.argv[1] + csv_dest = sys.argv[2] + + if not os.path.isfile(raw_src): + print(f"source file not existed: {raw_src}") + exit(2) + + if os.path.isfile(csv_dest): + print("destination file already existed: {csv_dest}") + exit(3) + + tmp_census_csv_file = _extract_census_zip_file(raw_src) + print(f"Reading extracted CSV file: {tmp_census_csv_file}") + _process_census_csv(tmp_census_csv_file, csv_dest) + print("Removing extracted CSV file") os.remove(tmp_census_csv_file) diff --git a/data/naics/README.md b/data/naics/README.md new file mode 100644 index 00000000..fce44290 --- /dev/null +++ b/data/naics/README.md @@ -0,0 +1,3 @@ +# North American Industry Classification System (NAICS) codes + +- https://www.census.gov/naics/?48967 diff --git a/data/naics/process_naics.py b/data/naics/process_naics.py new file mode 100644 index 00000000..f6e1a251 --- /dev/null +++ b/data/naics/process_naics.py @@ -0,0 +1,57 @@ +import csv +import os +import sys + +import pandas as pd + + +# column header text containing naics code +NAICS_CODE_COL = "2022 NAICS US Code" +# column header text containing naics title/description +NAICS_TITLE_COL = "2022 NAICS US Title" + + +""" +filter NAICS data with only 3 digit codes + +Raises: + FileNotFoundError: when input excel file not existed + FileExistsError: when output csv file existed +""" +if __name__ == "__main__": + if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} ") + exit(1) + + raw_src = sys.argv[1] + csv_dest = sys.argv[2] + + if not os.path.isfile(raw_src): + print(f"source file not existed: {raw_src}") + exit(2) + + if os.path.isfile(csv_dest): + print("destination file already existed: {csv_dest}") + exit(3) + + df = pd.read_excel(raw_src, dtype=str, na_filter=False) + + print(f'source file successfully read: {raw_src}') + + # add header + result = [["code", "title"]] + + # read excel file + # and create csv data list + for index, row in df.iterrows(): + code = str(row[NAICS_CODE_COL]) + if len(code) == 3: + a_row = [code, str(row[NAICS_TITLE_COL])] + result.append(a_row) + + # output data to csv file + with open(csv_dest, "w") as f: + writer = csv.writer(f) + writer.writerows(result) + + print(f'destination file successfully written: {csv_dest}') diff --git a/poetry.lock b/poetry.lock index 7fcbe376..600691d0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "black" @@ -136,6 +136,17 @@ files = [ [package.extras] toml = ["tomli"] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -203,6 +214,20 @@ files = [ {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"}, ] +[[package]] +name = "openpyxl" +version = "3.1.2" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, + {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "packaging" version = "23.1" @@ -642,4 +667,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "03e6adb7dcecd12194f8c44033d68666019c5bb52f8fd4bccd7301067832c9e1" +content-hash = "ac6360d9068e34f6bbad74a6c3339a85dd1968267f7272b48b8a99dfc5702812" diff --git a/pyproject.toml b/pyproject.toml index b84d616d..ea4c165f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,9 @@ pytest-cov = "4.1.0" black = "23.3.0" ruff = "0.0.259" +[tool.poetry.group.data.dependencies] +openpyxl = "^3.1.2" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" @@ -33,7 +36,6 @@ exclude = ''' | .gitignore | .github | data - | tools )/ ''' @@ -49,18 +51,18 @@ addopts = [ "--cov-branch", "--cov-report=xml", "--cov-report=term", - "--cov=src", + "--cov=regtech_data_validator", "-vv", "--strict-markers", "-rfE", ] testpaths = [ - "src/tests", + "tests", ] [tool.coverage.run] relative_files = true -source = ["src"] +source = ["regtech_data_validator"] [tool.coverage.report] -skip_empty = true \ No newline at end of file +skip_empty = true diff --git a/__init__.py b/regtech_data_validator/__init__.py similarity index 100% rename from __init__.py rename to regtech_data_validator/__init__.py diff --git a/src/validator/check_functions.py b/regtech_data_validator/check_functions.py similarity index 100% rename from src/validator/check_functions.py rename to regtech_data_validator/check_functions.py diff --git a/regtech_data_validator/checks.py b/regtech_data_validator/checks.py new file mode 100644 index 00000000..802dcbc7 --- /dev/null +++ b/regtech_data_validator/checks.py @@ -0,0 +1,47 @@ +""" +Subclasses of Pandera's `Check` class +""" + +from enum import StrEnum +from typing import Any, Callable, Type + +from pandera import Check +from pandera.backends.base import BaseCheckBackend +from pandera.backends.pandas.checks import PandasCheckBackend + + +class Severity(StrEnum): + ERROR = 'error' + WARNING = 'warning' + + +class SBLCheck(Check): + """ + A Pandera.Check subclasss that requires a `name` and an `id` be + specified. Additionally, an attribute named `warning` is added to + the class to enable distinction between warnings and errors. The + default value of warning is `False` which corresponds to an error. + Don't use this class directly. Make use of the SBLErrorCheck and + SBLWarningCheck subclasses below. + """ + + def __init__(self, check_fn: Callable, id: str, name: str, description: str, severity: Severity, **check_kwargs): + """ + Subclass of Pandera's `Check`, with special handling for severity level + Args: + check_fn (Callable): A function which evaluates the validity of the column(s) being tested. + id (str, required): Unique identifier for a check + name (str, required): Unique name for a check + description (str, required): Long-form description of a check + severity (Severity, required): The severity of a check (error or warning) + check_kwargs (Any, optional): Parameters passed to `check_fn` function + """ + + self.severity = severity + + super().__init__(check_fn, title=id, name=name, description=description, **check_kwargs) + + @classmethod + def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]: + """Assume Pandas DataFrame and return PandasCheckBackend""" + return PandasCheckBackend diff --git a/src/validator/create_schemas.py b/regtech_data_validator/create_schemas.py similarity index 63% rename from src/validator/create_schemas.py rename to regtech_data_validator/create_schemas.py index cc3cf8d7..bbb5e99d 100644 --- a/src/validator/create_schemas.py +++ b/regtech_data_validator/create_schemas.py @@ -2,11 +2,13 @@ with validations listed in phase 1 and phase 2.""" import pandas as pd -from checks import SBLCheck from pandera import DataFrameSchema -from pandera.errors import SchemaErrors -from phase_validations import get_phase_1_and_2_validations_for_lei -from schema_template import get_template +from pandera.errors import SchemaErrors, SchemaError + +from regtech_data_validator.checks import SBLCheck +from regtech_data_validator.phase_validations import get_phase_1_and_2_validations_for_lei +from regtech_data_validator.schema_template import get_template + # Get separate schema templates for phase 1 and 2 @@ -15,58 +17,59 @@ phase_2_template = get_template() -def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str = None): +def get_schema_by_phase_for_lei(template: dict, phase: str, lei: str | None = None): for column in get_phase_1_and_2_validations_for_lei(lei): validations = get_phase_1_and_2_validations_for_lei(lei)[column] template[column].checks = validations[phase] return DataFrameSchema(template) -def get_phase_1_schema_for_lei(lei: str = None): +def get_phase_1_schema_for_lei(lei: str | None = None): return get_schema_by_phase_for_lei(phase_1_template, "phase_1", lei) -def get_phase_2_schema_for_lei(lei: str = None): +def get_phase_2_schema_for_lei(lei: str | None = None): return get_schema_by_phase_for_lei(phase_2_template, "phase_2", lei) -def validate(schema: DataFrameSchema, df: pd.DataFrame): +def validate(schema: DataFrameSchema, df: pd.DataFrame) -> list[dict]: """ validate received dataframe with schema and return list of schema errors - Args: schema (DataFrameSchema): schema to be used for validation df (pd.DataFrame): data parsed into dataframe - Returns: - list of schema error + list of validation findings (warnings and errors) """ findings = [] try: schema(df, lazy=True) - except SchemaErrors as errors: - for error in errors.schema_errors: - check: SBLCheck = error.check - column_name = error.schema.name - check_id = "n/a" + except SchemaErrors as err: + # WARN: SchemaErrors.schema_errors is supposed to be of type + # list[dict[str,Any]], but it's actually of type SchemaError + schema_error: SchemaError + for schema_error in err.schema_errors: # type: ignore + check = schema_error.check + column_name = schema_error.schema.name + + if not check: + raise RuntimeError( + f'SchemaError occurred with no associated Check for {column_name} column' + ) from schema_error + + if not isinstance(check, SBLCheck): + raise RuntimeError( + f'Check {check} type on {column_name} column not supported. Must be of type {SBLCheck}' + ) from schema_error fields: list[str] = [column_name] - if hasattr(check, "name") and hasattr(check, "id"): - check_name: str = check.name - check_id: str = check.id - - if check.groupby: - fields += check.groupby # type: ignore + if check.groupby: + fields += check.groupby # type: ignore - # This will either be a boolean series or a single bool - check_output = error.check_output - else: - # This means this check's column has unique set to True. - # we shouldn't be using Unique flag as it doesn't return series of - # validation result . it returns just a printout result string/txt - raise AttributeError(f"{str(check)}") + # This will either be a boolean series or a single bool + check_output = schema_error.check_output # Remove duplicates, but keep as `list` for JSON-friendliness fields = list(set(fields)) @@ -92,11 +95,11 @@ def validate(schema: DataFrameSchema, df: pd.DataFrame): validation_findings = { "validation": { - "id": check_id, - "name": check_name, + "id": check.title, + "name": check.name, "description": check.description, + "severity": check.severity, "fields": fields, - "severity": "warning" if check.warning else "error", }, "records": records, } @@ -106,7 +109,7 @@ def validate(schema: DataFrameSchema, df: pd.DataFrame): return findings -def validate_phases(df: pd.DataFrame, lei: str = None) -> list: +def validate_phases(df: pd.DataFrame, lei: str | None = None) -> list: phase1_findings = validate(get_phase_1_schema_for_lei(lei), df) if phase1_findings: return phase1_findings diff --git a/data/census/processed/Census2022.processed.csv b/regtech_data_validator/data/census/Census2022.processed.csv similarity index 100% rename from data/census/processed/Census2022.processed.csv rename to regtech_data_validator/data/census/Census2022.processed.csv diff --git a/tools/__init__.py b/regtech_data_validator/data/census/__init__.py similarity index 100% rename from tools/__init__.py rename to regtech_data_validator/data/census/__init__.py diff --git a/data/naics/processed/2022_codes.csv b/regtech_data_validator/data/naics/2022_codes.csv similarity index 100% rename from data/naics/processed/2022_codes.csv rename to regtech_data_validator/data/naics/2022_codes.csv diff --git a/src/tests/.gitkeep b/regtech_data_validator/data/naics/__init__.py similarity index 100% rename from src/tests/.gitkeep rename to regtech_data_validator/data/naics/__init__.py diff --git a/regtech_data_validator/global_data.py b/regtech_data_validator/global_data.py new file mode 100644 index 00000000..d02d4fc7 --- /dev/null +++ b/regtech_data_validator/global_data.py @@ -0,0 +1,20 @@ +import csv +from importlib.resources import files + + +# global variable for NAICS codes +naics_codes: dict[str, str] = {} +naics_file_path = files('regtech_data_validator.data.naics').joinpath('2022_codes.csv') + +with naics_file_path.open('r') as f: + for row in csv.DictReader(f): + naics_codes[row['code']] = row['title'] + + +# global variable for Census GEOIDs +census_geoids: set[str] = set() +census_file_path = files('regtech_data_validator.data.census').joinpath('Census2022.processed.csv') + +with census_file_path.open('r') as f: + for row in csv.DictReader(f): + census_geoids.add(row['geoid']) diff --git a/src/validator/main.py b/regtech_data_validator/main.py similarity index 71% rename from src/validator/main.py rename to regtech_data_validator/main.py index 20972759..bed8bb3a 100644 --- a/src/validator/main.py +++ b/regtech_data_validator/main.py @@ -5,29 +5,33 @@ Run from the terminal to see the generated output. """ -import pprint +import json import sys import pandas as pd -from create_schemas import validate_phases + +from regtech_data_validator.create_schemas import validate_phases def csv_to_df(path: str) -> pd.DataFrame: return pd.read_csv(path, dtype=str, na_filter=False) -def run_validation_on_df(df: pd.DataFrame, lei: str) -> None: +def run_validation_on_df(df: pd.DataFrame, lei: str | None) -> None: """ Run validation on the supplied dataframe and print a report to the terminal. """ - pprint.pprint(validate_phases(df, lei)) + validation_dict = validate_phases(df, lei) + validation_json = json.dumps(validation_dict, indent=4) + print(validation_json) -if __name__ == "__main__": + +def main(): csv_path = None - lei: str = None + lei: str | None = None if len(sys.argv) == 1: raise ValueError("csv_path arg not provided") elif len(sys.argv) == 2: @@ -40,3 +44,7 @@ def run_validation_on_df(df: pd.DataFrame, lei: str) -> None: df = csv_to_df(csv_path) run_validation_on_df(df, lei) + + +if __name__ == "__main__": + main() diff --git a/src/validator/phase_validations.py b/regtech_data_validator/phase_validations.py similarity index 93% rename from src/validator/phase_validations.py rename to regtech_data_validator/phase_validations.py index fc7b0a22..20b23c06 100644 --- a/src/validator/phase_validations.py +++ b/regtech_data_validator/phase_validations.py @@ -4,8 +4,8 @@ an instance of a PanderaSchema object for phase 1 and phase 2.""" -import global_data -from check_functions import ( +from regtech_data_validator import global_data +from regtech_data_validator.check_functions import ( has_correct_length, has_no_conditional_field_conflict, has_valid_enum_pair, @@ -28,13 +28,10 @@ meets_multi_value_field_restriction, string_contains, ) -from checks import SBLCheck +from regtech_data_validator.checks import SBLCheck, Severity -# read and populate global naics code (this should be called only once) -global_data.read_naics_codes() - -def get_phase_1_and_2_validations_for_lei(lei: str = None): +def get_phase_1_and_2_validations_for_lei(lei: str | None = None): return { "uid": { "phase_1": [ @@ -46,6 +43,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "Any 'unique identifier' may not be used in more than one " "record within a small business lending application register." ), + severity=Severity.ERROR, groupby="uid", ), SBLCheck.str_length( @@ -57,6 +55,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "'Unique identifier' must be at least 21 characters " "in length and at most 45 characters in length." ), + severity=Severity.ERROR, ), SBLCheck( has_valid_format, @@ -67,6 +66,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "numbers and/or uppercase letters (i.e., 0-9 and A-Z), " "and must not contain any other characters." ), + severity=Severity.ERROR, element_wise=True, regex="^[A-Z0-9]+$", ), @@ -79,6 +79,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " match the Legal Entity Identifier (LEI) for the financial" " institution." ), + severity=Severity.WARNING, element_wise=True, containing_value=lei, end_idx=20, @@ -93,6 +94,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0020", name="app_date.invalid_date_format", description="'Application date' must be a real calendar date using YYYYMMDD format.", + severity=Severity.ERROR, element_wise=True, ), ], @@ -105,6 +107,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0040", name="app_method.invalid_enum_value", description="'Application method' must equal 1, 2, 3, or 4.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -123,6 +126,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0060", name="app_recipient.invalid_enum_value", description="'Application recipient' must equal 1 or 2", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -139,6 +143,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0080", name="ct_credit_product.invalid_enum_value", description="'Credit product' must equal 1, 2, 3, 4, 5, 6, 7, 8, 977, or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -158,6 +163,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): }, "ct_credit_product_ff": { "phase_1": [ + # FIXME: built-in Pandera checks do not support add'l params like `severity` SBLCheck.str_length( 0, 300, @@ -166,6 +172,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): description=( "'Free-form text field for other credit products' must not exceed 300 characters in length." ), + severity=Severity.ERROR, ) ], "phase_2": [ @@ -179,6 +186,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When 'credit product' equals 977, 'free-form text field " "for other credit products' must not be blank." ), + severity=Severity.ERROR, groupby="ct_credit_product", condition_values={"977"}, ), @@ -195,6 +203,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " semicolons) must equal 1, 2, 3, 4, 5, 6, 7, 8," " 9, 10, 11, 977, or 999." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -222,6 +231,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "'Type of guarantee' must contain at least one and at" " most five values, separated by semicolons." ), + severity=Severity.ERROR, element_wise=True, min_length=1, max_length=5, @@ -229,21 +239,21 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0123", - warning=True, name="ct_guarantee.duplicates_in_field", description="'Type of guarantee' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0122", - warning=True, name="ct_guarantee.multi_value_field_restriction", description=( "When 'type of guarantee' contains 999 (no guarantee)," " 'type of guarantee' should not contain more than one" " value." ), + severity=Severity.WARNING, element_wise=True, single_values={"999"}, ), @@ -257,6 +267,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0140", name="ct_guarantee_ff.invalid_text_length", description="'Free-form text field for other guarantee' must not exceed 300 characters in length", + severity=Severity.ERROR, ), ], "phase_2": [ @@ -270,13 +281,13 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When 'type of guarantee' contains 977, 'free-form text field" " for other guarantee' must not be blank." ), + severity=Severity.ERROR, groupby="ct_guarantee", condition_values={"977"}, ), SBLCheck( has_valid_multi_field_value_count, id="W2006", - warning=True, name="ct_guarantee_ff.multi_invalid_number_of_values", description=( "'Type of guarantee' and 'free-form text field for other " @@ -285,6 +296,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "toward the maximum number of values for the purpose of this " "validation check." ), + severity=Severity.WARNING, groupby="ct_guarantee", ignored_values={"977"}, max_length=5, @@ -300,6 +312,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): description=( "Each value in 'Loan term: NA/NP flag' (separated by semicolons) must equal 900, 988, or 999." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "900", @@ -321,6 +334,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "and otherwise undetermined), 'loan term: NA/NP flag' must" "equal 999." ), + severity=Severity.ERROR, groupby="ct_credit_product", conditions=[ { @@ -346,6 +360,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0180", name="ct_loan_term.invalid_numeric_format", description="When present, 'loan term' must be a whole number.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -360,6 +375,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "and reported), 'loan term' must be blank. When 'loan term:" "NA/NP flag' equals 900, 'loan term' must not be blank." ), + severity=Severity.ERROR, groupby="ct_loan_term_flag", condition_values={"900"}, ), @@ -368,6 +384,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0181", name="ct_loan_term.invalid_numeric_value", description="When present, 'loan term' must be greater than or equal to 1.", + severity=Severity.ERROR, element_wise=True, min_value="1", accept_blank=True, @@ -377,6 +394,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="W0182", name="ct_loan_term.unreasonable_numeric_value", description="When present, 'loan term' should be less than 1200 (100 years).", + severity=Severity.WARNING, element_wise=True, max_value="1200", accept_blank=True, @@ -394,6 +412,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " semicolons) must equal 1, 2, 3, 4, 5, 6, 7, 8," " 9, 10, 11, 977, 988, or 999." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -421,6 +440,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): description=( "'Credit purpose' must contain at least one and at most three values, separated by semicolons." ), + severity=Severity.ERROR, element_wise=True, min_length=1, max_length=3, @@ -428,13 +448,13 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( meets_multi_value_field_restriction, id="W0202", - warning=True, name="credit_purpose.multi_value_field_restriction", description=( "When 'credit purpose' contains 988 or 999," " 'credit purpose' should not contain more than one" " value." ), + severity=Severity.WARNING, element_wise=True, single_values={ "988", @@ -444,9 +464,9 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0203", - warning=True, name="credit_purpose.duplicates_in_field", description="'Credit purpose' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), ], @@ -461,6 +481,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): description=( "'Free-form text field for other credit purpose' must not exceed 300 characters in length" ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -474,13 +495,13 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When 'credit purpose' contains 977, 'free-form text field for" "other credit purpose' must not be blank." ), + severity=Severity.ERROR, groupby="credit_purpose", condition_values={"977"}, ), SBLCheck( has_valid_multi_field_value_count, id="W2006", - warning=True, name="credit_purpose_ff.multi_invalid_number_of_values", description=( "'Credit purpose' and 'free-form text field for other credit " @@ -489,6 +510,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "toward the maximum number of values for the purpose of " "this validation check." ), + severity=Severity.WARNING, groupby="credit_purpose", ignored_values={"977"}, max_length=3, @@ -502,6 +524,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0240", name="amount_applied_for_flag.invalid_enum_value", description="'Amount applied For: NA/NP flag' must equal 900, 988, or 999.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "900", @@ -519,6 +542,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0260", name="amount_applied_for.invalid_numeric_format", description="When present, 'amount applied for' must be a numeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -534,6 +558,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When 'amount applied for: NA/NP flag' equals 900, " "'amount applied for' must not be blank." ), + severity=Severity.ERROR, groupby="amount_applied_for_flag", condition_values={"900"}, ), @@ -542,6 +567,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0261", name="amount_applied_for.invalid_numeric_value", description="When present, 'amount applied for' must be greater than 0.", + severity=Severity.ERROR, element_wise=True, min_value="0", accept_blank=True, @@ -555,6 +581,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0280", name="amount_approved.invalid_numeric_format", description="When present, 'amount approved or originated' must be a numeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -565,6 +592,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0281", name="amount_approved.invalid_numeric_value", description="When present, 'amount approved or originated' must be greater than 0.", + severity=Severity.ERROR, element_wise=True, min_value="0", accept_blank=True, @@ -580,6 +608,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "equals 1 or 2, 'amount approved or originated' must " "not be blank." ), + severity=Severity.ERROR, groupby="action_taken", condition_values={"1", "2"}, ), @@ -592,6 +621,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0300", name="action_taken.invalid_enum_value", description="'Action taken' must equal 1, 2, 3, 4, or 5.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -621,6 +651,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "'Total origination charges', 'Amount of " "total broker fees', 'Initial annual charges'" ), + severity=Severity.ERROR, groupby=[ "pricing_interest_rate_type", "pricing_mca_addcost_flag", @@ -656,6 +687,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "penalty could be imposed', 'Prepayment " "penalty exists'" ), + severity=Severity.ERROR, groupby=[ "pricing_origination_charges", "pricing_broker_fees", @@ -681,6 +713,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0320", name="action_taken_date.invalid_date_format", description="'Action taken date' must be a real calendar date using YYYYMMDD format.", + severity=Severity.ERROR, element_wise=True, ), ], @@ -694,6 +727,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " within the current reporting period:" " October 1, 2024 to December 31, 2024." ), + severity=Severity.ERROR, element_wise=True, start_date_value="20241001", end_date_value="20241231", @@ -703,6 +737,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E2009", name="action_taken_date.date_value_conflict", description="The date indicated by 'action taken date' must occur on or after 'application date'.", + severity=Severity.ERROR, groupby="app_date", ), SBLCheck( @@ -714,6 +749,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " generally be less than two years (730 days) before" " 'action taken date'." ), + severity=Severity.WARNING, groupby="app_date", days_value=730, ), @@ -729,6 +765,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "Each value in 'denial reason(s)' (separated by semicolons)" "must equal 1, 2, 3, 4, 5, 6, 7, 8, 9, 977, or 999." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -753,6 +790,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): description=( "'Denial reason(s)' must contain at least one and at most fourvalues, separated by semicolons." ), + severity=Severity.ERROR, element_wise=True, min_length=1, max_length=4, @@ -766,6 +804,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "contain 999. When 'action taken' does not equal 3, 'denial" "reason(s)' must equal 999." ), + severity=Severity.ERROR, groupby="action_taken", conditions=[ { @@ -785,21 +824,21 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( meets_multi_value_field_restriction, id="W0340", - warning=True, name="denial_reasons.multi_value_field_restriction", description=( "When 'denial reason(s)' contains 999 (not applicable)," "'denial reason(s)' should not contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"999"}, ), SBLCheck( is_unique_in_field, id="W0341", - warning=True, name="denial_reasons.duplicates_in_field", description="'Denial reason(s)' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), ], @@ -814,6 +853,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): description=( "'Free-form text field for other denial reason(s)'must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -827,13 +867,13 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "blank. When 'denial reason(s)' contains 977, 'free-form text" "field for other denial reason(s)' must not be blank." ), + severity=Severity.ERROR, groupby="denial_reasons", condition_values={"977"}, ), SBLCheck( has_valid_multi_field_value_count, id="W2013", - warning=True, name="denial_reasons_ff.multi_invalid_number_of_values", description=( "'Denial reason(s)' and 'free-form text field for other " @@ -842,6 +882,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "does not count toward the maximum number of values for " "the purpose of this validation check." ), + severity=Severity.WARNING, groupby="denial_reasons", ignored_values={"977"}, max_length=4, @@ -858,6 +899,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "Each value in 'Interest rate type' (separated by " " semicolons) Must equal 1, 2, 3, 4, 5, 6, or 999" ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -878,7 +920,8 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): is_number, id="E0400", name="pricing_init_rate_period.invalid_numeric_format", - description=("When present, 'initial rate period' must be a whole number.",), + description="When present, 'initial rate period' must be a whole number.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -897,6 +940,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "be blank. When 'interest rate type' equals 3, 4, 5, or 6, " "'initial rate period' must not be blank" ), + severity=Severity.ERROR, groupby="pricing_interest_rate_type", condition_values={"3", "4", "5", "6"}, ), @@ -904,7 +948,8 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): is_greater_than, id="E0401", name="pricing_init_rate_period.invalid_numeric_value", - description=("When present, 'initial rate period' must be greater than 0",), + description="When present, 'initial rate period' must be greater than 0", + severity=Severity.ERROR, element_wise=True, min_value="0", accept_blank=True, @@ -918,6 +963,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0420", name="pricing_fixed_rate.invalid_numeric_format", description="When present, 'fixed rate: interest rate' must be a numeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -936,6 +982,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " blank. When 'interest rate type' equals 2, 4, or 6," " 'fixed rate: interest rate' must not be blank." ), + severity=Severity.ERROR, groupby="pricing_interest_rate_type", condition_values={"2", "4", "6"}, ), @@ -944,6 +991,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="W0420", name="pricing_fixed_rate.unreasonable_numeric_value", description="When present, 'fixed rate: interest rate' should generally be greater than 0.1.", + severity=Severity.WARNING, element_wise=True, min_value="0.1", accept_blank=True, @@ -957,6 +1005,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0440", name="pricing_adj_margin.invalid_numeric_format", description="When present, 'adjustable rate transaction: margin' must be a numeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -975,6 +1024,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "be blank. When 'interest rate type' equals 1, 3, or 5, " "'variable rate transaction: margin' must not be blank." ), + severity=Severity.ERROR, groupby="pricing_interest_rate_type", condition_values={"1", "3", "5"}, ), @@ -985,6 +1035,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): description=( "When present, 'adjustable rate transaction: margin' should generally be greater than 0.1." ), + severity=Severity.ERROR, element_wise=True, min_value="0.1", accept_blank=True, @@ -1001,6 +1052,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "'Adjustable rate transaction: index name' must equal " "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 977, or 999." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1032,6 +1084,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When 'interest rate type' equals 1, 3, or 5, 'adjustable rate" "transaction: index name' must not equal 999." ), + severity=Severity.ERROR, groupby="pricing_interest_rate_type", conditions=[ { @@ -1060,6 +1113,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): description=( "'Adjustable rate transaction: index name: other' must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -1075,6 +1129,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "'adjustable rate transaction: index name: other' must not be" "blank." ), + severity=Severity.ERROR, groupby="pricing_adj_index_name", condition_values={"977"}, ), @@ -1087,6 +1142,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0500", name="pricing_adj_index_value.invalid_numeric_format", description="When present, 'adjustable rate transaction: index value' must be a numeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -1104,6 +1160,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " blank. When 'interest rate type' equals 1 or 3," " 'adjustable rate transaction: index value' must not be blank." ), + severity=Severity.ERROR, groupby="pricing_interest_rate_type", condition_values={"1", "3"}, ), @@ -1115,10 +1172,8 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): is_number, id="E0520", name="pricing_origination_charges.invalid_numeric_format", - description=( - "When present, 'total origination charges' must be a numeric", - "value.", - ), + description="When present, 'total origination charges' must be a numeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -1131,10 +1186,8 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): is_number, id="E0540", name="pricing_broker_fees.invalid_numeric_format", - description=( - "When present, 'amount of total broker fees' must be a", - "numeric value.", - ), + description="When present, 'amount of total broker fees' must be a numeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -1148,6 +1201,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0560", name="pricing_initial_charges.invalid_numeric_format", description="When present, 'initial annual charges' must be anumeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -1165,6 +1219,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "advances or other sales-based financing: NA flag' " "must equal 900 or 999." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "900", @@ -1184,6 +1239,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "merchant cash advances or other sales-based financing: " "NA flag' must be 999 (not applicable)." ), + severity=Severity.ERROR, groupby="ct_credit_product", conditions=[ { @@ -1207,6 +1263,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "merchant cash advances or other sales-based financing' " "must be a numeric value" ), + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -1227,6 +1284,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "additional cost for merchant cash advances or other " "sales-based financing’ must not be blank." ), + severity=Severity.ERROR, groupby="pricing_mca_addcost_flag", condition_values={"900"}, ), @@ -1239,6 +1297,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0620", name="pricing_prepenalty_allowed.invalid_enum_value", description="'Prepayment penalty could be imposed' must equal 1, 2, or 999.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1256,6 +1315,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0640", name="pricing_prepenalty_exists.invalid_enum_value", description="'Prepayment penalty exists' must equal 1, 2, or 999.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1273,6 +1333,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0640", name="census_tract_adr_type.invalid_enum_value", description="'Census tract: type of address' must equal 1, 2, 3, or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1291,6 +1352,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0680", name="census_tract_number.invalid_text_length", description="When present, 'census tract: tract number' must be a GEOID with exactly 11 digits.", + severity=Severity.ERROR, element_wise=True, accepted_length=11, accept_blank=True, @@ -1312,6 +1374,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "location associated with the applicant), 'census tract:" " tract number' must not be blank." ), + severity=Severity.ERROR, groupby="census_tract_adr_type", conditions=[ { @@ -1337,6 +1400,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0700", name="gross_annual_revenue_flag.invalid_enum_value", description="'Gross annual revenue: NP flag' must equal 900 or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "900", @@ -1353,6 +1417,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0720", name="gross_annual_revenue.invalid_numeric_format", description="When present, 'gross annual revenue' must be a numeric value.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -1368,6 +1433,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "'gross annual revenue: NP flag' equals 900, " "'gross annual revenue' must not be blank." ), + severity=Severity.ERROR, groupby="gross_annual_revenue_flag", condition_values={"900"}, ), @@ -1380,8 +1446,9 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0720", name="naics_code_flag.invalid_enum_value", description=( - "'North American Industry Classification System (NAICS) code: NP flag' must equal 900 or 988." + "'North American Industry Classification System (NAICS) code: NP flag'must equal 900 or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "900", @@ -1401,6 +1468,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "'North American Industry Classification System " "(NAICS) code' may only contain numeric characters." ), + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -1414,6 +1482,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When present, 'North American Industry Classification System " "(NAICS) code' must be three digits in length." ), + severity=Severity.ERROR, element_wise=True, accepted_length=3, accept_blank=True, @@ -1426,6 +1495,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When present, 'North American Industry Classification System " "(NAICS) code' should be a valid NAICS code." ), + severity=Severity.WARNING, element_wise=True, accept_blank=True, codes=global_data.naics_codes, @@ -1440,6 +1510,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When 'type of guarantee' contains 977, 'free-form text field" " for other guarantee' must not be blank." ), + severity=Severity.ERROR, groupby="naics_code_flag", condition_values={"900"}, ), @@ -1452,6 +1523,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0780", name="number_of_workers.invalid_enum_value", description="'Number of workers' must equal 1, 2, 3, 4, 5, 6, 7, 8, 9, or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1476,6 +1548,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0800", name="time_in_business_type.invalid_enum_value", description="'Time in business: type of response' must equal 1, 2, 3, or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1494,6 +1567,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0820", name="time_in_business.invalid_numeric_format", description="When present, 'time in business' must be a whole number.", + severity=Severity.ERROR, element_wise=True, accept_blank=True, ), @@ -1504,6 +1578,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0821", name="time_in_business.invalid_numeric_value", description="When present, 'time in business' must be greater than or equal to 0.", + severity=Severity.ERROR, element_wise=True, min_value="0", accept_blank=True, @@ -1520,6 +1595,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 'time in business: type of response' equals 1," " 'time in business' must not be blank." ), + severity=Severity.ERROR, groupby="time_in_business_type", condition_values={"1"}, ), @@ -1536,6 +1612,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " (separated by semicolons) must equal 1, 2, 3," " 955, 966, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1553,21 +1630,21 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0841", name="business_ownership_status.invalid_number_of_values", description="'Business ownership status' must contain at least one value.", + severity=Severity.ERROR, element_wise=True, min_length=1, ), SBLCheck( is_unique_in_field, id="W0842", - warning=True, name="business_ownership_status.duplicates_in_field", description="'Business ownership status' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0843", - warning=True, name="business_ownership_status.multi_value_field_restriction", description=( "When 'business ownership status' contains 966" @@ -1576,6 +1653,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " by applicant), 'business ownership status' should" " not contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -1588,6 +1666,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0860", name="num_principal_owners_flag.invalid_enum_value", description="'Number of principal owners: NP flag' must equal 900 or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "900", @@ -1605,6 +1684,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "demographic fields for principal owners 1, 2, 3, and 4 " "should be blank." ), + severity=Severity.WARNING, groupby=[ "po_1_ethnicity", "po_1_race", @@ -1646,6 +1726,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " blank. Demographic fields for principal owners 2, 3, and 4 " "should be blank." ), + severity=Severity.WARNING, groupby=[ "po_1_ethnicity", "po_1_race", @@ -1686,6 +1767,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "owner 1 and 2', and 'sex/gender of principal owner 1 and 2: " "NP flag' should not be blank." ), + severity=Severity.WARNING, groupby=[ "po_1_ethnicity", "po_1_race", @@ -1727,6 +1809,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "and 3: NP flag' should not be blank. Demographic fields for " "principal owner 4 should be blank." ), + severity=Severity.WARNING, groupby=[ "po_1_ethnicity", "po_1_race", @@ -1768,6 +1851,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "and 'sex/gender of principal owner 1, 2, 3, and 4: NP flag'" " should not be blank." ), + severity=Severity.WARNING, groupby=[ "po_1_ethnicity", "po_1_race", @@ -1807,6 +1891,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E0880", name="num_principal_owners.invalid_enum_value", description="When present, 'number of principal owners' must equal 0, 1, 2, 3, or 4.", + severity=Severity.ERROR, element_wise=True, accepted_values=["0", "1", "2", "3", "4"], accept_blank=True, @@ -1823,6 +1908,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): "When 'number of principal owners: NP flag' equals 900, " "'number of principal owners' must not be blank." ), + severity=Severity.ERROR, groupby="num_principal_owners_flag", condition_values={"900"}, ), @@ -1840,6 +1926,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " semicolons) must equal 1, 11, 12," " 13, 14, 2, 966, 977, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1859,15 +1946,14 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0901", - warning=True, name="po_1_ethnicity.duplicates_in_field", description="'Ethnicity of principal owner 1' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0902", - warning=True, name="po_1_ethnicity.multi_value_field_restriction", description=( "When 'ethnicity of principal owner 1' contains" @@ -1876,6 +1962,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " (not provided by applicant), 'ethnicity of" " principal owner 1' should not contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -1893,6 +1980,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " text field for other Hispanic or Latino'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -1910,6 +1998,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " owner 1: free-form text field for other Hispanic" " or Latino' must not be blank." ), + severity=Severity.ERROR, groupby="po_1_ethnicity", condition_values={"977"}, ), @@ -1929,6 +2018,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 34, 35, 36, 37, 4, 41, 42, 43, 44," " 5, 966, 971, 972, 973, 974, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -1968,15 +2058,14 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0941", - warning=True, name="po_1_race.duplicates_in_field", description="'Race of principal owner 1' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0942", - warning=True, name="po_1_race.multi_value_field_restriction", description=( "When 'race of principal owner 1' contains" @@ -1986,6 +2075,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 'race of principal owner 1' should not" " contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -2004,6 +2094,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " Native Enrolled or Principal Tribe' must" " not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2024,6 +2115,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " for American Indian or Alaska Native Enrolled or" " Principal Tribe' must not be blank." ), + severity=Severity.ERROR, groupby="po_1_race", condition_values={"971"}, ), @@ -2041,6 +2133,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Asian' must not exceed 300" " characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2057,6 +2150,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 972, 'race of principal owner 1: free-form text field" " for other Asian' must not be blank." ), + severity=Severity.ERROR, groupby="po_1_race", condition_values={"972"}, ), @@ -2074,6 +2168,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Black or African American'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2090,6 +2185,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " contains 973, 'race of principal owner 1: free-form text" " field for other Black or African American' must not be blank." ), + severity=Severity.ERROR, groupby="po_1_race", condition_values={"973"}, ), @@ -2107,6 +2203,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Pacific Islander race' must" " not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2123,6 +2220,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " contains 974, 'race of principal owner 1: free-form text" " field for other Pacific Islander race' must not be blank." ), + severity=Severity.ERROR, groupby="po_1_race", condition_values={"974"}, ), @@ -2135,6 +2233,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E1040", name="po_1_gender_flag.invalid_enum_value", description="When present, 'sex/gender of principal owner 1: NP flag' must equal 1, 966, or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -2158,6 +2257,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " text field for self-identified sex/gender'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2175,6 +2275,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " of principal owner 1: free-form text field for" " self-identified sex/gender' must not be blank." ), + severity=Severity.ERROR, groupby="po_1_gender_flag", condition_values={"1"}, ), @@ -2192,6 +2293,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " semicolons) must equal 1, 11, 12," " 13, 14, 2, 966, 977, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -2211,15 +2313,14 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0901", - warning=True, name="po_2_ethnicity.duplicates_in_field", description="'Ethnicity of principal owner 2' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0902", - warning=True, name="po_2_ethnicity.multi_value_field_restriction", description=( "When 'ethnicity of principal owner 2' contains" @@ -2228,6 +2329,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " (not provided by applicant), 'ethnicity of" " principal owner 2' should not contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -2245,6 +2347,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " text field for other Hispanic or Latino'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2262,6 +2365,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " owner 2: free-form text field for other Hispanic" " or Latino' must not be blank." ), + severity=Severity.ERROR, groupby="po_2_ethnicity", condition_values={"977"}, ), @@ -2281,6 +2385,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 34, 35, 36, 37, 4, 41, 42, 43, 44," " 5, 966, 971, 972, 973, 974, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -2320,15 +2425,14 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0941", - warning=True, name="po_2_race.duplicates_in_field", description="'Race of principal owner 2' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0942", - warning=True, name="po_2_race.multi_value_field_restriction", description=( "When 'race of principal owner 2' contains" @@ -2338,6 +2442,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 'race of principal owner 2' should not" " contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -2356,6 +2461,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " Native Enrolled or Principal Tribe' must" " not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2376,6 +2482,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " for American Indian or Alaska Native Enrolled or" " Principal Tribe' must not be blank." ), + severity=Severity.ERROR, groupby="po_2_race", condition_values={"971"}, ), @@ -2393,6 +2500,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Asian' must not exceed 300" " characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2409,6 +2517,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 972, 'race of principal owner 2: free-form text field" " for other Asian' must not be blank." ), + severity=Severity.ERROR, groupby="po_2_race", condition_values={"972"}, ), @@ -2426,6 +2535,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Black or African American'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2442,6 +2552,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " contains 973, 'race of principal owner 2: free-form text" " field for other Black or African American' must not be blank." ), + severity=Severity.ERROR, groupby="po_2_race", condition_values={"973"}, ), @@ -2459,6 +2570,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Pacific Islander race' must" " not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2475,6 +2587,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " contains 974, 'race of principal owner 2: free-form text" " field for other Pacific Islander race' must not be blank." ), + severity=Severity.ERROR, groupby="po_2_race", condition_values={"974"}, ), @@ -2487,6 +2600,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E1040", name="po_2_gender_flag.invalid_enum_value", description="When present, 'sex/gender of principal owner 2: NP flag' must equal 1, 966, or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -2510,6 +2624,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " text field for self-identified sex/gender'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2527,6 +2642,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " of principal owner 2: free-form text field for" " self-identified sex/gender' must not be blank." ), + severity=Severity.ERROR, groupby="po_2_gender_flag", condition_values={"1"}, ), @@ -2544,6 +2660,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " semicolons) must equal 1, 11, 12," " 13, 14, 2, 966, 977, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -2563,15 +2680,14 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0901", - warning=True, name="po_3_ethnicity.duplicates_in_field", description="'Ethnicity of principal owner 3' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0902", - warning=True, name="po_3_ethnicity.multi_value_field_restriction", description=( "When 'ethnicity of principal owner 3' contains" @@ -2580,6 +2696,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " (not provided by applicant), 'ethnicity of" " principal owner 3' should not contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -2597,6 +2714,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " text field for other Hispanic or Latino'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2614,6 +2732,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " owner 3: free-form text field for other Hispanic" " or Latino' must not be blank." ), + severity=Severity.ERROR, groupby="po_3_ethnicity", condition_values={"977"}, ), @@ -2633,6 +2752,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 34, 35, 36, 37, 4, 41, 42, 43, 44," " 5, 966, 971, 972, 973, 974, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -2672,15 +2792,14 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0941", - warning=True, name="po_3_race.duplicates_in_field", description="'Race of principal owner 3' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0942", - warning=True, name="po_3_race.multi_value_field_restriction", description=( "When 'race of principal owner 3' contains" @@ -2690,6 +2809,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 'race of principal owner 3' should not" " contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -2708,6 +2828,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " Native Enrolled or Principal Tribe' must" " not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2728,6 +2849,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " for American Indian or Alaska Native Enrolled or" " Principal Tribe' must not be blank." ), + severity=Severity.ERROR, groupby="po_3_race", condition_values={"971"}, ), @@ -2745,6 +2867,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Asian' must not exceed 300" " characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2761,6 +2884,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 972, 'race of principal owner 3: free-form text field" " for other Asian' must not be blank." ), + severity=Severity.ERROR, groupby="po_3_race", condition_values={"972"}, ), @@ -2778,6 +2902,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Black or African American'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2794,6 +2919,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " contains 973, 'race of principal owner 3: free-form text" " field for other Black or African American' must not be blank." ), + severity=Severity.ERROR, groupby="po_3_race", condition_values={"973"}, ), @@ -2811,6 +2937,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Pacific Islander race' must" " not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2827,6 +2954,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " contains 974, 'race of principal owner 3: free-form text" " field for other Pacific Islander race' must not be blank." ), + severity=Severity.ERROR, groupby="po_3_race", condition_values={"974"}, ), @@ -2839,6 +2967,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E1040", name="po_3_gender_flag.invalid_enum_value", description="When present, 'sex/gender of principal owner 3: NP flag' must equal 1, 966, or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -2862,6 +2991,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " text field for self-identified sex/gender'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2879,6 +3009,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " of principal owner 3: free-form text field for" " self-identified sex/gender' must not be blank." ), + severity=Severity.ERROR, groupby="po_3_gender_flag", condition_values={"1"}, ), @@ -2896,6 +3027,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " semicolons) must equal 1, 11, 12," " 13, 14, 2, 966, 977, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -2915,15 +3047,14 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0901", - warning=True, name="po_4_ethnicity.duplicates_in_field", description="'Ethnicity of principal owner 4' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0902", - warning=True, name="po_4_ethnicity.multi_value_field_restriction", description=( "When 'ethnicity of principal owner 4' contains" @@ -2932,6 +3063,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " (not provided by applicant), 'ethnicity of" " principal owner 4' should not contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -2949,6 +3081,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " text field for other Hispanic or Latino'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -2966,6 +3099,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " owner 4: free-form text field for other Hispanic" " or Latino' must not be blank." ), + severity=Severity.ERROR, groupby="po_4_ethnicity", condition_values={"977"}, ), @@ -2985,6 +3119,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 34, 35, 36, 37, 4, 41, 42, 43, 44," " 5, 966, 971, 972, 973, 974, or 988." ), + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -3024,15 +3159,14 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): SBLCheck( is_unique_in_field, id="W0941", - warning=True, name="po_4_race.duplicates_in_field", description="'Race of principal owner 4' should not contain duplicated values.", + severity=Severity.WARNING, element_wise=True, ), SBLCheck( meets_multi_value_field_restriction, id="W0942", - warning=True, name="po_4_race.multi_value_field_restriction", description=( "When 'race of principal owner 4' contains" @@ -3042,6 +3176,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 'race of principal owner 4' should not" " contain more than one value." ), + severity=Severity.WARNING, element_wise=True, single_values={"966", "988"}, ), @@ -3060,6 +3195,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " Native Enrolled or Principal Tribe' must" " not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -3080,6 +3216,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " for American Indian or Alaska Native Enrolled or" " Principal Tribe' must not be blank." ), + severity=Severity.ERROR, groupby="po_4_race", condition_values={"971"}, ), @@ -3097,6 +3234,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Asian' must not exceed 300" " characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -3113,6 +3251,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " 972, 'race of principal owner 4: free-form text field" " for other Asian' must not be blank." ), + severity=Severity.ERROR, groupby="po_4_race", condition_values={"972"}, ), @@ -3130,6 +3269,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Black or African American'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -3146,6 +3286,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " contains 973, 'race of principal owner 4: free-form text" " field for other Black or African American' must not be blank." ), + severity=Severity.ERROR, groupby="po_4_race", condition_values={"973"}, ), @@ -3163,6 +3304,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " field for other Pacific Islander race' must" " not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -3179,6 +3321,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " contains 974, 'race of principal owner 4: free-form text" " field for other Pacific Islander race' must not be blank." ), + severity=Severity.ERROR, groupby="po_4_race", condition_values={"974"}, ), @@ -3191,6 +3334,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): id="E1040", name="po_4_gender_flag.invalid_enum_value", description="When present, 'sex/gender of principal owner 4: NP flag' must equal 1, 966, or 988.", + severity=Severity.ERROR, element_wise=True, accepted_values=[ "1", @@ -3214,6 +3358,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " text field for self-identified sex/gender'" " must not exceed 300 characters in length." ), + severity=Severity.ERROR, ), ], "phase_2": [ @@ -3231,6 +3376,7 @@ def get_phase_1_and_2_validations_for_lei(lei: str = None): " of principal owner 4: free-form text field for" " self-identified sex/gender' must not be blank." ), + severity=Severity.ERROR, groupby="po_4_gender_flag", condition_values={"1"}, ), diff --git a/src/validator/schema_template.py b/regtech_data_validator/schema_template.py similarity index 98% rename from src/validator/schema_template.py rename to regtech_data_validator/schema_template.py index 2aada648..a2229cdd 100644 --- a/src/validator/schema_template.py +++ b/regtech_data_validator/schema_template.py @@ -1,4 +1,4 @@ -"""This is a 'blank' Pandera template for SBLAR. All columns in the fig are present, +"""This is a 'blank' Pandera template for SBLAR. All columns in the FIG are present, but the checks need to be populated. Do not import _schema_template from this module directly. Instead, make use of the @@ -170,10 +170,7 @@ ), "pricing_mca_addcost": Column( str, - title=( - "Field 31: MCA/sales-based: additional cost for merchant cash ", - "advances or other sales-based financing", - ), + title="Field 31: MCA/sales-based: additional cost for merchant cash advances or other sales-based financing", checks=[], ), "pricing_prepenalty_allowed": Column( diff --git a/src/tests/__init__.py b/src/tests/__init__.py deleted file mode 100644 index 238d07e8..00000000 --- a/src/tests/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -import os -import sys - -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -sys.path.append(os.path.join(ROOT_DIR, "validator")) diff --git a/src/tests/test_checks.py b/src/tests/test_checks.py deleted file mode 100644 index bac4cc75..00000000 --- a/src/tests/test_checks.py +++ /dev/null @@ -1,27 +0,0 @@ -import pytest - -from validator.checks import SBLCheck - - -class TestSBLCheck: - def test_no_id_check(self): - with pytest.raises(Exception) as exc: - SBLCheck(lambda: True, warning=True, name="Just a Warning") - - assert "Each check must be assigned a `name` and an `id`." in str(exc.value) - assert exc.type == ValueError - - def test_no_name_check(self): - with pytest.raises(Exception) as exc: - SBLCheck(lambda: True, id="00000", warning=True) - - assert "Each check must be assigned a `name` and an `id`." in str(exc.value) - assert exc.type == ValueError - - def test_name_and_id_check(self): - raised = False - try: - SBLCheck(lambda: True, id="00000", warning=True, name="Just a Warning") - except ValueError: - raised = True - assert raised is False diff --git a/src/tests/test_global_data.py b/src/tests/test_global_data.py deleted file mode 100644 index 6e8fc13f..00000000 --- a/src/tests/test_global_data.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from validator import global_data - - -class TestGlobalData: - def test_valid_naics_codes(self): - global_data.read_naics_codes() - assert len(global_data.naics_codes) == 96 - - def test_valid_geoids(self): - global_data.read_geoids() - assert len(global_data.census_geoids) == 87275 - - def test_invalid_naics_file(self): - failed_fpath = "./data/naics/processed/2022_codes.csv1" - with pytest.raises(Exception) as exc: - global_data.read_naics_codes(failed_fpath) - assert exc.type == FileNotFoundError - - def test_invalid_geoids_file(self): - failed_fpath = "./data/census/processed/Census2022.processed.csv2" - with pytest.raises(Exception) as exc: - global_data.read_geoids(failed_fpath) - assert exc.type == FileNotFoundError diff --git a/src/validator/__init__.py b/src/validator/__init__.py deleted file mode 100644 index 836099bf..00000000 --- a/src/validator/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -import os -import sys - -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(ROOT_DIR) diff --git a/src/validator/checks.py b/src/validator/checks.py deleted file mode 100644 index 29677a18..00000000 --- a/src/validator/checks.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Custom subclass for warnings and errors. - -The class SBLCheck is a subclass of the standard Pandera Check class -that requires the `name` kwarg to be supplied. Errors and warnings are -distinguised based on the value of the warning attribute. It defaults -to false but can be set to True during init to indicate the validation -should be handled as a warning rather than an error. - -Examples: - - warning_check = SBLCheck( - lambda: True, - warning=True, - name="Just a Warning" - ) - - error_check_implied = SBLCheck(lambda: True, name="Error Check") - - error_check_explicit = SBLCheck( - lambda: True, - warning=False, - name="Also an Error" - ) -""" - - -from typing import Any, Callable, Type - -from pandera import Check -from pandera.backends.base import BaseCheckBackend -from pandera.backends.pandas.checks import PandasCheckBackend - - -class SBLCheck(Check): - """A custom Pandera.Check subclasss that requires a `name` and an `id` be - specified. Additionally, an attribute named `warning` is added to - the class to enable distinction between warnings and errors. The - default value of warning is `False` which corresponds to an error. - - Don't use this class directly. Make use of the SBLErrorCheck and - SBLWarningCheck subclasses below.""" - - def __init__(self, check_fn: Callable, id: str = None, warning=False, *args, **kwargs): - """Custom init method that verifies the presence of `name` and `id` in - kwargs creates a custom class attribute called `warning`. All - other initializaiton is handled by the parent Check class. - - Args: - check_fn (Callable): A function which evaluates the validity - of the column(s) being tested. - id (str, required): Each check mut have an id. - warning (bool, optional): Boolean specifying whether to - treat the check as a warning rather than an error. - - Raises: - ValueError: Raised if `name` not supplied in kwargs and if id is not - supplied or None. - """ - - self.id = id - - if "name" not in kwargs or id is None: - raise ValueError("Each check must be assigned a `name` and an `id`.") - - # if warning==False treat check as an error check - self.warning = warning - - super().__init__(check_fn=check_fn, *args, **kwargs) - - @classmethod - def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]: - """Assume Pandas DataFrame and return PandasCheckBackend""" - return PandasCheckBackend diff --git a/src/validator/global_data.py b/src/validator/global_data.py deleted file mode 100644 index a9c54f04..00000000 --- a/src/validator/global_data.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import sys - -import pandas as pd - -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # noqa: E402 -sys.path.append(ROOT_DIR) # noqa: E402 - -from config import CENSUS_PROCESSED_CSV_PATH, NAICS_CSV_PATH # noqa: E402 - -naics_codes = {} - -# global variable for geoids -census_geoids = {} - - -def read_naics_codes(csv_path: str = NAICS_CSV_PATH): - """ - read NAICS CSV file with this format: (code, description) - and populate global value: naics_codes - """ - naics_codes.clear() - df = pd.read_csv(csv_path, dtype=str, na_filter=False) - for _, row in df.iterrows(): - naics_codes.update({row.iloc[0]: row.iloc[1]}) - - -def read_geoids(csv_path: str = CENSUS_PROCESSED_CSV_PATH): - """ - read geoids CSV file with this format: (code) - and populate global value: census_geoids - """ - census_geoids.clear() - df = pd.read_csv(csv_path, dtype=str, na_filter=False) - for _, row in df.iterrows(): - census_geoids.update({row.iloc[0]: None}) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/tests/data/sbl-validations-fail.csv b/tests/data/sbl-validations-fail.csv similarity index 100% rename from src/tests/data/sbl-validations-fail.csv rename to tests/data/sbl-validations-fail.csv diff --git a/src/tests/data/sbl-validations-pass.csv b/tests/data/sbl-validations-pass.csv similarity index 100% rename from src/tests/data/sbl-validations-pass.csv rename to tests/data/sbl-validations-pass.csv diff --git a/src/tests/test_check_functions.py b/tests/test_check_functions.py similarity index 99% rename from src/tests/test_check_functions.py rename to tests/test_check_functions.py index 65175eff..c93b5613 100644 --- a/src/tests/test_check_functions.py +++ b/tests/test_check_functions.py @@ -1,7 +1,7 @@ import pandas as pd -from validator import global_data -from validator.check_functions import ( +from regtech_data_validator import global_data +from regtech_data_validator.check_functions import ( has_correct_length, has_no_conditional_field_conflict, has_valid_enum_pair, @@ -474,28 +474,24 @@ def test_with_incorrect_length(self): class TestIsValidCode: def test_with_valid_code(self): - global_data.read_naics_codes() result = is_valid_code("111", False, global_data.naics_codes) assert result is True result = is_valid_code("111", True, global_data.naics_codes) assert result is True def test_with_invalid_code(self): - global_data.read_naics_codes() result = is_valid_code("101", False, global_data.naics_codes) assert result is False result = is_valid_code("101", True, global_data.naics_codes) assert result is False def test_with_accepted_blank(self): - global_data.read_naics_codes() result = is_valid_code("", True, global_data.naics_codes) assert result is True result = is_valid_code(" ", True, global_data.naics_codes) assert result is True def test_with_invalid_blank(self): - global_data.read_naics_codes() result = is_valid_code("", False, global_data.naics_codes) assert result is False result = is_valid_code(" ", False, global_data.naics_codes) diff --git a/tests/test_global_data.py b/tests/test_global_data.py new file mode 100644 index 00000000..20f84c40 --- /dev/null +++ b/tests/test_global_data.py @@ -0,0 +1,9 @@ +from regtech_data_validator import global_data + + +class TestGlobalData: + def test_valid_naics_codes(self): + assert len(global_data.naics_codes) == 96 + + def test_valid_geoids(self): + assert len(global_data.census_geoids) == 87275 diff --git a/src/tests/test_sample_data.py b/tests/test_sample_data.py similarity index 82% rename from src/tests/test_sample_data.py rename to tests/test_sample_data.py index e1d6de07..35c1fe42 100644 --- a/src/tests/test_sample_data.py +++ b/tests/test_sample_data.py @@ -1,16 +1,10 @@ -import os -import sys - import pandas as pd import pytest -from validator.create_schemas import validate_phases - -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # noqa: E402 -sys.path.append(ROOT_DIR) # noqa: E402 +from regtech_data_validator.create_schemas import validate_phases -GOOD_FILE_PATH = "./src/tests/data/sbl-validations-pass.csv" -BAD_FILE_PATH = "./src/tests/data/sbl-validations-fail.csv" +GOOD_FILE_PATH = "./tests/data/sbl-validations-pass.csv" +BAD_FILE_PATH = "./tests/data/sbl-validations-fail.csv" class TestValidatingSampleData: diff --git a/src/tests/test_schema_functions.py b/tests/test_schema_functions.py similarity index 98% rename from src/tests/test_schema_functions.py rename to tests/test_schema_functions.py index 911dc673..7c141dee 100644 --- a/src/tests/test_schema_functions.py +++ b/tests/test_schema_functions.py @@ -1,6 +1,11 @@ import pandas as pd -from validator.create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate, validate_phases +from regtech_data_validator.create_schemas import ( + get_phase_1_schema_for_lei, + get_phase_2_schema_for_lei, + validate, + validate_phases, +) class TestUtil: diff --git a/tools/process_naics.py b/tools/process_naics.py deleted file mode 100644 index b202407c..00000000 --- a/tools/process_naics.py +++ /dev/null @@ -1,49 +0,0 @@ -import csv -import os -import sys - -import pandas as pd - -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402 -sys.path.append(ROOT_DIR) # noqa: E402 - -import config # noqa: E402 - -""" -filter NAICS data with only 3 digit codes - -Raises: - FileNotFoundError: when input excel file not existed - FileExistsError: when output csv file existed -""" -if __name__ == "__main__": - EXCEL_PATH = config.NAICS_EXCEL_PATH - CSV_PATH = config.NAICS_CSV_PATH - CODE_COL = config.NAICS_CODE_COL - TITLE_COL = config.NAICS_TITLE_COL - - # check for paths - if not os.path.isfile(EXCEL_PATH): - error_msg = "Input excel file not existed" - raise FileNotFoundError(error_msg) - if os.path.isfile(CSV_PATH): - error_msg = "Output csv file existed" - raise FileExistsError(error_msg) - - df = pd.read_excel(EXCEL_PATH, dtype=str, na_filter=False) - - # add header - result = [["code", "title"]] - - # read excel file - # and create csv data list - for index, row in df.iterrows(): - code = str(row[CODE_COL]) - if len(code) == 3: - a_row = [code, str(row[TITLE_COL])] - result.append(a_row) - - # output data to csv file - with open(CSV_PATH, "w") as f: - writer = csv.writer(f) - writer.writerows(result)