Skip to content

Commit

Permalink
refactor: standardize repo structure and other prep for open-sourcing (
Browse files Browse the repository at this point in the history
…#60)

Grab bag of tune-up in prep for open-sourcing this repo.

1. Restructure repo to be more compliant with modern Python projects.
    1. Move `tests` out to top-level directory.
    2. Rename `src/validator` to `regtech_data_validator`.
2. Consolidate external datasource code and data to `data` dir.
    1. Move `config.py` settings into their respective scripts, and file
        paths are now passed in as CLI args instead.
3. Move processed CSV files into the project itself. This allows for
    simpler data lookups via package name via `importlib.resources`.
    This allowed the removal of the `ROOT_PATH` Python path logic in all of
    the `__init__.py`s.
4. Refactor `global_data.py` to load data only once where module is
    first imported.
5. Refactor `SBLCheck`'s
    1. `warning: bool` for a more explicit `severity`, backed by an enum
        that only allows `ERROR` and `WARNING`.
        1. Several of the warning-level validations were not setting
            `warning=True`, and were thus defaulting to `False`. This will prevent
            that. I also fixed all these instances.
        2. Removes the need for translation to `severity` when building JSON
            output.
    2. Use explicit args in the constructor, and pass all shared args on to
        parent class. This removes the need for the arg `name`/`id` error handling. 
6. Switch CLI output from Python dict to JSON.
7. Rollback `black` version used in linting Action due to bug in latest version.
    -  psf/black#3953

**Note:** Some of the files that I both moved _and_ changed seem to now
show as having deleted the old file and created a new one. I'm not sure
why it's doing this. I did the moves and changes in separate commits,
which usually prevents this, but doesn't seem to be the case here.
Perhaps there's just so much change in some that git considers it a
whole new file? 🤷 It's kind of annoying, especially if it results in
losing git history for those files.
  • Loading branch information
hkeeler authored Oct 20, 2023
1 parent c6585d2 commit ba6a1c4
Show file tree
Hide file tree
Showing 35 changed files with 469 additions and 382 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"python.testing.unittestEnabled": false,
"python.testing.pytestArgs": [
"--rootdir",
"${workspaceFolder}/src/tests"
"${workspaceFolder}/tests"
]
}
}
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/linters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ jobs:
steps:
- uses: actions/checkout@v3
- uses: psf/black@stable
with:
options: "--check --diff --verbose"
version: "~= 22.0"
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: chartboost/ruff-action@v1
- uses: chartboost/ruff-action@v1
19 changes: 0 additions & 19 deletions config.py

This file was deleted.

3 changes: 3 additions & 0 deletions data/census/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# FFIEC's Census Flat File

- https://www.ffiec.gov/censusapp.htm
65 changes: 37 additions & 28 deletions tools/process_census.py → data/census/process_census.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

import pandas as pd

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402
sys.path.append(ROOT_DIR) # noqa: E402
# census file col indexes
CENSUS_STATE_COL_INDEX = 2
CENSUS_COUNTY_COL_INDEX = 3
CENSUS_TRACT_COL_INDEX = 4

import config # noqa: E402
CENSUS_GEOID_COL = "geoid"


# helper function to check number (float/int/negative)
Expand All @@ -21,24 +23,22 @@ def _is_number(s):


# helper function to unzip census file and extract CSV file
def _extract_census_zip_file():
CENSUS_TMP_CSV_PATH = config.CENSUS_RAW_ZIP_PATH + ".tmp.csv"
def _extract_census_zip_file(raw_src):
census_tmp_csv_path = raw_src + ".tmp.csv"
# unzip and extract csv files
with zipfile.ZipFile(config.CENSUS_RAW_ZIP_PATH, "r") as zip_ref:
with zipfile.ZipFile(raw_src, "r") as zip_ref:
for file in zip_ref.namelist(): # iterate over files in archive
if file[-4:] == ".csv":
print("Extracting CSV to {}".format(CENSUS_TMP_CSV_PATH))
with open(CENSUS_TMP_CSV_PATH, "wb") as outfile:
print("Extracting CSV to {}".format(census_tmp_csv_path))
with open(census_tmp_csv_path, "wb") as outfile:
outfile.write(zip_ref.read(file))
# it should only have one csv file
return CENSUS_TMP_CSV_PATH
# it should only have one csv file

return census_tmp_csv_path


# helper function to read extracted csv file and filter only geo-tract-id
def _read_census_csv(src_path: str, csv_path: str):
STATE_COL = config.CENSUS_STATE_COL_INDEX
COUNTY_COL = config.CENSUS_COUNTY_COL_INDEX
TRACT_COL = config.CENSUS_TRACT_COL_INDEX
def _process_census_csv(src_path: str, csv_path: str):

# check paths
if not os.path.isfile(src_path):
Expand All @@ -52,14 +52,14 @@ def _read_census_csv(src_path: str, csv_path: str):
)

# add header
result = [[config.CENSUS_GEOID_COL]]
result = [[CENSUS_GEOID_COL]]

# read excel file
# and create csv data list
for index, row in df.iterrows():
state_value = str(row[STATE_COL])
county_value = str(row[COUNTY_COL])
tract_value = str(row[TRACT_COL])
state_value = str(row[CENSUS_STATE_COL_INDEX])
county_value = str(row[CENSUS_COUNTY_COL_INDEX])
tract_value = str(row[CENSUS_TRACT_COL_INDEX])
if (
_is_number(state_value)
and _is_number(county_value)
Expand All @@ -84,14 +84,23 @@ def _read_census_csv(src_path: str, csv_path: str):
- output to defined output file
"""
if __name__ == "__main__":
CSV_PATH = config.CENSUS_PROCESSED_CSV_PATH

if os.path.isfile(CSV_PATH):
error_msg = "Output {} csv file existed".format(CSV_PATH)
raise FileExistsError(error_msg)

tmp_census_csv_file = _extract_census_zip_file()
print("Reading extracted CSV File . {}".format(tmp_census_csv_file))
_read_census_csv(tmp_census_csv_file, CSV_PATH)
print("Removing extracted CSV File")
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
exit(1)

raw_src = sys.argv[1]
csv_dest = sys.argv[2]

if not os.path.isfile(raw_src):
print(f"source file not existed: {raw_src}")
exit(2)

if os.path.isfile(csv_dest):
print("destination file already existed: {csv_dest}")
exit(3)

tmp_census_csv_file = _extract_census_zip_file(raw_src)
print(f"Reading extracted CSV file: {tmp_census_csv_file}")
_process_census_csv(tmp_census_csv_file, csv_dest)
print("Removing extracted CSV file")
os.remove(tmp_census_csv_file)
3 changes: 3 additions & 0 deletions data/naics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# North American Industry Classification System (NAICS) codes

- https://www.census.gov/naics/?48967
57 changes: 57 additions & 0 deletions data/naics/process_naics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import csv
import os
import sys

import pandas as pd


# column header text containing naics code
NAICS_CODE_COL = "2022 NAICS US Code"
# column header text containing naics title/description
NAICS_TITLE_COL = "2022 NAICS US Title"


"""
filter NAICS data with only 3 digit codes
Raises:
FileNotFoundError: when input excel file not existed
FileExistsError: when output csv file existed
"""
if __name__ == "__main__":
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
exit(1)

raw_src = sys.argv[1]
csv_dest = sys.argv[2]

if not os.path.isfile(raw_src):
print(f"source file not existed: {raw_src}")
exit(2)

if os.path.isfile(csv_dest):
print("destination file already existed: {csv_dest}")
exit(3)

df = pd.read_excel(raw_src, dtype=str, na_filter=False)

print(f'source file successfully read: {raw_src}')

# add header
result = [["code", "title"]]

# read excel file
# and create csv data list
for index, row in df.iterrows():
code = str(row[NAICS_CODE_COL])
if len(code) == 3:
a_row = [code, str(row[NAICS_TITLE_COL])]
result.append(a_row)

# output data to csv file
with open(csv_dest, "w") as f:
writer = csv.writer(f)
writer.writerows(result)

print(f'destination file successfully written: {csv_dest}')
29 changes: 27 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pytest-cov = "4.1.0"
black = "23.3.0"
ruff = "0.0.259"

[tool.poetry.group.data.dependencies]
openpyxl = "^3.1.2"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Expand All @@ -33,7 +36,6 @@ exclude = '''
| .gitignore
| .github
| data
| tools
)/
'''

Expand All @@ -49,18 +51,18 @@ addopts = [
"--cov-branch",
"--cov-report=xml",
"--cov-report=term",
"--cov=src",
"--cov=regtech_data_validator",
"-vv",
"--strict-markers",
"-rfE",
]
testpaths = [
"src/tests",
"tests",
]

[tool.coverage.run]
relative_files = true
source = ["src"]
source = ["regtech_data_validator"]

[tool.coverage.report]
skip_empty = true
skip_empty = true
File renamed without changes.
File renamed without changes.
47 changes: 47 additions & 0 deletions regtech_data_validator/checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Subclasses of Pandera's `Check` class
"""

from enum import StrEnum
from typing import Any, Callable, Type

from pandera import Check
from pandera.backends.base import BaseCheckBackend
from pandera.backends.pandas.checks import PandasCheckBackend


class Severity(StrEnum):
ERROR = 'error'
WARNING = 'warning'


class SBLCheck(Check):
"""
A Pandera.Check subclasss that requires a `name` and an `id` be
specified. Additionally, an attribute named `warning` is added to
the class to enable distinction between warnings and errors. The
default value of warning is `False` which corresponds to an error.
Don't use this class directly. Make use of the SBLErrorCheck and
SBLWarningCheck subclasses below.
"""

def __init__(self, check_fn: Callable, id: str, name: str, description: str, severity: Severity, **check_kwargs):
"""
Subclass of Pandera's `Check`, with special handling for severity level
Args:
check_fn (Callable): A function which evaluates the validity of the column(s) being tested.
id (str, required): Unique identifier for a check
name (str, required): Unique name for a check
description (str, required): Long-form description of a check
severity (Severity, required): The severity of a check (error or warning)
check_kwargs (Any, optional): Parameters passed to `check_fn` function
"""

self.severity = severity

super().__init__(check_fn, title=id, name=name, description=description, **check_kwargs)

@classmethod
def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]:
"""Assume Pandas DataFrame and return PandasCheckBackend"""
return PandasCheckBackend
Loading

0 comments on commit ba6a1c4

Please sign in to comment.