refactor: standardize repo structure and other prep for open-sourcing (…

…#60) Grab bag of tune-up in prep for open-sourcing this repo. 1. Restructure repo to be more compliant with modern Python projects. 1. Move `tests` out to top-level directory. 2. Rename `src/validator` to `regtech_data_validator`. 2. Consolidate external datasource code and data to `data` dir. 1. Move `config.py` settings into their respective scripts, and file paths are now passed in as CLI args instead. 3. Move processed CSV files into the project itself. This allows for simpler data lookups via package name via `importlib.resources`. This allowed the removal of the `ROOT_PATH` Python path logic in all of the `__init__.py`s. 4. Refactor `global_data.py` to load data only once where module is first imported. 5. Refactor `SBLCheck`'s 1. `warning: bool` for a more explicit `severity`, backed by an enum that only allows `ERROR` and `WARNING`. 1. Several of the warning-level validations were not setting `warning=True`, and were thus defaulting to `False`. This will prevent that. I also fixed all these instances. 2. Removes the need for translation to `severity` when building JSON output. 2. Use explicit args in the constructor, and pass all shared args on to parent class. This removes the need for the arg `name`/`id` error handling. 6. Switch CLI output from Python dict to JSON. 7. Rollback `black` version used in linting Action due to bug in latest version. - psf/black#3953 **Note:** Some of the files that I both moved _and_ changed seem to now show as having deleted the old file and created a new one. I'm not sure why it's doing this. I did the moves and changes in separate commits, which usually prevents this, but doesn't seem to be the case here. Perhaps there's just so much change in some that git considers it a whole new file? 🤷 It's kind of annoying, especially if it results in losing git history for those files.
cfpb · Oct 20, 2023 · ba6a1c4 · ba6a1c4
1 parent c6585d2
commit ba6a1c4
Show file tree

Hide file tree

Showing 35 changed files with 469 additions and 382 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -49,7 +49,7 @@
         "python.testing.unittestEnabled": false,
         "python.testing.pytestArgs": [
           "--rootdir",
-          "${workspaceFolder}/src/tests"
+          "${workspaceFolder}/tests"
         ]
       }
     }

diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml
@@ -8,8 +8,11 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: psf/black@stable
+        with:
+          options: "--check --diff --verbose"
+          version: "~= 22.0"
   ruff:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
-      - uses: chartboost/ruff-action@v1
+      - uses: chartboost/ruff-action@v1
diff --git a/config.py b/config.py
diff --git a/data/census/README.md b/data/census/README.md
@@ -0,0 +1,3 @@
+# FFIEC's Census Flat File
+
+- https://www.ffiec.gov/censusapp.htm
diff --git a/tools/process_census.py → data/census/process_census.py b/tools/process_census.py → data/census/process_census.py
@@ -5,10 +5,12 @@
 
 import pandas as pd
 
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # noqa: E402
-sys.path.append(ROOT_DIR)  # noqa: E402
+# census file col indexes
+CENSUS_STATE_COL_INDEX = 2
+CENSUS_COUNTY_COL_INDEX = 3
+CENSUS_TRACT_COL_INDEX = 4
 
-import config  # noqa: E402
+CENSUS_GEOID_COL = "geoid"
 
 
 # helper function to check number (float/int/negative)
@@ -21,24 +23,22 @@ def _is_number(s):
 
 
 # helper function to unzip census file and extract CSV file
-def _extract_census_zip_file():
-    CENSUS_TMP_CSV_PATH = config.CENSUS_RAW_ZIP_PATH + ".tmp.csv"
+def _extract_census_zip_file(raw_src):
+    census_tmp_csv_path = raw_src + ".tmp.csv"
     # unzip and extract csv files
-    with zipfile.ZipFile(config.CENSUS_RAW_ZIP_PATH, "r") as zip_ref:
+    with zipfile.ZipFile(raw_src, "r") as zip_ref:
         for file in zip_ref.namelist():  # iterate over files in archive
             if file[-4:] == ".csv":
-                print("Extracting CSV to {}".format(CENSUS_TMP_CSV_PATH))
-                with open(CENSUS_TMP_CSV_PATH, "wb") as outfile:
+                print("Extracting CSV to {}".format(census_tmp_csv_path))
+                with open(census_tmp_csv_path, "wb") as outfile:
                     outfile.write(zip_ref.read(file))
-                # it should only have one csv file
-                return CENSUS_TMP_CSV_PATH
+                    # it should only have one csv file
+
+    return census_tmp_csv_path
 
 
 # helper function to read extracted csv file and filter only geo-tract-id
-def _read_census_csv(src_path: str, csv_path: str):
-    STATE_COL = config.CENSUS_STATE_COL_INDEX
-    COUNTY_COL = config.CENSUS_COUNTY_COL_INDEX
-    TRACT_COL = config.CENSUS_TRACT_COL_INDEX
+def _process_census_csv(src_path: str, csv_path: str):
 
     # check paths
     if not os.path.isfile(src_path):
@@ -52,14 +52,14 @@ def _read_census_csv(src_path: str, csv_path: str):
     )
 
     # add header
-    result = [[config.CENSUS_GEOID_COL]]
+    result = [[CENSUS_GEOID_COL]]
 
     # read excel file
     # and create csv data list
     for index, row in df.iterrows():
-        state_value = str(row[STATE_COL])
-        county_value = str(row[COUNTY_COL])
-        tract_value = str(row[TRACT_COL])
+        state_value = str(row[CENSUS_STATE_COL_INDEX])
+        county_value = str(row[CENSUS_COUNTY_COL_INDEX])
+        tract_value = str(row[CENSUS_TRACT_COL_INDEX])
         if (
             _is_number(state_value)
             and _is_number(county_value)
@@ -84,14 +84,23 @@ def _read_census_csv(src_path: str, csv_path: str):
 - output to defined output file
 """
 if __name__ == "__main__":
-    CSV_PATH = config.CENSUS_PROCESSED_CSV_PATH
-
-    if os.path.isfile(CSV_PATH):
-        error_msg = "Output {} csv file existed".format(CSV_PATH)
-        raise FileExistsError(error_msg)
-
-    tmp_census_csv_file = _extract_census_zip_file()
-    print("Reading extracted CSV File . {}".format(tmp_census_csv_file))
-    _read_census_csv(tmp_census_csv_file, CSV_PATH)
-    print("Removing extracted CSV File")
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
+        exit(1)
+
+    raw_src = sys.argv[1]
+    csv_dest = sys.argv[2]
+
+    if not os.path.isfile(raw_src):
+        print(f"source file not existed: {raw_src}")
+        exit(2)
+
+    if os.path.isfile(csv_dest):
+        print("destination file already existed: {csv_dest}")
+        exit(3)
+
+    tmp_census_csv_file = _extract_census_zip_file(raw_src)
+    print(f"Reading extracted CSV file: {tmp_census_csv_file}")
+    _process_census_csv(tmp_census_csv_file, csv_dest)
+    print("Removing extracted CSV file")
     os.remove(tmp_census_csv_file)
diff --git a/data/naics/README.md b/data/naics/README.md
@@ -0,0 +1,3 @@
+# North American Industry Classification System (NAICS) codes
+
+- https://www.census.gov/naics/?48967
diff --git a/data/naics/process_naics.py b/data/naics/process_naics.py
@@ -0,0 +1,57 @@
+import csv
+import os
+import sys
+
+import pandas as pd
+
+
+# column header text containing naics code
+NAICS_CODE_COL = "2022 NAICS US   Code"
+# column header text containing naics title/description
+NAICS_TITLE_COL = "2022 NAICS US Title"
+
+
+"""
+filter NAICS data with only 3 digit codes
+
+Raises:
+    FileNotFoundError: when input excel file not existed
+    FileExistsError: when output csv file existed
+"""
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
+        exit(1)
+
+    raw_src = sys.argv[1]
+    csv_dest = sys.argv[2]
+
+    if not os.path.isfile(raw_src):
+        print(f"source file not existed: {raw_src}")
+        exit(2)
+
+    if os.path.isfile(csv_dest):
+        print("destination file already existed: {csv_dest}")
+        exit(3)
+
+    df = pd.read_excel(raw_src, dtype=str, na_filter=False)
+
+    print(f'source file successfully read: {raw_src}')
+
+    # add header
+    result = [["code", "title"]]
+
+    # read excel file
+    # and create csv data list
+    for index, row in df.iterrows():
+        code = str(row[NAICS_CODE_COL])
+        if len(code) == 3:
+            a_row = [code, str(row[NAICS_TITLE_COL])]
+            result.append(a_row)
+
+    # output data to csv file
+    with open(csv_dest, "w") as f:
+        writer = csv.writer(f)
+        writer.writerows(result)
+
+    print(f'destination file successfully written: {csv_dest}')
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,9 @@ pytest-cov = "4.1.0"
 black = "23.3.0"
 ruff = "0.0.259"
 
+[tool.poetry.group.data.dependencies]
+openpyxl = "^3.1.2"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
@@ -33,7 +36,6 @@ exclude = '''
     | .gitignore
     | .github
     | data
-    | tools
   )/
 '''
 
@@ -49,18 +51,18 @@ addopts = [
     "--cov-branch",
     "--cov-report=xml",
     "--cov-report=term",
-    "--cov=src",
+    "--cov=regtech_data_validator",
     "-vv",
     "--strict-markers",
     "-rfE",
 ]
 testpaths = [
-    "src/tests",
+    "tests",
 ]
 
 [tool.coverage.run]
 relative_files = true
-source = ["src"]
+source = ["regtech_data_validator"]
 
 [tool.coverage.report]
-skip_empty = true
+skip_empty = true
diff --git a/__init__.py → regtech_data_validator/__init__.py b/__init__.py → regtech_data_validator/__init__.py
diff --git a/src/validator/check_functions.py → regtech_data_validator/check_functions.py b/src/validator/check_functions.py → regtech_data_validator/check_functions.py
diff --git a/regtech_data_validator/checks.py b/regtech_data_validator/checks.py
@@ -0,0 +1,47 @@
+"""
+Subclasses of Pandera's `Check` class
+"""
+
+from enum import StrEnum
+from typing import Any, Callable, Type
+
+from pandera import Check
+from pandera.backends.base import BaseCheckBackend
+from pandera.backends.pandas.checks import PandasCheckBackend
+
+
+class Severity(StrEnum):
+    ERROR = 'error'
+    WARNING = 'warning'
+
+
+class SBLCheck(Check):
+    """
+    A Pandera.Check subclasss that requires a `name` and an `id` be
+    specified. Additionally, an attribute named `warning` is added to
+    the class to enable distinction between warnings and errors. The
+    default value of warning is `False` which corresponds to an error.
+    Don't use this class directly. Make use of the SBLErrorCheck and
+    SBLWarningCheck subclasses below.
+    """
+
+    def __init__(self, check_fn: Callable, id: str, name: str, description: str, severity: Severity, **check_kwargs):
+        """
+        Subclass of Pandera's `Check`, with special handling for severity level
+        Args:
+            check_fn (Callable): A function which evaluates the validity of the column(s) being tested.
+            id (str, required): Unique identifier for a check
+            name (str, required): Unique name for a check
+            description (str, required): Long-form description of a check
+            severity (Severity, required): The severity of a check (error or warning)
+            check_kwargs (Any, optional): Parameters passed to `check_fn` function
+        """
+
+        self.severity = severity
+
+        super().__init__(check_fn, title=id, name=name, description=description, **check_kwargs)
+
+    @classmethod
+    def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]:
+        """Assume Pandas DataFrame and return PandasCheckBackend"""
+        return PandasCheckBackend
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# FFIEC's Census Flat File

		- https://www.ffiec.gov/censusapp.htm
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# North American Industry Classification System (NAICS) codes

		- https://www.census.gov/naics/?48967