cfpb · hkeeler · Oct 20, 2023 · Oct 16, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -49,7 +49,7 @@
         "python.testing.unittestEnabled": false,
         "python.testing.pytestArgs": [
           "--rootdir",
-          "${workspaceFolder}/src/tests"
+          "${workspaceFolder}/tests"
         ]
       }
     }

diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml
@@ -8,8 +8,11 @@ jobs:
     steps:
       - uses: actions/checkout@v3
       - uses: psf/black@stable
+        with:
+          options: "--check --diff --verbose"
+          version: "~= 22.0"
   ruff:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
-      - uses: chartboost/ruff-action@v1
+      - uses: chartboost/ruff-action@v1
diff --git a/config.py b/config.py
diff --git a/data/census/README.md b/data/census/README.md
@@ -0,0 +1,3 @@
+# FFIEC's Census Flat File
+
+- https://www.ffiec.gov/censusapp.htm
diff --git a/tools/process_census.py → data/census/process_census.py b/tools/process_census.py → data/census/process_census.py
@@ -5,10 +5,12 @@
 
 import pandas as pd
 
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # noqa: E402
-sys.path.append(ROOT_DIR)  # noqa: E402
+# census file col indexes
+CENSUS_STATE_COL_INDEX = 2
+CENSUS_COUNTY_COL_INDEX = 3
+CENSUS_TRACT_COL_INDEX = 4
 
-import config  # noqa: E402
+CENSUS_GEOID_COL = "geoid"
 
 
 # helper function to check number (float/int/negative)
@@ -21,24 +23,22 @@ def _is_number(s):
 
 
 # helper function to unzip census file and extract CSV file
-def _extract_census_zip_file():
-    CENSUS_TMP_CSV_PATH = config.CENSUS_RAW_ZIP_PATH + ".tmp.csv"
+def _extract_census_zip_file(raw_src):
+    census_tmp_csv_path = raw_src + ".tmp.csv"
     # unzip and extract csv files
-    with zipfile.ZipFile(config.CENSUS_RAW_ZIP_PATH, "r") as zip_ref:
+    with zipfile.ZipFile(raw_src, "r") as zip_ref:
         for file in zip_ref.namelist():  # iterate over files in archive
             if file[-4:] == ".csv":
-                print("Extracting CSV to {}".format(CENSUS_TMP_CSV_PATH))
-                with open(CENSUS_TMP_CSV_PATH, "wb") as outfile:
+                print("Extracting CSV to {}".format(census_tmp_csv_path))
+                with open(census_tmp_csv_path, "wb") as outfile:
                     outfile.write(zip_ref.read(file))
-                # it should only have one csv file
-                return CENSUS_TMP_CSV_PATH
+                    # it should only have one csv file
+
+    return census_tmp_csv_path
 
 
 # helper function to read extracted csv file and filter only geo-tract-id
-def _read_census_csv(src_path: str, csv_path: str):
-    STATE_COL = config.CENSUS_STATE_COL_INDEX
-    COUNTY_COL = config.CENSUS_COUNTY_COL_INDEX
-    TRACT_COL = config.CENSUS_TRACT_COL_INDEX
+def _process_census_csv(src_path: str, csv_path: str):
 
     # check paths
     if not os.path.isfile(src_path):
@@ -52,14 +52,14 @@ def _read_census_csv(src_path: str, csv_path: str):
     )
 
     # add header
-    result = [[config.CENSUS_GEOID_COL]]
+    result = [[CENSUS_GEOID_COL]]
 
     # read excel file
     # and create csv data list
     for index, row in df.iterrows():
-        state_value = str(row[STATE_COL])
-        county_value = str(row[COUNTY_COL])
-        tract_value = str(row[TRACT_COL])
+        state_value = str(row[CENSUS_STATE_COL_INDEX])
+        county_value = str(row[CENSUS_COUNTY_COL_INDEX])
+        tract_value = str(row[CENSUS_TRACT_COL_INDEX])
         if (
             _is_number(state_value)
             and _is_number(county_value)
@@ -84,14 +84,23 @@ def _read_census_csv(src_path: str, csv_path: str):
 - output to defined output file
 """
 if __name__ == "__main__":
-    CSV_PATH = config.CENSUS_PROCESSED_CSV_PATH
-
-    if os.path.isfile(CSV_PATH):
-        error_msg = "Output {} csv file existed".format(CSV_PATH)
-        raise FileExistsError(error_msg)
-
-    tmp_census_csv_file = _extract_census_zip_file()
-    print("Reading extracted CSV File . {}".format(tmp_census_csv_file))
-    _read_census_csv(tmp_census_csv_file, CSV_PATH)
-    print("Removing extracted CSV File")
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
+        exit(1)
+
+    raw_src = sys.argv[1]
+    csv_dest = sys.argv[2]
+
+    if not os.path.isfile(raw_src):
+        print(f"source file not existed: {raw_src}")
+        exit(2)
+
+    if os.path.isfile(csv_dest):
+        print("destination file already existed: {csv_dest}")
+        exit(3)
+
+    tmp_census_csv_file = _extract_census_zip_file(raw_src)
+    print(f"Reading extracted CSV file: {tmp_census_csv_file}")
+    _process_census_csv(tmp_census_csv_file, csv_dest)
+    print("Removing extracted CSV file")
     os.remove(tmp_census_csv_file)
diff --git a/data/naics/README.md b/data/naics/README.md
@@ -0,0 +1,3 @@
+# North American Industry Classification System (NAICS) codes
+
+- https://www.census.gov/naics/?48967
diff --git a/data/naics/process_naics.py b/data/naics/process_naics.py
@@ -0,0 +1,57 @@
+import csv
+import os
+import sys
+
+import pandas as pd
+
+
+# column header text containing naics code
+NAICS_CODE_COL = "2022 NAICS US   Code"
+# column header text containing naics title/description
+NAICS_TITLE_COL = "2022 NAICS US Title"
+
+
+"""
+filter NAICS data with only 3 digit codes
+
+Raises:
+    FileNotFoundError: when input excel file not existed
+    FileExistsError: when output csv file existed
+"""
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
+        exit(1)
+
+    raw_src = sys.argv[1]
+    csv_dest = sys.argv[2]
+
+    if not os.path.isfile(raw_src):
+        print(f"source file not existed: {raw_src}")
+        exit(2)
+
+    if os.path.isfile(csv_dest):
+        print("destination file already existed: {csv_dest}")
+        exit(3)
+
+    df = pd.read_excel(raw_src, dtype=str, na_filter=False)
+
+    print(f'source file successfully read: {raw_src}')
+
+    # add header
+    result = [["code", "title"]]
+
+    # read excel file
+    # and create csv data list
+    for index, row in df.iterrows():
+        code = str(row[NAICS_CODE_COL])
+        if len(code) == 3:
+            a_row = [code, str(row[NAICS_TITLE_COL])]
+            result.append(a_row)
+
+    # output data to csv file
+    with open(csv_dest, "w") as f:
+        writer = csv.writer(f)
+        writer.writerows(result)
+
+    print(f'destination file successfully written: {csv_dest}')
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,9 @@ pytest-cov = "4.1.0"
 black = "23.3.0"
 ruff = "0.0.259"
 
+[tool.poetry.group.data.dependencies]
+openpyxl = "^3.1.2"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
@@ -33,7 +36,6 @@ exclude = '''
     | .gitignore
     | .github
     | data
-    | tools
   )/
 '''
 
@@ -49,18 +51,18 @@ addopts = [
     "--cov-branch",
     "--cov-report=xml",
     "--cov-report=term",
-    "--cov=src",
+    "--cov=regtech_data_validator",
     "-vv",
     "--strict-markers",
     "-rfE",
 ]
 testpaths = [
-    "src/tests",
+    "tests",
 ]
 
 [tool.coverage.run]
 relative_files = true
-source = ["src"]
+source = ["regtech_data_validator"]
 
 [tool.coverage.report]
-skip_empty = true
+skip_empty = true
diff --git a/__init__.py → regtech_data_validator/__init__.py b/__init__.py → regtech_data_validator/__init__.py
diff --git a/src/validator/check_functions.py → regtech_data_validator/check_functions.py b/src/validator/check_functions.py → regtech_data_validator/check_functions.py
diff --git a/regtech_data_validator/checks.py b/regtech_data_validator/checks.py
@@ -0,0 +1,47 @@
+"""
+Subclasses of Pandera's `Check` class
+"""
+
+from enum import StrEnum
+from typing import Any, Callable, Type
+
+from pandera import Check
+from pandera.backends.base import BaseCheckBackend
+from pandera.backends.pandas.checks import PandasCheckBackend
+
+
+class Severity(StrEnum):
+    ERROR = 'error'
+    WARNING = 'warning'
+
+
+class SBLCheck(Check):
+    """
+    A Pandera.Check subclasss that requires a `name` and an `id` be
+    specified. Additionally, an attribute named `warning` is added to
+    the class to enable distinction between warnings and errors. The
+    default value of warning is `False` which corresponds to an error.
+    Don't use this class directly. Make use of the SBLErrorCheck and
+    SBLWarningCheck subclasses below.
+    """
+
+    def __init__(self, check_fn: Callable, id: str, name: str, description: str, severity: Severity, **check_kwargs):
+        """
+        Subclass of Pandera's `Check`, with special handling for severity level
+        Args:
+            check_fn (Callable): A function which evaluates the validity of the column(s) being tested.
+            id (str, required): Unique identifier for a check
+            name (str, required): Unique name for a check
+            description (str, required): Long-form description of a check
+            severity (Severity, required): The severity of a check (error or warning)
+            check_kwargs (Any, optional): Parameters passed to `check_fn` function
+        """
+
+        self.severity = severity
+
+        super().__init__(check_fn, title=id, name=name, description=description, **check_kwargs)
+
+    @classmethod
+    def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]:
+        """Assume Pandas DataFrame and return PandasCheckBackend"""
+        return PandasCheckBackend
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# FFIEC's Census Flat File

		- https://www.ffiec.gov/censusapp.htm
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# North American Industry Classification System (NAICS) codes

		- https://www.census.gov/naics/?48967