Skip to content

Commit

Permalink
Merge branch 'main' into features/update_po_validation_ids
Browse files Browse the repository at this point in the history
  • Loading branch information
Aldrian Harjati committed Oct 23, 2023
2 parents 3ad46cf + ba6a1c4 commit 07d40c6
Show file tree
Hide file tree
Showing 35 changed files with 467 additions and 383 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"python.testing.unittestEnabled": false,
"python.testing.pytestArgs": [
"--rootdir",
"${workspaceFolder}/src/tests"
"${workspaceFolder}/tests"
]
}
}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/linters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: chartboost/ruff-action@v1
- uses: chartboost/ruff-action@v1
19 changes: 0 additions & 19 deletions config.py

This file was deleted.

3 changes: 3 additions & 0 deletions data/census/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# FFIEC's Census Flat File

- https://www.ffiec.gov/censusapp.htm
65 changes: 37 additions & 28 deletions tools/process_census.py → data/census/process_census.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

import pandas as pd

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402
sys.path.append(ROOT_DIR) # noqa: E402
# census file col indexes
CENSUS_STATE_COL_INDEX = 2
CENSUS_COUNTY_COL_INDEX = 3
CENSUS_TRACT_COL_INDEX = 4

import config # noqa: E402
CENSUS_GEOID_COL = "geoid"


# helper function to check number (float/int/negative)
Expand All @@ -21,24 +23,22 @@ def _is_number(s):


# helper function to unzip census file and extract CSV file
def _extract_census_zip_file():
CENSUS_TMP_CSV_PATH = config.CENSUS_RAW_ZIP_PATH + ".tmp.csv"
def _extract_census_zip_file(raw_src):
census_tmp_csv_path = raw_src + ".tmp.csv"
# unzip and extract csv files
with zipfile.ZipFile(config.CENSUS_RAW_ZIP_PATH, "r") as zip_ref:
with zipfile.ZipFile(raw_src, "r") as zip_ref:
for file in zip_ref.namelist(): # iterate over files in archive
if file[-4:] == ".csv":
print("Extracting CSV to {}".format(CENSUS_TMP_CSV_PATH))
with open(CENSUS_TMP_CSV_PATH, "wb") as outfile:
print("Extracting CSV to {}".format(census_tmp_csv_path))
with open(census_tmp_csv_path, "wb") as outfile:
outfile.write(zip_ref.read(file))
# it should only have one csv file
return CENSUS_TMP_CSV_PATH
# it should only have one csv file

return census_tmp_csv_path


# helper function to read extracted csv file and filter only geo-tract-id
def _read_census_csv(src_path: str, csv_path: str):
STATE_COL = config.CENSUS_STATE_COL_INDEX
COUNTY_COL = config.CENSUS_COUNTY_COL_INDEX
TRACT_COL = config.CENSUS_TRACT_COL_INDEX
def _process_census_csv(src_path: str, csv_path: str):

# check paths
if not os.path.isfile(src_path):
Expand All @@ -52,14 +52,14 @@ def _read_census_csv(src_path: str, csv_path: str):
)

# add header
result = [[config.CENSUS_GEOID_COL]]
result = [[CENSUS_GEOID_COL]]

# read excel file
# and create csv data list
for index, row in df.iterrows():
state_value = str(row[STATE_COL])
county_value = str(row[COUNTY_COL])
tract_value = str(row[TRACT_COL])
state_value = str(row[CENSUS_STATE_COL_INDEX])
county_value = str(row[CENSUS_COUNTY_COL_INDEX])
tract_value = str(row[CENSUS_TRACT_COL_INDEX])
if (
_is_number(state_value)
and _is_number(county_value)
Expand All @@ -84,14 +84,23 @@ def _read_census_csv(src_path: str, csv_path: str):
- output to defined output file
"""
if __name__ == "__main__":
CSV_PATH = config.CENSUS_PROCESSED_CSV_PATH

if os.path.isfile(CSV_PATH):
error_msg = "Output {} csv file existed".format(CSV_PATH)
raise FileExistsError(error_msg)

tmp_census_csv_file = _extract_census_zip_file()
print("Reading extracted CSV File . {}".format(tmp_census_csv_file))
_read_census_csv(tmp_census_csv_file, CSV_PATH)
print("Removing extracted CSV File")
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
exit(1)

raw_src = sys.argv[1]
csv_dest = sys.argv[2]

if not os.path.isfile(raw_src):
print(f"source file not existed: {raw_src}")
exit(2)

if os.path.isfile(csv_dest):
print("destination file already existed: {csv_dest}")
exit(3)

tmp_census_csv_file = _extract_census_zip_file(raw_src)
print(f"Reading extracted CSV file: {tmp_census_csv_file}")
_process_census_csv(tmp_census_csv_file, csv_dest)
print("Removing extracted CSV file")
os.remove(tmp_census_csv_file)
3 changes: 3 additions & 0 deletions data/naics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# North American Industry Classification System (NAICS) codes

- https://www.census.gov/naics/?48967
57 changes: 57 additions & 0 deletions data/naics/process_naics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import csv
import os
import sys

import pandas as pd


# column header text containing naics code
NAICS_CODE_COL = "2022 NAICS US Code"
# column header text containing naics title/description
NAICS_TITLE_COL = "2022 NAICS US Title"


"""
filter NAICS data with only 3 digit codes
Raises:
FileNotFoundError: when input excel file not existed
FileExistsError: when output csv file existed
"""
if __name__ == "__main__":
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
exit(1)

raw_src = sys.argv[1]
csv_dest = sys.argv[2]

if not os.path.isfile(raw_src):
print(f"source file not existed: {raw_src}")
exit(2)

if os.path.isfile(csv_dest):
print("destination file already existed: {csv_dest}")
exit(3)

df = pd.read_excel(raw_src, dtype=str, na_filter=False)

print(f'source file successfully read: {raw_src}')

# add header
result = [["code", "title"]]

# read excel file
# and create csv data list
for index, row in df.iterrows():
code = str(row[NAICS_CODE_COL])
if len(code) == 3:
a_row = [code, str(row[NAICS_TITLE_COL])]
result.append(a_row)

# output data to csv file
with open(csv_dest, "w") as f:
writer = csv.writer(f)
writer.writerows(result)

print(f'destination file successfully written: {csv_dest}')
29 changes: 27 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pytest-cov = "4.1.0"
black = "23.3.0"
ruff = "0.0.259"

[tool.poetry.group.data.dependencies]
openpyxl = "^3.1.2"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Expand All @@ -33,7 +36,6 @@ exclude = '''
| .gitignore
| .github
| data
| tools
)/
'''

Expand All @@ -49,18 +51,18 @@ addopts = [
"--cov-branch",
"--cov-report=xml",
"--cov-report=term",
"--cov=src",
"--cov=regtech_data_validator",
"-vv",
"--strict-markers",
"-rfE",
]
testpaths = [
"src/tests",
"tests",
]

[tool.coverage.run]
relative_files = true
source = ["src"]
source = ["regtech_data_validator"]

[tool.coverage.report]
skip_empty = true
skip_empty = true
File renamed without changes.
File renamed without changes.
47 changes: 47 additions & 0 deletions regtech_data_validator/checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Subclasses of Pandera's `Check` class
"""

from enum import StrEnum
from typing import Any, Callable, Type

from pandera import Check
from pandera.backends.base import BaseCheckBackend
from pandera.backends.pandas.checks import PandasCheckBackend


class Severity(StrEnum):
ERROR = 'error'
WARNING = 'warning'


class SBLCheck(Check):
"""
A Pandera.Check subclasss that requires a `name` and an `id` be
specified. Additionally, an attribute named `warning` is added to
the class to enable distinction between warnings and errors. The
default value of warning is `False` which corresponds to an error.
Don't use this class directly. Make use of the SBLErrorCheck and
SBLWarningCheck subclasses below.
"""

def __init__(self, check_fn: Callable, id: str, name: str, description: str, severity: Severity, **check_kwargs):
"""
Subclass of Pandera's `Check`, with special handling for severity level
Args:
check_fn (Callable): A function which evaluates the validity of the column(s) being tested.
id (str, required): Unique identifier for a check
name (str, required): Unique name for a check
description (str, required): Long-form description of a check
severity (Severity, required): The severity of a check (error or warning)
check_kwargs (Any, optional): Parameters passed to `check_fn` function
"""

self.severity = severity

super().__init__(check_fn, title=id, name=name, description=description, **check_kwargs)

@classmethod
def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]:
"""Assume Pandas DataFrame and return PandasCheckBackend"""
return PandasCheckBackend
Loading

0 comments on commit 07d40c6

Please sign in to comment.