Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: standardize repo structure and other prep for open-sourcing #60

Merged
merged 13 commits into from
Oct 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
"python.testing.unittestEnabled": false,
"python.testing.pytestArgs": [
"--rootdir",
"${workspaceFolder}/src/tests"
"${workspaceFolder}/tests"
]
}
}
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/linters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ jobs:
steps:
- uses: actions/checkout@v3
- uses: psf/black@stable
with:
options: "--check --diff --verbose"
version: "~= 22.0"
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: chartboost/ruff-action@v1
- uses: chartboost/ruff-action@v1
19 changes: 0 additions & 19 deletions config.py

This file was deleted.

3 changes: 3 additions & 0 deletions data/census/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# FFIEC's Census Flat File

- https://www.ffiec.gov/censusapp.htm
65 changes: 37 additions & 28 deletions tools/process_census.py → data/census/process_census.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

import pandas as pd

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402
sys.path.append(ROOT_DIR) # noqa: E402
# census file col indexes
CENSUS_STATE_COL_INDEX = 2
CENSUS_COUNTY_COL_INDEX = 3
CENSUS_TRACT_COL_INDEX = 4

import config # noqa: E402
CENSUS_GEOID_COL = "geoid"


# helper function to check number (float/int/negative)
Expand All @@ -21,24 +23,22 @@ def _is_number(s):


# helper function to unzip census file and extract CSV file
def _extract_census_zip_file():
CENSUS_TMP_CSV_PATH = config.CENSUS_RAW_ZIP_PATH + ".tmp.csv"
def _extract_census_zip_file(raw_src):
census_tmp_csv_path = raw_src + ".tmp.csv"
# unzip and extract csv files
with zipfile.ZipFile(config.CENSUS_RAW_ZIP_PATH, "r") as zip_ref:
with zipfile.ZipFile(raw_src, "r") as zip_ref:
for file in zip_ref.namelist(): # iterate over files in archive
if file[-4:] == ".csv":
print("Extracting CSV to {}".format(CENSUS_TMP_CSV_PATH))
with open(CENSUS_TMP_CSV_PATH, "wb") as outfile:
print("Extracting CSV to {}".format(census_tmp_csv_path))
with open(census_tmp_csv_path, "wb") as outfile:
outfile.write(zip_ref.read(file))
# it should only have one csv file
return CENSUS_TMP_CSV_PATH
# it should only have one csv file

return census_tmp_csv_path


# helper function to read extracted csv file and filter only geo-tract-id
def _read_census_csv(src_path: str, csv_path: str):
STATE_COL = config.CENSUS_STATE_COL_INDEX
COUNTY_COL = config.CENSUS_COUNTY_COL_INDEX
TRACT_COL = config.CENSUS_TRACT_COL_INDEX
def _process_census_csv(src_path: str, csv_path: str):

# check paths
if not os.path.isfile(src_path):
Expand All @@ -52,14 +52,14 @@ def _read_census_csv(src_path: str, csv_path: str):
)

# add header
result = [[config.CENSUS_GEOID_COL]]
result = [[CENSUS_GEOID_COL]]

# read excel file
# and create csv data list
for index, row in df.iterrows():
state_value = str(row[STATE_COL])
county_value = str(row[COUNTY_COL])
tract_value = str(row[TRACT_COL])
state_value = str(row[CENSUS_STATE_COL_INDEX])
county_value = str(row[CENSUS_COUNTY_COL_INDEX])
tract_value = str(row[CENSUS_TRACT_COL_INDEX])
if (
_is_number(state_value)
and _is_number(county_value)
Expand All @@ -84,14 +84,23 @@ def _read_census_csv(src_path: str, csv_path: str):
- output to defined output file
"""
if __name__ == "__main__":
CSV_PATH = config.CENSUS_PROCESSED_CSV_PATH

if os.path.isfile(CSV_PATH):
error_msg = "Output {} csv file existed".format(CSV_PATH)
raise FileExistsError(error_msg)

tmp_census_csv_file = _extract_census_zip_file()
print("Reading extracted CSV File . {}".format(tmp_census_csv_file))
_read_census_csv(tmp_census_csv_file, CSV_PATH)
print("Removing extracted CSV File")
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
exit(1)

raw_src = sys.argv[1]
csv_dest = sys.argv[2]

if not os.path.isfile(raw_src):
print(f"source file not existed: {raw_src}")
exit(2)

if os.path.isfile(csv_dest):
print("destination file already existed: {csv_dest}")
exit(3)

tmp_census_csv_file = _extract_census_zip_file(raw_src)
print(f"Reading extracted CSV file: {tmp_census_csv_file}")
_process_census_csv(tmp_census_csv_file, csv_dest)
print("Removing extracted CSV file")
os.remove(tmp_census_csv_file)
3 changes: 3 additions & 0 deletions data/naics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# North American Industry Classification System (NAICS) codes

- https://www.census.gov/naics/?48967
57 changes: 57 additions & 0 deletions data/naics/process_naics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import csv
import os
import sys

import pandas as pd


# column header text containing naics code
NAICS_CODE_COL = "2022 NAICS US Code"
# column header text containing naics title/description
NAICS_TITLE_COL = "2022 NAICS US Title"


"""
filter NAICS data with only 3 digit codes

Raises:
FileNotFoundError: when input excel file not existed
FileExistsError: when output csv file existed
"""
if __name__ == "__main__":
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
exit(1)

raw_src = sys.argv[1]
csv_dest = sys.argv[2]

if not os.path.isfile(raw_src):
print(f"source file not existed: {raw_src}")
exit(2)

if os.path.isfile(csv_dest):
print("destination file already existed: {csv_dest}")
exit(3)

df = pd.read_excel(raw_src, dtype=str, na_filter=False)

print(f'source file successfully read: {raw_src}')

# add header
result = [["code", "title"]]

# read excel file
# and create csv data list
for index, row in df.iterrows():
code = str(row[NAICS_CODE_COL])
if len(code) == 3:
a_row = [code, str(row[NAICS_TITLE_COL])]
result.append(a_row)

# output data to csv file
with open(csv_dest, "w") as f:
writer = csv.writer(f)
writer.writerows(result)

print(f'destination file successfully written: {csv_dest}')
29 changes: 27 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pytest-cov = "4.1.0"
black = "23.3.0"
ruff = "0.0.259"

[tool.poetry.group.data.dependencies]
openpyxl = "^3.1.2"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this used? I may have missed the usage

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's required by the NAICS code processing script.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can add that detail to the new README I added for that dataset. Each of those could use instructions on how to run those two scripts too.


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Expand All @@ -33,7 +36,6 @@ exclude = '''
| .gitignore
| .github
| data
| tools
)/
'''

Expand All @@ -49,18 +51,18 @@ addopts = [
"--cov-branch",
"--cov-report=xml",
"--cov-report=term",
"--cov=src",
"--cov=regtech_data_validator",
"-vv",
"--strict-markers",
"-rfE",
]
testpaths = [
"src/tests",
"tests",
]

[tool.coverage.run]
relative_files = true
source = ["src"]
source = ["regtech_data_validator"]

[tool.coverage.report]
skip_empty = true
skip_empty = true
File renamed without changes.
47 changes: 47 additions & 0 deletions regtech_data_validator/checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Subclasses of Pandera's `Check` class
"""

from enum import StrEnum
from typing import Any, Callable, Type

from pandera import Check
from pandera.backends.base import BaseCheckBackend
from pandera.backends.pandas.checks import PandasCheckBackend


class Severity(StrEnum):
ERROR = 'error'
WARNING = 'warning'


class SBLCheck(Check):
"""
A Pandera.Check subclasss that requires a `name` and an `id` be
specified. Additionally, an attribute named `warning` is added to
the class to enable distinction between warnings and errors. The
default value of warning is `False` which corresponds to an error.
Don't use this class directly. Make use of the SBLErrorCheck and
SBLWarningCheck subclasses below.
"""

def __init__(self, check_fn: Callable, id: str, name: str, description: str, severity: Severity, **check_kwargs):
"""
Subclass of Pandera's `Check`, with special handling for severity level
Args:
check_fn (Callable): A function which evaluates the validity of the column(s) being tested.
id (str, required): Unique identifier for a check
name (str, required): Unique name for a check
description (str, required): Long-form description of a check
severity (Severity, required): The severity of a check (error or warning)
check_kwargs (Any, optional): Parameters passed to `check_fn` function
"""

self.severity = severity

super().__init__(check_fn, title=id, name=name, description=description, **check_kwargs)

@classmethod
def get_backend(cls, check_obj: Any) -> Type[BaseCheckBackend]:
"""Assume Pandas DataFrame and return PandasCheckBackend"""
return PandasCheckBackend
Loading
Loading