Skip to content

Commit

Permalink
Fix issues related to repo restructure
Browse files Browse the repository at this point in the history
- Fixed all imports
- Fixed test and coverage settings in pyproject.toml
- Removed all Python path magic in __init.py__ files
- Moved data files into the repo, and used `importlib` to load files by
  package name instead of path. This is more portable, especially once
  we turn this into a distributable package.
- Refactored global_data to only load data once at module load time
  • Loading branch information
hkeeler committed Oct 17, 2023
1 parent 722b981 commit d4529ec
Show file tree
Hide file tree
Showing 16 changed files with 39 additions and 87 deletions.
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,18 @@ addopts = [
"--cov-branch",
"--cov-report=xml",
"--cov-report=term",
"--cov=src",
"--cov=regtech_data_validator",
"-vv",
"--strict-markers",
"-rfE",
]
testpaths = [
"src/tests",
"tests",
]

[tool.coverage.run]
relative_files = true
source = ["src"]
source = ["regtech_data_validator"]

[tool.coverage.report]
skip_empty = true
skip_empty = true
5 changes: 0 additions & 5 deletions regtech_data_validator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +0,0 @@
import os
import sys

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(ROOT_DIR)
7 changes: 4 additions & 3 deletions regtech_data_validator/create_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
with validations listed in phase 1 and phase 2."""

import pandas as pd
from checks import SBLCheck
from pandera import DataFrameSchema
from pandera.errors import SchemaErrors
from phase_validations import get_phase_1_and_2_validations_for_lei
from schema_template import get_template

from regtech_data_validator.checks import SBLCheck
from regtech_data_validator.phase_validations import get_phase_1_and_2_validations_for_lei
from regtech_data_validator.schema_template import get_template

# Get separate schema templates for phase 1 and 2

Expand Down
File renamed without changes.
File renamed without changes.
Empty file.
44 changes: 14 additions & 30 deletions regtech_data_validator/global_data.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,20 @@
import os
import sys
import csv
from importlib.resources import files

import pandas as pd

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # noqa: E402
sys.path.append(ROOT_DIR) # noqa: E402
# global variable for NAICS codes
naics_codes: dict[str,str] = {}
naics_file_path = files('regtech_data_validator.data.naics').joinpath('2022_codes.csv')

from config import CENSUS_PROCESSED_CSV_PATH, NAICS_CSV_PATH # noqa: E402
with naics_file_path.open('r') as f:
for row in csv.DictReader(f):
naics_codes[row['code']] = row['title']

naics_codes = {}

# global variable for geoids
census_geoids = {}
# global variable for Census GEOIDs
census_geoids: set[str] = set()
census_file_path = files('regtech_data_validator.data.census').joinpath('Census2022.processed.csv')


def read_naics_codes(csv_path: str = NAICS_CSV_PATH):
"""
read NAICS CSV file with this format: (code, description)
and populate global value: naics_codes
"""
naics_codes.clear()
df = pd.read_csv(csv_path, dtype=str, na_filter=False)
for _, row in df.iterrows():
naics_codes.update({row.iloc[0]: row.iloc[1]})


def read_geoids(csv_path: str = CENSUS_PROCESSED_CSV_PATH):
"""
read geoids CSV file with this format: (code)
and populate global value: census_geoids
"""
census_geoids.clear()
df = pd.read_csv(csv_path, dtype=str, na_filter=False)
for _, row in df.iterrows():
census_geoids.update({row.iloc[0]: None})
with census_file_path.open('r') as f:
for row in csv.DictReader(f):
census_geoids.add(row['geoid'])
3 changes: 2 additions & 1 deletion regtech_data_validator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import sys

import pandas as pd
from create_schemas import validate_phases

from regtech_data_validator.create_schemas import validate_phases


def csv_to_df(path: str) -> pd.DataFrame:
Expand Down
9 changes: 3 additions & 6 deletions regtech_data_validator/phase_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
an instance of a PanderaSchema object for phase 1 and phase 2."""


import global_data
from check_functions import (
from regtech_data_validator import global_data
from regtech_data_validator.check_functions import (
has_correct_length,
has_no_conditional_field_conflict,
has_valid_enum_pair,
Expand All @@ -28,10 +28,7 @@
meets_multi_value_field_restriction,
string_contains,
)
from checks import SBLCheck

# read and populate global naics code (this should be called only once)
global_data.read_naics_codes()
from regtech_data_validator.checks import SBLCheck


def get_phase_1_and_2_validations_for_lei(lei: str = None):
Expand Down
6 changes: 0 additions & 6 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +0,0 @@
import os
import sys

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

sys.path.append(os.path.join(ROOT_DIR, "validator"))
8 changes: 2 additions & 6 deletions tests/test_check_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd

from validator import global_data
from validator.check_functions import (
from regtech_data_validator import global_data
from regtech_data_validator.check_functions import (
has_correct_length,
has_no_conditional_field_conflict,
has_valid_enum_pair,
Expand Down Expand Up @@ -474,28 +474,24 @@ def test_with_incorrect_length(self):

class TestIsValidCode:
def test_with_valid_code(self):
global_data.read_naics_codes()
result = is_valid_code("111", False, global_data.naics_codes)
assert result is True
result = is_valid_code("111", True, global_data.naics_codes)
assert result is True

def test_with_invalid_code(self):
global_data.read_naics_codes()
result = is_valid_code("101", False, global_data.naics_codes)
assert result is False
result = is_valid_code("101", True, global_data.naics_codes)
assert result is False

def test_with_accepted_blank(self):
global_data.read_naics_codes()
result = is_valid_code("", True, global_data.naics_codes)
assert result is True
result = is_valid_code(" ", True, global_data.naics_codes)
assert result is True

def test_with_invalid_blank(self):
global_data.read_naics_codes()
result = is_valid_code("", False, global_data.naics_codes)
assert result is False
result = is_valid_code(" ", False, global_data.naics_codes)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_checks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from validator.checks import SBLCheck
from regtech_data_validator.checks import SBLCheck


class TestSBLCheck:
Expand Down
16 changes: 1 addition & 15 deletions tests/test_global_data.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,11 @@
import pytest

from validator import global_data
from regtech_data_validator import global_data


class TestGlobalData:
def test_valid_naics_codes(self):
global_data.read_naics_codes()
assert len(global_data.naics_codes) == 96

def test_valid_geoids(self):
global_data.read_geoids()
assert len(global_data.census_geoids) == 87275

def test_invalid_naics_file(self):
failed_fpath = "./data/naics/processed/2022_codes.csv1"
with pytest.raises(Exception) as exc:
global_data.read_naics_codes(failed_fpath)
assert exc.type == FileNotFoundError

def test_invalid_geoids_file(self):
failed_fpath = "./data/census/processed/Census2022.processed.csv2"
with pytest.raises(Exception) as exc:
global_data.read_geoids(failed_fpath)
assert exc.type == FileNotFoundError
12 changes: 3 additions & 9 deletions tests/test_sample_data.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
import os
import sys

import pandas as pd
import pytest

from validator.create_schemas import validate_phases

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # noqa: E402
sys.path.append(ROOT_DIR) # noqa: E402
from regtech_data_validator.create_schemas import validate_phases

GOOD_FILE_PATH = "./src/tests/data/sbl-validations-pass.csv"
BAD_FILE_PATH = "./src/tests/data/sbl-validations-fail.csv"
GOOD_FILE_PATH = "./tests/data/sbl-validations-pass.csv"
BAD_FILE_PATH = "./tests/data/sbl-validations-fail.csv"


class TestValidatingSampleData:
Expand Down
6 changes: 5 additions & 1 deletion tests/test_schema_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import pandas as pd

from validator.create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate, validate_phases
from regtech_data_validator.create_schemas import (
get_phase_1_schema_for_lei,
get_phase_2_schema_for_lei,
validate, validate_phases
)


class TestUtil:
Expand Down

0 comments on commit d4529ec

Please sign in to comment.