From d4529ecb175a2692e35ca9376d18240ee6644d3c Mon Sep 17 00:00:00 2001 From: Hans Keeler Date: Tue, 17 Oct 2023 01:08:56 -0400 Subject: [PATCH] Fix issues related to repo restructure - Fixed all imports - Fixed test and coverage settings in pyproject.toml - Removed all Python path magic in __init.py__ files - Moved data files into the repo, and used `importlib` to load files by package name instead of path. This is more portable, especially once we turn this into a distributable package. - Refactored global_data to only load data once at module load time --- pyproject.toml | 8 ++-- regtech_data_validator/__init__.py | 5 --- regtech_data_validator/create_schemas.py | 7 +-- .../data/census}/Census2022.processed.csv | 0 .../data/census/__init__.py | 0 .../data/naics}/2022_codes.csv | 0 regtech_data_validator/data/naics/__init__.py | 0 regtech_data_validator/global_data.py | 44 ++++++------------- regtech_data_validator/main.py | 3 +- regtech_data_validator/phase_validations.py | 9 ++-- tests/__init__.py | 6 --- tests/test_check_functions.py | 8 +--- tests/test_checks.py | 2 +- tests/test_global_data.py | 16 +------ tests/test_sample_data.py | 12 ++--- tests/test_schema_functions.py | 6 ++- 16 files changed, 39 insertions(+), 87 deletions(-) rename {data/census/processed => regtech_data_validator/data/census}/Census2022.processed.csv (100%) rename __init__.py => regtech_data_validator/data/census/__init__.py (100%) rename {data/naics/processed => regtech_data_validator/data/naics}/2022_codes.csv (100%) create mode 100644 regtech_data_validator/data/naics/__init__.py diff --git a/pyproject.toml b/pyproject.toml index b84d616d..4e13e24c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,18 +49,18 @@ addopts = [ "--cov-branch", "--cov-report=xml", "--cov-report=term", - "--cov=src", + "--cov=regtech_data_validator", "-vv", "--strict-markers", "-rfE", ] testpaths = [ - "src/tests", + "tests", ] [tool.coverage.run] relative_files = true -source = ["src"] +source = ["regtech_data_validator"] [tool.coverage.report] -skip_empty = true \ No newline at end of file +skip_empty = true diff --git a/regtech_data_validator/__init__.py b/regtech_data_validator/__init__.py index 836099bf..e69de29b 100644 --- a/regtech_data_validator/__init__.py +++ b/regtech_data_validator/__init__.py @@ -1,5 +0,0 @@ -import os -import sys - -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(ROOT_DIR) diff --git a/regtech_data_validator/create_schemas.py b/regtech_data_validator/create_schemas.py index cc3cf8d7..76784af3 100644 --- a/regtech_data_validator/create_schemas.py +++ b/regtech_data_validator/create_schemas.py @@ -2,11 +2,12 @@ with validations listed in phase 1 and phase 2.""" import pandas as pd -from checks import SBLCheck from pandera import DataFrameSchema from pandera.errors import SchemaErrors -from phase_validations import get_phase_1_and_2_validations_for_lei -from schema_template import get_template + +from regtech_data_validator.checks import SBLCheck +from regtech_data_validator.phase_validations import get_phase_1_and_2_validations_for_lei +from regtech_data_validator.schema_template import get_template # Get separate schema templates for phase 1 and 2 diff --git a/data/census/processed/Census2022.processed.csv b/regtech_data_validator/data/census/Census2022.processed.csv similarity index 100% rename from data/census/processed/Census2022.processed.csv rename to regtech_data_validator/data/census/Census2022.processed.csv diff --git a/__init__.py b/regtech_data_validator/data/census/__init__.py similarity index 100% rename from __init__.py rename to regtech_data_validator/data/census/__init__.py diff --git a/data/naics/processed/2022_codes.csv b/regtech_data_validator/data/naics/2022_codes.csv similarity index 100% rename from data/naics/processed/2022_codes.csv rename to regtech_data_validator/data/naics/2022_codes.csv diff --git a/regtech_data_validator/data/naics/__init__.py b/regtech_data_validator/data/naics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/regtech_data_validator/global_data.py b/regtech_data_validator/global_data.py index a9c54f04..b3364354 100644 --- a/regtech_data_validator/global_data.py +++ b/regtech_data_validator/global_data.py @@ -1,36 +1,20 @@ -import os -import sys +import csv +from importlib.resources import files -import pandas as pd -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # noqa: E402 -sys.path.append(ROOT_DIR) # noqa: E402 +# global variable for NAICS codes +naics_codes: dict[str,str] = {} +naics_file_path = files('regtech_data_validator.data.naics').joinpath('2022_codes.csv') -from config import CENSUS_PROCESSED_CSV_PATH, NAICS_CSV_PATH # noqa: E402 +with naics_file_path.open('r') as f: + for row in csv.DictReader(f): + naics_codes[row['code']] = row['title'] -naics_codes = {} -# global variable for geoids -census_geoids = {} +# global variable for Census GEOIDs +census_geoids: set[str] = set() +census_file_path = files('regtech_data_validator.data.census').joinpath('Census2022.processed.csv') - -def read_naics_codes(csv_path: str = NAICS_CSV_PATH): - """ - read NAICS CSV file with this format: (code, description) - and populate global value: naics_codes - """ - naics_codes.clear() - df = pd.read_csv(csv_path, dtype=str, na_filter=False) - for _, row in df.iterrows(): - naics_codes.update({row.iloc[0]: row.iloc[1]}) - - -def read_geoids(csv_path: str = CENSUS_PROCESSED_CSV_PATH): - """ - read geoids CSV file with this format: (code) - and populate global value: census_geoids - """ - census_geoids.clear() - df = pd.read_csv(csv_path, dtype=str, na_filter=False) - for _, row in df.iterrows(): - census_geoids.update({row.iloc[0]: None}) +with census_file_path.open('r') as f: + for row in csv.DictReader(f): + census_geoids.add(row['geoid']) diff --git a/regtech_data_validator/main.py b/regtech_data_validator/main.py index 20972759..e7df5b11 100644 --- a/regtech_data_validator/main.py +++ b/regtech_data_validator/main.py @@ -9,7 +9,8 @@ import sys import pandas as pd -from create_schemas import validate_phases + +from regtech_data_validator.create_schemas import validate_phases def csv_to_df(path: str) -> pd.DataFrame: diff --git a/regtech_data_validator/phase_validations.py b/regtech_data_validator/phase_validations.py index fc7b0a22..fc2ad5ea 100644 --- a/regtech_data_validator/phase_validations.py +++ b/regtech_data_validator/phase_validations.py @@ -4,8 +4,8 @@ an instance of a PanderaSchema object for phase 1 and phase 2.""" -import global_data -from check_functions import ( +from regtech_data_validator import global_data +from regtech_data_validator.check_functions import ( has_correct_length, has_no_conditional_field_conflict, has_valid_enum_pair, @@ -28,10 +28,7 @@ meets_multi_value_field_restriction, string_contains, ) -from checks import SBLCheck - -# read and populate global naics code (this should be called only once) -global_data.read_naics_codes() +from regtech_data_validator.checks import SBLCheck def get_phase_1_and_2_validations_for_lei(lei: str = None): diff --git a/tests/__init__.py b/tests/__init__.py index 238d07e8..e69de29b 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,6 +0,0 @@ -import os -import sys - -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -sys.path.append(os.path.join(ROOT_DIR, "validator")) diff --git a/tests/test_check_functions.py b/tests/test_check_functions.py index 65175eff..c93b5613 100644 --- a/tests/test_check_functions.py +++ b/tests/test_check_functions.py @@ -1,7 +1,7 @@ import pandas as pd -from validator import global_data -from validator.check_functions import ( +from regtech_data_validator import global_data +from regtech_data_validator.check_functions import ( has_correct_length, has_no_conditional_field_conflict, has_valid_enum_pair, @@ -474,28 +474,24 @@ def test_with_incorrect_length(self): class TestIsValidCode: def test_with_valid_code(self): - global_data.read_naics_codes() result = is_valid_code("111", False, global_data.naics_codes) assert result is True result = is_valid_code("111", True, global_data.naics_codes) assert result is True def test_with_invalid_code(self): - global_data.read_naics_codes() result = is_valid_code("101", False, global_data.naics_codes) assert result is False result = is_valid_code("101", True, global_data.naics_codes) assert result is False def test_with_accepted_blank(self): - global_data.read_naics_codes() result = is_valid_code("", True, global_data.naics_codes) assert result is True result = is_valid_code(" ", True, global_data.naics_codes) assert result is True def test_with_invalid_blank(self): - global_data.read_naics_codes() result = is_valid_code("", False, global_data.naics_codes) assert result is False result = is_valid_code(" ", False, global_data.naics_codes) diff --git a/tests/test_checks.py b/tests/test_checks.py index bac4cc75..dafaf512 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -1,6 +1,6 @@ import pytest -from validator.checks import SBLCheck +from regtech_data_validator.checks import SBLCheck class TestSBLCheck: diff --git a/tests/test_global_data.py b/tests/test_global_data.py index 6e8fc13f..f939cacf 100644 --- a/tests/test_global_data.py +++ b/tests/test_global_data.py @@ -1,25 +1,11 @@ import pytest -from validator import global_data +from regtech_data_validator import global_data class TestGlobalData: def test_valid_naics_codes(self): - global_data.read_naics_codes() assert len(global_data.naics_codes) == 96 def test_valid_geoids(self): - global_data.read_geoids() assert len(global_data.census_geoids) == 87275 - - def test_invalid_naics_file(self): - failed_fpath = "./data/naics/processed/2022_codes.csv1" - with pytest.raises(Exception) as exc: - global_data.read_naics_codes(failed_fpath) - assert exc.type == FileNotFoundError - - def test_invalid_geoids_file(self): - failed_fpath = "./data/census/processed/Census2022.processed.csv2" - with pytest.raises(Exception) as exc: - global_data.read_geoids(failed_fpath) - assert exc.type == FileNotFoundError diff --git a/tests/test_sample_data.py b/tests/test_sample_data.py index e1d6de07..35c1fe42 100644 --- a/tests/test_sample_data.py +++ b/tests/test_sample_data.py @@ -1,16 +1,10 @@ -import os -import sys - import pandas as pd import pytest -from validator.create_schemas import validate_phases - -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # noqa: E402 -sys.path.append(ROOT_DIR) # noqa: E402 +from regtech_data_validator.create_schemas import validate_phases -GOOD_FILE_PATH = "./src/tests/data/sbl-validations-pass.csv" -BAD_FILE_PATH = "./src/tests/data/sbl-validations-fail.csv" +GOOD_FILE_PATH = "./tests/data/sbl-validations-pass.csv" +BAD_FILE_PATH = "./tests/data/sbl-validations-fail.csv" class TestValidatingSampleData: diff --git a/tests/test_schema_functions.py b/tests/test_schema_functions.py index 911dc673..0b8e0c1b 100644 --- a/tests/test_schema_functions.py +++ b/tests/test_schema_functions.py @@ -1,6 +1,10 @@ import pandas as pd -from validator.create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate, validate_phases +from regtech_data_validator.create_schemas import ( + get_phase_1_schema_for_lei, + get_phase_2_schema_for_lei, + validate, validate_phases +) class TestUtil: