Fix issues related to repo restructure

- Fixed all imports - Fixed test and coverage settings in pyproject.toml - Removed all Python path magic in __init.py__ files - Moved data files into the repo, and used `importlib` to load files by package name instead of path. This is more portable, especially once we turn this into a distributable package. - Refactored global_data to only load data once at module load time
cfpb · Oct 17, 2023 · d4529ec · d4529ec
1 parent 722b981
commit d4529ec
Show file tree

Hide file tree

Showing 16 changed files with 39 additions and 87 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,18 +49,18 @@ addopts = [
     "--cov-branch",
     "--cov-report=xml",
     "--cov-report=term",
-    "--cov=src",
+    "--cov=regtech_data_validator",
     "-vv",
     "--strict-markers",
     "-rfE",
 ]
 testpaths = [
-    "src/tests",
+    "tests",
 ]
 
 [tool.coverage.run]
 relative_files = true
-source = ["src"]
+source = ["regtech_data_validator"]
 
 [tool.coverage.report]
-skip_empty = true
+skip_empty = true
diff --git a/regtech_data_validator/__init__.py b/regtech_data_validator/__init__.py
@@ -1,5 +0,0 @@
-import os
-import sys
-
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-sys.path.append(ROOT_DIR)

diff --git a/regtech_data_validator/create_schemas.py b/regtech_data_validator/create_schemas.py
@@ -2,11 +2,12 @@
 with validations listed in phase 1 and phase 2."""
 
 import pandas as pd
-from checks import SBLCheck
 from pandera import DataFrameSchema
 from pandera.errors import SchemaErrors
-from phase_validations import get_phase_1_and_2_validations_for_lei
-from schema_template import get_template
+
+from regtech_data_validator.checks import SBLCheck
+from regtech_data_validator.phase_validations import get_phase_1_and_2_validations_for_lei
+from regtech_data_validator.schema_template import get_template
 
 # Get separate schema templates for phase 1 and 2
 

diff --git a/...census/processed/Census2022.processed.csv → ...ator/data/census/Census2022.processed.csv b/...census/processed/Census2022.processed.csv → ...ator/data/census/Census2022.processed.csv
diff --git a/__init__.py → ...ch_data_validator/data/census/__init__.py b/__init__.py → ...ch_data_validator/data/census/__init__.py
diff --git a/data/naics/processed/2022_codes.csv → ..._data_validator/data/naics/2022_codes.csv b/data/naics/processed/2022_codes.csv → ..._data_validator/data/naics/2022_codes.csv
diff --git a/regtech_data_validator/data/naics/__init__.py b/regtech_data_validator/data/naics/__init__.py
diff --git a/regtech_data_validator/global_data.py b/regtech_data_validator/global_data.py
@@ -1,36 +1,20 @@
-import os
-import sys
+import csv
+from importlib.resources import files
 
-import pandas as pd
 
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  # noqa: E402
-sys.path.append(ROOT_DIR)  # noqa: E402
+# global variable for NAICS codes
+naics_codes: dict[str,str] = {}
+naics_file_path = files('regtech_data_validator.data.naics').joinpath('2022_codes.csv')
 
-from config import CENSUS_PROCESSED_CSV_PATH, NAICS_CSV_PATH  # noqa: E402
+with naics_file_path.open('r') as f:
+    for row in csv.DictReader(f):
+        naics_codes[row['code']] = row['title']
 
-naics_codes = {}
 
-# global variable for geoids
-census_geoids = {}
+# global variable for Census GEOIDs
+census_geoids: set[str] = set()
+census_file_path = files('regtech_data_validator.data.census').joinpath('Census2022.processed.csv')
 
-
-def read_naics_codes(csv_path: str = NAICS_CSV_PATH):
-    """
-    read NAICS CSV file with this format: (code, description)
-    and populate global value: naics_codes
-    """
-    naics_codes.clear()
-    df = pd.read_csv(csv_path, dtype=str, na_filter=False)
-    for _, row in df.iterrows():
-        naics_codes.update({row.iloc[0]: row.iloc[1]})
-
-
-def read_geoids(csv_path: str = CENSUS_PROCESSED_CSV_PATH):
-    """
-    read geoids CSV file with this format: (code)
-    and populate global value: census_geoids
-    """
-    census_geoids.clear()
-    df = pd.read_csv(csv_path, dtype=str, na_filter=False)
-    for _, row in df.iterrows():
-        census_geoids.update({row.iloc[0]: None})
+with census_file_path.open('r') as f:
+    for row in csv.DictReader(f):
+        census_geoids.add(row['geoid'])
diff --git a/regtech_data_validator/main.py b/regtech_data_validator/main.py
@@ -9,7 +9,8 @@
 import sys
 
 import pandas as pd
-from create_schemas import validate_phases
+
+from regtech_data_validator.create_schemas import validate_phases
 
 
 def csv_to_df(path: str) -> pd.DataFrame:

diff --git a/regtech_data_validator/phase_validations.py b/regtech_data_validator/phase_validations.py
@@ -4,8 +4,8 @@
 an instance of a PanderaSchema object for phase 1 and phase 2."""
 
 
-import global_data
-from check_functions import (
+from regtech_data_validator import global_data
+from regtech_data_validator.check_functions import (
     has_correct_length,
     has_no_conditional_field_conflict,
     has_valid_enum_pair,
@@ -28,10 +28,7 @@
     meets_multi_value_field_restriction,
     string_contains,
 )
-from checks import SBLCheck
-
-# read and populate global naics code (this should be called only once)
-global_data.read_naics_codes()
+from regtech_data_validator.checks import SBLCheck
 
 
 def get_phase_1_and_2_validations_for_lei(lei: str = None):

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,6 +0,0 @@
-import os
-import sys
-
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-sys.path.append(os.path.join(ROOT_DIR, "validator"))

diff --git a/tests/test_check_functions.py b/tests/test_check_functions.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
-from validator import global_data
-from validator.check_functions import (
+from regtech_data_validator import global_data
+from regtech_data_validator.check_functions import (
     has_correct_length,
     has_no_conditional_field_conflict,
     has_valid_enum_pair,
@@ -474,28 +474,24 @@ def test_with_incorrect_length(self):
 
 class TestIsValidCode:
     def test_with_valid_code(self):
-        global_data.read_naics_codes()
         result = is_valid_code("111", False, global_data.naics_codes)
         assert result is True
         result = is_valid_code("111", True, global_data.naics_codes)
         assert result is True
 
     def test_with_invalid_code(self):
-        global_data.read_naics_codes()
         result = is_valid_code("101", False, global_data.naics_codes)
         assert result is False
         result = is_valid_code("101", True, global_data.naics_codes)
         assert result is False
 
     def test_with_accepted_blank(self):
-        global_data.read_naics_codes()
         result = is_valid_code("", True, global_data.naics_codes)
         assert result is True
         result = is_valid_code(" ", True, global_data.naics_codes)
         assert result is True
 
     def test_with_invalid_blank(self):
-        global_data.read_naics_codes()
         result = is_valid_code("", False, global_data.naics_codes)
         assert result is False
         result = is_valid_code(" ", False, global_data.naics_codes)

diff --git a/tests/test_checks.py b/tests/test_checks.py
@@ -1,6 +1,6 @@
 import pytest
 
-from validator.checks import SBLCheck
+from regtech_data_validator.checks import SBLCheck
 
 
 class TestSBLCheck:

diff --git a/tests/test_global_data.py b/tests/test_global_data.py
@@ -1,25 +1,11 @@
 import pytest
 
-from validator import global_data
+from regtech_data_validator import global_data
 
 
 class TestGlobalData:
     def test_valid_naics_codes(self):
-        global_data.read_naics_codes()
         assert len(global_data.naics_codes) == 96
 
     def test_valid_geoids(self):
-        global_data.read_geoids()
         assert len(global_data.census_geoids) == 87275
-
-    def test_invalid_naics_file(self):
-        failed_fpath = "./data/naics/processed/2022_codes.csv1"
-        with pytest.raises(Exception) as exc:
-            global_data.read_naics_codes(failed_fpath)
-        assert exc.type == FileNotFoundError
-
-    def test_invalid_geoids_file(self):
-        failed_fpath = "./data/census/processed/Census2022.processed.csv2"
-        with pytest.raises(Exception) as exc:
-            global_data.read_geoids(failed_fpath)
-        assert exc.type == FileNotFoundError
diff --git a/tests/test_sample_data.py b/tests/test_sample_data.py
@@ -1,16 +1,10 @@
-import os
-import sys
-
 import pandas as pd
 import pytest
 
-from validator.create_schemas import validate_phases
-
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  # noqa: E402
-sys.path.append(ROOT_DIR)  # noqa: E402
+from regtech_data_validator.create_schemas import validate_phases
 
-GOOD_FILE_PATH = "./src/tests/data/sbl-validations-pass.csv"
-BAD_FILE_PATH = "./src/tests/data/sbl-validations-fail.csv"
+GOOD_FILE_PATH = "./tests/data/sbl-validations-pass.csv"
+BAD_FILE_PATH = "./tests/data/sbl-validations-fail.csv"
 
 
 class TestValidatingSampleData:

diff --git a/tests/test_schema_functions.py b/tests/test_schema_functions.py
@@ -1,6 +1,10 @@
 import pandas as pd
 
-from validator.create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate, validate_phases
+from regtech_data_validator.create_schemas import (
+    get_phase_1_schema_for_lei,
+    get_phase_2_schema_for_lei,
+    validate, validate_phases
+)
 
 
 class TestUtil: