From d4529ecb175a2692e35ca9376d18240ee6644d3c Mon Sep 17 00:00:00 2001
From: Hans Keeler <hans.keeler@cfpb.gov>
Date: Tue, 17 Oct 2023 01:08:56 -0400
Subject: [PATCH] Fix issues related to repo restructure

- Fixed all imports
- Fixed test and coverage settings in pyproject.toml
- Removed all Python path magic in __init.py__ files
- Moved data files into the repo, and used `importlib` to load files by
  package name instead of path. This is more portable, especially once
  we turn this into a distributable package.
- Refactored global_data to only load data once at module load time
---
 pyproject.toml                                |  8 ++--
 regtech_data_validator/__init__.py            |  5 ---
 regtech_data_validator/create_schemas.py      |  7 +--
 .../data/census}/Census2022.processed.csv     |  0
 .../data/census/__init__.py                   |  0
 .../data/naics}/2022_codes.csv                |  0
 regtech_data_validator/data/naics/__init__.py |  0
 regtech_data_validator/global_data.py         | 44 ++++++-------------
 regtech_data_validator/main.py                |  3 +-
 regtech_data_validator/phase_validations.py   |  9 ++--
 tests/__init__.py                             |  6 ---
 tests/test_check_functions.py                 |  8 +---
 tests/test_checks.py                          |  2 +-
 tests/test_global_data.py                     | 16 +------
 tests/test_sample_data.py                     | 12 ++---
 tests/test_schema_functions.py                |  6 ++-
 16 files changed, 39 insertions(+), 87 deletions(-)
 rename {data/census/processed => regtech_data_validator/data/census}/Census2022.processed.csv (100%)
 rename __init__.py => regtech_data_validator/data/census/__init__.py (100%)
 rename {data/naics/processed => regtech_data_validator/data/naics}/2022_codes.csv (100%)
 create mode 100644 regtech_data_validator/data/naics/__init__.py

diff --git a/pyproject.toml b/pyproject.toml
index b84d616d..4e13e24c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,18 +49,18 @@ addopts = [
     "--cov-branch",
     "--cov-report=xml",
     "--cov-report=term",
-    "--cov=src",
+    "--cov=regtech_data_validator",
     "-vv",
     "--strict-markers",
     "-rfE",
 ]
 testpaths = [
-    "src/tests",
+    "tests",
 ]
 
 [tool.coverage.run]
 relative_files = true
-source = ["src"]
+source = ["regtech_data_validator"]
 
 [tool.coverage.report]
-skip_empty = true
\ No newline at end of file
+skip_empty = true
diff --git a/regtech_data_validator/__init__.py b/regtech_data_validator/__init__.py
index 836099bf..e69de29b 100644
--- a/regtech_data_validator/__init__.py
+++ b/regtech_data_validator/__init__.py
@@ -1,5 +0,0 @@
-import os
-import sys
-
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-sys.path.append(ROOT_DIR)
diff --git a/regtech_data_validator/create_schemas.py b/regtech_data_validator/create_schemas.py
index cc3cf8d7..76784af3 100644
--- a/regtech_data_validator/create_schemas.py
+++ b/regtech_data_validator/create_schemas.py
@@ -2,11 +2,12 @@
 with validations listed in phase 1 and phase 2."""
 
 import pandas as pd
-from checks import SBLCheck
 from pandera import DataFrameSchema
 from pandera.errors import SchemaErrors
-from phase_validations import get_phase_1_and_2_validations_for_lei
-from schema_template import get_template
+
+from regtech_data_validator.checks import SBLCheck
+from regtech_data_validator.phase_validations import get_phase_1_and_2_validations_for_lei
+from regtech_data_validator.schema_template import get_template
 
 # Get separate schema templates for phase 1 and 2
 
diff --git a/data/census/processed/Census2022.processed.csv b/regtech_data_validator/data/census/Census2022.processed.csv
similarity index 100%
rename from data/census/processed/Census2022.processed.csv
rename to regtech_data_validator/data/census/Census2022.processed.csv
diff --git a/__init__.py b/regtech_data_validator/data/census/__init__.py
similarity index 100%
rename from __init__.py
rename to regtech_data_validator/data/census/__init__.py
diff --git a/data/naics/processed/2022_codes.csv b/regtech_data_validator/data/naics/2022_codes.csv
similarity index 100%
rename from data/naics/processed/2022_codes.csv
rename to regtech_data_validator/data/naics/2022_codes.csv
diff --git a/regtech_data_validator/data/naics/__init__.py b/regtech_data_validator/data/naics/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/regtech_data_validator/global_data.py b/regtech_data_validator/global_data.py
index a9c54f04..b3364354 100644
--- a/regtech_data_validator/global_data.py
+++ b/regtech_data_validator/global_data.py
@@ -1,36 +1,20 @@
-import os
-import sys
+import csv
+from importlib.resources import files
 
-import pandas as pd
 
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  # noqa: E402
-sys.path.append(ROOT_DIR)  # noqa: E402
+# global variable for NAICS codes
+naics_codes: dict[str,str] = {}
+naics_file_path = files('regtech_data_validator.data.naics').joinpath('2022_codes.csv')
 
-from config import CENSUS_PROCESSED_CSV_PATH, NAICS_CSV_PATH  # noqa: E402
+with naics_file_path.open('r') as f:
+    for row in csv.DictReader(f):
+        naics_codes[row['code']] = row['title']
 
-naics_codes = {}
 
-# global variable for geoids
-census_geoids = {}
+# global variable for Census GEOIDs
+census_geoids: set[str] = set()
+census_file_path = files('regtech_data_validator.data.census').joinpath('Census2022.processed.csv')
 
-
-def read_naics_codes(csv_path: str = NAICS_CSV_PATH):
-    """
-    read NAICS CSV file with this format: (code, description)
-    and populate global value: naics_codes
-    """
-    naics_codes.clear()
-    df = pd.read_csv(csv_path, dtype=str, na_filter=False)
-    for _, row in df.iterrows():
-        naics_codes.update({row.iloc[0]: row.iloc[1]})
-
-
-def read_geoids(csv_path: str = CENSUS_PROCESSED_CSV_PATH):
-    """
-    read geoids CSV file with this format: (code)
-    and populate global value: census_geoids
-    """
-    census_geoids.clear()
-    df = pd.read_csv(csv_path, dtype=str, na_filter=False)
-    for _, row in df.iterrows():
-        census_geoids.update({row.iloc[0]: None})
+with census_file_path.open('r') as f:
+    for row in csv.DictReader(f):
+        census_geoids.add(row['geoid'])
diff --git a/regtech_data_validator/main.py b/regtech_data_validator/main.py
index 20972759..e7df5b11 100644
--- a/regtech_data_validator/main.py
+++ b/regtech_data_validator/main.py
@@ -9,7 +9,8 @@
 import sys
 
 import pandas as pd
-from create_schemas import validate_phases
+
+from regtech_data_validator.create_schemas import validate_phases
 
 
 def csv_to_df(path: str) -> pd.DataFrame:
diff --git a/regtech_data_validator/phase_validations.py b/regtech_data_validator/phase_validations.py
index fc7b0a22..fc2ad5ea 100644
--- a/regtech_data_validator/phase_validations.py
+++ b/regtech_data_validator/phase_validations.py
@@ -4,8 +4,8 @@
 an instance of a PanderaSchema object for phase 1 and phase 2."""
 
 
-import global_data
-from check_functions import (
+from regtech_data_validator import global_data
+from regtech_data_validator.check_functions import (
     has_correct_length,
     has_no_conditional_field_conflict,
     has_valid_enum_pair,
@@ -28,10 +28,7 @@
     meets_multi_value_field_restriction,
     string_contains,
 )
-from checks import SBLCheck
-
-# read and populate global naics code (this should be called only once)
-global_data.read_naics_codes()
+from regtech_data_validator.checks import SBLCheck
 
 
 def get_phase_1_and_2_validations_for_lei(lei: str = None):
diff --git a/tests/__init__.py b/tests/__init__.py
index 238d07e8..e69de29b 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,6 +0,0 @@
-import os
-import sys
-
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-sys.path.append(os.path.join(ROOT_DIR, "validator"))
diff --git a/tests/test_check_functions.py b/tests/test_check_functions.py
index 65175eff..c93b5613 100644
--- a/tests/test_check_functions.py
+++ b/tests/test_check_functions.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
-from validator import global_data
-from validator.check_functions import (
+from regtech_data_validator import global_data
+from regtech_data_validator.check_functions import (
     has_correct_length,
     has_no_conditional_field_conflict,
     has_valid_enum_pair,
@@ -474,28 +474,24 @@ def test_with_incorrect_length(self):
 
 class TestIsValidCode:
     def test_with_valid_code(self):
-        global_data.read_naics_codes()
         result = is_valid_code("111", False, global_data.naics_codes)
         assert result is True
         result = is_valid_code("111", True, global_data.naics_codes)
         assert result is True
 
     def test_with_invalid_code(self):
-        global_data.read_naics_codes()
         result = is_valid_code("101", False, global_data.naics_codes)
         assert result is False
         result = is_valid_code("101", True, global_data.naics_codes)
         assert result is False
 
     def test_with_accepted_blank(self):
-        global_data.read_naics_codes()
         result = is_valid_code("", True, global_data.naics_codes)
         assert result is True
         result = is_valid_code(" ", True, global_data.naics_codes)
         assert result is True
 
     def test_with_invalid_blank(self):
-        global_data.read_naics_codes()
         result = is_valid_code("", False, global_data.naics_codes)
         assert result is False
         result = is_valid_code(" ", False, global_data.naics_codes)
diff --git a/tests/test_checks.py b/tests/test_checks.py
index bac4cc75..dafaf512 100644
--- a/tests/test_checks.py
+++ b/tests/test_checks.py
@@ -1,6 +1,6 @@
 import pytest
 
-from validator.checks import SBLCheck
+from regtech_data_validator.checks import SBLCheck
 
 
 class TestSBLCheck:
diff --git a/tests/test_global_data.py b/tests/test_global_data.py
index 6e8fc13f..f939cacf 100644
--- a/tests/test_global_data.py
+++ b/tests/test_global_data.py
@@ -1,25 +1,11 @@
 import pytest
 
-from validator import global_data
+from regtech_data_validator import global_data
 
 
 class TestGlobalData:
     def test_valid_naics_codes(self):
-        global_data.read_naics_codes()
         assert len(global_data.naics_codes) == 96
 
     def test_valid_geoids(self):
-        global_data.read_geoids()
         assert len(global_data.census_geoids) == 87275
-
-    def test_invalid_naics_file(self):
-        failed_fpath = "./data/naics/processed/2022_codes.csv1"
-        with pytest.raises(Exception) as exc:
-            global_data.read_naics_codes(failed_fpath)
-        assert exc.type == FileNotFoundError
-
-    def test_invalid_geoids_file(self):
-        failed_fpath = "./data/census/processed/Census2022.processed.csv2"
-        with pytest.raises(Exception) as exc:
-            global_data.read_geoids(failed_fpath)
-        assert exc.type == FileNotFoundError
diff --git a/tests/test_sample_data.py b/tests/test_sample_data.py
index e1d6de07..35c1fe42 100644
--- a/tests/test_sample_data.py
+++ b/tests/test_sample_data.py
@@ -1,16 +1,10 @@
-import os
-import sys
-
 import pandas as pd
 import pytest
 
-from validator.create_schemas import validate_phases
-
-ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  # noqa: E402
-sys.path.append(ROOT_DIR)  # noqa: E402
+from regtech_data_validator.create_schemas import validate_phases
 
-GOOD_FILE_PATH = "./src/tests/data/sbl-validations-pass.csv"
-BAD_FILE_PATH = "./src/tests/data/sbl-validations-fail.csv"
+GOOD_FILE_PATH = "./tests/data/sbl-validations-pass.csv"
+BAD_FILE_PATH = "./tests/data/sbl-validations-fail.csv"
 
 
 class TestValidatingSampleData:
diff --git a/tests/test_schema_functions.py b/tests/test_schema_functions.py
index 911dc673..0b8e0c1b 100644
--- a/tests/test_schema_functions.py
+++ b/tests/test_schema_functions.py
@@ -1,6 +1,10 @@
 import pandas as pd
 
-from validator.create_schemas import get_phase_1_schema_for_lei, get_phase_2_schema_for_lei, validate, validate_phases
+from regtech_data_validator.create_schemas import (
+    get_phase_1_schema_for_lei,
+    get_phase_2_schema_for_lei,
+    validate, validate_phases
+)
 
 
 class TestUtil: