Skip to content

Commit

Permalink
Merge config.py and tools under data dir
Browse files Browse the repository at this point in the history
- Move file format settings from config.py into respective data
  transform scripts
- Move src/dest file settings from config.py to CLI args
- Use consistent CLI arg and file exists handling
- Add openpyxl dependency for handling NAICS Excel reading
  • Loading branch information
hkeeler committed Oct 17, 2023
1 parent ccee738 commit baeb814
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 67 deletions.
65 changes: 37 additions & 28 deletions data/census/process_census.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@

import pandas as pd

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402
sys.path.append(ROOT_DIR) # noqa: E402
# census file col indexes
CENSUS_STATE_COL_INDEX = 2
CENSUS_COUNTY_COL_INDEX = 3
CENSUS_TRACT_COL_INDEX = 4

import config # noqa: E402
CENSUS_GEOID_COL = "geoid"


# helper function to check number (float/int/negative)
Expand All @@ -21,24 +23,22 @@ def _is_number(s):


# helper function to unzip census file and extract CSV file
def _extract_census_zip_file():
CENSUS_TMP_CSV_PATH = config.CENSUS_RAW_ZIP_PATH + ".tmp.csv"
def _extract_census_zip_file(raw_src):
census_tmp_csv_path = raw_src + ".tmp.csv"
# unzip and extract csv files
with zipfile.ZipFile(config.CENSUS_RAW_ZIP_PATH, "r") as zip_ref:
with zipfile.ZipFile(raw_src, "r") as zip_ref:
for file in zip_ref.namelist(): # iterate over files in archive
if file[-4:] == ".csv":
print("Extracting CSV to {}".format(CENSUS_TMP_CSV_PATH))
with open(CENSUS_TMP_CSV_PATH, "wb") as outfile:
print("Extracting CSV to {}".format(census_tmp_csv_path))
with open(census_tmp_csv_path, "wb") as outfile:
outfile.write(zip_ref.read(file))
# it should only have one csv file
return CENSUS_TMP_CSV_PATH
# it should only have one csv file

return census_tmp_csv_path


# helper function to read extracted csv file and filter only geo-tract-id
def _read_census_csv(src_path: str, csv_path: str):
STATE_COL = config.CENSUS_STATE_COL_INDEX
COUNTY_COL = config.CENSUS_COUNTY_COL_INDEX
TRACT_COL = config.CENSUS_TRACT_COL_INDEX
def _process_census_csv(src_path: str, csv_path: str):

# check paths
if not os.path.isfile(src_path):
Expand All @@ -52,14 +52,14 @@ def _read_census_csv(src_path: str, csv_path: str):
)

# add header
result = [[config.CENSUS_GEOID_COL]]
result = [[CENSUS_GEOID_COL]]

# read excel file
# and create csv data list
for index, row in df.iterrows():
state_value = str(row[STATE_COL])
county_value = str(row[COUNTY_COL])
tract_value = str(row[TRACT_COL])
state_value = str(row[CENSUS_STATE_COL_INDEX])
county_value = str(row[CENSUS_COUNTY_COL_INDEX])
tract_value = str(row[CENSUS_TRACT_COL_INDEX])
if (
_is_number(state_value)
and _is_number(county_value)
Expand All @@ -84,14 +84,23 @@ def _read_census_csv(src_path: str, csv_path: str):
- output to defined output file
"""
if __name__ == "__main__":
CSV_PATH = config.CENSUS_PROCESSED_CSV_PATH

if os.path.isfile(CSV_PATH):
error_msg = "Output {} csv file existed".format(CSV_PATH)
raise FileExistsError(error_msg)

tmp_census_csv_file = _extract_census_zip_file()
print("Reading extracted CSV File . {}".format(tmp_census_csv_file))
_read_census_csv(tmp_census_csv_file, CSV_PATH)
print("Removing extracted CSV File")
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
exit(1)

raw_src = sys.argv[1]
csv_dest = sys.argv[2]

if not os.path.isfile(raw_src):
print(f"source file not existed: {raw_src}")
exit(2)

if os.path.isfile(csv_dest):
print("destination file already existed: {csv_dest}")
exit(3)

tmp_census_csv_file = _extract_census_zip_file(raw_src)
print(f"Reading extracted CSV file: {tmp_census_csv_file}")
_process_census_csv(tmp_census_csv_file, csv_dest)
print("Removing extracted CSV file")
os.remove(tmp_census_csv_file)
19 changes: 0 additions & 19 deletions data/config.py

This file was deleted.

44 changes: 26 additions & 18 deletions data/naics/process_naics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@

import pandas as pd

ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402
sys.path.append(ROOT_DIR) # noqa: E402

import config # noqa: E402
# column header text containing naics code
NAICS_CODE_COL = "2022 NAICS US Code"
# column header text containing naics title/description
NAICS_TITLE_COL = "2022 NAICS US Title"


"""
filter NAICS data with only 3 digit codes
Expand All @@ -17,33 +19,39 @@
FileExistsError: when output csv file existed
"""
if __name__ == "__main__":
EXCEL_PATH = config.NAICS_EXCEL_PATH
CSV_PATH = config.NAICS_CSV_PATH
CODE_COL = config.NAICS_CODE_COL
TITLE_COL = config.NAICS_TITLE_COL
if len(sys.argv) != 3:
print(f"Usage: {sys.argv[0]} <raw-src> <csv-dest>")
exit(1)

raw_src = sys.argv[1]
csv_dest = sys.argv[2]

if not os.path.isfile(raw_src):
print(f"source file not existed: {raw_src}")
exit(2)

# check for paths
if not os.path.isfile(EXCEL_PATH):
error_msg = "Input excel file not existed"
raise FileNotFoundError(error_msg)
if os.path.isfile(CSV_PATH):
error_msg = "Output csv file existed"
raise FileExistsError(error_msg)
if os.path.isfile(csv_dest):
print("destination file already existed: {csv_dest}")
exit(3)

df = pd.read_excel(EXCEL_PATH, dtype=str, na_filter=False)
df = pd.read_excel(raw_src, dtype=str, na_filter=False)

print(f'source file successfully read: {raw_src}')

# add header
result = [["code", "title"]]

# read excel file
# and create csv data list
for index, row in df.iterrows():
code = str(row[CODE_COL])
code = str(row[NAICS_CODE_COL])
if len(code) == 3:
a_row = [code, str(row[TITLE_COL])]
a_row = [code, str(row[NAICS_TITLE_COL])]
result.append(a_row)

# output data to csv file
with open(CSV_PATH, "w") as f:
with open(csv_dest, "w") as f:
writer = csv.writer(f)
writer.writerows(result)

print(f'destination file successfully written: {csv_dest}')
29 changes: 27 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pytest-cov = "4.1.0"
black = "23.3.0"
ruff = "0.0.259"

[tool.poetry.group.data.dependencies]
openpyxl = "^3.1.2"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Expand Down
Empty file removed tools/__init__.py
Empty file.

0 comments on commit baeb814

Please sign in to comment.