diff --git a/data/census/process_census.py b/data/census/process_census.py index 0686b9c5..95a1445d 100644 --- a/data/census/process_census.py +++ b/data/census/process_census.py @@ -5,10 +5,12 @@ import pandas as pd -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402 -sys.path.append(ROOT_DIR) # noqa: E402 +# census file col indexes +CENSUS_STATE_COL_INDEX = 2 +CENSUS_COUNTY_COL_INDEX = 3 +CENSUS_TRACT_COL_INDEX = 4 -import config # noqa: E402 +CENSUS_GEOID_COL = "geoid" # helper function to check number (float/int/negative) @@ -21,24 +23,22 @@ def _is_number(s): # helper function to unzip census file and extract CSV file -def _extract_census_zip_file(): - CENSUS_TMP_CSV_PATH = config.CENSUS_RAW_ZIP_PATH + ".tmp.csv" +def _extract_census_zip_file(raw_src): + census_tmp_csv_path = raw_src + ".tmp.csv" # unzip and extract csv files - with zipfile.ZipFile(config.CENSUS_RAW_ZIP_PATH, "r") as zip_ref: + with zipfile.ZipFile(raw_src, "r") as zip_ref: for file in zip_ref.namelist(): # iterate over files in archive if file[-4:] == ".csv": - print("Extracting CSV to {}".format(CENSUS_TMP_CSV_PATH)) - with open(CENSUS_TMP_CSV_PATH, "wb") as outfile: + print("Extracting CSV to {}".format(census_tmp_csv_path)) + with open(census_tmp_csv_path, "wb") as outfile: outfile.write(zip_ref.read(file)) - # it should only have one csv file - return CENSUS_TMP_CSV_PATH + # it should only have one csv file + + return census_tmp_csv_path # helper function to read extracted csv file and filter only geo-tract-id -def _read_census_csv(src_path: str, csv_path: str): - STATE_COL = config.CENSUS_STATE_COL_INDEX - COUNTY_COL = config.CENSUS_COUNTY_COL_INDEX - TRACT_COL = config.CENSUS_TRACT_COL_INDEX +def _process_census_csv(src_path: str, csv_path: str): # check paths if not os.path.isfile(src_path): @@ -52,14 +52,14 @@ def _read_census_csv(src_path: str, csv_path: str): ) # add header - result = [[config.CENSUS_GEOID_COL]] + result = [[CENSUS_GEOID_COL]] # read excel file # and create csv data list for index, row in df.iterrows(): - state_value = str(row[STATE_COL]) - county_value = str(row[COUNTY_COL]) - tract_value = str(row[TRACT_COL]) + state_value = str(row[CENSUS_STATE_COL_INDEX]) + county_value = str(row[CENSUS_COUNTY_COL_INDEX]) + tract_value = str(row[CENSUS_TRACT_COL_INDEX]) if ( _is_number(state_value) and _is_number(county_value) @@ -84,14 +84,23 @@ def _read_census_csv(src_path: str, csv_path: str): - output to defined output file """ if __name__ == "__main__": - CSV_PATH = config.CENSUS_PROCESSED_CSV_PATH - - if os.path.isfile(CSV_PATH): - error_msg = "Output {} csv file existed".format(CSV_PATH) - raise FileExistsError(error_msg) - - tmp_census_csv_file = _extract_census_zip_file() - print("Reading extracted CSV File . {}".format(tmp_census_csv_file)) - _read_census_csv(tmp_census_csv_file, CSV_PATH) - print("Removing extracted CSV File") + if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} ") + exit(1) + + raw_src = sys.argv[1] + csv_dest = sys.argv[2] + + if not os.path.isfile(raw_src): + print(f"source file not existed: {raw_src}") + exit(2) + + if os.path.isfile(csv_dest): + print("destination file already existed: {csv_dest}") + exit(3) + + tmp_census_csv_file = _extract_census_zip_file(raw_src) + print(f"Reading extracted CSV file: {tmp_census_csv_file}") + _process_census_csv(tmp_census_csv_file, csv_dest) + print("Removing extracted CSV file") os.remove(tmp_census_csv_file) diff --git a/data/config.py b/data/config.py deleted file mode 100644 index 00a125f2..00000000 --- a/data/config.py +++ /dev/null @@ -1,19 +0,0 @@ -# path to original/raw NAICS excel file -NAICS_EXCEL_PATH = "./data/naics/raw/2-6 digit_2022_Codes.xlsx" -# path to parsed/filtered naics codes file -NAICS_CSV_PATH = "./data/naics/processed/2022_codes.csv" -# column header text containing naics code -NAICS_CODE_COL = "2022 NAICS US Code" -# column header text containing naics title/description -NAICS_TITLE_COL = "2022 NAICS US Title" - -# path to original/raw NAICS zip file -CENSUS_RAW_ZIP_PATH = "./data/census/raw/CensusFlatFile2022.zip" -# path to parsed/filtered naics codes file -CENSUS_PROCESSED_CSV_PATH = "./data/census/processed/Census2022.processed.csv" -# census file col indexes -CENSUS_STATE_COL_INDEX = 2 -CENSUS_COUNTY_COL_INDEX = 3 -CENSUS_TRACT_COL_INDEX = 4 - -CENSUS_GEOID_COL = "geoid" diff --git a/data/naics/process_naics.py b/data/naics/process_naics.py index b202407c..f6e1a251 100644 --- a/data/naics/process_naics.py +++ b/data/naics/process_naics.py @@ -4,10 +4,12 @@ import pandas as pd -ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # noqa: E402 -sys.path.append(ROOT_DIR) # noqa: E402 -import config # noqa: E402 +# column header text containing naics code +NAICS_CODE_COL = "2022 NAICS US Code" +# column header text containing naics title/description +NAICS_TITLE_COL = "2022 NAICS US Title" + """ filter NAICS data with only 3 digit codes @@ -17,20 +19,24 @@ FileExistsError: when output csv file existed """ if __name__ == "__main__": - EXCEL_PATH = config.NAICS_EXCEL_PATH - CSV_PATH = config.NAICS_CSV_PATH - CODE_COL = config.NAICS_CODE_COL - TITLE_COL = config.NAICS_TITLE_COL + if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} ") + exit(1) + + raw_src = sys.argv[1] + csv_dest = sys.argv[2] + + if not os.path.isfile(raw_src): + print(f"source file not existed: {raw_src}") + exit(2) - # check for paths - if not os.path.isfile(EXCEL_PATH): - error_msg = "Input excel file not existed" - raise FileNotFoundError(error_msg) - if os.path.isfile(CSV_PATH): - error_msg = "Output csv file existed" - raise FileExistsError(error_msg) + if os.path.isfile(csv_dest): + print("destination file already existed: {csv_dest}") + exit(3) - df = pd.read_excel(EXCEL_PATH, dtype=str, na_filter=False) + df = pd.read_excel(raw_src, dtype=str, na_filter=False) + + print(f'source file successfully read: {raw_src}') # add header result = [["code", "title"]] @@ -38,12 +44,14 @@ # read excel file # and create csv data list for index, row in df.iterrows(): - code = str(row[CODE_COL]) + code = str(row[NAICS_CODE_COL]) if len(code) == 3: - a_row = [code, str(row[TITLE_COL])] + a_row = [code, str(row[NAICS_TITLE_COL])] result.append(a_row) # output data to csv file - with open(CSV_PATH, "w") as f: + with open(csv_dest, "w") as f: writer = csv.writer(f) writer.writerows(result) + + print(f'destination file successfully written: {csv_dest}') diff --git a/poetry.lock b/poetry.lock index 7fcbe376..600691d0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "black" @@ -136,6 +136,17 @@ files = [ [package.extras] toml = ["tomli"] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.6" +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -203,6 +214,20 @@ files = [ {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"}, ] +[[package]] +name = "openpyxl" +version = "3.1.2" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.6" +files = [ + {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, + {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "packaging" version = "23.1" @@ -642,4 +667,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "03e6adb7dcecd12194f8c44033d68666019c5bb52f8fd4bccd7301067832c9e1" +content-hash = "ac6360d9068e34f6bbad74a6c3339a85dd1968267f7272b48b8a99dfc5702812" diff --git a/pyproject.toml b/pyproject.toml index 4e13e24c..e959153d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,9 @@ pytest-cov = "4.1.0" black = "23.3.0" ruff = "0.0.259" +[tool.poetry.group.data.dependencies] +openpyxl = "^3.1.2" + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/tools/__init__.py b/tools/__init__.py deleted file mode 100644 index e69de29b..00000000