From a4fff74f96125840bbab69948de92d87c7636915 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Wed, 14 Aug 2024 10:51:12 +0100 Subject: [PATCH] Add IRS PUF --- policyengine_us_data/irs_puf.py | 36 +++++++++++++++++++ policyengine_us_data/policyengine_cps.py | 4 +-- .../tests/test_datasets/test_irs_puf.py | 12 +++++++ .../test_datasets/test_policyengine_cps.py | 16 +++++++++ 4 files changed, 65 insertions(+), 3 deletions(-) create mode 100644 policyengine_us_data/irs_puf.py create mode 100644 policyengine_us_data/tests/test_datasets/test_irs_puf.py diff --git a/policyengine_us_data/irs_puf.py b/policyengine_us_data/irs_puf.py new file mode 100644 index 0000000..a1813d3 --- /dev/null +++ b/policyengine_us_data/irs_puf.py @@ -0,0 +1,36 @@ +from policyengine_core.data import Dataset +from policyengine_us_data.data_storage import STORAGE_FOLDER +from pathlib import Path + + +class IRS_PUF(Dataset): + """Dataset containing IRS PUF tables.""" + + puf_file_path: Path + puf_demographics_file_path: Path + data_format = Dataset.TABLES + + def generate(self): + import pandas as pd + + puf_file_path = Path(self.puf_file_path).expanduser().resolve() + puf_demographics_file_path = Path(self.puf_demographics_file_path).expanduser().resolve() + + if not puf_file_path.exists(): + raise FileNotFoundError(f"PUF file not found at {puf_file_path}. Either put it there, or change {Path(__file__)} point to a different path.") + + if not puf_demographics_file_path.exists(): + raise FileNotFoundError(f"PUF demographics file not found at {puf_demographics_file_path}. Either put it there, or change {Path(__file__)} point to a different path.") + + with pd.HDFStore(self.file_path, mode="w") as storage: + storage.put("puf", pd.read_csv(puf_file_path)) + storage.put("puf_demographics", pd.read_csv(puf_demographics_file_path)) + + +class IRS_PUF_2015(IRS_PUF): + name = "irs_puf_2015" + label = "IRS PUF (2015)" + time_period = 2015 + puf_file_path = "~/Downloads/puf_2015.csv" + puf_demographics_file_path = "~/Downloads/demographics_2015.csv" + file_path = STORAGE_FOLDER / "irs_puf_2015.h5" \ No newline at end of file diff --git a/policyengine_us_data/policyengine_cps.py b/policyengine_us_data/policyengine_cps.py index 66f1e7b..6c84028 100644 --- a/policyengine_us_data/policyengine_cps.py +++ b/policyengine_us_data/policyengine_cps.py @@ -1,5 +1,5 @@ from policyengine_core.data import Dataset -from policyengine_us.data.storage import STORAGE_FOLDER +from policyengine_us_data.data_storage import STORAGE_FOLDER import h5py from policyengine_us.data.datasets.cps.raw_cps import ( RawCPS_2018, @@ -9,8 +9,6 @@ RawCPS_2022, RawCPS, ) -from policyengine_us.data.datasets.cps.uprated_cps import UpratedCPS -from policyengine_us.data.storage import STORAGE_FOLDER from pandas import DataFrame, Series import numpy as np import pandas as pd diff --git a/policyengine_us_data/tests/test_datasets/test_irs_puf.py b/policyengine_us_data/tests/test_datasets/test_irs_puf.py new file mode 100644 index 0000000..9f5915a --- /dev/null +++ b/policyengine_us_data/tests/test_datasets/test_irs_puf.py @@ -0,0 +1,12 @@ +import pytest + + +@pytest.mark.parametrize("year", [2015]) +def test_irs_puf_generates(year: int): + from policyengine_us_data.irs_puf import IRS_PUF_2015 + + dataset_by_year = { + 2015: IRS_PUF_2015, + } + + dataset_by_year[year](require=True) \ No newline at end of file diff --git a/policyengine_us_data/tests/test_datasets/test_policyengine_cps.py b/policyengine_us_data/tests/test_datasets/test_policyengine_cps.py index 402b25f..24b147c 100644 --- a/policyengine_us_data/tests/test_datasets/test_policyengine_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_policyengine_cps.py @@ -10,3 +10,19 @@ def test_policyengine_cps_generates(year: int): } dataset_by_year[year](require=True) + +@pytest.mark.parametrize("year", [2022]) +def test_policyengine_cps_loads(year: int): + from policyengine_us_data.policyengine_cps import CPS_2022 + + dataset_by_year = { + 2022: CPS_2022, + } + + dataset = dataset_by_year[year] + + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=dataset) + + assert not sim.calculate("household_net_income").isna().any()