From d91726f467f64dfdd9c0227134f7d93ea6139041 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 19 Aug 2024 19:31:34 +0200 Subject: [PATCH] Add uprating tools --- .../datasets/cps/policyengine_cps.py | 25 +++++++++++++++++ .../datasets/puf/policyengine_puf.py | 21 +++++++++++++++ policyengine_us_data/utils/uprating.py | 27 +++++++++++++++---- 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/policyengine_us_data/datasets/cps/policyengine_cps.py b/policyengine_us_data/datasets/cps/policyengine_cps.py index 7c76973..b98ad40 100644 --- a/policyengine_us_data/datasets/cps/policyengine_cps.py +++ b/policyengine_us_data/datasets/cps/policyengine_cps.py @@ -8,6 +8,7 @@ import os import yaml from typing import Type +from policyengine_us_data.utils.uprating import create_policyengine_uprating_factors_table class CPS(Dataset): @@ -22,6 +23,24 @@ def generate(self): Technical documentation and codebook here: https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar21.pdf """ + if self.raw_cps is None: + # Extrapolate from CPS 2022 + + cps_2022 = CPS_2022(require=True) + print("Creating uprating factors table...") + uprating = create_policyengine_uprating_factors_table() + arrays = cps_2022.load_dataset() + for variable in uprating: + if variable in arrays: + current_index = uprating[uprating.Variable == variable][self.time_period].values[0] + start_index = uprating[uprating.Variable == variable][2021].values[0] + growth = current_index / start_index + print(f"Uprating {variable} by {growth-1:.1%}") + arrays[variable] = arrays[variable] * growth + + self.save_dataset(arrays) + return + raw_data = self.raw_cps(require=True).load() cps = h5py.File(self.file_path, mode="w") @@ -536,3 +555,9 @@ class CPS_2022(CPS): previous_year_raw_cps = CensusCPS_2021 file_path = STORAGE_FOLDER / "cps_2022.h5" time_period = 2022 + +class CPS_2024(CPS): + name = "cps_2024" + label = "CPS 2024" + file_path = STORAGE_FOLDER / "cps_2024.h5" + time_period = 2024 diff --git a/policyengine_us_data/datasets/puf/policyengine_puf.py b/policyengine_us_data/datasets/puf/policyengine_puf.py index fc87a77..6bcc890 100644 --- a/policyengine_us_data/datasets/puf/policyengine_puf.py +++ b/policyengine_us_data/datasets/puf/policyengine_puf.py @@ -7,6 +7,7 @@ from .uprate_puf import uprate_puf from survey_enhance import Imputation from .irs_puf import IRS_PUF_2015 +from policyengine_us_data.utils.uprating import create_policyengine_uprating_factors_table rng = np.random.default_rng(seed=64) @@ -290,6 +291,20 @@ def generate(self): if self.time_period == 2021: puf = uprate_puf(puf, 2015, self.time_period) + elif self.time_period >= 2021: + puf_2021 = PUF_2021(require=True) + print("Creating uprating factors table...") + uprating = create_policyengine_uprating_factors_table() + arrays = puf_2021.load_dataset() + for variable in uprating: + if variable in arrays: + current_index = uprating[uprating.Variable == variable][self.time_period].values[0] + start_index = uprating[uprating.Variable == variable][2021].values[0] + growth = current_index / start_index + print(f"Uprating {variable} by {growth-1:.1%}") + arrays[variable] = arrays[variable] * growth + self.save_dataset(arrays) + return puf = puf[puf.MARS != 0] # Remove aggregate records @@ -476,3 +491,9 @@ class PUF_2021(PUF): name = "puf_2021" time_period = 2021 file_path = STORAGE_FOLDER / "pe_puf_2021.h5" + +class PUF_2024(PUF): + label = "PUF 2024" + name = "puf_2024" + time_period = 2024 + file_path = STORAGE_FOLDER / "pe_puf_2024.h5" diff --git a/policyengine_us_data/utils/uprating.py b/policyengine_us_data/utils/uprating.py index f57a2c0..f655815 100644 --- a/policyengine_us_data/utils/uprating.py +++ b/policyengine_us_data/utils/uprating.py @@ -1,4 +1,4 @@ -from policyengine_core.model_api import Variable +from policyengine_us_data.data_storage import STORAGE_FOLDER import pandas as pd START_YEAR = 2020 @@ -13,14 +13,22 @@ def create_policyengine_uprating_factors_table(): years = [] index_values = [] + population_size = system.parameters.get_child("calibration.gov.census.populations.total") + for variable in system.variables.values(): if variable.uprating is not None: parameter = system.parameters.get_child(variable.uprating) start_value = parameter(START_YEAR) - for year in range(START_YEAR, END_YEAR): + for year in range(START_YEAR, END_YEAR + 1): + population_growth = population_size(year) / population_size(START_YEAR) variable_names.append(variable.name) years.append(year) - index_values.append(round(parameter(year) / start_value, 3)) + growth = parameter(year) / start_value + if "_weight" not in variable.name: + per_capita_growth = growth / population_growth + else: + per_capita_growth = growth + index_values.append(round(per_capita_growth, 3)) df["Variable"] = variable_names df["Year"] = years @@ -28,6 +36,15 @@ def create_policyengine_uprating_factors_table(): # Convert to there is a column for each year df = df.pivot(index="Variable", columns="Year", values="Value") + df = df.sort_values("Variable") + df.to_csv(STORAGE_FOLDER / "uprating_factors.csv") - return df.sort_values("Variable") - + # Create a table with growth factors by year + + df_growth = df.copy() + for year in range(END_YEAR, START_YEAR, -1): + df_growth[year] = df_growth[year] / df_growth[year - 1] - 1 + df_growth[START_YEAR] = 0 + + df_growth.to_csv(STORAGE_FOLDER / "uprating_growth_factors.csv") + return df \ No newline at end of file