diff --git a/Makefile b/Makefile index 6a4da61..f91ff99 100644 --- a/Makefile +++ b/Makefile @@ -8,4 +8,7 @@ install: pip install -e .[dev] docker: - docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest \ No newline at end of file + docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest + +documentation: + streamlit run docs/Home.py diff --git a/docs/Home.py b/docs/Home.py new file mode 100644 index 0000000..f2fcc18 --- /dev/null +++ b/docs/Home.py @@ -0,0 +1,3 @@ +import streamlit as st + +st.title("PolicyEngine US Data") diff --git a/docs/pages/Benchmarks.py b/docs/pages/Benchmarks.py new file mode 100644 index 0000000..361e170 --- /dev/null +++ b/docs/pages/Benchmarks.py @@ -0,0 +1,72 @@ +import streamlit as st + +st.set_page_config(layout="wide") + +st.title("Benchmarks") + +from policyengine_us_data.datasets import CPS_2024, PUF_2024, EnhancedCPS_2024 +from policyengine_us_data.utils import build_loss_matrix +from policyengine_us import Microsimulation +import pandas as pd +import plotly.express as px + + +@st.cache_data +def compare_datasets(): + comparison_combined = pd.DataFrame() + for dataset in [CPS_2024, PUF_2024, EnhancedCPS_2024]: + sim = Microsimulation(dataset=dataset) + weights = sim.calculate("household_weight").values + loss_matrix, targets_array = build_loss_matrix(dataset, 2024) + target_names = loss_matrix.columns + estimates = weights @ loss_matrix.values + comparison = pd.DataFrame( + { + "Target": target_names, + "Estimate": estimates, + "Actual": targets_array, + } + ) + comparison["Error"] = comparison["Estimate"] - comparison["Actual"] + comparison["Abs. Error"] = comparison["Error"].abs() + comparison["Abs. Error %"] = ( + comparison["Abs. Error"] / comparison["Actual"] + ) + comparison["Dataset"] = dataset.label + comparison_combined = pd.concat([comparison_combined, comparison]) + + return comparison_combined + + +df = compare_datasets() + +mean_relative_error_by_dataset = ( + df.groupby("Dataset")["Abs. Error %"].mean().reset_index() +) + +st.write(mean_relative_error_by_dataset) + +metric = st.selectbox( + "Metric", ["Estimate", "Error", "Abs. Error", "Abs. Error %"] +) +target = st.selectbox("Target", df["Target"].unique()) + +fig = px.bar( + df[df["Target"] == target], + x="Dataset", + y=metric, + title=f"{metric} for {target}", +) + +if metric == "Estimate": + # Add a dashed line at the target + fig.add_shape( + type="line", + x0=-0.5, + x1=2.5, + y0=df.loc[df["Target"] == target, "Actual"].values[0], + y1=df.loc[df["Target"] == target, "Actual"].values[0], + line=dict(dash="dash"), + ) + +st.plotly_chart(fig, use_container_width=True) diff --git a/policyengine_us_data/data_storage/uprating_factors.csv b/policyengine_us_data/data_storage/uprating_factors.csv index b55182a..b3f188b 100644 --- a/policyengine_us_data/data_storage/uprating_factors.csv +++ b/policyengine_us_data/data_storage/uprating_factors.csv @@ -15,7 +15,7 @@ early_withdrawal_penalty,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467 educator_expense,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 employment_income,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748 employment_income_before_lsr,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748 -employment_income_last_year,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +employment_income_last_year,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748 energy_efficient_home_improvement_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 estate_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 excess_withheld_payroll_tax,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 @@ -42,6 +42,7 @@ non_sch_d_capital_gains,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467, other_credits,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 partnership_s_corp_income,1.0,0.997,1.542,1.581,1.685,1.753,1.789,1.827,1.837,1.859,1.891,1.929,1.969,2.009,2.074 person_weight,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09 +population,1.0,1.0027545812166367,1.0065863897282326,1.0155402789988688,1.0271017184625957,1.0389212123758114,1.0486882732256506,1.0560668301011513,1.061272928587932,1.0663860074475715,1.0714000654138023,1.0763030999540903,1.0810831085359012,1.0857250879935667,1.0902200364276862 pre_tax_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 prior_year_minimum_tax_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 qualified_dividend_income,1.0,1.2,1.269,1.283,1.325,1.376,1.414,1.445,1.483,1.533,1.624,1.714,1.801,1.885,1.966 @@ -58,7 +59,7 @@ self_employed_pension_contribution_ald,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.3 self_employed_pension_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 self_employment_income,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779 self_employment_income_before_lsr,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779 -self_employment_income_last_year,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +self_employment_income_last_year,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779 short_term_capital_gains,1.0,0.997,1.59,1.711,1.781,1.711,1.633,1.607,1.612,1.639,1.68,1.727,1.781,1.838,1.898 snap_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 social_security,1.0,1.276,1.355,1.55,1.718,1.841,1.937,2.031,2.143,2.268,2.398,2.519,2.654,2.805,2.951 diff --git a/policyengine_us_data/data_storage/uprating_growth_factors.csv b/policyengine_us_data/data_storage/uprating_growth_factors.csv index e2d1f93..f06776b 100644 --- a/policyengine_us_data/data_storage/uprating_growth_factors.csv +++ b/policyengine_us_data/data_storage/uprating_growth_factors.csv @@ -15,7 +15,7 @@ early_withdrawal_penalty,0,0.16599999999999993,-0.015437392795883409,0.058362369 educator_expense,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 employment_income,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987 employment_income_before_lsr,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987 -employment_income_last_year,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 +employment_income_last_year,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987 energy_efficient_home_improvement_credit,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 estate_income,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 excess_withheld_payroll_tax,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 @@ -42,6 +42,7 @@ non_sch_d_capital_gains,0,0.16599999999999993,-0.015437392795883409,0.0583623693 other_credits,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 partnership_s_corp_income,0,-0.0030000000000000027,0.546639919759278,0.02529182879377423,0.0657811511701456,0.04035608308605321,0.02053622361665708,0.02124091671324768,0.005473453749315738,0.011976047904191711,0.017213555675094083,0.0200951877313591,0.02073613271124941,0.02031488065007614,0.03235440517670485 person_weight,0,0.0029999999999998916,0.003988035892322994,0.008937437934458892,0.010826771653543288,0.011684518013632017,0.009624639076034613,0.006673021925643674,0.004734848484848397,0.004712535344015167,0.004690431519699612,0.004668534080298992,0.004646840148698761,0.0046253469010177906,0.0036832412523020164 +population,0,0.0027545812166367423,0.0038212824786565402,0.008895301349200357,0.011384520833702672,0.011507617698184314,0.009401156443330061,0.007035986826480878,0.0049297055246797505,0.00481787363260322,0.004701916502291681,0.004576287325868789,0.004441136127931511,0.004293822945723447,0.004140042892834206 pre_tax_contributions,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 prior_year_minimum_tax_credit,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 qualified_dividend_income,0,0.19999999999999996,0.057499999999999885,0.011032308904649346,0.03273577552611062,0.03849056603773571,0.02761627906976738,0.02192362093352207,0.02629757785467124,0.033715441672285795,0.05936073059360747,0.05541871921182251,0.05075845974329063,0.046640755136035494,0.04297082228116711 @@ -58,7 +59,7 @@ self_employed_pension_contribution_ald,0,0.16599999999999993,-0.0154373927958834 self_employed_pension_contributions,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 self_employment_income,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017 self_employment_income_before_lsr,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017 -self_employment_income_last_year,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 +self_employment_income_last_year,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017 short_term_capital_gains,0,-0.0030000000000000027,0.5947843530591777,0.0761006289308177,0.040911747516072294,-0.03930376193149909,-0.0455873758036236,-0.015921616656460524,0.0031113876789048422,0.01674937965260548,0.02501525320317266,0.0279761904761906,0.03126809496236227,0.03200449185850651,0.03264417845484213 snap_reported,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876 social_security,0,0.276,0.06191222570532906,0.1439114391143912,0.10838709677419356,0.07159487776484275,0.05214557305812062,0.04852865255549821,0.05514524864598713,0.058329444703686395,0.057319223985890844,0.050458715596330306,0.05359269551409285,0.056895252449133515,0.05204991087344024 diff --git a/policyengine_us_data/datasets/cps/__init__.py b/policyengine_us_data/datasets/cps/__init__.py index 213a613..2411ca4 100644 --- a/policyengine_us_data/datasets/cps/__init__.py +++ b/policyengine_us_data/datasets/cps/__init__.py @@ -1,2 +1,3 @@ from .cps import * from .extended_cps import * +from .enhanced_cps import * diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 5d87f29..7a79d3e 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -562,6 +562,6 @@ class CPS_2022(CPS): class CPS_2024(CPS): name = "cps_2024" - label = "CPS 2024" + label = "CPS 2024 (2022-based)" file_path = STORAGE_FOLDER / "cps_2024.h5" time_period = 2024 diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 572824c..f960612 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -1,5 +1,208 @@ from policyengine_core.data import Dataset +import pandas as pd +from policyengine_us_data.utils import ( + pe_to_soi, + get_soi, + build_loss_matrix, + fmt, +) +import numpy as np +from typing import Type +from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.datasets.cps import ExtendedCPS_2024 +import torch + + +def build_loss_matrix(dataset: type, time_period): + loss_matrix = pd.DataFrame() + df = pe_to_soi(dataset, time_period) + agi = df["adjusted_gross_income"].values + filer = df["is_tax_filer"].values + soi_subset = get_soi(time_period) + targets_array = [] + agi_level_targeted_variables = [ + "adjusted_gross_income", + "count", + "employment_income", + "business_net_profits", + "capital_gains_gross", + "ordinary_dividends", + "partnership_and_s_corp_income", + "qualified_dividends", + "taxable_interest_income", + "total_pension_income", + "total_social_security", + ] + aggregate_level_targeted_variables = [ + "business_net_losses", + "capital_gains_distributions", + "capital_gains_losses", + "estate_income", + "estate_losses", + "exempt_interest", + "ira_distributions", + "partnership_and_s_corp_losses", + "rent_and_royalty_net_income", + "rent_and_royalty_net_losses", + "taxable_pension_income", + "taxable_social_security", + "unemployment_compensation", + ] + aggregate_level_targeted_variables = [ + variable + for variable in aggregate_level_targeted_variables + if variable in df.columns + ] + soi_subset = soi_subset[ + soi_subset.Variable.isin(agi_level_targeted_variables) + & ( + (soi_subset["AGI lower bound"] != -np.inf) + | (soi_subset["AGI upper bound"] != np.inf) + ) + | ( + soi_subset.Variable.isin(aggregate_level_targeted_variables) + & (soi_subset["AGI lower bound"] == -np.inf) + & (soi_subset["AGI upper bound"] == np.inf) + ) + ] + for _, row in soi_subset.iterrows(): + if row["Taxable only"]: + continue # exclude "taxable returns" statistics + + mask = ( + (agi >= row["AGI lower bound"]) + * (agi < row["AGI upper bound"]) + * filer + ) > 0 + + if row["Filing status"] == "Single": + mask *= df["filing_status"].values == "SINGLE" + elif row["Filing status"] == "Married Filing Jointly/Surviving Spouse": + mask *= df["filing_status"].values == "JOINT" + elif row["Filing status"] == "Head of Household": + mask *= df["filing_status"].values == "HEAD_OF_HOUSEHOLD" + elif row["Filing status"] == "Married Filing Separately": + mask *= df["filing_status"].values == "SEPARATE" + + values = df[row["Variable"]].values + + if row["Count"]: + values = (values > 0).astype(float) + + agi_range_label = ( + f"{fmt(row['AGI lower bound'])}-{fmt(row['AGI upper bound'])}" + ) + taxable_label = ( + "taxable" if row["Taxable only"] else "all" + " returns" + ) + filing_status_label = row["Filing status"] + + variable_label = row["Variable"].replace("_", " ") + + if row["Count"] and not row["Variable"] == "count": + label = ( + f"{variable_label}/count/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + elif row["Variable"] == "count": + label = ( + f"{variable_label}/count/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + else: + label = ( + f"{variable_label}/total/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + + if label not in loss_matrix.columns: + loss_matrix[label] = mask * values + targets_array.append(row["Value"]) + + # Convert tax-unit level df to household-level df + + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=dataset) + hh_id = sim.calculate("household_id", map_to="person") + tax_unit_hh_id = sim.map_result( + hh_id, "person", "tax_unit", how="value_from_first_person" + ) + + loss_matrix = loss_matrix.groupby(tax_unit_hh_id).sum() + + return loss_matrix.values, np.array(targets_array) + + +def reweight( + original_weights, + loss_matrix, + targets_array, +): + loss_matrix = torch.tensor(loss_matrix, dtype=torch.float32) + targets_array = torch.tensor(targets_array, dtype=torch.float32) + + # TODO: replace this with a call to the python reweight.py package. + def loss(weights): + estimate = weights @ loss_matrix + rel_error = ((estimate - targets_array) / targets_array) ** 2 + return rel_error.mean() + + weights = torch.tensor( + np.log(original_weights), requires_grad=True, dtype=torch.float32 + ) + optimizer = torch.optim.Adam([weights], lr=1e-2) + from tqdm import trange + + iterator = trange(1_000) + for i in iterator: + optimizer.zero_grad() + l = loss(torch.exp(weights)) + l.backward() + iterator.set_postfix({"loss": l.item()}) + optimizer.step() + + return torch.exp(weights).detach().numpy() class EnhancedCPS(Dataset): - pass + data_format = Dataset.FLAT_FILE + input_dataset: Type[Dataset] + start_year: int + end_year: int + + def generate(self): + df = self.input_dataset(require=True).load() + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=self.input_dataset) + original_weights = sim.calculate("household_weight") + original_weights = original_weights.values + np.random.normal( + 10, 1, len(original_weights) + ) + for year in range(self.start_year, self.end_year + 1): + print(f"Enhancing CPS for {year}") + loss_matrix, targets_array = build_loss_matrix( + self.input_dataset, year + ) + optimised_weights = reweight( + original_weights, loss_matrix, targets_array + ) + df[f"household_weight__{year}"] = sim.map_result( + optimised_weights, "household", "person" + ) + + self.save_dataset(df) + + +class EnhancedCPS_2024(EnhancedCPS): + input_dataset = ExtendedCPS_2024 + start_year = 2024 + end_year = 2024 + name = "enhanced_cps_2024" + label = "Enhanced CPS 2024" + file_path = STORAGE_FOLDER / "enhanced_cps_2024.csv" + + +if __name__ == "__main__": + EnhancedCPS_2024().generate() diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index f84ee7b..13003b0 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -65,8 +65,6 @@ "w2_wages_from_qualified_business", ] -IMPUTED_VARIABLES = ["employment_income"] - class ExtendedCPS(Dataset): cps: Type[CPS] @@ -95,21 +93,39 @@ def generate(self): model = Imputation() - model.train(X_train, y_train, verbose=True) + model.train(X_train, y_train, verbose=True, num_trees=10) X = cps_sim.calculate_dataframe(INPUTS) y = model.predict(X, verbose=True) original_dataset = cps_sim.to_input_dataframe() - original_dataset["employment_income"] = ( - original_dataset.employment_income_before_lsr - ) - original_dataset["self_employment_income"] = ( - original_dataset.self_employment_income_before_lsr - ) + renames = { + f"employment_income_before_lsr__{self.time_period}": f"employment_income__{self.time_period}", + f"self_employment_income_before_lsr__{self.time_period}": f"self_employment_income__{self.time_period}", + } + for a, b in renames.items(): + original_dataset[b] = original_dataset[a] + del original_dataset[a] imputed_dataset = original_dataset.copy().reset_index() - imputed_dataset[IMPUTED_VARIABLES] = y + for variable in IMPUTED_VARIABLES: + imputed_dataset[f"{variable}__{self.time_period}"] = y[variable] + + ENTITIES = ("person", "tax_unit", "family", "spm_unit", "household") + for entity in ENTITIES: + for id_name in [ + f"{entity}_id__{self.time_period}", + f"person_{entity}_id__{self.time_period}", + ]: + if "person_person" in id_name: + continue + original_ids = original_dataset[id_name].values + new_ids = original_ids + original_ids.max() + imputed_dataset[id_name] = new_ids + + for variable in imputed_dataset.columns: + if "_weight" in variable: + imputed_dataset[variable] = 0 original_dataset["data_source"] = "cps" imputed_dataset["data_source"] = "puf_imputed" combined = pd.concat([original_dataset, imputed_dataset]) @@ -123,3 +139,4 @@ class ExtendedCPS_2024(ExtendedCPS): name = "extended_cps_2024" label = "Extended CPS (2024)" file_path = STORAGE_FOLDER / "extended_cps_2024.csv" + time_period = 2024 diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index f86d95b..5f17f7e 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -501,7 +501,7 @@ class PUF_2021(PUF): class PUF_2024(PUF): - label = "PUF 2024" + label = "PUF 2024 (2015-based)" name = "puf_2024" time_period = 2024 file_path = STORAGE_FOLDER / "pe_puf_2024.h5" diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py index e69de29..1ccbd39 100644 --- a/policyengine_us_data/utils/__init__.py +++ b/policyengine_us_data/utils/__init__.py @@ -0,0 +1,4 @@ +from .github import * +from .soi import * +from .uprating import * +from .loss import * diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py new file mode 100644 index 0000000..baf8f99 --- /dev/null +++ b/policyengine_us_data/utils/loss.py @@ -0,0 +1,138 @@ +import pandas as pd +from .soi import pe_to_soi, get_soi +import numpy as np + + +def fmt(x): + if x == -np.inf: + return "-inf" + if x == np.inf: + return "inf" + if x < 1e3: + return f"{x:.0f}" + if x < 1e6: + return f"{x/1e3:.0f}k" + if x < 1e9: + return f"{x/1e6:.0f}m" + return f"{x/1e9:.1f}bn" + + +def build_loss_matrix(dataset: type, time_period): + loss_matrix = pd.DataFrame() + df = pe_to_soi(dataset, time_period) + agi = df["adjusted_gross_income"].values + filer = df["is_tax_filer"].values + soi_subset = get_soi(time_period) + targets_array = [] + agi_level_targeted_variables = [ + "adjusted_gross_income", + "count", + "employment_income", + "business_net_profits", + "capital_gains_gross", + "ordinary_dividends", + "partnership_and_s_corp_income", + "qualified_dividends", + "taxable_interest_income", + "total_pension_income", + "total_social_security", + ] + aggregate_level_targeted_variables = [ + "business_net_losses", + "capital_gains_distributions", + "capital_gains_losses", + "estate_income", + "estate_losses", + "exempt_interest", + "ira_distributions", + "partnership_and_s_corp_losses", + "rent_and_royalty_net_income", + "rent_and_royalty_net_losses", + "taxable_pension_income", + "taxable_social_security", + "unemployment_compensation", + ] + aggregate_level_targeted_variables = [ + variable + for variable in aggregate_level_targeted_variables + if variable in df.columns + ] + soi_subset = soi_subset[ + soi_subset.Variable.isin(agi_level_targeted_variables) + & ( + (soi_subset["AGI lower bound"] != -np.inf) + | (soi_subset["AGI upper bound"] != np.inf) + ) + | ( + soi_subset.Variable.isin(aggregate_level_targeted_variables) + & (soi_subset["AGI lower bound"] == -np.inf) + & (soi_subset["AGI upper bound"] == np.inf) + ) + ] + for _, row in soi_subset.iterrows(): + if row["Taxable only"]: + continue # exclude "taxable returns" statistics + + mask = ( + (agi >= row["AGI lower bound"]) + * (agi < row["AGI upper bound"]) + * filer + ) > 0 + + if row["Filing status"] == "Single": + mask *= df["filing_status"].values == "SINGLE" + elif row["Filing status"] == "Married Filing Jointly/Surviving Spouse": + mask *= df["filing_status"].values == "JOINT" + elif row["Filing status"] == "Head of Household": + mask *= df["filing_status"].values == "HEAD_OF_HOUSEHOLD" + elif row["Filing status"] == "Married Filing Separately": + mask *= df["filing_status"].values == "SEPARATE" + + values = df[row["Variable"]].values + + if row["Count"]: + values = (values > 0).astype(float) + + agi_range_label = ( + f"{fmt(row['AGI lower bound'])}-{fmt(row['AGI upper bound'])}" + ) + taxable_label = ( + "taxable" if row["Taxable only"] else "all" + " returns" + ) + filing_status_label = row["Filing status"] + + variable_label = row["Variable"].replace("_", " ") + + if row["Count"] and not row["Variable"] == "count": + label = ( + f"{variable_label}/count/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + elif row["Variable"] == "count": + label = ( + f"{variable_label}/count/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + else: + label = ( + f"{variable_label}/total/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + + if label not in loss_matrix.columns: + loss_matrix[label] = mask * values + targets_array.append(row["Value"]) + + # Convert tax-unit level df to household-level df + + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=dataset) + hh_id = sim.calculate("household_id", map_to="person") + tax_unit_hh_id = sim.map_result( + hh_id, "person", "tax_unit", how="value_from_first_person" + ) + + loss_matrix = loss_matrix.groupby(tax_unit_hh_id).sum() + + return loss_matrix, np.array(targets_array) diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py new file mode 100644 index 0000000..61eb0c0 --- /dev/null +++ b/policyengine_us_data/utils/soi.py @@ -0,0 +1,183 @@ +import pandas as pd +import numpy as np +from .uprating import create_policyengine_uprating_factors_table +from policyengine_us_data.data_storage import STORAGE_FOLDER + + +def pe_to_soi(pe_dataset, year): + from policyengine_us import Microsimulation + + pe_sim = Microsimulation(dataset=pe_dataset) + df = pd.DataFrame() + + pe = lambda variable: np.array( + pe_sim.calculate(variable, map_to="tax_unit") + ) + + df["adjusted_gross_income"] = pe("adjusted_gross_income") + df["exemption"] = pe("exemptions") + df["itemded"] = pe("itemized_taxable_income_deductions") + df["income_tax_after_credits"] = pe("income_tax") + df["total_income_tax"] = pe("income_tax_before_credits") + df["taxable_income"] = pe("taxable_income") + df["business_net_profits"] = pe("self_employment_income") * ( + pe("self_employment_income") > 0 + ) + df["business_net_losses"] = -pe("self_employment_income") * ( + pe("self_employment_income") < 0 + ) + df["capital_gains_distributions"] = pe("non_sch_d_capital_gains") + df["capital_gains_gross"] = pe("loss_limited_net_capital_gains") * ( + pe("loss_limited_net_capital_gains") > 0 + ) + df["capital_gains_losses"] = -pe("loss_limited_net_capital_gains") * ( + pe("loss_limited_net_capital_gains") < 0 + ) + df["estate_income"] = pe("estate_income") * (pe("estate_income") > 0) + df["estate_losses"] = -pe("estate_income") * (pe("estate_income") < 0) + df["exempt_interest"] = pe("tax_exempt_interest_income") + df["ira_distributions"] = pe("taxable_ira_distributions") + df["count_of_exemptions"] = pe("exemptions_count") + df["ordinary_dividends"] = pe("non_qualified_dividend_income") + pe( + "qualified_dividend_income" + ) + df["partnership_and_s_corp_income"] = pe("partnership_s_corp_income") * ( + pe("partnership_s_corp_income") > 0 + ) + df["partnership_and_s_corp_losses"] = -pe("partnership_s_corp_income") * ( + pe("partnership_s_corp_income") < 0 + ) + df["total_pension_income"] = pe("pension_income") + df["taxable_pension_income"] = pe("taxable_pension_income") + df["qualified_dividends"] = pe("qualified_dividend_income") + df["rent_and_royalty_net_income"] = pe("rental_income") * ( + pe("rental_income") > 0 + ) + df["rent_and_royalty_net_losses"] = -pe("rental_income") * ( + pe("rental_income") < 0 + ) + df["total_social_security"] = pe("social_security") + df["taxable_social_security"] = pe("taxable_social_security") + df["income_tax_before_credits"] = pe("income_tax_before_credits") + df["taxable_interest_income"] = pe("taxable_interest_income") + df["unemployment_compensation"] = pe("taxable_unemployment_compensation") + df["employment_income"] = pe("irs_employment_income") + df["qualified_business_income_deduction"] = pe( + "qualified_business_income_deduction" + ) + df["charitable_contributions_deduction"] = pe("charitable_deduction") + df["interest_paid_deductions"] = pe("interest_deduction") + df["medical_expense_deductions_uncapped"] = pe("medical_expense_deduction") + df["state_and_local_tax_deductions"] = pe("salt_deduction") + df["itemized_state_income_and_sales_tax_deductions"] = pe( + "state_and_local_sales_or_income_tax" + ) + df["itemized_real_estate_tax_deductions"] = pe("real_estate_taxes") + df["is_tax_filer"] = pe("tax_unit_is_filer") + df["count"] = 1 + + df["filing_status"] = pe("filing_status") + df["weight"] = pe("household_weight") + df["household_id"] = pe("household_id") + + return df + + +def puf_to_soi(puf, year): + df = pd.DataFrame() + + df["adjusted_gross_income"] = puf.E00100 + df["total_income_tax"] = puf.E06500 + df["employment_income"] = puf.E00200 + df["capital_gains_distributions"] = puf.E01100 + df["capital_gains_gross"] = puf["E01000"] * (puf["E01000"] > 0) + df["capital_gains_losses"] = -puf["E01000"] * (puf["E01000"] < 0) + df["estate_income"] = puf.E26390 + df["estate_losses"] = puf.E26400 + df["exempt_interest"] = puf.E00400 + df["ira_distributions"] = puf.E01400 + df["count_of_exemptions"] = puf.XTOT + df["ordinary_dividends"] = puf.E00600 + df["partnership_and_s_corp_income"] = puf.E26270 * (puf.E26270 > 0) + df["partnership_and_s_corp_losses"] = -puf.E26270 * (puf.E26270 < 0) + df["total_pension_income"] = puf.E01500 + df["taxable_pension_income"] = puf.E01700 + df["qualified_dividends"] = puf.E00650 + df["rent_and_royalty_net_income"] = puf.E25850 + df["rent_and_royalty_net_losses"] = puf.E25860 + df["total_social_security"] = puf.E02400 + df["taxable_social_security"] = puf.E02500 + df["income_tax_before_credits"] = puf.E06500 + df["taxable_interest_income"] = puf.E00300 + df["unemployment_compensation"] = puf.E02300 + df["employment_income"] = puf.E00200 + df["charitable_contributions_deduction"] = puf.E19700 + df["interest_paid_deductions"] = puf.E19200 + df["medical_expense_deductions_uncapped"] = puf.E17500 + df["itemized_state_income_and_sales_tax_deductions"] = puf.E18400 + df["itemized_real_estate_tax_deductions"] = puf.E18500 + df["state_and_local_tax_deductions"] = puf.E18400 + puf.E18500 + df["income_tax_after_credits"] = puf.E08800 + df["business_net_profits"] = puf.E00900 * (puf.E00900 > 0) + df["business_net_losses"] = -puf.E00900 * (puf.E00900 < 0) + df["taxable_income"] = puf.E04800 + df["is_tax_filer"] = True + df["count"] = 1 + df["filing_status"] = puf.MARS.map( + { + 0: "SINGLE", # Assume the aggregate record is single + 1: "SINGLE", + 2: "JOINT", + 3: "SEPARATE", + 4: "HEAD_OF_HOUSEHOLD", + } + ) + + df["weight"] = puf["S006"] / 100 + + return df + + +def get_soi(year: int) -> pd.DataFrame: + uprating = create_policyengine_uprating_factors_table() + + uprating_map = { + "adjusted_gross_income": "adjusted_gross_income", + "count": "population", + "employment_income": "employment_income", + "business_net_profits": "self_employment_income", + "capital_gains_gross": "long_term_capital_gains", + "ordinary_dividends": "non_qualified_dividend_income", + "partnership_and_s_corp_income": "partnership_s_corp_income", + "qualified_dividends": "qualified_dividend_income", + "taxable_interest_income": "taxable_interest_income", + "total_pension_income": "pension_income", + "total_social_security": "social_security", + "business_net_losses": "self_employment_income", + "capital_gains_distributions": "long_term_capital_gains", + "capital_gains_losses": "long_term_capital_gains", + "estate_income": "estate_income", + "estate_losses": "estate_income", + "exempt_interest": "tax_exempt_interest_income", + "ira_distributions": "taxable_ira_distributions", + "partnership_and_s_corp_losses": "partnership_s_corp_income", + "rent_and_royalty_net_income": "rental_income", + "rent_and_royalty_net_losses": "rental_income", + "taxable_pension_income": "taxable_pension_income", + "taxable_social_security": "taxable_social_security", + "unemployment_compensation": "unemployment_compensation", + } + soi = pd.read_csv(STORAGE_FOLDER / "soi.csv") + + uprating_factors = { + variable: uprating.loc[variable, year] + / uprating.loc[variable, soi.Year.max()] + for variable in uprating.index + } + + soi = soi[soi.Year == soi.Year.max()] + + for variable, uprating_factor in uprating_factors.items(): + soi.loc[soi.Variable == variable, "Value"] *= uprating_factor + + return soi diff --git a/policyengine_us_data/utils/uprating.py b/policyengine_us_data/utils/uprating.py index a2d2843..5c5be1c 100644 --- a/policyengine_us_data/utils/uprating.py +++ b/policyengine_us_data/utils/uprating.py @@ -35,6 +35,15 @@ def create_policyengine_uprating_factors_table(): per_capita_growth = growth index_values.append(round(per_capita_growth, 3)) + # Add population growth + + for year in range(START_YEAR, END_YEAR + 1): + variable_names.append("population") + years.append(year) + index_values.append( + population_size(year) / population_size(START_YEAR) + ) + df["Variable"] = variable_names df["Year"] = years df["Value"] = index_values