Skip to content

Commit

Permalink
Improve calibration for ECPS
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilwoodruff committed Aug 24, 2024
1 parent 9e759c5 commit c28821b
Show file tree
Hide file tree
Showing 14 changed files with 653 additions and 18 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@ install:
pip install -e .[dev]

docker:
docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest
docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest

documentation:
streamlit run docs/Home.py
3 changes: 3 additions & 0 deletions docs/Home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import streamlit as st

st.title("PolicyEngine US Data")
72 changes: 72 additions & 0 deletions docs/pages/Benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import streamlit as st

st.set_page_config(layout="wide")

st.title("Benchmarks")

from policyengine_us_data.datasets import CPS_2024, PUF_2024, EnhancedCPS_2024
from policyengine_us_data.utils import build_loss_matrix
from policyengine_us import Microsimulation
import pandas as pd
import plotly.express as px


@st.cache_data
def compare_datasets():
comparison_combined = pd.DataFrame()
for dataset in [CPS_2024, PUF_2024, EnhancedCPS_2024]:
sim = Microsimulation(dataset=dataset)
weights = sim.calculate("household_weight").values
loss_matrix, targets_array = build_loss_matrix(dataset, 2024)
target_names = loss_matrix.columns
estimates = weights @ loss_matrix.values
comparison = pd.DataFrame(
{
"Target": target_names,
"Estimate": estimates,
"Actual": targets_array,
}
)
comparison["Error"] = comparison["Estimate"] - comparison["Actual"]
comparison["Abs. Error"] = comparison["Error"].abs()
comparison["Abs. Error %"] = (
comparison["Abs. Error"] / comparison["Actual"]
)
comparison["Dataset"] = dataset.label
comparison_combined = pd.concat([comparison_combined, comparison])

return comparison_combined


df = compare_datasets()

mean_relative_error_by_dataset = (
df.groupby("Dataset")["Abs. Error %"].mean().reset_index()
)

st.write(mean_relative_error_by_dataset)

metric = st.selectbox(
"Metric", ["Estimate", "Error", "Abs. Error", "Abs. Error %"]
)
target = st.selectbox("Target", df["Target"].unique())

fig = px.bar(
df[df["Target"] == target],
x="Dataset",
y=metric,
title=f"{metric} for {target}",
)

if metric == "Estimate":
# Add a dashed line at the target
fig.add_shape(
type="line",
x0=-0.5,
x1=2.5,
y0=df.loc[df["Target"] == target, "Actual"].values[0],
y1=df.loc[df["Target"] == target, "Actual"].values[0],
line=dict(dash="dash"),
)

st.plotly_chart(fig, use_container_width=True)
5 changes: 3 additions & 2 deletions policyengine_us_data/data_storage/uprating_factors.csv
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ early_withdrawal_penalty,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467
educator_expense,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
employment_income,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748
employment_income_before_lsr,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748
employment_income_last_year,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
employment_income_last_year,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748
energy_efficient_home_improvement_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
estate_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
excess_withheld_payroll_tax,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
Expand All @@ -42,6 +42,7 @@ non_sch_d_capital_gains,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,
other_credits,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
partnership_s_corp_income,1.0,0.997,1.542,1.581,1.685,1.753,1.789,1.827,1.837,1.859,1.891,1.929,1.969,2.009,2.074
person_weight,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09
population,1.0,1.0027545812166367,1.0065863897282326,1.0155402789988688,1.0271017184625957,1.0389212123758114,1.0486882732256506,1.0560668301011513,1.061272928587932,1.0663860074475715,1.0714000654138023,1.0763030999540903,1.0810831085359012,1.0857250879935667,1.0902200364276862
pre_tax_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
prior_year_minimum_tax_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
qualified_dividend_income,1.0,1.2,1.269,1.283,1.325,1.376,1.414,1.445,1.483,1.533,1.624,1.714,1.801,1.885,1.966
Expand All @@ -58,7 +59,7 @@ self_employed_pension_contribution_ald,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.3
self_employed_pension_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
self_employment_income,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779
self_employment_income_before_lsr,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779
self_employment_income_last_year,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
self_employment_income_last_year,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779
short_term_capital_gains,1.0,0.997,1.59,1.711,1.781,1.711,1.633,1.607,1.612,1.639,1.68,1.727,1.781,1.838,1.898
snap_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
social_security,1.0,1.276,1.355,1.55,1.718,1.841,1.937,2.031,2.143,2.268,2.398,2.519,2.654,2.805,2.951
Expand Down
5 changes: 3 additions & 2 deletions policyengine_us_data/data_storage/uprating_growth_factors.csv
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ early_withdrawal_penalty,0,0.16599999999999993,-0.015437392795883409,0.058362369
educator_expense,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
employment_income,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987
employment_income_before_lsr,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987
employment_income_last_year,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
employment_income_last_year,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987
energy_efficient_home_improvement_credit,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
estate_income,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
excess_withheld_payroll_tax,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
Expand All @@ -42,6 +42,7 @@ non_sch_d_capital_gains,0,0.16599999999999993,-0.015437392795883409,0.0583623693
other_credits,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
partnership_s_corp_income,0,-0.0030000000000000027,0.546639919759278,0.02529182879377423,0.0657811511701456,0.04035608308605321,0.02053622361665708,0.02124091671324768,0.005473453749315738,0.011976047904191711,0.017213555675094083,0.0200951877313591,0.02073613271124941,0.02031488065007614,0.03235440517670485
person_weight,0,0.0029999999999998916,0.003988035892322994,0.008937437934458892,0.010826771653543288,0.011684518013632017,0.009624639076034613,0.006673021925643674,0.004734848484848397,0.004712535344015167,0.004690431519699612,0.004668534080298992,0.004646840148698761,0.0046253469010177906,0.0036832412523020164
population,0,0.0027545812166367423,0.0038212824786565402,0.008895301349200357,0.011384520833702672,0.011507617698184314,0.009401156443330061,0.007035986826480878,0.0049297055246797505,0.00481787363260322,0.004701916502291681,0.004576287325868789,0.004441136127931511,0.004293822945723447,0.004140042892834206
pre_tax_contributions,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
prior_year_minimum_tax_credit,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
qualified_dividend_income,0,0.19999999999999996,0.057499999999999885,0.011032308904649346,0.03273577552611062,0.03849056603773571,0.02761627906976738,0.02192362093352207,0.02629757785467124,0.033715441672285795,0.05936073059360747,0.05541871921182251,0.05075845974329063,0.046640755136035494,0.04297082228116711
Expand All @@ -58,7 +59,7 @@ self_employed_pension_contribution_ald,0,0.16599999999999993,-0.0154373927958834
self_employed_pension_contributions,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
self_employment_income,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017
self_employment_income_before_lsr,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017
self_employment_income_last_year,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
self_employment_income_last_year,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017
short_term_capital_gains,0,-0.0030000000000000027,0.5947843530591777,0.0761006289308177,0.040911747516072294,-0.03930376193149909,-0.0455873758036236,-0.015921616656460524,0.0031113876789048422,0.01674937965260548,0.02501525320317266,0.0279761904761906,0.03126809496236227,0.03200449185850651,0.03264417845484213
snap_reported,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
social_security,0,0.276,0.06191222570532906,0.1439114391143912,0.10838709677419356,0.07159487776484275,0.05214557305812062,0.04852865255549821,0.05514524864598713,0.058329444703686395,0.057319223985890844,0.050458715596330306,0.05359269551409285,0.056895252449133515,0.05204991087344024
Expand Down
1 change: 1 addition & 0 deletions policyengine_us_data/datasets/cps/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .cps import *
from .extended_cps import *
from .enhanced_cps import *
2 changes: 1 addition & 1 deletion policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,6 @@ class CPS_2022(CPS):

class CPS_2024(CPS):
name = "cps_2024"
label = "CPS 2024"
label = "CPS 2024 (2022-based)"
file_path = STORAGE_FOLDER / "cps_2024.h5"
time_period = 2024
205 changes: 204 additions & 1 deletion policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,208 @@
from policyengine_core.data import Dataset
import pandas as pd
from policyengine_us_data.utils import (
pe_to_soi,
get_soi,
build_loss_matrix,
fmt,
)
import numpy as np
from typing import Type
from policyengine_us_data.data_storage import STORAGE_FOLDER
from policyengine_us_data.datasets.cps import ExtendedCPS_2024
import torch


def build_loss_matrix(dataset: type, time_period):
loss_matrix = pd.DataFrame()
df = pe_to_soi(dataset, time_period)
agi = df["adjusted_gross_income"].values
filer = df["is_tax_filer"].values
soi_subset = get_soi(time_period)
targets_array = []
agi_level_targeted_variables = [
"adjusted_gross_income",
"count",
"employment_income",
"business_net_profits",
"capital_gains_gross",
"ordinary_dividends",
"partnership_and_s_corp_income",
"qualified_dividends",
"taxable_interest_income",
"total_pension_income",
"total_social_security",
]
aggregate_level_targeted_variables = [
"business_net_losses",
"capital_gains_distributions",
"capital_gains_losses",
"estate_income",
"estate_losses",
"exempt_interest",
"ira_distributions",
"partnership_and_s_corp_losses",
"rent_and_royalty_net_income",
"rent_and_royalty_net_losses",
"taxable_pension_income",
"taxable_social_security",
"unemployment_compensation",
]
aggregate_level_targeted_variables = [
variable
for variable in aggregate_level_targeted_variables
if variable in df.columns
]
soi_subset = soi_subset[
soi_subset.Variable.isin(agi_level_targeted_variables)
& (
(soi_subset["AGI lower bound"] != -np.inf)
| (soi_subset["AGI upper bound"] != np.inf)
)
| (
soi_subset.Variable.isin(aggregate_level_targeted_variables)
& (soi_subset["AGI lower bound"] == -np.inf)
& (soi_subset["AGI upper bound"] == np.inf)
)
]
for _, row in soi_subset.iterrows():
if row["Taxable only"]:
continue # exclude "taxable returns" statistics

mask = (
(agi >= row["AGI lower bound"])
* (agi < row["AGI upper bound"])
* filer
) > 0

if row["Filing status"] == "Single":
mask *= df["filing_status"].values == "SINGLE"
elif row["Filing status"] == "Married Filing Jointly/Surviving Spouse":
mask *= df["filing_status"].values == "JOINT"
elif row["Filing status"] == "Head of Household":
mask *= df["filing_status"].values == "HEAD_OF_HOUSEHOLD"
elif row["Filing status"] == "Married Filing Separately":
mask *= df["filing_status"].values == "SEPARATE"

values = df[row["Variable"]].values

if row["Count"]:
values = (values > 0).astype(float)

agi_range_label = (
f"{fmt(row['AGI lower bound'])}-{fmt(row['AGI upper bound'])}"
)
taxable_label = (
"taxable" if row["Taxable only"] else "all" + " returns"
)
filing_status_label = row["Filing status"]

variable_label = row["Variable"].replace("_", " ")

if row["Count"] and not row["Variable"] == "count":
label = (
f"{variable_label}/count/AGI in "
f"{agi_range_label}/{taxable_label}/{filing_status_label}"
)
elif row["Variable"] == "count":
label = (
f"{variable_label}/count/AGI in "
f"{agi_range_label}/{taxable_label}/{filing_status_label}"
)
else:
label = (
f"{variable_label}/total/AGI in "
f"{agi_range_label}/{taxable_label}/{filing_status_label}"
)

if label not in loss_matrix.columns:
loss_matrix[label] = mask * values
targets_array.append(row["Value"])

# Convert tax-unit level df to household-level df

from policyengine_us import Microsimulation

sim = Microsimulation(dataset=dataset)
hh_id = sim.calculate("household_id", map_to="person")
tax_unit_hh_id = sim.map_result(
hh_id, "person", "tax_unit", how="value_from_first_person"
)

loss_matrix = loss_matrix.groupby(tax_unit_hh_id).sum()

return loss_matrix.values, np.array(targets_array)


def reweight(
original_weights,
loss_matrix,
targets_array,
):
loss_matrix = torch.tensor(loss_matrix, dtype=torch.float32)
targets_array = torch.tensor(targets_array, dtype=torch.float32)

# TODO: replace this with a call to the python reweight.py package.
def loss(weights):
estimate = weights @ loss_matrix
rel_error = ((estimate - targets_array) / targets_array) ** 2
return rel_error.mean()

weights = torch.tensor(
np.log(original_weights), requires_grad=True, dtype=torch.float32
)
optimizer = torch.optim.Adam([weights], lr=1e-2)
from tqdm import trange

iterator = trange(1_000)
for i in iterator:
optimizer.zero_grad()
l = loss(torch.exp(weights))
l.backward()
iterator.set_postfix({"loss": l.item()})
optimizer.step()

return torch.exp(weights).detach().numpy()


class EnhancedCPS(Dataset):
pass
data_format = Dataset.FLAT_FILE
input_dataset: Type[Dataset]
start_year: int
end_year: int

def generate(self):
df = self.input_dataset(require=True).load()
from policyengine_us import Microsimulation

sim = Microsimulation(dataset=self.input_dataset)
original_weights = sim.calculate("household_weight")
original_weights = original_weights.values + np.random.normal(
10, 1, len(original_weights)
)
for year in range(self.start_year, self.end_year + 1):
print(f"Enhancing CPS for {year}")
loss_matrix, targets_array = build_loss_matrix(
self.input_dataset, year
)
optimised_weights = reweight(
original_weights, loss_matrix, targets_array
)
df[f"household_weight__{year}"] = sim.map_result(
optimised_weights, "household", "person"
)

self.save_dataset(df)


class EnhancedCPS_2024(EnhancedCPS):
input_dataset = ExtendedCPS_2024
start_year = 2024
end_year = 2024
name = "enhanced_cps_2024"
label = "Enhanced CPS 2024"
file_path = STORAGE_FOLDER / "enhanced_cps_2024.csv"


if __name__ == "__main__":
EnhancedCPS_2024().generate()
Loading

0 comments on commit c28821b

Please sign in to comment.