Improve calibration for ECPS

PolicyEngine · Aug 24, 2024 · c28821b · c28821b
1 parent 9e759c5
commit c28821b
Show file tree

Hide file tree

Showing 14 changed files with 653 additions and 18 deletions.
diff --git a/Makefile b/Makefile
@@ -8,4 +8,7 @@ install:
 	pip install -e .[dev]
 
 docker:
-	docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest
+	docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest
+
+documentation:
+	streamlit run docs/Home.py
diff --git a/docs/Home.py b/docs/Home.py
@@ -0,0 +1,3 @@
+import streamlit as st
+
+st.title("PolicyEngine US Data")
diff --git a/docs/pages/Benchmarks.py b/docs/pages/Benchmarks.py
@@ -0,0 +1,72 @@
+import streamlit as st
+
+st.set_page_config(layout="wide")
+
+st.title("Benchmarks")
+
+from policyengine_us_data.datasets import CPS_2024, PUF_2024, EnhancedCPS_2024
+from policyengine_us_data.utils import build_loss_matrix
+from policyengine_us import Microsimulation
+import pandas as pd
+import plotly.express as px
+
+
+@st.cache_data
+def compare_datasets():
+    comparison_combined = pd.DataFrame()
+    for dataset in [CPS_2024, PUF_2024, EnhancedCPS_2024]:
+        sim = Microsimulation(dataset=dataset)
+        weights = sim.calculate("household_weight").values
+        loss_matrix, targets_array = build_loss_matrix(dataset, 2024)
+        target_names = loss_matrix.columns
+        estimates = weights @ loss_matrix.values
+        comparison = pd.DataFrame(
+            {
+                "Target": target_names,
+                "Estimate": estimates,
+                "Actual": targets_array,
+            }
+        )
+        comparison["Error"] = comparison["Estimate"] - comparison["Actual"]
+        comparison["Abs. Error"] = comparison["Error"].abs()
+        comparison["Abs. Error %"] = (
+            comparison["Abs. Error"] / comparison["Actual"]
+        )
+        comparison["Dataset"] = dataset.label
+        comparison_combined = pd.concat([comparison_combined, comparison])
+
+    return comparison_combined
+
+
+df = compare_datasets()
+
+mean_relative_error_by_dataset = (
+    df.groupby("Dataset")["Abs. Error %"].mean().reset_index()
+)
+
+st.write(mean_relative_error_by_dataset)
+
+metric = st.selectbox(
+    "Metric", ["Estimate", "Error", "Abs. Error", "Abs. Error %"]
+)
+target = st.selectbox("Target", df["Target"].unique())
+
+fig = px.bar(
+    df[df["Target"] == target],
+    x="Dataset",
+    y=metric,
+    title=f"{metric} for {target}",
+)
+
+if metric == "Estimate":
+    # Add a dashed line at the target
+    fig.add_shape(
+        type="line",
+        x0=-0.5,
+        x1=2.5,
+        y0=df.loc[df["Target"] == target, "Actual"].values[0],
+        y1=df.loc[df["Target"] == target, "Actual"].values[0],
+        line=dict(dash="dash"),
+    )
+
+st.plotly_chart(fig, use_container_width=True)
diff --git a/policyengine_us_data/data_storage/uprating_factors.csv b/policyengine_us_data/data_storage/uprating_factors.csv
@@ -15,7 +15,7 @@ early_withdrawal_penalty,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467
 educator_expense,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 employment_income,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748
 employment_income_before_lsr,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748
-employment_income_last_year,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
+employment_income_last_year,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748
 energy_efficient_home_improvement_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 estate_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 excess_withheld_payroll_tax,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
@@ -42,6 +42,7 @@ non_sch_d_capital_gains,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,
 other_credits,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 partnership_s_corp_income,1.0,0.997,1.542,1.581,1.685,1.753,1.789,1.827,1.837,1.859,1.891,1.929,1.969,2.009,2.074
 person_weight,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09
+population,1.0,1.0027545812166367,1.0065863897282326,1.0155402789988688,1.0271017184625957,1.0389212123758114,1.0486882732256506,1.0560668301011513,1.061272928587932,1.0663860074475715,1.0714000654138023,1.0763030999540903,1.0810831085359012,1.0857250879935667,1.0902200364276862
 pre_tax_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 prior_year_minimum_tax_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 qualified_dividend_income,1.0,1.2,1.269,1.283,1.325,1.376,1.414,1.445,1.483,1.533,1.624,1.714,1.801,1.885,1.966
@@ -58,7 +59,7 @@ self_employed_pension_contribution_ald,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.3
 self_employed_pension_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 self_employment_income,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779
 self_employment_income_before_lsr,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779
-self_employment_income_last_year,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
+self_employment_income_last_year,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779
 short_term_capital_gains,1.0,0.997,1.59,1.711,1.781,1.711,1.633,1.607,1.612,1.639,1.68,1.727,1.781,1.838,1.898
 snap_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 social_security,1.0,1.276,1.355,1.55,1.718,1.841,1.937,2.031,2.143,2.268,2.398,2.519,2.654,2.805,2.951

diff --git a/policyengine_us_data/data_storage/uprating_growth_factors.csv b/policyengine_us_data/data_storage/uprating_growth_factors.csv
@@ -15,7 +15,7 @@ early_withdrawal_penalty,0,0.16599999999999993,-0.015437392795883409,0.058362369
 educator_expense,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
 employment_income,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987
 employment_income_before_lsr,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987
-employment_income_last_year,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
+employment_income_last_year,0,0.06899999999999995,0.07483629560336769,0.05395996518711921,0.04376548307184147,0.03322784810126578,0.03215926493108734,0.03115727002967339,0.03453237410071952,0.033379694019471495,0.03364737550471064,0.033203125,0.032766225582860686,0.032946918852959195,0.03248670998227987
 energy_efficient_home_improvement_credit,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
 estate_income,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
 excess_withheld_payroll_tax,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
@@ -42,6 +42,7 @@ non_sch_d_capital_gains,0,0.16599999999999993,-0.015437392795883409,0.0583623693
 other_credits,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
 partnership_s_corp_income,0,-0.0030000000000000027,0.546639919759278,0.02529182879377423,0.0657811511701456,0.04035608308605321,0.02053622361665708,0.02124091671324768,0.005473453749315738,0.011976047904191711,0.017213555675094083,0.0200951877313591,0.02073613271124941,0.02031488065007614,0.03235440517670485
 person_weight,0,0.0029999999999998916,0.003988035892322994,0.008937437934458892,0.010826771653543288,0.011684518013632017,0.009624639076034613,0.006673021925643674,0.004734848484848397,0.004712535344015167,0.004690431519699612,0.004668534080298992,0.004646840148698761,0.0046253469010177906,0.0036832412523020164
+population,0,0.0027545812166367423,0.0038212824786565402,0.008895301349200357,0.011384520833702672,0.011507617698184314,0.009401156443330061,0.007035986826480878,0.0049297055246797505,0.00481787363260322,0.004701916502291681,0.004576287325868789,0.004441136127931511,0.004293822945723447,0.004140042892834206
 pre_tax_contributions,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
 prior_year_minimum_tax_credit,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
 qualified_dividend_income,0,0.19999999999999996,0.057499999999999885,0.011032308904649346,0.03273577552611062,0.03849056603773571,0.02761627906976738,0.02192362093352207,0.02629757785467124,0.033715441672285795,0.05936073059360747,0.05541871921182251,0.05075845974329063,0.046640755136035494,0.04297082228116711
@@ -58,7 +59,7 @@ self_employed_pension_contribution_ald,0,0.16599999999999993,-0.0154373927958834
 self_employed_pension_contributions,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
 self_employment_income,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017
 self_employment_income_before_lsr,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017
-self_employment_income_last_year,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
+self_employment_income_last_year,0,0.2549999999999999,0.05338645418326715,0.02647503782148264,0.06558585114222537,0.04011065006915637,0.020611702127659504,0.02084690553745938,0.00574345883854499,0.012055837563451632,0.01692789968652053,0.020345252774352618,0.020543806646525775,0.02013025458851403,0.032501450957632017
 short_term_capital_gains,0,-0.0030000000000000027,0.5947843530591777,0.0761006289308177,0.040911747516072294,-0.03930376193149909,-0.0455873758036236,-0.015921616656460524,0.0031113876789048422,0.01674937965260548,0.02501525320317266,0.0279761904761906,0.03126809496236227,0.03200449185850651,0.03264417845484213
 snap_reported,0,0.16599999999999993,-0.015437392795883409,0.058362369337979336,0.05349794238683114,0.02968750000000009,0.02427921092564489,0.028888888888888742,0.02807775377969768,0.027310924369748024,0.03135650988411709,0.03172504957039002,0.03203074951953888,0.0322780881440099,0.03307276007215876
 social_security,0,0.276,0.06191222570532906,0.1439114391143912,0.10838709677419356,0.07159487776484275,0.05214557305812062,0.04852865255549821,0.05514524864598713,0.058329444703686395,0.057319223985890844,0.050458715596330306,0.05359269551409285,0.056895252449133515,0.05204991087344024

diff --git a/policyengine_us_data/datasets/cps/__init__.py b/policyengine_us_data/datasets/cps/__init__.py
@@ -1,2 +1,3 @@
 from .cps import *
 from .extended_cps import *
+from .enhanced_cps import *
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -562,6 +562,6 @@ class CPS_2022(CPS):
 
 class CPS_2024(CPS):
     name = "cps_2024"
-    label = "CPS 2024"
+    label = "CPS 2024 (2022-based)"
     file_path = STORAGE_FOLDER / "cps_2024.h5"
     time_period = 2024
diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -1,5 +1,208 @@
 from policyengine_core.data import Dataset
+import pandas as pd
+from policyengine_us_data.utils import (
+    pe_to_soi,
+    get_soi,
+    build_loss_matrix,
+    fmt,
+)
+import numpy as np
+from typing import Type
+from policyengine_us_data.data_storage import STORAGE_FOLDER
+from policyengine_us_data.datasets.cps import ExtendedCPS_2024
+import torch
+
+
+def build_loss_matrix(dataset: type, time_period):
+    loss_matrix = pd.DataFrame()
+    df = pe_to_soi(dataset, time_period)
+    agi = df["adjusted_gross_income"].values
+    filer = df["is_tax_filer"].values
+    soi_subset = get_soi(time_period)
+    targets_array = []
+    agi_level_targeted_variables = [
+        "adjusted_gross_income",
+        "count",
+        "employment_income",
+        "business_net_profits",
+        "capital_gains_gross",
+        "ordinary_dividends",
+        "partnership_and_s_corp_income",
+        "qualified_dividends",
+        "taxable_interest_income",
+        "total_pension_income",
+        "total_social_security",
+    ]
+    aggregate_level_targeted_variables = [
+        "business_net_losses",
+        "capital_gains_distributions",
+        "capital_gains_losses",
+        "estate_income",
+        "estate_losses",
+        "exempt_interest",
+        "ira_distributions",
+        "partnership_and_s_corp_losses",
+        "rent_and_royalty_net_income",
+        "rent_and_royalty_net_losses",
+        "taxable_pension_income",
+        "taxable_social_security",
+        "unemployment_compensation",
+    ]
+    aggregate_level_targeted_variables = [
+        variable
+        for variable in aggregate_level_targeted_variables
+        if variable in df.columns
+    ]
+    soi_subset = soi_subset[
+        soi_subset.Variable.isin(agi_level_targeted_variables)
+        & (
+            (soi_subset["AGI lower bound"] != -np.inf)
+            | (soi_subset["AGI upper bound"] != np.inf)
+        )
+        | (
+            soi_subset.Variable.isin(aggregate_level_targeted_variables)
+            & (soi_subset["AGI lower bound"] == -np.inf)
+            & (soi_subset["AGI upper bound"] == np.inf)
+        )
+    ]
+    for _, row in soi_subset.iterrows():
+        if row["Taxable only"]:
+            continue  # exclude "taxable returns" statistics
+
+        mask = (
+            (agi >= row["AGI lower bound"])
+            * (agi < row["AGI upper bound"])
+            * filer
+        ) > 0
+
+        if row["Filing status"] == "Single":
+            mask *= df["filing_status"].values == "SINGLE"
+        elif row["Filing status"] == "Married Filing Jointly/Surviving Spouse":
+            mask *= df["filing_status"].values == "JOINT"
+        elif row["Filing status"] == "Head of Household":
+            mask *= df["filing_status"].values == "HEAD_OF_HOUSEHOLD"
+        elif row["Filing status"] == "Married Filing Separately":
+            mask *= df["filing_status"].values == "SEPARATE"
+
+        values = df[row["Variable"]].values
+
+        if row["Count"]:
+            values = (values > 0).astype(float)
+
+        agi_range_label = (
+            f"{fmt(row['AGI lower bound'])}-{fmt(row['AGI upper bound'])}"
+        )
+        taxable_label = (
+            "taxable" if row["Taxable only"] else "all" + " returns"
+        )
+        filing_status_label = row["Filing status"]
+
+        variable_label = row["Variable"].replace("_", " ")
+
+        if row["Count"] and not row["Variable"] == "count":
+            label = (
+                f"{variable_label}/count/AGI in "
+                f"{agi_range_label}/{taxable_label}/{filing_status_label}"
+            )
+        elif row["Variable"] == "count":
+            label = (
+                f"{variable_label}/count/AGI in "
+                f"{agi_range_label}/{taxable_label}/{filing_status_label}"
+            )
+        else:
+            label = (
+                f"{variable_label}/total/AGI in "
+                f"{agi_range_label}/{taxable_label}/{filing_status_label}"
+            )
+
+        if label not in loss_matrix.columns:
+            loss_matrix[label] = mask * values
+            targets_array.append(row["Value"])
+
+    # Convert tax-unit level df to household-level df
+
+    from policyengine_us import Microsimulation
+
+    sim = Microsimulation(dataset=dataset)
+    hh_id = sim.calculate("household_id", map_to="person")
+    tax_unit_hh_id = sim.map_result(
+        hh_id, "person", "tax_unit", how="value_from_first_person"
+    )
+
+    loss_matrix = loss_matrix.groupby(tax_unit_hh_id).sum()
+
+    return loss_matrix.values, np.array(targets_array)
+
+
+def reweight(
+    original_weights,
+    loss_matrix,
+    targets_array,
+):
+    loss_matrix = torch.tensor(loss_matrix, dtype=torch.float32)
+    targets_array = torch.tensor(targets_array, dtype=torch.float32)
+
+    # TODO: replace this with a call to the python reweight.py package.
+    def loss(weights):
+        estimate = weights @ loss_matrix
+        rel_error = ((estimate - targets_array) / targets_array) ** 2
+        return rel_error.mean()
+
+    weights = torch.tensor(
+        np.log(original_weights), requires_grad=True, dtype=torch.float32
+    )
+    optimizer = torch.optim.Adam([weights], lr=1e-2)
+    from tqdm import trange
+
+    iterator = trange(1_000)
+    for i in iterator:
+        optimizer.zero_grad()
+        l = loss(torch.exp(weights))
+        l.backward()
+        iterator.set_postfix({"loss": l.item()})
+        optimizer.step()
+
+    return torch.exp(weights).detach().numpy()
 
 
 class EnhancedCPS(Dataset):
-    pass
+    data_format = Dataset.FLAT_FILE
+    input_dataset: Type[Dataset]
+    start_year: int
+    end_year: int
+
+    def generate(self):
+        df = self.input_dataset(require=True).load()
+        from policyengine_us import Microsimulation
+
+        sim = Microsimulation(dataset=self.input_dataset)
+        original_weights = sim.calculate("household_weight")
+        original_weights = original_weights.values + np.random.normal(
+            10, 1, len(original_weights)
+        )
+        for year in range(self.start_year, self.end_year + 1):
+            print(f"Enhancing CPS for {year}")
+            loss_matrix, targets_array = build_loss_matrix(
+                self.input_dataset, year
+            )
+            optimised_weights = reweight(
+                original_weights, loss_matrix, targets_array
+            )
+            df[f"household_weight__{year}"] = sim.map_result(
+                optimised_weights, "household", "person"
+            )
+
+        self.save_dataset(df)
+
+
+class EnhancedCPS_2024(EnhancedCPS):
+    input_dataset = ExtendedCPS_2024
+    start_year = 2024
+    end_year = 2024
+    name = "enhanced_cps_2024"
+    label = "Enhanced CPS 2024"
+    file_path = STORAGE_FOLDER / "enhanced_cps_2024.csv"
+
+
+if __name__ == "__main__":
+    EnhancedCPS_2024().generate()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		import streamlit as st

		st.title("PolicyEngine US Data")