Skip to content

Commit

Permalink
Calibrate EITC by number of qualifying dependents and apply dropout (#95
Browse files Browse the repository at this point in the history
)

* Add dropout

* Add dropout and EITC calibration

* Format

* Add install catch

* Change download folder

* Flip order of US install and bump US

* Add EITC targets

* Update data links
  • Loading branch information
nikhilwoodruff authored Oct 7, 2024
1 parent 371f77a commit 48e0011
Show file tree
Hide file tree
Showing 10 changed files with 123 additions and 71 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@
!uprating_factors.csv
!uprating_growth_factors.csv
!healthcare_spending.csv
!eitc.csv
!spm_threshold_agi.csv
**/_build
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ test:
pytest

install:
pip install -e ".[dev]"
pip install policyengine-us==1.100.0
pip install policyengine-us==1.109.0
pip install -e ".[dev]" --config-settings editable_mode=compat

changelog:
build-changelog changelog.yaml --output changelog.yaml --update-last-date --start-from 1.0.0 --append-file changelog_entry.yaml
Expand Down
5 changes: 5 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- bump: minor
changes:
added:
- EITC calibration by child counts.
- 10% dropout during weight calibration.
110 changes: 55 additions & 55 deletions docs/validation.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion policyengine_us_data/datasets/acs/acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class ACS_2022(ACS):
time_period = 2022
file_path = STORAGE_FOLDER / "acs_2022.h5"
census_acs = CensusACS_2022
url = "release://PolicyEngine/policyengine-us-data/1.8.0/acs_2022.h5"
url = "release://PolicyEngine/policyengine-us-data/1.9.0/acs_2022.h5"


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ class CPS_2024(CPS):
label = "CPS 2024 (2022-based)"
file_path = STORAGE_FOLDER / "cps_2024.h5"
time_period = 2024
url = "release://policyengine/policyengine-us-data/1.8.0/cps_2024.h5"
url = "release://policyengine/policyengine-us-data/1.9.0/cps_2024.h5"


class PooledCPS(Dataset):
Expand Down Expand Up @@ -681,7 +681,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
CPS_2023,
]
time_period = 2023
url = "release://PolicyEngine/policyengine-us-data/1.8.0/pooled_3_year_cps_2023.h5"
url = "release://PolicyEngine/policyengine-us-data/1.9.0/pooled_3_year_cps_2023.h5"


if __name__ == "__main__":
Expand Down
18 changes: 15 additions & 3 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def reweight(
original_weights,
loss_matrix,
targets_array,
dropout_rate=0.1,
):
target_names = np.array(loss_matrix.columns)
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
Expand All @@ -50,15 +51,26 @@ def loss(weights):
raise ValueError("Relative error contains NaNs")
return rel_error.mean()

def dropout_weights(weights, p):
if p == 0:
return weights
# Replace p% of the weights with the mean value of the rest of them
mask = torch.rand_like(weights) < p
mean = weights[~mask].mean()
masked_weights = weights.clone()
masked_weights[mask] = mean
return masked_weights

optimizer = torch.optim.Adam([weights], lr=1e-1)
from tqdm import trange

start_loss = None

iterator = trange(1_000)
iterator = trange(5_000)
for i in iterator:
optimizer.zero_grad()
l = loss(torch.exp(weights))
weights_ = dropout_weights(weights, dropout_rate)
l = loss(torch.exp(weights_))
if start_loss is None:
start_loss = l.item()
loss_rel_change = (l.item() - start_loss) / start_loss
Expand Down Expand Up @@ -177,7 +189,7 @@ class EnhancedCPS_2024(EnhancedCPS):
name = "enhanced_cps_2024"
label = "Enhanced CPS 2024"
file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"
url = "release://policyengine/policyengine-us-data/1.8.0/enhanced_cps_2024.h5"
url = "release://policyengine/policyengine-us-data/1.9.0/enhanced_cps_2024.h5"


if __name__ == "__main__":
Expand Down
7 changes: 3 additions & 4 deletions policyengine_us_data/storage/download_public_prerequisites.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
from policyengine_us_data.utils.github import download
from pathlib import Path

FOLDER = Path(__file__).parent
from policyengine_us_data.storage import STORAGE_FOLDER

download(
"PolicyEngine",
"policyengine-us-data",
"release",
"soi.csv",
FOLDER / "soi.csv",
STORAGE_FOLDER / "soi.csv",
)
download(
"PolicyEngine",
"policyengine-us-data",
"release",
"np2023_d5_mid.csv",
FOLDER / "np2023_d5_mid.csv",
STORAGE_FOLDER / "np2023_d5_mid.csv",
)
4 changes: 4 additions & 0 deletions policyengine_us_data/storage/eitc.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
count_children,eitc_returns,eitc_total,year
0,15108515,12427886000,2021
1,8500483,19879365000,2021
2,5542949,20472827000,2021
39 changes: 35 additions & 4 deletions policyengine_us_data/utils/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,42 @@ def build_loss_matrix(dataset: type, time_period):
loss_matrix["treasury/eitc"] = sim.calculate(
"eitc", map_to="household"
).values
targets_array.append(
sim.tax_benefit_system.parameters(
time_period
).calibration.gov.treasury.tax_expenditures.eitc
eitc_spending = (
sim.tax_benefit_system.parameters.calibration.gov.treasury.tax_expenditures.eitc
)
targets_array.append(eitc_spending(time_period))

# IRS EITC filers and totals by child counts
eitc_stats = pd.read_csv(STORAGE_FOLDER / "eitc.csv")

eitc_spending_uprating = eitc_spending(time_period) / eitc_spending(2021)
population = (
sim.tax_benefit_system.parameters.calibration.gov.census.populations.total
)
population_uprating = population(time_period) / population(2021)

for _, row in eitc_stats.iterrows():
returns_label = (
f"irs/eitc/returns/count_children_{row['count_children']}"
)
eitc_eligible_children = sim.calculate("eitc_child_count").values
eitc = sim.calculate("eitc").values
loss_matrix[returns_label] = sim.map_result(
(eitc > 0) * (eitc_eligible_children == row["count_children"]),
"tax_unit",
"household",
)
targets_array.append(row["eitc_returns"] * population_uprating)

spending_label = (
f"irs/eitc/spending/count_children_{row['count_children']}"
)
loss_matrix[spending_label] = sim.map_result(
eitc * (eitc_eligible_children == row["count_children"]),
"tax_unit",
"household",
)
targets_array.append(row["eitc_total"] * eitc_spending_uprating)

# CPS-derived statistics
# Medical expenses, sum of spm thresholds
Expand Down

0 comments on commit 48e0011

Please sign in to comment.