-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from PolicyEngine/imputations
Add BRMAs and CG imputation
- Loading branch information
Showing
6 changed files
with
247 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+1.64 KB
policyengine_uk_data/storage/capital_gains_distribution_advani_summers.csv.gz
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
184 changes: 184 additions & 0 deletions
184
policyengine_uk_data/utils/imputations/capital_gains.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
# Fit a spline to each income band's percentiles | ||
from scipy.interpolate import UnivariateSpline | ||
from policyengine_uk import Microsimulation | ||
from tqdm import tqdm | ||
from policyengine_uk_data.storage import STORAGE_FOLDER | ||
from policyengine_uk.system import system | ||
import copy | ||
|
||
import torch | ||
from torch.optim import Adam | ||
from tqdm import tqdm | ||
|
||
capital_gains = pd.read_csv( | ||
STORAGE_FOLDER / "capital_gains_distribution_advani_summers.csv.gz" | ||
) | ||
capital_gains["maximum_total_income"] = ( | ||
capital_gains.minimum_total_income.shift(-1).fillna(np.inf) | ||
) | ||
|
||
|
||
def impute_capital_gains(dataset, time_period: int): | ||
"""Assumes that the capital gains distribution is the same for all years.""" | ||
|
||
sim = Microsimulation(dataset=dataset) | ||
ti = sim.calculate("total_income", time_period) | ||
household_weight = sim.calculate("household_weight", time_period).values | ||
first_half = ( | ||
np.concatenate( | ||
[ | ||
np.ones(len(household_weight) // 2), | ||
np.zeros(len(household_weight) // 2), | ||
] | ||
) | ||
> 0 | ||
) | ||
# Give capital gains to one adult aged 15+ in each household | ||
adult_index = sim.calculate("adult_index", time_period) | ||
in_person_second_half = np.zeros(len(ti)) > 0 | ||
in_person_second_half[len(ti) // 2 :] = True | ||
has_cg = np.zeros(len(ti)) > 0 | ||
has_cg[adult_index & in_person_second_half] = True | ||
blend_factor = torch.tensor( | ||
np.zeros(first_half.sum()), requires_grad=True, dtype=torch.float32 | ||
) | ||
household_weight = torch.tensor(household_weight, dtype=torch.float32) | ||
sigmoid = torch.nn.Sigmoid() | ||
|
||
def loss(blend_factor): | ||
loss = 0 | ||
|
||
blended_household_weight = household_weight.clone() | ||
adjusted_blend_factor = sigmoid(blend_factor) | ||
blended_household_weight[first_half] = ( | ||
adjusted_blend_factor * blended_household_weight[first_half] | ||
) | ||
blended_household_weight[~first_half] = ( | ||
1 - adjusted_blend_factor | ||
) * blended_household_weight[first_half] | ||
for i in range(len(capital_gains)): | ||
lower = capital_gains.minimum_total_income.iloc[i] | ||
upper = capital_gains.maximum_total_income.iloc[i] | ||
true_pct_with_gains = capital_gains.percent_with_gains.iloc[i] | ||
|
||
ti_in_range = (ti >= lower) * (ti < upper) | ||
cg_in_income_range = has_cg * ti_in_range | ||
household_ti_in_range_count = torch.tensor( | ||
sim.map_result(ti_in_range, "person", "household", how="sum") | ||
) | ||
household_cg_in_income_range_count = torch.tensor( | ||
sim.map_result( | ||
cg_in_income_range, "person", "household", how="sum" | ||
) | ||
) | ||
pred_ti_in_range = ( | ||
blended_household_weight * household_ti_in_range_count | ||
).sum() | ||
pred_cg_in_income_range = ( | ||
blended_household_weight * household_cg_in_income_range_count | ||
).sum() | ||
pred_pct_with_gains = pred_cg_in_income_range / pred_ti_in_range | ||
loss += (pred_pct_with_gains - true_pct_with_gains) ** 2 | ||
|
||
return loss | ||
|
||
optimiser = Adam([blend_factor], lr=1e-1) | ||
progress = tqdm(range(1000)) | ||
for i in progress: | ||
optimiser.zero_grad() | ||
loss_value = loss(blend_factor) | ||
loss_value.backward() | ||
optimiser.step() | ||
progress.set_description(f"Loss: {loss_value.item()}") | ||
if loss_value.item() < 1e-5: | ||
break | ||
|
||
new_household_weight = household_weight.detach().numpy() | ||
original_household_weight = new_household_weight.copy() | ||
blend_factor = sigmoid(blend_factor).detach().numpy() | ||
new_household_weight[first_half] = ( | ||
blend_factor * original_household_weight[first_half] | ||
) | ||
new_household_weight[~first_half] = ( | ||
1 - blend_factor | ||
) * original_household_weight[first_half] | ||
|
||
# Impute actual capital gains amounts given gains | ||
new_cg = np.zeros(len(ti)) | ||
|
||
for i in range(len(capital_gains)): | ||
row = capital_gains.iloc[i] | ||
spline = UnivariateSpline( | ||
[0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95], | ||
[row.p05, row.p10, row.p25, row.p50, row.p75, row.p90, row.p95], | ||
k=1, | ||
) | ||
lower = row.minimum_total_income | ||
upper = row.maximum_total_income | ||
ti_in_range = (ti >= lower) * (ti < upper) | ||
in_target_range = has_cg * ti_in_range | ||
quantiles = np.random.random(int(in_target_range.sum())) | ||
pred_capital_gains = spline(quantiles) | ||
new_cg[in_target_range] = pred_capital_gains | ||
|
||
aggregate_cg = system.parameters.calibration.programs.capital_gains.total | ||
uprating = aggregate_cg(time_period) / aggregate_cg("2017-01-01") | ||
new_cg *= uprating | ||
|
||
return new_cg, new_household_weight | ||
|
||
|
||
def stack_datasets(data_1, data_2): | ||
assert isinstance( | ||
data_1[list(data_1.keys())[0]], dict | ||
), "Data must be in variable-time-period format." | ||
joined_data = {} | ||
|
||
for variable in data_1: | ||
joined_data[variable] = {} | ||
for time_period in data_1[variable]: | ||
if "_id" in variable: | ||
joined_data[variable][time_period] = np.concatenate( | ||
[ | ||
data_1[variable][time_period], | ||
data_2[variable][time_period] | ||
+ data_1[variable][time_period].max(), | ||
] | ||
) | ||
else: | ||
joined_data[variable][time_period] = np.concatenate( | ||
[ | ||
data_1[variable][time_period], | ||
data_2[variable][time_period], | ||
] | ||
) | ||
|
||
return joined_data | ||
|
||
|
||
def impute_cg_to_dataset(dataset): | ||
data = dataset.load_dataset() | ||
zero_weight_copy_1 = copy.deepcopy(data) | ||
zero_weight_copy_2 = copy.deepcopy(data) | ||
|
||
for time_period in zero_weight_copy_2["household_weight"]: | ||
zero_weight_copy_2["household_weight"][time_period] = np.zeros_like( | ||
zero_weight_copy_1["household_weight"][time_period] | ||
) | ||
|
||
data = stack_datasets(data, zero_weight_copy_2) | ||
|
||
dataset.save_dataset(data) | ||
|
||
pred_cg, household_weights_22 = impute_capital_gains(dataset, 2022) | ||
|
||
data["capital_gains"] = {2022: pred_cg} | ||
data["household_weight"][2022] = household_weights_22 | ||
|
||
for year in range(2023, 2028): | ||
_, data["household_weight"][year] = impute_capital_gains(dataset, year) | ||
|
||
dataset.save_dataset(data) |