Skip to content

Commit

Permalink
Merge pull request #10 from PolicyEngine/imputations
Browse files Browse the repository at this point in the history
Add BRMAs and CG imputation
  • Loading branch information
nikhilwoodruff authored Sep 16, 2024
2 parents 9a52a55 + 695faaa commit 6a1e5c1
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 1 deletion.
7 changes: 7 additions & 0 deletions policyengine_uk_data/datasets/frs/enhanced_frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from policyengine_uk_data.datasets.frs.frs import FRS_2022_23
from policyengine_uk_data.utils.loss import create_target_matrix
import torch
from policyengine_uk_data.utils.imputations.capital_gains import (
impute_cg_to_dataset,
)


class EnhancedFRS(Dataset):
Expand All @@ -22,6 +25,10 @@ def generate(self):

self.save_dataset(data)

# Capital gains imputation

impute_cg_to_dataset(self)


class ReweightedFRS_2022_23(EnhancedFRS):
name = "reweighted_frs_2022_23"
Expand Down
55 changes: 55 additions & 0 deletions policyengine_uk_data/datasets/frs/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ def generate(self):

self.save_dataset(frs)

impute_brmas(self, frs)

self.save_dataset(frs)


class FRS_2020_21(FRS):
dwp_frs = DWP_FRS_2020_21
Expand Down Expand Up @@ -781,6 +785,57 @@ def add_benunit_variables(frs: h5py.File, benunit: DataFrame):
frs["benunit_rent"] = np.maximum(benunit.BURENT.fillna(0) * 52, 0)


def impute_brmas(dataset, frs):
# Randomly select broad rental market areas from regions.
from policyengine_uk import Microsimulation

sim = Microsimulation(dataset=dataset)
region = (
sim.populations["benunit"]
.household("region", dataset.time_period)
.decode_to_str()
)
lha_category = sim.calculate("LHA_category")

brma = np.empty(len(region), dtype=object)

# Sample from a random BRMA in the region, weighted by the number of observations in each BRMA
lha_list_of_rents = pd.read_csv(
STORAGE_FOLDER / "lha_list_of_rents.csv.gz"
)
lha_list_of_rents = lha_list_of_rents.copy()

for possible_region in lha_list_of_rents.region.unique():
for possible_lha_category in lha_list_of_rents.lha_category.unique():
lor_mask = (lha_list_of_rents.region == possible_region) & (
lha_list_of_rents.lha_category == possible_lha_category
)
mask = (region == possible_region) & (
lha_category == possible_lha_category
)
brma[mask] = lha_list_of_rents[lor_mask].brma.sample(
n=len(region[mask]), replace=True
)

# Convert benunit-level BRMAs to household-level BRMAs (pick a random one)

df = pd.DataFrame(
{
"brma": brma,
"household_id": sim.populations["benunit"].household(
"household_id", 2023
),
}
)

df = df.groupby("household_id").brma.aggregate(
lambda x: x.sample(n=1).iloc[0]
)
brmas = df[sim.calculate("household_id")].values

frs["brma"] = {dataset.time_period: brmas}


if __name__ == "__main__":
FRS_2020_21().generate()
FRS_2021_22().generate()
Expand Down
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion policyengine_uk_data/utils/download_docs_prerequisites.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from policyengine_uk_data.utils.github import download
from policyengine_uk_data.data_storage import STORAGE_FOLDER
from policyengine_uk_data.storage import STORAGE_FOLDER

PREREQUISITES = [
{
Expand Down
184 changes: 184 additions & 0 deletions policyengine_uk_data/utils/imputations/capital_gains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import pandas as pd
import numpy as np

# Fit a spline to each income band's percentiles
from scipy.interpolate import UnivariateSpline
from policyengine_uk import Microsimulation
from tqdm import tqdm
from policyengine_uk_data.storage import STORAGE_FOLDER
from policyengine_uk.system import system
import copy

import torch
from torch.optim import Adam
from tqdm import tqdm

capital_gains = pd.read_csv(
STORAGE_FOLDER / "capital_gains_distribution_advani_summers.csv.gz"
)
capital_gains["maximum_total_income"] = (
capital_gains.minimum_total_income.shift(-1).fillna(np.inf)
)


def impute_capital_gains(dataset, time_period: int):
"""Assumes that the capital gains distribution is the same for all years."""

sim = Microsimulation(dataset=dataset)
ti = sim.calculate("total_income", time_period)
household_weight = sim.calculate("household_weight", time_period).values
first_half = (
np.concatenate(
[
np.ones(len(household_weight) // 2),
np.zeros(len(household_weight) // 2),
]
)
> 0
)
# Give capital gains to one adult aged 15+ in each household
adult_index = sim.calculate("adult_index", time_period)
in_person_second_half = np.zeros(len(ti)) > 0
in_person_second_half[len(ti) // 2 :] = True
has_cg = np.zeros(len(ti)) > 0
has_cg[adult_index & in_person_second_half] = True
blend_factor = torch.tensor(
np.zeros(first_half.sum()), requires_grad=True, dtype=torch.float32
)
household_weight = torch.tensor(household_weight, dtype=torch.float32)
sigmoid = torch.nn.Sigmoid()

def loss(blend_factor):
loss = 0

blended_household_weight = household_weight.clone()
adjusted_blend_factor = sigmoid(blend_factor)
blended_household_weight[first_half] = (
adjusted_blend_factor * blended_household_weight[first_half]
)
blended_household_weight[~first_half] = (
1 - adjusted_blend_factor
) * blended_household_weight[first_half]
for i in range(len(capital_gains)):
lower = capital_gains.minimum_total_income.iloc[i]
upper = capital_gains.maximum_total_income.iloc[i]
true_pct_with_gains = capital_gains.percent_with_gains.iloc[i]

ti_in_range = (ti >= lower) * (ti < upper)
cg_in_income_range = has_cg * ti_in_range
household_ti_in_range_count = torch.tensor(
sim.map_result(ti_in_range, "person", "household", how="sum")
)
household_cg_in_income_range_count = torch.tensor(
sim.map_result(
cg_in_income_range, "person", "household", how="sum"
)
)
pred_ti_in_range = (
blended_household_weight * household_ti_in_range_count
).sum()
pred_cg_in_income_range = (
blended_household_weight * household_cg_in_income_range_count
).sum()
pred_pct_with_gains = pred_cg_in_income_range / pred_ti_in_range
loss += (pred_pct_with_gains - true_pct_with_gains) ** 2

return loss

optimiser = Adam([blend_factor], lr=1e-1)
progress = tqdm(range(1000))
for i in progress:
optimiser.zero_grad()
loss_value = loss(blend_factor)
loss_value.backward()
optimiser.step()
progress.set_description(f"Loss: {loss_value.item()}")
if loss_value.item() < 1e-5:
break

new_household_weight = household_weight.detach().numpy()
original_household_weight = new_household_weight.copy()
blend_factor = sigmoid(blend_factor).detach().numpy()
new_household_weight[first_half] = (
blend_factor * original_household_weight[first_half]
)
new_household_weight[~first_half] = (
1 - blend_factor
) * original_household_weight[first_half]

# Impute actual capital gains amounts given gains
new_cg = np.zeros(len(ti))

for i in range(len(capital_gains)):
row = capital_gains.iloc[i]
spline = UnivariateSpline(
[0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95],
[row.p05, row.p10, row.p25, row.p50, row.p75, row.p90, row.p95],
k=1,
)
lower = row.minimum_total_income
upper = row.maximum_total_income
ti_in_range = (ti >= lower) * (ti < upper)
in_target_range = has_cg * ti_in_range
quantiles = np.random.random(int(in_target_range.sum()))
pred_capital_gains = spline(quantiles)
new_cg[in_target_range] = pred_capital_gains

aggregate_cg = system.parameters.calibration.programs.capital_gains.total
uprating = aggregate_cg(time_period) / aggregate_cg("2017-01-01")
new_cg *= uprating

return new_cg, new_household_weight


def stack_datasets(data_1, data_2):
assert isinstance(
data_1[list(data_1.keys())[0]], dict
), "Data must be in variable-time-period format."
joined_data = {}

for variable in data_1:
joined_data[variable] = {}
for time_period in data_1[variable]:
if "_id" in variable:
joined_data[variable][time_period] = np.concatenate(
[
data_1[variable][time_period],
data_2[variable][time_period]
+ data_1[variable][time_period].max(),
]
)
else:
joined_data[variable][time_period] = np.concatenate(
[
data_1[variable][time_period],
data_2[variable][time_period],
]
)

return joined_data


def impute_cg_to_dataset(dataset):
data = dataset.load_dataset()
zero_weight_copy_1 = copy.deepcopy(data)
zero_weight_copy_2 = copy.deepcopy(data)

for time_period in zero_weight_copy_2["household_weight"]:
zero_weight_copy_2["household_weight"][time_period] = np.zeros_like(
zero_weight_copy_1["household_weight"][time_period]
)

data = stack_datasets(data, zero_weight_copy_2)

dataset.save_dataset(data)

pred_cg, household_weights_22 = impute_capital_gains(dataset, 2022)

data["capital_gains"] = {2022: pred_cg}
data["household_weight"][2022] = household_weights_22

for year in range(2023, 2028):
_, data["household_weight"][year] = impute_capital_gains(dataset, year)

dataset.save_dataset(data)

0 comments on commit 6a1e5c1

Please sign in to comment.