diff --git a/policyengine_uk_data/datasets/frs/enhanced_frs.py b/policyengine_uk_data/datasets/frs/enhanced_frs.py index cd4c5da..f023d49 100644 --- a/policyengine_uk_data/datasets/frs/enhanced_frs.py +++ b/policyengine_uk_data/datasets/frs/enhanced_frs.py @@ -5,6 +5,9 @@ from policyengine_uk_data.datasets.frs.frs import FRS_2022_23 from policyengine_uk_data.utils.loss import create_target_matrix import torch +from policyengine_uk_data.utils.imputations.capital_gains import ( + impute_cg_to_dataset, +) class EnhancedFRS(Dataset): @@ -22,6 +25,10 @@ def generate(self): self.save_dataset(data) + # Capital gains imputation + + impute_cg_to_dataset(self) + class ReweightedFRS_2022_23(EnhancedFRS): name = "reweighted_frs_2022_23" diff --git a/policyengine_uk_data/datasets/frs/frs.py b/policyengine_uk_data/datasets/frs/frs.py index e377d0f..4c9d821 100644 --- a/policyengine_uk_data/datasets/frs/frs.py +++ b/policyengine_uk_data/datasets/frs/frs.py @@ -89,6 +89,10 @@ def generate(self): self.save_dataset(frs) + impute_brmas(self, frs) + + self.save_dataset(frs) + class FRS_2020_21(FRS): dwp_frs = DWP_FRS_2020_21 @@ -781,6 +785,57 @@ def add_benunit_variables(frs: h5py.File, benunit: DataFrame): frs["benunit_rent"] = np.maximum(benunit.BURENT.fillna(0) * 52, 0) +def impute_brmas(dataset, frs): + # Randomly select broad rental market areas from regions. + from policyengine_uk import Microsimulation + + sim = Microsimulation(dataset=dataset) + region = ( + sim.populations["benunit"] + .household("region", dataset.time_period) + .decode_to_str() + ) + lha_category = sim.calculate("LHA_category") + + brma = np.empty(len(region), dtype=object) + + # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA + lha_list_of_rents = pd.read_csv( + STORAGE_FOLDER / "lha_list_of_rents.csv.gz" + ) + lha_list_of_rents = lha_list_of_rents.copy() + + for possible_region in lha_list_of_rents.region.unique(): + for possible_lha_category in lha_list_of_rents.lha_category.unique(): + lor_mask = (lha_list_of_rents.region == possible_region) & ( + lha_list_of_rents.lha_category == possible_lha_category + ) + mask = (region == possible_region) & ( + lha_category == possible_lha_category + ) + brma[mask] = lha_list_of_rents[lor_mask].brma.sample( + n=len(region[mask]), replace=True + ) + + # Convert benunit-level BRMAs to household-level BRMAs (pick a random one) + + df = pd.DataFrame( + { + "brma": brma, + "household_id": sim.populations["benunit"].household( + "household_id", 2023 + ), + } + ) + + df = df.groupby("household_id").brma.aggregate( + lambda x: x.sample(n=1).iloc[0] + ) + brmas = df[sim.calculate("household_id")].values + + frs["brma"] = {dataset.time_period: brmas} + + if __name__ == "__main__": FRS_2020_21().generate() FRS_2021_22().generate() diff --git a/policyengine_uk_data/storage/capital_gains_distribution_advani_summers.csv.gz b/policyengine_uk_data/storage/capital_gains_distribution_advani_summers.csv.gz new file mode 100644 index 0000000..01e4ee0 Binary files /dev/null and b/policyengine_uk_data/storage/capital_gains_distribution_advani_summers.csv.gz differ diff --git a/policyengine_uk_data/storage/lha_list_of_rents.csv.gz b/policyengine_uk_data/storage/lha_list_of_rents.csv.gz new file mode 100644 index 0000000..af02bfd Binary files /dev/null and b/policyengine_uk_data/storage/lha_list_of_rents.csv.gz differ diff --git a/policyengine_uk_data/utils/download_docs_prerequisites.py b/policyengine_uk_data/utils/download_docs_prerequisites.py index 6fae94d..dd015ec 100644 --- a/policyengine_uk_data/utils/download_docs_prerequisites.py +++ b/policyengine_uk_data/utils/download_docs_prerequisites.py @@ -1,5 +1,5 @@ from policyengine_uk_data.utils.github import download -from policyengine_uk_data.data_storage import STORAGE_FOLDER +from policyengine_uk_data.storage import STORAGE_FOLDER PREREQUISITES = [ { diff --git a/policyengine_uk_data/utils/imputations/capital_gains.py b/policyengine_uk_data/utils/imputations/capital_gains.py new file mode 100644 index 0000000..e53536d --- /dev/null +++ b/policyengine_uk_data/utils/imputations/capital_gains.py @@ -0,0 +1,184 @@ +import pandas as pd +import numpy as np + +# Fit a spline to each income band's percentiles +from scipy.interpolate import UnivariateSpline +from policyengine_uk import Microsimulation +from tqdm import tqdm +from policyengine_uk_data.storage import STORAGE_FOLDER +from policyengine_uk.system import system +import copy + +import torch +from torch.optim import Adam +from tqdm import tqdm + +capital_gains = pd.read_csv( + STORAGE_FOLDER / "capital_gains_distribution_advani_summers.csv.gz" +) +capital_gains["maximum_total_income"] = ( + capital_gains.minimum_total_income.shift(-1).fillna(np.inf) +) + + +def impute_capital_gains(dataset, time_period: int): + """Assumes that the capital gains distribution is the same for all years.""" + + sim = Microsimulation(dataset=dataset) + ti = sim.calculate("total_income", time_period) + household_weight = sim.calculate("household_weight", time_period).values + first_half = ( + np.concatenate( + [ + np.ones(len(household_weight) // 2), + np.zeros(len(household_weight) // 2), + ] + ) + > 0 + ) + # Give capital gains to one adult aged 15+ in each household + adult_index = sim.calculate("adult_index", time_period) + in_person_second_half = np.zeros(len(ti)) > 0 + in_person_second_half[len(ti) // 2 :] = True + has_cg = np.zeros(len(ti)) > 0 + has_cg[adult_index & in_person_second_half] = True + blend_factor = torch.tensor( + np.zeros(first_half.sum()), requires_grad=True, dtype=torch.float32 + ) + household_weight = torch.tensor(household_weight, dtype=torch.float32) + sigmoid = torch.nn.Sigmoid() + + def loss(blend_factor): + loss = 0 + + blended_household_weight = household_weight.clone() + adjusted_blend_factor = sigmoid(blend_factor) + blended_household_weight[first_half] = ( + adjusted_blend_factor * blended_household_weight[first_half] + ) + blended_household_weight[~first_half] = ( + 1 - adjusted_blend_factor + ) * blended_household_weight[first_half] + for i in range(len(capital_gains)): + lower = capital_gains.minimum_total_income.iloc[i] + upper = capital_gains.maximum_total_income.iloc[i] + true_pct_with_gains = capital_gains.percent_with_gains.iloc[i] + + ti_in_range = (ti >= lower) * (ti < upper) + cg_in_income_range = has_cg * ti_in_range + household_ti_in_range_count = torch.tensor( + sim.map_result(ti_in_range, "person", "household", how="sum") + ) + household_cg_in_income_range_count = torch.tensor( + sim.map_result( + cg_in_income_range, "person", "household", how="sum" + ) + ) + pred_ti_in_range = ( + blended_household_weight * household_ti_in_range_count + ).sum() + pred_cg_in_income_range = ( + blended_household_weight * household_cg_in_income_range_count + ).sum() + pred_pct_with_gains = pred_cg_in_income_range / pred_ti_in_range + loss += (pred_pct_with_gains - true_pct_with_gains) ** 2 + + return loss + + optimiser = Adam([blend_factor], lr=1e-1) + progress = tqdm(range(1000)) + for i in progress: + optimiser.zero_grad() + loss_value = loss(blend_factor) + loss_value.backward() + optimiser.step() + progress.set_description(f"Loss: {loss_value.item()}") + if loss_value.item() < 1e-5: + break + + new_household_weight = household_weight.detach().numpy() + original_household_weight = new_household_weight.copy() + blend_factor = sigmoid(blend_factor).detach().numpy() + new_household_weight[first_half] = ( + blend_factor * original_household_weight[first_half] + ) + new_household_weight[~first_half] = ( + 1 - blend_factor + ) * original_household_weight[first_half] + + # Impute actual capital gains amounts given gains + new_cg = np.zeros(len(ti)) + + for i in range(len(capital_gains)): + row = capital_gains.iloc[i] + spline = UnivariateSpline( + [0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95], + [row.p05, row.p10, row.p25, row.p50, row.p75, row.p90, row.p95], + k=1, + ) + lower = row.minimum_total_income + upper = row.maximum_total_income + ti_in_range = (ti >= lower) * (ti < upper) + in_target_range = has_cg * ti_in_range + quantiles = np.random.random(int(in_target_range.sum())) + pred_capital_gains = spline(quantiles) + new_cg[in_target_range] = pred_capital_gains + + aggregate_cg = system.parameters.calibration.programs.capital_gains.total + uprating = aggregate_cg(time_period) / aggregate_cg("2017-01-01") + new_cg *= uprating + + return new_cg, new_household_weight + + +def stack_datasets(data_1, data_2): + assert isinstance( + data_1[list(data_1.keys())[0]], dict + ), "Data must be in variable-time-period format." + joined_data = {} + + for variable in data_1: + joined_data[variable] = {} + for time_period in data_1[variable]: + if "_id" in variable: + joined_data[variable][time_period] = np.concatenate( + [ + data_1[variable][time_period], + data_2[variable][time_period] + + data_1[variable][time_period].max(), + ] + ) + else: + joined_data[variable][time_period] = np.concatenate( + [ + data_1[variable][time_period], + data_2[variable][time_period], + ] + ) + + return joined_data + + +def impute_cg_to_dataset(dataset): + data = dataset.load_dataset() + zero_weight_copy_1 = copy.deepcopy(data) + zero_weight_copy_2 = copy.deepcopy(data) + + for time_period in zero_weight_copy_2["household_weight"]: + zero_weight_copy_2["household_weight"][time_period] = np.zeros_like( + zero_weight_copy_1["household_weight"][time_period] + ) + + data = stack_datasets(data, zero_weight_copy_2) + + dataset.save_dataset(data) + + pred_cg, household_weights_22 = impute_capital_gains(dataset, 2022) + + data["capital_gains"] = {2022: pred_cg} + data["household_weight"][2022] = household_weights_22 + + for year in range(2023, 2028): + _, data["household_weight"][year] = impute_capital_gains(dataset, year) + + dataset.save_dataset(data)