Merge pull request #10 from PolicyEngine/imputations

Add BRMAs and CG imputation
PolicyEngine · Sep 16, 2024 · 6a1e5c1 · 6a1e5c1
2 parents 9a52a55 + 695faaa
commit 6a1e5c1
Show file tree

Hide file tree

Showing 6 changed files with 247 additions and 1 deletion.
diff --git a/policyengine_uk_data/datasets/frs/enhanced_frs.py b/policyengine_uk_data/datasets/frs/enhanced_frs.py
@@ -5,6 +5,9 @@
 from policyengine_uk_data.datasets.frs.frs import FRS_2022_23
 from policyengine_uk_data.utils.loss import create_target_matrix
 import torch
+from policyengine_uk_data.utils.imputations.capital_gains import (
+    impute_cg_to_dataset,
+)
 
 
 class EnhancedFRS(Dataset):
@@ -22,6 +25,10 @@ def generate(self):
 
         self.save_dataset(data)
 
+        # Capital gains imputation
+
+        impute_cg_to_dataset(self)
+
 
 class ReweightedFRS_2022_23(EnhancedFRS):
     name = "reweighted_frs_2022_23"

diff --git a/policyengine_uk_data/datasets/frs/frs.py b/policyengine_uk_data/datasets/frs/frs.py
@@ -89,6 +89,10 @@ def generate(self):
 
         self.save_dataset(frs)
 
+        impute_brmas(self, frs)
+
+        self.save_dataset(frs)
+
 
 class FRS_2020_21(FRS):
     dwp_frs = DWP_FRS_2020_21
@@ -781,6 +785,57 @@ def add_benunit_variables(frs: h5py.File, benunit: DataFrame):
     frs["benunit_rent"] = np.maximum(benunit.BURENT.fillna(0) * 52, 0)
 
 
+def impute_brmas(dataset, frs):
+    # Randomly select broad rental market areas from regions.
+    from policyengine_uk import Microsimulation
+
+    sim = Microsimulation(dataset=dataset)
+    region = (
+        sim.populations["benunit"]
+        .household("region", dataset.time_period)
+        .decode_to_str()
+    )
+    lha_category = sim.calculate("LHA_category")
+
+    brma = np.empty(len(region), dtype=object)
+
+    # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA
+    lha_list_of_rents = pd.read_csv(
+        STORAGE_FOLDER / "lha_list_of_rents.csv.gz"
+    )
+    lha_list_of_rents = lha_list_of_rents.copy()
+
+    for possible_region in lha_list_of_rents.region.unique():
+        for possible_lha_category in lha_list_of_rents.lha_category.unique():
+            lor_mask = (lha_list_of_rents.region == possible_region) & (
+                lha_list_of_rents.lha_category == possible_lha_category
+            )
+            mask = (region == possible_region) & (
+                lha_category == possible_lha_category
+            )
+            brma[mask] = lha_list_of_rents[lor_mask].brma.sample(
+                n=len(region[mask]), replace=True
+            )
+
+    # Convert benunit-level BRMAs to household-level BRMAs (pick a random one)
+
+    df = pd.DataFrame(
+        {
+            "brma": brma,
+            "household_id": sim.populations["benunit"].household(
+                "household_id", 2023
+            ),
+        }
+    )
+
+    df = df.groupby("household_id").brma.aggregate(
+        lambda x: x.sample(n=1).iloc[0]
+    )
+    brmas = df[sim.calculate("household_id")].values
+
+    frs["brma"] = {dataset.time_period: brmas}
+
+
 if __name__ == "__main__":
     FRS_2020_21().generate()
     FRS_2021_22().generate()

diff --git a/policyengine_uk_data/storage/capital_gains_distribution_advani_summers.csv.gz b/policyengine_uk_data/storage/capital_gains_distribution_advani_summers.csv.gz
diff --git a/policyengine_uk_data/storage/lha_list_of_rents.csv.gz b/policyengine_uk_data/storage/lha_list_of_rents.csv.gz
diff --git a/policyengine_uk_data/utils/download_docs_prerequisites.py b/policyengine_uk_data/utils/download_docs_prerequisites.py
@@ -1,5 +1,5 @@
 from policyengine_uk_data.utils.github import download
-from policyengine_uk_data.data_storage import STORAGE_FOLDER
+from policyengine_uk_data.storage import STORAGE_FOLDER
 
 PREREQUISITES = [
     {

diff --git a/policyengine_uk_data/utils/imputations/capital_gains.py b/policyengine_uk_data/utils/imputations/capital_gains.py
@@ -0,0 +1,184 @@
+import pandas as pd
+import numpy as np
+
+# Fit a spline to each income band's percentiles
+from scipy.interpolate import UnivariateSpline
+from policyengine_uk import Microsimulation
+from tqdm import tqdm
+from policyengine_uk_data.storage import STORAGE_FOLDER
+from policyengine_uk.system import system
+import copy
+
+import torch
+from torch.optim import Adam
+from tqdm import tqdm
+
+capital_gains = pd.read_csv(
+    STORAGE_FOLDER / "capital_gains_distribution_advani_summers.csv.gz"
+)
+capital_gains["maximum_total_income"] = (
+    capital_gains.minimum_total_income.shift(-1).fillna(np.inf)
+)
+
+
+def impute_capital_gains(dataset, time_period: int):
+    """Assumes that the capital gains distribution is the same for all years."""
+
+    sim = Microsimulation(dataset=dataset)
+    ti = sim.calculate("total_income", time_period)
+    household_weight = sim.calculate("household_weight", time_period).values
+    first_half = (
+        np.concatenate(
+            [
+                np.ones(len(household_weight) // 2),
+                np.zeros(len(household_weight) // 2),
+            ]
+        )
+        > 0
+    )
+    # Give capital gains to one adult aged 15+ in each household
+    adult_index = sim.calculate("adult_index", time_period)
+    in_person_second_half = np.zeros(len(ti)) > 0
+    in_person_second_half[len(ti) // 2 :] = True
+    has_cg = np.zeros(len(ti)) > 0
+    has_cg[adult_index & in_person_second_half] = True
+    blend_factor = torch.tensor(
+        np.zeros(first_half.sum()), requires_grad=True, dtype=torch.float32
+    )
+    household_weight = torch.tensor(household_weight, dtype=torch.float32)
+    sigmoid = torch.nn.Sigmoid()
+
+    def loss(blend_factor):
+        loss = 0
+
+        blended_household_weight = household_weight.clone()
+        adjusted_blend_factor = sigmoid(blend_factor)
+        blended_household_weight[first_half] = (
+            adjusted_blend_factor * blended_household_weight[first_half]
+        )
+        blended_household_weight[~first_half] = (
+            1 - adjusted_blend_factor
+        ) * blended_household_weight[first_half]
+        for i in range(len(capital_gains)):
+            lower = capital_gains.minimum_total_income.iloc[i]
+            upper = capital_gains.maximum_total_income.iloc[i]
+            true_pct_with_gains = capital_gains.percent_with_gains.iloc[i]
+
+            ti_in_range = (ti >= lower) * (ti < upper)
+            cg_in_income_range = has_cg * ti_in_range
+            household_ti_in_range_count = torch.tensor(
+                sim.map_result(ti_in_range, "person", "household", how="sum")
+            )
+            household_cg_in_income_range_count = torch.tensor(
+                sim.map_result(
+                    cg_in_income_range, "person", "household", how="sum"
+                )
+            )
+            pred_ti_in_range = (
+                blended_household_weight * household_ti_in_range_count
+            ).sum()
+            pred_cg_in_income_range = (
+                blended_household_weight * household_cg_in_income_range_count
+            ).sum()
+            pred_pct_with_gains = pred_cg_in_income_range / pred_ti_in_range
+            loss += (pred_pct_with_gains - true_pct_with_gains) ** 2
+
+        return loss
+
+    optimiser = Adam([blend_factor], lr=1e-1)
+    progress = tqdm(range(1000))
+    for i in progress:
+        optimiser.zero_grad()
+        loss_value = loss(blend_factor)
+        loss_value.backward()
+        optimiser.step()
+        progress.set_description(f"Loss: {loss_value.item()}")
+        if loss_value.item() < 1e-5:
+            break
+
+    new_household_weight = household_weight.detach().numpy()
+    original_household_weight = new_household_weight.copy()
+    blend_factor = sigmoid(blend_factor).detach().numpy()
+    new_household_weight[first_half] = (
+        blend_factor * original_household_weight[first_half]
+    )
+    new_household_weight[~first_half] = (
+        1 - blend_factor
+    ) * original_household_weight[first_half]
+
+    # Impute actual capital gains amounts given gains
+    new_cg = np.zeros(len(ti))
+
+    for i in range(len(capital_gains)):
+        row = capital_gains.iloc[i]
+        spline = UnivariateSpline(
+            [0.05, 0.1, 0.25, 0.5, 0.75, 0.90, 0.95],
+            [row.p05, row.p10, row.p25, row.p50, row.p75, row.p90, row.p95],
+            k=1,
+        )
+        lower = row.minimum_total_income
+        upper = row.maximum_total_income
+        ti_in_range = (ti >= lower) * (ti < upper)
+        in_target_range = has_cg * ti_in_range
+        quantiles = np.random.random(int(in_target_range.sum()))
+        pred_capital_gains = spline(quantiles)
+        new_cg[in_target_range] = pred_capital_gains
+
+    aggregate_cg = system.parameters.calibration.programs.capital_gains.total
+    uprating = aggregate_cg(time_period) / aggregate_cg("2017-01-01")
+    new_cg *= uprating
+
+    return new_cg, new_household_weight
+
+
+def stack_datasets(data_1, data_2):
+    assert isinstance(
+        data_1[list(data_1.keys())[0]], dict
+    ), "Data must be in variable-time-period format."
+    joined_data = {}
+
+    for variable in data_1:
+        joined_data[variable] = {}
+        for time_period in data_1[variable]:
+            if "_id" in variable:
+                joined_data[variable][time_period] = np.concatenate(
+                    [
+                        data_1[variable][time_period],
+                        data_2[variable][time_period]
+                        + data_1[variable][time_period].max(),
+                    ]
+                )
+            else:
+                joined_data[variable][time_period] = np.concatenate(
+                    [
+                        data_1[variable][time_period],
+                        data_2[variable][time_period],
+                    ]
+                )
+
+    return joined_data
+
+
+def impute_cg_to_dataset(dataset):
+    data = dataset.load_dataset()
+    zero_weight_copy_1 = copy.deepcopy(data)
+    zero_weight_copy_2 = copy.deepcopy(data)
+
+    for time_period in zero_weight_copy_2["household_weight"]:
+        zero_weight_copy_2["household_weight"][time_period] = np.zeros_like(
+            zero_weight_copy_1["household_weight"][time_period]
+        )
+
+    data = stack_datasets(data, zero_weight_copy_2)
+
+    dataset.save_dataset(data)
+
+    pred_cg, household_weights_22 = impute_capital_gains(dataset, 2022)
+
+    data["capital_gains"] = {2022: pred_cg}
+    data["household_weight"][2022] = household_weights_22
+
+    for year in range(2023, 2028):
+        _, data["household_weight"][year] = impute_capital_gains(dataset, year)
+
+    dataset.save_dataset(data)