PolicyEngine · nikhilwoodruff · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@
 !tax_benefit.csv
 !demographics.csv
 !incomes_projection.csv
+!policyengine_uk_data/datasets/frs/local_areas/**/*.csv
 **/_build
diff --git a/Makefile b/Makefile
@@ -28,6 +28,7 @@ data:
 	python policyengine_uk_data/datasets/frs/frs.py
 	python policyengine_uk_data/datasets/frs/extended_frs.py
 	python policyengine_uk_data/datasets/frs/enhanced_frs.py
+	python policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py
 
 build:
 	python -m build

diff --git a/policyengine_uk_data/datasets/frs/enhanced_frs.py b/policyengine_uk_data/datasets/frs/enhanced_frs.py
@@ -8,6 +8,7 @@
 from policyengine_uk_data.utils.imputations.capital_gains import (
     impute_cg_to_dataset,
 )
+from policyengine_uk_data.utils.reweight import reweight
 
 try:
     import torch
@@ -24,8 +25,7 @@ def generate(self):
         # Capital gains imputation
 
         impute_cg_to_dataset(self)
-
-        self.save_dataset(data)
+        data = self.load_dataset()
 
         self.add_random_variables(data)
 
@@ -84,69 +84,6 @@ class EnhancedFRS_2022_23(EnhancedFRS):
     input_frs = ExtendedFRS_2022_23
     time_period = 2022
     end_year = 2028
-    url = "hf://policyengine/policyengine-uk-data"
-
-
-def reweight(
-    original_weights,
-    loss_matrix,
-    targets_array,
-    dropout_rate=0.05,
-):
-    target_names = np.array(loss_matrix.columns)
-    loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
-    targets_array = torch.tensor(targets_array, dtype=torch.float32)
-    weights = torch.tensor(
-        np.log(original_weights), requires_grad=True, dtype=torch.float32
-    )
-
-    # TODO: replace this with a call to the python reweight.py package.
-    def loss(weights):
-        # Check for Nans in either the weights or the loss matrix
-        if torch.isnan(weights).any():
-            raise ValueError("Weights contain NaNs")
-        if torch.isnan(loss_matrix).any():
-            raise ValueError("Loss matrix contains NaNs")
-        estimate = weights @ loss_matrix
-        if torch.isnan(estimate).any():
-            raise ValueError("Estimate contains NaNs")
-        rel_error = (
-            ((estimate - targets_array) + 1) / (targets_array + 1)
-        ) ** 2
-        if torch.isnan(rel_error).any():
-            raise ValueError("Relative error contains NaNs")
-        return rel_error.mean()
-
-    def dropout_weights(weights, p):
-        if p == 0:
-            return weights
-        # Replace p% of the weights with the mean value of the rest of them
-        mask = torch.rand_like(weights) < p
-        mean = weights[~mask].mean()
-        masked_weights = weights.clone()
-        masked_weights[mask] = mean
-        return masked_weights
-
-    optimizer = torch.optim.Adam([weights], lr=1e-1)
-    from tqdm import trange
-
-    start_loss = None
-
-    iterator = trange(10_000)
-    for i in iterator:
-        optimizer.zero_grad()
-        weights_ = dropout_weights(weights, dropout_rate)
-        l = loss(torch.exp(weights_))
-        if start_loss is None:
-            start_loss = l.item()
-        loss_rel_change = (l.item() - start_loss) / start_loss
-        l.backward()
-        iterator.set_postfix(
-            {"loss": l.item(), "loss_rel_change": loss_rel_change}
-        )
-        optimizer.step()
-
-    return torch.exp(weights).detach().numpy()
 
 
 if __name__ == "__main__":

diff --git a/policyengine_uk_data/datasets/frs/frs.py b/policyengine_uk_data/datasets/frs/frs.py
@@ -138,7 +138,6 @@ class FRS_2022_23(FRS):
     label = "FRS (2022-23)"
     file_path = STORAGE_FOLDER / "frs_2022_23.h5"
     time_period = 2022
-    url = "hf://policyengine/policyengine-uk-data"
 
 
 def add_id_variables(frs: h5py.File, person: DataFrame, household: DataFrame):

diff --git a/policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py b/policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py
@@ -0,0 +1,91 @@
+import torch
+from policyengine_uk import Microsimulation
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import h5py
+from policyengine_uk_data.datasets.frs.local_areas.constituencies.transform_constituencies import (
+    transform_2010_to_2024,
+)
+
+# Fill in missing constituencies with average column values
+import pandas as pd
+import numpy as np
+
+from policyengine_uk_data.datasets.frs.local_areas.constituencies.loss import (
+    create_constituency_target_matrix,
+    create_national_target_matrix,
+)
+from pathlib import Path
+from policyengine_uk_data.storage import STORAGE_FOLDER
+
+FOLDER = Path(__file__).parent
+
+
+def calibrate():
+    matrix, y = create_constituency_target_matrix("enhanced_frs_2022_23", 2025)
+
+    m_national, y_national = create_national_target_matrix(
+        "enhanced_frs_2022_23", 2025
+    )
+
+    sim = Microsimulation(dataset="enhanced_frs_2022_23")
+
+    COUNT_CONSTITUENCIES = 650
+
+    # Weights - 650 x 100180
+    original_weights = np.log(
+        sim.calculate("household_weight", 2025).values / COUNT_CONSTITUENCIES
+    )
+    weights = torch.tensor(
+        np.ones((COUNT_CONSTITUENCIES, len(original_weights)))
+        * original_weights,
+        dtype=torch.float32,
+        requires_grad=True,
+    )
+    metrics = torch.tensor(matrix.values, dtype=torch.float32)
+    y = torch.tensor(y.values, dtype=torch.float32)
+    matrix_national = torch.tensor(m_national.values, dtype=torch.float32)
+    y_national = torch.tensor(y_national.values, dtype=torch.float32)
+
+    def loss(w):
+        pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
+        mse_c = torch.mean((pred_c / (1 + y) - 1) ** 2)
+
+        pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
+        mse_n = torch.mean((pred_n / (1 + y_national) - 1) ** 2)
+
+        return mse_c + mse_n
+
+    optimizer = torch.optim.Adam([weights], lr=0.1)
+
+    desc = range(512)
+
+    for epoch in desc:
+        optimizer.zero_grad()
+        l = loss(torch.exp(weights))
+        l.backward()
+        optimizer.step()
+        if epoch % 50 == 0:
+            print(f"Loss: {l.item()}, Epoch: {epoch}")
+
+    final_weights = torch.exp(weights).detach().numpy()
+    mapping_matrix = pd.read_csv(
+        FOLDER / "mapping_2010_to_2024" / "mapping_matrix.csv"
+    )
+    final_weights = update_weights(final_weights, mapping_matrix)
+
+    with h5py.File(
+        STORAGE_FOLDER / "parliamentary_constituency_weights.h5", "w"
+    ) as f:
+        f.create_dataset("2025", data=final_weights)
+
+
+def update_weights(weights, mapping_matrix):
+    mapping_matrix = mapping_matrix.set_index(mapping_matrix.columns[0])
+    mapping_matrix = mapping_matrix.div(mapping_matrix.sum(), axis=1)
+    return mapping_matrix.T.dot(weights)
+
+
+if __name__ == "__main__":
+    calibrate()
diff --git a/policyengine_uk_data/datasets/frs/local_areas/constituencies/loss.py b/policyengine_uk_data/datasets/frs/local_areas/constituencies/loss.py
@@ -0,0 +1,93 @@
+import torch
+from policyengine_uk import Microsimulation
+import pandas as pd
+import numpy as np
+
+# Fill in missing constituencies with average column values
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+from policyengine_uk_data.utils.loss import (
+    create_target_matrix as create_national_target_matrix,
+)
+
+FOLDER = Path(__file__).parent
+
+
+def create_constituency_target_matrix(
+    dataset: str = "enhanced_frs_2022_23", time_period: int = 2025, reform=None
+):
+    ages = pd.read_csv(FOLDER / "targets" / "age.csv")
+    incomes = pd.read_csv(FOLDER / "targets" / "total_income.csv")
+    employment_incomes = pd.read_csv(
+        FOLDER / "targets" / "employment_income.csv"
+    )
+
+    sim = Microsimulation(dataset=dataset, reform=reform)
+    sim.default_calculation_period = time_period
+
+    matrix = pd.DataFrame()
+    y = pd.DataFrame()
+
+    total_income = sim.calculate("total_income").values
+    matrix["hmrc/total_income/amount"] = sim.map_result(
+        total_income, "person", "household"
+    )
+    y["hmrc/total_income/amount"] = incomes["total_income_amount"]
+
+    matrix["hmrc/total_income/count"] = sim.map_result(
+        total_income != 0, "person", "household"
+    )
+    y["hmrc/total_income/count"] = incomes["total_income_count"]
+
+    age = sim.calculate("age").values
+    for lower_age in range(0, 80, 10):
+        upper_age = lower_age + 10
+
+        in_age_band = (age >= lower_age) & (age < upper_age)
+
+        age_str = f"{lower_age}_{upper_age}"
+        matrix[f"age/{age_str}"] = sim.map_result(
+            in_age_band, "person", "household"
+        )
+
+        age_count = ages[
+            [str(age) for age in range(lower_age, upper_age)]
+        ].sum(axis=1)
+
+        age_str = f"{lower_age}_{upper_age}"
+        y[f"age/{age_str}"] = age_count.values
+
+    employment_income = sim.calculate("employment_income").values
+    bounds = list(
+        employment_incomes.employment_income_lower_bound.sort_values().unique()
+    ) + [np.inf]
+
+    for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
+        if lower_bound >= 70_000 or lower_bound < 12_570:
+            continue
+        in_bound = (
+            (employment_income >= lower_bound)
+            & (employment_income < upper_bound)
+            & (employment_income != 0)
+            & (age >= 16)
+        )
+        band_str = f"{lower_bound}_{upper_bound}"
+        matrix[f"hmrc/employment_income/count/{band_str}"] = sim.map_result(
+            in_bound, "person", "household"
+        )
+        y[f"hmrc/employment_income/count/{band_str}"] = employment_incomes[
+            (employment_incomes.employment_income_lower_bound == lower_bound)
+            & (employment_incomes.employment_income_upper_bound == upper_bound)
+        ].employment_income_count.values
+
+        matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
+            employment_income * in_bound, "person", "household"
+        )
+        y[f"hmrc/employment_income/amount/{band_str}"] = employment_incomes[
+            (employment_incomes.employment_income_lower_bound == lower_bound)
+            & (employment_incomes.employment_income_upper_bound == upper_bound)
+        ].employment_income_amount.values
+
+    return matrix, y
diff --git a/.../datasets/frs/local_areas/constituencies/mapping_2010_to_2024/construct_mapping_matrix.py b/.../datasets/frs/local_areas/constituencies/mapping_2010_to_2024/construct_mapping_matrix.py
@@ -0,0 +1,57 @@
+import pandas as pd
+import numpy as np
+
+mapping_raw = pd.read_csv("mapping_2010_to_2024/mapping_raw.csv")
+mapping_raw = mapping_raw.sort_values(["PCON10CD", "PCON24CD"])
+mapping_raw = mapping_raw.reset_index(drop=True)
+
+# Create sets of unique values for both columns
+unique_pcon10 = mapping_raw["PCON10CD"].unique()
+unique_pcon24 = mapping_raw["PCON24CD"].unique()
+
+# Create an empty matrix filled with zeros
+mapping_matrix = pd.DataFrame(0, index=unique_pcon10, columns=unique_pcon24)
+
+# Fill the matrix using a for loop
+for _, row in mapping_raw.iterrows():
+    mapping_matrix.loc[row["PCON10CD"], row["PCON24CD"]] = 1
+
+# Create sets of unique values for both columns
+unique_pcon10 = mapping_raw["PCON10CD"].unique()
+unique_pcon24 = mapping_raw["PCON24CD"].unique()
+
+# Create empty matrix filled with zeros
+mapping_matrix = pd.DataFrame(0, index=unique_pcon10, columns=unique_pcon24)
+
+# Let's check the first constituency to see what's happening
+example_pcon24 = unique_pcon24[0]
+print(f"Example 2024 constituency: {example_pcon24}")
+
+# Check if we can find it in mapping_raw
+matching_rows = mapping_raw[mapping_raw["PCON24CD"] == example_pcon24]
+print("\nMatching rows found:", len(matching_rows))
+print(matching_rows)
+
+# Now fill the matrix with proper checks
+for pcon24 in unique_pcon24:
+    # Get matching 2010 constituencies
+    matching_2010 = mapping_raw[mapping_raw["PCON24CD"] == pcon24]["PCON10CD"]
+
+    if len(matching_2010) > 0:  # Check if we found any matches
+        weight = 1 / len(matching_2010)
+        mapping_matrix.loc[matching_2010, pcon24] = weight
+    else:
+        print(f"No matches found for {pcon24}")
+
+# Verify results
+print("\nFirst few rows of result:")
+print(mapping_matrix.head())
+print("\nColumn sums (should be 1):")
+print(mapping_matrix.sum().head())
+
+# Show non-zero entries for first column
+first_col = mapping_matrix[mapping_matrix.columns[0]]
+print("\nNon-zero entries in first column:")
+print(first_col[first_col > 0])
+
+mapping_matrix.to_csv("mapping_2010_to_2024/mapping_matrix.csv")