Add local authority weights (#52)

* Add local authority weight generation Fixes #51 * Add LA weights and documentation * Versioning
PolicyEngine · Dec 9, 2024 · 426e527 · 426e527
1 parent 7ca372f
commit 426e527
Show file tree

Hide file tree

Showing 20 changed files with 9,110 additions and 1,645 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.11.0] - 2024-12-09 11:51:05
+
+### Added
+
+- Local authority weights.
+
 ## [1.10.1] - 2024-12-03 17:25:35
 
 ### Added
@@ -115,6 +121,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.11.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.10.1...1.11.0
 [1.10.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.10.0...1.10.1
 [1.10.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.9.2...1.10.0
 [1.9.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.9.1...1.9.2

diff --git a/Makefile b/Makefile
@@ -8,6 +8,7 @@ test:
 
 install:
 	pip install policyengine-uk
+	pip install policyengine
 	pip install -e ".[dev]" --config-settings editable_mode=compat
 
 download:
@@ -29,6 +30,7 @@ data:
 	python policyengine_uk_data/datasets/frs/extended_frs.py
 	python policyengine_uk_data/datasets/frs/enhanced_frs.py
 	python policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py
+	python policyengine_uk_data/datasets/frs/local_areas/local_authorities/calibrate.py
 
 build:
 	python -m build

diff --git a/changelog.yaml b/changelog.yaml
@@ -95,3 +95,8 @@
     added:
     - Dropout in constituency calibration.
   date: 2024-12-03 17:25:35
+- bump: minor
+  changes:
+    added:
+    - Local authority weights.
+  date: 2024-12-09 11:51:05
diff --git a/docs/_config.yml b/docs/_config.yml
@@ -4,7 +4,7 @@ copyright: "2024"
 logo: logo.png
 
 execute:
-  execute_notebooks: off
+  execute_notebooks: force
 
 repository:
   url: https://github.com/policyengine/policyengine-uk-data

diff --git a/docs/methodology.ipynb b/docs/methodology.ipynb
diff --git a/docs/utils.py b/docs/utils.py
diff --git a/docs/validation/constituencies.ipynb b/docs/validation/constituencies.ipynb
diff --git a/docs/validation/local_authorities.ipynb b/docs/validation/local_authorities.ipynb
diff --git a/docs/validation.ipynb → docs/validation/national.ipynb b/docs/validation.ipynb → docs/validation/national.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Validation"
+    "# National dataset validation"
    ]
   },
   {

diff --git a/policyengine_uk_data/datasets/frs/local_areas/constituencies/ageing.ipynb b/policyengine_uk_data/datasets/frs/local_areas/constituencies/ageing.ipynb
diff --git a/policyengine_uk_data/datasets/frs/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/frs/local_areas/local_authorities/calibrate.py
@@ -0,0 +1,86 @@
+import torch
+from policyengine_uk import Microsimulation
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import h5py
+from policyengine_uk_data.storage import STORAGE_FOLDER
+
+
+from loss import (
+    create_local_authority_target_matrix,
+    create_national_target_matrix,
+)
+
+
+def calibrate():
+    matrix, y = create_local_authority_target_matrix(
+        "enhanced_frs_2022_23", 2025
+    )
+
+    m_national, y_national = create_national_target_matrix(
+        "enhanced_frs_2022_23", 2025
+    )
+
+    sim = Microsimulation(dataset="enhanced_frs_2022_23")
+
+    count_local_authority = 360
+
+    # Weights - 360 x 100180
+    original_weights = np.log(
+        sim.calculate("household_weight", 2025).values / count_local_authority
+    )
+    weights = torch.tensor(
+        np.ones((count_local_authority, len(original_weights)))
+        * original_weights,
+        dtype=torch.float32,
+        requires_grad=True,
+    )
+    metrics = torch.tensor(matrix.values, dtype=torch.float32)
+    y = torch.tensor(y.values, dtype=torch.float32)
+    matrix_national = torch.tensor(m_national.values, dtype=torch.float32)
+    y_national = torch.tensor(y_national.values, dtype=torch.float32)
+
+    def loss(w):
+        pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
+        mse_c = torch.mean((pred_c / (1 + y) - 1) ** 2)
+
+        pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
+        mse_n = torch.mean((pred_n / (1 + y_national) - 1) ** 2)
+
+        return mse_c + mse_n
+
+    def dropout_weights(weights, p):
+        if p == 0:
+            return weights
+        # Replace p% of the weights with the mean value of the rest of them
+        mask = torch.rand_like(weights) < p
+        mean = weights[~mask].mean()
+        masked_weights = weights.clone()
+        masked_weights[mask] = mean
+        return masked_weights
+
+    optimizer = torch.optim.Adam([weights], lr=0.1)
+
+    desc = range(512)
+
+    for epoch in desc:
+        optimizer.zero_grad()
+        weights_ = dropout_weights(weights, 0.05)
+        l = loss(torch.exp(weights_))
+        l.backward()
+        optimizer.step()
+        if epoch % 50 == 0:
+            print(f"Loss: {l.item()}, Epoch: {epoch}")
+
+        if epoch % 100 == 0:
+            final_weights = torch.exp(weights).detach().numpy()
+
+            with h5py.File(
+                STORAGE_FOLDER / "local_authority_weights.h5", "w"
+            ) as f:
+                f.create_dataset("2025", data=final_weights)
+
+
+if __name__ == "__main__":
+    calibrate()
diff --git a/policyengine_uk_data/datasets/frs/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/frs/local_areas/local_authorities/loss.py
@@ -0,0 +1,155 @@
+import torch
+from policyengine_uk import Microsimulation
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+from policyengine_uk_data.utils.loss import (
+    create_target_matrix as create_national_target_matrix,
+)
+
+FOLDER = Path(__file__).parent
+
+
+def create_local_authority_target_matrix(
+    dataset: str = "enhanced_frs_2022_23",
+    time_period: int = 2025,
+    reform=None,
+    uprate: bool = True,
+):
+    ages = pd.read_csv(FOLDER / "targets" / "age.csv")
+    incomes = pd.read_csv(FOLDER / "targets" / "total_income.csv")
+    employment_incomes = pd.read_csv(
+        FOLDER / "targets" / "employment_income.csv"
+    )
+
+    sim = Microsimulation(dataset=dataset, reform=reform)
+    sim.default_calculation_period = time_period
+
+    matrix = pd.DataFrame()
+    y = pd.DataFrame()
+
+    total_income = sim.calculate("total_income").values
+    matrix["hmrc/total_income/amount"] = sim.map_result(
+        total_income, "person", "household"
+    )
+    y["hmrc/total_income/amount"] = incomes["total_income_amount"]
+
+    matrix["hmrc/total_income/count"] = sim.map_result(
+        total_income != 0, "person", "household"
+    )
+    y["hmrc/total_income/count"] = incomes["total_income_count"]
+
+    age = sim.calculate("age").values
+    for lower_age in range(0, 80, 10):
+        upper_age = lower_age + 10
+
+        in_age_band = (age >= lower_age) & (age < upper_age)
+
+        age_str = f"{lower_age}_{upper_age}"
+        matrix[f"age/{age_str}"] = sim.map_result(
+            in_age_band, "person", "household"
+        )
+
+        age_count = ages[
+            [str(age) for age in range(lower_age, upper_age)]
+        ].sum(axis=1)
+
+        age_str = f"{lower_age}_{upper_age}"
+        y[f"age/{age_str}"] = age_count.values
+
+    employment_income = sim.calculate("employment_income").values
+    bounds = list(
+        employment_incomes.employment_income_lower_bound.sort_values().unique()
+    ) + [np.inf]
+
+    for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
+        if lower_bound >= 70_000 or lower_bound < 12_570:
+            continue
+        in_bound = (
+            (employment_income >= lower_bound)
+            & (employment_income < upper_bound)
+            & (employment_income != 0)
+            & (age >= 16)
+        )
+        band_str = f"{lower_bound}_{upper_bound}"
+        matrix[f"hmrc/employment_income/count/{band_str}"] = sim.map_result(
+            in_bound, "person", "household"
+        )
+        y[f"hmrc/employment_income/count/{band_str}"] = employment_incomes[
+            (employment_incomes.employment_income_lower_bound == lower_bound)
+            & (employment_incomes.employment_income_upper_bound == upper_bound)
+        ].employment_income_count.values
+
+        matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
+            employment_income * in_bound, "person", "household"
+        )
+        y[f"hmrc/employment_income/amount/{band_str}"] = employment_incomes[
+            (employment_incomes.employment_income_lower_bound == lower_bound)
+            & (employment_incomes.employment_income_upper_bound == upper_bound)
+        ].employment_income_amount.values
+
+    if uprate:
+        y = uprate_targets(y, time_period)
+
+    return matrix, y
+
+
+def uprate_targets(y: pd.DataFrame, target_year: int = 2025) -> pd.DataFrame:
+    # Uprate age targets from 2020, taxable income targets from 2021, employment income targets from 2023.
+    # Use PolicyEngine uprating factors.
+    sim = Microsimulation(dataset="frs_2020_21")
+    matrix_20, y_20 = create_local_authority_target_matrix(
+        "frs_2020_21", 2020, uprate=False
+    )
+    matrix_21, y_21 = create_local_authority_target_matrix(
+        "frs_2020_21", 2021, uprate=False
+    )
+    matrix_23, y_23 = create_local_authority_target_matrix(
+        "frs_2020_21", 2023, uprate=False
+    )
+    matrix_final, y_final = create_local_authority_target_matrix(
+        "frs_2020_21", target_year, uprate=False
+    )
+    weights_20 = sim.calculate("household_weight", 2020)
+    weights_21 = sim.calculate("household_weight", 2021)
+    weights_23 = sim.calculate("household_weight", 2023)
+    weights_final = sim.calculate("household_weight", target_year)
+
+    rel_change_20_final = (weights_final @ matrix_final) / (
+        weights_20 @ matrix_20
+    ) - 1
+    is_uprated_from_2020 = [
+        col.startswith("age/") for col in matrix_20.columns
+    ]
+    uprating_from_2020 = np.zeros_like(matrix_20.columns, dtype=float)
+    uprating_from_2020[is_uprated_from_2020] = rel_change_20_final[
+        is_uprated_from_2020
+    ]
+
+    rel_change_21_final = (weights_final @ matrix_final) / (
+        weights_21 @ matrix_21
+    ) - 1
+    is_uprated_from_2021 = [
+        col.startswith("hmrc/") for col in matrix_21.columns
+    ]
+    uprating_from_2021 = np.zeros_like(matrix_21.columns, dtype=float)
+    uprating_from_2021[is_uprated_from_2021] = rel_change_21_final[
+        is_uprated_from_2021
+    ]
+
+    rel_change_23_final = (weights_final @ matrix_final) / (
+        weights_23 @ matrix_23
+    ) - 1
+    is_uprated_from_2023 = [
+        col.startswith("hmrc/") for col in matrix_23.columns
+    ]
+    uprating_from_2023 = np.zeros_like(matrix_23.columns, dtype=float)
+    uprating_from_2023[is_uprated_from_2023] = rel_change_23_final[
+        is_uprated_from_2023
+    ]
+
+    uprating = uprating_from_2020 + uprating_from_2021 + uprating_from_2023
+    y = y * (1 + uprating)
+
+    return y
diff --git a/policyengine_uk_data/datasets/frs/local_areas/local_authorities/targets/README.md b/policyengine_uk_data/datasets/frs/local_areas/local_authorities/targets/README.md
@@ -0,0 +1,6 @@
+# Data
+
+* Age is from [the ONS](https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.ons.gov.uk/file%3Furi%3D/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/parliamentaryconstituencymidyearpopulationestimates/mid2020sape23dt7/sape23dt7mid2020parliconsyoaestimatesunformatted.xlsx&ved=2ahUKEwifosm3x9GIAxXxQkEAHU_LB70QFnoECBgQAQ&usg=AOvVaw0-MdplttsD8klJR6M3WID8) and has single-year age counts for each political constituency (2010) in the UK. The data is from 2020.
+* Employment incomes are from Nomis, and are from 2023.
+* HMRC total income is from 2021.
+