Skip to content

Commit

Permalink
Add local authority weights (#52)
Browse files Browse the repository at this point in the history
* Add local authority weight generation
Fixes #51

* Add LA weights and documentation

* Versioning
  • Loading branch information
nikhilwoodruff authored Dec 9, 2024
1 parent 7ca372f commit 426e527
Show file tree
Hide file tree
Showing 20 changed files with 9,110 additions and 1,645 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.11.0] - 2024-12-09 11:51:05

### Added

- Local authority weights.

## [1.10.1] - 2024-12-03 17:25:35

### Added
Expand Down Expand Up @@ -115,6 +121,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0



[1.11.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.10.1...1.11.0
[1.10.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.10.0...1.10.1
[1.10.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.9.2...1.10.0
[1.9.2]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.9.1...1.9.2
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ test:

install:
pip install policyengine-uk
pip install policyengine
pip install -e ".[dev]" --config-settings editable_mode=compat

download:
Expand All @@ -29,6 +30,7 @@ data:
python policyengine_uk_data/datasets/frs/extended_frs.py
python policyengine_uk_data/datasets/frs/enhanced_frs.py
python policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py
python policyengine_uk_data/datasets/frs/local_areas/local_authorities/calibrate.py

build:
python -m build
Expand Down
5 changes: 5 additions & 0 deletions changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,8 @@
added:
- Dropout in constituency calibration.
date: 2024-12-03 17:25:35
- bump: minor
changes:
added:
- Local authority weights.
date: 2024-12-09 11:51:05
2 changes: 1 addition & 1 deletion docs/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ copyright: "2024"
logo: logo.png

execute:
execute_notebooks: off
execute_notebooks: force

repository:
url: https://github.com/policyengine/policyengine-uk-data
Expand Down
3,111 changes: 1,548 additions & 1,563 deletions docs/methodology.ipynb

Large diffs are not rendered by default.

7 changes: 0 additions & 7 deletions docs/utils.py

This file was deleted.

631 changes: 631 additions & 0 deletions docs/validation/constituencies.ipynb

Large diffs are not rendered by default.

631 changes: 631 additions & 0 deletions docs/validation/local_authorities.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/validation.ipynb → docs/validation/national.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Validation"
"# National dataset validation"
]
},
{
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import torch
from policyengine_uk import Microsimulation
import pandas as pd
import numpy as np
from tqdm import tqdm
import h5py
from policyengine_uk_data.storage import STORAGE_FOLDER


from loss import (
create_local_authority_target_matrix,
create_national_target_matrix,
)


def calibrate():
matrix, y = create_local_authority_target_matrix(
"enhanced_frs_2022_23", 2025
)

m_national, y_national = create_national_target_matrix(
"enhanced_frs_2022_23", 2025
)

sim = Microsimulation(dataset="enhanced_frs_2022_23")

count_local_authority = 360

# Weights - 360 x 100180
original_weights = np.log(
sim.calculate("household_weight", 2025).values / count_local_authority
)
weights = torch.tensor(
np.ones((count_local_authority, len(original_weights)))
* original_weights,
dtype=torch.float32,
requires_grad=True,
)
metrics = torch.tensor(matrix.values, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32)
matrix_national = torch.tensor(m_national.values, dtype=torch.float32)
y_national = torch.tensor(y_national.values, dtype=torch.float32)

def loss(w):
pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
mse_c = torch.mean((pred_c / (1 + y) - 1) ** 2)

pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
mse_n = torch.mean((pred_n / (1 + y_national) - 1) ** 2)

return mse_c + mse_n

def dropout_weights(weights, p):
if p == 0:
return weights
# Replace p% of the weights with the mean value of the rest of them
mask = torch.rand_like(weights) < p
mean = weights[~mask].mean()
masked_weights = weights.clone()
masked_weights[mask] = mean
return masked_weights

optimizer = torch.optim.Adam([weights], lr=0.1)

desc = range(512)

for epoch in desc:
optimizer.zero_grad()
weights_ = dropout_weights(weights, 0.05)
l = loss(torch.exp(weights_))
l.backward()
optimizer.step()
if epoch % 50 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}")

if epoch % 100 == 0:
final_weights = torch.exp(weights).detach().numpy()

with h5py.File(
STORAGE_FOLDER / "local_authority_weights.h5", "w"
) as f:
f.create_dataset("2025", data=final_weights)


if __name__ == "__main__":
calibrate()
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import torch
from policyengine_uk import Microsimulation
import pandas as pd
import numpy as np
from pathlib import Path

from policyengine_uk_data.utils.loss import (
create_target_matrix as create_national_target_matrix,
)

FOLDER = Path(__file__).parent


def create_local_authority_target_matrix(
dataset: str = "enhanced_frs_2022_23",
time_period: int = 2025,
reform=None,
uprate: bool = True,
):
ages = pd.read_csv(FOLDER / "targets" / "age.csv")
incomes = pd.read_csv(FOLDER / "targets" / "total_income.csv")
employment_incomes = pd.read_csv(
FOLDER / "targets" / "employment_income.csv"
)

sim = Microsimulation(dataset=dataset, reform=reform)
sim.default_calculation_period = time_period

matrix = pd.DataFrame()
y = pd.DataFrame()

total_income = sim.calculate("total_income").values
matrix["hmrc/total_income/amount"] = sim.map_result(
total_income, "person", "household"
)
y["hmrc/total_income/amount"] = incomes["total_income_amount"]

matrix["hmrc/total_income/count"] = sim.map_result(
total_income != 0, "person", "household"
)
y["hmrc/total_income/count"] = incomes["total_income_count"]

age = sim.calculate("age").values
for lower_age in range(0, 80, 10):
upper_age = lower_age + 10

in_age_band = (age >= lower_age) & (age < upper_age)

age_str = f"{lower_age}_{upper_age}"
matrix[f"age/{age_str}"] = sim.map_result(
in_age_band, "person", "household"
)

age_count = ages[
[str(age) for age in range(lower_age, upper_age)]
].sum(axis=1)

age_str = f"{lower_age}_{upper_age}"
y[f"age/{age_str}"] = age_count.values

employment_income = sim.calculate("employment_income").values
bounds = list(
employment_incomes.employment_income_lower_bound.sort_values().unique()
) + [np.inf]

for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
if lower_bound >= 70_000 or lower_bound < 12_570:
continue
in_bound = (
(employment_income >= lower_bound)
& (employment_income < upper_bound)
& (employment_income != 0)
& (age >= 16)
)
band_str = f"{lower_bound}_{upper_bound}"
matrix[f"hmrc/employment_income/count/{band_str}"] = sim.map_result(
in_bound, "person", "household"
)
y[f"hmrc/employment_income/count/{band_str}"] = employment_incomes[
(employment_incomes.employment_income_lower_bound == lower_bound)
& (employment_incomes.employment_income_upper_bound == upper_bound)
].employment_income_count.values

matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
employment_income * in_bound, "person", "household"
)
y[f"hmrc/employment_income/amount/{band_str}"] = employment_incomes[
(employment_incomes.employment_income_lower_bound == lower_bound)
& (employment_incomes.employment_income_upper_bound == upper_bound)
].employment_income_amount.values

if uprate:
y = uprate_targets(y, time_period)

return matrix, y


def uprate_targets(y: pd.DataFrame, target_year: int = 2025) -> pd.DataFrame:
# Uprate age targets from 2020, taxable income targets from 2021, employment income targets from 2023.
# Use PolicyEngine uprating factors.
sim = Microsimulation(dataset="frs_2020_21")
matrix_20, y_20 = create_local_authority_target_matrix(
"frs_2020_21", 2020, uprate=False
)
matrix_21, y_21 = create_local_authority_target_matrix(
"frs_2020_21", 2021, uprate=False
)
matrix_23, y_23 = create_local_authority_target_matrix(
"frs_2020_21", 2023, uprate=False
)
matrix_final, y_final = create_local_authority_target_matrix(
"frs_2020_21", target_year, uprate=False
)
weights_20 = sim.calculate("household_weight", 2020)
weights_21 = sim.calculate("household_weight", 2021)
weights_23 = sim.calculate("household_weight", 2023)
weights_final = sim.calculate("household_weight", target_year)

rel_change_20_final = (weights_final @ matrix_final) / (
weights_20 @ matrix_20
) - 1
is_uprated_from_2020 = [
col.startswith("age/") for col in matrix_20.columns
]
uprating_from_2020 = np.zeros_like(matrix_20.columns, dtype=float)
uprating_from_2020[is_uprated_from_2020] = rel_change_20_final[
is_uprated_from_2020
]

rel_change_21_final = (weights_final @ matrix_final) / (
weights_21 @ matrix_21
) - 1
is_uprated_from_2021 = [
col.startswith("hmrc/") for col in matrix_21.columns
]
uprating_from_2021 = np.zeros_like(matrix_21.columns, dtype=float)
uprating_from_2021[is_uprated_from_2021] = rel_change_21_final[
is_uprated_from_2021
]

rel_change_23_final = (weights_final @ matrix_final) / (
weights_23 @ matrix_23
) - 1
is_uprated_from_2023 = [
col.startswith("hmrc/") for col in matrix_23.columns
]
uprating_from_2023 = np.zeros_like(matrix_23.columns, dtype=float)
uprating_from_2023[is_uprated_from_2023] = rel_change_23_final[
is_uprated_from_2023
]

uprating = uprating_from_2020 + uprating_from_2021 + uprating_from_2023
y = y * (1 + uprating)

return y
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Data

* Age is from [the ONS](https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.ons.gov.uk/file%3Furi%3D/peoplepopulationandcommunity/populationandmigration/populationestimates/datasets/parliamentaryconstituencymidyearpopulationestimates/mid2020sape23dt7/sape23dt7mid2020parliconsyoaestimatesunformatted.xlsx&ved=2ahUKEwifosm3x9GIAxXxQkEAHU_LB70QFnoECBgQAQ&usg=AOvVaw0-MdplttsD8klJR6M3WID8) and has single-year age counts for each political constituency (2010) in the UK. The data is from 2020.
* Employment incomes are from Nomis, and are from 2023.
* HMRC total income is from 2021.

Loading

0 comments on commit 426e527

Please sign in to comment.