Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add constituency local area data generation #44

Merged
merged 7 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@
!tax_benefit.csv
!demographics.csv
!incomes_projection.csv
!policyengine_uk_data/datasets/frs/local_areas/**/*.csv
**/_build
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data:
python policyengine_uk_data/datasets/frs/frs.py
python policyengine_uk_data/datasets/frs/extended_frs.py
python policyengine_uk_data/datasets/frs/enhanced_frs.py
python policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py

build:
python -m build
Expand Down
67 changes: 2 additions & 65 deletions policyengine_uk_data/datasets/frs/enhanced_frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from policyengine_uk_data.utils.imputations.capital_gains import (
impute_cg_to_dataset,
)
from policyengine_uk_data.utils.reweight import reweight

try:
import torch
Expand All @@ -24,8 +25,7 @@ def generate(self):
# Capital gains imputation

impute_cg_to_dataset(self)

self.save_dataset(data)
data = self.load_dataset()

self.add_random_variables(data)

Expand Down Expand Up @@ -84,69 +84,6 @@ class EnhancedFRS_2022_23(EnhancedFRS):
input_frs = ExtendedFRS_2022_23
time_period = 2022
end_year = 2028
url = "hf://policyengine/policyengine-uk-data"


def reweight(
original_weights,
loss_matrix,
targets_array,
dropout_rate=0.05,
):
target_names = np.array(loss_matrix.columns)
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
targets_array = torch.tensor(targets_array, dtype=torch.float32)
weights = torch.tensor(
np.log(original_weights), requires_grad=True, dtype=torch.float32
)

# TODO: replace this with a call to the python reweight.py package.
def loss(weights):
# Check for Nans in either the weights or the loss matrix
if torch.isnan(weights).any():
raise ValueError("Weights contain NaNs")
if torch.isnan(loss_matrix).any():
raise ValueError("Loss matrix contains NaNs")
estimate = weights @ loss_matrix
if torch.isnan(estimate).any():
raise ValueError("Estimate contains NaNs")
rel_error = (
((estimate - targets_array) + 1) / (targets_array + 1)
) ** 2
if torch.isnan(rel_error).any():
raise ValueError("Relative error contains NaNs")
return rel_error.mean()

def dropout_weights(weights, p):
if p == 0:
return weights
# Replace p% of the weights with the mean value of the rest of them
mask = torch.rand_like(weights) < p
mean = weights[~mask].mean()
masked_weights = weights.clone()
masked_weights[mask] = mean
return masked_weights

optimizer = torch.optim.Adam([weights], lr=1e-1)
from tqdm import trange

start_loss = None

iterator = trange(10_000)
for i in iterator:
optimizer.zero_grad()
weights_ = dropout_weights(weights, dropout_rate)
l = loss(torch.exp(weights_))
if start_loss is None:
start_loss = l.item()
loss_rel_change = (l.item() - start_loss) / start_loss
l.backward()
iterator.set_postfix(
{"loss": l.item(), "loss_rel_change": loss_rel_change}
)
optimizer.step()

return torch.exp(weights).detach().numpy()


if __name__ == "__main__":
Expand Down
1 change: 0 additions & 1 deletion policyengine_uk_data/datasets/frs/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ class FRS_2022_23(FRS):
label = "FRS (2022-23)"
file_path = STORAGE_FOLDER / "frs_2022_23.h5"
time_period = 2022
url = "hf://policyengine/policyengine-uk-data"


def add_id_variables(frs: h5py.File, person: DataFrame, household: DataFrame):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import torch
from policyengine_uk import Microsimulation
import pandas as pd
import numpy as np
from tqdm import tqdm
import h5py
from policyengine_uk_data.datasets.frs.local_areas.constituencies.transform_constituencies import (
transform_2010_to_2024,
)

# Fill in missing constituencies with average column values
import pandas as pd
import numpy as np

from policyengine_uk_data.datasets.frs.local_areas.constituencies.loss import (
create_constituency_target_matrix,
create_national_target_matrix,
)
from pathlib import Path
from policyengine_uk_data.storage import STORAGE_FOLDER

FOLDER = Path(__file__).parent


def calibrate():
matrix, y = create_constituency_target_matrix("enhanced_frs_2022_23", 2025)

m_national, y_national = create_national_target_matrix(
"enhanced_frs_2022_23", 2025
)

sim = Microsimulation(dataset="enhanced_frs_2022_23")

COUNT_CONSTITUENCIES = 650

# Weights - 650 x 100180
original_weights = np.log(
sim.calculate("household_weight", 2025).values / COUNT_CONSTITUENCIES
)
weights = torch.tensor(
np.ones((COUNT_CONSTITUENCIES, len(original_weights)))
* original_weights,
dtype=torch.float32,
requires_grad=True,
)
metrics = torch.tensor(matrix.values, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32)
matrix_national = torch.tensor(m_national.values, dtype=torch.float32)
y_national = torch.tensor(y_national.values, dtype=torch.float32)

def loss(w):
pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
mse_c = torch.mean((pred_c / (1 + y) - 1) ** 2)

pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
mse_n = torch.mean((pred_n / (1 + y_national) - 1) ** 2)

return mse_c + mse_n

optimizer = torch.optim.Adam([weights], lr=0.1)

desc = range(512)

for epoch in desc:
optimizer.zero_grad()
l = loss(torch.exp(weights))
l.backward()
optimizer.step()
if epoch % 50 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}")

final_weights = torch.exp(weights).detach().numpy()
mapping_matrix = pd.read_csv(
FOLDER / "mapping_2010_to_2024" / "mapping_matrix.csv"
)
final_weights = update_weights(final_weights, mapping_matrix)

with h5py.File(
STORAGE_FOLDER / "parliamentary_constituency_weights.h5", "w"
) as f:
f.create_dataset("2025", data=final_weights)


def update_weights(weights, mapping_matrix):
mapping_matrix = mapping_matrix.set_index(mapping_matrix.columns[0])
mapping_matrix = mapping_matrix.div(mapping_matrix.sum(), axis=1)
return mapping_matrix.T.dot(weights)


if __name__ == "__main__":
calibrate()
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import torch
from policyengine_uk import Microsimulation
import pandas as pd
import numpy as np

# Fill in missing constituencies with average column values
import pandas as pd
import numpy as np
from pathlib import Path

from policyengine_uk_data.utils.loss import (
create_target_matrix as create_national_target_matrix,
)

FOLDER = Path(__file__).parent


def create_constituency_target_matrix(
dataset: str = "enhanced_frs_2022_23", time_period: int = 2025, reform=None
):
ages = pd.read_csv(FOLDER / "targets" / "age.csv")
incomes = pd.read_csv(FOLDER / "targets" / "total_income.csv")
employment_incomes = pd.read_csv(
FOLDER / "targets" / "employment_income.csv"
)

sim = Microsimulation(dataset=dataset, reform=reform)
sim.default_calculation_period = time_period

matrix = pd.DataFrame()
y = pd.DataFrame()

total_income = sim.calculate("total_income").values
matrix["hmrc/total_income/amount"] = sim.map_result(
total_income, "person", "household"
)
y["hmrc/total_income/amount"] = incomes["total_income_amount"]

matrix["hmrc/total_income/count"] = sim.map_result(
total_income != 0, "person", "household"
)
y["hmrc/total_income/count"] = incomes["total_income_count"]

age = sim.calculate("age").values
for lower_age in range(0, 80, 10):
upper_age = lower_age + 10

in_age_band = (age >= lower_age) & (age < upper_age)

age_str = f"{lower_age}_{upper_age}"
matrix[f"age/{age_str}"] = sim.map_result(
in_age_band, "person", "household"
)

age_count = ages[
[str(age) for age in range(lower_age, upper_age)]
].sum(axis=1)

age_str = f"{lower_age}_{upper_age}"
y[f"age/{age_str}"] = age_count.values

employment_income = sim.calculate("employment_income").values
bounds = list(
employment_incomes.employment_income_lower_bound.sort_values().unique()
) + [np.inf]

for lower_bound, upper_bound in zip(bounds[:-1], bounds[1:]):
if lower_bound >= 70_000 or lower_bound < 12_570:
continue
in_bound = (
(employment_income >= lower_bound)
& (employment_income < upper_bound)
& (employment_income != 0)
& (age >= 16)
)
band_str = f"{lower_bound}_{upper_bound}"
matrix[f"hmrc/employment_income/count/{band_str}"] = sim.map_result(
in_bound, "person", "household"
)
y[f"hmrc/employment_income/count/{band_str}"] = employment_incomes[
(employment_incomes.employment_income_lower_bound == lower_bound)
& (employment_incomes.employment_income_upper_bound == upper_bound)
].employment_income_count.values

matrix[f"hmrc/employment_income/amount/{band_str}"] = sim.map_result(
employment_income * in_bound, "person", "household"
)
y[f"hmrc/employment_income/amount/{band_str}"] = employment_incomes[
(employment_incomes.employment_income_lower_bound == lower_bound)
& (employment_incomes.employment_income_upper_bound == upper_bound)
].employment_income_amount.values

return matrix, y
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pandas as pd
import numpy as np

mapping_raw = pd.read_csv("mapping_2010_to_2024/mapping_raw.csv")
mapping_raw = mapping_raw.sort_values(["PCON10CD", "PCON24CD"])
mapping_raw = mapping_raw.reset_index(drop=True)

# Create sets of unique values for both columns
unique_pcon10 = mapping_raw["PCON10CD"].unique()
unique_pcon24 = mapping_raw["PCON24CD"].unique()

# Create an empty matrix filled with zeros
mapping_matrix = pd.DataFrame(0, index=unique_pcon10, columns=unique_pcon24)

# Fill the matrix using a for loop
for _, row in mapping_raw.iterrows():
mapping_matrix.loc[row["PCON10CD"], row["PCON24CD"]] = 1

# Create sets of unique values for both columns
unique_pcon10 = mapping_raw["PCON10CD"].unique()
unique_pcon24 = mapping_raw["PCON24CD"].unique()

# Create empty matrix filled with zeros
mapping_matrix = pd.DataFrame(0, index=unique_pcon10, columns=unique_pcon24)

# Let's check the first constituency to see what's happening
example_pcon24 = unique_pcon24[0]
print(f"Example 2024 constituency: {example_pcon24}")

# Check if we can find it in mapping_raw
matching_rows = mapping_raw[mapping_raw["PCON24CD"] == example_pcon24]
print("\nMatching rows found:", len(matching_rows))
print(matching_rows)

# Now fill the matrix with proper checks
for pcon24 in unique_pcon24:
# Get matching 2010 constituencies
matching_2010 = mapping_raw[mapping_raw["PCON24CD"] == pcon24]["PCON10CD"]

if len(matching_2010) > 0: # Check if we found any matches
weight = 1 / len(matching_2010)
mapping_matrix.loc[matching_2010, pcon24] = weight
else:
print(f"No matches found for {pcon24}")

# Verify results
print("\nFirst few rows of result:")
print(mapping_matrix.head())
print("\nColumn sums (should be 1):")
print(mapping_matrix.sum().head())

# Show non-zero entries for first column
first_col = mapping_matrix[mapping_matrix.columns[0]]
print("\nNon-zero entries in first column:")
print(first_col[first_col > 0])

mapping_matrix.to_csv("mapping_2010_to_2024/mapping_matrix.csv")
Loading