Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add constituency methodology documentation #57

Merged
merged 14 commits into from
Dec 23, 2024
Merged
14 changes: 14 additions & 0 deletions .github/workflows/pull_request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ jobs:
test:
name: Build and test
runs-on: ubuntu-latest
env:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand All @@ -47,7 +49,19 @@ jobs:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
- name: Build datasets
run: make data
env:
DATA_LITE: true
- name: Run tests
run: pytest
- name: Test documentation builds
run: make documentation

- name: Check documentation build
run: |
for notebook in $(find docs/_build/jupyter_execute -name "*.ipynb"); do
if grep -q '"output_type": "error"' "$notebook"; then
echo "Error found in $notebook"
cat "$notebook"
exit 1
fi
done
2 changes: 2 additions & 0 deletions .github/workflows/push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:
test:
name: Build and test
runs-on: ubuntu-latest
env:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
steps:
- name: Checkout code
uses: actions/checkout@v4
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ test:

install:
pip install policyengine-uk
pip install policyengine
pip install policyengine>=2.4
pip install -e ".[dev]" --config-settings editable_mode=compat

download:
Expand All @@ -22,7 +22,7 @@ docker:

documentation:
jb clean docs && jb build docs
python docs/add_plotly_to_book.py docs/book
python docs/add_plotly_to_book.py docs

data:
python policyengine_uk_data/datasets/frs/dwp_frs.py
Expand Down
1 change: 1 addition & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ chapters:
- file: validation/constituencies.ipynb
- file: validation/local_authorities.ipynb
- file: pension_contributions.ipynb
- file: constituency_methodology.ipynb
5,697 changes: 5,697 additions & 0 deletions docs/constituency_methodology.ipynb

Large diffs are not rendered by default.

Binary file added docs/pictures/earning_dist.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/pictures/nomis_screenshot1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/pictures/parliamentary_earnings.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
148 changes: 77 additions & 71 deletions docs/validation/constituencies.ipynb

Large diffs are not rendered by default.

81 changes: 32 additions & 49 deletions docs/validation/local_authorities.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from tqdm import tqdm
import h5py
import os
from policyengine_uk_data.datasets.frs.local_areas.constituencies.transform_constituencies import (
transform_2010_to_2024,
)
Expand Down Expand Up @@ -57,6 +58,18 @@ def loss(w):

return mse_c + mse_n

def pct_close(w, t=0.1):
# Return the percentage of metrics that are within t% of the target
pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
e_c = torch.sum(torch.abs((pred_c / (1 + y) - 1)) < t)
c_c = pred_c.shape[0] * pred_c.shape[1]

pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
e_n = torch.sum(torch.abs((pred_n / (1 + y_national) - 1)) < t)
c_n = pred_n.shape[0]

return (e_c + e_n) / (c_c + c_n)

def dropout_weights(weights, p):
if p == 0:
return weights
Expand All @@ -69,16 +82,17 @@ def dropout_weights(weights, p):

optimizer = torch.optim.Adam([weights], lr=0.1)

desc = range(512)
desc = range(32) if os.environ.get("DATA_LITE") else range(256)

for epoch in desc:
optimizer.zero_grad()
weights_ = dropout_weights(weights, 0.05)
l = loss(torch.exp(weights_))
l.backward()
optimizer.step()
if epoch % 50 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}")
close = pct_close(torch.exp(weights_))
if epoch % 10 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}, Within 10%: {close:.2%}")

final_weights = torch.exp(weights).detach().numpy()
mapping_matrix = pd.read_csv(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import numpy as np
from tqdm import tqdm
import h5py
import os
from policyengine_uk_data.storage import STORAGE_FOLDER


from loss import (
from policyengine_uk_data.datasets.frs.local_areas.local_authorities.loss import (
create_local_authority_target_matrix,
create_national_target_matrix,
)
Expand Down Expand Up @@ -50,6 +51,18 @@ def loss(w):

return mse_c + mse_n

def pct_close(w, t=0.1):
# Return the percentage of metrics that are within t% of the target
pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
e_c = torch.sum(torch.abs((pred_c / (1 + y) - 1)) < t)
c_c = pred_c.shape[0] * pred_c.shape[1]

pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
e_n = torch.sum(torch.abs((pred_n / (1 + y_national) - 1)) < t)
c_n = pred_n.shape[0]

return (e_c + e_n) / (c_c + c_n)

def dropout_weights(weights, p):
if p == 0:
return weights
Expand All @@ -62,16 +75,17 @@ def dropout_weights(weights, p):

optimizer = torch.optim.Adam([weights], lr=0.1)

desc = range(512)
desc = range(32) if os.environ.get("DATA_LITE") else range(256)

for epoch in desc:
optimizer.zero_grad()
weights_ = dropout_weights(weights, 0.05)
l = loss(torch.exp(weights_))
l.backward()
optimizer.step()
if epoch % 50 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}")
close = pct_close(torch.exp(weights_))
if epoch % 10 == 0:
print(f"Loss: {l.item()}, Epoch: {epoch}, Within 10%: {close:.2%}")

if epoch % 100 == 0:
final_weights = torch.exp(weights).detach().numpy()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,5 @@ def extract_zipped_folder(folder):
repo_filename=file.name,
local_folder=file.parent,
)
print(f"Extracting {file}")
extract_zipped_folder(file)
file.unlink()
2 changes: 1 addition & 1 deletion policyengine_uk_data/utils/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ def download(
token = os.environ.get(
"HUGGING_FACE_TOKEN",
)
login(token=token)

hf_hub_download(
repo_id=repo,
repo_type="model",
filename=repo_filename,
local_dir=local_folder,
revision=version,
token=token,
)


Expand Down
14 changes: 12 additions & 2 deletions policyengine_uk_data/utils/reweight.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import torch
import os


def reweight(
Expand Down Expand Up @@ -32,6 +33,12 @@ def loss(weights):
raise ValueError("Relative error contains NaNs")
return rel_error.mean()

def pct_close(weights, t=0.1):
# Return the percentage of metrics that are within t% of the target
estimate = weights @ loss_matrix
abs_error = torch.abs((estimate - targets_array) / (1 + targets_array))
return (abs_error < t).sum() / abs_error.numel()

def dropout_weights(weights, p):
if p == 0:
return weights
Expand All @@ -47,17 +54,20 @@ def dropout_weights(weights, p):

start_loss = None

iterator = range(1_000)
iterator = range(128) if os.environ.get("DATA_LITE") else range(2048)
for i in iterator:
optimizer.zero_grad()
weights_ = dropout_weights(weights, dropout_rate)
l = loss(torch.exp(weights_))
close = pct_close(torch.exp(weights_))
if start_loss is None:
start_loss = l.item()
loss_rel_change = (l.item() - start_loss) / start_loss
l.backward()
if i % 100 == 0:
print(f"Loss: {l.item()}, Rel change: {loss_rel_change}")
print(
f"Loss: {l.item()}, Rel change: {loss_rel_change}, Epoch: {i}, Within 10%: {close:.2%}"
)
optimizer.step()

return torch.exp(weights).detach().numpy()
Loading