PolicyEngine · nikhilwoodruff · Dec 23, 2024 · Dec 12, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml
@@ -30,6 +30,8 @@ jobs:
   test:
     name: Build and test
     runs-on: ubuntu-latest
+    env:
+      HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -47,7 +49,19 @@ jobs:
           HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
       - name: Build datasets
         run: make data
+        env:
+          DATA_LITE: true
       - name: Run tests
         run: pytest
       - name: Test documentation builds
         run: make documentation
+
+      - name: Check documentation build
+        run: |
+          for notebook in $(find docs/_build/jupyter_execute -name "*.ipynb"); do
+            if grep -q '"output_type": "error"' "$notebook"; then
+              echo "Error found in $notebook"
+              cat "$notebook"
+              exit 1
+            fi
+          done
diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
@@ -26,6 +26,8 @@ jobs:
   test:
     name: Build and test
     runs-on: ubuntu-latest
+    env:
+      HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4

diff --git a/Makefile b/Makefile
@@ -8,7 +8,7 @@ test:
 
 install:
 	pip install policyengine-uk
-	pip install policyengine
+	pip install policyengine>=2.4
 	pip install -e ".[dev]" --config-settings editable_mode=compat
 
 download:
@@ -22,7 +22,7 @@ docker:
 
 documentation:
 	jb clean docs && jb build docs
-	python docs/add_plotly_to_book.py docs/book
+	python docs/add_plotly_to_book.py docs
 
 data:
 	python policyengine_uk_data/datasets/frs/dwp_frs.py

diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -8,3 +8,4 @@ chapters:
   - file: validation/constituencies.ipynb
   - file: validation/local_authorities.ipynb
 - file: pension_contributions.ipynb
+- file: constituency_methodology.ipynb
diff --git a/docs/constituency_methodology.ipynb b/docs/constituency_methodology.ipynb
diff --git a/docs/pictures/earning_dist.png b/docs/pictures/earning_dist.png
diff --git a/docs/pictures/nomis_screenshot1.png b/docs/pictures/nomis_screenshot1.png
diff --git a/docs/pictures/parliamentary_earnings.png b/docs/pictures/parliamentary_earnings.png
diff --git a/docs/validation/constituencies.ipynb b/docs/validation/constituencies.ipynb
diff --git a/docs/validation/local_authorities.ipynb b/docs/validation/local_authorities.ipynb
diff --git a/policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py b/policyengine_uk_data/datasets/frs/local_areas/constituencies/calibrate.py
@@ -4,6 +4,7 @@
 import numpy as np
 from tqdm import tqdm
 import h5py
+import os
 from policyengine_uk_data.datasets.frs.local_areas.constituencies.transform_constituencies import (
     transform_2010_to_2024,
 )
@@ -57,6 +58,18 @@ def loss(w):
 
         return mse_c + mse_n
 
+    def pct_close(w, t=0.1):
+        # Return the percentage of metrics that are within t% of the target
+        pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
+        e_c = torch.sum(torch.abs((pred_c / (1 + y) - 1)) < t)
+        c_c = pred_c.shape[0] * pred_c.shape[1]
+
+        pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
+        e_n = torch.sum(torch.abs((pred_n / (1 + y_national) - 1)) < t)
+        c_n = pred_n.shape[0]
+
+        return (e_c + e_n) / (c_c + c_n)
+
     def dropout_weights(weights, p):
         if p == 0:
             return weights
@@ -69,16 +82,17 @@ def dropout_weights(weights, p):
 
     optimizer = torch.optim.Adam([weights], lr=0.1)
 
-    desc = range(512)
+    desc = range(32) if os.environ.get("DATA_LITE") else range(256)
 
     for epoch in desc:
         optimizer.zero_grad()
         weights_ = dropout_weights(weights, 0.05)
         l = loss(torch.exp(weights_))
         l.backward()
         optimizer.step()
-        if epoch % 50 == 0:
-            print(f"Loss: {l.item()}, Epoch: {epoch}")
+        close = pct_close(torch.exp(weights_))
+        if epoch % 10 == 0:
+            print(f"Loss: {l.item()}, Epoch: {epoch}, Within 10%: {close:.2%}")
 
     final_weights = torch.exp(weights).detach().numpy()
     mapping_matrix = pd.read_csv(

diff --git a/policyengine_uk_data/datasets/frs/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/frs/local_areas/local_authorities/calibrate.py
@@ -4,10 +4,11 @@
 import numpy as np
 from tqdm import tqdm
 import h5py
+import os
 from policyengine_uk_data.storage import STORAGE_FOLDER
 
 
-from loss import (
+from policyengine_uk_data.datasets.frs.local_areas.local_authorities.loss import (
     create_local_authority_target_matrix,
     create_national_target_matrix,
 )
@@ -50,6 +51,18 @@ def loss(w):
 
         return mse_c + mse_n
 
+    def pct_close(w, t=0.1):
+        # Return the percentage of metrics that are within t% of the target
+        pred_c = (w.unsqueeze(-1) * metrics.unsqueeze(0)).sum(dim=1)
+        e_c = torch.sum(torch.abs((pred_c / (1 + y) - 1)) < t)
+        c_c = pred_c.shape[0] * pred_c.shape[1]
+
+        pred_n = (w.sum(axis=0) * matrix_national.T).sum(axis=1)
+        e_n = torch.sum(torch.abs((pred_n / (1 + y_national) - 1)) < t)
+        c_n = pred_n.shape[0]
+
+        return (e_c + e_n) / (c_c + c_n)
+
     def dropout_weights(weights, p):
         if p == 0:
             return weights
@@ -62,16 +75,17 @@ def dropout_weights(weights, p):
 
     optimizer = torch.optim.Adam([weights], lr=0.1)
 
-    desc = range(512)
+    desc = range(32) if os.environ.get("DATA_LITE") else range(256)
 
     for epoch in desc:
         optimizer.zero_grad()
         weights_ = dropout_weights(weights, 0.05)
         l = loss(torch.exp(weights_))
         l.backward()
         optimizer.step()
-        if epoch % 50 == 0:
-            print(f"Loss: {l.item()}, Epoch: {epoch}")
+        close = pct_close(torch.exp(weights_))
+        if epoch % 10 == 0:
+            print(f"Loss: {l.item()}, Epoch: {epoch}, Within 10%: {close:.2%}")
 
         if epoch % 100 == 0:
             final_weights = torch.exp(weights).detach().numpy()

diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py
@@ -28,6 +28,5 @@ def extract_zipped_folder(folder):
         repo_filename=file.name,
         local_folder=file.parent,
     )
-    print(f"Extracting {file}")
     extract_zipped_folder(file)
     file.unlink()
diff --git a/policyengine_uk_data/utils/huggingface.py b/policyengine_uk_data/utils/huggingface.py
@@ -9,14 +9,14 @@ def download(
     token = os.environ.get(
         "HUGGING_FACE_TOKEN",
     )
-    login(token=token)
 
     hf_hub_download(
         repo_id=repo,
         repo_type="model",
         filename=repo_filename,
         local_dir=local_folder,
         revision=version,
+        token=token,
     )
 
 

diff --git a/policyengine_uk_data/utils/reweight.py b/policyengine_uk_data/utils/reweight.py
@@ -1,5 +1,6 @@
 import numpy as np
 import torch
+import os
 
 
 def reweight(
@@ -32,6 +33,12 @@ def loss(weights):
             raise ValueError("Relative error contains NaNs")
         return rel_error.mean()
 
+    def pct_close(weights, t=0.1):
+        # Return the percentage of metrics that are within t% of the target
+        estimate = weights @ loss_matrix
+        abs_error = torch.abs((estimate - targets_array) / (1 + targets_array))
+        return (abs_error < t).sum() / abs_error.numel()
+
     def dropout_weights(weights, p):
         if p == 0:
             return weights
@@ -47,17 +54,20 @@ def dropout_weights(weights, p):
 
     start_loss = None
 
-    iterator = range(1_000)
+    iterator = range(128) if os.environ.get("DATA_LITE") else range(2048)
     for i in iterator:
         optimizer.zero_grad()
         weights_ = dropout_weights(weights, dropout_rate)
         l = loss(torch.exp(weights_))
+        close = pct_close(torch.exp(weights_))
         if start_loss is None:
             start_loss = l.item()
         loss_rel_change = (l.item() - start_loss) / start_loss
         l.backward()
         if i % 100 == 0:
-            print(f"Loss: {l.item()}, Rel change: {loss_rel_change}")
+            print(
+                f"Loss: {l.item()}, Rel change: {loss_rel_change}, Epoch: {i}, Within 10%: {close:.2%}"
+            )
         optimizer.step()
 
     return torch.exp(weights).detach().numpy()