Calibration improvements (#36)

* Increase epochs per year to 10k * Update data urls * Add calibration improvements
PolicyEngine · Oct 18, 2024 · b8d68f3 · b8d68f3
1 parent a5dbc4d
commit b8d68f3
Show file tree

Hide file tree

Showing 11 changed files with 450 additions and 74 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,5 @@
 !incomes.csv
 !tax_benefit.csv
 !demographics.csv
+!incomes_projection.csv
 **/_build
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.6.0] - 2024-10-18 16:05:10
+
+### Added
+
+- Future year income targeting.
+- Random takeup variable values.
+
 ## [1.5.0] - 2024-10-16 17:05:58
 
 ### Added
@@ -66,6 +73,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[1.6.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.5.0...1.6.0
 [1.5.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.4.0...1.5.0
 [1.4.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.0...1.4.0
 [1.3.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.2.5...1.3.0

diff --git a/changelog.yaml b/changelog.yaml
@@ -54,3 +54,9 @@
     added:
     - Moved epoch count to 10k per year.
   date: 2024-10-16 17:05:58
+- bump: minor
+  changes:
+    added:
+    - Future year income targeting.
+    - Random takeup variable values.
+  date: 2024-10-18 16:05:10
diff --git a/policyengine_uk_data/datasets/frs/enhanced_frs.py b/policyengine_uk_data/datasets/frs/enhanced_frs.py
@@ -11,6 +11,7 @@
 
 try:
     import torch
+    from policyengine_uk_data.utils.reweight import reweight
 except ImportError:
     torch = None
 
@@ -59,68 +60,6 @@ class EnhancedFRS_2022_23(EnhancedFRS):
     url = "release://PolicyEngine/ukda/1.5.0/enhanced_frs_2022_23.h5"
 
 
-def reweight(
-    original_weights,
-    loss_matrix,
-    targets_array,
-    dropout_rate=0.05,
-):
-    target_names = np.array(loss_matrix.columns)
-    loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
-    targets_array = torch.tensor(targets_array, dtype=torch.float32)
-    weights = torch.tensor(
-        np.log(original_weights), requires_grad=True, dtype=torch.float32
-    )
-
-    # TODO: replace this with a call to the python reweight.py package.
-    def loss(weights):
-        # Check for Nans in either the weights or the loss matrix
-        if torch.isnan(weights).any():
-            raise ValueError("Weights contain NaNs")
-        if torch.isnan(loss_matrix).any():
-            raise ValueError("Loss matrix contains NaNs")
-        estimate = weights @ loss_matrix
-        if torch.isnan(estimate).any():
-            raise ValueError("Estimate contains NaNs")
-        rel_error = (
-            ((estimate - targets_array) + 1) / (targets_array + 1)
-        ) ** 2
-        if torch.isnan(rel_error).any():
-            raise ValueError("Relative error contains NaNs")
-        return rel_error.mean()
-
-    def dropout_weights(weights, p):
-        if p == 0:
-            return weights
-        # Replace p% of the weights with the mean value of the rest of them
-        mask = torch.rand_like(weights) < p
-        mean = weights[~mask].mean()
-        masked_weights = weights.clone()
-        masked_weights[mask] = mean
-        return masked_weights
-
-    optimizer = torch.optim.Adam([weights], lr=1e-1)
-    from tqdm import trange
-
-    start_loss = None
-
-    iterator = trange(10_000)
-    for i in iterator:
-        optimizer.zero_grad()
-        weights_ = dropout_weights(weights, dropout_rate)
-        l = loss(torch.exp(weights_))
-        if start_loss is None:
-            start_loss = l.item()
-        loss_rel_change = (l.item() - start_loss) / start_loss
-        l.backward()
-        iterator.set_postfix(
-            {"loss": l.item(), "loss_rel_change": loss_rel_change}
-        )
-        optimizer.step()
-
-    return torch.exp(weights).detach().numpy()
-
-
 if __name__ == "__main__":
     ReweightedFRS_2022_23().generate()
     EnhancedFRS_2022_23().generate()
diff --git a/policyengine_uk_data/datasets/frs/frs.py b/policyengine_uk_data/datasets/frs/frs.py
@@ -92,6 +92,29 @@ def generate(self):
 
         self.save_dataset(frs)
 
+        self.add_random_variables(frs)
+
+    def add_random_variables(self, frs: dict):
+        from policyengine_uk import Microsimulation
+
+        simulation = Microsimulation(dataset=self)
+        RANDOM_VARIABLES = [
+            "attends_private_school",
+            "would_evade_tv_licence_fee",
+            "would_claim_pc",
+            "would_claim_uc",
+            "would_claim_child_benefit",
+            "main_residential_property_purchased_is_first_home",
+            "household_owns_tv",
+            "is_higher_earner",
+        ]
+        INPUT_PERIODS = list(range(self.time_period, self.time_period + 10))
+        for variable in RANDOM_VARIABLES:
+            value = simulation.calculate(variable, self.time_period).values
+            frs[variable] = {period: value for period in INPUT_PERIODS}
+
+        self.save_dataset(frs)
+
 
 class FRS_2020_21(FRS):
     dwp_frs = DWP_FRS_2020_21

diff --git a/policyengine_uk_data/datasets/spi.py b/policyengine_uk_data/datasets/spi.py
@@ -64,8 +64,29 @@ def generate(self):
         data["savings_starter_rate_income"] = np.zeros(len(df))
         data["capital_allowances"] = df.CAPALL
         data["loss_relief"] = df.LOSSBF
-        data["is_SP_age"] = df.SPA == 1
-        data["state_pension"] = df.SRP
+
+        AGE_RANGES = {
+            -1: (16, 70),
+            1: (16, 25),
+            2: (25, 35),
+            3: (35, 45),
+            4: (45, 55),
+            5: (55, 65),
+            6: (65, 74),
+            7: (74, 90),
+        }
+        age_range = df.AGERANGE
+
+        # Randomly assign ages in age ranges
+
+        percent_along_age_range = np.random.rand(len(df))
+        min_age = np.array([AGE_RANGES[age][0] for age in age_range])
+        max_age = np.array([AGE_RANGES[age][1] for age in age_range])
+        data["age"] = (
+            min_age + (max_age - min_age) * percent_along_age_range
+        ).astype(int)
+
+        data["state_pension_reported"] = df.SRP
         data["other_tax_credits"] = df.TAX_CRED
         data["miscellaneous_income"] = (
             df.MOTHINC