Skip to content

Commit

Permalink
Calibration improvements (#36)
Browse files Browse the repository at this point in the history
* Increase epochs per year to 10k

* Update data urls

* Add calibration improvements
  • Loading branch information
nikhilwoodruff authored Oct 18, 2024
1 parent a5dbc4d commit b8d68f3
Show file tree
Hide file tree
Showing 11 changed files with 450 additions and 74 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@
!incomes.csv
!tax_benefit.csv
!demographics.csv
!incomes_projection.csv
**/_build
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.6.0] - 2024-10-18 16:05:10

### Added

- Future year income targeting.
- Random takeup variable values.

## [1.5.0] - 2024-10-16 17:05:58

### Added
Expand Down Expand Up @@ -66,6 +73,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0



[1.6.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.5.0...1.6.0
[1.5.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.4.0...1.5.0
[1.4.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.0...1.4.0
[1.3.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.2.5...1.3.0
Expand Down
6 changes: 6 additions & 0 deletions changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,9 @@
added:
- Moved epoch count to 10k per year.
date: 2024-10-16 17:05:58
- bump: minor
changes:
added:
- Future year income targeting.
- Random takeup variable values.
date: 2024-10-18 16:05:10
63 changes: 1 addition & 62 deletions policyengine_uk_data/datasets/frs/enhanced_frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

try:
import torch
from policyengine_uk_data.utils.reweight import reweight
except ImportError:
torch = None

Expand Down Expand Up @@ -59,68 +60,6 @@ class EnhancedFRS_2022_23(EnhancedFRS):
url = "release://PolicyEngine/ukda/1.5.0/enhanced_frs_2022_23.h5"


def reweight(
original_weights,
loss_matrix,
targets_array,
dropout_rate=0.05,
):
target_names = np.array(loss_matrix.columns)
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
targets_array = torch.tensor(targets_array, dtype=torch.float32)
weights = torch.tensor(
np.log(original_weights), requires_grad=True, dtype=torch.float32
)

# TODO: replace this with a call to the python reweight.py package.
def loss(weights):
# Check for Nans in either the weights or the loss matrix
if torch.isnan(weights).any():
raise ValueError("Weights contain NaNs")
if torch.isnan(loss_matrix).any():
raise ValueError("Loss matrix contains NaNs")
estimate = weights @ loss_matrix
if torch.isnan(estimate).any():
raise ValueError("Estimate contains NaNs")
rel_error = (
((estimate - targets_array) + 1) / (targets_array + 1)
) ** 2
if torch.isnan(rel_error).any():
raise ValueError("Relative error contains NaNs")
return rel_error.mean()

def dropout_weights(weights, p):
if p == 0:
return weights
# Replace p% of the weights with the mean value of the rest of them
mask = torch.rand_like(weights) < p
mean = weights[~mask].mean()
masked_weights = weights.clone()
masked_weights[mask] = mean
return masked_weights

optimizer = torch.optim.Adam([weights], lr=1e-1)
from tqdm import trange

start_loss = None

iterator = trange(10_000)
for i in iterator:
optimizer.zero_grad()
weights_ = dropout_weights(weights, dropout_rate)
l = loss(torch.exp(weights_))
if start_loss is None:
start_loss = l.item()
loss_rel_change = (l.item() - start_loss) / start_loss
l.backward()
iterator.set_postfix(
{"loss": l.item(), "loss_rel_change": loss_rel_change}
)
optimizer.step()

return torch.exp(weights).detach().numpy()


if __name__ == "__main__":
ReweightedFRS_2022_23().generate()
EnhancedFRS_2022_23().generate()
23 changes: 23 additions & 0 deletions policyengine_uk_data/datasets/frs/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,29 @@ def generate(self):

self.save_dataset(frs)

self.add_random_variables(frs)

def add_random_variables(self, frs: dict):
from policyengine_uk import Microsimulation

simulation = Microsimulation(dataset=self)
RANDOM_VARIABLES = [
"attends_private_school",
"would_evade_tv_licence_fee",
"would_claim_pc",
"would_claim_uc",
"would_claim_child_benefit",
"main_residential_property_purchased_is_first_home",
"household_owns_tv",
"is_higher_earner",
]
INPUT_PERIODS = list(range(self.time_period, self.time_period + 10))
for variable in RANDOM_VARIABLES:
value = simulation.calculate(variable, self.time_period).values
frs[variable] = {period: value for period in INPUT_PERIODS}

self.save_dataset(frs)


class FRS_2020_21(FRS):
dwp_frs = DWP_FRS_2020_21
Expand Down
25 changes: 23 additions & 2 deletions policyengine_uk_data/datasets/spi.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,29 @@ def generate(self):
data["savings_starter_rate_income"] = np.zeros(len(df))
data["capital_allowances"] = df.CAPALL
data["loss_relief"] = df.LOSSBF
data["is_SP_age"] = df.SPA == 1
data["state_pension"] = df.SRP

AGE_RANGES = {
-1: (16, 70),
1: (16, 25),
2: (25, 35),
3: (35, 45),
4: (45, 55),
5: (55, 65),
6: (65, 74),
7: (74, 90),
}
age_range = df.AGERANGE

# Randomly assign ages in age ranges

percent_along_age_range = np.random.rand(len(df))
min_age = np.array([AGE_RANGES[age][0] for age in age_range])
max_age = np.array([AGE_RANGES[age][1] for age in age_range])
data["age"] = (
min_age + (max_age - min_age) * percent_along_age_range
).astype(int)

data["state_pension_reported"] = df.SRP
data["other_tax_credits"] = df.TAX_CRED
data["miscellaneous_income"] = (
df.MOTHINC
Expand Down
Loading

0 comments on commit b8d68f3

Please sign in to comment.