From 1142c8ce9cbe803b8d97a34ca43d528ef2672d72 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 11:31:49 +0100 Subject: [PATCH 01/14] Lighten dependencies and add changelog --- .github/CONTRIBUTING.md | 3 +++ .github/changelog_template.md | 8 ++++++ .github/fetch_version.py | 13 ++++++++++ .github/get-changelog-diff.sh | 2 ++ .github/has-functional-changes.sh | 12 +++++++++ .github/is-version-number-acceptable.sh | 33 +++++++++++++++++++++++++ .github/publish-git-tag.sh | 4 +++ .github/workflows/ci_cd.yaml | 29 ++++++++++++++++++++++ changelog.yaml | 5 ++++ changelog_entry.yaml | 4 +++ pyproject.toml | 6 ++--- 11 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 .github/CONTRIBUTING.md create mode 100644 .github/changelog_template.md create mode 100644 .github/fetch_version.py create mode 100755 .github/get-changelog-diff.sh create mode 100755 .github/has-functional-changes.sh create mode 100755 .github/is-version-number-acceptable.sh create mode 100755 .github/publish-git-tag.sh create mode 100644 changelog.yaml create mode 100644 changelog_entry.yaml diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..5b5d24e --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,3 @@ +## Updating data + +If your changes present a non-bugfix change to one or more datasets which are cloud-hosted (FRS and EFRS), then please change both the filename and URL (in both the class definition file and in `storage/upload_completed_datasets.py`). This enables us to store historical versions of datasets separately and reproducibly. diff --git a/.github/changelog_template.md b/.github/changelog_template.md new file mode 100644 index 0000000..8a1e679 --- /dev/null +++ b/.github/changelog_template.md @@ -0,0 +1,8 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +{{changelog}} \ No newline at end of file diff --git a/.github/fetch_version.py b/.github/fetch_version.py new file mode 100644 index 0000000..f130f40 --- /dev/null +++ b/.github/fetch_version.py @@ -0,0 +1,13 @@ +from policyengine_us_data.__version__ import __version__ + + +def fetch_version(): + try: + return __version__ + except Exception as e: + print(f"Error fetching version: {e}") + return None + + +if __name__ == "__main__": + print(fetch_version()) diff --git a/.github/get-changelog-diff.sh b/.github/get-changelog-diff.sh new file mode 100755 index 0000000..66c2bfd --- /dev/null +++ b/.github/get-changelog-diff.sh @@ -0,0 +1,2 @@ +last_tagged_commit=`git describe --tags --abbrev=0 --first-parent` +git --no-pager diff $last_tagged_commit -- CHANGELOG.md \ No newline at end of file diff --git a/.github/has-functional-changes.sh b/.github/has-functional-changes.sh new file mode 100755 index 0000000..169689a --- /dev/null +++ b/.github/has-functional-changes.sh @@ -0,0 +1,12 @@ +#! /usr/bin/env bash + +IGNORE_DIFF_ON="README.md CONTRIBUTING.md Makefile docs/* .gitignore LICENSE* .github/* data/*" + +last_tagged_commit=`git describe --tags --abbrev=0 --first-parent` # --first-parent ensures we don't follow tags not published in master through an unlikely intermediary merge commit + +if git diff-index --name-only --exit-code $last_tagged_commit -- . `echo " $IGNORE_DIFF_ON" | sed 's/ / :(exclude)/g'` # Check if any file that has not be listed in IGNORE_DIFF_ON has changed since the last tag was published. +then + echo "No functional changes detected." + exit 1 +else echo "The functional files above were changed." +fi diff --git a/.github/is-version-number-acceptable.sh b/.github/is-version-number-acceptable.sh new file mode 100755 index 0000000..a9067e6 --- /dev/null +++ b/.github/is-version-number-acceptable.sh @@ -0,0 +1,33 @@ +#! /usr/bin/env bash + +if [[ ${GITHUB_REF#refs/heads/} == master ]] +then + echo "No need for a version check on master." + exit 0 +fi + +if ! $(dirname "$BASH_SOURCE")/has-functional-changes.sh +then + echo "No need for a version update." + exit 0 +fi + +current_version=`python .github/fetch_version.py` + +if git rev-parse --verify --quiet $current_version +then + echo "Version $current_version already exists in commit:" + git --no-pager log -1 $current_version + echo + echo "Update the version number in setup.py before merging this branch into master." + echo "Look at the CONTRIBUTING.md file to learn how the version number should be updated." + exit 1 +fi + +if ! $(dirname "$BASH_SOURCE")/has-functional-changes.sh | grep --quiet CHANGELOG.md +then + echo "CHANGELOG.md has not been modified, while functional changes were made." + echo "Explain what you changed before merging this branch into master." + echo "Look at the CONTRIBUTING.md file to learn how to write the changelog." + exit 2 +fi diff --git a/.github/publish-git-tag.sh b/.github/publish-git-tag.sh new file mode 100755 index 0000000..9437a66 --- /dev/null +++ b/.github/publish-git-tag.sh @@ -0,0 +1,4 @@ +#! /usr/bin/env bash + +git tag `python .github/fetch_version.py` # create a new tag +git push --tags || true # update the repository version diff --git a/.github/workflows/ci_cd.yaml b/.github/workflows/ci_cd.yaml index 3db31f2..4df3e8c 100644 --- a/.github/workflows/ci_cd.yaml +++ b/.github/workflows/ci_cd.yaml @@ -68,6 +68,35 @@ jobs: run: make data - name: Run tests run: pytest + check-version: + name: Check version + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history for all tags and branches + repository: ${{ github.event.pull_request.head.repo.full_name }} + ref: ${{ github.event.pull_request.head.ref }} + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Build changelog + run: pip install "yaml-changelog>=0.1.7" && make changelog + - name: Preview changelog update + run: ".github/get-changelog-diff.sh" + - name: Check version number has been properly updated + run: ".github/is-version-number-acceptable.sh" + - name: Update changelog + uses: EndBug/add-and-commit@v9 + with: + add: "." + committer_name: Github Actions[bot] + author_name: Github Actions[bot] + message: Update PolicyEngine US data + github_token: ${{ secrets.POLICYENGINE_GITHUB }} docker: name: Docker diff --git a/changelog.yaml b/changelog.yaml new file mode 100644 index 0000000..8944a4b --- /dev/null +++ b/changelog.yaml @@ -0,0 +1,5 @@ +- changes: + added: + - Initialized changelogging + date: 2024-09-09 17:29:10 + version: 1.0.0 diff --git a/changelog_entry.yaml b/changelog_entry.yaml new file mode 100644 index 0000000..f3b708c --- /dev/null +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + changed: + - Lightened dependency list. diff --git a/pyproject.toml b/pyproject.toml index 640810d..2ecfff7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,6 @@ license = {file = "LICENSE"} requires-python = ">=3.6" dependencies = [ "policyengine_core", - "tables", - "survey_enhance", - "torch", "requests", "tqdm", "tabulate", @@ -29,6 +26,9 @@ dev = [ "pytest", "policyengine_uk>=1.8.0", "streamlit", + "survey_enhance", + "torch", + "tables", ] [tool.setuptools] From 84ab3ff508ead2588174a203f86b9e17ffd064cc Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 11:59:00 +0100 Subject: [PATCH 02/14] Add missing uploads --- Makefile | 1 + .../datasets/frs/enhanced_frs.py | 2 ++ .../datasets/frs/extended_frs.py | 1 + .../storage/upload_completed_datasets.py | 17 +++++++++++++++++ 4 files changed, 21 insertions(+) diff --git a/Makefile b/Makefile index b254604..ed38cdd 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ documentation: data: python policyengine_uk_data/datasets/frs/dwp_frs.py python policyengine_uk_data/datasets/frs/frs.py + python policyengine_uk_data/datasets/frs/enhanced_frs.py build: python -m build diff --git a/policyengine_uk_data/datasets/frs/enhanced_frs.py b/policyengine_uk_data/datasets/frs/enhanced_frs.py index 28b873b..6a773e9 100644 --- a/policyengine_uk_data/datasets/frs/enhanced_frs.py +++ b/policyengine_uk_data/datasets/frs/enhanced_frs.py @@ -38,6 +38,7 @@ class ReweightedFRS_2022_23(EnhancedFRS): input_frs = FRS_2022_23 time_period = 2022 end_year = 2022 + url = "release://PolicyEngine/ukda/reweighted_frs_2022_23.h5" class EnhancedFRS_2022_23(EnhancedFRS): @@ -100,4 +101,5 @@ def loss(weights): if __name__ == "__main__": + ReweightedFRS_2022_23().generate() EnhancedFRS_2022_23().generate() diff --git a/policyengine_uk_data/datasets/frs/extended_frs.py b/policyengine_uk_data/datasets/frs/extended_frs.py index a421d0d..085ef38 100644 --- a/policyengine_uk_data/datasets/frs/extended_frs.py +++ b/policyengine_uk_data/datasets/frs/extended_frs.py @@ -94,6 +94,7 @@ class ExtendedFRS_2022_23(ExtendedFRS): data_format = Dataset.TIME_PERIOD_ARRAYS input_frs = FRS_2022_23 time_period = 2022 + url = "release://PolicyEngine/ukda/extended_frs_2022_23.h5" if __name__ == "__main__": diff --git a/policyengine_uk_data/storage/upload_completed_datasets.py b/policyengine_uk_data/storage/upload_completed_datasets.py index 8d97ba1..6d6490b 100644 --- a/policyengine_uk_data/storage/upload_completed_datasets.py +++ b/policyengine_uk_data/storage/upload_completed_datasets.py @@ -1,4 +1,21 @@ from policyengine_uk_data.utils.github import upload from pathlib import Path +from tqdm import tqdm FOLDER = Path(__file__).parent + +FILES = [ + "cps_2022_23.h5", + "enhanced_frs_2022_23.h5", + "extended_frs_2022_23.h5", + "reweighted_frs_2022_23.h5", +] + +for file in tqdm(FILES): + upload( + "PolicyEngine", + "ukda", + "release", + file, + FOLDER / file, + ) From 153339bc0d05f46eec90d5955eef9a330d6000cf Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 12:17:18 +0100 Subject: [PATCH 03/14] Fix bug in uploads --- .../storage/upload_completed_datasets.py | 2 +- policyengine_uk_data/utils/github.py | 77 +++++++------------ 2 files changed, 28 insertions(+), 51 deletions(-) diff --git a/policyengine_uk_data/storage/upload_completed_datasets.py b/policyengine_uk_data/storage/upload_completed_datasets.py index 6d6490b..513bbbb 100644 --- a/policyengine_uk_data/storage/upload_completed_datasets.py +++ b/policyengine_uk_data/storage/upload_completed_datasets.py @@ -5,7 +5,7 @@ FOLDER = Path(__file__).parent FILES = [ - "cps_2022_23.h5", + "frs_2022_23.h5", "enhanced_frs_2022_23.h5", "extended_frs_2022_23.h5", "reweighted_frs_2022_23.h5", diff --git a/policyengine_uk_data/utils/github.py b/policyengine_uk_data/utils/github.py index 43a05e2..27c88e1 100644 --- a/policyengine_uk_data/utils/github.py +++ b/policyengine_uk_data/utils/github.py @@ -1,8 +1,6 @@ import os import requests from tqdm import tqdm -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry import time auth_headers = { @@ -62,66 +60,45 @@ def download( f.write(response.content) -def create_session_with_retries(): - session = requests.Session() - retries = Retry( - total=5, backoff_factor=1, status_forcelist=[502, 503, 504] - ) - session.mount("https://", HTTPAdapter(max_retries=retries)) - return session - - def upload( org: str, repo: str, release_tag: str, file_name: str, file_path: str ) -> bytes: release_id = get_release_id(org, repo, release_tag) + + # First, list release assets + url = f"https://api.github.com/repos/{org}/{repo}/releases/{release_id}/assets" + response = requests.get(url, headers=auth_headers).json() + names = [asset["name"] for asset in response] + if file_name in names: + print( + f"Asset {file_name} already exists in release {release_tag} of {org}/{repo}, skipping." + ) + return + url = f"https://uploads.github.com/repos/{org}/{repo}/releases/{release_id}/assets?name={file_name}" - file_size = os.path.getsize(file_path) headers = { "Accept": "application/vnd.github.v3+json", "Content-Type": "application/octet-stream", **auth_headers, } - session = create_session_with_retries() - - max_retries = 3 - for attempt in range(max_retries): - try: - with open(file_path, "rb") as f: - with tqdm(total=file_size, unit="B", unit_scale=True) as pbar: - response = session.post( - url, - headers=headers, - data=f, - stream=True, - hooks=dict( - response=lambda r, *args, **kwargs: pbar.update( - len(r.content) - ) - ), - timeout=300, # 5 minutes timeout - ) - - if response.status_code == 201: - return response.json() - else: - print( - f"Attempt {attempt + 1} failed with status code {response.status_code}. Response: {response.text}" - ) - - except requests.exceptions.RequestException as e: - print(f"Attempt {attempt + 1} failed with error: {str(e)}") - - if attempt < max_retries - 1: - wait_time = ( - attempt + 1 - ) * 60 # Wait 1 minute, then 2 minutes, then 3 minutes - print(f"Waiting {wait_time} seconds before retrying...") - time.sleep(wait_time) - - raise ValueError(f"Failed to upload file after {max_retries} attempts.") + with open(file_path, "rb") as f: + data = f.read() + + response = requests.post( + url, + headers=headers, + data=data, + ) + + if response.status_code != 201: + raise ValueError( + f"Invalid response code {response.status_code} for url {url}. Received: {response.text}" + ) + + return response.json() + def set_pr_auto_review_comment(text: str): From b67924cca8fa9fc6298b3f5ef976f2f770561a28 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 18:29:44 +0100 Subject: [PATCH 04/14] Add Jupyter Book documentation --- .github/workflows/ci_cd.yaml | 27 ++- .gitignore | 1 + Makefile | 2 +- docker/docs.Dockerfile | 5 - docs/_config.yml | 22 ++ docs/_static/style.css | 2 + docs/_toc.yml | 5 + docs/{Home.py => intro.md} | 16 +- docs/logo.png | Bin 0 -> 10188 bytes docs/pages/Methodology.py | 301 --------------------------- docs/pages/Validation.py | 88 -------- docs/utils.py | 7 + policyengine_uk_data/utils/github.py | 7 +- pyproject.toml | 3 +- 14 files changed, 71 insertions(+), 415 deletions(-) delete mode 100644 docker/docs.Dockerfile create mode 100644 docs/_config.yml create mode 100644 docs/_static/style.css create mode 100644 docs/_toc.yml rename docs/{Home.py => intro.md} (81%) create mode 100644 docs/logo.png delete mode 100644 docs/pages/Methodology.py delete mode 100644 docs/pages/Validation.py create mode 100644 docs/utils.py diff --git a/.github/workflows/ci_cd.yaml b/.github/workflows/ci_cd.yaml index 4df3e8c..0e7afc5 100644 --- a/.github/workflows/ci_cd.yaml +++ b/.github/workflows/ci_cd.yaml @@ -30,6 +30,29 @@ jobs: user: __token__ password: ${{ secrets.PYPI }} skip-existing: true + publish-docs: + name: Publish documentation + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history for all tags and branches + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install package + run: pip install -e ".[dev]" + - name: Build Jupyter Book + run: make documentation + - name: Deploy documentation + uses: JamesIves/github-pages-deploy-action@releases/v4 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BRANCH: gh-pages + FOLDER: docs/_build/html lint: runs-on: ubuntu-latest name: Lint @@ -47,7 +70,7 @@ jobs: run: black . -l 79 --check test: - name: Build and Test + name: Build and test runs-on: ubuntu-latest steps: - name: Checkout code @@ -68,6 +91,8 @@ jobs: run: make data - name: Run tests run: pytest + - name: Test documentation builds + run: make documentation check-version: name: Check version if: github.event_name == 'pull_request' diff --git a/.gitignore b/.gitignore index e417f9d..61b49b8 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ !incomes.csv !tax_benefit.csv !demographics.csv +**/_build diff --git a/Makefile b/Makefile index ed38cdd..f78fd5b 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ docker: docker buildx build --platform linux/amd64 . -t policyengine-uk-data:latest documentation: - streamlit run docs/Home.py + jb clean docs && jb build docs data: python policyengine_uk_data/datasets/frs/dwp_frs.py diff --git a/docker/docs.Dockerfile b/docker/docs.Dockerfile deleted file mode 100644 index 090ca7e..0000000 --- a/docker/docs.Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM python:latest -COPY . . -RUN make install -EXPOSE 8080 -ENTRYPOINT ["streamlit", "run", "docs/Home.py", "--server.port=8080", "--server.address=0.0.0.0"] diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..711cc60 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,22 @@ +title: PolicyEngine UK data +author: PolicyEngine +copyright: "2024" +logo: logo.png + +execute: + execute_notebooks: off + +repository: + url: https://github.com/policyengine/policyengine-uk-data + branch: master + path_to_book: docs + +sphinx: + config: + html_js_files: + - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.7/require.min.js + html_theme: furo + pygments_style: default + html_css_files: + - style.css + nb_remove_code_source: true \ No newline at end of file diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000..e511f94 --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,2 @@ +@import url('https://fonts.googleapis.com/css2?family=Roboto+Serif:opsz@8..144&family=Roboto:wght@300&display=swap'); + diff --git a/docs/_toc.yml b/docs/_toc.yml new file mode 100644 index 0000000..4b8640a --- /dev/null +++ b/docs/_toc.yml @@ -0,0 +1,5 @@ +format: jb-book +root: intro +chapters: +- file: methodology.ipynb +- file: validation.ipynb diff --git a/docs/Home.py b/docs/intro.md similarity index 81% rename from docs/Home.py rename to docs/intro.md index 674f08e..af04a85 100644 --- a/docs/Home.py +++ b/docs/intro.md @@ -1,16 +1,5 @@ -import streamlit as st -from policyengine_uk_data.utils.download_docs_prerequisites import ( - download_data, -) +# Introduction -download_data() - -st.set_page_config(layout="wide") - -st.title("PolicyEngine-UK-Data") - -st.write( - """ PolicyEngine-UK-Data is a package that creates representative microdata for the UK, designed for input in the PolicyEngine tax-benefit microsimulation model. This tool allows users to explore the data sources, validation processes, and enhancements @@ -22,5 +11,4 @@ * An accurate representation of the current UK household sector *now*. This repository is dedicated to the second of those. In this documentation, we'll explain how we do that, but we'll also use our model (the first bullet) to see what we end up with when we combine the two, and measure up against other organisations doing the same thing. -""" -) + diff --git a/docs/logo.png b/docs/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..12736e4dce8158bb6ed2557a2bf9d0856811758e GIT binary patch literal 10188 zcmd5?_divC{J+^Fgp@KXD=Sg4M+(7Dak3Ok3EL!r2nD3~2_tiu%TSLinlj4$ou(w#vd2fn8 zv={V2=i{s%T15&QsS|e;rdUcLdcSPf_49o@HO);#va{3}qa%=N+>UbYYlM3-!6fQ-CVxAd+QXv zdrLCQR{zB*F+$;Vb_(#ljwr;k7H?#jG7dwV5ULBD$ge1JRqGNgZ10Tfx~{ufiaeYJ z@r_XH6QMc!arP%te79p{QmUrwRW41yUktcap(fU_;QmF0+cnNbQj%GV`RU-A3|Zc* z;Tv%y@!hA-EUz~5Brv2aHxwn5CM&Fb>*ibN=j1_~_v%2yLOND9KQc``t+Nh?EZgw- zV5u=TL#W92TaLYLgxxt?n^kN~vBF!8UIn|ol3BjH8gMn%_H!VbIL%{kYtZLjzMAE# zV5Y?nBQS3}k1dHW(y3FRgV+)H1k6~RQgUnNXlX0xg&OFnmD+=N)jMc!`$dDY*)yNw z+IXl~Nhg|xy)vh7f!i*{!vevP#21M3~X z9yfdTv{K_WJG>s^<>S9Q;gI zYTmiHDSI@d{dVL}_nh>WJP7278l&Ev;y`+$zwXVC=+Z>j;}$~qMozwFt?-^u3t*cX zVV@2ISMxw(bGFX{K|ZE9OMA)<70Id2lN_eEMxo6w{VmD%M=)v!TTv#z_SP<0CaKoT zpFE%46gvk3l@5PZg|(OOOUmTl-YcpuL)XMC*P0~Tm;WgynsoIE5O|$ovhCb#L6EsU zS7Zq!g%MVFmz(a2R@vhXh`sE4A$ZyI1e*&W(64}>=jR$W()xF_P|Ee4d%R7iKSYcn z&bE9d;L~o8UF*ucE`y&)0@ePUu^TNKd=-=K ztL(Skw+P1wQ92WXWvFV-fr6{A^0e3GXc;D{@6Wtnh{&56`sg_-oW-eo2Q|0IKe+u> zl*65k%Qa1UI!4VM1ma~XrJ0+wv)mamb+hGB9-k^91Pvx$f1hLi&`#&ef|=wB{`&Yx z+MR#Ww@PW}BGApgyCRn_2F;h^n#c8e9@iMZ6DY7uJK)d}loZ;gy{LO-y_Hu61d97u zMY*|Qvfeu^%nD00^oJsw73P)BITv99^T*>Pl#RUe4OppaZ?`jmKy!tVkO`?It%*mj zV-BaLMV_auvNvtxxuRfVKaCyp4UVb*$>k@7SOp?;R}>;e(gql?E2ljL*JXCJbGfZ4 zI3vKu?{2pgM+N2^#LrYA$S2w70K(lI9#$34QHbP!>>i;{jQ+hs#4_$y9bUH`=$?4} zUoX$``YCk#@H$cBv_a8a+BVmPtZ}@>*%-!kcl7R1=}4(A%F%tVsQe+#x3VA}y7MRq z=U?*-UYUfNA4x0f=IPGqyGzN7&HhSldp?~bykB&&8@x?L-sNOmI3D?1rek!#^4K!E zZL&Ld%qybb%eL2wk&dy9)|H65*~3&p-|8Cdriv#WD3>c3BzPNhG;dnr`tbTDd7bF(0e= zP!KuXMYL&)S_;p)G^UbJe9FY%-4~47N=Qr&o8Xo@3sJ3_n8on1z%W?Lmtb?!i1O1C z6~JakQ0&K~w;=Y?CVkS5y%Jw-wl!^s~0^;H3UyE0Z0GQ9n6uowv;M(aXk`b_W<0RF{5TU=-q>!c?-L zN8u@D$!o4wHg5v??c^VG2*pvG`PZ=*lPS?X7hcSGD>7lY2ie~v%+ucI_e1Y=&VGJt}8isubbg1%+jSPb`~&1AJ(-Gy1{cE8oU`ppl-ubCT3 zul%wmP$8S_cTAG#{pndkm@xIJSf0AaKQ5Px$An~XPOIo#VNrx zKp@q)F+Fo0l=4HGD8Mab+2)>x+21&Bi-5r_3hzH2x64hNKIKkMQIP1y$m2$G$2Q86 z_TrYxj^4Ai*v?Vc?h`daf7DSmpCyh#rG?6FHyHP{QkijTbnQ_OqsT3#$fj){=H_i1 z@BW{sX-S8`c|%XF1;258Tac^OeX$pT@%E5>+@bF0`!tc}9}c$~b4+alO>Z9O8y6dqHSEV44<#&zdBfF$ zqt?m4KQ(DPJ`mdc+TB2vS7_t6)AWTWJ#P^^)42Z``tccKXmobKza)z?!>cN1KiYd} zyz{KX`U}oaV5GC?<%1IENG%UvVhGAZO}G=1AZd{l?Yhgki9C4htzymF;$4;-?mOzbPTina8{<%OHd{b_Ui%d}0=&e@E9 z9Jrr}NA%}C$w44{m;1cF97X;4R1JE*`^E0>eIW5g$a@b95-iPv%L`%GLnda7UXDZ3 z9$@HZ8O`+mk!_~c&w{w($GhU2Aq)X5j*2xk7DFiS^;#Y~_5trt%k7QN87)7BF};?j z&KH*XQR^PX{f}uVe^#p_Ix_fzBj+C5>EEG6)SKr`Y5r&lnEPbNuMzp0dS(iqgXgoH zR~g3;qBz$Vn4X%Hv(dJkf$2igHFa$HS6!r&dzw>)oj#d)|7e16)JD5pUpk2Y8_gCM z3tQ1_{KdP|KbbtesBboXHYM4Z1;pbxPo#SF*PKE}lbteswYQr3X{Z(qFvy+uAb;j(-q@|QdJcP7tqk|}RQW#f8}@t-L>A_b(CfX@Aq zgA!|6BwGs{dU$=GVd09;#DWL!wZZa?FH{qY8w&IVsx@UIu}bQzj?-!v-?j0i_K0IQ z$S1FNpHF#P_D-d~iv&R1m%OO^GW>!IXhFK0AFXOLNTq#4F>FF~lnqov6GV5<`n9-pB+vSS z@>V#6abL|nh;#-gQasMLMSRdcEI0KhP4FiQDuqN9dOFD7Sdb;%G!r<(Rf5!~6x)YP zxNoo7-27sSt-MP_@3`7rXD)E-TSF4+HjI9_2m?I_&}o^lY^b{7_&g^IWP%64_vHxM+|?oxO?|}QZ^d|q0Baj zoa)6gglEfIO0ViNT0x5M8_OHCGnpFx%V2&lu=}sW%)Toecc`ZL`HtM8<=`Rek~Wh+ zbDpf(iMtO#h3o3%iE9)>$&ssCnS~|N5e_jIz5Z!<1t64I;Xlix1MtZjsM9wi-_ry> zN3TW8rx^OQB*yt>SBV>0cXIB%`|fnf#{8$`*l^dSXaBA>H!94pA6E;)YgsY*NQ;BD zADBxYb*ydQJKw)pPlgg>n=g7>XDFr;ETCmd7moMNJ0R!Kyf73?nQL4zp4Xp9Yo9l@ zN($h6-g~^gi+xh=5y%(WP;uNbmVaEy_GhPje}??lq&wSKU#z78QS#~jpEI&H~ zogpl@gTlP@3b=4B?j@Mjew`Q#%)8aR`K-3|a_GnXQ$=mpt^s3kQSL_XwOdJLjg-ZN zUj=rH*mR1n!e%nr4}5zPj~9}U-4XKCA={IlC-rqayGMSq*| zRXB+@9bfMin=+@VfkMADzk<9kkUFxr{I1LM<&Yi#C+O3K5XBd|*!8q!)3X{l=r!9X ze_|17vA5OE6mjcJLJ)A|=pMJfUy`CBEmLDW=cE%8kMu_;KLwQw?Jf23Plk(EN~i%r zlS?7Bpxe?%CmUL`(i>XU;QugR0$~1twfuvn9ycxnMQkTa6|~ z@zO^Dbt{;;*`KhziZ)Yw;znz(+c~WdjOj!y(fifnn~!GvzNc9H$uB+8G?@Wio*rPr zl~J)cDJqZEr2x(esl6}@?VoYO!1vvUC%^YRQ)uVE@e6e8^bljnDNE{X4 z;Cwwk^tHQ`HJ7Pmy)v^?P}GFzpI_a+CFdh{*Y8I>udD=m3lqAKTHaYFyr<>$b&i(} z%(rmL`x+UitJ&l2Qwf?T&ydvQh7Zg$Y?vb)nqM5ICHj=q{_a=CRPJg5`Wn!Y`G@p3 zm7$)R&OFQtbUrnlOoQ>X-sNzA#0N_r@4;$Yl9}ji?<_O@^t8zE$|yaQP`<@n;B5Q9 z*FJsRQCBN-4D}zCVJ5134iPzjnI1 zc`6Y1vC(YiwegzG+Z~#3H)c^X0k!cNQ763Lnoz(>YI)T7_@wlQ3Pl?e_8}bCh|W9s zc9@|=qzVvV_C2~~FjU%V+BSrE=iqEMRW$v3(%+9&GMGeQtd`S~j9-y?iM|eN`Ss z?8njEyB~S%i71uFml>^Iq72i!1z+)8G5b@U9~ICJ4SE+=BUW9(sWLxN+S}w9bmAak z6Fa)}keT{95Hj6XuFZtE=}^~8Zoz{ddlz8a>U0nI8FwTtGK87Fgvjy&cq(8d^N;MG zDJzC~iRwB8q-k;Dgp~H|zs2K?6-NF_PyrHwF0R1T_Dm0gVBvA-hsz^{RT=hoQX;J# zhYu<_6l{-?@kwwXc=FfdK6aZG>EwE31x{QlUbMZd!&ne~Hc*)+y7YtGb(Np|Q3fax zd7)h-^HCaX4JDyxDSGuu;QV)7(4s}|l$=+9;ThU)Qfx-8r@`(P20=ybfb|4IbmY!^ zj6{06oGuDjoT^eXw~IVqoSdhw52{0ieW96kl;r8e(mT}2o72n!2P8e@En}amC#bt& z@6{9&rq({m1-+AaCvXS@mORD^W)VBfBYbw80940l=Gf-cS9`EsJcC&e9{4+ zyB7oBZ?Zxi^ZA{^s$)*8KX55q4W_%(#8lZp zrgk)wqB+bu6f0<+k)!_Y{=uK&FDmTXz3Diq5@77lyh1a_urqTRFSV@dFz7nwHkXfm z4O{r3qJN(#%7Geo;96!<=-{5=5Nho^*ltj^$Z0svLL--9=Z!{{Ah=^qQ)phtp#~iB;dladMd^T$HINwpDqv%)sbY-7wz>K3U>c{)$lz2Rk;uls87VtI~3#5g+>)ozYX zdxS1=fVv-~{MBARL+p9~{$NmPKteURR5a1R0mkJZB_O}1JR*sy95zFaLl_pD^AAx* z@zIi5GZh(^vHF%qU*nJpfmIFoVM2{SDYj`~ah#C9WfXd6`WSIV7hpAz>20OtgCEqP z^r4;WY7^m!#m)IDWz||$#hIydB`$es%~`8GVr5`odCwez=~muWgfOr=EW_B^uufC3 zoFNqQpR@{bbgVYhTlo#p+SvN4O_<_chfKc;x;Ll4V)A#WnhxDpKZVf<|CBWMu+2{g z399k}HKc&ucyvYF_0rXob*S=x3>m8`!Z+SDxK8xD7khQNsQ6fVqN)%kG014vYjI~a zwbu|YPq$F~2dH?M-Z~{t>KWYnT5JQW4zd<)<7iF&E?kunh~8brw$U18s7O6xV=9m&ZsMF#s(MjeIoT~41S*;dbbb#ZuUCq zZkJ2(lrc%7DIGG$^c2XZYy?7(Y)iNnnzSd0dFo5#x zVqVh4oTd2v0w_CN%{NZ@)WSfn5;5yWGY2?(NCO}6+$-Y{5%%!ahWfA<&w#F4?l1H^ z6v@QFbEWvcw*;#f!`0-BQ`e8r9*|@j0eYvA^mcwai0%QEJ%L@Yz*1FneTBnON-;YNmM>_PbA& zral~0ijAhWeGy?;4Uw|vabn7epy?7`{I`*s@|_+^SA55H9s zqmyxJMXcdiu^(>&9u;&bJi6R5*Ptb`ysw5U4i-v{vhvL$sGW9cV?v*8mEu@Fmj!mk z01Kz8*h4TU3|n;+Ybfg*K_Q_0N#p)r@t0;n0n#RQ?Y(1#S`m+??ic-G#1Rj0G;4}= z?TrtWQgMe8C*&btMKwzCig5 z|JqAG+c|z(Xv~Z9drD^W7pH~I!#h`+I2forZ(BgD&Z^&m754~b`b1s-n)h~~fsjlc zR_<_;I#vczQoqct&aHVrqNxX@*0pCF;$7V-Am-0{+k1o1aut4)OTGOrTTsh`i{s!>t_XpZcJ5UK`s=L>Ng74v2-8DYq8B z!w>hE!NmeC_Lt)A9Z;w1)Dx^=7K;(fcV}|HXetq>YS|fDnI6UHiMRyCdSCnbb@RpV zN*k}UA$3C}6f>yd4^;Mg!_4<2faMK=s13{WNbS5%Y)Ok5-r3bUgO{$yAbRp(QJ*Qo z9PE9c+2NLMqS(#RnUF?AX2&~=ky|tt;dEya$@zs=9*+ElKrnS8-sLqLZDjyu;pQC|i zLF#L+zi9o##5+L9{KZGOspW%*-*eR3tBj@9&!3#)x@TKJlywH|5JwqULr(d97`d#}E~v zc?g+{q#*tY#LFk9#dbOHy}G%Idy@yur{TzGnZJY%u%saKLA%sG1{D9*XG5Cgv#Xhq zI$q(^YpVB4tZQkFhXD$34h3>(K_ntf?e|*@#<|lZ zlu^*?>kOkD-OLDKrT;Sr0YgB)Dn1^Ic_bC>qkjs~Q&QoY=Y4;srhw$qZyA+|x^fXA zERo|Q_@{U&9A|09|GWBlClZ=wq+Zm4Fdt5$P3h3_wevzZAldCJ7-}OMe1K+RVHlxv zq;$x%V}rT9dNiISR!D1c7Ky98y5Yl+w(=Q$mZc}SCl7-|-4(|tt{;|yxvu{qsThu|K1+mF4W<%2a14_o+^Jeyc z?b}=}3ky;59N*0RSq(p*{I1AV7!q&3TM7k--E(!o@jPG?i_{EA4B`q?zEttt z%z(pi1TpNgTE#4e2Xda_V#V1=sx=n1bB z%Mx{-+lbLzA-+K9uWR;S7Uf0vOXgg2o&o2Y?QVU-wlVMOcqgs5HD+D($VcQ9{XOz5 zf5WPf2z>_FXo9R(&?Ts2(xeWYW$;od{-?%S&*pe;3XK>`$F$5=?Xau02JncLmTJ-- zz1mrCvmLF<9PV%0qOU~ATqebrGEfz{i$rKR5lzkbc_L_7KvXF=H(bCdE4<7yU#WFMF^rmxCH&`ph`WjtU+ewZ!z2O92@QqnD6x~M$o=I*qb zXTCSv!y;T1)8FqR81vVn6lo+PHUjggC3fiXR>}L=@70>`uX%nBGp5*l?KF-|lqR)w zl?G`RBhMapDB0f3u(Ub6y^tw$T4l9!Y*!+ID6%{*eIVc^V?bZ_`Oa06>EUC8Cs!M^ z2pMZ)Qeh#-OP0UYL;Zd2y~Y}1W_Y0&Pu^6(m%RVzBx~#dg+TQG6av$;C{Tx*0XVD3 zz}BpG7MkVG1SSQ_?ht;k`&PhvA4iaTv2E>86Wpd$2qs%h)^GHwJw1R=e3&thIBhYR zML##;I^FC?WPTf~^bwpnaX&D0Zah7{f94t1+c8|O22@igxKLdFMwl~luPD3tKBnmD z-uC9l8VMz_S&jE%-QxS|!dM^tGc&Qnj?)?k!bqg<3uQl-O<00?FO{^{-kO%HN!xgq zn2Rq9CJhW6>dNK)%a4SA%^G>OsjN=cD*bu^#Kx4`o zE8Xl-30WN`l_F5t(D1Gqk0Y>dzxAaMX5b)UHY9ecyjLVEhLfeW7{P~}IXhU!+3OJB z)l~bY2`qnLH6F9QvaGTOG&oCB)aL|L@c}!zz;G?FOE5WDYv4RD04KO;Fn&=Sxl`e! zX7brLVRfc#v?Z1s=UqhPwBw<{XyY%xHkZC3m^=1H;w5|3<)g9}ABUmhcYC8d+Qe73 z`N-~$BJVeNfVV;vK)*j+#lLZ{Sxa(DiL+HVA{!=Xn`l{L2y1!6iuI)Vw6P^n)vUtF z7@sPzhIy$p8+uTAxX(mwv!MR2xIAsirfmhIQ<;&jG?dT9<(Pj;G27Or_=b?%27mzc N=z-pS#Jwk>{{sb>U7r8| literal 0 HcmV?d00001 diff --git a/docs/pages/Methodology.py b/docs/pages/Methodology.py deleted file mode 100644 index af6a120..0000000 --- a/docs/pages/Methodology.py +++ /dev/null @@ -1,301 +0,0 @@ -import streamlit as st -from policyengine_uk_data.utils.download_docs_prerequisites import ( - download_data, -) - -download_data() - -st.set_page_config(layout="wide") - -from policyengine_uk_data.utils import get_loss_results -from policyengine_uk_data import ( - FRS_2022_23, - ExtendedFRS_2022_23, - EnhancedFRS_2022_23, - ReweightedFRS_2022_23, -) -from policyengine_core.model_api import Reform -import plotly.express as px -import pandas as pd - -st.title("Methodology") - -st.write( - """ -In this page, we'll walk through step-by-step the process we use to create PolicyEngine's dataset. -* **Family Resources Survey**: we'll start with the FRS, looking at close it is to reality. To take an actual concrete starting point, we'll assume benefit payments are as reported in the survey. -* **FRS (+ tax-benefit model)**: we need to make sure that our tax-benefit model isn't doing anything unexpected. If we turn on simulation of taxes and benefits, does anything look unexpected? If not- great, we've turned a household survey into something useful for policy analysis. We'll also take stock here of what we're missing from reality. -* **Wealth and consumption**: the most obvious thing we're missing is wealth and consumption. We'll impute those here. -* **Fine-tuning**: we'll use reweighting to make some final adjustments to make sure our dataset is as close to reality as possible. -* **Validation**: we'll compare our dataset to the UK's official statistics, and see how we're doing. -""" -) - -st.subheader("Family Resources Survey") - -st.write( - """First, we'll start with the FRS as-is. Skipping over the technical details for how we actually feed this data into the model (you can find that in `policyengine_uk_data/datasets/frs/`), we need to decide how we're actually going to measure 'close to reality'. We need to define an objective function, and if our final dataset improves it a lot, we can call that a success. - -We'll define this objective function using public statistics that we can generally agree are of high importance to describing the UK household sector. These are things that, if the survey gets them wrong, we'd expect to cause inaccuracy in our model, and if we get them all mostly right, we'd expect to have confidence that it's a pretty accurate tax-benefit model. - -For this, we've gone through and collected: - -* **Demographics** from the ONS: ten-year age band populations by region of the UK, national family type populations and national tenure type populations. -* **Incomes** from HMRC: for each of 14 total income bands, the number of people with income and combined income of the seven income types that account for over 99% of total income: employment, self-employment, State Pension, private pension, property, savings interest, and dividends. -* **Tax-benefit programs** from the DWP and OBR: statistics on caseloads, expenditures and revenues for all 20 major tax-benefit programs. - -Let's first take a look at the initial FRS, our starting point, and what is generally considered the best dataset to use (mostly completely un-modified across major tax-benefit models), and see how close it is to reproducing these statistics. - -The table below shows the result, and: it's really quite bad! Look at the relative errors. -""" -) - - -@st.cache_data -def get_loss(dataset, reform, time_period): - loss_results = get_loss_results(dataset, time_period, reform) - - def get_type(name): - if "hmrc" in name: - return "Income" - if "ons" in name: - return "Demographics" - if "obr" in name: - return "Tax-benefit" - return "Other" - - loss_results["type"] = loss_results.name.apply(get_type) - return loss_results - - -reported_benefits = Reform.from_dict( - { - "gov.contrib.policyengine.disable_simulated_benefits": True, - } -) -loss_results = get_loss( - dataset=FRS_2022_23, reform=reported_benefits, time_period=2022 -).copy() -with st.expander(expanded=True, label="Objective function deep dive"): - st.dataframe(loss_results, use_container_width=True) - -st.write( - "It's easier to understand 'what kind of bad' this is by splitting out the statistics into those three categories. Here's a histogram of the absolute relative errors." -) - -fig = px.histogram( - loss_results, - x="abs_rel_error", - nbins=25, - title="Distribution of absolute relative errors", - labels={ - "value": "Absolute relative error", - "count": "Number of variables", - }, - color="type", -) - -st.plotly_chart(fig, use_container_width=True) - -st.write( - """A few notes: - -* We're comparing things in the same relevant time period (2022), and only doing a tiny amount of adjustment to the statistics: OBR statistics are taken directly from the latest EFO, ONS statistics are the most recent projections for 2022, and HMRC statistics are uprated from 2021 to 2022 using the same standard uprating factors we use in the model (and it's only one year adjustment). -* Demogaphics look basically fine: that's expected, because the DWP applies an optimisation algorithm to optimise the household weights to be as close as possible to a similar set of demographic statistics. It's a good sign that we use slightly different statistics than it was trained on and get good accuracy. -* Incomes look *not great at all*. We'll take a closer look below to understand why. But the FRS is well-known to under-report income significantly. -* Tax-benefit programs also look *not good*. And this is a concern! Because we're using this dataset to answer questions about tax-benefit programs, and the FRS isn't even providing a good representation of them under baseline law. -""" -) - -incomes = loss_results[loss_results.type == "Income"] -incomes["band"] = incomes.name.apply( - lambda x: x.split("band_")[1].split("_")[0] -).astype(int) -incomes["count"] = incomes.name.apply(lambda x: "count" in x) -incomes["variable"] = incomes.name.apply( - lambda x: x.split("_income_band")[0].split("_count")[0].split("hmrc/")[-1] -) - -variable = st.selectbox("Select income variable", incomes.variable.unique()) -count = st.checkbox("Count") -variable_df = incomes[ - (incomes.variable == variable) & (incomes["count"] == count) -] - -fig = px.bar( - variable_df, - x="band", - y=[ - "target", - "estimate", - "error", - "rel_error", - "abs_error", - "abs_rel_error", - ], - barmode="group", -) -st.plotly_chart(fig, use_container_width=True) - -st.write( - """There are a few interesting things here: - -* The FRS over-estimates incomes in the upper-middle of the distribution and under-estimates them in the top of the distribution. The reason for this is probably: the FRS misses out the top completely, and then because of the weight optimisation (which scales up the working-age age groups to hit their population targets), the middle of the distribution is inflated, overcompensating. -* Some income types are severely under-estimated across all bands: notably capital incomes. This probably reflects issues with the survey questionnaire design more than sampling bias. -""" -) -st.write("OK, so what can we do about it?") - -st.subheader("FRS (+ tax-benefit model)") - -st.write( - "First, let's turn on the model and check nothing unexpected happens." -) - - -original_frs_loss = loss_results.copy() -frs_loss = get_loss(FRS_2022_23, None, 2022).copy() -combined_frs_loss = pd.merge( - on="name", - left=original_frs_loss, - right=frs_loss, - suffixes=("_original", "_simulated"), -) -combined_frs_loss["change_in_abs_rel_error"] = ( - combined_frs_loss["abs_rel_error_simulated"] - - combined_frs_loss["abs_rel_error_original"] -) -# Sort columns -combined_frs_loss.sort_index(axis=1, inplace=True) -combined_frs_loss = combined_frs_loss.set_index("name") - -st.dataframe(combined_frs_loss, use_container_width=True) - -st.write( - """Again, a few notes: - -* You might be thinking: 'why do some of the HMRC income statistics change?'. That's because of the State Pension, which is simulated in the model. The State Pension is a component of total income, so people might be moved from one income band to another if we adjust their State Pension payments slightly. -* Some of the tax-benefit statistics change, and get better and worse. This is expected for a variety of reasons- one is that incomes and benefits are often out of sync with each other in the data (the income in the survey week might not match income in the benefits assessment time period). -""" -) - -st.subheader("Adding imputations") - -st.write( - """Now, let's add in the imputations for wealth and consumption. For this, we train *quantile regression forests* (essentially, random forest models that capture the conditional distribution of the data) to predict wealth and consumption variables from FRS-shared variables in other surveys. - -The datasets we use are: -* The Wealth and Assets Survey (WAS) for wealth imputations. -* The Living Costs and Food Survey (LCFS) for most consumption imputations. -* The Effects of Taxes and Benefits on Household Income (ETB) for '£ consumption that is full VAT rateable'. For example, different households will have different profiles in terms of the share of their consumption that falls on the VATable items. - -Below is a table showing how just adding these imputations changes our objective statistics (filtered to just rows which changed). Not bad pre-calibrated performance! And we've picked up an extra £200bn in taxes. -""" -) - -new_loss = get_loss(ExtendedFRS_2022_23, None, 2022).copy() -new_loss_against_old = pd.merge( - on="name", - left=frs_loss, - right=new_loss, - suffixes=("_simulated", "_imputed"), -) -new_loss_against_old["change_in_abs_rel_error"] = ( - new_loss_against_old["abs_rel_error_imputed"] - - new_loss_against_old["abs_rel_error_simulated"] -) - -st.dataframe( - new_loss_against_old[ - new_loss_against_old.change_in_abs_rel_error.abs() > 0.01 - ] -) - -st.subheader("Calibration") - -st.write( - "Now, we've got a dataset that's performs pretty well without explicitly targeting the official statistics we care about. So it's time to add the final touch- calibrating the weights to explicitly minimise error against the target set." -) - -calibrated_loss = get_loss(ReweightedFRS_2022_23, None, 2022).copy() -calibrated_loss_against_imputed = pd.merge( - on="name", - left=new_loss, - right=calibrated_loss, - suffixes=("_imputed", "_calibrated"), -) - -calibrated_loss_against_imputed["change_in_abs_rel_error"] = ( - calibrated_loss_against_imputed["abs_rel_error_calibrated"] - - calibrated_loss_against_imputed["abs_rel_error_imputed"] -) - -st.dataframe(calibrated_loss_against_imputed) - -st.write( - "The above table shows what this did to our target set. Mostly, we're hitting targets! But we are still under on income tax and many of the highest income band statistics. Let's take another look at the incomes, but with this new calibrated dataset." -) - -incomes = calibrated_loss[loss_results.type == "Income"] -incomes["band"] = incomes.name.apply( - lambda x: x.split("band_")[1].split("_")[0] -).astype(int) -incomes["count"] = incomes.name.apply(lambda x: "count" in x) -incomes["variable"] = incomes.name.apply( - lambda x: x.split("_income_band")[0].split("_count")[0].split("hmrc/")[-1] -) - -variable = st.selectbox( - "Select income variable", - incomes.variable.unique(), - key=1, -) -count = st.checkbox("Count", key=2) -variable_df = incomes[ - (incomes.variable == variable) & (incomes["count"] == count) -] - -fig = px.bar( - variable_df, - x="band", - y=[ - "target", - "estimate", - "error", - "rel_error", - "abs_error", - "abs_rel_error", - ], - barmode="group", -) -st.plotly_chart(fig, use_container_width=True) - -st.write( - """ -So, what's happening here seems like: the FRS just doesn't have enough high-income records for calibration to work straight away. The optimiser can't just set really high weights for the few rich people we do have, because it'd hurt performance on the demographic statistics. - -So, we need a solution to add more high-income records. What we'll do is: - -* Train a QRF model to predict the distributions of income variables from the Survey of Personal Incomes from FRS demographic variables. -* For each FRS person, add an 'imputed income' clone with zero weight. -* Run the calibration again. -""" -) - -st.subheader("The Enhanced FRS") - -st.write("Let's see how this new dataset performs.") - -efrs_loss = get_loss(EnhancedFRS_2022_23, None, 2022).copy() -efrs_loss_against_calibrated = pd.merge( - on="name", - left=calibrated_loss, - right=efrs_loss, - suffixes=("_calibrated", "_enhanced"), -) -efrs_loss_against_calibrated["change_in_abs_rel_error"] = ( - efrs_loss_against_calibrated["abs_rel_error_enhanced"] - - efrs_loss_against_calibrated["abs_rel_error_calibrated"] -) - -st.dataframe(efrs_loss_against_calibrated) diff --git a/docs/pages/Validation.py b/docs/pages/Validation.py deleted file mode 100644 index 39ea1f2..0000000 --- a/docs/pages/Validation.py +++ /dev/null @@ -1,88 +0,0 @@ -import streamlit as st -from policyengine_uk_data.utils.download_docs_prerequisites import ( - download_data, -) - -download_data() - -st.set_page_config(layout="wide") - -st.title("Validation") - -from policyengine_uk_data import EnhancedFRS_2022_23, FRS_2022_23, SPI_2020_21 -from policyengine_uk_data.utils.loss import get_loss_results -import pandas as pd - - -@st.cache_data -def get_validation(): - df = pd.DataFrame() - for dataset in [FRS_2022_23, EnhancedFRS_2022_23]: - for year in range(2022, 2029): - print(dataset.label, year) - loss_results = get_loss_results(dataset, year) - loss_results["time_period"] = year - loss_results["dataset"] = dataset.label - df = pd.concat([df, loss_results]) - df = df.reset_index(drop=True) - return df - - -df = get_validation() -truth_df = df[df.dataset == df.dataset.unique()[0]].reset_index() -truth_df["estimate"] = truth_df["target"] -truth_df["error"] = truth_df["estimate"] - truth_df["target"] -truth_df["abs_error"] = truth_df["error"].abs() -truth_df["rel_error"] = truth_df["error"] / truth_df["target"] -truth_df["abs_rel_error"] = truth_df["rel_error"].abs() -truth_df["dataset"] = "Official" -df = pd.concat([df, truth_df]).reset_index(drop=True) - -st.write( - "Calibration check: the table below shows how both the original and enhanced FRS datasets compare to over 2,000 official statistics (which the EFRS was explicitly calibrated to hit) from the OBR, DWP and HMRC." -) - -st.write( - "Since the EFRS is calibrated to these statistics, high performance is expected and achieved." -) - -a, b = st.columns(2) - -with a: - frs_mean = df[df.dataset == "FRS (2022-23)"].abs_rel_error.mean() - st.metric("FRS average error", f"{frs_mean:.2%}") -with b: - efrs_mean = df[df.dataset == "Enhanced FRS (2022-23)"].abs_rel_error.mean() - st.metric("Enhanced FRS average error", f"{efrs_mean:.2%}") - -selected_metrics = st.selectbox("Select statistic", df.name.unique()) -comparison = st.selectbox( - "Select metric", - ["estimate", "error", "abs_error", "rel_error", "abs_rel_error"], -) - -# Bar chart showing datasets and a dotted line for actual - -import plotly.express as px - -comparison_df = ( - df[df.name == selected_metrics] - .groupby(["dataset", "time_period"])[comparison] - .mean() - .reset_index() -) - -fig = px.bar( - comparison_df, - x="time_period", - y=comparison, - color="dataset", - barmode="group", - title=f"{selected_metrics} {comparison} comparison", -) -st.plotly_chart(fig, use_container_width=True) - - -st.dataframe(df) - -st.dataframe(df[df.name == selected_metrics]) diff --git a/docs/utils.py b/docs/utils.py new file mode 100644 index 0000000..d61f829 --- /dev/null +++ b/docs/utils.py @@ -0,0 +1,7 @@ +import plotly.io as pio +from IPython.display import HTML + + +def show(fig): + html = pio.to_html(fig) + return HTML(html) diff --git a/policyengine_uk_data/utils/github.py b/policyengine_uk_data/utils/github.py index 27c88e1..d949ac8 100644 --- a/policyengine_uk_data/utils/github.py +++ b/policyengine_uk_data/utils/github.py @@ -74,7 +74,7 @@ def upload( f"Asset {file_name} already exists in release {release_tag} of {org}/{repo}, skipping." ) return - + url = f"https://uploads.github.com/repos/{org}/{repo}/releases/{release_id}/assets?name={file_name}" headers = { @@ -85,7 +85,7 @@ def upload( with open(file_path, "rb") as f: data = f.read() - + response = requests.post( url, headers=headers, @@ -96,9 +96,8 @@ def upload( raise ValueError( f"Invalid response code {response.status_code} for url {url}. Received: {response.text}" ) - - return response.json() + return response.json() def set_pr_auto_review_comment(text: str): diff --git a/pyproject.toml b/pyproject.toml index 2ecfff7..cd227f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,10 +25,11 @@ dev = [ "black", "pytest", "policyengine_uk>=1.8.0", - "streamlit", "survey_enhance", "torch", "tables", + "furo", + "jupyter-book", ] [tool.setuptools] From 8276a00fff7bff57ad0c78abd9ba8d99c610a6f1 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 18:32:38 +0100 Subject: [PATCH 05/14] Add changelog rule --- Makefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile b/Makefile index f78fd5b..a02c02f 100644 --- a/Makefile +++ b/Makefile @@ -31,3 +31,10 @@ build: publish: twine upload dist/* + +changelog: + build-changelog changelog.yaml --output changelog.yaml --update-last-date --start-from 1.0.0 --append-file changelog_entry.yaml + build-changelog changelog.yaml --org PolicyEngine --repo policyengine-us-data --output CHANGELOG.md --template .github/changelog_template.md + bump-version changelog.yaml pyproject.toml + rm changelog_entry.yaml || true + touch changelog_entry.yaml From c40523ca564d94bab6f0ca4124ab097618a9c327 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 18:38:42 +0100 Subject: [PATCH 06/14] Ensure Extended FRS generates --- Makefile | 1 + policyengine_uk_data/utils/imputations/capital_gains.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a02c02f..975f206 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ documentation: data: python policyengine_uk_data/datasets/frs/dwp_frs.py python policyengine_uk_data/datasets/frs/frs.py + python policyengine_uk_data/datasets/frs/extended_frs.py python policyengine_uk_data/datasets/frs/enhanced_frs.py build: diff --git a/policyengine_uk_data/utils/imputations/capital_gains.py b/policyengine_uk_data/utils/imputations/capital_gains.py index 54c7862..3a9b571 100644 --- a/policyengine_uk_data/utils/imputations/capital_gains.py +++ b/policyengine_uk_data/utils/imputations/capital_gains.py @@ -86,7 +86,7 @@ def loss(blend_factor): return loss optimiser = Adam([blend_factor], lr=1e-1) - progress = tqdm(range(1000)) + progress = range(1000) for i in progress: optimiser.zero_grad() loss_value = loss(blend_factor) From f3f42c37c8a0a5f75e72e80a0c96444c9ddf2747 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 18:41:22 +0100 Subject: [PATCH 07/14] Add initial changelog --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..309434f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,13 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0.0] - 2024-09-09 17:29:10 + +### Added + +- Initialized changelogging + From 1748e2af06c81e7b57be353cd74a2880c9607491 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 19:03:02 +0100 Subject: [PATCH 08/14] Add download links for all microdata --- policyengine_uk_data/datasets/frs/dwp_frs.py | 16 ++----- policyengine_uk_data/datasets/frs/frs.py | 2 - .../storage/download_private_prerequisites.py | 43 ++++++++++--------- policyengine_uk_data/utils/__init__.py | 1 - .../utils/download_docs_prerequisites.py | 25 ----------- 5 files changed, 26 insertions(+), 61 deletions(-) delete mode 100644 policyengine_uk_data/utils/download_docs_prerequisites.py diff --git a/policyengine_uk_data/datasets/frs/dwp_frs.py b/policyengine_uk_data/datasets/frs/dwp_frs.py index 982fde2..cc3f3cf 100644 --- a/policyengine_uk_data/datasets/frs/dwp_frs.py +++ b/policyengine_uk_data/datasets/frs/dwp_frs.py @@ -22,14 +22,6 @@ def generate(self): if isinstance(tab_folder, str): tab_folder = Path(tab_folder) - # Folder might be either a folder, or a zipped folder. - - if tab_folder.suffix == ".zip": - import zipfile - - with zipfile.ZipFile(tab_folder, "r") as zip_ref: - zip_ref.extractall(tab_folder.parent) - tab_folder = Path(tab_folder.parent / tab_folder.stem) # Load the data tables = {} @@ -93,7 +85,7 @@ def generate(self): class DWP_FRS_2020_21(DWP_FRS): - folder = STORAGE_FOLDER / "frs_2020_21.zip" + folder = STORAGE_FOLDER / "frs_2020_21" name = "dwp_frs_2020_21" label = "DWP FRS (2020-21)" file_path = STORAGE_FOLDER / "dwp_frs_2020_21.h5" @@ -101,7 +93,7 @@ class DWP_FRS_2020_21(DWP_FRS): class DWP_FRS_2021_22(DWP_FRS): - folder = STORAGE_FOLDER / "frs_2021_22.zip" + folder = STORAGE_FOLDER / "frs_2021_22" name = "dwp_frs_2021_22" label = "DWP FRS (2021-22)" file_path = STORAGE_FOLDER / "dwp_frs_2021_22.h5" @@ -109,7 +101,7 @@ class DWP_FRS_2021_22(DWP_FRS): class DWP_FRS_2022_23(DWP_FRS): - folder = STORAGE_FOLDER / "frs_2022_23.zip" + folder = STORAGE_FOLDER / "frs_2022_23" name = "dwp_frs_2022_23" label = "DWP FRS (2022-23)" file_path = STORAGE_FOLDER / "dwp_frs_2022_23.h5" @@ -117,6 +109,4 @@ class DWP_FRS_2022_23(DWP_FRS): if __name__ == "__main__": - DWP_FRS_2020_21().generate() - DWP_FRS_2021_22().generate() DWP_FRS_2022_23().generate() diff --git a/policyengine_uk_data/datasets/frs/frs.py b/policyengine_uk_data/datasets/frs/frs.py index 4c9d821..6ba72c8 100644 --- a/policyengine_uk_data/datasets/frs/frs.py +++ b/policyengine_uk_data/datasets/frs/frs.py @@ -837,6 +837,4 @@ def impute_brmas(dataset, frs): if __name__ == "__main__": - FRS_2020_21().generate() - FRS_2021_22().generate() FRS_2022_23().generate() diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py index 09b5852..b5f8e34 100644 --- a/policyengine_uk_data/storage/download_private_prerequisites.py +++ b/policyengine_uk_data/storage/download_private_prerequisites.py @@ -1,26 +1,29 @@ from policyengine_uk_data.utils.github import download from pathlib import Path +import zipfile + +def extract_zipped_folder(folder): + folder = Path(folder) + with zipfile.ZipFile(folder, "r") as zip_ref: + zip_ref.extractall(folder.parent) FOLDER = Path(__file__).parent -download( - "PolicyEngine", - "ukda", - "release", - "frs_2020_21.zip", - FOLDER / "frs_2020_21.zip", -) -download( - "PolicyEngine", - "ukda", - "release", - "frs_2021_22.zip", - FOLDER / "frs_2021_22.zip", -) -download( - "PolicyEngine", - "ukda", - "release", +FILES = [ "frs_2022_23.zip", - FOLDER / "frs_2022_23.zip", -) + "lcfs_2021_22.zip", + "was_2006_20.zip", + "etb_1977_21.zip", + "spi_2020_21.zip", +] + +for file in FILES: + download( + "PolicyEngine", + "ukda", + "release", + file, + FOLDER / file, + ) + extract_zipped_folder(FOLDER / file) + (FOLDER / file).unlink() diff --git a/policyengine_uk_data/utils/__init__.py b/policyengine_uk_data/utils/__init__.py index f1c9f22..983bc4a 100644 --- a/policyengine_uk_data/utils/__init__.py +++ b/policyengine_uk_data/utils/__init__.py @@ -2,4 +2,3 @@ from .uprating import * from .datasets import * from .loss import * -from .download_docs_prerequisites import * diff --git a/policyengine_uk_data/utils/download_docs_prerequisites.py b/policyengine_uk_data/utils/download_docs_prerequisites.py deleted file mode 100644 index dd015ec..0000000 --- a/policyengine_uk_data/utils/download_docs_prerequisites.py +++ /dev/null @@ -1,25 +0,0 @@ -from policyengine_uk_data.utils.github import download -from policyengine_uk_data.storage import STORAGE_FOLDER - -PREREQUISITES = [ - { - "repo": "ukda", - "file_name": "frs_2022_23.h5", - }, - { - "repo": "ukda", - "file_name": "enhanced_frs_2022_23.h5", - }, -] - - -def download_data(): - for prerequisite in PREREQUISITES: - if not (STORAGE_FOLDER / prerequisite["file_name"]).exists(): - download( - "PolicyEngine", - prerequisite["repo"], - "release", - prerequisite["file_name"], - STORAGE_FOLDER / prerequisite["file_name"], - ) From d127581a8e9571035df5d31fcf7755adc6c13fdc Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 19:05:06 +0100 Subject: [PATCH 09/14] Try fix for changelog action --- .github/workflows/ci_cd.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci_cd.yaml b/.github/workflows/ci_cd.yaml index 0e7afc5..aee95d4 100644 --- a/.github/workflows/ci_cd.yaml +++ b/.github/workflows/ci_cd.yaml @@ -110,10 +110,6 @@ jobs: python-version: "3.10" - name: Build changelog run: pip install "yaml-changelog>=0.1.7" && make changelog - - name: Preview changelog update - run: ".github/get-changelog-diff.sh" - - name: Check version number has been properly updated - run: ".github/is-version-number-acceptable.sh" - name: Update changelog uses: EndBug/add-and-commit@v9 with: From 745473cecb130f7b41c9a467e27e841048a866b2 Mon Sep 17 00:00:00 2001 From: "Github Actions[bot]" Date: Tue, 17 Sep 2024 18:05:28 +0000 Subject: [PATCH 10/14] Update PolicyEngine US data --- CHANGELOG.md | 9 +++++++++ changelog.yaml | 5 +++++ changelog_entry.yaml | 4 ---- pyproject.toml | 2 +- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 309434f..9d89251 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.1.0] - 2024-09-17 18:05:27 + +### Changed + +- Lightened dependency list. + ## [1.0.0] - 2024-09-09 17:29:10 ### Added - Initialized changelogging + + +[1.1.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.0.0...1.1.0 diff --git a/changelog.yaml b/changelog.yaml index 8944a4b..317f2a1 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -3,3 +3,8 @@ - Initialized changelogging date: 2024-09-09 17:29:10 version: 1.0.0 +- bump: minor + changes: + changed: + - Lightened dependency list. + date: 2024-09-17 18:05:27 diff --git a/changelog_entry.yaml b/changelog_entry.yaml index f3b708c..e69de29 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -1,4 +0,0 @@ -- bump: minor - changes: - changed: - - Lightened dependency list. diff --git a/pyproject.toml b/pyproject.toml index cd227f4..e2ec2ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_uk_data" -version = "1.0.0" +version = "1.1.0" description = "A package to create representative microdata for the UK." readme = "README.md" authors = [ From 433757e135de8a0e83eaa85754d61dfc9446fb3d Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 19:11:16 +0100 Subject: [PATCH 11/14] Use correct folder names --- .gitignore | 1 + policyengine_uk_data/storage/download_private_prerequisites.py | 2 ++ policyengine_uk_data/utils/imputations/consumption.py | 2 +- policyengine_uk_data/utils/imputations/income.py | 2 +- policyengine_uk_data/utils/imputations/vat.py | 2 +- policyengine_uk_data/utils/imputations/wealth.py | 2 +- 6 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 61b49b8..f07f442 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ **/*.csv **/*.zip **/*.pkl +**/*.tab !uprating_factors.csv !uprating_growth_factors.csv !incomes.csv diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py index b5f8e34..f2813a5 100644 --- a/policyengine_uk_data/storage/download_private_prerequisites.py +++ b/policyengine_uk_data/storage/download_private_prerequisites.py @@ -18,6 +18,8 @@ def extract_zipped_folder(folder): ] for file in FILES: + if (FOLDER / file).exists(): + continue download( "PolicyEngine", "ukda", diff --git a/policyengine_uk_data/utils/imputations/consumption.py b/policyengine_uk_data/utils/imputations/consumption.py index 6de810f..2270f72 100644 --- a/policyengine_uk_data/utils/imputations/consumption.py +++ b/policyengine_uk_data/utils/imputations/consumption.py @@ -5,7 +5,7 @@ import yaml from policyengine_uk_data.storage import STORAGE_FOLDER -LCFS_TAB_FOLDER = Path("/Users/nikhilwoodruff/Downloads/UKDA-9123-tab/tab") +LCFS_TAB_FOLDER = STORAGE_FOLDER / "lcfs_2021_22" REGIONS = { 1: "NORTH_EAST", diff --git a/policyengine_uk_data/utils/imputations/income.py b/policyengine_uk_data/utils/imputations/income.py index 7cc2558..95d3947 100644 --- a/policyengine_uk_data/utils/imputations/income.py +++ b/policyengine_uk_data/utils/imputations/income.py @@ -4,7 +4,7 @@ import numpy as np from policyengine_uk_data.storage import STORAGE_FOLDER -SPI_TAB_FOLDER = Path("/Users/nikhilwoodruff/Downloads/UKDA-9121-tab/tab") +SPI_TAB_FOLDER = STORAGE_FOLDER / "spi_2020_21" SPI_RENAMES = dict( private_pension_income="PENSION", self_employment_income="PROFITS", diff --git a/policyengine_uk_data/utils/imputations/vat.py b/policyengine_uk_data/utils/imputations/vat.py index 868f441..9519212 100644 --- a/policyengine_uk_data/utils/imputations/vat.py +++ b/policyengine_uk_data/utils/imputations/vat.py @@ -4,7 +4,7 @@ import numpy as np from policyengine_uk_data.storage import STORAGE_FOLDER -ETB_TAB_FOLDER = Path("/Users/nikhilwoodruff/Downloads/UKDA-8856-tab/tab") +ETB_TAB_FOLDER = STORAGE_FOLDER / "etb_1977_21" CONSUMPTION_PCT_REDUCED_RATE = 0.03 # From OBR's VAT page CURRENT_VAT_RATE = 0.2 diff --git a/policyengine_uk_data/utils/imputations/wealth.py b/policyengine_uk_data/utils/imputations/wealth.py index 686a9b3..2513f5d 100644 --- a/policyengine_uk_data/utils/imputations/wealth.py +++ b/policyengine_uk_data/utils/imputations/wealth.py @@ -5,7 +5,7 @@ import yaml from policyengine_uk_data.storage import STORAGE_FOLDER -WAS_TAB_FOLDER = Path("/Users/nikhilwoodruff/Downloads/UKDA-7215-tab/tab") +WAS_TAB_FOLDER = STORAGE_FOLDER / "was_2006_20" REGIONS = { 1: "NORTH_EAST", From d66eae0ce0e3128e5a569e808d96f738f24ffeca Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 19:16:37 +0100 Subject: [PATCH 12/14] Remove check version action --- .github/workflows/ci_cd.yaml | 25 ------------------- .../storage/download_private_prerequisites.py | 2 ++ 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci_cd.yaml b/.github/workflows/ci_cd.yaml index aee95d4..0fc23c4 100644 --- a/.github/workflows/ci_cd.yaml +++ b/.github/workflows/ci_cd.yaml @@ -93,31 +93,6 @@ jobs: run: pytest - name: Test documentation builds run: make documentation - check-version: - name: Check version - if: github.event_name == 'pull_request' - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Fetch all history for all tags and branches - repository: ${{ github.event.pull_request.head.repo.full_name }} - ref: ${{ github.event.pull_request.head.ref }} - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - name: Build changelog - run: pip install "yaml-changelog>=0.1.7" && make changelog - - name: Update changelog - uses: EndBug/add-and-commit@v9 - with: - add: "." - committer_name: Github Actions[bot] - author_name: Github Actions[bot] - message: Update PolicyEngine US data - github_token: ${{ secrets.POLICYENGINE_GITHUB }} docker: name: Docker diff --git a/policyengine_uk_data/storage/download_private_prerequisites.py b/policyengine_uk_data/storage/download_private_prerequisites.py index f2813a5..4234142 100644 --- a/policyengine_uk_data/storage/download_private_prerequisites.py +++ b/policyengine_uk_data/storage/download_private_prerequisites.py @@ -2,11 +2,13 @@ from pathlib import Path import zipfile + def extract_zipped_folder(folder): folder = Path(folder) with zipfile.ZipFile(folder, "r") as zip_ref: zip_ref.extractall(folder.parent) + FOLDER = Path(__file__).parent FILES = [ From ada4e73169d0bcda844b24b6929979bfb7c131a8 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 19:17:51 +0100 Subject: [PATCH 13/14] Add to CONTRIBUTING.md --- .github/CONTRIBUTING.md | 4 ++++ pyproject.toml | 1 + 2 files changed, 5 insertions(+) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 5b5d24e..4604959 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,3 +1,7 @@ ## Updating data If your changes present a non-bugfix change to one or more datasets which are cloud-hosted (FRS and EFRS), then please change both the filename and URL (in both the class definition file and in `storage/upload_completed_datasets.py`). This enables us to store historical versions of datasets separately and reproducibly. + +## Updating the versioning + +Please add to `changelog.yaml` and then run `make changelog` before committing the results ONCE in this PR. diff --git a/pyproject.toml b/pyproject.toml index e2ec2ae..ca7ab03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dev = [ "tables", "furo", "jupyter-book", + "yaml-changelog>=0.1.7", ] [tool.setuptools] From 930653fabd256915ed9376f90abaa392d869ad06 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Tue, 17 Sep 2024 19:18:31 +0100 Subject: [PATCH 14/14] Remove bad tqdm --- policyengine_uk_data/utils/imputations/capital_gains.py | 1 - 1 file changed, 1 deletion(-) diff --git a/policyengine_uk_data/utils/imputations/capital_gains.py b/policyengine_uk_data/utils/imputations/capital_gains.py index 3a9b571..c4e4551 100644 --- a/policyengine_uk_data/utils/imputations/capital_gains.py +++ b/policyengine_uk_data/utils/imputations/capital_gains.py @@ -92,7 +92,6 @@ def loss(blend_factor): loss_value = loss(blend_factor) loss_value.backward() optimiser.step() - progress.set_description(f"Loss: {loss_value.item()}") if loss_value.item() < 1e-3: break