diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index ff36d52..94b4be6 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,3 +1,7 @@ ## Updating data If your changes present a non-bugfix change to one or more datasets which are cloud-hosted (CPS, ECPS and PUF), then please change both the filename and URL (in both the class definition file and in `storage/upload_completed_datasets.py`. This enables us to store historical versions of datasets separately and reproducibly. + +## Updating versioning + +Please add a versioning entry to `changelog_entry.yaml` (see previous PRs for examples), then run `make changelog` and commit the results ONCE in this PR. diff --git a/.github/workflows/ci_cd.yaml b/.github/workflows/ci_cd.yaml index 2c564f7..c2da057 100644 --- a/.github/workflows/ci_cd.yaml +++ b/.github/workflows/ci_cd.yaml @@ -22,37 +22,8 @@ jobs: pip install black - name: Check formatting run: black . -l 79 --check - check-version: - name: Check version - if: github.event_name == 'pull_request' - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Fetch all history for all tags and branches - repository: ${{ github.event.pull_request.head.repo.full_name }} - ref: ${{ github.event.pull_request.head.ref }} - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - name: Build changelog - run: pip install "yaml-changelog>=0.1.7" && make changelog - - name: Preview changelog update - run: ".github/get-changelog-diff.sh" - - name: Check version number has been properly updated - run: ".github/is-version-number-acceptable.sh" - - name: Update changelog - uses: EndBug/add-and-commit@v9 - with: - add: "." - committer_name: Github Actions[bot] - author_name: Github Actions[bot] - message: Update PolicyEngine US data - github_token: ${{ secrets.POLICYENGINE_GITHUB }} test: - name: Build and Test + name: Build and test runs-on: ubuntu-latest steps: - name: Checkout code @@ -71,8 +42,12 @@ jobs: POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }} - name: Build datasets run: make data + env: + TEST_LITE: true - name: Run tests run: pytest + - name: Test documentation builds + run: make documentation publish-to-pypi: name: Publish to PyPI runs-on: ubuntu-latest @@ -115,8 +90,31 @@ jobs: run: docker build . -f docker/policyengine_us_data.Dockerfile -t ghcr.io/policyengine/policyengine-us-data:latest - name: Push container run: docker push ghcr.io/policyengine/policyengine-us-data:latest + publish-docs: + name: Publish documentation + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch all history for all tags and branches + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install package + run: pip install -e ".[dev]" + - name: Build Jupyter Book + run: make documentation + - name: Deploy documentation + uses: JamesIves/github-pages-deploy-action@releases/v4 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BRANCH: gh-pages + FOLDER: docs/_build/html upload: - name: Upload Data + name: Upload data runs-on: ubuntu-latest needs: [lint, test] if: github.event_name == 'push' && github.ref == 'refs/heads/main' @@ -141,12 +139,12 @@ jobs: uses: actions/upload-artifact@v4 with: name: cps_2024.h5 - path: policyengine_us_data/data_storage/cps_2024.h5 + path: policyengine_us_data/storage/cps_2024.h5 - name: Upload ECPS 2024 uses: actions/upload-artifact@v4 with: name: enhanced_cps_2024.h5 - path: policyengine_us_data/data_storage/enhanced_cps_2024.h5 + path: policyengine_us_data/storage/enhanced_cps_2024.h5 - name: Upload data run: make upload env: diff --git a/.gitignore b/.gitignore index 240807c..f191579 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,9 @@ **/__pycache__ **/.DS_STORE **/*.h5 -*.ipynb **/*.csv !uprating_factors.csv !uprating_growth_factors.csv !healthcare_spending.csv !spm_threshold_agi.csv +**/_build diff --git a/CHANGELOG.md b/CHANGELOG.md index a0591f4..5dcacbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.3.1] - 2024-09-17 19:37:44 + +### Added + +- Jupyter Book documentation. + ## [1.3.0] - 2024-09-17 10:27:10 ### Fixed @@ -59,6 +65,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 +[1.3.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.3.0...1.3.1 [1.3.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.2.1...1.3.0 [1.2.1]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.2.0...1.2.1 [1.2.0]: https://github.com/PolicyEngine/policyengine-us-data/compare/1.1.1...1.2.0 diff --git a/Makefile b/Makefile index 707c4bf..43de4c5 100644 --- a/Makefile +++ b/Makefile @@ -19,25 +19,25 @@ changelog: touch changelog_entry.yaml download: - python policyengine_us_data/data_storage/download_public_prerequisites.py - python policyengine_us_data/data_storage/download_private_prerequisites.py + python policyengine_us_data/storage/download_public_prerequisites.py + python policyengine_us_data/storage/download_private_prerequisites.py upload: - python policyengine_us_data/data_storage/upload_completed_datasets.py + python policyengine_us_data/storage/upload_completed_datasets.py docker: docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest - + documentation: - streamlit run docs/Home.py + jb clean docs && jb build docs data: python policyengine_us_data/datasets/cps/cps.py python policyengine_us_data/datasets/cps/enhanced_cps.py clean: - rm -f policyengine_us_data/data_storage/puf_2015.csv - rm -f policyengine_us_data/data_storage/demographics_2015.csv + rm -f policyengine_us_data/storage/puf_2015.csv + rm -f policyengine_us_data/storage/demographics_2015.csv build: python -m build diff --git a/changelog.yaml b/changelog.yaml index 77b6197..5ab54e0 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -40,3 +40,8 @@ fixed: - Moved heavy dependencies to dev. date: 2024-09-17 10:27:10 +- bump: patch + changes: + added: + - Jupyter Book documentation. + date: 2024-09-17 19:37:44 diff --git a/docker/docs.Dockerfile b/docker/docs.Dockerfile deleted file mode 100644 index 090ca7e..0000000 --- a/docker/docs.Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM python:latest -COPY . . -RUN make install -EXPOSE 8080 -ENTRYPOINT ["streamlit", "run", "docs/Home.py", "--server.port=8080", "--server.address=0.0.0.0"] diff --git a/docs/Home.py b/docs/Home.py deleted file mode 100644 index 4cbc16f..0000000 --- a/docs/Home.py +++ /dev/null @@ -1,42 +0,0 @@ -import streamlit as st -from policyengine_us_data.utils.docs_prerequisites_download import ( - download_data, -) - -download_data() - -st.title("PolicyEngine-US-Data") - -st.write( - """PolicyEngine-US-Data is a package to create representative microdata for the US, designed for input in the PolicyEngine tax-benefit microsimulation model.""" -) - -st.subheader("What does this repo do?") - -st.write( - """Principally, this package creates a (partly synthetic) dataset of households (with incomes, demographics and more) that describes the U.S. household sector. This dataset synthesises multiple sources of data (the Current Population Survey, the IRS Public Use File, and administrative statistics) to improve upon the accuracy of **any** of them.""" -) - -st.subheader("What does this dataset look like?") - -st.write( - "The below table shows an extract of the person records in one household in the dataset." -) - - -@st.cache_data -def sample_household(): - import pandas as pd - from policyengine_us_data.datasets import EnhancedCPS_2024 - from policyengine_us import Microsimulation - - df = Microsimulation(dataset=EnhancedCPS_2024).to_input_dataframe() - - household_id = df.person_household_id__2024.values[10] - people_in_household = df[df.person_household_id__2024 == household_id] - return people_in_household - - -people_in_household = sample_household() - -st.dataframe(people_in_household.T, use_container_width=True) diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000..5636ac9 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,22 @@ +title: PolicyEngine US data +author: PolicyEngine +copyright: "2024" +logo: logo.png + +execute: + execute_notebooks: off + +repository: + url: https://github.com/policyengine/policyengine-us-data + branch: master + path_to_book: docs + +sphinx: + config: + html_js_files: + - https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.7/require.min.js + html_theme: furo + pygments_style: default + html_css_files: + - style.css + nb_remove_code_source: true \ No newline at end of file diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000..e511f94 --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,2 @@ +@import url('https://fonts.googleapis.com/css2?family=Roboto+Serif:opsz@8..144&family=Roboto:wght@300&display=swap'); + diff --git a/docs/_toc.yml b/docs/_toc.yml new file mode 100644 index 0000000..5dd18b6 --- /dev/null +++ b/docs/_toc.yml @@ -0,0 +1,4 @@ +format: jb-book +root: intro +chapters: +- file: validation.ipynb diff --git a/docs/intro.md b/docs/intro.md new file mode 100644 index 0000000..b220cd0 --- /dev/null +++ b/docs/intro.md @@ -0,0 +1,14 @@ +# Introduction + +PolicyEngine-US-Data is a package that creates representative microdata for the US, +designed for input in the PolicyEngine tax-benefit microsimulation model. This tool +allows users to explore the data sources, validation processes, and enhancements +made to ensure accurate and reliable microsimulation results. + +PolicyEngine is a tool with a clear purpose: for given assumptions about US government policy and US households, predicting what US households will look like in the next few years. To do that, we need both of two things: + +* An accurate model of the effects of policy rules on households. +* An accurate representation of the current US household sector *now*. + +This repository is dedicated to the second of those. In this documentation, we'll explain how we do that, but we'll also use our model (the first bullet) to see what we end up with when we combine the two, and measure up against other organisations doing the same thing. + diff --git a/docs/logo.png b/docs/logo.png new file mode 100644 index 0000000..12736e4 Binary files /dev/null and b/docs/logo.png differ diff --git a/docs/pages/Aggregates.py b/docs/pages/Aggregates.py deleted file mode 100644 index d036263..0000000 --- a/docs/pages/Aggregates.py +++ /dev/null @@ -1,48 +0,0 @@ -import streamlit as st -from policyengine_us_data.utils.docs_prerequisites_download import ( - download_data, -) - -download_data() - -st.title("Aggregates") - -st.write( - """The table below shows the totals for calendar year 2024 for the Enhanced CPS dataset variables.""" -) - - -@st.cache_data -def sample_household(): - from policyengine_us import Microsimulation - from policyengine_us_data import EnhancedCPS_2024 - from policyengine_us_data.datasets.cps.extended_cps import ( - IMPUTED_VARIABLES as FINANCE_VARIABLES, - ) - import pandas as pd - - sim = Microsimulation(dataset=EnhancedCPS_2024) - - df = ( - pd.DataFrame( - { - "Variable": FINANCE_VARIABLES, - "Total ($bn)": [ - round( - sim.calculate(variable, map_to="household").sum() - / 1e9, - 1, - ) - for variable in FINANCE_VARIABLES - ], - } - ) - .sort_values("Total ($bn)", ascending=False) - .set_index("Variable") - ) - return df - - -df = sample_household() - -st.dataframe(df, use_container_width=True) diff --git a/docs/pages/Benchmarks.py b/docs/pages/Benchmarks.py deleted file mode 100644 index 3bb926d..0000000 --- a/docs/pages/Benchmarks.py +++ /dev/null @@ -1,118 +0,0 @@ -import streamlit as st -from policyengine_us_data.utils.docs_prerequisites_download import ( - download_data, -) - -download_data() - -st.title("Benchmarks") - -from policyengine_us_data.datasets import CPS_2024, PUF_2024, EnhancedCPS_2024 -from policyengine_us_data.utils import build_loss_matrix -from policyengine_us import Microsimulation -import pandas as pd -import plotly.express as px -import numpy as np - - -@st.cache_data -def compare_datasets(): - comparison_combined = pd.DataFrame() - for dataset in [CPS_2024, PUF_2024, EnhancedCPS_2024]: - sim = Microsimulation(dataset=dataset) - weights = sim.calculate("household_weight").values - loss_matrix, targets_array = build_loss_matrix(dataset, 2024) - target_names = loss_matrix.columns - estimates = weights @ loss_matrix.values - comparison = pd.DataFrame( - { - "Target": target_names, - "Estimate": estimates, - "Actual": targets_array, - } - ) - comparison["Error"] = comparison["Estimate"] - comparison["Actual"] - comparison["Abs. Error"] = comparison["Error"].abs() - comparison["Abs. Error %"] = ( - (comparison["Abs. Error"] / comparison["Actual"].abs()) - .replace([np.inf, -np.inf], np.nan) - .fillna(0) - ) - comparison["Dataset"] = dataset.label - comparison_combined = pd.concat([comparison_combined, comparison]) - - return comparison_combined - - -df = compare_datasets() - -mean_relative_error_by_dataset = ( - df.groupby("Dataset")["Abs. Error %"] - .mean() - .sort_values(ascending=False) - .apply(lambda x: round(x, 3)) -) - -st.write( - f"PolicyEngine uses **{len(df.Target.unique())}** targets for calibration in the Enhanced CPS. This page compares the estimates and errors for these targets across the three datasets." -) - -st.dataframe(mean_relative_error_by_dataset, use_container_width=True) - -metric = st.selectbox( - "Metric", ["Estimate", "Error", "Abs. Error", "Abs. Error %"] -) -target = st.selectbox("Target", df["Target"].unique()) - -fig = px.bar( - df[df["Target"] == target], - x="Dataset", - y=metric, - title=f"{metric} for {target}", -) - -if metric == "Estimate": - # Add a dashed line at the target - fig.add_shape( - type="line", - x0=-0.5, - x1=2.5, - y0=df.loc[df["Target"] == target, "Actual"].values[0], - y1=df.loc[df["Target"] == target, "Actual"].values[0], - line=dict(dash="dash"), - ) - -st.subheader("Dataset comparisons") -st.write( - "The chart below, for a selected target and metric, shows the estimates and errors for each dataset." -) - -st.plotly_chart(fig, use_container_width=True) - -ecps_df = df[df["Dataset"] == "Enhanced CPS 2024"] - -st.subheader("Enhanced CPS 2024") -st.write( - "The table below shows the error for each target in the Enhanced CPS 2024 dataset." -) - -st.dataframe(ecps_df, use_container_width=True) - -st.subheader("Relative errors by dataset") - -st.write( - "The table below shows the relative error for each target in each dataset, and the change after moving the ECPS." -) - -long_to_wide = df.pivot( - index="Target", columns="Dataset", values="Abs. Error %" -).reset_index() -long_to_wide["CPS to ECPS change"] = ( - long_to_wide["Enhanced CPS 2024"] - long_to_wide["CPS 2024 (2022-based)"] -) -long_to_wide["PUF to ECPS change"] = ( - long_to_wide["Enhanced CPS 2024"] - long_to_wide["PUF 2024 (2015-based)"] -) -long_to_wide.sort_values("cps_to_ecps_change", ascending=False) - -st.dataframe(long_to_wide, use_container_width=True) diff --git a/docs/pages/Distributions.py b/docs/pages/Distributions.py deleted file mode 100644 index 7e3d0dd..0000000 --- a/docs/pages/Distributions.py +++ /dev/null @@ -1,81 +0,0 @@ -import streamlit as st -from policyengine_us_data.utils.docs_prerequisites_download import ( - download_data, -) - -download_data() - -st.title("Distributions") - -st.write( - "This page has several visualisations of the distributions of different variables in the PE-compatible datasets." -) - -from policyengine_us_data import CPS_2024, EnhancedCPS_2024, PUF_2024 -from policyengine_us_data.utils.soi import ( - pe_to_soi, - get_soi, - compare_soi_replication_to_soi, -) -from policyengine_us_data.utils.loss import fmt -import pandas as pd -import plotly.express as px -import numpy as np - -st.subheader("IRS SOI") - -st.write( - "Use these controls to see how the different PE-compatible datasets compare to (extrapolated from 2021 to 2024) IRS SOI data." -) - - -@st.cache_data -def _get_soi(year): - return get_soi(year) - - -soi = _get_soi(2024) - - -@st.cache_data -def get_soi_replication(dataset, year): - df = compare_soi_replication_to_soi(pe_to_soi(dataset, year), soi) - return df - - -variable = st.selectbox("Variable", soi.Variable.unique()) -filing_status = st.selectbox("Filing status", soi["Filing status"].unique()) -taxable = False -count = st.checkbox("Count") - - -def get_bar_chart(variable, filing_status, taxable, count): - df = soi[soi.Variable == variable] - df["Dataset"] = "SOI" - for dataset in [EnhancedCPS_2024, PUF_2024, CPS_2024]: - replication = get_soi_replication(dataset, 2024) - replication["Dataset"] = dataset.label - df = pd.concat([df, replication[replication.Variable == variable]]) - - df = df[df["Filing status"] == filing_status] - df = df[df["Taxable only"] == taxable] - df = df[df["Count"] == count] - df = df[ - ~( - (df["AGI lower bound"] == -np.inf) - & (df["AGI upper bound"] == np.inf) - ) - ] - - df["AGI lower bound"] = df["AGI lower bound"].apply(fmt) - - return px.bar( - df, - x="AGI lower bound", - y="Value", - color="Dataset", - barmode="group", - ) - - -st.plotly_chart(get_bar_chart(variable, filing_status, taxable, count)) diff --git a/docs/pages/Reforms.py b/docs/pages/Reforms.py deleted file mode 100644 index 7938650..0000000 --- a/docs/pages/Reforms.py +++ /dev/null @@ -1,132 +0,0 @@ -import streamlit as st -from policyengine_us_data.utils.docs_prerequisites_download import ( - download_data, -) - -download_data() - -st.title("Reforms") - -from policyengine_us import Microsimulation -from policyengine_core.reforms import Reform -from pathlib import Path -import pandas as pd - -FOLDER = Path(__file__).parent -scores = ( - pd.read_csv(FOLDER / "scores.csv") - if (FOLDER / "scores.csv").exists() - else pd.DataFrame( - { - "reform_id": [], - "dataset": [], - "year": [], - } - ) -) - - -@st.cache_data -def get_budget(dataset: str, year: int, reform_id: int = None) -> float: - from policyengine_us_data import EnhancedCPS_2024, CPS_2024, PUF_2024 - - dataset = {ds.name: ds for ds in [EnhancedCPS_2024, CPS_2024, PUF_2024]}[ - dataset - ] - - if reform_id is None: - reform = None - else: - reform = Reform.from_api(reform_id, "us") - - sim = Microsimulation(dataset=dataset, reform=reform) - tax_revenues = ( - sim.calculate( - "household_tax_before_refundable_credits", period=year - ).sum() - - sim.calculate("household_refundable_tax_credits", period=year).sum() - ) - benefit_spending = sim.calculate("household_benefits", period=year).sum() - govt_balance = tax_revenues - benefit_spending - - return govt_balance - - -@st.cache_data -def get_budgetary_impact(dataset: str, year: int, reform_id: int) -> float: - baseline = get_budget(dataset, year) - with_reform = get_budget(dataset, year, reform_id) - scores = ( - pd.read_csv(FOLDER / "scores.csv") - if (FOLDER / "scores.csv").exists() - else pd.DataFrame( - { - "reform_id": [], - "dataset": [], - "year": [], - "budgetary_impact": [], - } - ) - ) - - if not scores[scores.reform_id == reform_id][scores.dataset == dataset][ - scores.year == year - ].empty: - scores = scores.drop( - scores[scores.reform_id == reform_id][scores.dataset == dataset][ - scores.year == year - ].index - ) - scores = pd.concat( - [ - scores, - pd.DataFrame( - { - "reform_id": [reform_id], - "dataset": [dataset], - "year": [year], - "budgetary_impact": [ - round((with_reform - baseline) / 1e9, 1) - ], - } - ), - ] - ) - scores.to_csv(FOLDER / "scores.csv", index=False) - - -st.write( - "Use this page to compare the computed budgetary impacts of reforms by dataset." -) - -dataset = st.selectbox( - "Dataset", ["enhanced_cps_2024", "cps_2024", "puf_2024"] -) -num_years = st.slider("Number of years", 1, 11, 3) -reform_id = st.text_input("Reform ID", "1") -reform = Reform.from_api(reform_id, "us") -if reform is not None: - st.info(reform.name) - compute = st.button("Compute") - if compute: - for year in range(2024, 2024 + num_years): - get_budgetary_impact(dataset, year, reform_id) - -scores = ( - pd.read_csv(FOLDER / "scores.csv") - if (FOLDER / "scores.csv").exists() - else pd.DataFrame( - {"reform_id": [], "dataset": [], "year": [], "budgetary_impact": []} - ) -) -scores.year = scores.year.astype(int) -scores.reform_id = scores.reform_id.astype(int) - -# Convert to a table restricted to the given reform with a row for each dataset in scores.csv and a column for each year. - -scores_wide = ( - scores[scores.reform_id == int(reform_id)] - .pivot(index="dataset", columns="year", values="budgetary_impact") - .fillna(0) -) -st.dataframe(scores_wide, use_container_width=True) diff --git a/docs/utils.py b/docs/utils.py new file mode 100644 index 0000000..d61f829 --- /dev/null +++ b/docs/utils.py @@ -0,0 +1,7 @@ +import plotly.io as pio +from IPython.display import HTML + + +def show(fig): + html = pio.to_html(fig) + return HTML(html) diff --git a/docs/validation.ipynb b/docs/validation.ipynb new file mode 100644 index 0000000..e5f0972 --- /dev/null +++ b/docs/validation.ipynb @@ -0,0 +1,561 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Validation\n", + "\n", + "This page shows how different datasets (for 2024) perform at reproducing various official statistics when used with the PolicyEngine US microsimulation model.\n", + "\n", + "Note that the Enhanced CPS dataset is explicitly calibrated to these official statistics, so it is expected to perform well. Since these statistics are large in number and diverse, we expect this to improve the dataset's performance at predicting reform impacts." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + "This is the init_notebook_mode cell from ITables v2.1.5
\n", + "(you should not see this message - is your notebook trusted?)\n", + "
\n", + "
\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "\n", + "\n", + "
nameactualestimate_cpsestimate_pufestimate_ecpsabs_rel_error_cpsabs_rel_error_pufabs_rel_error_ecpsecps_abs_rel_error_change_over_cpsecps_abs_rel_error_change_over_pufecps_abs_rel_error_change_over_prev_best
\n", + "\n", + "
\n", + "Loading ITables v2.1.5 from the init_notebook_mode cell...\n", + "(need help?)
\n", + "\n" + ], + "text/plain": [ + " name actual \\\n", + "0 irs/adjusted gross income/total/AGI in -inf-0/... -1.517672e+10 \n", + "1 irs/adjusted gross income/total/AGI in -inf-in... 1.641182e+13 \n", + "2 irs/adjusted gross income/total/AGI in 1-5k/ta... 5.335097e+08 \n", + "3 irs/adjusted gross income/total/AGI in 5k-10k/... 1.606361e+09 \n", + "4 irs/adjusted gross income/total/AGI in 10k-15k... 1.698206e+10 \n", + ".. ... ... \n", + "638 census/count_in_spm_threshold_decile_8 1.406893e+07 \n", + "639 census/agi_in_spm_threshold_decile_9 2.540801e+12 \n", + "640 census/count_in_spm_threshold_decile_9 1.406985e+07 \n", + "641 census/agi_in_spm_threshold_decile_10 2.856966e+12 \n", + "642 census/count_in_spm_threshold_decile_10 1.406946e+07 \n", + "\n", + " estimate_cps estimate_puf estimate_ecps abs_rel_error_cps \\\n", + "0 -2.957629e+06 -7.553493e+06 -4.571500e+07 0.999805 \n", + "1 1.376702e+13 1.368676e+13 1.628153e+13 0.161152 \n", + "2 2.547288e+05 1.145203e+08 6.778941e+07 0.999523 \n", + "3 0.000000e+00 6.417706e+08 1.014265e+09 1.000000 \n", + "4 4.979286e+09 6.875298e+09 1.594325e+10 0.706792 \n", + ".. ... ... ... ... \n", + "638 1.275242e+07 0.000000e+00 1.406233e+07 0.093576 \n", + "639 1.986276e+12 0.000000e+00 2.542540e+12 0.218248 \n", + "640 1.313452e+07 0.000000e+00 1.406882e+07 0.066478 \n", + "641 1.825988e+12 0.000000e+00 2.857821e+12 0.360865 \n", + "642 1.122834e+07 0.000000e+00 1.406499e+07 0.201935 \n", + "\n", + " abs_rel_error_puf abs_rel_error_ecps \\\n", + "0 0.999502 0.996988 \n", + "1 0.166042 0.007938 \n", + "2 0.785345 0.872937 \n", + "3 0.600482 0.368594 \n", + "4 0.595144 0.061171 \n", + ".. ... ... \n", + "638 1.000000 0.000469 \n", + "639 1.000000 0.000684 \n", + "640 1.000000 0.000074 \n", + "641 1.000000 0.000300 \n", + "642 1.000000 0.000317 \n", + "\n", + " ecps_abs_rel_error_change_over_cps ecps_abs_rel_error_change_over_puf \\\n", + "0 -0.002817 -0.002514 \n", + "1 -0.153213 -0.158104 \n", + "2 -0.126586 0.087592 \n", + "3 -0.631406 -0.231887 \n", + "4 -0.645620 -0.533972 \n", + ".. ... ... \n", + "638 -0.093107 -0.999531 \n", + "639 -0.217564 -0.999316 \n", + "640 -0.066404 -0.999926 \n", + "641 -0.360565 -0.999700 \n", + "642 -0.201618 -0.999683 \n", + "\n", + " ecps_abs_rel_error_change_over_prev_best \n", + "0 -0.002514 \n", + "1 -0.153213 \n", + "2 0.087592 \n", + "3 -0.231887 \n", + "4 -0.533972 \n", + ".. ... \n", + "638 -0.093107 \n", + "639 -0.217564 \n", + "640 -0.066404 \n", + "641 -0.360565 \n", + "642 -0.201618 \n", + "\n", + "[643 rows x 11 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from policyengine_us_data.datasets import CPS_2024, PUF_2024, EnhancedCPS_2024\n", + "from policyengine_us_data.utils import build_loss_matrix\n", + "from policyengine_us import Microsimulation\n", + "import pandas as pd\n", + "import plotly.express as px\n", + "import numpy as np\n", + "\n", + "\n", + "def compare_datasets():\n", + " comparison_combined = pd.DataFrame()\n", + " for dataset in [CPS_2024, PUF_2024, EnhancedCPS_2024]:\n", + " sim = Microsimulation(dataset=dataset)\n", + " weights = sim.calculate(\"household_weight\").values\n", + " loss_matrix, targets_array = build_loss_matrix(dataset, 2024)\n", + " target_names = loss_matrix.columns\n", + " estimates = weights @ loss_matrix.values\n", + " comparison = pd.DataFrame(\n", + " {\n", + " \"name\": target_names,\n", + " \"estimate\": estimates,\n", + " \"actual\": targets_array,\n", + " }\n", + " )\n", + " comparison[\"error\"] = comparison[\"estimate\"] - comparison[\"actual\"]\n", + " comparison[\"rel_error\"] = comparison[\"error\"] / comparison[\"actual\"]\n", + " comparison[\"abs_error\"] = comparison[\"error\"].abs()\n", + " comparison[\"abs_rel_error\"] = (\n", + " (comparison[\"abs_error\"] / comparison[\"actual\"].abs())\n", + " .replace([np.inf, -np.inf], np.nan)\n", + " .fillna(0)\n", + " )\n", + " comparison[\"dataset\"] = dataset.label\n", + " comparison_combined = pd.concat([comparison_combined, comparison])\n", + "\n", + " return comparison_combined\n", + "\n", + "df = compare_datasets()\n", + "\n", + "merged = pd.merge(\n", + " df[df.dataset == \"CPS 2024 (2022-based)\"],\n", + " df[df.dataset == \"Enhanced CPS 2024\"],\n", + " on=[\"name\"],\n", + " suffixes=(\"_cps\", \"_ecps\"),\n", + ")\n", + "merged = pd.merge(\n", + " merged,\n", + " df[df.dataset == \"PUF 2024 (2015-based)\"].rename(columns={col: col + \"_puf\" for col in df.columns if col != \"name\"}),\n", + " on=[\"name\"],\n", + ")\n", + "import pandas as pd\n", + "from itables import init_notebook_mode\n", + "import itables.options as opt\n", + "init_notebook_mode(all_interactive=True)\n", + "opt.maxBytes = \"1MB\"\n", + "# Set max cols to inf\n", + "merged[\"ecps_abs_rel_error_change_over_cps\"] = merged[\"abs_rel_error_ecps\"] - merged[\"abs_rel_error_cps\"]\n", + "merged[\"ecps_abs_rel_error_change_over_puf\"] = merged[\"abs_rel_error_ecps\"] - merged[\"abs_rel_error_puf\"]\n", + "merged[\"ecps_abs_rel_error_change_over_prev_best\"] = merged[\"abs_rel_error_ecps\"] - np.minimum(merged[\"abs_rel_error_cps\"], merged[\"abs_rel_error_puf\"])\n", + "pd.set_option('display.max_columns', None)\n", + "merged.rename(columns={\n", + " \"actual_cps\": \"actual\"\n", + "})[[\n", + " \"name\", \"actual\",\n", + " \"estimate_cps\",\n", + " \"estimate_puf\",\n", + " \"estimate_ecps\",\n", + " \"abs_rel_error_cps\",\n", + " \"abs_rel_error_puf\",\n", + " \"abs_rel_error_ecps\",\n", + " \"ecps_abs_rel_error_change_over_cps\",\n", + " \"ecps_abs_rel_error_change_over_puf\",\n", + " \"ecps_abs_rel_error_change_over_prev_best\",\n", + "]]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "Overall, the ECPS outperforms the Census' CPS in **88.0%** of the targets and the IRS' PUF in **86.9%** of the targets." + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Markdown\n", + "\n", + "ecps_outperforms_puf = merged[\"ecps_abs_rel_error_change_over_puf\"] < 0\n", + "ecps_outperforms_cps = merged[\"ecps_abs_rel_error_change_over_cps\"] < 0\n", + "\n", + "Markdown(f\"Overall, the ECPS outperforms the Census' CPS in **{ecps_outperforms_cps.mean():.1%}** of the targets and the IRS' PUF in **{ecps_outperforms_puf.mean():.1%}** of the targets.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The below histogram shows the distribution of 'relative error change under the ECPS', comparing each metric's ECPS performance to the best of either the CPS or the PUF." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from utils import show\n", + "import plotly.express as px\n", + "\n", + "clipped = merged[\"ecps_abs_rel_error_change_over_prev_best\"].clip(-1, 1)\n", + "\n", + "fig = px.histogram(clipped, nbins=100, title=\"ECPS improvement over best of CPS and PUF\").update_layout(\n", + " xaxis_range=[-1, 1],\n", + " showlegend=False,\n", + " xaxis_title=\"Absolute relative error change\",\n", + " xaxis_tickformat=\".0%\",\n", + " yaxis_title=\"Count\",\n", + ")\n", + "show(fig)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py index 8fcac9e..aa650ee 100644 --- a/policyengine_us_data/datasets/cps/census_cps.py +++ b/policyengine_us_data/datasets/cps/census_cps.py @@ -4,7 +4,7 @@ from io import BytesIO from zipfile import ZipFile import pandas as pd -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER class CensusCPS(Dataset): diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index d212851..8a4494f 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -1,5 +1,5 @@ from policyengine_core.data import Dataset -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER import h5py from policyengine_us_data.datasets.cps.census_cps import * from pandas import DataFrame, Series diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index d738f03..993a9c5 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -8,13 +8,14 @@ ) import numpy as np from typing import Type -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_us_data.datasets.cps.extended_cps import ( ExtendedCPS_2024, CPS_2019, CPS_2024, ) import torch +import os def reweight( @@ -51,7 +52,9 @@ def loss(weights): optimizer = torch.optim.Adam([weights], lr=1e-2) from tqdm import trange - iterator = trange(10_000) + iterator = ( + trange(10_000) if not os.environ.get("TEST_LITE") else trange(100) + ) for i in iterator: optimizer.zero_grad() l, worst_name, worst_val = loss(torch.exp(weights)) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 32608db..970fd6b 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -1,9 +1,10 @@ from policyengine_core.data import Dataset -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER from typing import Type from .cps import * from ..puf import * import pandas as pd +import os # These are sorted by magnitude. # First 15 contain 90%. @@ -68,6 +69,9 @@ "recapture_of_investment_credit", ] +if os.environ.get("TEST_LITE"): + IMPUTED_VARIABLES = IMPUTED_VARIABLES[:7] + class ExtendedCPS(Dataset): cps: Type[CPS] diff --git a/policyengine_us_data/datasets/puf/irs_puf.py b/policyengine_us_data/datasets/puf/irs_puf.py index ab16abb..a2c2809 100644 --- a/policyengine_us_data/datasets/puf/irs_puf.py +++ b/policyengine_us_data/datasets/puf/irs_puf.py @@ -1,5 +1,5 @@ from policyengine_core.data import Dataset -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER from pathlib import Path diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 9ed7021..f0148bd 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -3,7 +3,7 @@ import pandas as pd from microdf import MicroDataFrame from policyengine_core.data import Dataset -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER from .uprate_puf import uprate_puf from survey_enhance import Imputation from .irs_puf import IRS_PUF_2015 diff --git a/policyengine_us_data/datasets/puf/uprate_puf.py b/policyengine_us_data/datasets/puf/uprate_puf.py index 2edb172..a5c2b8d 100644 --- a/policyengine_us_data/datasets/puf/uprate_puf.py +++ b/policyengine_us_data/datasets/puf/uprate_puf.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER ITMDED_GROW_RATE = 0.02 # annual growth rate in itemized deduction amounts diff --git a/policyengine_us_data/evaluation/__init__.py b/policyengine_us_data/evaluation/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/policyengine_us_data/evaluation/loss.py b/policyengine_us_data/evaluation/loss.py deleted file mode 100644 index f0da3f8..0000000 --- a/policyengine_us_data/evaluation/loss.py +++ /dev/null @@ -1,10 +0,0 @@ -import numpy as np -import pandas as pd - - -def create_statistical_target_matrix() -> np.array: - pass - - -def create_statistical_targets() -> pd.DataFrame: - pass diff --git a/policyengine_us_data/evaluation/report.py b/policyengine_us_data/evaluation/report.py deleted file mode 100644 index 02b2589..0000000 --- a/policyengine_us_data/evaluation/report.py +++ /dev/null @@ -1,42 +0,0 @@ -from policyengine_us_data.data_storage import STORAGE_FOLDER -import argparse - - -def create_report(): - from policyengine_us import Microsimulation - from policyengine_us_data import CPS_2024 - import pandas as pd - - sim = Microsimulation(dataset=CPS_2024) - - START_YEAR = 2024 - BUDGET_WINDOW = 10 - - hnet_totals = [] - years = [] - for year in range(START_YEAR, START_YEAR + BUDGET_WINDOW): - hnet_totals.append( - round(sim.calculate("household_net_income", year).sum() / 1e9, 1) - ) - years.append(year) - - df = pd.DataFrame( - {"Year": years, "Household net income": hnet_totals} - ).set_index("Year", drop=True) - - report = f"""# Economy summary - -## Household net income -{df.T.to_markdown(index=False)} -""" - - return report - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--output", type=str, default="report.md") - args = parser.parse_args() - report = create_report() - with open(STORAGE_FOLDER / args.output, "w") as f: - f.write(report) diff --git a/policyengine_us_data/data_storage/__init__.py b/policyengine_us_data/storage/__init__.py similarity index 100% rename from policyengine_us_data/data_storage/__init__.py rename to policyengine_us_data/storage/__init__.py diff --git a/policyengine_us_data/data_storage/download_private_prerequisites.py b/policyengine_us_data/storage/download_private_prerequisites.py similarity index 100% rename from policyengine_us_data/data_storage/download_private_prerequisites.py rename to policyengine_us_data/storage/download_private_prerequisites.py diff --git a/policyengine_us_data/data_storage/download_public_prerequisites.py b/policyengine_us_data/storage/download_public_prerequisites.py similarity index 100% rename from policyengine_us_data/data_storage/download_public_prerequisites.py rename to policyengine_us_data/storage/download_public_prerequisites.py diff --git a/policyengine_us_data/data_storage/healthcare_spending.csv b/policyengine_us_data/storage/healthcare_spending.csv similarity index 100% rename from policyengine_us_data/data_storage/healthcare_spending.csv rename to policyengine_us_data/storage/healthcare_spending.csv diff --git a/policyengine_us_data/data_storage/spm_threshold_agi.csv b/policyengine_us_data/storage/spm_threshold_agi.csv similarity index 100% rename from policyengine_us_data/data_storage/spm_threshold_agi.csv rename to policyengine_us_data/storage/spm_threshold_agi.csv diff --git a/policyengine_us_data/data_storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py similarity index 100% rename from policyengine_us_data/data_storage/upload_completed_datasets.py rename to policyengine_us_data/storage/upload_completed_datasets.py diff --git a/policyengine_us_data/data_storage/uprating_factors.csv b/policyengine_us_data/storage/uprating_factors.csv similarity index 100% rename from policyengine_us_data/data_storage/uprating_factors.csv rename to policyengine_us_data/storage/uprating_factors.csv diff --git a/policyengine_us_data/data_storage/uprating_growth_factors.csv b/policyengine_us_data/storage/uprating_growth_factors.csv similarity index 100% rename from policyengine_us_data/data_storage/uprating_growth_factors.csv rename to policyengine_us_data/storage/uprating_growth_factors.csv diff --git a/policyengine_us_data/utils/docs_prerequisites_download.py b/policyengine_us_data/utils/docs_prerequisites_download.py deleted file mode 100644 index 0307f98..0000000 --- a/policyengine_us_data/utils/docs_prerequisites_download.py +++ /dev/null @@ -1,41 +0,0 @@ -from policyengine_us_data.utils.github import download -from policyengine_us_data.data_storage import STORAGE_FOLDER - -PREREQUISITES = [ - { - "repo": "policyengine-us-data", - "file_name": "enhanced_cps_2024.h5", - }, - { - "repo": "policyengine-us-data", - "file_name": "cps_2024.h5", - }, - { - "repo": "irs-soi-puf", - "file_name": "puf_2024.h5", - }, - { - "repo": "policyengine-us-data", - "file_name": "soi.csv", - }, - { - "repo": "policyengine-us-data", - "file_name": "np2023_d5_mid.csv", - }, - { - "repo": "policyengine-us-data", - "file_name": "soi.csv", - }, -] - - -def download_data(): - for prerequisite in PREREQUISITES: - if not (STORAGE_FOLDER / prerequisite["file_name"]).exists(): - download( - "PolicyEngine", - prerequisite["repo"], - "release", - prerequisite["file_name"], - STORAGE_FOLDER / prerequisite["file_name"], - ) diff --git a/policyengine_us_data/utils/github.py b/policyengine_us_data/utils/github.py index 48abfb8..007a8d6 100644 --- a/policyengine_us_data/utils/github.py +++ b/policyengine_us_data/utils/github.py @@ -143,14 +143,10 @@ def upload( # in case our subsequent delete-upload fails print( - f"Asset {file_name} already exists in release {release_tag}. Downloading a backup..." + f"Asset {file_name} already exists in release {release_tag}. Skipping." ) - download(org, repo, release_tag, file_name, temp_file_path) - - # Now, delete the asset from the release - print(f"Deleting asset {file_name} from release {release_tag}...") - delete_asset(org, repo, asset_id) + return # Now, upload the asset print(f"Uploading {file_name} to release {release_tag}...") diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index 4b5ea04..f068011 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -1,7 +1,7 @@ import pandas as pd from .soi import pe_to_soi, get_soi import numpy as np -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER def fmt(x): @@ -61,10 +61,6 @@ def build_loss_matrix(dataset: type, time_period): ] soi_subset = soi_subset[ soi_subset.Variable.isin(agi_level_targeted_variables) - & ( - (soi_subset["AGI lower bound"] != -np.inf) - | (soi_subset["AGI upper bound"] != np.inf) - ) | ( soi_subset.Variable.isin(aggregate_level_targeted_variables) & (soi_subset["AGI lower bound"] == -np.inf) diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py index af52638..9d53a14 100644 --- a/policyengine_us_data/utils/soi.py +++ b/policyengine_us_data/utils/soi.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np from .uprating import create_policyengine_uprating_factors_table -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER def pe_to_soi(pe_dataset, year): diff --git a/policyengine_us_data/utils/uprating.py b/policyengine_us_data/utils/uprating.py index 05fd6a2..6dd2f89 100644 --- a/policyengine_us_data/utils/uprating.py +++ b/policyengine_us_data/utils/uprating.py @@ -1,4 +1,4 @@ -from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.storage import STORAGE_FOLDER import pandas as pd START_YEAR = 2020 diff --git a/pyproject.toml b/pyproject.toml index 6bc4e29..65122a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "policyengine_us_data" -version = "1.3.0" +version = "1.3.1" description = "A package to create representative microdata for the US." readme = "README.md" authors = [ @@ -16,8 +16,6 @@ dependencies = [ "policyengine_core", "requests", "tqdm", - "build", - "tomli" ] [project.optional-dependencies] @@ -30,6 +28,11 @@ dev = [ "torch", "tables", "tabulate", + "furo", + "jupyter-book", + "yaml-changelog>=0.1.7", + "build", + "tomli", ] [tool.setuptools]