diff --git a/.github/review_pull_request.py b/.github/review_pull_request.py deleted file mode 100644 index a646b40..0000000 --- a/.github/review_pull_request.py +++ /dev/null @@ -1,21 +0,0 @@ -import pandas as pd -from policyengine_us_data.data_storage import STORAGE_FOLDER -from IPython.display import Markdown - -def main(): - df = pd.read_csv(STORAGE_FOLDER / "evaluation.csv") - - most_recent_rows = df[df.Date == df.Date.max()].sort_values(["Variable", "Time period"]).set_index(["Variable", "Time period"]).Total - second_most_recent_rows = df[df.Date == df.Date.sort_values().unique()[-2]].reset_index(drop=True).sort_values(["Variable", "Time period"]).set_index(["Variable", "Time period"]).Total - - diff = (most_recent_rows - second_most_recent_rows) - # Convert to df - diff = diff.reset_index() - table = diff.to_markdown(index=False) - - review_text = f"""## National projection changes\n\n{table}""" - - print(review_text) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/.github/upload_evaluation.py b/.github/upload_evaluation.py deleted file mode 100644 index 851ff65..0000000 --- a/.github/upload_evaluation.py +++ /dev/null @@ -1,13 +0,0 @@ -from policyengine_us_data.evaluation.summary import main -from policyengine_us_data.utils.github import * -from policyengine_us_data.data_storage import STORAGE_FOLDER - -if __name__ == "__main__": - main() - upload( - "policyengine", - "policyengine-us-data", - "release", - "evaluation.csv", - STORAGE_FOLDER / "evaluation.csv", - ) diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml index b6b910f..7050058 100644 --- a/.github/workflows/pull_request.yaml +++ b/.github/workflows/pull_request.yaml @@ -7,26 +7,31 @@ on: jobs: build: - name: Build and test + name: Test runs-on: ubuntu-latest - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - steps: - name: Checkout code uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - - - name: Install dependencies + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install package run: make install - + - name: Download data inputs + run: make download + env: + POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }} + - name: Build datasets + run: make data - name: Run tests run: make test - - - name: Run evaluation - run: make evaluate - - - name: Add review comment - run: python .github/review_pull_request.py \ No newline at end of file + lint: + runs-on: ubuntu-latest + name: Lint + steps: + - uses: actions/checkout@v4 + - name: Check formatting + uses: "lgeiger/black-action@master" + with: + args: ". -l 79 --check" diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index a7332c2..06e761b 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -7,26 +7,56 @@ on: jobs: build: - name: Build and test + name: Test runs-on: ubuntu-latest - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - steps: - name: Checkout code uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - - - name: Install dependencies + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install package run: make install - + - name: Download data inputs + run: make download + env: + POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }} + - name: Build datasets + run: make data - name: Run tests run: make test - - - name: Run evaluation - run: make evaluate - - - name: Upload evaluation - run: python .github/upload_evaluation.py \ No newline at end of file + - name: Upload completed datasets + run: make upload + env: + POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }} + lint: + runs-on: ubuntu-latest + name: Lint + steps: + - uses: actions/checkout@v4 + - name: Check formatting + uses: "lgeiger/black-action@master" + with: + args: ". -l 79 --check" + publish: + runs-on: ubuntu-latest + name: Publish + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: Install package + run: make install + - name: Build package + run: make build + - name: Publish a Python distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI }} + skip-existing: true + diff --git a/.gitignore b/.gitignore index 8f5d40a..4f04c4f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ **/*.h5 *.ipynb **/*.csv +!uprating_factors.csv +!uprating_growth_factors.csv diff --git a/Dockerfile b/Dockerfile index 93a2419..f3dfb18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,5 +2,4 @@ FROM python:latest COPY . . # Install RUN make install -# Run tests -CMD ["make", "test"] \ No newline at end of file +RUN ["make", "data"] diff --git a/Makefile b/Makefile index 952913d..2dc2ed1 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,5 @@ +all: data test + format: black . -l 79 @@ -7,8 +9,24 @@ test: install: pip install -e .[dev] +download: + python policyengine_us_data/data_storage/download_public_prerequisites.py + python policyengine_us_data/data_storage/download_private_prerequisites.py + +upload: + python policyengine_us_data/data_storage/upload_completed_datasets.py + docker: docker buildx build --platform linux/amd64 . -t policyengine-us-data:latest -evaluate: - python policyengine_us_data/evaluation/summary.py \ No newline at end of file +documentation: + streamlit run docs/Home.py + +data: + python policyengine_us_data/datasets/cps/enhanced_cps.py + +clean: + rm policyengine_us_data/data_storage/puf_2015.csv + rm policyengine_us_data/data_storage/demographics_2015.csv +build: + python setup.py sdist bdist_wheel \ No newline at end of file diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 0000000..796f72b --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,8 @@ +FROM python:latest +COPY . . +# Install +RUN make download +RUN make install +RUN python docs/download.py +EXPOSE 8080 +ENTRYPOINT ["streamlit", "run", "docs/Home.py", "--server.port=8080", "--server.address=0.0.0.0"] \ No newline at end of file diff --git a/docs/Home.py b/docs/Home.py new file mode 100644 index 0000000..3eb34da --- /dev/null +++ b/docs/Home.py @@ -0,0 +1,37 @@ +import streamlit as st + +st.title("PolicyEngine-US-Data") + +st.write( + """PolicyEngine-US-Data is a package to create representative microdata for the US, designed for input in the PolicyEngine tax-benefit microsimulation model.""" +) + +st.subheader("What does this repo do?") + +st.write( + """Principally, this package creates a (partly synthetic) dataset of households (with incomes, demographics and more) that describes the U.S. household sector. This dataset synthesises multiple sources of data (the Current Population Survey, the IRS Public Use File, and administrative statistics) to improve upon the accuracy of **any** of them.""" +) + +st.subheader("What does this dataset look like?") + +st.write( + "The below table shows an extract of the person records in one household in the dataset." +) + + +@st.cache_data +def sample_household(): + import pandas as pd + from policyengine_us_data.datasets import EnhancedCPS_2024 + from policyengine_us import Microsimulation + + df = Microsimulation(dataset=EnhancedCPS_2024).to_input_dataframe() + + household_id = df.person_household_id__2024.values[10] + people_in_household = df[df.person_household_id__2024 == household_id] + return people_in_household + + +people_in_household = sample_household() + +st.dataframe(people_in_household.T, use_container_width=True) diff --git a/docs/download.py b/docs/download.py new file mode 100644 index 0000000..5de2582 --- /dev/null +++ b/docs/download.py @@ -0,0 +1,26 @@ +from policyengine_us_data.utils.github import download +from policyengine_us_data.data_storage import STORAGE_FOLDER + +download( + "PolicyEngine", + "policyengine-us-data", + "release", + "enhanced_cps_2024.h5", + STORAGE_FOLDER / "enhanced_cps_2024.h5", +) + +download( + "PolicyEngine", + "policyengine-us-data", + "release", + "cps_2024.h5", + STORAGE_FOLDER / "cps_2024.h5", +) + +download( + "PolicyEngine", + "irs-soi-puf", + "release", + "puf_2024.h5", + STORAGE_FOLDER / "puf_2024.h5", +) diff --git a/docs/pages/Aggregates.py b/docs/pages/Aggregates.py new file mode 100644 index 0000000..2070084 --- /dev/null +++ b/docs/pages/Aggregates.py @@ -0,0 +1,43 @@ +import streamlit as st + +st.title("Aggregates") + +st.write( + """The table below shows the totals for calendar year 2024 for the Enhanced CPS dataset variables.""" +) + + +@st.cache_data +def sample_household(): + from policyengine_us import Microsimulation + from policyengine_us_data import EnhancedCPS_2024 + from policyengine_us_data.datasets.cps.extended_cps import ( + IMPUTED_VARIABLES as FINANCE_VARIABLES, + ) + import pandas as pd + + sim = Microsimulation(dataset=EnhancedCPS_2024) + + df = ( + pd.DataFrame( + { + "Variable": FINANCE_VARIABLES, + "Total ($bn)": [ + round( + sim.calculate(variable, map_to="household").sum() + / 1e9, + 1, + ) + for variable in FINANCE_VARIABLES + ], + } + ) + .sort_values("Total ($bn)", ascending=False) + .set_index("Variable") + ) + return df + + +df = sample_household() + +st.dataframe(df, use_container_width=True) diff --git a/docs/pages/Benchmarks.py b/docs/pages/Benchmarks.py new file mode 100644 index 0000000..4422ec6 --- /dev/null +++ b/docs/pages/Benchmarks.py @@ -0,0 +1,94 @@ +import streamlit as st + +st.title("Benchmarks") + +from policyengine_us_data.datasets import CPS_2024, PUF_2024, EnhancedCPS_2024 +from policyengine_us_data.utils import build_loss_matrix +from policyengine_us import Microsimulation +import pandas as pd +import plotly.express as px +import numpy as np + + +@st.cache_data +def compare_datasets(): + comparison_combined = pd.DataFrame() + for dataset in [CPS_2024, PUF_2024, EnhancedCPS_2024]: + sim = Microsimulation(dataset=dataset) + weights = sim.calculate("household_weight").values + loss_matrix, targets_array = build_loss_matrix(dataset, 2024) + target_names = loss_matrix.columns + estimates = weights @ loss_matrix.values + comparison = pd.DataFrame( + { + "Target": target_names, + "Estimate": estimates, + "Actual": targets_array, + } + ) + comparison["Error"] = comparison["Estimate"] - comparison["Actual"] + comparison["Abs. Error"] = comparison["Error"].abs() + comparison["Abs. Error %"] = ( + (comparison["Abs. Error"] / comparison["Actual"].abs()) + .replace([np.inf, -np.inf], np.nan) + .fillna(1) + ) + comparison["Dataset"] = dataset.label + comparison_combined = pd.concat([comparison_combined, comparison]) + + return comparison_combined + + +df = compare_datasets() + +mean_relative_error_by_dataset = ( + df.groupby("Dataset")["Abs. Error %"] + .mean() + .sort_values(ascending=False) + .apply(lambda x: round(x, 3)) +) + +st.write( + f"PolicyEngine uses **{len(df.Target.unique())}** targets for calibration in the Enhanced CPS. This page compares the estimates and errors for these targets across the three datasets." +) + +st.dataframe(mean_relative_error_by_dataset, use_container_width=True) + +metric = st.selectbox( + "Metric", ["Estimate", "Error", "Abs. Error", "Abs. Error %"] +) +target = st.selectbox("Target", df["Target"].unique()) + +fig = px.bar( + df[df["Target"] == target], + x="Dataset", + y=metric, + title=f"{metric} for {target}", +) + +if metric == "Estimate": + # Add a dashed line at the target + fig.add_shape( + type="line", + x0=-0.5, + x1=2.5, + y0=df.loc[df["Target"] == target, "Actual"].values[0], + y1=df.loc[df["Target"] == target, "Actual"].values[0], + line=dict(dash="dash"), + ) + +st.subheader("Dataset comparisons") +st.write( + "The chart below, for a selected target and metric, shows the estimates and errors for each dataset." +) + +st.plotly_chart(fig, use_container_width=True) + +ecps_df = df[df["Dataset"] == "Enhanced CPS 2024"] + +st.subheader("Enhanced CPS 2024") +st.write( + "The table below shows the error for each target in the Enhanced CPS 2024 dataset." +) + +st.dataframe(ecps_df, use_container_width=True) diff --git a/docs/pages/Distributions.py b/docs/pages/Distributions.py new file mode 100644 index 0000000..427bf8b --- /dev/null +++ b/docs/pages/Distributions.py @@ -0,0 +1,76 @@ +import streamlit as st + +st.title("Distributions") + +st.write( + "This page has several visualisations of the distributions of different variables in the PE-compatible datasets." +) + +from policyengine_us_data import CPS_2024, EnhancedCPS_2024, PUF_2024 +from policyengine_us_data.utils.soi import ( + pe_to_soi, + get_soi, + compare_soi_replication_to_soi, +) +from policyengine_us_data.utils.loss import fmt +import pandas as pd +import plotly.express as px +import numpy as np + +st.subheader("IRS SOI") + +st.write( + "Use these controls to see how the different PE-compatible datasets compare to (extrapolated from 2021 to 2024) IRS SOI data." +) + + +@st.cache_data +def _get_soi(year): + return get_soi(year) + + +soi = _get_soi(2024) + + +@st.cache_data +def get_soi_replication(dataset, year): + df = compare_soi_replication_to_soi(pe_to_soi(dataset, year), soi) + return df + + +variable = st.selectbox("Variable", soi.Variable.unique()) +filing_status = st.selectbox("Filing status", soi["Filing status"].unique()) +taxable = False +count = st.checkbox("Count") + + +def get_bar_chart(variable, filing_status, taxable, count): + df = soi[soi.Variable == variable] + df["Dataset"] = "SOI" + for dataset in [EnhancedCPS_2024, PUF_2024, CPS_2024]: + replication = get_soi_replication(dataset, 2024) + replication["Dataset"] = dataset.label + df = pd.concat([df, replication[replication.Variable == variable]]) + + df = df[df["Filing status"] == filing_status] + df = df[df["Taxable only"] == taxable] + df = df[df["Count"] == count] + df = df[ + ~( + (df["AGI lower bound"] == -np.inf) + & (df["AGI upper bound"] == np.inf) + ) + ] + + df["AGI lower bound"] = df["AGI lower bound"].apply(fmt) + + return px.bar( + df, + x="AGI lower bound", + y="Value", + color="Dataset", + barmode="group", + ) + + +st.plotly_chart(get_bar_chart(variable, filing_status, taxable, count)) diff --git a/docs/pages/Reforms.py b/docs/pages/Reforms.py new file mode 100644 index 0000000..640aff9 --- /dev/null +++ b/docs/pages/Reforms.py @@ -0,0 +1,127 @@ +import streamlit as st + +st.title("Reforms") + +from policyengine_us import Microsimulation +from policyengine_core.reforms import Reform +from pathlib import Path +import pandas as pd + +FOLDER = Path(__file__).parent +scores = ( + pd.read_csv(FOLDER / "scores.csv") + if (FOLDER / "scores.csv").exists() + else pd.DataFrame( + { + "reform_id": [], + "dataset": [], + "year": [], + } + ) +) + + +@st.cache_data +def get_budget(dataset: str, year: int, reform_id: int = None) -> float: + from policyengine_us_data import EnhancedCPS_2024, CPS_2024, PUF_2024 + + dataset = {ds.name: ds for ds in [EnhancedCPS_2024, CPS_2024, PUF_2024]}[ + dataset + ] + + if reform_id is None: + reform = None + else: + reform = Reform.from_api(reform_id, "us") + + sim = Microsimulation(dataset=dataset, reform=reform) + tax_revenues = ( + sim.calculate( + "household_tax_before_refundable_credits", period=year + ).sum() + - sim.calculate("household_refundable_tax_credits", period=year).sum() + ) + benefit_spending = sim.calculate("household_benefits", period=year).sum() + govt_balance = tax_revenues - benefit_spending + + return govt_balance + + +@st.cache_data +def get_budgetary_impact(dataset: str, year: int, reform_id: int) -> float: + baseline = get_budget(dataset, year) + with_reform = get_budget(dataset, year, reform_id) + scores = ( + pd.read_csv(FOLDER / "scores.csv") + if (FOLDER / "scores.csv").exists() + else pd.DataFrame( + { + "reform_id": [], + "dataset": [], + "year": [], + "budgetary_impact": [], + } + ) + ) + + if not scores[scores.reform_id == reform_id][scores.dataset == dataset][ + scores.year == year + ].empty: + scores = scores.drop( + scores[scores.reform_id == reform_id][scores.dataset == dataset][ + scores.year == year + ].index + ) + scores = pd.concat( + [ + scores, + pd.DataFrame( + { + "reform_id": [reform_id], + "dataset": [dataset], + "year": [year], + "budgetary_impact": [ + round((with_reform - baseline) / 1e9, 1) + ], + } + ), + ] + ) + scores.to_csv(FOLDER / "scores.csv", index=False) + + +st.write( + "Use this page to compare the computed budgetary impacts of reforms by dataset." +) + +dataset = st.selectbox( + "Dataset", ["enhanced_cps_2024", "cps_2024", "puf_2024"] +) +num_years = st.slider("Number of years", 1, 11, 3) +reform_id = st.text_input("Reform ID", "1") +reform = Reform.from_api(reform_id, "us") +if reform is not None: + st.info(reform.name) + compute = st.button("Compute") + if compute: + for year in range(2024, 2024 + num_years): + get_budgetary_impact(dataset, year, reform_id) + +scores = ( + pd.read_csv(FOLDER / "scores.csv") + if (FOLDER / "scores.csv").exists() + else pd.DataFrame( + {"reform_id": [], "dataset": [], "year": [], "budgetary_impact": []} + ) +) +scores.year = scores.year.astype(int) +scores.reform_id = scores.reform_id.astype(int) + +# Convert to a table restricted to the given reform with a row for each dataset in scores.csv and a column for each year. + +scores_wide = ( + scores[scores.reform_id == int(reform_id)] + .pivot(index="dataset", columns="year", values="budgetary_impact") + .fillna(0) +) +st.dataframe(scores_wide, use_container_width=True) diff --git a/policyengine_us_data/__init__.py b/policyengine_us_data/__init__.py index e69de29..975d883 100644 --- a/policyengine_us_data/__init__.py +++ b/policyengine_us_data/__init__.py @@ -0,0 +1 @@ +from .datasets import * diff --git a/policyengine_us_data/data_storage/download_private_prerequisites.py b/policyengine_us_data/data_storage/download_private_prerequisites.py new file mode 100644 index 0000000..eb073c0 --- /dev/null +++ b/policyengine_us_data/data_storage/download_private_prerequisites.py @@ -0,0 +1,19 @@ +from policyengine_us_data.utils.github import download +from pathlib import Path + +FOLDER = Path(__file__).parent + +download( + "PolicyEngine", + "irs-soi-puf", + "release", + "puf_2015.csv", + FOLDER / "puf_2015.csv", +) +download( + "PolicyEngine", + "irs-soi-puf", + "release", + "demographics_2015.csv", + FOLDER / "demographics_2015.csv", +) diff --git a/policyengine_us_data/data_storage/download_public_prerequisites.py b/policyengine_us_data/data_storage/download_public_prerequisites.py new file mode 100644 index 0000000..585ca53 --- /dev/null +++ b/policyengine_us_data/data_storage/download_public_prerequisites.py @@ -0,0 +1,19 @@ +from policyengine_us_data.utils.github import download +from pathlib import Path + +FOLDER = Path(__file__).parent + +download( + "PolicyEngine", + "policyengine-us-data", + "release", + "soi.csv", + FOLDER / "soi.csv", +) +download( + "PolicyEngine", + "policyengine-us-data", + "release", + "np2023_d5_mid.csv", + FOLDER / "np2023_d5_mid.csv", +) diff --git a/policyengine_us_data/data_storage/upload_completed_datasets.py b/policyengine_us_data/data_storage/upload_completed_datasets.py new file mode 100644 index 0000000..335a99f --- /dev/null +++ b/policyengine_us_data/data_storage/upload_completed_datasets.py @@ -0,0 +1,28 @@ +from policyengine_us_data.utils.github import upload +from pathlib import Path + +FOLDER = Path(__file__).parent + +upload( + "PolicyEngine", + "policyengine-us-data", + "release", + "enhanced_cps_2024.h5", + FOLDER / "enhanced_cps_2024.h5", +) + +upload( + "PolicyEngine", + "policyengine-us-data", + "release", + "cps_2024.h5", + FOLDER / "cps_2024.h5", +) + +upload( + "PolicyEngine", + "irs-soi-puf", + "release", + "puf_2024.h5", + FOLDER / "puf_2015.h5", +) diff --git a/policyengine_us_data/data_storage/uprating_factors.csv b/policyengine_us_data/data_storage/uprating_factors.csv new file mode 100644 index 0000000..9462adc --- /dev/null +++ b/policyengine_us_data/data_storage/uprating_factors.csv @@ -0,0 +1,112 @@ +Variable,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034 +alimony_expense,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +alimony_income,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779 +american_opportunity_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +amt_foreign_tax_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +casualty_loss,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +cdcc_relevant_expenses,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +charitable_cash_donations,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +charitable_non_cash_donations,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +child_support_expense,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +child_support_received,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +disability_benefits,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +domestic_production_ald,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +early_withdrawal_penalty,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +educator_expense,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +employment_income,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748 +employment_income_before_lsr,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748 +employment_income_last_year,1.0,1.069,1.149,1.211,1.264,1.306,1.348,1.39,1.438,1.486,1.536,1.587,1.639,1.693,1.748 +energy_efficient_home_improvement_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +estate_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +excess_withheld_payroll_tax,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +family_weight,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09 +farm_income,1.0,1.867,1.838,1.945,2.049,2.111,2.161,2.224,2.287,2.349,2.423,2.499,2.579,2.662,2.751 +farm_rent_income,1.0,1.357,1.335,1.413,1.489,1.534,1.571,1.616,1.662,1.707,1.76,1.816,1.874,1.935,1.999 +foreign_tax_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +free_school_meals_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +general_business_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +health_insurance_premiums,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +health_savings_account_ald,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +household_weight,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09 +interest_deduction,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +investment_income_elected_form_4952,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +keogh_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +long_term_capital_gains,1.0,1.824,1.11,1.195,1.244,1.195,1.14,1.122,1.126,1.145,1.173,1.206,1.243,1.283,1.326 +long_term_capital_gains_on_collectibles,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +medical_expense,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +medical_out_of_pocket_expenses,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +misc_deduction,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +miscellaneous_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +non_qualified_dividend_income,1.0,1.2,1.269,1.283,1.325,1.376,1.414,1.445,1.483,1.533,1.624,1.714,1.801,1.885,1.966 +non_sch_d_capital_gains,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +other_credits,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +partnership_s_corp_income,1.0,0.997,1.542,1.581,1.685,1.753,1.789,1.827,1.837,1.859,1.891,1.929,1.969,2.009,2.074 +person_weight,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09 +population,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09 +pre_tax_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +prior_year_minimum_tax_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +qualified_dividend_income,1.0,1.2,1.269,1.283,1.325,1.376,1.414,1.445,1.483,1.533,1.624,1.714,1.801,1.885,1.966 +qualified_tuition_expenses,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +real_estate_taxes,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +recapture_of_investment_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +rental_income,1.0,0.976,0.961,1.017,1.071,1.104,1.13,1.163,1.196,1.228,1.266,1.307,1.348,1.392,1.438 +roth_401k_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +roth_ira_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +salt_refund_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +savers_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +self_employed_health_insurance_ald,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +self_employed_pension_contribution_ald,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +self_employed_pension_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +self_employment_income,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779 +self_employment_income_before_lsr,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779 +self_employment_income_last_year,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779 +short_term_capital_gains,1.0,0.997,1.59,1.711,1.781,1.711,1.633,1.607,1.612,1.639,1.68,1.727,1.781,1.838,1.898 +snap_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +social_security,1.0,1.276,1.355,1.55,1.718,1.841,1.937,2.031,2.143,2.268,2.398,2.519,2.654,2.805,2.951 +social_security_dependents,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +social_security_disability,1.0,0.997,0.993,1.071,1.093,1.115,1.13,1.151,1.166,1.186,1.206,1.227,1.249,1.272,1.295 +social_security_retirement,1.0,0.997,0.993,1.071,1.093,1.115,1.13,1.151,1.166,1.186,1.206,1.227,1.249,1.272,1.295 +social_security_survivors,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +spm_unit_broadband_subsidy_reported,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +spm_unit_capped_housing_subsidy_reported,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +spm_unit_capped_work_childcare_expenses,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +spm_unit_energy_subsidy_reported,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +spm_unit_federal_tax_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +spm_unit_medical_expenses,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +spm_unit_net_income_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +spm_unit_payroll_tax_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +spm_unit_pre_subsidy_childcare_expenses,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +spm_unit_spm_threshold,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +spm_unit_state_tax_reported,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +spm_unit_total_income_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +spm_unit_weight,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09 +spm_unit_wic_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +ssi_reported,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +state_and_local_sales_or_income_tax,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +strike_benefits,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +student_loan_interest,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +tanf_reported,1.0,1.011,1.083,1.142,1.164,1.161,1.18,1.198,1.219,1.238,1.26,1.282,1.305,1.329,1.353 +tax_exempt_401k_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +tax_exempt_403b_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +tax_exempt_interest_income,1.0,0.805,0.892,0.97,1.128,1.292,1.369,1.4,1.458,1.514,1.541,1.573,1.616,1.647,1.683 +tax_exempt_ira_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +tax_exempt_pension_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +tax_exempt_private_pension_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +tax_exempt_sep_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +tax_unit_weight,1.0,1.003,1.007,1.016,1.027,1.039,1.049,1.056,1.061,1.066,1.071,1.076,1.081,1.086,1.09 +taxable_401k_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +taxable_403b_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +taxable_interest_income,1.0,0.805,0.892,0.97,1.128,1.292,1.369,1.4,1.458,1.514,1.541,1.573,1.616,1.647,1.683 +taxable_ira_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +taxable_pension_income,1.0,1.122,1.101,1.206,1.308,1.345,1.371,1.401,1.433,1.463,1.495,1.531,1.572,1.623,1.674 +taxable_private_pension_income,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +taxable_sep_distributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +taxable_unemployment_compensation,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +traditional_401k_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +traditional_ira_contributions,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +unemployment_compensation,1.0,0.512,0.544,0.622,0.689,0.739,0.777,0.815,0.86,0.91,0.962,1.011,1.065,1.126,1.184 +unrecaptured_section_1250_gain,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +unreported_payroll_tax,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +veterans_benefits,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +w2_wages_from_qualified_business,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 +workers_compensation,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718 diff --git a/policyengine_us_data/data_storage/uprating_growth_factors.csv b/policyengine_us_data/data_storage/uprating_growth_factors.csv new file mode 100644 index 0000000..851a4c8 --- /dev/null +++ b/policyengine_us_data/data_storage/uprating_growth_factors.csv @@ -0,0 +1,112 @@ +Variable,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034 +alimony_expense,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +alimony_income,0,0.255,0.053,0.026,0.066,0.04,0.021,0.021,0.006,0.012,0.017,0.02,0.021,0.02,0.033 +american_opportunity_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +amt_foreign_tax_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +casualty_loss,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +cdcc_relevant_expenses,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +charitable_cash_donations,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +charitable_non_cash_donations,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +child_support_expense,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +child_support_received,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +disability_benefits,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +domestic_production_ald,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +early_withdrawal_penalty,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +educator_expense,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +employment_income,0,0.069,0.075,0.054,0.044,0.033,0.032,0.031,0.035,0.033,0.034,0.033,0.033,0.033,0.032 +employment_income_before_lsr,0,0.069,0.075,0.054,0.044,0.033,0.032,0.031,0.035,0.033,0.034,0.033,0.033,0.033,0.032 +employment_income_last_year,0,0.069,0.075,0.054,0.044,0.033,0.032,0.031,0.035,0.033,0.034,0.033,0.033,0.033,0.032 +energy_efficient_home_improvement_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +estate_income,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +excess_withheld_payroll_tax,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +family_weight,0,0.003,0.004,0.009,0.011,0.012,0.01,0.007,0.005,0.005,0.005,0.005,0.005,0.005,0.004 +farm_income,0,0.867,-0.016,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.032,0.031,0.032,0.032,0.033 +farm_rent_income,0,0.357,-0.016,0.058,0.054,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.033,0.033 +foreign_tax_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +free_school_meals_reported,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +general_business_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +health_insurance_premiums,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +health_savings_account_ald,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +household_weight,0,0.003,0.004,0.009,0.011,0.012,0.01,0.007,0.005,0.005,0.005,0.005,0.005,0.005,0.004 +interest_deduction,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +investment_income_elected_form_4952,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +keogh_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +long_term_capital_gains,0,0.824,-0.391,0.077,0.041,-0.039,-0.046,-0.016,0.004,0.017,0.024,0.028,0.031,0.032,0.034 +long_term_capital_gains_on_collectibles,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +medical_expense,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +medical_out_of_pocket_expenses,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +misc_deduction,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +miscellaneous_income,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +non_qualified_dividend_income,0,0.2,0.057,0.011,0.033,0.038,0.028,0.022,0.026,0.034,0.059,0.055,0.051,0.047,0.043 +non_sch_d_capital_gains,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +other_credits,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +partnership_s_corp_income,0,-0.003,0.547,0.025,0.066,0.04,0.021,0.021,0.005,0.012,0.017,0.02,0.021,0.02,0.032 +person_weight,0,0.003,0.004,0.009,0.011,0.012,0.01,0.007,0.005,0.005,0.005,0.005,0.005,0.005,0.004 +population,0,0.003,0.004,0.009,0.011,0.012,0.01,0.007,0.005,0.005,0.005,0.005,0.005,0.005,0.004 +pre_tax_contributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +prior_year_minimum_tax_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +qualified_dividend_income,0,0.2,0.057,0.011,0.033,0.038,0.028,0.022,0.026,0.034,0.059,0.055,0.051,0.047,0.043 +qualified_tuition_expenses,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +real_estate_taxes,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +recapture_of_investment_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +rental_income,0,-0.024,-0.015,0.058,0.053,0.031,0.024,0.029,0.028,0.027,0.031,0.032,0.031,0.033,0.033 +roth_401k_contributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +roth_ira_contributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +salt_refund_income,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +savers_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +self_employed_health_insurance_ald,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +self_employed_pension_contribution_ald,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +self_employed_pension_contributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +self_employment_income,0,0.255,0.053,0.026,0.066,0.04,0.021,0.021,0.006,0.012,0.017,0.02,0.021,0.02,0.033 +self_employment_income_before_lsr,0,0.255,0.053,0.026,0.066,0.04,0.021,0.021,0.006,0.012,0.017,0.02,0.021,0.02,0.033 +self_employment_income_last_year,0,0.255,0.053,0.026,0.066,0.04,0.021,0.021,0.006,0.012,0.017,0.02,0.021,0.02,0.033 +short_term_capital_gains,0,-0.003,0.595,0.076,0.041,-0.039,-0.046,-0.016,0.003,0.017,0.025,0.028,0.031,0.032,0.033 +snap_reported,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +social_security,0,0.276,0.062,0.144,0.108,0.072,0.052,0.049,0.055,0.058,0.057,0.05,0.054,0.057,0.052 +social_security_dependents,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +social_security_disability,0,-0.003,-0.004,0.079,0.021,0.02,0.013,0.019,0.013,0.017,0.017,0.017,0.018,0.018,0.018 +social_security_retirement,0,-0.003,-0.004,0.079,0.021,0.02,0.013,0.019,0.013,0.017,0.017,0.017,0.018,0.018,0.018 +social_security_survivors,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +spm_unit_broadband_subsidy_reported,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +spm_unit_capped_housing_subsidy_reported,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +spm_unit_capped_work_childcare_expenses,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +spm_unit_energy_subsidy_reported,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +spm_unit_federal_tax_reported,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +spm_unit_medical_expenses,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +spm_unit_net_income_reported,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +spm_unit_payroll_tax_reported,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +spm_unit_pre_subsidy_childcare_expenses,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +spm_unit_spm_threshold,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +spm_unit_state_tax_reported,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +spm_unit_total_income_reported,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +spm_unit_weight,0,0.003,0.004,0.009,0.011,0.012,0.01,0.007,0.005,0.005,0.005,0.005,0.005,0.005,0.004 +spm_unit_wic_reported,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +ssi_reported,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +state_and_local_sales_or_income_tax,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +strike_benefits,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +student_loan_interest,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +tanf_reported,0,0.011,0.071,0.054,0.019,-0.003,0.016,0.015,0.018,0.016,0.018,0.017,0.018,0.018,0.018 +tax_exempt_401k_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +tax_exempt_403b_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +tax_exempt_interest_income,0,-0.195,0.108,0.087,0.163,0.145,0.06,0.023,0.041,0.038,0.018,0.021,0.027,0.019,0.022 +tax_exempt_ira_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +tax_exempt_pension_income,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +tax_exempt_private_pension_income,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +tax_exempt_sep_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +tax_unit_weight,0,0.003,0.004,0.009,0.011,0.012,0.01,0.007,0.005,0.005,0.005,0.005,0.005,0.005,0.004 +taxable_401k_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +taxable_403b_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +taxable_interest_income,0,-0.195,0.108,0.087,0.163,0.145,0.06,0.023,0.041,0.038,0.018,0.021,0.027,0.019,0.022 +taxable_ira_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +taxable_pension_income,0,0.122,-0.019,0.095,0.085,0.028,0.019,0.022,0.023,0.021,0.022,0.024,0.027,0.032,0.031 +taxable_private_pension_income,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +taxable_sep_distributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +taxable_unemployment_compensation,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +traditional_401k_contributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +traditional_ira_contributions,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +unemployment_compensation,0,-0.488,0.062,0.143,0.108,0.073,0.051,0.049,0.055,0.058,0.057,0.051,0.053,0.057,0.052 +unrecaptured_section_1250_gain,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +unreported_payroll_tax,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +veterans_benefits,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +w2_wages_from_qualified_business,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 +workers_compensation,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033 diff --git a/policyengine_us_data/datasets/__init__.py b/policyengine_us_data/datasets/__init__.py index 1212726..146d980 100644 --- a/policyengine_us_data/datasets/__init__.py +++ b/policyengine_us_data/datasets/__init__.py @@ -1,3 +1,4 @@ -from .cps import CPS_2022 +from .cps import * +from .puf import * -DATASETS = [CPS_2022] +DATASETS = [CPS_2022, PUF_2021] diff --git a/policyengine_us_data/datasets/cps/__init__.py b/policyengine_us_data/datasets/cps/__init__.py index 7acafb3..2411ca4 100644 --- a/policyengine_us_data/datasets/cps/__init__.py +++ b/policyengine_us_data/datasets/cps/__init__.py @@ -1 +1,3 @@ -from .policyengine_cps import CPS_2022 +from .cps import * +from .extended_cps import * +from .enhanced_cps import * diff --git a/policyengine_us_data/datasets/cps/policyengine_cps.py b/policyengine_us_data/datasets/cps/cps.py similarity index 94% rename from policyengine_us_data/datasets/cps/policyengine_cps.py rename to policyengine_us_data/datasets/cps/cps.py index 7c76973..02f18ca 100644 --- a/policyengine_us_data/datasets/cps/policyengine_cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -8,6 +8,9 @@ import os import yaml from typing import Type +from policyengine_us_data.utils.uprating import ( + create_policyengine_uprating_factors_table, +) class CPS(Dataset): @@ -22,6 +25,28 @@ def generate(self): Technical documentation and codebook here: https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar21.pdf """ + if self.raw_cps is None: + # Extrapolate from CPS 2022 + + cps_2022 = CPS_2022(require=True) + print("Creating uprating factors table...") + uprating = create_policyengine_uprating_factors_table() + arrays = cps_2022.load_dataset() + for variable in uprating.index.unique(): + if variable in arrays: + current_index = uprating[uprating.index == variable][ + self.time_period + ].values[0] + start_index = uprating[uprating.index == variable][ + 2021 + ].values[0] + growth = current_index / start_index + print(f"Uprating {variable} by {growth-1:.1%}") + arrays[variable] = arrays[variable] * growth + + self.save_dataset(arrays) + return + raw_data = self.raw_cps(require=True).load() cps = h5py.File(self.file_path, mode="w") @@ -40,9 +65,6 @@ def generate(self): raw_data.close() cps.close() - cps = h5py.File(self.file_path, mode="a") - cps.close() - def add_id_variables( cps: h5py.File, @@ -75,23 +97,6 @@ def add_id_variables( cps["person_household_id"] = person.PH_SEQ cps["person_family_id"] = person.PH_SEQ * 10 + person.PF_SEQ - # Add weights - # Weights are multiplied by 100 to avoid decimals - cps["person_weight"] = person.A_FNLWGT / 1e2 - cps["family_weight"] = family.FSUP_WGT / 1e2 - - # Tax unit weight is the weight of the containing family. - family_weight = Series( - cps["family_weight"][...], index=cps["family_id"][...] - ) - person_family_id = cps["person_family_id"][...] - persons_family_weight = Series(family_weight[person_family_id]) - cps["tax_unit_weight"] = persons_family_weight.groupby( - cps["person_tax_unit_id"][...] - ).first() - - cps["spm_unit_weight"] = spm_unit.SPM_WEIGHT / 1e2 - cps["household_weight"] = household.HSUP_WGT / 1e2 # Marital units @@ -536,3 +541,10 @@ class CPS_2022(CPS): previous_year_raw_cps = CensusCPS_2021 file_path = STORAGE_FOLDER / "cps_2022.h5" time_period = 2022 + + +class CPS_2024(CPS): + name = "cps_2024" + label = "CPS 2024 (2022-based)" + file_path = STORAGE_FOLDER / "cps_2024.h5" + time_period = 2024 diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py new file mode 100644 index 0000000..dc6aa81 --- /dev/null +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -0,0 +1,147 @@ +from policyengine_core.data import Dataset +import pandas as pd +from policyengine_us_data.utils import ( + pe_to_soi, + get_soi, + build_loss_matrix, + fmt, +) +import numpy as np +from typing import Type +from policyengine_us_data.data_storage import STORAGE_FOLDER +from policyengine_us_data.datasets.cps.extended_cps import ( + ExtendedCPS_2024, + CPS_2019, +) +import torch + + +def reweight( + original_weights, + loss_matrix, + targets_array, +): + target_names = np.array(loss_matrix.columns) + loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32) + targets_array = torch.tensor(targets_array, dtype=torch.float32) + weights = torch.tensor( + np.log(original_weights), requires_grad=True, dtype=torch.float32 + ) + + # TODO: replace this with a call to the python reweight.py package. + def loss(weights): + # Check for Nans in either the weights or the loss matrix + if torch.isnan(weights).any(): + raise ValueError("Weights contain NaNs") + if torch.isnan(loss_matrix).any(): + raise ValueError("Loss matrix contains NaNs") + estimate = weights @ loss_matrix + if torch.isnan(estimate).any(): + raise ValueError("Estimate contains NaNs") + rel_error = ( + ((estimate - targets_array) + 1) / (targets_array + 1) + ) ** 2 + if torch.isnan(rel_error).any(): + raise ValueError("Relative error contains NaNs") + worst_name = target_names[torch.argmax(rel_error)] + worst_val = rel_error[torch.argmax(rel_error)].item() + return rel_error.mean(), worst_name, worst_val + + optimizer = torch.optim.Adam([weights], lr=1) + from tqdm import trange + + iterator = trange(1_000) + for i in iterator: + optimizer.zero_grad() + l, worst_name, worst_val = loss(torch.exp(weights)) + l.backward() + iterator.set_postfix( + {"loss": l.item(), "worst": worst_name, "val": worst_val} + ) + optimizer.step() + + return torch.exp(weights).detach().numpy() + + +def train_previous_year_income_model(): + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=CPS_2019) + + VARIABLES = [ + "previous_year_income_available", + "employment_income", + "self_employment_income", + "age", + "is_male", + "spm_unit_state_fips", + "dividend_income", + "interest_income", + "social_security", + "capital_gains", + "is_disabled", + "is_blind", + "is_married", + "tax_unit_children", + "pension_income", + ] + + OUTPUTS = [ + "employment_income_last_year", + "self_employment_income_last_year", + ] + + df = sim.calculate_dataframe(VARIABLES + OUTPUTS, 2019, map_to="person") + df_train = df[df.previous_year_income_available] + + from survey_enhance import Imputation + + income_last_year = Imputation() + X = df_train[VARIABLES[1:]] + y = df_train[OUTPUTS] + + income_last_year.train(X, y) + + return income_last_year + + +class EnhancedCPS(Dataset): + data_format = Dataset.TIME_PERIOD_ARRAYS + input_dataset: Type[Dataset] + start_year: int + end_year: int + + def generate(self): + df = self.input_dataset(require=True).load() + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=self.input_dataset) + data = sim.dataset.load_dataset() + data["household_weight"] = {} + original_weights = sim.calculate("household_weight") + original_weights = original_weights.values + np.random.normal( + 1, 0.1, len(original_weights) + ) + for year in range(self.start_year, self.end_year + 1): + loss_matrix, targets_array = build_loss_matrix( + self.input_dataset, year + ) + optimised_weights = reweight( + original_weights, loss_matrix, targets_array + ) + data["household_weight"][year] = optimised_weights + + self.save_dataset(data) + + +class EnhancedCPS_2024(EnhancedCPS): + input_dataset = ExtendedCPS_2024 + start_year = 2024 + end_year = 2024 + name = "enhanced_cps_2024" + label = "Enhanced CPS 2024" + file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5" + + +if __name__ == "__main__": + EnhancedCPS_2024().generate() diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py new file mode 100644 index 0000000..9f43122 --- /dev/null +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -0,0 +1,146 @@ +from policyengine_core.data import Dataset +from policyengine_us_data.data_storage import STORAGE_FOLDER +from typing import Type +from .cps import * +from ..puf import * +import pandas as pd + +IMPUTED_VARIABLES = [ + "alimony_expense", + "alimony_income", + "american_opportunity_credit", + "amt_foreign_tax_credit", + "casualty_loss", + "cdcc_relevant_expenses", + "charitable_cash_donations", + "charitable_non_cash_donations", + "domestic_production_ald", + "early_withdrawal_penalty", + "educator_expense", + "employment_income", + "energy_efficient_home_improvement_credit", + "estate_income", + "excess_withheld_payroll_tax", + "farm_income", + "farm_rent_income", + "foreign_tax_credit", + "general_business_credit", + "health_savings_account_ald", + "interest_deduction", + "investment_income_elected_form_4952", + "long_term_capital_gains", + "long_term_capital_gains_on_collectibles", + "medical_expense", + "misc_deduction", + "miscellaneous_income", + "non_qualified_dividend_income", + "non_sch_d_capital_gains", + "other_credits", + "partnership_s_corp_income", + "pre_tax_contributions", + "prior_year_minimum_tax_credit", + "qualified_dividend_income", + "qualified_tuition_expenses", + "real_estate_taxes", + "recapture_of_investment_credit", + "rental_income", + "salt_refund_income", + "savers_credit", + "self_employed_health_insurance_ald", + "self_employed_pension_contribution_ald", + "self_employment_income", + "short_term_capital_gains", + "social_security", + "student_loan_interest", + "tax_exempt_interest_income", + "tax_exempt_pension_income", + "taxable_interest_income", + "taxable_ira_distributions", + "taxable_pension_income", + "taxable_unemployment_compensation", + "traditional_ira_contributions", + "unrecaptured_section_1250_gain", + "unreported_payroll_tax", + "w2_wages_from_qualified_business", +] + + +class ExtendedCPS(Dataset): + cps: Type[CPS] + puf: Type[PUF] + data_format = Dataset.TIME_PERIOD_ARRAYS + + def generate(self): + from policyengine_us import Microsimulation + from survey_enhance import Imputation + + cps_sim = Microsimulation(dataset=self.cps) + puf_sim = Microsimulation(dataset=self.puf) + + INPUTS = [ + "age", + "is_male", + "tax_unit_is_joint", + "tax_unit_count_dependents", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", + ] + + X_train = puf_sim.calculate_dataframe(INPUTS) + y_train = puf_sim.calculate_dataframe(IMPUTED_VARIABLES) + X = cps_sim.calculate_dataframe(INPUTS) + y = pd.DataFrame(columns=IMPUTED_VARIABLES, index=X.index) + + model = Imputation() + model.train( + X_train, + y_train, + verbose=True, + sample_weight=puf_sim.calculate( + "household_weight", map_to="person" + ).values, + ) + y = model.predict(X, verbose=True) + + data = cps_sim.dataset.load_dataset() + new_data = {} + + for variable in list(data) + IMPUTED_VARIABLES: + variable_metadata = cps_sim.tax_benefit_system.variables.get( + variable + ) + if variable in data: + values = data[variable][...] + else: + values = cps_sim.calculate(variable).values + if variable in IMPUTED_VARIABLES: + pred_values = y[variable].values + entity = variable_metadata.entity.key + if entity != "person": + pred_values = cps_sim.populations[ + entity + ].value_from_first_person(pred_values) + values = np.concatenate([values, pred_values]) + elif variable == "person_id": + values = np.concatenate([values, values + values.max()]) + elif "_id" in variable: + values = np.concatenate([values, values + values.max()]) + elif "_weight" in variable: + values = np.concatenate([values, values * 0]) + else: + values = np.concatenate([values, values]) + new_data[variable] = { + self.time_period: values, + } + + self.save_dataset(new_data) + + +class ExtendedCPS_2024(ExtendedCPS): + cps = CPS_2024 + puf = PUF_2024 + name = "extended_cps_2024" + label = "Extended CPS (2024)" + file_path = STORAGE_FOLDER / "extended_cps_2024.h5" + time_period = 2024 diff --git a/policyengine_us_data/datasets/puf/__init__.py b/policyengine_us_data/datasets/puf/__init__.py index e69de29..2866243 100644 --- a/policyengine_us_data/datasets/puf/__init__.py +++ b/policyengine_us_data/datasets/puf/__init__.py @@ -0,0 +1 @@ +from .puf import * diff --git a/policyengine_us_data/datasets/puf/irs_puf.py b/policyengine_us_data/datasets/puf/irs_puf.py index 31de459..ab16abb 100644 --- a/policyengine_us_data/datasets/puf/irs_puf.py +++ b/policyengine_us_data/datasets/puf/irs_puf.py @@ -39,6 +39,6 @@ class IRS_PUF_2015(IRS_PUF): name = "irs_puf_2015" label = "IRS PUF (2015)" time_period = 2015 - puf_file_path = "~/Downloads/puf_2015.csv" - puf_demographics_file_path = "~/Downloads/demographics_2015.csv" + puf_file_path = STORAGE_FOLDER / "puf_2015.csv" + puf_demographics_file_path = STORAGE_FOLDER / "demographics_2015.csv" file_path = STORAGE_FOLDER / "irs_puf_2015.h5" diff --git a/policyengine_us_data/datasets/puf/policyengine_puf.py b/policyengine_us_data/datasets/puf/policyengine_puf.py deleted file mode 100644 index e69de29..0000000 diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py new file mode 100644 index 0000000..57f24da --- /dev/null +++ b/policyengine_us_data/datasets/puf/puf.py @@ -0,0 +1,507 @@ +from tqdm import tqdm +import numpy as np +import pandas as pd +from microdf import MicroDataFrame +from policyengine_core.data import Dataset +from policyengine_us_data.data_storage import STORAGE_FOLDER +from .uprate_puf import uprate_puf +from survey_enhance import Imputation +from .irs_puf import IRS_PUF_2015 +from policyengine_us_data.utils.uprating import ( + create_policyengine_uprating_factors_table, +) + +rng = np.random.default_rng(seed=64) + + +def impute_pension_contributions_to_puf(puf_df): + from policyengine_us import Microsimulation + from policyengine_us_data.datasets.cps import CPS_2021 + + cps = Microsimulation(dataset=CPS_2021) + cps_df = cps.calculate_dataframe( + ["employment_income", "household_weight", "pre_tax_contributions"] + ) + + pension_contributions = Imputation() + + pension_contributions.train( + X=cps_df[["employment_income"]], + Y=cps_df[["pre_tax_contributions"]], + sample_weight=cps_df["household_weight"], + ) + return pension_contributions.predict( + X=puf_df[["employment_income"]], + ) + + +def impute_missing_demographics( + puf: pd.DataFrame, demographics: pd.DataFrame +) -> pd.DataFrame: + puf_with_demographics = ( + puf[puf.RECID.isin(demographics.RECID)] + .merge(demographics, on="RECID") + .fillna(0) + ) + + DEMOGRAPHIC_VARIABLES = [ + "AGEDP1", + "AGEDP2", + "AGEDP3", + "AGERANGE", + "EARNSPLIT", + "GENDER", + ] + NON_DEMOGRAPHIC_VARIABLES = [ + "E00200", + "MARS", + "DSI", + "EIC", + "XTOT", + ] + + demographics_from_puf = Imputation() + + demographics_from_puf.train( + X=puf_with_demographics[NON_DEMOGRAPHIC_VARIABLES], + Y=puf_with_demographics[DEMOGRAPHIC_VARIABLES], + ) + + puf_without_demographics = puf[ + ~puf.RECID.isin(puf_with_demographics.RECID) + ].reset_index() + predicted_demographics = demographics_from_puf.predict( + X=puf_without_demographics, + ) + puf_with_imputed_demographics = pd.concat( + [puf_without_demographics, predicted_demographics], axis=1 + ) + + weighted_puf_with_demographics = MicroDataFrame( + puf_with_demographics, weights="S006" + ) + weighted_puf_with_imputed_demographics = MicroDataFrame( + puf_with_imputed_demographics, weights="S006" + ) + + puf_combined = pd.concat( + [ + weighted_puf_with_demographics, + weighted_puf_with_imputed_demographics, + ] + ) + + return puf_combined + + +def decode_age_filer(age_range: int) -> int: + if age_range == 0: + return 40 + AGERANGE_FILER_DECODE = { + 1: 18, + 2: 26, + 3: 35, + 4: 45, + 5: 55, + 6: 65, + 7: 80, + } + lower = AGERANGE_FILER_DECODE[age_range] + upper = AGERANGE_FILER_DECODE[age_range + 1] + return rng.integers(low=lower, high=upper, endpoint=False) + + +def decode_age_dependent(age_range: int) -> int: + if age_range == 0: + return 0 + AGERANGE_DEPENDENT_DECODE = { + 0: 0, + 1: 0, + 2: 5, + 3: 13, + 4: 17, + 5: 19, + 6: 25, + 7: 30, + } + lower = AGERANGE_DEPENDENT_DECODE[age_range] + upper = AGERANGE_DEPENDENT_DECODE[age_range + 1] + return rng.integers(low=lower, high=upper, endpoint=False) + + +def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: + # Add variable renames + puf.S006 = puf.S006 / 100 + # puf["adjusted_gross_income"] = puf.E00100 + puf["alimony_expense"] = puf.E03500 + puf["alimony_income"] = puf.E00800 + puf["casualty_loss"] = puf.E20500 + puf["cdcc_relevant_expenses"] = puf.E32800 + puf["charitable_cash_donations"] = puf.E19800 + puf["charitable_non_cash_donations"] = puf.E20100 + puf["domestic_production_ald"] = puf.E03240 + puf["early_withdrawal_penalty"] = puf.E03400 + puf["educator_expense"] = puf.E03220 + puf["employment_income"] = puf.E00200 + puf["estate_income"] = puf.E26390 - puf.E26400 + puf["farm_income"] = puf.T27800 + puf["farm_rent_income"] = puf.E27200 + puf["health_savings_account_ald"] = puf.E03290 + puf["interest_deduction"] = puf.E19200 + puf["long_term_capital_gains"] = puf.P23250 + puf["long_term_capital_gains_on_collectibles"] = puf.E24518 + puf["medical_expense"] = puf.E17500 + puf["misc_deduction"] = puf.E20400 + puf["non_qualified_dividend_income"] = puf.E00600 - puf.E00650 + puf["partnership_s_corp_income"] = puf.E26270 + puf["qualified_dividend_income"] = puf.E00650 + puf["qualified_tuition_expenses"] = puf.E03230 + puf["real_estate_taxes"] = puf.E18500 + puf["rental_income"] = puf.E25850 - puf.E25860 + puf["self_employment_income"] = puf.E00900 + puf["self_employed_health_insurance_ald"] = puf.E03270 + puf["self_employed_pension_contribution_ald"] = puf.E03300 + puf["short_term_capital_gains"] = puf.P22250 + puf["social_security"] = puf.E02400 + puf["state_and_local_sales_or_income_tax"] = puf.E18400 + puf["student_loan_interest"] = puf.E03210 + puf["taxable_interest_income"] = puf.E00300 + puf["taxable_pension_income"] = puf.E01700 + puf["taxable_unemployment_compensation"] = puf.E02300 + puf["taxable_ira_distributions"] = puf.E01400 + puf["tax_exempt_interest_income"] = puf.E00400 + puf["tax_exempt_pension_income"] = puf.E01500 - puf.E01700 + puf["traditional_ira_contributions"] = puf.E03150 + puf["unrecaptured_section_1250_gain"] = puf.E24515 + + puf["foreign_tax_credit"] = puf.E07300 + puf["amt_foreign_tax_credit"] = puf.E62900 + puf["miscellaneous_income"] = puf.E01200 + puf["salt_refund_income"] = puf.E00700 + puf["investment_income_elected_form_4952"] = puf.E58990 + puf["general_business_credit"] = puf.E07400 + puf["prior_year_minimum_tax_credit"] = puf.E07600 + puf["excess_withheld_payroll_tax"] = puf.E11200 + puf["non_sch_d_capital_gains"] = puf.E01100 + puf["american_opportunity_credit"] = puf.E87521 + puf["energy_efficient_home_improvement_credit"] = puf.E07260 + puf["early_withdrawal_penalty"] = puf.E09900 + # puf["qualified_tuition_expenses"] = puf.E87530 # PE uses the same variable for qualified tuition (general) and qualified tuition (Lifetime Learning Credit). Revisit here. + puf["other_credits"] = puf.P08000 + puf["savers_credit"] = puf.E07240 + puf["recapture_of_investment_credit"] = puf.E09700 + puf["unreported_payroll_tax"] = puf.E09800 + # Ignore f2441 (AMT form attached) + # Ignore cmbtp (estimate of AMT income not in AGI) + # Ignore k1bx14s and k1bx14p (partner self-employment income included in partnership and S-corp income) + qbi = np.maximum(0, puf.E00900 + puf.E26270 + puf.E02100 + puf.E27200) + W2_WAGES_SCALE = 0.16 + puf["w2_wages_from_qualified_business"] = qbi * W2_WAGES_SCALE + + # Remove aggregate records + puf = puf[puf.MARS != 0] + + puf["filing_status"] = puf.MARS.map( + { + 1: "SINGLE", + 2: "JOINT", + 3: "SEPARATE", + 4: "HEAD_OF_HOUSEHOLD", + } + ) + puf["household_id"] = puf.RECID + puf["household_weight"] = puf.S006 + puf["exemptions_count"] = puf.XTOT + + return puf + + +FINANCIAL_SUBSET = [ + # "adjusted_gross_income", + "alimony_expense", + "alimony_income", + "casualty_loss", + "cdcc_relevant_expenses", + "charitable_cash_donations", + "charitable_non_cash_donations", + "domestic_production_ald", + "early_withdrawal_penalty", + "educator_expense", + "employment_income", + "estate_income", + "farm_income", + "farm_rent_income", + "health_savings_account_ald", + "interest_deduction", + "long_term_capital_gains", + "long_term_capital_gains_on_collectibles", + "medical_expense", + "misc_deduction", + "non_qualified_dividend_income", + "non_sch_d_capital_gains", + "partnership_s_corp_income", + "qualified_dividend_income", + "qualified_tuition_expenses", + "real_estate_taxes", + "rental_income", + "self_employment_income", + "self_employed_health_insurance_ald", + "self_employed_pension_contribution_ald", + "short_term_capital_gains", + "social_security", + "state_and_local_sales_or_income_tax", + "student_loan_interest", + "taxable_interest_income", + "taxable_pension_income", + "taxable_unemployment_compensation", + "taxable_ira_distributions", + "tax_exempt_interest_income", + "tax_exempt_pension_income", + "traditional_ira_contributions", + "unrecaptured_section_1250_gain", + "foreign_tax_credit", + "amt_foreign_tax_credit", + "miscellaneous_income", + "salt_refund_income", + "investment_income_elected_form_4952", + "general_business_credit", + "prior_year_minimum_tax_credit", + "excess_withheld_payroll_tax", + "american_opportunity_credit", + "energy_efficient_home_improvement_credit", + "other_credits", + "savers_credit", + "recapture_of_investment_credit", + "unreported_payroll_tax", + "pre_tax_contributions", + "w2_wages_from_qualified_business", +] + + +class PUF(Dataset): + time_period = None + data_format = Dataset.ARRAYS + + def generate(self): + from policyengine_us.system import system + + print("Importing PolicyEngine US variable metadata...") + + irs_puf = IRS_PUF_2015(require=True) + + puf = irs_puf.load("puf") + demographics = irs_puf.load("puf_demographics") + + if self.time_period == 2021: + puf = uprate_puf(puf, 2015, self.time_period) + elif self.time_period >= 2021: + puf_2021 = PUF_2021(require=True) + print("Creating uprating factors table...") + uprating = create_policyengine_uprating_factors_table() + arrays = puf_2021.load_dataset() + for variable in uprating: + if variable in arrays: + current_index = uprating[uprating.Variable == variable][ + self.time_period + ].values[0] + start_index = uprating[uprating.Variable == variable][ + 2021 + ].values[0] + growth = current_index / start_index + print(f"Uprating {variable} by {growth-1:.1%}") + arrays[variable] = arrays[variable] * growth + self.save_dataset(arrays) + return + + puf = puf[puf.MARS != 0] # Remove aggregate records + + print("Pre-processing PUF...") + original_recid = puf.RECID.values.copy() + puf = preprocess_puf(puf) + print("Imputing missing PUF demographics...") + puf = impute_missing_demographics(puf, demographics) + print("Imputing PUF pension contributions...") + puf["pre_tax_contributions"] = impute_pension_contributions_to_puf( + puf[["employment_income"]] + ) + + # Sort in original PUF order + puf = puf.set_index("RECID").loc[original_recid].reset_index() + puf = puf.fillna(0) + self.variable_to_entity = { + variable: system.variables[variable].entity.key + for variable in system.variables + } + + VARIABLES = [ + "person_id", + "tax_unit_id", + "marital_unit_id", + "spm_unit_id", + "family_id", + "household_id", + "person_tax_unit_id", + "person_marital_unit_id", + "person_spm_unit_id", + "person_family_id", + "person_household_id", + "age", + "household_weight", + "is_male", + "filing_status", + "is_tax_unit_head", + "is_tax_unit_spouse", + "is_tax_unit_dependent", + ] + FINANCIAL_SUBSET + + self.holder = {variable: [] for variable in VARIABLES} + + i = 0 + self.earn_splits = [] + for _, row in tqdm( + puf.iterrows(), + total=len(puf), + desc="Constructing hierarchical PUF", + ): + i += 1 + exemptions = row["exemptions_count"] + tax_unit_id = row["household_id"] + self.add_tax_unit(row, tax_unit_id) + self.add_filer(row, tax_unit_id) + exemptions -= 1 + if row["filing_status"] == "JOINT": + self.add_spouse(row, tax_unit_id) + exemptions -= 1 + + for j in range(min(3, exemptions)): + self.add_dependent(row, tax_unit_id, j) + + groups_assumed_to_be_tax_unit_like = [ + "family", + "spm_unit", + "household", + ] + + for group in groups_assumed_to_be_tax_unit_like: + self.holder[f"{group}_id"] = self.holder["tax_unit_id"] + self.holder[f"person_{group}_id"] = self.holder[ + "person_tax_unit_id" + ] + + for key in self.holder: + if key == "filing_status": + self.holder[key] = np.array(self.holder[key]).astype("S") + else: + self.holder[key] = np.array(self.holder[key]).astype(float) + assert not np.isnan(self.holder[key]).any(), f"{key} has NaNs." + + self.save_dataset(self.holder) + + def add_tax_unit(self, row, tax_unit_id): + self.holder["tax_unit_id"].append(tax_unit_id) + + for key in FINANCIAL_SUBSET: + if self.variable_to_entity[key] == "tax_unit": + self.holder[key].append(row[key]) + + earnings_split = round(row["EARNSPLIT"]) + if earnings_split > 0: + SPLIT_DECODES = { + 1: 0.0, + 2: 0.25, + 3: 0.75, + 4: 1.0, + } + lower = SPLIT_DECODES[earnings_split] + upper = SPLIT_DECODES[earnings_split + 1] + frac = (upper - lower) * rng.random() + lower + self.earn_splits.append(1.0 - frac) + else: + self.earn_splits.append(1.0) + + self.holder["filing_status"].append(row["filing_status"]) + + def add_filer(self, row, tax_unit_id): + person_id = int(tax_unit_id * 1e2 + 1) + self.holder["person_id"].append(person_id) + self.holder["person_tax_unit_id"].append(tax_unit_id) + self.holder["person_marital_unit_id"].append(person_id) + self.holder["marital_unit_id"].append(person_id) + self.holder["is_tax_unit_head"].append(True) + self.holder["is_tax_unit_spouse"].append(False) + self.holder["is_tax_unit_dependent"].append(False) + + self.holder["age"].append(decode_age_filer(round(row["AGERANGE"]))) + + self.holder["household_weight"].append(row["household_weight"]) + self.holder["is_male"].append(row["GENDER"] == 1) + + for key in FINANCIAL_SUBSET: + if self.variable_to_entity[key] == "person": + self.holder[key].append(row[key] * self.earn_splits[-1]) + + def add_spouse(self, row, tax_unit_id): + person_id = int(tax_unit_id * 1e2 + 2) + self.holder["person_id"].append(person_id) + self.holder["person_tax_unit_id"].append(tax_unit_id) + self.holder["person_marital_unit_id"].append(person_id - 1) + self.holder["is_tax_unit_head"].append(False) + self.holder["is_tax_unit_spouse"].append(True) + self.holder["is_tax_unit_dependent"].append(False) + + self.holder["age"].append( + decode_age_filer(round(row["AGERANGE"])) + ) # Assume same age as filer for now + + # 96% of joint filers are opposite-gender + + is_opposite_gender = rng.random() < 0.96 + opposite_gender_code = 0 if row["GENDER"] == 1 else 1 + same_gender_code = 1 - opposite_gender_code + self.holder["is_male"].append( + opposite_gender_code if is_opposite_gender else same_gender_code + ) + + for key in FINANCIAL_SUBSET: + if self.variable_to_entity[key] == "person": + self.holder[key].append(row[key] * (1 - self.earn_splits[-1])) + + def add_dependent(self, row, tax_unit_id, dependent_id): + person_id = int(tax_unit_id * 1e2 + 3 + dependent_id) + self.holder["person_id"].append(person_id) + self.holder["person_tax_unit_id"].append(tax_unit_id) + self.holder["person_marital_unit_id"].append(person_id) + self.holder["marital_unit_id"].append(person_id) + self.holder["is_tax_unit_head"].append(False) + self.holder["is_tax_unit_spouse"].append(False) + self.holder["is_tax_unit_dependent"].append(True) + + age = decode_age_dependent(round(row[f"AGEDP{dependent_id + 1}"])) + self.holder["age"].append(age) + + for key in FINANCIAL_SUBSET: + if self.variable_to_entity[key] == "person": + self.holder[key].append(0) + + self.holder["is_male"].append(rng.choice([0, 1])) + + +class PUF_2015(PUF): + label = "PUF 2015" + name = "puf_2015" + time_period = 2015 + file_path = STORAGE_FOLDER / "puf_2015.h5" + + +class PUF_2021(PUF): + label = "PUF 2021" + name = "puf_2021" + time_period = 2021 + file_path = STORAGE_FOLDER / "puf_2021.h5" + + +class PUF_2024(PUF): + label = "PUF 2024 (2015-based)" + name = "puf_2024" + time_period = 2024 + file_path = STORAGE_FOLDER / "puf_2024.h5" diff --git a/policyengine_us_data/datasets/puf/uprate_puf.py b/policyengine_us_data/datasets/puf/uprate_puf.py index 3f37dcb..3b8f8de 100644 --- a/policyengine_us_data/datasets/puf/uprate_puf.py +++ b/policyengine_us_data/datasets/puf/uprate_puf.py @@ -87,7 +87,8 @@ "E09800", ] -soi = pd.read_csv(STORAGE_FOLDER / "soi.csv") +if (STORAGE_FOLDER / "soi.csv").exists(): + soi = pd.read_csv(STORAGE_FOLDER / "soi.csv") def get_soi_aggregate(variable, year, is_count): diff --git a/policyengine_us_data/evaluation/loss.py b/policyengine_us_data/evaluation/loss.py new file mode 100644 index 0000000..f0da3f8 --- /dev/null +++ b/policyengine_us_data/evaluation/loss.py @@ -0,0 +1,10 @@ +import numpy as np +import pandas as pd + + +def create_statistical_target_matrix() -> np.array: + pass + + +def create_statistical_targets() -> pd.DataFrame: + pass diff --git a/policyengine_us_data/evaluation/report.py b/policyengine_us_data/evaluation/report.py new file mode 100644 index 0000000..02b2589 --- /dev/null +++ b/policyengine_us_data/evaluation/report.py @@ -0,0 +1,42 @@ +from policyengine_us_data.data_storage import STORAGE_FOLDER +import argparse + + +def create_report(): + from policyengine_us import Microsimulation + from policyengine_us_data import CPS_2024 + import pandas as pd + + sim = Microsimulation(dataset=CPS_2024) + + START_YEAR = 2024 + BUDGET_WINDOW = 10 + + hnet_totals = [] + years = [] + for year in range(START_YEAR, START_YEAR + BUDGET_WINDOW): + hnet_totals.append( + round(sim.calculate("household_net_income", year).sum() / 1e9, 1) + ) + years.append(year) + + df = pd.DataFrame( + {"Year": years, "Household net income": hnet_totals} + ).set_index("Year", drop=True) + + report = f"""# Economy summary + +## Household net income +{df.T.to_markdown(index=False)} +""" + + return report + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--output", type=str, default="report.md") + args = parser.parse_args() + report = create_report() + with open(STORAGE_FOLDER / args.output, "w") as f: + f.write(report) diff --git a/policyengine_us_data/evaluation/summary.py b/policyengine_us_data/evaluation/summary.py deleted file mode 100644 index 15489c0..0000000 --- a/policyengine_us_data/evaluation/summary.py +++ /dev/null @@ -1,87 +0,0 @@ -from policyengine_us_data.data_storage import STORAGE_FOLDER -from policyengine_core.data import Dataset -import pandas as pd -from pathlib import Path -from typing import Type -from tqdm import tqdm - - -def evaluate_dataset(dataset: Type[Dataset]) -> pd.DataFrame: - from policyengine_us import Microsimulation - - sim = Microsimulation(dataset=dataset) - - KEY_VARIABLES = [ - "household_net_income", - "income_tax", - "snap", - "ssi", - ] - - KEY_TIME_PERIODS = [ - 2024, - 2025, - 2026, - 2027, - 2028, - 2029, - 2030, - 2031, - 2032, - 2033, - ] - variables = [] - time_periods = [] - totals = [] - - for time_period in tqdm(KEY_TIME_PERIODS[:3]): - for variable in KEY_VARIABLES: - total = round(sim.calculate(variable, time_period).sum() / 1e9, 1) - variables.append(variable) - time_periods.append(time_period) - totals.append(total) - - df = pd.DataFrame( - { - "Variable": variables, - "Time period": time_periods, - "Total": totals, - } - ) - - df["Date"] = pd.Timestamp("now") - df["Dataset"] = dataset.name - - return df - - -def main(): - from policyengine_us_data.datasets import DATASETS - from policyengine_us_data.utils.github import download - - try: - download( - "policyengine", - "policyengine-us-data", - "release", - "evaluation.csv", - STORAGE_FOLDER / "evaluation.csv", - ) - except: - pass - - df = pd.DataFrame() - - for dataset in DATASETS: - df = pd.concat([df, evaluate_dataset(dataset)]) - - file_path = Path(STORAGE_FOLDER / "evaluation.csv") - if file_path.exists(): - existing_df = pd.read_csv(file_path) - df = pd.concat([existing_df, df]) - - df.to_csv(file_path, index=False) - - -if __name__ == "__main__": - main() diff --git a/policyengine_us_data/tests/test_datasets/test_policyengine_cps.py b/policyengine_us_data/tests/test_datasets/test_cps.py similarity index 78% rename from policyengine_us_data/tests/test_datasets/test_policyengine_cps.py rename to policyengine_us_data/tests/test_datasets/test_cps.py index 291edbb..0cf2bba 100644 --- a/policyengine_us_data/tests/test_datasets/test_policyengine_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_cps.py @@ -3,7 +3,7 @@ @pytest.mark.parametrize("year", [2022]) def test_policyengine_cps_generates(year: int): - from policyengine_us_data.datasets.cps.policyengine_cps import CPS_2022 + from policyengine_us_data.datasets.cps.cps import CPS_2022 dataset_by_year = { 2022: CPS_2022, @@ -14,7 +14,7 @@ def test_policyengine_cps_generates(year: int): @pytest.mark.parametrize("year", [2022]) def test_policyengine_cps_loads(year: int): - from policyengine_us_data.datasets.cps.policyengine_cps import CPS_2022 + from policyengine_us_data.datasets.cps.cps import CPS_2022 dataset_by_year = { 2022: CPS_2022, diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py new file mode 100644 index 0000000..41fdb1a --- /dev/null +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -0,0 +1,29 @@ +import pytest + + +@pytest.mark.parametrize("year", [2024]) +def test_policyengine_cps_generates(year: int): + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + + dataset_by_year = { + 2024: EnhancedCPS_2024, + } + + dataset_by_year[year](require=True) + + +@pytest.mark.parametrize("year", [2024]) +def test_policyengine_cps_loads(year: int): + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + + dataset_by_year = { + 2024: EnhancedCPS_2024, + } + + dataset = dataset_by_year[year] + + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=dataset) + + assert not sim.calculate("household_net_income").isna().any() diff --git a/policyengine_us_data/tests/test_datasets/test_irs_puf.py b/policyengine_us_data/tests/test_datasets/test_irs_puf.py index 529bee8..f6e8472 100644 --- a/policyengine_us_data/tests/test_datasets/test_irs_puf.py +++ b/policyengine_us_data/tests/test_datasets/test_irs_puf.py @@ -1,5 +1,6 @@ import pytest + @pytest.mark.skip(reason="This test requires private data.") @pytest.mark.parametrize("year", [2015]) def test_irs_puf_generates(year: int): diff --git a/policyengine_us_data/utils/__init__.py b/policyengine_us_data/utils/__init__.py index e69de29..1ccbd39 100644 --- a/policyengine_us_data/utils/__init__.py +++ b/policyengine_us_data/utils/__init__.py @@ -0,0 +1,4 @@ +from .github import * +from .soi import * +from .uprating import * +from .loss import * diff --git a/policyengine_us_data/utils/github.py b/policyengine_us_data/utils/github.py index 380b4ef..f9f5ce2 100644 --- a/policyengine_us_data/utils/github.py +++ b/policyengine_us_data/utils/github.py @@ -2,7 +2,7 @@ import requests auth_headers = { - "Authorization": f"token {os.environ['GITHUB_TOKEN']}", + "Authorization": f"token {os.environ.get('POLICYENGINE_US_DATA_GITHUB_TOKEN')}", } @@ -79,3 +79,25 @@ def upload( ) return response.json() + + +def set_pr_auto_review_comment(text: str): + # On a pull request, set a review comment with the given text. + + pr_number = os.environ["GITHUB_PR_NUMBER"] + + url = f"https://api.github.com/repos/{os.environ['GITHUB_REPOSITORY']}/pulls/{pr_number}/reviews" + + response = requests.post( + url, + headers=auth_headers, + json={ + "body": text, + "event": "COMMENT", + }, + ) + + if response.status_code != 200: + raise ValueError( + f"Invalid response code {response.status_code} for url {url}. Received: {response.text}" + ) diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py new file mode 100644 index 0000000..39f99e7 --- /dev/null +++ b/policyengine_us_data/utils/loss.py @@ -0,0 +1,193 @@ +import pandas as pd +from .soi import pe_to_soi, get_soi +import numpy as np +from policyengine_us_data.data_storage import STORAGE_FOLDER + + +def fmt(x): + if x == -np.inf: + return "-inf" + if x == np.inf: + return "inf" + if x < 1e3: + return f"{x:.0f}" + if x < 1e6: + return f"{x/1e3:.0f}k" + if x < 1e9: + return f"{x/1e6:.0f}m" + return f"{x/1e9:.1f}bn" + + +def build_loss_matrix(dataset: type, time_period): + loss_matrix = pd.DataFrame() + df = pe_to_soi(dataset, time_period) + agi = df["adjusted_gross_income"].values + filer = df["is_tax_filer"].values + taxable = df["total_income_tax"].values > 0 + soi_subset = get_soi(time_period) + targets_array = [] + agi_level_targeted_variables = [ + "adjusted_gross_income", + "count", + "employment_income", + "business_net_profits", + "capital_gains_gross", + "ordinary_dividends", + "partnership_and_s_corp_income", + "qualified_dividends", + "taxable_interest_income", + "total_pension_income", + "total_social_security", + ] + aggregate_level_targeted_variables = [ + "business_net_losses", + "capital_gains_distributions", + "capital_gains_losses", + "estate_income", + "estate_losses", + "exempt_interest", + "ira_distributions", + "partnership_and_s_corp_losses", + "rent_and_royalty_net_income", + "rent_and_royalty_net_losses", + "taxable_pension_income", + "taxable_social_security", + "unemployment_compensation", + ] + aggregate_level_targeted_variables = [ + variable + for variable in aggregate_level_targeted_variables + if variable in df.columns + ] + soi_subset = soi_subset[ + soi_subset.Variable.isin(agi_level_targeted_variables) + & ( + (soi_subset["AGI lower bound"] != -np.inf) + | (soi_subset["AGI upper bound"] != np.inf) + ) + | ( + soi_subset.Variable.isin(aggregate_level_targeted_variables) + & (soi_subset["AGI lower bound"] == -np.inf) + & (soi_subset["AGI upper bound"] == np.inf) + ) + ] + for _, row in soi_subset.iterrows(): + if not row["Taxable only"]: + continue # exclude non "taxable returns" statistics + + mask = ( + (agi >= row["AGI lower bound"]) + * (agi < row["AGI upper bound"]) + * filer + ) > 0 + + if row["Filing status"] == "Single": + mask *= df["filing_status"].values == "SINGLE" + elif row["Filing status"] == "Married Filing Jointly/Surviving Spouse": + mask *= df["filing_status"].values == "JOINT" + elif row["Filing status"] == "Head of Household": + mask *= df["filing_status"].values == "HEAD_OF_HOUSEHOLD" + elif row["Filing status"] == "Married Filing Separately": + mask *= df["filing_status"].values == "SEPARATE" + + values = df[row["Variable"]].values + + if row["Taxable only"]: + mask *= taxable + + if row["Count"]: + values = (values > 0).astype(float) + + agi_range_label = ( + f"{fmt(row['AGI lower bound'])}-{fmt(row['AGI upper bound'])}" + ) + taxable_label = ( + "taxable" if row["Taxable only"] else "all" + " returns" + ) + filing_status_label = row["Filing status"] + + variable_label = row["Variable"].replace("_", " ") + + if row["Count"] and not row["Variable"] == "count": + label = ( + f"irs/{variable_label}/count/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + elif row["Variable"] == "count": + label = ( + f"irs/{variable_label}/count/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + else: + label = ( + f"irs/{variable_label}/total/AGI in " + f"{agi_range_label}/{taxable_label}/{filing_status_label}" + ) + + if label not in loss_matrix.columns: + loss_matrix[label] = mask * values + targets_array.append(row["Value"]) + + # Convert tax-unit level df to household-level df + + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=dataset) + hh_id = sim.calculate("household_id", map_to="person") + tax_unit_hh_id = sim.map_result( + hh_id, "person", "tax_unit", how="value_from_first_person" + ) + + loss_matrix = loss_matrix.groupby(tax_unit_hh_id).sum() + + hh_id = sim.calculate("household_id").values + loss_matrix = loss_matrix.loc[hh_id] + + # Census single-year age population projections + + populations = pd.read_csv(STORAGE_FOLDER / "np2023_d5_mid.csv") + populations = populations[populations.SEX == 0][populations.RACE_HISP == 0] + populations = ( + populations.groupby("YEAR") + .sum()[[f"POP_{i}" for i in range(0, 86)]] + .T[time_period] + .values + ) # Array of [age_0_pop, age_1_pop, ...] for the given year + age = sim.calculate("age").values + for year in range(len(populations)): + label = f"census/population_by_age/{year}" + loss_matrix[label] = sim.map_result( + (age >= year) * (age < year + 1), "person", "household" + ) + targets_array.append(populations[year]) + + # CBO projections + + PROGRAMS = [ + "income_tax", + "snap", + "social_security", + "ssi", + "unemployment_compensation", + ] + + for variable_name in PROGRAMS: + label = f"cbo/{variable_name}" + loss_matrix[label] = sim.calculate( + variable_name, map_to="household" + ).values + if any(loss_matrix[label].isna()): + raise ValueError(f"Missing values for {label}") + targets_array.append( + sim.tax_benefit_system.parameters( + time_period + ).calibration.gov.cbo._children[variable_name] + ) + + if any(loss_matrix.isna().sum() > 0): + raise ValueError("Some targets are missing from the loss matrix") + + if any(pd.isna(targets_array)): + raise ValueError("Some targets are missing from the targets array") + + return loss_matrix, np.array(targets_array) diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py new file mode 100644 index 0000000..af52638 --- /dev/null +++ b/policyengine_us_data/utils/soi.py @@ -0,0 +1,272 @@ +import pandas as pd +import numpy as np +from .uprating import create_policyengine_uprating_factors_table +from policyengine_us_data.data_storage import STORAGE_FOLDER + + +def pe_to_soi(pe_dataset, year): + from policyengine_us import Microsimulation + + pe_sim = Microsimulation(dataset=pe_dataset) + pe_sim.default_calculation_period = year + df = pd.DataFrame() + + pe = lambda variable: np.array( + pe_sim.calculate(variable, map_to="tax_unit") + ) + + df["adjusted_gross_income"] = pe("adjusted_gross_income") + df["exemption"] = pe("exemptions") + df["itemded"] = pe("itemized_taxable_income_deductions") + df["income_tax_after_credits"] = pe("income_tax") + df["total_income_tax"] = pe("income_tax_before_credits") + df["taxable_income"] = pe("taxable_income") + df["business_net_profits"] = pe("self_employment_income") * ( + pe("self_employment_income") > 0 + ) + df["business_net_losses"] = -pe("self_employment_income") * ( + pe("self_employment_income") < 0 + ) + df["capital_gains_distributions"] = pe("non_sch_d_capital_gains") + df["capital_gains_gross"] = pe("loss_limited_net_capital_gains") * ( + pe("loss_limited_net_capital_gains") > 0 + ) + df["capital_gains_losses"] = -pe("loss_limited_net_capital_gains") * ( + pe("loss_limited_net_capital_gains") < 0 + ) + df["estate_income"] = pe("estate_income") * (pe("estate_income") > 0) + df["estate_losses"] = -pe("estate_income") * (pe("estate_income") < 0) + df["exempt_interest"] = pe("tax_exempt_interest_income") + df["ira_distributions"] = pe("taxable_ira_distributions") + df["count_of_exemptions"] = pe("exemptions_count") + df["ordinary_dividends"] = pe("non_qualified_dividend_income") + pe( + "qualified_dividend_income" + ) + df["partnership_and_s_corp_income"] = pe("partnership_s_corp_income") * ( + pe("partnership_s_corp_income") > 0 + ) + df["partnership_and_s_corp_losses"] = -pe("partnership_s_corp_income") * ( + pe("partnership_s_corp_income") < 0 + ) + df["total_pension_income"] = pe("pension_income") + df["taxable_pension_income"] = pe("taxable_pension_income") + df["qualified_dividends"] = pe("qualified_dividend_income") + df["rent_and_royalty_net_income"] = pe("rental_income") * ( + pe("rental_income") > 0 + ) + df["rent_and_royalty_net_losses"] = -pe("rental_income") * ( + pe("rental_income") < 0 + ) + df["total_social_security"] = pe("social_security") + df["taxable_social_security"] = pe("taxable_social_security") + df["income_tax_before_credits"] = pe("income_tax_before_credits") + df["taxable_interest_income"] = pe("taxable_interest_income") + df["unemployment_compensation"] = pe("taxable_unemployment_compensation") + df["employment_income"] = pe("irs_employment_income") + df["qualified_business_income_deduction"] = pe( + "qualified_business_income_deduction" + ) + df["charitable_contributions_deduction"] = pe("charitable_deduction") + df["interest_paid_deductions"] = pe("interest_deduction") + df["medical_expense_deductions_uncapped"] = pe("medical_expense_deduction") + df["state_and_local_tax_deductions"] = pe("salt_deduction") + df["itemized_state_income_and_sales_tax_deductions"] = pe( + "state_and_local_sales_or_income_tax" + ) + df["itemized_real_estate_tax_deductions"] = pe("real_estate_taxes") + df["is_tax_filer"] = pe("tax_unit_is_filer") + df["count"] = 1 + + df["filing_status"] = pe("filing_status") + df["weight"] = pe("tax_unit_weight") + df["household_id"] = pe("household_id") + + return df + + +def puf_to_soi(puf, year): + df = pd.DataFrame() + + df["adjusted_gross_income"] = puf.E00100 + df["total_income_tax"] = puf.E06500 + df["employment_income"] = puf.E00200 + df["capital_gains_distributions"] = puf.E01100 + df["capital_gains_gross"] = puf["E01000"] * (puf["E01000"] > 0) + df["capital_gains_losses"] = -puf["E01000"] * (puf["E01000"] < 0) + df["estate_income"] = puf.E26390 + df["estate_losses"] = puf.E26400 + df["exempt_interest"] = puf.E00400 + df["ira_distributions"] = puf.E01400 + df["count_of_exemptions"] = puf.XTOT + df["ordinary_dividends"] = puf.E00600 + df["partnership_and_s_corp_income"] = puf.E26270 * (puf.E26270 > 0) + df["partnership_and_s_corp_losses"] = -puf.E26270 * (puf.E26270 < 0) + df["total_pension_income"] = puf.E01500 + df["taxable_pension_income"] = puf.E01700 + df["qualified_dividends"] = puf.E00650 + df["rent_and_royalty_net_income"] = puf.E25850 + df["rent_and_royalty_net_losses"] = puf.E25860 + df["total_social_security"] = puf.E02400 + df["taxable_social_security"] = puf.E02500 + df["income_tax_before_credits"] = puf.E06500 + df["taxable_interest_income"] = puf.E00300 + df["unemployment_compensation"] = puf.E02300 + df["employment_income"] = puf.E00200 + df["charitable_contributions_deduction"] = puf.E19700 + df["interest_paid_deductions"] = puf.E19200 + df["medical_expense_deductions_uncapped"] = puf.E17500 + df["itemized_state_income_and_sales_tax_deductions"] = puf.E18400 + df["itemized_real_estate_tax_deductions"] = puf.E18500 + df["state_and_local_tax_deductions"] = puf.E18400 + puf.E18500 + df["income_tax_after_credits"] = puf.E08800 + df["business_net_profits"] = puf.E00900 * (puf.E00900 > 0) + df["business_net_losses"] = -puf.E00900 * (puf.E00900 < 0) + df["taxable_income"] = puf.E04800 + df["is_tax_filer"] = True + df["count"] = 1 + df["filing_status"] = puf.MARS.map( + { + 0: "SINGLE", # Assume the aggregate record is single + 1: "SINGLE", + 2: "JOINT", + 3: "SEPARATE", + 4: "HEAD_OF_HOUSEHOLD", + } + ) + + df["weight"] = puf["S006"] / 100 + + return df + + +def get_soi(year: int) -> pd.DataFrame: + uprating = create_policyengine_uprating_factors_table() + + uprating_map = { + "adjusted_gross_income": "adjusted_gross_income", + "count": "population", + "employment_income": "employment_income", + "business_net_profits": "self_employment_income", + "capital_gains_gross": "long_term_capital_gains", + "ordinary_dividends": "non_qualified_dividend_income", + "partnership_and_s_corp_income": "partnership_s_corp_income", + "qualified_dividends": "qualified_dividend_income", + "taxable_interest_income": "taxable_interest_income", + "total_pension_income": "pension_income", + "total_social_security": "social_security", + "business_net_losses": "self_employment_income", + "capital_gains_distributions": "long_term_capital_gains", + "capital_gains_losses": "long_term_capital_gains", + "estate_income": "estate_income", + "estate_losses": "estate_income", + "exempt_interest": "tax_exempt_interest_income", + "ira_distributions": "taxable_ira_distributions", + "partnership_and_s_corp_losses": "partnership_s_corp_income", + "rent_and_royalty_net_income": "rental_income", + "rent_and_royalty_net_losses": "rental_income", + "taxable_pension_income": "taxable_pension_income", + "taxable_social_security": "taxable_social_security", + "unemployment_compensation": "unemployment_compensation", + } + soi = pd.read_csv(STORAGE_FOLDER / "soi.csv") + soi = soi[soi.Year == soi.Year.max()] + + uprating_factors = {} + for variable in uprating_map: + pe_name = uprating_map.get(variable) + if pe_name in uprating.index: + uprating_factors[variable] = ( + uprating.loc[pe_name, year] + / uprating.loc[pe_name, soi.Year.max()] + ) + else: + uprating_factors[variable] = ( + uprating.loc["employment_income", year] + / uprating.loc["employment_income", soi.Year.max()] + ) + + for variable, uprating_factor in uprating_factors.items(): + soi.loc[soi.Variable == variable, "Value"] *= uprating_factor + + return soi + + +def compare_soi_replication_to_soi(df, soi): + variables = [] + filing_statuses = [] + agi_lower_bounds = [] + agi_upper_bounds = [] + counts = [] + taxables = [] + full_pops = [] + values = [] + soi_values = [] + + for i, row in soi.iterrows(): + if row.Variable not in df.columns: + continue + + subset = df[df.adjusted_gross_income >= row["AGI lower bound"]][ + df.adjusted_gross_income < row["AGI upper bound"] + ] + + variable = row["Variable"] + + fs = row["Filing status"] + if fs == "Single": + subset = subset[subset.filing_status == "SINGLE"] + elif fs == "Head of Household": + subset = subset[subset.filing_status == "HEAD_OF_HOUSEHOLD"] + elif fs == "Married Filing Jointly/Surviving Spouse": + subset = subset[subset.filing_status.isin(["JOINT", "WIDOW"])] + elif fs == "Married Filing Separately": + subset = subset[subset.filing_status == "SEPARATE"] + + if row["Taxable only"]: + subset = subset[subset.total_income_tax > 0] + else: + subset = subset[subset.is_tax_filer.values > 0] + + if row["Count"]: + value = subset[subset[variable] > 0].weight.sum() + else: + value = (subset[variable] * subset.weight).sum() + + variables.append(row["Variable"]) + filing_statuses.append(row["Filing status"]) + agi_lower_bounds.append(row["AGI lower bound"]) + agi_upper_bounds.append(row["AGI upper bound"]) + counts.append(row["Count"] or (row["Variable"] == "count")) + taxables.append(row["Taxable only"]) + full_pops.append(row["Full population"]) + values.append(value) + soi_values.append(row["Value"]) + + soi_replication = pd.DataFrame( + { + "Variable": variables, + "Filing status": filing_statuses, + "AGI lower bound": agi_lower_bounds, + "AGI upper bound": agi_upper_bounds, + "Count": counts, + "Taxable only": taxables, + "Full population": full_pops, + "Value": values, + "SOI Value": soi_values, + } + ) + + soi_replication["Error"] = ( + soi_replication["Value"] - soi_replication["SOI Value"] + ) + soi_replication["Absolute error"] = soi_replication["Error"].abs() + soi_replication["Relative error"] = ( + (soi_replication["Error"] / soi_replication["SOI Value"]) + .replace([np.inf, -np.inf], np.nan) + .fillna(0) + ) + soi_replication["Absolute relative error"] = soi_replication[ + "Relative error" + ].abs() + + return soi_replication diff --git a/policyengine_us_data/utils/uprating.py b/policyengine_us_data/utils/uprating.py new file mode 100644 index 0000000..05fd6a2 --- /dev/null +++ b/policyengine_us_data/utils/uprating.py @@ -0,0 +1,64 @@ +from policyengine_us_data.data_storage import STORAGE_FOLDER +import pandas as pd + +START_YEAR = 2020 +END_YEAR = 2034 + + +def create_policyengine_uprating_factors_table(): + from policyengine_us.system import system + + df = pd.DataFrame() + + variable_names = [] + years = [] + index_values = [] + + population_size = system.parameters.get_child( + "calibration.gov.census.populations.total" + ) + + for variable in system.variables.values(): + if variable.uprating is not None: + parameter = system.parameters.get_child(variable.uprating) + start_value = parameter(START_YEAR) + for year in range(START_YEAR, END_YEAR + 1): + population_growth = population_size(year) / population_size( + START_YEAR + ) + variable_names.append(variable.name) + years.append(year) + growth = parameter(year) / start_value + if "_weight" not in variable.name: + per_capita_growth = growth / population_growth + else: + per_capita_growth = growth + index_values.append(round(per_capita_growth, 3)) + + # Add population growth + + for year in range(START_YEAR, END_YEAR + 1): + variable_names.append("population") + years.append(year) + index_values.append( + round(population_size(year) / population_size(START_YEAR), 3) + ) + + df["Variable"] = variable_names + df["Year"] = years + df["Value"] = index_values + + # Convert to there is a column for each year + df = df.pivot(index="Variable", columns="Year", values="Value") + df = df.sort_values("Variable") + df.to_csv(STORAGE_FOLDER / "uprating_factors.csv") + + # Create a table with growth factors by year + + df_growth = df.copy() + for year in range(END_YEAR, START_YEAR, -1): + df_growth[year] = round(df_growth[year] / df_growth[year - 1] - 1, 3) + df_growth[START_YEAR] = 0 + + df_growth.to_csv(STORAGE_FOLDER / "uprating_growth_factors.csv") + return df diff --git a/setup.py b/setup.py index ce419ad..830aead 100644 --- a/setup.py +++ b/setup.py @@ -11,14 +11,19 @@ install_requires=[ "policyengine_core", "tables", + "survey_enhance", + "torch", + "requests", + "tqdm", + "tabulate", + "tables", ], extras_require={ "dev": [ "black", "pytest", - "tqdm", - "requests", - "policyengine_us", + "policyengine_us==1.69.0", + "streamlit", ], }, )