diff --git a/Makefile b/Makefile index 91c454e..9bf8088 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all format test install download upload docker documentation data clean build +.PHONY: all format test install download upload docker documentation data clean build paper clean-paper all: data test @@ -49,3 +49,15 @@ build: publish: twine upload dist/* + +paper: paper/main.pdf + +paper/main.pdf: $(wildcard paper/sections/**/*.tex) $(wildcard paper/bibliography/*.bib) paper/main.tex paper/macros.tex + cd paper && \ + BIBINPUTS=./bibliography pdflatex main && \ + BIBINPUTS=./bibliography bibtex main && \ + pdflatex main && \ + pdflatex main + +clean-paper: + rm -f paper/*.aux paper/*.bbl paper/*.blg paper/*.log paper/*.out paper/*.toc paper/main.pdf paper/sections/**/*.aux diff --git a/README.md b/README.md index f46bc85..76e686a 100644 --- a/README.md +++ b/README.md @@ -1 +1,50 @@ # PolicyEngine US Data + +## Installation + +```bash +pip install policyengine-us-data +``` + +## Building the Paper + +### Prerequisites + +The paper requires a LaTeX distribution (e.g., TeXLive or MiKTeX) with the following packages: + +- graphicx (for figures) +- amsmath (for mathematical notation) +- natbib (for bibliography management) +- hyperref (for PDF links) +- booktabs (for tables) +- geometry (for page layout) +- microtype (for typography) +- xcolor (for colored links) + +On Ubuntu/Debian, you can install these with: + +```bash +sudo apt-get install texlive-latex-base texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended +``` + +On macOS with Homebrew: + +```bash +brew install --cask mactex +``` + +### Building + +To build the paper: + +```bash +make paper +``` + +To clean LaTeX build files: + +```bash +make clean-paper +``` + +The output PDF will be at `paper/main.pdf`. diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29..6664d51 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + added: + - Paper on methodology. diff --git a/paper/.gitignore b/paper/.gitignore new file mode 100644 index 0000000..b18fe6e --- /dev/null +++ b/paper/.gitignore @@ -0,0 +1,32 @@ +## Core latex/pdflatex auxiliary files: +*.aux +*.lof +*.log +*.lot +*.fls +*.out +*.toc +*.fmt +*.fot +*.cb +*.cb2 +.*.lb + +## Generated if empty string is given at "Please type another file name for output:" +.pdf + +## Bibliography auxiliary files (bibtex/biblatex/biber): +*.bbl +*.bcf +*.blg +*-blx.aux +*-blx.bib +*.run.xml + +## Build tool auxiliary files: +*.fdb_latexmk +*.synctex +*.synctex(busy) +*.synctex.gz +*.synctex.gz(busy) +*.pdfsync diff --git a/paper/bibliography/references.bib b/paper/bibliography/references.bib new file mode 100644 index 0000000..3414672 --- /dev/null +++ b/paper/bibliography/references.bib @@ -0,0 +1,186 @@ +@techreport{cbo2018, + title = {An Overview of CBO's Microsimulation Tax Model}, + author = {{Congressional Budget Office}}, + institution = {Congressional Budget Office}, + year = {2018}, + url = {https://www.cbo.gov/publication/54096} +} + +@techreport{jct2023, + title = {Overview of JCT Revenue Estimating Methods}, + author = {{Joint Committee on Taxation}}, + institution = {Joint Committee on Taxation}, + number = {JCX-48-23}, + year = {2023}, + url = {https://www.jct.gov/publications/2023/jcx-48-23/} +} + +@techreport{ota2012, + title = {Revenue Estimating Models at the U.S. Treasury Department}, + author = {{Office of Tax Analysis}}, + institution = {U.S. Department of the Treasury}, + number = {Technical Paper 12}, + year = {2012}, + url = {https://home.treasury.gov/system/files/131/TP-12.pdf} +} + +@article{saez2012, + title = {The Elasticity of Taxable Income with Respect to Marginal Tax Rates: A Critical Review}, + author = {Saez, Emmanuel and Slemrod, Joel and Giertz, Seth H}, + journal = {Journal of Economic Literature}, + volume = {50}, + number = {1}, + pages = {3--50}, + year = {2012} +} + +@misc{tpc2022, + title = {Brief Description of the Tax Model}, + author = {{Tax Policy Center}}, + year = {2022}, + url = {https://www.taxpolicycenter.org/resources/brief-description-tax-model}, + note = {Updated March 2022} +} + +@misc{itep2024, + title = {ITEP Tax Model Overview}, + author = {{Institute on Taxation and Economic Policy}}, + year = {2024}, + url = {https://itep.org/itep-tax-model/} +} + +@misc{tf2024, + title = {Overview of the Tax Foundation's Taxes and Growth Model}, + author = {{Tax Foundation}}, + year = {2024}, + url = {https://taxfoundation.org/research/all/federal/overview-tax-foundations-taxes-growth-model/} +} + +@misc{trim2024, + title = {TRIM3 Project Documentation: Transfer Income Model, Version 3}, + author = {{Urban Institute}}, + year = {2024}, + url = {https://boreas.urban.org/documentation/input/Concepts%20and%20Procedures/Modifications%20to%20the%20Underlying%20Surveys.php} +} + +@misc{attis2024, + title = {ATTIS Microsimulation Model}, + author = {{Urban Institute}}, + year = {2024}, + url = {https://www.urban.org/research-methods/attis-microsimulation-model} +} + +@misc{budgetlab2024, + title = {Tax Microsimulation at The Budget Lab}, + author = {{Budget Lab}}, + institution = {Yale University}, + year = {2024}, + url = {https://budgetlab.yale.edu/research/tax-microsimulation-budget-lab} +} + +@misc{psl2024, + title = {Tax-Data Documentation}, + author = {{Policy Simulation Library}}, + year = {2024}, + url = {https://github.com/PSLmodels/taxdata} +} + +@article{ohare2009, + title = {Statistical Matching Using the Current Population Survey as the Donor: Techniques and Issues}, + author = {O'Hare, William P}, + journal = {National Tax Journal}, + volume = {62}, + number = {3}, + pages = {519--537}, + year = {2009} +} + +@techreport{piketty2018, + title = {Distributional National Accounts: Methods and Estimates for the United States}, + author = {Piketty, Thomas and Saez, Emmanuel and Zucman, Gabriel}, + institution = {National Bureau of Economic Research}, + number = {w22945}, + year = {2018} +} + +@article{burkhauser2012, + title = {Recent Trends in Top Income Shares in the United States: Reconciling Estimates from March CPS and IRS Tax Return Data}, + author = {Burkhauser, Richard V and Feng, Shuaizhang and Jenkins, Stephen P and Larrimore, Jeff}, + journal = {Review of Economics and Statistics}, + volume = {94}, + number = {2}, + pages = {371--388}, + year = {2012} +} + +@article{auerbach2018, + title = {Macroeconomic Modeling of Tax Policy: A Comparison of Current Methodologies}, + author = {Auerbach, Alan J and Kotlikoff, Laurence J and Koehler, Darryl}, + journal = {National Tax Journal}, + volume = {71}, + number = {3}, + pages = {541--576}, + year = {2018} +} + +@techreport{bryant2023a, + title = {General Description Booklet for the 2015 Public Use Tax File}, + author = {Bryant, Victoria}, + institution = {Statistics of Income Division, Internal Revenue Service}, + year = {2023}, + month = {February}, + type = {Technical Documentation}, + url = {https://drive.google.com/file/d/1WoTU70GEjYMO0KHsHvTTH0NwCc-kN5cE/view} +} + +@techreport{bryant2023b, + title = {General Description Booklet for the 2015 Public Use Tax File Demographic File}, + author = {Bryant, Victoria}, + institution = {Statistics of Income Division, Internal Revenue Service}, + year = {2023}, + month = {February}, + type = {Technical Documentation}, + url = {https://drive.google.com/file/d/1WoTU70GEjYMO0KHsHvTTH0NwCc-kN5cE/view} +} + +@techreport{census2024, + title = {Current Population Survey, 2024 Annual Social and Economic (ASEC) Supplement}, + author = {{U.S. Census Bureau}}, + institution = {U.S. Census Bureau}, + year = {2024}, + url = {https://www2.census.gov/programs-surveys/cps/datasets/2024/march/asec2024_ddl_pub_full.pdf} +} + +@article{meinshausen2006quantile, + title = {Quantile regression forests}, + author = {Meinshausen, Nicolai and Ridgeway, Greg}, + journal = {Journal of machine learning research}, + volume = {7}, + number = {6}, + year = {2006} +} + +@misc{zillow2024quantile, + title = {quantile-forest: Scikit-learn compatible quantile regression forests}, + author = {{Zillow Group}}, + year = {2024}, + howpublished = {\url{https://zillow.github.io/quantile-forest/}} +} + +@article{pytorch2019, + title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library}, + author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and others}, + journal = {Advances in Neural Information Processing Systems}, + volume = {32}, + year = {2019} +} + +@techreport{woodruff2023survey, + title = {Surveying the (loss) landscape: using machine learning to improve household survey accuracy}, + author = {Woodruff, Nikhil}, + institution = {University of Durham}, + year = {2023}, + month = {April}, + note = {Demonstrates superiority of machine learning approaches over traditional methods for survey enhancement through comprehensive benchmarking}, + url = {https://github.com/policyengine/survey-enhance/blob/main/docs/paper/project_paper.pdf} +} diff --git a/paper/figures/data_flow.png b/paper/figures/data_flow.png new file mode 100644 index 0000000..133551d Binary files /dev/null and b/paper/figures/data_flow.png differ diff --git a/paper/figures/ecps_vs_cps_puf.png b/paper/figures/ecps_vs_cps_puf.png new file mode 100644 index 0000000..f1eec3a Binary files /dev/null and b/paper/figures/ecps_vs_cps_puf.png differ diff --git a/paper/macros.tex b/paper/macros.tex new file mode 100644 index 0000000..8c52fb7 --- /dev/null +++ b/paper/macros.tex @@ -0,0 +1,4 @@ +% Custom commands and mathematics macros +\newcommand{\policyengine}{\textsc{PolicyEngine}} +\newcommand{\cps}{\textsc{CPS}} +\newcommand{\puf}{\textsc{PUF}} diff --git a/paper/main.pdf b/paper/main.pdf new file mode 100644 index 0000000..f59ade1 Binary files /dev/null and b/paper/main.pdf differ diff --git a/paper/main.tex b/paper/main.tex new file mode 100644 index 0000000..2f9aed1 --- /dev/null +++ b/paper/main.tex @@ -0,0 +1,55 @@ +\documentclass[12pt]{article} + +\usepackage{graphicx} +\usepackage{amsmath} +\usepackage[round]{natbib} % Keep round option +\usepackage{hyperref} +\usepackage{booktabs} +\usepackage{geometry} +\usepackage{microtype} +\usepackage{xcolor} + +% Set citation style in preamble +\bibpunct{(}{)}{;}{a}{,}{,} % Move here +\setcitestyle{authoryear,round} % Move here + +\input{macros} + +\geometry{margin=1in} +\hypersetup{ + colorlinks=true, + linkcolor=blue, + filecolor=magenta, + urlcolor=blue, + citecolor=blue, +} + + +\title{Enhancing Survey Microdata with Administrative Records: \\ A Novel Approach to Microsimulation Dataset Construction} +% Define the \samethanks command +\newcommand*\samethanks[1][\value{footnote}]{\footnotemark[#1]} + +% Define authors with the same affiliation +\author{ + Nikhil Woodruff\thanks{PolicyEngine} \and + Max Ghenis\samethanks +} +\date{\today} + +\begin{document} + +\maketitle + +\input{sections/abstract} +\input{sections/introduction} +\input{sections/background} +\input{sections/data} +\input{sections/methodology} +\input{sections/results} +\input{sections/discussion} +\input{sections/conclusion} + +\bibliographystyle{plainnat} +\bibliography{./bibliography/references} + +\end{document} \ No newline at end of file diff --git a/paper/sections/abstract.tex b/paper/sections/abstract.tex new file mode 100644 index 0000000..2261f0a --- /dev/null +++ b/paper/sections/abstract.tex @@ -0,0 +1,3 @@ +\section*{Abstract} + +We combine the demographic detail of the Current Population Survey (CPS) with the tax precision of the IRS Public Use File (PUF) to create an enhanced microsimulation dataset. Our method uses quantile regression forests to transfer income and tax variables from the PUF to demographically-similar CPS households. We create a synthetic CPS-structured dataset using PUF tax information, stack it alongside the original CPS records, then use dropout-regularized gradient descent to reweight households toward administrative targets from IRS Statistics of Income, Census population estimates, and program participation data. This preserves the CPS's granular demographic and geographic information while leveraging the PUF's tax reporting accuracy. The enhanced dataset provides a foundation for analyzing federal tax policy, state tax systems, and benefit programs. We release both the enhanced dataset and our open-source enhancement procedure to support transparent policy analysis. \ No newline at end of file diff --git a/paper/sections/background.tex b/paper/sections/background.tex new file mode 100644 index 0000000..d7ea8a5 --- /dev/null +++ b/paper/sections/background.tex @@ -0,0 +1,144 @@ +\section{Background} + +Tax microsimulation models are essential tools for analyzing the distributional and revenue impacts of tax policy changes. By simulating individual tax units rather than relying on aggregate statistics, these models can capture the complex interactions between different provisions of the tax code and heterogeneous effects across the population. The core challenges these models face include: + +\begin{itemize} + \item Combining multiple data sources while preserving statistical validity + \item Aging historical data to represent current and future years + \item Imputing variables not observed in the source data + \item Modeling behavioral responses to policy changes + \item Calibrating results to match administrative totals +\end{itemize} + +Each existing model approaches these challenges differently, making tradeoffs between precision, comprehensiveness, and transparency. We build on their methods while introducing new techniques for data synthesis and uncertainty quantification. + +\subsection{Government Agency Models} + +The U.S. federal government maintains several microsimulation capabilities through its policy analysis agencies, which form the foundation for official policy analysis and revenue estimation. + +The Congressional Budget Office's model emphasizes behavioral responses and their macroeconomic effects \citep{cbo2018}. Their approach uses a two-stage estimation process: + +\begin{enumerate} + \item Static scoring: calculating mechanical revenue effects assuming no behavioral change + \item Dynamic scoring: incorporating behavioral responses calibrated to empirical literature +\end{enumerate} + +CBO's elasticity assumptions have evolved over time in response to new research, particularly regarding the elasticity of taxable income (ETI). Their current approach varies ETI by income level and type of tax change, broadly consistent with the academic consensus surveyed in \citep{saez2012}. The model also incorporates detailed projections of demographic change and economic growth from CBO's other forecasting models. + +The Joint Committee on Taxation employs a similar approach but with particular focus on conventional revenue estimates \citep{jct2023}. Their model maintains detailed imputations for: + +\begin{itemize} + \item Business income allocation between tax forms + \item Retirement account contributions and distributions + \item Asset basis and unrealized capital gains + \item International income and foreign tax credits +\end{itemize} + +A distinguishing feature is their treatment of tax expenditure interactions - addressing both mechanical overlap (e.g., between itemized deductions) and behavioral responses (e.g., between savings incentives). + +The Treasury's Office of Tax Analysis model features additional detail on corporate tax incidence and international provisions \citep{ota2012}. Their approach emphasizes the relationship between different types of tax instruments through a series of linked models: + +\begin{itemize} + \item Individual income tax model using matched administrative data + \item Corporate microsimulation using tax returns and financial statements + \item International tax model incorporating country-by-country reporting + \item Estate tax model with SCF-based wealth imputations +\end{itemize} + +This integration allows OTA to analyze proposals affecting multiple parts of the tax system consistently. + +\subsection{Research Institution Models} + +\subsubsection{Urban Institute Family of Models} + +The Urban Institute maintains several complementary microsimulation models, each emphasizing different aspects of tax and transfer policy analysis. + +The Urban-Brookings Tax Policy Center model \citep{tpc2022} combines the IRS Public Use File with Current Population Survey data through predictive mean matching, an approach similar to what we employ in Section~\ref{sec:methodology}. Their imputation strategy aims to preserve joint distributions across variables using regression-based techniques for: + +\begin{itemize} + \item Wealth holdings (18 asset and debt categories) + \item Education expenses (by level and institution type) + \item Consumption patterns (16 expenditure categories) + \item Health insurance status (plan type and premiums) + \item Retirement accounts (DB/DC split and contribution levels) +\end{itemize} + +TRIM3 emphasizes the time dimension of policy analysis, with sophisticated procedures for converting annual survey data into monthly variables \citep{trim2024}. Key innovations include: + +\begin{itemize} + \item Allocation of employment spells to specific weeks using BLS benchmarks + \item Probabilistic monthly assignment of benefit receipt + \item State-specific program rules and eligibility determination + \item Integration of administrative data for validation +\end{itemize} + +This monthly allocation approach informs our treatment of time variation in Section~\ref{sec:data}. + +The newer ATTIS model \citep{attis2024} focuses on interactions between tax and transfer programs. Building on the American Community Survey rather than the CPS provides better geographic detail at the cost of requiring additional tax variable imputations. Their approach to correcting for benefit underreporting in survey data parallels our methods in Section~\ref{sec:methodology}. + +\subsubsection{Other Research Institution Models} + +The Institute on Taxation and Economic Policy model \citep{itep2024} is unique in its comprehensive treatment of federal, state and local taxes. Key features include: + +\begin{itemize} + \item Integration of income, sales, and property tax microsimulation + \item Detailed state-specific tax calculators + \item Consumer expenditure imputations for indirect tax analysis + \item Race/ethnicity analysis through statistical matching +\end{itemize} + +The Tax Foundation's Taxes and Growth model \citep{tf2024} emphasizes macroeconomic feedback effects through a neoclassical growth framework. Their approach includes: + +\begin{itemize} + \item Production function based on CES technology + \item Endogenous labor supply responses + \item Investment responses to cost of capital + \item International capital flow effects +\end{itemize} + +\subsection{Open Source Initiatives} + +Recent years have seen growing interest in open source approaches that promote transparency and reproducibility in tax policy modeling. + +The Budget Lab at Yale \citep{budgetlab2024} maintains a fully open source federal tax model distinguished by: + +\begin{itemize} + \item Modular codebase with clear separation of concerns + \item Flexible behavioral response specification + \item Comprehensive test suite and documentation + \item Version control and continuous integration +\end{itemize} + +Their approach to code organization and testing informs our own development practices. + +The Policy Simulation Library's Tax-Data project \citep{psl2024} provides building blocks for tax microsimulation including: + +\begin{itemize} + \item Data processing and cleaning routines + \item Statistical matching algorithms + \item Variable imputation methods + \item Growth factor calculation + \item Validation frameworks +\end{itemize} + +We build directly on several Tax-Data components while introducing new methods for synthesis and uncertainty quantification described in Section~\ref{sec:methodology}. + +\subsection{Key Methodological Challenges} + +This review of existing models highlights several common methodological challenges that our approach aims to address: + +\begin{enumerate} + \item \textbf{Data Limitations}: Each primary data source (tax returns, surveys) has significant limitations. Tax returns lack demographic detail; surveys underreport income and benefits. While existing models use various matching techniques to combine sources, maintaining consistent joint distributions remains difficult. + + \item \textbf{Aging and Extrapolation}: Forward projection requires both technical adjustments (e.g., inflation indexing) and assumptions about behavioral and demographic change. Current approaches range from simple factor adjustment to complex forecasting models. + + \item \textbf{Behavioral Response}: Models must balance tractability with realism in specifying how taxpayers respond to policy changes. Key challenges include heterogeneous elasticities, extensive margin responses, and general equilibrium effects. + + \item \textbf{Uncertainty Quantification}: Most models provide point estimates without formal measures of uncertainty from parameter estimates, data quality, or specification choices. +\end{enumerate} + +Our methodology, detailed in Section~\ref{sec:methodology}, introduces novel approaches to these challenges while building on existing techniques that have proven successful. We particularly focus on quantifying and communicating uncertainty throughout the modeling process. + +\subsubsection{Empirical Evaluation of Enhancement Methods} + +Recent work has systematically compared different approaches to survey enhancement. \citet{woodruff2023survey} evaluated traditional techniques like percentile matching against machine learning methods including gradient descent reweighting and synthetic data generation. Their results showed ML-based approaches substantially outperforming conventional methods, with combined synthetic data and reweighting reducing error by 88\% compared to baseline surveys. Importantly, their cross-validation analysis demonstrated these improvements generalized to out-of-sample targets, suggesting the methods avoid overfitting to specific statistical measures. This empirical evidence informs our methodological choices, particularly around combining multiple enhancement techniques. \ No newline at end of file diff --git a/paper/sections/conclusion.tex b/paper/sections/conclusion.tex new file mode 100644 index 0000000..b5ca65e --- /dev/null +++ b/paper/sections/conclusion.tex @@ -0,0 +1,9 @@ +\section{Conclusion} + +This paper presents a novel approach to constructing enhanced microdata for tax-benefit microsimulation by combining survey and administrative data sources. Our methodology leverages machine learning techniques – specifically quantile regression forests and gradient descent optimization – to preserve the strengths of each source while mitigating their weaknesses. The resulting dataset outperforms both the Current Population Survey and IRS Public Use File across a majority of validation targets, with particularly strong improvements in areas crucial for policy analysis such as income distributions and program participation rates. + +The enhanced dataset addresses a key challenge in tax-benefit microsimulation: the need for both detailed demographic information and accurate tax/income data. By maintaining the CPS's rich household structure while incorporating the PUF's tax precision, our approach enables more reliable analysis of policies that depend on both demographic characteristics and economic circumstances. The systematic validation against hundreds of administrative targets provides confidence in the dataset's reliability while helping users understand its limitations. + +Our open-source implementation and automatically updated validation metrics establish a new standard for transparency in microsimulation data enhancement. This enables other researchers to build upon our work, adapt the methodology to other jurisdictions, or extend it to incorporate additional data sources. Future work could expand the approach to finer geographic levels, integrate data from additional surveys, or apply similar techniques to other domains requiring the combination of survey and administrative data. + +The enhanced CPS represents a significant advance in the quality of openly available microdata for tax-benefit analysis. By reducing error rates across a broad range of metrics while preserving essential relationships in the data, it provides a more reliable foundation for understanding the impacts of complex policy reforms on American households. \ No newline at end of file diff --git a/paper/sections/data.tex b/paper/sections/data.tex new file mode 100644 index 0000000..9fc4580 --- /dev/null +++ b/paper/sections/data.tex @@ -0,0 +1,105 @@ +\section{Data}\label{sec:data} + +\subsection{Current Population Survey} + +The Census Bureau administers the Current Population Survey Annual Social and Economic Supplement (CPS ASEC, or hereafter the CPS) each March. In March 2024, they surveyed 89,473 households representing the U.S. civilian non-institutional population about their activities in the 2023 calendar year. + +The CPS's key strengths include: +\begin{itemize} + \item Rich demographic detail including age, sex, race, ethnicity, and education + \item Complete household relationship matrices + \item Program participation indicators + \item State identifiers, and partial county identifiers +\end{itemize} + +However, the CPS has known limitations for tax modeling: +\begin{itemize} + \item Underreporting of income, particularly at the top of the distribution due to top-coding + \item Limited tax-relevant information (e.g., itemized deductions) + \item No direct observation of tax units within households + \item Imprecise measurement of certain income types (e.g., capital gains) +\end{itemize} + +\subsection{IRS Public Use File} + +The Internal Revenue Service Public Use File (PUF) is a national sample of individual income tax returns, representing the 151.2 million Form 1040, Form 1040A, and Form 1040EZ Federal Individual Income Tax Returns filed for Tax Year 2015. The file contains 119,675 records sampled at varying rates across strata, with 0.07 percent sampling for strata 7 through 13 \citep{bryant2023b}. The data are extensively transformed to protect taxpayer privacy while preserving statistical properties. + +The Public Use Tax Demographic File supplements the PUF with: +\begin{itemize} + \item Age ranges for primary taxpayers (different ranges for dependent vs non-dependent filers) + \item Dependent age information in six categories (under 5, 5-13, 13-17, 17-19, 19-24, 24+) + \item Gender of primary taxpayer + \item Earnings splits for joint filers (categorizing primary earner share) +\end{itemize} + +Key disclosure protections include: +\begin{itemize} + \item Demographic information limited to returns in strata 7-13 + \item Suppression of dependent ages for returns with farm income or homebuyer credits + \item Minimum population thresholds for dependent age reporting + \item Sequential limits on dependent counts by filing status +\end{itemize} + +The PUF's key strengths include: +\begin{itemize} + \item Precise income amounts derived from information returns + \item Complete tax return information including itemized deductions + \item Actual tax unit structure + \item Accurate income type classification +\end{itemize} + +The PUF's limitations include: +\begin{itemize} + \item Limited demographic information + \item No household structure beyond the tax unit + \item No geographic information such as state + \item No program participation information + \item Privacy protections that mask extreme values + \item Lag; the latest version as of November 2024 is for the 2015 tax year +\end{itemize} + +\subsection{External Validation Sources} + +We validate our enhanced dataset against 570 targets from several external sources: + +\subsubsection{IRS Statistics of Income} + +The Statistics of Income (SOI) Division publishes detailed tabulations of tax return data, including: +\begin{itemize} + \item Income amounts by source and adjusted gross income bracket + \item Number of returns by filing status + \item Itemized deduction amounts and counts + \item Tax credits and their distribution +\end{itemize} + +These tabulations serve as key targets in our reweighting procedure and validation metrics. + +\subsubsection{CPS ASEC Public Tables} + +Census Bureau publications provide demographic and program participation benchmarks, including: +\begin{itemize} + \item Age distribution by state + \item Household size distribution + \item Program participation rates +\end{itemize} + +\subsubsection{Administrative Program Totals} + +We incorporate official totals from various agencies, including but not limited to: +\begin{itemize} + \item Social Security Administration beneficiary counts and benefit amounts + \item SNAP participation and benefits from USDA + \item Earned Income Tax Credit statistics from IRS + \item Unemployment Insurance claims and benefits from Department of Labor +\end{itemize} + +\subsection{Variable Harmonization} + +A crucial preparatory step is harmonizing variables across datasets. We develop a detailed crosswalk between CPS and PUF variables, accounting for definitional differences. Key considerations include: +\begin{itemize} + \item Income classification (e.g., business vs. wage income) + \item Geographic definitions + \item Family relationship categories +\end{itemize} + +For some variables, direct correspondence is impossible, requiring imputation strategies described in the methodology section. The complete variable crosswalk is available in our open-source repository. \ No newline at end of file diff --git a/paper/sections/discussion.tex b/paper/sections/discussion.tex new file mode 100644 index 0000000..4eb9ae7 --- /dev/null +++ b/paper/sections/discussion.tex @@ -0,0 +1,29 @@ +\section{Discussion} + +This paper introduces a novel approach to constructing an enhanced microsimulation dataset by integrating survey and administrative data sources. Our methodology, which combines quantile regression forests (QRF) and dropout-regularized gradient descent reweighting, demonstrates substantial improvements in accurately capturing both demographic and tax-related variables. In this section, we discuss the strengths, limitations, potential applications, and future directions of this approach. + +\subsection{Strengths of the Enhanced Dataset} + +The enhanced dataset achieves a unique balance between demographic detail and tax precision, addressing a long-standing gap in microsimulation modeling. The use of QRF allows for more accurate transfer of income and tax distributions from the IRS Public Use File (PUF) to the Current Population Survey (CPS), preserving complex variable relationships that are critical for policy analysis. Additionally, the dropout-regularized gradient descent reweighting effectively calibrates household weights to align with administrative benchmarks, reducing error rates across a broad range of demographic and economic metrics. + +Our validation results show that the enhanced CPS (ECPS) improves on both source datasets, particularly in tax-related variables that are essential for analyzing income distributions and program participation. By providing a publicly available, open-source dataset with extensive validation against external benchmarks, we support more transparent and reliable policy analysis. + +\subsection{Limitations and Potential Biases} + +Despite these strengths, the enhanced dataset has limitations that merit careful consideration. One key challenge lies in maintaining consistency in relationships across diverse variables, especially in cases where nonlinear or unexpected correlations exist. Although QRF is well-suited for capturing non-linear relationships, biases may still arise due to assumptions made during variable imputation. + +A second limitation is the reliance on older IRS data, which may not fully capture recent demographic and economic shifts. While our reweighting procedure attempts to mitigate this through adjustment to more current administrative targets, future iterations could benefit from updated IRS data or alternative administrative sources that better reflect the contemporary population. + +Further, our approach may introduce biases when aligning household records with administrative targets. These biases can impact analyses that depend heavily on small demographic subgroups or specific income brackets. Future improvements could involve fine-tuning the reweighting process to minimize potential overfitting in cases where data are sparse. + +\subsection{Applications of the Enhanced Dataset} + +The enhanced CPS dataset expands the scope and accuracy of microsimulation analyses in several policy domains. By combining the CPS's household structure with the PUF's tax precision, this dataset is well-suited for both federal and state-level tax analysis, particularly in modeling income-based benefits and tax credits. Researchers and policymakers could leverage this dataset to evaluate the distributional impacts of various tax reforms, analyze the implications of benefit programs across income levels, and assess policy proposals that rely on a precise understanding of income and demographic characteristics. + +Additional applications extend to labor market studies, health policy analysis, and state-specific program evaluations. With further adaptation, the methodology could also support microsimulation in international contexts, providing a flexible tool for policy modeling across diverse regions and socioeconomic conditions. + +\subsection{Future Directions} + +Building on the success of this methodology, future work could aim to expand the dataset's geographic granularity and incorporate additional data sources. Integrating state-specific datasets or additional federal data on healthcare and education would further enrich the dataset's utility for policy analysis. Moreover, the dataset could benefit from continued refinement of its reweighting procedure, including the use of ensemble methods to capture a broader range of variable interactions. + +Another promising direction involves the development of interactive tools that allow researchers and policymakers to explore the dataset in real time, enhancing transparency and accessibility. By providing both the enhanced dataset and the codebase as open-source resources, we establish a foundation for collaborative improvement and iterative updates that respond to changing policy needs and data availability. diff --git a/paper/sections/introduction.tex b/paper/sections/introduction.tex new file mode 100644 index 0000000..b3c66be --- /dev/null +++ b/paper/sections/introduction.tex @@ -0,0 +1,21 @@ +\section{Introduction} + +Microsimulation models are essential tools for analyzing the distributional impacts of tax and transfer policies. These models require microdata that accurately represent both the demographic composition of a population and their economic circumstances, particularly their tax situations. However, available data sources typically excel in one dimension while falling short in another. + +The Current Population Survey (CPS), conducted by the U.S. Census Bureau, provides rich demographic detail and household relationships but suffers from underreporting of income and lacks tax information. Conversely, the Internal Revenue Service's Public Use File (PUF) offers precise tax data but contains limited demographic information and obscures household structure. This tradeoff between demographic detail and tax precision poses a significant challenge for policy analysis. + +This paper presents a novel approach to combining these complementary data sources. We develop a methodology that preserves the demographic richness of the CPS while incorporating the tax precision of the PUF, creating an enhanced dataset that serves as the foundation for PolicyEngine's microsimulation capabilities. Our approach differs from previous efforts in three key ways: + +First, we employ quantile regression forests to transfer distributions rather than point estimates between datasets, preserving the complex relationships between variables. Second, we maintain household structure throughout the enhancement process, ensuring that family relationships crucial for benefit calculations remain intact. Third, we implement a sophisticated reweighting procedure that simultaneously matches dozens of demographic and economic targets while avoiding overfitting through a dropout-enhanced gradient descent approach. + +The resulting dataset demonstrates superior performance in both tax and transfer policy simulation. When compared to administrative totals, our enhanced dataset reduces discrepancies in key tax components by an average of 40\% relative to the baseline CPS, while maintaining or improving the accuracy of demographic and program participation variables. + +The remainder of this paper is organized as follows: Section 2 reviews related work in survey enhancement and microsimulation data construction. Section 3 describes our data sources and their characteristics. Section 4 presents our methodology in detail. Section 5 validates our results against external benchmarks. Section 6 discusses implications and limitations, and Section 7 concludes. + +Our contributions include: +\begin{itemize} + \item A novel methodology for combining survey and administrative data while preserving distributional relationships + \item An open-source implementation that can be adapted for other jurisdictions and policy models + \item A validation framework comparing enhanced estimates against multiple external benchmarks + \item A new, publicly available microdata file suitable for US tax and benefit policy analysis +\end{itemize} \ No newline at end of file diff --git a/paper/sections/methodology.tex b/paper/sections/methodology.tex new file mode 100644 index 0000000..9ed4ad0 --- /dev/null +++ b/paper/sections/methodology.tex @@ -0,0 +1,17 @@ +\section{Methodology}\label{sec:methodology} + +\begin{figure}[h] + \centering + \includegraphics[width=\textwidth]{figures/data_flow.png} + \caption{Data flow diagram for integrating CPS and PUF microdata. The process ages both datasets to a common year, integrates demographic and income information through quantile regression forests, and optimizes household weights using gradient descent.} + \label{fig:data_flow} +\end{figure} + +\input{sections/methodology/overview} +\input{sections/methodology/demographic_variables} +\input{sections/methodology/puf_preprocessing} +\input{sections/methodology/aging} +\input{sections/methodology/quantile_forests} +\input{sections/methodology/loss_matrix} +\input{sections/methodology/reweighting} +\input{sections/methodology/pipeline} \ No newline at end of file diff --git a/paper/sections/methodology/aging.tex b/paper/sections/methodology/aging.tex new file mode 100644 index 0000000..ef70cfc --- /dev/null +++ b/paper/sections/methodology/aging.tex @@ -0,0 +1,49 @@ +\subsection{Data Aging and Indexing} + +The process of projecting historical microdata involves both demographic aging and economic indexing based on US government forecasts. Our aging process occurs in two stages: first to reach our baseline year (2024), and then to project the calibrated dataset forward. + +\subsubsection{Growth Factor Construction} + +For each variable in the tax-benefit system with a specified growth parameter, we compute change factors from the base year through 2034: + +\[ \text{Index Factor}_{t} = \frac{\text{Index}_{t}}{\text{Index}_{\text{base}}} \] + +\subsubsection{Population Adjustment} + +Most economic variables are adjusted for changes in total population: + +\[ \text{Per Capita Factor}_{t} = \frac{\text{Index Factor}_{t}}{\text{Population Growth}_{t}} \] + +Exceptions include: +\begin{itemize} + \item Weight variables maintain raw growth + \item Population itself uses Census projections directly +\end{itemize} + +\subsubsection{Data Sources} + +Projection factors come from: +\begin{itemize} + \item Congressional Budget Office economic projections + \item Census Bureau population estimates + \item Social Security Administration wage index forecasts + \item Treasury tax parameter indexing +\end{itemize} + +\subsubsection{Initial Aging Implementation} + +For any variable y, the projected value to reach our baseline year is computed as: + +\[ y_{2024} = y_{2023} \cdot \frac{f(2024)}{f(2023)} \] + +where f(t) represents the index factor for time t. + +\subsubsection{Forward Projection} + +After constructing and calibrating the enhanced 2024 dataset, we project it to future years using the same indexing framework. This maintains the dataset's enhanced distributional properties while reflecting: + +\begin{itemize} + \item Economic growth forecasts for monetary variables + \item Statutory adjustments to program parameters + \item Population projections applied to household weights +\end{itemize} \ No newline at end of file diff --git a/paper/sections/methodology/demographic_variables.tex b/paper/sections/methodology/demographic_variables.tex new file mode 100644 index 0000000..9d3c637 --- /dev/null +++ b/paper/sections/methodology/demographic_variables.tex @@ -0,0 +1,76 @@ +\subsection{Demographic Variable Construction} + +Following the IRS specifications for the Public Use File, we construct three key demographic variables: dependent ages, primary taxpayer age ranges, and earnings splits between spouses. + +\subsection{Dependent Ages} + +For each dependent, we construct age categories following IRS constraints: +\begin{itemize} + \item Under 5 + \item 5 under 13 + \item 13 under 17 + \item 17 under 19 + \item 19 under 24 + \item 24 or older +\end{itemize} + +The number of dependents is limited by filing status: +\begin{itemize} + \item Up to 3 dependents for joint returns and head of household returns + \item Up to 2 dependents for single returns + \item Up to 1 dependent for married filing separately returns +\end{itemize} + +Dependents are ordered sequentially by type: +\begin{enumerate} + \item Children living at home + \item Children living away from home + \item Other dependents + \item Parents +\end{enumerate} + +\subsubsection{Primary Taxpayer Age} + +Age ranges are constructed differently for dependent and non-dependent returns: + +For non-dependent returns: +\begin{itemize} + \item Under 26 + \item 26 under 35 + \item 35 under 45 + \item 45 under 55 + \item 55 under 65 + \item 65 or older +\end{itemize} + +For dependent returns: +\begin{itemize} + \item Under 18 + \item 18 under 26 + \item 26 or older +\end{itemize} + +\subsubsection{Earnings Splits} + +For joint returns, we calculate the primary earner's share of total earnings: + +\[ \text{Primary Share} = \frac{\text{Primary Wages} + \text{Primary SE Income}}{\text{Total Wages} + \text{Total SE Income}} \] + +Where: +\begin{itemize} + \item Primary wages and SE income = E30400 - E30500 + \item Secondary wages and SE income = E30500 +\end{itemize} + +This share is categorized into: +\begin{itemize} + \item 75 percent or more earned by primary + \item Less than 75 percent but more than 25 percent earned by primary + \item Less than 25 percent earned by primary +\end{itemize} + +\subsubsection{Implementation Details} + +When decoding age ranges into specific ages, we use random assignment within the range to avoid unrealistic bunching. For example, when the PUF indicates age 80, we randomly assign an age between 80 and 84. + +The ordering of dependents is preserved when constructing synthetic tax units to maintain consistency with the original data structure. \ No newline at end of file diff --git a/paper/sections/methodology/loss_matrix.tex b/paper/sections/methodology/loss_matrix.tex new file mode 100644 index 0000000..677ad71 --- /dev/null +++ b/paper/sections/methodology/loss_matrix.tex @@ -0,0 +1,128 @@ +\subsection{Loss Matrix Construction} + +The loss matrix measures deviation from 570 administrative targets sourced from IRS Statistics of Income (SOI), Census population estimates, CBO projections, and other administrative data. + +\subsubsection{IRS Statistics of Income Targets} + +For each combination of AGI bracket and filing status, we create targets for: + +\begin{itemize} + \item Adjusted gross income + \item Count of returns + \item Employment income + \item Business net profits + \item Capital gains (gross) + \item Ordinary dividends + \item Partnership and S-corporation income + \item Qualified dividends + \item Taxable interest income + \item Total pension income + \item Total social security +\end{itemize} + +For aggregate-level targets only, we track: +\begin{itemize} + \item Business net losses + \item Capital gains distributions + \item Capital gains losses + \item Estate income and losses + \item Exempt interest + \item IRA distributions + \item Partnership and S-corporation losses + \item Rent and royalty net income and losses + \item Taxable pension income + \item Taxable social security + \item Unemployment compensation +\end{itemize} + +\subsubsection{Census Population Targets} + +From Census population projections (np2023\_d5\_mid.csv), we include: +\begin{itemize} + \item Single-year age population counts from age 0 to 85 + \item Filtered to total population (SEX = 0, RACE\_HISP = 0) + \item Projected to the target year +\end{itemize} + +\subsubsection{CBO Program Totals} + +From CBO projections, we calibrate: +\begin{itemize} + \item Income tax + \item SNAP benefits + \item Social security benefits + \item SSI payments + \item Unemployment compensation +\end{itemize} + +\subsubsection{EITC Statistics} + +From Treasury EITC data (eitc.csv), we target: +\begin{itemize} + \item EITC recipient counts by number of qualifying children + \item Total EITC amounts by number of qualifying children +\end{itemize} + +The EITC values are uprated by: +\begin{itemize} + \item EITC spending growth for amounts + \item Population growth for recipient counts +\end{itemize} + +\subsubsection{CPS-Derived Statistics} + +We calibrate to hardcoded totals for: +\begin{itemize} + \item Health insurance premiums without Medicare Part B: \$385B + \item Other medical expenses: \$278B + \item Medicare Part B premiums: \$112B + \item Over-the-counter health expenses: \$72B + \item SPM unit thresholds sum: \$3,945B + \item Child support expense: \$33B + \item Child support received: \$33B + \item SPM unit capped work childcare expenses: \$348B + \item SPM unit capped housing subsidy: \$35B + \item TANF: \$9B + \item Alimony income: \$13B + \item Alimony expense: \$13B + \item Real estate taxes: \$400B + \item Rent: \$735B +\end{itemize} + +\subsubsection{Market Income Targets} + +From IRS SOI PUF estimates: +\begin{itemize} + \item Total negative household market income: -\$138B + \item Count of households with negative market income: 3M +\end{itemize} + +\subsubsection{Healthcare Spending by Age} + +Using healthcare\_spending.csv, we target healthcare expenditures by: +\begin{itemize} + \item 10-year age groups + \item Four expense categories: + \begin{itemize} + \item Health insurance premiums without Medicare Part B + \item Over-the-counter health expenses + \item Other medical expenses + \item Medicare Part B premiums + \end{itemize} +\end{itemize} + +\subsubsection{AGI by SPM Threshold} + +From spm\_threshold\_agi.csv, we target: +\begin{itemize} + \item Adjusted gross income totals by SPM threshold decile + \item Count of households in each SPM threshold decile +\end{itemize} + +\subsubsection{Target Validation} + +The loss matrix construction enforces two key checks: +\begin{itemize} + \item No missing values in any target row + \item No NaN values in the targets array +\end{itemize} \ No newline at end of file diff --git a/paper/sections/methodology/overview.tex b/paper/sections/methodology/overview.tex new file mode 100644 index 0000000..da293ae --- /dev/null +++ b/paper/sections/methodology/overview.tex @@ -0,0 +1,13 @@ +\subsection{Overview} + +Our approach enhances the Current Population Survey (CPS) with information from the IRS Public Use File (PUF) through a multi-stage process. This design is motivated by empirical evidence from \citet{woodruff2023survey} showing that combining synthetic data generation with weight optimization achieves substantially better results than either technique alone or traditional enhancement methods. Their comprehensive benchmarking demonstrated an 88\% reduction in survey error through this combined approach, with improvements that generalized across multiple validation metrics. + +\begin{enumerate} + \item Train quantile regression forests on PUF tax records to learn distributions of tax-related variables + \item Generate a synthetic dataset that combines PUF tax precision with CPS-like demographic detail + \item Stack these synthetic records alongside the original CPS records + \item Run the PolicyEngine US tax-benefit model on the stacked dataset to generate tax and benefit amounts + \item Optimize household weights to match administrative benchmarks while determining the optimal mix of original and synthetic records +\end{enumerate} + +This method preserves the CPS's demographic richness and household relationships while incorporating the PUF's precise tax information. Each component is detailed in the following sections. \ No newline at end of file diff --git a/paper/sections/methodology/pipeline.tex b/paper/sections/methodology/pipeline.tex new file mode 100644 index 0000000..6abf9b0 --- /dev/null +++ b/paper/sections/methodology/pipeline.tex @@ -0,0 +1,48 @@ +\subsection{Complete Enhancement Pipeline} + +The full dataset enhancement process proceeds through the following sequential steps, each documented in the preceding sections: + +\subsubsection{Initial Data Loading} + +\begin{enumerate} + \item Load CPS ASEC public use file + \item Load IRS public use file (if before 2021) + \item Load external validation sources (SOI, Census data) +\end{enumerate} + +\subsubsection{PUF Processing (Pre-2021)} + +\begin{enumerate} + \item Apply variable renaming and recoding + \item Construct derived variables + \item Split income components + \item Remove aggregate records + \item Generate synthetic demographic variables +\end{enumerate} + +\subsubsection{CPS Enhancement} + +\begin{enumerate} + \item Project both datasets to target year using uprating factors + \item Train quantile regression forests on PUF records + \item Generate synthetic CPS-structured records + \item Stack synthetic records with original CPS +\end{enumerate} + +\subsubsection{Final Calibration} + +\begin{enumerate} + \item Construct loss matrix from administrative targets + \item Initialize weights for original and synthetic records + \item Optimize weights using gradient descent with dropout + \item Validate results against external benchmarks +\end{enumerate} + +Each step is implemented in separate modules: +\begin{itemize} + \item PUF processing in datasets/puf/puf.py + \item Uprating in utils/uprating.py + \item QRF models in utils/qrf.py + \item Loss matrix in utils/loss.py + \item Weight optimization in datasets/cps/enhanced\_cps.py +\end{itemize} \ No newline at end of file diff --git a/paper/sections/methodology/puf_preprocessing.tex b/paper/sections/methodology/puf_preprocessing.tex new file mode 100644 index 0000000..760d8fc --- /dev/null +++ b/paper/sections/methodology/puf_preprocessing.tex @@ -0,0 +1,95 @@ +\subsection{PUF Data Preprocessing} + +The preprocessing of the IRS Public Use File involves variable renaming, recoding, and construction of derived variables to align with PolicyEngine's analytical framework. + +\subsubsection{Medical Expense Categories} + +Total medical expenses are decomposed into specific categories using fixed ratios derived from external data: + +\begin{itemize} + \item Health insurance premiums without Medicare Part B: 45.3\% + \item Other medical expenses: 32.5\% + \item Medicare Part B premiums: 13.7\% + \item Over-the-counter health expenses: 8.5\% +\end{itemize} + +\subsubsection{Variable Construction} + +Key derived variables include: + +\paragraph{Qualified Business Income (QBI)} +Calculated as the maximum of zero and the sum of: +\begin{itemize} + \item Schedule E income (E00900) + \item Partnership and S-corporation income (E26270) + \item Farm income (E02100) + \item Rental income (E27200) +\end{itemize} + +W2 wages from qualified business are then computed as 16\% of QBI. + +\paragraph{Filing Status} +Mapped from MARS codes: +\begin{itemize} + \item 1 $\rightarrow$ SINGLE + \item 2 $\rightarrow$ JOINT + \item 3 $\rightarrow$ SEPARATE + \item 4 $\rightarrow$ HEAD\_OF\_HOUSEHOLD +\end{itemize} + +Records with MARS = 0 (aggregate records) are excluded. + +\subsubsection{Income Component Separation} + +Several income sources are separated into positive and negative components: + +\begin{itemize} + \item Business income split into net profits (positive) and losses (negative) + \item Capital gains split into gross gains and losses + \item Partnership and S-corporation income split into income and losses + \item Rental income split into net income and losses +\end{itemize} + +\subsubsection{Variable Renaming} + +The following PUF variables are renamed to align with PolicyEngine conventions: + +\paragraph{Direct Renames} +\begin{itemize} + \item E03500 $\rightarrow$ alimony\_expense + \item E00800 $\rightarrow$ alimony\_income + \item E20500 $\rightarrow$ casualty\_loss + \item E32800 $\rightarrow$ cdcc\_relevant\_expenses + \item E19800 $\rightarrow$ charitable\_cash\_donations + \item E20100 $\rightarrow$ charitable\_non\_cash\_donations + \item E03240 $\rightarrow$ domestic\_production\_ald + \item E03400 $\rightarrow$ early\_withdrawal\_penalty + \item E03220 $\rightarrow$ educator\_expense + \item E00200 $\rightarrow$ employment\_income + \item E26390 - E26400 $\rightarrow$ estate\_income + \item T27800 $\rightarrow$ farm\_income + \item E27200 $\rightarrow$ farm\_rent\_income + \item E03290 $\rightarrow$ health\_savings\_account\_ald + \item E19200 $\rightarrow$ interest\_deduction + \item P23250 $\rightarrow$ long\_term\_capital\_gains + \item E24518 $\rightarrow$ long\_term\_capital\_gains\_on\_collectibles + \item E20400 $\rightarrow$ misc\_deduction + \item E00600 - E00650 $\rightarrow$ non\_qualified\_dividend\_income + \item E00650 $\rightarrow$ qualified\_dividend\_income + \item E03230 $\rightarrow$ qualified\_tuition\_expenses + \item E18500 $\rightarrow$ real\_estate\_taxes +\end{itemize} + +\paragraph{Weight Adjustment} +S006 weights are divided by 100 to convert to population units. + +\subsubsection{Data Cleaning} + +The preprocessing includes: +\begin{itemize} + \item Removal of aggregate records (MARS = 0) + \item Missing value imputation with zeros + \item Construction of unique household identifiers from RECID + \item Assignment of household weights from S006 + \item Extraction of exemption counts from XTOT +\end{itemize} \ No newline at end of file diff --git a/paper/sections/methodology/quantile_forests.tex b/paper/sections/methodology/quantile_forests.tex new file mode 100644 index 0000000..8080a40 --- /dev/null +++ b/paper/sections/methodology/quantile_forests.tex @@ -0,0 +1,23 @@ +\subsection{Quantile Regression Forests} + +Our implementation uses quantile regression forests (QRF) \citep{meinshausen2006quantile}, which extend random forests to estimate conditional quantiles. Building on \citet{woodruff2023survey}, we use the quantile-forest package \citep{zillow2024quantile}, a scikit-learn compatible implementation that provides efficient, Cython-optimized estimation of arbitrary quantiles at prediction time without retraining. + +QRF works by generating an ensemble of regression trees, where each tree recursively partitions the feature space. Unlike standard random forests that only store mean values in leaf nodes, QRF maintains the full empirical distribution of training observations in each leaf. To estimate conditional quantiles, the model identifies relevant leaf nodes for new observations, aggregates the weighted empirical distributions across all trees, and computes the desired quantiles from the combined distribution. + +The key advantages over traditional quantile regression include QRF's ability to capture non-linear relationships without explicit specification, model heteroscedastic variance across the feature space, estimate any quantile without retraining, and maintain the computational efficiency of random forests. + +\subsubsection{PUF Integration: Synthetic Record Generation} + +Unlike our other QRF applications, we use the PUF to generate an entire synthetic CPS-structured dataset. This process begins by training QRF models on PUF records with demographic variables. We then generate a complete set of synthetic CPS-structured records using PUF tax information, which are stacked alongside the original CPS records. The reweighting procedure ultimately determines the optimal mixing between CPS and PUF-based records. + +This approach preserves CPS's person-level detail crucial for modeling various aspects of the tax system. These include state tax policies, benefit program eligibility, age-dependent federal provisions (such as Child Tax Credit variations by child age), and family structure interactions. + +\subsubsection{Direct Variable Imputation} + +For other enhancement needs, we use QRF to directly impute missing variables. When imputing housing costs from ACS records, we incorporate a comprehensive set of predictors including household head status, age, sex, tenure type, various income sources (employment, self-employment, Social Security, and pension), state, and household size. + +To support analysis of lookback provisions, we impute prior year earnings using consecutive-year ASEC records. This imputation relies on current employment and self-employment income, household weights, and income imputation flags from the CPS ASEC panel. + +\subsubsection{Implementation Details} + +Our QRF implementation, housed in utils/qrf.py, provides a robust framework for model development and deployment. The implementation handles categorical variable encoding and ensures consistent feature ordering across training and prediction. It also manages distribution sampling and model persistence, enabling efficient reuse of trained models. \ No newline at end of file diff --git a/paper/sections/methodology/reweighting.tex b/paper/sections/methodology/reweighting.tex new file mode 100644 index 0000000..7bb1cca --- /dev/null +++ b/paper/sections/methodology/reweighting.tex @@ -0,0 +1,66 @@ +\subsection{Reweighting Procedure} + +We optimize household weights using gradient descent through PyTorch \citep{pytorch2019}. + +\subsubsection{Problem Formulation} + +Given a loss matrix $M$ of household characteristics and a target vector $t$, we optimize the log-transformed weights $w$ to minimize: + +\[ L(w) = \text{mean}\left(\left(\frac{w^T M + 1}{t + 1} - 1\right)^2\right) \] + +where: +\begin{itemize} + \item $w$ are the log-transformed weights (requires grad=True) + \item $M$ is the loss matrix in tensor form (float32) + \item $t$ are the targets in tensor form (float32) +\end{itemize} + +\subsubsection{Optimization Implementation} + +The procedure follows these steps: + +\begin{enumerate} + \item Initialize with log-transformed original weights + \item Create a PyTorch session with retries for robustness + \item Use Adam optimizer with learning rate 0.1 + \item Apply dropout (5\% rate) during optimization + \item Run for 5,000 iterations or until convergence +\end{enumerate} + +\subsubsection{Dropout Application} + +We apply dropout regularization during optimization to prevent overfitting: +\begin{itemize} + \item Randomly masks p\% of weights each iteration (p = 5) + \item Replaces masked weights with mean of unmasked weights + \item Returns original weights if dropout rate is 0 +\end{itemize} + +\subsubsection{Convergence Monitoring} + +For each iteration: +\begin{itemize} + \item Track initial loss value as baseline + \item Compute relative change from starting loss + \item Display progress with current loss values +\end{itemize} + +\subsubsection{Error Handling} + +The implementation includes checks for: +\begin{itemize} + \item NaN values in weights + \item NaN values in loss matrix + \item NaN values in loss computation + \item NaN values in relative error calculation +\end{itemize} + +If any check fails, the procedure raises a ValueError with diagnostic information. + +\subsubsection{Weight Recovery} + +The final weights are recovered by: +\begin{itemize} + \item Taking exponential of optimized log weights + \item Converting from torch tensor to numpy array +\end{itemize} \ No newline at end of file diff --git a/paper/sections/results.tex b/paper/sections/results.tex new file mode 100644 index 0000000..539b306 --- /dev/null +++ b/paper/sections/results.tex @@ -0,0 +1,32 @@ +\section{Results} + +We validate our enhanced dataset against a comprehensive set of official statistics and compare its performance to both the original CPS and PUF datasets. Our validation metrics cover 570 distinct targets spanning demographic totals, program participation rates, and detailed income components across the distribution. + +\subsection{Validation Against Administrative Totals} + +The enhanced CPS (ECPS) shows substantial improvements over both of its source datasets. When comparing absolute relative errors across all targets, the ECPS outperforms: +\begin{itemize} + \item The Census Bureau's CPS in 63.0\% of targets + \item The IRS Public Use File in 70.7\% of targets +\end{itemize} + +These improvements are particularly notable because they demonstrate that our enhancement methodology successfully combines the strengths of both source datasets while mitigating their individual weaknesses. The CPS excels at demographic representation but struggles with income reporting, particularly at the top of the distribution. Conversely, the PUF captures tax-related variables well but lacks demographic detail. Our enhanced dataset achieves better accuracy than either source across most metrics. + +\subsection{Distribution of Improvements} + +To assess the magnitude and consistency of these improvements, we calculate the relative error change under the ECPS compared to the better performing of the CPS or PUF for each target. The distribution of these improvements shows that: + +\begin{itemize} + \item Most improvements cluster between 5-15\% reduction in relative error + \item Some targets see improvements exceeding 50\% reduction in error + \item Very few targets show degraded performance compared to the source datasets +\end{itemize} + +A detailed, interactive validation dashboard showing performance across all targets is maintained at \url{https://policyengine.github.io/policyengine-us-data/validation.html} and updates automatically with each dataset revision. This transparency allows users to assess the dataset's strengths and limitations for their specific use cases. See Figure \ref{fig:ecps_vs_cps_puf} for a visualization of the distribution of improvements. + +\begin{figure}[h] + \centering + \includegraphics[width=\textwidth]{figures/ecps_vs_cps_puf.png} + \caption{Relative error change under the ECPS compared to the better performing of the CPS or PUF for each target.} + \label{fig:ecps_vs_cps_puf} +\end{figure}