PolicyEngine · MaxGhenis · Sep 29, 2024 · Sep 28, 2024 · Sep 28, 2024 · Sep 28, 2024
diff --git a/.github/workflows/pull_request.yaml b/.github/workflows/pull_request.yaml
@@ -67,12 +67,9 @@ jobs:
           POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
       - name: Build datasets
         run: make data
+        env:
+          LITE_MODE: true
       - name: Run tests
         run: pytest
       - name: Test documentation builds
         run: make documentation
-      - name: Upload ECPS 2024
-        uses: actions/upload-artifact@v4
-        with:
-          name: enhanced_cps_2024.h5
-          path: policyengine_us_data/storage/enhanced_cps_2024.h5
diff --git a/.github/workflows/push_2.yaml → .github/workflows/push.yaml b/.github/workflows/push_2.yaml → .github/workflows/push.yaml
@@ -1,10 +1,10 @@
 # After successful versioning, this script runs various 
 # parts of the push process
-name: Push 2
+name: Push
 
 on:
   workflow_run:
-    workflows: ["Push 1"]
+    workflows: ["Update versioning"]
     types: [completed]
 
 jobs:
@@ -46,11 +46,19 @@ jobs:
       - name: Build datasets
         run: make data
         env:
-          TEST_LITE: true
+          LITE_MODE: true
       - name: Run tests
         run: pytest
       - name: Test documentation builds
         run: make documentation
+      - name: Build Jupyter Book
+        run: make documentation
+      - name: Deploy documentation
+        uses: JamesIves/github-pages-deploy-action@releases/v4
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          BRANCH: gh-pages
+          FOLDER: docs/_build/html
   publish-to-pypi:
     name: Publish to PyPI
     runs-on: ubuntu-latest
@@ -92,63 +100,4 @@ jobs:
       - name: Build container
         run: docker build . -f docker/policyengine_us_data.Dockerfile -t ghcr.io/policyengine/policyengine-us-data:latest
       - name: Push container
-        run: docker push ghcr.io/policyengine/policyengine-us-data:latest
-  publish-docs:
-    name: Publish documentation
-    runs-on: ubuntu-latest
-    if: ${{ github.event.workflow_run.conclusion == 'success' }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0 # Fetch all history for all tags and branches
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.12
-      - name: Install package
-        run: pip install -e ".[dev]"
-      - name: Build Jupyter Book
-        run: make documentation
-      - name: Deploy documentation
-        uses: JamesIves/github-pages-deploy-action@releases/v4
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          BRANCH: gh-pages
-          FOLDER: docs/_build/html
-  upload:
-    name: Upload data 
-    runs-on: ubuntu-latest
-    needs: [lint, test]
-    if: ${{ github.event.workflow_run.conclusion == 'success' }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0 # Fetch all history for all tags and branches
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.12
-      - name: Install package
-        run: pip install -e ".[dev]"
-      - name: Download data inputs
-        run: make download
-        env:
-          POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
-      - name: Build datasets
-        run: make data
-      - name: Upload CPS 2024
-        uses: actions/upload-artifact@v4
-        with:
-          name: cps_2024.h5
-          path: policyengine_us_data/storage/cps_2024.h5
-      - name: Upload ECPS 2024
-        uses: actions/upload-artifact@v4
-        with:
-          name: enhanced_cps_2024.h5
-          path: policyengine_us_data/storage/enhanced_cps_2024.h5
-      - name: Upload data
-        run: make upload
-        env:
-          POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
+        run: docker push ghcr.io/policyengine/policyengine-us-data:latest
diff --git a/.github/workflows/push_1.yaml → .github/workflows/update_versioning.yaml b/.github/workflows/push_1.yaml → .github/workflows/update_versioning.yaml
@@ -5,7 +5,7 @@
 # This script must run first and complete to allow for 
 # proper versioning.
 
-name: Push 1
+name: Update versioning
 
 on:
   push:

diff --git a/Makefile b/Makefile
@@ -34,6 +34,8 @@ documentation:
 data:
 	python policyengine_us_data/datasets/acs/acs.py
 	python policyengine_us_data/datasets/cps/cps.py
+	python policyengine_us_data/datasets/puf/irs_puf.py
+	python policyengine_us_data/datasets/puf/puf.py
 	python policyengine_us_data/datasets/cps/extended_cps.py
 	python policyengine_us_data/datasets/cps/enhanced_cps.py
 

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    changed:
+      - Bump to policyengine-us 1.100.0.
diff --git a/docs/validation.ipynb b/docs/validation.ipynb
diff --git a/policyengine_us_data/datasets/__init__.py b/policyengine_us_data/datasets/__init__.py
@@ -18,4 +18,11 @@
 from .puf import PUF_2015, PUF_2021, PUF_2024, IRS_PUF_2015
 from .acs import ACS_2022
 
-DATASETS = [CPS_2022, PUF_2021, CPS_2024, EnhancedCPS_2024, ACS_2022]
+DATASETS = [
+    CPS_2022,
+    PUF_2021,
+    CPS_2024,
+    EnhancedCPS_2024,
+    ACS_2022,
+    Pooled_3_Year_CPS_2023,
+]
diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py
@@ -111,7 +111,7 @@ class ACS_2022(ACS):
     time_period = 2022
     file_path = STORAGE_FOLDER / "acs_2022.h5"
     census_acs = CensusACS_2022
-    url = "release://PolicyEngine/policyengine-us-data/release/acs_2022.h5"
+    url = "release://PolicyEngine/policyengine-us-data/1.7.0/acs_2022.h5"
 
 
 if __name__ == "__main__":

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -620,9 +620,9 @@ class CPS_2023(CPS):
 class CPS_2024(CPS):
     name = "cps_2024"
     label = "CPS 2024 (2022-based)"
-    file_path = STORAGE_FOLDER / "cps_2024_v1_6_1.h5"
+    file_path = STORAGE_FOLDER / "cps_2024.h5"
     time_period = 2024
-    url = "release://policyengine/policyengine-us-data/release/cps_2024_v1_6_1.h5"
+    url = "release://policyengine/policyengine-us-data/1.7.0/cps_2024.h5"
 
 
 class PooledCPS(Dataset):
@@ -674,14 +674,14 @@ def generate(self):
 class Pooled_3_Year_CPS_2023(PooledCPS):
     label = "CPS 2023 (3-year pooled)"
     name = "pooled_3_year_cps_2023"
-    file_path = STORAGE_FOLDER / "pooled_3_year_cps_2023_v1_6_1.h5"
+    file_path = STORAGE_FOLDER / "pooled_3_year_cps_2023.h5"
     input_datasets = [
         CPS_2021,
         CPS_2022,
         CPS_2023,
     ]
     time_period = 2023
-    url = "release://PolicyEngine/policyengine-us-data/release/pooled_3_year_cps_2023_v1_6_1.h5"
+    url = "release://PolicyEngine/policyengine-us-data/1.7.0/pooled_3_year_cps_2023.h5"
 
 
 if __name__ == "__main__":

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -176,8 +176,8 @@ class EnhancedCPS_2024(EnhancedCPS):
     end_year = 2024
     name = "enhanced_cps_2024"
     label = "Enhanced CPS 2024"
-    file_path = STORAGE_FOLDER / "enhanced_cps_2024_v1_6_1.h5"
-    url = "release://policyengine/policyengine-us-data/release/enhanced_cps_2024_v1_6_1.h5"
+    file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"
+    url = "release://policyengine/policyengine-us-data/1.7.0/enhanced_cps_2024.h5"
 
 
 if __name__ == "__main__":

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -85,6 +85,9 @@ def generate(self):
         cps_sim = Microsimulation(dataset=self.cps)
         puf_sim = Microsimulation(dataset=self.puf)
 
+        if os.environ.get("LITE_MODE"):
+            puf_sim.subsample(10_000)
+
         INPUTS = [
             "age",
             "is_male",
@@ -114,7 +117,7 @@ def generate(self):
         print(
             f"Predicting imputed values took {time.time() - start:.2f} seconds"
         )
-
+        cps_sim = Microsimulation(dataset=self.cps)
         data = cps_sim.dataset.load_dataset()
         new_data = {}
 

diff --git a/policyengine_us_data/datasets/puf/irs_puf.py b/policyengine_us_data/datasets/puf/irs_puf.py
@@ -42,3 +42,7 @@ class IRS_PUF_2015(IRS_PUF):
     puf_file_path = STORAGE_FOLDER / "puf_2015.csv"
     puf_demographics_file_path = STORAGE_FOLDER / "demographics_2015.csv"
     file_path = STORAGE_FOLDER / "irs_puf_2015.h5"
+
+
+if __name__ == "__main__":
+    IRS_PUF_2015().generate()
diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
@@ -4,8 +4,8 @@
 from microdf import MicroDataFrame
 from policyengine_core.data import Dataset
 from policyengine_us_data.storage import STORAGE_FOLDER
-from .uprate_puf import uprate_puf
-from .irs_puf import IRS_PUF_2015
+from policyengine_us_data.datasets.puf.uprate_puf import uprate_puf
+from policyengine_us_data.datasets.puf.irs_puf import IRS_PUF_2015
 from policyengine_us_data.utils.uprating import (
     create_policyengine_uprating_factors_table,
 )
@@ -357,11 +357,7 @@ def generate(self):
 
         i = 0
         self.earn_splits = []
-        for _, row in tqdm(
-            puf.iterrows(),
-            total=len(puf),
-            desc="Constructing hierarchical PUF",
-        ):
+        for _, row in puf.iterrows():
             i += 1
             exemptions = row["exemptions_count"]
             tax_unit_id = row["household_id"]
@@ -497,15 +493,15 @@ class PUF_2021(PUF):
     name = "puf_2021"
     time_period = 2021
     file_path = STORAGE_FOLDER / "puf_2021.h5"
-    url = "release://policyengine/irs-soi-puf/release/puf_2021.h5"
+    url = "release://policyengine/irs-soi-puf/1.7.0/puf_2021.h5"
 
 
 class PUF_2024(PUF):
     label = "PUF 2024 (2015-based)"
     name = "puf_2024"
     time_period = 2024
     file_path = STORAGE_FOLDER / "puf_2024.h5"
-    url = "release://policyengine/irs-soi-puf/release/puf_2024.h5"
+    url = "release://policyengine/irs-soi-puf/1.7.0/puf_2024.h5"
 
 
 MEDICAL_EXPENSE_CATEGORY_BREAKDOWNS = {
@@ -514,3 +510,8 @@ class PUF_2024(PUF):
     "medicare_part_b_premiums": 0.137,
     "over_the_counter_health_expenses": 0.085,
 }
+
+if __name__ == "__main__":
+    PUF_2015().generate()
+    PUF_2021().generate()
+    PUF_2024().generate()
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -226,6 +226,19 @@ def build_loss_matrix(dataset: type, time_period):
             raise ValueError(f"Missing values for {label}")
         targets_array.append(target)
 
+    # Negative household market income total rough estimate from the IRS SOI PUF
+
+    market_income = sim.calculate("household_market_income").values
+    loss_matrix["irs/negative_household_market_income_total"] = (
+        market_income * (market_income < 0)
+    )
+    targets_array.append(-138e9)
+
+    loss_matrix["irs/negative_household_market_income_count"] = (
+        market_income < 0
+    ).astype(float)
+    targets_array.append(3e6)
+
     # Healthcare spending by age
 
     healthcare = pd.read_csv(STORAGE_FOLDER / "healthcare_spending.csv")

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
 dev = [
     "black",
     "pytest",
-    "policyengine_us==1.88.0",
+    "policyengine_us==1.100.0",
     "quantile-forest",
     "torch",
     "tables",