Calibrate state populations (and under-5s) (#121)

* Add dataset comparisons to docs * Add inequality to book * Add population by state to calibration * Update uprating factors * Update versioning and dataset URLs
PolicyEngine · Nov 19, 2024 · 786b18e · 786b18e
1 parent 2f0e8cf
commit 786b18e
Show file tree

Hide file tree

Showing 12 changed files with 115 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@
 !eitc.csv
 !spm_threshold_agi.csv
 **/_build
+!population_by_state.csv
diff --git a/Makefile b/Makefile
@@ -31,6 +31,8 @@ docker:
 
 documentation:
 	jb clean docs && jb build docs
+	python docs/add_plotly_to_book.py docs
+
 
 data:
 	python policyengine_us_data/datasets/acs/acs.py

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: minor
+  changes:
+    added:
+    - Metric comparisons by dataset to the documentation.
+    - Calibration of state populations.
diff --git a/docs/add_plotly_to_book.py b/docs/add_plotly_to_book.py
@@ -0,0 +1,27 @@
+import argparse
+from pathlib import Path
+
+# This command-line tools enables Plotly charts to show in the HTML files for the Jupyter Book documentation.
+
+parser = argparse.ArgumentParser()
+parser.add_argument("book_path", help="Path to the Jupyter Book.")
+
+args = parser.parse_args()
+
+# Find every HTML file in the Jupyter Book. Then, add a script tag to the start of the <head> tag in each file, with the contents:
+# <script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
+
+book_folder = Path(args.book_path)
+
+for html_file in book_folder.glob("**/*.html"):
+    with open(html_file, "r") as f:
+        html = f.read()
+
+    # Add the script tag to the start of the <head> tag.
+    html = html.replace(
+        "<head>",
+        '<head><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>',
+    )
+
+    with open(html_file, "w") as f:
+        f.write(html)
diff --git a/docs/results.ipynb b/docs/results.ipynb
@@ -310,7 +310,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": 94,
    "metadata": {},
    "outputs": [
     {
@@ -7270,7 +7270,8 @@
     "        title=\"Weight\",\n",
     "        type=\"log\",\n",
     "    ),\n",
-    ")"
+    ")\n",
+    "fig"
    ]
   }
  ],

diff --git a/policyengine_us_data/datasets/acs/acs.py b/policyengine_us_data/datasets/acs/acs.py
@@ -111,7 +111,7 @@ class ACS_2022(ACS):
     time_period = 2022
     file_path = STORAGE_FOLDER / "acs_2022.h5"
     census_acs = CensusACS_2022
-    url = "release://PolicyEngine/policyengine-us-data/1.11.0/acs_2022.h5"
+    url = "release://PolicyEngine/policyengine-us-data/1.13.0/acs_2022.h5"
 
 
 if __name__ == "__main__":

diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -648,7 +648,7 @@ class CPS_2024(CPS):
     label = "CPS 2024 (2022-based)"
     file_path = STORAGE_FOLDER / "cps_2024.h5"
     time_period = 2024
-    url = "release://policyengine/policyengine-us-data/1.11.0/cps_2024.h5"
+    url = "release://policyengine/policyengine-us-data/1.13.0/cps_2024.h5"
 
 
 class PooledCPS(Dataset):
@@ -707,7 +707,7 @@ class Pooled_3_Year_CPS_2023(PooledCPS):
         CPS_2023,
     ]
     time_period = 2023
-    url = "release://PolicyEngine/policyengine-us-data/1.11.0/pooled_3_year_cps_2023.h5"
+    url = "release://PolicyEngine/policyengine-us-data/1.13.0/pooled_3_year_cps_2023.h5"
 
 
 if __name__ == "__main__":

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -189,7 +189,7 @@ class EnhancedCPS_2024(EnhancedCPS):
     name = "enhanced_cps_2024"
     label = "Enhanced CPS 2024"
     file_path = STORAGE_FOLDER / "enhanced_cps_2024.h5"
-    url = "release://policyengine/policyengine-us-data/1.11.0/enhanced_cps_2024.h5"
+    url = "release://policyengine/policyengine-us-data/1.13.0/enhanced_cps_2024.h5"
 
 
 if __name__ == "__main__":

diff --git a/policyengine_us_data/storage/population_by_state.csv b/policyengine_us_data/storage/population_by_state.csv
@@ -0,0 +1,53 @@
+state,population,population_under_5
+CA,38965193.00,2104120.00
+TX,30503301.00,1921708.00
+FL,22610726.00,1130536.00
+NY,19571216.00,1037274.00
+PA,12961683.00,661046.00
+IL,12549689.00,665134.00
+OH,11785935.00,660012.00
+GA,11029227.00,639695.00
+NC,10835491.00,606787.00
+MI,10037261.00,531975.00
+NJ,9290841.00,520287.00
+VA,8715698.00,488079.00
+WA,7812880.00,421896.00
+AZ,7431344.00,393861.00
+TN,7126489.00,413336.00
+MA,7001399.00,343069.00
+IN,6862199.00,404870.00
+MO,6196156.00,353181.00
+MD,6180253.00,352274.00
+WI,5910955.00,313281.00
+CO,5877610.00,311513.00
+MN,5737915.00,327061.00
+SC,5373555.00,290172.00
+AL,5108468.00,291183.00
+LA,4573749.00,278999.00
+KY,4526154.00,262517.00
+OR,4233358.00,203201.00
+OK,4053824.00,243229.00
+CT,3617176.00,180859.00
+UT,3417734.00,232406.00
+IA,3207004.00,186006.00
+PR,3205691.00,96171.00
+NV,3194176.00,172486.00
+AR,3067732.00,180996.00
+KS,2940546.00,176433.00
+MS,2939690.00,173442.00
+NM,2114371.00,107833.00
+NE,1978379.00,124638.00
+ID,1964726.00,113954.00
+WV,1770071.00,86733.00
+HI,1435138.00,77497.00
+NH,1402054.00,63092.00
+ME,1395722.00,61412.00
+MT,1132812.00,57773.00
+RI,1095962.00,52606.00
+DE,1031890.00,54690.00
+SD,919318.00,57917.00
+ND,783926.00,49387.00
+AK,733406.00,46205.00
+DC,678972.00,38701.00
+VT,647464.00,27193.00
+WY,584057.00,30955.00
diff --git a/policyengine_us_data/storage/uprating_factors.csv b/policyengine_us_data/storage/uprating_factors.csv
@@ -3,6 +3,7 @@ alimony_expense,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.
 alimony_income,1.0,1.255,1.322,1.357,1.446,1.504,1.535,1.567,1.576,1.595,1.622,1.655,1.689,1.723,1.779
 american_opportunity_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 amt_foreign_tax_credit,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
+capital_gains_before_response,1.0,1.824,1.11,1.195,1.244,1.195,1.14,1.122,1.126,1.145,1.173,1.206,1.243,1.283,1.326
 casualty_loss,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 cdcc_relevant_expenses,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718
 charitable_cash_donations,1.0,1.166,1.148,1.215,1.28,1.318,1.35,1.389,1.428,1.467,1.513,1.561,1.611,1.663,1.718

diff --git a/policyengine_us_data/storage/uprating_growth_factors.csv b/policyengine_us_data/storage/uprating_growth_factors.csv
@@ -3,6 +3,7 @@ alimony_expense,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.
 alimony_income,0,0.255,0.053,0.026,0.066,0.04,0.021,0.021,0.006,0.012,0.017,0.02,0.021,0.02,0.033
 american_opportunity_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033
 amt_foreign_tax_credit,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033
+capital_gains_before_response,0,0.824,-0.391,0.077,0.041,-0.039,-0.046,-0.016,0.004,0.017,0.024,0.028,0.031,0.032,0.034
 casualty_loss,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033
 cdcc_relevant_expenses,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033
 charitable_cash_donations,0,0.166,-0.015,0.058,0.053,0.03,0.024,0.029,0.028,0.027,0.031,0.032,0.032,0.032,0.033

diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
@@ -322,6 +322,24 @@ def build_loss_matrix(dataset: type, time_period):
         )
         targets_array.append(row["count"])
 
+    # Population by state and population under 5 by state
+
+    state_population = pd.read_csv(STORAGE_FOLDER / "population_by_state.csv")
+
+    for _, row in state_population.iterrows():
+        in_state = sim.calculate("state_code", map_to="person") == row["state"]
+        label = f"census/population_by_state/{row['state']}"
+        loss_matrix[label] = sim.map_result(in_state, "person", "household")
+        targets_array.append(row["population"])
+
+        under_5 = sim.calculate("age").values < 5
+        in_state_under_5 = in_state * under_5
+        label = f"census/population_under_5_by_state/{row['state']}"
+        loss_matrix[label] = sim.map_result(
+            in_state_under_5, "person", "household"
+        )
+        targets_array.append(row["population_under_5"])
+
     if any(loss_matrix.isna().sum() > 0):
         raise ValueError("Some targets are missing from the loss matrix")