Merge pull request PSLmodels#164 from PSLmodels/fix-even-more-warnings

Fix warnings in taxcalc_dataset.py code
donboyd5 · Aug 22, 2024 · 7dc5728 · 7dc5728
2 parents 093235d + 13d7686
commit 7dc5728
Showing 1 changed file with 122 additions and 130 deletions.
diff --git a/tax_microdata_benchmarking/datasets/taxcalc_dataset.py b/tax_microdata_benchmarking/datasets/taxcalc_dataset.py
@@ -1,18 +1,16 @@
 # Create a Tax-Calculator-compatible dataset from any PolicyEngine hierarchical dataset.
+import yaml
 from typing import Type
-import pandas as pd
 import numpy as np
-import yaml
+import pandas as pd
 from tax_microdata_benchmarking.storage import STORAGE_FOLDER
-import taxcalc
+from tax_microdata_benchmarking.datasets.puf import PUF_2015, PUF_2021
+from policyengine_us import Microsimulation
+from policyengine_us.system import system
 
 
 def create_tc_dataset(pe_dataset: Type, year: int) -> pd.DataFrame:
-    from policyengine_us import Microsimulation
-    from policyengine_us.system import system
-
     pe_sim = Microsimulation(dataset=pe_dataset)
-    df = pd.DataFrame()
 
     print(f"Creating tc dataset from '{pe_dataset.label}' for year {year}...")
 
@@ -21,59 +19,114 @@ def create_tc_dataset(pe_dataset: Type, year: int) -> pd.DataFrame:
 
     def pe(variable):
         if system.variables[variable].entity.key == "person":
-            # Sum over non-dependents
+            # sum over non-dependents
             values = pe_sim.calculate(variable).values
             return np.array(tax_unit.sum(values * is_non_dep))
         else:
             return np.array(pe_sim.calculate(variable, map_to="tax_unit"))
 
-    df["E03500"] = pe("alimony_expense")
-    df["E00800"] = pe("alimony_income")
-    df["G20500"] = pe(
-        "casualty_loss"
-    )  # Amend with taxdata treatment from e20500
-    df["E32800"] = pe("cdcc_relevant_expenses")
-    df["E19800"] = pe("charitable_cash_donations")
-    df["E20100"] = pe("charitable_non_cash_donations")
-    df["XTOT"] = pe("exemptions_count")
-    df["E03240"] = pe("domestic_production_ald")
-    df["E03400"] = pe("early_withdrawal_penalty")
-    df["E03220"] = pe("educator_expense")
-    df["E00200"] = pe("employment_income")
-    df["E02100"] = pe("farm_income")
-    df["E27200"] = pe("farm_rent_income")
-    df["E03290"] = pe("health_savings_account_ald")
-    df["E19200"] = pe("interest_deduction")
-    df["P23250"] = pe("long_term_capital_gains")
-    df["E24518"] = pe("long_term_capital_gains_on_collectibles")
-    df["E17500"] = pe("medical_expense")
-    df["E00600"] = pe("non_qualified_dividend_income") + pe(
+    # specify tcname-to-pename dictionary
+    vnames = {
+        "RECID": "household_id",
+        "S006": "tax_unit_weight",
+        "E03500": "alimony_expense",
+        "E00800": "alimony_income",
+        "G20500": "casualty_loss",
+        "E32800": "cdcc_relevant_expenses",
+        "E19800": "charitable_cash_donations",
+        "E20100": "charitable_non_cash_donations",
+        "XTOT": "exemptions_count",
+        "E03240": "domestic_production_ald",
+        "E03400": "early_withdrawal_penalty",
+        "E03220": "educator_expense",
+        "E00200": "employment_income",
+        "E02100": "farm_income",
+        "E27200": "farm_rent_income",
+        "E03290": "health_savings_account_ald",
+        "E19200": "interest_deduction",
+        "P23250": "long_term_capital_gains",
+        "E24518": "long_term_capital_gains_on_collectibles",
+        "E17500": "medical_expense",
+        "E00650": "qualified_dividend_income",
+        "E26270": "partnership_s_corp_income",
+        "E03230": "qualified_tuition_expenses",
+        "e87530": "qualified_tuition_expenses",
+        "E18500": "real_estate_taxes",
+        "E00900": "self_employment_income",
+        "E03270": "self_employed_health_insurance_ald",
+        "E03300": "self_employed_pension_contribution_ald",
+        "P22250": "short_term_capital_gains",
+        "E02400": "social_security",
+        "E18400": "state_and_local_sales_or_income_tax",
+        "E03210": "student_loan_interest",
+        "E00300": "taxable_interest_income",
+        "E01700": "taxable_pension_income",
+        "E02300": "taxable_unemployment_compensation",
+        "E01400": "taxable_ira_distributions",
+        "E00400": "tax_exempt_interest_income",
+        "E01700": "taxable_pension_income",
+        "E03150": "traditional_ira_contributions",
+        "E24515": "unrecaptured_section_1250_gain",
+        "E27200": "farm_rent_income",
+        "PT_binc_w2_wages": "w2_wages_from_qualified_business",
+        "e20400": "misc_deduction",
+        "e07300": "foreign_tax_credit",
+        "e62900": "amt_foreign_tax_credit",
+        "e01200": "miscellaneous_income",
+        "e00700": "salt_refund_income",
+        "e58990": "investment_income_elected_form_4952",
+        "e07400": "general_business_credit",
+        "e07600": "prior_year_minimum_tax_credit",
+        "e11200": "excess_withheld_payroll_tax",
+        "e01100": "non_sch_d_capital_gains",
+        "e87521": "american_opportunity_credit",
+        "e07260": "energy_efficient_home_improvement_credit",
+        "e09900": "early_withdrawal_penalty",
+        "p08000": "other_credits",
+        "e07240": "savers_credit",
+        "e09700": "recapture_of_investment_credit",
+        "e09800": "unreported_payroll_tax",
+        "f2441": "count_cdcc_eligible",
+    }
+    # specify Tax-Calculator names of variables that have zero values
+    zero_names = [
+        "a_lineno",  # taxdata-specific (CPS matched person ID)
+        "agi_bin",  # taxdata-specific (AGI bin)
+        "h_seq",  # taxdata-specific (CPS matched household ID)
+        "ffpos",  # taxdata-specific (CPS matched family ID)
+        "fips",  # no FIPS data
+        "DSI",  # claimed as dependent on another return, assume not
+        "MIDR",  # separately filing spouse itemizes, assume not
+        "PT_SSTB_income",  # PT SSTB business income, assume none
+        "PT_ubia_property",  # PT business capital, assume none
+        "cmbtp",
+        "f6251",
+        "k1bx14p",
+        "k1bx14s",
+        "tanf_ben",  # TANF benefits, assume none
+        "vet_ben",  # veteran's benefits, assume none
+        "wic_ben",  # WIC benefits, assume none
+        "snap_ben",  # SNAP benefits, assume none
+        "housing_ben",  # housing benefits, assume none
+        "ssi_ben",  # SSI benefits, assume none
+        "mcare_ben",  # Medicare benefits, assume none
+        "mcaid_ben",  # Medicaid benefits, assume none
+        "other_ben",  # Other benefits, assume none
+    ]
+    # specify Tax-Calculator array variable dictionary
+    var = {}
+    for tcname, pename in vnames.items():
+        var[tcname] = pe(pename)
+    zeros = np.zeros_like(var["RECID"], dtype=int)
+    for tcname in zero_names:
+        var[tcname] = zeros
+    var["E00600"] = pe("non_qualified_dividend_income") + pe(
         "qualified_dividend_income"
     )
-    df["E00650"] = pe("qualified_dividend_income")
-    df["E26270"] = pe("partnership_s_corp_income")
-    df["E03230"] = pe("qualified_tuition_expenses")
-    df["E18500"] = pe("real_estate_taxes")
-    df["E00900"] = pe("self_employment_income")
-    df["E03270"] = pe("self_employed_health_insurance_ald")
-    df["E03300"] = pe("self_employed_pension_contribution_ald")
-    df["P22250"] = pe("short_term_capital_gains")
-    df["E02400"] = pe("social_security")
-    df["E18400"] = pe("state_and_local_sales_or_income_tax")
-    df["E03210"] = pe("student_loan_interest")
-    df["E00300"] = pe("taxable_interest_income")
-    df["E01700"] = pe("taxable_pension_income")
-    df["E02300"] = pe("taxable_unemployment_compensation")
-    df["E01400"] = pe("taxable_ira_distributions")
-    df["E00400"] = pe("tax_exempt_interest_income")
-    df["E01500"] = pe("tax_exempt_pension_income") + pe(
+    var["E01500"] = pe("tax_exempt_pension_income") + pe(
         "taxable_pension_income"
     )
-    df["E01700"] = pe("taxable_pension_income")
-    df["E03150"] = pe("traditional_ira_contributions")
-    df["E24515"] = pe("unrecaptured_section_1250_gain")
-    df["E27200"] = pe("farm_rent_income")
-    df["MARS"] = (
+    var["MARS"] = (
         pd.Series(pe("filing_status"))
         .map(
             {
@@ -86,143 +139,82 @@ def pe(variable):
         )
         .values
     )
-    df["RECID"] = pe("household_id")
-    df["S006"] = pe("tax_unit_weight")
-    df["a_lineno"] = 0  # TD-specific (CPS matched person ID)
-    df["agi_bin"] = 0  # TD-specific (AGI bin)
-    df["h_seq"] = 0  # TD-specific (CPS matched household ID)
-    df["ffpos"] = 0  # TD-specific (CPS matched family ID)
-    df["fips"] = 0  # No FIPS data
-    df["DSI"] = 0  # Claimed as dependent on another return, assume not
-    df["EIC"] = np.minimum(pe("eitc_child_count"), 3)
-    df["FLPDYR"] = year
-    df["MIDR"] = 0  # Separately filing spouse itemizes, assume not
-    df["PT_SSTB_income"] = (
-        0  # Business income is from specified service trade or business, assume not
-    )
-    df["tanf_ben"] = 0  # TANF benefits, assume none
-    df["vet_ben"] = 0  # Veteran's benefits, assume none
-    df["wic_ben"] = 0  # WIC benefits, assume none
-    df["snap_ben"] = 0  # SNAP benefits, assume none
-    df["housing_ben"] = 0  # Housing benefits, assume none
-    df["ssi_ben"] = 0  # SSI benefits, assume none
-    df["mcare_ben"] = 0  # Medicare benefits, assume none
-    df["mcaid_ben"] = 0  # Medicaid benefits, assume none
-    df["other_ben"] = 0  # Other benefits, assume none
-    df["PT_binc_w2_wages"] = pe("w2_wages_from_qualified_business")
-    df["PT_ubia_property"] = 0
-    df["data_source"] = 1 if "puf" in pe_dataset.__name__.lower() else 0
-    df["e02000"] = (
+    var["EIC"] = np.minimum(pe("eitc_child_count"), 3)
+    ones = np.ones_like(var["RECID"], dtype=int)
+    var["FLPDYR"] = ones * year
+    if "puf" in pe_dataset.__name__.lower():
+        var["data_source"] = ones
+    else:
+        var["data_source"] = zeros
+    var["e02000"] = (
         pe("rental_income")
         + pe("partnership_s_corp_income")
         + pe("estate_income")
         + pe("farm_rent_income")
     )
-    df["e20400"] = pe("misc_deduction")
-
-    df["e07300"] = pe("foreign_tax_credit")
-    df["e62900"] = pe("amt_foreign_tax_credit")
-    df["e01200"] = pe("miscellaneous_income")
-    df["e00700"] = pe("salt_refund_income")
-    df["e58990"] = pe("investment_income_elected_form_4952")
-    df["e07400"] = pe("general_business_credit")
-    df["e07600"] = pe("prior_year_minimum_tax_credit")
-    df["e11200"] = pe("excess_withheld_payroll_tax")
-    df["e01100"] = pe("non_sch_d_capital_gains")
-    df["e87521"] = pe("american_opportunity_credit")
-    df["e07260"] = pe("energy_efficient_home_improvement_credit")
-    df["e09900"] = pe("early_withdrawal_penalty")
-    df["p08000"] = pe("other_credits")
-    df["e07240"] = pe("savers_credit")
-    df["e09700"] = pe("recapture_of_investment_credit")
-    df["e09800"] = pe("unreported_payroll_tax")
-    df["f2441"] = pe("count_cdcc_eligible")
-    df["cmbtp"] = 0
-    df["e87530"] = df[
-        "E03230"
-    ]  # Assume same definition for tuition expenses (for now).
-    df["f6251"] = 0
-    df["k1bx14p"] = 0
-    df["k1bx14s"] = 0
-
-    # Filer and spouse pairs
+    df = pd.DataFrame(var)
 
     map_to_tax_unit = lambda arr: pe_sim.map_result(arr, "person", "tax_unit")
 
-    filer = pe_sim.calculate("is_tax_unit_head").values
+    # specify df head/spouse variables
+    head = pe_sim.calculate("is_tax_unit_head").values
     spouse = pe_sim.calculate("is_tax_unit_spouse").values
 
     employment_income = pe_sim.calculate("employment_income").values
     self_employment_income = pe_sim.calculate("self_employment_income").values
     farm_income = pe_sim.calculate("farm_income").values
     pre_tax_contributions = pe_sim.calculate("pre_tax_contributions").values
 
-    df["e00200p"] = map_to_tax_unit(employment_income * filer)
+    df["e00200p"] = map_to_tax_unit(employment_income * head)
     df["e00200s"] = map_to_tax_unit(employment_income * spouse)
-    df["e00900p"] = map_to_tax_unit(self_employment_income * filer)
+    df["e00900p"] = map_to_tax_unit(self_employment_income * head)
     df["e00900s"] = map_to_tax_unit(self_employment_income * spouse)
-    df["e02100p"] = map_to_tax_unit(farm_income * filer)
+    df["e02100p"] = map_to_tax_unit(farm_income * head)
     df["e02100s"] = map_to_tax_unit(farm_income * spouse)
-    df["pencon_p"] = map_to_tax_unit(pre_tax_contributions * filer)
+    df["pencon_p"] = map_to_tax_unit(pre_tax_contributions * head)
     df["pencon_s"] = map_to_tax_unit(pre_tax_contributions * spouse)
 
-    # Demographics
-
+    # specify df demographics
     age = pe_sim.calculate("age").values
-    head = pe_sim.calculate("is_tax_unit_head").values
-    spouse = pe_sim.calculate("is_tax_unit_spouse").values
     dependent = pe_sim.calculate("is_tax_unit_dependent").values
     blind = pe_sim.calculate("is_blind").values
 
     df["age_head"] = map_to_tax_unit(age * head)
     df["age_spouse"] = map_to_tax_unit(age * spouse)
-
     df["blind_head"] = map_to_tax_unit(blind * head)
     df["blind_spouse"] = map_to_tax_unit(blind * spouse)
-
     df["nu18"] = map_to_tax_unit((age < 18) * dependent)
     df["nu13"] = map_to_tax_unit((age < 13) * dependent)
     df["nu06"] = map_to_tax_unit((age < 6) * dependent)
     df["n1820"] = map_to_tax_unit(((age >= 18) & (age < 21)) * dependent)
     df["n21"] = map_to_tax_unit((age >= 21) * dependent)
-    df["n24"] = map_to_tax_unit(
-        (age < 17) * dependent
-    )  # Following TaxData code.
+    df["n24"] = map_to_tax_unit((age < 17) * dependent)  # usinng taxdata logic
     df["elderly_dependents"] = map_to_tax_unit((age >= 65) * dependent)
 
-    # Correct case of variable names for Tax-Calculator
+    # correct case of df variable names for Tax-Calculator
     tc_variable_metadata = yaml.safe_load(
         open(STORAGE_FOLDER / "input" / "taxcalc_variable_metadata.yaml", "r")
     )
-
     renames = {}
     for variable in df.columns:
         if variable.upper() in tc_variable_metadata["read"]:
             renames[variable] = variable.upper()
         elif variable.lower() in tc_variable_metadata["read"]:
             renames[variable] = variable.lower()
-
     df = df.rename(columns=renames)
 
     return df
 
 
 def create_tc_puf_2015():
-    from tax_microdata_benchmarking.datasets.puf import PUF_2015
-
     return create_tc_dataset(PUF_2015, 2015)
 
 
 def create_tc_puf_2021():
-    from tax_microdata_benchmarking.datasets.puf import PUF_2021
-
     return create_tc_dataset(PUF_2021, 2021)
 
 
 if __name__ == "__main__":
-    from tax_microdata_benchmarking.datasets.puf import PUF_2015, PUF_2021
-    from tax_microdata_benchmarking.storage import STORAGE_FOLDER
-
     create_tc_dataset(PUF_2015).to_csv(
         STORAGE_FOLDER / "output" / "tc_puf_2015.csv.gz", index=False
     )