Skip to content

Commit

Permalink
Merge pull request PSLmodels#164 from PSLmodels/fix-even-more-warnings
Browse files Browse the repository at this point in the history
Fix warnings in taxcalc_dataset.py code
  • Loading branch information
martinholmer authored Aug 22, 2024
2 parents 093235d + 13d7686 commit 7dc5728
Showing 1 changed file with 122 additions and 130 deletions.
252 changes: 122 additions & 130 deletions tax_microdata_benchmarking/datasets/taxcalc_dataset.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
# Create a Tax-Calculator-compatible dataset from any PolicyEngine hierarchical dataset.
import yaml
from typing import Type
import pandas as pd
import numpy as np
import yaml
import pandas as pd
from tax_microdata_benchmarking.storage import STORAGE_FOLDER
import taxcalc
from tax_microdata_benchmarking.datasets.puf import PUF_2015, PUF_2021
from policyengine_us import Microsimulation
from policyengine_us.system import system


def create_tc_dataset(pe_dataset: Type, year: int) -> pd.DataFrame:
from policyengine_us import Microsimulation
from policyengine_us.system import system

pe_sim = Microsimulation(dataset=pe_dataset)
df = pd.DataFrame()

print(f"Creating tc dataset from '{pe_dataset.label}' for year {year}...")

Expand All @@ -21,59 +19,114 @@ def create_tc_dataset(pe_dataset: Type, year: int) -> pd.DataFrame:

def pe(variable):
if system.variables[variable].entity.key == "person":
# Sum over non-dependents
# sum over non-dependents
values = pe_sim.calculate(variable).values
return np.array(tax_unit.sum(values * is_non_dep))
else:
return np.array(pe_sim.calculate(variable, map_to="tax_unit"))

df["E03500"] = pe("alimony_expense")
df["E00800"] = pe("alimony_income")
df["G20500"] = pe(
"casualty_loss"
) # Amend with taxdata treatment from e20500
df["E32800"] = pe("cdcc_relevant_expenses")
df["E19800"] = pe("charitable_cash_donations")
df["E20100"] = pe("charitable_non_cash_donations")
df["XTOT"] = pe("exemptions_count")
df["E03240"] = pe("domestic_production_ald")
df["E03400"] = pe("early_withdrawal_penalty")
df["E03220"] = pe("educator_expense")
df["E00200"] = pe("employment_income")
df["E02100"] = pe("farm_income")
df["E27200"] = pe("farm_rent_income")
df["E03290"] = pe("health_savings_account_ald")
df["E19200"] = pe("interest_deduction")
df["P23250"] = pe("long_term_capital_gains")
df["E24518"] = pe("long_term_capital_gains_on_collectibles")
df["E17500"] = pe("medical_expense")
df["E00600"] = pe("non_qualified_dividend_income") + pe(
# specify tcname-to-pename dictionary
vnames = {
"RECID": "household_id",
"S006": "tax_unit_weight",
"E03500": "alimony_expense",
"E00800": "alimony_income",
"G20500": "casualty_loss",
"E32800": "cdcc_relevant_expenses",
"E19800": "charitable_cash_donations",
"E20100": "charitable_non_cash_donations",
"XTOT": "exemptions_count",
"E03240": "domestic_production_ald",
"E03400": "early_withdrawal_penalty",
"E03220": "educator_expense",
"E00200": "employment_income",
"E02100": "farm_income",
"E27200": "farm_rent_income",
"E03290": "health_savings_account_ald",
"E19200": "interest_deduction",
"P23250": "long_term_capital_gains",
"E24518": "long_term_capital_gains_on_collectibles",
"E17500": "medical_expense",
"E00650": "qualified_dividend_income",
"E26270": "partnership_s_corp_income",
"E03230": "qualified_tuition_expenses",
"e87530": "qualified_tuition_expenses",
"E18500": "real_estate_taxes",
"E00900": "self_employment_income",
"E03270": "self_employed_health_insurance_ald",
"E03300": "self_employed_pension_contribution_ald",
"P22250": "short_term_capital_gains",
"E02400": "social_security",
"E18400": "state_and_local_sales_or_income_tax",
"E03210": "student_loan_interest",
"E00300": "taxable_interest_income",
"E01700": "taxable_pension_income",
"E02300": "taxable_unemployment_compensation",
"E01400": "taxable_ira_distributions",
"E00400": "tax_exempt_interest_income",
"E01700": "taxable_pension_income",
"E03150": "traditional_ira_contributions",
"E24515": "unrecaptured_section_1250_gain",
"E27200": "farm_rent_income",
"PT_binc_w2_wages": "w2_wages_from_qualified_business",
"e20400": "misc_deduction",
"e07300": "foreign_tax_credit",
"e62900": "amt_foreign_tax_credit",
"e01200": "miscellaneous_income",
"e00700": "salt_refund_income",
"e58990": "investment_income_elected_form_4952",
"e07400": "general_business_credit",
"e07600": "prior_year_minimum_tax_credit",
"e11200": "excess_withheld_payroll_tax",
"e01100": "non_sch_d_capital_gains",
"e87521": "american_opportunity_credit",
"e07260": "energy_efficient_home_improvement_credit",
"e09900": "early_withdrawal_penalty",
"p08000": "other_credits",
"e07240": "savers_credit",
"e09700": "recapture_of_investment_credit",
"e09800": "unreported_payroll_tax",
"f2441": "count_cdcc_eligible",
}
# specify Tax-Calculator names of variables that have zero values
zero_names = [
"a_lineno", # taxdata-specific (CPS matched person ID)
"agi_bin", # taxdata-specific (AGI bin)
"h_seq", # taxdata-specific (CPS matched household ID)
"ffpos", # taxdata-specific (CPS matched family ID)
"fips", # no FIPS data
"DSI", # claimed as dependent on another return, assume not
"MIDR", # separately filing spouse itemizes, assume not
"PT_SSTB_income", # PT SSTB business income, assume none
"PT_ubia_property", # PT business capital, assume none
"cmbtp",
"f6251",
"k1bx14p",
"k1bx14s",
"tanf_ben", # TANF benefits, assume none
"vet_ben", # veteran's benefits, assume none
"wic_ben", # WIC benefits, assume none
"snap_ben", # SNAP benefits, assume none
"housing_ben", # housing benefits, assume none
"ssi_ben", # SSI benefits, assume none
"mcare_ben", # Medicare benefits, assume none
"mcaid_ben", # Medicaid benefits, assume none
"other_ben", # Other benefits, assume none
]
# specify Tax-Calculator array variable dictionary
var = {}
for tcname, pename in vnames.items():
var[tcname] = pe(pename)
zeros = np.zeros_like(var["RECID"], dtype=int)
for tcname in zero_names:
var[tcname] = zeros
var["E00600"] = pe("non_qualified_dividend_income") + pe(
"qualified_dividend_income"
)
df["E00650"] = pe("qualified_dividend_income")
df["E26270"] = pe("partnership_s_corp_income")
df["E03230"] = pe("qualified_tuition_expenses")
df["E18500"] = pe("real_estate_taxes")
df["E00900"] = pe("self_employment_income")
df["E03270"] = pe("self_employed_health_insurance_ald")
df["E03300"] = pe("self_employed_pension_contribution_ald")
df["P22250"] = pe("short_term_capital_gains")
df["E02400"] = pe("social_security")
df["E18400"] = pe("state_and_local_sales_or_income_tax")
df["E03210"] = pe("student_loan_interest")
df["E00300"] = pe("taxable_interest_income")
df["E01700"] = pe("taxable_pension_income")
df["E02300"] = pe("taxable_unemployment_compensation")
df["E01400"] = pe("taxable_ira_distributions")
df["E00400"] = pe("tax_exempt_interest_income")
df["E01500"] = pe("tax_exempt_pension_income") + pe(
var["E01500"] = pe("tax_exempt_pension_income") + pe(
"taxable_pension_income"
)
df["E01700"] = pe("taxable_pension_income")
df["E03150"] = pe("traditional_ira_contributions")
df["E24515"] = pe("unrecaptured_section_1250_gain")
df["E27200"] = pe("farm_rent_income")
df["MARS"] = (
var["MARS"] = (
pd.Series(pe("filing_status"))
.map(
{
Expand All @@ -86,143 +139,82 @@ def pe(variable):
)
.values
)
df["RECID"] = pe("household_id")
df["S006"] = pe("tax_unit_weight")
df["a_lineno"] = 0 # TD-specific (CPS matched person ID)
df["agi_bin"] = 0 # TD-specific (AGI bin)
df["h_seq"] = 0 # TD-specific (CPS matched household ID)
df["ffpos"] = 0 # TD-specific (CPS matched family ID)
df["fips"] = 0 # No FIPS data
df["DSI"] = 0 # Claimed as dependent on another return, assume not
df["EIC"] = np.minimum(pe("eitc_child_count"), 3)
df["FLPDYR"] = year
df["MIDR"] = 0 # Separately filing spouse itemizes, assume not
df["PT_SSTB_income"] = (
0 # Business income is from specified service trade or business, assume not
)
df["tanf_ben"] = 0 # TANF benefits, assume none
df["vet_ben"] = 0 # Veteran's benefits, assume none
df["wic_ben"] = 0 # WIC benefits, assume none
df["snap_ben"] = 0 # SNAP benefits, assume none
df["housing_ben"] = 0 # Housing benefits, assume none
df["ssi_ben"] = 0 # SSI benefits, assume none
df["mcare_ben"] = 0 # Medicare benefits, assume none
df["mcaid_ben"] = 0 # Medicaid benefits, assume none
df["other_ben"] = 0 # Other benefits, assume none
df["PT_binc_w2_wages"] = pe("w2_wages_from_qualified_business")
df["PT_ubia_property"] = 0
df["data_source"] = 1 if "puf" in pe_dataset.__name__.lower() else 0
df["e02000"] = (
var["EIC"] = np.minimum(pe("eitc_child_count"), 3)
ones = np.ones_like(var["RECID"], dtype=int)
var["FLPDYR"] = ones * year
if "puf" in pe_dataset.__name__.lower():
var["data_source"] = ones
else:
var["data_source"] = zeros
var["e02000"] = (
pe("rental_income")
+ pe("partnership_s_corp_income")
+ pe("estate_income")
+ pe("farm_rent_income")
)
df["e20400"] = pe("misc_deduction")

df["e07300"] = pe("foreign_tax_credit")
df["e62900"] = pe("amt_foreign_tax_credit")
df["e01200"] = pe("miscellaneous_income")
df["e00700"] = pe("salt_refund_income")
df["e58990"] = pe("investment_income_elected_form_4952")
df["e07400"] = pe("general_business_credit")
df["e07600"] = pe("prior_year_minimum_tax_credit")
df["e11200"] = pe("excess_withheld_payroll_tax")
df["e01100"] = pe("non_sch_d_capital_gains")
df["e87521"] = pe("american_opportunity_credit")
df["e07260"] = pe("energy_efficient_home_improvement_credit")
df["e09900"] = pe("early_withdrawal_penalty")
df["p08000"] = pe("other_credits")
df["e07240"] = pe("savers_credit")
df["e09700"] = pe("recapture_of_investment_credit")
df["e09800"] = pe("unreported_payroll_tax")
df["f2441"] = pe("count_cdcc_eligible")
df["cmbtp"] = 0
df["e87530"] = df[
"E03230"
] # Assume same definition for tuition expenses (for now).
df["f6251"] = 0
df["k1bx14p"] = 0
df["k1bx14s"] = 0

# Filer and spouse pairs
df = pd.DataFrame(var)

map_to_tax_unit = lambda arr: pe_sim.map_result(arr, "person", "tax_unit")

filer = pe_sim.calculate("is_tax_unit_head").values
# specify df head/spouse variables
head = pe_sim.calculate("is_tax_unit_head").values
spouse = pe_sim.calculate("is_tax_unit_spouse").values

employment_income = pe_sim.calculate("employment_income").values
self_employment_income = pe_sim.calculate("self_employment_income").values
farm_income = pe_sim.calculate("farm_income").values
pre_tax_contributions = pe_sim.calculate("pre_tax_contributions").values

df["e00200p"] = map_to_tax_unit(employment_income * filer)
df["e00200p"] = map_to_tax_unit(employment_income * head)
df["e00200s"] = map_to_tax_unit(employment_income * spouse)
df["e00900p"] = map_to_tax_unit(self_employment_income * filer)
df["e00900p"] = map_to_tax_unit(self_employment_income * head)
df["e00900s"] = map_to_tax_unit(self_employment_income * spouse)
df["e02100p"] = map_to_tax_unit(farm_income * filer)
df["e02100p"] = map_to_tax_unit(farm_income * head)
df["e02100s"] = map_to_tax_unit(farm_income * spouse)
df["pencon_p"] = map_to_tax_unit(pre_tax_contributions * filer)
df["pencon_p"] = map_to_tax_unit(pre_tax_contributions * head)
df["pencon_s"] = map_to_tax_unit(pre_tax_contributions * spouse)

# Demographics

# specify df demographics
age = pe_sim.calculate("age").values
head = pe_sim.calculate("is_tax_unit_head").values
spouse = pe_sim.calculate("is_tax_unit_spouse").values
dependent = pe_sim.calculate("is_tax_unit_dependent").values
blind = pe_sim.calculate("is_blind").values

df["age_head"] = map_to_tax_unit(age * head)
df["age_spouse"] = map_to_tax_unit(age * spouse)

df["blind_head"] = map_to_tax_unit(blind * head)
df["blind_spouse"] = map_to_tax_unit(blind * spouse)

df["nu18"] = map_to_tax_unit((age < 18) * dependent)
df["nu13"] = map_to_tax_unit((age < 13) * dependent)
df["nu06"] = map_to_tax_unit((age < 6) * dependent)
df["n1820"] = map_to_tax_unit(((age >= 18) & (age < 21)) * dependent)
df["n21"] = map_to_tax_unit((age >= 21) * dependent)
df["n24"] = map_to_tax_unit(
(age < 17) * dependent
) # Following TaxData code.
df["n24"] = map_to_tax_unit((age < 17) * dependent) # usinng taxdata logic
df["elderly_dependents"] = map_to_tax_unit((age >= 65) * dependent)

# Correct case of variable names for Tax-Calculator
# correct case of df variable names for Tax-Calculator
tc_variable_metadata = yaml.safe_load(
open(STORAGE_FOLDER / "input" / "taxcalc_variable_metadata.yaml", "r")
)

renames = {}
for variable in df.columns:
if variable.upper() in tc_variable_metadata["read"]:
renames[variable] = variable.upper()
elif variable.lower() in tc_variable_metadata["read"]:
renames[variable] = variable.lower()

df = df.rename(columns=renames)

return df


def create_tc_puf_2015():
from tax_microdata_benchmarking.datasets.puf import PUF_2015

return create_tc_dataset(PUF_2015, 2015)


def create_tc_puf_2021():
from tax_microdata_benchmarking.datasets.puf import PUF_2021

return create_tc_dataset(PUF_2021, 2021)


if __name__ == "__main__":
from tax_microdata_benchmarking.datasets.puf import PUF_2015, PUF_2021
from tax_microdata_benchmarking.storage import STORAGE_FOLDER

create_tc_dataset(PUF_2015).to_csv(
STORAGE_FOLDER / "output" / "tc_puf_2015.csv.gz", index=False
)
Expand Down

0 comments on commit 7dc5728

Please sign in to comment.