diff --git a/tmd/areas/targets/prepare/prepare_states/R/libraries.R b/tmd/areas/targets/prepare/prepare_states/R/libraries.R index 5814517a..7d1d0917 100644 --- a/tmd/areas/targets/prepare/prepare_states/R/libraries.R +++ b/tmd/areas/targets/prepare/prepare_states/R/libraries.R @@ -25,6 +25,8 @@ library(tidycensus) library(tigris) options(tigris_use_cache = TRUE) +library(janitor) + # possible libraries ------------------------------------------------------ diff --git a/tmd/areas/targets/prepare/prepare_states/developing_SALT_targets.qmd b/tmd/areas/targets/prepare/prepare_states/SALT_analysis.qmd similarity index 92% rename from tmd/areas/targets/prepare/prepare_states/developing_SALT_targets.qmd rename to tmd/areas/targets/prepare/prepare_states/SALT_analysis.qmd index 76cf1f76..5eb646e2 100644 --- a/tmd/areas/targets/prepare/prepare_states/developing_SALT_targets.qmd +++ b/tmd/areas/targets/prepare/prepare_states/SALT_analysis.qmd @@ -131,18 +131,18 @@ Unfortunately, the published SOI Historical Table 2 data do not capture potentia basesort <- c("18400", "18500", "18425", "18450", "18800", "18460", "18300") tabdata <- soilong |> - filter(basevname %in% basesort, stabbr=="US", agistub==0, year==2021) |> + filter(basesoivname %in% basesort, stabbr=="US", agistub==0, year==2021) |> mutate(description=udescription[vtype=="amount"], description=str_remove(description, " amount"), - .by=basevname) |> - select(basevname, vtype, value, description) |> + .by=basesoivname) |> + select(basesoivname, vtype, value, description) |> pivot_wider(names_from = vtype) |> - select(basevname, description, count, amount) |> - mutate(basevname=factor(basevname, levels=basesort)) |> - arrange(basevname) + select(basesoivname, description, count, amount) |> + mutate(basesoivname=factor(basesoivname, levels=basesort)) |> + arrange(basesoivname) tabdata |> - mutate(vgroup=basevname %in% basesort[1:2]) |> + mutate(vgroup=basesoivname %in% basesort[1:2]) |> gt() |> cols_hide(vgroup) |> tab_header(title="Actual SALT deductions in 2021 IRS-published data", @@ -183,21 +183,21 @@ The table below shows how SALT amounts have changed over time. We can see the la basesort <- c("18400", "18500", "18425", "18450", "18800", "18460", "18300") soilong |> - filter(basevname %in% basesort, stabbr=="US", agistub==0) |> - select(stabbr, basevname, vname, vtype, year, value, udescription) |> + filter(basesoivname %in% basesort, stabbr=="US", agistub==0) |> + select(stabbr, basesoivname, soivname, vtype, year, value, udescription) |> pivot_wider(names_from = year) |> - mutate(basevname=factor(basevname, levels=basesort)) |> + mutate(basesoivname=factor(basesoivname, levels=basesort)) |> mutate(udescription=udescription[vtype=="amount"], - .by=basevname) |> - arrange(vtype, basevname) |> - select(-vname) |> + .by=basesoivname) |> + arrange(vtype, basesoivname) |> + select(-soivname) |> gt() |> tab_header(title=html("Actual SALT variables for the U.S. over time
Amounts in $ billions, counts in millions"), subtitle = "Source: IRS SOI Historical Table 2") |> - fmt_currency(columns = -c(stabbr, basevname, vtype, udescription), + fmt_currency(columns = -c(stabbr, basesoivname, vtype, udescription), rows = vtype=="amount", scale=1e-9, decimals=2) |> - fmt_number(columns = -c(stabbr, basevname, vtype, udescription), + fmt_number(columns = -c(stabbr, basesoivname, vtype, udescription), rows = vtype=="count", scale=1e-6, decimals=2) |> sub_missing(columns=everything(), @@ -216,7 +216,7 @@ Note that in the IRS SOI data, there are no deductions at all on returns with AG #| label: income-sales-tax-by-agirange-amounts-over-time soilong |> - filter(vname=="a18400", stabbr=="US") |> + filter(soivname=="a18400", stabbr=="US") |> select(year, agistub, agilabel, value) |> pivot_wider(names_from = year) |> gt() |> @@ -240,16 +240,16 @@ We do this both statistically (correlation coefficients) and graphically. salt <- soilong |> filter(year %in% c(2017, 2018, 2021), - basevname %in% c("18400", "18500"), + basesoivname %in% c("18400", "18500"), !stabbr %in% c("US", "OA", "PR")) |> mutate(agistubf=factor(agistub, levels=agilabels$agistub, labels=agilabels$agilabel)) |> - select(stabbr, year, basevname, vtype, udescription, agistub, agistubf, value) |> + select(stabbr, year, basesoivname, vtype, udescription, agistub, agistubf, value) |> pivot_wider(names_from = year, names_prefix = "y") saltshares <- salt |> mutate(across(starts_with("y"), \(x) x / sum(x)), - .by=c(agistub, basevname, vtype)) + .by=c(agistub, basesoivname, vtype)) ``` @@ -273,16 +273,16 @@ corrs <- saltshares |> filter(agistub != 1) |> summarise(cor2017_2018=cor(y2017, y2018, use = "complete.obs"), cor2018_2021=cor(y2018, y2021, use = "complete.obs"), - .by=c(basevname, vtype, udescription, agistub, agistubf)) |> + .by=c(basesoivname, vtype, udescription, agistub, agistubf)) |> mutate(agistub=factor(agistub), - udescription=ifelse(basevname=="18400" & vtype=="count", + udescription=ifelse(basesoivname=="18400" & vtype=="count", "Number of returns with state and local income or sales taxes (estimated)", udescription)) corrs |> summarise(across(c(cor2017_2018, cor2018_2021), list(min=min, max=max)), - .by=c(basevname, vtype, udescription)) |> + .by=c(basesoivname, vtype, udescription)) |> gt() |> tab_header(title="Min and max correlations across states for state SALT variables as share of national total", subtitle = "Comparing 2017 to 2018, and 2018 to 2021") |> @@ -294,7 +294,7 @@ corrs |> cor2017_2018_max = "max") |> cols_label(cor2018_2021_min= "min", cor2018_2021_max = "max") |> - fmt_number(-c(basevname, vtype, udescription), + fmt_number(-c(basesoivname, vtype, udescription), decimals=3) corrs |> @@ -303,7 +303,7 @@ corrs |> subtitle = "Comparing 2017 to 2018, and 2018 to 2021") |> cols_label(cor2017_2018 = "Correlation between 2017 and 2018", cor2018_2021= "Correlation between 2018 and 2021") |> - fmt_number(-c(basevname, vtype, udescription), + fmt_number(-c(basesoivname, vtype, udescription), decimals=3) ``` @@ -325,7 +325,7 @@ ub <- .075 saltshares |> filter(!stabbr %in% c("CA", "NY")) |> filter(!agistub %in% c(0, 1)) |> - filter(basevname=="18400", vtype=="amount") |> + filter(basesoivname=="18400", vtype=="amount") |> ggplot(aes(x=y2018, y=y2017)) + geom_point(colour="blue", size=0.5) + @@ -354,7 +354,7 @@ ub <- .11 saltshares |> filter(!stabbr %in% c("CA", "NY")) |> filter(!agistub %in% c(0, 1)) |> - filter(basevname=="18500", vtype=="amount") |> + filter(basesoivname=="18500", vtype=="amount") |> ggplot(aes(x=y2018, y=y2017)) + geom_point(colour="blue", size=0.5) + @@ -382,10 +382,10 @@ Finally, all of the state shares are shown in the filter-able and sortable table saltshares |> filter(agistub != 1) |> - select(stabbr, basevname, vtype, udescription, agistub, agistubf, y2017, y2018, y2021) |> + select(stabbr, basesoivname, vtype, udescription, agistub, agistubf, y2017, y2018, y2021) |> mutate(y2018m2017=y2018 - y2017, y2021m2018=y2021 - y2018, - across(c(stabbr, basevname, vtype, udescription, agistub), + across(c(stabbr, basesoivname, vtype, udescription, agistub), \(x) factor(x))) |> DT::datatable(rownames = FALSE, caption = htmltools::tags$caption( diff --git a/tmd/areas/targets/prepare/prepare_states/_quarto.yml b/tmd/areas/targets/prepare/prepare_states/_quarto.yml index a81070c8..0c4f2bc7 100644 --- a/tmd/areas/targets/prepare/prepare_states/_quarto.yml +++ b/tmd/areas/targets/prepare/prepare_states/_quarto.yml @@ -45,10 +45,12 @@ book: - construct_long_soi_data_file.qmd - part: "Analysis of SALT variables and other issues" chapters: - - developing_SALT_targets.qmd - - part: "Create basefile for state targets" + - SALT_analysis.qmd + - part: "Create data from which to extract state target files" chapters: - create_state_targets_basefile.qmd + - create_additional_state_targets.qmd + - combine_base_and_additional_targets.qmd # old files maybe use as base for new work # - cd_create_variable_mapping.qmd # - cd_compare_us_totals_tmd_vs_irs_published.qmd diff --git a/tmd/areas/targets/prepare/prepare_states/combine_base_and_additional_targets.qmd b/tmd/areas/targets/prepare/prepare_states/combine_base_and_additional_targets.qmd new file mode 100644 index 00000000..a3bafac7 --- /dev/null +++ b/tmd/areas/targets/prepare/prepare_states/combine_base_and_additional_targets.qmd @@ -0,0 +1,108 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +# Combine base and additional target files + + +```{r} +#| label: setup +#| output: false + +suppressPackageStartupMessages(source(here::here("R", "libraries.R"))) +source(here::here("R", "constants.R")) +source(here::here("R", "functions.R")) + +``` + + +## Stack basefile targets and additional targets + +```{r} +#| label: stack-targets +#| output: false + +base_targets <- read_csv(fs::path(DINTERMEDIATE, "base_targets.csv")) +additional_targets <- read_csv(fs::path(DINTERMEDIATE, "additional_targets.csv")) +ns(additional_targets) + +setdiff(names(base_targets), names(additional_targets)) # none missing +setdiff(names(additional_targets), names(base_targets)) # see below +# "soi_ussum" "soi_share" "tmdvar" "tmdsum" +# we can drop all of these + +# re-examine additional targets +glimpse(additional_targets) +count(additional_targets, basesoivname, soivname, description) +count(additional_targets, tmdvar, basesoivname, soivname, description) + +stack <- bind_rows(base_targets, + additional_targets |> + select(all_of(names(base_targets)))) |> + mutate(sort=ifelse(basesoivname=="XTOT" & + soivname=="XTOT" & + scope==0 & + str_detect(description, "population"), + 1, NA_real_)) |> + # sort is 1 for the population record, NA for others - so pop sorts first + # set desired order + arrange(stabbr, sort, scope, fstatus, basesoivname, count, agistub) |> + # now calc sort + mutate(sort=row_number(), .by=stabbr) |> + select(stabbr, sort, count, scope, agilo, agihi, fstatus, target, basesoivname, soivname, description, agistub, agilabel) + +# varname,count,scope,agilo,agihi,fstatus,target +check <- stack |> filter(stabbr=="NY") +check2 <- count(check, basesoivname, soivname, description) + +write_csv(stack, fs::path(DINTERMEDIATE, "enhanced_targets.csv")) + +``` + + + +## Additional notes + +```{r} +#| label: notes +#| output: false + +# documentation for the targets.csv data file + +# sample file excerpt +# varname,count,scope,agilo,agihi,fstatus,target +# XTOT, 0, 0,-9e99, 9e99, 0, 33e6 +# e00300, 0, 1,-9e99, 9e99, 0, 20e9 +# e00900, 0, 1,-9e99, 9e99, 0, 30e9 +# e00200, 0, 1,-9e99, 9e99, 0,1000e9 +# e02000, 0, 1,-9e99, 9e99, 0, 30e9 +# e02400, 0, 1,-9e99, 9e99, 0, 60e9 + +# varname: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the tmd/storage/__init__.py file +# count: integer in [0,4] range: +# count==0 implies dollar total of varname is tabulated +# count==1 implies number of tax units with any value of varname is tabulated +# count==2 implies number of tax units with a nonzero value of varname is tabulated +# count==3 implies number of tax units with a positive value of varname is tabulated +# count==4 implies number of tax units with a negative value of varname is tabulated + +# scope: integer in [0,2] range: +# scope==0 implies all tax units are tabulated +# scope==1 implies only PUF-derived filing units are tabulated +# scope==2 implies only CPS-derived filing units are tabulated + +# agilo: float representing lower bound of the AGI range (which is included in the range) that is tabulated. +# agihi: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated. + +# fstatus: integer in [0,5] range: +# fstatus=0 implies all filing statuses are tabulated +# other fstatus values imply just the tax units with the Tax-Calculator MARS variable equal to fstatus are included in the tabulation + +# target: target amount: +# dollars if count==0 +# number of tax units if count>0 + +``` + diff --git a/tmd/areas/targets/prepare/prepare_states/construct_long_soi_data_file.qmd b/tmd/areas/targets/prepare/prepare_states/construct_long_soi_data_file.qmd index b9f2d69d..ee370555 100644 --- a/tmd/areas/targets/prepare/prepare_states/construct_long_soi_data_file.qmd +++ b/tmd/areas/targets/prepare/prepare_states/construct_long_soi_data_file.qmd @@ -67,7 +67,7 @@ csvdata2 <- csvdata |> rename(stabbr=state, agistub=agi_stub) |> mutate(year=as.integer(year)) |> pivot_longer(-c(stabbr, year, agistub), - names_to = "vname") |> + names_to = "soivname") |> filter(!is.na(value)) saveRDS(csvdata2, fs::path(DINTERMEDIATE, "soilong_raw.rds")) @@ -85,14 +85,14 @@ glimpse(soilong_raw) # investigate the data to make sure correct check <- soilong_raw |> - filter(str_sub(vname, 2, -1) %in% c("18425", "18450")) + filter(str_sub(soivname, 2, -1) %in% c("18425", "18450")) #.. 18400 State and local income or sales tax (estimated) est18400 <- soilong_raw |> - filter(str_sub(vname, 2, -1) %in% c("18425", "18450")) |> - mutate(vname=paste0(str_sub(vname, 1, 1), "18400")) |> + filter(str_sub(soivname, 2, -1) %in% c("18425", "18450")) |> + mutate(soivname=paste0(str_sub(soivname, 1, 1), "18400")) |> summarise(value=sum(value), - .by=c(stabbr, agistub, year, vname)) + .by=c(stabbr, agistub, year, soivname)) glimpse(est18400) skim(est18400) @@ -116,14 +116,14 @@ agilabels <- read_csv(fs::path(DINTERMEDIATE, "agilabels.csv")) soilong <- soilong1 |> left_join(variable_descriptions, - by = join_by(vname, year)) |> + by = join_by(soivname, year)) |> left_join(agilabels, by = join_by(agistub)) |> mutate(value=ifelse(vtype=="amount", value * 1000, value)) |> - select(stabbr, vname, basevname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |> - arrange(stabbr, vname, basevname, vtype, agistub, year) + select(stabbr, soivname, basesoivname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |> + arrange(stabbr, soivname, basesoivname, vtype, agistub, year) skim(soilong) -check <- count(soilong, basevname, vtype, vname, udescription) +check <- count(soilong, basesoivname, vtype, soivname, udescription) saveRDS(soilong, fs::path(DINTERMEDIATE, "soilong.rds")) @@ -145,7 +145,7 @@ count(soilong, stabbr) # 54: 50 states, DC, PR, OA, US soilong |> filter(is.na(vtype)) |> - count(vname) # should be zero recs + count(soivname) # should be zero recs soilong |> filter(is.na(vtype)) |> @@ -154,15 +154,13 @@ soilong |> # n17000 had been one of the all-missing values variables in some years # we have since dropped all missing values variable_descriptions |> - filter(vname=="n17000") # Number of returns with Total medical and dental expense deduction + filter(soivname=="n17000") # Number of returns with Total medical and dental expense deduction soilong |> - filter(stabbr=="NY", vname %in% c("n17000", "a17000"), agistub==0) |> - select(stabbr, agistub, vname, vtype, year, value, udescription) |> + filter(stabbr=="NY", soivname %in% c("n17000", "a17000"), agistub==0) |> + select(stabbr, agistub, soivname, vtype, year, value, udescription) |> arrange(vtype, year) - - ``` diff --git a/tmd/areas/targets/prepare/prepare_states/construct_soi_documentation.qmd b/tmd/areas/targets/prepare/prepare_states/construct_soi_documentation.qmd index ef406d7b..e3ca0be0 100644 --- a/tmd/areas/targets/prepare/prepare_states/construct_soi_documentation.qmd +++ b/tmd/areas/targets/prepare/prepare_states/construct_soi_documentation.qmd @@ -39,12 +39,12 @@ fpath <- fs::path(DRAW, fname) get_year <- function(year){ df1 <- readxl::read_xlsx(fpath, sheet = as.character(year), col_types = "text") df2 <- df1 |> - select(vname=1, description=2, reference=3, type=4) |> + select(soivname=1, description=2, reference=3, type=4) |> filter(if_any(everything(), ~!is.na(.))) |> - # after verifying that AGI_STUB is the only variable with NA in vname + # after verifying that AGI_STUB is the only variable with NA in soivname # fill down and then concatenate the reference column - fill(vname, description, type, .direction="down") |> - mutate(reference = paste(reference, collapse = "\n"), .by=vname) |> + fill(soivname, description, type, .direction="down") |> + mutate(reference = paste(reference, collapse = "\n"), .by=soivname) |> distinct() |> # for now, make mistaken references NA mutate(reference=ifelse(!is.na(as.numeric(reference)), NA_character_, reference), @@ -65,6 +65,7 @@ get_year <- function(year){ stacked_docs <- purrr::map(2015:2021, get_year) |> list_rbind() +glimpse(stacked_docs) count(stacked_docs, year) @@ -110,18 +111,18 @@ str_distance <- function(strings) { # identify instances where the descriptions are highly dissimilar distances <- stacked_docs |> - select(year, vname, description) |> - arrange(vname, year) |> + select(year, soivname, description) |> + arrange(soivname, year) |> # fix known error - mutate(vname=ifelse(vname=="CREP", "CPREP", vname)) |> + mutate(soivname=ifelse(soivname=="CREP", "CPREP", soivname)) |> # prepare to determine distances mutate(n=n(), nunique=length(unique(description)), - .by=vname) |> + .by=soivname) |> mutate(cleaned=cleanit(description), distance=str_distance(cleaned), maxdist=max(distance, na.rm=TRUE), - .by=vname) + .by=soivname) count(distances, maxdist) @@ -140,7 +141,7 @@ distances |> filter(maxdist == 32) # A11070 distances |> filter(maxdist == 37) # A11450, N11450 adjusted_descriptions <- read_delim( -"vname; adjusted +"soivname; adjusted A00100; Adjusted gross income (AGI) amount A10971; Economic impact payment amount (pre-2021 is different) N10971; Number of returns with economic impact payment (pre-2021 is different) @@ -164,35 +165,35 @@ adjusted_descriptions adjusted1 <- distances |> mutate(last=last(cleaned), - .by=vname) |> + .by=soivname) |> left_join(adjusted_descriptions, - by = join_by(vname)) |> + by = join_by(soivname)) |> mutate(udescription=ifelse(!is.na(adjusted), adjusted, last)) cleaned_descriptions <- adjusted1 |> - mutate(vname=str_to_lower(vname)) |> - mutate(length=nchar(vname), + mutate(soivname=str_to_lower(soivname)) |> + mutate(length=nchar(soivname), vtype=case_when( - vname == "numdep" ~ "count", + soivname == "numdep" ~ "count", length != 6 ~ "count", - length == 6 & str_sub(vname, 1, 1) == "n" ~ "count", - length == 6 & str_sub(vname, 1, 1) == "a" ~ "amount", + length == 6 & str_sub(soivname, 1, 1) == "n" ~ "count", + length == 6 & str_sub(soivname, 1, 1) == "a" ~ "amount", .default = "ERROR"), - basevname=case_when( - vname == "numdep" ~ "numdep", - length == 6 ~ str_sub(vname, 2, 6), - .default = vname + basesoivname=case_when( + soivname == "numdep" ~ "numdep", + length == 6 ~ str_sub(soivname, 2, 6), + .default = soivname )) |> - select(year, vname, basevname, vtype, description, udescription) + select(year, soivname, basesoivname, vtype, description, udescription) count(cleaned_descriptions, vtype) # verify no more than one description per variable per year glimpse(cleaned_descriptions) cleaned_descriptions |> - summarise(n=n(), .by=c(vname, year)) |> + summarise(n=n(), .by=c(soivname, year)) |> filter(n > 1) # should be an empty tibble @@ -210,7 +211,7 @@ write_csv(cleaned_descriptions, fs::path(DINTERMEDIATE, "soi_documentation_by_ye ``` -## Add variables we may create that are not in the SOI documentation +## Add variables not in the SOI documentation that we may create ```{r} #| label: new-variables @@ -220,13 +221,13 @@ cleaned1 <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation_by_year_raw.csv" glimpse(cleaned1) count(cleaned1 |> - filter(str_starts(basevname, "18")), - vname, udescription, year) + filter(str_starts(basesoivname, "18")), + soivname, udescription, year) -# salt variables basevname 18400, sum of 18425 and 18450, both avail 2015-2021 +# salt variables basesoivname 18400, sum of 18425 and 18450, both avail 2015-2021 # add documentation for n00100 agi count EVEN THOUGH it's not in the data newvars_base <- read_delim( -"vname; basevname; description +"soivname; basesoivname; description n18400; 18400; Number of returns with state and local income or sales tax (estimated) a18400; 18400; State and local income or sales tax (estimated) amount n00100; 00100; Number of returns with adjusted gross income (AGI) @@ -238,15 +239,15 @@ crosses <- crossing(year=2015:2021, vtype=c("amount", "count")) newvars <- newvars_base |> cross_join(tibble(year=2015:2021)) |> - mutate(vtype=case_when(str_sub(vname, 1, 1)=="n" ~ "count", - str_sub(vname, 1, 1)=="a" ~ "amount", + mutate(vtype=case_when(str_sub(soivname, 1, 1)=="n" ~ "count", + str_sub(soivname, 1, 1)=="a" ~ "amount", .default = "ERROR"), udescription=description) |> - select(year, vname, basevname, vtype, description, udescription) + select(year, soivname, basesoivname, vtype, description, udescription) newvars cleaned_new <- bind_rows(cleaned1, newvars) |> - arrange(basevname, vtype, year) + arrange(basesoivname, vtype, year) ``` @@ -260,18 +261,17 @@ cleaned_new <- bind_rows(cleaned1, newvars) |> write_csv(cleaned_new, fs::path(DINTERMEDIATE, "soi_documentation_by_year.csv")) uniform_descriptions <- cleaned_new |> - select(vname, basevname, vtype, udescription) |> + select(soivname, basesoivname, vtype, udescription) |> distinct() # verify only one description per variable -anyDuplicated(uniform_descriptions$vname) +anyDuplicated(uniform_descriptions$soivname) write_csv(uniform_descriptions, fs::path(DINTERMEDIATE, "soi_documentation.csv")) uniform_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation.csv")) -check <- count(uniform_descriptions, basevname, vtype, vname, udescription) - +check <- count(uniform_descriptions, basesoivname, vtype, soivname, udescription) ``` diff --git a/tmd/areas/targets/prepare/prepare_states/create_additional_state_targets.qmd b/tmd/areas/targets/prepare/prepare_states/create_additional_state_targets.qmd new file mode 100644 index 00000000..84cd45c7 --- /dev/null +++ b/tmd/areas/targets/prepare/prepare_states/create_additional_state_targets.qmd @@ -0,0 +1,142 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +# Enhance state targets basefile + +The targets basefile created in a prior .qmd file has a population target for each state and many SOI-based targets. However, for some target concepts SOI data are either not available or are inappropriate. + +For example, for SALT deductions, for variables ending in 18400 (state and local income or sales tax deduction) or 18500 (real estate taxes paid) the SOI data for 2021 represent the amount available to be deducted by 2021 itemizers (at 2021 levels, under 2021), before the SALT cap. Because the TCJA raised the standard deduction sharply, there are far fewer itemizers in 2021 than pre-TCJA. We want our targets to be for available deductions including nonitemizers, in a manner consistent with the way the data are measured in our TMD variables e18400 and e18500. + +We construct alternative SALT targets below. + + +```{r} +#| label: setup +#| output: false + +suppressPackageStartupMessages(source(here::here("R", "libraries.R"))) +source(here::here("R", "constants.R")) +source(here::here("R", "functions.R")) + +``` + + +Define which SOI variables will be used to share which tmd variables. + +```{r} +#| label: construct-mappings +#| output: false + +mappings <- read_csv( +"tmdvar, basesoivname +e18400, 18400 +e18500, 18500 +", col_types="cc") + +mappings + +``` + + + +## Get data + +```{r} +#| label: get-data +#| output: false + +agilabels <- read_csv(fs::path(DINTERMEDIATE, "agilabels.csv")) +agilabels +STAGICUTS <- c(agilabels$agilo, agilabels$agihi) |> unique() |> sort() +# agistub=cut(c00100, STAGICUTS, right = FALSE, ordered_result = TRUE) |> as.integer() + +fpath <- fs::path(TMDDATA, "cached_allvars.csv") +tmd2021 <- vroom(fpath) +ns(tmd2021) + +tmd2021 <- tmd2021 |> + mutate(agistublab=cut(c00100, STAGICUTS, right = FALSE, ordered_result = TRUE), + agistub=agistublab |> as.integer()) |> + left_join(agilabels, by = join_by(agistub)) +# has more agistub info than we need but makes it easy to be sure we have the right stubs +count(tmd2021, agistub, agistublab, agilo, agihi, agilabel) + +base_targets <- read_csv(fs::path(DINTERMEDIATE, "base_targets.csv")) +glimpse(base_targets) + +``` + +## Variables shared to states + +```{r} +#| label: get-tmdsums +#| output: false + +tmdsums1 <- tmd2021 |> + filter(data_source==1) |> + select(s006, agistub, agilabel, all_of(mappings$tmdvar)) |> + pivot_longer(-c(s006, agistub, agilabel), + names_to = "tmdvar") |> + summarize(nzcount=sum(s006 * (value !=0)), + amount=sum(s006 * value), + .by=c(tmdvar, agistub, agilabel)) |> + arrange(tmdvar, agistub) + +tmdallincomes <- tmdsums1 |> + summarise(nzcount=sum(nzcount), + amount=sum(amount), + .by=c(tmdvar)) |> + mutate(agistub=0, agilabel="Total") + +tmdsums <- bind_rows(tmdsums1, + tmdallincomes) |> + arrange(tmdvar, agistub) |> + pivot_longer(-c(tmdvar, agistub, agilabel), + names_to = "vtype", + values_to = "tmdsum") |> + mutate(fstatus=0, scope=1, + count=case_when(vtype=="nzcount" ~ 2, + vtype=="amount" ~ 0, + .default = -9e99)) |> + left_join(mappings, + by = join_by(tmdvar)) |> + arrange(tmdvar, scope, count, fstatus) +tmdsums + +``` + + +```{r} +#| label: get-variable-shares +#| output: false + +# note: by using the us record we include the (trivial) OA amounts, which +# seems right - implicitly they are in the tmd data +soivname_shares <- base_targets |> + filter(basesoivname %in% mappings$basesoivname) |> + mutate(soi_ussum=target[stabbr=="US"], + soi_share=ifelse(soi_ussum==0, 0, target / soi_ussum), + .by=c(basesoivname, count, scope, fstatus, agistub)) + +check <- soivname_shares |> filter(stabbr=="US") +check <- soivname_shares |> filter(stabbr=="NY") + +soivname_targets <- soivname_shares |> + left_join(tmdsums |> + select(tmdvar, basesoivname, agistub, scope, fstatus, count, tmdsum), + by = join_by(basesoivname, scope, fstatus, count, agistub)) |> + mutate(target=tmdsum * soi_share, + basesoivname=paste0("tmd", str_sub(tmdvar, 2, -1), "_shared_by_soi", basesoivname), + soivname=paste0(str_sub(soivname, 1, 1), basesoivname)) + +check <- soivname_targets |> filter(stabbr=="NY") + +count(soivname_targets, tmdvar, basesoivname, soivname) + +write_csv(soivname_targets, fs::path(DINTERMEDIATE, "additional_targets.csv")) + +``` + diff --git a/tmd/areas/targets/prepare/prepare_states/create_state_targets_basefile.qmd b/tmd/areas/targets/prepare/prepare_states/create_state_targets_basefile.qmd index 170d89d8..d2f6027a 100644 --- a/tmd/areas/targets/prepare/prepare_states/create_state_targets_basefile.qmd +++ b/tmd/areas/targets/prepare/prepare_states/create_state_targets_basefile.qmd @@ -6,6 +6,16 @@ editor_options: # Create state targets basefile +Create and save as .csv a state targets basefile ("base_targets.csv") that has most of what we will need to write \[xx\]\_targets.csv files. It is a data frame with population and SOI target information for each state: + +- an initial "XTOT" record with the total population for the state +- records for data from SOI Historical Table 2 with the following information: + - `soivname` -- the SOI variable name (later, when we write target files we will map the SOI variable name to the TMD variable name and we will put the TMD variable name on the \[xx\]\_targets.csv file) + - `count`, `scope`, `agilo`, `agihi`, `fstatus`, and `target` -- variables with the meanings noted in the [main documentation](https://github.com/PSLmodels/tax-microdata-benchmarking/tree/master/tmd/areas/targets) + - certain other useful identifying information + +The information it will be missing at this point is: (1) any targets we need to estimate in other ways in cases where SOI targets are inappropriate or not available for a concept we care about, and (2) the TMD variable names. We hold off on adding TMD variable names until the last step because there are some challenges in mapping SOI concepts to TMD concepts. We want to have the targets right and then, finally, deal with the mapping. + ```{r} #| label: setup #| output: false @@ -16,7 +26,7 @@ source(here::here("R", "functions.R")) ``` -# get data +## get data ```{r} #| label: get-data @@ -32,7 +42,6 @@ pop2021 <- read_csv(fs::path(DRAW, "statepop_2021.csv")) ``` - ```{r} #| label: clean-soi-all-years #| output: false @@ -48,14 +57,14 @@ soi2 <- soilong |> count = case_when( vtype == "amount" ~ 0, vtype == "count" & - vname %in% allcounts ~ 1, # count for any value for these vars + soivname %in% allcounts ~ 1, # count for any value for these vars vtype == "count" & - !vname %in% allcounts ~ 2, # counts when var is nonzero + !soivname %in% allcounts ~ 2, # counts when var is nonzero .default = -9e9), count = as.integer(count), fstatus = case_when( - str_starts(vname, "mars") ~ str_sub(vname, -1), + str_starts(soivname, "mars") ~ str_sub(soivname, -1), .default = "0"), fstatus = as.integer(fstatus)) @@ -65,7 +74,6 @@ count(soi2, fstatus) ``` - ```{r} #| label: winnow-down-soi2021 #| output: false @@ -74,7 +82,7 @@ count(soi2, stabbr) # 54: 50 states, DC, OA, PR, US # do areas sum to US? soi2 |> - filter(vname=="a00100", year==2021) |> + filter(soivname=="a00100", year==2021) |> mutate(groupus=stabbr=="US") |> summarise(value=sum(value), .by=groupus) |> gt() |> @@ -84,10 +92,9 @@ soi2 |> soi2021 <- soi2 |> filter(year==2021, stabbr != "OA") |> # to conform to pop data - arrange(stabbr, vname, scope, fstatus, count, agistub) |> - mutate(sort = row_number() + 1, .by=stabbr) |> - select(sort, vname, count, scope, agilo, agihi, fstatus, target=value, - stabbr, basevname, description=udescription, agistub, agilabel) + arrange(stabbr, soivname, scope, fstatus, count, agistub) |> + select(soivname, count, scope, agilo, agihi, fstatus, target=value, + stabbr, basesoivname, description=udescription, agistub, agilabel) soi2021 |> filter(stabbr=="NY") @@ -96,7 +103,6 @@ count(soi2021, stabbr) # 54 ``` - ```{r} #| label: prepare-population #| output: false @@ -114,15 +120,14 @@ pop2021 |> # create the pop recs poprecs <- pop2021 |> - mutate(vname="XTOT", - basevname="XTOT", - description = "CD population in 2021 per Census ACS", + mutate(soivname="XTOT", + basesoivname="XTOT", + description = "State population in 2021 per Census Population Estimates Program", agistub = 0, count = 0, scope = 0, fstatus = 0, - target = pop2021, - sort = 1) |> + target = pop2021) |> left_join(agilabels, by = join_by(agistub)) |> select(all_of(names(soi2021))) |> @@ -130,9 +135,10 @@ poprecs <- pop2021 |> ``` +## Combine population and SOI data and save ```{r} -#| label: combine-pop-and-soi-data +#| label: combine-pop-and-soi-data-and-save #| output: false count(soi2021, stabbr) # 53 @@ -140,16 +146,19 @@ count(poprecs, stabbr) # 53 skim(soi2021) skim(poprecs) -basetargets <- bind_rows(poprecs, soi2021) |> - arrange(stabbr, sort) +base_targets <- bind_rows(poprecs, soi2021) |> + arrange(stabbr, scope, fstatus, basesoivname, count, agistub) -check <- basetargets |> +check <- base_targets |> filter(stabbr=="NY") -write_csv(basetargets, fs::path(DINTERMEDIATE, "basetargets.csv")) +write_csv(base_targets, fs::path(DINTERMEDIATE, "base_targets.csv")) + +base_targets <- read_csv(fs::path(DINTERMEDIATE, "base_targets.csv")) ``` +## Additional notes ```{r} #| label: notes @@ -190,8 +199,4 @@ write_csv(basetargets, fs::path(DINTERMEDIATE, "basetargets.csv")) # dollars if count==0 # number of tax units if count>0 - - - ``` - diff --git a/tmd/areas/targets/prepare/prepare_states/renv.lock b/tmd/areas/targets/prepare/prepare_states/renv.lock index 906620a3..0825f0ea 100644 --- a/tmd/areas/targets/prepare/prepare_states/renv.lock +++ b/tmd/areas/targets/prepare/prepare_states/renv.lock @@ -904,6 +904,28 @@ ], "Hash": "0080607b4a1a7b28979aecef976d8bc2" }, + "janitor": { + "Package": "janitor", + "Version": "2.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "dplyr", + "hms", + "lifecycle", + "lubridate", + "magrittr", + "purrr", + "rlang", + "snakecase", + "stringi", + "stringr", + "tidyr", + "tidyselect" + ], + "Hash": "5baae149f1082f466df9d1442ba7aa65" + }, "jquerylib": { "Package": "jquerylib", "Version": "0.1.4", @@ -963,14 +985,14 @@ }, "later": { "Package": "later", - "Version": "1.4.0", + "Version": "1.4.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "Rcpp", "rlang" ], - "Hash": "dd8a8b6833989ba10fba1bf1ee7d3860" + "Hash": "501744395cac0bab0fbcfab9375ae92c" }, "lattice": { "Package": "lattice", @@ -1203,7 +1225,7 @@ }, "promises": { "Package": "promises", - "Version": "1.3.1", + "Version": "1.3.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1215,7 +1237,7 @@ "rlang", "stats" ], - "Hash": "08049fb8ae7205a0f8f83da772761e3a" + "Hash": "c84fd4f75ea1f5434735e08b7f50fbca" }, "proxy": { "Package": "proxy", @@ -1580,6 +1602,18 @@ ], "Hash": "8f138ff2c8fbea9e0a523f6c399c0386" }, + "snakecase": { + "Package": "snakecase", + "Version": "0.11.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stringi", + "stringr" + ], + "Hash": "58767e44739b76965332e8a4fe3f91f1" + }, "stringdist": { "Package": "stringdist", "Version": "0.9.12",