Skip to content

Commit

Permalink
Merge pull request PSLmodels#317 from PSLmodels/PR-add-targets-shared…
Browse files Browse the repository at this point in the history
…-down-from-tmd

PR add targets shared down from tmd
  • Loading branch information
donboyd5 authored Dec 1, 2024
2 parents 1e4bff0 + 0be7ae8 commit 33a1f10
Show file tree
Hide file tree
Showing 9 changed files with 402 additions and 111 deletions.
2 changes: 2 additions & 0 deletions tmd/areas/targets/prepare/prepare_states/R/libraries.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ library(tidycensus)
library(tigris)
options(tigris_use_cache = TRUE)

library(janitor)


# possible libraries ------------------------------------------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,18 +131,18 @@ Unfortunately, the published SOI Historical Table 2 data do not capture potentia
basesort <- c("18400", "18500", "18425", "18450", "18800", "18460", "18300")
tabdata <- soilong |>
filter(basevname %in% basesort, stabbr=="US", agistub==0, year==2021) |>
filter(basesoivname %in% basesort, stabbr=="US", agistub==0, year==2021) |>
mutate(description=udescription[vtype=="amount"],
description=str_remove(description, " amount"),
.by=basevname) |>
select(basevname, vtype, value, description) |>
.by=basesoivname) |>
select(basesoivname, vtype, value, description) |>
pivot_wider(names_from = vtype) |>
select(basevname, description, count, amount) |>
mutate(basevname=factor(basevname, levels=basesort)) |>
arrange(basevname)
select(basesoivname, description, count, amount) |>
mutate(basesoivname=factor(basesoivname, levels=basesort)) |>
arrange(basesoivname)
tabdata |>
mutate(vgroup=basevname %in% basesort[1:2]) |>
mutate(vgroup=basesoivname %in% basesort[1:2]) |>
gt() |>
cols_hide(vgroup) |>
tab_header(title="Actual SALT deductions in 2021 IRS-published data",
Expand Down Expand Up @@ -183,21 +183,21 @@ The table below shows how SALT amounts have changed over time. We can see the la
basesort <- c("18400", "18500", "18425", "18450", "18800", "18460", "18300")
soilong |>
filter(basevname %in% basesort, stabbr=="US", agistub==0) |>
select(stabbr, basevname, vname, vtype, year, value, udescription) |>
filter(basesoivname %in% basesort, stabbr=="US", agistub==0) |>
select(stabbr, basesoivname, soivname, vtype, year, value, udescription) |>
pivot_wider(names_from = year) |>
mutate(basevname=factor(basevname, levels=basesort)) |>
mutate(basesoivname=factor(basesoivname, levels=basesort)) |>
mutate(udescription=udescription[vtype=="amount"],
.by=basevname) |>
arrange(vtype, basevname) |>
select(-vname) |>
.by=basesoivname) |>
arrange(vtype, basesoivname) |>
select(-soivname) |>
gt() |>
tab_header(title=html("Actual SALT variables for the U.S. over time<br>Amounts in $ billions, counts in millions"),
subtitle = "Source: IRS SOI Historical Table 2") |>
fmt_currency(columns = -c(stabbr, basevname, vtype, udescription),
fmt_currency(columns = -c(stabbr, basesoivname, vtype, udescription),
rows = vtype=="amount",
scale=1e-9, decimals=2) |>
fmt_number(columns = -c(stabbr, basevname, vtype, udescription),
fmt_number(columns = -c(stabbr, basesoivname, vtype, udescription),
rows = vtype=="count",
scale=1e-6, decimals=2) |>
sub_missing(columns=everything(),
Expand All @@ -216,7 +216,7 @@ Note that in the IRS SOI data, there are no deductions at all on returns with AG
#| label: income-sales-tax-by-agirange-amounts-over-time
soilong |>
filter(vname=="a18400", stabbr=="US") |>
filter(soivname=="a18400", stabbr=="US") |>
select(year, agistub, agilabel, value) |>
pivot_wider(names_from = year) |>
gt() |>
Expand All @@ -240,16 +240,16 @@ We do this both statistically (correlation coefficients) and graphically.
salt <- soilong |>
filter(year %in% c(2017, 2018, 2021),
basevname %in% c("18400", "18500"),
basesoivname %in% c("18400", "18500"),
!stabbr %in% c("US", "OA", "PR")) |>
mutate(agistubf=factor(agistub, levels=agilabels$agistub, labels=agilabels$agilabel)) |>
select(stabbr, year, basevname, vtype, udescription, agistub, agistubf, value) |>
select(stabbr, year, basesoivname, vtype, udescription, agistub, agistubf, value) |>
pivot_wider(names_from = year, names_prefix = "y")
saltshares <- salt |>
mutate(across(starts_with("y"),
\(x) x / sum(x)),
.by=c(agistub, basevname, vtype))
.by=c(agistub, basesoivname, vtype))
```

Expand All @@ -273,16 +273,16 @@ corrs <- saltshares |>
filter(agistub != 1) |>
summarise(cor2017_2018=cor(y2017, y2018, use = "complete.obs"),
cor2018_2021=cor(y2018, y2021, use = "complete.obs"),
.by=c(basevname, vtype, udescription, agistub, agistubf)) |>
.by=c(basesoivname, vtype, udescription, agistub, agistubf)) |>
mutate(agistub=factor(agistub),
udescription=ifelse(basevname=="18400" & vtype=="count",
udescription=ifelse(basesoivname=="18400" & vtype=="count",
"Number of returns with state and local income or sales taxes (estimated)",
udescription))
corrs |>
summarise(across(c(cor2017_2018, cor2018_2021),
list(min=min, max=max)),
.by=c(basevname, vtype, udescription)) |>
.by=c(basesoivname, vtype, udescription)) |>
gt() |>
tab_header(title="Min and max correlations across states for state SALT variables as share of national total",
subtitle = "Comparing 2017 to 2018, and 2018 to 2021") |>
Expand All @@ -294,7 +294,7 @@ corrs |>
cor2017_2018_max = "max") |>
cols_label(cor2018_2021_min= "min",
cor2018_2021_max = "max") |>
fmt_number(-c(basevname, vtype, udescription),
fmt_number(-c(basesoivname, vtype, udescription),
decimals=3)
corrs |>
Expand All @@ -303,7 +303,7 @@ corrs |>
subtitle = "Comparing 2017 to 2018, and 2018 to 2021") |>
cols_label(cor2017_2018 = "Correlation between 2017 and 2018",
cor2018_2021= "Correlation between 2018 and 2021") |>
fmt_number(-c(basevname, vtype, udescription),
fmt_number(-c(basesoivname, vtype, udescription),
decimals=3)
```
Expand All @@ -325,7 +325,7 @@ ub <- .075
saltshares |>
filter(!stabbr %in% c("CA", "NY")) |>
filter(!agistub %in% c(0, 1)) |>
filter(basevname=="18400", vtype=="amount") |>
filter(basesoivname=="18400", vtype=="amount") |>
ggplot(aes(x=y2018, y=y2017)) +
geom_point(colour="blue",
size=0.5) +
Expand Down Expand Up @@ -354,7 +354,7 @@ ub <- .11
saltshares |>
filter(!stabbr %in% c("CA", "NY")) |>
filter(!agistub %in% c(0, 1)) |>
filter(basevname=="18500", vtype=="amount") |>
filter(basesoivname=="18500", vtype=="amount") |>
ggplot(aes(x=y2018, y=y2017)) +
geom_point(colour="blue",
size=0.5) +
Expand Down Expand Up @@ -382,10 +382,10 @@ Finally, all of the state shares are shown in the filter-able and sortable table
saltshares |>
filter(agistub != 1) |>
select(stabbr, basevname, vtype, udescription, agistub, agistubf, y2017, y2018, y2021) |>
select(stabbr, basesoivname, vtype, udescription, agistub, agistubf, y2017, y2018, y2021) |>
mutate(y2018m2017=y2018 - y2017,
y2021m2018=y2021 - y2018,
across(c(stabbr, basevname, vtype, udescription, agistub),
across(c(stabbr, basesoivname, vtype, udescription, agistub),
\(x) factor(x))) |>
DT::datatable(rownames = FALSE,
caption = htmltools::tags$caption(
Expand Down
6 changes: 4 additions & 2 deletions tmd/areas/targets/prepare/prepare_states/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,12 @@ book:
- construct_long_soi_data_file.qmd
- part: "Analysis of SALT variables and other issues"
chapters:
- developing_SALT_targets.qmd
- part: "Create basefile for state targets"
- SALT_analysis.qmd
- part: "Create data from which to extract state target files"
chapters:
- create_state_targets_basefile.qmd
- create_additional_state_targets.qmd
- combine_base_and_additional_targets.qmd
# old files maybe use as base for new work
# - cd_create_variable_mapping.qmd
# - cd_compare_us_totals_tmd_vs_irs_published.qmd
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
---
output: html_document
editor_options:
chunk_output_type: console
---

# Combine base and additional target files


```{r}
#| label: setup
#| output: false
suppressPackageStartupMessages(source(here::here("R", "libraries.R")))
source(here::here("R", "constants.R"))
source(here::here("R", "functions.R"))
```


## Stack basefile targets and additional targets

```{r}
#| label: stack-targets
#| output: false
base_targets <- read_csv(fs::path(DINTERMEDIATE, "base_targets.csv"))
additional_targets <- read_csv(fs::path(DINTERMEDIATE, "additional_targets.csv"))
ns(additional_targets)
setdiff(names(base_targets), names(additional_targets)) # none missing
setdiff(names(additional_targets), names(base_targets)) # see below
# "soi_ussum" "soi_share" "tmdvar" "tmdsum"
# we can drop all of these
# re-examine additional targets
glimpse(additional_targets)
count(additional_targets, basesoivname, soivname, description)
count(additional_targets, tmdvar, basesoivname, soivname, description)
stack <- bind_rows(base_targets,
additional_targets |>
select(all_of(names(base_targets)))) |>
mutate(sort=ifelse(basesoivname=="XTOT" &
soivname=="XTOT" &
scope==0 &
str_detect(description, "population"),
1, NA_real_)) |>
# sort is 1 for the population record, NA for others - so pop sorts first
# set desired order
arrange(stabbr, sort, scope, fstatus, basesoivname, count, agistub) |>
# now calc sort
mutate(sort=row_number(), .by=stabbr) |>
select(stabbr, sort, count, scope, agilo, agihi, fstatus, target, basesoivname, soivname, description, agistub, agilabel)
# varname,count,scope,agilo,agihi,fstatus,target
check <- stack |> filter(stabbr=="NY")
check2 <- count(check, basesoivname, soivname, description)
write_csv(stack, fs::path(DINTERMEDIATE, "enhanced_targets.csv"))
```



## Additional notes

```{r}
#| label: notes
#| output: false
# documentation for the targets.csv data file
# sample file excerpt
# varname,count,scope,agilo,agihi,fstatus,target
# XTOT, 0, 0,-9e99, 9e99, 0, 33e6
# e00300, 0, 1,-9e99, 9e99, 0, 20e9
# e00900, 0, 1,-9e99, 9e99, 0, 30e9
# e00200, 0, 1,-9e99, 9e99, 0,1000e9
# e02000, 0, 1,-9e99, 9e99, 0, 30e9
# e02400, 0, 1,-9e99, 9e99, 0, 60e9
# varname: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the tmd/storage/__init__.py file
# count: integer in [0,4] range:
# count==0 implies dollar total of varname is tabulated
# count==1 implies number of tax units with any value of varname is tabulated
# count==2 implies number of tax units with a nonzero value of varname is tabulated
# count==3 implies number of tax units with a positive value of varname is tabulated
# count==4 implies number of tax units with a negative value of varname is tabulated
# scope: integer in [0,2] range:
# scope==0 implies all tax units are tabulated
# scope==1 implies only PUF-derived filing units are tabulated
# scope==2 implies only CPS-derived filing units are tabulated
# agilo: float representing lower bound of the AGI range (which is included in the range) that is tabulated.
# agihi: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated.
# fstatus: integer in [0,5] range:
# fstatus=0 implies all filing statuses are tabulated
# other fstatus values imply just the tax units with the Tax-Calculator MARS variable equal to fstatus are included in the tabulation
# target: target amount:
# dollars if count==0
# number of tax units if count>0
```

Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ csvdata2 <- csvdata |>
rename(stabbr=state, agistub=agi_stub) |>
mutate(year=as.integer(year)) |>
pivot_longer(-c(stabbr, year, agistub),
names_to = "vname") |>
names_to = "soivname") |>
filter(!is.na(value))
saveRDS(csvdata2, fs::path(DINTERMEDIATE, "soilong_raw.rds"))
Expand All @@ -85,14 +85,14 @@ glimpse(soilong_raw)
# investigate the data to make sure correct
check <- soilong_raw |>
filter(str_sub(vname, 2, -1) %in% c("18425", "18450"))
filter(str_sub(soivname, 2, -1) %in% c("18425", "18450"))
#.. 18400 State and local income or sales tax (estimated)
est18400 <- soilong_raw |>
filter(str_sub(vname, 2, -1) %in% c("18425", "18450")) |>
mutate(vname=paste0(str_sub(vname, 1, 1), "18400")) |>
filter(str_sub(soivname, 2, -1) %in% c("18425", "18450")) |>
mutate(soivname=paste0(str_sub(soivname, 1, 1), "18400")) |>
summarise(value=sum(value),
.by=c(stabbr, agistub, year, vname))
.by=c(stabbr, agistub, year, soivname))
glimpse(est18400)
skim(est18400)
Expand All @@ -116,14 +116,14 @@ agilabels <- read_csv(fs::path(DINTERMEDIATE, "agilabels.csv"))
soilong <- soilong1 |>
left_join(variable_descriptions,
by = join_by(vname, year)) |>
by = join_by(soivname, year)) |>
left_join(agilabels, by = join_by(agistub)) |>
mutate(value=ifelse(vtype=="amount", value * 1000, value)) |>
select(stabbr, vname, basevname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |>
arrange(stabbr, vname, basevname, vtype, agistub, year)
select(stabbr, soivname, basesoivname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |>
arrange(stabbr, soivname, basesoivname, vtype, agistub, year)
skim(soilong)
check <- count(soilong, basevname, vtype, vname, udescription)
check <- count(soilong, basesoivname, vtype, soivname, udescription)
saveRDS(soilong, fs::path(DINTERMEDIATE, "soilong.rds"))
Expand All @@ -145,7 +145,7 @@ count(soilong, stabbr) # 54: 50 states, DC, PR, OA, US
soilong |>
filter(is.na(vtype)) |>
count(vname) # should be zero recs
count(soivname) # should be zero recs
soilong |>
filter(is.na(vtype)) |>
Expand All @@ -154,15 +154,13 @@ soilong |>
# n17000 had been one of the all-missing values variables in some years
# we have since dropped all missing values
variable_descriptions |>
filter(vname=="n17000") # Number of returns with Total medical and dental expense deduction
filter(soivname=="n17000") # Number of returns with Total medical and dental expense deduction
soilong |>
filter(stabbr=="NY", vname %in% c("n17000", "a17000"), agistub==0) |>
select(stabbr, agistub, vname, vtype, year, value, udescription) |>
filter(stabbr=="NY", soivname %in% c("n17000", "a17000"), agistub==0) |>
select(stabbr, agistub, soivname, vtype, year, value, udescription) |>
arrange(vtype, year)
```


Expand Down
Loading

0 comments on commit 33a1f10

Please sign in to comment.