Merge pull request PSLmodels#317 from PSLmodels/PR-add-targets-shared…

…-down-from-tmd PR add targets shared down from tmd
donboyd5 · Dec 1, 2024 · 33a1f10 · 33a1f10
2 parents 1e4bff0 + 0be7ae8
commit 33a1f10
Show file tree

Hide file tree

Showing 9 changed files with 402 additions and 111 deletions.
diff --git a/tmd/areas/targets/prepare/prepare_states/R/libraries.R b/tmd/areas/targets/prepare/prepare_states/R/libraries.R
@@ -25,6 +25,8 @@ library(tidycensus)
 library(tigris)
 options(tigris_use_cache = TRUE)
 
+library(janitor)
+
 
 # possible libraries ------------------------------------------------------
 

diff --git a/...repare_states/developing_SALT_targets.qmd → .../prepare/prepare_states/SALT_analysis.qmd b/...repare_states/developing_SALT_targets.qmd → .../prepare/prepare_states/SALT_analysis.qmd
@@ -131,18 +131,18 @@ Unfortunately, the published SOI Historical Table 2 data do not capture potentia
 basesort <- c("18400", "18500", "18425", "18450", "18800", "18460", "18300")
 
 tabdata <- soilong |> 
-  filter(basevname %in% basesort, stabbr=="US", agistub==0, year==2021) |> 
+  filter(basesoivname %in% basesort, stabbr=="US", agistub==0, year==2021) |> 
   mutate(description=udescription[vtype=="amount"],
          description=str_remove(description, " amount"),
-         .by=basevname) |> 
-  select(basevname, vtype, value, description) |> 
+         .by=basesoivname) |> 
+  select(basesoivname, vtype, value, description) |> 
   pivot_wider(names_from = vtype) |> 
-  select(basevname, description, count, amount) |> 
-  mutate(basevname=factor(basevname, levels=basesort)) |> 
-  arrange(basevname)
+  select(basesoivname, description, count, amount) |> 
+  mutate(basesoivname=factor(basesoivname, levels=basesort)) |> 
+  arrange(basesoivname)
 
 tabdata |> 
-  mutate(vgroup=basevname %in% basesort[1:2]) |> 
+  mutate(vgroup=basesoivname %in% basesort[1:2]) |> 
   gt() |> 
   cols_hide(vgroup) |> 
   tab_header(title="Actual SALT deductions in 2021 IRS-published data",
@@ -183,21 +183,21 @@ The table below shows how SALT amounts have changed over time. We can see the la
 basesort <- c("18400", "18500", "18425", "18450", "18800", "18460", "18300")
 
 soilong |> 
-  filter(basevname %in% basesort, stabbr=="US", agistub==0) |> 
-  select(stabbr, basevname, vname, vtype, year, value, udescription) |> 
+  filter(basesoivname %in% basesort, stabbr=="US", agistub==0) |> 
+  select(stabbr, basesoivname, soivname, vtype, year, value, udescription) |> 
   pivot_wider(names_from = year) |> 
-  mutate(basevname=factor(basevname, levels=basesort)) |> 
+  mutate(basesoivname=factor(basesoivname, levels=basesort)) |> 
   mutate(udescription=udescription[vtype=="amount"],
-         .by=basevname) |> 
-  arrange(vtype, basevname) |> 
-  select(-vname) |> 
+         .by=basesoivname) |> 
+  arrange(vtype, basesoivname) |> 
+  select(-soivname) |> 
   gt() |> 
   tab_header(title=html("Actual SALT variables for the U.S. over time<br>Amounts in $ billions, counts in millions"),
              subtitle = "Source: IRS SOI Historical Table 2") |> 
-  fmt_currency(columns = -c(stabbr, basevname, vtype, udescription), 
+  fmt_currency(columns = -c(stabbr, basesoivname, vtype, udescription), 
                rows = vtype=="amount",
                scale=1e-9, decimals=2) |> 
-  fmt_number(columns = -c(stabbr, basevname, vtype, udescription), 
+  fmt_number(columns = -c(stabbr, basesoivname, vtype, udescription), 
                rows = vtype=="count",
                scale=1e-6, decimals=2) |> 
   sub_missing(columns=everything(),
@@ -216,7 +216,7 @@ Note that in the IRS SOI data, there are no deductions at all on returns with AG
 #| label: income-sales-tax-by-agirange-amounts-over-time
 
 soilong |> 
-  filter(vname=="a18400", stabbr=="US") |> 
+  filter(soivname=="a18400", stabbr=="US") |> 
   select(year, agistub, agilabel, value) |> 
   pivot_wider(names_from = year) |> 
   gt() |> 
@@ -240,16 +240,16 @@ We do this both statistically (correlation coefficients) and graphically.
 
 salt <- soilong |> 
   filter(year %in% c(2017, 2018, 2021),
-         basevname %in% c("18400", "18500"),
+         basesoivname %in% c("18400", "18500"),
          !stabbr %in% c("US", "OA", "PR")) |> 
   mutate(agistubf=factor(agistub, levels=agilabels$agistub, labels=agilabels$agilabel)) |> 
-  select(stabbr, year, basevname, vtype, udescription, agistub, agistubf, value) |> 
+  select(stabbr, year, basesoivname, vtype, udescription, agistub, agistubf, value) |> 
   pivot_wider(names_from = year, names_prefix = "y")
 
 saltshares <- salt |> 
   mutate(across(starts_with("y"),
                 \(x) x / sum(x)),
-                .by=c(agistub, basevname, vtype))
+                .by=c(agistub, basesoivname, vtype))
 
 ```
 
@@ -273,16 +273,16 @@ corrs <- saltshares |>
   filter(agistub != 1) |> 
   summarise(cor2017_2018=cor(y2017, y2018, use = "complete.obs"),
             cor2018_2021=cor(y2018, y2021, use = "complete.obs"),
-            .by=c(basevname, vtype, udescription, agistub, agistubf)) |>
+            .by=c(basesoivname, vtype, udescription, agistub, agistubf)) |>
   mutate(agistub=factor(agistub),
-         udescription=ifelse(basevname=="18400" & vtype=="count",
+         udescription=ifelse(basesoivname=="18400" & vtype=="count",
                              "Number of returns with state and local income or sales taxes (estimated)",
                              udescription))
 
 corrs |> 
   summarise(across(c(cor2017_2018, cor2018_2021),
                    list(min=min, max=max)),
-            .by=c(basevname, vtype, udescription)) |> 
+            .by=c(basesoivname, vtype, udescription)) |> 
   gt() |> 
   tab_header(title="Min and max correlations across states for state SALT variables as share of national total",
              subtitle = "Comparing 2017 to 2018, and 2018 to 2021") |> 
@@ -294,7 +294,7 @@ corrs |>
              cor2017_2018_max = "max") |> 
   cols_label(cor2018_2021_min= "min",
              cor2018_2021_max = "max") |> 
-  fmt_number(-c(basevname, vtype, udescription),
+  fmt_number(-c(basesoivname, vtype, udescription),
              decimals=3)
 
 corrs |> 
@@ -303,7 +303,7 @@ corrs |>
              subtitle = "Comparing 2017 to 2018, and 2018 to 2021") |> 
   cols_label(cor2017_2018 = "Correlation between 2017 and 2018",
              cor2018_2021= "Correlation between 2018 and 2021") |> 
-  fmt_number(-c(basevname, vtype, udescription),
+  fmt_number(-c(basesoivname, vtype, udescription),
              decimals=3)
 
 ```
@@ -325,7 +325,7 @@ ub <- .075
 saltshares |> 
   filter(!stabbr %in% c("CA", "NY")) |> 
   filter(!agistub %in% c(0, 1)) |> 
-  filter(basevname=="18400", vtype=="amount") |> 
+  filter(basesoivname=="18400", vtype=="amount") |> 
   ggplot(aes(x=y2018, y=y2017)) +
   geom_point(colour="blue",
              size=0.5) +
@@ -354,7 +354,7 @@ ub <- .11
 saltshares |> 
   filter(!stabbr %in% c("CA", "NY")) |> 
   filter(!agistub %in% c(0, 1)) |> 
-  filter(basevname=="18500", vtype=="amount") |> 
+  filter(basesoivname=="18500", vtype=="amount") |> 
   ggplot(aes(x=y2018, y=y2017)) +
   geom_point(colour="blue",
              size=0.5) +
@@ -382,10 +382,10 @@ Finally, all of the state shares are shown in the filter-able and sortable table
 
 saltshares |> 
   filter(agistub != 1) |> 
-  select(stabbr, basevname, vtype, udescription, agistub, agistubf, y2017, y2018, y2021) |> 
+  select(stabbr, basesoivname, vtype, udescription, agistub, agistubf, y2017, y2018, y2021) |> 
   mutate(y2018m2017=y2018 - y2017,
          y2021m2018=y2021 - y2018,
-         across(c(stabbr, basevname, vtype, udescription, agistub),
+         across(c(stabbr, basesoivname, vtype, udescription, agistub),
                 \(x) factor(x))) |> 
   DT::datatable(rownames = FALSE,
                 caption = htmltools::tags$caption(

diff --git a/tmd/areas/targets/prepare/prepare_states/_quarto.yml b/tmd/areas/targets/prepare/prepare_states/_quarto.yml
@@ -45,10 +45,12 @@ book:
         - construct_long_soi_data_file.qmd
     - part: "Analysis of SALT variables and other issues"
       chapters:
-        - developing_SALT_targets.qmd
-    - part: "Create basefile for state targets"
+        - SALT_analysis.qmd
+    - part: "Create data from which to extract state target files"
       chapters: 
         - create_state_targets_basefile.qmd
+        - create_additional_state_targets.qmd
+        - combine_base_and_additional_targets.qmd
         # old files maybe use as base for new work
         # - cd_create_variable_mapping.qmd
         # - cd_compare_us_totals_tmd_vs_irs_published.qmd

diff --git a/tmd/areas/targets/prepare/prepare_states/combine_base_and_additional_targets.qmd b/tmd/areas/targets/prepare/prepare_states/combine_base_and_additional_targets.qmd
@@ -0,0 +1,108 @@
+---
+output: html_document
+editor_options: 
+  chunk_output_type: console
+---
+
+# Combine base and additional target files
+
+
+```{r}
+#| label: setup
+#| output: false
+
+suppressPackageStartupMessages(source(here::here("R", "libraries.R")))
+source(here::here("R", "constants.R"))
+source(here::here("R", "functions.R"))
+
+```
+
+
+## Stack basefile targets and additional targets
+
+```{r}
+#| label: stack-targets
+#| output: false
+
+base_targets <- read_csv(fs::path(DINTERMEDIATE, "base_targets.csv"))
+additional_targets <- read_csv(fs::path(DINTERMEDIATE, "additional_targets.csv"))
+ns(additional_targets)
+
+setdiff(names(base_targets), names(additional_targets)) # none missing
+setdiff(names(additional_targets), names(base_targets)) # see below
+# "soi_ussum" "soi_share" "tmdvar" "tmdsum"
+# we can drop all of these
+
+# re-examine additional targets
+glimpse(additional_targets)
+count(additional_targets, basesoivname, soivname, description)
+count(additional_targets, tmdvar, basesoivname, soivname, description)
+
+stack <- bind_rows(base_targets, 
+                   additional_targets |> 
+                     select(all_of(names(base_targets)))) |> 
+  mutate(sort=ifelse(basesoivname=="XTOT" &
+                       soivname=="XTOT" &
+                       scope==0 &
+                       str_detect(description, "population"),
+                     1, NA_real_)) |> 
+  # sort is 1 for the  population record, NA for others - so pop sorts first
+  # set desired order
+  arrange(stabbr, sort, scope, fstatus, basesoivname, count, agistub) |> 
+  # now calc sort
+  mutate(sort=row_number(), .by=stabbr) |> 
+  select(stabbr, sort, count, scope, agilo, agihi, fstatus, target, basesoivname, soivname, description, agistub, agilabel)
+
+# varname,count,scope,agilo,agihi,fstatus,target
+check <- stack |> filter(stabbr=="NY")
+check2 <- count(check, basesoivname, soivname, description)
+
+write_csv(stack, fs::path(DINTERMEDIATE, "enhanced_targets.csv"))
+
+```
+
+
+
+## Additional notes
+
+```{r}
+#| label: notes
+#| output: false
+
+# documentation for the targets.csv data file
+
+# sample file excerpt
+# varname,count,scope,agilo,agihi,fstatus,target
+# XTOT,       0,    0,-9e99, 9e99,      0,  33e6
+# e00300,     0,    1,-9e99, 9e99,      0,  20e9
+# e00900,     0,    1,-9e99, 9e99,      0,  30e9
+# e00200,     0,    1,-9e99, 9e99,      0,1000e9
+# e02000,     0,    1,-9e99, 9e99,      0,  30e9
+# e02400,     0,    1,-9e99, 9e99,      0,  60e9
+
+# varname: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the tmd/storage/__init__.py file
+# count: integer in [0,4] range:
+# count==0 implies dollar total of varname is tabulated
+# count==1 implies number of tax units with any value of varname is tabulated
+# count==2 implies number of tax units with a nonzero value of varname is tabulated
+# count==3 implies number of tax units with a positive value of varname is tabulated
+# count==4 implies number of tax units with a negative value of varname is tabulated
+
+# scope: integer in [0,2] range:
+# scope==0 implies all tax units are tabulated
+# scope==1 implies only PUF-derived filing units are tabulated
+# scope==2 implies only CPS-derived filing units are tabulated
+
+# agilo: float representing lower bound of the AGI range (which is included in the range) that is tabulated.
+# agihi: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated.
+
+# fstatus: integer in [0,5] range:
+# fstatus=0 implies all filing statuses are tabulated
+# other fstatus values imply just the tax units with the Tax-Calculator MARS variable equal to fstatus are included in the tabulation
+
+# target: target amount:
+# dollars if count==0
+# number of tax units if count>0
+
+```
+
diff --git a/tmd/areas/targets/prepare/prepare_states/construct_long_soi_data_file.qmd b/tmd/areas/targets/prepare/prepare_states/construct_long_soi_data_file.qmd
@@ -67,7 +67,7 @@ csvdata2 <- csvdata |>
   rename(stabbr=state, agistub=agi_stub) |> 
   mutate(year=as.integer(year)) |> 
   pivot_longer(-c(stabbr, year, agistub),
-               names_to = "vname") |> 
+               names_to = "soivname") |> 
   filter(!is.na(value))
 saveRDS(csvdata2, fs::path(DINTERMEDIATE, "soilong_raw.rds"))
 
@@ -85,14 +85,14 @@ glimpse(soilong_raw)
 
 # investigate the data to make sure correct
 check <- soilong_raw |> 
-  filter(str_sub(vname, 2, -1) %in% c("18425", "18450"))
+  filter(str_sub(soivname, 2, -1) %in% c("18425", "18450"))
 
 #.. 18400 State and local income or sales tax (estimated)
 est18400 <- soilong_raw |> 
-  filter(str_sub(vname, 2, -1) %in% c("18425", "18450")) |> 
-  mutate(vname=paste0(str_sub(vname, 1, 1), "18400")) |> 
+  filter(str_sub(soivname, 2, -1) %in% c("18425", "18450")) |> 
+  mutate(soivname=paste0(str_sub(soivname, 1, 1), "18400")) |> 
   summarise(value=sum(value),
-            .by=c(stabbr, agistub, year, vname))
+            .by=c(stabbr, agistub, year, soivname))
 glimpse(est18400)
 skim(est18400)
 
@@ -116,14 +116,14 @@ agilabels <- read_csv(fs::path(DINTERMEDIATE, "agilabels.csv"))
 
 soilong <- soilong1 |> 
   left_join(variable_descriptions,
-            by = join_by(vname, year)) |> 
+            by = join_by(soivname, year)) |> 
   left_join(agilabels, by = join_by(agistub)) |> 
   mutate(value=ifelse(vtype=="amount", value * 1000, value)) |> 
-  select(stabbr, vname, basevname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |> 
-  arrange(stabbr, vname, basevname, vtype, agistub, year)
+  select(stabbr, soivname, basesoivname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |> 
+  arrange(stabbr, soivname, basesoivname, vtype, agistub, year)
 
 skim(soilong)
-check <- count(soilong, basevname, vtype, vname, udescription)
+check <- count(soilong, basesoivname, vtype, soivname, udescription)
 
 saveRDS(soilong, fs::path(DINTERMEDIATE, "soilong.rds"))
 
@@ -145,7 +145,7 @@ count(soilong, stabbr) # 54: 50 states, DC, PR, OA, US
 
 soilong |>
   filter(is.na(vtype)) |>
-  count(vname) # should be zero recs
+  count(soivname) # should be zero recs
 
 soilong |>
   filter(is.na(vtype)) |> 
@@ -154,15 +154,13 @@ soilong |>
 # n17000 had been one of the all-missing values variables in some years
 # we have since dropped all missing values
 variable_descriptions |> 
-  filter(vname=="n17000") # Number of returns with Total medical and dental expense deduction
+  filter(soivname=="n17000") # Number of returns with Total medical and dental expense deduction
 
 soilong |> 
-  filter(stabbr=="NY", vname %in% c("n17000", "a17000"), agistub==0) |> 
-  select(stabbr, agistub, vname, vtype, year, value, udescription) |> 
+  filter(stabbr=="NY", soivname %in% c("n17000", "a17000"), agistub==0) |> 
+  select(stabbr, agistub, soivname, vtype, year, value, udescription) |> 
   arrange(vtype, year)
 
-
-
 ```