added function to restore types in ARDs

insightsengineering · Nov 1, 2024 · fd17eb7 · fd17eb7
1 parent 9c0f38c
commit fd17eb7
Show file tree

Hide file tree

Showing 15 changed files with 269 additions and 46 deletions.
diff --git a/R/ard_categorical.survey.design.R b/R/ard_categorical.survey.design.R
@@ -27,7 +27,7 @@
 #'   a named list, a list of formulas,
 #'   or a single formula where the list element is a named list of functions
 #'   (or the RHS of a formula),
-#'   e.g. `list(mpg = list(mean = \(x) round(x, digits = 2) |> as.character))`.
+#'   e.g. `list(mpg = list(mean = \(x) round(x, digits = 2) |> as.character()))`.
 #' @param stat_label ([`formula-list-selector`][cards::syntax])\cr
 #'   a named list, a list of formulas, or a single formula where
 #'   the list element is either a named list or a list of formulas defining the
@@ -207,6 +207,7 @@ ard_categorical.survey.design <- function(data,
 
   # return final object --------------------------------------------------------
   cards |>
+    .restore_original_column_types(data = data$variables) |>
     dplyr::mutate(
       context = "categorical",
       warning = list(NULL),

diff --git a/R/ard_categorical_ci.survey.design.R b/R/ard_categorical_ci.survey.design.R
@@ -145,12 +145,13 @@ ard_categorical_ci.survey.design <- function(data,
       fmt_fn = map(.data$stat, ~ case_switch(is.numeric(.x) ~ 2L, .default = as.character))
     ) |>
     cards::as_card() |>
-    cards::tidy_ard_column_order()
+    cards::tidy_ard_column_order() |>
+    .restore_original_column_types(data = data$variables)
 
   # if a value was passed for the variable, subset on those results
   if (!is_empty(value)) {
     df_full <- df_full |>
-      dplyr::filter(.data$variable_level %in% .env$value)
+      dplyr::filter(unlist(.data$variable_level) %in% .env$value)
   }
 
   df_full

diff --git a/R/ard_continuous.survey.design.R b/R/ard_continuous.survey.design.R
@@ -99,8 +99,8 @@ ard_continuous.survey.design <- function(data, variables, by = NULL,
         )
       }
     ) |>
-    dplyr::bind_rows()
-
+    dplyr::bind_rows() |>
+    .restore_original_column_types(data = data$variables)
 
   # add stat_labels ------------------------------------------------------------
   df_stats <-
@@ -319,3 +319,112 @@ accepted_svy_stats <- function(expand_quantiles = TRUE) {
       )
     )
 }
+
+
+# some operations coerce the variable types to character.
+# this function will convert the `_level` values to their original types
+.restore_original_column_types <- function(ard, data) {
+  # identify grouping variable names with associated levels --------------------
+  by <- character()
+  for (v in names(dplyr::select(ard, cards::all_ard_groups("names")))) {
+    if (paste0(v, "_level") %in% names(ard)) by <- c(by, ard[[v]][1]) # styler: off
+  }
+  variables <- character()
+  if ("variable" %in% names(ard)) {
+    variables <- ard[["variable"]] |> unique()
+  }
+
+  # if there are no levels to correct, then return ard as it is
+  if (is_empty(variables) && is_empty(by)) return(ard) # styler: off
+
+  # add an ID for sorting
+  ard$...ard_id_for_sorting... <- seq_len(nrow(ard))
+
+  # nest the raw data with original types --------------------------------------
+  if (!is_empty(variables)) {
+    if (!"variable_level" %in% names(ard)) df_variable_orginal_types <- unique(ard["variable"]) # styler: off
+    else if (!all(variables %in% names(data))) { # for survfit summaries, the times/probs var is not in the data
+      df_variable_orginal_types <- unique(ard[c("variable", "variable_level")])
+    }
+    else {
+      df_variable_orginal_types <-
+        map(
+          variables,
+          ~ cards::nest_for_ard(data, by = .x, include_data = FALSE) |>
+            stats::setNames(c("variable", "variable_level"))
+        ) |>
+        dplyr::bind_rows()
+    }
+  }
+  if (!is_empty(by)) {
+    df_by_orginal_types <-
+      cards::nest_for_ard(data, by = by, include_data = FALSE)
+  }
+
+  # combine groups and variables together
+  if (!is_empty(variables) && !is_empty(by)) {
+    df_original_types <-
+      dplyr::cross_join(df_by_orginal_types, df_variable_orginal_types)
+  }
+  else if (!is_empty(variables)) {
+    df_original_types <- df_variable_orginal_types
+  }
+  else if (!is_empty(by)) {
+    df_original_types <- df_by_orginal_types
+  }
+
+  # unlisting the sorting according the character value
+  df_original_types <- df_original_types |>
+    dplyr::arrange(across(everything(), ~map(., as.character) |> unlist(.)))
+
+  ard_nested <- ard |>
+    tidyr::nest(..ard_data... = -c(cards::all_ard_groups(), cards::all_ard_variables())) |>
+    dplyr::arrange(across(c(cards::all_ard_groups(), cards::all_ard_variables()), unlist))
+
+  # if all columns match, then replace the coerced character cols with their original type/class
+  all_cols_equal <-
+    every(
+      names(df_original_types) |> setdiff("variable_level"),
+      ~ all(
+        unlist(ard_nested[[.x]]) == as.character(unlist(df_original_types[[.x]])) |
+          (is.na(unlist(ard_nested[[.x]])) & is.na(unlist(df_original_types[[.x]])))
+      )
+    )
+  # the variable level needs to be handled separately because there can be mixed type and we can't unlist
+  if (isTRUE(all_cols_equal) && "variable_level" %in% names(df_original_types)) {
+    all_cols_equal <-
+      seq_len(nrow(df_original_types)) |>
+      map_lgl(
+        ~identical(
+          as.character(df_original_types[["variable_level"]][[.x]]),
+          as.character(ard_nested[["variable_level"]][[.x]])
+        )
+      ) |>
+      all()
+  }
+
+  if (isTRUE(all_cols_equal)) {
+    return(
+      dplyr::bind_cols(
+        df_original_types,
+        dplyr::select(ard_nested, -all_of(names(df_original_types))),
+        .name_repair = "minimal"
+      ) |>
+        tidyr::unnest(cols = "..ard_data...") |>
+        dplyr::arrange(.data$...ard_id_for_sorting...) |>
+        dplyr::select(-"...ard_id_for_sorting...") |>
+        cards::as_card()
+    )
+  }
+
+  # I hope this message is never triggered!
+  cli::cli_inform(c(
+    "If you see this message, variable levels have been coerced to character, which could cause downstream issues.",
+    "*" = "Please post a reproducible example to {.url https://github.com/insightsengineering/cardx/issues/new}, so we can address in the next release.",
+    "i" = "You can create a minimal reproducible example with {.fun reprex::reprex}."
+  ))
+
+  ard |>
+    dplyr::arrange(.data$...ard_id_for_sorting...) |>
+    dplyr::select(-"...ard_id_for_sorting...")
+}
diff --git a/R/ard_continuous_ci.survey.design.R b/R/ard_continuous_ci.survey.design.R
@@ -74,7 +74,8 @@ ard_continuous_ci.survey.design <- function(data,
     method = method,
     df = df,
     ...
-  )
+  ) |>
+    .restore_original_column_types(data = data$variables)
 }
 
 .calculate_ard_continuous_survey_ci <- function(FUN, data, variables, by, conf.level, ...) {

diff --git a/R/ard_survival_survfit.R b/R/ard_survival_survfit.R
@@ -179,7 +179,8 @@ ard_survival_survfit.data.frame <- function(x, y, variables,
     package = "survival",
     method.args = {{ method.args }}
   ) |>
-    ard_survival_survfit(times = times, probs = probs, type = type)
+      ard_survival_survfit(times = times, probs = probs, type = type) |>
+      .restore_original_column_types(data = x)
 }
 
 #' Process Survival Fit For Time Estimates

diff --git a/man/ard_categorical.survey.design.Rd b/man/ard_categorical.survey.design.Rd
diff --git a/man/ard_dichotomous.survey.design.Rd b/man/ard_dichotomous.survey.design.Rd
diff --git a/man/ard_missing.survey.design.Rd b/man/ard_missing.survey.design.Rd
diff --git a/tests/testthat/_snaps/ard_categorical_ci.survey.design.md b/tests/testthat/_snaps/ard_categorical_ci.survey.design.md
@@ -1,28 +1,30 @@
 # ard_categorical_ci(data)
 
     Code
-      dplyr::select(as.data.frame(ard_categorical_ci(dclus1, variables = c(both,
-        awards))), -warning, -error, -fmt_fn, -context)
+      dplyr::select(ard_categorical_ci(dclus1, variables = c(both, awards)), -warning,
+      -error, -fmt_fn, -context)
+    Message
+      {cards} data frame: 20 x 5
     Output
-         variable variable_level  stat_name stat_label      stat
-      1      both             No   estimate   estimate  0.273224
-      2      both             No   conf.low   conf.low 0.2131745
-      3      both             No  conf.high  conf.high  0.342819
-      4      both             No     method     method     logit
-      5      both             No conf.level conf.level      0.95
-      6      both            Yes   estimate   estimate  0.726776
-      7      both            Yes   conf.low   conf.low  0.657181
-      8      both            Yes  conf.high  conf.high 0.7868255
-      9      both            Yes     method     method     logit
-      10     both            Yes conf.level conf.level      0.95
-      11   awards             No   estimate   estimate 0.2896175
-      12   awards             No   conf.low   conf.low 0.2241835
-      13   awards             No  conf.high  conf.high 0.3651608
-      14   awards             No     method     method     logit
-      15   awards             No conf.level conf.level      0.95
-      16   awards            Yes   estimate   estimate 0.7103825
-      17   awards            Yes   conf.low   conf.low 0.6348392
-      18   awards            Yes  conf.high  conf.high 0.7758165
-      19   awards            Yes     method     method     logit
-      20   awards            Yes conf.level conf.level      0.95
+         variable variable_level  stat_name stat_label  stat
+      1      both             No   estimate   estimate 0.273
+      2      both             No   conf.low   conf.low 0.213
+      3      both             No  conf.high  conf.high 0.343
+      4      both             No     method     method logit
+      5      both             No conf.level  conf.lev…  0.95
+      6      both            Yes   estimate   estimate 0.727
+      7      both            Yes   conf.low   conf.low 0.657
+      8      both            Yes  conf.high  conf.high 0.787
+      9      both            Yes     method     method logit
+      10     both            Yes conf.level  conf.lev…  0.95
+      11   awards             No   estimate   estimate  0.29
+      12   awards             No   conf.low   conf.low 0.224
+      13   awards             No  conf.high  conf.high 0.365
+      14   awards             No     method     method logit
+      15   awards             No conf.level  conf.lev…  0.95
+      16   awards            Yes   estimate   estimate  0.71
+      17   awards            Yes   conf.low   conf.low 0.635
+      18   awards            Yes  conf.high  conf.high 0.776
+      19   awards            Yes     method     method logit
+      20   awards            Yes conf.level  conf.lev…  0.95
 
diff --git a/tests/testthat/_snaps/ard_survival_survfit.md b/tests/testthat/_snaps/ard_survival_survfit.md
@@ -364,7 +364,7 @@
 # ard_survival_survfit.data.frame() works as expected
 
     Code
-      res_quo <- print(dplyr::mutate(ard_survival_survfit.data.frame(x = mtcars, y = "survival::Surv(mpg, am)",
+      res_quo <- print(dplyr::mutate(ard_survival_survfit(x = mtcars, y = "survival::Surv(mpg, am)",
         variables = "vs", times = 20, method.args = list(start.time = 0, id = cyl)),
       stat = lapply(stat, function(x) ifelse(is.numeric(x), cards::round5(x, 3), x))),
       n = Inf)

diff --git a/tests/testthat/test-ard_categorical.survey.design.R b/tests/testthat/test-ard_categorical.survey.design.R
@@ -1324,3 +1324,47 @@ test_that("ard_categorical follows ard structure", {
       cards::check_ard_structure(method = FALSE)
   )
 })
+
+test_that("ard_categorical.survey.design() original types are retained", {
+  svy_titanic <-
+    survey::svydesign(
+      ~1,
+      data = as.data.frame(Titanic) |> dplyr::mutate(Class.dbl = as.numeric(Class),
+                                                     Class.int = as.integer(Class)),
+      weights = ~Freq
+    )
+
+  # factors and integer check
+  expect_silent(
+    ard <-
+      ard_categorical(svy_titanic, variables = c(Class, Age, Class.int, Class.dbl), by = Survived)
+  )
+  expect_equal(
+    unlist(ard$group1_level) |> levels(),
+    levels(as.data.frame(Titanic)$Survived)
+  )
+  expect_true(
+    dplyr::filter(ard, variable %in% "Class") |>
+      dplyr::pull("variable_level") |>
+      getElement(1L) |>
+      is.factor()
+  )
+  expect_true(
+    dplyr::filter(ard, variable %in% "Age") |>
+      dplyr::pull("variable_level") |>
+      getElement(1L) |>
+      is.factor()
+  )
+  expect_true(
+    dplyr::filter(ard, variable %in% "Class.int") |>
+      dplyr::pull("variable_level") |>
+      getElement(1L) |>
+      is.integer()
+  )
+  expect_true(
+    dplyr::filter(ard, variable %in% "Class.dbl") |>
+      dplyr::pull("variable_level") |>
+      getElement(1L) |>
+      is.numeric()
+  )
+})
diff --git a/tests/testthat/test-ard_categorical_ci.survey.design.R b/tests/testthat/test-ard_categorical_ci.survey.design.R
@@ -6,7 +6,6 @@ dclus1 <- survey::svydesign(id = ~dnum, weights = ~pw, data = apiclus1, fpc = ~f
 test_that("ard_categorical_ci(data)", {
   expect_snapshot(
     ard_categorical_ci(dclus1, variables = c(both, awards)) |>
-      as.data.frame() |>
       dplyr::select(-warning, -error, -fmt_fn, -context)
   )
 })
@@ -17,7 +16,7 @@ test_that("ard_categorical_ci(variables)", {
   )
 
   expect_equal(
-    cards::get_ard_statistics(ard, variable %in% "both", variable_level %in% "No")[c("estimate", "conf.low", "conf.high")],
+    cards::get_ard_statistics(ard, variable %in% "both", map(variable_level, as.character) %in% "No")[c("estimate", "conf.low", "conf.high")],
     survey::svyciprop(~ I(both == "No"), design = dclus1, method = "logit", level = 0.95) %>%
       {c(as.list(.), as.list(attr(., "ci")))} |> # styler: off
       set_names(c("estimate", "conf.low", "conf.high"))
@@ -56,9 +55,9 @@ test_that("ard_categorical_ci(by)", {
   expect_equal(
     cards::get_ard_statistics(
       ard,
-      group1_level %in% "No",
+      map(group1_level, as.character) %in% "No",
       variable %in% "both",
-      variable_level %in% "No",
+      map(variable_level, as.character) %in% "No",
       stat_name %in% c("estimate", "conf.low", "conf.high")
     ),
     survey::svyciprop(~ I(both == "No"), design = dclus1 |> subset(sch.wide == "No")) %>%
@@ -92,7 +91,7 @@ test_that("ard_categorical_ci(conf.level)", {
   )
 
   expect_equal(
-    cards::get_ard_statistics(ard, variable %in% "both", variable_level == "No", stat_name %in% c("estimate", "conf.low", "conf.high")),
+    cards::get_ard_statistics(ard, variable %in% "both", map(variable_level, as.character) %in% "No", stat_name %in% c("estimate", "conf.low", "conf.high")),
     survey::svyciprop(~ I(both == "No"), design = dclus1, level = 0.80, df = survey::degf(dclus1)) %>%
       {c(as.list(.), as.list(attr(., "ci")))} |> # styler: off
       set_names(c("estimate", "conf.low", "conf.high"))
@@ -105,11 +104,31 @@ test_that("ard_categorical_ci(method)", {
   )
 
   expect_equal(
-    cards::get_ard_statistics(ard, variable %in% "both", variable_level == "No", stat_name %in% c("estimate", "conf.low", "conf.high")),
+    cards::get_ard_statistics(ard, variable %in% "both", map(variable_level, as.character) %in% "No", stat_name %in% c("estimate", "conf.low", "conf.high")),
     survey::svyciprop(~ I(both == "No"), design = dclus1, method = "likelihood", df = survey::degf(dclus1)) %>%
       {c(as.list(.), as.list(attr(., "ci")))} |> # styler: off
       set_names(c("estimate", "conf.low", "conf.high"))
   )
+
+  # check type
+  expect_true(ard$variable_level |> unique() |> map_lgl(is.factor) |> all())
+})
+
+test_that("ard_categorical_ci(value)", {
+  data(api, package = "survey")
+  dclus1 <- survey::svydesign(id = ~dnum, weights = ~pw, data = apiclus1, fpc = ~fpc)
+
+  expect_equal(
+    ard_categorical_ci(dclus1, variables = sch.wide, value = sch.wide ~ "Yes", method = "xlogit"),
+    ard_categorical_ci(dclus1, variables = sch.wide, method = "xlogit") |>
+      dplyr::filter(unlist(variable_level) %in% "Yes")
+  )
+
+  expect_equal(
+    ard_categorical_ci(dclus1, variables = c(sch.wide, both), value = list(sch.wide ~ "Yes", both ~ "Yes"), method = "xlogit"),
+    ard_categorical_ci(dclus1, variables = c(sch.wide, both), method = "xlogit") |>
+      dplyr::filter(map(variable_level, as.character) %in% "Yes")
+  )
 })
 
 test_that("ard_categorical_ci.survey.design() follows ard structure", {