Added function to restore types in ARDs (#231)

**What changes are proposed in this pull request?** * Methods in the {survey} and {survival} packages do not retain inputs variables types in their outputs. We now are able retain these variable types in ARDs returned by `ard_continuous.survey.design()`, `ard_categorical.survey.design()`, `ard_continuous_ci.survey.design()`, `ard_categorical_ci.survey.design()`, and `ard_survival_survfit.data.frame()`. The survey results were being all converted to character, which resulted in incorrect ordering/sorting downstream. I was trying to handle each case separately, but in the end it's just easier to handle the issue at the source (despite this code being a bit ugly). -------------------------------------------------------------------------------- Pre-review Checklist (if item does not apply, mark is as complete) - [x] **All** GitHub Action workflows pass with a ✅ - [x] PR branch has pulled the most recent updates from master branch: `usethis::pr_merge_main()` - [x] If a bug was fixed, a unit test was added. - [x] If a new `ard_*()` function was added, it passes the ARD structural checks from `cards::check_ard_structure()`. - [x] If a new `ard_*()` function was added, `set_cli_abort_call()` has been set. - [x] If a new `ard_*()` function was added and it depends on another package (such as, `broom`), `is_pkg_installed("broom")` has been set in the function call and the following added to the roxygen comments: `@examplesIf do.call(asNamespace("cardx")$is_pkg_installed, list(pkg = "broom""))` - [x] Code coverage is suitable for any new functions/features (generally, 100% coverage for new code): `devtools::test_coverage()` Reviewer Checklist (if item does not apply, mark is as complete) - [ ] If a bug was fixed, a unit test was added. - [ ] Code coverage is suitable for any new functions/features: `devtools::test_coverage()` When the branch is ready to be merged: - [ ] Update `NEWS.md` with the changes from this pull request under the heading "`# cardx (development version)`". If there is an issue associated with the pull request, reference it in parentheses at the end update (see `NEWS.md` for examples). - [ ] **All** GitHub Action workflows pass with a ✅ - [ ] Approve Pull Request - [ ] Merge the PR. Please use "Squash and merge" or "Rebase and merge".
insightsengineering · Nov 4, 2024 · bc1c9b0 · bc1c9b0
1 parent e5f7b41
commit bc1c9b0
Show file tree

Hide file tree

Showing 17 changed files with 288 additions and 48 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,11 +1,13 @@
 # cardx 0.2.1.9011
 
-* Implemented `summary(extend=TRUE)` in `ard_survival_survfit()` to return results for time points out of bounds. (#224)
-
 * Added a `data.frame` method to `ard_survival_survfit()`.
 
 * Added a warning for incorrect formula type to `ard_survival_survfit()`. (#223)
 
+* Implemented `summary(extend=TRUE)` in `ard_survival_survfit()` to return results for time points out of bounds. (#224)
+
+* Methods in the {survey} and {survival} packages do not retain inputs variables types in their outputs. We now are able retain these variable types in ARDs returned by `ard_continuous.survey.design()`, `ard_categorical.survey.design()`, `ard_continuous_ci.survey.design()`, `ard_categorical_ci.survey.design()`, and `ard_survival_survfit.data.frame()` (and notably, _not_ in `ard_survival_survfit.survfit()`).
+
 # cardx 0.2.1
 
 ## New Features and Updates

diff --git a/R/ard_categorical.survey.design.R b/R/ard_categorical.survey.design.R
@@ -27,7 +27,7 @@
 #'   a named list, a list of formulas,
 #'   or a single formula where the list element is a named list of functions
 #'   (or the RHS of a formula),
-#'   e.g. `list(mpg = list(mean = \(x) round(x, digits = 2) |> as.character))`.
+#'   e.g. `list(mpg = list(mean = \(x) round(x, digits = 2) |> as.character()))`.
 #' @param stat_label ([`formula-list-selector`][cards::syntax])\cr
 #'   a named list, a list of formulas, or a single formula where
 #'   the list element is either a named list or a list of formulas defining the
@@ -207,6 +207,7 @@ ard_categorical.survey.design <- function(data,
 
   # return final object --------------------------------------------------------
   cards |>
+    .restore_original_column_types(data = data$variables) |>
     dplyr::mutate(
       context = "categorical",
       warning = list(NULL),

diff --git a/R/ard_categorical_ci.survey.design.R b/R/ard_categorical_ci.survey.design.R
@@ -145,12 +145,13 @@ ard_categorical_ci.survey.design <- function(data,
       fmt_fn = map(.data$stat, ~ case_switch(is.numeric(.x) ~ 2L, .default = as.character))
     ) |>
     cards::as_card() |>
-    cards::tidy_ard_column_order()
+    cards::tidy_ard_column_order() |>
+    .restore_original_column_types(data = data$variables)
 
   # if a value was passed for the variable, subset on those results
   if (!is_empty(value)) {
     df_full <- df_full |>
-      dplyr::filter(.data$variable_level %in% .env$value)
+      dplyr::filter(unlist(.data$variable_level) %in% .env$value)
   }
 
   df_full

diff --git a/R/ard_continuous.survey.design.R b/R/ard_continuous.survey.design.R
@@ -99,8 +99,8 @@ ard_continuous.survey.design <- function(data, variables, by = NULL,
         )
       }
     ) |>
-    dplyr::bind_rows()
-
+    dplyr::bind_rows() |>
+    .restore_original_column_types(data = data$variables)
 
   # add stat_labels ------------------------------------------------------------
   df_stats <-
@@ -319,3 +319,109 @@ accepted_svy_stats <- function(expand_quantiles = TRUE) {
       )
     )
 }
+
+
+# some operations coerce the variable types to character.
+# this function will convert the `_level` values to their original types
+.restore_original_column_types <- function(ard, data) {
+  # identify grouping variable names with associated levels --------------------
+  by <- character()
+  for (v in names(dplyr::select(ard, cards::all_ard_groups("names")))) {
+    if (paste0(v, "_level") %in% names(ard)) by <- c(by, ard[[v]][1]) # styler: off
+  }
+  variables <- character()
+  if ("variable" %in% names(ard)) {
+    variables <- ard[["variable"]] |> unique()
+  }
+
+  # if there are no levels to correct, then return ard as it is
+  if (is_empty(variables) && is_empty(by)) return(ard) # styler: off
+
+  # add an ID for sorting
+  ard$...ard_id_for_sorting... <- seq_len(nrow(ard))
+
+  # nest the raw data with original types --------------------------------------
+  if (!is_empty(variables)) {
+    if (!"variable_level" %in% names(ard)) df_variable_orginal_types <- unique(ard["variable"]) # styler: off
+    else if (!all(variables %in% names(data))) { # for survfit summaries, the times/probs var is not in the data
+      df_variable_orginal_types <- unique(ard[c("variable", "variable_level")])
+    } else {
+      df_variable_orginal_types <-
+        map(
+          variables,
+          ~ cards::nest_for_ard(data, by = .x, include_data = FALSE) |>
+            stats::setNames(c("variable", "variable_level"))
+        ) |>
+        dplyr::bind_rows()
+    }
+  }
+  if (!is_empty(by)) {
+    df_by_orginal_types <-
+      cards::nest_for_ard(data, by = by, include_data = FALSE)
+  }
+
+  # combine groups and variables together
+  if (!is_empty(variables) && !is_empty(by)) {
+    df_original_types <-
+      dplyr::cross_join(df_by_orginal_types, df_variable_orginal_types)
+  } else if (!is_empty(variables)) {
+    df_original_types <- df_variable_orginal_types
+  } else if (!is_empty(by)) {
+    df_original_types <- df_by_orginal_types
+  }
+
+  # unlisting the sorting according the character value
+  df_original_types <- df_original_types |>
+    dplyr::arrange(across(everything(), ~ map(., as.character) |> unlist()))
+
+  ard_nested <- ard |>
+    tidyr::nest(..ard_data... = -c(cards::all_ard_groups(), cards::all_ard_variables())) |>
+    dplyr::arrange(across(c(cards::all_ard_groups(), cards::all_ard_variables()), unlist))
+
+  # if all columns match, then replace the coerced character cols with their original type/class
+  all_cols_equal <-
+    every(
+      names(df_original_types) |> setdiff("variable_level"),
+      ~ all(
+        unlist(ard_nested[[.x]]) == as.character(unlist(df_original_types[[.x]])) |
+          (is.na(unlist(ard_nested[[.x]])) & is.na(unlist(df_original_types[[.x]])))
+      )
+    )
+  # the variable level needs to be handled separately because there can be mixed type and we can't unlist
+  if (isTRUE(all_cols_equal) && "variable_level" %in% names(df_original_types)) {
+    all_cols_equal <-
+      seq_len(nrow(df_original_types)) |>
+      map_lgl(
+        ~ identical(
+          as.character(df_original_types[["variable_level"]][[.x]]),
+          as.character(ard_nested[["variable_level"]][[.x]])
+        )
+      ) |>
+      all()
+  }
+
+  if (isTRUE(all_cols_equal)) {
+    return(
+      dplyr::bind_cols(
+        df_original_types,
+        dplyr::select(ard_nested, -all_of(names(df_original_types))),
+        .name_repair = "minimal"
+      ) |>
+        tidyr::unnest(cols = "..ard_data...") |>
+        dplyr::arrange(.data$...ard_id_for_sorting...) |>
+        dplyr::select(-"...ard_id_for_sorting...") |>
+        cards::as_card()
+    )
+  }
+
+  # I hope this message is never triggered!
+  cli::cli_inform(c(
+    "If you see this message, variable levels have been coerced to character, which could cause downstream issues.",
+    "*" = "Please post a reproducible example to {.url https://github.com/insightsengineering/cardx/issues/new}, so we can address in the next release.",
+    "i" = "You can create a minimal reproducible example with {.fun reprex::reprex}."
+  ))
+
+  ard |>
+    dplyr::arrange(.data$...ard_id_for_sorting...) |>
+    dplyr::select(-"...ard_id_for_sorting...")
+}
diff --git a/R/ard_continuous_ci.survey.design.R b/R/ard_continuous_ci.survey.design.R
@@ -74,7 +74,8 @@ ard_continuous_ci.survey.design <- function(data,
     method = method,
     df = df,
     ...
-  )
+  ) |>
+    .restore_original_column_types(data = data$variables)
 }
 
 .calculate_ard_continuous_survey_ci <- function(FUN, data, variables, by, conf.level, ...) {

diff --git a/R/ard_survival_survfit.R b/R/ard_survival_survfit.R
@@ -52,6 +52,13 @@
 #' ```
 #' You **cannot**, however, pass a stored formula, e.g. `survfit(my_formula, lung)`
 #'
+#' @section Variable Classes:
+#' When the `survfit` method is called, the class of the stratifying variables
+#' will be returned as a factor.
+#'
+#' When the data frame method is called, the original classes are retained in the
+#' resulting ARD.
+#'
 #' @return an ARD data frame of class 'card'
 #' @name ard_survival_survfit
 #'
@@ -179,7 +186,8 @@ ard_survival_survfit.data.frame <- function(x, y, variables,
     package = "survival",
     method.args = {{ method.args }}
   ) |>
-    ard_survival_survfit(times = times, probs = probs, type = type)
+    ard_survival_survfit(times = times, probs = probs, type = type) |>
+    .restore_original_column_types(data = x)
 }
 
 #' Process Survival Fit For Time Estimates

diff --git a/man/ard_categorical.survey.design.Rd b/man/ard_categorical.survey.design.Rd
diff --git a/man/ard_dichotomous.survey.design.Rd b/man/ard_dichotomous.survey.design.Rd
diff --git a/man/ard_missing.survey.design.Rd b/man/ard_missing.survey.design.Rd
diff --git a/man/ard_survival_survfit.Rd b/man/ard_survival_survfit.Rd
diff --git a/tests/testthat/_snaps/ard_categorical_ci.survey.design.md b/tests/testthat/_snaps/ard_categorical_ci.survey.design.md
@@ -1,28 +1,30 @@
 # ard_categorical_ci(data)
 
     Code
-      dplyr::select(as.data.frame(ard_categorical_ci(dclus1, variables = c(both,
-        awards))), -warning, -error, -fmt_fn, -context)
+      dplyr::select(ard_categorical_ci(dclus1, variables = c(both, awards)), -warning,
+      -error, -fmt_fn, -context)
+    Message
+      {cards} data frame: 20 x 5
     Output
-         variable variable_level  stat_name stat_label      stat
-      1      both             No   estimate   estimate  0.273224
-      2      both             No   conf.low   conf.low 0.2131745
-      3      both             No  conf.high  conf.high  0.342819
-      4      both             No     method     method     logit
-      5      both             No conf.level conf.level      0.95
-      6      both            Yes   estimate   estimate  0.726776
-      7      both            Yes   conf.low   conf.low  0.657181
-      8      both            Yes  conf.high  conf.high 0.7868255
-      9      both            Yes     method     method     logit
-      10     both            Yes conf.level conf.level      0.95
-      11   awards             No   estimate   estimate 0.2896175
-      12   awards             No   conf.low   conf.low 0.2241835
-      13   awards             No  conf.high  conf.high 0.3651608
-      14   awards             No     method     method     logit
-      15   awards             No conf.level conf.level      0.95
-      16   awards            Yes   estimate   estimate 0.7103825
-      17   awards            Yes   conf.low   conf.low 0.6348392
-      18   awards            Yes  conf.high  conf.high 0.7758165
-      19   awards            Yes     method     method     logit
-      20   awards            Yes conf.level conf.level      0.95
+         variable variable_level  stat_name stat_label  stat
+      1      both             No   estimate   estimate 0.273
+      2      both             No   conf.low   conf.low 0.213
+      3      both             No  conf.high  conf.high 0.343
+      4      both             No     method     method logit
+      5      both             No conf.level  conf.lev…  0.95
+      6      both            Yes   estimate   estimate 0.727
+      7      both            Yes   conf.low   conf.low 0.657
+      8      both            Yes  conf.high  conf.high 0.787
+      9      both            Yes     method     method logit
+      10     both            Yes conf.level  conf.lev…  0.95
+      11   awards             No   estimate   estimate  0.29
+      12   awards             No   conf.low   conf.low 0.224
+      13   awards             No  conf.high  conf.high 0.365
+      14   awards             No     method     method logit
+      15   awards             No conf.level  conf.lev…  0.95
+      16   awards            Yes   estimate   estimate  0.71
+      17   awards            Yes   conf.low   conf.low 0.635
+      18   awards            Yes  conf.high  conf.high 0.776
+      19   awards            Yes     method     method logit
+      20   awards            Yes conf.level  conf.lev…  0.95
 
diff --git a/tests/testthat/_snaps/ard_survival_survfit.md b/tests/testthat/_snaps/ard_survival_survfit.md
@@ -364,7 +364,7 @@
 # ard_survival_survfit.data.frame() works as expected
 
     Code
-      res_quo <- print(dplyr::mutate(ard_survival_survfit.data.frame(x = mtcars, y = "survival::Surv(mpg, am)",
+      res_quo <- print(dplyr::mutate(ard_survival_survfit(x = mtcars, y = "survival::Surv(mpg, am)",
         variables = "vs", times = 20, method.args = list(start.time = 0, id = cyl)),
       stat = lapply(stat, function(x) ifelse(is.numeric(x), cards::round5(x, 3), x))),
       n = Inf)

diff --git a/tests/testthat/test-ard_categorical.survey.design.R b/tests/testthat/test-ard_categorical.survey.design.R
@@ -1324,3 +1324,49 @@ test_that("ard_categorical follows ard structure", {
       cards::check_ard_structure(method = FALSE)
   )
 })
+
+test_that("ard_categorical.survey.design() original types are retained", {
+  svy_titanic <-
+    survey::svydesign(
+      ~1,
+      data = as.data.frame(Titanic) |> dplyr::mutate(
+        Class.dbl = as.numeric(Class),
+        Class.int = as.integer(Class)
+      ),
+      weights = ~Freq
+    )
+
+  # factors and integer check
+  expect_silent(
+    ard <-
+      ard_categorical(svy_titanic, variables = c(Class, Age, Class.int, Class.dbl), by = Survived)
+  )
+  expect_equal(
+    unlist(ard$group1_level) |> levels(),
+    levels(as.data.frame(Titanic)$Survived)
+  )
+  expect_true(
+    dplyr::filter(ard, variable %in% "Class") |>
+      dplyr::pull("variable_level") |>
+      getElement(1L) |>
+      is.factor()
+  )
+  expect_true(
+    dplyr::filter(ard, variable %in% "Age") |>
+      dplyr::pull("variable_level") |>
+      getElement(1L) |>
+      is.factor()
+  )
+  expect_true(
+    dplyr::filter(ard, variable %in% "Class.int") |>
+      dplyr::pull("variable_level") |>
+      getElement(1L) |>
+      is.integer()
+  )
+  expect_true(
+    dplyr::filter(ard, variable %in% "Class.dbl") |>
+      dplyr::pull("variable_level") |>
+      getElement(1L) |>
+      is.numeric()
+  )
+})