in progress

insightsengineering · Jun 14, 2024 · 9c5e3e3 · 9c5e3e3
1 parent bc7c3c3
commit 9c5e3e3
Show file tree

Hide file tree

Showing 12 changed files with 541 additions and 55 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,8 @@
 S3method(ard_attributes,survey.design)
 S3method(ard_categorical,survey.design)
 S3method(ard_continuous,survey.design)
+S3method(ard_dichotomous,survey.design)
+S3method(ard_missing,survey.design)
 S3method(ard_regression,default)
 S3method(ard_stats_anova,anova)
 S3method(ard_stats_anova,data.frame)

diff --git a/R/ard_categorical.survey.design.R b/R/ard_categorical.survey.design.R
@@ -12,7 +12,7 @@
 #' @param data (`survey.design`)\cr
 #'   a design object often created with [`survey::svydesign()`].
 #' @param variables ([`tidy-select`][dplyr::dplyr_tidy_select])\cr
-#'   columns to include in summaries. Default is `everything()`.
+#'   columns to include in summaries.
 #' @param by ([`tidy-select`][dplyr::dplyr_tidy_select])\cr
 #'   results are calculated for **all combinations** of the column specified
 #'   and the variables. A single column may be specified.
@@ -43,7 +43,7 @@
 #'
 #' ard_categorical(svy_titanic, variables = c(Class, Age), by = Survived)
 ard_categorical.survey.design <- function(data,
-                                          variables = everything(),
+                                          variables,
                                           by = NULL,
                                           statistic = everything() ~ c("n", "N", "p", "p.std.error", "deff", "n_unweighted", "N_unweighted", "p_unweighted"),
                                           denominator = c("column", "row", "cell"),
@@ -52,9 +52,12 @@ ard_categorical.survey.design <- function(data,
                                                                            "n" = "Unweighted n", "N" = "Unweighted N", "p" = "Unweighted %"),
                                           ...) {
   set_cli_abort_call()
+  check_pkg_installed(pkg = "survey", reference_pkg = "cardx")
+  check_dots_empty()
   deff <- TRUE # we may update in the future to make this an argument for users
 
   # process arguments ----------------------------------------------------------
+  check_not_missing(variables)
   cards::process_selectors(
     data = data$variables,
     variables = {{ variables }},
@@ -81,7 +84,7 @@ ard_categorical.survey.design <- function(data,
     x = statistic,
     predicate = \(x) all(x %in% accepted_svy_stats),
     error_msg = c("Error in the values of the {.arg statistic} argument.",
-      i = "Values must be in {.val {accepted_svy_stats}}"
+                  i = "Values must be in {.val {accepted_svy_stats}}"
     )
   )
   denominator <- arg_match(denominator)
@@ -136,6 +139,13 @@ ard_categorical.survey.design <- function(data,
       by = any_of(by),
       denominator = denominator
     ) |>
+    # all the survey levels are reported as character, so we do the same here.
+    dplyr::mutate(
+      across(
+        c(cards::all_ard_groups("levels"), cards::all_ard_variables("levels")),
+        ~map(.x, as.character)
+      )
+    ) |>
     dplyr::select(-c("stat_label", "fmt_fn", "warning", "error")) |>
     dplyr::mutate(
       stat_name =
@@ -166,9 +176,10 @@ ard_categorical.survey.design <- function(data,
       context = "categorical",
       warning = list(NULL),
       error = list(NULL),
-    ) |>
+    )  |>
     cards::tidy_ard_column_order() %>%
-    {structure(., class = c("card", class(.)))} # styler: off
+    {structure(., class = c("card", class(.)))} |> # styler: off
+    cards::tidy_ard_row_order()
 }
 
 
@@ -177,24 +188,52 @@ ard_categorical.survey.design <- function(data,
   lapply(
     variables,
     \(variable) {
+      # convert the variable to a factor if not already one or a lgl, so we get the correct rate stats from svymean
+      if (!inherits(data$variables[[variable]], c("factor", "logical"))) {
+        data$variables[[variable]] <- factor(data$variables[[variable]])
+      }
+
+      # there are issues with svymean() when a variable has only one level. adding a second as needed
+      variable_lvls <- .unique_values_sort(data$variables, variable)
+      if (!is_empty(by)) by_lvls <- .unique_values_sort(data$variables, by) # styler: off
+      if (length(variable_lvls) == 1L) {
+        data$variables[[variable]] <-
+          factor(data$variables[[variable]], levels = c(variable_lvls, paste("not", variable_lvls)))
+      }
+      if (!is_empty(by) && length(by_lvls) == 1L) {
+        data$variables[[by]] <-
+          factor(data$variables[[by]], levels = c(by_lvls, paste("not", by_lvls)))
+      }
+
       # each combination of denominator and whether there is a by variable is handled separately
-      case_switch(
-        # by variable and column percentages
-        !is_empty(by) && denominator == "column" ~
-          .one_svytable_rates_by_column(data, variable, by, deff),
-        # by variable and row percentages
-        !is_empty(by) && denominator == "row" ~
-          .one_svytable_rates_by_row(data, variable, by, deff),
-        # by variable and cell percentages
-        !is_empty(by) && denominator == "cell" ~
-          .one_svytable_rates_by_cell(data, variable, by, deff),
-        # no by variable and column/cell percentages
-        denominator %in% c("column", "cell") ~
-          .one_svytable_rates_no_by_column_and_cell(data, variable, deff),
-        # no by variable and row percentages
-        denominator == "row" ~
-          .one_svytable_rates_no_by_row(data, variable, deff)
-      )
+      result <-
+        case_switch(
+          # by variable and column percentages
+          !is_empty(by) && denominator == "column" ~
+            .one_svytable_rates_by_column(data, variable, by, deff),
+          # by variable and row percentages
+          !is_empty(by) && denominator == "row" ~
+            .one_svytable_rates_by_row(data, variable, by, deff),
+          # by variable and cell percentages
+          !is_empty(by) && denominator == "cell" ~
+            .one_svytable_rates_by_cell(data, variable, by, deff),
+          # no by variable and column/cell percentages
+          denominator %in% c("column", "cell") ~
+            .one_svytable_rates_no_by_column_and_cell(data, variable, deff),
+          # no by variable and row percentages
+          denominator == "row" ~
+            .one_svytable_rates_no_by_row(data, variable, deff)
+        )
+
+      # if a level was added, remove the fake level
+      if (length(variable_lvls) == 1L) {
+        result <- result |> dplyr::filter(.data$variable_level %in% variable_lvls)
+      }
+      if (!is_empty(by) && length(by_lvls) == 1L) {
+        result <- result |> dplyr::filter(.data$group1_level %in% by_lvls)
+      }
+
+      result
     }
   ) |>
     dplyr::bind_rows()
@@ -262,9 +301,9 @@ ard_categorical.survey.design <- function(data,
         ),
       name =
         str_remove_all(.data$name, "se\\.") %>%
-          str_remove_all("DEff\\.") %>%
-          str_remove_all(by) %>%
-          str_remove_all("`")
+        str_remove_all("DEff\\.") %>%
+        str_remove_all(by) %>%
+        str_remove_all("`")
     ) |>
     tidyr::pivot_wider(names_from = "stat", values_from = "value") |>
     set_names(c("variable_level", "group1_level", "p", "p.std.error", "deff")) |>
@@ -295,9 +334,9 @@ ard_categorical.survey.design <- function(data,
         ),
       name =
         str_remove_all(.data$name, "se\\.") %>%
-          str_remove_all("DEff\\.") %>%
-          str_remove_all(variable) %>%
-          str_remove_all("`")
+        str_remove_all("DEff\\.") %>%
+        str_remove_all(variable) %>%
+        str_remove_all("`")
     ) |>
     tidyr::pivot_wider(names_from = "stat", values_from = "value") |>
     set_names(c("group1_level", "variable_level", "p", "p.std.error", "deff")) |>
@@ -339,27 +378,27 @@ ard_categorical.survey.design <- function(data,
 
   # add big N and p, then return data frame of results
   switch(denominator,
-    "column" =
-      df_counts |>
-        dplyr::mutate(
-          .by = any_of("group1_level"),
-          N = sum(.data$n),
-          p = .data$n / .data$N
-        ),
-    "row" =
-      df_counts |>
-        dplyr::mutate(
-          .by = any_of("variable_level"),
-          N = sum(.data$n),
-          p = .data$n / .data$N
-        ),
-    "cell" =
-      df_counts |>
-        dplyr::mutate(
-          .by = any_of(c("group1_level", "variable_level")),
-          N = sum(.data$n),
-          p = .data$n / .data$N
-        )
+         "column" =
+           df_counts |>
+           dplyr::mutate(
+             .by = any_of("group1_level"),
+             N = sum(.data$n),
+             p = .data$n / .data$N
+           ),
+         "row" =
+           df_counts |>
+           dplyr::mutate(
+             .by = any_of("variable_level"),
+             N = sum(.data$n),
+             p = .data$n / .data$N
+           ),
+         "cell" =
+           df_counts |>
+           dplyr::mutate(
+             .by = any_of(c("group1_level", "variable_level")),
+             N = sum(.data$n),
+             p = .data$n / .data$N
+           )
   )
 }
 

diff --git a/R/ard_continuous.survey.design.R b/R/ard_continuous.survey.design.R
@@ -5,7 +5,7 @@
 #' @param data (`survey.design`)\cr
 #'   a design object often created with [`survey::svydesign()`].
 #' @param variables ([`tidy-select`][dplyr::dplyr_tidy_select])\cr
-#'   columns to include in summaries. Default is `everything()`.
+#'   columns to include in summaries.
 #' @param by ([`tidy-select`][dplyr::dplyr_tidy_select])\cr
 #'   results are calculated for **all combinations** of the columns specified,
 #'   including unobserved combinations and unobserved factor levels.
@@ -56,8 +56,6 @@ ard_continuous.survey.design <- function(data, variables, by = NULL,
   check_pkg_installed(pkg = "survey", reference_pkg = "cardx")
 
   # check inputs ---------------------------------------------------------------
-  check_not_missing(data)
-  check_class(data, cls = "survey.design")
   check_not_missing(variables)
 
   # process inputs -------------------------------------------------------------

diff --git a/R/ard_dichotomous.survey.design.R b/R/ard_dichotomous.survey.design.R
@@ -0,0 +1,146 @@
+#' ARD Dichotomous Survey Statistics
+#'
+#' Compute Analysis Results Data (ARD) for dichotomous summary statistics.
+#'
+#' @inheritParams ard_categorical.survey.design
+#' @param value (named `list`)\cr
+#'   named list of dichotomous values to tabulate.
+#'   Default is `cards::maximum_variable_value(data$variables)`,
+#'   which returns the largest/last value after a sort.
+#'
+#' @return an ARD data frame of class 'card'
+#' @export
+#'
+#' @examples
+#' survey::svydesign(ids = ~1, data = mtcars, weights = ~1) |>
+#'   ard_dichotomous(by = vs, variables = c(cyl, am), value = list(cyl = 4))
+ard_dichotomous.survey.design <- function(data,
+                                          variables,
+                                          by = NULL,
+                                          value = cards::maximum_variable_value(data$variables[variables]),
+                                          statistic = everything() ~ c("n", "N", "p", "p.std.error", "deff", "n_unweighted", "N_unweighted", "p_unweighted"),
+                                          denominator = c("column", "row", "cell"),
+                                          fmt_fn = NULL,
+                                          stat_label = everything() ~ list(p = "%", p.std.error = "SE(%)", deff = "Design Effect",
+                                                                           "n" = "Unweighted n", "N" = "Unweighted N", "p" = "Unweighted %"),
+                                          ...) {
+  set_cli_abort_call()
+  check_dots_empty()
+  check_pkg_installed(pkg = "survey", reference_pkg = "cardx")
+
+  # check inputs ---------------------------------------------------------------
+  check_not_missing(variables)
+
+  # process inputs -------------------------------------------------------------
+  cards::process_selectors(data$variables, variables = {{ variables }})
+  cards::process_formula_selectors(data$variables[variables], value = value)
+  cards::fill_formula_selectors(
+    data$variables[variables],
+    value = formals(asNamespace("cardx")[["ard_dichotomous.survey.design"]])[["value"]] |> eval()
+  )
+  .check_dichotomous_value(data$variables, value)
+
+  # return empty tibble if no variables selected -------------------------------
+  if (is_empty(variables)) {
+    return(dplyr::tibble())
+  }
+
+  # calculate summary statistics -----------------------------------------------
+  ard_categorical(
+    data = data,
+    variables = all_of(variables),
+    by = {{ by }},
+    statistic = statistic,
+    denominator = denominator,
+    fmt_fn = fmt_fn,
+    stat_label = stat_label
+  ) |>
+    dplyr::filter(
+      pmap(
+        list(.data$variable, .data$variable_level),
+        function(variable, variable_level) {
+          variable_level %in% .env$value[[variable]]
+        }
+      ) |>
+        unlist()
+    ) |>
+    dplyr::mutate(context = "dichotomous")
+}
+
+#' Perform Value Checks
+#'
+#' Check the validity of the values passed in `ard_dichotomous(value)`.
+#'
+#' @param data (`data.frame`)\cr
+#'   a data frame
+#' @param value (named `list`)\cr
+#'   a named list
+#'
+#' @return returns invisible if check is successful, throws an error message if not.
+#' @keywords internal
+#'
+#' @examples
+#' cardx:::.check_dichotomous_value(mtcars, list(cyl = 4))
+.check_dichotomous_value <- function(data, value) {
+  imap(
+    value,
+    function(value, column) {
+      accepted_values <- .unique_and_sorted(data[[column]])
+      if (length(value) != 1L || !value %in% accepted_values) {
+        message <- "Error in argument {.arg value} for variable {.val {column}}."
+        cli::cli_abort(
+          if (length(value) != 1L) {
+            c(message, "i" = "The value must be one of {.val {accepted_values}}.")
+          } else {
+            c(message, "i" = "A value of {.val {value}} was passed, but must be one of {.val {accepted_values}}.")
+          },
+          call = get_cli_abort_call()
+        )
+      }
+    }
+  ) |>
+    invisible()
+}
+
+#' ARD-flavor of unique()
+#'
+#' Essentially a wrapper for `unique(x) |> sort()` with `NA` levels removed.
+#' For factors, all levels are returned even if they are unobserved.
+#' Similarly, logical vectors always return `c(TRUE, FALSE)`, even if
+#' both levels are not observed.
+#'
+#' @param x (`any`)\cr
+#'   a vector
+#'
+#' @return a vector
+#' @keywords internal
+#'
+#' @examples
+#' cards:::.unique_and_sorted(factor(letters[c(5, 5:1)], levels = letters))
+#'
+#' cards:::.unique_and_sorted(c(FALSE, TRUE, TRUE, FALSE))
+#'
+#' cards:::.unique_and_sorted(c(5, 5:1))
+.unique_and_sorted <- function(x, useNA = c("no", "always")) {
+  # styler: off
+  useNA <- match.arg(useNA)
+  # if a factor return a factor that includes the same levels (including unobserved levels)
+  if (inherits(x, "factor")) {
+    return(
+      factor(
+        if (useNA == "no") levels(x)
+        else c(levels(x), NA_character_),
+        levels = levels(x)
+      )
+    )
+  }
+  if (inherits(x, "logical")) {
+    if (useNA == "no") return(c(TRUE, FALSE))
+    else return(c(TRUE, FALSE, NA))
+  }
+
+  # otherwise, return a simple unique and sort of the vector
+  if (useNA == "no") return(unique(x) |> sort())
+  else return(unique(x) |> sort() |> c(NA))
+  # styler: on
+}