Catching errors for mismatches in alt_counts_df splits (#721)

* error catching * adding mix cases * feat: we need this only in the error * adding more explicit check * news * correction * chore: adding "" * adding more things and discovering a possible issue * spaces fix * Update R/tt_dotabulation.R Co-authored-by: Joe Zhu <[email protected]> Signed-off-by: Davide Garolini <[email protected]> --------- Signed-off-by: Davide Garolini <[email protected]> Signed-off-by: Davide Garolini <[email protected]> Co-authored-by: Joe Zhu <[email protected]>
insightsengineering · Aug 29, 2023 · 7c23ddf · 7c23ddf
1 parent 25600cb
commit 7c23ddf
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 0 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -16,6 +16,7 @@
  * Cleaned up spelling in documentation ([#685](https://github.com/insightsengineering/rtables/issues/685)) 
  * Added `qtable_layout` and fixed `qtable` labeling via `row_labels` ([#698](https://github.com/insightsengineering/rtables/issues/698))
  * Added vignette on exploratory analysis with `qtable`.
+ * Error catching and test coverage for cases where `alt_counts_df` presents different splits from `df`.
 
 ## rtables 0.6.2
  * Fixed major regressions for `page_by` machinery caused by migration to `formatters` 0.5.1 pagination framework.

diff --git a/R/tt_dotabulation.R b/R/tt_dotabulation.R
@@ -753,12 +753,46 @@ setMethod(".make_split_kids", "Split",
                                         spl_context = spl_context)[["datasplit"]],
                                error = function(e) e)
 
+        # Removing NA rows - to explore why this happens at all in a split
+        # This would be a fix but it is done in post-processing instead of pre-proc -> xxx
+        # x alt_dfpart <- lapply(alt_dfpart, function(data) {
+        # x    data[!apply(is.na(data), 1, all), ]
+        # x })
+
         # Error localization
         if (is(alt_dfpart, "error")) {
             stop("Following error encountered in splitting alt_counts_df: ", 
                  alt_dfpart$message,
                  call. = FALSE)
         }
+        # Error if split does not have the same values in the alt_df (and order)
+        # The following breaks if there are different levels (do_split returns empty list)
+        # or if there are different number of the same levels. Added handling of NAs
+        # in the values of the factor when is all only NAs
+        is_all_na <- all(is.na(alt_df[[spl_payload(spl)]]))
+
+        if (!all(names(dataspl) %in% names(alt_dfpart)) ||
+            length(alt_dfpart) != length(dataspl) ||
+            is_all_na) {
+            alt_df_spl_vals <- unique(alt_df[[spl_payload(spl)]])
+            end_part <- ""
+
+            if (!all(alt_df_spl_vals %in% levels(alt_df_spl_vals))) {
+                end_part <- paste0(" and following levels: ",
+                                   paste_vec(levels(alt_df_spl_vals)))
+            }
+
+            if (is_all_na) {
+                end_part <- ". Found only NAs in alt_counts_df split"
+            }
+
+            stop("alt_counts_df split variable(s) [", spl_payload(spl), 
+                 "] (in split ", as.character(class(spl)), 
+                 ") does not have the same factor levels of df.\ndf has c(", '"', 
+                 paste(names(dataspl), collapse = '", "'), '"', ") levels while alt_counts_df has ", 
+                 ifelse(length(alt_df_spl_vals) > 0, paste_vec(alt_df_spl_vals), ""), 
+                 " unique values", end_part)
+        }
     } else {
         alt_dfpart <- setNames(rep(list(NULL), length(dataspl)), names(dataspl))
     }

diff --git a/R/utils.R b/R/utils.R
@@ -97,3 +97,8 @@ spl_context_to_disp_path <- function(ctx) {
         ret <- "root"
     ret
 }
+
+# Utility function to paste vector of values in a nice way
+paste_vec <- function(vec) {
+    paste0('c("', paste(vec, collapse = '", "'), '")')
+}
diff --git a/tests/testthat/test-tab_afun_cfun.R b/tests/testthat/test-tab_afun_cfun.R
@@ -185,6 +185,61 @@ test_that("Error localization for missing split variable when done in alt_count_
     expect_silent(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM))
 })
 
+test_that("Error localization for missmatch split variable when done in alt_count_df", {
+    afun_tmp <- function(x, .alt_df_row, .spl_context,...) {
+        # Important check that order is aligned even if source levels are not
+        check_val <- unique(.alt_df_row$ARMCD)
+        # This is something mysterious happening in splits for which if the values are all
+        # NAs in the split column, the dataspl has the nrow of the data in NA rows. xxx ToFix
+        check_val <- check_val[!is.na(check_val)] 
+        stopifnot(as.character(check_val) == .spl_context$value[2])
+        mean(x)
+    }
+    lyt_row <- basic_table() %>% split_rows_by("ARMCD") %>% analyze("BMRKR1", afun = afun_tmp)
+
+    # Mismatch in the number of splits (NA is 0)
+    DM_tmp <- DM %>% mutate("ARMCD" = NA_character_)
+    expect_error(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM_tmp), 
+                 regexp = paste0("alt_counts_df split variable\\(s\\) \\[ARMCD\\] *"))
+
+    # Mismatch of levels
+    armcd_col <- factor(sample(c("arm A", "arm B", "arm C"), nrow(DM), replace = TRUE))
+    DM_tmp <- DM %>% mutate("ARMCD" = armcd_col)
+    expect_error(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM_tmp), 
+                 regexp = paste0("alt_counts_df split variable\\(s\\) \\[ARMCD\\] *"))
+
+    # Mix mismatch of levels
+    armcd_col <- factor(sample(c("arm A", "ARM B", "ARM C"), nrow(DM), replace = TRUE))
+    DM_tmp <- DM %>% mutate("ARMCD" = armcd_col)
+    expect_error(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM_tmp), 
+                 regexp = paste0("alt_counts_df split variable\\(s\\) \\[ARMCD\\] *"))
+
+    # Mismatch in the number of levels 
+    armcd_col2 <- factor(sample(levels(ex_adsl$ARMCD)[c(1, 2)], nrow(DM), replace = TRUE))
+    DM_tmp <- DM %>% mutate("ARMCD" = armcd_col2)
+    expect_error(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM_tmp), 
+                 regexp = paste0("alt_counts_df split variable\\(s\\) \\[ARMCD\\] *"))
+
+    # Another order -> should work? yes, split is valid
+    levels(armcd_col) <- levels(ex_adsl$ARMCD)[c(1, 3, 2)]
+    DM_tmp <- DM %>% mutate("ARMCD" = armcd_col)
+    expect_silent(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM_tmp))
+
+    # Mix mismatch of levels but covering them all -> valid split
+    armcd_col <- factor(sample(c("arm A", levels(ex_adsl$ARMCD)), nrow(DM), replace = TRUE))
+    DM_tmp <- DM %>% mutate("ARMCD" = armcd_col)
+    expect_silent(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM_tmp))
+
+    # Values are all NA, but the levels are correct
+    DM_tmp$ARMCD <- factor(NA, levels = levels(ex_adsl$ARMCD))
+    expect_error(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM_tmp), 
+                 regexp = paste0("alt_counts_df split variable\\(s\\) \\[ARMCD\\] *"))
+
+    DM_tmp$ARMCD <- factor(NA, levels = levels(ex_adsl$ARMCD))
+    DM_tmp$ARMCD[seq_along(levels(ex_adsl$ARMCD))] <- levels(ex_adsl$ARMCD) 
+    expect_silent(lyt_row %>% build_table(ex_adsl, alt_counts_df = DM_tmp))
+})
+
 context("Content functions (cfun)")
 
 test_that(".alt_df_row appears in cfun but not in afun.", {