From b671d2e7b3fe3a6132bad26e190d237935cc3a2e Mon Sep 17 00:00:00 2001 From: Melkiades Date: Thu, 18 Jul 2024 11:38:56 +0200 Subject: [PATCH] add coverage and precise testing + fix for keep_spec_rows --- DESCRIPTION | 3 +- R/utils_dim_control_and_checks.R | 52 ++++++++------ .../test-utils_dim_control_and_checks.R | 69 +++++++++++++++++-- 3 files changed, 96 insertions(+), 28 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 129922c8..2ecdd3a3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,7 +41,8 @@ Suggests: knitr (>= 1.42), rmarkdown (>= 2.23), testthat (>= 3.0.4), - withr (>= 2.0.0) + withr (>= 2.0.0), + ggplot2 VignetteBuilder: knitr RdMacros: diff --git a/R/utils_dim_control_and_checks.R b/R/utils_dim_control_and_checks.R index 6e83540e..838ec5f7 100644 --- a/R/utils_dim_control_and_checks.R +++ b/R/utils_dim_control_and_checks.R @@ -54,9 +54,9 @@ reduce_num_levels_in_df <- function(dt, checkmate::assert_string(variable) checkmate::assert_character(add_specific_value, null.ok = TRUE) checkmate::assert_choice(variable, names(dt)) - checkmate::assert_integer(keep_spec_rows, - null.ok = TRUE, - lower = 1, upper = nrow(dt), unique = TRUE + checkmate::assert_integerish(keep_spec_rows, + null.ok = TRUE, + lower = 1, upper = nrow(dt), unique = TRUE ) checkmate::assert_flag(explorative) cur_vec <- dt[[variable]] @@ -166,38 +166,46 @@ reduce_num_levels_in_df <- function(dt, checkmate::assert_subset(add_specific_value, names(lev_freq)) lev_to_keep <- unique(c(lev_to_keep, add_specific_value)) } + out <- dt %>% filter(!!sym(variable) %in% lev_to_keep) if (!is.null(keep_spec_rows)) { - lev_to_keep <- c(lev_to_keep, keep_spec_rows) - filter1 <- which(cur_vec %in% lev_to_keep) - keep_spec_rows <- keep_spec_rows[!keep_spec_rows %in% filter1] + lev_new_rows <- cur_vec[keep_spec_rows] + what_is_new_row <- which(!lev_new_rows %in% lev_to_keep) + lev_new_rows <- lev_new_rows[what_is_new_row] + keep_spec_rows <- keep_spec_rows[what_is_new_row] + + if (length(keep_spec_rows) > 0) { + out <- rbind(out, dt %>% slice(keep_spec_rows)) + } } if (interactive()) { - msg <- paste0( - "Reducing levels of ", deparse(substitute(dt)), " for variable ", - variable, ": keeping ", length(lev_to_keep), - " levels out of ", length(lev_freq), " levels. Total kept (%): ", - round(sum(lev_freq[lev_to_keep]) * 100 / sum(lev_freq), 1) - ) if (length(keep_spec_rows) > 0) { - msg <- paste0( - msg, "\n", "Keeping other rows for a total: ", + core_msg <- paste0( + length(lev_to_keep), " + ", length(keep_spec_rows), " (from keep_spec_rows) levels out of ", + length(lev_freq), " levels. Total rows kept (%): ", round((sum(lev_freq[lev_to_keep]) + length(keep_spec_rows)) * 100 / sum(lev_freq), 1) ) + } else { + core_msg <- paste0( + length(lev_to_keep), " levels out of ", length(lev_freq), " levels. Total rows kept (%): ", + round(sum(lev_freq[lev_to_keep]) * 100 / sum(lev_freq), 1) + ) } + msg <- paste0( + "Reducing levels of ", deparse(substitute(dt)), " for variable ", + variable, ": keeping ", core_msg + ) message(msg) } - out <- dt %>% filter(!!sym(variable) %in% lev_to_keep) - - if (length(keep_spec_rows) > 0) { - out <- rbind(out, dt %>% slice(keep_spec_rows)) - } - # Simple check of filtering - stopifnot(nrow(out) == sum(cur_vec %in% lev_to_keep)) + stopifnot(nrow(out) == sum(cur_vec %in% lev_to_keep) + length(keep_spec_rows)) + + # We want a factor anyway (drop unused levels) + out <- out %>% + mutate(!!sym(variable) := factor(!!sym(variable))) - out + invisible(out) } } diff --git a/tests/testthat/test-utils_dim_control_and_checks.R b/tests/testthat/test-utils_dim_control_and_checks.R index e615e153..ae46e7ae 100644 --- a/tests/testthat/test-utils_dim_control_and_checks.R +++ b/tests/testthat/test-utils_dim_control_and_checks.R @@ -1,12 +1,71 @@ -test_that("Checking that levels are reduced correctly for multiple variables", { - expect_message( - reduce_num_levels_in_df(cadae, "AEDECOD"), - "Reducing levels of cadae for variable AEDECOD: keeping 6 levels out of 10 levels. Total kept \\(%\\): 63.3" +test_that("Checking that levels are reduced correctly for multiple variables with defaults", { + rlang::with_interactive( + expect_message( + out <- reduce_num_levels_in_df(cadae, "AEDECOD"), + "Reducing levels of cadae for variable AEDECOD: keeping 6 levels out of 10 levels. Total kept \\(%\\): 63.3" + ) ) - out <- suppressMessages(reduce_num_levels_in_df(cadae, "AEDECOD")) expect_equal(length(levels(out$AEDECOD)), 6L) skip_if_not_installed(ggplot2) suppressMessages(a_plot <- reduce_num_levels_in_df(cadae, "AEDECOD", explorative = TRUE)) expect_true(ggplot2::is.ggplot(a_plot)) }) + +test_that("reduce_num_levels_in_df(num_max_values) works", { + rlang::with_interactive( + expect_message( + out <- reduce_num_levels_in_df(cadae, "AEDECOD", num_max_values = 5), + "keeping 5 levels out of 10 levels" + ) + ) + expect_equal(length(levels(out$AEDECOD)), 5L) +}) + +test_that("reduce_num_levels_in_df(num_max_values, num_of_rare_values) works", { + cadae_tmp <- cadae %>% mutate(AEDECOD = as.character(AEDECOD)) + cadae_tmp$AEDECOD[1] <- "an_outlier" + rlang::with_interactive( + expect_message( + out <- reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, num_of_rare_values = 2), + "keeping 5 levels out of 11 levels" + ) + ) + + expect_equal(length(levels(out$AEDECOD)), 5L) + expect_true(cadae_tmp$AEDECOD[1] %in% names(table(out$AEDECOD))) + + expect_error(reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, num_of_rare_values = 6)) +}) + +test_that("reduce_num_levels_in_df(add_specific_value) works", { + cadae_tmp <- cadae %>% mutate(AEDECOD = as.character(AEDECOD)) + cadae_tmp$AEDECOD[1] <- "an_outlier" + rlang::with_interactive( + expect_message( + out <- reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, add_specific_value = "an_outlier"), + "keeping 6 levels out of 11 levels" + ) + ) + + expect_equal(length(levels(out$AEDECOD)), 6L) + expect_true(cadae_tmp$AEDECOD[1] %in% names(table(out$AEDECOD))) + + expect_error(reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, add_specific_value = 6)) +}) + +test_that("reduce_num_levels_in_df(add_specific_value) works", { + cadae_tmp <- cadae %>% mutate(AEDECOD = as.character(AEDECOD)) + cadae_tmp$AEDECOD[1] <- "an_outlier" + rlang::with_interactive( + expect_message( + out <- reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, keep_spec_rows = c(1, 4)), + "keeping 5 \\+ 1 \\(from keep_spec_rows\\)" + ) + ) + + expect_equal(length(levels(out$AEDECOD)), 6L) + expect_true(cadae_tmp$AEDECOD[1] %in% names(table(out$AEDECOD))) + + expect_error(reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, keep_spec_rows = "asdsa")) +})