Skip to content

Commit

Permalink
add coverage and precise testing + fix for keep_spec_rows
Browse files Browse the repository at this point in the history
  • Loading branch information
Melkiades committed Jul 18, 2024
1 parent 30e27a7 commit b671d2e
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 28 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ Suggests:
knitr (>= 1.42),
rmarkdown (>= 2.23),
testthat (>= 3.0.4),
withr (>= 2.0.0)
withr (>= 2.0.0),
ggplot2
VignetteBuilder:
knitr
RdMacros:
Expand Down
52 changes: 30 additions & 22 deletions R/utils_dim_control_and_checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ reduce_num_levels_in_df <- function(dt,
checkmate::assert_string(variable)
checkmate::assert_character(add_specific_value, null.ok = TRUE)
checkmate::assert_choice(variable, names(dt))
checkmate::assert_integer(keep_spec_rows,
null.ok = TRUE,
lower = 1, upper = nrow(dt), unique = TRUE
checkmate::assert_integerish(keep_spec_rows,
null.ok = TRUE,

Check warning on line 58 in R/utils_dim_control_and_checks.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / Lint R code 🧶

file=R/utils_dim_control_and_checks.R,line=58,col=31,[indentation_linter] Indentation should be 4 spaces but is 31 spaces.
lower = 1, upper = nrow(dt), unique = TRUE
)
checkmate::assert_flag(explorative)
cur_vec <- dt[[variable]]
Expand Down Expand Up @@ -166,38 +166,46 @@ reduce_num_levels_in_df <- function(dt,
checkmate::assert_subset(add_specific_value, names(lev_freq))
lev_to_keep <- unique(c(lev_to_keep, add_specific_value))
}
out <- dt %>% filter(!!sym(variable) %in% lev_to_keep)

if (!is.null(keep_spec_rows)) {
lev_to_keep <- c(lev_to_keep, keep_spec_rows)
filter1 <- which(cur_vec %in% lev_to_keep)
keep_spec_rows <- keep_spec_rows[!keep_spec_rows %in% filter1]
lev_new_rows <- cur_vec[keep_spec_rows]
what_is_new_row <- which(!lev_new_rows %in% lev_to_keep)
lev_new_rows <- lev_new_rows[what_is_new_row]
keep_spec_rows <- keep_spec_rows[what_is_new_row]

if (length(keep_spec_rows) > 0) {
out <- rbind(out, dt %>% slice(keep_spec_rows))
}
}

if (interactive()) {
msg <- paste0(
"Reducing levels of ", deparse(substitute(dt)), " for variable ",
variable, ": keeping ", length(lev_to_keep),
" levels out of ", length(lev_freq), " levels. Total kept (%): ",
round(sum(lev_freq[lev_to_keep]) * 100 / sum(lev_freq), 1)
)
if (length(keep_spec_rows) > 0) {
msg <- paste0(
msg, "\n", "Keeping other rows for a total: ",
core_msg <- paste0(
length(lev_to_keep), " + ", length(keep_spec_rows), " (from keep_spec_rows) levels out of ",
length(lev_freq), " levels. Total rows kept (%): ",
round((sum(lev_freq[lev_to_keep]) + length(keep_spec_rows)) * 100 / sum(lev_freq), 1)
)
} else {
core_msg <- paste0(
length(lev_to_keep), " levels out of ", length(lev_freq), " levels. Total rows kept (%): ",
round(sum(lev_freq[lev_to_keep]) * 100 / sum(lev_freq), 1)
)
}
msg <- paste0(
"Reducing levels of ", deparse(substitute(dt)), " for variable ",
variable, ": keeping ", core_msg
)
message(msg)
}

out <- dt %>% filter(!!sym(variable) %in% lev_to_keep)

if (length(keep_spec_rows) > 0) {
out <- rbind(out, dt %>% slice(keep_spec_rows))
}

# Simple check of filtering
stopifnot(nrow(out) == sum(cur_vec %in% lev_to_keep))
stopifnot(nrow(out) == sum(cur_vec %in% lev_to_keep) + length(keep_spec_rows))

# We want a factor anyway (drop unused levels)
out <- out %>%
mutate(!!sym(variable) := factor(!!sym(variable)))

out
invisible(out)
}
}
69 changes: 64 additions & 5 deletions tests/testthat/test-utils_dim_control_and_checks.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,71 @@
test_that("Checking that levels are reduced correctly for multiple variables", {
expect_message(
reduce_num_levels_in_df(cadae, "AEDECOD"),
"Reducing levels of cadae for variable AEDECOD: keeping 6 levels out of 10 levels. Total kept \\(%\\): 63.3"
test_that("Checking that levels are reduced correctly for multiple variables with defaults", {
rlang::with_interactive(
expect_message(
out <- reduce_num_levels_in_df(cadae, "AEDECOD"),
"Reducing levels of cadae for variable AEDECOD: keeping 6 levels out of 10 levels. Total kept \\(%\\): 63.3"
)
)
out <- suppressMessages(reduce_num_levels_in_df(cadae, "AEDECOD"))
expect_equal(length(levels(out$AEDECOD)), 6L)

skip_if_not_installed(ggplot2)
suppressMessages(a_plot <- reduce_num_levels_in_df(cadae, "AEDECOD", explorative = TRUE))
expect_true(ggplot2::is.ggplot(a_plot))
})

test_that("reduce_num_levels_in_df(num_max_values) works", {
rlang::with_interactive(
expect_message(
out <- reduce_num_levels_in_df(cadae, "AEDECOD", num_max_values = 5),
"keeping 5 levels out of 10 levels"
)
)
expect_equal(length(levels(out$AEDECOD)), 5L)
})

test_that("reduce_num_levels_in_df(num_max_values, num_of_rare_values) works", {
cadae_tmp <- cadae %>% mutate(AEDECOD = as.character(AEDECOD))
cadae_tmp$AEDECOD[1] <- "an_outlier"
rlang::with_interactive(
expect_message(
out <- reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, num_of_rare_values = 2),
"keeping 5 levels out of 11 levels"
)
)

expect_equal(length(levels(out$AEDECOD)), 5L)
expect_true(cadae_tmp$AEDECOD[1] %in% names(table(out$AEDECOD)))

expect_error(reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, num_of_rare_values = 6))
})

test_that("reduce_num_levels_in_df(add_specific_value) works", {
cadae_tmp <- cadae %>% mutate(AEDECOD = as.character(AEDECOD))
cadae_tmp$AEDECOD[1] <- "an_outlier"
rlang::with_interactive(
expect_message(
out <- reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, add_specific_value = "an_outlier"),
"keeping 6 levels out of 11 levels"
)
)

expect_equal(length(levels(out$AEDECOD)), 6L)
expect_true(cadae_tmp$AEDECOD[1] %in% names(table(out$AEDECOD)))

expect_error(reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, add_specific_value = 6))
})

test_that("reduce_num_levels_in_df(add_specific_value) works", {
cadae_tmp <- cadae %>% mutate(AEDECOD = as.character(AEDECOD))
cadae_tmp$AEDECOD[1] <- "an_outlier"
rlang::with_interactive(
expect_message(
out <- reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, keep_spec_rows = c(1, 4)),
"keeping 5 \\+ 1 \\(from keep_spec_rows\\)"
)
)

expect_equal(length(levels(out$AEDECOD)), 6L)
expect_true(cadae_tmp$AEDECOD[1] %in% names(table(out$AEDECOD)))

expect_error(reduce_num_levels_in_df(cadae_tmp, "AEDECOD", num_max_values = 5, keep_spec_rows = "asdsa"))
})

0 comments on commit b671d2e

Please sign in to comment.