Skip to content

Commit

Permalink
adding the detector and descriptor
Browse files Browse the repository at this point in the history
  • Loading branch information
Melkiades committed Jul 29, 2024
1 parent 5cc29cd commit ca863df
Show file tree
Hide file tree
Showing 8 changed files with 450 additions and 30 deletions.
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
# Generated by roxygen2: do not edit by hand

export(apply_metadata)
export(check_long_strings)
export(check_special_chars)
export(describe_cols)
export(detect_issues)
export(is_empty_string)
export(mutate_na)
export(radab)
export(radae)
Expand Down
290 changes: 273 additions & 17 deletions R/utils_dim_control_and_checks.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
#' Use this function to reduce the number of levels in a `data.frame` column called `variable`.
#' This function calculates the frequency distribution of levels and sets either a soft threshold
#' based on the probability cut-off (`p_to_keep`) and/or an hard threshold (`num_max_values`).
#' Consider checking the number of unique values in each pivotal data column with [describe_cols()].
#'
#' @param dt (`data.frame`)\cr data.frame with too many levels for a `variable`.
#' @param df (`data.frame`)\cr data.frame with too many levels for a `variable`.
#' @param variable (`character(1)`)\cr string with the name of the column to be reduced.
#' @param p_to_keep (`numeric(1)`)\cr probability cut-off to keep the most frequent levels. If `num_max_values` is
#' specified (i.e. not `NULL`) this is not used.
Expand All @@ -32,17 +33,17 @@
#'
#' @examples
#' # real case scenario: trimming of variables with too many levels
# random.cdisc.data::cadae %>%
#' # reduce_num_levels_in_df(adae_pharmaverse, "AEDECOD",
#' # num_max_values = 7, num_of_rare_values = 1,
#' # add_specific_value = c(
#' # "VOMITING", "NAUSEA", "SKIN IRRITATION", "HEADACHE", # For SMQ01NAM, SMQ02NAM, CQ01NAM
#' # "MYOCARDIAL INFARCTION" # for aet07 AESDTH == "Y"
#' # )
#' # )
#' random.cdisc.data::cadae %>%
#' reduce_num_levels_in_df(adae_pharmaverse, "AEDECOD",
#' num_max_values = 7, num_of_rare_values = 1,
#' add_specific_value = c(
#' "VOMITING", "NAUSEA", "SKIN IRRITATION", "HEADACHE", # For SMQ01NAM, SMQ02NAM, CQ01NAM
#' "MYOCARDIAL INFARCTION" # for aet07 AESDTH == "Y"
#' )
#' )
#'
#' @export
reduce_num_levels_in_df <- function(dt,
reduce_num_levels_in_df <- function(df,
variable,
p_to_keep = 0.7,
num_max_values = NULL,
Expand All @@ -52,17 +53,17 @@ reduce_num_levels_in_df <- function(dt,
explorative = FALSE,
verbose = TRUE) {
checkmate::assert_number(p_to_keep, lower = 0, upper = 1)
checkmate::assert_data_frame(dt)
checkmate::assert_data_frame(df)
checkmate::assert_string(variable)
checkmate::assert_character(add_specific_value, null.ok = TRUE)
checkmate::assert_choice(variable, names(dt))
checkmate::assert_choice(variable, names(df))
checkmate::assert_integerish(keep_spec_rows,
null.ok = TRUE,
lower = 1, upper = nrow(dt), unique = TRUE
lower = 1, upper = nrow(df), unique = TRUE
)
checkmate::assert_flag(explorative)
checkmate::assert_flag(verbose)
cur_vec <- dt[[variable]]
cur_vec <- df[[variable]]

if (is.factor(cur_vec)) {
cur_vec <- as.character(cur_vec)
Expand Down Expand Up @@ -169,7 +170,7 @@ reduce_num_levels_in_df <- function(dt,
checkmate::assert_subset(add_specific_value, names(lev_freq))
lev_to_keep <- unique(c(lev_to_keep, add_specific_value))
}
out <- dt %>% filter(!!sym(variable) %in% lev_to_keep)
out <- df %>% filter(!!sym(variable) %in% lev_to_keep)

if (!is.null(keep_spec_rows)) {
lev_new_rows <- cur_vec[keep_spec_rows]
Expand All @@ -178,7 +179,7 @@ reduce_num_levels_in_df <- function(dt,
keep_spec_rows <- keep_spec_rows[what_is_new_row]

if (length(keep_spec_rows) > 0) {
out <- rbind(out, dt %>% slice(keep_spec_rows))
out <- rbind(out, df %>% slice(keep_spec_rows))
}
}

Expand All @@ -196,7 +197,7 @@ reduce_num_levels_in_df <- function(dt,
)
}
msg <- paste0(
"Reducing levels of ", deparse(substitute(dt)), " for variable ",
"Reducing levels of ", deparse(substitute(df)), " for variable ",
variable, ": keeping ", core_msg
)
message(msg)
Expand All @@ -211,3 +212,258 @@ reduce_num_levels_in_df <- function(dt,

invisible(out)
}


# Descriptive tools ------------------------------------------------------------
#' Describe columns of a data frame
#'
#' @description
#' This function uses some simple descriptor to describe the columns of a data frame. It is an indicative
#' function and it is not intended to be used for a full description or analysis of the data.
#'
#' @param df (`data.frame`)\cr Data like `random.cdisc.data::cadae`.
#' @param additional_checks (named `list` of functions)\cr List of functions that take in input a column
#' and return a vector of logicals. List names are used as column names for the returning `tibble`. Needs
#' to have at least a `"column"` parameter. See [check_long_strings()] for an example.
#' @param max_desc_length (`integer(1)`)\cr Maximum length of the description of any column name. You can retrieve this
#' attribute using [formatters::var_labels()] on the data. `NA` is returned when absent.
#' @param column (`character`)\cr Column to describe. If using a custom function, please consider how to handle
#' all types and missing values (`NA`).
#' @param max_length (`integer(1)`)\cr Maximum length of a string.
#' @param special_chr (`character(1)`)\cr Regular expression to detect special characters.
#'
#' @return A `tibble` with the following columns:
#' \itemize{
#' \item `col.name`: column name. We do not expect this code name to be longer than 5 - 10 characters.
#' \item `desc`: description of the column (output of [formatters::var_labels()]).
#' \item `long_desc`: `TRUE` if `desc` is longer than 80 characters. Use `max_desc_length` to change this variable.
#' \item `nrows`: number of elements or rows (useful when having multiple data to compare).
#' \item `type`: type of the column.
#' \item `is_numeric`: logical indicating if the column is numeric.
#' \item `n_na`: number of missing values.
#' \item `n_empty`: number of empty strings.
#' \item `n_unique`: number of unique values (everything is casted as factor).
#' \item `n_levels`: number of levels (if factor) or number of unique values (if character).
#' \item `n_empty_levels`: number of empty levels if > 0. If it is -1 the column is a factor that
#' contains some missing values (`NA`).
#' \item `additional_checks` list names: output of additional checks functions.
#' \item `mean`: mean of the column (if numeric).
#' \item `sd`: standard deviation of the column (if numeric).
#' \item `quartiles`: quartiles of the column (if numeric).
#' }
#'
#' @seealso [reduce_num_levels_in_df()] for how to reduce pivotal values when `n_unique` is too high.
#'
#' @examples
#' # Describe columns of a data frame
#' describe_cols(random.cdisc.data::cadae)
#'
#' adae <- random.cdisc.data::cadae
#' adae$STUDYID[1] <- "missing" # We search for this string
#' adae$USUBJID[2] <- paste0(rep("a", 40), collapse = "_") # Too long for us!!
#'
#' # Let's add one custom check function (additional_checks param)
#' check_spec_missing_str <- function(column, missing_str) {
#' # We want characters
#' column <- as.character(column)
#' # We do not want NAs (already taken into account before)
#' if (anyNA(column)) {
#' column[is.na(column)] <- "NA"
#' }
#' # Checking if it is "missing"
#' if (is.character(column)) {
#' return(column == missing_str)
#' } else {
#' return(0) # not applicable (e.g. is numeric)
#' }
#' }
#'
#' # Our description function
#' out <- describe_cols(
#' adae,
#' additional_checks = list(
#' "very_long" = check_long_strings,
#' "n_missing" = check_spec_missing_str
#' ),
#' extra_args = list( # extra arguments for our additional_checks functions
#' "max_length" = 70,
#' "missing_str" = "missing"
#' )
#' )
#' out[out$n_missing > 0, ] # STUDYID has 1 "missing" value
#' out[out$very_long > 0, ] # USUBJID has 1 long string
#'
#' @export
describe_cols <- function(df,
additional_checks = list(
"too_long" = check_long_strings,
"special_chars" = check_special_chars
),
extra_args = list(max_length = 80, special_chr = "outlier"),
max_desc_length = 80) {
checkmate::assert_data_frame(df)
checkmate::assert_list(additional_checks, any.missing = FALSE)
checkmate::assert_list(extra_args, unique = TRUE)
checkmate::assert_int(max_desc_length, lower = 1, upper = 500)

if (is.null(names(additional_checks))) {
names(additional_checks) <- paste0("check_", seq_along(additional_checks))
}

# Values of columns
var_lb <- formatters::var_labels(df)
lb_tb <- tibble::tibble(
col.name = names(var_lb),
desc = as.character(var_lb),
long_desc = nchar(desc) > max_desc_length
)

# main descriptor
lb_tb2 <- dplyr::bind_cols(lb_tb,
"nrows" = nrow(df),
"type" = df %>% dplyr::summarise_all(~ class(.)[1]) %>% t() %>% c(),
"is_numeric" = df %>% dplyr::summarise_all(~ is.numeric(.)) %>% t() %>% c(),
"n_na" = df %>% dplyr::summarise_all(~ sum(is.na(.))) %>% t() %>% c(),
"n_empty" = df %>% dplyr::summarise_all(~ sum(is_empty_string(.))) %>% t() %>% c(),
"n_unique" = df %>% dplyr::summarise_all(~ length(unique(as.character(.)))) %>% t() %>% c(),
"n_levels" = df %>% dplyr::summarise_all(~ {
ifelse(is.factor(.), length(levels(.)), length(unique(.)))
}) %>% t() %>% c()

Check warning on line 331 in R/utils_dim_control_and_checks.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / Lint R code 🧶

file=R/utils_dim_control_and_checks.R,line=331,col=8,[pipe_continuation_linter] `%>%` should always have a space before it and a new line after it, unless the full pipeline fits on one line.

Check warning on line 331 in R/utils_dim_control_and_checks.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / Lint R code 🧶

file=R/utils_dim_control_and_checks.R,line=331,col=16,[pipe_continuation_linter] `%>%` should always have a space before it and a new line after it, unless the full pipeline fits on one line.
) %>% dplyr::mutate(

Check warning on line 332 in R/utils_dim_control_and_checks.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / Lint R code 🧶

file=R/utils_dim_control_and_checks.R,line=332,col=5,[pipe_continuation_linter] `%>%` should always have a space before it and a new line after it, unless the full pipeline fits on one line.
"n_empty_levels" = n_levels - n_unique # 0 if character, -1 if NAs are present
)

# Adding custom checks
for (custom_function_i in seq_along(additional_checks)) {
col_fun_name <- names(additional_checks)[custom_function_i]
lb_tb2 <- dplyr::bind_cols(
lb_tb2,
"very_tmp_name" = df %>% dplyr::summarise_all(~ {
col <- .
cur_fun <- additional_checks[[custom_function_i]]

cur_extra_args <- as.list(args(cur_fun))
drop_empty_params <- names(cur_extra_args) == ""
cur_extra_args <- cur_extra_args[!drop_empty_params]
if (names(cur_extra_args)[1] != "column") {
stop("The first element of additional_checks must have a column parameter.")
}
cur_extra_args$column <- col
param_overload <- names(cur_extra_args) %in% names(extra_args)

if (any(param_overload)) {
cur_extra_args[param_overload] <- extra_args[names(cur_extra_args)[param_overload]]
}

sum(do.call(cur_fun, args = cur_extra_args))
}) |> t() |> c()

Check warning on line 359 in R/utils_dim_control_and_checks.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / Lint R code 🧶

file=R/utils_dim_control_and_checks.R,line=359,col=10,[pipe_continuation_linter] `|>` should always have a space before it and a new line after it, unless the full pipeline fits on one line.

Check warning on line 359 in R/utils_dim_control_and_checks.R

View workflow job for this annotation

GitHub Actions / SuperLinter 🦸‍♀️ / Lint R code 🧶

file=R/utils_dim_control_and_checks.R,line=359,col=17,[pipe_continuation_linter] `|>` should always have a space before it and a new line after it, unless the full pipeline fits on one line.
)
lb_tb2 <- dplyr::rename(lb_tb2, !!col_fun_name := very_tmp_name)
}

if (any(lb_tb2$is_numeric)) {
lb_tb2 <- dplyr::mutate(lb_tb2,
"mean" = NA_real_,
"sd" = NA_real_,
"quantile" = NA_real_
)
# for numeric
lb_tb2$mean[lb_tb2$is_numeric] <- apply(df[, lb_tb2$is_numeric], 2, function(i) {
mean(i, na.rm = TRUE)
})
lb_tb2$sd[lb_tb2$is_numeric] <- apply(df[, lb_tb2$is_numeric], 2, function(i) {
sd(i, na.rm = TRUE)
})
lb_tb2$quantile[lb_tb2$is_numeric] <- apply(df[, lb_tb2$is_numeric], 2, function(i) {
as.list(quantile(i, na.rm = TRUE))
})
}

# return
return(lb_tb2)
}

# Any element of column is an empty string? columns are vectors
#' @rdname describe_cols
#' @export
is_empty_string <- function(column) {
if (anyNA(column)) {
column[is.na(column)] <- "NA"
}
return(as.character(column) == "")
}
# Function to check for long strings
#' @rdname describe_cols
#' @export
check_long_strings <- function(column, max_length = 80) {
checkmate::assert_int(max_length, lower = 1, upper = 500, null.ok = FALSE)
column <- as.character(column)
if (anyNA(column)) {
column[is.na(column)] <- "NA"
}
long_strings <- nchar(column) > max_length
if (any(long_strings, na.rm = TRUE)) {
return(long_strings)
} else {
return(NULL)
}
}

# Function to check for specific special characters
#' @rdname describe_cols
#' @export
check_special_chars <- function(column, special_chr = "[\\n\\r\\{\\}\\[\\]]") {
checkmate::assert_character(special_chr, null.ok = FALSE)
column <- as.character(column)
if (anyNA(column)) {
column[is.na(column)] <- "NA"
}
special_chars <- stringr::str_detect(column, stringr::regex(special_chr))
if (any(special_chars, na.rm = TRUE)) {
return(special_chars)
} else {
return(NULL)
}
}

#' @examples
#' # Lets find those values that are too long
#'
#' @rdname describe_cols
#' @export
detect_issues <- function(df,
additional_checks = list(
"too_long" = check_long_strings,
"special_chars" = check_special_chars
),
extra_args = list(max_length = 80, special_chr = "outlier"),
max_desc_length = 80) {
checkmate::assert_int(max_desc_length, lower = 1, upper = 500, null.ok = FALSE)
df_col_desc <- describe_cols(df = df, additional_checks = additional_checks, extra_args = extra_args)
condition_v <- list(
"have_long_desc" = ifelse(is.na(df_col_desc$desc), FALSE, nchar(df_col_desc$desc) > max_desc_length),
"have_empty_levels" = df_col_desc$n_empty_levels > 0,
"have_nas_not_levels" = df_col_desc$n_empty_levels < 0
)
if (any(condition_v$have_long_desc)) {
cur_c <- condition_v$have_long_desc
message(
"The following columns have long descriptions: ",
paste(df_col_desc$col.name[cur_c], collapse = ", ")
)
}
if (any(condition_v$have_empty_levels)) {
cur_c <- condition_v$have_empty_levels
message(
"The following columns are ", unique(df_col_desc$type[cur_c]), " and have empty levels: ",
paste(df_col_desc$col.name[cur_c], collapse = ", ")
)
}
if (any(condition_v$have_nas_not_levels)) {
cur_c <- condition_v$have_nas_not_levels
message(
"The following columns are ", unique(df_col_desc$type[cur_c]), " and have NAs that are not levels: ",
paste(df_col_desc$col.name[cur_c], collapse = ", ")
)
}
}
7 changes: 7 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,10 @@ reference:
- sample_fct
- rcd_var_relabel
- visit_schedule

- title: Utility Functions For Dimensionaly Control and Reduction
desc: Function to simplify data pre-processing and recognize error-prone inputs
contents:
- reduce_num_levels_in_df
- describe_cols
-
2 changes: 2 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,15 @@ WOS
Xiuting
Xuefeng
anthropometric
casted
cdisc
datetime
de
dipietrc
funder
metres
npaszty
quartiles
repo
tomlinsj
wojciakw
Loading

0 comments on commit ca863df

Please sign in to comment.