Basic support for "conditioned" data frames

- Adds a new S3 class (cnd_df) for represented conditioned data frames, i.e. data frames that carry metadata about what records should be used for derivations - Adds support for basic pretty printing of cnd_df objects - Adds a user-facing function for creating such cnd_df objects: `condition_by` - Adds experimental "mutate"-version function for these conditioned data frames: `derive_by_condition()`
pharmaverse · May 24, 2024 · 8dfa57f · 8dfa57f
1 parent f9816fb
commit 8dfa57f
Show file tree

Hide file tree

Showing 11 changed files with 1,782 additions and 1 deletion.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -46,7 +46,8 @@ Imports:
     tibble,
     vctrs,
     readr,
-    glue
+    glue,
+    pillar
 Suggests:
     knitr,
     rmarkdown,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,10 +1,12 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(print,iso8601)
+S3method(tbl_sum,cnd_df)
 export(assign_ct)
 export(assign_datetime)
 export(assign_no_ct)
 export(clear_cache)
+export(condition_by)
 export(create_iso8601)
 export(ct_map)
 export(ct_spec_example)
@@ -16,6 +18,8 @@ export(hardcode_no_ct)
 export(problems)
 export(read_ct_spec)
 export(read_ct_spec_example)
+export(rm_cnd_df)
+importFrom(pillar,tbl_sum)
 importFrom(rlang,"%||%")
 importFrom(rlang,":=")
 importFrom(rlang,.data)

diff --git a/R/assertions.R b/R/assertions.R
@@ -0,0 +1,16 @@
+# Surprisingly, admiraldev doesn't provide `assert_logical_vector`.
+assert_logical_vector <- function(arg, optional = FALSE) {
+  if (optional && is.null(arg)) {
+    return(invisible(arg))
+  }
+
+  if (!is.logical(arg)) {
+    err_msg <- sprintf(
+      "`arg` must be a logical vector but is %s.",
+      admiraldev::what_is_it(arg)
+    )
+    rlang::abort(err_msg)
+  }
+
+  invisible(arg)
+}
diff --git a/R/cnd_df.R b/R/cnd_df.R
@@ -0,0 +1,299 @@
+# ------------------------------------------------------------------------------
+# File: cnd_df.R
+# Package: sdtm.oak
+# Author: Ramiro Magno <[email protected]>
+# Created: 2024-05-23
+# Last Modified: 2024-05-23
+# ------------------------------------------------------------------------------
+# Description:
+#
+# This file contains functions and scripts related to the so-called conditioned
+# data frames, i.e. those data frames extended with class `cnd_df`.
+#
+# Functions:
+#
+# - `new_cnd_df()`: Create a "conditioned data set" (class cnd_df).
+# - `is_cnd_df()`: Assess whether the argument is a `cnd_df` data frame.
+# - `get_cnd_df_cnd()`: Extract the attribute "cnd" from a `cnd_df` data frame.
+# - `get_cnd_df_cnd_sum()`: Extract the attribute "cnd_sum" from a `cnd_df` data frame.
+# - `rm_cnd_df()`: De-class a cnd_df data frame.
+# - `tbl_sum.cnd_df()`: Provide a tibble header print method for `cnd_df` tibbles.
+# - `ctl_new_rowid_pillar.cnd_df()`: A print method for the row ids cnd_df` tibbles.
+# - `eval_conditions()`: Find which rows match a set of conditions.
+# - `condition_by()`: Create a conditioned data frame (user facing).
+# - `derive_by_condition()`: Perform a derivation on a conditioned data frame.
+
+#' Create a data frame with filtering tags
+#'
+#' [new_cnd_df()] creates a _conditioned_ data frame, classed `cnd_df`, meaning
+#' that this function extends the data frame passed as argument by storing a
+#' logical vector `cnd` (as attribute) that marks rows for posterior conditional
+#' transformation by methods that support _conditioned_ data frames.
+#'
+#' @param dat A data frame.
+#' @param cnd A logical vector. Length must match the number of rows in `dat`.
+#' @param .warn Whether to warn about creating a new _conditioned_ data frame
+#' in case that `dat` already is one.
+#'
+#' @returns A data frame `dat` with the additional class `"cnd_df"` and the
+#'   following attributes:
+#'
+#' - `cnd`: The logical vector passed as argument `cnd`: `TRUE` values mark
+#' rows in `dat` to be used for transformations; rows marked with `FALSE` are
+#' not transformed; and `NA` mark rows whose transformations are to be applied
+#' resulting in `NA`.
+#' - `cnd_sum`: An integer vector of three elements providing the sum of `TRUE`,
+#' `FALSE` and `NA` values in `cnd`, respectively.
+#'
+#' @examples
+#' df <- data.frame(x = 1:3, y = letters[1:3])
+#' sdtm.oak:::new_cnd_df(dat = df, cnd = c(FALSE, NA, TRUE))
+#'
+#' @keywords internal
+new_cnd_df <- function(dat, cnd, .warn = TRUE) {
+
+  admiraldev::assert_data_frame(dat)
+  assert_logical_vector(cnd)
+
+  if (!identical(nrow(dat), length(cnd))) {
+    msg <- c(
+      "Number of rows in `dat` must match length of `cond`."
+    )
+    rlang::abort(message = msg)
+  }
+
+  is_cnd_df <- inherits(dat, "cnd_df")
+  if (.warn && is_cnd_df) {
+    msg <- "`dat` is already a conditioned data frame (`cnd_df`)."
+    rlang::warn(message = msg)
+  }
+
+  if (!is_cnd_df) {
+    class(dat) <- c("cnd_df", class(dat))
+  }
+
+  n_true <- sum(cnd, na.rm = TRUE)
+  n_false <- sum(!cnd, na.rm = TRUE)
+  n_na <- length(cnd) - (n_true + n_false)
+  cnd_sum <- c(n_true = n_true, n_false = n_false, n_na = n_na)
+
+  attr(dat, "cnd") <- cnd
+  attr(dat, "cnd_sum") <- cnd_sum
+
+  return(dat)
+}
+
+is_cnd_df <- function(dat) {
+  inherits(dat, "cnd_df")
+}
+
+get_cnd_df_cnd <- function(dat) {
+  if (is_cnd_df(dat)) {
+    attr(dat, "cnd")
+  } else {
+    NULL
+  }
+}
+
+get_cnd_df_cnd_sum <- function(dat) {
+  if (is_cnd_df(dat)) {
+    attr(dat, "cnd_sum")
+  } else {
+    NULL
+  }
+}
+
+#' Remove the cnd_df class from a data frame
+#'
+#' This function removes the 'cnd_df' class, along with its attributes, if
+#' applicable.
+#'
+#' @param dat A data frame.
+#' @return The input `dat` without the 'cnd_df' class.
+#'
+#' @export
+rm_cnd_df <- function(dat) {
+  if (is_cnd_df(dat)) {
+    class(dat) <- class(dat)[class(dat) != "cnd_df"]
+    attr(dat, "cnd") <- NULL
+    attr(dat, "cnd_sum") <- NULL
+  }
+  return(dat)
+}
+
+#' Print
+#'
+#' Blah
+#'
+#' @param x A conditioned tibble of class 'cnd_df'.
+#' @param ... Additional arguments passed to the default print method.
+#'
+#' @importFrom pillar tbl_sum
+#' @export
+tbl_sum.cnd_df <- function(x, ...) {
+  default_header <- NextMethod()
+
+  tally <- get_cnd_df_cnd_sum(x)
+  h2 <- sprintf("%d/%d/%d", tally[1], tally[2], tally[3])
+  c(default_header, "Cond. tbl" = h2)
+}
+
+lgl_to_chr <- function(x) {
+  ifelse(is.na(x), "-", ifelse(x, "T", "F"))
+}
+
+ctl_new_rowid_pillar.cnd_df <- function(controller, x, width, ...) {
+
+  out <- NextMethod()
+  n_row <- nrow(x)
+  idx <- seq_len(n_row)
+  i <- sprintf("%d", idx)
+  i_width <- nchar(as.character(i))
+  i_max_width <- max(i_width)
+  max_width <- i_max_width + 2
+  ws <- strrep(" ", max_width - i_width - 1)
+  abb_lgl <- lgl_to_chr(attr(controller, "cnd")[idx])
+
+  row_ids <- paste0(i, ws, abb_lgl)
+  width <- max(nchar(as.character(row_ids)))
+  pillar::new_pillar(
+    list(
+      title = out$title,
+      type = out$type,
+      data = pillar::pillar_component(
+        pillar::new_pillar_shaft(list(row_ids = row_ids),
+                                 width = width,
+                                 class = "pillar_rif_shaft"
+        )
+      )
+    ),
+    width = width
+  )
+}
+
+#' Evaluate conditions
+#'
+#' @description
+#' [eval_conditions()] evaluates a set of conditions in the context of a
+#' data frame and an optional environment.
+#'
+#' The utility of this function is to provide an easy way to generate a logical
+#' vector of matching records from a set of logical conditions involving
+#' variables in a data frame (`dat`) and optionally in a supplementary
+#' environment (`.env`). The set of logical conditions are provided as
+#' expressions to be evaluated in the context of `dat` and `.env`.
+#'
+#' Variables are looked up in `dat`, then in `.env`, then in the calling
+#' function's environment, followed by its parent environments.
+#'
+#' @param dat A data frame
+#' @param ... A set of logical conditions, e.g. `y & z, x | z` (`x`, `y`, `z`
+#'   would have to exist either as columns in `dat` or in the enviroment
+#'   `.env`). If multiple expressions are included, they are combined with the
+#'   `&` operator.
+#' @param .na Return value to be used when the conditions evalute to `NA`.
+#' @param .env An optional environment to look for variables involved in logical
+#'   expression passed in `...`. A data frame or a list can also be passed that
+#'   will be coerced to an environment internally.
+#'
+#' @returns A logical vector reflecting matching rows in `dat`.
+#'
+#' @examples
+#' # Create a sample data frame
+#' df <- data.frame(
+#'   x = c(1, 2, 3, 4, 5),
+#'   y = c(TRUE, FALSE, TRUE, FALSE, TRUE),
+#'   z = c("a", "b", "a", "b", "a")
+#' )
+#'
+#' # Simple condition on one column
+#' sdtm.oak:::eval_conditions(df, x > 2)
+#'
+#' # Combined conditions on multiple columns
+#' sdtm.oak:::eval_conditions(df, x > 2, y)
+#'
+#' # Using conditions with NA handling
+#' df_with_na <- data.frame(
+#'   x = c(1, 2, NA, 4, 5),
+#'   y = c(TRUE, FALSE, TRUE, FALSE, TRUE)
+#' )
+#' sdtm.oak:::eval_conditions(df_with_na, x > 2, .na = FALSE)
+#'
+#' # The environment where `eval_conditions()` is called is also inspected
+#' # when evaluating conditions in `...`.
+#' w <- 1
+#' sdtm.oak:::eval_conditions(df, x > w)
+#'
+#' # Using an environment
+#' env <- rlang::env(w = 2)
+#' sdtm.oak:::eval_conditions(df, x > w, .env = env)
+#'
+#' # In place of an environment, you may alternatively pass a list or data frame.
+#' sdtm.oak:::eval_conditions(df, x > w, .env = list(w = 3))
+#' sdtm.oak:::eval_conditions(df, x > w, .env = tibble::tibble(w = 4))
+#'
+#' @keywords internal
+eval_conditions <- function(dat,
+                            ...,
+                            .na = NA,
+                            .env = rlang::env()) {
+
+  conditions <- rlang::enexprs(...)
+
+  # List (or data frame).
+  if (is.list(.env)) {
+    .env <- rlang::as_environment(.env, parent = rlang::env())
+  }
+
+  lgl_vctrs <-
+    conditions |>
+    purrr::map(~ rlang::eval_tidy(.x, dat, env = .env)) |>
+    purrr::map(~ dplyr::if_else(is.na(.x), .na, .x))
+
+  cnd <- purrr::reduce(lgl_vctrs, `&`, .init = rep(TRUE, nrow(dat)))
+
+  cnd
+}
+
+#' Condition a data set based on specified conditions
+#'
+#' This function tags records in a data set, indicating which rows match the
+#' specified conditions.
+#'
+#' @param dat A tibble.
+#' @param ... Conditions to filter the tibble.
+#' @return A tibble with an additional class 'cnd_df' and a logical vector
+#'   attribute indicating matching rows.
+#' @param .na Return value to be used when the conditions evalute to `NA`.
+#' @param .env An optional environment to look for variables involved in logical
+#'   expression passed in `...`. A data frame or a list can also be passed that
+#'   will be coerced to an environment internally.
+#'
+#' @export
+condition_by <- function(dat, ..., .na = NA, .env = rlang::env()) {
+
+  if (is_cnd_df(dat)) {
+    rlang::warn(
+      c("`dat` is already a conditioned data frame (`cnd_df`).",
+      "The previous condition will be replaced by the new one.")
+    )
+  }
+
+  cnd <- eval_conditions(dat = dat, ..., .na = .na, .env = .env)
+  new_cnd_df(dat, cnd = cnd, .warn = FALSE)
+}
+
+#' @keywords internal
+derive_by_condition <- function(dat, ...) {
+
+  cnd <- get_cnd_df_cnd(dat)
+  derivations <- rlang::enquos(...)
+  derived_vars <- names(derivations)
+
+  lst <- purrr::map(derivations, ~ rlang::expr(dplyr::if_else({{cnd}}, !!.x, NA)))
+  lst <- rlang::set_names(lst, derived_vars)
+  dat2 <- dplyr::mutate({{dat}}, !!!lst)
+  rm_cnd_df(dat2)
+}
+
+
diff --git a/man/condition_by.Rd b/man/condition_by.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -46,7 +46,8 @@ Imports: @@
         tibble,
         vctrs,
         readr,
-        glue
+        glue,
+        pillar
     Suggests:
         knitr,
         rmarkdown,
@@ Expand Down @@