Skip to content

Commit

Permalink
Extensive support for conditioned tibbles
Browse files Browse the repository at this point in the history
- Joins by raw and target data sets are now aware of conditioned tibbles
- Transformation functions, namely `assign_datetime()`, `hardcode*()` and `assign*` are also conditioned-tibble aware
- Unit test coverage for most cases indicated at #54

I believe the essential components are here to support the if_then_else algorithm via conditioned tibbles. Now, further testing, assertions and documentation is needed.
  • Loading branch information
ramiromagno committed May 29, 2024
1 parent d08794b commit 00e758a
Show file tree
Hide file tree
Showing 16 changed files with 420 additions and 80 deletions.
34 changes: 10 additions & 24 deletions R/assign.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#' the variables indicated in `id_vars`. This parameter is optional, see
#' section Value for how the output changes depending on this argument value.
#' @param id_vars Key variables to be used in the join between the raw dataset
#' (`raw_dat`) and the target data set (`raw_dat`).
#' (`raw_dat`) and the target data set (`tgt_dat`).
#'
#' @returns The returned data set depends on the value of `tgt_dat`:
#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to
Expand Down Expand Up @@ -53,33 +53,19 @@ sdtm_assign <- function(raw_dat,
assert_ct_spec(ct_spec, optional = TRUE)
assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = TRUE)

# Recode the raw variable following terminology.
tgt_val <- ct_map(raw_dat[[raw_var]], ct_spec = ct_spec, ct_clst = ct_clst)

# Apply derivation by assigning `raw_var` to `tgt_var`.
# `der_dat`: derived dataset.
der_dat <-
join_dat <-
raw_dat |>
dplyr::select(c(id_vars, raw_var)) |>
mutate("{tgt_var}" := tgt_val) |> # nolint object_name_linter()
dplyr::select(-rlang::sym(raw_var))
dplyr::select(dplyr::all_of(c(id_vars, raw_var))) |>
sdtm_join(tgt_dat = tgt_dat, id_vars = id_vars)

# If a target dataset is supplied, then join the so far derived dataset with
# the target dataset (`tgt_dat`), otherwise leave it be.
der_dat <-
if (!is.null(tgt_dat)) {
# If variable `tgt_var` exists in `tgt_dat` remove it as we want to
# keep the derived variable in `der_dat`.
tgt_dat <- dplyr::select(tgt_dat, -dplyr::any_of(tgt_var))
# Recode the raw variable following terminology.
tgt_val <- ct_map(join_dat[[raw_var]], ct_spec = ct_spec, ct_clst = ct_clst)

der_dat |>
dplyr::right_join(y = tgt_dat, by = id_vars) |>
dplyr::relocate(tgt_var, .after = dplyr::last_col())
} else {
der_dat
}
join_dat |>
mutate("{tgt_var}" := tgt_val) |> # nolint object_name_linter()
dplyr::select(-dplyr::any_of(setdiff(raw_var, tgt_var))) |>
dplyr::relocate(dplyr::all_of(tgt_var), .after = dplyr::last_col())

der_dat
}

#' Derive an SDTM variable
Expand Down
34 changes: 12 additions & 22 deletions R/assign_datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,30 +170,20 @@ assign_datetime <-
admiraldev::assert_character_vector(raw_unk)
admiraldev::assert_logical_scalar(.warn)

tgt_val <-
create_iso8601(!!!raw_dat[raw_var],
.format = raw_fmt,
.na = raw_unk,
.warn = .warn
)

der_dat <-
join_dat <-
raw_dat |>
dplyr::select(dplyr::all_of(c(id_vars, raw_var))) |>
dplyr::mutate("{tgt_var}" := tgt_val) |> # nolint object_name_linter()
dplyr::select(-dplyr::any_of(raw_var))
sdtm_join(tgt_dat = tgt_dat, id_vars = id_vars)

der_dat <-
if (!is.null(tgt_dat)) {
# If variable `tgt_var` exists in `tgt_dat` remove it as we want to
# keep the derived variable in `der_dat`.
tgt_dat <- dplyr::select(tgt_dat, -dplyr::any_of(tgt_var))
der_dat |>
dplyr::right_join(y = tgt_dat, by = id_vars) |>
dplyr::relocate(tgt_var, .after = dplyr::last_col())
} else {
der_dat
}
tgt_val <-
create_iso8601(!!!join_dat[raw_var],
.format = raw_fmt,
.na = raw_unk,
.warn = .warn
)

der_dat
join_dat |>
mutate("{tgt_var}" := tgt_val) |> # nolint object_name_linter()
dplyr::select(-dplyr::any_of(setdiff(raw_var, tgt_var))) |>
dplyr::relocate(dplyr::all_of(tgt_var), .after = dplyr::last_col())
}
23 changes: 18 additions & 5 deletions R/cnd_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,7 @@ rm_cnd_df <- function(dat) {
return(dat)
}

#' Print
#'
#' Blah
#' Conditioned tibble header print method
#'
#' @param x A conditioned tibble of class `cnd_df`.
#' @param ... Additional arguments passed to the default print method.
Expand All @@ -143,6 +141,9 @@ lgl_to_chr <- function(x) {
ifelse(is.na(x), "-", ifelse(x, "T", "F"))
}

#' Conditioned tibble pillar print method
#'
#' @inheritParams pillar::ctl_new_rowid_pillar
#' @importFrom pillar ctl_new_rowid_pillar
#' @export
ctl_new_rowid_pillar.cnd_df <- function(controller, x, width, ...) {
Expand Down Expand Up @@ -261,7 +262,7 @@ eval_conditions <- function(dat,
#' Condition a data set based on specified conditions
#'
#' This function tags records in a data set, indicating which rows match the
#' specified conditions.
#' specified conditions, resulting in a conditioned data frame.
#'
#' @param dat A tibble.
#' @param ... Conditions to filter the tibble.
Expand All @@ -272,6 +273,8 @@ eval_conditions <- function(dat,
#' expression passed in `...`. A data frame or a list can also be passed that
#' will be coerced to an environment internally.
#'
#' @returns A conditioned data frame.
#'
#' @export
condition_by <- function(dat, ..., .na = NA, .env = rlang::env()) {

Expand All @@ -286,6 +289,16 @@ condition_by <- function(dat, ..., .na = NA, .env = rlang::env()) {
new_cnd_df(dat, cnd = cnd, .warn = FALSE)
}

#' Mutate method for conditioned data frames
#'
#' [mutate.cnd_df()] is an S3 method to be dispatched by [mutate][dplyr::mutate]
#' generic on conditioned data frames. This function implements a conditional
#' mutate by only changing rows for which the condition stored in the
#' conditioned data frame is `TRUE`.
#'
#' @param .data A conditioned data frame.
#'
#' @inheritParams dplyr::mutate
#' @importFrom dplyr mutate
#' @export
mutate.cnd_df <- function(.data,
Expand All @@ -304,5 +317,5 @@ mutate.cnd_df <- function(.data,
lst <- purrr::map(derivations, ~ rlang::expr(dplyr::if_else({{cnd}}, !!.x, NA)))
lst <- rlang::set_names(lst, derived_vars)

dplyr::mutate(dat, !!!lst)
dplyr::mutate(dat, !!!lst, .by = .by, .keep = .keep, .after = .after)
}
33 changes: 9 additions & 24 deletions R/hardcode.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#' the variables indicated in `id_vars`. This parameter is optional, see
#' section Value for how the output changes depending on this argument value.
#' @param id_vars Key variables to be used in the join between the raw dataset
#' (`raw_dat`) and the target data set (`raw_dat`).
#' (`raw_dat`) and the target data set (`tgt_dat`).
#'
#' @returns The returned data set depends on the value of `tgt_dat`:
#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to
Expand Down Expand Up @@ -56,33 +56,18 @@ sdtm_hardcode <- function(raw_dat,
assert_ct_spec(ct_spec, optional = TRUE)
assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = TRUE)

# Recode the hardcoded value following terminology.
tgt_val <- ct_map(tgt_val, ct_spec = ct_spec, ct_clst = ct_clst)

# Apply derivation of the hardcoded value.
# `der_dat`: derived dataset.
der_dat <-
join_dat <-
raw_dat |>
dplyr::select(dplyr::all_of(c(id_vars, raw_var))) |>
mutate("{tgt_var}" := recode(x = !!rlang::sym(raw_var), to = tgt_val)) |> # nolint object_name_linter()
dplyr::select(-dplyr::any_of(raw_var))
sdtm_join(tgt_dat = tgt_dat, id_vars = id_vars)

# If a target dataset is supplied, then join the so far derived dataset with
# the target dataset (`tgt_dat`), otherwise leave it be.
der_dat <-
if (!is.null(tgt_dat)) {
# If variable `tgt_var` exists in `tgt_dat` remove it as we want to
# keep the derived variable in `der_dat`.
tgt_dat <- dplyr::select(tgt_dat, -dplyr::any_of(tgt_var))

der_dat |>
dplyr::right_join(y = tgt_dat, by = id_vars) |>
dplyr::relocate(tgt_var, .after = dplyr::last_col())
} else {
der_dat
}
# Recode the hardcoded value following terminology.
tgt_val <- ct_map(tgt_val, ct_spec = ct_spec, ct_clst = ct_clst)

der_dat
join_dat |>
mutate("{tgt_var}" := recode(x = !!rlang::sym(raw_var), to = tgt_val)) |> # nolint object_name_linter()
dplyr::select(-dplyr::any_of(setdiff(raw_var, tgt_var))) |>
dplyr::relocate(dplyr::all_of(tgt_var), .after = dplyr::last_col())
}

#' Derive an SDTM variable with a hardcoded value
Expand Down
39 changes: 39 additions & 0 deletions R/join.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#' SDTM join
#'
#' [sdtm_join()] is a special join between a raw data set and a target data
#' set. This function supports conditioned data frames.
#'
#' @param raw_dat The raw dataset: a dataframe or a conditioned data frame. Must
#' include the variables passed in `id_vars`.
#' @param tgt_dat Target dataset: a data frame or a conditioned data frame to be
#' merged against `raw_dat` by the variables indicated in `id_vars`.
#' @param id_vars Key variables to be used in the join between the raw dataset
#' (`raw_dat`) and the target data set (`raw_dat`).
#'
#' @returns A data frame, or a conditioned data frame if at least one of the
#' input data sets is a conditioned data frame.
#'
#' @keywords internal
#' @importFrom rlang %||%
sdtm_join <- function(raw_dat,
tgt_dat = NULL,
id_vars = oak_id_vars()) {
raw_dat_cnd <- get_cnd_df_cnd(raw_dat) %||% rep(TRUE, nrow(raw_dat))
tgt_dat <- tgt_dat %||% raw_dat[id_vars]
tgt_dat_cnd <- get_cnd_df_cnd(tgt_dat) %||% rep(TRUE, nrow(tgt_dat))

# `rm_cnd_df()` prevents `mutate` from dispatching.
raw_dat <- dplyr::mutate(rm_cnd_df(raw_dat), `__raw_dat_cond__` = raw_dat_cnd)
tgt_dat <- dplyr::mutate(rm_cnd_df(tgt_dat), `__tgt_dat_cond__` = tgt_dat_cnd)

res <- dplyr::right_join(raw_dat, y = tgt_dat, by = id_vars)

cnd <- res$`__raw_dat_cond__` & res$`__tgt_dat_cond__`
res |>
dplyr::select(-dplyr::all_of(c(
"__raw_dat_cond__", "__tgt_dat_cond__"
))) |>
new_cnd_df(cnd = cnd, .warn = FALSE)

}

39 changes: 39 additions & 0 deletions R/sdtm_join.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#' SDTM join
#'
#' [sdtm_join()] is a special join between a raw data set and a target data
#' set. This function supports conditioned data frames.
#'
#' @param raw_dat The raw dataset: a dataframe or a conditioned data frame. Must
#' include the variables passed in `id_vars`.
#' @param tgt_dat Target dataset: a data frame or a conditioned data frame to be
#' merged against `raw_dat` by the variables indicated in `id_vars`.
#' @param id_vars Key variables to be used in the join between the raw dataset
#' (`raw_dat`) and the target data set (`tgt_dat`).
#'
#' @returns A data frame, or a conditioned data frame if, at least, one of the
#' input data sets is a conditioned data frame.
#'
#' @keywords internal
#' @importFrom rlang %||%
sdtm_join <- function(raw_dat,
tgt_dat = NULL,
id_vars = oak_id_vars()) {
raw_dat_cnd <- get_cnd_df_cnd(raw_dat) %||% rep(TRUE, nrow(raw_dat))
tgt_dat <- tgt_dat %||% raw_dat[id_vars]
tgt_dat_cnd <- get_cnd_df_cnd(tgt_dat) %||% rep(TRUE, nrow(tgt_dat))

# `rm_cnd_df()` prevents `mutate` from dispatching.
raw_dat <- dplyr::mutate(rm_cnd_df(raw_dat), `__raw_dat_cond__` = raw_dat_cnd)
tgt_dat <- dplyr::mutate(rm_cnd_df(tgt_dat), `__tgt_dat_cond__` = tgt_dat_cnd)

res <- dplyr::right_join(raw_dat, y = tgt_dat, by = id_vars)

cnd <- res$`__raw_dat_cond__` & res$`__tgt_dat_cond__`
res |>
dplyr::select(-dplyr::all_of(c(
"__raw_dat_cond__", "__tgt_dat_cond__"
))) |>
new_cnd_df(cnd = cnd, .warn = FALSE)

}

1 change: 1 addition & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ AE
AESTDY
CMSTDY
DM
ungrouped
4 changes: 3 additions & 1 deletion man/condition_by.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions man/ctl_new_rowid_pillar.cnd_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 60 additions & 0 deletions man/mutate.cnd_df.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/sdtm_assign.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 00e758a

Please sign in to comment.