diff --git a/.Rbuildignore b/.Rbuildignore index c8038ef3..80fe0c2f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -14,3 +14,4 @@ ^data-raw$ ^staged_dependencies.yaml$ ^vignettes/articles$ +^inst/ct/README.md$ diff --git a/DESCRIPTION b/DESCRIPTION index ce8b29a7..8abff617 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: sdtm.oak Type: Package Title: SDTM Data Transformation Engine -Version: 0.0.0.9001 +Version: 0.0.0.9002 Authors@R: c( person("Rammprasad", "Ganapathy", role = c("aut", "cre"), email = "ganapathy.rammprasad@gene.com"), @@ -38,11 +38,15 @@ Depends: R (>= 4.2) Imports: admiraldev, dplyr (>= 1.0.0), + memoise, assertthat, purrr (>= 0.3.3), rlang (>= 0.4.4), stringr (>= 1.4.0), - tibble + tibble, + vctrs, + readr, + glue Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index f1161f5e..6170cee4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,22 @@ # Generated by roxygen2: do not edit by hand S3method(print,iso8601) +export(assign_ct) +export(assign_no_ct) +export(clear_cache) export(create_iso8601) +export(ct_map) +export(ct_spec_example) +export(ct_spec_vars) export(derive_study_day) export(fmt_cmp) +export(hardcode_ct) +export(hardcode_no_ct) export(problems) +export(read_ct_spec) +export(read_ct_spec_example) +importFrom(rlang,"%||%") +importFrom(rlang,":=") importFrom(rlang,.data) +importFrom(stats,na.omit) importFrom(tibble,tibble) diff --git a/NEWS.md b/NEWS.md index 77776156..297b7d53 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# sdtm.oak 0.0.0.9002 (development version) + +## New Features + +* New function: `derive_study_day()` for study day calculation. + +* New functions for basic SDTM derivations: ` assign_no_ct()`, `assign_ct()`, +`hardcode_no_ct()` and `hardcode_ct()`. + +* New functions for handling controlled terminologies: `read_ct_spec()`, +`read_ct_spec_example()`, `ct_spec_example()` and `ct_map()`. + # sdtm.oak 0.0.0.9001 (development version) ## New Features diff --git a/R/.gitkeep b/R/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/R/assign.R b/R/assign.R new file mode 100644 index 00000000..a91eacaf --- /dev/null +++ b/R/assign.R @@ -0,0 +1,243 @@ +#' Derive an SDTM variable +#' +#' @description +#' [sdtm_assign()] is an internal function packing the same functionality as +#' [assign_no_ct()] and [assign_ct()] together but aimed at developers only. +#' As a user please use either [assign_no_ct()] or [assign_ct()]. +#' +#' @param raw_dat The raw dataset (dataframe); must include the +#' variables passed in `id_vars` and `raw_var`. +#' @param raw_var The raw variable: a single string indicating the name of the +#' raw variable in `raw_dat`. +#' @param tgt_var The target SDTM variable: a single string indicating the name +#' of variable to be derived. +#' @param ct_spec Study controlled terminology specification: a dataframe with a +#' minimal set of columns, see [ct_spec_vars()] for details. This parameter is +#' optional, if left as `NULL` no controlled terminology recoding is applied. +#' @param ct_clst A codelist code indicating which subset of the controlled +#' terminology to apply in the derivation. This parameter is optional, if left +#' as `NULL`, all possible recodings in `ct_spec` are attempted. +#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by +#' the variables indicated in `id_vars`. This parameter is optional, see +#' section Value for how the output changes depending on this argument value. +#' @param id_vars Key variables to be used in the join between the raw dataset +#' (`raw_dat`) and the target data set (`raw_dat`). +#' +#' @returns The returned data set depends on the value of `tgt_dat`: +#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to +#' `NULL`, then the returned data set is `raw_dat`, selected for the variables +#' indicated in `id_vars`, and a new extra column: the derived variable, as +#' indicated in `tgt_var`. +#' - If the target dataset is provided, then it is merged with the raw data set +#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the +#' derived variable, as indicated in `tgt_var`. +#' +#' +#' @importFrom rlang := +#' @keywords internal +sdtm_assign <- function(raw_dat, + raw_var, + tgt_var, + ct_spec = NULL, + ct_clst = NULL, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + assert_ct_spec(ct_spec, optional = TRUE) + assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = TRUE) + + # Recode the raw variable following terminology. + tgt_val <- ct_map(raw_dat[[raw_var]], ct_spec = ct_spec, ct_clst = ct_clst) + + # Apply derivation by assigning `raw_var` to `tgt_var`. + # `der_dat`: derived dataset. + der_dat <- + raw_dat |> + dplyr::select(c(id_vars, raw_var)) |> + dplyr::mutate("{tgt_var}" := tgt_val) |> # nolint object_name_linter() + dplyr::select(-rlang::sym(raw_var)) + + # If a target dataset is supplied, then join the so far derived dataset with + # the target dataset (`tgt_dat`), otherwise leave it be. + der_dat <- + if (!is.null(tgt_dat)) { + der_dat |> + dplyr::right_join(y = tgt_dat, by = id_vars) |> + dplyr::relocate(tgt_var, .after = dplyr::last_col()) + } else { + der_dat + } + + der_dat +} + +#' Derive an SDTM variable +#' +#' @description +#' - [assign_no_ct()] maps a variable in a raw dataset to a target SDTM +#' variable that has no terminology restrictions. +#' +#' - [assign_ct()] maps a variable in a raw dataset to a target SDTM variable +#' following controlled terminology recoding. +#' +#' @param raw_dat The raw dataset (dataframe); must include the +#' variables passed in `id_vars` and `raw_var`. +#' @param raw_var The raw variable: a single string indicating the name of the +#' raw variable in `raw_dat`. +#' @param tgt_var The target SDTM variable: a single string indicating the name +#' of variable to be derived. +#' @param ct_spec Study controlled terminology specification: a dataframe with a +#' minimal set of columns, see [ct_spec_vars()] for details. +#' @param ct_clst A codelist code indicating which subset of the controlled +#' terminology to apply in the derivation. +#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by +#' the variables indicated in `id_vars`. This parameter is optional, see +#' section Value for how the output changes depending on this argument value. +#' @param id_vars Key variables to be used in the join between the raw dataset +#' (`raw_dat`) and the target data set (`raw_dat`). +#' +#' @returns The returned data set depends on the value of `tgt_dat`: +#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to +#' `NULL`, then the returned data set is `raw_dat`, selected for the variables +#' indicated in `id_vars`, and a new extra column: the derived variable, as +#' indicated in `tgt_var`. +#' - If the target dataset is provided, then it is merged with the raw data set +#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the +#' derived variable, as indicated in `tgt_var`. +#' +#' @examples +#' +#' md1 <- +#' tibble::tibble( +#' oak_id = 1:14, +#' raw_source = "MD1", +#' patient_number = 101:114, +#' MDIND = c( +#' "NAUSEA", "NAUSEA", "ANEMIA", "NAUSEA", "PYREXIA", +#' "VOMITINGS", "DIARHHEA", "COLD", +#' "FEVER", "LEG PAIN", "FEVER", "COLD", "COLD", "PAIN" +#' ) +#' ) +#' +#' assign_no_ct( +#' raw_dat = md1, +#' raw_var = "MDIND", +#' tgt_var = "CMINDC", +#' ) +#' +#' cm_inter <- +#' tibble::tibble( +#' oak_id = 1:14, +#' raw_source = "MD1", +#' patient_number = 101:114, +#' CMTRT = c( +#' "BABY ASPIRIN", +#' "CORTISPORIN", +#' "ASPIRIN", +#' "DIPHENHYDRAMINE HCL", +#' "PARCETEMOL", +#' "VOMIKIND", +#' "ZENFLOX OZ", +#' "AMITRYPTYLINE", +#' "BENADRYL", +#' "DIPHENHYDRAMINE HYDROCHLORIDE", +#' "TETRACYCLINE", +#' "BENADRYL", +#' "SOMINEX", +#' "ZQUILL" +#' ), +#' CMROUTE = c( +#' "ORAL", +#' "ORAL", +#' NA, +#' "ORAL", +#' "ORAL", +#' "ORAL", +#' "INTRAMUSCULAR", +#' "INTRA-ARTERIAL", +#' NA, +#' "NON-STANDARD", +#' "RANDOM_VALUE", +#' "INTRA-ARTICULAR", +#' "TRANSDERMAL", +#' "OPHTHALMIC" +#' ) +#' ) +#' +#' # Controlled terminology specification +#' (ct_spec <- read_ct_spec_example("ct-01-cm")) +#' +#' assign_ct( +#' raw_dat = md1, +#' raw_var = "MDIND", +#' tgt_var = "CMINDC", +#' ct_spec = ct_spec, +#' ct_clst = "C66729", +#' tgt_dat = cm_inter +#' ) +#' +#' @name assign +NULL + +#' @order 1 +#' @export +#' @rdname assign +assign_no_ct <- function(raw_dat, + raw_var, + tgt_var, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + + sdtm_assign( + raw_dat = raw_dat, + raw_var = raw_var, + tgt_var = tgt_var, + tgt_dat = tgt_dat, + id_vars = id_vars + ) +} + +#' @order 2 +#' @export +#' @rdname assign +assign_ct <- function(raw_dat, + raw_var, + tgt_var, + ct_spec, + ct_clst, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + + sdtm_assign( + raw_dat = raw_dat, + raw_var = raw_var, + tgt_var = tgt_var, + tgt_dat = tgt_dat, + id_vars = id_vars, + ct_spec = ct_spec, + ct_clst = ct_clst + ) +} diff --git a/R/clear_cache.R b/R/clear_cache.R new file mode 100644 index 00000000..047bd23e --- /dev/null +++ b/R/clear_cache.R @@ -0,0 +1,19 @@ +#' Clear `{sdtm.oak}` cache of memoised functions +#' +#' @description +#' Some of `{sdtm.oak}` functions have their results cached for runtime +#' efficiency. Use this function to reset the cache. +#' +#' Memoised functions: +#' - [ct_mappings()] +#' +#' @return Returns a logical value, indicating whether the resetting of the +#' cache was successful (`TRUE`) or not (`FALSE`). +#' +#' @examples +#' clear_cache() +#' +#' @export +clear_cache <- function() { + memoise::forget(ct_mappings) +} diff --git a/R/ct.R b/R/ct.R new file mode 100644 index 00000000..32b0596a --- /dev/null +++ b/R/ct.R @@ -0,0 +1,434 @@ +#' Controlled terminology variables +#' +#' @description +#' [ct_spec_vars()] returns the mandatory variables to be present in a data set +#' representing a controlled terminology. By default, it returns all required +#' variables. +#' +#' If only the subset of variables used for matching terms are needed, then +#' request this subset of variables by passing the argument value `"from"`. If +#' only the mapping-to variable is to be requested, then simply pass `"to"`. If +#' only the codelist code variable name is needed then pass `"ct_clst"`. +#' +#' @param set A scalar character (string), one of: `"all"` (default), `"ct_clst"`, +#' `"from"` or `"to"`. +#' +#' @examples +#' # These two calls are equivalent and return all required variables in a +#' # controlled terminology data set. +#' sdtm.oak:::ct_spec_vars() +#' sdtm.oak:::ct_spec_vars("all") +#' +#' # "Codelist code" variable name. +#' sdtm.oak:::ct_spec_vars("ct_clst") +#' +#' # "From" variables +#' sdtm.oak:::ct_spec_vars("from") +#' +#' # The "to" variable. +#' sdtm.oak:::ct_spec_vars("to") +#' +#' @keywords internal +#' @export +ct_spec_vars <- function(set = c("all", "ct_clst", "from", "to")) { + admiraldev::assert_character_vector(set) + + set <- match.arg(set) + ct_clst_var <- "codelist_code" + from_vars <- c("collected_value", "term_synonyms") + to_var <- "term_value" + + if (identical(set, "all")) { + return(c(ct_clst_var, from_vars, to_var)) + } + + if (identical(set, "ct_clst")) { + return(ct_clst_var) + } + + if (identical(set, "from")) { + return(from_vars) + } + + if (identical(set, "to")) { + return(to_var) + } +} + +#' Assert a controlled terminology specification +#' +#' @description +#' [assert_ct_spec()] will check whether `ct_spec` is a data frame and if it contains the +#' variables: `r knitr::combine_words(ct_spec_vars())`. +#' +#' In addition, it will also check if the data frame is not empty (no rows), and +#' whether the columns \code{`r ct_spec_vars('ct_clst')`} and \code{`r ct_spec_vars('to')`} do +#' not contain any `NA` values. +#' +#' @param ct_spec A data frame to be asserted as a valid controlled terminology data +#' set. +#' +#' @returns The function throws an error if `ct_spec` is not a valid controlled +#' terminology data set; otherwise, `ct_spec` is returned invisibly. +#' +#' @examples +#' # If `ct_spec` is a valid controlled terminology then it is returned invisibly. +#' ct_spec_01 <- read_ct_spec_example("ct-01-cm") +#' all.equal(ct_spec_01, sdtm.oak:::assert_ct_spec(ct_spec_01)) +#' +#' # A minimal set of variables needs to be present in `ct_spec` for it to pass the +#' # assertion; `sdtm.oak:::ct_spec_vars()` defines their names. +#' (req_vars <- sdtm.oak:::ct_spec_vars()) +#' +#' # Other (facultative) variables also present in the controlled terminology +#' # example. +#' (opt_vars <- setdiff(colnames(ct_spec_01), req_vars)) +#' +#' # With only the mandatory variables, the assertion still passes. +#' sdtm.oak:::assert_ct_spec(ct_spec_01[req_vars]) +#' +#' # Not having the required variables results in an error. +#' try(sdtm.oak:::assert_ct_spec(ct_spec_01[opt_vars])) +#' +#' @keywords internal +assert_ct_spec <- function(ct_spec, optional = FALSE) { + admiraldev::assert_data_frame( + arg = ct_spec, + required_vars = rlang::syms(ct_spec_vars()), + optional = optional + ) + + if (!is.null(ct_spec) && nrow(ct_spec) == 0L) { + rlang::abort("`ct_spec` can't be empty.") + } + + if (!is.null(ct_spec) && anyNA(ct_spec[[ct_spec_vars("ct_clst")]])) { + rlang::abort(glue::glue("`{ct_spec_vars('ct_clst')}` can't have any NA values.")) + } + + if (!is.null(ct_spec) && anyNA(ct_spec[[ct_spec_vars("to")]])) { + rlang::abort(glue::glue("`{ct_spec_vars('to')}` can't have any NA values.")) + } + + invisible(ct_spec) +} + +#' Assert a codelist code +#' +#' [assert_ct_clst()] asserts the validity of a codelist code in the context of +#' a controlled terminology specification. +#' +#' @param ct_spec Either a data frame encoding a controlled terminology data set, or +#' `NULL`. +#' @param ct_clst A string with a to-be asserted codelist code, or `NULL`. +#' @param optional A scalar logical, indicating whether `ct_clst` can be `NULL` or +#' not. +#' +#' @returns The function throws an error if `ct_clst` is not a valid codelist code +#' given the controlled terminology data set; otherwise, `ct_clst` is returned +#' invisibly. +#' +#' @examples +#' # Load a controlled terminology example. +#' (ct_spec <- read_ct_spec_example("ct-01-cm")) +#' +#' # Should work fine. +#' sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = "C71113") +#' +#' # In certain cases, you might allow `ct_clst` to be `NULL` as to indicate absence, +#' # in that case, set `optional` to `TRUE` to make `assert_ct_clst()` more +#' # forgiving. +#' sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = NULL, optional = TRUE) +#' +#' # Otherwise it would err. +#' try(sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = NULL, optional = FALSE)) +#' +#' @keywords internal +assert_ct_clst <- function(ct_spec, ct_clst, optional = FALSE) { + is_ct_spec_missing <- is.null(ct_spec) + is_ct_clst_missing <- is.null(ct_clst) + is_required_ct_clst_missing <- is_ct_clst_missing && !optional + is_ct_clst_without_ct_spec <- is_ct_spec_missing && !is_ct_clst_missing + are_ct_spec_ct_clst_available <- !is_ct_spec_missing && !is_ct_clst_missing + + if (!is_ct_clst_missing) { + admiraldev::assert_character_scalar(ct_clst) + } + + if (is_required_ct_clst_missing) { + rlang::abort("`ct_clst` is a required parameter.") + } + + if (is_ct_clst_without_ct_spec) { + rlang::abort("`ct_spec` must be a valid controlled terminology if `ct_clst` is supplied.") + } + + if (is_ct_clst_missing) { + return(invisible(NULL)) + } + + if (!is_ct_spec_missing && is.na(ct_clst)) { + rlang::abort("`ct_clst` can't be NA. Did you mean `NULL`?") + } + + if (are_ct_spec_ct_clst_available) { + assert_ct_spec(ct_spec, optional = FALSE) + ct_clst_possibilities <- + ct_spec |> + dplyr::pull(ct_spec_vars("ct_clst")) |> + unique() + admiraldev::assert_character_scalar(ct_clst, values = ct_clst_possibilities) + } + + return(ct_clst) +} + +#' Controlled terminology mappings +#' +#' @description +#' [ct_mappings()] takes a controlled terminology specification and returns the +#' mappings in the form of a [tibble][tibble::tibble-package] in long format, +#' i.e. the recoding of values in the `from` column to the `to` column values, +#' one mapping per row. +#' +#' The resulting mappings are unique, i.e. if `from` values are duplicated in +#' two `from` columns, the first column indicated in `from` takes precedence, +#' and only that mapping is retained in the controlled terminology map. +#' +#' @param ct_spec Controlled terminology specification as a +#' [tibble][tibble::tibble-package]. Each row is for a mapped controlled term. +#' Controlled terms are expected in the column indicated by `to_col`. +#' @param from A character vector of column names indicating the variables +#' containing values to be recoded. +#' @param to A single string indicating the column whose values are to be +#' recoded into. +#' +#' @returns A [tibble][tibble::tibble-package] with two columns, `from` and +#' `to`, indicating the mapping of values, one per row. +#' +#' @examples +#' # Read in a bundled controlled terminology spec example (ex. 01). +#' (ct_spec_01 <- read_ct_spec_example("ct-01-cm")) +#' +#' # Generate mappings from the terminology specification. +#' sdtm.oak:::ct_mappings(ct_spec = ct_spec_01) +#' +#' # Take a glimpse at those mappings where an actual recoding happens. +#' sdtm.oak:::ct_mappings(ct_spec = ct_spec_01) |> +#' dplyr::filter(from != to) |> +#' print(n = 20) +#' +#' @importFrom rlang .data +#' @keywords internal +ct_mappings <- function(ct_spec, from = ct_spec_vars("from"), to = ct_spec_vars("to")) { + assert_ct_spec(ct_spec) + cols <- c(to, from) + + ct_mappings <- + ct_spec |> + dplyr::mutate(to = !!rlang::sym(to)) |> + tidyr::pivot_longer( + cols = dplyr::all_of(cols), + values_to = "from", + names_to = "type" + ) |> + dplyr::select(c("type", "from", "to")) |> + dplyr::mutate(type = factor(.data$type, levels = cols)) |> + dplyr::arrange(.data$type) |> + dplyr::select(-"type") |> + tidyr::drop_na("from") |> + dplyr::mutate(from = str_split(.data$from)) |> + tidyr::unnest(from) |> + dplyr::filter(from != "") |> # In case the split resulted in empty strings. + dplyr::mutate(from = trimws(.data$from), to = trimws(.data$to)) |> + dplyr::distinct(.data$from, .keep_all = TRUE) + + ct_mappings +} + +#' Recode according to controlled terminology +#' +#' [ct_map()] recodes a vector following a controlled terminology. +#' +#' @param x A character vector of terms to be recoded following a controlled +#' terminology. +#' @param ct_spec A [tibble][tibble::tibble-package] providing a controlled +#' terminology specification. +#' @param ct_clst A character vector indicating a set of possible controlled +#' terminology codelists codes to be used for recoding. By default (`NULL`) +#' all codelists available in `ct_spec` are used. +#' @param from A character vector of column names indicating the variables +#' containing values to be matched against for terminology recoding. +#' @param to A single string indicating the column whose values are to be +#' recoded into. +#' +#' @returns A character vector of terminology recoded values from `x`. If no +#' match is found in the controlled terminology spec provided in `ct_spec`, then +#' `x` values are returned in uppercase. If `ct_spec` is not provided `x` is +#' returned unchanged. +#' +#' @examples +#' # A few example terms. +#' terms <- +#' c( +#' "/day", +#' "Yes", +#' "Unknown", +#' "Prior", +#' "Every 2 hours", +#' "Percentage", +#' "International Unit" +#' ) +#' +#' # Load a controlled terminology example +#' (ct_spec <- read_ct_spec_example("ct-01-cm")) +#' +#' # Use all possible matching terms in the controlled terminology. +#' ct_map(x = terms, ct_spec = ct_spec) +#' +#' # Note that if the controlled terminology mapping is restricted to a codelist +#' # code, e.g. C71113, then only `"/day"` and `"Every 2 hours"` get mapped to +#' # `"QD"` and `"Q2H"`, respectively; remaining terms won't match given the +#' # codelist code # restriction, and will be mapped to an uppercase version of +#' # the original terms. +#' ct_map(x = terms, ct_spec = ct_spec, ct_clst = "C71113") +#' +#' @importFrom rlang %||% .data +#' @export +ct_map <- + function(x, + ct_spec = NULL, + ct_clst = NULL, + from = ct_spec_vars("from"), + to = ct_spec_vars("to")) { + ct_spec %||% return(x) + assert_ct_spec(ct_spec) + + ct_clst <- ct_clst %||% unique(ct_spec[[ct_spec_vars("ct_clst")]]) + ct_spec <- dplyr::filter(ct_spec, .data[[ct_spec_vars("ct_clst")]] %in% ct_clst) + + mappings <- ct_mappings(ct_spec, from = from, to = to) + recode( + x, + from = mappings$from, + to = mappings$to, + .no_match = toupper(x) + ) + } + +#' Read in a controlled terminology +#' +#' [read_ct_spec()] imports a controlled terminology specification data set as a +#' [tibble][tibble::tibble-package]. +#' +#' @param file A path to a file containing a controlled terminology +#' specification data set. The following are expected of this file: +#' +#' - The file is expected to be a CSV file; +#' - The file is expected to contain a first row of column names; +#' - This minimal set of variables is expected: `r knitr::combine_words(ct_spec_vars())`. +#' +#' @returns A [tibble][tibble::tibble-package] with a controlled terminology +#' specification. +#' +#' @examples +#' # Get the local path to one of the controlled terminology example files. +#' path <- ct_spec_example("ct-01-cm") +#' +#' # Import it to R. +#' read_ct_spec(file = path) +#' +#' @export +read_ct_spec <- function(file = stop("`file` must be specified")) { + ct_spec <- readr::read_csv(file = file, col_types = "c") + assert_ct_spec(ct_spec) + + ct_spec +} + +#' Find the path to an example controlled terminology file +#' +#' [ct_spec_example()] resolves the local path to an example controlled +#' terminology file. +#' +#' @param example A string with either the basename, file name, or relative path +#' to a controlled terminology file bundled with `{stdm.oak}`, see examples. +#' +#' @returns The local path to an example file if `example` is supplied, or a +#' character vector of example file names. +#' +#' @examples +#' # Get the local path to controlled terminology example file 01 +#' # Using the basename only: +#' ct_spec_example("ct-01-cm") +#' +#' # Using the file name: +#' ct_spec_example("ct-01-cm.csv") +#' +#' # Using the relative path: +#' ct_spec_example("ct/ct-01-cm.csv") +#' +#' # If no example is provided it returns a vector of possible choices. +#' ct_spec_example() +#' +#' @export +ct_spec_example <- function(example) { + # If no example is requested, then return all available files. + if (missing(example)) { + ct_spec_path <- system.file("ct", package = "sdtm.oak", mustWork = TRUE) + ct_spec_files <- list.files(ct_spec_path, pattern = "*.csv") + return(ct_spec_files) + } + + # Otherwise, resolve the local path to the example requested. + admiraldev::assert_character_scalar(example, optional = TRUE) + base_name <- tools::file_path_sans_ext(basename(example)) + path <- file.path("ct", paste0(base_name, ".csv")) + local_path <- system.file(path, package = "sdtm.oak") + + if (identical(local_path, "")) { + stop( + glue::glue( + "'{example}' does not match any ct spec files. Run `ct_spec_example()` for options." + ), + call. = FALSE + ) + } else { + local_path <- + system.file(path, package = "sdtm.oak", mustWork = TRUE) + return(local_path) + } +} + +#' Read an example controlled terminology specification +#' +#' [read_ct_spec_example()] imports one of the bundled controlled terminology +#' specification data sets as a [tibble][tibble::tibble-package] into R. +#' +#' @param example The file name of a controlled terminology data set bundled +#' with `{stdm.oak}`, run `read_ct_spec_example()` for available example files. +#' +#' @returns A [tibble][tibble::tibble-package] with a controlled terminology +#' specification data set, or a character vector of example file names. +#' +#' @examples +#' # Leave the `example` parameter as missing for available example files. +#' read_ct_spec_example() +#' +#' # Read an example controlled terminology spec file. +#' read_ct_spec_example("ct-01-cm.csv") +#' +#' # You may omit the file extension. +#' read_ct_spec_example("ct-01-cm") +#' +#' @export +read_ct_spec_example <- function(example) { + if (missing(example)) { + return(ct_spec_example()) + } else { + admiraldev::assert_character_scalar(example) + } + + path <- ct_spec_example(example) + read_ct_spec(file = path) +} diff --git a/R/dtc_problems.R b/R/dtc_problems.R index c6ab3494..9c23544e 100644 --- a/R/dtc_problems.R +++ b/R/dtc_problems.R @@ -133,7 +133,7 @@ any_problems <- function(cap_matrices, .cutoff_2000 = 68L) { #' provides easy access to these parsing problems. #' #' @param x An object of class iso8601, as typically obtained from a call to -#' [create_iso8601()]. The argument can also be left empty, in that case it +#' [create_iso8601()]. The argument can also be left empty, in that case #' `problems()` will use the last returned value, making it convenient to use #' immediately after [create_iso8601()]. #' @@ -159,7 +159,7 @@ any_problems <- function(cap_matrices, .cutoff_2000 = 68L) { #' "20231225" #' ) #' -#' #' # By inspecting the problematic dates it can be understood that +#' # By inspecting the problematic dates it can be understood that #' # the `.format` parameter needs to updated to include other variations. #' iso8601_dttm <- create_iso8601(dates, .format = "y-m-d") #' problems(iso8601_dttm) diff --git a/R/hardcode.R b/R/hardcode.R new file mode 100644 index 00000000..31938689 --- /dev/null +++ b/R/hardcode.R @@ -0,0 +1,251 @@ +#' Derive an SDTM variable with a hardcoded value +#' +#' @description +#' [sdtm_hardcode()] is an internal function packing the same functionality as +#' [hardcode_no_ct()] and [hardcode_ct()] together but aimed at developers only. +#' As a user please use either [hardcode_no_ct()] or [hardcode_ct()]. +#' +#' @param raw_dat The raw dataset (dataframe); must include the +#' variables passed in `id_vars` and `raw_var`. +#' @param raw_var The raw variable: a single string indicating the name of the +#' raw variable in `raw_dat`. +#' @param tgt_var The target SDTM variable: a single string indicating the name +#' of variable to be derived. +#' @param tgt_val The target SDTM value to be hardcoded into the variable +#' indicated in `tgt_var`. +#' @param ct_spec Study controlled terminology specification: a dataframe with a +#' minimal set of columns, see [ct_spec_vars()] for details. This parameter is +#' optional, if left as `NULL` no controlled terminology recoding is applied. +#' @param ct_clst A codelist code indicating which subset of the controlled +#' terminology to apply in the derivation. This parameter is optional, if left +#' as `NULL`, all possible recodings in `ct_spec` are attempted. +#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by +#' the variables indicated in `id_vars`. This parameter is optional, see +#' section Value for how the output changes depending on this argument value. +#' @param id_vars Key variables to be used in the join between the raw dataset +#' (`raw_dat`) and the target data set (`raw_dat`). +#' +#' @returns The returned data set depends on the value of `tgt_dat`: +#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to +#' `NULL`, then the returned data set is `raw_dat`, selected for the variables +#' indicated in `id_vars`, and a new extra column: the derived variable, as +#' indicated in `tgt_var`. +#' - If the target dataset is provided, then it is merged with the raw data set +#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the +#' derived variable, as indicated in `tgt_var`. +#' +#' @importFrom rlang := +#' @keywords internal +sdtm_hardcode <- function(raw_dat, + raw_var, + tgt_var, + tgt_val, + ct_spec = NULL, + ct_clst = NULL, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + admiraldev::assert_character_scalar(tgt_val) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + assert_ct_spec(ct_spec, optional = TRUE) + assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = TRUE) + + # Recode the hardcoded value following terminology. + tgt_val <- ct_map(tgt_val, ct_spec = ct_spec, ct_clst = ct_clst) + + # Apply derivation of the hardcoded value. + # `der_dat`: derived dataset. + der_dat <- + raw_dat |> + dplyr::select(c(id_vars, raw_var)) |> + dplyr::mutate("{tgt_var}" := recode(x = !!rlang::sym(raw_var), to = tgt_val)) |> # nolint object_name_linter() + dplyr::select(-rlang::sym(raw_var)) + + # If a target dataset is supplied, then join the so far derived dataset with + # the target dataset (`tgt_dat`), otherwise leave it be. + der_dat <- + if (!is.null(tgt_dat)) { + der_dat |> + dplyr::right_join(y = tgt_dat, by = id_vars) |> + dplyr::relocate(tgt_var, .after = dplyr::last_col()) + } else { + der_dat + } + + der_dat +} + +#' Derive an SDTM variable with a hardcoded value +#' +#' +#' @description +#' - [hardcode_no_ct()] maps a hardcoded value to a target SDTM variable that has +#' no terminology restrictions. +#' +#' - [hardcode_ct()] maps a hardcoded value to a target SDTM variable with +#' controlled terminology recoding. +#' +#' @param raw_dat The raw dataset (dataframe); must include the +#' variables passed in `id_vars` and `raw_var`. +#' @param raw_var The raw variable: a single string indicating the name of the +#' raw variable in `raw_dat`. +#' @param tgt_var The target SDTM variable: a single string indicating the name +#' of variable to be derived. +#' @param tgt_val The target SDTM value to be hardcoded into the variable +#' indicated in `tgt_var`. +#' @param ct_spec Study controlled terminology specification: a dataframe with a +#' minimal set of columns, see [ct_spec_vars()] for details. This parameter is +#' optional, if left as `NULL` no controlled terminology recoding is applied. +#' @param ct_clst A codelist code indicating which subset of the controlled +#' terminology to apply in the derivation. This parameter is optional, if left +#' as `NULL`, all possible recodings in `ct_spec` are attempted. +#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by +#' the variables indicated in `id_vars`. This parameter is optional, see +#' section Value for how the output changes depending on this argument value. +#' @param id_vars Key variables to be used in the join between the raw dataset +#' (`raw_dat`) and the target data set (`raw_dat`). +#' +#' @returns The returned data set depends on the value of `tgt_dat`: +#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to +#' `NULL`, then the returned data set is `raw_dat`, selected for the variables +#' indicated in `id_vars`, and a new extra column: the derived variable, as +#' indicated in `tgt_var`. +#' - If the target dataset is provided, then it is merged with the raw data set +#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the +#' derived variable, as indicated in `tgt_var`. +#' +#' @examples +#' md1 <- +#' tibble::tribble( +#' ~oak_id, ~raw_source, ~patient_number, ~MDRAW, +#' 1L, "MD1", 101L, "BABY ASPIRIN", +#' 2L, "MD1", 102L, "CORTISPORIN", +#' 3L, "MD1", 103L, NA_character_, +#' 4L, "MD1", 104L, "DIPHENHYDRAMINE HCL" +#' ) +#' +#' # Derive a new variable `CMCAT` by overwriting `MDRAW` with the +#' # hardcoded value "GENERAL CONCOMITANT MEDICATIONS". +#' hardcode_no_ct( +#' raw_dat = md1, +#' raw_var = "MDRAW", +#' tgt_var = "CMCAT", +#' tgt_val = "GENERAL CONCOMITANT MEDICATIONS" +#' ) +#' +#' cm_inter <- +#' tibble::tribble( +#' ~oak_id, ~raw_source, ~patient_number, ~CMTRT, ~CMINDC, +#' 1L, "MD1", 101L, "BABY ASPIRIN", NA, +#' 2L, "MD1", 102L, "CORTISPORIN", "NAUSEA", +#' 3L, "MD1", 103L, "ASPIRIN", "ANEMIA", +#' 4L, "MD1", 104L, "DIPHENHYDRAMINE HCL", "NAUSEA", +#' 5L, "MD1", 105L, "PARACETAMOL", "PYREXIA" +#' ) +#' +#' # Derive a new variable `CMCAT` by overwriting `MDRAW` with the +#' # hardcoded value "GENERAL CONCOMITANT MEDICATIONS" with a prior join to +#' # `target_dataset`. +#' hardcode_no_ct( +#' raw_dat = md1, +#' raw_var = "MDRAW", +#' tgt_var = "CMCAT", +#' tgt_val = "GENERAL CONCOMITANT MEDICATIONS", +#' tgt_dat = cm_inter +#' ) +#' +#' # Controlled terminology specification +#' (ct_spec <- read_ct_spec_example("ct-01-cm")) +#' +#' # Hardcoding of `CMCAT` with the value `"GENERAL CONCOMITANT MEDICATIONS"` +#' # involving terminology recoding. `NA` values in `MDRAW` are preserved in +#' # `CMCAT`. +#' hardcode_ct( +#' raw_dat = md1, +#' raw_var = "MDRAW", +#' tgt_var = "CMCAT", +#' tgt_val = "GENERAL CONCOMITANT MEDICATIONS", +#' ct_spec = ct_spec, +#' ct_clst = "C66729", +#' tgt_dat = cm_inter +#' ) +#' +#' @name harcode +NULL + +#' @export +#' @rdname harcode +hardcode_no_ct <- function(raw_dat, + raw_var, + tgt_var, + tgt_val, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + assertthat::assert_that(assertthat::is.scalar(tgt_val), + msg = "`tgt_val` must be a scalar value." + ) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + + sdtm_hardcode( + raw_dat = raw_dat, + raw_var = raw_var, + tgt_var = tgt_var, + tgt_val = tgt_val, + tgt_dat = tgt_dat, + id_vars = id_vars + ) +} + +#' @export +#' @rdname harcode +hardcode_ct <- + function(raw_dat, + raw_var, + tgt_var, + tgt_val, + ct_spec, + ct_clst, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + assertthat::assert_that(assertthat::is.scalar(tgt_val), + msg = "`tgt_val` must be a scalar value." + ) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, + required_vars = rlang::syms(id_vars), + optional = TRUE + ) + + assert_ct_spec(ct_spec, optional = FALSE) + assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = FALSE) + + sdtm_hardcode( + raw_dat = raw_dat, + raw_var = raw_var, + tgt_var = tgt_var, + tgt_val = tgt_val, + ct_spec = ct_spec, + ct_clst = ct_clst, + tgt_dat = tgt_dat, + id_vars = id_vars + ) + } diff --git a/R/oak_id_vars.R b/R/oak_id_vars.R new file mode 100644 index 00000000..718d11da --- /dev/null +++ b/R/oak_id_vars.R @@ -0,0 +1,53 @@ +#' Raw dataset keys +#' +#' [oak_id_vars()] is a helper function providing the variable (column) names to +#' be regarded as keys in [tibbles][tibble::tibble-package] representing raw +#' datasets. By default, the set of names is +#' `r knitr::combine_words(oak_id_vars())`. Extra variable names may be +#' indicated and passed in `extra_vars` which are appended to the default names. +#' +#' @param extra_vars A character vector of extra column names to be appended to +#' the default names: `r knitr::combine_words(oak_id_vars())`. +#' +#' @returns A character vector of column names to be regarded +#' as keys in raw datasets. +#' +#' @examples +#' sdtm.oak:::oak_id_vars() +#' +#' sdtm.oak:::oak_id_vars(extra_vars = "sample_id") +#' +#' @keywords internal +oak_id_vars <- function(extra_vars = NULL) { + admiraldev::assert_character_vector(extra_vars, optional = TRUE) + unique(c("oak_id", "raw_source", "patient_number", extra_vars)) +} + +#' Does a vector contain the raw dataset key variables? +#' +#' [contains_oak_id_vars()] evaluates whether a character vector `x` contains +#' the raw dataset key variable names, i.e. the so called Oak identifier +#' variables --- these are defined by the return value of [oak_id_vars()]. +#' +#' @param x A character vector. +#' +#' @returns A logical scalar value. +#' +#' @examples +#' # `oak_id_vars()` is the function that defines what are the minimal set of +#' # oak keys. Hence, by definition, the following code should always return +#' # `TRUE`. +#' sdtm.oak:::contains_oak_id_vars(sdtm.oak:::oak_id_vars()) +#' +#' # Returns `FALSE`. +#' sdtm.oak:::contains_oak_id_vars(character()) +#' +#' # Another example that returns `FALSE` because it is missing +#' # `"patient_number"`. +#' sdtm.oak:::contains_oak_id_vars(c("oak_id", "raw_source")) +#' +#' @keywords internal +contains_oak_id_vars <- function(x) { + admiraldev::assert_character_vector(x) + all(oak_id_vars() %in% x) +} diff --git a/R/recode.R b/R/recode.R new file mode 100644 index 00000000..424096b2 --- /dev/null +++ b/R/recode.R @@ -0,0 +1,68 @@ +#' Determine Indices for Recoding +#' +#' [index_for_recode()] identifies the positions of elements in `x` that match +#' any of the values specified in the `from` vector. This function is primarily +#' used to facilitate the recoding of values by pinpointing which elements in +#' `x` correspond to the `from` values and thus need to be replaced or updated. +#' +#' @param x A vector of values in which to search for matches. +#' @param from A vector of values to match against the elements in `x`. +#' @return An integer vector of the same length as `x`, containing the indices +#' of the matched values from the `from` vector. If an element in `x` does not +#' match any value in `from`, the corresponding position in the output will be +#' `NA`. This index information is critical for subsequent recoding operations. +#' @examples +#' sdtm.oak:::index_for_recode(x = 1:5, from = c(2, 4)) +#' +#' @keywords internal +index_for_recode <- function(x, from) { + match(x, from) +} + +#' Recode values +#' +#' [recode()] recodes values in `x` by matching elements in `from` onto values +#' in `to`. +#' +#' @param x An atomic vector of values are to be recoded. +#' @param from A vector of values to be matched in `x` for recoding. +#' @param to A vector of values to be used as replacement for values in `from`. +#' @param .no_match Value to be used as replacement when cases in `from` are not +#' matched. +#' @param .na Value to be used to recode missing values. +#' +#' @returns A vector of recoded values. +#' +#' @examples +#' x <- c("male", "female", "x", NA) +#' sdtm.oak:::recode(x, +#' from = c("male", "female"), +#' to = c("M", "F") +#' ) +#' sdtm.oak:::recode( +#' x, +#' from = c("male", "female"), +#' to = c("M", "F"), +#' .no_match = "?" +#' ) +#' sdtm.oak:::recode( +#' x, +#' from = c("male", "female"), +#' to = c("M", "F"), +#' .na = "missing" +#' ) +#' +#' @keywords internal +recode <- function( + x, + from = unique(na.omit(x)), + to = from, + .no_match = x, + .na = NA) { + to <- vctrs::vec_recycle(to, length(from)) + index <- index_for_recode(x, from) + y <- ifelse(!is.na(index), to[index], .no_match) + y[is.na(x)] <- .na + + y +} diff --git a/R/sdtm.oak-package.R b/R/sdtm.oak-package.R index b1a48e6f..0ba23dc1 100644 --- a/R/sdtm.oak-package.R +++ b/R/sdtm.oak-package.R @@ -4,5 +4,6 @@ ## usethis namespace: start #' @importFrom tibble tibble #' @importFrom rlang .data +#' @importFrom stats na.omit ## usethis namespace: end NULL diff --git a/R/str_split.R b/R/str_split.R new file mode 100644 index 00000000..7b5b1125 --- /dev/null +++ b/R/str_split.R @@ -0,0 +1,13 @@ +str_split_ <- function(x, split = ";", quote = '"') { + scan( + text = x, + what = "character", + sep = split, + quote = quote, + quiet = TRUE + ) +} + +str_split <- function(x, split = ";", quote = '"') { + lapply(x, str_split_, split = split, quote = quote) +} diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 00000000..e05837ee --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,3 @@ +.onLoad <- function(libname, pkgname) { + ct_mappings <<- memoise::memoise(ct_mappings) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index d0716d79..2f306131 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -6,3 +6,29 @@ template: navbar: title: "sdtm.oak" + +reference: +- title: SDTM derivation + desc: Set of functions to perform SDTM derivations + contents: + - assign + - harcode + - derive_study_day + +- title: Controlled terminology + contents: + - read_ct_spec + - read_ct_spec_example + - ct_spec_example + - ct_map + +- title: Date and time conversion + contents: + - create_iso8601 + - fmt_cmp + - dtc_formats + - problems + +- title: Package global state + contents: + - clear_cache diff --git a/inst/WORDLIST b/inst/WORDLIST index 50b2eb7e..538f7a2e 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -10,6 +10,17 @@ ORCID PMDA CDISC iso +hardcoded +CDISC +PMDA +recode +recodes +recoded +recoding +tibble +codelist +Memoised +memoised AE AESTDY CMSTDY diff --git a/inst/ct/README.md b/inst/ct/README.md new file mode 100644 index 00000000..58433341 --- /dev/null +++ b/inst/ct/README.md @@ -0,0 +1,22 @@ +# Controlled terminology examples + +## Introduction + +The folder inst/ct is meant to hold examples of controlled terminology +specifications. These example files should be plain CSV files. + +If you're adding new example files please follow the naming convention: +`ct-
-.csv`, where `
` is a simple numerical identifier +for the example, i.e. 01, 02, etc.. The `` is a short keyword +providing a simple contextual annotation for the controlled terminology. + +For example, `ct-01-cm.csv` is an example of a controlled terminology +specification data set: + +- `01`: indicates that it is the first example +- `cm`: suggests that this data set is related to the Concomitant Medication +domain. + +## Examples + +- `ct-01-cm.csv`: TBD. diff --git a/inst/ct/ct-01-cm.csv b/inst/ct/ct-01-cm.csv new file mode 100644 index 00000000..ffbe0903 --- /dev/null +++ b/inst/ct/ct-01-cm.csv @@ -0,0 +1,34 @@ +codelist_code,term_code,CodedData,term_value,collected_value,term_preferred_term,term_synonyms,raw_codelist +C71113,C25473,QD,QD,QD (Every Day),Daily,/day; Daily; Per Day,FREQ_CV1 +C71113,C64496,BID,BID,BID (Twice a Day),Twice Daily,BD; Twice per day,FREQ_CV1 +C71113,C64499,PRN,PRN,PRN (As Needed),As Needed,As needed,FREQ_CV1 +C71113,C64516,Q2H,Q2H,Q2H (Every 2 Hours),Every Two Hours,Every 2 hours,FREQ_CV1 +C71113,C64530,QID,QID,QID (4 Times a Day),Four Times Daily,4 times per day,FREQ_CV1 +C66726,C25158,CAPSULE,CAPSULE,Capsule,Capsule Dosage Form,cap,FRM_CV1 +C66726,C25394,PILL,PILL,Pill,Pill Dosage Form,NA,FRM_CV1 +C66726,C29167,LOTION,LOTION,Lotion,Lotion Dosage Form,NA,FRM_CV1 +C66726,C42887,AEROSOL,AEROSOL,Aerosol,Aerosol Dosage Form,aer,FRM_CV1 +C66726,C42944,INHALANT,INHALANT,Inhalant,Inhalant Dosage Form,NA,FRM_CV1 +C66726,C42946,INJECTION,INJECTION,Injection,Injectable Dosage Form,NA,FRM_CV1 +C66726,C42953,LIQUID,LIQUID,Liquid,Liquid Dosage Form,NA,FRM_CV1 +C66726,C42998,TABLET,TABLET,Tablet,Tablet Dosage Form,tab,FRM_CV1 +C66742,C49488,Y,Y,Yes,Yes,Yes,NY_CV1 +C66729,C28161,INTRAMUSCULAR,INTRAMUSCULAR,IM (Intramuscular),Intramuscular Route of Administration,NA,ROUTE_CV1 +C66729,C38210,EPIDURAL,EPIDURAL,EP (Epidural),Epidural Route of Administration,NA,ROUTE_CV1 +C66729,C38222,INTRA-ARTERIAL,INTRA-ARTERIAL,IA (Intra-arterial),Intraarterial Route of Administration,NA,ROUTE_CV1 +C66729,C38223,INTRA-ARTICULAR,INTRA-ARTICULAR,IJ (Intra-articular),Intraarticular Route of Administration,NA,ROUTE_CV1 +C66729,C38287,OPHTHALMIC,OPHTHALMIC,OP (Ophthalmic),Ophthalmic Route of Administration,NA,ROUTE_CV1 +C66729,C38288,ORAL,ORAL,PO (Oral),Oral Route of Administration,Intraoral Route of Administration; PO,ROUTE_CV1 +C66729,C38305,TRANSDERMAL,TRANSDERMAL,DE (Transdermal),Transdermal Route of Administration,NA,ROUTE_CV1 +C66729,C38311,UNKNOWN,UNKNOWN,Unknown,Unknown Route of Administration,NA,ROUTE_CV1 +C71620,C25613,%,%,%,Percentage,Percentage,UNIT_CV1 +C71620,C28253,MG,mg,mg,Milligram,Milligram,UNIT_CV1 +C71620,C28254,ML,mL,mL,Milliliter,cm3; Milliliter,UNIT_CV1 +C71620,C48155,G,g,g,Gram,Gram,UNIT_CV1 +C71620,C48480,CAPSULE,CAPSULE,Capsule,Capsule Dosing Unit,cap; Capsule Dosing Unit,UNIT_CV1 +C71620,C48542,TABLET,TABLET,Tablet,Tablet Dosing Unit,tab; Tablet Dosing Unit,UNIT_CV1 +C71620,C48579,IU,IU,IU,International Unit,IE; International Unit,UNIT_CV1 +C71620,C28254,ML,mL,mL,Milliliter,cm3; Milliliter,UNIT_CV5 +C66728,C25629,BEFORE,BEFORE,Prior,Prior,,NA +C66728,C53279,ONGOING,ONGOING,Continue,Continue,Continuous,NA +C66734,C49568,CM,CM,Concomitant Medication Domain,Concomitant Medication Domain,Concomitant/Prior Medications,NA diff --git a/man/assert_ct_clst.Rd b/man/assert_ct_clst.Rd new file mode 100644 index 00000000..c0239c97 --- /dev/null +++ b/man/assert_ct_clst.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{assert_ct_clst} +\alias{assert_ct_clst} +\title{Assert a codelist code} +\usage{ +assert_ct_clst(ct_spec, ct_clst, optional = FALSE) +} +\arguments{ +\item{ct_spec}{Either a data frame encoding a controlled terminology data set, or +\code{NULL}.} + +\item{ct_clst}{A string with a to-be asserted codelist code, or \code{NULL}.} + +\item{optional}{A scalar logical, indicating whether \code{ct_clst} can be \code{NULL} or +not.} +} +\value{ +The function throws an error if \code{ct_clst} is not a valid codelist code +given the controlled terminology data set; otherwise, \code{ct_clst} is returned +invisibly. +} +\description{ +\code{\link[=assert_ct_clst]{assert_ct_clst()}} asserts the validity of a codelist code in the context of +a controlled terminology specification. +} +\examples{ +# Load a controlled terminology example. +(ct_spec <- read_ct_spec_example("ct-01-cm")) + +# Should work fine. +sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = "C71113") + +# In certain cases, you might allow `ct_clst` to be `NULL` as to indicate absence, +# in that case, set `optional` to `TRUE` to make `assert_ct_clst()` more +# forgiving. +sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = NULL, optional = TRUE) + +# Otherwise it would err. +try(sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = NULL, optional = FALSE)) + +} +\keyword{internal} diff --git a/man/assert_ct_spec.Rd b/man/assert_ct_spec.Rd new file mode 100644 index 00000000..5ce72ce4 --- /dev/null +++ b/man/assert_ct_spec.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{assert_ct_spec} +\alias{assert_ct_spec} +\title{Assert a controlled terminology specification} +\usage{ +assert_ct_spec(ct_spec, optional = FALSE) +} +\arguments{ +\item{ct_spec}{A data frame to be asserted as a valid controlled terminology data +set.} +} +\value{ +The function throws an error if \code{ct_spec} is not a valid controlled +terminology data set; otherwise, \code{ct_spec} is returned invisibly. +} +\description{ +\code{\link[=assert_ct_spec]{assert_ct_spec()}} will check whether \code{ct_spec} is a data frame and if it contains the +variables: codelist_code, collected_value, term_synonyms, and term_value. + +In addition, it will also check if the data frame is not empty (no rows), and +whether the columns \code{codelist_code} and \code{term_value} do +not contain any \code{NA} values. +} +\examples{ +# If `ct_spec` is a valid controlled terminology then it is returned invisibly. +ct_spec_01 <- read_ct_spec_example("ct-01-cm") +all.equal(ct_spec_01, sdtm.oak:::assert_ct_spec(ct_spec_01)) + +# A minimal set of variables needs to be present in `ct_spec` for it to pass the +# assertion; `sdtm.oak:::ct_spec_vars()` defines their names. +(req_vars <- sdtm.oak:::ct_spec_vars()) + +# Other (facultative) variables also present in the controlled terminology +# example. +(opt_vars <- setdiff(colnames(ct_spec_01), req_vars)) + +# With only the mandatory variables, the assertion still passes. +sdtm.oak:::assert_ct_spec(ct_spec_01[req_vars]) + +# Not having the required variables results in an error. +try(sdtm.oak:::assert_ct_spec(ct_spec_01[opt_vars])) + +} +\keyword{internal} diff --git a/man/assign.Rd b/man/assign.Rd new file mode 100644 index 00000000..ff7df056 --- /dev/null +++ b/man/assign.Rd @@ -0,0 +1,141 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assign.R +\name{assign_no_ct} +\alias{assign_no_ct} +\alias{assign_ct} +\alias{assign} +\title{Derive an SDTM variable} +\usage{ +assign_no_ct( + raw_dat, + raw_var, + tgt_var, + tgt_dat = NULL, + id_vars = oak_id_vars() +) + +assign_ct( + raw_dat, + raw_var, + tgt_var, + ct_spec, + ct_clst, + tgt_dat = NULL, + id_vars = oak_id_vars() +) +} +\arguments{ +\item{raw_dat}{The raw dataset (dataframe); must include the +variables passed in \code{id_vars} and \code{raw_var}.} + +\item{raw_var}{The raw variable: a single string indicating the name of the +raw variable in \code{raw_dat}.} + +\item{tgt_var}{The target SDTM variable: a single string indicating the name +of variable to be derived.} + +\item{tgt_dat}{Target dataset: a data frame to be merged against \code{raw_dat} by +the variables indicated in \code{id_vars}. This parameter is optional, see +section Value for how the output changes depending on this argument value.} + +\item{id_vars}{Key variables to be used in the join between the raw dataset +(\code{raw_dat}) and the target data set (\code{raw_dat}).} + +\item{ct_spec}{Study controlled terminology specification: a dataframe with a +minimal set of columns, see \code{\link[=ct_spec_vars]{ct_spec_vars()}} for details.} + +\item{ct_clst}{A codelist code indicating which subset of the controlled +terminology to apply in the derivation.} +} +\value{ +The returned data set depends on the value of \code{tgt_dat}: +\itemize{ +\item If no target dataset is supplied, meaning that \code{tgt_dat} defaults to +\code{NULL}, then the returned data set is \code{raw_dat}, selected for the variables +indicated in \code{id_vars}, and a new extra column: the derived variable, as +indicated in \code{tgt_var}. +\item If the target dataset is provided, then it is merged with the raw data set +\code{raw_dat} by the variables indicated in \code{id_vars}, with a new column: the +derived variable, as indicated in \code{tgt_var}. +} +} +\description{ +\itemize{ +\item \code{\link[=assign_no_ct]{assign_no_ct()}} maps a variable in a raw dataset to a target SDTM +variable that has no terminology restrictions. +\item \code{\link[=assign_ct]{assign_ct()}} maps a variable in a raw dataset to a target SDTM variable +following controlled terminology recoding. +} +} +\examples{ + +md1 <- + tibble::tibble( + oak_id = 1:14, + raw_source = "MD1", + patient_number = 101:114, + MDIND = c( + "NAUSEA", "NAUSEA", "ANEMIA", "NAUSEA", "PYREXIA", + "VOMITINGS", "DIARHHEA", "COLD", + "FEVER", "LEG PAIN", "FEVER", "COLD", "COLD", "PAIN" + ) + ) + +assign_no_ct( + raw_dat = md1, + raw_var = "MDIND", + tgt_var = "CMINDC", +) + +cm_inter <- + tibble::tibble( + oak_id = 1:14, + raw_source = "MD1", + patient_number = 101:114, + CMTRT = c( + "BABY ASPIRIN", + "CORTISPORIN", + "ASPIRIN", + "DIPHENHYDRAMINE HCL", + "PARCETEMOL", + "VOMIKIND", + "ZENFLOX OZ", + "AMITRYPTYLINE", + "BENADRYL", + "DIPHENHYDRAMINE HYDROCHLORIDE", + "TETRACYCLINE", + "BENADRYL", + "SOMINEX", + "ZQUILL" + ), + CMROUTE = c( + "ORAL", + "ORAL", + NA, + "ORAL", + "ORAL", + "ORAL", + "INTRAMUSCULAR", + "INTRA-ARTERIAL", + NA, + "NON-STANDARD", + "RANDOM_VALUE", + "INTRA-ARTICULAR", + "TRANSDERMAL", + "OPHTHALMIC" + ) + ) + +# Controlled terminology specification +(ct_spec <- read_ct_spec_example("ct-01-cm")) + +assign_ct( + raw_dat = md1, + raw_var = "MDIND", + tgt_var = "CMINDC", + ct_spec = ct_spec, + ct_clst = "C66729", + tgt_dat = cm_inter +) + +} diff --git a/man/clear_cache.Rd b/man/clear_cache.Rd new file mode 100644 index 00000000..212f245e --- /dev/null +++ b/man/clear_cache.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clear_cache.R +\name{clear_cache} +\alias{clear_cache} +\title{Clear \code{{sdtm.oak}} cache of memoised functions} +\usage{ +clear_cache() +} +\value{ +Returns a logical value, indicating whether the resetting of the +cache was successful (\code{TRUE}) or not (\code{FALSE}). +} +\description{ +Some of \code{{sdtm.oak}} functions have their results cached for runtime +efficiency. Use this function to reset the cache. + +Memoised functions: +\itemize{ +\item \code{\link[=ct_mappings]{ct_mappings()}} +} +} +\examples{ +clear_cache() + +} diff --git a/man/contains_oak_id_vars.Rd b/man/contains_oak_id_vars.Rd new file mode 100644 index 00000000..c872bbbe --- /dev/null +++ b/man/contains_oak_id_vars.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oak_id_vars.R +\name{contains_oak_id_vars} +\alias{contains_oak_id_vars} +\title{Does a vector contain the raw dataset key variables?} +\usage{ +contains_oak_id_vars(x) +} +\arguments{ +\item{x}{A character vector.} +} +\value{ +A logical scalar value. +} +\description{ +\code{\link[=contains_oak_id_vars]{contains_oak_id_vars()}} evaluates whether a character vector \code{x} contains +the raw dataset key variable names, i.e. the so called Oak identifier +variables --- these are defined by the return value of \code{\link[=oak_id_vars]{oak_id_vars()}}. +} +\examples{ +# `oak_id_vars()` is the function that defines what are the minimal set of +# oak keys. Hence, by definition, the following code should always return +# `TRUE`. +sdtm.oak:::contains_oak_id_vars(sdtm.oak:::oak_id_vars()) + +# Returns `FALSE`. +sdtm.oak:::contains_oak_id_vars(character()) + +# Another example that returns `FALSE` because it is missing +# `"patient_number"`. +sdtm.oak:::contains_oak_id_vars(c("oak_id", "raw_source")) + +} +\keyword{internal} diff --git a/man/ct_map.Rd b/man/ct_map.Rd new file mode 100644 index 00000000..0f00c10a --- /dev/null +++ b/man/ct_map.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{ct_map} +\alias{ct_map} +\title{Recode according to controlled terminology} +\usage{ +ct_map( + x, + ct_spec = NULL, + ct_clst = NULL, + from = ct_spec_vars("from"), + to = ct_spec_vars("to") +) +} +\arguments{ +\item{x}{A character vector of terms to be recoded following a controlled +terminology.} + +\item{ct_spec}{A \link[tibble:tibble-package]{tibble} providing a controlled +terminology specification.} + +\item{ct_clst}{A character vector indicating a set of possible controlled +terminology codelists codes to be used for recoding. By default (\code{NULL}) +all codelists available in \code{ct_spec} are used.} + +\item{from}{A character vector of column names indicating the variables +containing values to be matched against for terminology recoding.} + +\item{to}{A single string indicating the column whose values are to be +recoded into.} +} +\value{ +A character vector of terminology recoded values from \code{x}. If no +match is found in the controlled terminology spec provided in \code{ct_spec}, then +\code{x} values are returned in uppercase. If \code{ct_spec} is not provided \code{x} is +returned unchanged. +} +\description{ +\code{\link[=ct_map]{ct_map()}} recodes a vector following a controlled terminology. +} +\examples{ +# A few example terms. +terms <- + c( + "/day", + "Yes", + "Unknown", + "Prior", + "Every 2 hours", + "Percentage", + "International Unit" + ) + +# Load a controlled terminology example +(ct_spec <- read_ct_spec_example("ct-01-cm")) + +# Use all possible matching terms in the controlled terminology. +ct_map(x = terms, ct_spec = ct_spec) + +# Note that if the controlled terminology mapping is restricted to a codelist +# code, e.g. C71113, then only `"/day"` and `"Every 2 hours"` get mapped to +# `"QD"` and `"Q2H"`, respectively; remaining terms won't match given the +# codelist code # restriction, and will be mapped to an uppercase version of +# the original terms. +ct_map(x = terms, ct_spec = ct_spec, ct_clst = "C71113") + +} diff --git a/man/ct_mappings.Rd b/man/ct_mappings.Rd new file mode 100644 index 00000000..83ca4898 --- /dev/null +++ b/man/ct_mappings.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{ct_mappings} +\alias{ct_mappings} +\title{Controlled terminology mappings} +\usage{ +ct_mappings(ct_spec, from = ct_spec_vars("from"), to = ct_spec_vars("to")) +} +\arguments{ +\item{ct_spec}{Controlled terminology specification as a +\link[tibble:tibble-package]{tibble}. Each row is for a mapped controlled term. +Controlled terms are expected in the column indicated by \code{to_col}.} + +\item{from}{A character vector of column names indicating the variables +containing values to be recoded.} + +\item{to}{A single string indicating the column whose values are to be +recoded into.} +} +\value{ +A \link[tibble:tibble-package]{tibble} with two columns, \code{from} and +\code{to}, indicating the mapping of values, one per row. +} +\description{ +\code{\link[=ct_mappings]{ct_mappings()}} takes a controlled terminology specification and returns the +mappings in the form of a \link[tibble:tibble-package]{tibble} in long format, +i.e. the recoding of values in the \code{from} column to the \code{to} column values, +one mapping per row. + +The resulting mappings are unique, i.e. if \code{from} values are duplicated in +two \code{from} columns, the first column indicated in \code{from} takes precedence, +and only that mapping is retained in the controlled terminology map. +} +\examples{ +# Read in a bundled controlled terminology spec example (ex. 01). +(ct_spec_01 <- read_ct_spec_example("ct-01-cm")) + +# Generate mappings from the terminology specification. +sdtm.oak:::ct_mappings(ct_spec = ct_spec_01) + +# Take a glimpse at those mappings where an actual recoding happens. +sdtm.oak:::ct_mappings(ct_spec = ct_spec_01) |> + dplyr::filter(from != to) |> + print(n = 20) + +} +\keyword{internal} diff --git a/man/ct_spec_example.Rd b/man/ct_spec_example.Rd new file mode 100644 index 00000000..2a7f2e3f --- /dev/null +++ b/man/ct_spec_example.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{ct_spec_example} +\alias{ct_spec_example} +\title{Find the path to an example controlled terminology file} +\usage{ +ct_spec_example(example) +} +\arguments{ +\item{example}{A string with either the basename, file name, or relative path +to a controlled terminology file bundled with \code{{stdm.oak}}, see examples.} +} +\value{ +The local path to an example file if \code{example} is supplied, or a +character vector of example file names. +} +\description{ +\code{\link[=ct_spec_example]{ct_spec_example()}} resolves the local path to an example controlled +terminology file. +} +\examples{ +# Get the local path to controlled terminology example file 01 +# Using the basename only: +ct_spec_example("ct-01-cm") + +# Using the file name: +ct_spec_example("ct-01-cm.csv") + +# Using the relative path: +ct_spec_example("ct/ct-01-cm.csv") + +# If no example is provided it returns a vector of possible choices. +ct_spec_example() + +} diff --git a/man/ct_spec_vars.Rd b/man/ct_spec_vars.Rd new file mode 100644 index 00000000..fff7e108 --- /dev/null +++ b/man/ct_spec_vars.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{ct_spec_vars} +\alias{ct_spec_vars} +\title{Controlled terminology variables} +\usage{ +ct_spec_vars(set = c("all", "ct_clst", "from", "to")) +} +\arguments{ +\item{set}{A scalar character (string), one of: \code{"all"} (default), \code{"ct_clst"}, +\code{"from"} or \code{"to"}.} +} +\description{ +\code{\link[=ct_spec_vars]{ct_spec_vars()}} returns the mandatory variables to be present in a data set +representing a controlled terminology. By default, it returns all required +variables. + +If only the subset of variables used for matching terms are needed, then +request this subset of variables by passing the argument value \code{"from"}. If +only the mapping-to variable is to be requested, then simply pass \code{"to"}. If +only the codelist code variable name is needed then pass \code{"ct_clst"}. +} +\examples{ +# These two calls are equivalent and return all required variables in a +# controlled terminology data set. +sdtm.oak:::ct_spec_vars() +sdtm.oak:::ct_spec_vars("all") + +# "Codelist code" variable name. +sdtm.oak:::ct_spec_vars("ct_clst") + +# "From" variables +sdtm.oak:::ct_spec_vars("from") + +# The "to" variable. +sdtm.oak:::ct_spec_vars("to") + +} +\keyword{internal} diff --git a/man/harcode.Rd b/man/harcode.Rd new file mode 100644 index 00000000..e38424a5 --- /dev/null +++ b/man/harcode.Rd @@ -0,0 +1,133 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hardcode.R +\name{harcode} +\alias{harcode} +\alias{hardcode_no_ct} +\alias{hardcode_ct} +\title{Derive an SDTM variable with a hardcoded value} +\usage{ +hardcode_no_ct( + raw_dat, + raw_var, + tgt_var, + tgt_val, + tgt_dat = NULL, + id_vars = oak_id_vars() +) + +hardcode_ct( + raw_dat, + raw_var, + tgt_var, + tgt_val, + ct_spec, + ct_clst, + tgt_dat = NULL, + id_vars = oak_id_vars() +) +} +\arguments{ +\item{raw_dat}{The raw dataset (dataframe); must include the +variables passed in \code{id_vars} and \code{raw_var}.} + +\item{raw_var}{The raw variable: a single string indicating the name of the +raw variable in \code{raw_dat}.} + +\item{tgt_var}{The target SDTM variable: a single string indicating the name +of variable to be derived.} + +\item{tgt_val}{The target SDTM value to be hardcoded into the variable +indicated in \code{tgt_var}.} + +\item{tgt_dat}{Target dataset: a data frame to be merged against \code{raw_dat} by +the variables indicated in \code{id_vars}. This parameter is optional, see +section Value for how the output changes depending on this argument value.} + +\item{id_vars}{Key variables to be used in the join between the raw dataset +(\code{raw_dat}) and the target data set (\code{raw_dat}).} + +\item{ct_spec}{Study controlled terminology specification: a dataframe with a +minimal set of columns, see \code{\link[=ct_spec_vars]{ct_spec_vars()}} for details. This parameter is +optional, if left as \code{NULL} no controlled terminology recoding is applied.} + +\item{ct_clst}{A codelist code indicating which subset of the controlled +terminology to apply in the derivation. This parameter is optional, if left +as \code{NULL}, all possible recodings in \code{ct_spec} are attempted.} +} +\value{ +The returned data set depends on the value of \code{tgt_dat}: +\itemize{ +\item If no target dataset is supplied, meaning that \code{tgt_dat} defaults to +\code{NULL}, then the returned data set is \code{raw_dat}, selected for the variables +indicated in \code{id_vars}, and a new extra column: the derived variable, as +indicated in \code{tgt_var}. +\item If the target dataset is provided, then it is merged with the raw data set +\code{raw_dat} by the variables indicated in \code{id_vars}, with a new column: the +derived variable, as indicated in \code{tgt_var}. +} +} +\description{ +\itemize{ +\item \code{\link[=hardcode_no_ct]{hardcode_no_ct()}} maps a hardcoded value to a target SDTM variable that has +no terminology restrictions. +\item \code{\link[=hardcode_ct]{hardcode_ct()}} maps a hardcoded value to a target SDTM variable with +controlled terminology recoding. +} +} +\examples{ +md1 <- + tibble::tribble( + ~oak_id, ~raw_source, ~patient_number, ~MDRAW, + 1L, "MD1", 101L, "BABY ASPIRIN", + 2L, "MD1", 102L, "CORTISPORIN", + 3L, "MD1", 103L, NA_character_, + 4L, "MD1", 104L, "DIPHENHYDRAMINE HCL" + ) + +# Derive a new variable `CMCAT` by overwriting `MDRAW` with the +# hardcoded value "GENERAL CONCOMITANT MEDICATIONS". +hardcode_no_ct( + raw_dat = md1, + raw_var = "MDRAW", + tgt_var = "CMCAT", + tgt_val = "GENERAL CONCOMITANT MEDICATIONS" +) + +cm_inter <- + tibble::tribble( + ~oak_id, ~raw_source, ~patient_number, ~CMTRT, ~CMINDC, + 1L, "MD1", 101L, "BABY ASPIRIN", NA, + 2L, "MD1", 102L, "CORTISPORIN", "NAUSEA", + 3L, "MD1", 103L, "ASPIRIN", "ANEMIA", + 4L, "MD1", 104L, "DIPHENHYDRAMINE HCL", "NAUSEA", + 5L, "MD1", 105L, "PARACETAMOL", "PYREXIA" + ) + +# Derive a new variable `CMCAT` by overwriting `MDRAW` with the +# hardcoded value "GENERAL CONCOMITANT MEDICATIONS" with a prior join to +# `target_dataset`. +hardcode_no_ct( + raw_dat = md1, + raw_var = "MDRAW", + tgt_var = "CMCAT", + tgt_val = "GENERAL CONCOMITANT MEDICATIONS", + tgt_dat = cm_inter +) + +# Controlled terminology specification +(ct_spec <- read_ct_spec_example("ct-01-cm")) + +# Hardcoding of `CMCAT` with the value `"GENERAL CONCOMITANT MEDICATIONS"` +# involving terminology recoding. `NA` values in `MDRAW` are preserved in +# `CMCAT`. +hardcode_ct( + raw_dat = md1, + raw_var = "MDRAW", + tgt_var = "CMCAT", + tgt_val = "GENERAL CONCOMITANT MEDICATIONS", + ct_spec = ct_spec, + ct_clst = "C66729", + tgt_dat = cm_inter +) + +} diff --git a/man/index_for_recode.Rd b/man/index_for_recode.Rd new file mode 100644 index 00000000..2362517f --- /dev/null +++ b/man/index_for_recode.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recode.R +\name{index_for_recode} +\alias{index_for_recode} +\title{Determine Indices for Recoding} +\usage{ +index_for_recode(x, from) +} +\arguments{ +\item{x}{A vector of values in which to search for matches.} + +\item{from}{A vector of values to match against the elements in \code{x}.} +} +\value{ +An integer vector of the same length as \code{x}, containing the indices +of the matched values from the \code{from} vector. If an element in \code{x} does not +match any value in \code{from}, the corresponding position in the output will be +\code{NA}. This index information is critical for subsequent recoding operations. +} +\description{ +\code{\link[=index_for_recode]{index_for_recode()}} identifies the positions of elements in \code{x} that match +any of the values specified in the \code{from} vector. This function is primarily +used to facilitate the recoding of values by pinpointing which elements in +\code{x} correspond to the \code{from} values and thus need to be replaced or updated. +} +\examples{ +sdtm.oak:::index_for_recode(x = 1:5, from = c(2, 4)) + +} +\keyword{internal} diff --git a/man/oak_id_vars.Rd b/man/oak_id_vars.Rd new file mode 100644 index 00000000..af5550a3 --- /dev/null +++ b/man/oak_id_vars.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oak_id_vars.R +\name{oak_id_vars} +\alias{oak_id_vars} +\title{Raw dataset keys} +\usage{ +oak_id_vars(extra_vars = NULL) +} +\arguments{ +\item{extra_vars}{A character vector of extra column names to be appended to +the default names: oak_id, raw_source, and patient_number.} +} +\value{ +A character vector of column names to be regarded +as keys in raw datasets. +} +\description{ +\code{\link[=oak_id_vars]{oak_id_vars()}} is a helper function providing the variable (column) names to +be regarded as keys in \link[tibble:tibble-package]{tibbles} representing raw +datasets. By default, the set of names is +oak_id, raw_source, and patient_number. Extra variable names may be +indicated and passed in \code{extra_vars} which are appended to the default names. +} +\examples{ +sdtm.oak:::oak_id_vars() + +sdtm.oak:::oak_id_vars(extra_vars = "sample_id") + +} +\keyword{internal} diff --git a/man/problems.Rd b/man/problems.Rd index ce68ad46..8a5c23fe 100644 --- a/man/problems.Rd +++ b/man/problems.Rd @@ -8,7 +8,7 @@ problems(x = .Last.value) } \arguments{ \item{x}{An object of class iso8601, as typically obtained from a call to -\code{\link[=create_iso8601]{create_iso8601()}}. The argument can also be left empty, in that case it +\code{\link[=create_iso8601]{create_iso8601()}}. The argument can also be left empty, in that case \code{problems()} will use the last returned value, making it convenient to use immediately after \code{\link[=create_iso8601]{create_iso8601()}}.} } @@ -42,7 +42,7 @@ dates <- "20231225" ) -#' # By inspecting the problematic dates it can be understood that +# By inspecting the problematic dates it can be understood that # the `.format` parameter needs to updated to include other variations. iso8601_dttm <- create_iso8601(dates, .format = "y-m-d") problems(iso8601_dttm) diff --git a/man/read_ct_spec.Rd b/man/read_ct_spec.Rd new file mode 100644 index 00000000..21d6103d --- /dev/null +++ b/man/read_ct_spec.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{read_ct_spec} +\alias{read_ct_spec} +\title{Read in a controlled terminology} +\usage{ +read_ct_spec(file = stop("`file` must be specified")) +} +\arguments{ +\item{file}{A path to a file containing a controlled terminology +specification data set. The following are expected of this file: +\itemize{ +\item The file is expected to be a CSV file; +\item The file is expected to contain a first row of column names; +\item This minimal set of variables is expected: codelist_code, collected_value, term_synonyms, and term_value. +}} +} +\value{ +A \link[tibble:tibble-package]{tibble} with a controlled terminology +specification. +} +\description{ +\code{\link[=read_ct_spec]{read_ct_spec()}} imports a controlled terminology specification data set as a +\link[tibble:tibble-package]{tibble}. +} +\examples{ +# Get the local path to one of the controlled terminology example files. +path <- ct_spec_example("ct-01-cm") + +# Import it to R. +read_ct_spec(file = path) + +} diff --git a/man/read_ct_spec_example.Rd b/man/read_ct_spec_example.Rd new file mode 100644 index 00000000..b3b26c08 --- /dev/null +++ b/man/read_ct_spec_example.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{read_ct_spec_example} +\alias{read_ct_spec_example} +\title{Read an example controlled terminology specification} +\usage{ +read_ct_spec_example(example) +} +\arguments{ +\item{example}{The file name of a controlled terminology data set bundled +with \code{{stdm.oak}}, run \code{read_ct_spec_example()} for available example files.} +} +\value{ +A \link[tibble:tibble-package]{tibble} with a controlled terminology +specification data set, or a character vector of example file names. +} +\description{ +\code{\link[=read_ct_spec_example]{read_ct_spec_example()}} imports one of the bundled controlled terminology +specification data sets as a \link[tibble:tibble-package]{tibble} into R. +} +\examples{ +# Leave the `example` parameter as missing for available example files. +read_ct_spec_example() + +# Read an example controlled terminology spec file. +read_ct_spec_example("ct-01-cm.csv") + +# You may omit the file extension. +read_ct_spec_example("ct-01-cm") + +} diff --git a/man/recode.Rd b/man/recode.Rd new file mode 100644 index 00000000..aca082ac --- /dev/null +++ b/man/recode.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recode.R +\name{recode} +\alias{recode} +\title{Recode values} +\usage{ +recode(x, from = unique(na.omit(x)), to = from, .no_match = x, .na = NA) +} +\arguments{ +\item{x}{An atomic vector of values are to be recoded.} + +\item{from}{A vector of values to be matched in \code{x} for recoding.} + +\item{to}{A vector of values to be used as replacement for values in \code{from}.} + +\item{.no_match}{Value to be used as replacement when cases in \code{from} are not +matched.} + +\item{.na}{Value to be used to recode missing values.} +} +\value{ +A vector of recoded values. +} +\description{ +\code{\link[=recode]{recode()}} recodes values in \code{x} by matching elements in \code{from} onto values +in \code{to}. +} +\examples{ +x <- c("male", "female", "x", NA) +sdtm.oak:::recode(x, + from = c("male", "female"), + to = c("M", "F") +) +sdtm.oak:::recode( + x, + from = c("male", "female"), + to = c("M", "F"), + .no_match = "?" +) +sdtm.oak:::recode( + x, + from = c("male", "female"), + to = c("M", "F"), + .na = "missing" +) + +} +\keyword{internal} diff --git a/man/sdtm_assign.Rd b/man/sdtm_assign.Rd new file mode 100644 index 00000000..676979dc --- /dev/null +++ b/man/sdtm_assign.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assign.R +\name{sdtm_assign} +\alias{sdtm_assign} +\title{Derive an SDTM variable} +\usage{ +sdtm_assign( + raw_dat, + raw_var, + tgt_var, + ct_spec = NULL, + ct_clst = NULL, + tgt_dat = NULL, + id_vars = oak_id_vars() +) +} +\arguments{ +\item{raw_dat}{The raw dataset (dataframe); must include the +variables passed in \code{id_vars} and \code{raw_var}.} + +\item{raw_var}{The raw variable: a single string indicating the name of the +raw variable in \code{raw_dat}.} + +\item{tgt_var}{The target SDTM variable: a single string indicating the name +of variable to be derived.} + +\item{ct_spec}{Study controlled terminology specification: a dataframe with a +minimal set of columns, see \code{\link[=ct_spec_vars]{ct_spec_vars()}} for details. This parameter is +optional, if left as \code{NULL} no controlled terminology recoding is applied.} + +\item{ct_clst}{A codelist code indicating which subset of the controlled +terminology to apply in the derivation. This parameter is optional, if left +as \code{NULL}, all possible recodings in \code{ct_spec} are attempted.} + +\item{tgt_dat}{Target dataset: a data frame to be merged against \code{raw_dat} by +the variables indicated in \code{id_vars}. This parameter is optional, see +section Value for how the output changes depending on this argument value.} + +\item{id_vars}{Key variables to be used in the join between the raw dataset +(\code{raw_dat}) and the target data set (\code{raw_dat}).} +} +\value{ +The returned data set depends on the value of \code{tgt_dat}: +\itemize{ +\item If no target dataset is supplied, meaning that \code{tgt_dat} defaults to +\code{NULL}, then the returned data set is \code{raw_dat}, selected for the variables +indicated in \code{id_vars}, and a new extra column: the derived variable, as +indicated in \code{tgt_var}. +\item If the target dataset is provided, then it is merged with the raw data set +\code{raw_dat} by the variables indicated in \code{id_vars}, with a new column: the +derived variable, as indicated in \code{tgt_var}. +} +} +\description{ +\code{\link[=sdtm_assign]{sdtm_assign()}} is an internal function packing the same functionality as +\code{\link[=assign_no_ct]{assign_no_ct()}} and \code{\link[=assign_ct]{assign_ct()}} together but aimed at developers only. +As a user please use either \code{\link[=assign_no_ct]{assign_no_ct()}} or \code{\link[=assign_ct]{assign_ct()}}. +} +\keyword{internal} diff --git a/man/sdtm_hardcode.Rd b/man/sdtm_hardcode.Rd new file mode 100644 index 00000000..5c3435b5 --- /dev/null +++ b/man/sdtm_hardcode.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hardcode.R +\name{sdtm_hardcode} +\alias{sdtm_hardcode} +\title{Derive an SDTM variable with a hardcoded value} +\usage{ +sdtm_hardcode( + raw_dat, + raw_var, + tgt_var, + tgt_val, + ct_spec = NULL, + ct_clst = NULL, + tgt_dat = NULL, + id_vars = oak_id_vars() +) +} +\arguments{ +\item{raw_dat}{The raw dataset (dataframe); must include the +variables passed in \code{id_vars} and \code{raw_var}.} + +\item{raw_var}{The raw variable: a single string indicating the name of the +raw variable in \code{raw_dat}.} + +\item{tgt_var}{The target SDTM variable: a single string indicating the name +of variable to be derived.} + +\item{tgt_val}{The target SDTM value to be hardcoded into the variable +indicated in \code{tgt_var}.} + +\item{ct_spec}{Study controlled terminology specification: a dataframe with a +minimal set of columns, see \code{\link[=ct_spec_vars]{ct_spec_vars()}} for details. This parameter is +optional, if left as \code{NULL} no controlled terminology recoding is applied.} + +\item{ct_clst}{A codelist code indicating which subset of the controlled +terminology to apply in the derivation. This parameter is optional, if left +as \code{NULL}, all possible recodings in \code{ct_spec} are attempted.} + +\item{tgt_dat}{Target dataset: a data frame to be merged against \code{raw_dat} by +the variables indicated in \code{id_vars}. This parameter is optional, see +section Value for how the output changes depending on this argument value.} + +\item{id_vars}{Key variables to be used in the join between the raw dataset +(\code{raw_dat}) and the target data set (\code{raw_dat}).} +} +\value{ +The returned data set depends on the value of \code{tgt_dat}: +\itemize{ +\item If no target dataset is supplied, meaning that \code{tgt_dat} defaults to +\code{NULL}, then the returned data set is \code{raw_dat}, selected for the variables +indicated in \code{id_vars}, and a new extra column: the derived variable, as +indicated in \code{tgt_var}. +\item If the target dataset is provided, then it is merged with the raw data set +\code{raw_dat} by the variables indicated in \code{id_vars}, with a new column: the +derived variable, as indicated in \code{tgt_var}. +} +} +\description{ +\code{\link[=sdtm_hardcode]{sdtm_hardcode()}} is an internal function packing the same functionality as +\code{\link[=hardcode_no_ct]{hardcode_no_ct()}} and \code{\link[=hardcode_ct]{hardcode_ct()}} together but aimed at developers only. +As a user please use either \code{\link[=hardcode_no_ct]{hardcode_no_ct()}} or \code{\link[=hardcode_ct]{hardcode_ct()}}. +} +\keyword{internal} diff --git a/renv.lock b/renv.lock index 329ec463..ebc34e12 100644 --- a/renv.lock +++ b/renv.lock @@ -128,6 +128,30 @@ ], "Hash": "543776ae6848fde2f48ff3816d0628bc" }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, "brew": { "Package": "brew", "Version": "1.0-8", @@ -911,6 +935,19 @@ ], "Hash": "e9d21e79848e02e524bea6f5bd53e7e4" }, + "progress": { + "Package": "progress", + "Version": "1.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "14dc9f7a3c91ebb14ec5bb9208a07061" + }, "promises": { "Package": "promises", "Version": "1.2.0.1", @@ -995,6 +1032,29 @@ ], "Hash": "8f25ebe2ec38b1f2aef3b0d2ef76f6c4" }, + "readr": { + "Package": "readr", + "Version": "2.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "clipr", + "cpp11", + "crayon", + "hms", + "lifecycle", + "methods", + "rlang", + "tibble", + "tzdb", + "utils", + "vroom" + ], + "Hash": "b5047343b3825f37ad9d3b5d89aa1078" + }, "rematch2": { "Package": "rematch2", "Version": "2.1.2", @@ -1007,7 +1067,7 @@ }, "remotes": { "Package": "remotes", - "Version": "2.4.2.1", + "Version": "2.5.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1017,7 +1077,7 @@ "tools", "utils" ], - "Hash": "63d15047eb239f95160112bcadc4fcb9" + "Hash": "3ee025083e66f18db6cf27b56e23e141" }, "renv": { "Package": "renv", @@ -1424,6 +1484,17 @@ ], "Hash": "e4e357f28c2edff493936b6cb30c3d65" }, + "tzdb": { + "Package": "tzdb", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "b2e1cbce7c903eaf23ec05c58e59fb5e" + }, "urlchecker": { "Package": "urlchecker", "Version": "1.0.1", @@ -1493,6 +1564,32 @@ ], "Hash": "a745bda7aff4734c17294bb41d4e4607" }, + "vroom": { + "Package": "vroom", + "Version": "1.6.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit64", + "cli", + "cpp11", + "crayon", + "glue", + "hms", + "lifecycle", + "methods", + "progress", + "rlang", + "stats", + "tibble", + "tidyselect", + "tzdb", + "vctrs", + "withr" + ], + "Hash": "7015a74373b83ffaef64023f4a0f5033" + }, "waldo": { "Package": "waldo", "Version": "0.4.0", diff --git a/renv/profiles/4.2/renv.lock b/renv/profiles/4.2/renv.lock index 494312ef..885895ef 100644 --- a/renv/profiles/4.2/renv.lock +++ b/renv/profiles/4.2/renv.lock @@ -128,6 +128,30 @@ ], "Hash": "543776ae6848fde2f48ff3816d0628bc" }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, "brew": { "Package": "brew", "Version": "1.0-8", @@ -912,6 +936,19 @@ ], "Hash": "e9d21e79848e02e524bea6f5bd53e7e4" }, + "progress": { + "Package": "progress", + "Version": "1.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "14dc9f7a3c91ebb14ec5bb9208a07061" + }, "promises": { "Package": "promises", "Version": "1.2.0.1", @@ -996,6 +1033,29 @@ ], "Hash": "8f25ebe2ec38b1f2aef3b0d2ef76f6c4" }, + "readr": { + "Package": "readr", + "Version": "2.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "clipr", + "cpp11", + "crayon", + "hms", + "lifecycle", + "methods", + "rlang", + "tibble", + "tzdb", + "utils", + "vroom" + ], + "Hash": "b5047343b3825f37ad9d3b5d89aa1078" + }, "rematch2": { "Package": "rematch2", "Version": "2.1.2", @@ -1008,7 +1068,7 @@ }, "remotes": { "Package": "remotes", - "Version": "2.4.2.1", + "Version": "2.5.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1018,7 +1078,7 @@ "tools", "utils" ], - "Hash": "63d15047eb239f95160112bcadc4fcb9" + "Hash": "3ee025083e66f18db6cf27b56e23e141" }, "renv": { "Package": "renv", @@ -1424,6 +1484,17 @@ ], "Hash": "c0f007e2eeed7722ce13d42b84a22e07" }, + "tzdb": { + "Package": "tzdb", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "b2e1cbce7c903eaf23ec05c58e59fb5e" + }, "urlchecker": { "Package": "urlchecker", "Version": "1.0.1", @@ -1493,6 +1564,32 @@ ], "Hash": "e4ffa94ceed5f124d429a5a5f0f5b378" }, + "vroom": { + "Package": "vroom", + "Version": "1.6.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit64", + "cli", + "cpp11", + "crayon", + "glue", + "hms", + "lifecycle", + "methods", + "progress", + "rlang", + "stats", + "tibble", + "tidyselect", + "tzdb", + "vctrs", + "withr" + ], + "Hash": "7015a74373b83ffaef64023f4a0f5033" + }, "waldo": { "Package": "waldo", "Version": "0.4.0", diff --git a/renv/profiles/4.3/renv.lock b/renv/profiles/4.3/renv.lock index 329ec463..ebc34e12 100644 --- a/renv/profiles/4.3/renv.lock +++ b/renv/profiles/4.3/renv.lock @@ -128,6 +128,30 @@ ], "Hash": "543776ae6848fde2f48ff3816d0628bc" }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, "brew": { "Package": "brew", "Version": "1.0-8", @@ -911,6 +935,19 @@ ], "Hash": "e9d21e79848e02e524bea6f5bd53e7e4" }, + "progress": { + "Package": "progress", + "Version": "1.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "14dc9f7a3c91ebb14ec5bb9208a07061" + }, "promises": { "Package": "promises", "Version": "1.2.0.1", @@ -995,6 +1032,29 @@ ], "Hash": "8f25ebe2ec38b1f2aef3b0d2ef76f6c4" }, + "readr": { + "Package": "readr", + "Version": "2.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "clipr", + "cpp11", + "crayon", + "hms", + "lifecycle", + "methods", + "rlang", + "tibble", + "tzdb", + "utils", + "vroom" + ], + "Hash": "b5047343b3825f37ad9d3b5d89aa1078" + }, "rematch2": { "Package": "rematch2", "Version": "2.1.2", @@ -1007,7 +1067,7 @@ }, "remotes": { "Package": "remotes", - "Version": "2.4.2.1", + "Version": "2.5.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1017,7 +1077,7 @@ "tools", "utils" ], - "Hash": "63d15047eb239f95160112bcadc4fcb9" + "Hash": "3ee025083e66f18db6cf27b56e23e141" }, "renv": { "Package": "renv", @@ -1424,6 +1484,17 @@ ], "Hash": "e4e357f28c2edff493936b6cb30c3d65" }, + "tzdb": { + "Package": "tzdb", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "b2e1cbce7c903eaf23ec05c58e59fb5e" + }, "urlchecker": { "Package": "urlchecker", "Version": "1.0.1", @@ -1493,6 +1564,32 @@ ], "Hash": "a745bda7aff4734c17294bb41d4e4607" }, + "vroom": { + "Package": "vroom", + "Version": "1.6.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit64", + "cli", + "cpp11", + "crayon", + "glue", + "hms", + "lifecycle", + "methods", + "progress", + "rlang", + "stats", + "tibble", + "tidyselect", + "tzdb", + "vctrs", + "withr" + ], + "Hash": "7015a74373b83ffaef64023f4a0f5033" + }, "waldo": { "Package": "waldo", "Version": "0.4.0", diff --git a/tests/testthat/test-ct.R b/tests/testthat/test-ct.R new file mode 100644 index 00000000..e1fb5ca7 --- /dev/null +++ b/tests/testthat/test-ct.R @@ -0,0 +1,268 @@ +test_that("ct_spec_vars() works as expected", { + expect_identical( + ct_spec_vars(), + c( + "codelist_code", + "collected_value", + "term_synonyms", + "term_value" + ) + ) + + expect_identical( + ct_spec_vars(set = "all"), + c( + "codelist_code", + "collected_value", + "term_synonyms", + "term_value" + ) + ) + + expect_identical( + ct_spec_vars(set = "ct_clst"), + "codelist_code" + ) + + expect_identical( + ct_spec_vars(set = "from"), + c( + "collected_value", + "term_synonyms" + ) + ) + + expect_identical(ct_spec_vars(set = "to"), "term_value") +}) + +test_that("ct_spec_vars() fails with invalid input choice", { + expect_error(ct_spec_vars("foo")) + expect_error(ct_spec_vars(1L)) + expect_error(ct_spec_vars(FALSE)) + expect_error(ct_spec_vars(NULL)) +}) + +test_that("assert_ct_spec() works as expected", { + # Load an example controlled terminology spec. + ct_spec <- read_ct_spec_example("ct-01-cm") + cols <- colnames(ct_spec) + ct_clst_col <- ct_spec_vars("ct_clst") + to_col <- ct_spec_vars("to") + + expect_no_error(assert_ct_spec(ct_spec, optional = FALSE)) + expect_no_error(assert_ct_spec(ct_spec, optional = TRUE)) + expect_identical(assert_ct_spec(ct_spec, optional = FALSE), ct_spec) + expect_identical(assert_ct_spec(ct_spec, optional = TRUE), ct_spec) + expect_null(assert_ct_spec(NULL, optional = TRUE)) + + # Codelist code column is one of the key variables that must be present + # in `ct_spec`, so being missing should trigger an error. + expect_error(assert_ct_spec(ct_spec[setdiff(cols, ct_clst_col)], optional = FALSE)) + expect_error(assert_ct_spec(ct_spec[setdiff(cols, ct_clst_col)], optional = TRUE)) + + # The codelist code and the "to" columns of a controlled terminology should + # not contain NAs, as otherwise the mapping is undefined. If that happens + # an error is triggered. + ct_spec01 <- ct_spec + ct_spec01[[ct_clst_col]] <- NA_character_ + expect_error(assert_ct_spec(ct_spec01, optional = FALSE)) + expect_error(assert_ct_spec(ct_spec01, optional = TRUE)) + + ct_spec02 <- ct_spec + ct_spec02[[to_col]] <- NA_character_ + expect_error(assert_ct_spec(ct_spec01, optional = FALSE)) + expect_error(assert_ct_spec(ct_spec01, optional = TRUE)) + + ct_spec_empty <- + data.frame( + codelist_code = character(), + collected_value = character(), + term_synonyms = character(), + term_value = character(), + stringsAsFactors = FALSE + ) + + # `ct_spec` cannot be empty as that means that there are no mappings. + expect_error(assert_ct_spec(ct_spec_empty, optional = TRUE)) + expect_error(assert_ct_spec(ct_spec_empty, optional = FALSE)) +}) + +test_that("assert_ct_clst() works as expected", { + # Read in a controlled terminology example. + ct_spec <- read_ct_spec_example("ct-01-cm") + + # If `ct_clst` is not supplied and is not optional, then it should err. + expect_error(assert_ct_clst( + ct_spec = NULL, + ct_clst = NULL, + optional = FALSE + )) + + # If `ct_clst` is not supplied but it is optional, then all fine. + expect_no_error(assert_ct_clst( + ct_spec = NULL, + ct_clst = NULL, + optional = TRUE + )) + # Moreover, in case of no error, `ct_clst` should be returned invisibly, in this + # case `NULL`. + expect_null(assert_ct_clst( + ct_spec = NULL, + ct_clst = NULL, + optional = TRUE + )) + + # If `ct_clst` is supplied but `ct_spec` is not, then err. + expect_error(assert_ct_clst( + ct_spec = NULL, + ct_clst = "C71113", + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = NULL, + ct_clst = "C71113", + optional = TRUE + )) + + # If `ct_spec` is supplied but `ct_clst` is NULL, then err if `ct_clst` is not optional, or + # return `ct_clst` invisibly. + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = FALSE + )) + expect_no_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = TRUE + )) + expect_null(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = TRUE + )) + + # If both `ct_spec` and `ct_clst` are supplied, then `ct_spec` must be a valid controlled + # terminology data set and `ct_clst` must contain a codelist code available among + # the possibilities in column `codelist_code` (as returned by `ct_spec_vars("ct_clst")`). + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "foo", + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "", + optional = FALSE + )) + + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NA_character_, + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NA_character_, + optional = TRUE + )) + + expect_identical(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C71113", + optional = FALSE + ), "C71113") + expect_identical(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C66726", + optional = FALSE + ), "C66726") + expect_identical(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C71113", + optional = TRUE + ), "C71113") + expect_identical(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C66726", + optional = TRUE + ), "C66726") +}) + +test_that("assert_ct_clst(): when ct_spec is empty", { + ct_spec <- + data.frame( + codelist_code = character(), + collected_value = character(), + term_synonyms = character(), + term_value = character(), + stringsAsFactors = FALSE + ) + + # If `ct_spec` is supplied but `ct_clst` is NULL, then err if `ct_clst` is not optional, or + # return `ct_clst` invisibly. + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = FALSE + )) + expect_no_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = TRUE + )) + expect_null(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = TRUE + )) + + # If both `ct_spec` and `ct_clst` are supplied, then `ct_spec` must be a valid controlled + # terminology data set and `ct_clst` must contain a codelist code available among + # the possibilities in column `codelist_code` (as returned by `ct_spec_vars("ct_clst")`). + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "foo", + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "", + optional = FALSE + )) + + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NA_character_, + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NA_character_, + optional = TRUE + )) + + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C71113", + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C71113", + optional = TRUE + )) +}) + +test_that("ct_mappings(): works as expected", { + ct_spec <- read_ct_spec_example("ct-01-cm") + ct_spec_qd <- dplyr::filter(ct_spec, term_code == "C25473") + + expect_identical( + ct_mappings(ct_spec = ct_spec_qd), + tibble::tibble( + from = c("QD", "QD (Every Day)", "/day", "Daily", "Per Day"), + to = rep("QD", 5L) + ) + ) +}) diff --git a/tests/testthat/test-recode.R b/tests/testthat/test-recode.R new file mode 100644 index 00000000..87562015 --- /dev/null +++ b/tests/testthat/test-recode.R @@ -0,0 +1,125 @@ +test_that("recode() works as intended on typical input", { + x <- c("a", NA_character_, "α") + # Map letters from lowercase to uppercase. NA is left as NA. Unmatched + # values are returned as inputted. + expect_identical( + recode(x = x, from = letters, to = LETTERS), + c("A", NA_character_, "α") + ) + + # The same as before but map now to integer values. Note though that the + # type of the returned vector is nonetheless character because "α" is not + # matched and will be preserved in the output, forcing coercion to character. + expect_identical( + recode( + x = x, + from = letters, + to = seq_along(LETTERS) + ), + c("1", NA_character_, "α") + ) + + # Now that `.no_match` is of integer type, like the vector supplied in `to`, + # the returned vector is also integer + expect_identical( + recode( + x = x, + from = letters, + to = seq_along(LETTERS), + .no_match = 0L + ), + c(1L, NA_integer_, 0L) + ) +}) + +test_that("recode() handling of NAs in input", { + x <- c("aye", "nay", "maybe", NA_character_) + from <- c("aye", "nay") + to <- c("yes", "no") + + expect_identical( + recode(x = x, from = from, to = to), + c("yes", "no", "maybe", NA_character_) + ) + expect_identical( + recode( + x = x, + from = from, + to = to, + .na = "uh?" + ), + c("yes", "no", "maybe", "uh?") + ) + + # The type of the vector in the output is always the most compatible across + # the types of `to`, `.no_match` and `.na`. + expect_identical( + recode( + x = x, + from = from, + to = to, + .na = NA + ), + c("yes", "no", "maybe", NA_character_) + ) + expect_identical( + recode( + x = x, + from = from, + to = to, + .na = NA_integer_ + ), + c("yes", "no", "maybe", NA_character_) + ) + expect_identical( + recode( + x = x, + from = from, + to = to, + .na = NA_character_ + ), + c("yes", "no", "maybe", NA_character_) + ) +}) + +test_that("recode(): recycling between `from` and `to` parameters", { + x <- c("aye", "nay", "maybe", NA_character_) + from <- c("aye", "nay") + to <- "?" + + # Mapping one to many values + expect_identical( + recode(x = x, from = from, to = to), + c("?", "?", "maybe", NA_character_) + ) + + # Enforce every value to become the hardcoded value specified in `to`. + expect_identical( + recode( + x = x, + from = from, + to = to, + .no_match = to, + .na = to + ), + c("?", "?", "?", "?") + ) +}) + +test_that("recode(): notable cases", { + x <- c(letters[1L:3L], NA_character_) + + # Identity: no recoding. + expect_identical(recode(x = x), x) + + # Hardcode all values, leave NA at peace + expect_identical(recode(x = x, to = "X"), c(rep("X", 3L), NA_character_)) + + # Or, really hardcode every single value + expect_identical(recode( + x = x, + to = "X", + .no_match = "X", + .na = "X" + ), rep("X", 4L)) +})