From 5fc61afa1ceb0376237e2df90684910aae13182f Mon Sep 17 00:00:00 2001 From: Ramiro Magno Date: Wed, 10 Apr 2024 16:11:51 +0100 Subject: [PATCH] Implementation of `hardcode_no_ct()`, `hardcode_ct()`, `assign_no_ct()` and `assign_ct()` (#41) * First mockup of `hardcode_no_ct()` * Update `hardcode_no_ct()` Update `hardcode_no_ct()` by allowing the rewriting of the `target_sdtm_variable` variable to preserve `NA` * Align `hardcode_no_ct()` code style with Ramm's expectations * Add `hardcode_*()` and `assign_*()` functions * hardcode_no_ct algorithm code changes (#45) * hardcode_no_ct algorithm code changes * harcode_ct working as expected * assign_ct and assign_no_ct works great. * address review comments * Add `oak_id_vars()` * Fix typo in `recode()` * Simplify `oak_id_vars()` docs * Update `assign_*` and `hardcode_*` implementations * Introduce memoisation of `ct_mappings()` * Update of README introductory paragraph * Update hardcode_* functions' interface * Add `contains_oak_id_vars()` function * Update `contains_oak_id_vars()` doc examples * Update `sdtm_harcode()` and dependant functions * Update `assign_*` and `hardcore_*` related functions * Automatic renv profile update. * Automatic renv profile update. * Make `ct` and `cl` parameters mandatory for `assign_ct()` * Add functions ct importing - Adds three new user facing ct-related functions: `read_ct_example()`, `ct_example()` and `read_ct()` - Provides a ct example file in inst/ct/ * Bring `hardcode*()` and `assign*()` related assertions closer to user calling functions * Add lagging behind Rd for `ct_example()` * Add `assert_ct()` * Add ct assertions * Remove R/.gitkeep As it is no longer needed. * Add unit tests for `ct_vars()` * Update dependencies * Export `ct_vars()` Export `ct_vars()` such that we can cross-reference it from other functions' documentation. * Update `assert_ct()` docs * Clarify `assign_ct()`/`assign_no_ct()` doc * Improve grammar in doc * Remove last empty line from ct example file * Add documentation to `sdtm_assign()` and ct-related unit tests Although we had discussed to keep assertions only at the user facing functions, I am getting the feeling we would miss assertions also at the internal function... because of several reasons: firstly, the internal function is more flexible having more optional parameters, which requires extra assertion logic, and also because eventually we will be checking code coverage and we will regret not having done this now. * Update hardcode-related fns * Changes to meet linter issues * Code reformatting * Code reflow * Improve `assert_cl()` docs * Update `read_ct()` docs * Automatic renv profile update. * Automatic renv profile update. * Add units tests for `recode()` * Remove `are_to_recode()` function Ended up not using this function. * Add units tests for `assert_ct()` * Add one more test for `assert_ct()` * Add a basic unit test for `ct_mappings()` * Fill in some doc details of ct-related functions * Remove leftover doc text in `assign` * Update website's reference * Styling update * Bump version and update NEWS * Fix a few lintr issues * Add examples to `ct_map()` doc * Fix typo in `problems()` doc * Fix typo * Remove lint issues * Replace `.data` usage in tidyselect expressions See https://github.com/tidyverse/tidyverse.org/pull/600 for more details. * Variable renaming - `ct` to `ct_spec` (ct specification) - `cl` to `ct_cltc` (codelist code) * Finish pending renaming of variables * Rename code-list to codelist * Fix style * Fix style * Update `ct_map()` doc example * Make tibbles more readable in doc examples * Rename `ct_cltc` to `ct_clst` As per @rammprasad's suggestion. --------- Co-authored-by: Ram Ganapathy Co-authored-by: ramiromagno --- .Rbuildignore | 1 + DESCRIPTION | 8 +- NAMESPACE | 13 ++ NEWS.md | 12 + R/.gitkeep | 0 R/assign.R | 243 ++++++++++++++++++++ R/clear_cache.R | 19 ++ R/ct.R | 434 +++++++++++++++++++++++++++++++++++ R/dtc_problems.R | 4 +- R/hardcode.R | 251 ++++++++++++++++++++ R/oak_id_vars.R | 53 +++++ R/recode.R | 68 ++++++ R/sdtm.oak-package.R | 1 + R/str_split.R | 13 ++ R/zzz.R | 3 + _pkgdown.yml | 26 +++ inst/WORDLIST | 11 + inst/ct/README.md | 22 ++ inst/ct/ct-01-cm.csv | 34 +++ man/assert_ct_clst.Rd | 43 ++++ man/assert_ct_spec.Rd | 45 ++++ man/assign.Rd | 141 ++++++++++++ man/clear_cache.Rd | 25 ++ man/contains_oak_id_vars.Rd | 34 +++ man/ct_map.Rd | 67 ++++++ man/ct_mappings.Rd | 47 ++++ man/ct_spec_example.Rd | 35 +++ man/ct_spec_vars.Rd | 39 ++++ man/harcode.Rd | 133 +++++++++++ man/index_for_recode.Rd | 30 +++ man/oak_id_vars.Rd | 30 +++ man/problems.Rd | 4 +- man/read_ct_spec.Rd | 33 +++ man/read_ct_spec_example.Rd | 31 +++ man/recode.Rd | 48 ++++ man/sdtm_assign.Rd | 59 +++++ man/sdtm_hardcode.Rd | 63 +++++ renv.lock | 101 +++++++- renv/profiles/4.2/renv.lock | 101 +++++++- renv/profiles/4.3/renv.lock | 101 +++++++- tests/testthat/test-ct.R | 268 +++++++++++++++++++++ tests/testthat/test-recode.R | 125 ++++++++++ 42 files changed, 2807 insertions(+), 12 deletions(-) delete mode 100644 R/.gitkeep create mode 100644 R/assign.R create mode 100644 R/clear_cache.R create mode 100644 R/ct.R create mode 100644 R/hardcode.R create mode 100644 R/oak_id_vars.R create mode 100644 R/recode.R create mode 100644 R/str_split.R create mode 100644 R/zzz.R create mode 100644 inst/ct/README.md create mode 100644 inst/ct/ct-01-cm.csv create mode 100644 man/assert_ct_clst.Rd create mode 100644 man/assert_ct_spec.Rd create mode 100644 man/assign.Rd create mode 100644 man/clear_cache.Rd create mode 100644 man/contains_oak_id_vars.Rd create mode 100644 man/ct_map.Rd create mode 100644 man/ct_mappings.Rd create mode 100644 man/ct_spec_example.Rd create mode 100644 man/ct_spec_vars.Rd create mode 100644 man/harcode.Rd create mode 100644 man/index_for_recode.Rd create mode 100644 man/oak_id_vars.Rd create mode 100644 man/read_ct_spec.Rd create mode 100644 man/read_ct_spec_example.Rd create mode 100644 man/recode.Rd create mode 100644 man/sdtm_assign.Rd create mode 100644 man/sdtm_hardcode.Rd create mode 100644 tests/testthat/test-ct.R create mode 100644 tests/testthat/test-recode.R diff --git a/.Rbuildignore b/.Rbuildignore index c8038ef3..80fe0c2f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -14,3 +14,4 @@ ^data-raw$ ^staged_dependencies.yaml$ ^vignettes/articles$ +^inst/ct/README.md$ diff --git a/DESCRIPTION b/DESCRIPTION index ce8b29a7..8abff617 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: sdtm.oak Type: Package Title: SDTM Data Transformation Engine -Version: 0.0.0.9001 +Version: 0.0.0.9002 Authors@R: c( person("Rammprasad", "Ganapathy", role = c("aut", "cre"), email = "ganapathy.rammprasad@gene.com"), @@ -38,11 +38,15 @@ Depends: R (>= 4.2) Imports: admiraldev, dplyr (>= 1.0.0), + memoise, assertthat, purrr (>= 0.3.3), rlang (>= 0.4.4), stringr (>= 1.4.0), - tibble + tibble, + vctrs, + readr, + glue Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index f1161f5e..6170cee4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,22 @@ # Generated by roxygen2: do not edit by hand S3method(print,iso8601) +export(assign_ct) +export(assign_no_ct) +export(clear_cache) export(create_iso8601) +export(ct_map) +export(ct_spec_example) +export(ct_spec_vars) export(derive_study_day) export(fmt_cmp) +export(hardcode_ct) +export(hardcode_no_ct) export(problems) +export(read_ct_spec) +export(read_ct_spec_example) +importFrom(rlang,"%||%") +importFrom(rlang,":=") importFrom(rlang,.data) +importFrom(stats,na.omit) importFrom(tibble,tibble) diff --git a/NEWS.md b/NEWS.md index 77776156..297b7d53 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# sdtm.oak 0.0.0.9002 (development version) + +## New Features + +* New function: `derive_study_day()` for study day calculation. + +* New functions for basic SDTM derivations: ` assign_no_ct()`, `assign_ct()`, +`hardcode_no_ct()` and `hardcode_ct()`. + +* New functions for handling controlled terminologies: `read_ct_spec()`, +`read_ct_spec_example()`, `ct_spec_example()` and `ct_map()`. + # sdtm.oak 0.0.0.9001 (development version) ## New Features diff --git a/R/.gitkeep b/R/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/R/assign.R b/R/assign.R new file mode 100644 index 00000000..a91eacaf --- /dev/null +++ b/R/assign.R @@ -0,0 +1,243 @@ +#' Derive an SDTM variable +#' +#' @description +#' [sdtm_assign()] is an internal function packing the same functionality as +#' [assign_no_ct()] and [assign_ct()] together but aimed at developers only. +#' As a user please use either [assign_no_ct()] or [assign_ct()]. +#' +#' @param raw_dat The raw dataset (dataframe); must include the +#' variables passed in `id_vars` and `raw_var`. +#' @param raw_var The raw variable: a single string indicating the name of the +#' raw variable in `raw_dat`. +#' @param tgt_var The target SDTM variable: a single string indicating the name +#' of variable to be derived. +#' @param ct_spec Study controlled terminology specification: a dataframe with a +#' minimal set of columns, see [ct_spec_vars()] for details. This parameter is +#' optional, if left as `NULL` no controlled terminology recoding is applied. +#' @param ct_clst A codelist code indicating which subset of the controlled +#' terminology to apply in the derivation. This parameter is optional, if left +#' as `NULL`, all possible recodings in `ct_spec` are attempted. +#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by +#' the variables indicated in `id_vars`. This parameter is optional, see +#' section Value for how the output changes depending on this argument value. +#' @param id_vars Key variables to be used in the join between the raw dataset +#' (`raw_dat`) and the target data set (`raw_dat`). +#' +#' @returns The returned data set depends on the value of `tgt_dat`: +#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to +#' `NULL`, then the returned data set is `raw_dat`, selected for the variables +#' indicated in `id_vars`, and a new extra column: the derived variable, as +#' indicated in `tgt_var`. +#' - If the target dataset is provided, then it is merged with the raw data set +#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the +#' derived variable, as indicated in `tgt_var`. +#' +#' +#' @importFrom rlang := +#' @keywords internal +sdtm_assign <- function(raw_dat, + raw_var, + tgt_var, + ct_spec = NULL, + ct_clst = NULL, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + assert_ct_spec(ct_spec, optional = TRUE) + assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = TRUE) + + # Recode the raw variable following terminology. + tgt_val <- ct_map(raw_dat[[raw_var]], ct_spec = ct_spec, ct_clst = ct_clst) + + # Apply derivation by assigning `raw_var` to `tgt_var`. + # `der_dat`: derived dataset. + der_dat <- + raw_dat |> + dplyr::select(c(id_vars, raw_var)) |> + dplyr::mutate("{tgt_var}" := tgt_val) |> # nolint object_name_linter() + dplyr::select(-rlang::sym(raw_var)) + + # If a target dataset is supplied, then join the so far derived dataset with + # the target dataset (`tgt_dat`), otherwise leave it be. + der_dat <- + if (!is.null(tgt_dat)) { + der_dat |> + dplyr::right_join(y = tgt_dat, by = id_vars) |> + dplyr::relocate(tgt_var, .after = dplyr::last_col()) + } else { + der_dat + } + + der_dat +} + +#' Derive an SDTM variable +#' +#' @description +#' - [assign_no_ct()] maps a variable in a raw dataset to a target SDTM +#' variable that has no terminology restrictions. +#' +#' - [assign_ct()] maps a variable in a raw dataset to a target SDTM variable +#' following controlled terminology recoding. +#' +#' @param raw_dat The raw dataset (dataframe); must include the +#' variables passed in `id_vars` and `raw_var`. +#' @param raw_var The raw variable: a single string indicating the name of the +#' raw variable in `raw_dat`. +#' @param tgt_var The target SDTM variable: a single string indicating the name +#' of variable to be derived. +#' @param ct_spec Study controlled terminology specification: a dataframe with a +#' minimal set of columns, see [ct_spec_vars()] for details. +#' @param ct_clst A codelist code indicating which subset of the controlled +#' terminology to apply in the derivation. +#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by +#' the variables indicated in `id_vars`. This parameter is optional, see +#' section Value for how the output changes depending on this argument value. +#' @param id_vars Key variables to be used in the join between the raw dataset +#' (`raw_dat`) and the target data set (`raw_dat`). +#' +#' @returns The returned data set depends on the value of `tgt_dat`: +#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to +#' `NULL`, then the returned data set is `raw_dat`, selected for the variables +#' indicated in `id_vars`, and a new extra column: the derived variable, as +#' indicated in `tgt_var`. +#' - If the target dataset is provided, then it is merged with the raw data set +#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the +#' derived variable, as indicated in `tgt_var`. +#' +#' @examples +#' +#' md1 <- +#' tibble::tibble( +#' oak_id = 1:14, +#' raw_source = "MD1", +#' patient_number = 101:114, +#' MDIND = c( +#' "NAUSEA", "NAUSEA", "ANEMIA", "NAUSEA", "PYREXIA", +#' "VOMITINGS", "DIARHHEA", "COLD", +#' "FEVER", "LEG PAIN", "FEVER", "COLD", "COLD", "PAIN" +#' ) +#' ) +#' +#' assign_no_ct( +#' raw_dat = md1, +#' raw_var = "MDIND", +#' tgt_var = "CMINDC", +#' ) +#' +#' cm_inter <- +#' tibble::tibble( +#' oak_id = 1:14, +#' raw_source = "MD1", +#' patient_number = 101:114, +#' CMTRT = c( +#' "BABY ASPIRIN", +#' "CORTISPORIN", +#' "ASPIRIN", +#' "DIPHENHYDRAMINE HCL", +#' "PARCETEMOL", +#' "VOMIKIND", +#' "ZENFLOX OZ", +#' "AMITRYPTYLINE", +#' "BENADRYL", +#' "DIPHENHYDRAMINE HYDROCHLORIDE", +#' "TETRACYCLINE", +#' "BENADRYL", +#' "SOMINEX", +#' "ZQUILL" +#' ), +#' CMROUTE = c( +#' "ORAL", +#' "ORAL", +#' NA, +#' "ORAL", +#' "ORAL", +#' "ORAL", +#' "INTRAMUSCULAR", +#' "INTRA-ARTERIAL", +#' NA, +#' "NON-STANDARD", +#' "RANDOM_VALUE", +#' "INTRA-ARTICULAR", +#' "TRANSDERMAL", +#' "OPHTHALMIC" +#' ) +#' ) +#' +#' # Controlled terminology specification +#' (ct_spec <- read_ct_spec_example("ct-01-cm")) +#' +#' assign_ct( +#' raw_dat = md1, +#' raw_var = "MDIND", +#' tgt_var = "CMINDC", +#' ct_spec = ct_spec, +#' ct_clst = "C66729", +#' tgt_dat = cm_inter +#' ) +#' +#' @name assign +NULL + +#' @order 1 +#' @export +#' @rdname assign +assign_no_ct <- function(raw_dat, + raw_var, + tgt_var, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + + sdtm_assign( + raw_dat = raw_dat, + raw_var = raw_var, + tgt_var = tgt_var, + tgt_dat = tgt_dat, + id_vars = id_vars + ) +} + +#' @order 2 +#' @export +#' @rdname assign +assign_ct <- function(raw_dat, + raw_var, + tgt_var, + ct_spec, + ct_clst, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + + sdtm_assign( + raw_dat = raw_dat, + raw_var = raw_var, + tgt_var = tgt_var, + tgt_dat = tgt_dat, + id_vars = id_vars, + ct_spec = ct_spec, + ct_clst = ct_clst + ) +} diff --git a/R/clear_cache.R b/R/clear_cache.R new file mode 100644 index 00000000..047bd23e --- /dev/null +++ b/R/clear_cache.R @@ -0,0 +1,19 @@ +#' Clear `{sdtm.oak}` cache of memoised functions +#' +#' @description +#' Some of `{sdtm.oak}` functions have their results cached for runtime +#' efficiency. Use this function to reset the cache. +#' +#' Memoised functions: +#' - [ct_mappings()] +#' +#' @return Returns a logical value, indicating whether the resetting of the +#' cache was successful (`TRUE`) or not (`FALSE`). +#' +#' @examples +#' clear_cache() +#' +#' @export +clear_cache <- function() { + memoise::forget(ct_mappings) +} diff --git a/R/ct.R b/R/ct.R new file mode 100644 index 00000000..32b0596a --- /dev/null +++ b/R/ct.R @@ -0,0 +1,434 @@ +#' Controlled terminology variables +#' +#' @description +#' [ct_spec_vars()] returns the mandatory variables to be present in a data set +#' representing a controlled terminology. By default, it returns all required +#' variables. +#' +#' If only the subset of variables used for matching terms are needed, then +#' request this subset of variables by passing the argument value `"from"`. If +#' only the mapping-to variable is to be requested, then simply pass `"to"`. If +#' only the codelist code variable name is needed then pass `"ct_clst"`. +#' +#' @param set A scalar character (string), one of: `"all"` (default), `"ct_clst"`, +#' `"from"` or `"to"`. +#' +#' @examples +#' # These two calls are equivalent and return all required variables in a +#' # controlled terminology data set. +#' sdtm.oak:::ct_spec_vars() +#' sdtm.oak:::ct_spec_vars("all") +#' +#' # "Codelist code" variable name. +#' sdtm.oak:::ct_spec_vars("ct_clst") +#' +#' # "From" variables +#' sdtm.oak:::ct_spec_vars("from") +#' +#' # The "to" variable. +#' sdtm.oak:::ct_spec_vars("to") +#' +#' @keywords internal +#' @export +ct_spec_vars <- function(set = c("all", "ct_clst", "from", "to")) { + admiraldev::assert_character_vector(set) + + set <- match.arg(set) + ct_clst_var <- "codelist_code" + from_vars <- c("collected_value", "term_synonyms") + to_var <- "term_value" + + if (identical(set, "all")) { + return(c(ct_clst_var, from_vars, to_var)) + } + + if (identical(set, "ct_clst")) { + return(ct_clst_var) + } + + if (identical(set, "from")) { + return(from_vars) + } + + if (identical(set, "to")) { + return(to_var) + } +} + +#' Assert a controlled terminology specification +#' +#' @description +#' [assert_ct_spec()] will check whether `ct_spec` is a data frame and if it contains the +#' variables: `r knitr::combine_words(ct_spec_vars())`. +#' +#' In addition, it will also check if the data frame is not empty (no rows), and +#' whether the columns \code{`r ct_spec_vars('ct_clst')`} and \code{`r ct_spec_vars('to')`} do +#' not contain any `NA` values. +#' +#' @param ct_spec A data frame to be asserted as a valid controlled terminology data +#' set. +#' +#' @returns The function throws an error if `ct_spec` is not a valid controlled +#' terminology data set; otherwise, `ct_spec` is returned invisibly. +#' +#' @examples +#' # If `ct_spec` is a valid controlled terminology then it is returned invisibly. +#' ct_spec_01 <- read_ct_spec_example("ct-01-cm") +#' all.equal(ct_spec_01, sdtm.oak:::assert_ct_spec(ct_spec_01)) +#' +#' # A minimal set of variables needs to be present in `ct_spec` for it to pass the +#' # assertion; `sdtm.oak:::ct_spec_vars()` defines their names. +#' (req_vars <- sdtm.oak:::ct_spec_vars()) +#' +#' # Other (facultative) variables also present in the controlled terminology +#' # example. +#' (opt_vars <- setdiff(colnames(ct_spec_01), req_vars)) +#' +#' # With only the mandatory variables, the assertion still passes. +#' sdtm.oak:::assert_ct_spec(ct_spec_01[req_vars]) +#' +#' # Not having the required variables results in an error. +#' try(sdtm.oak:::assert_ct_spec(ct_spec_01[opt_vars])) +#' +#' @keywords internal +assert_ct_spec <- function(ct_spec, optional = FALSE) { + admiraldev::assert_data_frame( + arg = ct_spec, + required_vars = rlang::syms(ct_spec_vars()), + optional = optional + ) + + if (!is.null(ct_spec) && nrow(ct_spec) == 0L) { + rlang::abort("`ct_spec` can't be empty.") + } + + if (!is.null(ct_spec) && anyNA(ct_spec[[ct_spec_vars("ct_clst")]])) { + rlang::abort(glue::glue("`{ct_spec_vars('ct_clst')}` can't have any NA values.")) + } + + if (!is.null(ct_spec) && anyNA(ct_spec[[ct_spec_vars("to")]])) { + rlang::abort(glue::glue("`{ct_spec_vars('to')}` can't have any NA values.")) + } + + invisible(ct_spec) +} + +#' Assert a codelist code +#' +#' [assert_ct_clst()] asserts the validity of a codelist code in the context of +#' a controlled terminology specification. +#' +#' @param ct_spec Either a data frame encoding a controlled terminology data set, or +#' `NULL`. +#' @param ct_clst A string with a to-be asserted codelist code, or `NULL`. +#' @param optional A scalar logical, indicating whether `ct_clst` can be `NULL` or +#' not. +#' +#' @returns The function throws an error if `ct_clst` is not a valid codelist code +#' given the controlled terminology data set; otherwise, `ct_clst` is returned +#' invisibly. +#' +#' @examples +#' # Load a controlled terminology example. +#' (ct_spec <- read_ct_spec_example("ct-01-cm")) +#' +#' # Should work fine. +#' sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = "C71113") +#' +#' # In certain cases, you might allow `ct_clst` to be `NULL` as to indicate absence, +#' # in that case, set `optional` to `TRUE` to make `assert_ct_clst()` more +#' # forgiving. +#' sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = NULL, optional = TRUE) +#' +#' # Otherwise it would err. +#' try(sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = NULL, optional = FALSE)) +#' +#' @keywords internal +assert_ct_clst <- function(ct_spec, ct_clst, optional = FALSE) { + is_ct_spec_missing <- is.null(ct_spec) + is_ct_clst_missing <- is.null(ct_clst) + is_required_ct_clst_missing <- is_ct_clst_missing && !optional + is_ct_clst_without_ct_spec <- is_ct_spec_missing && !is_ct_clst_missing + are_ct_spec_ct_clst_available <- !is_ct_spec_missing && !is_ct_clst_missing + + if (!is_ct_clst_missing) { + admiraldev::assert_character_scalar(ct_clst) + } + + if (is_required_ct_clst_missing) { + rlang::abort("`ct_clst` is a required parameter.") + } + + if (is_ct_clst_without_ct_spec) { + rlang::abort("`ct_spec` must be a valid controlled terminology if `ct_clst` is supplied.") + } + + if (is_ct_clst_missing) { + return(invisible(NULL)) + } + + if (!is_ct_spec_missing && is.na(ct_clst)) { + rlang::abort("`ct_clst` can't be NA. Did you mean `NULL`?") + } + + if (are_ct_spec_ct_clst_available) { + assert_ct_spec(ct_spec, optional = FALSE) + ct_clst_possibilities <- + ct_spec |> + dplyr::pull(ct_spec_vars("ct_clst")) |> + unique() + admiraldev::assert_character_scalar(ct_clst, values = ct_clst_possibilities) + } + + return(ct_clst) +} + +#' Controlled terminology mappings +#' +#' @description +#' [ct_mappings()] takes a controlled terminology specification and returns the +#' mappings in the form of a [tibble][tibble::tibble-package] in long format, +#' i.e. the recoding of values in the `from` column to the `to` column values, +#' one mapping per row. +#' +#' The resulting mappings are unique, i.e. if `from` values are duplicated in +#' two `from` columns, the first column indicated in `from` takes precedence, +#' and only that mapping is retained in the controlled terminology map. +#' +#' @param ct_spec Controlled terminology specification as a +#' [tibble][tibble::tibble-package]. Each row is for a mapped controlled term. +#' Controlled terms are expected in the column indicated by `to_col`. +#' @param from A character vector of column names indicating the variables +#' containing values to be recoded. +#' @param to A single string indicating the column whose values are to be +#' recoded into. +#' +#' @returns A [tibble][tibble::tibble-package] with two columns, `from` and +#' `to`, indicating the mapping of values, one per row. +#' +#' @examples +#' # Read in a bundled controlled terminology spec example (ex. 01). +#' (ct_spec_01 <- read_ct_spec_example("ct-01-cm")) +#' +#' # Generate mappings from the terminology specification. +#' sdtm.oak:::ct_mappings(ct_spec = ct_spec_01) +#' +#' # Take a glimpse at those mappings where an actual recoding happens. +#' sdtm.oak:::ct_mappings(ct_spec = ct_spec_01) |> +#' dplyr::filter(from != to) |> +#' print(n = 20) +#' +#' @importFrom rlang .data +#' @keywords internal +ct_mappings <- function(ct_spec, from = ct_spec_vars("from"), to = ct_spec_vars("to")) { + assert_ct_spec(ct_spec) + cols <- c(to, from) + + ct_mappings <- + ct_spec |> + dplyr::mutate(to = !!rlang::sym(to)) |> + tidyr::pivot_longer( + cols = dplyr::all_of(cols), + values_to = "from", + names_to = "type" + ) |> + dplyr::select(c("type", "from", "to")) |> + dplyr::mutate(type = factor(.data$type, levels = cols)) |> + dplyr::arrange(.data$type) |> + dplyr::select(-"type") |> + tidyr::drop_na("from") |> + dplyr::mutate(from = str_split(.data$from)) |> + tidyr::unnest(from) |> + dplyr::filter(from != "") |> # In case the split resulted in empty strings. + dplyr::mutate(from = trimws(.data$from), to = trimws(.data$to)) |> + dplyr::distinct(.data$from, .keep_all = TRUE) + + ct_mappings +} + +#' Recode according to controlled terminology +#' +#' [ct_map()] recodes a vector following a controlled terminology. +#' +#' @param x A character vector of terms to be recoded following a controlled +#' terminology. +#' @param ct_spec A [tibble][tibble::tibble-package] providing a controlled +#' terminology specification. +#' @param ct_clst A character vector indicating a set of possible controlled +#' terminology codelists codes to be used for recoding. By default (`NULL`) +#' all codelists available in `ct_spec` are used. +#' @param from A character vector of column names indicating the variables +#' containing values to be matched against for terminology recoding. +#' @param to A single string indicating the column whose values are to be +#' recoded into. +#' +#' @returns A character vector of terminology recoded values from `x`. If no +#' match is found in the controlled terminology spec provided in `ct_spec`, then +#' `x` values are returned in uppercase. If `ct_spec` is not provided `x` is +#' returned unchanged. +#' +#' @examples +#' # A few example terms. +#' terms <- +#' c( +#' "/day", +#' "Yes", +#' "Unknown", +#' "Prior", +#' "Every 2 hours", +#' "Percentage", +#' "International Unit" +#' ) +#' +#' # Load a controlled terminology example +#' (ct_spec <- read_ct_spec_example("ct-01-cm")) +#' +#' # Use all possible matching terms in the controlled terminology. +#' ct_map(x = terms, ct_spec = ct_spec) +#' +#' # Note that if the controlled terminology mapping is restricted to a codelist +#' # code, e.g. C71113, then only `"/day"` and `"Every 2 hours"` get mapped to +#' # `"QD"` and `"Q2H"`, respectively; remaining terms won't match given the +#' # codelist code # restriction, and will be mapped to an uppercase version of +#' # the original terms. +#' ct_map(x = terms, ct_spec = ct_spec, ct_clst = "C71113") +#' +#' @importFrom rlang %||% .data +#' @export +ct_map <- + function(x, + ct_spec = NULL, + ct_clst = NULL, + from = ct_spec_vars("from"), + to = ct_spec_vars("to")) { + ct_spec %||% return(x) + assert_ct_spec(ct_spec) + + ct_clst <- ct_clst %||% unique(ct_spec[[ct_spec_vars("ct_clst")]]) + ct_spec <- dplyr::filter(ct_spec, .data[[ct_spec_vars("ct_clst")]] %in% ct_clst) + + mappings <- ct_mappings(ct_spec, from = from, to = to) + recode( + x, + from = mappings$from, + to = mappings$to, + .no_match = toupper(x) + ) + } + +#' Read in a controlled terminology +#' +#' [read_ct_spec()] imports a controlled terminology specification data set as a +#' [tibble][tibble::tibble-package]. +#' +#' @param file A path to a file containing a controlled terminology +#' specification data set. The following are expected of this file: +#' +#' - The file is expected to be a CSV file; +#' - The file is expected to contain a first row of column names; +#' - This minimal set of variables is expected: `r knitr::combine_words(ct_spec_vars())`. +#' +#' @returns A [tibble][tibble::tibble-package] with a controlled terminology +#' specification. +#' +#' @examples +#' # Get the local path to one of the controlled terminology example files. +#' path <- ct_spec_example("ct-01-cm") +#' +#' # Import it to R. +#' read_ct_spec(file = path) +#' +#' @export +read_ct_spec <- function(file = stop("`file` must be specified")) { + ct_spec <- readr::read_csv(file = file, col_types = "c") + assert_ct_spec(ct_spec) + + ct_spec +} + +#' Find the path to an example controlled terminology file +#' +#' [ct_spec_example()] resolves the local path to an example controlled +#' terminology file. +#' +#' @param example A string with either the basename, file name, or relative path +#' to a controlled terminology file bundled with `{stdm.oak}`, see examples. +#' +#' @returns The local path to an example file if `example` is supplied, or a +#' character vector of example file names. +#' +#' @examples +#' # Get the local path to controlled terminology example file 01 +#' # Using the basename only: +#' ct_spec_example("ct-01-cm") +#' +#' # Using the file name: +#' ct_spec_example("ct-01-cm.csv") +#' +#' # Using the relative path: +#' ct_spec_example("ct/ct-01-cm.csv") +#' +#' # If no example is provided it returns a vector of possible choices. +#' ct_spec_example() +#' +#' @export +ct_spec_example <- function(example) { + # If no example is requested, then return all available files. + if (missing(example)) { + ct_spec_path <- system.file("ct", package = "sdtm.oak", mustWork = TRUE) + ct_spec_files <- list.files(ct_spec_path, pattern = "*.csv") + return(ct_spec_files) + } + + # Otherwise, resolve the local path to the example requested. + admiraldev::assert_character_scalar(example, optional = TRUE) + base_name <- tools::file_path_sans_ext(basename(example)) + path <- file.path("ct", paste0(base_name, ".csv")) + local_path <- system.file(path, package = "sdtm.oak") + + if (identical(local_path, "")) { + stop( + glue::glue( + "'{example}' does not match any ct spec files. Run `ct_spec_example()` for options." + ), + call. = FALSE + ) + } else { + local_path <- + system.file(path, package = "sdtm.oak", mustWork = TRUE) + return(local_path) + } +} + +#' Read an example controlled terminology specification +#' +#' [read_ct_spec_example()] imports one of the bundled controlled terminology +#' specification data sets as a [tibble][tibble::tibble-package] into R. +#' +#' @param example The file name of a controlled terminology data set bundled +#' with `{stdm.oak}`, run `read_ct_spec_example()` for available example files. +#' +#' @returns A [tibble][tibble::tibble-package] with a controlled terminology +#' specification data set, or a character vector of example file names. +#' +#' @examples +#' # Leave the `example` parameter as missing for available example files. +#' read_ct_spec_example() +#' +#' # Read an example controlled terminology spec file. +#' read_ct_spec_example("ct-01-cm.csv") +#' +#' # You may omit the file extension. +#' read_ct_spec_example("ct-01-cm") +#' +#' @export +read_ct_spec_example <- function(example) { + if (missing(example)) { + return(ct_spec_example()) + } else { + admiraldev::assert_character_scalar(example) + } + + path <- ct_spec_example(example) + read_ct_spec(file = path) +} diff --git a/R/dtc_problems.R b/R/dtc_problems.R index c6ab3494..9c23544e 100644 --- a/R/dtc_problems.R +++ b/R/dtc_problems.R @@ -133,7 +133,7 @@ any_problems <- function(cap_matrices, .cutoff_2000 = 68L) { #' provides easy access to these parsing problems. #' #' @param x An object of class iso8601, as typically obtained from a call to -#' [create_iso8601()]. The argument can also be left empty, in that case it +#' [create_iso8601()]. The argument can also be left empty, in that case #' `problems()` will use the last returned value, making it convenient to use #' immediately after [create_iso8601()]. #' @@ -159,7 +159,7 @@ any_problems <- function(cap_matrices, .cutoff_2000 = 68L) { #' "20231225" #' ) #' -#' #' # By inspecting the problematic dates it can be understood that +#' # By inspecting the problematic dates it can be understood that #' # the `.format` parameter needs to updated to include other variations. #' iso8601_dttm <- create_iso8601(dates, .format = "y-m-d") #' problems(iso8601_dttm) diff --git a/R/hardcode.R b/R/hardcode.R new file mode 100644 index 00000000..31938689 --- /dev/null +++ b/R/hardcode.R @@ -0,0 +1,251 @@ +#' Derive an SDTM variable with a hardcoded value +#' +#' @description +#' [sdtm_hardcode()] is an internal function packing the same functionality as +#' [hardcode_no_ct()] and [hardcode_ct()] together but aimed at developers only. +#' As a user please use either [hardcode_no_ct()] or [hardcode_ct()]. +#' +#' @param raw_dat The raw dataset (dataframe); must include the +#' variables passed in `id_vars` and `raw_var`. +#' @param raw_var The raw variable: a single string indicating the name of the +#' raw variable in `raw_dat`. +#' @param tgt_var The target SDTM variable: a single string indicating the name +#' of variable to be derived. +#' @param tgt_val The target SDTM value to be hardcoded into the variable +#' indicated in `tgt_var`. +#' @param ct_spec Study controlled terminology specification: a dataframe with a +#' minimal set of columns, see [ct_spec_vars()] for details. This parameter is +#' optional, if left as `NULL` no controlled terminology recoding is applied. +#' @param ct_clst A codelist code indicating which subset of the controlled +#' terminology to apply in the derivation. This parameter is optional, if left +#' as `NULL`, all possible recodings in `ct_spec` are attempted. +#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by +#' the variables indicated in `id_vars`. This parameter is optional, see +#' section Value for how the output changes depending on this argument value. +#' @param id_vars Key variables to be used in the join between the raw dataset +#' (`raw_dat`) and the target data set (`raw_dat`). +#' +#' @returns The returned data set depends on the value of `tgt_dat`: +#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to +#' `NULL`, then the returned data set is `raw_dat`, selected for the variables +#' indicated in `id_vars`, and a new extra column: the derived variable, as +#' indicated in `tgt_var`. +#' - If the target dataset is provided, then it is merged with the raw data set +#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the +#' derived variable, as indicated in `tgt_var`. +#' +#' @importFrom rlang := +#' @keywords internal +sdtm_hardcode <- function(raw_dat, + raw_var, + tgt_var, + tgt_val, + ct_spec = NULL, + ct_clst = NULL, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + admiraldev::assert_character_scalar(tgt_val) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + assert_ct_spec(ct_spec, optional = TRUE) + assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = TRUE) + + # Recode the hardcoded value following terminology. + tgt_val <- ct_map(tgt_val, ct_spec = ct_spec, ct_clst = ct_clst) + + # Apply derivation of the hardcoded value. + # `der_dat`: derived dataset. + der_dat <- + raw_dat |> + dplyr::select(c(id_vars, raw_var)) |> + dplyr::mutate("{tgt_var}" := recode(x = !!rlang::sym(raw_var), to = tgt_val)) |> # nolint object_name_linter() + dplyr::select(-rlang::sym(raw_var)) + + # If a target dataset is supplied, then join the so far derived dataset with + # the target dataset (`tgt_dat`), otherwise leave it be. + der_dat <- + if (!is.null(tgt_dat)) { + der_dat |> + dplyr::right_join(y = tgt_dat, by = id_vars) |> + dplyr::relocate(tgt_var, .after = dplyr::last_col()) + } else { + der_dat + } + + der_dat +} + +#' Derive an SDTM variable with a hardcoded value +#' +#' +#' @description +#' - [hardcode_no_ct()] maps a hardcoded value to a target SDTM variable that has +#' no terminology restrictions. +#' +#' - [hardcode_ct()] maps a hardcoded value to a target SDTM variable with +#' controlled terminology recoding. +#' +#' @param raw_dat The raw dataset (dataframe); must include the +#' variables passed in `id_vars` and `raw_var`. +#' @param raw_var The raw variable: a single string indicating the name of the +#' raw variable in `raw_dat`. +#' @param tgt_var The target SDTM variable: a single string indicating the name +#' of variable to be derived. +#' @param tgt_val The target SDTM value to be hardcoded into the variable +#' indicated in `tgt_var`. +#' @param ct_spec Study controlled terminology specification: a dataframe with a +#' minimal set of columns, see [ct_spec_vars()] for details. This parameter is +#' optional, if left as `NULL` no controlled terminology recoding is applied. +#' @param ct_clst A codelist code indicating which subset of the controlled +#' terminology to apply in the derivation. This parameter is optional, if left +#' as `NULL`, all possible recodings in `ct_spec` are attempted. +#' @param tgt_dat Target dataset: a data frame to be merged against `raw_dat` by +#' the variables indicated in `id_vars`. This parameter is optional, see +#' section Value for how the output changes depending on this argument value. +#' @param id_vars Key variables to be used in the join between the raw dataset +#' (`raw_dat`) and the target data set (`raw_dat`). +#' +#' @returns The returned data set depends on the value of `tgt_dat`: +#' - If no target dataset is supplied, meaning that `tgt_dat` defaults to +#' `NULL`, then the returned data set is `raw_dat`, selected for the variables +#' indicated in `id_vars`, and a new extra column: the derived variable, as +#' indicated in `tgt_var`. +#' - If the target dataset is provided, then it is merged with the raw data set +#' `raw_dat` by the variables indicated in `id_vars`, with a new column: the +#' derived variable, as indicated in `tgt_var`. +#' +#' @examples +#' md1 <- +#' tibble::tribble( +#' ~oak_id, ~raw_source, ~patient_number, ~MDRAW, +#' 1L, "MD1", 101L, "BABY ASPIRIN", +#' 2L, "MD1", 102L, "CORTISPORIN", +#' 3L, "MD1", 103L, NA_character_, +#' 4L, "MD1", 104L, "DIPHENHYDRAMINE HCL" +#' ) +#' +#' # Derive a new variable `CMCAT` by overwriting `MDRAW` with the +#' # hardcoded value "GENERAL CONCOMITANT MEDICATIONS". +#' hardcode_no_ct( +#' raw_dat = md1, +#' raw_var = "MDRAW", +#' tgt_var = "CMCAT", +#' tgt_val = "GENERAL CONCOMITANT MEDICATIONS" +#' ) +#' +#' cm_inter <- +#' tibble::tribble( +#' ~oak_id, ~raw_source, ~patient_number, ~CMTRT, ~CMINDC, +#' 1L, "MD1", 101L, "BABY ASPIRIN", NA, +#' 2L, "MD1", 102L, "CORTISPORIN", "NAUSEA", +#' 3L, "MD1", 103L, "ASPIRIN", "ANEMIA", +#' 4L, "MD1", 104L, "DIPHENHYDRAMINE HCL", "NAUSEA", +#' 5L, "MD1", 105L, "PARACETAMOL", "PYREXIA" +#' ) +#' +#' # Derive a new variable `CMCAT` by overwriting `MDRAW` with the +#' # hardcoded value "GENERAL CONCOMITANT MEDICATIONS" with a prior join to +#' # `target_dataset`. +#' hardcode_no_ct( +#' raw_dat = md1, +#' raw_var = "MDRAW", +#' tgt_var = "CMCAT", +#' tgt_val = "GENERAL CONCOMITANT MEDICATIONS", +#' tgt_dat = cm_inter +#' ) +#' +#' # Controlled terminology specification +#' (ct_spec <- read_ct_spec_example("ct-01-cm")) +#' +#' # Hardcoding of `CMCAT` with the value `"GENERAL CONCOMITANT MEDICATIONS"` +#' # involving terminology recoding. `NA` values in `MDRAW` are preserved in +#' # `CMCAT`. +#' hardcode_ct( +#' raw_dat = md1, +#' raw_var = "MDRAW", +#' tgt_var = "CMCAT", +#' tgt_val = "GENERAL CONCOMITANT MEDICATIONS", +#' ct_spec = ct_spec, +#' ct_clst = "C66729", +#' tgt_dat = cm_inter +#' ) +#' +#' @name harcode +NULL + +#' @export +#' @rdname harcode +hardcode_no_ct <- function(raw_dat, + raw_var, + tgt_var, + tgt_val, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + assertthat::assert_that(assertthat::is.scalar(tgt_val), + msg = "`tgt_val` must be a scalar value." + ) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, required_vars = rlang::syms(id_vars), optional = TRUE) + + sdtm_hardcode( + raw_dat = raw_dat, + raw_var = raw_var, + tgt_var = tgt_var, + tgt_val = tgt_val, + tgt_dat = tgt_dat, + id_vars = id_vars + ) +} + +#' @export +#' @rdname harcode +hardcode_ct <- + function(raw_dat, + raw_var, + tgt_var, + tgt_val, + ct_spec, + ct_clst, + tgt_dat = NULL, + id_vars = oak_id_vars()) { + admiraldev::assert_character_scalar(raw_var) + admiraldev::assert_character_scalar(tgt_var) + assertthat::assert_that(assertthat::is.scalar(tgt_val), + msg = "`tgt_val` must be a scalar value." + ) + admiraldev::assert_character_vector(id_vars) + assertthat::assert_that(contains_oak_id_vars(id_vars), + msg = "`id_vars` must include the oak id vars." + ) + admiraldev::assert_data_frame(raw_dat, required_vars = rlang::syms(c(id_vars, raw_var))) + admiraldev::assert_data_frame(tgt_dat, + required_vars = rlang::syms(id_vars), + optional = TRUE + ) + + assert_ct_spec(ct_spec, optional = FALSE) + assert_ct_clst(ct_spec = ct_spec, ct_clst = ct_clst, optional = FALSE) + + sdtm_hardcode( + raw_dat = raw_dat, + raw_var = raw_var, + tgt_var = tgt_var, + tgt_val = tgt_val, + ct_spec = ct_spec, + ct_clst = ct_clst, + tgt_dat = tgt_dat, + id_vars = id_vars + ) + } diff --git a/R/oak_id_vars.R b/R/oak_id_vars.R new file mode 100644 index 00000000..718d11da --- /dev/null +++ b/R/oak_id_vars.R @@ -0,0 +1,53 @@ +#' Raw dataset keys +#' +#' [oak_id_vars()] is a helper function providing the variable (column) names to +#' be regarded as keys in [tibbles][tibble::tibble-package] representing raw +#' datasets. By default, the set of names is +#' `r knitr::combine_words(oak_id_vars())`. Extra variable names may be +#' indicated and passed in `extra_vars` which are appended to the default names. +#' +#' @param extra_vars A character vector of extra column names to be appended to +#' the default names: `r knitr::combine_words(oak_id_vars())`. +#' +#' @returns A character vector of column names to be regarded +#' as keys in raw datasets. +#' +#' @examples +#' sdtm.oak:::oak_id_vars() +#' +#' sdtm.oak:::oak_id_vars(extra_vars = "sample_id") +#' +#' @keywords internal +oak_id_vars <- function(extra_vars = NULL) { + admiraldev::assert_character_vector(extra_vars, optional = TRUE) + unique(c("oak_id", "raw_source", "patient_number", extra_vars)) +} + +#' Does a vector contain the raw dataset key variables? +#' +#' [contains_oak_id_vars()] evaluates whether a character vector `x` contains +#' the raw dataset key variable names, i.e. the so called Oak identifier +#' variables --- these are defined by the return value of [oak_id_vars()]. +#' +#' @param x A character vector. +#' +#' @returns A logical scalar value. +#' +#' @examples +#' # `oak_id_vars()` is the function that defines what are the minimal set of +#' # oak keys. Hence, by definition, the following code should always return +#' # `TRUE`. +#' sdtm.oak:::contains_oak_id_vars(sdtm.oak:::oak_id_vars()) +#' +#' # Returns `FALSE`. +#' sdtm.oak:::contains_oak_id_vars(character()) +#' +#' # Another example that returns `FALSE` because it is missing +#' # `"patient_number"`. +#' sdtm.oak:::contains_oak_id_vars(c("oak_id", "raw_source")) +#' +#' @keywords internal +contains_oak_id_vars <- function(x) { + admiraldev::assert_character_vector(x) + all(oak_id_vars() %in% x) +} diff --git a/R/recode.R b/R/recode.R new file mode 100644 index 00000000..424096b2 --- /dev/null +++ b/R/recode.R @@ -0,0 +1,68 @@ +#' Determine Indices for Recoding +#' +#' [index_for_recode()] identifies the positions of elements in `x` that match +#' any of the values specified in the `from` vector. This function is primarily +#' used to facilitate the recoding of values by pinpointing which elements in +#' `x` correspond to the `from` values and thus need to be replaced or updated. +#' +#' @param x A vector of values in which to search for matches. +#' @param from A vector of values to match against the elements in `x`. +#' @return An integer vector of the same length as `x`, containing the indices +#' of the matched values from the `from` vector. If an element in `x` does not +#' match any value in `from`, the corresponding position in the output will be +#' `NA`. This index information is critical for subsequent recoding operations. +#' @examples +#' sdtm.oak:::index_for_recode(x = 1:5, from = c(2, 4)) +#' +#' @keywords internal +index_for_recode <- function(x, from) { + match(x, from) +} + +#' Recode values +#' +#' [recode()] recodes values in `x` by matching elements in `from` onto values +#' in `to`. +#' +#' @param x An atomic vector of values are to be recoded. +#' @param from A vector of values to be matched in `x` for recoding. +#' @param to A vector of values to be used as replacement for values in `from`. +#' @param .no_match Value to be used as replacement when cases in `from` are not +#' matched. +#' @param .na Value to be used to recode missing values. +#' +#' @returns A vector of recoded values. +#' +#' @examples +#' x <- c("male", "female", "x", NA) +#' sdtm.oak:::recode(x, +#' from = c("male", "female"), +#' to = c("M", "F") +#' ) +#' sdtm.oak:::recode( +#' x, +#' from = c("male", "female"), +#' to = c("M", "F"), +#' .no_match = "?" +#' ) +#' sdtm.oak:::recode( +#' x, +#' from = c("male", "female"), +#' to = c("M", "F"), +#' .na = "missing" +#' ) +#' +#' @keywords internal +recode <- function( + x, + from = unique(na.omit(x)), + to = from, + .no_match = x, + .na = NA) { + to <- vctrs::vec_recycle(to, length(from)) + index <- index_for_recode(x, from) + y <- ifelse(!is.na(index), to[index], .no_match) + y[is.na(x)] <- .na + + y +} diff --git a/R/sdtm.oak-package.R b/R/sdtm.oak-package.R index b1a48e6f..0ba23dc1 100644 --- a/R/sdtm.oak-package.R +++ b/R/sdtm.oak-package.R @@ -4,5 +4,6 @@ ## usethis namespace: start #' @importFrom tibble tibble #' @importFrom rlang .data +#' @importFrom stats na.omit ## usethis namespace: end NULL diff --git a/R/str_split.R b/R/str_split.R new file mode 100644 index 00000000..7b5b1125 --- /dev/null +++ b/R/str_split.R @@ -0,0 +1,13 @@ +str_split_ <- function(x, split = ";", quote = '"') { + scan( + text = x, + what = "character", + sep = split, + quote = quote, + quiet = TRUE + ) +} + +str_split <- function(x, split = ";", quote = '"') { + lapply(x, str_split_, split = split, quote = quote) +} diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 00000000..e05837ee --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,3 @@ +.onLoad <- function(libname, pkgname) { + ct_mappings <<- memoise::memoise(ct_mappings) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index d0716d79..2f306131 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -6,3 +6,29 @@ template: navbar: title: "sdtm.oak" + +reference: +- title: SDTM derivation + desc: Set of functions to perform SDTM derivations + contents: + - assign + - harcode + - derive_study_day + +- title: Controlled terminology + contents: + - read_ct_spec + - read_ct_spec_example + - ct_spec_example + - ct_map + +- title: Date and time conversion + contents: + - create_iso8601 + - fmt_cmp + - dtc_formats + - problems + +- title: Package global state + contents: + - clear_cache diff --git a/inst/WORDLIST b/inst/WORDLIST index 50b2eb7e..538f7a2e 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -10,6 +10,17 @@ ORCID PMDA CDISC iso +hardcoded +CDISC +PMDA +recode +recodes +recoded +recoding +tibble +codelist +Memoised +memoised AE AESTDY CMSTDY diff --git a/inst/ct/README.md b/inst/ct/README.md new file mode 100644 index 00000000..58433341 --- /dev/null +++ b/inst/ct/README.md @@ -0,0 +1,22 @@ +# Controlled terminology examples + +## Introduction + +The folder inst/ct is meant to hold examples of controlled terminology +specifications. These example files should be plain CSV files. + +If you're adding new example files please follow the naming convention: +`ct-
-.csv`, where `
` is a simple numerical identifier +for the example, i.e. 01, 02, etc.. The `` is a short keyword +providing a simple contextual annotation for the controlled terminology. + +For example, `ct-01-cm.csv` is an example of a controlled terminology +specification data set: + +- `01`: indicates that it is the first example +- `cm`: suggests that this data set is related to the Concomitant Medication +domain. + +## Examples + +- `ct-01-cm.csv`: TBD. diff --git a/inst/ct/ct-01-cm.csv b/inst/ct/ct-01-cm.csv new file mode 100644 index 00000000..ffbe0903 --- /dev/null +++ b/inst/ct/ct-01-cm.csv @@ -0,0 +1,34 @@ +codelist_code,term_code,CodedData,term_value,collected_value,term_preferred_term,term_synonyms,raw_codelist +C71113,C25473,QD,QD,QD (Every Day),Daily,/day; Daily; Per Day,FREQ_CV1 +C71113,C64496,BID,BID,BID (Twice a Day),Twice Daily,BD; Twice per day,FREQ_CV1 +C71113,C64499,PRN,PRN,PRN (As Needed),As Needed,As needed,FREQ_CV1 +C71113,C64516,Q2H,Q2H,Q2H (Every 2 Hours),Every Two Hours,Every 2 hours,FREQ_CV1 +C71113,C64530,QID,QID,QID (4 Times a Day),Four Times Daily,4 times per day,FREQ_CV1 +C66726,C25158,CAPSULE,CAPSULE,Capsule,Capsule Dosage Form,cap,FRM_CV1 +C66726,C25394,PILL,PILL,Pill,Pill Dosage Form,NA,FRM_CV1 +C66726,C29167,LOTION,LOTION,Lotion,Lotion Dosage Form,NA,FRM_CV1 +C66726,C42887,AEROSOL,AEROSOL,Aerosol,Aerosol Dosage Form,aer,FRM_CV1 +C66726,C42944,INHALANT,INHALANT,Inhalant,Inhalant Dosage Form,NA,FRM_CV1 +C66726,C42946,INJECTION,INJECTION,Injection,Injectable Dosage Form,NA,FRM_CV1 +C66726,C42953,LIQUID,LIQUID,Liquid,Liquid Dosage Form,NA,FRM_CV1 +C66726,C42998,TABLET,TABLET,Tablet,Tablet Dosage Form,tab,FRM_CV1 +C66742,C49488,Y,Y,Yes,Yes,Yes,NY_CV1 +C66729,C28161,INTRAMUSCULAR,INTRAMUSCULAR,IM (Intramuscular),Intramuscular Route of Administration,NA,ROUTE_CV1 +C66729,C38210,EPIDURAL,EPIDURAL,EP (Epidural),Epidural Route of Administration,NA,ROUTE_CV1 +C66729,C38222,INTRA-ARTERIAL,INTRA-ARTERIAL,IA (Intra-arterial),Intraarterial Route of Administration,NA,ROUTE_CV1 +C66729,C38223,INTRA-ARTICULAR,INTRA-ARTICULAR,IJ (Intra-articular),Intraarticular Route of Administration,NA,ROUTE_CV1 +C66729,C38287,OPHTHALMIC,OPHTHALMIC,OP (Ophthalmic),Ophthalmic Route of Administration,NA,ROUTE_CV1 +C66729,C38288,ORAL,ORAL,PO (Oral),Oral Route of Administration,Intraoral Route of Administration; PO,ROUTE_CV1 +C66729,C38305,TRANSDERMAL,TRANSDERMAL,DE (Transdermal),Transdermal Route of Administration,NA,ROUTE_CV1 +C66729,C38311,UNKNOWN,UNKNOWN,Unknown,Unknown Route of Administration,NA,ROUTE_CV1 +C71620,C25613,%,%,%,Percentage,Percentage,UNIT_CV1 +C71620,C28253,MG,mg,mg,Milligram,Milligram,UNIT_CV1 +C71620,C28254,ML,mL,mL,Milliliter,cm3; Milliliter,UNIT_CV1 +C71620,C48155,G,g,g,Gram,Gram,UNIT_CV1 +C71620,C48480,CAPSULE,CAPSULE,Capsule,Capsule Dosing Unit,cap; Capsule Dosing Unit,UNIT_CV1 +C71620,C48542,TABLET,TABLET,Tablet,Tablet Dosing Unit,tab; Tablet Dosing Unit,UNIT_CV1 +C71620,C48579,IU,IU,IU,International Unit,IE; International Unit,UNIT_CV1 +C71620,C28254,ML,mL,mL,Milliliter,cm3; Milliliter,UNIT_CV5 +C66728,C25629,BEFORE,BEFORE,Prior,Prior,,NA +C66728,C53279,ONGOING,ONGOING,Continue,Continue,Continuous,NA +C66734,C49568,CM,CM,Concomitant Medication Domain,Concomitant Medication Domain,Concomitant/Prior Medications,NA diff --git a/man/assert_ct_clst.Rd b/man/assert_ct_clst.Rd new file mode 100644 index 00000000..c0239c97 --- /dev/null +++ b/man/assert_ct_clst.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{assert_ct_clst} +\alias{assert_ct_clst} +\title{Assert a codelist code} +\usage{ +assert_ct_clst(ct_spec, ct_clst, optional = FALSE) +} +\arguments{ +\item{ct_spec}{Either a data frame encoding a controlled terminology data set, or +\code{NULL}.} + +\item{ct_clst}{A string with a to-be asserted codelist code, or \code{NULL}.} + +\item{optional}{A scalar logical, indicating whether \code{ct_clst} can be \code{NULL} or +not.} +} +\value{ +The function throws an error if \code{ct_clst} is not a valid codelist code +given the controlled terminology data set; otherwise, \code{ct_clst} is returned +invisibly. +} +\description{ +\code{\link[=assert_ct_clst]{assert_ct_clst()}} asserts the validity of a codelist code in the context of +a controlled terminology specification. +} +\examples{ +# Load a controlled terminology example. +(ct_spec <- read_ct_spec_example("ct-01-cm")) + +# Should work fine. +sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = "C71113") + +# In certain cases, you might allow `ct_clst` to be `NULL` as to indicate absence, +# in that case, set `optional` to `TRUE` to make `assert_ct_clst()` more +# forgiving. +sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = NULL, optional = TRUE) + +# Otherwise it would err. +try(sdtm.oak:::assert_ct_clst(ct_spec = ct_spec, ct_clst = NULL, optional = FALSE)) + +} +\keyword{internal} diff --git a/man/assert_ct_spec.Rd b/man/assert_ct_spec.Rd new file mode 100644 index 00000000..5ce72ce4 --- /dev/null +++ b/man/assert_ct_spec.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{assert_ct_spec} +\alias{assert_ct_spec} +\title{Assert a controlled terminology specification} +\usage{ +assert_ct_spec(ct_spec, optional = FALSE) +} +\arguments{ +\item{ct_spec}{A data frame to be asserted as a valid controlled terminology data +set.} +} +\value{ +The function throws an error if \code{ct_spec} is not a valid controlled +terminology data set; otherwise, \code{ct_spec} is returned invisibly. +} +\description{ +\code{\link[=assert_ct_spec]{assert_ct_spec()}} will check whether \code{ct_spec} is a data frame and if it contains the +variables: codelist_code, collected_value, term_synonyms, and term_value. + +In addition, it will also check if the data frame is not empty (no rows), and +whether the columns \code{codelist_code} and \code{term_value} do +not contain any \code{NA} values. +} +\examples{ +# If `ct_spec` is a valid controlled terminology then it is returned invisibly. +ct_spec_01 <- read_ct_spec_example("ct-01-cm") +all.equal(ct_spec_01, sdtm.oak:::assert_ct_spec(ct_spec_01)) + +# A minimal set of variables needs to be present in `ct_spec` for it to pass the +# assertion; `sdtm.oak:::ct_spec_vars()` defines their names. +(req_vars <- sdtm.oak:::ct_spec_vars()) + +# Other (facultative) variables also present in the controlled terminology +# example. +(opt_vars <- setdiff(colnames(ct_spec_01), req_vars)) + +# With only the mandatory variables, the assertion still passes. +sdtm.oak:::assert_ct_spec(ct_spec_01[req_vars]) + +# Not having the required variables results in an error. +try(sdtm.oak:::assert_ct_spec(ct_spec_01[opt_vars])) + +} +\keyword{internal} diff --git a/man/assign.Rd b/man/assign.Rd new file mode 100644 index 00000000..ff7df056 --- /dev/null +++ b/man/assign.Rd @@ -0,0 +1,141 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assign.R +\name{assign_no_ct} +\alias{assign_no_ct} +\alias{assign_ct} +\alias{assign} +\title{Derive an SDTM variable} +\usage{ +assign_no_ct( + raw_dat, + raw_var, + tgt_var, + tgt_dat = NULL, + id_vars = oak_id_vars() +) + +assign_ct( + raw_dat, + raw_var, + tgt_var, + ct_spec, + ct_clst, + tgt_dat = NULL, + id_vars = oak_id_vars() +) +} +\arguments{ +\item{raw_dat}{The raw dataset (dataframe); must include the +variables passed in \code{id_vars} and \code{raw_var}.} + +\item{raw_var}{The raw variable: a single string indicating the name of the +raw variable in \code{raw_dat}.} + +\item{tgt_var}{The target SDTM variable: a single string indicating the name +of variable to be derived.} + +\item{tgt_dat}{Target dataset: a data frame to be merged against \code{raw_dat} by +the variables indicated in \code{id_vars}. This parameter is optional, see +section Value for how the output changes depending on this argument value.} + +\item{id_vars}{Key variables to be used in the join between the raw dataset +(\code{raw_dat}) and the target data set (\code{raw_dat}).} + +\item{ct_spec}{Study controlled terminology specification: a dataframe with a +minimal set of columns, see \code{\link[=ct_spec_vars]{ct_spec_vars()}} for details.} + +\item{ct_clst}{A codelist code indicating which subset of the controlled +terminology to apply in the derivation.} +} +\value{ +The returned data set depends on the value of \code{tgt_dat}: +\itemize{ +\item If no target dataset is supplied, meaning that \code{tgt_dat} defaults to +\code{NULL}, then the returned data set is \code{raw_dat}, selected for the variables +indicated in \code{id_vars}, and a new extra column: the derived variable, as +indicated in \code{tgt_var}. +\item If the target dataset is provided, then it is merged with the raw data set +\code{raw_dat} by the variables indicated in \code{id_vars}, with a new column: the +derived variable, as indicated in \code{tgt_var}. +} +} +\description{ +\itemize{ +\item \code{\link[=assign_no_ct]{assign_no_ct()}} maps a variable in a raw dataset to a target SDTM +variable that has no terminology restrictions. +\item \code{\link[=assign_ct]{assign_ct()}} maps a variable in a raw dataset to a target SDTM variable +following controlled terminology recoding. +} +} +\examples{ + +md1 <- + tibble::tibble( + oak_id = 1:14, + raw_source = "MD1", + patient_number = 101:114, + MDIND = c( + "NAUSEA", "NAUSEA", "ANEMIA", "NAUSEA", "PYREXIA", + "VOMITINGS", "DIARHHEA", "COLD", + "FEVER", "LEG PAIN", "FEVER", "COLD", "COLD", "PAIN" + ) + ) + +assign_no_ct( + raw_dat = md1, + raw_var = "MDIND", + tgt_var = "CMINDC", +) + +cm_inter <- + tibble::tibble( + oak_id = 1:14, + raw_source = "MD1", + patient_number = 101:114, + CMTRT = c( + "BABY ASPIRIN", + "CORTISPORIN", + "ASPIRIN", + "DIPHENHYDRAMINE HCL", + "PARCETEMOL", + "VOMIKIND", + "ZENFLOX OZ", + "AMITRYPTYLINE", + "BENADRYL", + "DIPHENHYDRAMINE HYDROCHLORIDE", + "TETRACYCLINE", + "BENADRYL", + "SOMINEX", + "ZQUILL" + ), + CMROUTE = c( + "ORAL", + "ORAL", + NA, + "ORAL", + "ORAL", + "ORAL", + "INTRAMUSCULAR", + "INTRA-ARTERIAL", + NA, + "NON-STANDARD", + "RANDOM_VALUE", + "INTRA-ARTICULAR", + "TRANSDERMAL", + "OPHTHALMIC" + ) + ) + +# Controlled terminology specification +(ct_spec <- read_ct_spec_example("ct-01-cm")) + +assign_ct( + raw_dat = md1, + raw_var = "MDIND", + tgt_var = "CMINDC", + ct_spec = ct_spec, + ct_clst = "C66729", + tgt_dat = cm_inter +) + +} diff --git a/man/clear_cache.Rd b/man/clear_cache.Rd new file mode 100644 index 00000000..212f245e --- /dev/null +++ b/man/clear_cache.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clear_cache.R +\name{clear_cache} +\alias{clear_cache} +\title{Clear \code{{sdtm.oak}} cache of memoised functions} +\usage{ +clear_cache() +} +\value{ +Returns a logical value, indicating whether the resetting of the +cache was successful (\code{TRUE}) or not (\code{FALSE}). +} +\description{ +Some of \code{{sdtm.oak}} functions have their results cached for runtime +efficiency. Use this function to reset the cache. + +Memoised functions: +\itemize{ +\item \code{\link[=ct_mappings]{ct_mappings()}} +} +} +\examples{ +clear_cache() + +} diff --git a/man/contains_oak_id_vars.Rd b/man/contains_oak_id_vars.Rd new file mode 100644 index 00000000..c872bbbe --- /dev/null +++ b/man/contains_oak_id_vars.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oak_id_vars.R +\name{contains_oak_id_vars} +\alias{contains_oak_id_vars} +\title{Does a vector contain the raw dataset key variables?} +\usage{ +contains_oak_id_vars(x) +} +\arguments{ +\item{x}{A character vector.} +} +\value{ +A logical scalar value. +} +\description{ +\code{\link[=contains_oak_id_vars]{contains_oak_id_vars()}} evaluates whether a character vector \code{x} contains +the raw dataset key variable names, i.e. the so called Oak identifier +variables --- these are defined by the return value of \code{\link[=oak_id_vars]{oak_id_vars()}}. +} +\examples{ +# `oak_id_vars()` is the function that defines what are the minimal set of +# oak keys. Hence, by definition, the following code should always return +# `TRUE`. +sdtm.oak:::contains_oak_id_vars(sdtm.oak:::oak_id_vars()) + +# Returns `FALSE`. +sdtm.oak:::contains_oak_id_vars(character()) + +# Another example that returns `FALSE` because it is missing +# `"patient_number"`. +sdtm.oak:::contains_oak_id_vars(c("oak_id", "raw_source")) + +} +\keyword{internal} diff --git a/man/ct_map.Rd b/man/ct_map.Rd new file mode 100644 index 00000000..0f00c10a --- /dev/null +++ b/man/ct_map.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{ct_map} +\alias{ct_map} +\title{Recode according to controlled terminology} +\usage{ +ct_map( + x, + ct_spec = NULL, + ct_clst = NULL, + from = ct_spec_vars("from"), + to = ct_spec_vars("to") +) +} +\arguments{ +\item{x}{A character vector of terms to be recoded following a controlled +terminology.} + +\item{ct_spec}{A \link[tibble:tibble-package]{tibble} providing a controlled +terminology specification.} + +\item{ct_clst}{A character vector indicating a set of possible controlled +terminology codelists codes to be used for recoding. By default (\code{NULL}) +all codelists available in \code{ct_spec} are used.} + +\item{from}{A character vector of column names indicating the variables +containing values to be matched against for terminology recoding.} + +\item{to}{A single string indicating the column whose values are to be +recoded into.} +} +\value{ +A character vector of terminology recoded values from \code{x}. If no +match is found in the controlled terminology spec provided in \code{ct_spec}, then +\code{x} values are returned in uppercase. If \code{ct_spec} is not provided \code{x} is +returned unchanged. +} +\description{ +\code{\link[=ct_map]{ct_map()}} recodes a vector following a controlled terminology. +} +\examples{ +# A few example terms. +terms <- + c( + "/day", + "Yes", + "Unknown", + "Prior", + "Every 2 hours", + "Percentage", + "International Unit" + ) + +# Load a controlled terminology example +(ct_spec <- read_ct_spec_example("ct-01-cm")) + +# Use all possible matching terms in the controlled terminology. +ct_map(x = terms, ct_spec = ct_spec) + +# Note that if the controlled terminology mapping is restricted to a codelist +# code, e.g. C71113, then only `"/day"` and `"Every 2 hours"` get mapped to +# `"QD"` and `"Q2H"`, respectively; remaining terms won't match given the +# codelist code # restriction, and will be mapped to an uppercase version of +# the original terms. +ct_map(x = terms, ct_spec = ct_spec, ct_clst = "C71113") + +} diff --git a/man/ct_mappings.Rd b/man/ct_mappings.Rd new file mode 100644 index 00000000..83ca4898 --- /dev/null +++ b/man/ct_mappings.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{ct_mappings} +\alias{ct_mappings} +\title{Controlled terminology mappings} +\usage{ +ct_mappings(ct_spec, from = ct_spec_vars("from"), to = ct_spec_vars("to")) +} +\arguments{ +\item{ct_spec}{Controlled terminology specification as a +\link[tibble:tibble-package]{tibble}. Each row is for a mapped controlled term. +Controlled terms are expected in the column indicated by \code{to_col}.} + +\item{from}{A character vector of column names indicating the variables +containing values to be recoded.} + +\item{to}{A single string indicating the column whose values are to be +recoded into.} +} +\value{ +A \link[tibble:tibble-package]{tibble} with two columns, \code{from} and +\code{to}, indicating the mapping of values, one per row. +} +\description{ +\code{\link[=ct_mappings]{ct_mappings()}} takes a controlled terminology specification and returns the +mappings in the form of a \link[tibble:tibble-package]{tibble} in long format, +i.e. the recoding of values in the \code{from} column to the \code{to} column values, +one mapping per row. + +The resulting mappings are unique, i.e. if \code{from} values are duplicated in +two \code{from} columns, the first column indicated in \code{from} takes precedence, +and only that mapping is retained in the controlled terminology map. +} +\examples{ +# Read in a bundled controlled terminology spec example (ex. 01). +(ct_spec_01 <- read_ct_spec_example("ct-01-cm")) + +# Generate mappings from the terminology specification. +sdtm.oak:::ct_mappings(ct_spec = ct_spec_01) + +# Take a glimpse at those mappings where an actual recoding happens. +sdtm.oak:::ct_mappings(ct_spec = ct_spec_01) |> + dplyr::filter(from != to) |> + print(n = 20) + +} +\keyword{internal} diff --git a/man/ct_spec_example.Rd b/man/ct_spec_example.Rd new file mode 100644 index 00000000..2a7f2e3f --- /dev/null +++ b/man/ct_spec_example.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{ct_spec_example} +\alias{ct_spec_example} +\title{Find the path to an example controlled terminology file} +\usage{ +ct_spec_example(example) +} +\arguments{ +\item{example}{A string with either the basename, file name, or relative path +to a controlled terminology file bundled with \code{{stdm.oak}}, see examples.} +} +\value{ +The local path to an example file if \code{example} is supplied, or a +character vector of example file names. +} +\description{ +\code{\link[=ct_spec_example]{ct_spec_example()}} resolves the local path to an example controlled +terminology file. +} +\examples{ +# Get the local path to controlled terminology example file 01 +# Using the basename only: +ct_spec_example("ct-01-cm") + +# Using the file name: +ct_spec_example("ct-01-cm.csv") + +# Using the relative path: +ct_spec_example("ct/ct-01-cm.csv") + +# If no example is provided it returns a vector of possible choices. +ct_spec_example() + +} diff --git a/man/ct_spec_vars.Rd b/man/ct_spec_vars.Rd new file mode 100644 index 00000000..fff7e108 --- /dev/null +++ b/man/ct_spec_vars.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{ct_spec_vars} +\alias{ct_spec_vars} +\title{Controlled terminology variables} +\usage{ +ct_spec_vars(set = c("all", "ct_clst", "from", "to")) +} +\arguments{ +\item{set}{A scalar character (string), one of: \code{"all"} (default), \code{"ct_clst"}, +\code{"from"} or \code{"to"}.} +} +\description{ +\code{\link[=ct_spec_vars]{ct_spec_vars()}} returns the mandatory variables to be present in a data set +representing a controlled terminology. By default, it returns all required +variables. + +If only the subset of variables used for matching terms are needed, then +request this subset of variables by passing the argument value \code{"from"}. If +only the mapping-to variable is to be requested, then simply pass \code{"to"}. If +only the codelist code variable name is needed then pass \code{"ct_clst"}. +} +\examples{ +# These two calls are equivalent and return all required variables in a +# controlled terminology data set. +sdtm.oak:::ct_spec_vars() +sdtm.oak:::ct_spec_vars("all") + +# "Codelist code" variable name. +sdtm.oak:::ct_spec_vars("ct_clst") + +# "From" variables +sdtm.oak:::ct_spec_vars("from") + +# The "to" variable. +sdtm.oak:::ct_spec_vars("to") + +} +\keyword{internal} diff --git a/man/harcode.Rd b/man/harcode.Rd new file mode 100644 index 00000000..e38424a5 --- /dev/null +++ b/man/harcode.Rd @@ -0,0 +1,133 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hardcode.R +\name{harcode} +\alias{harcode} +\alias{hardcode_no_ct} +\alias{hardcode_ct} +\title{Derive an SDTM variable with a hardcoded value} +\usage{ +hardcode_no_ct( + raw_dat, + raw_var, + tgt_var, + tgt_val, + tgt_dat = NULL, + id_vars = oak_id_vars() +) + +hardcode_ct( + raw_dat, + raw_var, + tgt_var, + tgt_val, + ct_spec, + ct_clst, + tgt_dat = NULL, + id_vars = oak_id_vars() +) +} +\arguments{ +\item{raw_dat}{The raw dataset (dataframe); must include the +variables passed in \code{id_vars} and \code{raw_var}.} + +\item{raw_var}{The raw variable: a single string indicating the name of the +raw variable in \code{raw_dat}.} + +\item{tgt_var}{The target SDTM variable: a single string indicating the name +of variable to be derived.} + +\item{tgt_val}{The target SDTM value to be hardcoded into the variable +indicated in \code{tgt_var}.} + +\item{tgt_dat}{Target dataset: a data frame to be merged against \code{raw_dat} by +the variables indicated in \code{id_vars}. This parameter is optional, see +section Value for how the output changes depending on this argument value.} + +\item{id_vars}{Key variables to be used in the join between the raw dataset +(\code{raw_dat}) and the target data set (\code{raw_dat}).} + +\item{ct_spec}{Study controlled terminology specification: a dataframe with a +minimal set of columns, see \code{\link[=ct_spec_vars]{ct_spec_vars()}} for details. This parameter is +optional, if left as \code{NULL} no controlled terminology recoding is applied.} + +\item{ct_clst}{A codelist code indicating which subset of the controlled +terminology to apply in the derivation. This parameter is optional, if left +as \code{NULL}, all possible recodings in \code{ct_spec} are attempted.} +} +\value{ +The returned data set depends on the value of \code{tgt_dat}: +\itemize{ +\item If no target dataset is supplied, meaning that \code{tgt_dat} defaults to +\code{NULL}, then the returned data set is \code{raw_dat}, selected for the variables +indicated in \code{id_vars}, and a new extra column: the derived variable, as +indicated in \code{tgt_var}. +\item If the target dataset is provided, then it is merged with the raw data set +\code{raw_dat} by the variables indicated in \code{id_vars}, with a new column: the +derived variable, as indicated in \code{tgt_var}. +} +} +\description{ +\itemize{ +\item \code{\link[=hardcode_no_ct]{hardcode_no_ct()}} maps a hardcoded value to a target SDTM variable that has +no terminology restrictions. +\item \code{\link[=hardcode_ct]{hardcode_ct()}} maps a hardcoded value to a target SDTM variable with +controlled terminology recoding. +} +} +\examples{ +md1 <- + tibble::tribble( + ~oak_id, ~raw_source, ~patient_number, ~MDRAW, + 1L, "MD1", 101L, "BABY ASPIRIN", + 2L, "MD1", 102L, "CORTISPORIN", + 3L, "MD1", 103L, NA_character_, + 4L, "MD1", 104L, "DIPHENHYDRAMINE HCL" + ) + +# Derive a new variable `CMCAT` by overwriting `MDRAW` with the +# hardcoded value "GENERAL CONCOMITANT MEDICATIONS". +hardcode_no_ct( + raw_dat = md1, + raw_var = "MDRAW", + tgt_var = "CMCAT", + tgt_val = "GENERAL CONCOMITANT MEDICATIONS" +) + +cm_inter <- + tibble::tribble( + ~oak_id, ~raw_source, ~patient_number, ~CMTRT, ~CMINDC, + 1L, "MD1", 101L, "BABY ASPIRIN", NA, + 2L, "MD1", 102L, "CORTISPORIN", "NAUSEA", + 3L, "MD1", 103L, "ASPIRIN", "ANEMIA", + 4L, "MD1", 104L, "DIPHENHYDRAMINE HCL", "NAUSEA", + 5L, "MD1", 105L, "PARACETAMOL", "PYREXIA" + ) + +# Derive a new variable `CMCAT` by overwriting `MDRAW` with the +# hardcoded value "GENERAL CONCOMITANT MEDICATIONS" with a prior join to +# `target_dataset`. +hardcode_no_ct( + raw_dat = md1, + raw_var = "MDRAW", + tgt_var = "CMCAT", + tgt_val = "GENERAL CONCOMITANT MEDICATIONS", + tgt_dat = cm_inter +) + +# Controlled terminology specification +(ct_spec <- read_ct_spec_example("ct-01-cm")) + +# Hardcoding of `CMCAT` with the value `"GENERAL CONCOMITANT MEDICATIONS"` +# involving terminology recoding. `NA` values in `MDRAW` are preserved in +# `CMCAT`. +hardcode_ct( + raw_dat = md1, + raw_var = "MDRAW", + tgt_var = "CMCAT", + tgt_val = "GENERAL CONCOMITANT MEDICATIONS", + ct_spec = ct_spec, + ct_clst = "C66729", + tgt_dat = cm_inter +) + +} diff --git a/man/index_for_recode.Rd b/man/index_for_recode.Rd new file mode 100644 index 00000000..2362517f --- /dev/null +++ b/man/index_for_recode.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recode.R +\name{index_for_recode} +\alias{index_for_recode} +\title{Determine Indices for Recoding} +\usage{ +index_for_recode(x, from) +} +\arguments{ +\item{x}{A vector of values in which to search for matches.} + +\item{from}{A vector of values to match against the elements in \code{x}.} +} +\value{ +An integer vector of the same length as \code{x}, containing the indices +of the matched values from the \code{from} vector. If an element in \code{x} does not +match any value in \code{from}, the corresponding position in the output will be +\code{NA}. This index information is critical for subsequent recoding operations. +} +\description{ +\code{\link[=index_for_recode]{index_for_recode()}} identifies the positions of elements in \code{x} that match +any of the values specified in the \code{from} vector. This function is primarily +used to facilitate the recoding of values by pinpointing which elements in +\code{x} correspond to the \code{from} values and thus need to be replaced or updated. +} +\examples{ +sdtm.oak:::index_for_recode(x = 1:5, from = c(2, 4)) + +} +\keyword{internal} diff --git a/man/oak_id_vars.Rd b/man/oak_id_vars.Rd new file mode 100644 index 00000000..af5550a3 --- /dev/null +++ b/man/oak_id_vars.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/oak_id_vars.R +\name{oak_id_vars} +\alias{oak_id_vars} +\title{Raw dataset keys} +\usage{ +oak_id_vars(extra_vars = NULL) +} +\arguments{ +\item{extra_vars}{A character vector of extra column names to be appended to +the default names: oak_id, raw_source, and patient_number.} +} +\value{ +A character vector of column names to be regarded +as keys in raw datasets. +} +\description{ +\code{\link[=oak_id_vars]{oak_id_vars()}} is a helper function providing the variable (column) names to +be regarded as keys in \link[tibble:tibble-package]{tibbles} representing raw +datasets. By default, the set of names is +oak_id, raw_source, and patient_number. Extra variable names may be +indicated and passed in \code{extra_vars} which are appended to the default names. +} +\examples{ +sdtm.oak:::oak_id_vars() + +sdtm.oak:::oak_id_vars(extra_vars = "sample_id") + +} +\keyword{internal} diff --git a/man/problems.Rd b/man/problems.Rd index ce68ad46..8a5c23fe 100644 --- a/man/problems.Rd +++ b/man/problems.Rd @@ -8,7 +8,7 @@ problems(x = .Last.value) } \arguments{ \item{x}{An object of class iso8601, as typically obtained from a call to -\code{\link[=create_iso8601]{create_iso8601()}}. The argument can also be left empty, in that case it +\code{\link[=create_iso8601]{create_iso8601()}}. The argument can also be left empty, in that case \code{problems()} will use the last returned value, making it convenient to use immediately after \code{\link[=create_iso8601]{create_iso8601()}}.} } @@ -42,7 +42,7 @@ dates <- "20231225" ) -#' # By inspecting the problematic dates it can be understood that +# By inspecting the problematic dates it can be understood that # the `.format` parameter needs to updated to include other variations. iso8601_dttm <- create_iso8601(dates, .format = "y-m-d") problems(iso8601_dttm) diff --git a/man/read_ct_spec.Rd b/man/read_ct_spec.Rd new file mode 100644 index 00000000..21d6103d --- /dev/null +++ b/man/read_ct_spec.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{read_ct_spec} +\alias{read_ct_spec} +\title{Read in a controlled terminology} +\usage{ +read_ct_spec(file = stop("`file` must be specified")) +} +\arguments{ +\item{file}{A path to a file containing a controlled terminology +specification data set. The following are expected of this file: +\itemize{ +\item The file is expected to be a CSV file; +\item The file is expected to contain a first row of column names; +\item This minimal set of variables is expected: codelist_code, collected_value, term_synonyms, and term_value. +}} +} +\value{ +A \link[tibble:tibble-package]{tibble} with a controlled terminology +specification. +} +\description{ +\code{\link[=read_ct_spec]{read_ct_spec()}} imports a controlled terminology specification data set as a +\link[tibble:tibble-package]{tibble}. +} +\examples{ +# Get the local path to one of the controlled terminology example files. +path <- ct_spec_example("ct-01-cm") + +# Import it to R. +read_ct_spec(file = path) + +} diff --git a/man/read_ct_spec_example.Rd b/man/read_ct_spec_example.Rd new file mode 100644 index 00000000..b3b26c08 --- /dev/null +++ b/man/read_ct_spec_example.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ct.R +\name{read_ct_spec_example} +\alias{read_ct_spec_example} +\title{Read an example controlled terminology specification} +\usage{ +read_ct_spec_example(example) +} +\arguments{ +\item{example}{The file name of a controlled terminology data set bundled +with \code{{stdm.oak}}, run \code{read_ct_spec_example()} for available example files.} +} +\value{ +A \link[tibble:tibble-package]{tibble} with a controlled terminology +specification data set, or a character vector of example file names. +} +\description{ +\code{\link[=read_ct_spec_example]{read_ct_spec_example()}} imports one of the bundled controlled terminology +specification data sets as a \link[tibble:tibble-package]{tibble} into R. +} +\examples{ +# Leave the `example` parameter as missing for available example files. +read_ct_spec_example() + +# Read an example controlled terminology spec file. +read_ct_spec_example("ct-01-cm.csv") + +# You may omit the file extension. +read_ct_spec_example("ct-01-cm") + +} diff --git a/man/recode.Rd b/man/recode.Rd new file mode 100644 index 00000000..aca082ac --- /dev/null +++ b/man/recode.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/recode.R +\name{recode} +\alias{recode} +\title{Recode values} +\usage{ +recode(x, from = unique(na.omit(x)), to = from, .no_match = x, .na = NA) +} +\arguments{ +\item{x}{An atomic vector of values are to be recoded.} + +\item{from}{A vector of values to be matched in \code{x} for recoding.} + +\item{to}{A vector of values to be used as replacement for values in \code{from}.} + +\item{.no_match}{Value to be used as replacement when cases in \code{from} are not +matched.} + +\item{.na}{Value to be used to recode missing values.} +} +\value{ +A vector of recoded values. +} +\description{ +\code{\link[=recode]{recode()}} recodes values in \code{x} by matching elements in \code{from} onto values +in \code{to}. +} +\examples{ +x <- c("male", "female", "x", NA) +sdtm.oak:::recode(x, + from = c("male", "female"), + to = c("M", "F") +) +sdtm.oak:::recode( + x, + from = c("male", "female"), + to = c("M", "F"), + .no_match = "?" +) +sdtm.oak:::recode( + x, + from = c("male", "female"), + to = c("M", "F"), + .na = "missing" +) + +} +\keyword{internal} diff --git a/man/sdtm_assign.Rd b/man/sdtm_assign.Rd new file mode 100644 index 00000000..676979dc --- /dev/null +++ b/man/sdtm_assign.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assign.R +\name{sdtm_assign} +\alias{sdtm_assign} +\title{Derive an SDTM variable} +\usage{ +sdtm_assign( + raw_dat, + raw_var, + tgt_var, + ct_spec = NULL, + ct_clst = NULL, + tgt_dat = NULL, + id_vars = oak_id_vars() +) +} +\arguments{ +\item{raw_dat}{The raw dataset (dataframe); must include the +variables passed in \code{id_vars} and \code{raw_var}.} + +\item{raw_var}{The raw variable: a single string indicating the name of the +raw variable in \code{raw_dat}.} + +\item{tgt_var}{The target SDTM variable: a single string indicating the name +of variable to be derived.} + +\item{ct_spec}{Study controlled terminology specification: a dataframe with a +minimal set of columns, see \code{\link[=ct_spec_vars]{ct_spec_vars()}} for details. This parameter is +optional, if left as \code{NULL} no controlled terminology recoding is applied.} + +\item{ct_clst}{A codelist code indicating which subset of the controlled +terminology to apply in the derivation. This parameter is optional, if left +as \code{NULL}, all possible recodings in \code{ct_spec} are attempted.} + +\item{tgt_dat}{Target dataset: a data frame to be merged against \code{raw_dat} by +the variables indicated in \code{id_vars}. This parameter is optional, see +section Value for how the output changes depending on this argument value.} + +\item{id_vars}{Key variables to be used in the join between the raw dataset +(\code{raw_dat}) and the target data set (\code{raw_dat}).} +} +\value{ +The returned data set depends on the value of \code{tgt_dat}: +\itemize{ +\item If no target dataset is supplied, meaning that \code{tgt_dat} defaults to +\code{NULL}, then the returned data set is \code{raw_dat}, selected for the variables +indicated in \code{id_vars}, and a new extra column: the derived variable, as +indicated in \code{tgt_var}. +\item If the target dataset is provided, then it is merged with the raw data set +\code{raw_dat} by the variables indicated in \code{id_vars}, with a new column: the +derived variable, as indicated in \code{tgt_var}. +} +} +\description{ +\code{\link[=sdtm_assign]{sdtm_assign()}} is an internal function packing the same functionality as +\code{\link[=assign_no_ct]{assign_no_ct()}} and \code{\link[=assign_ct]{assign_ct()}} together but aimed at developers only. +As a user please use either \code{\link[=assign_no_ct]{assign_no_ct()}} or \code{\link[=assign_ct]{assign_ct()}}. +} +\keyword{internal} diff --git a/man/sdtm_hardcode.Rd b/man/sdtm_hardcode.Rd new file mode 100644 index 00000000..5c3435b5 --- /dev/null +++ b/man/sdtm_hardcode.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hardcode.R +\name{sdtm_hardcode} +\alias{sdtm_hardcode} +\title{Derive an SDTM variable with a hardcoded value} +\usage{ +sdtm_hardcode( + raw_dat, + raw_var, + tgt_var, + tgt_val, + ct_spec = NULL, + ct_clst = NULL, + tgt_dat = NULL, + id_vars = oak_id_vars() +) +} +\arguments{ +\item{raw_dat}{The raw dataset (dataframe); must include the +variables passed in \code{id_vars} and \code{raw_var}.} + +\item{raw_var}{The raw variable: a single string indicating the name of the +raw variable in \code{raw_dat}.} + +\item{tgt_var}{The target SDTM variable: a single string indicating the name +of variable to be derived.} + +\item{tgt_val}{The target SDTM value to be hardcoded into the variable +indicated in \code{tgt_var}.} + +\item{ct_spec}{Study controlled terminology specification: a dataframe with a +minimal set of columns, see \code{\link[=ct_spec_vars]{ct_spec_vars()}} for details. This parameter is +optional, if left as \code{NULL} no controlled terminology recoding is applied.} + +\item{ct_clst}{A codelist code indicating which subset of the controlled +terminology to apply in the derivation. This parameter is optional, if left +as \code{NULL}, all possible recodings in \code{ct_spec} are attempted.} + +\item{tgt_dat}{Target dataset: a data frame to be merged against \code{raw_dat} by +the variables indicated in \code{id_vars}. This parameter is optional, see +section Value for how the output changes depending on this argument value.} + +\item{id_vars}{Key variables to be used in the join between the raw dataset +(\code{raw_dat}) and the target data set (\code{raw_dat}).} +} +\value{ +The returned data set depends on the value of \code{tgt_dat}: +\itemize{ +\item If no target dataset is supplied, meaning that \code{tgt_dat} defaults to +\code{NULL}, then the returned data set is \code{raw_dat}, selected for the variables +indicated in \code{id_vars}, and a new extra column: the derived variable, as +indicated in \code{tgt_var}. +\item If the target dataset is provided, then it is merged with the raw data set +\code{raw_dat} by the variables indicated in \code{id_vars}, with a new column: the +derived variable, as indicated in \code{tgt_var}. +} +} +\description{ +\code{\link[=sdtm_hardcode]{sdtm_hardcode()}} is an internal function packing the same functionality as +\code{\link[=hardcode_no_ct]{hardcode_no_ct()}} and \code{\link[=hardcode_ct]{hardcode_ct()}} together but aimed at developers only. +As a user please use either \code{\link[=hardcode_no_ct]{hardcode_no_ct()}} or \code{\link[=hardcode_ct]{hardcode_ct()}}. +} +\keyword{internal} diff --git a/renv.lock b/renv.lock index 329ec463..ebc34e12 100644 --- a/renv.lock +++ b/renv.lock @@ -128,6 +128,30 @@ ], "Hash": "543776ae6848fde2f48ff3816d0628bc" }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, "brew": { "Package": "brew", "Version": "1.0-8", @@ -911,6 +935,19 @@ ], "Hash": "e9d21e79848e02e524bea6f5bd53e7e4" }, + "progress": { + "Package": "progress", + "Version": "1.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "14dc9f7a3c91ebb14ec5bb9208a07061" + }, "promises": { "Package": "promises", "Version": "1.2.0.1", @@ -995,6 +1032,29 @@ ], "Hash": "8f25ebe2ec38b1f2aef3b0d2ef76f6c4" }, + "readr": { + "Package": "readr", + "Version": "2.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "clipr", + "cpp11", + "crayon", + "hms", + "lifecycle", + "methods", + "rlang", + "tibble", + "tzdb", + "utils", + "vroom" + ], + "Hash": "b5047343b3825f37ad9d3b5d89aa1078" + }, "rematch2": { "Package": "rematch2", "Version": "2.1.2", @@ -1007,7 +1067,7 @@ }, "remotes": { "Package": "remotes", - "Version": "2.4.2.1", + "Version": "2.5.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1017,7 +1077,7 @@ "tools", "utils" ], - "Hash": "63d15047eb239f95160112bcadc4fcb9" + "Hash": "3ee025083e66f18db6cf27b56e23e141" }, "renv": { "Package": "renv", @@ -1424,6 +1484,17 @@ ], "Hash": "e4e357f28c2edff493936b6cb30c3d65" }, + "tzdb": { + "Package": "tzdb", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "b2e1cbce7c903eaf23ec05c58e59fb5e" + }, "urlchecker": { "Package": "urlchecker", "Version": "1.0.1", @@ -1493,6 +1564,32 @@ ], "Hash": "a745bda7aff4734c17294bb41d4e4607" }, + "vroom": { + "Package": "vroom", + "Version": "1.6.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit64", + "cli", + "cpp11", + "crayon", + "glue", + "hms", + "lifecycle", + "methods", + "progress", + "rlang", + "stats", + "tibble", + "tidyselect", + "tzdb", + "vctrs", + "withr" + ], + "Hash": "7015a74373b83ffaef64023f4a0f5033" + }, "waldo": { "Package": "waldo", "Version": "0.4.0", diff --git a/renv/profiles/4.2/renv.lock b/renv/profiles/4.2/renv.lock index 494312ef..885895ef 100644 --- a/renv/profiles/4.2/renv.lock +++ b/renv/profiles/4.2/renv.lock @@ -128,6 +128,30 @@ ], "Hash": "543776ae6848fde2f48ff3816d0628bc" }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, "brew": { "Package": "brew", "Version": "1.0-8", @@ -912,6 +936,19 @@ ], "Hash": "e9d21e79848e02e524bea6f5bd53e7e4" }, + "progress": { + "Package": "progress", + "Version": "1.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "14dc9f7a3c91ebb14ec5bb9208a07061" + }, "promises": { "Package": "promises", "Version": "1.2.0.1", @@ -996,6 +1033,29 @@ ], "Hash": "8f25ebe2ec38b1f2aef3b0d2ef76f6c4" }, + "readr": { + "Package": "readr", + "Version": "2.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "clipr", + "cpp11", + "crayon", + "hms", + "lifecycle", + "methods", + "rlang", + "tibble", + "tzdb", + "utils", + "vroom" + ], + "Hash": "b5047343b3825f37ad9d3b5d89aa1078" + }, "rematch2": { "Package": "rematch2", "Version": "2.1.2", @@ -1008,7 +1068,7 @@ }, "remotes": { "Package": "remotes", - "Version": "2.4.2.1", + "Version": "2.5.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1018,7 +1078,7 @@ "tools", "utils" ], - "Hash": "63d15047eb239f95160112bcadc4fcb9" + "Hash": "3ee025083e66f18db6cf27b56e23e141" }, "renv": { "Package": "renv", @@ -1424,6 +1484,17 @@ ], "Hash": "c0f007e2eeed7722ce13d42b84a22e07" }, + "tzdb": { + "Package": "tzdb", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "b2e1cbce7c903eaf23ec05c58e59fb5e" + }, "urlchecker": { "Package": "urlchecker", "Version": "1.0.1", @@ -1493,6 +1564,32 @@ ], "Hash": "e4ffa94ceed5f124d429a5a5f0f5b378" }, + "vroom": { + "Package": "vroom", + "Version": "1.6.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit64", + "cli", + "cpp11", + "crayon", + "glue", + "hms", + "lifecycle", + "methods", + "progress", + "rlang", + "stats", + "tibble", + "tidyselect", + "tzdb", + "vctrs", + "withr" + ], + "Hash": "7015a74373b83ffaef64023f4a0f5033" + }, "waldo": { "Package": "waldo", "Version": "0.4.0", diff --git a/renv/profiles/4.3/renv.lock b/renv/profiles/4.3/renv.lock index 329ec463..ebc34e12 100644 --- a/renv/profiles/4.3/renv.lock +++ b/renv/profiles/4.3/renv.lock @@ -128,6 +128,30 @@ ], "Hash": "543776ae6848fde2f48ff3816d0628bc" }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, "brew": { "Package": "brew", "Version": "1.0-8", @@ -911,6 +935,19 @@ ], "Hash": "e9d21e79848e02e524bea6f5bd53e7e4" }, + "progress": { + "Package": "progress", + "Version": "1.2.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "14dc9f7a3c91ebb14ec5bb9208a07061" + }, "promises": { "Package": "promises", "Version": "1.2.0.1", @@ -995,6 +1032,29 @@ ], "Hash": "8f25ebe2ec38b1f2aef3b0d2ef76f6c4" }, + "readr": { + "Package": "readr", + "Version": "2.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "clipr", + "cpp11", + "crayon", + "hms", + "lifecycle", + "methods", + "rlang", + "tibble", + "tzdb", + "utils", + "vroom" + ], + "Hash": "b5047343b3825f37ad9d3b5d89aa1078" + }, "rematch2": { "Package": "rematch2", "Version": "2.1.2", @@ -1007,7 +1067,7 @@ }, "remotes": { "Package": "remotes", - "Version": "2.4.2.1", + "Version": "2.5.0", "Source": "Repository", "Repository": "RSPM", "Requirements": [ @@ -1017,7 +1077,7 @@ "tools", "utils" ], - "Hash": "63d15047eb239f95160112bcadc4fcb9" + "Hash": "3ee025083e66f18db6cf27b56e23e141" }, "renv": { "Package": "renv", @@ -1424,6 +1484,17 @@ ], "Hash": "e4e357f28c2edff493936b6cb30c3d65" }, + "tzdb": { + "Package": "tzdb", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "b2e1cbce7c903eaf23ec05c58e59fb5e" + }, "urlchecker": { "Package": "urlchecker", "Version": "1.0.1", @@ -1493,6 +1564,32 @@ ], "Hash": "a745bda7aff4734c17294bb41d4e4607" }, + "vroom": { + "Package": "vroom", + "Version": "1.6.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "bit64", + "cli", + "cpp11", + "crayon", + "glue", + "hms", + "lifecycle", + "methods", + "progress", + "rlang", + "stats", + "tibble", + "tidyselect", + "tzdb", + "vctrs", + "withr" + ], + "Hash": "7015a74373b83ffaef64023f4a0f5033" + }, "waldo": { "Package": "waldo", "Version": "0.4.0", diff --git a/tests/testthat/test-ct.R b/tests/testthat/test-ct.R new file mode 100644 index 00000000..e1fb5ca7 --- /dev/null +++ b/tests/testthat/test-ct.R @@ -0,0 +1,268 @@ +test_that("ct_spec_vars() works as expected", { + expect_identical( + ct_spec_vars(), + c( + "codelist_code", + "collected_value", + "term_synonyms", + "term_value" + ) + ) + + expect_identical( + ct_spec_vars(set = "all"), + c( + "codelist_code", + "collected_value", + "term_synonyms", + "term_value" + ) + ) + + expect_identical( + ct_spec_vars(set = "ct_clst"), + "codelist_code" + ) + + expect_identical( + ct_spec_vars(set = "from"), + c( + "collected_value", + "term_synonyms" + ) + ) + + expect_identical(ct_spec_vars(set = "to"), "term_value") +}) + +test_that("ct_spec_vars() fails with invalid input choice", { + expect_error(ct_spec_vars("foo")) + expect_error(ct_spec_vars(1L)) + expect_error(ct_spec_vars(FALSE)) + expect_error(ct_spec_vars(NULL)) +}) + +test_that("assert_ct_spec() works as expected", { + # Load an example controlled terminology spec. + ct_spec <- read_ct_spec_example("ct-01-cm") + cols <- colnames(ct_spec) + ct_clst_col <- ct_spec_vars("ct_clst") + to_col <- ct_spec_vars("to") + + expect_no_error(assert_ct_spec(ct_spec, optional = FALSE)) + expect_no_error(assert_ct_spec(ct_spec, optional = TRUE)) + expect_identical(assert_ct_spec(ct_spec, optional = FALSE), ct_spec) + expect_identical(assert_ct_spec(ct_spec, optional = TRUE), ct_spec) + expect_null(assert_ct_spec(NULL, optional = TRUE)) + + # Codelist code column is one of the key variables that must be present + # in `ct_spec`, so being missing should trigger an error. + expect_error(assert_ct_spec(ct_spec[setdiff(cols, ct_clst_col)], optional = FALSE)) + expect_error(assert_ct_spec(ct_spec[setdiff(cols, ct_clst_col)], optional = TRUE)) + + # The codelist code and the "to" columns of a controlled terminology should + # not contain NAs, as otherwise the mapping is undefined. If that happens + # an error is triggered. + ct_spec01 <- ct_spec + ct_spec01[[ct_clst_col]] <- NA_character_ + expect_error(assert_ct_spec(ct_spec01, optional = FALSE)) + expect_error(assert_ct_spec(ct_spec01, optional = TRUE)) + + ct_spec02 <- ct_spec + ct_spec02[[to_col]] <- NA_character_ + expect_error(assert_ct_spec(ct_spec01, optional = FALSE)) + expect_error(assert_ct_spec(ct_spec01, optional = TRUE)) + + ct_spec_empty <- + data.frame( + codelist_code = character(), + collected_value = character(), + term_synonyms = character(), + term_value = character(), + stringsAsFactors = FALSE + ) + + # `ct_spec` cannot be empty as that means that there are no mappings. + expect_error(assert_ct_spec(ct_spec_empty, optional = TRUE)) + expect_error(assert_ct_spec(ct_spec_empty, optional = FALSE)) +}) + +test_that("assert_ct_clst() works as expected", { + # Read in a controlled terminology example. + ct_spec <- read_ct_spec_example("ct-01-cm") + + # If `ct_clst` is not supplied and is not optional, then it should err. + expect_error(assert_ct_clst( + ct_spec = NULL, + ct_clst = NULL, + optional = FALSE + )) + + # If `ct_clst` is not supplied but it is optional, then all fine. + expect_no_error(assert_ct_clst( + ct_spec = NULL, + ct_clst = NULL, + optional = TRUE + )) + # Moreover, in case of no error, `ct_clst` should be returned invisibly, in this + # case `NULL`. + expect_null(assert_ct_clst( + ct_spec = NULL, + ct_clst = NULL, + optional = TRUE + )) + + # If `ct_clst` is supplied but `ct_spec` is not, then err. + expect_error(assert_ct_clst( + ct_spec = NULL, + ct_clst = "C71113", + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = NULL, + ct_clst = "C71113", + optional = TRUE + )) + + # If `ct_spec` is supplied but `ct_clst` is NULL, then err if `ct_clst` is not optional, or + # return `ct_clst` invisibly. + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = FALSE + )) + expect_no_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = TRUE + )) + expect_null(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = TRUE + )) + + # If both `ct_spec` and `ct_clst` are supplied, then `ct_spec` must be a valid controlled + # terminology data set and `ct_clst` must contain a codelist code available among + # the possibilities in column `codelist_code` (as returned by `ct_spec_vars("ct_clst")`). + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "foo", + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "", + optional = FALSE + )) + + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NA_character_, + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NA_character_, + optional = TRUE + )) + + expect_identical(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C71113", + optional = FALSE + ), "C71113") + expect_identical(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C66726", + optional = FALSE + ), "C66726") + expect_identical(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C71113", + optional = TRUE + ), "C71113") + expect_identical(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C66726", + optional = TRUE + ), "C66726") +}) + +test_that("assert_ct_clst(): when ct_spec is empty", { + ct_spec <- + data.frame( + codelist_code = character(), + collected_value = character(), + term_synonyms = character(), + term_value = character(), + stringsAsFactors = FALSE + ) + + # If `ct_spec` is supplied but `ct_clst` is NULL, then err if `ct_clst` is not optional, or + # return `ct_clst` invisibly. + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = FALSE + )) + expect_no_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = TRUE + )) + expect_null(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NULL, + optional = TRUE + )) + + # If both `ct_spec` and `ct_clst` are supplied, then `ct_spec` must be a valid controlled + # terminology data set and `ct_clst` must contain a codelist code available among + # the possibilities in column `codelist_code` (as returned by `ct_spec_vars("ct_clst")`). + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "foo", + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "", + optional = FALSE + )) + + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NA_character_, + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = NA_character_, + optional = TRUE + )) + + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C71113", + optional = FALSE + )) + expect_error(assert_ct_clst( + ct_spec = ct_spec, + ct_clst = "C71113", + optional = TRUE + )) +}) + +test_that("ct_mappings(): works as expected", { + ct_spec <- read_ct_spec_example("ct-01-cm") + ct_spec_qd <- dplyr::filter(ct_spec, term_code == "C25473") + + expect_identical( + ct_mappings(ct_spec = ct_spec_qd), + tibble::tibble( + from = c("QD", "QD (Every Day)", "/day", "Daily", "Per Day"), + to = rep("QD", 5L) + ) + ) +}) diff --git a/tests/testthat/test-recode.R b/tests/testthat/test-recode.R new file mode 100644 index 00000000..87562015 --- /dev/null +++ b/tests/testthat/test-recode.R @@ -0,0 +1,125 @@ +test_that("recode() works as intended on typical input", { + x <- c("a", NA_character_, "α") + # Map letters from lowercase to uppercase. NA is left as NA. Unmatched + # values are returned as inputted. + expect_identical( + recode(x = x, from = letters, to = LETTERS), + c("A", NA_character_, "α") + ) + + # The same as before but map now to integer values. Note though that the + # type of the returned vector is nonetheless character because "α" is not + # matched and will be preserved in the output, forcing coercion to character. + expect_identical( + recode( + x = x, + from = letters, + to = seq_along(LETTERS) + ), + c("1", NA_character_, "α") + ) + + # Now that `.no_match` is of integer type, like the vector supplied in `to`, + # the returned vector is also integer + expect_identical( + recode( + x = x, + from = letters, + to = seq_along(LETTERS), + .no_match = 0L + ), + c(1L, NA_integer_, 0L) + ) +}) + +test_that("recode() handling of NAs in input", { + x <- c("aye", "nay", "maybe", NA_character_) + from <- c("aye", "nay") + to <- c("yes", "no") + + expect_identical( + recode(x = x, from = from, to = to), + c("yes", "no", "maybe", NA_character_) + ) + expect_identical( + recode( + x = x, + from = from, + to = to, + .na = "uh?" + ), + c("yes", "no", "maybe", "uh?") + ) + + # The type of the vector in the output is always the most compatible across + # the types of `to`, `.no_match` and `.na`. + expect_identical( + recode( + x = x, + from = from, + to = to, + .na = NA + ), + c("yes", "no", "maybe", NA_character_) + ) + expect_identical( + recode( + x = x, + from = from, + to = to, + .na = NA_integer_ + ), + c("yes", "no", "maybe", NA_character_) + ) + expect_identical( + recode( + x = x, + from = from, + to = to, + .na = NA_character_ + ), + c("yes", "no", "maybe", NA_character_) + ) +}) + +test_that("recode(): recycling between `from` and `to` parameters", { + x <- c("aye", "nay", "maybe", NA_character_) + from <- c("aye", "nay") + to <- "?" + + # Mapping one to many values + expect_identical( + recode(x = x, from = from, to = to), + c("?", "?", "maybe", NA_character_) + ) + + # Enforce every value to become the hardcoded value specified in `to`. + expect_identical( + recode( + x = x, + from = from, + to = to, + .no_match = to, + .na = to + ), + c("?", "?", "?", "?") + ) +}) + +test_that("recode(): notable cases", { + x <- c(letters[1L:3L], NA_character_) + + # Identity: no recoding. + expect_identical(recode(x = x), x) + + # Hardcode all values, leave NA at peace + expect_identical(recode(x = x, to = "X"), c(rep("X", 3L), NA_character_)) + + # Or, really hardcode every single value + expect_identical(recode( + x = x, + to = "X", + .no_match = "X", + .na = "X" + ), rep("X", 4L)) +})