Merge pull request #20 from atorus-research/add_control_term_selection

Adding in new functionality to select control terms for a given variable
atorus-research · Jan 7, 2022 · f94c402 · f94c402
2 parents ac85a78 + 9911d8a
commit f94c402
Show file tree

Hide file tree

Showing 8 changed files with 148 additions and 40 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: metacore
 Title: A Centralized Metadata Object Focus on Clinical Trial Data Programming Workflows
-Version: 0.0.1.0000
+Version: 0.0.1.1000
 Authors@R: 
     c(person(given = "Christina",
              family = "Fillmore",

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export("%>%")
 export(create_tbl)
 export(define_to_MetaCore)
+export(get_control_term)
 export(load_metacore)
 export(metacore)
 export(metacore_example)
@@ -70,7 +71,9 @@ importFrom(purrr,reduce)
 importFrom(readxl,excel_sheets)
 importFrom(readxl,read_excel)
 importFrom(rlang,"!!")
+importFrom(rlang,as_label)
 importFrom(rlang,as_string)
+importFrom(rlang,enexpr)
 importFrom(rlang,expr)
 importFrom(rlang,prim_name)
 importFrom(rlang,sym)

diff --git a/NEWS.md b/NEWS.md
@@ -2,10 +2,11 @@
 
 This fixes the following issues:
 
-- #16 the metacore function now accepts any empty datasets and creates an empty dataset with the correct column names and types
-- #10 yn function checks for logicals and returns them
-- #11 updated function description to make this clearer
-- #12 updated regex so to [F|f]ormat so it can accept lower case
-- #14 added supp_flag to ds_vars (on a side note we did a really good job with this it was super easy to change and only required a few edits)
-- #15 modified create =tbl so if there are two potential matches in the same dataset and one is an exact match it uses that
+- [#16](https://github.com/atorus-research/metacore/issues/16) the metacore function now accepts any empty datasets and creates an empty dataset with the correct column names and types
+- [#10](https://github.com/atorus-research/metacore/issues/10) yn function checks for logicals and returns them
+- [#11](https://github.com/atorus-research/metacore/issues/11) updated function description to make this clearer
+- [#12](https://github.com/atorus-research/metacore/issues/12) updated regex so to [F|f]ormat so it can accept lower case
+- [#14](https://github.com/atorus-research/metacore/issues/14) added supp_flag to ds_vars (on a side note we did a really good job with this it was super easy to change and only required a few edits)
+- [#15](https://github.com/atorus-research/metacore/issues/15) modified create =tbl so if there are two potential matches in the same dataset and one is an exact match it uses that
 
+Additionally, it adds the `get_control_term` function to get pull out the control term for a given variable. 
diff --git a/R/metacore.R b/R/metacore.R
@@ -288,6 +288,61 @@ select_dataset <- function(.data, dataset, simplify = FALSE) {
 }
 
 
+
+#' Get Control Term
+#'
+#' Returns the control term (a vector for permitted values and a tibble for code
+#' lists) for a given variable. The dataset can be optionally specified if there
+#' is different control terminology for different datasets
+#'
+#' @param metacode metacore object
+#' @param variable A variable name to get the controlled terms for. This can
+#'   either be a string or just the name of the variable
+#' @param dataset A dataset name. This is not required if there is only one set
+#'   of control terminology across all datasets
+#'
+#' @return a vector for permitted values and a 2-column tibble for codelists
+#' @export
+#'
+#' @importFrom rlang as_label enexpr
+#'
+#' @examples
+#' meta_ex <- spec_to_metacore(metacore_example("p21_mock.xlsx"))
+#' get_control_term(meta_ex, QVAL, SUPPAE)
+#' get_control_term(meta_ex, "QVAL", "SUPPAE")
+get_control_term <- function(metacode, variable, dataset = NULL){
+   var_str <- ifelse(mode(enexpr(variable)) == "character",
+                      variable, as_label(enexpr(variable)))
+   dataset_val <- ifelse(mode(enexpr(dataset)) == "character",
+                          dataset, as_label(enexpr(dataset))) # to make the filter more explicit
+   if(dataset_val == "NULL"){
+      var_code_id <- metacode$value_spec %>%
+         filter(variable == var_str) %>%
+         pull(code_id) %>%
+         unique()
+   } else {
+      subset_data <- metacode$value_spec %>%
+         filter(dataset == dataset_val)
+      if(nrow(subset_data) == 0){
+         stop(paste0(dataset_val, " not found in the value_spec table. Please check the dataset name"))
+      }
+      var_code_id <- subset_data %>%
+         filter(variable == var_str) %>%
+         pull(code_id) %>%
+         unique()
+   }
+   if(length(var_code_id) > 1){
+      stop(paste0(var_str, " does not have a unique control term, consider spcificing a dataset"))
+   }
+
+   metacode$codelist %>%
+      filter(code_id == var_code_id) %>%
+      pull(codes) %>%
+      .[[1]]
+}
+
+
+
 #' save metacore object
 #'
 #' @param metacore_object the metacore object in memory to save to disc

diff --git a/README.Rmd b/README.Rmd
@@ -8,12 +8,11 @@ output: github_document
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>",
-  fig.path = "man/figures/README-",
-  out.width = "100%"
+  fig.path = "man/figures/README-"
 )
 ```
 
-# metacore <a href='https://github.com/atorus-research/metacore'><img src="man/figures/metacore.PNG" align="right" height="139"/></a>
+# metacore <a href='https://github.com/atorus-research/metacore'><img src="man/figures/metacore.PNG" align="right" style="height:139px;"/></a>
 
 <!-- badges: start -->
 [<img src="https://img.shields.io/badge/Slack-RValidationHub-blue?style=flat&logo=slack">](https://RValidationHub.slack.com)
@@ -55,7 +54,7 @@ Here is a schema of how all this fits together:
 
 ![](man/figures/schema-colors.png "man/figures/Metacore Schema")
 
-### ds_spec <img src="man/figures/labeled-ds_spec.png" align="right" height="150"/>
+### ds_spec <img src="man/figures/labeled-ds_spec.png" align="right" style="height:150px;"/>
 
 This table covers the basic information about each dataset. There is only a single row per dataset, with the following information:
 
@@ -65,7 +64,7 @@ This table covers the basic information about each dataset. There is only a sing
 
 -   *Label*: Dataset label
 
-### ds_vars <img src="man/figures/labeled-ds_vars.png" align="right" height="150"/>
+### ds_vars <img src="man/figures/labeled-ds_vars.png" align="right" style="height:150px;"/>
 
 This table contains the information that bridges between purely dataset level and purely variable level. There is one row per dataset per variable:
 
@@ -83,7 +82,7 @@ This table contains the information that bridges between purely dataset level an
 
 -   *supp_flag*: Logical to determine if the variable is in the supplementals
 
-### var_spec <img src="man/figures/labeled-var_spec.png" align="right" height="150"/>
+### var_spec <img src="man/figures/labeled-var_spec.png" align="right" style="height:150px;"/>
 
 This table contains the information the purely variable level information. The goal is there is a single row per variable, which is common across all datasets. This helps ensure variables follow the CDISC standard. But, this isn't always possible, so if information for a given variable differs across datasets, the variable will be recorded as dataset.variable in the variable column.
 
@@ -99,7 +98,7 @@ This table contains the information the purely variable level information. The g
 
 -   *format*: Variable format
 
-### value_spec <img src="man/figures/labeled-value_spec.png" align="right" height="150"/>
+### value_spec <img src="man/figures/labeled-value_spec.png" align="right" style="height:150px;"/>
 
 This table contains the information the information at the value level. There will be at least one row per dataset/variable combination. There is more than one row per dataset/variable combination if the combination has values which have differing metadata. For instance LBORRES that are different data types depending on the value. The information contained are as follows:
 
@@ -117,15 +116,15 @@ This table contains the information the information at the value level. There wi
 
 -   *derivation_id*: ID for the derivation to match with the **derivation** table
 
-### derivation <img src="man/figures/labeled-derivation.png" align="right" height="150"/>
+### derivation <img src="man/figures/labeled-derivation.png" align="right" style="height:150px;"/>
 
 This table has all the derivation information, with one row per derivation ID and the following information:
 
 -   *derivation_id*: The ID, which should match to **value_spec**
 
 -   *derivation*: Text describing the derivation
 
-### codelist <img src="man/figures/labeled-code_list.png" align="right" height="150"/>
+### codelist <img src="man/figures/labeled-code_list.png" align="right" style="height:150px;"/>
 
 This table contains the code lists, permitted value lists, and external libraries nested within a tibble. There is only a single row per list/library, with the following information:
 

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 
 <!-- README.md is generated from README.Rmd. Please edit that file -->
 
-# metacore <a href='https://github.com/atorus-research/metacore'><img src="man/figures/metacore.PNG" align="right" height="139"/></a>
+# metacore <a href='https://github.com/atorus-research/metacore'><img src="man/figures/metacore.PNG" align="right" style="height:139px;"/></a>
 
 <!-- badges: start -->
 
@@ -54,13 +54,13 @@ normalize the information as much as possible, while keeping together
 like information. Each table has a basic theme to make them easier to
 remember. They are as follows:
 
--   **ds\_spec**: Contains dataset level information
+-   **ds_spec**: Contains dataset level information
 
--   **ds\_vars**: Bridges the dataset and variable level information
+-   **ds_vars**: Bridges the dataset and variable level information
 
--   **var\_spec**: Contains variable level information
+-   **var_spec**: Contains variable level information
 
--   **value\_spec**: Contains value level information
+-   **value_spec**: Contains value level information
 
 -   **derivations**: Contains all derivations
 
@@ -71,29 +71,29 @@ Here is a schema of how all this fits together:
 
 ![](man/figures/schema-colors.png "man/figures/Metacore Schema")
 
-### ds\_spec <img src="man/figures/labeled-ds_spec.png" align="right" height="150"/>
+### ds_spec <img src="man/figures/labeled-ds_spec.png" align="right" style="height:150px;"/>
 
 This table covers the basic information about each dataset. There is
 only a single row per dataset, with the following information:
 
 -   *dataset*: The abbreviated name of the dataset (e.g. AE)
 
--   *structure*: Value structure of the dataset as a sting
+-   *Structure*: Value structure of the dataset as a sting
 
--   *label*: Dataset label
+-   *Label*: Dataset label
 
-### ds\_vars <img src="man/figures/labeled-ds_vars.png" align="right" height="150"/>
+### ds_vars <img src="man/figures/labeled-ds_vars.png" align="right" style="height:150px;"/>
 
 This table contains the information that bridges between purely dataset
 level and purely variable level. There is one row per dataset per
 variable:
 
 -   *dataset*: The abbreviated name of the dataset. This will match to
-    the name in **ds\_spec**
+    the name in **ds_spec**
 
 -   *variable*: Variable name
 
--   *key\_seq*: Sequence key, which are the variables used to order a
+-   *key_seq*: Sequence key, which are the variables used to order a
     dataset. This is a column of integers, where 1 is the first sorting
     variable and 2 is the second etc. If the variable is not used in
     sorting it will be left `NA`
@@ -108,7 +108,10 @@ variable:
     “Conditionally Expected”, or NA. For more information about core see
     [CDISC](https://www.cdisc.org/standards/foundational/adam)
 
-### var\_spec <img src="man/figures/labeled-var_spec.png" align="right" height="150"/>
+-   *supp_flag*: Logical to determine if the variable is in the
+    supplementals
+
+### var_spec <img src="man/figures/labeled-var_spec.png" align="right" style="height:150px;"/>
 
 This table contains the information the purely variable level
 information. The goal is there is a single row per variable, which is
@@ -118,9 +121,9 @@ variable differs across datasets, the variable will be recorded as
 dataset.variable in the variable column.
 
 -   *variable*: Variable name, which should match the name in
-    **ds\_spec**. Unless the variable needs to be duplicated, then the
+    **ds_spec**. Unless the variable needs to be duplicated, then the
     name will be a combination of the the dataset name and variable name
-    from **ds\_spec** (dataset.variable)
+    from **ds_spec** (dataset.variable)
 
 -   *type*: Variable class
 
@@ -133,7 +136,7 @@ dataset.variable in the variable column.
 
 -   *format*: Variable format
 
-### value\_spec <img src="man/figures/labeled-value_spec.png" align="right" height="150"/>
+### value_spec <img src="man/figures/labeled-value_spec.png" align="right" style="height:150px;"/>
 
 This table contains the information the information at the value level.
 There will be at least one row per dataset/variable combination. There
@@ -143,40 +146,40 @@ different data types depending on the value. The information contained
 are as follows:
 
 -   *dataset*: The abbreviated name of the dataset. This will match to
-    the name in **ds\_spec**
+    the name in **ds_spec**
 
 -   *variable*: Variable name. This will match to the name in
-    **ds\_spec**
+    **ds_spec**
 
 -   *type*: String of the value type
 
 -   *origin*: Origin of the value
 
--   *code\_id*: ID for the code list to match the id in the **codelist**
+-   *code_id*: ID for the code list to match the id in the **codelist**
     table
 
 -   *where*: Value of the variable
 
--   *derivation\_id*: ID for the derivation to match with the
+-   *derivation_id*: ID for the derivation to match with the
     **derivation** table
 
-### derivation <img src="man/figures/labeled-derivation.png" align="right" height="150"/>
+### derivation <img src="man/figures/labeled-derivation.png" align="right" style="height:150px;"/>
 
 This table has all the derivation information, with one row per
 derivation ID and the following information:
 
--   *derivation\_id*: The ID, which should match to **value\_spec**
+-   *derivation_id*: The ID, which should match to **value_spec**
 
 -   *derivation*: Text describing the derivation
 
-### codelist <img src="man/figures/labeled-code_list.png" align="right" height="150"/>
+### codelist <img src="man/figures/labeled-code_list.png" align="right" style="height:150px;"/>
 
 This table contains the code lists, permitted value lists, and external
 libraries nested within a tibble. There is only a single row per
 list/library, with the following information:
 
--   *code\_id*: the ID used to identify the code list. This should be
-    the same as the *code\_id* in **val\_spec**
+-   *code_id*: the ID used to identify the code list. This should be the
+    same as the *code_id* in **val_spec**
 
 -   *name*: Name of the code list
 

diff --git a/man/get_control_term.Rd b/man/get_control_term.Rd
diff --git a/tests/testthat/test-metacore.R b/tests/testthat/test-metacore.R
@@ -132,3 +132,20 @@ test_that("load metacore fails with no path and rdss in wd", {
    )
    unlink(my_temp_dir)
 })
+
+test_that("pulling out control terminology works", {
+   test <- suppressWarnings(
+      spec_to_metacore(metacore_example("p21_mock.xlsx"))
+   )
+   expect_error(get_control_term(test, QVAL))
+   expect_error(get_control_term(test, QVAL, LB))
+   expect_equal(
+      get_control_term(test, QVAL, SUPPAE),
+      tibble(code = c("N", "Y"), decode = c("No", "Yes"))
+   )
+   expect_equal(
+      get_control_term(test, "QVAL", "SUPPAE"),
+      tibble(code = c("N", "Y"), decode = c("No", "Yes"))
+   )
+   })
+