From 63c52b1f8f872df5938717eb6420ef6ebad2c507 Mon Sep 17 00:00:00 2001 From: Eli Miller Date: Mon, 26 Feb 2024 16:09:46 -0500 Subject: [PATCH 1/2] Closes #183 Add `xportr_split` Functionality (#235) * add in split function * Add tests and functionality * Testing and doc updates * Update tests to correct skip condition and add cli_warn * Update with lint and style * exclude large file size from cov * Update R/split.R Co-authored-by: Ben Straub * Update R/split.R Co-authored-by: Ben Straub * Update R/write.R Co-authored-by: Ben Straub * Update for passing R CMD Check * Update R/split.R --------- Co-authored-by: Ben Straub --- NAMESPACE | 2 ++ NEWS.md | 1 + R/length.R | 2 +- R/split.R | 35 ++++++++++++++++++++ R/utils-xportr.R | 20 ++++++++++++ R/write.R | 40 ++++++++++++++++++++++- R/xportr-package.R | 4 +-- inst/WORDLIST | 5 +++ man/xportr-package.Rd | 1 + man/xportr_split.Rd | 39 ++++++++++++++++++++++ tests/testthat/test-write.R | 64 +++++++++++++++++++++++++++++++++++++ 11 files changed, 209 insertions(+), 4 deletions(-) create mode 100644 R/split.R create mode 100644 man/xportr_split.Rd diff --git a/NAMESPACE b/NAMESPACE index 44baa2a4..c0acf978 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(xportr_length) export(xportr_metadata) export(xportr_options) export(xportr_order) +export(xportr_split) export(xportr_type) export(xportr_write) export(xpt_validate) @@ -32,6 +33,7 @@ importFrom(cli,cli_alert_success) importFrom(cli,cli_div) importFrom(cli,cli_h2) importFrom(cli,cli_text) +importFrom(cli,cli_warn) importFrom(dplyr,across) importFrom(dplyr,arrange) importFrom(dplyr,as_tibble) diff --git a/NEWS.md b/NEWS.md index 512b6001..30ce6846 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,6 +21,7 @@ * Added `xportr.character_metadata_types` and `xportr.numeric_metadata_types` to list the metadata types that are character or numeric. Updated `xportr.character_types` and `xportr.numeric_types` to list only the R types that are character and the R types that are numeric. This ensures that all R types, including dates, are now managed by xportr_type. If the R type differs from the metadata type, the variable is coerced (#161).. * Adds argument assertions to public functions using `{checkmate}` (#175) +* `xportr_split()` is a new function that allows users to split a dataset into multiple output files based on a variable. (#183) * `xportr_metadata()` can set `verbose` for a whole pipeline, i.e. setting `verbose` in `xportr_metadata()` will populate to all `xportr` functions. (#151) diff --git a/R/length.R b/R/length.R index 68fe8f8d..f2b91711 100644 --- a/R/length.R +++ b/R/length.R @@ -162,7 +162,7 @@ xportr_length <- function(.df, length_meta = as.numeric(length_msg[[paste0(variable_length, ".y")]]) ) %>% filter(length_df < length_meta) %>% - select(all_of(variable_name), length_df, length_meta) + select(any_of(c(variable_name, "length_df", "length_meta"))) max_length_msg(length_msg, verbose) } diff --git a/R/split.R b/R/split.R new file mode 100644 index 00000000..a4b02beb --- /dev/null +++ b/R/split.R @@ -0,0 +1,35 @@ +#' Split xpt file output +#' +#' Per the FDA Study Data Technical Conformance +#' Guide(https://www.fda.gov/media/88173/download) section 3.3.2, dataset files +#' sizes shouldn't exceed 5 GB. If datasets are large enough, they should be +#' split based on a variable. For example, laboratory readings in `ADLB` can be +#' split by `LBCAT` to split up hematology and chemistry data. +#' +#' This function will tell `xportr_write()` to split the data frame based on the +#' variable passed in `split_by`. When written, the file name will be prepended +#' with a number for uniqueness. These files should be noted in the Reviewer Guides per +#' CDISC guidance to note how you split your files. +#' +#' @inheritParams xportr_length +#' @param split_by A quoted variable that will be passed to `base::split()`. +#' +#' @return A data frame with an additional attribute added so `xportr_write()` +#' knows how to split the data frame. +#' +#' +#' @export +#' +#' @examples +#' +#' adlb <- data.frame( +#' USUBJID = c(1001, 1002, 1003), +#' LBCAT = c("HEMATOLOGY", "HEMATOLOGY", "CHEMISTRY") +#' ) +#' +#' adsl <- xportr_split(adsl, "LBCAT") +xportr_split <- function(.df, split_by = NULL) { + attr(.df, "_xportr.split_by_") <- split_by + + return(.df) +} diff --git a/R/utils-xportr.R b/R/utils-xportr.R index ca89ed74..ac7cfd0c 100644 --- a/R/utils-xportr.R +++ b/R/utils-xportr.R @@ -440,3 +440,23 @@ assert_metadata <- function(metadata, #' Internal choices for verbose option #' @noRd .internal_verbose_choices <- c("none", "warn", "message", "stop") + +#' Internal function to check xpt file size +#' @noRd +check_xpt_size <- function(path) { + fs <- file.size(path) + + fs_string <- c( + "i" = paste0("xpt file size is: ", round(fs / 1e+9, 2)), " GB.", + "x" = paste0( + "XPT file sizes should not exceed 5G. It is", + " recommended you call `xportr_split` to split the file into smaller files." + ) + ) + + if (fs > 5e+9) { + cli_warn(fs_string, class = "xportr.xpt_size") # nocov + } + + invisible(NULL) +} diff --git a/R/write.R b/R/write.R index 7d55c05e..7fa90201 100644 --- a/R/write.R +++ b/R/write.R @@ -105,7 +105,26 @@ xportr_write <- function(.df, data <- as.data.frame(.df) tryCatch( - write_xpt(data, path = path, version = 5, name = name), + { + # If data is not split, data is just written out + if (is.null(attr(data, "_xportr.split_by_"))) { + write_xpt(data, path = path, version = 5, name = name) + check_xpt_size(path) + } else { + # If data is split, perform the split and get an index for the for loop + split_data <- split(data, data[[attr(data, "_xportr.split_by_")]]) + split_index <- unique(data[[attr(data, "_xportr.split_by_")]]) + paths <- get_split_path(path, seq_along(split_index)) + # Iterate on the unique values of the split + for (i in seq_along(split_index)) { + # Write out the split data, `get_split_path` will determine file name + write_xpt(split_data[[i]], + path = paths[i], version = 5, name = name + ) + check_xpt_size(paths[i]) + } + } + }, error = function(err) { rlang::abort( paste0( @@ -118,3 +137,22 @@ xportr_write <- function(.df, invisible(data) } + +#' Figure out path for split data. +#' +#' Will append a number before the file extension to indicate the split. +#' +#' i.e. `adsl.xpt` will become `adsl1.xpt` and `adsl2.xpt` +#' +#' @param path Path variable specified by user +#' @param ind Index of split variable +#' +#' @noRd +get_split_path <- function(path, ind) { + paste0( + tools::file_path_sans_ext(path), + ind, + ".", + tools::file_ext(path) + ) +} diff --git a/R/xportr-package.R b/R/xportr-package.R index 65c1cc52..8f9b78e9 100644 --- a/R/xportr-package.R +++ b/R/xportr-package.R @@ -107,7 +107,7 @@ #' tribble if_else across as_tibble #' @importFrom glue glue glue_collapse #' @importFrom cli cli_alert_info cli_h2 cli_alert_success cli_div cli_text -#' cli_alert_danger +#' cli_alert_danger cli_warn #' @importFrom tidyselect all_of any_of where #' @importFrom utils capture.output str tail packageVersion #' @importFrom stringr str_detect str_extract str_replace str_replace_all @@ -125,7 +125,7 @@ globalVariables(c( "abbr_parsed", "abbr_stem", "adj_orig", "adj_parsed", "col_pos", "dict_varname", "lower_original_varname", "my_minlength", "num_st_ind", "original_varname", "renamed_n", "renamed_var", "use_bundle", "viable_start", "type.x", "type.y", - "variable", "length.x", "lenght.y" + "variable", "length.x", "lenght.y", "e", "length_df", "length_meta" )) # The following block is used by usethis to automatically manage diff --git a/inst/WORDLIST b/inst/WORDLIST index 692b1dae..be70b75c 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,6 +1,7 @@ ADAE ADSL ADaM +adlb AE Atorus BMI @@ -10,6 +11,7 @@ Completers DCREASCD DM Didenko +fda GSK JPT MMSE @@ -55,3 +57,6 @@ validator validators xportr's xpt +https +lbcat +www diff --git a/man/xportr-package.Rd b/man/xportr-package.Rd index d077e53c..f7e7c58e 100644 --- a/man/xportr-package.Rd +++ b/man/xportr-package.Rd @@ -3,6 +3,7 @@ \docType{package} \name{xportr-package} \alias{xportr-package} +\alias{_PACKAGE} \title{The \code{xportr} package} \description{ \code{xportr} is designed to be a clinical workflow friendly method for outputting diff --git a/man/xportr_split.Rd b/man/xportr_split.Rd new file mode 100644 index 00000000..5b4d3a44 --- /dev/null +++ b/man/xportr_split.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/split.R +\name{xportr_split} +\alias{xportr_split} +\title{Split xpt file output} +\usage{ +xportr_split(.df, split_by = NULL) +} +\arguments{ +\item{.df}{A data frame of CDISC standard.} + +\item{split_by}{A quoted variable that will be passed to \code{base::split()}.} +} +\value{ +A data frame with an additional attribute added so \code{xportr_write()} +knows how to split the data frame. +} +\description{ +Per the FDA Study Data Technical Conformance +Guide(https://www.fda.gov/media/88173/download) section 3.3.2, dataset files +sizes shouldn't exceed 5 GB. If datasets are large enough, they should be +split based on a variable. For example, laboratory readings in \code{ADLB} can be +split by \code{LBCAT} to split up hematology and chemistry data. +} +\details{ +This function will tell \code{xportr_write()} to split the data frame based on the +variable passed in \code{split_by}. When written, the file name will be prepended +with a number for uniqueness. These files should be noted in the sdrg per +CDISC guidance to note how you split your files. +} +\examples{ + +adlb <- data.frame( + USUBJID = c(1001, 1002, 1003), + LBCAT = c("HEMATOLOGY", "HEMATOLOGY", "CHEMISTRY") +) + +adsl <- xportr_split(adsl, "LBCAT") +} diff --git a/tests/testthat/test-write.R b/tests/testthat/test-write.R index 0517a462..bb036cf0 100644 --- a/tests/testthat/test-write.R +++ b/tests/testthat/test-write.R @@ -4,6 +4,9 @@ data_to_save <- function() { as_tibble() } +# Skip large file tests unless explicitly requested +test_large_files <- Sys.getenv("XPORTR.TEST_LARGE_FILES", FALSE) + test_that("xportr_write: exported data can be saved to a file", { skip_if_not_installed("withr") tmp <- withr::local_file("xyz.xpt") @@ -194,3 +197,64 @@ test_that("xportr_write: Capture errors by haven and report them as such", { "Error reported by haven" ) }) + +test_that("xportr_write: `split_by` attribute is used to split the data", { + tmpdir <- tempdir() + tmp <- file.path(tmpdir, "xyz.xpt") + + on.exit(unlink(tmpdir)) + + dts <- data_to_save() + dts %>% + xportr_split(split_by = "X") %>% + xportr_write(path = tmp) + + expect_true( + file.exists(file.path(tmpdir, "xyz1.xpt")) + ) + expect_true( + file.exists(file.path(tmpdir, "xyz2.xpt")) + ) + expect_true( + file.exists(file.path(tmpdir, "xyz3.xpt")) + ) + expect_equal( + read_xpt(file.path(tmpdir, "xyz1.xpt")) %>% + extract2("X") %>% + unique() %>% + length(), + 1 + ) + expect_equal( + read_xpt(file.path(tmpdir, "xyz2.xpt")) %>% + extract2("X") %>% + unique() %>% + length(), + 1 + ) + expect_equal( + read_xpt(file.path(tmpdir, "xyz3.xpt")) %>% + extract2("X") %>% + unique() %>% + length(), + 1 + ) +}) + +test_that("xportr_write: Large file sizes are reported and warned", { + skip_if_not(test_large_files) + tmpdir <- tempdir() + tmp <- file.path(tmpdir, "xyz.xpt") + + on.exit(unlink(tmpdir)) + + # Large_df should be at least 5GB + large_df <- do.call( + data.frame, replicate(80000, rep("large", 80000), simplify = FALSE) + ) + + expect_warning( + xportr_write(large_df, path = tmp), + class = "xportr.xpt_size" + ) +}) From bb9e7c50ff19557c45985fa1a56a38717a05408a Mon Sep 17 00:00:00 2001 From: bms63 Date: Mon, 26 Feb 2024 21:10:50 +0000 Subject: [PATCH 2/2] [skip actions] Bump version to 0.3.1.9024 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0061dd58..197358bd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: xportr Title: Utilities to Output CDISC SDTM/ADaM XPT Files -Version: 0.3.1.9023 +Version: 0.3.1.9024 Authors@R: c( person("Eli", "Miller", , "Eli.Miller@AtorusResearch.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2127-9456")),