Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#92 Create function to derive reference dates in DM domain #98

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Authors@R: c(
person("Pattern Institute", role = c("cph", "fnd")),
person("F. Hoffmann-La Roche AG", role = c("cph", "fnd")),
person("Pfizer Inc", role = c("cph", "fnd")),
person("Mohsin", "Uzzama", email = "[email protected]",
role = "aut"),
person("Transition Technologies Science", role = c("cph", "fnd"))
)
Maintainer: Rammprasad Ganapathy <[email protected]>
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export("%.>%")
export(assign_ct)
export(assign_datetime)
export(assign_no_ct)
export(cal_min_max_date)
export(condition_add)
export(create_iso8601)
export(ct_map)
Expand All @@ -20,6 +21,7 @@ export(fmt_cmp)
export(generate_oak_id_vars)
export(hardcode_ct)
export(hardcode_no_ct)
export(oak_cal_ref_dates)
export(oak_id_vars)
export(problems)
export(read_ct_spec)
Expand Down
130 changes: 130 additions & 0 deletions R/cal_min_max_date.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#' Calculate minimum and maximum date and time in the data frame
#'
#' @description This function derives the earliest/latest ISO8601 datetime
#'
#' @param raw_dataset Raw source data frame
#' @param date_variable Single character string. Name of the date variable
#' @param time_variable Single character string. Name of the time variable
#' @param val_type Single character string determining whether to look
#' for the earliest or the latest datetime combination. Permitted values:
#' "min", "max". Default to "min".
#' @param date_format Format of source date variable
#' @param time_format Format of source time variable
#'
#' @return Data frame with 2 columns: unique patient_number and datetime variable
#' column storing the earliest/latest datetime.
#'
#' @export
#' @examples
#' EX <- tibble::tribble(
#' ~patient_number, ~EX_ST_DT, ~EX_ST_TM,
#' "001", "25-04-2022", "10:20",
#' "001", "25-04-2022", "10:15",
#' "001", "25-04-2022", "10:19",
#' "002", "26-05-2022", "UNK:UNK",
#' "002", "26-05-2022", "05:59"
#' )
#'
#' min <- cal_min_max_date(EX,
#' date_variable = "EX_ST_DT",
#' time_variable = "EX_ST_TM",
#' val_type = "min",
#' date_format = "dd-mmm-yyyy",
#' time_format = "H:M"
#' )
#'
#' max <- cal_min_max_date(EX,
#' date_variable = "EX_ST_DT",
#' time_variable = "EX_ST_TM",
#' val_type = "max",
#' date_format = "dd-mmm-yyyy",
#' time_format = "H:M"
#' )
#'
cal_min_max_date <- function(raw_dataset,
date_variable,
time_variable,
val_type = "min",
date_format,
time_format) {
# Check if date parameter is missing or date variable is present in the raw data frame
date_not_in_data <- is.na(date_variable) ||
!utils::hasName(raw_dataset, date_variable)

# Check if time variable is used and present in the raw data frame
time_not_in_data <- !is.na(time_variable) &&
!utils::hasName(raw_dataset, time_variable)

# If date/time variables not present return empty data frame
if (date_not_in_data || time_not_in_data) {
# Return Empty data frame with patient_number and datetime columns
empty_df <- stats::setNames(
data.frame(matrix(ncol = 2L, nrow = 0L)),
c("patient_number", "datetime")
)
cli::cli_warn(paste(
"Date variable", date_variable, "or Time variable", time_variable,
"not present in source data"
))
return(empty_df)
}

fin_df <- raw_dataset

# Time is not used in reference date then use only date
if (is.na(time_variable)) {
ShiyuC marked this conversation as resolved.
Show resolved Hide resolved
fin_df$datetime <- create_iso8601(raw_dataset[[date_variable]],
.format = date_format
)
} else {
ShiyuC marked this conversation as resolved.
Show resolved Hide resolved
# If both date and time variables are present use both date and time
raw_dataset$date_time <- paste0(
raw_dataset[[date_variable]],
raw_dataset[[time_variable]]
)
format <- paste0(date_format, time_format)

fin_df$datetime <- as.character(create_iso8601(raw_dataset$date_time,
.format = format,
.na = c(
"UNK", "NA", "U", "unk",
"u", "un", "UN"
)
))
}

fin_df <- fin_df |>
dplyr::select(c("patient_number", "datetime")) |>
unique()

fin_df <- fin_df |>
dplyr::mutate(date_time = datetime) |>
tidyr::separate(
date_time,
sep = "-|T|:",
into = c("year", "month", "day", "hour", "minute"),
fill = "right",
extra = "drop"
) |>
list() |>
stats::setNames("x") |>
with(replace(x, x == "UNK", NA)) |>
list() |>
stats::setNames("x") |>
with(replace(x, x == "", NA))

if (val_type == "min") {
final_df <- fin_df |>
dplyr::arrange(year, month, day, hour, minute)
} else {
final_df <- fin_df |>
dplyr::arrange(dplyr::desc(year), dplyr::desc(month), dplyr::desc(day), dplyr::desc(hour), dplyr::desc(minute))
}

# Keep first appearance in the data frame since it is already sorted
final_df <- final_df[!duplicated(final_df$patient_number), c("patient_number", "datetime")]

final_df <- final_df |> dplyr::filter(!is.na(datetime))

return(final_df)
}
4 changes: 3 additions & 1 deletion R/globals.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
utils::globalVariables(c(
"USUBJID", "VISIT", "dom_dt", "dom_tm", "ref_dt",
"ref_tm"
"ref_tm", "datetime", "date_time", "year", "month",
"day", "hour", "minute", "dataset_name", "date_var",
"dformat", "tformat", "sdtm_var_name", "patient_number"
))
120 changes: 120 additions & 0 deletions R/oak_cal_ref_dates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#' Calculate Reference dates in ISO8601 character format.
#'
#' Populate RFSTDTC variable in demographic domain in ISO8601 character format.
#'
#' @description Derive RFSTDTC, RFENDTC, RFXENDTC, RFXSTDTC based on the input dates and time.
#'
#'
#' @param ds_in Data frame. DM domain.
#' @param der_var Character string. The reference date to be derived.
#' @param min_max Minimum or Maximum date to be calculated based on the input.
#' Default set to Minimum. Values should be min or max.
#' @param ref_date_config_df Data frame which has the details of the variables to
#' be used for the calculation of reference dates.
#' Should have columns listed below:
#' dataset_name : Name of the raw dataset.
#' date_var : Date variable name from the raw dataset.
#' time_var : Time variable name from the raw dataset.
#' dformat : Format of the date collected in raw data.
#' tformat: Format of the time collected in raw data.
#' sdtm_var_name : Reference variable name.
#' @param raw_source List contains all the raw datasets.
#' @return DM data frame with the reference dates populated.
#' @export
#' @examples
#' dm <- tibble::tribble(
#' ~patient_number, ~USUBJID, ~SUBJID, ~SEX,
#' "001", "XXXX-001", "001", "F",
#' "002", "XXXX-002", "002", "M",
#' "003", "XXXX-003", "003", "M"
#' )
#'
#' ref_date_config_df <- tibble::tribble(
#' ~dataset_name, ~date_var, ~time_var, ~dformat, ~tformat, ~sdtm_var_name,
#' "EX1", "EX_ST_DT1", "EX_ST_TM1", "dd-mm-yyyy", "H:M", "RFSTDTC",
#' "EX2", "EX_ST_DT2", NA, "dd-mmm-yyyy", NA, "RFSTDTC",
#' "EX1", "EX_EN_DT1", "EX_EN_TM1", "dd-mm-yyyy", "H:M", "RFENDTC",
#' "EX2", "EX_ST_DT2", NA, "dd-mmm-yyyy", NA, "RFENDTC"
#' )
#'
#' EX1 <- tibble::tribble(
#' ~patient_number, ~EX_ST_DT1, ~EX_EN_DT1, ~EX_ST_TM1, ~EX_EN_TM1,
#' "001", "15-05-2023", "15-05-2023", "10:20", "11:00",
#' "001", "15-05-2023", "15-05-2023", "9:15", "10:00",
#' "001", "15-05-2023", "15-05-2023", "8:19", "09:00",
#' "002", "02-10-2023", "02-10-2023", "UNK:UNK", NA,
#' "002", "03-11-2023", "03-11-2023", "11:19", NA
#' )
#'
#' EX2 <- tibble::tribble(
#' ~patient_number, ~EX_ST_DT2,
#' "001", "11-JUN-2023",
#' "002", "24-OCT-2023",
#' "002", "25-JUL-2023",
#' "002", "30-OCT-2023",
#' "002", "UNK-OCT-2023"
#' )
#'
#' raw_source <- list(EX1 = EX1, EX2 = EX2)
#'
#' dm_df <- oak_cal_ref_dates(dm,
#' der_var = "RFSTDTC",
#' min_max = "min",
#' ref_date_config_df = ref_date_config_df,
#' raw_source
#' )
#'
oak_cal_ref_dates <- function(ds_in = dm,
der_var,
min_max = "min",
ref_date_config_df,
raw_source) {
# Check if ref_date_config_df is a data frame and has all required variables
admiraldev::assert_data_frame(ref_date_config_df, required_vars = exprs(
dataset_name, date_var,
time_var, dformat,
tformat, sdtm_var_name
))

ShiyuC marked this conversation as resolved.
Show resolved Hide resolved
admiraldev::assert_list_of(raw_source, "data.frame")

ds_out <- data.frame()
for (i in seq_along(ref_date_config_df$dataset_name)) {
raw_dataset_name <- ref_date_config_df$dataset_name[i]
date_variable <- ref_date_config_df$date_var[i]
date_format <- ref_date_config_df$dformat[i]
time_var <- ref_date_config_df$time_var[i]
time_format <- ref_date_config_df$tformat[i]
sdtm_var <- ref_date_config_df$sdtm_var_name[i]
raw_dataset <- raw_source[[raw_dataset_name]]

if (der_var == sdtm_var && !is.null(raw_dataset)) {
ds_out1 <- cal_min_max_date(
raw_dataset = raw_dataset,
date_variable = date_variable,
time_variable = time_var,
date_format = date_format,
time_format = time_format,
val_type = min_max
)
ds_out <- rbind(ds_out, ds_out1)
} else if (der_var == sdtm_var && is.null(raw_dataset)) {
warning(paste0(
raw_dataset_name,
" is not present in the source data list but referenced in ref_date_config_df"
))
}
}

if (min_max == "min") {
df_final <- ds_out |> dplyr::arrange(patient_number, datetime)
} else {
df_final <- ds_out |> dplyr::arrange(dplyr::desc(datetime))
}

df_final <- df_final[!duplicated(df_final$patient_number), c("patient_number", "datetime")]
colnames(df_final)[colnames(df_final) == "datetime"] <- der_var

dm <- dplyr::left_join(ds_in, y = df_final, by = "patient_number")
return(dm)
}
10 changes: 10 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,19 @@ reference:
- dtc_formats
- problems

- title: Calculation of reference dates in DM
contents:
- oak_cal_ref_dates

- title: Explicit dot pipe operator
desc: A simple alternative to `%>% {...}`
contents:
- "%.>%"

- title: Calculation of minimum/maximum ISO8601 dates
contents:
- cal_min_max_date

authors:
Ramiro Magno:
href: https://www.pattern.institute/team/rmagno/
Expand All @@ -63,3 +71,5 @@ authors:
href: https://www.linkedin.com/in/edgar-manukyan-20987927
Shiyu Chen:
href: https://www.linkedin.com/in/shiyu-chen-55a55410a/
Mohsin Uzzama:
href: https://www.linkedin.com/in/mohsin-uzzama-34066741/
64 changes: 64 additions & 0 deletions man/cal_min_max_date.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading