-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
#92 Create function to derive reference dates in DM domain #98
base: main
Are you sure you want to change the base?
Changes from 9 commits
a9dbb38
9c90a71
9216107
73f2d64
cdf00ce
1b90edb
badc493
93adfa2
40dc132
1d5bcb1
d798ae7
a1f0444
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,8 @@ Authors@R: c( | |
person("Pattern Institute", role = c("cph", "fnd")), | ||
person("F. Hoffmann-La Roche AG", role = c("cph", "fnd")), | ||
person("Pfizer Inc", role = c("cph", "fnd")), | ||
person("Mohsin", "Uzzama", email = "[email protected]", | ||
role = "aut"), | ||
person("Transition Technologies Science", role = c("cph", "fnd")) | ||
) | ||
Maintainer: Rammprasad Ganapathy <[email protected]> | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,127 @@ | ||||||
#' Calculate minimum and maximum date and time in the dataframe | ||||||
#' | ||||||
#' @description This function derives the earliest/latest ISO8601 datetime | ||||||
#' | ||||||
#' @param raw_dataset Raw source data frame | ||||||
#' @param date_variable Single character string. Name of the date variable | ||||||
#' @param time_variable Single character string. Name of the time variable | ||||||
#' @param val_type Single character string determining whether to look | ||||||
#' for the earliest or the latest datetime combination. Permitted values: | ||||||
#' "min", "max". Default to "min". | ||||||
#' @param date_format Format of source date variable | ||||||
#' @param time_format Format of source time variable | ||||||
#' | ||||||
#' @return Data frame with 2 columns: unique patient_number and datetime variable | ||||||
#' column storing the earliest/latest datetime. | ||||||
#' | ||||||
#' @export | ||||||
#' @examples | ||||||
#' EX <- tibble::tribble( | ||||||
#' ~patient_number, ~EX_ST_DT, ~EX_ST_TM, | ||||||
#' "001", "26-04-2022", "10:20", | ||||||
#' "001", "25-04-2022", "10:15", | ||||||
#' "001", "25-04-2022", "10:19", | ||||||
#' "002", "26-05-2022", "UNK:UNK", | ||||||
#' "002", "26-05-2022", "05:59" | ||||||
#' ) | ||||||
#' | ||||||
#' min <- cal_min_max_date(EX, | ||||||
#' "EX_ST_DT", | ||||||
#' "EX_ST_TM", | ||||||
#' val_type = "min", | ||||||
#' date_format = "dd-mmm-yyyy", | ||||||
#' time_format = "H:M" | ||||||
#' ) | ||||||
#' | ||||||
#' max <- cal_min_max_date(EX, | ||||||
#' "EX_ST_DT", | ||||||
#' "EX_ST_TM", | ||||||
#' val_type = "max", | ||||||
#' date_format = "dd-mmm-yyyy", | ||||||
#' time_format = "H:M" | ||||||
#' ) | ||||||
#' | ||||||
cal_min_max_date <- function(raw_dataset, | ||||||
date_variable, | ||||||
time_variable, | ||||||
val_type = "min", | ||||||
date_format, | ||||||
time_format) { | ||||||
# Check if date is present in the raw data frame | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to check if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is called by the oak_cal_ref_dates and parameters are read from the dataframe hence feel that this assertion is not required. |
||||||
date_not_in_data <- !(date_variable %in% colnames(raw_dataset)) | ||||||
ShiyuC marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
# Check if time variable is used and present in the raw data frame | ||||||
time_not_in_data <- !(time_variable %in% colnames(raw_dataset)) && !is.na(time_variable) | ||||||
ShiyuC marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
# If date/time variables not present return the empty data frame | ||||||
if (date_not_in_data || time_not_in_data) { | ||||||
# Return Empty data frame with patient_number and datetime columns | ||||||
empty_df <- stats::setNames( | ||||||
data.frame(matrix(ncol = 2L, nrow = 0L)), | ||||||
c("patient_number", "datetime") | ||||||
) | ||||||
cli::cli_warn(paste( | ||||||
"Date variable", date_variable, "or Time variable", time_variable, | ||||||
"not present in source data" | ||||||
)) | ||||||
return(empty_df) | ||||||
} | ||||||
|
||||||
fin_df <- raw_dataset | ||||||
# Time variable is not used then use only date | ||||||
if (is.na(time_variable)) { | ||||||
ShiyuC marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
fin_df$datetime <- create_iso8601(raw_dataset[[date_variable]], | ||||||
.format = date_format | ||||||
) | ||||||
} else { | ||||||
ShiyuC marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
# If both date and time variables are presen use both date and time | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated |
||||||
raw_dataset$date_time <- paste0( | ||||||
raw_dataset[[date_variable]], | ||||||
raw_dataset[[time_variable]] | ||||||
) | ||||||
format <- paste0(date_format, time_format) | ||||||
|
||||||
fin_df$datetime <- as.character(create_iso8601(raw_dataset$date_time, | ||||||
.format = format, | ||||||
.na = c( | ||||||
"UNK", "NA", "U", "unk", | ||||||
"u", "un", "UN" | ||||||
) | ||||||
)) | ||||||
} | ||||||
|
||||||
fin_df <- fin_df |> | ||||||
dplyr::select(c("patient_number", "datetime")) |> | ||||||
unique() | ||||||
|
||||||
fin_df <- fin_df |> | ||||||
dplyr::mutate(date_time = datetime) |> | ||||||
tidyr::separate( | ||||||
date_time, | ||||||
sep = "-|T|:", | ||||||
into = c("year", "month", "day", "hour", "minute"), | ||||||
fill = "right", | ||||||
extra = "drop" | ||||||
) |> | ||||||
list() |> | ||||||
stats::setNames("x") |> | ||||||
with(replace(x, x == "UNK", NA)) |> | ||||||
list() |> | ||||||
stats::setNames("x") |> | ||||||
with(replace(x, x == "", NA)) | ||||||
|
||||||
if (val_type == "min") { | ||||||
final_df <- fin_df |> | ||||||
dplyr::arrange(year, month, day, hour, minute) | ||||||
} else { | ||||||
final_df <- fin_df |> | ||||||
dplyr::arrange(dplyr::desc(year), dplyr::desc(month), dplyr::desc(day), dplyr::desc(hour), dplyr::desc(minute)) | ||||||
} | ||||||
|
||||||
# Keep first appearance in the data frame since it is already sorted | ||||||
final_df <- final_df[!duplicated(final_df$patient_number), c("patient_number", "datetime")] | ||||||
|
||||||
final_df <- final_df |> dplyr::filter(!is.na(datetime)) | ||||||
|
||||||
return(final_df) | ||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
utils::globalVariables(c( | ||
"USUBJID", "VISIT", "dom_dt", "dom_tm", "ref_dt", | ||
"ref_tm" | ||
"ref_tm", "datetime", "date_time", "year", "month", | ||
"day", "hour", "minute", "dataset_name", "date_var", | ||
"dformat", "tformat", "sdtm_var_name", "patient_number" | ||
)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
#' Calculate Reference dates in ISO8601 character format. | ||
#' | ||
#' Populate RFSTDTC variable in demographic domain in ISO8601 character format. | ||
#' | ||
#' @description Derive RFSTDTC, RFENDTC, RFXENDTC, RFXSTDTC based on the input dates and time. | ||
#' | ||
#' | ||
#' @param ds_in Data frame. DM domain. | ||
#' @param der_var Character string. The reference date to be derived. | ||
#' @param min_max Minimum or Maximum date to be calculated based on the input. | ||
#' Default set to Minimum. Values should be min or max. | ||
#' @param ref_date_config_df Data frame which has the details of the variables to | ||
#' be used for the calculation of reference dates. | ||
#' Should has columns listed below: | ||
muzzama-1990 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
#' dataset_name : Name of the raw dataset. | ||
#' date_var : Date variable name from the raw dataset. | ||
#' time_var : Time variable name from the raw dataset. | ||
#' dformat : Format of the date collected in raw data. | ||
#' tformat: Format of the time collected in raw data. | ||
#' sdtm_var_name : Reference variable name. | ||
#' @param raw_source List contains all the raw datasets. | ||
#' @return DM data frame with the reference dates populated. | ||
#' @export | ||
#' @examples | ||
#' dm <- tibble::tribble( | ||
#' ~patient_number, ~USUBJID, ~SUBJID, ~SEX, | ||
#' "001", "XXXX-001", "001", "F", | ||
#' "002", "XXXX-002", "002", "M", | ||
#' "003", "XXXX-003", "003", "M" | ||
#' ) | ||
#' | ||
#' ref_date_config_df <- tibble::tribble( | ||
#' ~dataset_name, ~date_var, ~time_var, ~dformat, ~tformat, ~sdtm_var_name, | ||
#' "EX1", "EX_ST_DT1", "EX_ST_TM1", "dd-mm-yyyy", "H:M", "RFSTDTC", | ||
#' "EX2", "EX_ST_DT2", NA, "dd-mmm-yyyy", NA, "RFSTDTC", | ||
#' "EX1", "EX_EN_DT1", "EX_EN_TM1", "dd-mm-yyyy", "H:M", "RFENDTC", | ||
#' "EX2", "EX_ST_DT2", NA, "dd-mmm-yyyy", NA, "RFENDTC" | ||
#' ) | ||
#' | ||
#' EX1 <- tibble::tribble( | ||
#' ~patient_number, ~EX_ST_DT1, ~EX_EN_DT1, ~EX_ST_TM1, ~EX_EN_TM1, | ||
#' "001", "15-05-2023", "15-05-2023", "10:20", "11:00", | ||
#' "001", "15-05-2023", "15-05-2023", "9:15", "10:00", | ||
#' "001", "15-05-2023", "15-05-2023", "8:19", "09:00", | ||
#' "002", "02-10-2023", "02-10-2023", "UNK:UNK", NA, | ||
#' "002", "03-11-2023", "03-11-2023", "11:19", NA | ||
#' ) | ||
#' | ||
#' EX2 <- tibble::tribble( | ||
#' ~patient_number, ~EX_ST_DT2, | ||
#' "001", "11-JUN-2023", | ||
#' "002", "24-OCT-2023", | ||
#' "002", "25-JUL-2023", | ||
#' "002", "30-OCT-2023", | ||
#' "002", "UNK-OCT-2023" | ||
#' ) | ||
#' | ||
#' raw_source <- list(EX1 = EX1, EX2 = EX2) | ||
#' | ||
#' dm_df <- oak_cal_ref_dates(dm, | ||
#' der_var = "RFSTDTC", | ||
#' min_max = "max", | ||
#' ref_date_config_df = ref_date_config_df, | ||
#' raw_source | ||
#' ) | ||
#' | ||
oak_cal_ref_dates <- function(ds_in = dm, | ||
der_var, | ||
min_max = "min", | ||
ref_date_config_df, | ||
raw_source) { | ||
# Check if ref_date_config_df is a data frame and has all required variables | ||
admiraldev::assert_data_frame(ref_date_config_df, required_vars = exprs( | ||
dataset_name, date_var, | ||
time_var, dformat, | ||
tformat, sdtm_var_name | ||
)) | ||
|
||
ShiyuC marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ds_out <- data.frame() | ||
for (i in seq_along(ref_date_config_df$dataset_name)) { | ||
raw_dataset_name <- ref_date_config_df$dataset_name[i] | ||
date_variable <- ref_date_config_df$date_var[i] | ||
date_format <- ref_date_config_df$dformat[i] | ||
time_var <- ref_date_config_df$time_var[i] | ||
time_format <- ref_date_config_df$tformat[i] | ||
sdtm_var <- ref_date_config_df$sdtm_var_name[i] | ||
raw_dataset <- raw_source[[raw_dataset_name]] | ||
|
||
if (der_var == sdtm_var) { | ||
ds_out1 <- cal_min_max_date( | ||
raw_dataset = raw_dataset, | ||
date_variable = date_variable, | ||
time_variable = time_var, | ||
date_format = date_format, | ||
time_format = time_format, | ||
val_type = min_max | ||
) | ||
ds_out <- rbind(ds_out, ds_out1) | ||
} | ||
} | ||
|
||
if (min_max == "min") { | ||
df_final <- ds_out |> dplyr::arrange(patient_number, datetime) | ||
} else { | ||
df_final <- ds_out |> dplyr::arrange(dplyr::desc(datetime)) | ||
} | ||
|
||
df_final <- df_final[!duplicated(df_final$patient_number), c("patient_number", "datetime")] | ||
colnames(df_final)[colnames(df_final) == "datetime"] <- der_var | ||
|
||
dm <- dplyr::left_join(ds_in, y = df_final, by = "patient_number") | ||
return(dm) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.