Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#92 Create function to derive reference dates in DM domain #98

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Authors@R: c(
person("Pattern Institute", role = c("cph", "fnd")),
person("F. Hoffmann-La Roche AG", role = c("cph", "fnd")),
person("Pfizer Inc", role = c("cph", "fnd")),
person("Mohsin", "Uzzama", email = "[email protected]",
role = "aut"),
person("Transition Technologies Science", role = c("cph", "fnd"))
)
Maintainer: Rammprasad Ganapathy <[email protected]>
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ export("%.>%")
export(assign_ct)
export(assign_datetime)
export(assign_no_ct)
export(cal_min_max_date)
export(condition_add)
export(create_iso8601)
export(ct_map)
Expand All @@ -20,6 +21,7 @@ export(fmt_cmp)
export(generate_oak_id_vars)
export(hardcode_ct)
export(hardcode_no_ct)
export(oak_cal_ref_dates)
export(oak_id_vars)
export(problems)
export(read_ct_spec)
Expand Down
127 changes: 127 additions & 0 deletions R/cal_min_max_date.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#' Calculate minimum and maximum date and time in the dataframe
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#' Calculate minimum and maximum date and time in the dataframe
#' Calculate minimum and maximum date and time in the data frame

#'
#' @description This function derives the earliest/latest ISO8601 datetime
#'
#' @param raw_dataset Raw source data frame
#' @param date_variable Single character string. Name of the date variable
#' @param time_variable Single character string. Name of the time variable
#' @param val_type Single character string determining whether to look
#' for the earliest or the latest datetime combination. Permitted values:
#' "min", "max". Default to "min".
#' @param date_format Format of source date variable
#' @param time_format Format of source time variable
#'
#' @return Data frame with 2 columns: unique patient_number and datetime variable
#' column storing the earliest/latest datetime.
#'
#' @export
#' @examples
#' EX <- tibble::tribble(
#' ~patient_number, ~EX_ST_DT, ~EX_ST_TM,
#' "001", "26-04-2022", "10:20",
#' "001", "25-04-2022", "10:15",
#' "001", "25-04-2022", "10:19",
#' "002", "26-05-2022", "UNK:UNK",
#' "002", "26-05-2022", "05:59"
#' )
#'
#' min <- cal_min_max_date(EX,
#' "EX_ST_DT",
#' "EX_ST_TM",
#' val_type = "min",
#' date_format = "dd-mmm-yyyy",
#' time_format = "H:M"
#' )
#'
#' max <- cal_min_max_date(EX,
#' "EX_ST_DT",
#' "EX_ST_TM",
#' val_type = "max",
#' date_format = "dd-mmm-yyyy",
#' time_format = "H:M"
#' )
#'
cal_min_max_date <- function(raw_dataset,
date_variable,
time_variable,
val_type = "min",
date_format,
time_format) {
# Check if date is present in the raw data frame
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to check if date_variable, time_variable are a string of single character, in case users input a vector of variable names.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is called by the oak_cal_ref_dates and parameters are read from the dataframe hence feel that this assertion is not required.

date_not_in_data <- !(date_variable %in% colnames(raw_dataset))
ShiyuC marked this conversation as resolved.
Show resolved Hide resolved

# Check if time variable is used and present in the raw data frame
time_not_in_data <- !(time_variable %in% colnames(raw_dataset)) && !is.na(time_variable)
ShiyuC marked this conversation as resolved.
Show resolved Hide resolved

# If date/time variables not present return the empty data frame
if (date_not_in_data || time_not_in_data) {
# Return Empty data frame with patient_number and datetime columns
empty_df <- stats::setNames(
data.frame(matrix(ncol = 2L, nrow = 0L)),
c("patient_number", "datetime")
)
cli::cli_warn(paste(
"Date variable", date_variable, "or Time variable", time_variable,
"not present in source data"
))
return(empty_df)
}

fin_df <- raw_dataset
# Time variable is not used then use only date
if (is.na(time_variable)) {
ShiyuC marked this conversation as resolved.
Show resolved Hide resolved
fin_df$datetime <- create_iso8601(raw_dataset[[date_variable]],
.format = date_format
)
} else {
ShiyuC marked this conversation as resolved.
Show resolved Hide resolved
# If both date and time variables are presen use both date and time
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# If both date and time variables are presen use both date and time
# If both date and time variables are present use both date and time

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

raw_dataset$date_time <- paste0(
raw_dataset[[date_variable]],
raw_dataset[[time_variable]]
)
format <- paste0(date_format, time_format)

fin_df$datetime <- as.character(create_iso8601(raw_dataset$date_time,
.format = format,
.na = c(
"UNK", "NA", "U", "unk",
"u", "un", "UN"
)
))
}

fin_df <- fin_df |>
dplyr::select(c("patient_number", "datetime")) |>
unique()

fin_df <- fin_df |>
dplyr::mutate(date_time = datetime) |>
tidyr::separate(
date_time,
sep = "-|T|:",
into = c("year", "month", "day", "hour", "minute"),
fill = "right",
extra = "drop"
) |>
list() |>
stats::setNames("x") |>
with(replace(x, x == "UNK", NA)) |>
list() |>
stats::setNames("x") |>
with(replace(x, x == "", NA))

if (val_type == "min") {
final_df <- fin_df |>
dplyr::arrange(year, month, day, hour, minute)
} else {
final_df <- fin_df |>
dplyr::arrange(dplyr::desc(year), dplyr::desc(month), dplyr::desc(day), dplyr::desc(hour), dplyr::desc(minute))
}

# Keep first appearance in the data frame since it is already sorted
final_df <- final_df[!duplicated(final_df$patient_number), c("patient_number", "datetime")]

final_df <- final_df |> dplyr::filter(!is.na(datetime))

return(final_df)
}
4 changes: 3 additions & 1 deletion R/globals.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
utils::globalVariables(c(
"USUBJID", "VISIT", "dom_dt", "dom_tm", "ref_dt",
"ref_tm"
"ref_tm", "datetime", "date_time", "year", "month",
"day", "hour", "minute", "dataset_name", "date_var",
"dformat", "tformat", "sdtm_var_name", "patient_number"
))
113 changes: 113 additions & 0 deletions R/oak_cal_ref_dates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#' Calculate Reference dates in ISO8601 character format.
#'
#' Populate RFSTDTC variable in demographic domain in ISO8601 character format.
#'
#' @description Derive RFSTDTC, RFENDTC, RFXENDTC, RFXSTDTC based on the input dates and time.
#'
#'
#' @param ds_in Data frame. DM domain.
#' @param der_var Character string. The reference date to be derived.
#' @param min_max Minimum or Maximum date to be calculated based on the input.
#' Default set to Minimum. Values should be min or max.
#' @param ref_date_config_df Data frame which has the details of the variables to
#' be used for the calculation of reference dates.
#' Should has columns listed below:
muzzama-1990 marked this conversation as resolved.
Show resolved Hide resolved
#' dataset_name : Name of the raw dataset.
#' date_var : Date variable name from the raw dataset.
#' time_var : Time variable name from the raw dataset.
#' dformat : Format of the date collected in raw data.
#' tformat: Format of the time collected in raw data.
#' sdtm_var_name : Reference variable name.
#' @param raw_source List contains all the raw datasets.
#' @return DM data frame with the reference dates populated.
#' @export
#' @examples
#' dm <- tibble::tribble(
#' ~patient_number, ~USUBJID, ~SUBJID, ~SEX,
#' "001", "XXXX-001", "001", "F",
#' "002", "XXXX-002", "002", "M",
#' "003", "XXXX-003", "003", "M"
#' )
#'
#' ref_date_config_df <- tibble::tribble(
#' ~dataset_name, ~date_var, ~time_var, ~dformat, ~tformat, ~sdtm_var_name,
#' "EX1", "EX_ST_DT1", "EX_ST_TM1", "dd-mm-yyyy", "H:M", "RFSTDTC",
#' "EX2", "EX_ST_DT2", NA, "dd-mmm-yyyy", NA, "RFSTDTC",
#' "EX1", "EX_EN_DT1", "EX_EN_TM1", "dd-mm-yyyy", "H:M", "RFENDTC",
#' "EX2", "EX_ST_DT2", NA, "dd-mmm-yyyy", NA, "RFENDTC"
#' )
#'
#' EX1 <- tibble::tribble(
#' ~patient_number, ~EX_ST_DT1, ~EX_EN_DT1, ~EX_ST_TM1, ~EX_EN_TM1,
#' "001", "15-05-2023", "15-05-2023", "10:20", "11:00",
#' "001", "15-05-2023", "15-05-2023", "9:15", "10:00",
#' "001", "15-05-2023", "15-05-2023", "8:19", "09:00",
#' "002", "02-10-2023", "02-10-2023", "UNK:UNK", NA,
#' "002", "03-11-2023", "03-11-2023", "11:19", NA
#' )
#'
#' EX2 <- tibble::tribble(
#' ~patient_number, ~EX_ST_DT2,
#' "001", "11-JUN-2023",
#' "002", "24-OCT-2023",
#' "002", "25-JUL-2023",
#' "002", "30-OCT-2023",
#' "002", "UNK-OCT-2023"
#' )
#'
#' raw_source <- list(EX1 = EX1, EX2 = EX2)
#'
#' dm_df <- oak_cal_ref_dates(dm,
#' der_var = "RFSTDTC",
#' min_max = "max",
#' ref_date_config_df = ref_date_config_df,
#' raw_source
#' )
#'
oak_cal_ref_dates <- function(ds_in = dm,
der_var,
min_max = "min",
ref_date_config_df,
raw_source) {
# Check if ref_date_config_df is a data frame and has all required variables
admiraldev::assert_data_frame(ref_date_config_df, required_vars = exprs(
dataset_name, date_var,
time_var, dformat,
tformat, sdtm_var_name
))

ShiyuC marked this conversation as resolved.
Show resolved Hide resolved
ds_out <- data.frame()
for (i in seq_along(ref_date_config_df$dataset_name)) {
raw_dataset_name <- ref_date_config_df$dataset_name[i]
date_variable <- ref_date_config_df$date_var[i]
date_format <- ref_date_config_df$dformat[i]
time_var <- ref_date_config_df$time_var[i]
time_format <- ref_date_config_df$tformat[i]
sdtm_var <- ref_date_config_df$sdtm_var_name[i]
raw_dataset <- raw_source[[raw_dataset_name]]

if (der_var == sdtm_var) {
ds_out1 <- cal_min_max_date(
raw_dataset = raw_dataset,
date_variable = date_variable,
time_variable = time_var,
date_format = date_format,
time_format = time_format,
val_type = min_max
)
ds_out <- rbind(ds_out, ds_out1)
}
}

if (min_max == "min") {
df_final <- ds_out |> dplyr::arrange(patient_number, datetime)
} else {
df_final <- ds_out |> dplyr::arrange(dplyr::desc(datetime))
}

df_final <- df_final[!duplicated(df_final$patient_number), c("patient_number", "datetime")]
colnames(df_final)[colnames(df_final) == "datetime"] <- der_var

dm <- dplyr::left_join(ds_in, y = df_final, by = "patient_number")
return(dm)
}
10 changes: 10 additions & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,19 @@ reference:
- dtc_formats
- problems

- title: Calculation of reference dates in DM
contents:
- oak_cal_ref_dates

- title: Explicit dot pipe operator
desc: A simple alternative to `%>% {...}`
contents:
- "%.>%"

- title: Calculation of minimum/maximum ISO8601 dates
contents:
- cal_min_max_date

authors:
Ramiro Magno:
href: https://www.pattern.institute/team/rmagno/
Expand All @@ -63,3 +71,5 @@ authors:
href: https://www.linkedin.com/in/edgar-manukyan-20987927
Shiyu Chen:
href: https://www.linkedin.com/in/shiyu-chen-55a55410a/
Mohsin Uzzama:
href: https://www.linkedin.com/in/mohsin-uzzama-34066741/
64 changes: 64 additions & 0 deletions man/cal_min_max_date.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading