Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdr 2516 #149

Merged
merged 14 commits into from
May 17, 2024
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Type: Package
Package: gDRcore
Title: Processing functions and interface to process and analyze drug
dose-response data
Version: 1.1.20
Date: 2024-05-08
Version: 1.1.21
Date: 2024-05-13
Authors@R: c(
person("Bartosz", "Czech", , "[email protected]", role = "aut",
comment = c(ORCID = "0000-0002-9908-3007")),
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ export(create_and_normalize_SE)
export(data_model)
export(fit_SE)
export(fit_SE.combinations)
export(get_cellline_annotation_from_dt)
export(get_default_nested_identifiers)
export(get_drug_annotation_from_dt)
export(grr_matches)
export(identify_data_type)
export(identify_keys)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## gDRcore 1.1.21 - 2024-05-13
* add functions for retrieving annotation data from input dt

## gDRcore 1.1.20 - 2024-05-08
* fix typo

Expand Down
266 changes: 173 additions & 93 deletions R/add_annotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
#' used in the annotation file
#' @param fname string with file name with annotation
#' @param fill string indicating how unknown cell lines should be filled in the DB
#' @param annotationPackage string indication name of the package containing cellline annotation
#' @param externalSource string with path to external file with annotation data; by default it checks
#' @param annotation_package string indication name of the package containing cellline annotation
#' @param external_source string with path to external file with annotation data; by default it checks
#' 'GDR_CELLLINE_ANNOTATION' env var. This file should contain columns such as gnumber, drug_name and drug_moa
#' @keywords annotation
#' @details
Expand Down Expand Up @@ -42,35 +42,40 @@ add_CellLine_annotation <- function(
),
fname = "cell_lines.csv",
fill = "unknown",
annotationPackage = if ("gDRinternal" %in% .packages(all.available = TRUE)) {
annotation_package = if ("gDRinternal" %in% .packages(all.available = TRUE)) {
"gDRinternal"
} else {
"gDRtestData"
},
externalSource = Sys.getenv("GDR_CELLLINE_ANNOTATION")
external_source = Sys.getenv("GDR_CELLLINE_ANNOTATION")
) {

# Assertions:
checkmate::assert_data_table(dt_metadata)
checkmate::assert_string(fill, null.ok = TRUE)
checkmate::assert_string(externalSource, null.ok = TRUE)
checkmate::assert(checkmate::check_string(external_source, null.ok = TRUE),
checkmate::check_data_table(external_source,
col.names = "named"))


cellline <- gDRutils::get_env_identifiers("cellline")
cellline_name <- gDRutils::get_env_identifiers("cellline_name")
add_clid <- gDRutils::get_header("add_clid")

if (all(c(cellline, cellline_name, add_clid) %in% names(dt_metadata))) {
return(dt_metadata)
dt_metadata[, (unlist(add_clid)) := NULL]
}

CLs_info <- if (nchar(externalSource) && file.exists(externalSource)) {
data.table::fread(externalSource)
} else if (annotationPackage == "gDRtestData") {
CLs_info <- if (is.character(external_source) && nchar(external_source) && file.exists(external_source)) {
data.table::fread(external_source)
} else if (data.table::is.data.table(external_source)) {
external_source
} else if (annotation_package == "gDRtestData") {
data.table::fread(
system.file("annotation_data", fname, package = annotationPackage), header = TRUE
system.file("annotation_data", fname, package = annotation_package), header = TRUE
)
} else {
eval(parse(text = paste0(annotationPackage,
eval(parse(text = paste0(annotation_package,
"::",
"get_cell_line_annotations")))(dt_metadata[[cellline]])
}
Expand Down Expand Up @@ -124,8 +129,8 @@ add_CellLine_annotation <- function(
#' @param dt_metadata data.table with metadata
#' @param fname string with file name with annotation
#' @param fill string indicating how unknown cell lines should be filled in the DB
#' @param annotationPackage string indication name of the package containing drug annotation
#' @param externalSource string with path to external file with annotation data; by default it checks
#' @param annotation_package string indication name of the package containing drug annotation
#' @param external_source string with path to external file with annotation data; by default it checks
#' 'GDR_DRUG_ANNOTATION' env var. This file should contain columns such as gnumber, drug_name, and drug_moa
#' @keywords annotation
#' @details The logic of adding drug annotation for dt_metadata
Expand All @@ -143,19 +148,20 @@ add_Drug_annotation <- function(
dt_metadata,
fname = "drugs.csv",
fill = "unknown",
annotationPackage = if ("gDRinternal" %in% .packages(all.available = TRUE)) {
annotation_package = if ("gDRinternal" %in% .packages(all.available = TRUE)) {
"gDRinternal"
} else {
"gDRtestData"
},
externalSource = Sys.getenv("GDR_DRUG_ANNOTATION")
external_source = Sys.getenv("GDR_DRUG_ANNOTATION")
) {

# Assertions:
checkmate::assert_data_table(dt_metadata)
checkmate::assert_string(fill, null.ok = TRUE)
checkmate::assert_string(externalSource, null.ok = TRUE)

checkmate::assert(checkmate::check_string(external_source, null.ok = TRUE),
checkmate::check_data_table(external_source,
col.names = "named"))
nrows_df <- nrow(dt_metadata)

drug <- unlist(gDRutils::get_env_identifiers(c(
Expand All @@ -171,86 +177,88 @@ add_Drug_annotation <- function(
if (all(c(drug[["drug"]],
drug_name[["drug_name"]],
drug_moa[["drug_moa"]]) %in% names(dt_metadata))) {
return(dt_metadata)
} else {
drug_full_identifiers <- c(
drug[drug_idx],
drug_name[drug_idx],
drug_moa[drug_idx]
dt_metadata[, (unlist(intersect(c(drug_name, drug_moa), names(dt_metadata)))) := NULL]
}
drug_full_identifiers <- c(
drug[drug_idx],
drug_name[drug_idx],
drug_moa[drug_idx]
)
drug_ids <- stringr::str_extract(names(drug_full_identifiers), "[0-9]")
drug_ids <- ifelse(is.na(drug_ids), 1, drug_ids)
drug_identifiers_list <- split(drug_full_identifiers, drug_ids)
names(drug_identifiers_list) <- drug[drug_idx]

# Read local drugs annotations

Drug_info <- if (is.character(external_source) && nchar(external_source) && file.exists(external_source)) {
data.table::fread(external_source)
} else if (data.table::is.data.table(external_source)) {
external_source
} else if (annotation_package == "gDRtestData") {
data.table::fread(
system.file("annotation_data", fname, package = annotation_package), header = TRUE
)
drug_ids <- stringr::str_extract(names(drug_full_identifiers), "[0-9]")
drug_ids <- ifelse(is.na(drug_ids), 1, drug_ids)
drug_identifiers_list <- split(drug_full_identifiers, drug_ids)
names(drug_identifiers_list) <- drug[drug_idx]

# Read local drugs annotations
Drug_info <- if (nchar(externalSource) && file.exists(externalSource)) {
data.table::fread(externalSource)
} else if (annotationPackage == "gDRtestData") {
data.table::fread(
system.file("annotation_data", fname, package = annotationPackage), header = TRUE
)
} else {
eval(parse(text = paste0(annotationPackage,
"::",
"get_drug_annotations")))()
}


Drug_info <- Drug_info[, c("gnumber", "drug_name", "drug_moa"), with = FALSE]
data.table::setnames(Drug_info, c("gnumber", "drug_name", "drug_moa"),
c("drug", "drug_name", "drug_moa"))
drugsTreated <- drugsTreated[!drugsTreated %in% untreated_tag]
validatedDrugs <-
remove_drug_batch(drugsTreated) %in% remove_drug_batch(Drug_info[["drug"]])
#### function should be parallelizeized
missingTblDrugs <- NULL
if (!is.null(fill) && any(!validatedDrugs)) {
missingTblDrugs <- data.table::data.table(
drug = remove_drug_batch(drugsTreated[!validatedDrugs]),
drug_name = drugsTreated[!validatedDrugs],
drug_moa = fill
)
}

if (nrow(Drug_info) == 0) {
drug_name <- intersect(drug_name, colnames(dt_metadata))
drug <- intersect(drug, colnames(dt_metadata))
dt_metadata[, (drug_name) := get(drug)]
return(dt_metadata)
}
} else {
eval(parse(text = paste0(annotation_package,
"::",
"get_drug_annotations")))()
}

Drug_info <- rbind(Drug_info, missingTblDrugs)

Drug_info$drug <- remove_drug_batch(Drug_info$drug)
Drug_info <-
rbind(data.table::data.table(
drug = untreated_tag,
drug_name = untreated_tag,
drug_moa = untreated_tag
),
Drug_info)
Drug_info <- Drug_info[!duplicated(Drug_info[["drug"]]), ]
if (any(!remove_drug_batch(drugsTreated) %in% Drug_info$drug) &&
!is.null(missingTblDrugs)) {
Drug_info <- rbind(Drug_info, stats::setNames(
missingTblDrugs[!(remove_drug_batch(missingTblDrugs$drug) %in%
Drug_info$drug), ],
names(Drug_info)
))
}
for (drug_idf in names(drug_identifiers_list)) {
colnames(Drug_info) <- drug_identifiers_list[[drug_idf]]
dt_metadata$batch <- dt_metadata[[drug_idf]]
dt_metadata[[drug_idf]] <- remove_drug_batch(dt_metadata[[drug_idf]])
req_col <- c(drug_idf, setdiff(colnames(dt_metadata), colnames(Drug_info)))
dt_metadata <- Drug_info[dt_metadata[, req_col, with = FALSE], on = drug_idf]
dt_metadata[[drug_idf]] <- dt_metadata$batch
dt_metadata$batch <- NULL
}
stopifnot(nrows_df == nrow(dt_metadata))
dt_metadata
Drug_info <- Drug_info[, c("gnumber", "drug_name", "drug_moa"), with = FALSE]
data.table::setnames(Drug_info, c("gnumber", "drug_name", "drug_moa"),
c("drug", "drug_name", "drug_moa"))
drugsTreated <- drugsTreated[!drugsTreated %in% untreated_tag]
validatedDrugs <-
remove_drug_batch(drugsTreated) %in% remove_drug_batch(Drug_info[["drug"]])
#### function should be parallelizeized
missingTblDrugs <- NULL
if (!is.null(fill) && any(!validatedDrugs)) {
missingTblDrugs <- data.table::data.table(
drug = remove_drug_batch(drugsTreated[!validatedDrugs]),
drug_name = drugsTreated[!validatedDrugs],
drug_moa = fill
)
}

if (nrow(Drug_info) == 0) {
drug_name <- intersect(drug_name, colnames(dt_metadata))
drug <- intersect(drug, colnames(dt_metadata))
dt_metadata[, (drug_name) := get(drug)]
return(dt_metadata)
}

Drug_info <- rbind(Drug_info, missingTblDrugs)

Drug_info$drug <- remove_drug_batch(Drug_info$drug)
Drug_info <-
rbind(data.table::data.table(
drug = untreated_tag,
drug_name = untreated_tag,
drug_moa = untreated_tag
),
Drug_info)
Drug_info <- Drug_info[!duplicated(Drug_info[["drug"]]), ]
if (any(!remove_drug_batch(drugsTreated) %in% Drug_info$drug) &&
!is.null(missingTblDrugs)) {
Drug_info <- rbind(Drug_info, stats::setNames(
missingTblDrugs[!(remove_drug_batch(missingTblDrugs$drug) %in%
Drug_info$drug), ],
names(Drug_info)
))
}
for (drug_idf in names(drug_identifiers_list)) {
colnames(Drug_info) <- drug_identifiers_list[[drug_idf]]
dt_metadata$batch <- dt_metadata[[drug_idf]]
dt_metadata[[drug_idf]] <- remove_drug_batch(dt_metadata[[drug_idf]])
req_col <- c(drug_idf, setdiff(colnames(dt_metadata), colnames(Drug_info)))
dt_metadata <- Drug_info[dt_metadata[, req_col, with = FALSE], on = drug_idf]
dt_metadata[[drug_idf]] <- dt_metadata$batch
dt_metadata$batch <- NULL
}
stopifnot(nrows_df == nrow(dt_metadata))
dt_metadata
}

#' Remove batch from Gnumber
Expand All @@ -266,3 +274,75 @@ add_Drug_annotation <- function(
remove_drug_batch <- function(drug) {
gsub("\\.[0-9]+.*", "", drug)
}


#' Retrieve the drug annotation from the annotated dt input
#'
#' @param dt annotated data.table
#'
#' @return data.table with drug annotation
#' @export
#'
#' @examples
#' dt <- data.table::data.table(Gnumber = "A",
#' DrugName = "drugA",
#' drug_moa = "drug_moa_A")
#' get_drug_annotation_from_dt(dt)
get_drug_annotation_from_dt <- function(dt) {
checkmate::assert_data_table(dt)
drug_cols <- intersect(gDRutils::get_env_identifiers()
[grep("drug", names(gDRutils::get_env_identifiers()))],
names(dt))
dt_drug <- dt[, unlist(drug_cols), with = FALSE]
dt_long <- data.table::melt(dt_drug,
measure.vars = patterns(gsub("[0-9]",
".",
paste0("^",
unlist(drug_cols[c("drug",
"drug_name",
"drug_moa")])))),
value.name = unlist(drug_cols[c("drug",
"drug_name",
"drug_moa")]))
dt_long[, "variable" := NULL]
data.table::setnames(dt_long,
unlist(drug_cols[c("drug",
"drug_name",
"drug_moa")]),
c("gnumber", "drug_name", "drug_moa"))
unique_dt <- unique(dt_long)
unique_dt[!unique_dt$gnumber %in% gDRutils::get_env_identifiers("untreated_tag"), ]
}


#' Retrieve the cell line annotation from the annotated dt input
#'
#' @param dt annotated data.table
#'
#' @return data.table with cell line annotation
#' @export
#'
#' @examples
#' dt <- data.table::data.table(Gnumber = "A",
#' clid = "CL123",
#' CellLineName = "cl name",
#' Tissue = "Bone",
#' parental_identifier = "some cl",
#' subtype = "cortical",
#' ReferenceDivisionTime = 5)
#' get_cellline_annotation_from_dt(dt)
get_cellline_annotation_from_dt <- function(dt) {
checkmate::assert_data_table(dt)
cell_cols <- c(gDRutils::get_env_identifiers("cellline"),
gDRutils::get_header("add_clid"))
cell_dt <- dt[, unlist(cell_cols), with = FALSE]
data.table::setnames(cell_dt,
unlist(cell_cols),
c("cell_line_identifier",
"cell_line_name",
"primary_tissue",
"parental_identifier",
"subtype",
"doubling_time"))
unique(cell_dt)
}
1 change: 1 addition & 0 deletions R/packages.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

# data.table awareness
.datatable.aware <- TRUE
patterns <- data.table:::patterns

#' onload function
#'
Expand Down
Loading
Loading