Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdr 2682 #132

Merged
merged 13 commits into from
Sep 30, 2024
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: gDRutils
Type: Package
Title: A package with helper functions for processing drug response data
Version: 1.3.12
Date: 2024-09-04
Version: 1.3.13
Date: 2024-09-16
Authors@R: c(person("Bartosz", "Czech", role=c("aut"),
comment = c(ORCID = "0000-0002-9908-3007")),
person("Arkadiusz", "Gladki", role=c("cre", "aut"), email="[email protected]",
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ export(set_SE_processing_metadata)
export(set_constant_fit_params)
export(set_env_identifier)
export(set_unique_cl_names)
export(set_unique_cl_names_dt)
export(set_unique_drug_names)
export(set_unique_drug_names_dt)
export(set_unique_identifiers)
export(shorten_normalization_type_name)
export(split_SE_components)
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## gDRutils 1.3.13 - 2024-09-16
* add functions set_unique_cl_names_dt and set_unique_drug_names_dt

## gDRutils 1.3.12 - 2024-09-04
* remove hack with checkDimnames

Expand Down
102 changes: 90 additions & 12 deletions R/standardize_MAE.R
Original file line number Diff line number Diff line change
Expand Up @@ -298,21 +298,64 @@ set_unique_cl_names <- function(se) {
checkmate::assert_class(se, "SummarizedExperiment")

col_data <- SummarizedExperiment::colData(se)
col_data_new <- set_unique_cl_names_dt(col_data)
SummarizedExperiment::colData(se) <- col_data_new

se
}

#' Set Unique Parental Identifiers in table
#'
#' This function sets the `CellLineName` field in
#' `colData` to be unique by appending the `clid` in parentheses for duplicates.
#'
#' @param col_data data.table or DFrame with col data
#' @param sep string with separator added before suffix
#' @return fixed input table with unique `CellLineName` in `colData`.
#' @examples
#' col_data <- S4Vectors::DataFrame(CellLineName = c("ID1", "ID1"), clid = c("C1", "C2"))
#' col_data <- set_unique_cl_names_dt(col_data)
#' @export
#' @keywords standardize_MAE
#'
set_unique_cl_names_dt <- function(col_data, sep = " ") {
stopifnot(any(inherits(col_data, "data.table") || inherits(col_data, "DFrame")))

cellline_name <- get_env_identifiers("cellline_name")
clid <- get_env_identifiers("cellline")

if (!is.null(col_data[[cellline_name]])) {
duplicated_ids <- col_data[[cellline_name]][duplicated(col_data[[cellline_name]])]
unique_col_names <- c(unlist(get_default_identifiers()[
c("cellline_name", "drug_name", "drug_name2",
"concentration2", "duration", "data_source")
]), "normalization_type")
unique_col_names <- intersect(unique_col_names, names(col_data))
unique_col_names_clid <- c(unique_col_names, get_default_identifiers()$cellline)
if (data.table::is.data.table(col_data)) {
duplicated_ids <- col_data[[cellline_name]][duplicated(col_data, by = unique_col_names)]
duplicated_ids_with_clid <- col_data[[cellline_name]][duplicated(col_data, by = unique_col_names_clid)]
} else {
duplicated_ids <- col_data[[cellline_name]][duplicated(col_data[unique_col_names])]
duplicated_ids_with_clid <- col_data[[cellline_name]][duplicated(col_data[unique_col_names_clid])]
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why there is a different procedure for colData for SE and for data.table?
(I understand that in colData we do not have information about Drug, but it is misleading: the same content as input but different output depending on format - in the context of presence of cell line name duplicates)

>   dt <- data.table::data.table(
   DrugName = c("DrugA", "DrugB", "DrugC", "DrugD", "DrugC", "DrugD"), 
   Gnumber = c("G1", "G2", "G3", "G4", "G3", "G4"),
   CellLineName = c("ID1", "ID1", "ID2", "ID2", "ID2", "ID2"), 
   clid = c("C1", "C2", "C3", "C4", "C5", "C6")
  )
>   res_dt <- set_unique_cl_names_dt(dt)
> res_dt
   DrugName Gnumber CellLineName   clid
     <char>  <char>       <char> <char>
1:    DrugA      G1          ID1     C1
2:    DrugB      G2          ID1     C2
3:    DrugC      G3     ID2 (C3)     C3
4:    DrugD      G4     ID2 (C4)     C4
5:    DrugC      G3     ID2 (C5)     C5
6:    DrugD      G4     ID2 (C6)     C6

  dt <- S4Vectors::DataFrame(
    DrugName = c("DrugA", "DrugB", "DrugC", "DrugD", "DrugC", "DrugD"), 
    Gnumber = c("G1", "G2", "G3", "G4", "G3", "G4"),
    CellLineName = c("ID1", "ID1", "ID2", "ID2", "ID2", "ID2"), 
    clid = c("C1", "C2", "C3", "C4", "C5", "C6")
  )
>   res_S4 <- set_unique_cl_names_dt(dt)
> res_S4
DataFrame with 6 rows and 4 columns
     DrugName     Gnumber CellLineName        clid
  <character> <character>  <character> <character>
1       DrugA          G1     ID1 (C1)          C1
2       DrugB          G2     ID1 (C2)          C2
3       DrugC          G3     ID2 (C3)          C3
4       DrugD          G4     ID2 (C4)          C4
5       DrugC          G3     ID2 (C5)          C5
6       DrugD          G4     ID2 (C6)          C6

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see your point, but in my opinion it's a bit pointless. From a practical point of view DataFrame is only used for colData or rowData in SE, where there is no possibility for such a situation to occur. I don't see the need to complicate this logic.
More complex logic for data.table objects is required due to data specificity.

Copy link
Contributor

@j-smola j-smola Sep 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO now is complicated and not consistent.
I would vote for the function always returning the same result, regardless of format. The user may want to use this feature in a context other than within the application.

The only thing to change is - instead of checking format of col_data input - just check whether unique_col_names is "CellLineName".
If true - just add suffix, if not - check other columns and add suffix accordingly.
You have that code already written.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes but duplicated works in different way for data.table and DataFrame.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm on the same page as @j-smola. It would be great to have consistent logic regardless of the data format.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Added fix and test

duplicated_ids <- setdiff(duplicated_ids, duplicated_ids_with_clid)

if (length(duplicated_ids) > 0) {
for (dup_id in unique(duplicated_ids)) {
dup_indices <- which(col_data[[cellline_name]] == dup_id)
col_data[[cellline_name]][dup_indices] <- paste0(col_data[[cellline_name]][dup_indices],
" (", col_data[[clid]][dup_indices], ")")
}
SummarizedExperiment::colData(se) <- col_data
for (dup_id in unique(duplicated_ids)) {
dup_indices <- which(col_data[[cellline_name]] == dup_id)
col_data[[cellline_name]][dup_indices] <-
paste0(
col_data[[cellline_name]][dup_indices],
sep,
"(",
col_data[[clid]][dup_indices],
")"
)
}
}
}
return(se)

col_data
}

#' Set Unique Drug Names
Expand All @@ -337,6 +380,36 @@ set_unique_drug_names <- function(se) {
checkmate::assert_class(se, "SummarizedExperiment")

row_data <- SummarizedExperiment::rowData(se)
row_data_new <- set_unique_drug_names_dt(row_data)

SummarizedExperiment::rowData(se) <- row_data_new
se
}


#' Set Unique Drug Names in table
#'
#' This function sets the `DrugName`, `DrugName_2`, and `DrugName_3` fields in
#' `rowData` to be unique by appending the corresponding `Gnumber`, `Gnumber_2`,
#' and `Gnumber_3` in parentheses for duplicates.
#'
#' @param row_data data.table or DFrame with row data
#' @param sep string with separator added before suffix
#' @return fixed input table with unique `DrugName` fields in `rowData`.
#' @examples
#' row_data <- S4Vectors::DataFrame(
#' DrugName = c("DrugA", "DrugA", "DrugB"),
#' Gnumber = c("G1", "G2", "G5"),
#' DrugName_2 = c("DrugC", "DrugC", "DrugD"),
#' Gnumber_2 = c("G3", "G4", "G5")
#' )
#' row_data <- set_unique_drug_names_dt(row_data)
#' @export
#' @keywords standardize_MAE
#'
set_unique_drug_names_dt <- function(row_data, sep = " ") {
stopifnot(any(inherits(row_data, "data.table") || inherits(row_data, "DFrame")))

drug_columns <- intersect(unlist(get_env_identifiers(c("drug_name", "drug_name2", "drug_name3"), simplify = FALSE)),
names(row_data))
gnumber_columns <- intersect(unlist(get_env_identifiers(c("drug", "drug2", "drug3"), simplify = FALSE)),
Expand All @@ -354,15 +427,20 @@ set_unique_drug_names <- function(se) {
for (dup_drug in unique_drugs) {
dup_indices <- which(row_data[[drug_col]] == dup_drug)
if (length(unique(row_data[[gnumber_col]][dup_indices])) > 1) {
row_data[[drug_col]][dup_indices] <- paste0(row_data[[drug_col]][dup_indices],
" (", row_data[[gnumber_col]][dup_indices], ")")
row_data[[drug_col]][dup_indices] <-
paste0(
row_data[[drug_col]][dup_indices],
sep,
"(",
row_data[[gnumber_col]][dup_indices],
")"
)
}
}
}
}

SummarizedExperiment::rowData(se) <- row_data
return(se)
row_data
}

#' Set Unique Identifiers in MultiAssayExperiment
Expand Down
25 changes: 25 additions & 0 deletions man/set_unique_cl_names_dt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions man/set_unique_drug_names_dt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

95 changes: 95 additions & 0 deletions tests/testthat/test-standardize_MAE.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,100 @@ test_that("get_optional_rowdata_fields works as expected", {
})


test_that("set_unique_cl_names_dt and set_unique_drug_names_dt works correctly", {

# DataFrame
## Duplicated CellLineName
col_data <- S4Vectors::DataFrame(CellLineName = c("ID1", "ID1"), clid = c("C1", "C2"))
res_1 <- set_unique_drug_names_dt(col_data)
res_2 <- set_unique_cl_names_dt(col_data)
expect_equal(col_data, res_1)
expect_false(identical(col_data, res_2))
expect_equal(c("ID1 (C1)", "ID1 (C2)"), res_2$CellLineName)

## Duplicated DrugName
row_data <- S4Vectors::DataFrame(DrugName = c("DrugA", "DrugA"), Gnumber = c("G1", "G2"))
res_3 <- set_unique_drug_names_dt(row_data)
res_4 <- set_unique_cl_names_dt(row_data)
expect_false(identical(row_data, res_3))
expect_equal(row_data, res_4)
expect_equal(c("DrugA (G1)", "DrugA (G2)"), res_3$DrugName)

# data.table
## All different
dt <- data.table::data.table(
DrugName = c("DrugA", "DrugB", "DrugC", "DrugD"),
Gnumber = c("G1", "G2", "G3", "G4"),
CellLineName = c("ID1", "ID2", "ID3", "ID4"),
clid = c("C1", "C2", "C3", "C4")
)
res_5 <- set_unique_drug_names_dt(dt)
res_6 <- set_unique_cl_names_dt(dt)
expect_equal(res_5, dt)
expect_equal(res_6, dt)

## Duplicated CellLineName
dt <- data.table::data.table(
DrugName = c("DrugA", "DrugB", "DrugC", "DrugD", "DrugC", "DrugD"),
Gnumber = c("G1", "G2", "G3", "G4", "G3", "G4"),
CellLineName = c("ID1", "ID1", "ID2", "ID2", "ID2", "ID2"),
clid = c("C1", "C2", "C3", "C4", "C5", "C6")
)
res_7 <- set_unique_drug_names_dt(dt)
res_8 <- set_unique_cl_names_dt(dt)
expect_equal(res_7, dt)
expect_false(identical(res_8, dt))
expect_equal(length(unique(res_8$CellLineName)), 5)

## Duplicated DrugName
dt <- data.table::data.table(
DrugName = c("DrugA", "DrugA", "DrugB", "DrugB", "DrugB", "DrugB"),
Gnumber = c("G1", "G2", "G3", "G4", "G5", "G6"),
CellLineName = c("ID1", "ID2", "ID3", "ID4", "ID3", "ID4"),
clid = c("C1", "C2", "C3", "C4", "C3", "C4")
)
res_9 <- set_unique_drug_names_dt(dt)
res_10 <- set_unique_cl_names_dt(dt)
expect_false(identical(res_9, dt))
expect_equal(length(unique(res_9$DrugName)), 6)
expect_equal(res_10, dt)

## Duplicated both
dt <- data.table::data.table(
DrugName = c("DrugA", "DrugA", "DrugB", "DrugB"),
Gnumber = c("G1", "G2", "G3", "G4"),
CellLineName = c("ID1", "ID1", "ID2", "ID2"),
clid = c("C1", "C2", "C3", "C4")
)
res_11 <- set_unique_drug_names_dt(dt)
res_12 <- set_unique_cl_names_dt(dt)
expect_false(identical(res_11, dt))
expect_equal(length(unique(res_11$DrugName)), 4)
expect_equal(length(unique(res_11$CellLineName)), 2)
expect_false(identical(res_12, dt))
expect_equal(length(unique(res_12$DrugName)), 2)
expect_equal(length(unique(res_12$CellLineName)), 4)

## Function works in the same way for data.table and DataFrame
dt <- data.table::data.table(
DrugName = c("DrugA", "DrugB", "DrugC", "DrugD", "DrugC", "DrugD"),
Gnumber = c("G1", "G2", "G3", "G4", "G3", "G4"),
CellLineName = c("ID1", "ID1", "ID2", "ID2", "ID2", "ID2"),
clid = c("C1", "C2", "C3", "C4", "C5", "C6")
)
res_dt <- set_unique_cl_names_dt(dt)
df <- S4Vectors::DataFrame(
DrugName = c("DrugA", "DrugB", "DrugC", "DrugD", "DrugC", "DrugD"),
Gnumber = c("G1", "G2", "G3", "G4", "G3", "G4"),
CellLineName = c("ID1", "ID1", "ID2", "ID2", "ID2", "ID2"),
clid = c("C1", "C2", "C3", "C4", "C5", "C6")
)
res_S4 <- set_unique_cl_names_dt(df)
expect_equivalent(res_dt, res_S4)

})


test_that("set_unique_cl_names works correctly", {
se <- SummarizedExperiment::SummarizedExperiment(
assays = list(counts = matrix(1:4, ncol = 2)),
Expand All @@ -105,6 +199,7 @@ test_that("set_unique_cl_names works correctly", {
expect_equal(SummarizedExperiment::colData(se)$CellLineName, c("ID1 (C1)", "ID1 (C2)"))
})


test_that("set_unique_drug_names works correctly", {
se <- SummarizedExperiment::SummarizedExperiment(
assays = list(counts = matrix(1:4, ncol = 2)),
Expand Down
Loading