From 1fbe448f856e7110698501865b34318bc66f6dd3 Mon Sep 17 00:00:00 2001 From: Dariusz Scigocki Date: Thu, 24 Oct 2024 10:41:21 +0200 Subject: [PATCH 1/6] make average_biological_replicates_dt more restrictive - less column to check for duplicates --- R/utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index e96d68c2..ae3783be 100644 --- a/R/utils.R +++ b/R/utils.R @@ -476,7 +476,7 @@ average_biological_replicates_dt <- function( average_fields <- setdiff(names(Filter(is.numeric, data)), c(unlist(pidfs), var, iso_cols)) geometric_average_fields <- intersect(geometric_average_fields, names(dt)) fit_type_average_fields <- intersect(fit_type_average_fields, names(dt)) - group_by <- setdiff(names(data), c(average_fields, var, id_cols, fit_type_average_fields)) + group_by <- get_assay_req_uniq_cols(dt) if (add_sd) { # Calculate standard deviation for both average_fields and geometric_average_fields From dff1ec30fbfd3988173493685e9216359d0514a0 Mon Sep 17 00:00:00 2001 From: Dariusz Scigocki Date: Thu, 24 Oct 2024 10:42:22 +0200 Subject: [PATCH 2/6] bump version --- DESCRIPTION | 4 ++-- NEWS.md | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 65105279..5f2355c2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: gDRutils Type: Package Title: A package with helper functions for processing drug response data -Version: 1.3.16 -Date: 2024-10-11 +Version: 1.3.17 +Date: 2024-10-24 Authors@R: c(person("Bartosz", "Czech", role=c("aut"), comment = c(ORCID = "0000-0002-9908-3007")), person("Arkadiusz", "Gladki", role=c("cre", "aut"), email="gladki.arkadiusz@gmail.com", diff --git a/NEWS.md b/NEWS.md index b7ede461..310ce933 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +## gDRutils 1.3.17 - 2024-10-24 +*make average_biological_replicates_dt more restrictive - less column to check for duplicates + ## gDRutils 1.3.16 - 2024-10-11 * make duplicates' helpers supporting combo assays as well From 26f8860ed8b7746bbb5633e8c1350d844711d27b Mon Sep 17 00:00:00 2001 From: Dariusz Scigocki Date: Thu, 24 Oct 2024 12:05:10 +0200 Subject: [PATCH 3/6] updated tests --- tests/testthat/test-utils.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 4c61b0a5..7e5e9473 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -249,9 +249,9 @@ test_that("average_biological_replicates_dt works as expected", { av2b <- average_biological_replicates_dt(av2f, var = "source_id") expect_true(all.equal(av1f, av2b)) expect_true(nrow(av1f) == 1) + # even for an incorrect column name we check the differentiating columns av1i <- average_biological_replicates_dt(tdata, var = "source_id", fit_type_average_fields = "bad_value") - expect_true(nrow(av1i) == 8) - + expect_true(nrow(av1i) == 2) }) From ab2e075c648b2ff8aec39e64c453d2f5693e7bb4 Mon Sep 17 00:00:00 2001 From: darsoo Date: Thu, 24 Oct 2024 12:08:31 +0200 Subject: [PATCH 4/6] Update NEWS.md Co-authored-by: Bartek <32614650+bczech@users.noreply.github.com> --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 310ce933..c1598fbf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,5 @@ ## gDRutils 1.3.17 - 2024-10-24 -*make average_biological_replicates_dt more restrictive - less column to check for duplicates +* make average_biological_replicates_dt more restrictive - fewer columns to check for duplicates ## gDRutils 1.3.16 - 2024-10-11 * make duplicates' helpers supporting combo assays as well From 289a3d36eb2cb9e2518ab772b9cb5ab6a4c93eb0 Mon Sep 17 00:00:00 2001 From: Dariusz Scigocki Date: Mon, 28 Oct 2024 09:46:12 +0100 Subject: [PATCH 5/6] added blacklisted fields in averagings --- R/headers_list.R | 5 +++++ R/utils.R | 6 +++++- man/average_biological_replicates_dt.Rd | 4 ++++ tests/testthat/test-utils.R | 3 +-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/R/headers_list.R b/R/headers_list.R index df09f265..9573201b 100644 --- a/R/headers_list.R +++ b/R/headers_list.R @@ -204,6 +204,11 @@ "Fit Type GR", "RV_fit_type", "GR_fit_type" + ), + blacklisted = c( + "Tissue", # sometimes this field is missing + "Reference Division Time", # sometimes this field is missing + "Parental Identifier" # sometimes suffixes incorrectly differentiate this field ) ) } diff --git a/R/utils.R b/R/utils.R index ae3783be..c3fd7861 100644 --- a/R/utils.R +++ b/R/utils.R @@ -435,6 +435,8 @@ geometric_mean <- function(x, fixed = TRUE, maxlog10Concentration = 1) { #' to take the geometric average of. #' @param fit_type_average_fields Character vector of column names in \code{dt} #' that should be treated as a column with fit type data +#' @param blacklisted_fields Character vector of column names in \code{dt} +#' that should be skipped in averaging #' @param add_sd Flag indicating whether to add standard deviation and count columns. #' #' @examples @@ -452,6 +454,7 @@ average_biological_replicates_dt <- function( fixed = TRUE, geometric_average_fields = get_header("metric_average_fields")$geometric_mean, fit_type_average_fields = get_header("metric_average_fields")$fit_type, + blacklisted_fields = get_header("metric_average_fields")$blacklisted, add_sd = FALSE) { checkmate::assert_data_table(dt) @@ -476,7 +479,8 @@ average_biological_replicates_dt <- function( average_fields <- setdiff(names(Filter(is.numeric, data)), c(unlist(pidfs), var, iso_cols)) geometric_average_fields <- intersect(geometric_average_fields, names(dt)) fit_type_average_fields <- intersect(fit_type_average_fields, names(dt)) - group_by <- get_assay_req_uniq_cols(dt) + blacklisted_fields <- intersect(blacklisted_fields, names(dt)) + group_by <- setdiff(names(data), c(average_fields, var, id_cols, fit_type_average_fields, blacklisted_fields)) if (add_sd) { # Calculate standard deviation for both average_fields and geometric_average_fields diff --git a/man/average_biological_replicates_dt.Rd b/man/average_biological_replicates_dt.Rd index ba726af1..d7a2bea8 100644 --- a/man/average_biological_replicates_dt.Rd +++ b/man/average_biological_replicates_dt.Rd @@ -11,6 +11,7 @@ average_biological_replicates_dt( fixed = TRUE, geometric_average_fields = get_header("metric_average_fields")$geometric_mean, fit_type_average_fields = get_header("metric_average_fields")$fit_type, + blacklisted_fields = get_header("metric_average_fields")$blacklisted, add_sd = FALSE ) } @@ -29,6 +30,9 @@ to take the geometric average of.} \item{fit_type_average_fields}{Character vector of column names in \code{dt} that should be treated as a column with fit type data} +\item{blacklisted_fields}{Character vector of column names in \code{dt} +that should be skipped in averaging} + \item{add_sd}{Flag indicating whether to add standard deviation and count columns.} } \value{ diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 7e5e9473..9c87834b 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -249,9 +249,8 @@ test_that("average_biological_replicates_dt works as expected", { av2b <- average_biological_replicates_dt(av2f, var = "source_id") expect_true(all.equal(av1f, av2b)) expect_true(nrow(av1f) == 1) - # even for an incorrect column name we check the differentiating columns av1i <- average_biological_replicates_dt(tdata, var = "source_id", fit_type_average_fields = "bad_value") - expect_true(nrow(av1i) == 2) + expect_true(nrow(av1i) == 8) }) From f624d67db7cbacd51efdb3a158b5573a9c717522 Mon Sep 17 00:00:00 2001 From: Dariusz Scigocki Date: Mon, 28 Oct 2024 10:18:30 +0100 Subject: [PATCH 6/6] added unprettified identifiers in blacklisted average fields --- R/headers_list.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/R/headers_list.R b/R/headers_list.R index 9573201b..70f70ae0 100644 --- a/R/headers_list.R +++ b/R/headers_list.R @@ -205,10 +205,16 @@ "RV_fit_type", "GR_fit_type" ), + # due to the fact that there is some freedom in what values are in individual fields, + # in order to avoid duplicates in the application we have to exclude some fields from + # recognizing duplicates in averaging blacklisted = c( "Tissue", # sometimes this field is missing - "Reference Division Time", # sometimes this field is missing - "Parental Identifier" # sometimes suffixes incorrectly differentiate this field + "cellline_tissue", + "Reference Division Time", # sometimes this field has `NA`s + "cellline_ref_div_time", + "Parental Identifier", # sometimes suffixes incorrectly differentiate this field + "cellline_parental_identifier" ) ) }