Add Aneuploidy Score computation #374

ShixiangWang · Aug 2, 2021 · faf0fa1 · faf0fa1
1 parent ed57a90
commit faf0fa1
Show file tree

Hide file tree

Showing 9 changed files with 411 additions and 74 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -29,6 +29,7 @@ export(enquo)
 export(enquos)
 export(enrich_component_strand_bias)
 export(expr)
+export(get_Aneuploidy_score)
 export(get_adj_p)
 export(get_bayesian_result)
 export(get_cn_freq_table)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # sigminer 2.0.4
 
+- Implemented Cohen-Sharir method-like Aneuploidy Score.
 - Enhanced error handling in `show_sig_feature_corrplot()` (#376).
 - Fixed INDEL classification.
 - Fixed end position determination in `read_vcf()`.

diff --git a/R/get.R b/R/get.R
@@ -275,80 +275,6 @@ get_LengthFraction <- function(CN_data,
   segTab[, c(colnames(arm_data)[-1], "flag") := NULL]
   segTab
 
-
-  # .annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, q_end, q_length, total_size) {
-  #   if (end <= p_end & start >= p_start) {
-  #     ## 1L
-  #     location <- paste0(sub("chr", "", chrom), "p")
-  #     annotation <- "short arm"
-  #     fraction <- (end - start + 1) / (p_end - p_start + 1)
-  #   } else if (end <= q_end &
-  #     start >= q_start) {
-  #     ## 2L
-  #     location <- paste0(sub("chr", "", chrom), "q")
-  #     annotation <- "long arm"
-  #     fraction <- (end - start + 1) / (q_end - q_start + 1)
-  #   } else if (start >= p_start &
-  #     start <= p_end &
-  #     end >= q_start & end <= q_end) {
-  #     ## 3L
-  #     location <- paste0(sub("chr", "", chrom), "pq") # across p and q arm
-  #     annotation <- "across short and long arm"
-  #     fraction <- 2 * ((end - start + 1) / total_size)
-  #   } else if (start < p_end & end < q_start) {
-  #     ## 4L
-  #     location <- paste0(sub("chr", "", chrom), "p")
-  #     annotation <- "short arm intersect with centromere region"
-  #     # only calculate region does not intersect
-  #     fraction <- (end - start + 1 - (end - p_end)) / (p_end - p_start + 1)
-  #   } else if (start > p_end &
-  #     start < q_start & end > q_start) {
-  #     ## 5L
-  #     location <- paste0(sub("chr", "", chrom), "q")
-  #     annotation <- "long arm intersect with centromere region"
-  #     # only calculate region does not intersect
-  #     fraction <- (end - start + 1 - (start - q_start)) / (q_end - q_start + 1)
-  #   } else {
-  #     ## 6L
-  #     location <- paste0(sub("chr", "", chrom), "pq") # suppose as pq
-  #     annotation <- "segment locate in centromere region"
-  #     fraction <- 2 * ((end - start + 1) / total_size)
-  #   }
-  #
-  #   dplyr::tibble(location = location, annotation = annotation, fraction = fraction)
-  # }
-  #
-  # annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start,
-  #                       q_end, q_length, total_size, .pb = NULL) {
-  #   if (.pb$i < .pb$n) .pb$tick()$print()
-  #   .annot_fun(
-  #     chrom, start, end, p_start, p_end, p_length, q_start,
-  #     q_end, q_length, total_size
-  #   )
-  # }
-  #
-  # pb <- progress_estimated(nrow(segTab), 0)
-  #
-  # annot <- purrr::pmap_df(
-  #   list(
-  #     chrom = segTab$chromosome,
-  #     start = segTab$start,
-  #     end = segTab$end,
-  #     p_start = segTab$p_start,
-  #     p_end = segTab$p_end,
-  #     p_length = segTab$p_length,
-  #     q_start = segTab$q_start,
-  #     q_end = segTab$q_end,
-  #     q_length = segTab$q_length,
-  #     total_size = segTab$total_size
-  #   ), annot_fun,
-  #   .pb = pb
-  # )
-  #
-  # cbind(
-  #   data.table::as.data.table(segTab)[, colnames(arm_data)[-1] := NULL],
-  #   data.table::as.data.table(annot)
-  # )
 }
 
 

diff --git a/R/get_Aneuploidy_score.R b/R/get_Aneuploidy_score.R
@@ -0,0 +1,105 @@
+#' Get Aneuploidy Score from Copy Number Profile
+#'
+#' This implements a Cohen-Sharir method (see reference) like "Aneuploidy Score" computation.
+#' You can read the source code to see how it works. Basically, it follows
+#' the logic of Cohen-Sharir method but with some difference in detail implementation.
+#' Their results should be counterpart, but with no data validation for now.
+#' **Please raise an issue if you find problem/bugs in this function**.
+#'
+#' @inheritParams read_copynumber
+#' @param data a CopyNumber object or a `data.frame` containing at least
+#' 'chromosome', 'start', 'end', 'segVal', 'sample' these columns.
+#' @param ploidy_df default is `NULL`, compute ploidy by segment-size weighted copy number
+#' aross autosome, see [get_cn_ploidy]. You can also provide a `data.frame` with 'sample'
+#' and 'ploidy' columns.
+#' @references
+#' - Cohen-Sharir, Y., McFarland, J. M., Abdusamad, M., Marquis, C., Bernhard, S. V., Kazachkova, M., ... & Ben-David, U. (2021). Aneuploidy renders cancer cells vulnerable to mitotic checkpoint inhibition. Nature, 1-6.
+#' - Logic reference: <https://github.com/quevedor2/aneuploidy_score/>.
+#'
+#' @return A `data.frame`
+#' @export
+#'
+#' @examples
+#' # Load copy number object
+#' load(system.file("extdata", "toy_copynumber.RData",
+#'   package = "sigminer", mustWork = TRUE
+#' ))
+#'
+#' df <- get_Aneuploidy_score(cn)
+#' df
+#'
+#' df2 <- get_Aneuploidy_score(cn@data)
+#' df2
+#'
+#' df3 <- get_Aneuploidy_score(cn@data,
+#'   ploidy_df = get_cn_ploidy(cn@data)
+#' )
+#' df3
+#' @testexamples
+#' expect_equal(nrow(df), 10L)
+#' expect_equal(nrow(df2), 10L)
+#' expect_equal(nrow(df3), 10L)
+get_Aneuploidy_score <- function(data, ploidy_df = NULL, genome_build = "hg19") {
+  stopifnot(is.data.frame(data) | inherits(data, "CopyNumber"))
+  if (is.data.frame(data)) {
+    nc_cols <- c("chromosome", "start", "end", "segVal", "sample")
+    if (!all(nc_cols %in% colnames(data))) {
+      stop("Invalid input, it must contain columns: ", paste(nc_cols, collapse = " "))
+    }
+    annot <- get_LengthFraction(data,
+      genome_build = genome_build,
+      seg_cols = c("chromosome", "start", "end", "segVal")
+    )
+  } else {
+    if (nrow(data@annotation) == 0L) {
+      annot <- get_LengthFraction(data@data,
+        genome_build = data@genome_build,
+        seg_cols = c("chromosome", "start", "end", "segVal")
+      )
+    } else {
+      annot <- data@annotation
+    }
+    data <- data@data
+  }
+
+  if (is.null(ploidy_df)) {
+    ploidy_df <- get_cn_ploidy(data)
+  } else {
+    ploidy_df <- data.table::as.data.table(ploidy_df)
+  }
+
+  annot <- merge(annot, ploidy_df, by = "sample", all.x = TRUE)
+  annot$type <- sub("[0-9]+", "", annot$location)
+
+  # For segments spans the centromere, remove it if length < 50%, i.e. 1 here.
+  # We assign average weights to all arms.
+  df <- annot[!(type == "pq" & round(fraction, 2) < 1) & (chromosome %in% paste0("chr", 1:22)), ] %>%
+    dplyr::mutate(
+      fraction = ifelse(type == "pq", .data$fraction / 2, .data$fraction),
+      type = ifelse(type == "pq", "p,q", type)
+    ) %>%
+    tidyr::separate_rows(type, sep = ",") %>%
+    data.table::as.data.table()
+
+  df <- df[, .(flag = as.integer(round(sum(segVal * fraction) - ploidy[1]))),
+    # overage = round(sum(fraction), 3)),
+    by = .(sample, chromosome, type)
+  ]
+
+  df %>%
+    dplyr::mutate(
+      flag = dplyr::case_when(
+        .data$flag > 1 ~ 1L,
+        .data$flag < -1 ~ -1L,
+        TRUE ~ .data$flag
+      ),
+      cytoband = paste(.data$chromosome, .data$type, sep = "-")
+    ) %>%
+    dplyr::select(-c("chromosome", "type")) %>%
+    tidyr::pivot_wider(id_cols = "sample", names_from = "cytoband", values_from = "flag", values_fill = 0L) %>%
+    dplyr::mutate(AScore = rowSums(abs(.[, -1]))) %>%
+    dplyr::select(c("sample", "AScore", dplyr::everything())) %>%
+    data.table::as.data.table()
+}
+
+utils::globalVariables(c("ploidy"))
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -146,6 +146,7 @@ reference:
     contents:
     - transform_seg_table
     - get_cn_ploidy
+    - get_Aneuploidy_score
     - scoring
     - show_cn_profile
     - show_cn_circos