Merge pull request #16 from FredHutch/ki_qc

Quality Control -- Report Making Function
FredHutch · Mar 22, 2024 · 9aa337a · 9aa337a
2 parents 6027dad + 8dc306e
commit 9aa337a
Show file tree

Hide file tree

Showing 12 changed files with 33,591 additions and 99 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -22,8 +22,12 @@ Imports:
     rmarkdown,
     rcmdcheck,
     ggplot2,
+    magrittr, 
+    kableExtra, 
+    pheatmap,
 Suggests:
     testthat (>= 3.0.0),
+    roxygen2,
 Config/testthat/edition: 3
 Encoding: UTF-8
 RoxygenNote: 7.2.3

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,8 +1,19 @@
 # Generated by roxygen2: do not edit by hand
 
+export("%>%")
 export(annotate)
 export(calc_fc)
 export(calculate_gi)
-export(example_data)
+export(get_example_data)
 export(run_qc)
 export(setup_data)
+import(dplyr)
+import(ggplot2)
+import(kableExtra)
+importFrom(dplyr,mutate)
+importFrom(ggplot2,ggplot)
+importFrom(ggplot2,labs)
+importFrom(magrittr,"%<>%")
+importFrom(magrittr,"%>%")
+importFrom(pheatmap,pheatmap)
+importFrom(tidyr,pivot_longer)
diff --git a/R/00-setup_data.R b/R/00-setup_data.R
@@ -1,7 +1,8 @@
 #' Making a new gimap dataset
 #' @description This function allows people to have their data ready to be processed by gimap
 #' @param counts a matrix of data that contains the counts where rows are each paired_guide target and columns are each sample
-#' @param pg_metadata metadata associated with the pgRNA constructs that correspond to the rows of the counts data
+#' @param pg_ids the pgRNA IDs: metadata associated with the pgRNA constructs that correspond to the rows of the counts data
+#' @param pg_metadata construct metadata
 #' @param sample_metadata metadata associated with the samples of the dataset that correspond to the columns of the counts data
 #' @return A special gimap_dataset to be used with the other functions in this package.
 #' @export
@@ -12,20 +13,23 @@
 #'
 #' counts_data <- setup_data(data)
 #' }
-setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL) {
-
+setup_data <- function(counts = NULL, pg_ids = NULL, pg_metadata = NULL, sample_metadata = NULL) {
   new_data <- gimap_data <- list(
-    raw_counts =  NULL,
+    raw_counts = NULL,
     counts_per_sample = NULL,
-    coverage = NULL,
     transformed_data = list(
-      long_form = NULL,
-      count_norm = NULL,
       count_norm = NULL,
       cpm = NULL,
-      log2_cpm = NULL),
-    metadata = list(pg_metadata = NULL,
-                    sample_metadata = NULL)
+      log2_cpm = NULL
+    ),
+    metadata = list(
+      pg_ids = NULL,
+      pg_metadata = NULL,
+      sample_metadata = NULL
+    ),
+    qc = list(
+      qc_filter = NULL
+    )
   )
 
   class(new_data) <- c("list", "gimap_dataset")
@@ -36,20 +40,20 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL
   # If they don't give sample metadata, then we will make up a row id
   if (is.null(sample_metadata)) sample_metadata <- data.frame(id = 1:nrow(counts))
   if (!is.data.frame(sample_metadata)) stop("metadata can only be in the form of a data.frame")
-  if (nrow(sample_metadata) != ncol(counts)) stop("the number of rows in the sample metadata is not equal to the number of columns in the counts" )
+  if (nrow(sample_metadata) != ncol(counts)) stop("the number of rows in the sample metadata is not equal to the number of columns in the counts")
 
   if (!(length(unique(sample_metadata[, 1])) == length(sample_metadata[, 1]))) stop("The first column in sample metadata must be a unique ID")
 
   new_data$metadata$sample_metadata <- sample_metadata
 
-  # If they don't give paired guide metadata, then we will make up a row id
-  if (is.null(pg_metadata)) pg_metadata <- data.frame(id = 1:ncol(counts))
-  if (!is.data.frame(pg_metadata)) stop("metadata can only be in the form of a data.frame")
-  if (nrow(pg_metadata) != nrow(counts)) stop("the number of rows in the pg_info is not equal to the number of rows in the counts" )
+  # If they don't give paired guide ids, then we will make up a row id
+  if (is.null(pg_ids)) pg_ids <- data.frame(id = 1:ncol(counts))
+  if (!is.data.frame(pg_ids)) stop("pgRNA IDs can only be in the form of a data.frame")
+  if (nrow(pg_ids) != nrow(counts)) stop("the number of rows in the pg_info is not equal to the number of rows in the counts")
 
   # we need the first column to be a unique id
-  if (!(length(unique(pg_metadata[, 1])) == length(pg_metadata[, 1]))) stop("The first column in paired guide metadata must be a unique ID")
-  new_data$metadata$pg_metadata <- pg_metadata
+  if (!(nrow(unique(pg_ids[, 1])) == nrow(pg_ids[, 1]))) stop("The paired guide IDs must be a unique ID")
+  new_data$metadata$pg_ids <- pg_ids
 
   # Store the raw counts
   new_data$raw_counts <- counts
@@ -64,4 +68,3 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL
 
   return(new_data)
 }
-
diff --git a/R/01-qc.R b/R/01-qc.R
@@ -1,39 +1,64 @@
-
-#' This is a title for a function
+#' A function to run QC
 #' @description This is a function here's where we describe what it does
-#' @param parameter Here's a parameter let's describe it here
+#' @param use_combined default it TRUE; if TRUE, both zero count and low plasmid CPM filters are applied and if either is TRUE, a pgRNA construct will be filtered out. If FALSE, need to allow to specify which should be used
+#' @param plots_dir default is `./qc_plots`; directory to save plots created with this function, if it doesn't exist already it will be created
+#' @param overwrite default is FALSE; whether to overwrite the QC Report file
+#' @param output_file_path default is `QC_Report`; name of the output QC report file
+#' @param ... additional parameters are sent to rmarkdown::render
 #' @export
 #' @importFrom tidyr pivot_longer
-#' @import ggplot2
+#' @importFrom magrittr %>%
 #' @examples \dontrun{
 #'
 #' }
+run_qc <- function(gimap_dataset,
+                   output_file = "./gimap_QC_Report.Rmd",
+                   plots_dir = "./qc_plots",
+                   overwrite = FALSE,
+                   ...) {
+
+  if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
 
-run_qc <- function(gimap_data, plots_dir = "./qc_plots", wide_ar = 0.75, square_ar = 1) {
+  # Determine the template
+  templateFile <- system.file("rmd/gimapQCTemplate.Rmd", package = "gimap")
 
-  if (!dir.exists(plots_dir)) {
-    dir.create(plots_dir, showWarnings = TRUE)
+  # Make sure that the directory exists!
+  directory <- dirname(output_file)
+
+  # If not, make the directory
+  if (directory != ".") {
+    if (!dir.exists(directory)) dir.create(directory, showWarnings = TRUE, recursive = TRUE)
   }
 
-  if (!("gimap_dataset" %in% class(gimap_data))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
+  # Now if the file exists,
+  if (file.exists(templateFile)) {
+    if (file.exists(output_file) & !overwrite) {
+      stop("there is already an output .Rmd file", output_file,
+           ". Please remove or rename this file, or choose another output_file name.",
+           call. = FALSE
+      )
+    } else {
+      file.copy(from = templateFile, to = output_file, overwrite = overwrite)
+    }
+  } else {
+    stop("The Rmd template file ", templateFile, " does not exist -- did you move it from the package files?",
+         call. = FALSE)
+  }
 
-  long_form <-
-    tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm),
-                        everything(),
-                        names_to = "sample",
-                        values_to = "count_normalized")
+  # Make a plots directory if it doesn't exist
+  if (!dir.exists(plots_dir)) dir.create(plots_dir, showWarnings = TRUE)
 
-  counts_cdf <- ggplot(long_form, aes(x = count_normalized, color = sample)) +
-    stat_ecdf() +
-    labs(x = "-log10(count/total_count)", # bquote(~-log[10]~"(count/total_count)")
-         y = "Expected_pgRNAs",
-         color = "Sample") +
-    plot_options() +
-    plot_theme() +
-    theme(aspect.ratio = wide_ar)
+  # Send the data to render it!
+  rmarkdown::render(output_file,
+                    params = list(dataset = gimap_dataset,
+                                  plots_dir = plots_dir),
+                                  ...)
 
-  counts_cdf
-  save_plot(counts_cdf, out_dir = plots_dir)
+  # Tell where the output is
+  results_file <- gsub("\\.Rmd$", "\\.html", output_file)
+  message("Results in: ", results_file)
 
+  results_file <- normalizePath(list.files(pattern = results_file, full.names = TRUE))
 
+  if (results_file != "") browseURL(results_file)
 }
diff --git a/R/02-foldchange.R b/R/02-foldchange.R
@@ -5,7 +5,6 @@
 #' @examples \dontrun{
 #'
 #' }
-
 calc_fc <- function() {
 
 }
diff --git a/R/03-annotate.R b/R/03-annotate.R
@@ -5,7 +5,6 @@
 #' @examples \dontrun{
 #'
 #' }
-
-annotate <- function() { 
+annotate <- function() {
 
 }
diff --git a/R/04-calculate_gi.R b/R/04-calculate_gi.R
@@ -5,7 +5,6 @@
 #' @examples \dontrun{
 #'
 #' }
-
-calculate_gi <- function() { 
+calculate_gi <- function() {
 
 }
diff --git a/R/qc-plots.R b/R/qc-plots.R
@@ -0,0 +1,155 @@
+#' Create a CDF for the pgRNA normalized counts
+#' @description This function uses pivot_longer to rearrange the data for plotting and then plots a CDF of the normalized counts
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
+#' @param qc_obj The object that has the qc stuff stored
+#' @param wide_ar aspect ratio, default is 0.75
+#' @importFrom tidyr pivot_longer
+#' @importFrom ggplot2 ggplot labs
+#' @return counts_cdf a ggplot
+#' @examples \dontrun{
+#'
+#' }
+#'
+qc_cdf <- function(gimap_dataset, wide_ar = 0.75) {
+  long_form <-
+    tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm),
+      everything(),
+      names_to = "sample",
+      values_to = "count_normalized"
+    )
+
+  counts_cdf <- ggplot(long_form, aes(x = count_normalized, color = sample)) +
+    stat_ecdf() +
+    labs(
+      x = "-log10(count/total_count)",
+      y = "Expected_pgRNAs",
+      color = "Sample"
+    ) +
+    plot_options() +
+    plot_theme() +
+    theme(aspect.ratio = wide_ar)
+
+  return(counts_cdf)
+}
+
+#' Create a histogram for the pgRNA log2 CPMs, faceted by sample
+#' @description This function uses pivot_longer to rearrange the data for plotting and then plots sample specific histograms of the pgRNA cpm's
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
+#' @param wide_ar aspect ratio, default is 0.75
+#' @importFrom tidyr pivot_longer
+#' @import ggplot2
+#' @return sample_cpm_histogram a ggplot
+#' @examples \dontrun{
+#'
+#' }
+#'
+qc_sample_hist <- function(gimap_dataset, wide_ar = 0.75) {
+  long_form <-
+    tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$log2_cpm),
+      everything(),
+      names_to = "sample",
+      values_to = "log2_cpm"
+    )
+
+  sample_cpm_histogram <- ggplot(long_form, aes(x = log2_cpm, fill = sample)) +
+    geom_histogram(color = "black", binwidth = 0.5) +
+    plot_options() +
+    plot_theme() +
+    theme(
+      aspect.ratio = wide_ar,
+      legend.position = "none"
+    ) +
+    facet_wrap(~sample, scales = "free_y", ncol = ceiling(ncol(gimap_dataset$raw_counts) / 2))
+
+  return(sample_cpm_histogram)
+}
+
+#' Create a correlation heatmap for the pgRNA CPMs
+#' @description This function uses the `cor` function to find correlations between the sample CPM's and then plots a heatmap of these
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
+#' @param ... Additional arguments are passed in to the pheatmap function.
+#' @importFrom magrittr %>%
+#' @importFrom pheatmap pheatmap
+#' @return `sample_cor_heatmap` a pheatmap
+#' @examples \dontrun{
+#'
+#' }
+#'
+qc_cor_heatmap <- function(gimap_dataset, ...) {
+  cpm_cor <- gimap_dataset$transformed_data$cpm %>%
+    cor() %>%
+    round(2) %>%
+    data.frame()
+
+  sample_cor_heatmap <-
+    pheatmap::pheatmap(cpm_cor,
+      border_color = "white",
+      cellwidth = 20,
+      cellheight = 20,
+      treeheight_row = 20,
+      treeheight_col = 20,
+      ...
+    )
+
+  return(sample_cor_heatmap)
+}
+
+#' Create a histogram with plasmid log2 CPM values and ascertain a cutoff for low values
+#' @description Find the distribution of plasmid (day0 data) pgRNA log2 CPM values, and ascertain a cutoff or filter for low log2 CPM values.
+#' Assumes the first column of the dataset is the day0 data; do I need a better
+#' method to tell, especially if there are reps?
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
+#' @param cutoff default is NULL, the cutoff for low log2 CPM values for the plasmid time period
+#' @param wide_ar aspect ratio, default is 0.75
+#' @importFrom magrittr %>%
+#' @import ggplot2
+#' @return a named list
+
+qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, wide_ar = 0.75) {
+  to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, 1]) %>% `colnames<-`(c("log2_cpm"))
+
+  plasmid_cpm_histogram <- ggplot(to_plot, aes(x = log2_cpm)) +
+    geom_histogram(binwidth = 0.2, color = "black", fill = "gray60") +
+    plot_options() +
+    plot_theme() +
+    theme(aspect.ratio = wide_ar)
+
+  if (is.null(cutoff)) {
+    # if cutoff is null, suggest a cutoff and plot with suggested
+    quantile_info <- quantile(to_plot$log2_cpm)
+    plasmid_cpm_stats <- data.frame(
+      stat = c("median", "Q1", "Q3", "lower_outlier"),
+      log2_cpm_value = c(
+        quantile_info["50%"],
+        quantile_info["25%"],
+        quantile_info["75%"],
+        quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"]))
+      )
+    )
+    cutoff <- plasmid_cpm_stats[which(plasmid_cpm_stats$stat == "lower_outlier"), "log2_cpm_value"]
+  } else {
+    plasmid_cpm_stats <- NULL
+  }
+
+  # plot with the cutoff
+  plasmid_cpm_hist_wcutoff <- plasmid_cpm_histogram +
+    geom_vline(
+      xintercept = cutoff,
+      linetype = "dashed"
+    )
+
+
+  plasmid_cpm_filter <- unlist(lapply(1:nrow(to_plot), function(x) to_plot$log2_cpm[x] < cutoff))
+
+  plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>%
+    mutate(percent = round(((n / sum(n)) * 100), 2))
+
+  return(list(
+    plasmid_hist_nocutoff = plasmid_cpm_histogram,
+    plasmid_stats = plasmid_cpm_stats,
+    used_log2_cpm_cutoff = cutoff,
+    plasmid_hist_cutoff = plasmid_cpm_hist_wcutoff,
+    plasmid_filter = plasmid_cpm_filter,
+    plasmid_filter_report = plasmid_filter_df
+  ))
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,7 +5,6 @@ @@
     #' @examples \dontrun{
     #'
     #' }
     calc_fc <- function() {
     }