FredHutch · cansavvy · Mar 22, 2024 · Feb 8, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/R/00-setup_data.R b/R/00-setup_data.R
@@ -1,7 +1,8 @@
 #' Making a new gimap dataset
 #' @description This function allows people to have their data ready to be processed by gimap
 #' @param counts a matrix of data that contains the counts where rows are each paired_guide target and columns are each sample
-#' @param pg_metadata metadata associated with the pgRNA constructs that correspond to the rows of the counts data
+#' @param pg_ids the pgRNA IDs: metadata associated with the pgRNA constructs that correspond to the rows of the counts data
+#' @param pg_metadata construct metadata 
 #' @param sample_metadata metadata associated with the samples of the dataset that correspond to the columns of the counts data
 #' @return A special gimap_dataset to be used with the other functions in this package.
 #' @export
@@ -12,19 +13,21 @@
 #'
 #' counts_data <- setup_data(data)
 #' }
-setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL) {
+setup_data <- function(counts = NULL, pg_ids = NULL, pg_metadata = NULL, sample_metadata = NULL) {
 
   new_data <- gimap_data <- list(
     raw_counts =  NULL,
     counts_per_sample = NULL,
-    coverage = NULL,
+    #coverage = NULL,
     transformed_data = list(
       long_form = NULL,
       count_norm = NULL,
       count_norm = NULL,
       cpm = NULL,
-      log2_cpm = NULL),
-    metadata = list(pg_metadata = NULL,
+      log2_cpm = NULL,
+      qc_filter = NULL),
+    metadata = list(pg_ids = NULL,
+                    pg_metadata = NULL,
                     sample_metadata = NULL)
   )
 
@@ -42,14 +45,14 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL
 
   new_data$metadata$sample_metadata <- sample_metadata
 
-  # If they don't give paired guide metadata, then we will make up a row id
-  if (is.null(pg_metadata)) pg_metadata <- data.frame(id = 1:ncol(counts))
-  if (!is.data.frame(pg_metadata)) stop("metadata can only be in the form of a data.frame")
-  if (nrow(pg_metadata) != nrow(counts)) stop("the number of rows in the pg_info is not equal to the number of rows in the counts" )
+  # If they don't give paired guide ids, then we will make up a row id
+  if (is.null(pg_ids)) pg_ids <- data.frame(id = 1:ncol(counts))
+  if (!is.data.frame(pg_ids)) stop("pgRNA IDs can only be in the form of a data.frame")
+  if (nrow(pg_ids) != nrow(counts)) stop("the number of rows in the pg_info is not equal to the number of rows in the counts" )
 
   # we need the first column to be a unique id
-  if (!(length(unique(pg_metadata[, 1])) == length(pg_metadata[, 1]))) stop("The first column in paired guide metadata must be a unique ID")
-  new_data$metadata$pg_metadata <- pg_metadata
+  if (!(nrow(unique(pg_ids[, 1])) == nrow(pg_ids[, 1]))) stop("The paired guide IDs must be a unique ID")
+  new_data$metadata$pg_ids <- pg_ids
 
   # Store the raw counts
   new_data$raw_counts <- counts
@@ -59,8 +62,8 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL
 
   # Transform the data
   new_data$transformed_data$count_norm <- apply(counts, 2, function(x) -log10((x+1)/sum(x)))
-  new_data$transformed_data$cpm <- apply(counts, 2, function(x) (x/new_data$counts_per_sample)*1e6)
-  new_data$transformed_data$log2_cpm <- log2(new_data$cpm +1)
+  new_data$transformed_data$cpm <- apply(counts, 2, function(x) (x/sum(x))*1e6)
+  new_data$transformed_data$log2_cpm <- log2(new_data$transformed_data$cpm +1)
 
   return(new_data)
 }

diff --git a/R/01-qc.R b/R/01-qc.R
@@ -1,39 +1,102 @@
 
-#' This is a title for a function
+#' A function to run QC
 #' @description This is a function here's where we describe what it does
 #' @param parameter Here's a parameter let's describe it here
 #' @export
 #' @importFrom tidyr pivot_longer
-#' @import ggplot2
+#' @importFrom magrittr %>%
+#' @import ggplot2 pheatmap
 #' @examples \dontrun{
 #'
 #' }
 
-run_qc <- function(gimap_data, plots_dir = "./qc_plots", wide_ar = 0.75, square_ar = 1) {
 
-  if (!dir.exists(plots_dir)) {
+run_qc <- function(gimap_dataset, plasmid_cutoff = NULL, use_combined = TRUE,
+                   params = list(cell_line = NULL), plots_dir = "./qc_plots", overwrite = FALSE, output_format = NULL, output_file = "QC_Report", output_dir = "./",
+                   wide_ar = 0.75, square_ar = 1) {
+
+  if (!dir.exists(plots_dir)){
     dir.create(plots_dir, showWarnings = TRUE)
   }
+
+  if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
+
+  # for Rmd report
+  list_of_qc_things <- list(counts_cdf = NULL,
+                            sample_cpm_histogram = NULL,
+                            sample_cor_heatmap_unfiltered = NULL,
+                            zerocount_df = NULL,
+                            plasmid_hist_nocutoff = NULL,
+                            plasmid_stats = NULL,
+                            plasmid_hist_cutoff = NULL,
+                            plasmid_filter_report = NULL,
+                            combined_filter_df = NULL,
+                            which_filter_df = NULL
+                            )
+
+  counts_cdf <- qc_cdf(gimap_dataset, wide_ar = wide_ar)
+
+  list_of_qc_things$counts_cdf <- counts_cdf #put it in the report
+  save_plot(counts_cdf, params = params, out_dir = plots_dir)
+
+  sample_cpm_histogram <- qc_sample_hist(gimap_dataset, wide_ar = wide_ar)
+
+  list_of_qc_things$sample_cpm_histogram <- sample_cpm_histogram #put it in the report
+  save_plot(sample_cpm_histogram, params = params, out_dir = plots_dir)
+
+  sample_cor_heatmap_unfiltered <- qc_cor_heatmap(gimap_dataset)
+
+  list_of_qc_things$sample_cor_heatmap_unfiltered <- sample_cor_heatmap_unfiltered #put it in the report
+  save_plot(sample_cor_heatmap_unfiltered, params = params, out_dir = plots_dir)
+  ## repeat after filtering?
+
+  ## flag pgRNAs with count = 0 at any time point
+  ## how many pgRNAs have a raw count of 0 at any time point/sample
+  counts_filter_list <- qc_filter_zerocounts(gimap_dataset)
+
+  counts_filter <- counts_filter_list$filter #TRUE means the filter applies
+
+  zerocount_df <- counts_filter_list$reportdf
+
+  list_of_qc_things$zerocount_df <- zerocount_df #put it in the report
+
+  ## flag pgRNAs with low plasmid read counts
+  ## plasmid reads are the first sample/day0 time point  
+  plasmid_filter_list <- qc_plasmid_histogram(gimap_dataset, cutoff = plasmid_cutoff, wide_ar = wide_ar)
+
+  list_of_qc_things$plasmid_hist_nocutoff <- plasmid_filter_list$plasmid_hist_nocutoff #put it in the report
+  save_plot(plasmid_filter_list$plasmid_hist_nocutoff, params = params, out_dir = plots_dir)
+
+
+  list_of_qc_things$plasmid_stats <- plasmid_filter_list$plasmid_stats #put it in the report
 
-  if (!("gimap_dataset" %in% class(gimap_data))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
-
-  long_form <-
-    tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm),
-                        everything(),
-                        names_to = "sample",
-                        values_to = "count_normalized")
-
-  counts_cdf <- ggplot(long_form, aes(x = count_normalized, color = sample)) +
-    stat_ecdf() +
-    labs(x = "-log10(count/total_count)", # bquote(~-log[10]~"(count/total_count)")
-         y = "Expected_pgRNAs",
-         color = "Sample") +
-    plot_options() +
-    plot_theme() +
-    theme(aspect.ratio = wide_ar)
-
-  counts_cdf
-  save_plot(counts_cdf, out_dir = plots_dir)
-
-
+  list_of_qc_things$plasmid_hist_cutoff <- plasmid_filter_list$plasmid_hist_cutoff #put it in the report
+  save_plot(plasmid_filter_list$plasmid_hist_cutoff, params = params, out_dir = plots_dir)
+
+
+  list_of_qc_things$plasmid_filter_report <- plasmid_filter_list$plasmid_filter_report #put it in the report
+
+  plasmid_filter <- plasmid_filter_list$plasmid_filter #TRUE means the filter applies
+
+  ## combine the filters (zerocount and plasmid)
+  ## how many pgRNAs are removed by both filters?
+
+  combined_filter_list <- qc_combine_filters(counts_filter, plasmid_filter, use_combined = use_combined)
+
+  list_of_qc_things$num_filtered_report <- combined_filter_df <- combined_filter_list$num_filtered_report #put it in the report
+
+  list_of_qc_things$which_filter_df <- which_filter_df <- combined_filter_list$num_which_filter_report #put it in the report
+
+  gimap_dataset$transformed_data$qc_filter <- combined_filter_list$combined_filter
+
+  #if (use_combined){
+    #filter gimap_dataset and metadata to only keep pgRNA passing both filters
+    #gimap_dataset$transformed_data$qc_filter <- combined_filter_list$keep_pgRNA
+  #} else {
+    #print out a message asking which filter to use?
+  #}
+
+  qc_generateReport(list_of_qc_things, overwrite = overwrite, output_file = output_file, output_dir = output_dir)
+
+  return(gimap_dataset)
 }
diff --git a/R/qc_cdf.R b/R/qc_cdf.R
@@ -0,0 +1,31 @@
+#' Create a CDF for the pgRNA normalized counts
+#' @description This function uses pivot_longer to rearrange the data for plotting and then plots a CDF of the normalized counts
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
+#' @param wide_ar aspect ratio, default is 0.75
+#' @importFrom tidyr pivot_longer
+#' @import ggplot2
+#' @return counts_cdf a ggplot
+#' @examples \dontrun{
+#' 
+#' }
+#'
+
+qc_cdf <- function(gimap_dataset, wide_ar = 0.75){
+
+  long_form <- 
+    tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm),
+                        everything(),
+                        names_to = "sample",
+                        values_to = "count_normalized")
+
+  counts_cdf <- ggplot(long_form, aes(x=count_normalized, color = sample)) +
+    stat_ecdf() +
+    labs(x = "-log10(count/total_count)",
+         y = "Expected_pgRNAs",
+         color = "Sample") +
+    plot_options() +
+    plot_theme() +
+    theme(aspect.ratio = wide_ar)
+
+  return(counts_cdf)
+}
diff --git a/R/qc_combine_filters.R b/R/qc_combine_filters.R
@@ -0,0 +1,34 @@
+#'
+#'
+#'
+#'
+#'
+#' @importFrom magrittr %>% %<>%
+#' @import dplyr
+#' @return a named list
+#'
+
+qc_combine_filters <- function(filter_zerocount, filter_plasmid, use_combined = TRUE){
+  combined_filter <- cbind(data.frame(filter_zerocount), data.frame(filter_plasmid)) %>%
+    mutate(combined = filter_zerocount == TRUE | filter_plasmid == TRUE)
+
+  combined_filter_df <- data.frame("Filtered" = c("Yes", "No"), #Yes filtered means remove pgRNA
+                                   n = c(sum(combined_filter$combined), sum(!combined_filter$combined))) %>%
+    mutate(percent = round(((n/sum(n))*100),2))
+
+  which_filter_df <- data.frame("Which Filter" = c("Low plasmid cpm only", "Zero count for any sample", "Both", "Neither"),
+                                 n = c(sum(combined_filter$filter_zerocount == FALSE & combined_filter$filter_plasmid == TRUE),
+                                              sum(combined_filter$filter_zerocount == TRUE & combined_filter$filter_plasmid == FALSE),
+                                              sum(combined_filter$filter_zerocount == TRUE & combined_filter$filter_plasmid == TRUE),
+                                              sum(combined_filter$filter_zerocount == FALSE & combined_filter$filter_plasmid == FALSE))) %>%
+    mutate(percent = round(((n/sum(n)) *100),2))
+
+  if (use_combined){
+    combined_filter %<>% select(combined) %>% mutate(combined = as.logical(abs(combined -1))) %>% `colnames<-`(c("keep_pgRNA")) #invert the bits so boolean reports which ones you keep, not which ones you remove.
+  } else { combined_filter$keep_pgRNA <- NA }
+
+  return(list(combined_filter = combined_filter$keep_pgRNA, 
+              num_filtered_report = combined_filter_df,
+              num_which_filter_report = which_filter_df))
+
+}
diff --git a/R/qc_cor_heatmap.R b/R/qc_cor_heatmap.R
@@ -0,0 +1,27 @@
+#' Create a correlation heatmap for the pgRNA cpm's
+#' @description This function uses the cor function to find correlations between the sample CPM's and then plots a heatmap of these
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
+#' @importFrom magrittr %>%
+#' @importFrom pheatmap pheatmap
+#' @return sample_cor_heatmap a pheatmap
+#' @examples \dontrun{
+#' 
+#' }
+#'
+
+qc_cor_heatmap <- function(gimap_dataset){
+
+  cpm_cor <- gimap_dataset$transformed_data$cpm %>%
+    cor() %>%
+    round(2) %>%
+    data.frame()
+
+  sample_cor_heatmap <-
+    pheatmap(cpm_cor,
+             border_color = "white",
+             cellwidth = 20, cellheight = 20,
+             treeheight_row = 20, treeheight_col = 20)
+
+  return (sample_cor_heatmap)
+
+}
diff --git a/R/qc_filter_zerocounts.R b/R/qc_filter_zerocounts.R
@@ -0,0 +1,21 @@
+#' Create a filter for pgRNAs which have a raw count of 0 for any sample/time point
+#' @description This function flags and reports which and how many pgRNAs have a raw count of 0 for any sample/time point
+#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
+#' @importFrom magrittr %>%
+#' @importFrom dpylr mutate
+#' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point 
+#' @examples \dontrun{
+#' 
+#' }
+#'
+
+qc_filter_zerocounts <- function(gimap_dataset){
+
+  counts_filter <- unlist(lapply(1:nrow(gimap_dataset$raw_counts), function(x) 0 %in% gimap_dataset$raw_counts[x,]))
+
+  zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>%
+    mutate(percent = round(((n/sum(n))*100),2))
+
+  return(list(filter = counts_filter, reportdf = zerocount_df))
+
+}
diff --git a/R/qc_generateReport.R b/R/qc_generateReport.R
@@ -0,0 +1,38 @@
+#'
+#'
+#'
+#' Used https://github.com/csoneson/alevinQC/tree/devel and https://stackoverflow.com/a/37122513 as inspiration for producing an html QC report
+#'
+#' @import rmarkdown
+#'
+#'
+
+qc_generateReport = function(list_of_qc_things, overwrite = FALSE, output_format = NULL, output_file = "QC_Report.Rmd", output_dir="./"){
+
+  #Determine the template
+  templateFile = system.file("rmd/gimapQCTemplate.Rmd", package="gimap")
+
+  if (file.exists(templateFile)) {
+    if (file.exists(output_file) & !overwrite){
+      stop("there is already an output .Rmd file", output_file,
+           ". Please remove or rename this file, or choose another output_file name.", 
+           call. = FALSE)
+    } else {
+      file.copy(from = templateFile, to = output_file, overwrite = overwrite)
+    }
+
+  } else {
+    stop("The Rmd template file ", templateFile, " does not exist", call. = FALSE)
+  }
+
+  #Process the Arguments
+  args                <- list()
+  args$input          <- templateFile
+  args$output_dir     <- output_dir
+  args$output_format  <- output_format
+  args$output_file    <- output_file
+
+  #Run the render 
+  outputFileName = do.call('render', args=args)
+  invisible(outputFileName)
+}