-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from FredHutch/ki_qc
Quality Control -- Report Making Function
- Loading branch information
Showing
12 changed files
with
33,591 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,19 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export("%>%") | ||
export(annotate) | ||
export(calc_fc) | ||
export(calculate_gi) | ||
export(example_data) | ||
export(get_example_data) | ||
export(run_qc) | ||
export(setup_data) | ||
import(dplyr) | ||
import(ggplot2) | ||
import(kableExtra) | ||
importFrom(dplyr,mutate) | ||
importFrom(ggplot2,ggplot) | ||
importFrom(ggplot2,labs) | ||
importFrom(magrittr,"%<>%") | ||
importFrom(magrittr,"%>%") | ||
importFrom(pheatmap,pheatmap) | ||
importFrom(tidyr,pivot_longer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,39 +1,64 @@ | ||
|
||
#' This is a title for a function | ||
#' A function to run QC | ||
#' @description This is a function here's where we describe what it does | ||
#' @param parameter Here's a parameter let's describe it here | ||
#' @param use_combined default it TRUE; if TRUE, both zero count and low plasmid CPM filters are applied and if either is TRUE, a pgRNA construct will be filtered out. If FALSE, need to allow to specify which should be used | ||
#' @param plots_dir default is `./qc_plots`; directory to save plots created with this function, if it doesn't exist already it will be created | ||
#' @param overwrite default is FALSE; whether to overwrite the QC Report file | ||
#' @param output_file_path default is `QC_Report`; name of the output QC report file | ||
#' @param ... additional parameters are sent to rmarkdown::render | ||
#' @export | ||
#' @importFrom tidyr pivot_longer | ||
#' @import ggplot2 | ||
#' @importFrom magrittr %>% | ||
#' @examples \dontrun{ | ||
#' | ||
#' } | ||
run_qc <- function(gimap_dataset, | ||
output_file = "./gimap_QC_Report.Rmd", | ||
plots_dir = "./qc_plots", | ||
overwrite = FALSE, | ||
...) { | ||
|
||
if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.") | ||
|
||
run_qc <- function(gimap_data, plots_dir = "./qc_plots", wide_ar = 0.75, square_ar = 1) { | ||
# Determine the template | ||
templateFile <- system.file("rmd/gimapQCTemplate.Rmd", package = "gimap") | ||
|
||
if (!dir.exists(plots_dir)) { | ||
dir.create(plots_dir, showWarnings = TRUE) | ||
# Make sure that the directory exists! | ||
directory <- dirname(output_file) | ||
|
||
# If not, make the directory | ||
if (directory != ".") { | ||
if (!dir.exists(directory)) dir.create(directory, showWarnings = TRUE, recursive = TRUE) | ||
} | ||
|
||
if (!("gimap_dataset" %in% class(gimap_data))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.") | ||
# Now if the file exists, | ||
if (file.exists(templateFile)) { | ||
if (file.exists(output_file) & !overwrite) { | ||
stop("there is already an output .Rmd file", output_file, | ||
". Please remove or rename this file, or choose another output_file name.", | ||
call. = FALSE | ||
) | ||
} else { | ||
file.copy(from = templateFile, to = output_file, overwrite = overwrite) | ||
} | ||
} else { | ||
stop("The Rmd template file ", templateFile, " does not exist -- did you move it from the package files?", | ||
call. = FALSE) | ||
} | ||
|
||
long_form <- | ||
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm), | ||
everything(), | ||
names_to = "sample", | ||
values_to = "count_normalized") | ||
# Make a plots directory if it doesn't exist | ||
if (!dir.exists(plots_dir)) dir.create(plots_dir, showWarnings = TRUE) | ||
|
||
counts_cdf <- ggplot(long_form, aes(x = count_normalized, color = sample)) + | ||
stat_ecdf() + | ||
labs(x = "-log10(count/total_count)", # bquote(~-log[10]~"(count/total_count)") | ||
y = "Expected_pgRNAs", | ||
color = "Sample") + | ||
plot_options() + | ||
plot_theme() + | ||
theme(aspect.ratio = wide_ar) | ||
# Send the data to render it! | ||
rmarkdown::render(output_file, | ||
params = list(dataset = gimap_dataset, | ||
plots_dir = plots_dir), | ||
...) | ||
|
||
counts_cdf | ||
save_plot(counts_cdf, out_dir = plots_dir) | ||
# Tell where the output is | ||
results_file <- gsub("\\.Rmd$", "\\.html", output_file) | ||
message("Results in: ", results_file) | ||
|
||
results_file <- normalizePath(list.files(pattern = results_file, full.names = TRUE)) | ||
|
||
if (results_file != "") browseURL(results_file) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,6 @@ | |
#' @examples \dontrun{ | ||
#' | ||
#' } | ||
|
||
calc_fc <- function() { | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,6 @@ | |
#' @examples \dontrun{ | ||
#' | ||
#' } | ||
|
||
annotate <- function() { | ||
annotate <- function() { | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,6 @@ | |
#' @examples \dontrun{ | ||
#' | ||
#' } | ||
|
||
calculate_gi <- function() { | ||
calculate_gi <- function() { | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
#' Create a CDF for the pgRNA normalized counts | ||
#' @description This function uses pivot_longer to rearrange the data for plotting and then plots a CDF of the normalized counts | ||
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data | ||
#' @param qc_obj The object that has the qc stuff stored | ||
#' @param wide_ar aspect ratio, default is 0.75 | ||
#' @importFrom tidyr pivot_longer | ||
#' @importFrom ggplot2 ggplot labs | ||
#' @return counts_cdf a ggplot | ||
#' @examples \dontrun{ | ||
#' | ||
#' } | ||
#' | ||
qc_cdf <- function(gimap_dataset, wide_ar = 0.75) { | ||
long_form <- | ||
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm), | ||
everything(), | ||
names_to = "sample", | ||
values_to = "count_normalized" | ||
) | ||
|
||
counts_cdf <- ggplot(long_form, aes(x = count_normalized, color = sample)) + | ||
stat_ecdf() + | ||
labs( | ||
x = "-log10(count/total_count)", | ||
y = "Expected_pgRNAs", | ||
color = "Sample" | ||
) + | ||
plot_options() + | ||
plot_theme() + | ||
theme(aspect.ratio = wide_ar) | ||
|
||
return(counts_cdf) | ||
} | ||
|
||
#' Create a histogram for the pgRNA log2 CPMs, faceted by sample | ||
#' @description This function uses pivot_longer to rearrange the data for plotting and then plots sample specific histograms of the pgRNA cpm's | ||
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data | ||
#' @param wide_ar aspect ratio, default is 0.75 | ||
#' @importFrom tidyr pivot_longer | ||
#' @import ggplot2 | ||
#' @return sample_cpm_histogram a ggplot | ||
#' @examples \dontrun{ | ||
#' | ||
#' } | ||
#' | ||
qc_sample_hist <- function(gimap_dataset, wide_ar = 0.75) { | ||
long_form <- | ||
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$log2_cpm), | ||
everything(), | ||
names_to = "sample", | ||
values_to = "log2_cpm" | ||
) | ||
|
||
sample_cpm_histogram <- ggplot(long_form, aes(x = log2_cpm, fill = sample)) + | ||
geom_histogram(color = "black", binwidth = 0.5) + | ||
plot_options() + | ||
plot_theme() + | ||
theme( | ||
aspect.ratio = wide_ar, | ||
legend.position = "none" | ||
) + | ||
facet_wrap(~sample, scales = "free_y", ncol = ceiling(ncol(gimap_dataset$raw_counts) / 2)) | ||
|
||
return(sample_cpm_histogram) | ||
} | ||
|
||
#' Create a correlation heatmap for the pgRNA CPMs | ||
#' @description This function uses the `cor` function to find correlations between the sample CPM's and then plots a heatmap of these | ||
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data | ||
#' @param ... Additional arguments are passed in to the pheatmap function. | ||
#' @importFrom magrittr %>% | ||
#' @importFrom pheatmap pheatmap | ||
#' @return `sample_cor_heatmap` a pheatmap | ||
#' @examples \dontrun{ | ||
#' | ||
#' } | ||
#' | ||
qc_cor_heatmap <- function(gimap_dataset, ...) { | ||
cpm_cor <- gimap_dataset$transformed_data$cpm %>% | ||
cor() %>% | ||
round(2) %>% | ||
data.frame() | ||
|
||
sample_cor_heatmap <- | ||
pheatmap::pheatmap(cpm_cor, | ||
border_color = "white", | ||
cellwidth = 20, | ||
cellheight = 20, | ||
treeheight_row = 20, | ||
treeheight_col = 20, | ||
... | ||
) | ||
|
||
return(sample_cor_heatmap) | ||
} | ||
|
||
#' Create a histogram with plasmid log2 CPM values and ascertain a cutoff for low values | ||
#' @description Find the distribution of plasmid (day0 data) pgRNA log2 CPM values, and ascertain a cutoff or filter for low log2 CPM values. | ||
#' Assumes the first column of the dataset is the day0 data; do I need a better | ||
#' method to tell, especially if there are reps? | ||
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data | ||
#' @param cutoff default is NULL, the cutoff for low log2 CPM values for the plasmid time period | ||
#' @param wide_ar aspect ratio, default is 0.75 | ||
#' @importFrom magrittr %>% | ||
#' @import ggplot2 | ||
#' @return a named list | ||
|
||
qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, wide_ar = 0.75) { | ||
to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, 1]) %>% `colnames<-`(c("log2_cpm")) | ||
|
||
plasmid_cpm_histogram <- ggplot(to_plot, aes(x = log2_cpm)) + | ||
geom_histogram(binwidth = 0.2, color = "black", fill = "gray60") + | ||
plot_options() + | ||
plot_theme() + | ||
theme(aspect.ratio = wide_ar) | ||
|
||
if (is.null(cutoff)) { | ||
# if cutoff is null, suggest a cutoff and plot with suggested | ||
quantile_info <- quantile(to_plot$log2_cpm) | ||
plasmid_cpm_stats <- data.frame( | ||
stat = c("median", "Q1", "Q3", "lower_outlier"), | ||
log2_cpm_value = c( | ||
quantile_info["50%"], | ||
quantile_info["25%"], | ||
quantile_info["75%"], | ||
quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"])) | ||
) | ||
) | ||
cutoff <- plasmid_cpm_stats[which(plasmid_cpm_stats$stat == "lower_outlier"), "log2_cpm_value"] | ||
} else { | ||
plasmid_cpm_stats <- NULL | ||
} | ||
|
||
# plot with the cutoff | ||
plasmid_cpm_hist_wcutoff <- plasmid_cpm_histogram + | ||
geom_vline( | ||
xintercept = cutoff, | ||
linetype = "dashed" | ||
) | ||
|
||
|
||
plasmid_cpm_filter <- unlist(lapply(1:nrow(to_plot), function(x) to_plot$log2_cpm[x] < cutoff)) | ||
|
||
plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>% | ||
mutate(percent = round(((n / sum(n)) * 100), 2)) | ||
|
||
return(list( | ||
plasmid_hist_nocutoff = plasmid_cpm_histogram, | ||
plasmid_stats = plasmid_cpm_stats, | ||
used_log2_cpm_cutoff = cutoff, | ||
plasmid_hist_cutoff = plasmid_cpm_hist_wcutoff, | ||
plasmid_filter = plasmid_cpm_filter, | ||
plasmid_filter_report = plasmid_filter_df | ||
)) | ||
} |
Oops, something went wrong.