Skip to content

Commit

Permalink
Merge pull request #16 from FredHutch/ki_qc
Browse files Browse the repository at this point in the history
Quality Control -- Report Making Function
  • Loading branch information
cansavvy authored Mar 22, 2024
2 parents 6027dad + 8dc306e commit 9aa337a
Show file tree
Hide file tree
Showing 12 changed files with 33,591 additions and 99 deletions.
4 changes: 4 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@ Imports:
rmarkdown,
rcmdcheck,
ggplot2,
magrittr,
kableExtra,
pheatmap,
Suggests:
testthat (>= 3.0.0),
roxygen2,
Config/testthat/edition: 3
Encoding: UTF-8
RoxygenNote: 7.2.3
Expand Down
13 changes: 12 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(annotate)
export(calc_fc)
export(calculate_gi)
export(example_data)
export(get_example_data)
export(run_qc)
export(setup_data)
import(dplyr)
import(ggplot2)
import(kableExtra)
importFrom(dplyr,mutate)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,labs)
importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(pheatmap,pheatmap)
importFrom(tidyr,pivot_longer)
39 changes: 21 additions & 18 deletions R/00-setup_data.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' Making a new gimap dataset
#' @description This function allows people to have their data ready to be processed by gimap
#' @param counts a matrix of data that contains the counts where rows are each paired_guide target and columns are each sample
#' @param pg_metadata metadata associated with the pgRNA constructs that correspond to the rows of the counts data
#' @param pg_ids the pgRNA IDs: metadata associated with the pgRNA constructs that correspond to the rows of the counts data
#' @param pg_metadata construct metadata
#' @param sample_metadata metadata associated with the samples of the dataset that correspond to the columns of the counts data
#' @return A special gimap_dataset to be used with the other functions in this package.
#' @export
Expand All @@ -12,20 +13,23 @@
#'
#' counts_data <- setup_data(data)
#' }
setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL) {

setup_data <- function(counts = NULL, pg_ids = NULL, pg_metadata = NULL, sample_metadata = NULL) {
new_data <- gimap_data <- list(
raw_counts = NULL,
raw_counts = NULL,
counts_per_sample = NULL,
coverage = NULL,
transformed_data = list(
long_form = NULL,
count_norm = NULL,
count_norm = NULL,
cpm = NULL,
log2_cpm = NULL),
metadata = list(pg_metadata = NULL,
sample_metadata = NULL)
log2_cpm = NULL
),
metadata = list(
pg_ids = NULL,
pg_metadata = NULL,
sample_metadata = NULL
),
qc = list(
qc_filter = NULL
)
)

class(new_data) <- c("list", "gimap_dataset")
Expand All @@ -36,20 +40,20 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL
# If they don't give sample metadata, then we will make up a row id
if (is.null(sample_metadata)) sample_metadata <- data.frame(id = 1:nrow(counts))
if (!is.data.frame(sample_metadata)) stop("metadata can only be in the form of a data.frame")
if (nrow(sample_metadata) != ncol(counts)) stop("the number of rows in the sample metadata is not equal to the number of columns in the counts" )
if (nrow(sample_metadata) != ncol(counts)) stop("the number of rows in the sample metadata is not equal to the number of columns in the counts")

if (!(length(unique(sample_metadata[, 1])) == length(sample_metadata[, 1]))) stop("The first column in sample metadata must be a unique ID")

new_data$metadata$sample_metadata <- sample_metadata

# If they don't give paired guide metadata, then we will make up a row id
if (is.null(pg_metadata)) pg_metadata <- data.frame(id = 1:ncol(counts))
if (!is.data.frame(pg_metadata)) stop("metadata can only be in the form of a data.frame")
if (nrow(pg_metadata) != nrow(counts)) stop("the number of rows in the pg_info is not equal to the number of rows in the counts" )
# If they don't give paired guide ids, then we will make up a row id
if (is.null(pg_ids)) pg_ids <- data.frame(id = 1:ncol(counts))
if (!is.data.frame(pg_ids)) stop("pgRNA IDs can only be in the form of a data.frame")
if (nrow(pg_ids) != nrow(counts)) stop("the number of rows in the pg_info is not equal to the number of rows in the counts")

# we need the first column to be a unique id
if (!(length(unique(pg_metadata[, 1])) == length(pg_metadata[, 1]))) stop("The first column in paired guide metadata must be a unique ID")
new_data$metadata$pg_metadata <- pg_metadata
if (!(nrow(unique(pg_ids[, 1])) == nrow(pg_ids[, 1]))) stop("The paired guide IDs must be a unique ID")
new_data$metadata$pg_ids <- pg_ids

# Store the raw counts
new_data$raw_counts <- counts
Expand All @@ -64,4 +68,3 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL

return(new_data)
}

71 changes: 48 additions & 23 deletions R/01-qc.R
Original file line number Diff line number Diff line change
@@ -1,39 +1,64 @@

#' This is a title for a function
#' A function to run QC
#' @description This is a function here's where we describe what it does
#' @param parameter Here's a parameter let's describe it here
#' @param use_combined default it TRUE; if TRUE, both zero count and low plasmid CPM filters are applied and if either is TRUE, a pgRNA construct will be filtered out. If FALSE, need to allow to specify which should be used
#' @param plots_dir default is `./qc_plots`; directory to save plots created with this function, if it doesn't exist already it will be created
#' @param overwrite default is FALSE; whether to overwrite the QC Report file
#' @param output_file_path default is `QC_Report`; name of the output QC report file
#' @param ... additional parameters are sent to rmarkdown::render
#' @export
#' @importFrom tidyr pivot_longer
#' @import ggplot2
#' @importFrom magrittr %>%
#' @examples \dontrun{
#'
#' }
run_qc <- function(gimap_dataset,
output_file = "./gimap_QC_Report.Rmd",
plots_dir = "./qc_plots",
overwrite = FALSE,
...) {

if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")

run_qc <- function(gimap_data, plots_dir = "./qc_plots", wide_ar = 0.75, square_ar = 1) {
# Determine the template
templateFile <- system.file("rmd/gimapQCTemplate.Rmd", package = "gimap")

if (!dir.exists(plots_dir)) {
dir.create(plots_dir, showWarnings = TRUE)
# Make sure that the directory exists!
directory <- dirname(output_file)

# If not, make the directory
if (directory != ".") {
if (!dir.exists(directory)) dir.create(directory, showWarnings = TRUE, recursive = TRUE)
}

if (!("gimap_dataset" %in% class(gimap_data))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")
# Now if the file exists,
if (file.exists(templateFile)) {
if (file.exists(output_file) & !overwrite) {
stop("there is already an output .Rmd file", output_file,
". Please remove or rename this file, or choose another output_file name.",
call. = FALSE
)
} else {
file.copy(from = templateFile, to = output_file, overwrite = overwrite)
}
} else {
stop("The Rmd template file ", templateFile, " does not exist -- did you move it from the package files?",
call. = FALSE)
}

long_form <-
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm),
everything(),
names_to = "sample",
values_to = "count_normalized")
# Make a plots directory if it doesn't exist
if (!dir.exists(plots_dir)) dir.create(plots_dir, showWarnings = TRUE)

counts_cdf <- ggplot(long_form, aes(x = count_normalized, color = sample)) +
stat_ecdf() +
labs(x = "-log10(count/total_count)", # bquote(~-log[10]~"(count/total_count)")
y = "Expected_pgRNAs",
color = "Sample") +
plot_options() +
plot_theme() +
theme(aspect.ratio = wide_ar)
# Send the data to render it!
rmarkdown::render(output_file,
params = list(dataset = gimap_dataset,
plots_dir = plots_dir),
...)

counts_cdf
save_plot(counts_cdf, out_dir = plots_dir)
# Tell where the output is
results_file <- gsub("\\.Rmd$", "\\.html", output_file)
message("Results in: ", results_file)

results_file <- normalizePath(list.files(pattern = results_file, full.names = TRUE))

if (results_file != "") browseURL(results_file)
}
1 change: 0 additions & 1 deletion R/02-foldchange.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#' @examples \dontrun{
#'
#' }

calc_fc <- function() {

}
3 changes: 1 addition & 2 deletions R/03-annotate.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#' @examples \dontrun{
#'
#' }

annotate <- function() {
annotate <- function() {

}
3 changes: 1 addition & 2 deletions R/04-calculate_gi.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#' @examples \dontrun{
#'
#' }

calculate_gi <- function() {
calculate_gi <- function() {

}
155 changes: 155 additions & 0 deletions R/qc-plots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#' Create a CDF for the pgRNA normalized counts
#' @description This function uses pivot_longer to rearrange the data for plotting and then plots a CDF of the normalized counts
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @param qc_obj The object that has the qc stuff stored
#' @param wide_ar aspect ratio, default is 0.75
#' @importFrom tidyr pivot_longer
#' @importFrom ggplot2 ggplot labs
#' @return counts_cdf a ggplot
#' @examples \dontrun{
#'
#' }
#'
qc_cdf <- function(gimap_dataset, wide_ar = 0.75) {
long_form <-
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm),
everything(),
names_to = "sample",
values_to = "count_normalized"
)

counts_cdf <- ggplot(long_form, aes(x = count_normalized, color = sample)) +
stat_ecdf() +
labs(
x = "-log10(count/total_count)",
y = "Expected_pgRNAs",
color = "Sample"
) +
plot_options() +
plot_theme() +
theme(aspect.ratio = wide_ar)

return(counts_cdf)
}

#' Create a histogram for the pgRNA log2 CPMs, faceted by sample
#' @description This function uses pivot_longer to rearrange the data for plotting and then plots sample specific histograms of the pgRNA cpm's
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @param wide_ar aspect ratio, default is 0.75
#' @importFrom tidyr pivot_longer
#' @import ggplot2
#' @return sample_cpm_histogram a ggplot
#' @examples \dontrun{
#'
#' }
#'
qc_sample_hist <- function(gimap_dataset, wide_ar = 0.75) {
long_form <-
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$log2_cpm),
everything(),
names_to = "sample",
values_to = "log2_cpm"
)

sample_cpm_histogram <- ggplot(long_form, aes(x = log2_cpm, fill = sample)) +
geom_histogram(color = "black", binwidth = 0.5) +
plot_options() +
plot_theme() +
theme(
aspect.ratio = wide_ar,
legend.position = "none"
) +
facet_wrap(~sample, scales = "free_y", ncol = ceiling(ncol(gimap_dataset$raw_counts) / 2))

return(sample_cpm_histogram)
}

#' Create a correlation heatmap for the pgRNA CPMs
#' @description This function uses the `cor` function to find correlations between the sample CPM's and then plots a heatmap of these
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @param ... Additional arguments are passed in to the pheatmap function.
#' @importFrom magrittr %>%
#' @importFrom pheatmap pheatmap
#' @return `sample_cor_heatmap` a pheatmap
#' @examples \dontrun{
#'
#' }
#'
qc_cor_heatmap <- function(gimap_dataset, ...) {
cpm_cor <- gimap_dataset$transformed_data$cpm %>%
cor() %>%
round(2) %>%
data.frame()

sample_cor_heatmap <-
pheatmap::pheatmap(cpm_cor,
border_color = "white",
cellwidth = 20,
cellheight = 20,
treeheight_row = 20,
treeheight_col = 20,
...
)

return(sample_cor_heatmap)
}

#' Create a histogram with plasmid log2 CPM values and ascertain a cutoff for low values
#' @description Find the distribution of plasmid (day0 data) pgRNA log2 CPM values, and ascertain a cutoff or filter for low log2 CPM values.
#' Assumes the first column of the dataset is the day0 data; do I need a better
#' method to tell, especially if there are reps?
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @param cutoff default is NULL, the cutoff for low log2 CPM values for the plasmid time period
#' @param wide_ar aspect ratio, default is 0.75
#' @importFrom magrittr %>%
#' @import ggplot2
#' @return a named list

qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, wide_ar = 0.75) {
to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[, 1]) %>% `colnames<-`(c("log2_cpm"))

plasmid_cpm_histogram <- ggplot(to_plot, aes(x = log2_cpm)) +
geom_histogram(binwidth = 0.2, color = "black", fill = "gray60") +
plot_options() +
plot_theme() +
theme(aspect.ratio = wide_ar)

if (is.null(cutoff)) {
# if cutoff is null, suggest a cutoff and plot with suggested
quantile_info <- quantile(to_plot$log2_cpm)
plasmid_cpm_stats <- data.frame(
stat = c("median", "Q1", "Q3", "lower_outlier"),
log2_cpm_value = c(
quantile_info["50%"],
quantile_info["25%"],
quantile_info["75%"],
quantile_info["25%"] - (1.5 * (quantile_info["75%"] - quantile_info["25%"]))
)
)
cutoff <- plasmid_cpm_stats[which(plasmid_cpm_stats$stat == "lower_outlier"), "log2_cpm_value"]
} else {
plasmid_cpm_stats <- NULL
}

# plot with the cutoff
plasmid_cpm_hist_wcutoff <- plasmid_cpm_histogram +
geom_vline(
xintercept = cutoff,
linetype = "dashed"
)


plasmid_cpm_filter <- unlist(lapply(1:nrow(to_plot), function(x) to_plot$log2_cpm[x] < cutoff))

plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>%
mutate(percent = round(((n / sum(n)) * 100), 2))

return(list(
plasmid_hist_nocutoff = plasmid_cpm_histogram,
plasmid_stats = plasmid_cpm_stats,
used_log2_cpm_cutoff = cutoff,
plasmid_hist_cutoff = plasmid_cpm_hist_wcutoff,
plasmid_filter = plasmid_cpm_filter,
plasmid_filter_report = plasmid_filter_df
))
}
Loading

0 comments on commit 9aa337a

Please sign in to comment.