Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quality Control -- Report Making Function #16

Merged
merged 31 commits into from
Mar 22, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
4d5757f
some additions
kweav Feb 8, 2024
6a6b8e1
try to merge
kweav Feb 9, 2024
3b4fbfe
correct cpm and log2_cpm calculations
kweav Feb 9, 2024
48f8725
starting to include ID mapping
kweav Feb 16, 2024
4b7469e
left out file name ugh
kweav Feb 20, 2024
8483167
still need to require metadata and combine filters
kweav Feb 21, 2024
96df654
some more qc progress
kweav Feb 21, 2024
b2085a6
need to make report and filter gimap_dataset
kweav Feb 24, 2024
9f96980
need to make report and filter gimap_dataset
kweav Feb 24, 2024
ea88770
testing the new functions
kweav Feb 26, 2024
359c193
git status says it be modified
kweav Feb 26, 2024
fc30903
additions for generating QC report and saving plots
kweav Mar 1, 2024
acd00ef
Update vignettes/getting-started.Rmd
kweav Mar 4, 2024
a75cedf
make qc plot file
kweav Mar 5, 2024
b589975
Merge branch 'ki_qc' of https://github.com/FredHutch/gimap into ki_qc
kweav Mar 5, 2024
fa2e9df
move qc_filter gimap loc, update docs, and specify which example_data…
kweav Mar 5, 2024
f08e240
add plot saved locations to html reports
kweav Mar 6, 2024
47b24d8
remove overwrite true again
kweav Mar 6, 2024
c7a1841
Simplifying
cansavvy Mar 7, 2024
800f065
simplifying
cansavvy Mar 7, 2024
1244234
it works
cansavvy Mar 7, 2024
e580aca
making the example data names a little more consistent
cansavvy Mar 12, 2024
e66e928
Update 00-setup_data.R
kweav Mar 18, 2024
f2264ff
Merge branch 'main' into ki_qc
cansavvy Mar 22, 2024
0004e69
Merge branch 'ki_qc' into cansavvy/refactor
cansavvy Mar 22, 2024
c734375
Merge pull request #17 from FredHutch/cansavvy/refactor
cansavvy Mar 22, 2024
254478e
Adding so report runs
cansavvy Mar 22, 2024
53b140f
Updates in vignette
cansavvy Mar 22, 2024
3e1f642
Updates for building purposes
cansavvy Mar 22, 2024
4eac61a
Remove filter from this branch (it will be in the other one)
cansavvy Mar 22, 2024
8dc306e
Merge branch 'main' into ki_qc
cansavvy Mar 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions R/00-setup_data.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#' Making a new gimap dataset
#' @description This function allows people to have their data ready to be processed by gimap
#' @param counts a matrix of data that contains the counts where rows are each paired_guide target and columns are each sample
#' @param pg_metadata metadata associated with the pgRNA constructs that correspond to the rows of the counts data
#' @param pg_ids the pgRNA IDs: metadata associated with the pgRNA constructs that correspond to the rows of the counts data
#' @param pg_metadata construct metadata
#' @param sample_metadata metadata associated with the samples of the dataset that correspond to the columns of the counts data
#' @return A special gimap_dataset to be used with the other functions in this package.
#' @export
Expand All @@ -12,19 +13,21 @@
#'
#' counts_data <- setup_data(data)
#' }
setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL) {
setup_data <- function(counts = NULL, pg_ids = NULL, pg_metadata = NULL, sample_metadata = NULL) {

new_data <- gimap_data <- list(
raw_counts = NULL,
counts_per_sample = NULL,
coverage = NULL,
#coverage = NULL,
kweav marked this conversation as resolved.
Show resolved Hide resolved
transformed_data = list(
long_form = NULL,
count_norm = NULL,
count_norm = NULL,
cpm = NULL,
log2_cpm = NULL),
metadata = list(pg_metadata = NULL,
log2_cpm = NULL,
qc_filter = NULL),
metadata = list(pg_ids = NULL,
pg_metadata = NULL,
sample_metadata = NULL)
)

Expand All @@ -42,14 +45,14 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL

new_data$metadata$sample_metadata <- sample_metadata

# If they don't give paired guide metadata, then we will make up a row id
if (is.null(pg_metadata)) pg_metadata <- data.frame(id = 1:ncol(counts))
if (!is.data.frame(pg_metadata)) stop("metadata can only be in the form of a data.frame")
if (nrow(pg_metadata) != nrow(counts)) stop("the number of rows in the pg_info is not equal to the number of rows in the counts" )
# If they don't give paired guide ids, then we will make up a row id
if (is.null(pg_ids)) pg_ids <- data.frame(id = 1:ncol(counts))
cansavvy marked this conversation as resolved.
Show resolved Hide resolved
if (!is.data.frame(pg_ids)) stop("pgRNA IDs can only be in the form of a data.frame")
if (nrow(pg_ids) != nrow(counts)) stop("the number of rows in the pg_info is not equal to the number of rows in the counts" )

# we need the first column to be a unique id
if (!(length(unique(pg_metadata[, 1])) == length(pg_metadata[, 1]))) stop("The first column in paired guide metadata must be a unique ID")
new_data$metadata$pg_metadata <- pg_metadata
if (!(nrow(unique(pg_ids[, 1])) == nrow(pg_ids[, 1]))) stop("The paired guide IDs must be a unique ID")
new_data$metadata$pg_ids <- pg_ids

# Store the raw counts
new_data$raw_counts <- counts
Expand All @@ -59,8 +62,8 @@ setup_data <- function(counts = NULL, pg_metadata = NULL, sample_metadata = NULL

# Transform the data
new_data$transformed_data$count_norm <- apply(counts, 2, function(x) -log10((x+1)/sum(x)))
new_data$transformed_data$cpm <- apply(counts, 2, function(x) (x/new_data$counts_per_sample)*1e6)
new_data$transformed_data$log2_cpm <- log2(new_data$cpm +1)
new_data$transformed_data$cpm <- apply(counts, 2, function(x) (x/sum(x))*1e6)
new_data$transformed_data$log2_cpm <- log2(new_data$transformed_data$cpm +1)

return(new_data)
}
Expand Down
94 changes: 68 additions & 26 deletions R/01-qc.R
Original file line number Diff line number Diff line change
@@ -1,39 +1,81 @@

#' This is a title for a function
#' A function to run QC
#' @description This is a function here's where we describe what it does
#' @param parameter Here's a parameter let's describe it here
#' @export
#' @importFrom tidyr pivot_longer
#' @import ggplot2
#' @importFrom magrittr %>%
#' @import ggplot2 pheatmap
#' @examples \dontrun{
#'
#' }

run_qc <- function(gimap_data, plots_dir = "./qc_plots", wide_ar = 0.75, square_ar = 1) {

if (!dir.exists(plots_dir)) {
run_qc <- function(gimap_dataset, plasmid_cutoff = NULL, use_combined = TRUE, plots_dir = "./qc_plots", wide_ar = 0.75, square_ar = 1) {

if (!dir.exists(plots_dir)){
dir.create(plots_dir, showWarnings = TRUE)
}

if (!("gimap_dataset" %in% class(gimap_data))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")

long_form <-
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm),
everything(),
names_to = "sample",
values_to = "count_normalized")

counts_cdf <- ggplot(long_form, aes(x = count_normalized, color = sample)) +
stat_ecdf() +
labs(x = "-log10(count/total_count)", # bquote(~-log[10]~"(count/total_count)")
y = "Expected_pgRNAs",
color = "Sample") +
plot_options() +
plot_theme() +
theme(aspect.ratio = wide_ar)

counts_cdf
save_plot(counts_cdf, out_dir = plots_dir)



if (!("gimap_dataset" %in% class(gimap_dataset))) stop("This function only works with gimap_dataset objects which can be made with the setup_data() function.")

counts_cdf <- qc_cdf(gimap_dataset, wide_ar = wide_ar)

counts_cdf #put it in the report
#save_plot(counts_cdf)

sample_cpm_histogram <- qc_sample_hist(gimap_dataset, wide_ar = wide_ar)

sample_cpm_histogram #put it in the report
#save_plot(sample_cpm_histogram)

sample_cor_heatmap_unfiltered <- qc_cor_heatmap(gimap_dataset)

sample_cor_heatmap_unfiltered #put it in the report
#save_plot(sample_cor_heatmap_unfiltered)


## flag pgRNAs with count = 0 at any time point
## how many pgRNAs have a raw count of 0 at any time point/sample
counts_filter_list <- qc_filter_zerocounts(gimap_dataset)

counts_filter <- counts_filter_list$filter #TRUE means the filter applies
cansavvy marked this conversation as resolved.
Show resolved Hide resolved

zerocount_df <- counts_filter_list$reportdf

zerocount_df #put it in the report

## flag pgRNAs with low plasmid read counts
## plasmid reads are the first sample/day0 time point
plasmid_filter_list <- qc_plasmid_histogram(gimap_dataset, cutoff = plasmid_cutoff, wide_ar = wide_ar)

plasmid_filter_list$plasmid_hist_nocutoff #put it in the report

plasmid_filter_list$plasmid_stats #put it in the report

plasmid_filter_list$plasmid_hist_cutoff #put it in the report

plasmid_filter_list$plasmid_filter_report #put it in the report

plasmid_filter <- plasmid_filter_list$plasmid_filter #TRUE means the filter applies

## combine the filters (zerocount and plasmid)
## how many pgRNAs are removed by both filters?

combined_filter_list <- qc_combine_filters(counts_filter, plasmid_filter, use_combined = use_combined)

combined_filter_df <- combined_filter_list$num_filtered_report #put it in the report

which_filter_df <- combined_filter_list$num_which_filter_report #put it in the report

gimap_dataset$transformed_data$qc_filter <- combined_filter_list$combined_filter

#if (use_combined){
#filter gimap_dataset and metadata to only keep pgRNA passing both filters
#gimap_dataset$transformed_data$qc_filter <- combined_filter_list$keep_pgRNA
#} else {
#print out a message asking which filter to use?
#}

return(gimap_dataset)
}
31 changes: 31 additions & 0 deletions R/qc_cdf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#' Create a CDF for the pgRNA normalized counts
#' @description This function uses pivot_longer to rearrange the data for plotting and then plots a CDF of the normalized counts
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @param wide_ar aspect ratio, default is 0.75
#' @importFrom tidyr pivot_longer
#' @import ggplot2
#' @return counts_cdf a ggplot
#' @examples \dontrun{
#'
#' }
#'

qc_cdf <- function(gimap_dataset, wide_ar = 0.75){
kweav marked this conversation as resolved.
Show resolved Hide resolved

long_form <-
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$count_norm),
everything(),
names_to = "sample",
values_to = "count_normalized")

counts_cdf <- ggplot(long_form, aes(x=count_normalized, color = sample)) +
stat_ecdf() +
labs(x = "-log10(count/total_count)",
y = "Expected_pgRNAs",
color = "Sample") +
plot_options() +
plot_theme() +
theme(aspect.ratio = wide_ar)

return(counts_cdf)
}
34 changes: 34 additions & 0 deletions R/qc_combine_filters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#'
#'
#'
#'
#'
#' @importFrom magrittr %>% %<>%
#' @import dplyr
#' @return a named list
#'

qc_combine_filters <- function(filter_zerocount, filter_plasmid, use_combined = TRUE){
combined_filter <- cbind(data.frame(filter_zerocount), data.frame(filter_plasmid)) %>%
mutate(combined = filter_zerocount == TRUE | filter_plasmid == TRUE)

combined_filter_df <- data.frame("Filtered" = c("Yes", "No"), #Yes filtered means remove pgRNA
n = c(sum(combined_filter$combined), sum(!combined_filter$combined))) %>%
mutate(percent = round(((n/sum(n))*100),2))

which_filter_df <- data.frame("Which Filter" = c("Low plasmid cpm only", "Zero count for any sample", "Both", "Neither"),
n = c(sum(combined_filter$filter_zerocount == FALSE & combined_filter$filter_plasmid == TRUE),
sum(combined_filter$filter_zerocount == TRUE & combined_filter$filter_plasmid == FALSE),
sum(combined_filter$filter_zerocount == TRUE & combined_filter$filter_plasmid == TRUE),
sum(combined_filter$filter_zerocount == FALSE & combined_filter$filter_plasmid == FALSE))) %>%
mutate(percent = round(((n/sum(n)) *100),2))

if (use_combined){
combined_filter %<>% select(combined) %>% mutate(combined = as.logical(abs(combined -1))) %>% `colnames<-`(c("keep_pgRNA")) #invert the bits so boolean reports which ones you keep, not which ones you remove.
} else { combined_filter$keep_pgRNA <- NA }

return(list(combined_filter = combined_filter$keep_pgRNA,
num_filtered_report = combined_filter_df,
num_which_filter_report = which_filter_df))

}
27 changes: 27 additions & 0 deletions R/qc_cor_heatmap.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#' Create a correlation heatmap for the pgRNA cpm's
#' @description This function uses the cor function to find correlations between the sample CPM's and then plots a heatmap of these
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @importFrom magrittr %>%
#' @importFrom pheatmap pheatmap
#' @return sample_cor_heatmap a pheatmap
#' @examples \dontrun{
#'
#' }
#'

qc_cor_heatmap <- function(gimap_dataset){

cpm_cor <- gimap_dataset$transformed_data$cpm %>%
cor() %>%
round(2) %>%
data.frame()

sample_cor_heatmap <-
pheatmap(cpm_cor,
border_color = "white",
cellwidth = 20, cellheight = 20,
treeheight_row = 20, treeheight_col = 20)

return (sample_cor_heatmap)

}
21 changes: 21 additions & 0 deletions R/qc_filter_zerocounts.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#' Create a filter for pgRNAs which have a raw count of 0 for any sample/time point
#' @description This function flags and reports which and how many pgRNAs have a raw count of 0 for any sample/time point
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @importFrom magrittr %>%
#' @importFrom dpylr mutate
#' @return a named list with the filter `filter` specifying which pgRNA have a count zero for at least one sample/time point and a report df `reportdf` for the number and percent of pgRNA which have a count zero for at least one sample/time point
#' @examples \dontrun{
#'
#' }
#'

qc_filter_zerocounts <- function(gimap_dataset){

counts_filter <- unlist(lapply(1:nrow(gimap_dataset$raw_counts), function(x) 0 %in% gimap_dataset$raw_counts[x,]))

zerocount_df <- data.frame("RawCount0" = c(FALSE, TRUE), n = c(sum(!counts_filter), sum(counts_filter))) %>%
mutate(percent = round(((n/sum(n))*100),2))

return(list(filter = counts_filter, reportdf = zerocount_df))

}
52 changes: 52 additions & 0 deletions R/qc_plasmid_histogram.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#'
#'
#'
#'
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @param cutoff default is NULL
#' @param wide_ar aspect ratio, default is 0.75
#' @importFrom magrittr %>%
#' @import ggplot2
#' @return a named list
#' Assumes the first column of the dataset is the day0 data; do I need a better
#' method to tell, especially if there are reps?

qc_plasmid_histogram <- function(gimap_dataset, cutoff = NULL, wide_ar = 0.75){
to_plot <- data.frame(gimap_dataset$transformed_data$log2_cpm[,1]) %>% `colnames<-`(c("log2_cpm"))
plasmid_cpm_histogram <- ggplot(to_plot, aes(x=log2_cpm)) +
geom_histogram(binwidth=0.2, color = "black", fill="gray60") +
plot_options() +
plot_theme() +
theme(aspect.ratio = wide_ar)

if (is.null(cutoff)){
#if cutoff is null, suggest a cutoff and plot with suggested
quantile_info <- quantile(to_plot$log2_cpm)
plasmid_cpm_stats <- data.frame(stat = c("median", "Q1", "Q3", "lower_outlier"),
log2_cpm_value = c(quantile_info["50%"],
quantile_info["25%"],
quantile_info["75%"],
quantile_info["25%"] - (1.5*(quantile_info["75%"] - quantile_info["25%"]))
)
)
cutoff <- plasmid_cpm_stats[which(plasmid_cpm_stats$stat == "lower_outlier"),"log2_cpm_value"]
} else{ plasmid_cpm_stats <- NULL }

#plot with the cutoff
plasmid_cpm_hist_wcutoff <- plasmid_cpm_histogram +
geom_vline(xintercept = cutoff,
linetype = "dashed")


plasmid_cpm_filter <- unlist(lapply(1:nrow(to_plot), function(x) to_plot$log2_cpm[x] < cutoff))

plasmid_filter_df <- data.frame("Plasmid_log2cpmBelowCutoff" = c(FALSE, TRUE), n = c(sum(!plasmid_cpm_filter), sum(plasmid_cpm_filter))) %>%
mutate(percent = round(((n/sum(n))*100),2))

return(list(plasmid_hist_nocutoff= plasmid_cpm_histogram,
plasmid_stats <- plasmid_cpm_stats,
used_log2_cpm_cutoff = cutoff,
plasmid_hist_cutoff=plasmid_cpm_hist_wcutoff,
plasmid_filter = plasmid_cpm_filter,
plasmid_filter_report = plasmid_filter_df))
}
30 changes: 30 additions & 0 deletions R/qc_sample_hist.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#' Create a histogram for the pgRNA cpm's, faceted by sample
#' @description This function uses pivot_longer to rearrange the data for plotting and then plots sample specific histograms of the pgRNA cpm's
#' @param gimap_dataset The special gimap_dataset from the `setup_data` function which contains the transformed data
#' @param wide_ar aspect ratio, default is 0.75
#' @importFrom tidyr pivot_longer
#' @import ggplot2
#' @return sample_cpm_histogram a ggplot
#' @examples \dontrun{
#'
#' }
#'

qc_sample_hist <- function(gimap_dataset, wide_ar = 0.75){

long_form <-
tidyr::pivot_longer(data.frame(gimap_dataset$transformed_data$log2_cpm),
everything(),
names_to = "sample",
values_to = "log2_cpm")

sample_cpm_histogram <- ggplot(long_form, aes(x=log2_cpm, fill = sample)) +
geom_histogram(color = "black", binwidth = 0.5) +
plot_options() +
plot_theme() +
theme(aspect.ratio = wide_ar,
legend.position = "none") +
facet_wrap(~sample, scales = "free_y", ncol = ceiling(ncol(gimap_dataset$raw_counts)/2))

return(sample_cpm_histogram)
}
Loading
Loading