diff --git a/PEPATACr/DESCRIPTION b/PEPATACr/DESCRIPTION index bf29c3f9..cfa97fcd 100644 --- a/PEPATACr/DESCRIPTION +++ b/PEPATACr/DESCRIPTION @@ -8,4 +8,4 @@ Depends: R (>= 3.5.1), data.table, pepr, gplots, grid, ggplot2, optigrab, scales License: BSD 2-Clause "Simplified" License Encoding: UTF-8 LazyData: true -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 diff --git a/PEPATACr/NAMESPACE b/PEPATACr/NAMESPACE index e33ff021..8582fa62 100644 --- a/PEPATACr/NAMESPACE +++ b/PEPATACr/NAMESPACE @@ -2,6 +2,7 @@ export(consensusPeaks) export(createAssetsSummary) +export(createStatsSummary) export(fileIcon) export(getPrealignments) export(narrowPeakToBigBed) diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index da1246d8..7d53940d 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -1641,7 +1641,7 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, #' #' @param input Path to narrowPeak file #' @param chr_sizes Genome chromosome sizes file. -#' @param output Output file. +#' @param output Output file name. #' @param normalize Remove overlaps and normalize the score. #' @keywords reduce fixed peaks #' @export @@ -1649,9 +1649,20 @@ reducePeaks <- function(input, chr_sizes, output=NA, normalize=FALSE) { info <- file.info(file.path(input)) if (file.exists(file.path(input)) && info$size != 0) { peaks <- fread(file.path(input)) - colnames(peaks) <- c("chr", "start", "end", - "name", "score", "strand", - "signalValue", "pValue", "qValue", "peak") + if (ncol(peaks) == 6) { + colnames(peaks) <- c("chr", "start", "end", + "name", "score", "strand") + bedOnly <- TRUE + } else if (ncol(peaks) == 10) { + colnames(peaks) <- c("chr", "start", "end", + "name", "score", "strand", + "signalValue", "pValue", "qValue", "peak") + bedOnly <- FALSE + } else { + warning(paste0(input, " did not contain a recognizable number", + " of columns (", ncol(peaks), ")")) + rm(peaks) + } setkey(peaks, chr, start, end) } else { if (info$size == 0) { @@ -1679,7 +1690,12 @@ reducePeaks <- function(input, chr_sizes, output=NA, normalize=FALSE) { hits <- foverlaps(peaks, peaks, by.x=c("chr", "start", "end"), type="any", which=TRUE, nomatch=0) - qVals <- data.table(index=rep(1:nrow(peaks)), qValue=peaks$qValue) + if (bedOnly) { + # Only have the "score" to rank peaks + qVals <- data.table(index=rep(1:nrow(peaks)), qValue=peaks$score) + } else { + qVals <- data.table(index=rep(1:nrow(peaks)), qValue=peaks$qValue) + } setkey(hits, xid) setkey(qVals, index) out <- hits[qVals, nomatch=0] @@ -1711,7 +1727,9 @@ reducePeaks <- function(input, chr_sizes, output=NA, normalize=FALSE) { } } else { - message("PEPATACr reducePeaks() failed. Check peak and chrom.sizes files.") + err_msg = paste0("PEPATACr reducePeaks() failed. ", + "Check peak and chrom.sizes files.") + warning(err_msg) } } @@ -2471,37 +2489,49 @@ countReproduciblePeaks <- function(peak_list, peak_DT) { #' #' @param sample_table A data.table object that includes paths to #' valid peak files. -#' @param chrom_sizes A data.table of genome chromosome sizes. +#' @param chr_sizes A data.table of genome chromosome sizes. #' @param min_samples A minimum number of samples a peak must be present #' in to keep. #' @param min_score A minimum peak score to keep an individual peak. -collapsePeaks <- function(sample_table, chrom_sizes, min_samples=2, min_score=5) { - final <- data.table(chr=character(), - start=integer(), - end=integer(), - name=character(), - score=numeric(), - strand=character(), - signalValue=numeric(), - pValue=numeric(), - qValue=numeric(), - peak=integer()) +#' @param min_olap A minimum number of bases between peaks to be +#' considered overlapping. +collapsePeaks <- function(sample_table, chr_sizes, + min_samples=2, min_score=5, min_olap=1) { # create combined peaks - peaks <- rbindlist(lapply(sample_table$peak_files, fread)) - colnames(peaks) <- c("chr", "start", "end", "name", "score", - "strand", "signalValue", "pValue", "qValue", - "peak") + peaks <- rbindlist(lapply(sample_table$peak_files, fread), idcol="file") + if (ncol(peaks) == 7) { + colnames(peaks) <- c("file", "chr", "start", "end", + "name", "score", "strand") + } else if (ncol(peaks) == 11) { + colnames(peaks) <- c("file", "chr", "start", "end", + "name", "score", "strand", + "signalValue", "pValue", "qValue", "peak") + } else { + warning(paste0("Peak files did not contain a recognizable number", + " of columns (", ncol(peaks), ")")) + rm(peaks) + final <- data.table(chr=character(), + start=integer(), + end=integer(), + name=character(), + score=numeric(), + strand=character(), + signalValue=numeric(), + pValue=numeric(), + qValue=numeric(), + peak=integer()) + return(final) + } setkey(peaks, chr, start, end) # keep highest scored peaks - # hits <- foverlaps(peaks, peaks, - # by.x=c("chr", "start", "end"), - # type="any", which=TRUE, nomatch=0) # split by chromosome to minimize memory requirements peaks_by_chr <- split(peaks, peaks$chr) hit_aggregator <- function(x) { + #message(paste0("x: ", unique(x$chr))) # DEBUG peaksGR <- makeGRangesFromDataFrame(x, keep.extra.columns=FALSE) hitsGR <- suppressWarnings( - findOverlaps(peaksGR, peaksGR, ignore.strand=TRUE)) + findOverlaps(peaksGR, peaksGR, + ignore.strand=TRUE, minoverlap=min_olap)) hits <- data.table::data.table(xid=queryHits(hitsGR), yid=subjectHits(hitsGR)) setkey(hits, xid) @@ -2510,20 +2540,22 @@ collapsePeaks <- function(sample_table, chrom_sizes, min_samples=2, min_score=5) out <- hits[scores, nomatch=0] keep <- out[out[,.I[which.max(score)],by=yid]$V1] indices <- unique(keep$xid) - final <- x[indices,] - final[start < 0, start := 0] - return(final) + reduced <- x[indices,] + reduced[start < 0, start := 0] + return(reduced) } final <- rbindlist(lapply(peaks_by_chr, hit_aggregator)) # can't extend past chromosome - for (i in nrow(chrom_sizes)) { - final[chr == chrom_sizes$chr[i] & end > chrom_sizes$size[i], - end := chrom_sizes$size[i]] + for (i in nrow(chr_sizes)) { + final[chr == chr_sizes$chr[i] & end > chr_sizes$size[i], + end := chr_sizes$size[i]] } # identify reproducible peaks - peaks[,group := gsub("_peak.*","",name)] + peaks[,group := sample_table$sample_name[file]] + peaks[,file:=NULL] + final[,file:=NULL] peak_list <- splitDataTable(peaks, "group") rm(peaks) invisible(gc()) @@ -2539,30 +2571,34 @@ collapsePeaks <- function(sample_table, chrom_sizes, min_samples=2, min_score=5) #' This function is meant to identify a project level set of consensus peaks. #' -#' @param project A PEPr Project object -#' @param output_dir A PEP project output directory path -#' @param results_subdir A PEP project results subdirectory path +#' @param sample_table A data.table containing sample names and corresponding +#' genomes. +#' @param summary_dir A directory path to place results of this analysis +#' @param results_subdir A project results subdirectory path #' @param assets A data.table containing file assets +#' @param min_samples A minimum number of samples a peak must be present +#' in to keep. +#' @param min_score A minimum peak score to keep an individual peak. +#' @param min_olap A minimum number of bases between peaks to be +#' considered overlapping. #' @keywords consensus peaks #' @export -consensusPeaks <- function(project, output_dir, results_subdir, assets) { - # Set the summary output directory - summary_dir <- suppressMessages(file.path(output_dir, "summary")) +consensusPeaks <- function(sample_table, summary_dir, results_subdir, assets, + min_samples=2, min_score=5, min_olap=1) { + # Produce summary output directory (if needed) dir.create(summary_dir, showWarnings = FALSE) - sample_table <- data.table(sample_name=pepr::sampleTable(prj)$sample_name, - genome=pepr::sampleTable(prj)$genome) setDT(sample_table)[assets[asset == 'chrom_sizes', ], c_path := i.path, on = 'sample_name'] # generate paths to peak files sample_table[,peak_files:=.((file.path( - results_subdir, - sample_table$sample_name, - paste0("peak_calling_", sample_table$genome), - paste0(sample_table$sample_name, - "_peaks_normalized.narrowPeak"))))] + results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_peaks_normalized.narrowPeak"))))] #Only keep samples with valid peak files file_list <- sample_table$peak_files @@ -2577,8 +2613,7 @@ consensusPeaks <- function(project, output_dir, results_subdir, assets) { if (nrow(files) == 0) { return(consensus_peak_files) } - #sample_table <- sample_table[files, .SD, nomatch=0L, - # on="peak_files", .SDcols=names(sample_table)] + sample_table <- unique( sample_table[sample_table$peak_files %in% files$peak_files,]) @@ -2609,8 +2644,9 @@ consensusPeaks <- function(project, output_dir, results_subdir, assets) { } message(paste0("Calculating ", g, " consensus peak set from ", nrow(st_list[[g]]), " samples...")) - final <- collapsePeaks(st_list[[g]], c_size) - #} + final <- collapsePeaks(st_list[[g]], c_size, + min_samples, min_score, min_olap) + if (!is.null(final)) { # save consensus peak set file_name <- paste0("_", g,"_consensusPeaks.narrowPeak") @@ -2680,31 +2716,27 @@ readPepatacPeakCounts = function(prj, results_subdir) { #' Produce a project level peak counts table #' -#' @param project A PEPr project object -#' @param output_dir A PEP project output directory path +#' @param sample_table A data.table containing sample names and their +#' corresponding genome +#' @param summary_dir A PEP project summary directory path #' @param results_subdir A PEP project results subdirectory path #' @param assets A data.table containing file assets #' @param poverlap Weight counts by the percentage overlap with peak #' @param norm Use normalized read counts #' @param cutoff Only keep peaks present in the `cutoff` number of samples +#' @param min_olap A minimum number of bases between peaks to be +#' considered overlapping. #' @keywords project peak counts #' @export -peakCounts <- function(project, output_dir, results_subdir, assets, - poverlap=FALSE, norm=FALSE, cutoff=2) { - # Set the output directory - summary_dir <- suppressMessages(file.path(output_dir, "summary")) +peakCounts <- function(sample_table, summary_dir, results_subdir, assets, + poverlap=FALSE, norm=FALSE, cutoff=2, min_olap=1) { # Produce output directory (if needed) dir.create(summary_dir, showWarnings = FALSE) - sample_names <- pepr::sampleTable(project)$sample_name - genomes <- as.list(pepr::sampleTable(project)$genome) + sample_names <- unique(as.character(sample_table$sample_name)) + genomes <- as.list(sample_table$genome) names(genomes) <- sample_names - sample_names <- unique(as.character(pepr::sampleTable(project)$sample_name)) - - sample_table <- data.table( - sample_name=pepr::sampleTable(project)$sample_name, - genome=pepr::sampleTable(project)$genome) - + setDT(sample_table)[assets[asset == 'chrom_sizes', ], c_path := i.path, on = 'sample_name'] @@ -2744,7 +2776,9 @@ peakCounts <- function(project, output_dir, results_subdir, assets, file_list <- sample_table$peak_files file_exists <- character() for (i in 1:length(file_list)) { - if(file.exists(file.path(file_list[i]))) { + # Check if file exists and is not empty + info <- file.info(file.path(file_list[i])) + if(file.exists(file.path(file_list[i])) & info$size != 0) { file_exists <- append(file_exists, file.path(file_list[i])) } } @@ -2753,8 +2787,7 @@ peakCounts <- function(project, output_dir, results_subdir, assets, if (nrow(files) == 0) { return(consensus_peak_files) } - #sample_table <- sample_table[files, .SD, nomatch=0L, - # on="peak_files", .SDcols=names(sample_table)] + sample_table <- unique( sample_table[sample_table$peak_files %in% files$peak_files,]) peak_files <- sample_table$peak_files @@ -2829,8 +2862,29 @@ peakCounts <- function(project, output_dir, results_subdir, assets, width=as.numeric(), frac=as.numeric(), norm=as.numeric()) - - peaks <- rbindlist(lapply(st_list[[g]]$peak_files, fread)) + + peaks <- tryCatch( + { + suppressMessages( + rbindlist(lapply(st_list[[g]]$peak_files, fread))) + }, + error=function(e) { + message("peakCounts() peak coverage file fread(): ", e) + return(NULL) + }, + warning=function(e) { + message("peakCounts() peak coverage file fread(): ", e) + return(NULL) + } + ) + + if (is.null(peaks)) { + warning(strwrap(prefix = " ", initial = "", + "Unable to produce a peak coverage file. Check that + individual peak coverage files are not empty.")) + return(NULL) + } + colnames(peaks) <- c("chr", "start", "end", "read_count", "base_count", "width", "frac", "norm") setkey(peaks, chr, start, end) @@ -2840,12 +2894,12 @@ peakCounts <- function(project, output_dir, results_subdir, assets, keep.extra.columns=TRUE) reduceGR <- reduce(peaksGR) - # instead, different column for each sample is the counts columns, plural + # instead, different column for each sample is the counts columns, + # plural reduce_dt <- data.table(chr=as.character(seqnames(reduceGR)), start=start(reduceGR), end=end(reduceGR)) f <- function(x) {list(0)} - #reduce_dt[, (sample_names) := f()] # Need to make syntactically valid names valid_names <- make.unique(make.names(st_list[[g]]$sample_name)) reduce_dt[, (valid_names) := f()] @@ -2873,16 +2927,15 @@ peakCounts <- function(project, output_dir, results_subdir, assets, p <- fread(file) #name <- gsub("_peaks_coverage.bed","", basename(file)) name <- make.unique(make.names(st_list[[g]][i]$sample_name)) - #message(paste0("name: ", name)) i <- i + 1 - #message(paste0("i: ", i)) colnames(p) <- c("chr", "start", "end", "read_count", "base_count", "width", "frac", "norm") setkey(p, chr, start, end) reduceGR <- makeGRangesFromDataFrame(reduce_dt) peaksGR <- makeGRangesFromDataFrame(p) - hitsGR <- findOverlaps(query=reduceGR, subject=peaksGR) + hitsGR <- findOverlaps(query=reduceGR, subject=peaksGR, + minoverlap=min_olap) # Weight counts by percent overlap olap <- pintersect(reduceGR[queryHits(hitsGR)], @@ -2962,18 +3015,61 @@ peakCounts <- function(project, output_dir, results_subdir, assets, } -#' Create and return assets spreadsheet and save the spreadsheet to file +#' Create and return sample statistics summary data table #' -#' @param project A PEPr project object -#' @param output_dir A PEP project output directory path +#' @param samples A PEP project character vector of sample names #' @param results_subdir A PEP project results subdirectory path #' @export -createAssetsSummary <- function(project, output_dir, results_subdir) { - # Convenience - project_name <- pepr::config(project)$name - +createStatsSummary <- function(samples, results_subdir) { + # Create stats_summary file + missing_files <- 0 + write(paste0("Creating stats summary..."), stdout()) + + for (sample in samples) { + sample_output_folder <- file.path(results_subdir, sample) + sample_assets_file <- file.path(sample_output_folder, "stats.tsv") + + if (!file.exists(sample_assets_file)) { + missing_files <- missing_files + 1 + next + } + + t <- fread(sample_assets_file, header=FALSE, + col.names=c('stat', 'val', 'annotation')) + # Remove complete duplicates + t <- t[!duplicated(t[, c('stat', 'val', 'annotation')], + fromLast=TRUE),] + max_time <- suppressWarnings(max(t[stat=="Time",]$val)) + # Keep max(Time) and last(Success) + t <- t[!duplicated(t[, c('stat', 'annotation')], + fromLast=TRUE),] + t[stat=="Time",]$val <- max_time + + t2 <- data.table(t(t$val)) + colnames(t2) <- t$stat + t2 <- cbind(data.table(sample_name=sample), t2) + if (exists("stats")) { + stats <- rbind(stats, t2, fill=TRUE) + } else { + stats <- t2 + } + } + + if (missing_files > 0) { + warning(sprintf("Stats files missing for %s samples.", missing_files)) + } + + return(stats) +} + + +#' Create and return assets summary data table +#' +#' @param samples A PEP project character vector of sample names +#' @param results_subdir A PEP project results subdirectory path +#' @export +createAssetsSummary <- function(samples, results_subdir) { # Create assets_summary file - project_samples <- pepr::sampleTable(project)$sample_name missing_files <- 0 assets <- data.table(sample_name=character(), asset=character(), @@ -2981,7 +3077,7 @@ createAssetsSummary <- function(project, output_dir, results_subdir) { annotation=character()) write(paste0("Creating assets summary..."), stdout()) - for (sample in project_samples) { + for (sample in samples) { sample_output_folder <- file.path(results_subdir, sample) sample_assets_file <- file.path(sample_output_folder, "assets.tsv") @@ -2997,16 +3093,11 @@ createAssetsSummary <- function(project, output_dir, results_subdir) { t[,sample_name:=sample] assets = rbind(assets, t) } - project_assets_file <- file.path(output_dir, - paste0(project_name, '_assets_summary.tsv')) + if (missing_files > 0) { warning(sprintf("Assets files missing for %s samples.", missing_files)) } - fwrite(assets, project_assets_file, sep="\t", col.names=FALSE) - - message(sprintf("Summary (n=%s): %s", - length(unique(assets$sample_name)), project_assets_file)) return(assets) } diff --git a/PEPATACr/man/collapsePeaks.Rd b/PEPATACr/man/collapsePeaks.Rd index c2e66676..b3a8a1ec 100644 --- a/PEPATACr/man/collapsePeaks.Rd +++ b/PEPATACr/man/collapsePeaks.Rd @@ -6,15 +6,27 @@ Take a set of peak files and identify only the reproducible minimally scoring peaks.} \usage{ -collapsePeaks(sample_table, chrom_sizes, min_score = 5) +collapsePeaks( + sample_table, + chr_sizes, + min_samples = 2, + min_score = 5, + min_olap = 1 +) } \arguments{ \item{sample_table}{A data.table object that includes paths to valid peak files.} -\item{chrom_sizes}{A data.table of genome chromosome sizes.} +\item{chr_sizes}{A data.table of genome chromosome sizes.} -\item{min_score}{A minimum peak score to keep.} +\item{min_samples}{A minimum number of samples a peak must be present +in to keep.} + +\item{min_score}{A minimum peak score to keep an individual peak.} + +\item{min_olap}{A minimum number of bases between peaks to be +considered overlapping.} } \description{ Internal helper function for \code{consensusPeaks} diff --git a/PEPATACr/man/consensusPeaks.Rd b/PEPATACr/man/consensusPeaks.Rd index 42af6e6f..0e2d64bc 100644 --- a/PEPATACr/man/consensusPeaks.Rd +++ b/PEPATACr/man/consensusPeaks.Rd @@ -4,16 +4,29 @@ \alias{consensusPeaks} \title{This function is meant to identify a project level set of consensus peaks.} \usage{ -consensusPeaks(project, output_dir, results_subdir, assets) +consensusPeaks( + sample_table, + summary_dir, + results_subdir, + assets, + min_samples = 2, + min_score = 5 +) } \arguments{ -\item{project}{A PEPr Project object} +\item{sample_table}{A data.table containing sample names and corresponding +genomes.} -\item{output_dir}{A PEP project output directory path} +\item{summary_dir}{A directory path to place results of this analysis} -\item{results_subdir}{A PEP project results subdirectory path} +\item{results_subdir}{A project results subdirectory path} \item{assets}{A data.table containing file assets} + +\item{min_samples}{A minimum number of samples a peak must be present +in to keep.} + +\item{min_score}{A minimum peak score to keep an individual peak.} } \description{ This function is meant to identify a project level set of consensus peaks. diff --git a/PEPATACr/man/countReproduciblePeaks.Rd b/PEPATACr/man/countReproduciblePeaks.Rd index 57f997d1..e961f85d 100644 --- a/PEPATACr/man/countReproduciblePeaks.Rd +++ b/PEPATACr/man/countReproduciblePeaks.Rd @@ -5,13 +5,13 @@ \title{Internal helper function for \code{collapsePeaks} Count the number of times a peak appears across a list of peak files} \usage{ -countReproduciblePeaks(peakList, peakDT) +countReproduciblePeaks(peak_list, peak_DT) } \arguments{ -\item{peakList}{A list of data.table objects representing +\item{peak_list}{A list of data.table objects representing narrowPeak files} -\item{peakDT}{A single data.table of peaks} +\item{peak_DT}{A single data.table of peaks} } \description{ Internal helper function for \code{collapsePeaks} diff --git a/PEPATACr/man/createAssetsSummary.Rd b/PEPATACr/man/createAssetsSummary.Rd index ad6b06d5..b25929eb 100644 --- a/PEPATACr/man/createAssetsSummary.Rd +++ b/PEPATACr/man/createAssetsSummary.Rd @@ -2,17 +2,15 @@ % Please edit documentation in R/PEPATACr.R \name{createAssetsSummary} \alias{createAssetsSummary} -\title{Create and return assets spreadsheet and save the spreadsheet to file} +\title{Create and return assets summary data table} \usage{ -createAssetsSummary(project, output_dir, results_subdir) +createAssetsSummary(samples, results_subdir) } \arguments{ -\item{project}{A PEPr project object} - -\item{output_dir}{A PEP project output directory path} +\item{samples}{A PEP project character vector of sample names} \item{results_subdir}{A PEP project results subdirectory path} } \description{ -Create and return assets spreadsheet and save the spreadsheet to file +Create and return assets summary data table } diff --git a/PEPATACr/man/createStatsSummary.Rd b/PEPATACr/man/createStatsSummary.Rd new file mode 100644 index 00000000..7843bcea --- /dev/null +++ b/PEPATACr/man/createStatsSummary.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PEPATACr.R +\name{createStatsSummary} +\alias{createStatsSummary} +\title{Create and return sample statistics summary data table} +\usage{ +createStatsSummary(samples, results_subdir) +} +\arguments{ +\item{samples}{A PEP project character vector of sample names} + +\item{results_subdir}{A PEP project results subdirectory path} +} +\description{ +Create and return sample statistics summary data table +} diff --git a/PEPATACr/man/is.empty.Rd b/PEPATACr/man/is.empty.Rd new file mode 100644 index 00000000..0b7ff004 --- /dev/null +++ b/PEPATACr/man/is.empty.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PEPATACr.R +\name{is.empty} +\alias{is.empty} +\title{Check if a data frame is empty.} +\usage{ +is.empty(df) +} +\arguments{ +\item{df}{A data frame to check for emptiness} +} +\description{ +Return TRUE if provided data frame is NULL, has no rows, or has no columns. +} diff --git a/PEPATACr/man/peakCounts.Rd b/PEPATACr/man/peakCounts.Rd index 5c26bae0..0247b881 100644 --- a/PEPATACr/man/peakCounts.Rd +++ b/PEPATACr/man/peakCounts.Rd @@ -4,16 +4,35 @@ \alias{peakCounts} \title{Produce a project level peak counts table} \usage{ -peakCounts(project, output_dir, results_subdir, assets) +peakCounts( + sample_table, + summary_dir, + results_subdir, + assets, + poverlap = FALSE, + norm = FALSE, + cutoff = 2, + min_olap = 1 +) } \arguments{ -\item{project}{A PEPr project object} +\item{sample_table}{A data.table containing sample names and their +corresponding genome} -\item{output_dir}{A PEP project output directory path} +\item{summary_dir}{A PEP project summary directory path} \item{results_subdir}{A PEP project results subdirectory path} \item{assets}{A data.table containing file assets} + +\item{poverlap}{Weight counts by the percentage overlap with peak} + +\item{norm}{Use normalized read counts} + +\item{cutoff}{Only keep peaks present in the `cutoff` number of samples} + +\item{min_olap}{A minimum number of bases between peaks to be +considered overlapping.} } \description{ Produce a project level peak counts table diff --git a/PEPATACr/man/reducePeaks.Rd b/PEPATACr/man/reducePeaks.Rd index 5a090ff5..25eb7b7d 100644 --- a/PEPATACr/man/reducePeaks.Rd +++ b/PEPATACr/man/reducePeaks.Rd @@ -5,14 +5,14 @@ \title{This function is meant to keep only the most significant non-overlapping peaks. It also trims peaks extending beyond the bounds of the chromosome.} \usage{ -reducePeaks(input, chr_sizes, output = NULL, normalize = FALSE) +reducePeaks(input, chr_sizes, output = NA, normalize = FALSE) } \arguments{ \item{input}{Path to narrowPeak file} \item{chr_sizes}{Genome chromosome sizes file. } -\item{output}{Output file.} +\item{output}{Output file name.} \item{normalize}{Remove overlaps and normalize the score.} } diff --git a/PEPATACr/man/tryCatchChromBins.Rd b/PEPATACr/man/tryCatchChromBins.Rd new file mode 100644 index 00000000..516c3832 --- /dev/null +++ b/PEPATACr/man/tryCatchChromBins.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PEPATACr.R +\name{tryCatchChromBins} +\alias{tryCatchChromBins} +\title{Handle warnings and errors while returning value from calcChromBinsRef()} +\usage{ +tryCatchChromBins(x, y) +} +\arguments{ +\item{x}{A GenomicRanges or GenomicRangesList object with query regions} + +\item{y}{A character vector representing a known genome that will be used +to grab chromosome sizes with \code{getChromSizes}} +} +\description{ +Handle warnings and errors while returning value from calcChromBinsRef() +} diff --git a/containers/pepatac.Dockerfile b/containers/pepatac.Dockerfile index 8403327f..ac10320a 100644 --- a/containers/pepatac.Dockerfile +++ b/containers/pepatac.Dockerfile @@ -1,116 +1,181 @@ # Pull base image -FROM phusion/baseimage:0.10.2 +FROM phusion/baseimage:master # Who maintains this image LABEL maintainer Jason Smith "jasonsmith@virginia.edu" # Version info -LABEL version 0.9.5 +LABEL version 0.9.16 # Use baseimage-docker's init system. CMD ["/sbin/my_init"] # Install dependencies +RUN add-apt-repository ppa:deadsnakes/ppa RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes \ + DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes \ + libbz2-dev \ + build-essential \ curl \ default-jre \ default-jdk \ + openjdk-8-jdk \ git \ libcommons-math3-java \ libcurl4-gnutls-dev \ libjbzip2-java \ + liblua5.1-0-dev \ libpng-dev \ libssl-dev \ libtbb2 \ libtbb-dev \ + lua-filesystem-dev \ + lua-lpeg-dev \ + lua-md5-dev \ + libexpat1-dev \ + libtre-dev \ + libcairo2-dev \ + libpango1.0-dev \ + libsqlite3-dev \ + libbam-dev \ + libxml2-dev \ openssl \ - pigz \ - python \ - python-pip python-dev build-essential \ - wget + perl \ + pigz=2.4-1 \ + software-properties-common \ + python3.8 \ + python3-pip \ + python3-dev \ + build-essential \ + rustc \ + wget \ + zlib1g \ + zlib1g-dev # Install MySQL server RUN DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes mysql-server \ mysql-client \ libmysqlclient-dev - + # Install python tools -RUN pip install --upgrade pip -RUN pip install numpy # must install separate from MACS2 due to setup_requires conflicts +RUN python3.8 -m pip install -U pip +RUN pip install numpy>=1.17 # must install separate from MACS due to setup_requires conflicts RUN pip install virtualenv && \ - pip install cython && \ - pip install pandas && \ + pip install cython>=0.29 && \ + pip install cykhash && \ pip install pararead && \ - pip install piper + pip install attmap>=0.12.9 && \ + pip install codecov>=2.0 && \ + pip install colorama>=0.3.9 && \ + pip install cython>=0.29 && \ + pip install cykhash>=1.0.2 && \ + pip install jinja2>=2.11.2 && \ + pip install jsonschema>=3.0.1 && \ + pip install logmuse>=0.2.5 && \ + pip install oyaml>=0.9 && \ + pip install pandas>=0.20.2 && \ + pip install peppy>=0.31.0 && \ + pip install piper>=0.12.1 && \ + pip install psutil>=5.6.3 && \ + pip install pysam>=0.13 && \ + pip install python-Levenshtein>=0.12.0 && \ + pip install pyyaml>=3.13 && \ + pip install refgenconf>=0.7.0 && \ + pip install refgenie>=0.9.3 && \ + pip install ubiquerg>=0.6.1 && \ + pip install yacman>=0.6.7 # Install R -RUN DEBIAN_FRONTEND=noninteractive apt-get --assume-yes install r-base r-base-dev && \ - echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile && \ - Rscript -e "install.packages('argparser')" && \ +RUN apt update -qq && \ + DEBIAN_FRONTEND=noninteractive apt --assume-yes install --no-install-recommends dirmngr && \ + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ + add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" + +RUN DEBIAN_FRONTEND=noninteractive apt-get --assume-yes install r-base r-base-dev r-base-core r-recommended && \ + echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile + +RUN Rscript -e "install.packages('argparser')" && \ + Rscript -e "install.packages('optigrab')" && \ Rscript -e "install.packages('data.table')" && \ + Rscript -e "install.packages('xml2')" && \ + Rscript -e "install.packages('roxygen2')" && \ + Rscript -e "install.packages('rversions')" && \ Rscript -e "install.packages('devtools')" && \ Rscript -e "devtools::install_github('pepkit/pepr')" && \ Rscript -e "install.packages('data.table')" && \ - Rscript -e "source('https://bioconductor.org/biocLite.R')" && \ - Rscript -e "source('https://bioconductor.org/biocLite.R'); biocLite('GenomicRanges')" && \ + Rscript -e "install.packages('BiocManager')" && \ + Rscript -e "BiocManager::install('GenomicRanges')" && \ + Rscript -e "BiocManager::install('BSgenome')" && \ + Rscript -e "BiocManager::install('GenomicFeatures')" && \ + Rscript -e "BiocManager::install('ensembldb')" && \ + Rscript -e "BiocManager::install('ExperimentHub')" && \ Rscript -e "devtools::install_github('databio/GenomicDistributions')" && \ + Rscript -e "install.packages('http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz', repos=NULL)" &&\ Rscript -e "install.packages('ggrepel')" && \ Rscript -e "install.packages('ggplot2')" && \ Rscript -e "install.packages('gplots')" && \ Rscript -e "install.packages('grid')" && \ Rscript -e "install.packages('gtable')" && \ Rscript -e "install.packages('scales')" && \ - Rscript -e "install.packages('stringr')" - -# Install MACS2 from github -# Must install this way due to bug using easy_install in MACS2 setup.py with pip -WORKDIR /home/tools/ -RUN git clone git://github.com/taoliu/MACS.git && \ - cd /home/tools/MACS && \ - python setup_w_cython.py install && \ - python setup.py install + Rscript -e "install.packages('stringr')" && \ + Rscript -e "devtools::install_github('databio/pepatac/PEPATACr/')" # Install bedtools RUN DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes \ ant \ - bedtools - + bedtools>=2.29.2 + # Install fastqc WORKDIR /home/tools/ -RUN wget http://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.7.zip && \ - unzip fastqc_v0.11.7.zip && \ +RUN wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip && \ + unzip fastqc_v0.11.9.zip && \ cd /home/tools/FastQC && \ chmod 755 fastqc && \ ln -s /home/tools/FastQC/fastqc /usr/bin/ - + # Install htslib WORKDIR /home/src/ -RUN wget https://github.com/samtools/htslib/releases/download/1.7/htslib-1.7.tar.bz2 && \ - tar xf htslib-1.7.tar.bz2 && \ - cd /home/src/htslib-1.7 && \ - ./configure --prefix /home/tools/ && \ +RUN wget https://github.com/samtools/htslib/releases/download/1.12/htslib-1.12.tar.bz2 && \ + tar xf htslib-1.12.tar.bz2 && \ + cd /home/src/htslib-1.12 && \ + ./configure --prefix /home/src/ && \ make && \ make install +# Install MACS2 from PyPi +WORKDIR /home/tools/ +RUN pip install MACS2 + # Install samtools WORKDIR /home/src/ -RUN wget https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2 && \ - tar xf samtools-1.7.tar.bz2 && \ - cd /home/src/samtools-1.7 && \ +RUN wget https://github.com/samtools/samtools/releases/download/1.12/samtools-1.12.tar.bz2 && \ + tar xf samtools-1.12.tar.bz2 && \ + cd /home/src/samtools-1.12 && \ ./configure && \ make && \ make install && \ - ln -s /home/src/samtools-1.7/samtools /usr/bin/ + ln -s /home/src/samtools-1.12/samtools /usr/bin/ + +# Install preseq +WORKDIR /home/tools/ +RUN wget https://github.com/smithlabcode/preseq/releases/download/v3.1.2/preseq-3.1.2.tar.gz && \ + tar xf preseq-3.1.2.tar.gz && \ + cd preseq-3.1.2 && \ + mkdir build && cd build && \ + ../configure --enable-hts \ + CPPFLAGS=-I"/home/src/include" \ + LDFLAGS="-L/home/src/lib -Wl,-R/home/src/lib" && \ + make && \ + make install # Install bowtie2 WORKDIR /home/src/ -RUN wget https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.3.4.1/bowtie2-2.3.4.1-source.zip && \ - unzip bowtie2-2.3.4.1-source.zip && \ - cd /home/src/bowtie2-2.3.4.1 && \ +RUN wget -O bowtie2-2.4.2-source.zip 'https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.4.2/bowtie2-2.4.2-source.zip?ts=gAAAAABgfxZxKMUjBU0A0XjfO55q36KUoO9RRemjzTT_WCDpSSZCy8NtKrFODKV4xS_135KTiIdnBSaqmvHuQw9l6nqM2EULvw%3D%3D&r=https%3A%2F%2Fsourceforge.net%2Fprojects%2Fbowtie-bio%2Ffiles%2Fbowtie2%2F2.4.2%2Fbowtie2-2.4.2-source.zip%2Fdownload' && \ + unzip bowtie2-2.4.2-source.zip && \ + cd /home/src/bowtie2-2.4.2 && \ make && \ make install && \ - ln -s /home/src/bowtie2-2.3.4.1/bowtie2 /usr/bin/ + ln -s /home/src/bowtie2-2.4.2/bowtie2 /usr/bin/ # Install samblaster WORKDIR /home/tools/ @@ -126,15 +191,18 @@ RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedGraphToBigWig wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigWigCat && \ wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bedSort && \ wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bedToBigBed && \ + wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigWigMerge && \ chmod +x /home/tools/bedGraphToBigWig && \ chmod +x /home/tools/wigToBigWig && \ chmod +x /home/tools/bigWigCat && \ chmod +x /home/tools/bedSort && \ chmod +x /home/tools/bedToBigBed && \ + chmod +x /home/tools/bigWigMerge && \ ln -s /home/tools/bedGraphToBigWig /usr/bin/ && \ ln -s /home/tools/wigToBigWig /usr/bin/ && \ ln -s /home/tools/bigWigCat /usr/bin/ && \ ln -s /home/tools/bedSort /usr/bin/ && \ + ln -s /home/tools/bigWigMerge /usr/bin/ && \ ln -s /home/tools/bedToBigBed /usr/bin/ # Install Skewer @@ -145,33 +213,99 @@ RUN git clone https://github.com/relipmoc/skewer.git && \ make install # OPTIONAL REQUIREMENTS +# Install bwa +WORKDIR /home/src/ +RUN wget -O bwa-0.7.17.tar.bz2 'https://downloads.sourceforge.net/project/bio-bwa/bwa-0.7.17.tar.bz2?ts=gAAAAABgfZHQ5GyuWr5rjuP8wYOC1ueJsyJD3qN-MfiwKeN3jxzRMWGU1eUfKvYzteg86k_UeLBYZTRRm1deDQaokXbPaevHYw%3D%3D&r=https%3A%2F%2Fsourceforge.net%2Fprojects%2Fbio-bwa%2Ffiles%2Flatest%2Fdownload' && \ + tar xf bwa-0.7.17.tar.bz2 && \ + cd /home/src/bwa-0.7.17 && \ + make && \ + ln -s /home/src/bwa-0.7.17/bwa /usr/bin/ + +# Install F-seq2 +WORKDIR /home/src/ +RUN pip install pyBigWig +RUN wget https://github.com/Boyle-Lab/F-Seq2/archive/master.zip && \ + unzip master.zip && \ + cd /home/src/F-Seq2-master && \ + python3 setup.py install + +# Install genometools +WORKDIR /home/tools/ +RUN wget http://genometools.org/pub/genometools-1.6.1.tar.gz && \ + tar xf genometools-1.6.1.tar.gz && \ + cd /home/tools/genometools-1.6.1 && \ + make useshared=yes && \ + make install + +# Install Genrich +WORKDIR /home/tools/ +RUN git clone https://github.com/jsh58/Genrich.git && \ + cd Genrich && \ + make && \ + ln -s /home/tools/Genrich/Genrich /usr/bin/ + +# Install HMMRATAC +WORKDIR /home/tools/bin +RUN wget https://github.com/LiuLabUB/HMMRATAC/releases/download/1.2.10/HMMRATAC_V1.2.10_exe.jar && \ + chmod +x HMMRATAC_V1.2.10_exe.jar + +# Install HOMER +WORKDIR /home/tools/bin +RUN wget http://homer.ucsd.edu/homer/configureHomer.pl && \ + perl /home/tools/bin/configureHomer.pl -install && \ + perl /home/tools/bin/configureHomer.pl -install human && \ + perl /home/tools/bin/configureHomer.pl -install mouse + +# Install MACS3 from github +WORKDIR /home/tools/ +RUN git clone https://github.com/macs3-project/MACS.git --recurse-submodules && \ + cd /home/tools/MACS && \ + python3 setup.py install + # Install picard WORKDIR /home/tools/bin -RUN wget https://github.com/broadinstitute/picard/releases/download/2.18.0/picard.jar && \ +RUN wget https://github.com/broadinstitute/picard/releases/download/2.25.2/picard.jar && \ chmod +x picard.jar +# Install seqOutBias +WORKDIR /home/tools/ +RUN wget -O seqOutBias-v1.3.0.tar.gz 'https://github.com/guertinlab/seqOutBias/archive/refs/tags/v1.3.0.tar.gz' && \ + tar xf seqOutBias-v1.3.0.tar.gz && \ + cd seqOutBias-1.3.0 && \ + cargo build --release && \ + ln -s /home/tools/seqOutBias-1.3.0/target/release/seqOutBias /usr/bin/ + +# Install Trimmomatic +WORKDIR /home/src/ +RUN wget https://github.com/usadellab/Trimmomatic/files/5854859/Trimmomatic-0.39.zip && \ + unzip Trimmomatic-0.39.zip && \ + chmod +x Trimmomatic-0.39/trimmomatic-0.39.jar + # Install F-seq WORKDIR /home/src/ -RUN wget https://github.com/aboyle/F-seq/archive/master.zip && \ - unzip master.zip && \ - cd /home/src/F-seq-master && \ +RUN git clone https://github.com/aboyle/F-seq.git +ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ \ + PATH=$PATH:$JAVA_HOME/bin +RUN cd /home/src/F-seq && \ ant && \ cd dist~/ && \ tar xf fseq.tgz && \ - ln -s /home/src/F-seq-master/dist~/fseq/bin/fseq /usr/bin/ - -# Install Trimmomatic -WORKDIR /home/src/ -RUN wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.36.zip && \ - unzip Trimmomatic-0.36.zip && \ - chmod +x Trimmomatic-0.36/trimmomatic-0.36.jar + ln -s /home/src/F-seq/dist~/fseq/bin/fseq /usr/bin/ # Set environment variables -ENV PATH=/home/tools/bin:/home/tools/:/home/tools/bin/kentUtils/:/home/src/F-seq-master/dist~/fseq/bin:/home/src/bowtie2-2.3.4.1:/home/src/skewer:/home/src/samtools-1.7:/home/src/Trimmomatic-0.36/:/home/src/htslib-1.7:$PATH \ - TRIMMOMATIC=/home/src/Trimmomatic-0.36/trimmomatic-0.36.jar \ +ENV PATH=/home/tools/bin:/home/tools/:/home/tools/bin/kentUtils/:/home/src/bowtie2-2.4.2:/home/src/skewer:/home/src/samtools-1.12:/home/src/Trimmomatic-0.39/:/home/src/htslib-1.12:$PATH \ + TRIMMOMATIC=/home/src/Trimmomatic-0.39/trimmomatic-0.39.jar \ PICARD=/home/tools/bin/picard.jar \ + HMMRATAC=/home/tools/bin/HMMRATAC_V1.2.10_exe.jar \ R_LIBS_USER=/usr/local/lib/R/site-library/ \ - PYTHONPATH=/usr/local/lib/python2.7/dist-packages:$PYTHONPATH + PYTHONPATH=/usr/local/lib/python3.8/dist-packages:$PYTHONPATH + +# Create python alias +RUN echo 'alias python="/usr/bin/python3"' >> /root/.bashrc && \ + /bin/bash -c "source /root/.bashrc" + +# Make python3 default +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10 # Define default command WORKDIR /home/ diff --git a/docs/changelog.md b/docs/changelog.md index 4f418b03..c9e4e109 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,17 @@ All notable changes to this project will be documented in this file. +## [0.9.16] -- 2021-05-18 + +### Changed + - Updated consensus peak documentation + - Reintroduced single container images and corresponding documentation + - Added minimum score and minimum overlap parameters to consensus peak generation + - Restore use of original F-seq as optional peak caller + - Updated peak calling using F-seq, HOMER, and HMMRATAC + - Updated PEPATACr package to rely less on PEPs within functions + - Move stats summary generation to R and outside of looper table + ## [0.9.15] -- 2021-03-03 ### Added diff --git a/docs/consensus_peaks.md b/docs/consensus_peaks.md index c792c21d..f6ed4b1c 100644 --- a/docs/consensus_peaks.md +++ b/docs/consensus_peaks.md @@ -19,15 +19,30 @@ From the `PEPATAC` repository after you've run the test sample: R library(PEPATACr) pep <- "examples/test_project/test_config.yaml" + # Load the project metadata prj <- pepr::Project(pep) -project_name <- config(prj)$name -output_dir <- "pepatac_test/" -results_dir <- "pepatac_test/results_pipeline/" -# Identify which chrom_sizes file to use -assets <- createAssetsSummary(prj, output_dir, results_dir) +project_name <- pepr::config(prj)$name +project_samples <- pepr::sampleTable(prj)$sample_name +sample_table <- data.table::data.table( + sample_name=pepr::sampleTable(prj)$sample_name, + genome=pepr::sampleTable(prj)$genome) + +# Specify file locations +output_dir <- "pepatac_test/" +results_dir <- file.path(output_dir, "results_pipeline") +summary_dir <- file.path(output_dir, "summary") +# Produce output directory (if needed) +dir.create(summary_dir, showWarnings = FALSE) + +# Identify which chrom_sizes file was used from the project assets +assets <- createAssetsSummary(project_samples, results_dir) + # Generate consensus peaks and write to project output directory -peak_filepath <- consensusPeaks(prj, output_dir, results_dir, assets) +min_samples <- 2 +min_score <- 5 +peak_filepath <- consensusPeaks(sample_table, summary_dir, results_dir, assets, + min_samples, min_score) # Load the peak file into R -peaks <- fread(peak_filepath[[1]]) +peaks <- data.table::fread(peak_filepath[[1]]) ``` diff --git a/docs/install.md b/docs/install.md index 52249606..7c1456b3 100644 --- a/docs/install.md +++ b/docs/install.md @@ -8,7 +8,7 @@ git clone https://github.com/databio/pepatac.git ## 2: Install required software -You have two options for software prerequisites: 1) use containers, or 2) install all prerequisites natively. If you want to use containers, you need the [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/), and either `docker` or `singularity` -- please see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements natively: +You have a few options for software prerequisites: 1) use containers, 2) install via `conda`, or 3) install all prerequisites natively. If you want to use containers, you can use either the [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/) with `docker` or `singularity`, or just use either `docker` or `singularity` -- see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements with `conda`: ### Tools diff --git a/docs/run-container.md b/docs/run-container.md index 1b6bc903..61786224 100644 --- a/docs/run-container.md +++ b/docs/run-container.md @@ -2,7 +2,17 @@ Whether you are using `docker` or `singularity`, we have a solution to run the pipeline using containers that dramatically reduces the installation burden. -In addition to cloning the `PEPATAC` repository, this requires the installation and configuration of a single python package, our [multi-container environment manager `bulker`](https://bulker.databio.org/en/latest/). +In addition to cloning the `PEPATAC` repository, this requires the installation and configuration of a single python package, our [multi-container environment manager `bulker`](https://bulker.databio.org/en/latest/). We support using `bulker` for a few reasons: + +1. It simplifies container use by wrapping the complexities of `docker` or +`singularity` calls so that you can use a containerized program without even +realizing you're using a container. You can call a program at the command line +the same as your would *without* using bulker. +2. Similar to a dockerfile, you can distribute sets of tools *but* as a separate set of containers, not a single, unwieldy, and monolithic container. +3. Since `bulker` commands behave like native commands, a workflow becomes automatically containerized with bulker. +4. Finally, this makes bulker environments very portable, since the only requirement for native-like command use is `docker` or `singularity`. + +Yet, if you would still prefer using a single container, we do still provide a [PEPATAC dockerfile](https://github.com/databio/pepatac/blob/master/containers/pepatac.Dockerfile) and support for [running the pipeline in this manner](run-container.md#running-pepatac-without-bulker). ## Running `PEPATAC` using `bulker` @@ -14,14 +24,14 @@ Check out [the `bulker` setup guide to install bulker](https://bulker.databio.or We've already produced a `bulker` crate for `PEPATAC` that requires all software needed to run the pipeline. We can load this crate directly from the [`bulker registry`](http://hub.bulker.io/): ``` -bulker load databio/pepatac:1.0.4 -r +bulker load databio/pepatac:1.0.6 -r ``` ### 3. Activate the `PEPATAC` crate Now that we've loaded the `PEPATAC` crate, we need to activate that specific crate so its included tools are available. ``` -bulker activate databio/pepatac:1.0.4 +bulker activate databio/pepatac:1.0.6 ``` Now, you can run any of the commands in the crate as if they were natively installed, **but they're actually running in containers**! @@ -47,4 +57,122 @@ Since `bulker` automatically direct any calls to required software to instead be looper run examples/test_project/test_config.yaml ``` +## Running `PEPATAC` without `bulker` + +You can run `PEPATAC` as an individual pipeline on a single sample using these containers by directly calling `docker run` or `singularity exec`. Or, you can rely on `looper`, which is already set up to run any pipeline in existing containers using the `divvy` templating system. Instructions for both follow: + +First, make sure your environment is set up to run either docker or singularity containers. Then, pull the container image: + +**Docker**: You can pull the docker [databio/pepatac image](https://hub.docker.com/r/databio/pepatac/) from dockerhub like this: + +``` +docker pull databio/pepatac +``` + +Or build the image using the included Dockerfile (you can use a recipe in the included Makefile): +``` +cd pepatac/ +make docker +``` + +**Singularity**: You can [download the singularity image](http://big.databio.org/simages/pepatac) or build it from the docker image using the Makefile: +``` +cd pepatac/ +make singularity +``` + +Now you'll need to tell the pipeline where you saved the singularity image. You can either create an environment variable called `$SIMAGES` that points to the folder where your image is stored, or you can tweak the `pipeline_interface.yaml` file so that the `compute.singularity_image` attribute is pointing to the right location on disk. + +If your containers are set up correctly, then won't need to install any additional software. + +## Running individual samples in a container + +Individual jobs can be run in a container by simply running the `pepatac.py` command through `docker run` or `singularity exec`. You can run containers either on your local computer, or in an HPC environment, as long as you have `docker` or `singularity` installed. You will need to include any volumes that contain data required by the pipeline. For example, to utilize `refgenie` assets you'll need to ensure the volume containing those files is available. In the following example, we are including an environment variable (`$GENOMES`) which points to such a directory. + +For example, run it locally in singularity like this: +``` +singularity exec --bind $GENOMES $SIMAGES/pepatac pipelines/pepatac.py --help +``` + +With `docker`, you can use: +``` +docker run --rm -it databio/pepatac pipelines/pepatac.py --help +``` +Be sure to mount the volumes you need with `--volume`. If you're utilizing any environment variables (e.g. `$GENOMES`), don't forget to include those in your docker command with the `-e` option. + +### Container details + +#### Using `docker` +The pipeline has been successfully run in both a Linux and MacOS environment. With `docker` you need to bind mount your volume that contains the pipeline and your `refgenie` assets locations, as well as provide the container the same environment variables your host environment is using. + +In the first example, we're mounting our home user directory (`/home/jps3ag/`) which contains the parent directories to our `refgenie` assets (`$GENOMES`) and to the pipeline itself. We'll also provide the pipeline two environment variables, `$GENOMES` and `$HOME`. + +Here's that example command in a Linux environment to run the test example through the pipeline: +``` +docker run --rm -it --volume /home/jps3ag/:/home/jps3ag/ \ + -e GENOMES='/home/jps3ag/genomes/' \ + -e HOME='/home/jps3ag/' \ + databio/pepatac \ + /home/jps3ag/src/pepatac/pipelines/pepatac.py --single-or-paired paired \ + --prealignments rCRSd human_repeats \ + --genome hg38 \ + --sample-name test1 \ + --input /home/jps3ag/src/pepatac/examples/data/test1_r1.fastq.gz \ + --input2 /home/jps3ag/src/pepatac/examples/data/test1_r2.fastq.gz \ + --genome-size hs \ + -O $HOME/pepatac_test +``` + +In this second example, we'll perform the same command in a Mac environment using [Docker for Mac](https://docs.docker.com/v17.12/docker-for-mac/install/). + +This necessitates a few minor changes to run that same example: + +- replace `/home/` with `/Users/` format +- e.g. `--volume /Users/jps3ag/:/Users/jps3ag/` + +Remember to [allocate sufficient memory](https://docs.docker.com/docker-for-mac/#advanced) (6-8GB should generally be adequate) in Docker for Mac. + +``` +docker run --rm -it --volume /Users/jps3ag/:/Users/jps3ag/ \ + -e GENOMES="/Users/jps3ag/genomes" \ + -e HOME="/Users/jps3ag/" \ + databio/pepatac \ + /Users/jps3ag/src/pepatac/pipelines/pepatac.py --single-or-paired paired \ + --prealignments rCRSd human_repeats \ + --genome hg38 \ + --sample-name test1 \ + --input /Users/jps3ag/src/pepatac/examples/data/test1_r1.fastq.gz \ + --input2 /Users/jps3ag/src/pepatac/examples/data/test1_r2.fastq.gz \ + --genome-size hs \ + -O $HOME/pepatac_test +``` + +#### Using `singularity` + +First, build a singularity container from the docker image and create a running instance (be sure to mount your directories containing your `$GENOMES` folder and pipeline. +``` +singularity build pepatac docker://databio/pepatac:latest +singularity instance start -B /home/jps3ag/:/home/jps3aq/ pepatac pepatac_instance +``` + +Second, run your command. +``` +singularity exec instance://pepatac_instance \ + /home/jps3ag/src/pepatac/pipelines/pepatac.py --single-or-paired single \ + --prealignments rCRSd human_repeats \ + --genome hg38 \ + --sample-name test1 \ + --input /home/jps3ag/src/pepatac/examples/data/test1_r1.fastq.gz \ + --input2 /home/jps3ag/src/pepatac/examples/data/test1_r2.fastq.gz \ + --genome-size hs \ + -O $HOME/pepatac_test +``` + +Third, close your instance when finished. +``` +singularity instance stop pepatac_instance +``` + +## Running multiple samples in a container with looper +To run multiple samples in a container, you simply need to configure `looper` to use a container-compatible template. The looper documentation has instructions for [running jobs in containers](http://looper.databio.org/en/latest/containers/). \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index 8ade7a46..e0818e3e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -12,7 +12,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY [-Q SINGLE_OR_PAIRED] [--aligner {bowtie2,bwa}] - [--peak-caller {fseq,genrich,hmmratac,homer,macs2}] + [--peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2}] [-gs GENOME_SIZE] [--trimmer {trimmomatic,pyadapt,skewer}] [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] [--deduplicator {picard,samblaster,samtools}] @@ -22,7 +22,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite] [--skipqc] [-V] -PEPATAC version 0.9.15 +PEPATAC version 0.9.16 optional arguments: -h, --help show this help message and exit @@ -49,7 +49,7 @@ optional arguments: Single- or paired-end sequencing protocol --aligner {bowtie2,bwa} Name of read aligner - --peak-caller {fseq,genrich,hmmratac,homer,macs2} + --peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2} Name of peak caller -gs GENOME_SIZE, --genome-size GENOME_SIZE Effective genome size. It can be 1.0e+9 or 1000000000: diff --git a/examples/gold_atac/README.md b/examples/gold_atac/README.md index a66f8156..f3852fcc 100644 --- a/examples/gold_atac/README.md +++ b/examples/gold_atac/README.md @@ -15,9 +15,34 @@ This does require having the [`NCBI SRA Toolkit`](https://trace.ncbi.nlm.nih.gov geofetch -i GSE94182 ``` -## Run pipeline +## Run the pipeline + +To make our software and code more portable, we create environment variables to point to software parent directories, data directories, and output directories. The [`gold_config.yaml`](https://github.com/databio/pepatac/tree/master/examples/gold_atac/metadata/gold_config.yaml) configuration file expects a few of these to already be configured. You could also overwrite these in the `gold_config.yaml` itself to be full paths should you prefer. + +Pipeline output parent directory: `export PROCESSED="/your/project/output/parent/dir" +Software and code parent directory: `export CODE="/your/path/to/code"` +Sample data directory: `export SRAFQ=/your/path/to/fastq/files"` + +### Run with default settings + +``` +looper run ${CODE}pepatac/examples/gold_atac/metadata/gold_config.yaml +``` + +### Run with alternative settings + +The configuration file contains numerous [amendments](http://peppy.databio.org/en/latest/feature5_amend/) to illustrate how you could run the pipeline using a different peak caller, deduplication tool, or alignment tool to name a few. + +For example, say you want to align the gold samples using `bwa` instead of `bowtie2`. First, you will of course need to have `bwa` installed or available with bulker, and second, you'll need to have [`refgenie` assets for `bwa`](http://refgenie.databio.org/en/latest/available_assets/#bwa_index). + +``` +looper run ${CODE}pepatac/examples/gold_atac/metadata/gold_config.yaml --amend bwa +``` + +Or, say you want to use `picard MarkDuplicates` instead of `samblaster` for deduplication: ``` -looper run ${CODE}pepatac/examples/gold_atac/metadata/project_config.yaml +looper run ${CODE}pepatac/examples/gold_atac/metadata/gold_config.yaml --amend picard ``` +All of the `project_modifiers:` subheaders in the `gold_config.yaml` file illustrate how you could, using the same base configuration file, call the pipeline on the gold samples with different settings. \ No newline at end of file diff --git a/examples/gold_atac/metadata/gold_atac_annotation.csv b/examples/gold_atac/metadata/gold_sample_table.csv similarity index 100% rename from examples/gold_atac/metadata/gold_atac_annotation.csv rename to examples/gold_atac/metadata/gold_sample_table.csv diff --git a/mkdocs.yml b/mkdocs.yml index b71a744c..6e3678b2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,11 +1,12 @@ theme: databio -site_name: peppro +site_name: PEPATAC site_author: Jason Smith -site_url: http://peppro.databio.org/en/latest/ -site_logo: img/peppro_logo_gray.svg -repo_url: https://github.com/databio/peppro/ -#google_analytics: ['UA-127092878-1', 'code.databio.org/peppro'] +site_url: http://pepatac.databio.org/en/latest/ +site_logo: img/pepatac_logo_white.svg +repo_url: https://github.com/databio/pepatac/ +google_analytics: ['UA-127092878-1', 'pepatac.databio.org'] +paper_link: https://doi.org/10.1101/2020.10.21.347054 markdown_extensions: - fontawesome_markdown # pip install --user fontawesome-markdown diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 391316b5..7f459c75 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.9.15" +__version__ = "0.9.16" from argparse import ArgumentParser @@ -24,7 +24,7 @@ TOOLS_FOLDER = "tools" ANNO_FOLDER = "anno" ALIGNERS = ["bowtie2", "bwa"] -PEAK_CALLERS = ["fseq", "genrich", "hmmratac", "homer", "macs2"] +PEAK_CALLERS = ["fseq", "fseq2", "genrich", "hmmratac", "homer", "macs2"] PEAK_TYPES = [ "fixed", "variable"] DEDUPLICATORS = ["picard", "samblaster", "samtools"] TRIMMERS = ["trimmomatic", "pyadapt", "skewer"] @@ -412,6 +412,22 @@ def tool_path(tool_name): TOOLS_FOLDER, tool_name) +def rescale(n, after=[0,1], before=[]): + """ + Helper function to rescale a vector between specified range of values + + :param numpy array n: a vector of numbers to be rescale + :param list after: range of values to which to scale n + :param list before: range of values in which n is contained + """ + if not before: + before=[min(n), max(n)] + if (before[1] - before[0]) == 0: + return n + return (((after[1] - after[0]) * (n - before[0]) / + (before[1] - before[0])) + after[0]) + + def check_commands(commands, ignore=''): """ Check if command(s) can be called @@ -567,7 +583,7 @@ def main(): ############################################################################ # Confirm required tools are all callable # ############################################################################ - opt_tools = ["fseq", "Genrich", "${HMMRATAC}", "${PICARD}", + opt_tools = ["fseq", "fseq2", "Genrich", "${HMMRATAC}", "${PICARD}", "${TRIMMOMATIC}", "pyadapt", "findMotifsGenome.pl", "findPeaks", "seqOutBias", "bigWigMerge", "bedGraphToBigWig", "pigz", "bwa"] @@ -606,6 +622,9 @@ def main(): if args.peak_caller == "fseq": if 'fseq' in opt_tools: opt_tools.remove('fseq') + if args.peak_caller == "fseq2": + if 'fseq2' in opt_tools: opt_tools.remove('fseq2') + if args.peak_caller == "genrich": if 'Genrich' in opt_tools: opt_tools.remove('Genrich') @@ -777,18 +796,19 @@ def main(): cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq( local_input_files, args.sample_name, args.paired_end, fastq_folder, zipmode=True) - #print(cmd) # DEBUG + + #print(f"{cmd}") # DEBUG # flatten nested list if any(isinstance(i, list) for i in unaligned_fastq): unaligned_fastq = [i for e in unaligned_fastq for i in e] # maintain order and remove duplicate entries - unaligned_fastq = list(dict.fromkeys(unaligned_fastq)) + if any(isinstance(i, dict) for i in local_input_files): + unaligned_fastq = list(dict.fromkeys(unaligned_fastq)) pm.run(cmd, unaligned_fastq, follow=ngstk.check_fastq( local_input_files, unaligned_fastq, args.paired_end)) - pm.clean_add(out_fastq_pre + "*.fastq", conditional=True) if args.paired_end: @@ -1064,14 +1084,14 @@ def no_handle(fq): if args.aligner.lower() == "bwa": if not param.bwa.params: - bwa_options = " -M" + bwa_options = " -M " else: bwa_options = param.bwa.params else: if not param.bowtie2.params: - bt2_options = " --very-sensitive" + bt2_options = " --very-sensitive " if args.paired_end: - bt2_options += " -X 2000" + bt2_options += " -X 2000 " else: bt2_options = param.bowtie2.params @@ -1972,6 +1992,30 @@ def report_peak_count(): pm.stop_pipeline() else: if args.peak_caller == "fseq": + if args.peak_type == "fixed": + err_msg = "Must use MACS2 when calling fixed width peaks." + pm.fail_pipeline(RuntimeError(err_msg)) + else: + fseq_cmd_chunks = [ + tools.fseq, + ("-o", peak_folder), + param.fseq.params + ] + # Create the peak calling command + fseq_cmd_chunks.append(peak_input_file) + fseq_cmd = build_command(fseq_cmd_chunks) + + # Create the file merge/delete commands. + chrom_peak_files = os.path.join(peak_folder, "*.npf") + merge_chrom_peaks_files = ( + "cat {peakfiles} > {combined_peak_file}" + .format(peakfiles=chrom_peak_files, + combined_peak_file=peak_output_file)) + pm.clean_add(chrom_peak_files) + + # Pypiper serially executes the commands. + cmd = [fseq_cmd, merge_chrom_peaks_files] + elif args.peak_caller == "fseq2": if args.peak_type == "fixed": err_msg = "Must use MACS2 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) @@ -1980,7 +2024,7 @@ def report_peak_count(): tools.fseq, "callpeak", peak_input_file, - param.fseq.params, + param.fseq2.params, ("-name", args.sample_name), ("-cpus", pm.cores) ] @@ -1989,24 +2033,14 @@ def report_peak_count(): fseq_cmd_chunks.append("-pe_fragment_size_range auto") # Create the peak calling command fseq_cmd = build_command(fseq_cmd_chunks) - - # Create the file merge/delete commands. # F-seq1 - # chrom_peak_files = os.path.join(peak_folder, "*.npf") - # merge_chrom_peaks_files = ( - # "cat {peakfiles} > {combined_peak_file}" - # .format(peakfiles=chrom_peak_files, - # combined_peak_file=peak_output_file)) - # pm.clean_add(chrom_peak_files) - - # Pypiper serially executes the commands. - #cmd = [fseq_cmd, merge_chrom_peaks_files] # F-seq1 cmd = fseq_cmd - # TODO: move fixed plus not macs check early on before pipeline starts! elif args.peak_caller == "hmmratac" and args.paired_end: if args.peak_type == "fixed": err_msg = "Must use MACS2 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: + gapped_peak_file = os.path.join(peak_folder, args.sample_name + + "_peaks.gappedPeak") fixed_header_bam = os.path.join(map_genome_folder, args.sample_name + "_fixed_header.bam") fixed_header_index = os.path.join(map_genome_folder, @@ -2025,9 +2059,16 @@ def report_peak_count(): cmd3 += " --genome " + chr_order if os.path.exists(res.blacklist): cmd3 += " --blacklist " + res.blacklist - cmd3 += param.hmmratac.params - cmd3 += " --output " + peak_output_file - cmd = cmd3 + cmd3 += " " + param.hmmratac.params + cmd3 += " --output " + cmd3 += os.path.join(peak_folder, args.sample_name) + # Drop HighCoveragePeaks [$13(e.g. the score)>1] + # Use the score as the qValue for compatibility downstream + cmd4 = ("awk -v OFS='\t' '$13>1 {print $1, $2, $3, $4, " + + "$13, $5, \".\", \".\", $13, \".\"}' " + + gapped_peak_file + " | sort -k1,1n -k2,2n > " + + peak_output_file) + cmd = [cmd3, cmd4] elif args.peak_caller == "hmmratac" and not args.paired_end: pm.info("HMMRATAC requires paired-end data. Defaulting to MACS2") cmd_base = [ @@ -2050,6 +2091,7 @@ def report_peak_count(): cmd2 = tools.genrich + " -j -t " + name_sort_rmdup if os.path.exists(res.blacklist): cmd2 += " -E " + res.blacklist + cmd2 += param.genrich.params cmd2 += " -o " + peak_output_file pm.clean_add(name_sort_rmdup) cmd = [cmd1, cmd2] @@ -2063,11 +2105,16 @@ def report_peak_count(): "_homer_peaks.tsv") cmd1 = ('makeTagDirectory ' + tag_directory + " " + rmdup_bam) cmd2 = (tools.homer_findpeaks + " " + tag_directory + " -o " + - homer_peaks + " -gsize " + args.genome_size + " ") + homer_peaks + " -gsize " + str(genome_size) + " ") cmd2 += param.homer_findpeaks.params - cmd3 = ('pos2bed.pl ' + homer_peaks + " | " + tools.bedtools + - " sort | " + tools.bedtools + " merge > " + + cmd3 = ("awk 'BEGIN{OFS=\"\t\";} /^[^#]/ " + + "{ print $2,$3,$4,$1,$8,$5 }' " + homer_peaks + + " | sort -k1,1 -k2,2n | " + tools.bedtools + " merge " + + "-c 4,5,6 -o distinct,sum,distinct > " + peak_output_file) + # cmd3 = ('pos2bed.pl ' + homer_peaks + " | " + tools.bedtools + + # " sort | " + tools.bedtools + " merge > " + + # peak_output_file) pm.clean_add(homer_peaks) pm.clean_add(tag_directory) cmd = [cmd1, cmd2, cmd3] @@ -2086,7 +2133,7 @@ def report_peak_count(): cmd = build_command(cmd_base) # Call peaks and report peak count. - # nofail true conditional on hmmratac which fails on small samples + # nofail true conditional on hmmratac/fseq2 which fails on small samples # TODO: there are downstream steps that require a peak file! # maybe it should just fail? if args.peak_caller == "hmmratac": @@ -2096,10 +2143,42 @@ def report_peak_count(): num_peaks = max(0, line_count - 1) pm.report_result("Peak_count", num_peaks) else: - # TODO: could just touch an empty file? Homer creates an empty file... + # just touch an empty file? Homer creates an empty file... cmd = "touch " + peak_output_file pm.run(cmd, peak_output_file) pm.warning("HMMRATAC failed to identify any peaks.") + elif args.peak_caller == "fseq" or args.peak_caller == "fseq2": + pm.run(cmd, peak_output_file, nofail=True) + if (os.path.exists(peak_output_file) and + os.stat(peak_output_file).st_size > 0): + df = pd.read_csv(peak_output_file, sep='\t', header=None, + names=("chr", "start", "end", "name", "score", + "strand", "signalValue", "pValue", + "qValue", "peak")) + nineNine = df['signalValue'].quantile(q=0.99) + df.loc[df['signalValue'] > nineNine, 'signalValue'] = nineNine + + # rescale score to be between 100 and 1000. + # See https://fureylab.web.unc.edu/software/fseq/ + df['score'] = rescale(np.log(df['signalValue']), [100, 1000]) + + # ensure score is a whole integer value + df['score'] = pd.to_numeric(df['score'].round(), + downcast='integer') + df.to_csv(peak_output_file, sep='\t', + header=False, index=False) + + line_count = int(ngstk.count_lines(peak_output_file).strip()) + num_peaks = max(0, line_count - 1) + pm.report_result("Peak_count", num_peaks) + else: + # just touch an empty file? Homer creates an empty file... + cmd = "touch " + peak_output_file + pm.run(cmd, peak_output_file) + if args.peak_caller == "fseq": + pm.warning("FSeq failed to identify any peaks.") + else: + pm.warning("FSeq2 failed to identify any peaks.") else: pm.run(cmd, peak_output_file, follow=report_peak_count) @@ -2324,13 +2403,6 @@ def report_peak_count(): nineNine = df['V5'].quantile(q=0.99) df.loc[df['V5'] > nineNine, 'V5'] = nineNine - def rescale(n, after=[0,1], before=[]): - if not before: - before=[min(n), max(n)] - if (before[1] - before[0]) == 0: - return n - return (((after[1] - after[0]) * (n - before[0]) / - (before[1] - before[0])) + after[0]) # rescale score to be between 0 and 1000 df['V5'] = rescale(np.log(df['V5']), [0, 1000]) diff --git a/pipelines/pepatac.yaml b/pipelines/pepatac.yaml index 7f395882..8c6975e3 100644 --- a/pipelines/pepatac.yaml +++ b/pipelines/pepatac.yaml @@ -20,7 +20,8 @@ tools: # absolute paths to required tools bedToBigBed: bedToBigBed # optional tools bwa: bwa - fseq: fseq2 + fseq: fseq + fseq2: fseq2 genrich: Genrich hmmratac: ${HMMRATAC} homer_findpeaks: findPeaks @@ -74,6 +75,12 @@ parameters: # parameters passed to bioinformatic tools, subclassed by tool # --shift: Assign an arbitrary shift in bp. See MACS documentation. # --nomodel: Will bybass building the shifting model. fseq: + params: '-of npf -l 600 -t 4.0 -s 1' + # -of: narrowPeak as output format. + # -l: feature length. + # -t: "threshold" (standard deviations). + # -s: wiggle track step. + fseq2: params: '-f 0 -l 600 -t 4.0' # -f: fragment size of treatment data # -l: feature length. @@ -82,10 +89,10 @@ parameters: # parameters passed to bioinformatic tools, subclassed by tool params: '' # -j: ATAC-seq mode on by default hmmratac: - params: '' + params: '--fragmem True --upper 10 --lower 5 --peaks True --window 500000' # --blacklist: Pipeline includes blacklist by default if the refgenie asset exists. homer_findpeaks: - params: 'minDist 150 -region' + params: '-minDist 150 -region' # Use region mode with minimum distance of 150 homer_motif: params: '-size given -mask' diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index 29f7a70b..2c085f6d 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -5,7 +5,7 @@ __author__ = ["Jason Smith", "Michal Stolarczyk"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.0.3" +__version__ = "0.0.4" from argparse import ArgumentParser import os @@ -54,6 +54,12 @@ def parse_arguments(): parser.add_argument("-m", "--cutoff", default=2, help="Only keep peaks present in at least this " + "number of samples.") + parser.add_argument("-s", "--min-score", default=5, + help="A minimum peak score to keep an " + + "individual peak.") + parser.add_argument("-l", "--min-olap", default=1, + help="A minimum number of overlapping bases to " + + "defined peaks as overlapping.") args = parser.parse_args() return args @@ -65,12 +71,9 @@ def main(): pm = pypiper.PipelineManager(name="PEPATAC_collator", outfolder=outfolder, args=args, version=__version__) - cmd = "Rscript {R_file} {cfg_file} {out_dir} {results_dir} {cutoff}".format( - R_file=tool_path("PEPATAC_summarizer.R"), - cfg_file=args.config_file, - out_dir=args.output_parent, - results_dir=args.results, - cutoff=args.cutoff) + cmd = (f"Rscript {tool_path('PEPATAC_summarizer.R')} " + f"{args.config_file} {args.output_parent} " + f"{args.results} {args.cutoff} {args.min_score} {args.min_olap}") if args.new_start: cmd += " --new-start" if args.skip_consensus: diff --git a/project_pipeline_interface.yaml b/project_pipeline_interface.yaml index 56741bb2..25b3ea71 100644 --- a/project_pipeline_interface.yaml +++ b/project_pipeline_interface.yaml @@ -3,7 +3,6 @@ pipeline_type: project path: pipelines/pepatac_collator.py output_schema: pepatac_output_schema.yaml command_template: > - looper table {looper.pep_config} && {pipeline.path} --config {looper.pep_config} -O {looper.output_dir} @@ -14,7 +13,7 @@ command_template: > compute: singularity_image: ${SIMAGES}pepatac docker_image: databio/pepatac - bulker_crate: databio/pepatac + bulker_crate: databio/pepatac:1.0.7 size_dependent_variables: resources-project.tsv bioconductor: diff --git a/requirements-conda.yml b/requirements-conda.yml index aea051f5..003bd155 100644 --- a/requirements-conda.yml +++ b/requirements-conda.yml @@ -20,7 +20,8 @@ dependencies: - bowtie2>=2.4.2 - bwa>=0.7.17 - fastqc>=0.11.9 - - genometools-genometools >=1.6.1 + - fseq + - genometools-genometools>=1.6.1 - genrich>=0.6 - hmmratac>=1.2.10 - homer>=4.11 @@ -66,5 +67,6 @@ dependencies: - pyyaml>=3.13 - refgenconf>=0.7.0 - refgenie>=0.9.3 + - typing_extensions>=3.7.4.1 - ubiquerg>=0.6.1 - yacman>=0.6.7 diff --git a/requirements.txt b/requirements.txt index 258d9995..e77370da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ Cython>=0.29 cykhash>=1.0.2 divvy>=0.5.0 eido>=0.1.3 -fseq2>=2.0.2 +#fseq2>=2.0.2 hypothesis==4.38.0 jinja2 jsonschema>=3.0.1 diff --git a/resources-sample.tsv b/resources-sample.tsv index e8dc5d41..084c7b0a 100644 --- a/resources-sample.tsv +++ b/resources-sample.tsv @@ -1,6 +1,6 @@ -max_file_size cores mem time -0.05 4 8000 00-06:00:00 -0.5 8 12000 00-09:00:00 -1 16 16000 00-12:00:00 -10 32 24000 01-00:00:00 -NaN 32 32000 02-00:00:00 +max_file_size cores mem time +0.05 4 10000 00-03:00:00 +0.5 8 12000 00-08:00:00 +1 16 16000 00-12:00:00 +10 32 24000 01-00:00:00 +NaN 32 32000 02-00:00:00 diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 55f2e46d..49998829 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -37,7 +37,7 @@ compute: singularity_image: ${SIMAGES}pepatac conda_env: pepatac docker_image: databio/pepatac - bulker_crate: databio/pepatac:1.0.6 + bulker_crate: databio/pepatac:1.0.7 size_dependent_variables: resources-sample.tsv bioconductor: diff --git a/tools/PEPATAC_summarizer.R b/tools/PEPATAC_summarizer.R index 41eb095c..aea77d75 100755 --- a/tools/PEPATAC_summarizer.R +++ b/tools/PEPATAC_summarizer.R @@ -11,7 +11,7 @@ # )) # # Created: 5/18/17 -# Last updated: 03/12/2020 +# Last updated: 05/10/2021 # # usage: Rscript /path/to/Rscript/PEPATAC_summarizer.R # /path/to/project_config.yaml @@ -65,6 +65,12 @@ p <- add_argument(p, arg="--normalized", short="-Z", flag=TRUE, p <- add_argument(p, arg="cutoff", short="-m", default=2, help=paste0("Only keep peaks present in at least this ", "number of samples.")) +p <- add_argument(p, arg="min-score", short="-s", default=5, + help=paste0("A minimum peak score to keep an", + " individual peak.")) +p <- add_argument(p, arg="min-olap", short="-l", default=1, + help=paste0("A minimum number of overlapping bases to", + " defined peaks as overlapping.")) # Parse the command line arguments argv <- parse_args(p) @@ -102,7 +108,10 @@ pep <- argv$config # Load the project prj <- invisible(suppressWarnings(pepr::Project(pep))) # Convenience -project_name <- config(prj)$name +project_name <- config(prj)$name +project_samples <- pepr::sampleTable(prj)$sample_name +sample_table <- data.table(sample_name=pepr::sampleTable(prj)$sample_name, + genome=pepr::sampleTable(prj)$genome) # Set the output directory summary_dir <- suppressMessages(file.path(argv$output, "summary")) @@ -121,11 +130,30 @@ if (dir.exists(argv$results)) { quit() } -# Get assets -assets <- PEPATACr::createAssetsSummary(prj, argv$output, results_subdir) + +# Generate stats summary +stats <- PEPATACr::createStatsSummary(project_samples, results_subdir) +if (nrow(stats) == 0) { + quit() +} +project_stats_file <- file.path(argv$output, + paste0(project_name, '_stats_summary.tsv')) +message(sprintf("Summary (n=%s): %s", + length(unique(stats$sample_name)), project_stats_file)) +fwrite(stats, project_stats_file, sep="\t", col.names=TRUE) + + +# Generate assets +assets <- PEPATACr::createAssetsSummary(project_samples, results_subdir) if (nrow(assets) == 0) { quit() } +project_assets_file <- file.path(argv$output, + paste0(project_name, '_assets_summary.tsv')) +message(sprintf("Summary (n=%s): %s", + length(unique(assets$sample_name)), project_assets_file)) +fwrite(assets, project_assets_file, sep="\t", col.names=FALSE) + # Produce project summary plots summarizer_flag <- PEPATACr::summarizer(prj, argv$output) @@ -133,6 +161,7 @@ if (is.null(summarizer_flag)) { summarizer_flag <- FALSE } + # Produce library complexity summary plots complexity_path <- file.path(summary_dir, paste0(project_name, "_libComplexity.pdf")) @@ -189,6 +218,7 @@ if (summarizer_flag && complexity_flag) { message("Successfully produced project summary plots.\n") } + # Report existing consensus peaks for (genome in unique(genomes)) { file_name <- paste0("_", genome,"_consensusPeaks.narrowPeak") @@ -199,11 +229,17 @@ for (genome in unique(genomes)) { } } + # Calculate consensus peaks if (!file.exists(consensus_path) || argv$new_start && !argv$skip_consensus) { #write(paste0("Creating consensus peak set..."), stdout()) - consensus_paths <- PEPATACr::consensusPeaks(prj, argv$output, - argv$results, assets) + consensus_paths <- PEPATACr::consensusPeaks(sample_table, + summary_dir, + argv$results, + assets, + argv$cutoff, + argv$min_score, + argv$min_olap) if (!length(consensus_paths) == 0) { for (consensus_file in consensus_paths) { if (file.exists(consensus_file)) { @@ -221,6 +257,7 @@ if (!file.exists(consensus_path) || argv$new_start && !argv$skip_consensus) { } } + # Report existing counts tables # TODO: move genome handling out of the called function? for (genome in unique(genomes)) { @@ -232,12 +269,20 @@ for (genome in unique(genomes)) { } } + # Create count matrix if (!file.exists(counts_path) || argv$new_start && !argv$skip_table) { - #write(paste0("Creating peak count table(s)..."), stdout()) - counts_paths <- PEPATACr::peakCounts(prj, argv$output, argv$results, assets, - argv$poverlap, argv$normalized, - argv$cutoff) + sample_table <- data.table::data.table( + sample_name=pepr::sampleTable(prj)$sample_name, + genome=pepr::sampleTable(prj)$genome) + counts_paths <- PEPATACr::peakCounts(sample_table, + summary_dir, + argv$results, + assets, + argv$poverlap, + argv$normalized, + argv$cutoff, + argv$min_olap) if (!length(counts_paths) == 0) { for (counts_table in counts_paths) { if (file.exists(counts_table)) { diff --git a/usage.txt b/usage.txt index 7b643167..980fc5ec 100644 --- a/usage.txt +++ b/usage.txt @@ -4,7 +4,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY [-Q SINGLE_OR_PAIRED] [--aligner {bowtie2,bwa}] - [--peak-caller {fseq,genrich,hmmratac,homer,macs2}] + [--peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2}] [-gs GENOME_SIZE] [--trimmer {trimmomatic,pyadapt,skewer}] [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] [--deduplicator {picard,samblaster,samtools}] @@ -14,7 +14,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite] [--skipqc] [-V] -PEPATAC version 0.9.15 +PEPATAC version 0.9.16 optional arguments: -h, --help show this help message and exit @@ -41,7 +41,7 @@ optional arguments: Single- or paired-end sequencing protocol --aligner {bowtie2,bwa} Name of read aligner - --peak-caller {fseq,genrich,hmmratac,homer,macs2} + --peak-caller {fseq,fseq2,genrich,hmmratac,homer,macs2} Name of peak caller -gs GENOME_SIZE, --genome-size GENOME_SIZE Effective genome size. It can be 1.0e+9 or 1000000000: