diff --git a/.gitignore b/.gitignore index 1f3396a1..1804b5b5 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,5 @@ examples/test_project/test_fseq.yaml examples/test_project/test_genrich.yaml examples/test_project/test_hmmratac.yaml examples/test_project/test_homer.yaml -examples/test_project/test_macs.yaml \ No newline at end of file +examples/test_project/test_macs.yaml +.Rproj.user diff --git a/PEPATACr/DESCRIPTION b/PEPATACr/DESCRIPTION index cfa97fcd..8fcee6ef 100644 --- a/PEPATACr/DESCRIPTION +++ b/PEPATACr/DESCRIPTION @@ -1,11 +1,13 @@ Package: PEPATACr Title: Functions and libraries to analyze ATAC-seq data -Version: 0.0.1.0000 +Version: 0.0.2.0000 Authors@R: person("Jason", "Smith", email = "jasonsmith@virginia.edu", role = c("aut", "cre")) Maintainer: Jason Smith Description: Installs required libraries to calculate the fraction of reads in features, TSS enrichments, and fragment length distributions. -Depends: R (>= 3.5.1), data.table, pepr, gplots, grid, ggplot2, optigrab, scales, GenomicDistributions +BiocViews: +Depends: R (>= 4.0), GenomicRanges +Imports: data.table, pepr, gplots, grid, ggplot2, optigrab, scales, yaml, GenomicDistributions, GenomicDistributionsData License: BSD 2-Clause "Simplified" License Encoding: UTF-8 LazyData: true -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.3 diff --git a/PEPATACr/NAMESPACE b/PEPATACr/NAMESPACE index 8582fa62..660437e1 100644 --- a/PEPATACr/NAMESPACE +++ b/PEPATACr/NAMESPACE @@ -23,3 +23,6 @@ export(sampleName) export(setPanelSize) export(splitDataTable) export(summarizer) +import(data.table) +import(ggplot2) +import(optigrab) diff --git a/PEPATACr/NAMESPACE.bak b/PEPATACr/NAMESPACE.bak new file mode 100644 index 00000000..8582fa62 --- /dev/null +++ b/PEPATACr/NAMESPACE.bak @@ -0,0 +1,25 @@ +# Generated by roxygen2: do not edit by hand + +export(consensusPeaks) +export(createAssetsSummary) +export(createStatsSummary) +export(fileIcon) +export(getPrealignments) +export(narrowPeakToBigBed) +export(peakCounts) +export(plotAlignedPct) +export(plotAlignedRaw) +export(plotAnno) +export(plotComplexityCurves) +export(plotFLD) +export(plotFRiF) +export(plotLibSizes) +export(plotTSS) +export(plotTSSscores) +export(readPepatacPeakCounts) +export(reducePeaks) +export(roundUpNice) +export(sampleName) +export(setPanelSize) +export(splitDataTable) +export(summarizer) diff --git a/PEPATACr/PEPATACr.Rproj b/PEPATACr/PEPATACr.Rproj index bfa3107e..f54a4b61 100644 --- a/PEPATACr/PEPATACr.Rproj +++ b/PEPATACr/PEPATACr.Rproj @@ -1,21 +1,21 @@ -Version: 1.0 - -RestoreWorkspace: No -SaveWorkspace: No -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 4 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX - -AutoAppendNewline: Yes -StripTrailingWhitespace: Yes - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R old mode 100644 new mode 100755 index 5b9085d5..b3498489 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -10,6 +10,9 @@ #' @author Jason Smith #' #' @references \url{http://github.com/databio/pepatac} +#' @import data.table +#' @import ggplot2 +#' @import optigrab NULL ################################################################################ @@ -20,8 +23,8 @@ NULL #' @examples #' theme_PEPATAC() theme_PEPATAC <- function(base_family = "sans", ...){ - theme_classic(base_family = base_family, base_size = 14, ...) + - theme( + ggplot2::theme_classic(base_family = base_family, base_size = 14, ...) + + ggplot2::theme( axis.line = element_line(size = 0.5), axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5), panel.grid.major = element_blank(), @@ -88,7 +91,7 @@ fileIcon <- function(x=seq(0:275), y=seq(0:275)) { y=c(157.5/275*maxY,157.5/275*maxY)), aes(x=x,y=y), color="gray", size=4, lineend="round") + theme_PEPATAC() + - theme(panel.border = element_blank(), + ggplot2::theme(panel.border = element_blank(), axis.line = element_blank(), axis.title.x=element_blank(), axis.text.x=element_blank(), @@ -191,12 +194,13 @@ plotComplexityCurves <- function(ccurves, } # Get the real counts if we have them - rcDT <- data.table(name = character(length(real_counts_path)), - total = integer(length(real_counts_path)), - unique = integer(length(real_counts_path))) + rcDT <- data.table::data.table( + name = character(length(real_counts_path)), + total = integer(length(real_counts_path)), + unique = integer(length(real_counts_path))) if (exists(real_counts_path[1])) { - rc_file <- data.table(get(real_counts_path[1])) + rc_file <- data.table::data.table(get(real_counts_path[1])) rcDT$name <- basename(rc_file$V1) rcDT$total <- as.integer(rc_file$V2) if (ncol(rc_file) == 3 && !ignore_unique) { @@ -221,7 +225,7 @@ plotComplexityCurves <- function(ccurves, for (rc in 1:length(real_counts_path)) { info <- file.info(file.path(real_counts_path[rc])) if (file.exists(real_counts_path[rc]) && info$size != 0) { - rc_file <- fread(real_counts_path[rc]) + rc_file <- data.table::fread(real_counts_path[rc]) rcDT$name[rc] <- basename(rc_file$V1) rcDT$total[rc] <- as.integer(rc_file$V2) if (ncol(rc_file) == 3 && !ignore_unique) { @@ -259,7 +263,8 @@ plotComplexityCurves <- function(ccurves, "#372B4C", "#E3DAC7", "#27CAE6", "#B361BC", "#897779", "#6114F8", "#19C42B", "#56B4E9")) - clist <- data.table(TOTAL_READS = list(), EXPECTED_DISTINCT = list()) + clist <- data.table::data.table(TOTAL_READS = list(), + EXPECTED_DISTINCT = list()) ccurves <- as.list(ccurves) colormap <- palette(length(ccurves)) for (c in 1:length(ccurves)) { @@ -270,9 +275,9 @@ plotComplexityCurves <- function(ccurves, #message(paste0("Processing ", sample_name)) if (exists(ccurves[[c]])) { - ctable <- data.table(get(ccurves[[c]])) + ctable <- data.table::data.table(get(ccurves[[c]])) } else if (file.exists(ccurves[[c]])) { - ctable <- fread(ccurves[[c]]) + ctable <- data.table::fread(ccurves[[c]]) } else { stop(paste0("FileExistsError: ", ccurves[[c]], " could not be found.")) @@ -313,7 +318,7 @@ plotComplexityCurves <- function(ccurves, } } if (c == 1) { - clist <- data.table( + clist <- data.table::data.table( SAMPLE_NAME = list(rep(sample_name, length(ccurve_TOTAL_READS))), TOTAL_READS = list(ccurve_TOTAL_READS), @@ -322,7 +327,7 @@ plotComplexityCurves <- function(ccurves, ) } else { clist <- rbindlist(list(clist, - data.table(SAMPLE_NAME = list(rep(sample_name, + data.table::data.table(SAMPLE_NAME = list(rep(sample_name, length(ccurve_TOTAL_READS))), TOTAL_READS = list(ccurve_TOTAL_READS), EXPECTED_DISTINCT = list(ccurve_EXPECTED_DISTINCT), @@ -499,11 +504,11 @@ plotComplexityCurves <- function(ccurves, labs(col = "") + scale_color_discrete(labels=c(clist$SAMPLE_NAME)) + theme_PEPATAC() + - theme(legend.position = "right", + ggplot2::theme(legend.position = "right", plot.caption = element_text(size = 8, face = "italic")) # inset zoom plot - zoom_theme <- theme(legend.position = "none", + zoom_theme <- ggplot2::theme(legend.position = "none", axis.line = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank(), @@ -563,7 +568,7 @@ plotComplexityCurves <- function(ccurves, # Don't include legend for single sample plots if (length(ccurves) == 1) { - fig <- fig + theme(legend.position = "none") + fig <- fig + ggplot2::theme(legend.position = "none") } return(fig) @@ -617,7 +622,7 @@ plotComplexityCurves <- function(ccurves, calcFRiF <- function(bedFile, total, reads) { colnames(bedFile) <- c("chromosome", "start", "end", "count", "bases", "width", "fraction") - grObj <- makeGRangesFromDataFrame(bedFile) + grObj <- GenomicRanges::makeGRangesFromDataFrame(bedFile) grObj <- reduce(grObj) redBed <- data.frame(chromosome=seqnames(grObj), start=start(grObj), end=end(grObj)) @@ -835,7 +840,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, p <- ggplot(covDF, aes(x=log10(cumSize), y=frip, group=feature, color=feature)) + #geom_line(aes(linetype=feature), size=2, alpha=0.5) + - geom_line(size=2, alpha=0.5) + + geom_line(linewidth=2, alpha=0.5) + guides(linetype = "none") + labs(x=expression(log[10]("number of bases")), y="FRiF") + @@ -846,7 +851,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, labels$val), values=labels$color) + labs(color="FRiF") + - theme(legend.position="right", + ggplot2::theme(legend.position="right", legend.justification=c(0.1,0.9), legend.background=element_blank(), legend.text = element_text(size = rel(0.65)), @@ -862,11 +867,11 @@ plotFRiF <- function(sample_name, num_reads, genome_size, coord_flip() + scale_x_discrete(position="top") + theme_PEPATAC() + - theme(plot.background = element_rect(fill = "transparent", - color = NA,), - panel.background = element_rect(fill = "transparent"), - rect = element_rect(fill = "transparent"), - plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) + ggplot2::theme(plot.background = element_rect( + fill = "transparent", color = NA,), + panel.background = element_rect(fill = "transparent"), + rect = element_rect(fill = "transparent"), + plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) g <- ggplotGrob(p2) min_x <- min(layer_scales(p)$x$range$range) @@ -884,7 +889,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, # group=feature, color=feature)) + p <- ggplot(covDF, aes(x=log10(cumSize), y=frip, group=feature, color=feature)) + - geom_line(size=2, alpha=0.5) + + geom_line(linewidth=2, alpha=0.5) + guides(linetype = "none") + labs(x=expression(log[10]("number of bases")), y="FRiF") + theme_PEPATAC() @@ -894,7 +899,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, labels$val), values=labels$color) + labs(color="FRiF") + - theme(legend.position=c(0.075,0.975), + ggplot2::theme(legend.position=c(0.075,0.975), legend.justification=c(0.1,0.9), legend.title = element_blank(), legend.text = element_text(size = rel(0.65)), @@ -929,7 +934,7 @@ plotFRiF <- function(sample_name, num_reads, genome_size, labels$val), values=labels$color) + labs(color="FRiF") + - theme(legend.position="right", + ggplot2::theme(legend.position="right", legend.justification=c(0.1,0.9), legend.background=element_blank(), legend.key = element_blank(), @@ -944,11 +949,11 @@ plotFRiF <- function(sample_name, num_reads, genome_size, coord_flip() + scale_x_discrete(position="top") + theme_PEPATAC() + - theme(plot.background = element_rect(fill = "transparent", - color = NA,), - panel.background = element_rect(fill = "transparent"), - rect = element_rect(fill = "transparent"), - plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) + ggplot2::theme(plot.background = element_rect( + fill = "transparent", color = NA,), + panel.background = element_rect(fill = "transparent"), + rect = element_rect(fill = "transparent"), + plot.margin = unit(c(0,0,-6.5,-6.5),"mm")) g <- ggplotGrob(p2) min_x <- min(layer_scales(p)$x$range$range) @@ -1004,8 +1009,8 @@ plotTSS <- function(TSSfile, cutoff=6) { } } - t1 <- theme_classic(base_size=14) + - theme(plot.background = element_blank(), + t1 <- ggplot2::theme_classic(base_size=14) + + ggplot2::theme(plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_rect(colour = "black", @@ -1016,26 +1021,26 @@ plotTSS <- function(TSSfile, cutoff=6) { aspect.ratio = 1, axis.ticks.length = unit(2, "mm")) - iMat <- data.table(V1 = numeric()) + iMat <- data.table::data.table(V1 = numeric()) if (length(TSSfile) == 1) { if (exists(TSSfile)) { - iMat <- data.table(get(TSSfile)) + iMat <- data.table::data.table(get(TSSfile)) } else { - iMat <- fread(TSSfile) + iMat <- data.table::fread(TSSfile) } } else if (length(TSSfile) == 2) { for (i in 1:length(TSSfile)) { if (exists(TSSfile[i])) { if (i == 1) { - iMat <- data.table(get(TSSfile[i])) + iMat <- data.table::data.table(get(TSSfile[i])) } else { - iMat <- list(iMat, data.table(get(TSSfile[i]))) + iMat <- list(iMat, data.table::data.table(get(TSSfile[i]))) } } else { if (i == 1) { - iMat <- fread(TSSfile[i]) + iMat <- data.table::fread(TSSfile[i]) } else { - iMat <- list(iMat, fread(TSSfile[i])) + iMat <- list(iMat, data.table::fread(TSSfile[i])) } } } @@ -1192,11 +1197,11 @@ plotFLD <- function(fragL, max_fragment = 200) { if (exists(fragL_count)) { - dat <- data.table(get(fragL_count)) + dat <- data.table::data.table(get(fragL_count)) } else if (file.exists(fragL_count)) { info <- file.info(file.path(fragL_count)) if (info$size != 0) { - dat <- fread(fragL_count) + dat <- data.table::fread(fragL_count) } else { warning(paste0(fragL_count, " is an empty file.")) return(ggplot()) @@ -1207,9 +1212,9 @@ plotFLD <- function(fragL, } if (exists(fragL)) { - summary_table <- data.table(get(fragL)) + summary_table <- data.table::data.table(get(fragL)) } else if (file.exists(fragL)) { - summary_table <- fread(fragL) + summary_table <- data.table::fread(fragL) } else { stop(paste0("FileExistsError: ", fragL, " could not be found.")) quit(save = "no", status = 1, runLast = FALSE) @@ -1217,7 +1222,7 @@ plotFLD <- function(fragL, dat1 <- dat[dat$V2<=max_fragment,] tmp <- seq(1:as.numeric(dat1[1,2]-1)) - dat0 <- data.table(V1=rep(0,length(tmp)),V2=tmp) + dat0 <- data.table::data.table(V1=rep(0,length(tmp)),V2=tmp) dat2 <- rbind(dat0, dat1) x_min <- which.min(dat1$V1[1:which.max(dat1$V1)]) @@ -1237,15 +1242,16 @@ plotFLD <- function(fragL, geom_line(alpha=0.5) + labs(x="Fragment length", y=ylabel) + theme_PEPATAC() + - theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) - - summ <- data.table(Min=min(summary_table$V1), - Max=max(summary_table$V1), - Median=median(summary_table$V1), - Mean=mean(summary_table$V1), - Stdev=sd(summary_table$V1)) + ggplot2::theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) + + summ <- data.table::data.table( + Min=min(summary_table$V1), + Max=max(summary_table$V1), + Median=median(summary_table$V1), + Mean=mean(summary_table$V1), + Stdev=sd(summary_table$V1)) # Write summary table to stats file - fwrite(summ, file=fragL_txt, row.names=F, quote=F, sep="\t") + data.table::fwrite(summ, file=fragL_txt, row.names=F, quote=F, sep="\t") return(p) } @@ -1361,7 +1367,7 @@ tryCatchChromBins <- function(x, y) { withCallingHandlers( { msg <- "" - list(value = calcChromBinsRef(x, y), msg = msg) + list(value = GenomicDistributions::calcChromBinsRef(x, y), msg = msg) }, warning = function(e) { msg <<- trimws(paste0("WARNING: ", e)) @@ -1411,11 +1417,11 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), # load input file and convert to/ensure it is in BED6 format if (exists(input)) { - in_file <- data.table(get(input)) + in_file <- data.table::data.table(get(input)) } else { info <- file.info(file.path(input)) if (file.exists(file.path(input)) && info$size != 0) { - in_file <- fread(file.path(input)) + in_file <- data.table::fread(file.path(input)) } else { out_file <- file.path(output, paste(basename(sample_path), output_type, @@ -1436,7 +1442,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), } # Convert to GRanges Object - query <- makeGRangesFromDataFrame(in_bed, keep.extra.columns=TRUE) + query <- GenomicRanges::makeGRangesFromDataFrame(in_bed, keep.extra.columns=TRUE) if (tolower(plot) == "chromosome") { # Chromosome distribution plot @@ -1446,7 +1452,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), message(x$msg) } - if (x$value == "" || is.na(x$value) || is.null(x$value)) { + if (is.null(nrow(x$value))) { return(ggplot()) } # Don't plot lowest 10% represented chromosomes @@ -1455,7 +1461,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), keep <- tbl[tbl$Freq > cutoff, 1] x <- x$value[x$value$chr %in% keep,] if (nrow(x) > 0) { - ga_plot <- plotChromBins(x) + ga_plot <- GenomicDistributions::plotChromBins(x) return(ga_plot) } else { message("Too few peaks to plot. Check the genome alignment rates.") @@ -1465,15 +1471,15 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), # Feature distance distribution plots x <- tryCatch( { - suppressMessages(calcFeatureDistRefTSS(query, genome)) + suppressMessages(GenomicDistributions::calcFeatureDistRefTSS(query, genome)) }, error=function(e) { - message("calcFeatureDistRefTSS(): ", e) + message("GenomicDistributions::calcFeatureDistRefTSS(): ", e) return(NULL) }, warning=function(e) { - message("calcFeatureDistRefTSS(): ", e) - return(NULL) + message("GenomicDistributions::calcFeatureDistRefTSS(): ", e) + suppressMessages(GenomicDistributions::calcFeatureDistRefTSS(query, genome)) } ) @@ -1482,7 +1488,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), } if (!is.na(x[1])) { - TSS_plot <- plotFeatureDist(x, featureName="TSS") + TSS_plot <- GenomicDistributions::plotFeatureDist(x, featureName="TSS") return(TSS_plot) } else { message("Unable to produce TSS distribution plot.") @@ -1492,13 +1498,14 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), # Partition distribution plots knownGenomes <- c('hg19', 'hg38', 'mm9', 'mm10') if (exists(feat)) { - anno_file <- data.table(get(feat)) + anno_file <- data.table::data.table(get(feat)) } else { if (filetype(paste0(feat)) == "gzfile") { - anno_file <- fread(cmd=(sprintf('gzip -d -c %s', shQuote(file.path(feat))))) + anno_file <- data.table::fread( + cmd=(sprintf('gzip -d -c %s', shQuote(file.path(feat))))) suppressWarnings(closeAllConnections()) } else { - anno_file <- fread(file.path(feat)) + anno_file <- data.table::fread(file.path(feat)) } } @@ -1526,7 +1533,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), # Default to chromosome distribution plot # Chromosome distribution plot # Chromosome distribution plot - x <- chromBins(query, genome) + x <- GenomicDistributions::chromBins(query, genome) if (x$msg != "") { message(x$msg) @@ -1541,7 +1548,7 @@ plotAnno <- function(plot = c("chromosome", "tss", "genomic"), keep <- tbl[tbl$Freq > cutoff, 1] x <- x$value[x$value$chr %in% keep,] if (nrow(x) > 0) { - ga_plot <- plotChromBins(x) + ga_plot <- GenomicDistributions::plotChromBins(x) return(ga_plot) } else { message("Too few peaks to plot. Check the genome alignment rates.") @@ -1565,7 +1572,7 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, sample_path <- sampleName(input, 1) if (file.exists(file.path(chr_sizes))) { - chrom_sizes <- fread(file.path(chr_sizes)) + chrom_sizes <- data.table::fread(file.path(chr_sizes)) colnames(chrom_sizes) <- c("chr", "size") } else { err_msg <- paste0("Could not find: ", file.path(chr_sizes)) @@ -1574,7 +1581,7 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, info = file.info(file.path(input)) if (file.exists(file.path(input)) && info$size != 0) { - np <- fread(file.path(input)) + np <- data.table::fread(file.path(input)) colnames(np) <- c("chr", "chromStart", "chromEnd", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak") } else { @@ -1610,8 +1617,8 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, as_file <- file.path(paste0(dirname(sample_path), "bigNarrowPeak.as")) out_file <- file.path(paste0(sample_path, "_peaks.bigBed")) - fwrite(np, file=tmp_file, col.names=FALSE, row.names=FALSE, - quote=FALSE, sep='\t') + data.table::fwrite(np, file=tmp_file, col.names=FALSE, row.names=FALSE, + quote=FALSE, sep='\t') cat("table bigNarrowPeak\n", "\"BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data.\"\n", @@ -1650,7 +1657,7 @@ narrowPeakToBigBed <- function(input=input, chr_sizes=chr_sizes, reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALSE) { info <- file.info(file.path(input)) if (file.exists(file.path(input)) && info$size != 0) { - peaks <- fread(file.path(input)) + peaks <- data.table::fread(file.path(input)) if (ncol(peaks) == 6) { colnames(peaks) <- c("chr", "start", "end", "name", "score", "strand") @@ -1677,7 +1684,7 @@ reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALS info <- file.info(file.path(chr_sizes)) if (file.exists(file.path(chr_sizes)) && info$size != 0) { - c_size <- fread(file.path(chr_sizes)) + c_size <- data.table::fread(file.path(chr_sizes)) colnames(c_size) <- c("chr", "size") } else { if (info$size == 0) { @@ -1689,14 +1696,16 @@ reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALS } if (exists("peaks") & exists("c_size")) { - hits <- foverlaps(peaks, peaks, + hits <- data.table::foverlaps(peaks, peaks, by.x=c("chr", "start", "end"), type="any", which=TRUE, nomatch=0) if (bedOnly) { # Only have the "score" to rank peaks - qVals <- data.table(index=rep(1:nrow(peaks)), qValue=peaks$score) + qVals <- data.table::data.table(index=rep(1:nrow(peaks)), + qValue=peaks$score) } else { - qVals <- data.table(index=rep(1:nrow(peaks)), qValue=peaks$qValue) + qVals <- data.table::data.table(index=rep(1:nrow(peaks)), + qValue=peaks$qValue) } setkey(hits, xid) setkey(qVals, index) @@ -1722,10 +1731,11 @@ reducePeaks <- function(input, sample_name, chr_sizes, output=NA, normalize=FALS # save final peak set if (is.na(output)) { file_path <- file.path(dirname(input), sample_name) - fwrite(final, paste0(file_path, "_peaks_normalized.narrowPeak"), - sep="\t", col.names=FALSE) + data.table::fwrite(final, + paste0(file_path, "_peaks_normalized.narrowPeak"), + sep="\t", col.names=FALSE) } else { - fwrite(final, output, sep="\t", col.names=FALSE) + data.table::fwrite(final, output, sep="\t", col.names=FALSE) } } else { @@ -1785,9 +1795,9 @@ setPanelSize <- function(p=NULL, g=ggplotGrob(p), file=NULL, if(!is.null(file)) ggsave(file, g, limitsize = FALSE, - width=convertWidth(sum(g$widths) + margin, + width=grid::convertWidth(sum(g$widths) + margin, unitTo="in", valueOnly=TRUE), - height=convertHeight(sum(g$heights) + margin, + height=grid::convertHeight(sum(g$heights) + margin, unitTo="in", valueOnly=TRUE)) invisible(g) } @@ -1818,17 +1828,19 @@ getPrealignments <- function(stats_file) { #' @export plotAlignedRaw <- function(prj, summary_dir, stats) { # Convenience - project_name <- config(prj)$name + project_name <- pepr::config(prj)$name - align_theme <- theme( + align_theme <- ggplot2::theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.border = element_rect(colour = "black", fill = NA, size = 0.5), + panel.border = element_rect(colour = "black", fill = NA, + linewidth = 0.5), panel.background = element_blank(), axis.line = element_blank(), axis.text.x = element_text(face = "plain", color = "black", - size = 20, hjust = 0.5), + size = 20, angle = 90, hjust = 1, + vjust=0.5), axis.text.y = element_text(face = "plain", color = "black", size = 20, hjust = 1), axis.title.x = element_text(face = "plain", color = "black", @@ -1837,17 +1849,18 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { size = 22, hjust = 0.5), plot.title = element_text(face = "bold", color = "black", size = 12, hjust = 0.5), - axis.ticks = element_line(size = 0.5), + axis.ticks = element_line(linewidth = 0.5), axis.ticks.length = unit(2, "mm"), legend.background = element_rect(fill = "transparent"), legend.text = element_text(size = 16), - legend.title = element_blank() + legend.title = element_blank(), + aspect.ratio = 1 ) # Get prealignments if they exist prealignments <- getPrealignments(stats) - unaligned <- stats$Fastq_reads - stats$Aligned_reads + unaligned <- as.numeric(stats$Fastq_reads) - as.numeric(stats$Aligned_reads) # If prealignments exist...include in unaligned reads count if (!is.null(prealignments)) { for (i in 1:length(unlist(prealignments))) { @@ -1859,7 +1872,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { align_raw <- tryCatch( { - data.table(sample = stats$sample_name, + data.table::data.table(sample = stats$sample_name, unaligned = as.integer(unaligned), duplicates = as.integer(stats$Duplicate_reads)) }, @@ -1904,25 +1917,25 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { stats[, (paste("Aligned_reads", unlist(prealignments)[i], sep="_")), with=FALSE][[1]])] } - setcolorder(align_raw, c("sample", "unaligned", + data.table::setcolorder(align_raw, c("sample", "unaligned", paste(unlist(prealignments)), "duplicates", paste(unique(stats$Genome)))) } else { - setcolorder(align_raw, c("sample", "unaligned", "duplicates", + data.table::setcolorder(align_raw, c("sample", "unaligned", "duplicates", paste(unique(stats$Genome)))) } align_raw$sample <- factor(align_raw$sample, levels = unique(align_raw$sample)) - melt_align_raw <- melt(align_raw, id.vars = "sample") + melt_align_raw <- reshape2::melt(align_raw, id.vars = "sample") max_reads <- max(rowSums(align_raw[,2:ncol(align_raw)])) upper_limit <- roundUpNice(max_reads/1000000) - chart_height <- (length(unique(align_raw$sample))) * 0.75 - plot_colors <- data.table(unaligned="gray15") + chart_height <- (length(unique(align_raw$sample))) #* 0.75 + plot_colors <- data.table::data.table(unaligned="gray15") if (!is.null(prealignments)) { - more_colors <- colorpanel(length(unlist(prealignments)), + more_colors <- gplots::colorpanel(length(unlist(prealignments)), low="#FFE595", mid="#F6CAA6", high="#F6F2A6") for (i in 1:length(unlist(prealignments))) { plot_colors[, unlist(prealignments)[i] := more_colors[i]] @@ -1930,7 +1943,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { } plot_colors[, duplicates := "#FC1E25"] - more_colors <- colorpanel(length(genome_names), + more_colors <- gplots::colorpanel(length(genome_names), low="#4876FF", mid="#94D9CE", high="#7648FF") for (i in 1:length(genome_names)) { plot_colors[, (genome_names[i]) := more_colors[i]] @@ -1938,7 +1951,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { align_raw_plot <- ( ggplot(melt_align_raw, aes(x=sample, y=as.numeric(value)/1000000)) + - geom_col(aes(fill=variable), colour="black", size=0.25, width=0.8) + + geom_col(aes(fill=variable), colour="black", `linewidth`=0.25, width=0.8) + guides(fill=guide_legend(reverse=TRUE)) + labs(x="", y="Number of reads (M)") + scale_x_discrete(limits=rev(levels(melt_align_raw$sample))) + @@ -1954,7 +1967,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { setPanelSize( align_raw_plot, file=output_file, - width=unit(8,"inches"), + width=unit(chart_height,"inches"), height=unit(chart_height,"inches") ) ) @@ -1967,8 +1980,8 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { colnames(more_to_see) <- colnames(align_raw_thumb) align_raw_thumb <- rbind(align_raw_thumb, more_to_see) align_raw_thumb$sample <- droplevels(align_raw_thumb)$sample - melt_align_raw_thumb <- melt(align_raw_thumb, id.vars="sample") - chart_height <- (length(unique(align_raw_thumb$sample))) * 0.75 + melt_align_raw_thumb <- reshape2::melt(align_raw_thumb, id.vars="sample") + chart_height <- (length(unique(align_raw_thumb$sample))) #* 0.75 } else {melt_align_raw_thumb <- melt_align_raw} thumb_raw_plot <- ( @@ -1988,7 +2001,7 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { setPanelSize( thumb_raw_plot, file=output_file, - width=unit(8,"inches"), + width=unit(chart_height,"inches"), height=unit(chart_height,"inches") ) ) @@ -2004,17 +2017,19 @@ plotAlignedRaw <- function(prj, summary_dir, stats) { #' @export plotAlignedPct <- function(prj, summary_dir, stats) { # Convenience - project_name <- config(prj)$name + project_name <- pepr::config(prj)$name - align_theme <- theme( + align_theme <- ggplot2::theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.border = element_rect(colour = "black", fill = NA, size = 0.5), + panel.border = element_rect(colour = "black", fill = NA, + linewidth = 0.5), panel.background = element_blank(), axis.line = element_blank(), axis.text.x = element_text(face = "plain", color = "black", - size = 20, hjust = 0.5), + size = 20, angle = 90, hjust = 1, + vjust=0.5), axis.text.y = element_text(face = "plain", color = "black", size = 20, hjust = 1), axis.title.x = element_text(face = "plain", color = "black", @@ -2023,14 +2038,15 @@ plotAlignedPct <- function(prj, summary_dir, stats) { size = 22, hjust = 0.5), plot.title = element_text(face = "bold", color = "black", size = 12, hjust = 0.5), - axis.ticks = element_line(size = 0.5), + axis.ticks = element_line(linewidth = 0.5), axis.ticks.length = unit(2, "mm"), legend.background = element_rect(fill = "transparent"), legend.text = element_text(size = 16), - legend.title = element_blank() + legend.title = element_blank(), + aspect.ratio = 1 ) - unaligned <- 100 - stats$Alignment_rate + unaligned <- 100 - as.numeric(stats$Alignment_rate) # Get prealignments if they exist prealignments <- getPrealignments(stats) @@ -2054,7 +2070,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { } } - align_percent <- data.table(sample=stats$sample_name, + align_percent <- data.table::data.table(sample=stats$sample_name, unaligned=unaligned, duplicates=as.numeric(duplicates)) @@ -2083,11 +2099,11 @@ plotAlignedPct <- function(prj, summary_dir, stats) { unlist(prealignments)[i], sep="_")), with=FALSE][[1]])] } - setcolorder(align_percent, c("sample", "unaligned", + data.table::setcolorder(align_percent, c("sample", "unaligned", paste(unlist(prealignments)), "duplicates", paste(unique(stats$Genome)))) } else { - setcolorder(align_percent, c("sample", "unaligned", "duplicates", + data.table::setcolorder(align_percent, c("sample", "unaligned", "duplicates", paste(unique(stats$Genome)))) } @@ -2117,14 +2133,14 @@ plotAlignedPct <- function(prj, summary_dir, stats) { print(aberrant_samples, row.names=FALSE) } - melt_align_percent <- melt(align_percent, id.vars="sample") + melt_align_percent <- reshape2::melt(align_percent, id.vars="sample") upper_limit <- 103 - chart_height <- (length(unique(align_percent$sample))) * 0.75 + chart_height <- (length(unique(align_percent$sample))) #* 0.75 - plot_colors <- data.table(unaligned="gray15") + plot_colors <- data.table::data.table(unaligned="gray15") if (!is.null(prealignments)) { - more_colors <- colorpanel(length(unlist(prealignments)), + more_colors <- gplots::colorpanel(length(unlist(prealignments)), low="#FFE595", mid="#F6CAA6", high="#F6F2A6") for (i in 1:length(unlist(prealignments))) { plot_colors[, unlist(prealignments)[i] := more_colors[i]] @@ -2132,7 +2148,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { } plot_colors[, duplicates := "#FC1E25"] - more_colors <- colorpanel(length(genome_names), + more_colors <- gplots::colorpanel(length(genome_names), low="#4876FF", mid="#94D9CE", high="#7648FF") for (i in 1:length(genome_names)) { plot_colors[, (genome_names[i]) := more_colors[i]] @@ -2156,7 +2172,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { setPanelSize( align_percent_plot, file=output_file, - width=unit(8,"inches"), + width=unit(chart_height,"inches"), height=unit(chart_height,"inches") ) ) @@ -2170,10 +2186,9 @@ plotAlignedPct <- function(prj, summary_dir, stats) { colnames(more_to_see) <- colnames(align_percent_thumb) align_percent_thumb <- rbind(align_percent_thumb, more_to_see) align_percent_thumb$sample <- droplevels(align_percent_thumb)$sample - melt_align_percent_thumb <- melt(align_percent_thumb, + melt_align_percent_thumb <- reshape2::melt(align_percent_thumb, id.vars="sample") - chart_height <- ((length(unique(align_percent_thumb$sample))) * - 0.75) + chart_height <- ((length(unique(align_percent_thumb$sample)))) } else {melt_align_percent_thumb <- melt_align_percent} thumb_percent_plot <- ( @@ -2193,7 +2208,7 @@ plotAlignedPct <- function(prj, summary_dir, stats) { setPanelSize( thumb_percent_plot, file=output_file, - width=unit(8,"inches"), + width=unit(chart_height,"inches"), height=unit(chart_height,"inches") ) ) @@ -2210,18 +2225,19 @@ plotAlignedPct <- function(prj, summary_dir, stats) { #' @export plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { # Convenience - project_name <- config(prj)$name + project_name <- pepr::config(prj)$name - align_theme <- theme( + align_theme <- ggplot2::theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_rect(colour = "black", fill = NA, - size = 0.5), + linewidth = 0.5), panel.background = element_blank(), axis.line = element_blank(), axis.text.x = element_text(face = "plain", color = "black", - size = 20, hjust = 0.5), + size = 20, angle = 90, hjust = 1, + vjust=0.5), axis.text.y = element_text(face = "plain", color = "black", size = 20, hjust = 1), axis.title.x = element_text(face = "plain", color = "black", @@ -2230,23 +2246,24 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { size = 22, hjust = 0.5), plot.title = element_text(face = "bold", color = "black", size = 12, hjust = 0.5), - axis.ticks = element_line(size = 0.5), + axis.ticks = element_line(linewidth = 0.5), axis.ticks.length = unit(2, "mm"), legend.background = element_rect(fill = "transparent"), legend.text = element_text(size = 16), - legend.title = element_blank() + legend.title = element_blank(), + aspect.ratio = 1 ) # Establish red/green color scheme red_min <- 0 red_max <- cutoff-0.01 red_breaks <- seq(red_min,red_max,0.01) - red_colors <- colorpanel(length(red_breaks), + red_colors <- gplots::colorpanel(length(red_breaks), "#AF0000","#E40E00","#FF7A6A") green_min <- cutoff green_max <- 30 green_breaks <- seq(green_min,green_max,0.01) - green_colors <- colorpanel(length(green_breaks)-1, + green_colors <- gplots::colorpanel(length(green_breaks)-1, "#B4E896","#009405","#003B00") TSS_colors <- c(red_colors, green_colors) @@ -2278,7 +2295,7 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { max_TSS <- max(stats$TSS_score, na.rm=TRUE) upper_limit <- roundUpNice(max_TSS) - chart_height <- (length(unique(TSS_score$sample))) * 0.75 + chart_height <- (length(unique(TSS_score$sample))) #* 0.75 TSS_score$sample <- factor(TSS_score$sample, levels=unique(TSS_score$sample)) @@ -2308,7 +2325,7 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { # Limit to 25 samples max if (length(TSS_score$sample) > 25) { TSS_score_thumb <- TSS_score[1:25, ] - chart_height <- (length(unique(TSS_score_thumb$sample))) * 0.75 + chart_height <- (length(unique(TSS_score_thumb$sample))) #* 0.75 more_to_see <- data.frame(t(c("...", "0", "#AF0000"))) colnames(more_to_see) <- colnames(TSS_score_thumb) TSS_score_thumb <- rbind(TSS_score_thumb, more_to_see) @@ -2346,13 +2363,14 @@ plotTSSscores <- function(prj, summary_dir, stats, cutoff=6) { #' @export plotLibSizes <- function(prj, summary_dir, stats) { # Convenience - project_name <- config(prj)$name + project_name <- pepr::config(prj)$name - align_theme <- theme( + align_theme <- ggplot2::theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.border = element_rect(colour = "black", fill = NA, size = 0.5), + panel.border = element_rect(colour = "black", fill = NA, + linewidth = 0.5), panel.background = element_blank(), axis.line = element_blank(), axis.text.x = element_text(face = "plain", color = "black", @@ -2365,11 +2383,12 @@ plotLibSizes <- function(prj, summary_dir, stats) { size = 22, hjust = 0.5), plot.title = element_text(face = "bold", color = "black", size = 12, hjust = 0.5), - axis.ticks = element_line(size = 0.5), + axis.ticks = element_line(linewidth = 0.5), axis.ticks.length = unit(2, "mm"), legend.background = element_rect(fill = "transparent"), legend.text = element_text(size = 16), - legend.title = element_blank() + legend.title = element_blank(), + aspect.ratio = 1 ) picard_lib_size <- cbind.data.frame( @@ -2411,6 +2430,34 @@ plotLibSizes <- function(prj, summary_dir, stats) { ) } +#' Internal helper function for \code{summarizer} +#' Convert the new pipestat yaml outputs into R readable data.table file +#' +#' @param sample_name A list project sample names +#' @param yaml_file A project level stats summary yaml file +yamlToDT <- function(sample_name, yaml_file) { + sample_DT <- data.table::as.data.table(yaml_file$PEPATAC$sample[[sample_name]]) + remove_cols <- c("Library complexity", + "TSS enrichment", + "TSS distance distribution", + "Fragment distribution", + "Peak chromosome distribution", + "Peak partition distribution", + "cFRiF", + "FRiF", + "FastQC report r1", + "FastQC report r2") + cols_present <- character() + for (column_name in remove_cols) { + if (any(grepl(column_name, names(sample_DT)))) { + cols_present <- c(cols_present, column_name) + } + } + sample_DT[, (cols_present) := NULL] + sample_DT <- unique(sample_DT) + sample_DT$sample_name <- sample_name + return(sample_DT) +} #' This function is meant to plot multiple summary graphs from the summary table #' made by the Looper summarize command @@ -2420,25 +2467,27 @@ plotLibSizes <- function(prj, summary_dir, stats) { #' @keywords summarize PEPATAC #' @export summarizer <- function(project, output_dir) { + options(scipen = 999) # Build the stats summary file path summary_file <- file.path(output_dir, - paste0(pepr::config(project)$name, "_stats_summary.tsv")) - + paste0(pepr::config(project)$name, "_stats_summary.yaml")) + if (file.exists(summary_file)) { + summary_file <- yaml::read_yaml(summary_file, + handlers=list(int=function(x) { as.numeric(x) })) + } else { + warning("PEPATAC.R summarizer was unable to locate the summary file.") + return(FALSE) + } + #message(paste0("summary file : ", summary_file)) # DEBUG write(paste0("Creating summary plots..."), stdout()) # Set the output directory summary_dir <- suppressMessages(file.path(output_dir, "summary")) # Produce output directory (if needed) dir.create(summary_dir, showWarnings = FALSE) - # read in stats summary file - if (file.exists(summary_file)) { - stats <- suppressWarnings(fread(summary_file, - header=TRUE, - check.names=FALSE)) - } else { - warning("PEPATAC.R summarizer was unable to locate the summary file.") - return(FALSE) - } + # convert yaml to data.table object + stats <- data.table::rbindlist(sapply(project_samples, FUN=yamlToDT, + yaml_file=summary_file), fill=TRUE) # Set absent values in table to zero stats[is.na(stats)] <- 0 @@ -2472,7 +2521,7 @@ summarizer <- function(project, output_dir) { countReproduciblePeaks <- function(peak_list, peak_DT) { setkey(peak_DT, chr, start, end) setkey(peak_list, chr, start, end) - hits <- foverlaps(peak_list, peak_DT, + hits <- data.table::foverlaps(peak_list, peak_DT, by.x=c("chr", "start", "end"), type="any", which=TRUE, nomatch=0) # track the number of overlaps of final peak set peaks @@ -2500,7 +2549,7 @@ countReproduciblePeaks <- function(peak_list, peak_DT) { collapsePeaks <- function(sample_table, chr_sizes, min_samples=2, min_score=5, min_olap=1) { # create combined peaks - peaks <- rbindlist(lapply(sample_table$peak_files, fread), idcol="file") + peaks <- rbindlist(lapply(sample_table$peak_files, data.table::fread), idcol="file") if (ncol(peaks) == 7) { colnames(peaks) <- c("file", "chr", "start", "end", "name", "score", "strand") @@ -2512,7 +2561,7 @@ collapsePeaks <- function(sample_table, chr_sizes, warning(paste0("Peak files did not contain a recognizable number", " of columns (", ncol(peaks), ")")) rm(peaks) - final <- data.table(chr=character(), + final <- data.table::data.table(chr=character(), start=integer(), end=integer(), name=character(), @@ -2530,14 +2579,14 @@ collapsePeaks <- function(sample_table, chr_sizes, peaks_by_chr <- split(peaks, peaks$chr) hit_aggregator <- function(x) { #message(paste0("x: ", unique(x$chr))) # DEBUG - peaksGR <- makeGRangesFromDataFrame(x, keep.extra.columns=FALSE) + peaksGR <- GenomicRanges::makeGRangesFromDataFrame(x, keep.extra.columns=FALSE) hitsGR <- suppressWarnings( findOverlaps(peaksGR, peaksGR, ignore.strand=TRUE, minoverlap=min_olap)) hits <- data.table::data.table(xid=queryHits(hitsGR), yid=subjectHits(hitsGR)) setkey(hits, xid) - scores <- data.table(index=rep(1:nrow(x)), score=x$score) + scores <- data.table::data.table(index=rep(1:nrow(x)), score=x$score) setkey(scores, index) out <- hits[scores, nomatch=0] keep <- out[out[,.I[which.max(score)],by=yid]$V1] @@ -2610,7 +2659,7 @@ consensusPeaks <- function(sample_table, summary_dir, results_subdir, assets, file_exists <- append(file_exists, file.path(file_list[i])) } } - files <- data.table(peak_files=file_exists) + files <- data.table::data.table(peak_files=file_exists) consensus_peak_files = list() if (nrow(files) == 0) { return(consensus_peak_files) @@ -2636,7 +2685,7 @@ consensusPeaks <- function(sample_table, summary_dir, results_subdir, assets, } c_path <- unique(sample_table[genome == g, c_path]) if (file.exists(c_path)) { - c_size <- fread(c_path) + c_size <- data.table::fread(c_path) colnames(c_size) <- c("chr", "size") } else { warning("Unable to load the chromosome sizes file.") @@ -2654,7 +2703,7 @@ consensusPeaks <- function(sample_table, summary_dir, results_subdir, assets, file_name <- paste0("_", g,"_consensusPeaks.narrowPeak") output_file <- file.path(summary_dir, paste0(project_name, file_name)) - fwrite(final, output_file, sep="\t", col.names=FALSE) + data.table::fwrite(final, output_file, sep="\t", col.names=FALSE) consensus_peak_files <- c(consensus_peak_files, output_file) rm(final) invisible(gc()) @@ -2704,7 +2753,7 @@ readPepatacPeakCounts = function(prj, results_subdir) { result <- lapply(paths, function(x){ info <- file.info(file.path(x)) if (file.exists(x) && info$size != 0) { - df <- fread(x) + df <- data.table::fread(x) colnames(df) <- c("chr", "start", "end", "read_count", "base_count", "width", "frac", "norm") gr <- GenomicRanges::GRanges(df) @@ -2784,7 +2833,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, file_exists <- append(file_exists, file.path(file_list[i])) } } - files <- data.table(peak_files=file_exists) + files <- data.table::data.table(peak_files=file_exists) consensus_peak_files = list() if (nrow(files) == 0) { return(consensus_peak_files) @@ -2811,7 +2860,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, } c_path <- unique(sample_table[genome == g, c_path]) if (file.exists(c_path)) { - c_size <- fread(c_path) + c_size <- data.table::fread(c_path) colnames(c_size) <- c("chr", "size") } else { warning("Unable to load the chromosome sizes file.") @@ -2823,7 +2872,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, nrow(st_list[[g]]), " samples...")) if (reference) { read_peaks <- function(x, y) { - fread(y) + data.table::fread(y) } # Load each peak file as list of named data.tables peaks <- mapply(FUN=read_peaks, @@ -2856,7 +2905,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, return(NULL) } } else { - peaks_dt <- data.table(chr=as.character(), + peaks_dt <- data.table::data.table(chr=as.character(), start=as.numeric(), end=as.numeric(), read_count=as.numeric(), @@ -2868,7 +2917,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, peaks <- tryCatch( { suppressMessages( - rbindlist(lapply(st_list[[g]]$peak_files, fread))) + rbindlist(lapply(st_list[[g]]$peak_files, data.table::fread))) }, error=function(e) { message("peakCounts() peak coverage file fread(): ", e) @@ -2892,15 +2941,16 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, setkey(peaks, chr, start, end) peaks_dt <- rbind(peaks_dt, peaks) # Convert to GRanges for more efficient findOverlaps vs data.table - peaksGR <- makeGRangesFromDataFrame(peaks_dt, + peaksGR <- GenomicRanges::makeGRangesFromDataFrame(peaks_dt, keep.extra.columns=TRUE) reduceGR <- reduce(peaksGR) # instead, different column for each sample is the counts columns, # plural - reduce_dt <- data.table(chr=as.character(seqnames(reduceGR)), - start=start(reduceGR), - end=end(reduceGR)) + reduce_dt <- data.table::data.table( + chr=as.character(seqnames(reduceGR)), + start=start(reduceGR), + end=end(reduceGR)) f <- function(x) {list(0)} # Need to make syntactically valid names valid_names <- make.unique(make.names(st_list[[g]]$sample_name)) @@ -2926,7 +2976,7 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, for (file in st_list[[g]]$peak_files) { info <- file.info(file.path(file)) if (file.exists(file.path(file)) && info$size != 0) { - p <- fread(file) + p <- data.table::fread(file) #name <- gsub("_peaks_coverage.bed","", basename(file)) name <- make.unique(make.names(st_list[[g]][i]$sample_name)) i <- i + 1 @@ -2934,8 +2984,8 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, "base_count", "width", "frac", "norm") setkey(p, chr, start, end) - reduceGR <- makeGRangesFromDataFrame(reduce_dt) - peaksGR <- makeGRangesFromDataFrame(p) + reduceGR <- GenomicRanges::makeGRangesFromDataFrame(reduce_dt) + peaksGR <- GenomicRanges::makeGRangesFromDataFrame(p) hitsGR <- findOverlaps(query=reduceGR, subject=peaksGR, minoverlap=min_olap) @@ -2945,21 +2995,22 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, polap <- width(olap) / width(peaksGR[subjectHits(hitsGR)]) if (poverlap & norm) { - counts <- data.table(index=rep(1:nrow(p)), + counts <- data.table::data.table(index=rep(1:nrow(p)), counts=p$norm*polap) } else if (!poverlap & norm) { - counts <- data.table(index=rep(1:nrow(p)), + counts <- data.table::data.table(index=rep(1:nrow(p)), counts=p$norm) } else if (poverlap & !norm) { - counts <- data.table(index=rep(1:nrow(p)), + counts <- data.table::data.table(index=rep(1:nrow(p)), counts=p$read_count*polap) } else { - counts <- data.table(index=rep(1:nrow(p)), + counts <- data.table::data.table(index=rep(1:nrow(p)), counts=p$read_count) } - hits <- data.table(xid=queryHits(hitsGR), - yid=subjectHits(hitsGR)) + hits <- data.table::data.table( + xid=queryHits(hitsGR), + yid=subjectHits(hitsGR)) setkey(hits, yid) setkey(counts, index) out <- hits[counts, nomatch=0] @@ -3004,7 +3055,8 @@ peakCounts <- function(sample_table, summary_dir, results_subdir, assets, output_file <- file.path(summary_dir, paste0(project_name, "_", g, "_peaks_coverage.tsv")) # save consensus peak set - fwrite(reduce_dt, output_file, sep="\t", col.names=TRUE) + data.table::fwrite(reduce_dt, output_file, + sep="\t", col.names=TRUE) counts_path <- c(counts_path, output_file) } else { warning(strwrap(prefix = " ", initial = "", @@ -3036,7 +3088,7 @@ createStatsSummary <- function(samples, results_subdir) { next } - t <- fread(sample_assets_file, header=FALSE, + t <- data.table::fread(sample_assets_file, header=FALSE, col.names=c('stat', 'val', 'annotation')) # Remove complete duplicates t <- t[!duplicated(t[, c('stat', 'val', 'annotation')], @@ -3047,9 +3099,9 @@ createStatsSummary <- function(samples, results_subdir) { fromLast=TRUE),] t[stat=="Time",]$val <- max_time - t2 <- data.table(t(t$val)) + t2 <- data.table::data.table(t(t$val)) colnames(t2) <- t$stat - t2 <- cbind(data.table(sample_name=sample), t2) + t2 <- cbind(data.table::data.table(sample_name=sample), t2) if (exists("stats")) { stats <- rbind(stats, t2, fill=TRUE) } else { @@ -3073,10 +3125,11 @@ createStatsSummary <- function(samples, results_subdir) { createAssetsSummary <- function(samples, results_subdir) { # Create assets_summary file missing_files <- 0 - assets <- data.table(sample_name=character(), - asset=character(), - path=character(), - annotation=character()) + assets <- data.table::data.table( + sample_name=character(), + asset=character(), + path=character(), + annotation=character()) write(paste0("Creating assets summary..."), stdout()) for (sample in samples) { @@ -3088,7 +3141,7 @@ createAssetsSummary <- function(samples, results_subdir) { next } - t <- fread(sample_assets_file, header=FALSE, + t <- data.table::fread(sample_assets_file, header=FALSE, col.names=c('asset', 'path', 'annotation')) t <- t[!duplicated(t[, c('asset', 'path', 'annotation')], fromLast=TRUE),] diff --git a/PEPATACr/man/consensusPeaks.Rd b/PEPATACr/man/consensusPeaks.Rd index 0e2d64bc..64248c4a 100644 --- a/PEPATACr/man/consensusPeaks.Rd +++ b/PEPATACr/man/consensusPeaks.Rd @@ -10,7 +10,8 @@ consensusPeaks( results_subdir, assets, min_samples = 2, - min_score = 5 + min_score = 5, + min_olap = 1 ) } \arguments{ @@ -27,6 +28,9 @@ genomes.} in to keep.} \item{min_score}{A minimum peak score to keep an individual peak.} + +\item{min_olap}{A minimum number of bases between peaks to be +considered overlapping.} } \description{ This function is meant to identify a project level set of consensus peaks. diff --git a/PEPATACr/man/reducePeaks.Rd b/PEPATACr/man/reducePeaks.Rd index 25eb7b7d..bc37d2dd 100644 --- a/PEPATACr/man/reducePeaks.Rd +++ b/PEPATACr/man/reducePeaks.Rd @@ -5,11 +5,13 @@ \title{This function is meant to keep only the most significant non-overlapping peaks. It also trims peaks extending beyond the bounds of the chromosome.} \usage{ -reducePeaks(input, chr_sizes, output = NA, normalize = FALSE) +reducePeaks(input, sample_name, chr_sizes, output = NA, normalize = FALSE) } \arguments{ \item{input}{Path to narrowPeak file} +\item{sample_name}{Sample name character string} + \item{chr_sizes}{Genome chromosome sizes file. } \item{output}{Output file name.} diff --git a/PEPATACr/man/yamlToDT.Rd b/PEPATACr/man/yamlToDT.Rd new file mode 100644 index 00000000..272027f5 --- /dev/null +++ b/PEPATACr/man/yamlToDT.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PEPATACr.R +\name{yamlToDT} +\alias{yamlToDT} +\title{Internal helper function for \code{summarizer} +Convert the new pipestat yaml outputs into R readable data.table file} +\usage{ +yamlToDT(sample_name, yaml_file) +} +\arguments{ +\item{sample_name}{A list project sample names} + +\item{yaml_file}{A project level stats summary yaml file} +} +\description{ +Internal helper function for \code{summarizer} +Convert the new pipestat yaml outputs into R readable data.table file +} diff --git a/checkinstall b/checkinstall index 736534a9..97766409 100755 --- a/checkinstall +++ b/checkinstall @@ -119,61 +119,282 @@ fi while IFS= read -r line; do [ "${line:0:1}" = "#" ] && continue - IFS='>=' read -r -a array <<< "$line" - package=${array[0]} - required=${array[2]} - required=$(trim ${required}) - IFS='.' read -r -a required_version <<< "$required" - declare -i rmajor - declare -i rminor - declare -i rpatch - rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') - rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') - rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') - - if ! pip_show "${package}"; then - echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - else - installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') - installed=$(trim ${installed}) - IFS='.' read -r -a installed_version <<< "$installed" - declare -i imajor - declare -i iminor - declare -i ipatch - imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') - iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') - ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') - - if ! [ -z "$required" ]; then - if [ $imajor -lt $rmajor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") - printf "\n" - NATIVE_INSTALL=1 - BULKER_INSTALL=1 + + if [[ $line == *","* ]]; then + IFS=',' read -r -a array <<< "$line" + first_half=${array[0]} + # Check the maximum requirement + IFS='<' read -r -a array <<< "$first_half" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, < $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -gt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -gt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -gt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + + + second_half=${array[2]} + # Check the minimum requirement + IFS='>=' read -r -a array <<< "$second_half" + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + elif [[ $line == *">="* ]]; then + IFS='>=' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + elif [[ $line == *"=="* ]]; then + IFS='==' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi else - echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") fi + fi + else + IFS='<' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 else - echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi fi fi done < $REQS # Check tool installation -declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "fseq" "macs2" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") +declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "macs3" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") for cmd in ${requiredCommands[@]}; do if ! is_executable $cmd; then @@ -224,135 +445,346 @@ if ! is_executable "conda"; then CONDA_INSTALL=1 else eval "$(conda shell.bash hook)" - conda activate pepatac - - unset PYTHONPATH - unset R_LIBS - - # Check Python - if ! is_executable "python"; then - echo $(warn "WARNING: PEPATAC requires python 3.0 or greater. Install python and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - else - #echo "which python: $(which python)" - ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]\).*/\1\2/') - if [ "$ver" -lt "30" ]; then - echo $(warn "WARNING: PEPATAC requires python 3.0 or greater. Update python and checkinstall again.") + + # Check if a pepatac conda environment exists + CONDA_ENVS=$(conda env list | grep -c '^pepatac\s') + + if [[ "$CONDA_ENVS" == 1 ]]; then + conda activate pepatac + + unset PYTHONPATH + unset R_LIBS + + # Check Python + if ! is_executable "python"; then + echo $(warn "WARNING: PEPATAC requires python 3.0 or greater. Install python and checkinstall again.") printf "\n" CONDA_INSTALL=1 + else + #echo "which python: $(which python)" + ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]\).*/\1\2/') + if [ "$ver" -lt "30" ]; then + echo $(warn "WARNING: PEPATAC requires python 3.0 or greater. Update python and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + fi fi - fi - - # Check Python packages - if ! is_executable "pip"; then - echo $(warn "WARNING: PEPATAC requires pip. Please install pip and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - fi - - if [ -f "requirements.txt" ]; then - REQS="requirements.txt" - else - REQS=$(curl https://raw.githubusercontent.com/databio/pepatac/master/requirements.txt) - fi - while IFS= read -r line; do - [ "${line:0:1}" = "#" ] && continue - IFS='>=' read -r -a array <<< "$line" - package=${array[0]} - required=${array[2]} - required=$(trim ${required}) - IFS='.' read -r -a required_version <<< "$required" - declare -i rmajor - declare -i rminor - declare -i rpatch - rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') - rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') - rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') - - if ! pip_show "${package}"; then - echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + # Check Python packages + if ! is_executable "pip"; then + echo $(warn "WARNING: PEPATAC requires pip. Please install pip and checkinstall again.") printf "\n" CONDA_INSTALL=1 + fi + + if [ -f "requirements.txt" ]; then + REQS="requirements.txt" else - installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') - installed=$(trim ${installed}) - IFS='.' read -r -a installed_version <<< "$installed" - declare -i imajor - declare -i iminor - declare -i ipatch - imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') - iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') - ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + REQS=$(curl https://raw.githubusercontent.com/databio/pepatac/master/requirements.txt) + fi - if ! [ -z "$required" ]; then - if [ $imajor -lt $rmajor ]; then - #echo -e "Installed major: ${imajor}\tRequired major: ${rmajor}" - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + while IFS= read -r line; do + [ "${line:0:1}" = "#" ] && continue + + if [[ $line == *","* ]]; then + IFS=',' read -r -a array <<< "$line" + first_half=${array[0]} + # Check the maximum requirement + IFS='<' read -r -a array <<< "$first_half" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, < $required. Try pip install $package and checkinstall again.") printf "\n" CONDA_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then - #echo -e "Installed minor: ${iminor}\tRequired minor: ${rminor}" - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -gt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -gt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -gt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, < $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + + + second_half=${array[2]} + # Check the minimum requirement + IFS='>=' read -r -a array <<< "$second_half" + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") printf "\n" CONDA_INSTALL=1 - elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then - #echo -e "Installed patch: ${ipatch}\tRequired patch: ${rpatch}" - echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + elif [[ $line == *">="* ]]; then + IFS='>=' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") printf "\n" CONDA_INSTALL=1 else - echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + elif [[ $line == *"=="* ]]; then + IFS='==' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, == $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi fi else - echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + IFS='<' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPATAC requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPATAC requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi fi - fi - done < $REQS + done < $REQS - # Check tool installation - declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "fseq" "macs2" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") + # Check tool installation + declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bowtie2" "macs3" "preseq" "samblaster" "samtools" "skewer" "bedToBigBed" "bigWigCat" "wigToBigWig" "Rscript") - for cmd in ${requiredCommands[@]}; do - if ! is_executable $cmd; then - echo $(warn "WARNING: Please install $cmd and checkinstall again.") + for cmd in ${requiredCommands[@]}; do + if ! is_executable $cmd; then + echo $(warn "WARNING: Please install $cmd and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: ${cmd}") + fi + done + + ## Check R packages + if ! is_executable "R"; then + echo $(warn "WARNING: PEPATAC requires R 3.5 or greater.\n Please install R>=3.5 and checkinstall again.") printf "\n" - CONDA_INSTALL=1 + exit 1 else - echo -e $(success "SUCCESS: ${cmd}") + rVer=$(R --version 2>&1 | grep 'R version' | awk '{print $3}') + rVer=$(echo "${rVer//.}") + if [ "$rVer" -lt "350" ]; then + echo $(warn "WARNING: PEPATAC requires R 3.5 or greater. Update R and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + fi fi - done - ## Check R packages - if ! is_executable "R"; then - echo $(warn "WARNING: PEPATAC requires R 3.5 or greater.\n Please install R>=3.5 and checkinstall again.") - printf "\n" - exit 1 + declare -a requiredRPackages=("optigrab" "devtools" "GenomicDistributions" "PEPATACr" "data.table" "pepr" "gplots" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") + for package in ${requiredRPackages[@]}; do + cmd=$(echo "Rscript -e 'library(\"$package\")'") + packageInstalled=$(eval $cmd 2>&1) + if [[ "$packageInstalled" == *Error* ]]; then + echo $(warn "WARNING: Please install the R package, $package, and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: R package: ${package}") + fi + done + + conda deactivate else - rVer=$(R --version 2>&1 | grep 'R version' | awk '{print $3}') - rVer=$(echo "${rVer//.}") - if [ "$rVer" -lt "350" ]; then - echo $(warn "WARNING: PEPATAC requires R 3.5 or greater. Update R and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - fi + echo -e "The pepatac conda environment is not present." + CONDA_INSTALL=1 fi - - declare -a requiredRPackages=("optigrab" "devtools" "GenomicDistributions" "PEPATACr" "data.table" "pepr" "gplots" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") - for package in ${requiredRPackages[@]}; do - cmd=$(echo "Rscript -e 'library(\"$package\")'") - packageInstalled=$(eval $cmd 2>&1) - if [[ "$packageInstalled" == *Error* ]]; then - echo $(warn "WARNING: Please install the R package, $package, and checkinstall again.") - printf "\n" - CONDA_INSTALL=1 - else - echo -e $(success "SUCCESS: R package: ${package}") - fi - done - - conda deactivate fi ################################################################################ @@ -417,7 +849,7 @@ else fi if [ -f "$CWD/pipelines/pepatac.py" ]; then - PIPELINE="$CWD/pipelines/pepatac.py" + PIPELINE=$(cat "$CWD/pipelines/pepatac.py") else PIPELINE=$(curl https://raw.githubusercontent.com/databio/pepatac/master/pipelines/pepatac.py) fi diff --git a/containers/pepatac.Dockerfile b/containers/pepatac.Dockerfile old mode 100644 new mode 100755 diff --git a/docs/assets.md b/docs/assets.md old mode 100644 new mode 100755 diff --git a/docs/changelog.md b/docs/changelog.md old mode 100644 new mode 100755 index f0297257..5aef8b7e --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,16 @@ # Change log All notable changes to this project will be documented in this file. +## [0.11.0] -- 2023-12-22 + +### Fixed + - adjusted requirements and docs for Looper 1.6.0, Pipestat v0.6.0, and Pypiper 0.14.0 + +### Changed +- pipeline uses MACS3 instead of MACS2 +- output schema is now a pipestat compatible JSON Schema +- collator now outputs with record_identifier name `summary` + ## [0.10.5] -- 2023-07-31 ### Fixed diff --git a/docs/detailed-install.md b/docs/detailed-install.md old mode 100644 new mode 100755 index 29cf6afe..e841c3d9 --- a/docs/detailed-install.md +++ b/docs/detailed-install.md @@ -29,8 +29,8 @@ We'll install each of these pieces of software before moving forward. Let's sta ```console cd tools/ wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools-2.29.2.tar.gz -tar -zxvf bedtools-2.29.0.tar.gz -rm bedtools-2.29.0.tar.gz +tar -zxvf bedtools-2.29.2.tar.gz +rm bedtools-2.29.2.tar.gz cd bedtools2 make ``` @@ -50,6 +50,7 @@ rm bowtie2-2.4.1-source.zip cd bowtie2-2.4.1 make ``` +Note: you may need to install `libtbb-dev` if `make` fails, e.g. using `apt install libtbb-dev` Again, let's add `bowtie2` to our `PATH` environment variable: @@ -59,6 +60,14 @@ export PATH="$PATH:/path/to/pepatac_tutorial/tools/bowtie2-2.4.1/" #### preseq The pipeline uses `preseq` to calculate library complexity. Check out the author's [page for more instruction](https://github.com/smithlabcode/preseq). +Note: If receiving the following error later in the tutorial: `preseq: error while loading shared libraries: libgsl.so.0: cannot open shared object file: No such file or directory` +you may need to install `libgsl-dev` using: `apt install libgsl-dev` and either: +1. `export LD_LIBRARY_PATH=/usr/local/lib` +2. link `libgsl.so.0` to an existing `libgsl`, e.g. `libgsl.so.27` + +More info can be found here: +https://www.gnu.org/software/gsl/doc/html/usage.html#shared-libraries + ```console wget http://smithlabresearch.org/downloads/preseq_linux_v2.0.tar.bz2 tar xvfj preseq_linux_v2.0.tar.bz2 @@ -84,7 +93,7 @@ wget https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.t tar xvfj samtools-1.10.tar.bz2 rm samtools-1.10.tar.bz2 cd samtools-1.10 -/configure +./configure ``` Alternatively, if you do not have the ability to install `samtools` to the default location, you can specify using the `--prefix=/install/destination/dir/` option. [Learn more about the `--prefix` option here](http://samtools.github.io/bcftools/howtos/install.html). ```console @@ -126,7 +135,10 @@ That should do it! Now we'll [install some **optional** packages](tutorial.md#1 `PEPATAC` uses `R` to generate quality control and read/peak annotation plots, so you'll need to have R functional if you want these outputs. We have packaged all the `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/master/PEPATACr). The `PEPATAC` package relies on a few additional packages which can be installed at the command line as follows: +Note: if given error regarding `devtools` try: `apt install r-cran-devtools` before proceeding with installation. +Note: if receiving an error for GenomicDistributionsData_0.0.2.tar.gz, download the file manually and install directly using `install.packages("local/path/to/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL)` ``` +Rscript -e 'install.packages('argparser')' Rscript -e 'install.packages("devtools")' Rscript -e 'devtools::install_github("pepkit/pepr")' Rscript -e 'install.packages("BiocManager")' @@ -187,10 +199,10 @@ export PICARD="/path/to/pepatac_tutorial/tools/picard.jar" To extract files quicker, `PEPATAC` can also utilize `pigz` in place of `gzip` if you have it installed. Let's go ahead and do that now. It's not required, but it can help speed everything up when you have many samples to process. ```console cd /path/to/pepatac_tutorial/tools/ -wget https://zlib.net/pigz/pigz-2.4.tar.gz -tar xvfz pigz-2.4.tar.gz -rm pigz-2.4.tar.gz -cd pigz-2.4/ +wget https://zlib.net/pigz/pigz-2.8.tar.gz +tar xvfz pigz-2.8.tar.gz +rm pigz-2.8.tar.gz +cd pigz-2.8/ make ``` Don't forget to add this to your `PATH` too! diff --git a/docs/reference_peaks.md b/docs/reference_peaks.md old mode 100644 new mode 100755 diff --git a/docs/run-cluster.md b/docs/run-cluster.md index 76fb98c7..9b5bb8db 100644 --- a/docs/run-cluster.md +++ b/docs/run-cluster.md @@ -47,3 +47,10 @@ divvy init $DIVCFG ``` Next, you edit that config file to add in any compute packages you need. `PEPATAC` will then give you access to any of your custom packages with `looper --package `. For complete instructions on how to create a custom compute package, read [how to configure divvy](https://divvy.databio.org/en/latest/configuration/). + +Alternatively, you can specify compute parameters via the CLI: + +```console +looper run examples/test_project/test_config_refgenie.yaml -d \ + --package slurm --compute PARTITION="your_cluster_partition_name" +``` \ No newline at end of file diff --git a/docs/run-conda.md b/docs/run-conda.md old mode 100644 new mode 100755 index 16eedd4d..649b4a10 --- a/docs/run-conda.md +++ b/docs/run-conda.md @@ -42,10 +42,13 @@ To ensure these packages are installed to the `pepatac` `conda` environment, mak conda activate pepatac unset R_LIBS export R_LIBS="$CONDA_PREFIX/lib/R/library" +export R_LIBS_USER="$CONDA_PREFIX/lib/R/library" ``` From the `pepatac/` directory, open `R` and install the following packages: +Note: if receiving an error for GenomicDistributionsData_0.0.2.tar.gz, download the file manually and install directly using `install.packages("local/path/to/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL)` ```{R} +install.packages('argparser') install.packages("optigrab") devtools::install_github("databio/GenomicDistributions") install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL) diff --git a/docs/run-directly.md b/docs/run-directly.md old mode 100644 new mode 100755 diff --git a/docs/tutorial.md b/docs/tutorial.md old mode 100644 new mode 100755 index 2ced155a..3f0c08b1 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -199,6 +199,7 @@ looper runp examples/tutorial/tutorial_refgenie.yaml This should take about a minute on the tutorial samples and will generate a `summary/` directory containing project level output in the parent project directory. You can [browse the tutorial data](browse_output.md) to see the example output. ## 7: Generate an `HTML` report using `looper` +Note: beginning with Looper 1.5.0, pipestat configuration is required to use `looper report`, please see here [Configuring Looper to use pipestat](https://looper.databio.org/en/dev/pipestat/) Let's take full advantage of `looper` and generate a pipeline `HTML` report that makes all our results easy to view and browse. If you'd like to skip right to the results and see what it looks like, [check out the tutorial results](files/examples/tutorial/PEPATAC_tutorial_summary.html). Otherwise, let's generate a report ourselves. @@ -376,8 +377,14 @@ cd ../tools/pepatac/ ``` Now, we'll use `looper` to run the sample locally. ```console -looper run examples/tutorial/tutorial_refgenie.yaml +looper run --looper-config examples/tutorial/.looper_tutorial_refgenie.yaml ``` +Note: if using Looper<1.5.0, the run method is via: +```console +looper run examples/tutorial/tutorial_refgenie.yaml +``` + + Congratulations! Your first samples should be running through the pipeline now. For both samples to run locally should take 30-50 minutes in total depending on your system. After the pipeline is finished, we can look through the output directory together. We've provided an example breakdown of just such a directory in the [browse output page](browse_output.md). @@ -385,6 +392,10 @@ After the pipeline is finished, we can look through the output directory togethe ## 6: Use `looper` to run the project level pipeline The pipeline also includes project level analyses that work on all samples concurrently. This allows for analyses that require output produced by individual sample analysis. We'll run the project analysis much like we run the sample analysis: ```console +looper runp --looper-config examples/tutorial/.looper_tutorial_refgenie.yaml +``` +Note: if using Looper<1.5.0, the run method is via: +```console looper runp examples/tutorial/tutorial_refgenie.yaml ``` This should take about a minute on the tutorial samples and will generate a `summary/` directory containing project level output in the parent project directory. You can [browse the tutorial data](browse_output.md) to see the example output. diff --git a/docs/usage.md b/docs/usage.md old mode 100644 new mode 100755 diff --git a/examples/tutorial/.looper_tutorial_refgenie.yaml b/examples/tutorial/.looper_tutorial_refgenie.yaml new file mode 100644 index 00000000..e3448e21 --- /dev/null +++ b/examples/tutorial/.looper_tutorial_refgenie.yaml @@ -0,0 +1,10 @@ +name: PEPATAC_tutorial +pep_config: tutorial_refgenie_project_config.yaml + +output_dir: "${TUTORIAL}/processed/" +pipeline_interfaces: + sample: ["${TUTORIAL}/tools/pepatac/sample_pipeline_interface.yaml"] + project: ["${TUTORIAL}/tools/pepatac/project_pipeline_interface.yaml"] + +pipestat: + results_file_path: "${TUTORIAL}/processed/results_pipeline/{record_identifier}/stats.yaml" diff --git a/examples/tutorial/tutorial_refgenie_project_config.yaml b/examples/tutorial/tutorial_refgenie_project_config.yaml new file mode 100644 index 00000000..a45528df --- /dev/null +++ b/examples/tutorial/tutorial_refgenie_project_config.yaml @@ -0,0 +1,23 @@ +name: PEPATAC_tutorial +pep_version: 2.0.0 +sample_table: tutorial.csv + +sample_modifiers: + derive: + attributes: [read1, read2] + sources: + # Obtain tutorial data from http://big.databio.org/pepatac/ then set + # path to your local saved files + R1: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r1.fastq.gz" + R2: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + prealignment_names: ["rCRSd"] + deduplicator: samblaster # Default. [options: picard] + trimmer: skewer # Default. [options: pyadapt, trimmomatic] + peak_type: fixed # Default. [options: variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run diff --git a/pepatac_input_schema.yaml b/pepatac_input_schema.yaml old mode 100644 new mode 100755 index 7472b587..5a40779e --- a/pepatac_input_schema.yaml +++ b/pepatac_input_schema.yaml @@ -46,7 +46,7 @@ properties: peak_caller: type: string description: "Specify the peak calling tool" - enum: ["fseq", "genrich", "hmmratac", "homer", "macs2"] + enum: ["fseq", "genrich", "hmmratac", "homer", "macs3"] genome_size: type: string description: "MACS2 effective genome size" @@ -98,6 +98,7 @@ properties: required: - sample_name - protocol + - read_type - read1 - genome required_files: diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index 19840b00..3c3626c8 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -1,67 +1,359 @@ +title: An example Pipestat output schema description: objects produced by PEPATAC pipeline. +type: object properties: + pipeline_name: PEPATAC samples: - type: array - items: - type: object - properties: - smooth_bw: - path: "aligned_{genome}/{sample_name}_smooth.bw" - type: string - description: "Smoothed signal track" - exact_bw: - path: "aligned_{genome}_exact/{sample_name}_exact.bw" - type: string - description: "Nucleotide-resolution signal track" - aligned_bam: - path: "aligned_{genome}/{sample_name}_sort_dedup.bam" - type: string - description: "Coordinate sorted deduplicated, aligned BAM file" - peak_file: - path: "peak_calling_{genome}/{sample_name}_peaks_normalized.narrowPeak" - type: string - description: "Sample peak file" - coverage_file: - path: "peak_calling_{genome}/{sample_name}_peaks_coverage.bed.gz" - type: string - description: "Sample peak coverage table" - summits_bed: - path: "peak_calling_{genome}/{sample_name}_summits.bed" - type: string - description: "Peak summit file" - alignment_percent_file: - title: "Alignment percent file" - description: "Plots percent of total alignment to all pre-alignments and primary genome." - thumbnail_path: "summary/{name}_alignmentPercent.png" - path: "summary/{name}_alignmentPercent.pdf" - type: image - alignment_raw_file: - title: "Alignment raw file" - description: "Plots raw alignment rates to all pre-alignments and primary genome." - thumbnail_path: "summary/{name}_alignmentRaw.png" - path: "summary/{name}_alignmentRaw.pdf" - type: image - tss_file: - title: "TSS enrichment file" - description: "Plots TSS scores for each sample." - thumbnail_path: "summary/{name}_TSSEnrichment.png" - path: "summary/{name}_TSSEnrichment.pdf" - type: image - library_complexity_file: - title: "Library complexity file" - description: "Plots each sample's library complexity on a single plot." - thumbnail_path: "summary/{name}_libComplexity.png" - path: "summary/{name}_libComplexity.pdf" - type: image - consensus_peaks_file: - title: "Consensus peaks file" - description: "A set of consensus peaks across samples." - thumbnail_path: "summary/{name}_*_consensusPeaks.png" - path: "summary/{name}_*_consensusPeaks.narrowPeak" - type: string - counts_table: - title: "Project peak coverage file" - description: "Project peak coverages: chr_start_end X sample" - thumbnail_path: "summary/{name}_*_peaks_coverage.png" - path: "summary/{name}_*_peaks_coverage.tsv" - type: string + type: object + properties: + smooth_bw: + type: string + description: "Smoothed signal track" + exact_bw: + type: string + description: "Nucleotide-resolution signal track" + aligned_bam: + type: string + description: "Coordinate sorted deduplicated, aligned BAM file" + peak_file: + type: string + description: "Sample peak file" + coverage_file: + type: string + description: "Sample peak coverage table" + summits_bed: + type: string + description: "Peak summit file" + File_mb: + type: number + description: "size of file" + Read_type: + type: string + description: "read_type" + Genome: + type: string + description: "e.g. hg38" + Raw_reads: + type: string + description: "raw reads" + Fastq_reads: + type: number + description: "fastq_reads" + Trimmed_reads: + type: number + description: "trimmed_reads" + Trimmed_loss_rate: + type: number + description: "trimmed loss rate" + FastQC report r1: + title: "FastQC report r1" + description: "FastQC report r1" + type: object + object_type: file + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + FastQC report r2: + title: "FastQC report r2" + description: "FastQC report r2" + type: object + object_type: file + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Aligned_reads_rCRSd: + type: number + description: "Aligned_reads_rCRSd" + Alignment_rate_rCRSd: + type: number + description: "Alignment_rate_rCRSd" + Mapped_reads: + type: string + description: "mapped_reads" + QC_filtered_reads: + type: number + description: "QC_filtered_reads" + Aligned_reads: + type: string + description: "Aligned_reads" + Alignment_rate: + type: number + description: "Alignment_rate" + Total_efficiency: + type: number + description: "Total_efficiency" + Mitochondrial_reads: + type: number + description: "Mitochondrial_reads" + NRF: + type: number + description: "NRF" + PBC1: + type: number + description: "PBC1" + PBC2: + type: number + description: "PBC1" + Unmapped_reads: + type: number + description: "Unmapped_reads" + Duplicate_reads: + type: string + description: "Duplicate_reads" + Dedup_aligned_reads: + type: number + description: "Dedup_aligned_reads" + Dedup_alignment_rate: + type: number + description: "Dedup_alignment_rate" + Dedup_total_efficiency: + type: number + description: "Dedup_total_efficiency" + NFR_frac: + type: number + description: "NFR_frac" + mono_frac: + type: number + description: "mono_frac" + di_frac: + type: number + description: "di_frac" + tri_frac: + type: number + description: "tri_frac" + poly_frac: + type: number + description: "poly_frac" + Read_length: + type: number + description: "Read_length" + Genome_size: + type: number + description: "Read_length" + Frac_exp_unique_at_10M: + type: number + description: "Frac_exp_unique_at_10M" + TSS_score: + type: number + description: "TSS_score" + Fragment distribution: + title: "Fragment distribution" + description: "Fragment distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Peak_count: + type: number + description: "Peak_count" + FRiP: + type: number + description: "FRiP" + Peak chromosome distribution: + title: "Peak chromosome distribution" + description: "Peak chromosome distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + TSS distance distribution: + title: "TSS distance distribution" + description: "TSS distance distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Peak partition distribution: + title: "Peak partition distribution" + description: "Peak partition distribution" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + cFRiF: + title: "cFRiF" + description: "cFRiF" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + FRiF: + title: "FRiF" + description: "FRiF" + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Time: + type: string + description: "time" + Success: + type: string + description: "success" + project: + type: object + properties: + alignment_percent_file: + title: "Alignment percent file" + description: "Plots percent of total alignment to all pre-alignments and primary genome." + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + alignment_raw_file: + title: "Alignment raw file" + description: "Plots raw alignment rates to all pre-alignments and primary genome." + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + TSS_enrichment: + title: "TSS enrichment file" + description: "Plots TSS scores for each sample." + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + Library_complexity: + title: "Library complexity file" + description: "Plots each sample's library complexity on a single plot." + type: object + object_type: image + properties: + path: + type: string + thumbnail_path: + type: string + title: + type: string + required: + - path + - thumbnail_path + - title + consensus_peaks_file: + title: "consesus peak file" + description: "A set of consensus peaks across samples." + type: object + object_type: file + properties: + path: + type: string + title: + type: string + thumbnail_path: + type: string + required: + - path + - title + counts_table: + title: "Project peak coverage file" + description: "Project peak coverages: chr_start_end X sample" + type: object + object_type: file + properties: + path: + type: string + title: + type: string + thumbnail_path: + type: string + required: + - path + - title \ No newline at end of file diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 31615429..2e0ac284 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.10.4" +__version__ = "0.11.0" from argparse import ArgumentParser @@ -24,7 +24,7 @@ TOOLS_FOLDER = "tools" ANNO_FOLDER = "anno" ALIGNERS = ["bowtie2", "bwa"] -PEAK_CALLERS = ["fseq", "fseq2", "genrich", "hmmratac", "homer", "macs2"] +PEAK_CALLERS = ["fseq", "fseq2", "genrich", "hmmratac", "homer", "macs3"] PEAK_TYPES = [ "fixed", "variable"] DEDUPLICATORS = ["picard", "samblaster", "samtools"] TRIMMERS = ["trimmomatic", "pyadapt", "skewer"] @@ -57,7 +57,7 @@ def parse_arguments(): help="Name of deduplicator program.") parser.add_argument("--peak-caller", dest="peak_caller", type=str.lower, - default="macs2", choices=PEAK_CALLERS, + default="macs3", choices=PEAK_CALLERS, help="Name of peak caller.") parser.add_argument("-gs", "--genome-size", default="2.7e9", type=str.lower, @@ -68,7 +68,7 @@ def parse_arguments(): parser.add_argument("--peak-type", default="fixed", dest="peak_type", choices=PEAK_TYPES, type=str.lower, help="Call variable or fixed width peaks.\n" - "Fixed width requires MACS2.") + "Fixed width requires MACS3.") parser.add_argument("--extend", default=250, dest="extend", type=int, @@ -511,7 +511,7 @@ def main(): os.path.join(args.output_parent, args.sample_name)) global pm pm = pypiper.PipelineManager( - name="PEPATAC", outfolder=outfolder, args=args, version=__version__) + name="PEPATAC", outfolder=outfolder,pipestat_sample_name=args.sample_name, args=args, version=__version__) global ngstk ngstk = pypiper.NGSTk(pm=pm) @@ -530,8 +530,8 @@ def main(): "pigz", "bwa"] # Confirm compatible peak calling settings - if args.peak_type == "fixed" and not args.peak_caller == "macs2": - err_msg = ("Must use MACS2 with `--peak-type fixed` width peaks. " + + if args.peak_type == "fixed" and not args.peak_caller == "macs3": + err_msg = ("Must use MACS3 with `--peak-type fixed` width peaks. " + "Either change the " + "`--peak-caller {}` or ".format(PEAK_CALLERS) + "use `--peak-type variable`.") @@ -1929,7 +1929,7 @@ def report_peak_count(): else: if args.peak_caller == "fseq": if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: fseq_cmd_chunks = [ @@ -1953,7 +1953,7 @@ def report_peak_count(): cmd = [fseq_cmd, merge_chrom_peaks_files] elif args.peak_caller == "fseq2": if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: fseq_cmd_chunks = [ @@ -1972,7 +1972,7 @@ def report_peak_count(): cmd = fseq_cmd elif args.peak_caller == "hmmratac" and args.paired_end: if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: gapped_peak_file = os.path.join(peak_folder, args.sample_name + @@ -2010,20 +2010,20 @@ def report_peak_count(): cmd3 += os.path.join(peak_folder, args.sample_name) cmd = cmd3 elif args.peak_caller == "hmmratac" and not args.paired_end: - pm.info("HMMRATAC requires paired-end data. Defaulting to MACS2") + pm.info("HMMRATAC requires paired-end data. Defaulting to MACS3") cmd_base = [ - "{} callpeak".format(tools.macs2), + "{} callpeak".format(tools.macs3), ("-t", peak_input_file), ("-f", "BED"), ("--outdir", peak_folder), ("-n", args.sample_name), ("-g", args.genome_size) ] - cmd_base.extend(param.macs2.params.split()) + cmd_base.extend(param.macs3.params.split()) cmd = build_command(cmd_base) elif args.peak_caller == "genrich": if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: cmd1 = (tools.samtools + " sort -n " + rmdup_bam + " > " + @@ -2037,7 +2037,7 @@ def report_peak_count(): cmd = [cmd1, cmd2] elif args.peak_caller == "homer": if args.peak_type == "fixed": - err_msg = "Must use MACS2 when calling fixed width peaks." + err_msg = "Must use MACS3 when calling fixed width peaks." pm.fail_pipeline(RuntimeError(err_msg)) else: tag_directory = os.path.join(peak_folder, "HOMER_tags") @@ -2059,10 +2059,10 @@ def report_peak_count(): pm.clean_add(tag_directory) cmd = [cmd1, cmd2, cmd3] else: - # MACS2 + # MACS3 # Note: required input file is non-positional ("treatment" file -t) cmd_base = [ - "{} callpeak".format(tools.macs2), + "{} callpeak".format(tools.macs3), ("-t", peak_input_file), ("-f", "BED"), ("--outdir", peak_folder), @@ -2070,11 +2070,11 @@ def report_peak_count(): ("-g", args.genome_size) ] if args.peak_type == "fixed": - cmd_base.extend(param.macs2.params.split()) + cmd_base.extend(param.macs3.params.split()) elif args.peak_type == "variable": - cmd_base.extend(param.macs2_variable.params.split()) + cmd_base.extend(param.macs3_variable.params.split()) else: - cmd_base.extend(param.macs2.params.split()) + cmd_base.extend(param.macs3.params.split()) cmd = build_command(cmd_base) # Call peaks and report peak count. @@ -2138,7 +2138,7 @@ def report_peak_count(): fixed_peak_file = os.path.join(peak_folder, args.sample_name + "_peaks_fixedWidth.narrowPeak") # If using fixed peaks, extend from summit - if args.peak_type == "fixed" and args.peak_caller == "macs2": + if args.peak_type == "fixed" and args.peak_caller == "macs3": temp = tempfile.NamedTemporaryFile(dir=peak_folder, delete=False) # extend peaks from summit by 'extend' # start extend from center of peak diff --git a/pipelines/pepatac.yaml b/pipelines/pepatac.yaml old mode 100644 new mode 100755 index b8250748..9719cc3b --- a/pipelines/pepatac.yaml +++ b/pipelines/pepatac.yaml @@ -9,7 +9,7 @@ tools: # absolute paths to required tools bedtools: bedtools bowtie2: bowtie2 fastqc: fastqc - macs2: macs2 + macs3: macs3 preseq: preseq samblaster: samblaster skewer: skewer @@ -68,13 +68,13 @@ parameters: # parameters passed to bioinformatic tools, subclassed by tool samtools: params: '-q 10' # -q: skip alignments with MAPQ < 10. - macs2: + macs3: params: '--shift -75 --extsize 150 --nomodel --call-summits --nolambda --keep-dup all -p 0.01' # -f: Format of tag file. # -q: The qvalue (minimum FDR) cutoff to call significant regions. # --shift: Assign an arbitrary shift in bp. See MACS documentation. # --nomodel: Will bybass building the shifting model. - macs2_variable: + macs3_variable: params: '-f BED -q 0.01 --shift 0 --nomodel' # -f: Format of tag file. # -q: The qvalue (minimum FDR) cutoff to call significant regions. diff --git a/pipelines/pepatac_collator.py b/pipelines/pepatac_collator.py index 2c085f6d..e4ad73a1 100755 --- a/pipelines/pepatac_collator.py +++ b/pipelines/pepatac_collator.py @@ -5,12 +5,15 @@ __author__ = ["Jason Smith", "Michal Stolarczyk"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.0.4" +__version__ = "0.0.6" from argparse import ArgumentParser import os import sys import pypiper +import peppy +from peppy.utils import load_yaml +import yaml from ubiquerg import VersionInHelpParser def tool_path(tool_name): @@ -33,7 +36,7 @@ def parse_arguments(): """ parser = VersionInHelpParser(prog="PEPATAC_collator", description='PEPATAC collator' , version=__version__) - parser = pypiper.add_pypiper_args(parser, groups=['pypiper', 'looper']) + parser = pypiper.add_pypiper_args(parser, groups=['pypiper', 'looper', 'common']) parser.add_argument("-n", "--name", help="Name of the project to use.", type=str) parser.add_argument("-r", "--results", @@ -66,11 +69,38 @@ def parse_arguments(): def main(): args = parse_arguments() + outfolder = os.path.abspath(os.path.join(args.output_parent, "summary")) - pm = pypiper.PipelineManager(name="PEPATAC_collator", outfolder=outfolder, + pm = pypiper.PipelineManager(name="PEPATAC", outfolder=outfolder, + pipestat_sample_name="summary", + pipestat_pipeline_type="project", args=args, version=__version__) + pm.debug(f"\nargs: {args}\n") + + project = peppy.Project(args.config_file) + project_stats_file = os.path.join(args.output_parent, f"{project.name}_stats_summary.yaml") + stats_yaml_files = [] + sample_names = [] + for sample in project.sample_table.sample_name: + pm.debug(f"sample name: {sample}") + sample_names.append(sample) + stats_yaml_files.append(os.path.join(args.results, sample, "stats.yaml")) + num_samples = len(sample_names) + yaml_dict = load_yaml(stats_yaml_files[0]) + if num_samples > 1: + sample_names = sample_names[1:] + stats_yaml_files = stats_yaml_files[1:] + for i in range(len(stats_yaml_files)): + pm.debug(f"i: {i}") + sample_name = sample_names[i] + yaml_tmp = load_yaml(stats_yaml_files[i]) + yaml_dict['PEPATAC']['sample'][sample_name] = yaml_tmp['PEPATAC']['sample'][sample_name] + with open(project_stats_file, 'w') as file: + yaml.dump(yaml_dict, file) + print(f"Summary (n={num_samples}: {project_stats_file})") + cmd = (f"Rscript {tool_path('PEPATAC_summarizer.R')} " f"{args.config_file} {args.output_parent} " f"{args.results} {args.cutoff} {args.min_score} {args.min_olap}") @@ -86,13 +116,54 @@ def main(): cmd += " --normalized" complexity_file = os.path.join( - outfolder, "{name}_libComplexity.pdf".format(name=args.name)) - consensus_peaks_file = os.path.join( - outfolder, "{name}_*_consensusPeaks.narrowPeak".format(name=args.name)) - peak_coverage_file = os.path.join( - outfolder, "{name}_peaks_coverage.tsv".format(name=args.name)) + outfolder, f"{args.name}_libComplexity.pdf") + complexity_thumbnail = os.path.join( + outfolder, f"{args.name}_libComplexity.png") + + alignment_percent_file = os.path.join( + outfolder, f"{args.name}_alignmentPercent.pdf") + alignment_percent_thumbnail = os.path.join( + outfolder, f"{args.name}_alignmentPercent.png") + + alignment_raw_file = os.path.join( + outfolder, f"{args.name}_alignmentRaw.pdf") + alignment_raw_thumbnail = os.path.join( + outfolder, f"{args.name}_alignmentRaw.png") + + TSS_enrichment_file = os.path.join( + outfolder, f"{args.name}_TSSEnrichment.pdf") + TSS_enrichment_thumbnail = os.path.join( + outfolder, f"{args.name}_TSSEnrichment.png") + + for genome in project.sample_table.genome.unique(): + pm.debug(f"genome: {genome}") + consensus_peaks_file = os.path.join( + outfolder, f"{args.name}_{genome}_consensusPeaks.narrowPeak") + consensus_peaks_thumbnail = os.path.join( + outfolder, f"{args.name}_{genome}_consensusPeaks.png") + pm.debug(f"consensus_peaks_file: {consensus_peaks_file}") + peak_coverage_file = os.path.join( + outfolder, f"{args.name}_{genome}_peaks_coverage.tsv") + peak_coverage_thumbnail = os.path.join( + outfolder, f"{args.name}_{genome}_peaks_coverage.png") + pm.debug(f"peak_coverage_file(s): {peak_coverage_file}") pm.run(cmd, [complexity_file, consensus_peaks_file, peak_coverage_file]) + + # For pypiper's report object, anchor image = thumbnail path + pm.report_object("Library_complexity", complexity_file, + anchor_image=complexity_thumbnail) + pm.report_object("alignment_percent_file", alignment_percent_file, + anchor_image=alignment_percent_thumbnail) + pm.report_object("alignment_raw_file", alignment_raw_file, + anchor_image=alignment_raw_thumbnail) + pm.report_object("TSS_enrichment", TSS_enrichment_file, + anchor_image=TSS_enrichment_thumbnail) + pm.report_object("consensus_peaks_file", consensus_peaks_file, + anchor_image=consensus_peaks_thumbnail) + pm.report_object("counts_table", peak_coverage_file, + anchor_image=peak_coverage_thumbnail) + pm.stop_pipeline() diff --git a/project_pipeline_interface.yaml b/project_pipeline_interface.yaml index 42f0e554..febeda8d 100755 --- a/project_pipeline_interface.yaml +++ b/project_pipeline_interface.yaml @@ -1,11 +1,11 @@ -pipeline_name: PEPATAC_summarizer +pipeline_name: PEPATAC pipeline_type: project path: pipelines/pepatac_collator.py output_schema: pepatac_output_schema.yaml command_template: > {pipeline.path} --config {looper.pep_config} - -O {looper.output_dir} + -O {looper.results_subdir} -P {compute.cores} -M {compute.mem} -n {project.name} diff --git a/requirements-conda.yml b/requirements-conda.yml old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt old mode 100644 new mode 100755 index c6debba6..0d49c1fd --- a/requirements.txt +++ b/requirements.txt @@ -1,29 +1,35 @@ -attmap>=0.13.0 +attmap>=0.13.2 bio>=0.2.4 +blosc2>=2.0.0 +clyent==1.2.1 codecov>=2.0 colorama>=0.3.9 Cython>=0.29 cykhash>=1.0.2 divvy>=0.6.0 -eido>=0.2.0 +eido>=0.2.2 +FuzzyTM>=0.4.0 #fseq2>=2.0.2 hypothesis==4.38.0 jinja2 jsonschema>=3.0.1 -logmuse>=0.2.6 -looper>=1.4.2 -MACS2>=2.2.7.1 -numpy>=1.21 -oyaml +logmuse>=0.2.7 +looper>=1.6.0 +MACS2>=2.2.9.1 +MACS3>=3.0.0b1 +numpy<1.25,>=1.21 +oyaml>=1.0 pararead>=0.7.0 pandas>=1.4 -peppy>=0.31.1 -piper>=0.12.3 +piper>=0.14.0 +pipestat>=0.6.0 psutil>=5.8 +pyqt5<5.16 +pyqtwebengine<5.16 pysam>=0.16 python-Levenshtein>=0.12 pyyaml>=3.13 refgenconf>=0.12.2 -#refgenie -ubiquerg>=0.6.1 -yacman>=0.8.4 +requests_mock +ubiquerg>=0.6.3 +yacman>=0.9.2 diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 17de28ad..9b9f33ae 100755 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -23,7 +23,7 @@ command_template: > {% if sample.prealignment_index is defined %} --prealignment-index { sample.prealignment_index } {% endif %} {% if sample.prealignment_names is defined %} {% if aligner == "bowtie2" or sample.aligner == "bowtie2" %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% else %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bwa_index.dir } {% endfor %} {% endif %} {% endif %} {% if sample.deduplicator is defined %} --deduplicator { sample.deduplicator } {% endif %} - {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% else %} --peak-caller "macs2" {% endif %} + {% if sample.peak_caller is defined %} --peak-caller { sample.peak_caller } {% else %} --peak-caller "macs3" {% endif %} {% if sample.peak_type is defined %} --peak-type { sample.peak_type } {% else %} --peak-type "fixed" {% endif %} {% if sample.extend is defined %} --extend { sample.extend } {% else %} --extend 250 {% endif %} {% if sample.genome_size is defined %} --genome-size { sample.genome_size } {% else %} --genome-size "2.7e9" {% endif %} diff --git a/tests/microtest.csv b/tests/microtest.csv new file mode 100644 index 00000000..16906447 --- /dev/null +++ b/tests/microtest.csv @@ -0,0 +1,2 @@ +sample_name,protocol,read1,read2 +test_sample,ATAC-seq, diff --git a/tests/microtest.yaml b/tests/microtest.yaml new file mode 100644 index 00000000..607b8edc --- /dev/null +++ b/tests/microtest.yaml @@ -0,0 +1,18 @@ +pep_version: "2.1.0" +sample_table: microtest.csv + +looper: + output_dir: microtest + +sample_modifiers: + append: + genome: hg38_chr22 + pipeline_interfaces: ../sample_pipeline_interface.yaml + read1: R1 + read2: R2 + read_type: PAIRED + derive: + attributes: ["read1", "read2"] + sources: + R1: "${CODE}/microtest/data/atac-seq_PE_R1.fastq.gz" + R2: "${CODE}/microtest/data/atac-seq_PE_R2.fastq.gz" diff --git a/tools/PEPATAC.R b/tools/PEPATAC.R index 256444d1..23eb7387 100755 --- a/tools/PEPATAC.R +++ b/tools/PEPATAC.R @@ -6,7 +6,7 @@ version <- 0.6 ##### Load dependencies ##### -required_libraries <- c("PEPATACr") +required_libraries <- c("PEPATACr", "optigrab") for (i in required_libraries) { loadLibrary <- tryCatch ( { @@ -16,8 +16,6 @@ for (i in required_libraries) { error=function(e) { message("Error: Install the \"", i, "\" R package before proceeding.") - message('i.e. devtools::install_github("databio/pepatac",', - ' subdir="PEPATACr")') return(NULL) }, warning=function(e) { @@ -322,9 +320,15 @@ if (is.na(subcmd) || grepl("/R", subcmd)) { description="Output file name.") numArgs <- length(opt_get_args()) argGap <- ifelse(reads, 13, 12) - bed <- opt_get(name = c("bed", "b"), required=TRUE, - n=(numArgs - argGap), - description="Coverage file(s).") + + # Manually parse --bed argument + opt_args_dt <- data.table::as.data.table(opt_get_args()) + bed_start <- grep("--bed", opt_args_dt$V1) + 1 + bed_end <- nrow(opt_args_dt) + # bed <- opt_get(name = c("bed", "b"), required=TRUE, + # n=(numArgs - argGap), + # description="Coverage file(s).") + bed <- opt_args_dt$V1[bed_start:bed_end] p <- plotFRiF(sample_name = sample_name, num_reads = as.numeric(num_reads), diff --git a/tools/PEPATAC_summarizer.R b/tools/PEPATAC_summarizer.R index aea77d75..07d17cb1 100755 --- a/tools/PEPATAC_summarizer.R +++ b/tools/PEPATAC_summarizer.R @@ -11,11 +11,11 @@ # )) # # Created: 5/18/17 -# Last updated: 05/10/2021 +# Last updated: 12/18/2023 # # usage: Rscript /path/to/Rscript/PEPATAC_summarizer.R # /path/to/project_config.yaml -# Depends: R (>= 3.5.1) +# Depends: R (>= 4.0) # Imports: PEPATACr, argparser ############################################################################### ##### LOAD ARGUMENTPARSER ##### @@ -108,10 +108,11 @@ pep <- argv$config # Load the project prj <- invisible(suppressWarnings(pepr::Project(pep))) # Convenience -project_name <- config(prj)$name +project_name <- pepr::config(prj)$name project_samples <- pepr::sampleTable(prj)$sample_name -sample_table <- data.table(sample_name=pepr::sampleTable(prj)$sample_name, - genome=pepr::sampleTable(prj)$genome) +sample_table <- data.table::data.table( + sample_name=pepr::sampleTable(prj)$sample_name, + genome=pepr::sampleTable(prj)$genome) # Set the output directory summary_dir <- suppressMessages(file.path(argv$output, "summary")) @@ -130,19 +131,6 @@ if (dir.exists(argv$results)) { quit() } - -# Generate stats summary -stats <- PEPATACr::createStatsSummary(project_samples, results_subdir) -if (nrow(stats) == 0) { - quit() -} -project_stats_file <- file.path(argv$output, - paste0(project_name, '_stats_summary.tsv')) -message(sprintf("Summary (n=%s): %s", - length(unique(stats$sample_name)), project_stats_file)) -fwrite(stats, project_stats_file, sep="\t", col.names=TRUE) - - # Generate assets assets <- PEPATACr::createAssetsSummary(project_samples, results_subdir) if (nrow(assets) == 0) { @@ -152,7 +140,7 @@ project_assets_file <- file.path(argv$output, paste0(project_name, '_assets_summary.tsv')) message(sprintf("Summary (n=%s): %s", length(unique(assets$sample_name)), project_assets_file)) -fwrite(assets, project_assets_file, sep="\t", col.names=FALSE) +data.table::fwrite(assets, project_assets_file, sep="\t", col.names=FALSE) # Produce project summary plots @@ -189,15 +177,15 @@ if (!file.exists(complexity_path) || argv$new_start) { read_length = 0, real_counts_path = rcSub, ignore_unique = FALSE) - output_file <- file.path(summary_dir, + output_pdf <- file.path(summary_dir, paste0(project_name, "_libComplexity.pdf")) - pdf(file = output_file, width= 10, height = 7, useDingbats=F) + pdf(file = output_pdf, width= 10, height = 7, useDingbats=F) suppressWarnings(print(p)) invisible(dev.off()) - output_file <- file.path(summary_dir, + output_png <- file.path(summary_dir, paste0(project_name, "_libComplexity.png")) - png(filename = output_file, width = 686, height = 480) + png(filename = output_png, width = 686, height = 480) suppressWarnings(print(p)) invisible(dev.off()) } else { diff --git a/tools/bamSitesToWig.py b/tools/bamSitesToWig.py index bb365d57..f47e4712 100755 --- a/tools/bamSitesToWig.py +++ b/tools/bamSitesToWig.py @@ -99,27 +99,27 @@ def __call__(self, chrom): reads = self.fetch_chunk(chrom) chromOutFile = self._tempf(chrom.replace("|","_")) + chromOutFileWig = chromOutFile + ".wig" chromOutFileBw = chromOutFile + ".bw" + chromOutFileWigSm = chromOutFile + "_smooth.wig" chromOutFileBwSm = chromOutFile + "_smooth.bw" tmpFile = chromOutFile + "_cuts.txt" cutsToWig = os.path.join(os.path.dirname(__file__), "cutsToWig.pl") cmd1 = ("sort -n | perl " + cutsToWig + " " + str(chrom_size) + - " " + str(self.variable_step) + " " + str(self.scale)) - cmd2 = ("wigToBigWig -clip -fixedSummaries -keepAllChromosomes stdin " + + " " + str(self.variable_step) + " " + str(self.scale) + + " > " + chromOutFileWig) + cmd2 = ("wigToBigWig -clip -fixedSummaries -keepAllChromosomes " + + chromOutFileWig + " " + self.chrom_sizes_file + " " + chromOutFileBw) _LOGGER.debug(" cutsToWigProcess: " + cmd1) _LOGGER.debug(" wigToBigWigProcess: " + cmd2) if self.exactbw: cutsToWigProcess = subprocess.Popen(cmd1, shell=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - wigToBigWigProcess = subprocess.Popen( - ['wigToBigWig', '-clip', '-fixedSummaries', - '-keepAllChromosomes', 'stdin', - self.chrom_sizes_file, chromOutFileBw], - stdin=cutsToWigProcess.stdout) + stdin=subprocess.PIPE) + if self.smoothbw: cutsToWigSm = os.path.join(os.path.dirname(__file__), @@ -127,17 +127,13 @@ def __call__(self, chrom): cmd1 = ("sort -n | tee " + tmpFile + " | perl " + cutsToWigSm + " " + str(chrom_size) + " " + str(self.smooth_length) + " " + str(self.step_size) + " " + str(self.variable_step) + - " " + str(self.scale)) + " " + str(self.scale) + " > " + chromOutFileWigSm) cmd2 = ("wigToBigWig -clip -fixedSummaries " + - "-keepAllChromosomes stdin " + self.chrom_sizes_file + - " " + chromOutFileBwSm) + "-keepAllChromosomes " + chromOutFileWigSm + " " + + self.chrom_sizes_file + " " + chromOutFileBwSm) cutsToWigProcessSm = subprocess.Popen(cmd1, shell=True, - stdin=subprocess.PIPE, stdout=subprocess.PIPE) - wigToBigWigProcessSm = subprocess.Popen( - ['wigToBigWig', '-clip', '-fixedSummaries', - '-keepAllChromosomes', 'stdin', - self.chrom_sizes_file, chromOutFileBwSm], - stdin=cutsToWigProcessSm.stdout) + stdin=subprocess.PIPE) + if self.bedout: chromOutFileBed = chromOutFile + ".bed" @@ -241,8 +237,13 @@ def get_shifted_pos(read, shift_factor): # Clean up processes if self.exactbw: cutsToWigProcess.stdin.close() + cutsToWigProcess.communicate() _LOGGER.debug("Encoding exact bigwig for " + chrom + " (last read position:" + str(read.pos) + ")...") + wigToBigWigProcess = subprocess.Popen( + ['wigToBigWig', '-clip', '-fixedSummaries', + '-keepAllChromosomes', chromOutFileWig, + self.chrom_sizes_file, chromOutFileBw]) wigToBigWigProcess.communicate() if self.bedout: @@ -250,8 +251,13 @@ def get_shifted_pos(read, shift_factor): if self.smoothbw: cutsToWigProcessSm.stdin.close() + cutsToWigProcessSm.communicate() _LOGGER.debug("Encoding smooth bigwig for " + chrom + " (last read position:" + str(read.pos) + ")...") + wigToBigWigProcessSm = subprocess.Popen( + ['wigToBigWig', '-clip', '-fixedSummaries', + '-keepAllChromosomes', chromOutFileWigSm, + self.chrom_sizes_file, chromOutFileBwSm]) wigToBigWigProcessSm.communicate() except StopIteration as e: diff --git a/usage.txt b/usage.txt old mode 100644 new mode 100755