diff --git a/BiocProject/PEPATAC_BiocProject.Rmd b/BiocProject/PEPATAC_BiocProject.Rmd index c25cd3a8..ca4df839 100644 --- a/BiocProject/PEPATAC_BiocProject.Rmd +++ b/BiocProject/PEPATAC_BiocProject.Rmd @@ -1,6 +1,6 @@ --- title: "PEPATAC BiocProject" -author: "Michal Stolarczyk" +author: c("Michal Stolarczyk", "Jason Smith") date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > @@ -35,7 +35,7 @@ The way the output files are read is defined in a [function](https://github.com/ ```{r echo=T, message=FALSE} library(BiocProject) -ProjectConfig = "gold_hg19.yaml" +ProjectConfig = "gold_config.yaml" ``` ## Run the `BiocProject` function diff --git a/BiocProject/gold_atac_annotation.csv b/BiocProject/gold_atac_annotation.csv deleted file mode 100644 index 3683e246..00000000 --- a/BiocProject/gold_atac_annotation.csv +++ /dev/null @@ -1,6 +0,0 @@ -sample_name,sample_description,treatment_description,organism,protocol,data_source,SRR,SRX,Sample_geo_accession,Sample_series_id,read_type,Sample_instrument_model,read1,read2 -gold1,ATAC-seq from dendritic cell (ENCLB065VMV),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210416,SRX2523872,GSM2471255,GSE94182,PAIRED,Illumina HiSeq 2000,SRA_1,SRA_2 -gold2,ATAC-seq from dendritic cell (ENCLB811FLK),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210450,SRX2523906,GSM2471300,GSE94222,PAIRED,Illumina HiSeq 2000,SRA_1,SRA_2 -gold3,ATAC-seq from dendritic cell (ENCLB887PKE),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210398,SRX2523862,GSM2471249,GSE94177,PAIRED,Illumina NextSeq 500,SRA_1,SRA_2 -gold4,ATAC-seq from dendritic cell (ENCLB586KIS),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210428,SRX2523884,GSM2471269,GSE94196,PAIRED,Illumina HiSeq 2000,SRA_1,SRA_2 -gold5,ATAC-seq from dendritic cell (ENCLB384NOX),Homo sapiens dendritic in vitro differentiated cells treated with 0 ng/mL Lipopolysaccharide for 0 hours,human,ATAC-seq,SRA,SRR5210390,SRX2523854,GSM2471245,GSE94173,PAIRED,Illumina HiSeq 2000,SRA_1,SRA_2 diff --git a/BiocProject/gold_config.yaml b/BiocProject/gold_config.yaml new file mode 100644 index 00000000..150d8550 --- /dev/null +++ b/BiocProject/gold_config.yaml @@ -0,0 +1,40 @@ +# This project config file describes your project. See looper docs for details. +name: BiocProject # The name that summary files will be prefaced with + +pep_version: 2.0.0 +sample_table: gold_sample_table.csv # sheet listing all samples in the project + +looper: # relative paths are relative to this config file + output_dir: "$PROCESSED/pepatac/gold" # ABSOLUTE PATH to the parent, shared space where project results go + pipeline_interfaces: ["$CODE/pepatac/project_pipeline_interface.yaml"] # ABSOLUTE PATH to the directory where looper will find the pipeline repository + +sample_modifiers: + append: + pipeline_interfaces: ["$CODE/pepatac/sample_pipeline_interface.yaml"] + derive: + attributes: [read1, read2] + sources: + SRA_1: "${SRAFQ}{SRR}_1.fastq.gz" + SRA_2: "${SRAFQ}{SRR}_2.fastq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + macs_genome_size: hs + prealignments: rCRSd + aligner: bowtie2 # Default. [options: bwa] + deduplicator: samblaster # Default. [options: picard] + trimmer: skewer # Default. [options: pyadapt, trimmomatic] + peak_caller: macs2 # Default. [options: fseq, genrich, hmmratac, homer] + peak_type: fixed # Default. [options: variable] + extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. + frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run + blacklist: $GENOMES/hg38/blacklist/default/hg38_blacklist.bed.gz + +bioconductor: + readFunName: readNarrowPeak + readFunPath: readNarrowPeak.R + funcArgs: + output_dir: "$PROCESSED/pepatac/gold" + results_subdir: "$PROCESSED/pepatac/gold/results_pipeline/" \ No newline at end of file diff --git a/BiocProject/gold_hg19.yaml b/BiocProject/gold_hg19.yaml deleted file mode 100644 index 32e071ed..00000000 --- a/BiocProject/gold_hg19.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Run gold standard samples through ATACseq pipeline. -name: gold_hg19 - -metadata: - sample_annotation: "$PROCESSED/gold/pepatac/hg19/gold_atac_annotation.csv" - output_dir: "$PROCESSED/gold/pepatac/hg19/10_08_18_wo" - pipeline_interfaces: "$CODE/pepatac/pipeline_interface.yaml" - -derived_columns: [read1, read2] - -data_sources: - SRA_1: "${SRAFQ}{SRR}_1.fastq.gz" - SRA_2: "${SRAFQ}{SRR}_2.fastq.gz" - -implied_columns: - organism: - human: - genome: hg19 - macs_genome_size: hs diff --git a/BiocProject/gold_sample_table.csv b/BiocProject/gold_sample_table.csv new file mode 100644 index 00000000..adb33194 --- /dev/null +++ b/BiocProject/gold_sample_table.csv @@ -0,0 +1,6 @@ +sample_name,organism,protocol,SRR,read_type,read1,read2 +gold1,human,ATAC-seq,SRR5210416,PAIRED,SRA_1,SRA_2 +gold2,human,ATAC-seq,SRR5210450,PAIRED,SRA_1,SRA_2 +gold3,human,ATAC-seq,SRR5210398,PAIRED,SRA_1,SRA_2 +gold4,human,ATAC-seq,SRR5210428,PAIRED,SRA_1,SRA_2 +gold5,human,ATAC-seq,SRR5210390,PAIRED,SRA_1,SRA_2 diff --git a/BiocProject/readConsensusPeak.R b/BiocProject/readConsensusPeak.R new file mode 100644 index 00000000..81165e86 --- /dev/null +++ b/BiocProject/readConsensusPeak.R @@ -0,0 +1,44 @@ +readConsensusPeak = function(project, summary_dir) { + cwd = getwd() + # Construct sample table + sample_table <- data.table( + genome=unique(pepr::sampleTable(project)$genome) + ) + sample_table[,peak_file_path:=.(file.path( + summary_dir, + paste0(config(project)$name, "_", sample_table$genome, + "_consensusPeaks.narrowPeak") + ))] + # Only keep samples with valid peak files + file_list <- sample_table$peak_file_path + file_exists <- character() + for (i in 1:length(file_list)) { + if(file.exists(file.path(file_list[i]))) { + file_exists <- append(file_exists, file.path(file_list[i])) + } + } + files <- data.table(peak_file_path=file_exists) + consensus_peak_files = list() + if (nrow(files) == 0) { + warning("No valid peak files exist.") + return(GenomicRanges::GRangesList()) + } + # Ensure duplicates do not exist + sample_table <- unique( + sample_table[sample_table$peak_file_path %in% files$peak_file_path,]) + # Load peak files as Granges objects + sample_table[, peak_file := .(lapply( + peak_file_path, + function(x) { + GenomicRanges::GRanges( + fread(x, col.names=c("chr", "start", "end", "name", "score", + "strand", "signalValue", "pValue", "qValue", + "peak"))) + }))] + # Convert to GRangesList and name the individual Granges + peak_files <- GenomicRanges::GRangesList(sample_table$peak_file) + names(peak_files) <- sample_table$genome + setwd(cwd) + return(peak_files) +} + diff --git a/BiocProject/readCoverage.R b/BiocProject/readCoverage.R new file mode 100644 index 00000000..7e8f2173 --- /dev/null +++ b/BiocProject/readCoverage.R @@ -0,0 +1,80 @@ +readCoverage = function(project, results_subdir) { + cwd = getwd() + # Construct sample table + sample_table <- data.table( + sample_name=pepr::sampleTable(project)$sample_name, + genome=pepr::sampleTable(project)$genome + ) + # Check if coverage files are compressed + if (any(file.exists(file.path(results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_ref_peaks_coverage.bed.gz"))))) { + ext <- ".bed.gz" + } else if (any(file.exists(file.path(results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_peaks_coverage.bed.gz"))))) { + ext <- ".bed.gz" + } else { + ext <- ".bed" + } + # Use reference peak coverage file if available + if (any(file.exists(file.path(results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_ref_peaks_coverage", ext))))) { + peak_file_name = paste0("_ref_peaks_coverage", ext) + reference = TRUE + } else { + warning("Peak coverage files are not derived from a singular reference peak set.") + reference = FALSE + peak_file_name = paste0("_peaks_coverage", ext) + } + # Construct paths to peak coverage files + sample_table[,cov_file_path:=.((file.path( + results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, peak_file_name))))] + # Only keep samples with valid peak files + file_list <- sample_table$cov_file_path + file_exists <- character() + for (i in 1:length(file_list)) { + if(file.exists(file.path(file_list[i]))) { + file_exists <- append(file_exists, file.path(file_list[i])) + } + } + files <- data.table(cov_file_path=file_exists) + if (nrow(files) == 0) { + warning("No valid peak files exist.") + return(data.table()) + } + # Ensure duplicates do not exist + sample_table <- unique( + sample_table[sample_table$cov_file_path %in% files$cov_file_path,]) + # Column names are dependent on source file + if (reference) { + column_names <- c("chr", "start", "end", "name", "score", "strand", + "signalValue", "pValue", "qValue", "peak", + "read_count", "base_count", "width", "frac") + } else { + column_names <- c("chr", "start", "end", "read_count", + "base_count", "width", "frac", "norm") + } + # Load coverage files and return list of data.tables + sample_table[, cov_file := .(lapply(cov_file_path, + fread, col.names=column_names))] + # sample_table[, cov_file := .(lapply( + # cov_file_path, function(x) { + # x <- fread(x, col.names=column_names) + # data.table(x) + # }))] + cov_files <- sample_table$cov_file + names(cov_files) <- sample_table$sample_name + setwd(cwd) + return(cov_files) +} diff --git a/BiocProject/readNarrowPeak.R b/BiocProject/readNarrowPeak.R new file mode 100644 index 00000000..ff8a3ce8 --- /dev/null +++ b/BiocProject/readNarrowPeak.R @@ -0,0 +1,46 @@ +readNarrowPeak = function(project, results_subdir) { + cwd = getwd() + # Construct sample table + sample_table <- data.table( + sample_name=pepr::sampleTable(project)$sample_name, + genome=pepr::sampleTable(project)$genome + ) + # Identify peak file paths + sample_table[,peak_file_path:=.((file.path( + results_subdir, + sample_table$sample_name, + paste0("peak_calling_", sample_table$genome), + paste0(sample_table$sample_name, + "_peaks_normalized.narrowPeak"))))] + # Only keep samples with valid peak files + file_list <- sample_table$peak_file_path + file_exists <- character() + for (i in 1:length(file_list)) { + if(file.exists(file.path(file_list[i]))) { + file_exists <- append(file_exists, file.path(file_list[i])) + } + } + files <- data.table(peak_file_path=file_exists) + consensus_peak_files = list() + if (nrow(files) == 0) { + warning("No valid peak files exist.") + return(GenomicRanges::GRangesList()) + } + # Ensure duplicates do not exist + sample_table <- unique( + sample_table[sample_table$peak_file_path %in% files$peak_file_path,]) + # Load peak files as Granges objects + sample_table[, peak_file := .(lapply( + peak_file_path, + function(x) { + GenomicRanges::GRanges( + fread(x, col.names=c("chr", "start", "end", "name", "score", "strand", + "signalValue", "pValue", "qValue", "peak"))) + }))] + # Convert to GRangesList and name the individual Granges + peak_files <- GenomicRanges::GRangesList(sample_table$peak_file) + names(peak_files) <- sample_table$sample_name + setwd(cwd) + return(peak_files) +} + diff --git a/BiocProject/readPepatacPeakBeds.R b/BiocProject/readPepatacPeakBeds.R deleted file mode 100644 index 10791ceb..00000000 --- a/BiocProject/readPepatacPeakBeds.R +++ /dev/null @@ -1,18 +0,0 @@ -readPepatacPeakBeds = function(p, pipName="pepatac.py") { - readBed = function(fp) { - message("Reading: ", basename(fp)) - df = read.table(s, stringsAsFactors=F) - colnames(df)[1:3] = c('chr', 'start', 'end') - GenomicRanges::GRanges(df) - } - lapply(outputsByPipeline(p, pipName), function(f) { - lapply(f, function(s){ - if (!file.exists(s)) { - warning("File '", s, "' not found") - NULL - } else { - readBed(s) - } - }) - }) -} diff --git a/BiocProject/runCOCOA.R b/BiocProject/runCOCOA.R new file mode 100644 index 00000000..3e7d4435 --- /dev/null +++ b/BiocProject/runCOCOA.R @@ -0,0 +1,97 @@ +runCOCOA = function(project, results_subdir, summary_dir, + genome, conditions, lolaPath) { + # verify condition table + if(!all(c("sample_name", "condition") %in% tolower(colnames(conditions)))) { + warning("Conditions must include 'sample_name' and 'condition' columns.") + return(1) + } + # load required packages + required_packages <- c("COCOA", "LOLA", "data.table") + for (package in required_packages) { + loadLibrary <- tryCatch ( + { + suppressPackageStartupMessages( + suppressWarnings(library(package, character.only=TRUE))) + }, + error=function(e) { + message("Error: Install the \"", package, + "\" library before proceeding.") + return(NULL) + }, + warning=function(e) { + message(e) + return(1) + } + ) + if (length(loadLibrary)!=0) { + suppressWarnings(library(package, character.only=TRUE)) + } else { + warning() + return(1) + } + } + # Load peak count table + count_table_path = file.path(summary_dir, + paste0(config(project)$name, "_", genome, + "_peaks_coverage.tsv")) + if(file.exists(count_table_path)) { + ct <- data.table::fread(count_table_path) + } else { + warning(paste0("Could not load ", count_table_path)) + return(1) + } + # Subset count table by condition table + cols <- colnames(ct)[c(TRUE ,colnames(ct) %in% conditions$sample_name)] + ct <- ct[, cols, with=FALSE] + # Generate matrix + m <- as.matrix(ct[, 2:ncol(ct)]) + rownames(m) <- ct$name + mat <- t(m) + mat <- mat[, -grep("random", colnames(mat))] + mat <- mat[, -grep("GL", colnames(mat))] + mat <- mat[, -grep("chrUn", colnames(mat))] + pca <- prcomp(mat) + row.names(pca$x) <- rownames(mat) + loadings <- pca$rotation + scores <- pca$x + rownames(scores) <- rownames(mat) + + delIdx <- c(grep("random", rownames(m)), + grep("GL", rownames(m)), + grep("chrUn", rownames(m))) + coord_list <- data.table::tstrsplit(rownames(m)[-delIdx], "_", + type.convert = TRUE, fixed = TRUE) + peaks <- data.table::data.table(chr=coord_list[[1]], + start=coord_list[[2]], + end=coord_list[[3]]) + # Load region database + regionSetDB <- suppressWarnings(loadRegionDB(lolaPath)) + # Load the ENCODE transcription factor binding sites + regionAnno <- regionSetDB$regionAnno + collectionInd <- regionAnno$collection %in% "encode_tfbs" + #message("load just TFBS") + TFGR <- GRangesList(regionSetDB$regionGRL[collectionInd]) + regionAnno <- regionAnno[collectionInd, ] + regionSetName <- regionAnno$filename + regionSetDescription <- regionAnno$description + #message("remove regionSetDB") + # Remove full DB to release memory + rm("regionSetDB") + + # Subset principal components to annotate + PCsToAnnotate <- paste0("PC", 1:6) + correctSampleOrder <- 1:nrow(mat) + message("run COCOA") + tfRSS <- local(COCOA::runCOCOA(genomicSignal=loadings, + signalCoord=peaks, + GRList=TFGR, + signalCol=PCsToAnnotate, + sampleOrder=correctSampleOrder, + targetVar=scores, + scoringMetric="proportionWeightedMean", + absVal=TRUE)) + tfRSS$regionSetName <- regionSetName + tfRSS$regionSetDescription <- regionSetDescription + return(tfRSS) +} + diff --git a/PEPATACr/R/PEPATACr.R b/PEPATACr/R/PEPATACr.R index b0c612be..51f8fa35 100644 --- a/PEPATACr/R/PEPATACr.R +++ b/PEPATACr/R/PEPATACr.R @@ -631,6 +631,7 @@ calcFRiF <- function(bedFile, total, reads) { } bedFile <- bedFile[apply(bedFile != 0, 1, all),] + # if no regions have coverage, file is now empty if (reads) { bedFile <- cbind(bedFile, cumsum=cumsum(bedFile$count)) @@ -640,8 +641,11 @@ calcFRiF <- function(bedFile, total, reads) { bedFile <- cbind(bedFile, cumSize=cumsum(bedFile$size)) bedFile <- cbind(bedFile, frip=bedFile$cumsum/as.numeric(total)) - bedFile <- cbind(bedFile, numfeats=as.numeric(1:nrow(bedFile))) - return(bedFile) + if (is.empty(bedFile)) { + return(cbind(bedFile, numfeats=as.numeric())) + } else { + return(cbind(bedFile, numfeats=as.numeric(1:nrow(bedFile)))) + } } @@ -692,12 +696,18 @@ plotFRiF <- function(sample_name, num_reads, genome_size, bed <- get(bedFile[1]) bedCov <- calcFRiF(bed, num_reads, reads) name <- bedFile[1] - labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), max(bedCov$frip)+0.001, - name, round(max(bedCov$frip),2), "#FF0703") - feature_dist[1,] <- c(name, nrow(bed), - as.numeric(sum(abs(bed$V3-bed$V2))), - as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) - bedCov$feature <- name + if (is.empty(bedCov)) { + labels[1,] <- c(0, 0, name,0, "#FF0703") + bedCov$feature <- as.character() + } else { + labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), + max(bedCov$frip)+0.001, name, + round(max(bedCov$frip),2), "#FF0703") + bedCov$feature <- name + } + feature_dist[1,] <- c(name, + nrow(bed), as.numeric(sum(abs(bed$V3-bed$V2))), + as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) } else if (file.exists(file.path(bedFile[1])) && info$size != 0) { bed <- read.table(file.path(bedFile[1])) if (nrow(bed[which(bed$V5 != 0),]) == 0) { @@ -709,12 +719,18 @@ plotFRiF <- function(sample_name, num_reads, genome_size, name <- gsub("^.*?_", "", name) numFields <- 1 for(i in 1:numFields) name <- gsub("_[^_]*$", "", name) - labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), max(bedCov$frip)+0.001, - name, round(max(bedCov$frip),2), "#FF0703") + if (is.empty(bedCov)) { + labels[1,] <- c(0, 0, name, 0, "#FF0703") + bedCov$feature <- as.character() + } else { + labels[1,] <- c(0.95*max(log10(bedCov$cumSize)), + max(bedCov$frip)+0.001, name, + round(max(bedCov$frip),2), "#FF0703") + bedCov$feature <- name + } feature_dist[1,] <- c(name, nrow(bed), - as.numeric(sum(abs(bed$V3-bed$V2))), - as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) - bedCov$feature <- name + as.numeric(sum(abs(bed$V3-bed$V2))), + as.numeric((sum(abs(bed$V3-bed$V2))/genome_size))) } } else { if (is.na(info[1])) { @@ -1235,6 +1251,15 @@ plotFLD <- function(fragL, } +#' Check if a data frame is empty. +#' +#' Return TRUE if provided data frame is NULL, has no rows, or has no columns. +#' +#' @param df A data frame to check for emptiness +is.empty <- function(df) { + (is.null(df) || nrow(df) == 0 || ncol(df) == 0) +} + #' Calculate mode(s) of data #' diff --git a/docs/changelog.md b/docs/changelog.md index 932e74a8..1e4431a7 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,14 @@ # Change log All notable changes to this project will be documented in this file. + +## [0.9.10] -- 2020-11-16 + +### Changed + - Added check for rare lagging filter_paired_fq.pl command + - Fixed calcFRiF bug on zero coverage files + - Updated BiocProject examples + ## [0.9.9] -- 2020-11-02 ### Changed diff --git a/docs/usage.md b/docs/usage.md index e9040a1d..7b19ecd8 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -22,7 +22,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite] [--skipqc] [-V] -PEPATAC version 0.9.9 +PEPATAC version 0.9.10 optional arguments: -h, --help show this help message and exit diff --git a/pepatac_output_schema.yaml b/pepatac_output_schema.yaml index edd2dc1e..19840b00 100644 --- a/pepatac_output_schema.yaml +++ b/pepatac_output_schema.yaml @@ -1,4 +1,4 @@ -description: objects produced by PEPPRO pipeline. +description: objects produced by PEPATAC pipeline. properties: samples: type: array @@ -8,23 +8,27 @@ properties: smooth_bw: path: "aligned_{genome}/{sample_name}_smooth.bw" type: string - description: "Test sample property" + description: "Smoothed signal track" exact_bw: path: "aligned_{genome}_exact/{sample_name}_exact.bw" type: string - description: "Test sample property" + description: "Nucleotide-resolution signal track" aligned_bam: - path: "aligned_{genome}/{sample_name}_sort.bam" + path: "aligned_{genome}/{sample_name}_sort_dedup.bam" type: string - description: "Test sample property" - peaks_bed: - path: "peak_calling_{genome}/{sample_name}_peaks.bed" + description: "Coordinate sorted deduplicated, aligned BAM file" + peak_file: + path: "peak_calling_{genome}/{sample_name}_peaks_normalized.narrowPeak" type: string - description: "Test sample property" + description: "Sample peak file" + coverage_file: + path: "peak_calling_{genome}/{sample_name}_peaks_coverage.bed.gz" + type: string + description: "Sample peak coverage table" summits_bed: path: "peak_calling_{genome}/{sample_name}_summits.bed" type: string - description: "Test sample property" + description: "Peak summit file" alignment_percent_file: title: "Alignment percent file" description: "Plots percent of total alignment to all pre-alignments and primary genome." diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index c5db7b23..f4ca857b 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.9.9" +__version__ = "0.9.10" from argparse import ArgumentParser @@ -565,7 +565,7 @@ def main(): ############################################################################ # Confirm required tools are all callable # ############################################################################ - opt_tools = ["fseq", "genrich", "${HMMRATAC}", "${PICARD}", + opt_tools = ["fseq", "Genrich", "${HMMRATAC}", "${PICARD}", "${TRIMMOMATIC}", "pyadapt", "findMotifsGenome.pl", "findPeaks", "seqOutBias", "bigWigMerge", "bedGraphToBigWig", "pigz", "bwa"] @@ -605,7 +605,7 @@ def main(): if 'fseq' in opt_tools: opt_tools.remove('fseq') if args.peak_caller == "genrich": - if 'genrich' in opt_tools: opt_tools.remove('genrich') + if 'Genrich' in opt_tools: opt_tools.remove('Genrich') if args.peak_caller == "hmmratac": if '${HMMRATAC}' in opt_tools: opt_tools.remove('${HMMRATAC}') @@ -981,6 +981,27 @@ def check_trim(): to_compress.append(unmap_fq2) pm.timestamp("### Compress all unmapped read files") + # Confirm pairing is complete + def check_pairing(fq1, fq2): + wc1 = ngstk.count_reads(fq1, args.paired_end) + wc2 = ngstk.count_reads(fq2, args.paired_end) + pm.debug("wc1: {}".format(str(wc1))) + pm.debug("wc2: {}".format(str(wc2))) + pm.debug("Return value: {}".format(str(wc1 == wc2))) + return wc1 == wc2 + + if args.paired_end: + if (not pypiper.is_gzipped_fastq(unmap_fq1) and + not pypiper.is_gzipped_fastq(unmap_fq2)): + checks = 1 + while not check_pairing(unmap_fq1, unmap_fq2) and checks < 100: + checks += 1 + pm.debug("Check count: {}".format(str(checks))) + if checks > 100 and not check_pairing(unmap_fq1, unmap_fq2): + err_msg = ("Fastq filter_paired_fq.pl function did not " + "complete successfully. Try running the pipeline " + "with `--keep`.") + pm.fail_pipeline(IOError(err_msg)) for unmapped_fq in to_compress: # Compress unmapped fastq reads if not pypiper.is_gzipped_fastq(unmapped_fq) and not unmapped_fq == '': diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index c84d2baa..e67213e1 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -13,9 +13,9 @@ command_template: > -P {compute.cores} -M {compute.mem} {% if sample.read2 is defined %} --input2 {sample.read2} {% endif %} + {% if sample.aligner is defined %} --aligner {sample.aligner} {% endif %} {% if sample.peak_caller is defined %} --peak-caller {sample.peak_caller} {% endif %} {% if sample.macs_genome_size is defined %} --genome-size {sample.macs_genome_size} {% endif %} - {% if sample.aligner is defined %} --aligner {sample.aligner} {% endif %} {% if sample.trimmer is defined %} --trimmer {sample.trimmer} {% endif %} {% if sample.prealignments is defined %} --prealignments {sample.prealignments} {% endif %} {% if sample.deduplicator is defined %} --deduplicator {sample.deduplicator} {% endif %} @@ -26,18 +26,19 @@ command_template: > {% if sample.extend is defined %} --extend {sample.extend} {% endif %} {% if sample.frip_ref_peaks is defined %} --frip-ref-peaks {sample.frip_ref_peaks} {% endif %} {% if sample.motif is defined %} --motif {% endif %} - {% if sample.no_scale is defined %} --no-scale {% endif %} {% if sample.sob is defined %} --sob {% endif %} + {% if sample.no_scale is defined %} --no-scale {% endif %} {% if sample.prioritize is defined %} --prioritize {% endif %} {% if sample.keep is defined %} --keep {% endif %} {% if sample.no_fifo is defined %} --noFIFO {% endif %} {% if sample.lite is defined %} --lite {% endif %} + {% if sample.skipqc is defined %} --skipqc {% endif %} compute: singularity_image: ${SIMAGES}pepatac docker_image: databio/pepatac - bulker_crate: databio/pepatac:1.0.4 + bulker_crate: databio/pepatac:1.0.5 size_dependent_variables: resources-sample.tsv bioconductor: - readFunName: readPepatacPeakBeds - readFunPath: BiocProject/readPepatacPeakBeds.R + readFunName: runCOCOA + readFunPath: BiocProject/runCOCOA.R diff --git a/usage.txt b/usage.txt index 60d598ad..c7b6ba4c 100644 --- a/usage.txt +++ b/usage.txt @@ -14,7 +14,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--motif] [--sob] [--no-scale] [--prioritize] [--keep] [--noFIFO] [--lite] [--skipqc] [-V] -PEPATAC version 0.9.9 +PEPATAC version 0.9.10 optional arguments: -h, --help show this help message and exit