Merge pull request #161 from databio/feat_biocproject

Feat biocproject
databio · Nov 17, 2020 · 8c478e6 · 8c478e6
2 parents 3547db1 + 089f9a2
commit 8c478e6
Show file tree

Hide file tree

Showing 17 changed files with 406 additions and 77 deletions.
diff --git a/BiocProject/PEPATAC_BiocProject.Rmd b/BiocProject/PEPATAC_BiocProject.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "PEPATAC BiocProject"
-author: "Michal Stolarczyk"
+author: c("Michal Stolarczyk", "Jason Smith")
 date: "`r Sys.Date()`"
 output: rmarkdown::html_vignette
 vignette: >
@@ -35,7 +35,7 @@ The way the output files are read is defined in a [function](https://github.com/
 
 ```{r echo=T, message=FALSE}
 library(BiocProject)
-ProjectConfig = "gold_hg19.yaml"
+ProjectConfig = "gold_config.yaml"
 ```
 
 ## Run the `BiocProject` function

diff --git a/BiocProject/gold_atac_annotation.csv b/BiocProject/gold_atac_annotation.csv
diff --git a/BiocProject/gold_config.yaml b/BiocProject/gold_config.yaml
@@ -0,0 +1,40 @@
+# This project config file describes your project. See looper docs for details.
+name: BiocProject # The name that summary files will be prefaced with
+
+pep_version: 2.0.0
+sample_table: gold_sample_table.csv  # sheet listing all samples in the project
+
+looper:  # relative paths are relative to this config file
+  output_dir: "$PROCESSED/pepatac/gold"  # ABSOLUTE PATH to the parent, shared space where project results go
+  pipeline_interfaces: ["$CODE/pepatac/project_pipeline_interface.yaml"]  # ABSOLUTE PATH to the directory where looper will find the pipeline repository
+
+sample_modifiers:
+  append:
+    pipeline_interfaces: ["$CODE/pepatac/sample_pipeline_interface.yaml"]
+  derive:
+    attributes: [read1, read2]
+    sources:
+      SRA_1: "${SRAFQ}{SRR}_1.fastq.gz"
+      SRA_2: "${SRAFQ}{SRR}_2.fastq.gz"
+  imply:
+    - if: 
+        organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"]
+      then: 
+        genome: hg38
+        macs_genome_size: hs
+        prealignments: rCRSd
+        aligner: bowtie2         # Default. [options: bwa]
+        deduplicator: samblaster # Default. [options: picard]
+        trimmer: skewer          # Default. [options: pyadapt, trimmomatic]
+        peak_caller: macs2       # Default. [options: fseq, genrich, hmmratac, homer]
+        peak_type: fixed         # Default. [options: variable]
+        extend: "250"            # Default. For fixed-width peaks, extend this distance up- and down-stream.
+        frip_ref_peaks: None     # Default. Use an external reference set of peaks instead of the peaks called from this run
+        blacklist: $GENOMES/hg38/blacklist/default/hg38_blacklist.bed.gz
+
+bioconductor:
+  readFunName: readNarrowPeak
+  readFunPath: readNarrowPeak.R
+  funcArgs:
+    output_dir: "$PROCESSED/pepatac/gold" 
+    results_subdir: "$PROCESSED/pepatac/gold/results_pipeline/" 
diff --git a/BiocProject/gold_hg19.yaml b/BiocProject/gold_hg19.yaml
diff --git a/BiocProject/gold_sample_table.csv b/BiocProject/gold_sample_table.csv
@@ -0,0 +1,6 @@
+sample_name,organism,protocol,SRR,read_type,read1,read2
+gold1,human,ATAC-seq,SRR5210416,PAIRED,SRA_1,SRA_2
+gold2,human,ATAC-seq,SRR5210450,PAIRED,SRA_1,SRA_2
+gold3,human,ATAC-seq,SRR5210398,PAIRED,SRA_1,SRA_2
+gold4,human,ATAC-seq,SRR5210428,PAIRED,SRA_1,SRA_2
+gold5,human,ATAC-seq,SRR5210390,PAIRED,SRA_1,SRA_2
diff --git a/BiocProject/readConsensusPeak.R b/BiocProject/readConsensusPeak.R
@@ -0,0 +1,44 @@
+readConsensusPeak = function(project, summary_dir) {
+    cwd = getwd()
+    # Construct sample table
+    sample_table <- data.table(
+        genome=unique(pepr::sampleTable(project)$genome)
+    )
+    sample_table[,peak_file_path:=.(file.path(
+                 summary_dir,
+                 paste0(config(project)$name, "_", sample_table$genome,
+                        "_consensusPeaks.narrowPeak")
+                 ))]
+    # Only keep samples with valid peak files
+    file_list   <- sample_table$peak_file_path
+    file_exists <- character()
+    for (i in 1:length(file_list)) {
+        if(file.exists(file.path(file_list[i]))) {
+            file_exists <- append(file_exists, file.path(file_list[i]))
+        }
+    }
+    files <- data.table(peak_file_path=file_exists)
+    consensus_peak_files = list()
+    if (nrow(files) == 0) {
+        warning("No valid peak files exist.")
+        return(GenomicRanges::GRangesList())
+    }
+    # Ensure duplicates do not exist
+    sample_table <- unique(
+        sample_table[sample_table$peak_file_path %in% files$peak_file_path,])
+    # Load peak files as Granges objects
+    sample_table[, peak_file := .(lapply(
+      peak_file_path,
+      function(x) {
+        GenomicRanges::GRanges(
+            fread(x, col.names=c("chr", "start", "end", "name", "score",
+                                 "strand", "signalValue", "pValue", "qValue",
+                                 "peak")))
+      }))]
+    # Convert to GRangesList and name the individual Granges
+    peak_files        <- GenomicRanges::GRangesList(sample_table$peak_file)
+    names(peak_files) <- sample_table$genome
+    setwd(cwd)
+    return(peak_files)
+}
+
diff --git a/BiocProject/readCoverage.R b/BiocProject/readCoverage.R
@@ -0,0 +1,80 @@
+readCoverage = function(project, results_subdir) {
+    cwd = getwd()
+    # Construct sample table
+    sample_table <- data.table(
+        sample_name=pepr::sampleTable(project)$sample_name,
+        genome=pepr::sampleTable(project)$genome
+    )
+    # Check if coverage files are compressed
+    if (any(file.exists(file.path(results_subdir,
+                        sample_table$sample_name,
+                        paste0("peak_calling_", sample_table$genome),
+                        paste0(sample_table$sample_name,
+                               "_ref_peaks_coverage.bed.gz"))))) {
+        ext <- ".bed.gz"
+    } else if (any(file.exists(file.path(results_subdir,
+                        sample_table$sample_name,
+                        paste0("peak_calling_", sample_table$genome),
+                        paste0(sample_table$sample_name,
+                               "_peaks_coverage.bed.gz"))))) {
+        ext <- ".bed.gz"
+    } else {
+        ext <- ".bed"
+    }
+    # Use reference peak coverage file if available
+    if (any(file.exists(file.path(results_subdir,
+                        sample_table$sample_name,
+                        paste0("peak_calling_", sample_table$genome),
+                        paste0(sample_table$sample_name,
+                               "_ref_peaks_coverage", ext))))) {
+        peak_file_name = paste0("_ref_peaks_coverage", ext)
+        reference = TRUE
+    } else {
+        warning("Peak coverage files are not derived from a singular reference peak set.")
+        reference = FALSE
+        peak_file_name = paste0("_peaks_coverage", ext)
+    }
+    # Construct paths to peak coverage files
+    sample_table[,cov_file_path:=.((file.path(
+                 results_subdir,
+                 sample_table$sample_name,
+                 paste0("peak_calling_", sample_table$genome),
+                 paste0(sample_table$sample_name, peak_file_name))))]
+    # Only keep samples with valid peak files
+    file_list   <- sample_table$cov_file_path
+    file_exists <- character()
+    for (i in 1:length(file_list)) {
+        if(file.exists(file.path(file_list[i]))) {
+            file_exists <- append(file_exists, file.path(file_list[i]))
+        }
+    }
+    files <- data.table(cov_file_path=file_exists)
+    if (nrow(files) == 0) {
+        warning("No valid peak files exist.")
+        return(data.table())
+    }
+    # Ensure duplicates do not exist
+    sample_table <- unique(
+        sample_table[sample_table$cov_file_path %in% files$cov_file_path,])
+    # Column names are dependent on source file
+    if (reference) {
+        column_names <- c("chr", "start", "end", "name", "score", "strand",
+                          "signalValue", "pValue", "qValue", "peak",
+                          "read_count", "base_count", "width", "frac")
+    } else {
+        column_names <- c("chr", "start", "end", "read_count",
+                          "base_count", "width", "frac", "norm")
+    }
+    # Load coverage files and return list of data.tables
+    sample_table[, cov_file := .(lapply(cov_file_path,
+                                        fread, col.names=column_names))]
+    # sample_table[, cov_file := .(lapply(
+      # cov_file_path, function(x) {
+        # x <- fread(x, col.names=column_names)
+        # data.table(x)
+    # }))]
+    cov_files        <- sample_table$cov_file
+    names(cov_files) <- sample_table$sample_name
+    setwd(cwd)
+    return(cov_files)
+}
diff --git a/BiocProject/readNarrowPeak.R b/BiocProject/readNarrowPeak.R
@@ -0,0 +1,46 @@
+readNarrowPeak = function(project, results_subdir) {
+    cwd = getwd()
+    # Construct sample table
+    sample_table <- data.table(
+        sample_name=pepr::sampleTable(project)$sample_name,
+        genome=pepr::sampleTable(project)$genome
+    )
+    # Identify peak file paths
+    sample_table[,peak_file_path:=.((file.path(
+                 results_subdir,
+                 sample_table$sample_name,
+                 paste0("peak_calling_", sample_table$genome),
+                 paste0(sample_table$sample_name,
+                 "_peaks_normalized.narrowPeak"))))]
+    # Only keep samples with valid peak files
+    file_list   <- sample_table$peak_file_path
+    file_exists <- character()
+    for (i in 1:length(file_list)) {
+        if(file.exists(file.path(file_list[i]))) {
+            file_exists <- append(file_exists, file.path(file_list[i]))
+        }
+    }
+    files <- data.table(peak_file_path=file_exists)
+    consensus_peak_files = list()
+    if (nrow(files) == 0) {
+        warning("No valid peak files exist.")
+        return(GenomicRanges::GRangesList())
+    }
+    # Ensure duplicates do not exist
+    sample_table <- unique(
+        sample_table[sample_table$peak_file_path %in% files$peak_file_path,])
+    # Load peak files as Granges objects
+    sample_table[, peak_file := .(lapply(
+      peak_file_path,
+      function(x) {
+        GenomicRanges::GRanges(
+            fread(x, col.names=c("chr", "start", "end", "name", "score", "strand",
+                                 "signalValue", "pValue", "qValue", "peak")))
+      }))]
+    # Convert to GRangesList and name the individual Granges
+    peak_files        <- GenomicRanges::GRangesList(sample_table$peak_file)
+    names(peak_files) <- sample_table$sample_name
+    setwd(cwd)
+    return(peak_files)
+}
+
diff --git a/BiocProject/readPepatacPeakBeds.R b/BiocProject/readPepatacPeakBeds.R
diff --git a/BiocProject/runCOCOA.R b/BiocProject/runCOCOA.R
@@ -0,0 +1,97 @@
+runCOCOA = function(project, results_subdir, summary_dir,
+                    genome, conditions, lolaPath) {
+    # verify condition table
+    if(!all(c("sample_name", "condition") %in% tolower(colnames(conditions)))) {
+        warning("Conditions must include 'sample_name' and 'condition' columns.")
+        return(1)
+    }
+    # load required packages
+    required_packages <- c("COCOA", "LOLA", "data.table")
+    for (package in required_packages) {
+        loadLibrary <- tryCatch (
+            {
+                suppressPackageStartupMessages(
+                    suppressWarnings(library(package, character.only=TRUE)))
+            },
+            error=function(e) {
+                message("Error: Install the \"", package,
+                        "\" library before proceeding.")
+                return(NULL)
+            },
+            warning=function(e) {
+                message(e)
+                return(1)
+            }
+        )
+        if (length(loadLibrary)!=0) {
+            suppressWarnings(library(package, character.only=TRUE))
+        } else {
+            warning()
+            return(1)
+        }
+    }
+    # Load peak count table
+    count_table_path = file.path(summary_dir,
+                                 paste0(config(project)$name, "_", genome,
+                                        "_peaks_coverage.tsv"))
+    if(file.exists(count_table_path)) {
+        ct <- data.table::fread(count_table_path)
+    } else {
+        warning(paste0("Could not load ", count_table_path))
+        return(1)
+    }
+    # Subset count table by condition table
+    cols <- colnames(ct)[c(TRUE ,colnames(ct) %in% conditions$sample_name)]
+    ct   <- ct[, cols, with=FALSE]
+    # Generate matrix
+    m           <- as.matrix(ct[, 2:ncol(ct)])
+    rownames(m) <- ct$name
+    mat <- t(m)
+    mat <- mat[, -grep("random", colnames(mat))]
+    mat <- mat[, -grep("GL", colnames(mat))]
+    mat <- mat[, -grep("chrUn", colnames(mat))]
+    pca              <- prcomp(mat)
+    row.names(pca$x) <- rownames(mat)
+    loadings         <- pca$rotation
+    scores           <- pca$x
+    rownames(scores) <- rownames(mat)
+
+    delIdx <- c(grep("random", rownames(m)),
+                grep("GL", rownames(m)),
+                grep("chrUn", rownames(m)))
+    coord_list <- data.table::tstrsplit(rownames(m)[-delIdx], "_",
+                                        type.convert = TRUE, fixed = TRUE)
+    peaks <- data.table::data.table(chr=coord_list[[1]],
+                                    start=coord_list[[2]],
+                                    end=coord_list[[3]])
+    # Load region database    
+    regionSetDB <- suppressWarnings(loadRegionDB(lolaPath))
+    # Load the ENCODE transcription factor binding sites
+    regionAnno    <- regionSetDB$regionAnno
+    collectionInd <- regionAnno$collection %in% "encode_tfbs"
+    #message("load just TFBS")
+    TFGR          <- GRangesList(regionSetDB$regionGRL[collectionInd])
+    regionAnno    <- regionAnno[collectionInd, ]
+    regionSetName        <- regionAnno$filename
+    regionSetDescription <- regionAnno$description
+    #message("remove regionSetDB")
+    # Remove full DB to release memory
+    rm("regionSetDB")
+
+    # Subset principal components to annotate
+    PCsToAnnotate      <- paste0("PC", 1:6)
+    correctSampleOrder <- 1:nrow(mat)
+    message("run COCOA")
+    tfRSS <- local(COCOA::runCOCOA(genomicSignal=loadings,
+                                   signalCoord=peaks,
+                                   GRList=TFGR,
+                                   signalCol=PCsToAnnotate,
+                                   sampleOrder=correctSampleOrder,
+                                   targetVar=scores,
+                                   scoringMetric="proportionWeightedMean",
+                                   absVal=TRUE))
+    tfRSS$regionSetName        <- regionSetName
+    tfRSS$regionSetDescription <- regionSetDescription
+    return(tfRSS)    
+}
+