Skip to content

Commit

Permalink
Merge pull request #161 from databio/feat_biocproject
Browse files Browse the repository at this point in the history
Feat biocproject
  • Loading branch information
jpsmith5 authored Nov 17, 2020
2 parents 3547db1 + 089f9a2 commit 8c478e6
Show file tree
Hide file tree
Showing 17 changed files with 406 additions and 77 deletions.
4 changes: 2 additions & 2 deletions BiocProject/PEPATAC_BiocProject.Rmd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
---
title: "PEPATAC BiocProject"
author: "Michal Stolarczyk"
author: c("Michal Stolarczyk", "Jason Smith")
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
Expand Down Expand Up @@ -35,7 +35,7 @@ The way the output files are read is defined in a [function](https://github.com/

```{r echo=T, message=FALSE}
library(BiocProject)
ProjectConfig = "gold_hg19.yaml"
ProjectConfig = "gold_config.yaml"
```

## Run the `BiocProject` function
Expand Down
6 changes: 0 additions & 6 deletions BiocProject/gold_atac_annotation.csv

This file was deleted.

40 changes: 40 additions & 0 deletions BiocProject/gold_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This project config file describes your project. See looper docs for details.
name: BiocProject # The name that summary files will be prefaced with

pep_version: 2.0.0
sample_table: gold_sample_table.csv # sheet listing all samples in the project

looper: # relative paths are relative to this config file
output_dir: "$PROCESSED/pepatac/gold" # ABSOLUTE PATH to the parent, shared space where project results go
pipeline_interfaces: ["$CODE/pepatac/project_pipeline_interface.yaml"] # ABSOLUTE PATH to the directory where looper will find the pipeline repository

sample_modifiers:
append:
pipeline_interfaces: ["$CODE/pepatac/sample_pipeline_interface.yaml"]
derive:
attributes: [read1, read2]
sources:
SRA_1: "${SRAFQ}{SRR}_1.fastq.gz"
SRA_2: "${SRAFQ}{SRR}_2.fastq.gz"
imply:
- if:
organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"]
then:
genome: hg38
macs_genome_size: hs
prealignments: rCRSd
aligner: bowtie2 # Default. [options: bwa]
deduplicator: samblaster # Default. [options: picard]
trimmer: skewer # Default. [options: pyadapt, trimmomatic]
peak_caller: macs2 # Default. [options: fseq, genrich, hmmratac, homer]
peak_type: fixed # Default. [options: variable]
extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream.
frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run
blacklist: $GENOMES/hg38/blacklist/default/hg38_blacklist.bed.gz

bioconductor:
readFunName: readNarrowPeak
readFunPath: readNarrowPeak.R
funcArgs:
output_dir: "$PROCESSED/pepatac/gold"
results_subdir: "$PROCESSED/pepatac/gold/results_pipeline/"
19 changes: 0 additions & 19 deletions BiocProject/gold_hg19.yaml

This file was deleted.

6 changes: 6 additions & 0 deletions BiocProject/gold_sample_table.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sample_name,organism,protocol,SRR,read_type,read1,read2
gold1,human,ATAC-seq,SRR5210416,PAIRED,SRA_1,SRA_2
gold2,human,ATAC-seq,SRR5210450,PAIRED,SRA_1,SRA_2
gold3,human,ATAC-seq,SRR5210398,PAIRED,SRA_1,SRA_2
gold4,human,ATAC-seq,SRR5210428,PAIRED,SRA_1,SRA_2
gold5,human,ATAC-seq,SRR5210390,PAIRED,SRA_1,SRA_2
44 changes: 44 additions & 0 deletions BiocProject/readConsensusPeak.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
readConsensusPeak = function(project, summary_dir) {
cwd = getwd()
# Construct sample table
sample_table <- data.table(
genome=unique(pepr::sampleTable(project)$genome)
)
sample_table[,peak_file_path:=.(file.path(
summary_dir,
paste0(config(project)$name, "_", sample_table$genome,
"_consensusPeaks.narrowPeak")
))]
# Only keep samples with valid peak files
file_list <- sample_table$peak_file_path
file_exists <- character()
for (i in 1:length(file_list)) {
if(file.exists(file.path(file_list[i]))) {
file_exists <- append(file_exists, file.path(file_list[i]))
}
}
files <- data.table(peak_file_path=file_exists)
consensus_peak_files = list()
if (nrow(files) == 0) {
warning("No valid peak files exist.")
return(GenomicRanges::GRangesList())
}
# Ensure duplicates do not exist
sample_table <- unique(
sample_table[sample_table$peak_file_path %in% files$peak_file_path,])
# Load peak files as Granges objects
sample_table[, peak_file := .(lapply(
peak_file_path,
function(x) {
GenomicRanges::GRanges(
fread(x, col.names=c("chr", "start", "end", "name", "score",
"strand", "signalValue", "pValue", "qValue",
"peak")))
}))]
# Convert to GRangesList and name the individual Granges
peak_files <- GenomicRanges::GRangesList(sample_table$peak_file)
names(peak_files) <- sample_table$genome
setwd(cwd)
return(peak_files)
}

80 changes: 80 additions & 0 deletions BiocProject/readCoverage.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
readCoverage = function(project, results_subdir) {
cwd = getwd()
# Construct sample table
sample_table <- data.table(
sample_name=pepr::sampleTable(project)$sample_name,
genome=pepr::sampleTable(project)$genome
)
# Check if coverage files are compressed
if (any(file.exists(file.path(results_subdir,
sample_table$sample_name,
paste0("peak_calling_", sample_table$genome),
paste0(sample_table$sample_name,
"_ref_peaks_coverage.bed.gz"))))) {
ext <- ".bed.gz"
} else if (any(file.exists(file.path(results_subdir,
sample_table$sample_name,
paste0("peak_calling_", sample_table$genome),
paste0(sample_table$sample_name,
"_peaks_coverage.bed.gz"))))) {
ext <- ".bed.gz"
} else {
ext <- ".bed"
}
# Use reference peak coverage file if available
if (any(file.exists(file.path(results_subdir,
sample_table$sample_name,
paste0("peak_calling_", sample_table$genome),
paste0(sample_table$sample_name,
"_ref_peaks_coverage", ext))))) {
peak_file_name = paste0("_ref_peaks_coverage", ext)
reference = TRUE
} else {
warning("Peak coverage files are not derived from a singular reference peak set.")
reference = FALSE
peak_file_name = paste0("_peaks_coverage", ext)
}
# Construct paths to peak coverage files
sample_table[,cov_file_path:=.((file.path(
results_subdir,
sample_table$sample_name,
paste0("peak_calling_", sample_table$genome),
paste0(sample_table$sample_name, peak_file_name))))]
# Only keep samples with valid peak files
file_list <- sample_table$cov_file_path
file_exists <- character()
for (i in 1:length(file_list)) {
if(file.exists(file.path(file_list[i]))) {
file_exists <- append(file_exists, file.path(file_list[i]))
}
}
files <- data.table(cov_file_path=file_exists)
if (nrow(files) == 0) {
warning("No valid peak files exist.")
return(data.table())
}
# Ensure duplicates do not exist
sample_table <- unique(
sample_table[sample_table$cov_file_path %in% files$cov_file_path,])
# Column names are dependent on source file
if (reference) {
column_names <- c("chr", "start", "end", "name", "score", "strand",
"signalValue", "pValue", "qValue", "peak",
"read_count", "base_count", "width", "frac")
} else {
column_names <- c("chr", "start", "end", "read_count",
"base_count", "width", "frac", "norm")
}
# Load coverage files and return list of data.tables
sample_table[, cov_file := .(lapply(cov_file_path,
fread, col.names=column_names))]
# sample_table[, cov_file := .(lapply(
# cov_file_path, function(x) {
# x <- fread(x, col.names=column_names)
# data.table(x)
# }))]
cov_files <- sample_table$cov_file
names(cov_files) <- sample_table$sample_name
setwd(cwd)
return(cov_files)
}
46 changes: 46 additions & 0 deletions BiocProject/readNarrowPeak.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
readNarrowPeak = function(project, results_subdir) {
cwd = getwd()
# Construct sample table
sample_table <- data.table(
sample_name=pepr::sampleTable(project)$sample_name,
genome=pepr::sampleTable(project)$genome
)
# Identify peak file paths
sample_table[,peak_file_path:=.((file.path(
results_subdir,
sample_table$sample_name,
paste0("peak_calling_", sample_table$genome),
paste0(sample_table$sample_name,
"_peaks_normalized.narrowPeak"))))]
# Only keep samples with valid peak files
file_list <- sample_table$peak_file_path
file_exists <- character()
for (i in 1:length(file_list)) {
if(file.exists(file.path(file_list[i]))) {
file_exists <- append(file_exists, file.path(file_list[i]))
}
}
files <- data.table(peak_file_path=file_exists)
consensus_peak_files = list()
if (nrow(files) == 0) {
warning("No valid peak files exist.")
return(GenomicRanges::GRangesList())
}
# Ensure duplicates do not exist
sample_table <- unique(
sample_table[sample_table$peak_file_path %in% files$peak_file_path,])
# Load peak files as Granges objects
sample_table[, peak_file := .(lapply(
peak_file_path,
function(x) {
GenomicRanges::GRanges(
fread(x, col.names=c("chr", "start", "end", "name", "score", "strand",
"signalValue", "pValue", "qValue", "peak")))
}))]
# Convert to GRangesList and name the individual Granges
peak_files <- GenomicRanges::GRangesList(sample_table$peak_file)
names(peak_files) <- sample_table$sample_name
setwd(cwd)
return(peak_files)
}

18 changes: 0 additions & 18 deletions BiocProject/readPepatacPeakBeds.R

This file was deleted.

97 changes: 97 additions & 0 deletions BiocProject/runCOCOA.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
runCOCOA = function(project, results_subdir, summary_dir,
genome, conditions, lolaPath) {
# verify condition table
if(!all(c("sample_name", "condition") %in% tolower(colnames(conditions)))) {
warning("Conditions must include 'sample_name' and 'condition' columns.")
return(1)
}
# load required packages
required_packages <- c("COCOA", "LOLA", "data.table")
for (package in required_packages) {
loadLibrary <- tryCatch (
{
suppressPackageStartupMessages(
suppressWarnings(library(package, character.only=TRUE)))
},
error=function(e) {
message("Error: Install the \"", package,
"\" library before proceeding.")
return(NULL)
},
warning=function(e) {
message(e)
return(1)
}
)
if (length(loadLibrary)!=0) {
suppressWarnings(library(package, character.only=TRUE))
} else {
warning()
return(1)
}
}
# Load peak count table
count_table_path = file.path(summary_dir,
paste0(config(project)$name, "_", genome,
"_peaks_coverage.tsv"))
if(file.exists(count_table_path)) {
ct <- data.table::fread(count_table_path)
} else {
warning(paste0("Could not load ", count_table_path))
return(1)
}
# Subset count table by condition table
cols <- colnames(ct)[c(TRUE ,colnames(ct) %in% conditions$sample_name)]
ct <- ct[, cols, with=FALSE]
# Generate matrix
m <- as.matrix(ct[, 2:ncol(ct)])
rownames(m) <- ct$name
mat <- t(m)
mat <- mat[, -grep("random", colnames(mat))]
mat <- mat[, -grep("GL", colnames(mat))]
mat <- mat[, -grep("chrUn", colnames(mat))]
pca <- prcomp(mat)
row.names(pca$x) <- rownames(mat)
loadings <- pca$rotation
scores <- pca$x
rownames(scores) <- rownames(mat)

delIdx <- c(grep("random", rownames(m)),
grep("GL", rownames(m)),
grep("chrUn", rownames(m)))
coord_list <- data.table::tstrsplit(rownames(m)[-delIdx], "_",
type.convert = TRUE, fixed = TRUE)
peaks <- data.table::data.table(chr=coord_list[[1]],
start=coord_list[[2]],
end=coord_list[[3]])
# Load region database
regionSetDB <- suppressWarnings(loadRegionDB(lolaPath))
# Load the ENCODE transcription factor binding sites
regionAnno <- regionSetDB$regionAnno
collectionInd <- regionAnno$collection %in% "encode_tfbs"
#message("load just TFBS")
TFGR <- GRangesList(regionSetDB$regionGRL[collectionInd])
regionAnno <- regionAnno[collectionInd, ]
regionSetName <- regionAnno$filename
regionSetDescription <- regionAnno$description
#message("remove regionSetDB")
# Remove full DB to release memory
rm("regionSetDB")

# Subset principal components to annotate
PCsToAnnotate <- paste0("PC", 1:6)
correctSampleOrder <- 1:nrow(mat)
message("run COCOA")
tfRSS <- local(COCOA::runCOCOA(genomicSignal=loadings,
signalCoord=peaks,
GRList=TFGR,
signalCol=PCsToAnnotate,
sampleOrder=correctSampleOrder,
targetVar=scores,
scoringMetric="proportionWeightedMean",
absVal=TRUE))
tfRSS$regionSetName <- regionSetName
tfRSS$regionSetDescription <- regionSetDescription
return(tfRSS)
}

Loading

0 comments on commit 8c478e6

Please sign in to comment.