-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #161 from databio/feat_biocproject
Feat biocproject
- Loading branch information
Showing
17 changed files
with
406 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# This project config file describes your project. See looper docs for details. | ||
name: BiocProject # The name that summary files will be prefaced with | ||
|
||
pep_version: 2.0.0 | ||
sample_table: gold_sample_table.csv # sheet listing all samples in the project | ||
|
||
looper: # relative paths are relative to this config file | ||
output_dir: "$PROCESSED/pepatac/gold" # ABSOLUTE PATH to the parent, shared space where project results go | ||
pipeline_interfaces: ["$CODE/pepatac/project_pipeline_interface.yaml"] # ABSOLUTE PATH to the directory where looper will find the pipeline repository | ||
|
||
sample_modifiers: | ||
append: | ||
pipeline_interfaces: ["$CODE/pepatac/sample_pipeline_interface.yaml"] | ||
derive: | ||
attributes: [read1, read2] | ||
sources: | ||
SRA_1: "${SRAFQ}{SRR}_1.fastq.gz" | ||
SRA_2: "${SRAFQ}{SRR}_2.fastq.gz" | ||
imply: | ||
- if: | ||
organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] | ||
then: | ||
genome: hg38 | ||
macs_genome_size: hs | ||
prealignments: rCRSd | ||
aligner: bowtie2 # Default. [options: bwa] | ||
deduplicator: samblaster # Default. [options: picard] | ||
trimmer: skewer # Default. [options: pyadapt, trimmomatic] | ||
peak_caller: macs2 # Default. [options: fseq, genrich, hmmratac, homer] | ||
peak_type: fixed # Default. [options: variable] | ||
extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. | ||
frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run | ||
blacklist: $GENOMES/hg38/blacklist/default/hg38_blacklist.bed.gz | ||
|
||
bioconductor: | ||
readFunName: readNarrowPeak | ||
readFunPath: readNarrowPeak.R | ||
funcArgs: | ||
output_dir: "$PROCESSED/pepatac/gold" | ||
results_subdir: "$PROCESSED/pepatac/gold/results_pipeline/" |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
sample_name,organism,protocol,SRR,read_type,read1,read2 | ||
gold1,human,ATAC-seq,SRR5210416,PAIRED,SRA_1,SRA_2 | ||
gold2,human,ATAC-seq,SRR5210450,PAIRED,SRA_1,SRA_2 | ||
gold3,human,ATAC-seq,SRR5210398,PAIRED,SRA_1,SRA_2 | ||
gold4,human,ATAC-seq,SRR5210428,PAIRED,SRA_1,SRA_2 | ||
gold5,human,ATAC-seq,SRR5210390,PAIRED,SRA_1,SRA_2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
readConsensusPeak = function(project, summary_dir) { | ||
cwd = getwd() | ||
# Construct sample table | ||
sample_table <- data.table( | ||
genome=unique(pepr::sampleTable(project)$genome) | ||
) | ||
sample_table[,peak_file_path:=.(file.path( | ||
summary_dir, | ||
paste0(config(project)$name, "_", sample_table$genome, | ||
"_consensusPeaks.narrowPeak") | ||
))] | ||
# Only keep samples with valid peak files | ||
file_list <- sample_table$peak_file_path | ||
file_exists <- character() | ||
for (i in 1:length(file_list)) { | ||
if(file.exists(file.path(file_list[i]))) { | ||
file_exists <- append(file_exists, file.path(file_list[i])) | ||
} | ||
} | ||
files <- data.table(peak_file_path=file_exists) | ||
consensus_peak_files = list() | ||
if (nrow(files) == 0) { | ||
warning("No valid peak files exist.") | ||
return(GenomicRanges::GRangesList()) | ||
} | ||
# Ensure duplicates do not exist | ||
sample_table <- unique( | ||
sample_table[sample_table$peak_file_path %in% files$peak_file_path,]) | ||
# Load peak files as Granges objects | ||
sample_table[, peak_file := .(lapply( | ||
peak_file_path, | ||
function(x) { | ||
GenomicRanges::GRanges( | ||
fread(x, col.names=c("chr", "start", "end", "name", "score", | ||
"strand", "signalValue", "pValue", "qValue", | ||
"peak"))) | ||
}))] | ||
# Convert to GRangesList and name the individual Granges | ||
peak_files <- GenomicRanges::GRangesList(sample_table$peak_file) | ||
names(peak_files) <- sample_table$genome | ||
setwd(cwd) | ||
return(peak_files) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
readCoverage = function(project, results_subdir) { | ||
cwd = getwd() | ||
# Construct sample table | ||
sample_table <- data.table( | ||
sample_name=pepr::sampleTable(project)$sample_name, | ||
genome=pepr::sampleTable(project)$genome | ||
) | ||
# Check if coverage files are compressed | ||
if (any(file.exists(file.path(results_subdir, | ||
sample_table$sample_name, | ||
paste0("peak_calling_", sample_table$genome), | ||
paste0(sample_table$sample_name, | ||
"_ref_peaks_coverage.bed.gz"))))) { | ||
ext <- ".bed.gz" | ||
} else if (any(file.exists(file.path(results_subdir, | ||
sample_table$sample_name, | ||
paste0("peak_calling_", sample_table$genome), | ||
paste0(sample_table$sample_name, | ||
"_peaks_coverage.bed.gz"))))) { | ||
ext <- ".bed.gz" | ||
} else { | ||
ext <- ".bed" | ||
} | ||
# Use reference peak coverage file if available | ||
if (any(file.exists(file.path(results_subdir, | ||
sample_table$sample_name, | ||
paste0("peak_calling_", sample_table$genome), | ||
paste0(sample_table$sample_name, | ||
"_ref_peaks_coverage", ext))))) { | ||
peak_file_name = paste0("_ref_peaks_coverage", ext) | ||
reference = TRUE | ||
} else { | ||
warning("Peak coverage files are not derived from a singular reference peak set.") | ||
reference = FALSE | ||
peak_file_name = paste0("_peaks_coverage", ext) | ||
} | ||
# Construct paths to peak coverage files | ||
sample_table[,cov_file_path:=.((file.path( | ||
results_subdir, | ||
sample_table$sample_name, | ||
paste0("peak_calling_", sample_table$genome), | ||
paste0(sample_table$sample_name, peak_file_name))))] | ||
# Only keep samples with valid peak files | ||
file_list <- sample_table$cov_file_path | ||
file_exists <- character() | ||
for (i in 1:length(file_list)) { | ||
if(file.exists(file.path(file_list[i]))) { | ||
file_exists <- append(file_exists, file.path(file_list[i])) | ||
} | ||
} | ||
files <- data.table(cov_file_path=file_exists) | ||
if (nrow(files) == 0) { | ||
warning("No valid peak files exist.") | ||
return(data.table()) | ||
} | ||
# Ensure duplicates do not exist | ||
sample_table <- unique( | ||
sample_table[sample_table$cov_file_path %in% files$cov_file_path,]) | ||
# Column names are dependent on source file | ||
if (reference) { | ||
column_names <- c("chr", "start", "end", "name", "score", "strand", | ||
"signalValue", "pValue", "qValue", "peak", | ||
"read_count", "base_count", "width", "frac") | ||
} else { | ||
column_names <- c("chr", "start", "end", "read_count", | ||
"base_count", "width", "frac", "norm") | ||
} | ||
# Load coverage files and return list of data.tables | ||
sample_table[, cov_file := .(lapply(cov_file_path, | ||
fread, col.names=column_names))] | ||
# sample_table[, cov_file := .(lapply( | ||
# cov_file_path, function(x) { | ||
# x <- fread(x, col.names=column_names) | ||
# data.table(x) | ||
# }))] | ||
cov_files <- sample_table$cov_file | ||
names(cov_files) <- sample_table$sample_name | ||
setwd(cwd) | ||
return(cov_files) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
readNarrowPeak = function(project, results_subdir) { | ||
cwd = getwd() | ||
# Construct sample table | ||
sample_table <- data.table( | ||
sample_name=pepr::sampleTable(project)$sample_name, | ||
genome=pepr::sampleTable(project)$genome | ||
) | ||
# Identify peak file paths | ||
sample_table[,peak_file_path:=.((file.path( | ||
results_subdir, | ||
sample_table$sample_name, | ||
paste0("peak_calling_", sample_table$genome), | ||
paste0(sample_table$sample_name, | ||
"_peaks_normalized.narrowPeak"))))] | ||
# Only keep samples with valid peak files | ||
file_list <- sample_table$peak_file_path | ||
file_exists <- character() | ||
for (i in 1:length(file_list)) { | ||
if(file.exists(file.path(file_list[i]))) { | ||
file_exists <- append(file_exists, file.path(file_list[i])) | ||
} | ||
} | ||
files <- data.table(peak_file_path=file_exists) | ||
consensus_peak_files = list() | ||
if (nrow(files) == 0) { | ||
warning("No valid peak files exist.") | ||
return(GenomicRanges::GRangesList()) | ||
} | ||
# Ensure duplicates do not exist | ||
sample_table <- unique( | ||
sample_table[sample_table$peak_file_path %in% files$peak_file_path,]) | ||
# Load peak files as Granges objects | ||
sample_table[, peak_file := .(lapply( | ||
peak_file_path, | ||
function(x) { | ||
GenomicRanges::GRanges( | ||
fread(x, col.names=c("chr", "start", "end", "name", "score", "strand", | ||
"signalValue", "pValue", "qValue", "peak"))) | ||
}))] | ||
# Convert to GRangesList and name the individual Granges | ||
peak_files <- GenomicRanges::GRangesList(sample_table$peak_file) | ||
names(peak_files) <- sample_table$sample_name | ||
setwd(cwd) | ||
return(peak_files) | ||
} | ||
|
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
runCOCOA = function(project, results_subdir, summary_dir, | ||
genome, conditions, lolaPath) { | ||
# verify condition table | ||
if(!all(c("sample_name", "condition") %in% tolower(colnames(conditions)))) { | ||
warning("Conditions must include 'sample_name' and 'condition' columns.") | ||
return(1) | ||
} | ||
# load required packages | ||
required_packages <- c("COCOA", "LOLA", "data.table") | ||
for (package in required_packages) { | ||
loadLibrary <- tryCatch ( | ||
{ | ||
suppressPackageStartupMessages( | ||
suppressWarnings(library(package, character.only=TRUE))) | ||
}, | ||
error=function(e) { | ||
message("Error: Install the \"", package, | ||
"\" library before proceeding.") | ||
return(NULL) | ||
}, | ||
warning=function(e) { | ||
message(e) | ||
return(1) | ||
} | ||
) | ||
if (length(loadLibrary)!=0) { | ||
suppressWarnings(library(package, character.only=TRUE)) | ||
} else { | ||
warning() | ||
return(1) | ||
} | ||
} | ||
# Load peak count table | ||
count_table_path = file.path(summary_dir, | ||
paste0(config(project)$name, "_", genome, | ||
"_peaks_coverage.tsv")) | ||
if(file.exists(count_table_path)) { | ||
ct <- data.table::fread(count_table_path) | ||
} else { | ||
warning(paste0("Could not load ", count_table_path)) | ||
return(1) | ||
} | ||
# Subset count table by condition table | ||
cols <- colnames(ct)[c(TRUE ,colnames(ct) %in% conditions$sample_name)] | ||
ct <- ct[, cols, with=FALSE] | ||
# Generate matrix | ||
m <- as.matrix(ct[, 2:ncol(ct)]) | ||
rownames(m) <- ct$name | ||
mat <- t(m) | ||
mat <- mat[, -grep("random", colnames(mat))] | ||
mat <- mat[, -grep("GL", colnames(mat))] | ||
mat <- mat[, -grep("chrUn", colnames(mat))] | ||
pca <- prcomp(mat) | ||
row.names(pca$x) <- rownames(mat) | ||
loadings <- pca$rotation | ||
scores <- pca$x | ||
rownames(scores) <- rownames(mat) | ||
|
||
delIdx <- c(grep("random", rownames(m)), | ||
grep("GL", rownames(m)), | ||
grep("chrUn", rownames(m))) | ||
coord_list <- data.table::tstrsplit(rownames(m)[-delIdx], "_", | ||
type.convert = TRUE, fixed = TRUE) | ||
peaks <- data.table::data.table(chr=coord_list[[1]], | ||
start=coord_list[[2]], | ||
end=coord_list[[3]]) | ||
# Load region database | ||
regionSetDB <- suppressWarnings(loadRegionDB(lolaPath)) | ||
# Load the ENCODE transcription factor binding sites | ||
regionAnno <- regionSetDB$regionAnno | ||
collectionInd <- regionAnno$collection %in% "encode_tfbs" | ||
#message("load just TFBS") | ||
TFGR <- GRangesList(regionSetDB$regionGRL[collectionInd]) | ||
regionAnno <- regionAnno[collectionInd, ] | ||
regionSetName <- regionAnno$filename | ||
regionSetDescription <- regionAnno$description | ||
#message("remove regionSetDB") | ||
# Remove full DB to release memory | ||
rm("regionSetDB") | ||
|
||
# Subset principal components to annotate | ||
PCsToAnnotate <- paste0("PC", 1:6) | ||
correctSampleOrder <- 1:nrow(mat) | ||
message("run COCOA") | ||
tfRSS <- local(COCOA::runCOCOA(genomicSignal=loadings, | ||
signalCoord=peaks, | ||
GRList=TFGR, | ||
signalCol=PCsToAnnotate, | ||
sampleOrder=correctSampleOrder, | ||
targetVar=scores, | ||
scoringMetric="proportionWeightedMean", | ||
absVal=TRUE)) | ||
tfRSS$regionSetName <- regionSetName | ||
tfRSS$regionSetDescription <- regionSetDescription | ||
return(tfRSS) | ||
} | ||
|
Oops, something went wrong.