Skip to content

Commit

Permalink
Merge pull request #169 from databio/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jpsmith5 authored Feb 5, 2021
2 parents 3291788 + 9c6322d commit 6bc3cb9
Show file tree
Hide file tree
Showing 18 changed files with 740 additions and 125 deletions.
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ _site
.sass-cache
_site/
/_site/
_build/
.sass-cache/
.jekyll-metadata

Expand All @@ -32,3 +33,17 @@ anno/mm9_annotations.bed.gz
# Tutorial files
examples/data/tutorial_r1.fastq.gz
examples/data/tutorial_r2.fastq.gz
examples/gold_atac/metadata/distinct.bed
examples/gold_atac/metadata/distinct_only.bed
examples/gold_atac/metadata/gold_fseq.yaml
examples/gold_atac/metadata/gold_genrich.yaml
examples/gold_atac/metadata/gold_hmmratac.yaml
examples/gold_atac/metadata/gold_homer.yaml
examples/gold_atac/metadata/gold_picard_dedup.yaml
examples/gold_atac/metadata/gold_samtools_dedup.yaml
examples/test_project/test_bwa.yaml
examples/test_project/test_fseq.yaml
examples/test_project/test_genrich.yaml
examples/test_project/test_hmmratac.yaml
examples/test_project/test_homer.yaml
examples/test_project/test_macs.yaml
68 changes: 68 additions & 0 deletions PEP_schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
description: "Schema for a minimal PEP"
version: "2.0.0"
properties:
name:
type: string
pattern: "^\\S*$"
description: "Project name with no whitespace"
config:
pep_version:
description: "Version of the PEP Schema this PEP follows"
type: string
sample_table:
type: string
description: "Path to the sample annotation table with one row per sample"
subsample_table:
type: string
description: "Path to the subsample annotation table with one row per subsample and sample_name attribute matching an entry in the sample table"
sample_modifiers:
type: object
properties:
append:
type: object
duplicate:
type: object
imply:
type: array
items:
type: object
properties:
if:
type: object
then:
type: object
derive:
type: object
properties:
attributes:
type: array
items:
type: string
sources:
type: object
project_modifiers:
type: object
properties:
amend:
description: "Object overwriting original project attributes"
type: object
import:
description: "List of external PEP project config files to import"
type: array
items:
type: string
required:
- pep_version
samples:
type: array
items:
type: object
properties:
sample_name:
type: string
pattern: "^\\S*$"
description: "Unique name of the sample with no whitespace"
required:
- sample_name
required:
- samples
8 changes: 8 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
All notable changes to this project will be documented in this file.


## [0.9.14] -- 2021-02-05

### Changed
- Update F-Seq to F-Seq2
- Add the option to install using conda
- Add script for easing installation of seqOutBias
- Use https for PEP schema; list alternative local PEP schema approach

## [0.9.13] -- 2020-12-02

### Changed
Expand Down
13 changes: 8 additions & 5 deletions docs/files/examples/gold/gold_reports/fastqc_report_r1.html
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,14 @@
<body>
<div id="top"></div>
<div class="container">

<p><b>No objects to display for: <code>FastQC report r1</code></b><p>

<!-- Expects a 2 lists of lists: figures and links -->


<h5>FastQC r1 reports</h5>

<a href='../results_pipeline/gold1/fastqc/gold1_R1_trim_fastqc.html' class="list-group-item">gold1 FastQC report r1</a>
<a href='../results_pipeline/gold2/fastqc/gold2_R1_trim_fastqc.html' class="list-group-item">gold2 FastQC report r1</a>
<a href='../results_pipeline/gold3/fastqc/gold3_R1_trim_fastqc.html' class="list-group-item">gold3 FastQC report r1</a>
<a href='../results_pipeline/gold4/fastqc/gold4_R1_trim_fastqc.html' class="list-group-item">gold4 FastQC report r1</a>
<a href='../results_pipeline/gold5/fastqc/gold5_R1_trim_fastqc.html' class="list-group-item">gold5 FastQC report r1</a>

</div>
</body>
Expand Down
10 changes: 7 additions & 3 deletions docs/files/examples/gold/gold_reports/fastqc_report_r2.html
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,13 @@
<div id="top"></div>
<div class="container">

<p><b>No objects to display for: <code>FastQC report r2</code></b><p>

<!-- Expects a 2 lists of lists: figures and links -->
<h5>FastQC r2 reports</h5>

<a href='../results_pipeline/gold1/fastqc/gold1_R2_trim_fastqc.html' class="list-group-item">gold1 FastQC report r2</a>
<a href='../results_pipeline/gold2/fastqc/gold2_R2_trim_fastqc.html' class="list-group-item">gold2 FastQC report r2</a>
<a href='../results_pipeline/gold3/fastqc/gold3_R2_trim_fastqc.html' class="list-group-item">gold3 FastQC report r2</a>
<a href='../results_pipeline/gold4/fastqc/gold4_R2_trim_fastqc.html' class="list-group-item">gold4 FastQC report r2</a>
<a href='../results_pipeline/gold5/fastqc/gold5_R2_trim_fastqc.html' class="list-group-item">gold5 FastQC report r2</a>


</div>
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

97 changes: 51 additions & 46 deletions docs/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,84 +6,89 @@
git clone https://github.com/databio/pepatac.git
```

## 2: Download `refgenie` assets
## 2: Install required software

PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this:

```console
pip install --user refgenie
export REFGENIE=your_genome_folder/genome_config.yaml
refgenie init -c $REFGENIE
```
You have two options for software prerequisites: 1) use containers, or 2) install all prerequisites natively. If you want to use containers, you need the [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/), and either `docker` or `singularity` -- please see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements natively:

Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists.
### Tools

Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command:
You will need some common bioinformatics tools installed: [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/), [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [preseq (v2.0+)](http://smithlabresearch.org/software/preseq/), [samblaster (v0.1.24+)](https://github.com/GregoryFaust/samblaster), [samtools (v1.7+)](http://www.htslib.org/), [skewer (v0.1.126+)](https://github.com/relipmoc/skewer), [UCSC tools](http://hgdownload.soe.ucsc.edu/admin/exe/) (wigToBigWig, bigWigCat, bedToBigBed), [pigz (v2.3.4+)](https://zlib.net/pigz/).

```console
refgenie pull hg38/bowtie2_index refgene_anno feat_annotation
```
Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc)) and utilize swappable tools for adapter removal ([trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)), deduplication ([picard](https://broadinstitute.github.io/picard/)), and signal track generation ([seqOutBias](https://github.com/guertinlab/seqOutBias), [bedGraphToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigMerge](http://hgdownload.soe.ucsc.edu/admin/exe/)).

PEPATAC also requires `bowtie2_index` for any pre-alignment genomes:
The easiest and preferred way is to utilize `conda` to install all the tools in a single command, albeit be prepared for this initial installation process to take more than an hour to complete.

```console
refgenie pull rCRSd/bowtie2_index
refgenie pull human_repeats/bowtie2_index
From the `pepatac/` directory:
```{bash}
conda env create -f requirements-conda.yml
```

## 3: Install required software

You have two options for software prerequisites: 1) use a container, or 2) install all prerequisites natively. If you want to use containers, you need our [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/), and either `docker` or `singularity` -- please see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements natively:
Note: The subsequent steps all assume you have installed using `conda`. Alternatively, you can follow instructions to install each individual program natively. If you need additional direction with this approach, see the [detailed installation instructions](detailed-install.md).

### Python packages

`PEPATAC` uses several packages under the hood. From the `pepatac/` directory:
`PEPATAC` uses several Python packages under the hood. Not all of these are available through `conda`, so we'll ensure they are installed ourselves to the `pepatac` `conda` environment. From the `pepatac/` directory:

```{bash}
pip install --user -r requirements.txt
conda activate pepatac
unset PYTHONPATH
python -m pip install --ignore-installed --upgrade -r requirements.txt
```

### R packages

`PEPATAC` uses `R` to generate quality control and read/peak annotation plots, so you'll need to have R functional if you want these outputs. We have packaged all the `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/dev/PEPATACr). The `PEPATAC` package relies on a few additional packages which can be installed at the command line as follows:
`PEPATAC` uses `R` to generate quality control and read/peak annotation plots. We have packaged the `pepatac` specific `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/dev/PEPATACr). The `PEPATACr` package relies on a few additional packages which can be installed to the `conda` environment.

```
Rscript -e 'install.packages("devtools")'
Rscript -e 'devtools::install_github("pepkit/pepr")'
Rscript -e 'install.packages("BiocManager")'
Rscript -e 'BiocManager::install("GenomicRanges")'
Rscript -e 'devtools::install_github("databio/GenomicDistributions")'
Rscript -e 'BiocManager::install(c("BSgenome", "GenomicFeatures", "ensembldb"))'
Rscript -e 'install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.1.tar.gz", repos=NULL)'
To ensure these packages are installed to the `pepatac` `conda` environment, make sure to point your `R_LIBS` environment variable to the `conda` environment `R` library. For example:
```{bash}
conda activate pepatac
unset R_LIBS
export R_LIBS="$CONDA_PREFIX/lib/R/library"
```

Then, install the `PEPATAC` package. From the `pepatac/` directory:
```
Rscript -e 'devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/")'
From the `pepatac/` directory, open `R` and install the following packages:
```{R}
install.packages("optigrab")
devtools::install_github("databio/GenomicDistributions")
install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL)
devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/")
```

### Tools
## 3: Download `refgenie` assets

We will need some common bioinformatics tools installed: [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/), [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [preseq (v2.0+)](http://smithlabresearch.org/software/preseq/), [samblaster (v0.1.24+)](https://github.com/GregoryFaust/samblaster), [samtools (v1.7+)](http://www.htslib.org/), [skewer (v0.1.126+)](https://github.com/relipmoc/skewer), [UCSC tools](http://hgdownload.soe.ucsc.edu/admin/exe/) (wigToBigWig, bigWigCat, bedToBigBed), [pigz (v2.3.4+)](https://zlib.net/pigz/). Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc)) and utilize swappable tools for adapter removal ([trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)), deduplication ([picard](https://broadinstitute.github.io/picard/)), and signal track generation ([seqOutBias](https://github.com/guertinlab/seqOutBias), [bedGraphToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigMerge](http://hgdownload.soe.ucsc.edu/admin/exe/)).
PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this:

You should follow instructions to install each individual program. If you need help installing these, see the [detailed installation instructions](detailed-install.md).
```console
export REFGENIE=/path/to/your_genome_folder/genome_config.yaml
refgenie init -c $REFGENIE
```

## 4: Run an example project through `PEPATAC`
Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists.

Start by running the example project (test_config.yaml) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the pipeline across samples in a project. Let's use the `-d` argument to do a dry run, which will create job scripts for every sample in a project, but will not execute them:
Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command:

```console
refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb
refgenie build hg38/feat_annotation
```
cd pepatac
looper run -d examples/test_project/test_config.yaml

PEPATAC also requires a `bowtie2_index` asset for any pre-alignment genomes:

```console
refgenie pull rCRSd/bowtie2_index
refgenie pull human_repeats/bowtie2_index
```

If the looper executable is not in your $PATH, add the following line to your .bashrc or .profile:
## 4: Run an example project through `PEPATAC`

Start by running the example project (`test_config.yaml`) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the pipeline across samples in a project. Let's use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them:

From the `pepatac/` folder:
```
export PATH=$PATH:~/.local/bin
looper run -d examples/test_project/test_config.yaml
```

If that worked, let's actually run the example by taking out the -d flag:
If that looked good, let's actually run the example by taking out the `-d` flag:
```
looper run examples/test_project/test_config.yaml
```
Expand All @@ -95,11 +100,11 @@ bulker activate databio/pepatac
looper run examples/test_project/test_config.yaml
```

There are lots of other cool things you can do with looper, like dry runs, summarize results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/).
There are lots of other cool things you can do with looper, like dry runs, report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/).

## 5: Configure your project files

To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](https://pepkit.github.io/docs/home/) and is universal to all pipelines that read PEPs, including `PEPATAC`. To get you started, there are multiple examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/pepatac/tree/master/examples/test_project)). In short, you need two files for your project:
To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](http://pep.databio.org/en/latest/simple_example/) and is universal to all pipelines that read PEPs, including `PEPATAC`. To get you started, there are multiple examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/pepatac/tree/master/examples/test_project)). In short, you need two files for your project:

1. project config file -- describes output locations, pointers to data, etc.
2. sample annotation file -- comma-separated value (CSV) list of your samples.
Expand Down
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V]
[--motif] [--sob] [--no-scale] [--prioritize] [--keep]
[--noFIFO] [--lite] [--skipqc] [-V]
PEPATAC version 0.9.13
PEPATAC version 0.9.14
optional arguments:
-h, --help show this help message and exit
Expand Down
12 changes: 6 additions & 6 deletions examples/test_project/test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ pep_version: 2.0.0
sample_table: test_annotation.csv # sheet listing all samples in the project

looper: # relative paths are relative to this config file
output_dir: "$PROCESSED/pepatac_test" # ABSOLUTE PATH to the parent, shared space where project results go
pipeline_interfaces: ["$CODE/pepatac/project_pipeline_interface.yaml"] # ABSOLUTE PATH to the directory where looper will find the pipeline repository
output_dir: pepatac_test
pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository.

sample_modifiers:
append:
pipeline_interfaces: ["$CODE/pepatac/sample_pipeline_interface.yaml"]
pipeline_interfaces: ../../sample_pipeline_interface.yaml
derive:
attributes: [read1, read2]
sources:
test_data_R1: "$CODE/pepatac/examples/data/{sample_name}_r1.fastq.gz"
test_data_R2: "$CODE/pepatac/examples/data/{sample_name}_r2.fastq.gz"
test_data_R1: "examples/data/{sample_name}_r1.fastq.gz"
test_data_R2: "examples/data/{sample_name}_r2.fastq.gz"
imply:
- if:
organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"]
Expand All @@ -29,4 +29,4 @@ sample_modifiers:
peak_type: fixed # Default. [options: variable]
extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream.
frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run
blacklist: $GENOMES/hg38/blacklist/default/hg38_blacklist.bed.gz

24 changes: 24 additions & 0 deletions install_sob.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

case "$OSTYPE" in
solaris*) echo "SOLARIS; See https://github.com/guertinlab/seqOutBias for installation." ;;
darwin*) wget -O seqOutBias_v1.2.0.tgz https://github.com/guertinlab/seqOutBias/releases/download/v1.2.0/seqOutBias_v1.2.0.bin.osx.x86_64.tgz ;;
linux*) wget -O seqOutBias_v1.2.0.tgz https://github.com/guertinlab/seqOutBias/releases/download/v1.2.0/seqOutBias_v1.2.0.bin.linux.x86_64.tgz ;;
bsd*) echo "BSD; See https://github.com/guertinlab/seqOutBias for installation." ;;
msys*) echo "WINDOWS; See https://github.com/guertinlab/seqOutBias for installation." ;;
*) echo "unknown: $OSTYPE; See https://github.com/guertinlab/seqOutBias for installation." ;;
esac

if [ -f "seqOutBias_v1.2.0.tgz" ]; then
ENVS=$(conda env list | awk '{print $1}' )
if [[ $ENVS = *"pepatac"* ]]; then
conda activate pepatac
mkdir seqOutBias_v1.2.0 && tar xvf seqOutBias_v1.2.0.tgz -C seqOutBias_v1.2.0 --strip-components 1
cd seqOutBias_v1.2.0/
ln -s `pwd`/seqOutBias $CONDA_PREFIX/bin/seqOutBias
cd ../
else
echo "Error: Please run `conda env create -f requirements-conda.yml` first."
exit
fi;
fi;
3 changes: 2 additions & 1 deletion pepatac_input_schema.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
description: A PEP for ATAC-seq samples for the PEPATAC pipeline.
imports:
- http://schema.databio.org/pep/2.0.0.yaml
- https://schema.databio.org/pep/2.0.0.yaml # Use a web-based schema source file
#- PEP_schema.yaml # Use a local (current directory) schema source file
properties:
samples:
type: array
Expand Down
Loading

0 comments on commit 6bc3cb9

Please sign in to comment.