Merge pull request #169 from databio/dev

Dev
databio · Feb 5, 2021 · 6bc3cb9 · 6bc3cb9
2 parents 3291788 + 9c6322d
commit 6bc3cb9
Show file tree

Hide file tree

Showing 18 changed files with 740 additions and 125 deletions.
diff --git a/.gitignore b/.gitignore
@@ -20,6 +20,7 @@ _site
 .sass-cache
 _site/
 /_site/
+_build/
 .sass-cache/
 .jekyll-metadata
 
@@ -32,3 +33,17 @@ anno/mm9_annotations.bed.gz
 # Tutorial files
 examples/data/tutorial_r1.fastq.gz
 examples/data/tutorial_r2.fastq.gz
+examples/gold_atac/metadata/distinct.bed
+examples/gold_atac/metadata/distinct_only.bed
+examples/gold_atac/metadata/gold_fseq.yaml
+examples/gold_atac/metadata/gold_genrich.yaml
+examples/gold_atac/metadata/gold_hmmratac.yaml
+examples/gold_atac/metadata/gold_homer.yaml
+examples/gold_atac/metadata/gold_picard_dedup.yaml
+examples/gold_atac/metadata/gold_samtools_dedup.yaml
+examples/test_project/test_bwa.yaml
+examples/test_project/test_fseq.yaml
+examples/test_project/test_genrich.yaml
+examples/test_project/test_hmmratac.yaml
+examples/test_project/test_homer.yaml
+examples/test_project/test_macs.yaml
diff --git a/PEP_schema.yaml b/PEP_schema.yaml
@@ -0,0 +1,68 @@
+description: "Schema for a minimal PEP"
+version: "2.0.0"
+properties:
+  name: 
+    type: string
+    pattern: "^\\S*$"
+    description: "Project name with no whitespace"
+  config:
+    pep_version:
+      description: "Version of the PEP Schema this PEP follows"
+      type: string
+    sample_table:
+      type: string
+      description: "Path to the sample annotation table with one row per sample"
+    subsample_table:
+      type: string
+      description: "Path to the subsample annotation table with one row per subsample and sample_name attribute matching an entry in the sample table"
+    sample_modifiers:
+      type: object
+      properties:
+        append:
+          type: object
+        duplicate:
+          type: object
+        imply:
+          type: array
+          items:
+            type: object
+            properties:
+              if:
+                type: object
+              then:
+                type: object
+        derive:
+          type: object
+          properties:
+            attributes:
+              type: array
+              items:
+                type: string
+            sources:
+              type: object
+    project_modifiers:
+      type: object
+      properties:
+        amend:
+          description: "Object overwriting original project attributes"
+          type: object
+        import:
+          description: "List of external PEP project config files to import"
+          type: array
+          items:
+            type: string
+    required:
+      - pep_version
+  samples:
+    type: array
+    items:
+      type: object
+      properties:
+        sample_name: 
+          type: string
+          pattern: "^\\S*$"
+          description: "Unique name of the sample with no whitespace"
+      required:
+        - sample_name
+required:
+  - samples
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -2,6 +2,14 @@
 All notable changes to this project will be documented in this file.
 
 
+## [0.9.14] -- 2021-02-05
+
+### Changed
+ - Update F-Seq to F-Seq2
+ - Add the option to install using conda
+ - Add script for easing installation of seqOutBias
+ - Use https for PEP schema; list alternative local PEP schema approach
+
 ## [0.9.13] -- 2020-12-02
 
 ### Changed

diff --git a/docs/files/examples/gold/gold_reports/fastqc_report_r1.html b/docs/files/examples/gold/gold_reports/fastqc_report_r1.html
@@ -226,11 +226,14 @@
     <body>
     	<div id="top"></div>
     	<div class="container">
-
-				<p><b>No objects to display for: <code>FastQC report r1</code></b><p>
-
-			<!-- Expects a 2 lists of lists: figures and links -->
-
+
+                <h5>FastQC r1 reports</h5>
+
+                    <a href='../results_pipeline/gold1/fastqc/gold1_R1_trim_fastqc.html' class="list-group-item">gold1 FastQC report r1</a>
+                    <a href='../results_pipeline/gold2/fastqc/gold2_R1_trim_fastqc.html' class="list-group-item">gold2 FastQC report r1</a>
+                    <a href='../results_pipeline/gold3/fastqc/gold3_R1_trim_fastqc.html' class="list-group-item">gold3 FastQC report r1</a>
+                    <a href='../results_pipeline/gold4/fastqc/gold4_R1_trim_fastqc.html' class="list-group-item">gold4 FastQC report r1</a>
+                    <a href='../results_pipeline/gold5/fastqc/gold5_R1_trim_fastqc.html' class="list-group-item">gold5 FastQC report r1</a>
 
     	</div>
     </body>

diff --git a/docs/files/examples/gold/gold_reports/fastqc_report_r2.html b/docs/files/examples/gold/gold_reports/fastqc_report_r2.html
@@ -227,9 +227,13 @@
     	<div id="top"></div>
     	<div class="container">
 
-				<p><b>No objects to display for: <code>FastQC report r2</code></b><p>
-
-			<!-- Expects a 2 lists of lists: figures and links -->
+				<h5>FastQC r2 reports</h5>
+
+                    <a href='../results_pipeline/gold1/fastqc/gold1_R2_trim_fastqc.html' class="list-group-item">gold1 FastQC report r2</a>
+                    <a href='../results_pipeline/gold2/fastqc/gold2_R2_trim_fastqc.html' class="list-group-item">gold2 FastQC report r2</a>
+                    <a href='../results_pipeline/gold3/fastqc/gold3_R2_trim_fastqc.html' class="list-group-item">gold3 FastQC report r2</a>
+                    <a href='../results_pipeline/gold4/fastqc/gold4_R2_trim_fastqc.html' class="list-group-item">gold4 FastQC report r2</a>
+                    <a href='../results_pipeline/gold5/fastqc/gold5_R2_trim_fastqc.html' class="list-group-item">gold5 FastQC report r2</a>
 
 
     	</div>

diff --git a/docs/files/examples/gold/results_pipeline/gold1/fastqc/gold1_R1_trim_fastqc.html b/docs/files/examples/gold/results_pipeline/gold1/fastqc/gold1_R1_trim_fastqc.html
diff --git a/docs/files/examples/gold/results_pipeline/gold3/fastqc/gold3_R1_trim_fastqc.html b/docs/files/examples/gold/results_pipeline/gold3/fastqc/gold3_R1_trim_fastqc.html
diff --git a/docs/install.md b/docs/install.md
@@ -6,84 +6,89 @@
 git clone https://github.com/databio/pepatac.git
 ```
 
-## 2: Download `refgenie` assets
+## 2: Install required software
 
-PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this:
-
-```console
-pip install --user refgenie
-export REFGENIE=your_genome_folder/genome_config.yaml
-refgenie init -c $REFGENIE
-```
+You have two options for software prerequisites: 1) use containers, or 2) install all prerequisites natively. If you want to use containers, you need the [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/), and either `docker` or `singularity` -- please see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements natively:
 
-Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. 
+### Tools
 
-Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command:
+You will need some common bioinformatics tools installed: [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/), [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [preseq (v2.0+)](http://smithlabresearch.org/software/preseq/), [samblaster (v0.1.24+)](https://github.com/GregoryFaust/samblaster), [samtools (v1.7+)](http://www.htslib.org/), [skewer (v0.1.126+)](https://github.com/relipmoc/skewer), [UCSC tools](http://hgdownload.soe.ucsc.edu/admin/exe/) (wigToBigWig, bigWigCat, bedToBigBed), [pigz (v2.3.4+)](https://zlib.net/pigz/). 
 
-```console
-refgenie pull hg38/bowtie2_index refgene_anno feat_annotation
-```
+Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc)) and utilize swappable tools for adapter removal ([trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)), deduplication ([picard](https://broadinstitute.github.io/picard/)), and signal track generation ([seqOutBias](https://github.com/guertinlab/seqOutBias), [bedGraphToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigMerge](http://hgdownload.soe.ucsc.edu/admin/exe/)).
 
-PEPATAC also requires `bowtie2_index` for any pre-alignment genomes:
+The easiest and preferred way is to utilize `conda` to install all the tools in a single command, albeit be prepared for this initial installation process to take more than an hour to complete.
 
-```console
-refgenie pull rCRSd/bowtie2_index
-refgenie pull human_repeats/bowtie2_index
+From the `pepatac/` directory:
+```{bash}
+conda env create -f requirements-conda.yml
 ```
 
-## 3: Install required software
-
-You have two options for software prerequisites: 1) use a container, or 2) install all prerequisites natively. If you want to use containers, you need our [multi-container environment manager, `bulker`](https://bulker.databio.org/en/latest/), and either `docker` or `singularity` -- please see instructions in [how to run PEPATAC with containers](run-container.md). Otherwise, follow these instructions to install the requirements natively:
+Note: The subsequent steps all assume you have installed using `conda`.  Alternatively, you can follow instructions to install each individual program natively. If you need additional direction with this approach, see the [detailed installation instructions](detailed-install.md).
 
 ### Python packages
 
-`PEPATAC` uses several packages under the hood. From the `pepatac/` directory:
+`PEPATAC` uses several Python packages under the hood. Not all of these are available through `conda`, so we'll ensure they are installed ourselves to the `pepatac` `conda` environment. From the `pepatac/` directory:
 
 ```{bash}
-pip install --user -r requirements.txt
+conda activate pepatac
+unset PYTHONPATH
+python -m pip install --ignore-installed --upgrade -r requirements.txt
 ```
 
 ### R packages
 
-`PEPATAC` uses `R` to generate quality control and read/peak annotation plots, so you'll need to have R functional if you want these outputs. We have packaged all the `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/dev/PEPATACr). The `PEPATAC` package relies on a few additional packages which can be installed at the command line as follows:
+`PEPATAC` uses `R` to generate quality control and read/peak annotation plots. We have packaged the `pepatac` specific `R` code into a supporting package called [PEPATACr](https://github.com/databio/pepatac/tree/dev/PEPATACr). The `PEPATACr` package relies on a few additional packages which can be installed to the `conda` environment.
 
-```
-Rscript -e 'install.packages("devtools")'
-Rscript -e 'devtools::install_github("pepkit/pepr")'
-Rscript -e 'install.packages("BiocManager")'
-Rscript -e 'BiocManager::install("GenomicRanges")'
-Rscript -e 'devtools::install_github("databio/GenomicDistributions")'
-Rscript -e 'BiocManager::install(c("BSgenome", "GenomicFeatures", "ensembldb"))'
-Rscript -e 'install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.1.tar.gz", repos=NULL)'
+To ensure these packages are installed to the `pepatac` `conda` environment, make sure to point your `R_LIBS` environment variable to the `conda` environment `R` library. For example:
+```{bash}
+conda activate pepatac
+unset R_LIBS
+export R_LIBS="$CONDA_PREFIX/lib/R/library"
 ```
 
-Then, install the `PEPATAC` package.  From the `pepatac/` directory:
-```
-Rscript -e 'devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/")'
+From the `pepatac/` directory, open `R` and install the following packages:
+```{R}
+install.packages("optigrab")
+devtools::install_github("databio/GenomicDistributions")
+install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL)
+devtools::install(file.path("PEPATACr/"), dependencies=TRUE, repos="https://cloud.r-project.org/")
 ```
 
-### Tools
+## 3: Download `refgenie` assets
 
-We will need some common bioinformatics tools installed: [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/), [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [preseq (v2.0+)](http://smithlabresearch.org/software/preseq/), [samblaster (v0.1.24+)](https://github.com/GregoryFaust/samblaster), [samtools (v1.7+)](http://www.htslib.org/), [skewer (v0.1.126+)](https://github.com/relipmoc/skewer), [UCSC tools](http://hgdownload.soe.ucsc.edu/admin/exe/) (wigToBigWig, bigWigCat, bedToBigBed), [pigz (v2.3.4+)](https://zlib.net/pigz/).  Optionally, `PEPATAC` can report on fastq quality ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc)) and utilize swappable tools for adapter removal ([trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)), deduplication ([picard](https://broadinstitute.github.io/picard/)), and signal track generation ([seqOutBias](https://github.com/guertinlab/seqOutBias), [bedGraphToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/), and [bigWigMerge](http://hgdownload.soe.ucsc.edu/admin/exe/)).
+PEPATAC uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this:
 
-You should follow instructions to install each individual program. If you need help installing these, see the [detailed installation instructions](detailed-install.md).
+```console
+export REFGENIE=/path/to/your_genome_folder/genome_config.yaml
+refgenie init -c $REFGENIE
+```
 
-## 4: Run an example project through `PEPATAC`
+Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. 
 
-Start by running the example project (test_config.yaml) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the pipeline across samples in a project. Let's use the `-d` argument to do a dry run, which will create job scripts for every sample in a project, but will not execute them:
+Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command:
 
+```console
+refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb
+refgenie build hg38/feat_annotation
 ```
-cd pepatac
-looper run -d examples/test_project/test_config.yaml
+
+PEPATAC also requires a `bowtie2_index` asset for any pre-alignment genomes:
+
+```console
+refgenie pull rCRSd/bowtie2_index
+refgenie pull human_repeats/bowtie2_index
 ```
 
-If the looper executable is not in your $PATH, add the following line to your .bashrc or .profile:
+## 4: Run an example project through `PEPATAC`
+
+Start by running the example project (`test_config.yaml`) in the `examples/test_project/` folder. `PEPATAC` uses a project management tool called `looper` to run the pipeline across samples in a project. Let's use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them:
 
+From the `pepatac/` folder:
 ```
-export PATH=$PATH:~/.local/bin
+looper run -d examples/test_project/test_config.yaml
 ```
 
-If that worked, let's actually run the example by taking out the -d flag:
+If that looked good, let's actually run the example by taking out the `-d` flag:
 ```
 looper run examples/test_project/test_config.yaml
 ```
@@ -95,11 +100,11 @@ bulker activate databio/pepatac
 looper run examples/test_project/test_config.yaml
 ```
 
-There are lots of other cool things you can do with looper, like dry runs, summarize results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/).
+There are lots of other cool things you can do with looper, like dry runs, report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/).
 
 ## 5: Configure your project files
 
-To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](https://pepkit.github.io/docs/home/) and is universal to all pipelines that read PEPs, including `PEPATAC`. To get you started, there are multiple examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/pepatac/tree/master/examples/test_project)). In short, you need two files for your project:
+To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](http://pep.databio.org/en/latest/simple_example/) and is universal to all pipelines that read PEPs, including `PEPATAC`. To get you started, there are multiple examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/pepatac/tree/master/examples/test_project)). In short, you need two files for your project:
 
   1. project config file -- describes output locations, pointers to data, etc.
   2. sample annotation file -- comma-separated value (CSV) list of your samples.

diff --git a/docs/usage.md b/docs/usage.md
@@ -22,7 +22,7 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V]
                   [--motif] [--sob] [--no-scale] [--prioritize] [--keep]
                   [--noFIFO] [--lite] [--skipqc] [-V]
 
-PEPATAC version 0.9.13
+PEPATAC version 0.9.14
 
 optional arguments:
   -h, --help            show this help message and exit

diff --git a/examples/test_project/test_config.yaml b/examples/test_project/test_config.yaml
@@ -5,17 +5,17 @@ pep_version: 2.0.0
 sample_table: test_annotation.csv  # sheet listing all samples in the project
 
 looper:  # relative paths are relative to this config file
-  output_dir: "$PROCESSED/pepatac_test"  # ABSOLUTE PATH to the parent, shared space where project results go
-  pipeline_interfaces: ["$CODE/pepatac/project_pipeline_interface.yaml"]  # ABSOLUTE PATH to the directory where looper will find the pipeline repository
+  output_dir: pepatac_test 
+  pipeline_interfaces: ../../project_pipeline_interface.yaml  # PATH to the directory where looper will find the pipeline repository. 
 
 sample_modifiers:
   append:
-    pipeline_interfaces: ["$CODE/pepatac/sample_pipeline_interface.yaml"]
+    pipeline_interfaces: ../../sample_pipeline_interface.yaml
   derive:
     attributes: [read1, read2]
     sources:
-      test_data_R1: "$CODE/pepatac/examples/data/{sample_name}_r1.fastq.gz"
-      test_data_R2: "$CODE/pepatac/examples/data/{sample_name}_r2.fastq.gz"
+      test_data_R1: "examples/data/{sample_name}_r1.fastq.gz"
+      test_data_R2: "examples/data/{sample_name}_r2.fastq.gz"
   imply:
     - if: 
         organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"]
@@ -29,4 +29,4 @@ sample_modifiers:
         peak_type: fixed         # Default. [options: variable]
         extend: "250"            # Default. For fixed-width peaks, extend this distance up- and down-stream.
         frip_ref_peaks: None     # Default. Use an external reference set of peaks instead of the peaks called from this run
-        blacklist: $GENOMES/hg38/blacklist/default/hg38_blacklist.bed.gz
+
diff --git a/install_sob.sh b/install_sob.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+case "$OSTYPE" in
+  solaris*) echo "SOLARIS; See https://github.com/guertinlab/seqOutBias for installation." ;;
+  darwin*)  wget -O seqOutBias_v1.2.0.tgz https://github.com/guertinlab/seqOutBias/releases/download/v1.2.0/seqOutBias_v1.2.0.bin.osx.x86_64.tgz ;; 
+  linux*)   wget -O seqOutBias_v1.2.0.tgz  https://github.com/guertinlab/seqOutBias/releases/download/v1.2.0/seqOutBias_v1.2.0.bin.linux.x86_64.tgz ;;
+  bsd*)     echo "BSD; See https://github.com/guertinlab/seqOutBias for installation." ;;
+  msys*)    echo "WINDOWS; See https://github.com/guertinlab/seqOutBias for installation." ;;
+  *)        echo "unknown: $OSTYPE; See https://github.com/guertinlab/seqOutBias for installation." ;;
+esac
+
+if [ -f "seqOutBias_v1.2.0.tgz" ]; then
+  ENVS=$(conda env list | awk '{print $1}' )
+  if [[ $ENVS = *"pepatac"* ]]; then
+    conda activate pepatac
+    mkdir seqOutBias_v1.2.0 && tar xvf seqOutBias_v1.2.0.tgz -C seqOutBias_v1.2.0 --strip-components 1
+    cd seqOutBias_v1.2.0/
+    ln -s `pwd`/seqOutBias $CONDA_PREFIX/bin/seqOutBias
+    cd ../
+  else 
+    echo "Error: Please run `conda env create -f requirements-conda.yml` first."
+    exit
+  fi;
+fi;
diff --git a/pepatac_input_schema.yaml b/pepatac_input_schema.yaml
@@ -1,6 +1,7 @@
 description: A PEP for ATAC-seq samples for the PEPATAC pipeline.
 imports: 
-  - http://schema.databio.org/pep/2.0.0.yaml
+  - https://schema.databio.org/pep/2.0.0.yaml  # Use a web-based schema source file
+  #- PEP_schema.yaml # Use a local (current directory) schema source file
 properties:
   samples:
     type: array