diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index dd84ea7..891c617 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -23,16 +23,5 @@ A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. -**Desktop (please complete the following information):** - - OS: [e.g. iOS] - - Browser [e.g. chrome, safari] - - Version [e.g. 22] - -**Smartphone (please complete the following information):** - - Device: [e.g. iPhone6] - - OS: [e.g. iOS8.1] - - Browser [e.g. stock browser, safari] - - Version [e.g. 22] - **Additional context** Add any other context about the problem here. diff --git a/.github/workflows/ATtRACT.yml b/.github/workflows/ATtRACT.yml new file mode 100644 index 0000000..412557a --- /dev/null +++ b/.github/workflows/ATtRACT.yml @@ -0,0 +1,56 @@ +name: ATtRACT + +on: push + +jobs: + + ATtRACT-db-test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-10.15, ubuntu-20.04] + + steps: + + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Setup miniconda & conda env + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: 3.7.4 + miniconda-version: 4.7.12 + auto-update-conda: false + environment-file: envs/main.yml + activate-environment: bindz-rbp + auto-activate-base: false + + - name: Display all miniconda & env info + shell: bash -l {0} + run: | + conda info -a + conda list + + - name: Install md5sum with brew + shell: bash -l {0} + if: matrix.os == 'macos-10.15' + run: brew install md5sha1sum + + - name: Extract db backup + run: | + mkdir ATtRACT_backup_26082020 + unzip resources/ATtRACT_backup_26082020.zip -d ATtRACT_backup_26082020 + + - name: Extract hsa motifs + shell: bash -l {0} + run: | + mkdir tests/unit/format-ATtRACT-motifs/ATtRACT_hsa + python scripts/format-ATtRACT-motifs.py --pwms tests/unit/format-ATtRACT-motifs/ATtRACT/pwm.txt --names tests/unit/format-ATtRACT-motifs/ATtRACT/ATtRACT_db.txt --organism Homo_sapiens --outdir tests/unit/format-ATtRACT-motifs/ATtRACT_hsa + md5sum --check tests/unit/format-ATtRACT-motifs/expected_output_hsa.md5 + + - name: Extract mmu motifs + shell: bash -l {0} + run: | + mkdir tests/unit/format-ATtRACT-motifs/ATtRACT_mmu + python scripts/format-ATtRACT-motifs.py --pwms tests/unit/format-ATtRACT-motifs/ATtRACT/pwm.txt --names tests/unit/format-ATtRACT-motifs/ATtRACT/ATtRACT_db.txt --organism Mus_musculus --outdir tests/unit/format-ATtRACT-motifs/ATtRACT_mmu + md5sum --check tests/unit/format-ATtRACT-motifs/expected_output_mmu.md5 diff --git a/.github/workflows/test-conda.yml b/.github/workflows/test-conda.yml new file mode 100644 index 0000000..4fd4f2a --- /dev/null +++ b/.github/workflows/test-conda.yml @@ -0,0 +1,194 @@ +name: test-conda + +on: push + +jobs: + + dev-env: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-10.15, ubuntu-20.04] + + steps: + + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Setup miniconda & conda env + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: 3.7.4 + miniconda-version: 4.7.12 + auto-update-conda: false + environment-file: envs/dev.yml + activate-environment: bindz-rbp-dev + auto-activate-base: false + + - name: Display all miniconda & env info + shell: bash -l {0} + run: | + conda info -a + conda list + + plot-seq-logos: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-10.15, ubuntu-20.04] + + steps: + + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Setup miniconda & conda env + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: 3.7.4 + miniconda-version: 4.7.12 + auto-update-conda: false + environment-file: workflow/envs/plot_sequence_logos.yml + activate-environment: bindz-plot-sequence-logos + auto-activate-base: false + + - name: Display all miniconda & env info + shell: bash -l {0} + run: | + conda info -a + conda list + + - name: Install md5sum with brew + shell: bash -l {0} + if: matrix.os == 'macos-10.15' + run: brew install md5sha1sum + + - name: Run script tests + shell: bash -l {0} + run: | + python workflow/scripts/sequence_logos.py --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_820 --output_location tests/unit/plot_sequence_logos + python workflow/scripts/sequence_logos.py --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_821 --output_location tests/unit/plot_sequence_logos + python workflow/scripts/sequence_logos.py --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_822 --output_location tests/unit/plot_sequence_logos + python workflow/scripts/sequence_logos.py --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_823 --output_location tests/unit/plot_sequence_logos + python workflow/scripts/sequence_logos.py --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_824 --output_location tests/unit/plot_sequence_logos + + - name: MD5SUM check + shell: bash -l {0} + run: md5sum --check tests/unit/plot_sequence_logos/expected_output.md5 + + combine-MotEvo-results: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-10.15, ubuntu-20.04] + + steps: + + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Setup miniconda & conda env + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: 3.7.4 + miniconda-version: 4.7.12 + auto-update-conda: false + environment-file: workflow/envs/combine-motevo-results.yml + activate-environment: bindz-combine-motevo-results + auto-activate-base: false + + - name: Display all miniconda & env info + shell: bash -l {0} + run: | + conda info -a + conda list + + - name: Install md5sum with brew + shell: bash -l {0} + if: matrix.os == 'macos-10.15' + run: brew install md5sha1sum + + - name: Run script tests + shell: bash -l {0} + run: python workflow/scripts/combine-motevo-results.py --input_directories tests/unit/combine_results/motif_HNRNPF_820 tests/unit/combine_results/motif_HNRNPF_821 tests/unit/combine_results/motif_HNRNPF_822 tests/unit/combine_results/motif_HNRNPF_823 tests/unit/combine_results/motif_HNRNPF_824 --filename posterior_sites --outfile tests/unit/combine_results/combined_MotEvo_results.tsv + + - name: MD5SUM check + shell: bash -l {0} + run: md5sum --check tests/unit/combine_results/expected_output.md5 + + plot-heatmap: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-10.15, ubuntu-20.04] + + steps: + + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Setup miniconda & conda env + uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: 4.7.12 + auto-update-conda: false + environment-file: workflow/envs/plot_heatmap_of_MotEvo_results.yml + activate-environment: bindz-plot-heatmap-of-MotEvo-results + auto-activate-base: false + + - name: Display all miniconda & env info + shell: bash -l {0} + run: | + conda info -a + conda list + + - name: Run script tests + shell: bash -l {0} + run: Rscript workflow/scripts/heatmap.r --input_tsv tests/unit/Plot-heatmap-for-motifs/combined_MotEvo_results.tsv --input_sequence ATGTGAGTGAAGTGTGGGAAAGATGACTCGATATATCTGGATGCTAGGGATCGGATGGCGATACG --outfile tests/unit/Plot-heatmap-for-motifs/ProbabilityvsSequences.pdf --sequence_logos_directory tests/unit/Plot-heatmap-for-motifs/sequence_logos + + pipeline-exec: + needs: [plot-seq-logos, combine-MotEvo-results, plot-heatmap] + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [macos-10.15, ubuntu-20.04] + + steps: + + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Setup miniconda & conda env + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: 3.7.4 + miniconda-version: 4.7.12 + auto-update-conda: false + environment-file: envs/main.yml + activate-environment: bindz-rbp + auto-activate-base: false + + - name: Display all miniconda & env info + shell: bash -l {0} + run: | + conda info -a + conda list + + - name: Install md5sum with brew + shell: bash -l {0} + if: matrix.os == 'macos-10.15' + run: brew install md5sha1sum + + - name: Snakemake Rulegraph + shell: bash -l {0} + run: bash tests/integration/execution/snakemake_rulegraph_run.sh + + - name: Snakemake DAG + shell: bash -l {0} + run: bash tests/integration/execution/snakemake_dag_run.sh + + - name: Snakemake local run w/ conda envs + shell: bash -l {0} + run: | + bash tests/integration/execution/snakemake_local_run_conda_environments.sh + md5sum --check tests/integration/expected_output.md5 diff --git a/.github/workflows/test-singularity.yml b/.github/workflows/test-singularity.yml new file mode 100644 index 0000000..6d54e9b --- /dev/null +++ b/.github/workflows/test-singularity.yml @@ -0,0 +1,38 @@ +name: test-singularity + +on: push + +jobs: + + pipeline-exec: + runs-on: ubuntu-20.04 + + steps: + + - name: Checkout Repository + uses: actions/checkout@v2 + + - name: Setup miniconda & conda env + uses: conda-incubator/setup-miniconda@v2 + with: + python-version: 3.7.4 + miniconda-version: 4.7.12 + auto-update-conda: false + environment-file: envs/main.yml + activate-environment: bindz-rbp + auto-activate-base: false + + - name: Install Singularity + shell: bash -l {0} + run: conda install -c conda-forge singularity=3.5.2 + + - name: Display all miniconda & env info + shell: bash -l {0} + run: | + conda info -a + conda list + + - name: Snakemake local run w/ singularity containers + shell: bash -l {0} + run: | + bash tests/integration/execution/snakemake_local_run_singularity_containers.sh diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 5c14496..0000000 --- a/.travis.yml +++ /dev/null @@ -1,124 +0,0 @@ -language: bash - -os: - - linux - - osx - -install: - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt update; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; fi - - if [ "$TRAVIS_OS_NAME" = "osx" ]; then brew update; fi - - if [ "$TRAVIS_OS_NAME" = "osx" ]; then wget https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; fi - - bash miniconda.sh -b -p $HOME/miniconda - - source "$HOME/miniconda/etc/profile.d/conda.sh" - - hash -r - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - # Useful for debugging any issues with conda - - conda info -a - # Install singularity - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y wget; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y build-essential; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y uuid-dev ; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y squashfs-tools; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y libseccomp-dev; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y pkg-config; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get install -y cryptsetup-bin; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then eval "$(gimme 1.13.1)"; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then go version; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then export VERSION=3.5.2; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then wget https://github.com/sylabs/singularity/releases/download/v${VERSION}/singularity-${VERSION}.tar.gz; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then tar -xzf singularity-${VERSION}.tar.gz; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then cd singularity; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then ./mconfig; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then make -C ./builddir; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo make -C ./builddir install; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then singularity --version; fi - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then cd ..; fi - -before_script: - # Create conda virtual environments - - conda env create -f envs/main.yml - - conda env create -f envs/dev.yml - - conda env create -f workflow/envs/plot_heatmap_of_MotEvo_results.yml - - conda env create -f workflow/envs/combine-motevo-results.yml - - conda env create -f workflow/envs/plot_sequence_logos.yml - # Activate the main env - - conda activate binding-scanner && echo $CONDA_DEFAULT_ENV - -script: - # Download and extract the ATtRACT db - #- bash scripts/download-ATtRACT-motifs.sh -o ATtRACT # this curl/wget do not work on Travis machine... - # Extract the backup db - - mkdir ATtRACT_backup_26082020 - - unzip resources/ATtRACT_backup_26082020.zip -d ATtRACT_backup_26082020 - # Extract motifs for Homo sapiens - - mkdir tests/unit/format-ATtRACT-motifs/ATtRACT_hsa - - > - python scripts/format-ATtRACT-motifs.py - --pwms tests/unit/format-ATtRACT-motifs/ATtRACT/pwm.txt - --names tests/unit/format-ATtRACT-motifs/ATtRACT/ATtRACT_db.txt - --organism Homo_sapiens - --outdir tests/unit/format-ATtRACT-motifs/ATtRACT_hsa - - md5sum --check tests/unit/format-ATtRACT-motifs/expected_output_hsa.md5 - # Extract motifs for Mus musculus - - mkdir tests/unit/format-ATtRACT-motifs/ATtRACT_mmu - - > - python scripts/format-ATtRACT-motifs.py - --pwms tests/unit/format-ATtRACT-motifs/ATtRACT/pwm.txt - --names tests/unit/format-ATtRACT-motifs/ATtRACT/ATtRACT_db.txt - --organism Mus_musculus - --outdir tests/unit/format-ATtRACT-motifs/ATtRACT_mmu - - md5sum --check tests/unit/format-ATtRACT-motifs/expected_output_mmu.md5 - # Test sequence_logos.py script - - conda activate plot_sequence_logos - - > - python workflow/scripts/sequence_logos.py - --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_820 - --output_location tests/unit/plot_sequence_logos - - > - python workflow/scripts/sequence_logos.py - --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_821 - --output_location tests/unit/plot_sequence_logos - - > - python workflow/scripts/sequence_logos.py - --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_822 - --output_location tests/unit/plot_sequence_logos - - > - python workflow/scripts/sequence_logos.py - --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_823 - --output_location tests/unit/plot_sequence_logos - - > - python workflow/scripts/sequence_logos.py - --input_file tests/unit/plot_sequence_logos/motif_HNRNPF_824 - --output_location tests/unit/plot_sequence_logos - - md5sum --check tests/unit/plot_sequence_logos/expected_output.md5 - # Test combine-motevo-results.py script - - conda activate combine-motevo-results - - > - python workflow/scripts/combine-motevo-results.py - --input_directories tests/unit/combine_results/motif_HNRNPF_820 tests/unit/combine_results/motif_HNRNPF_821 tests/unit/combine_results/motif_HNRNPF_822 tests/unit/combine_results/motif_HNRNPF_823 tests/unit/combine_results/motif_HNRNPF_824 - --filename posterior_sites - --outfile tests/unit/combine_results/combined_MotEvo_results.tsv - - md5sum --check tests/unit/combine_results/expected_output.md5 - # Test heatmap.r script - - conda activate plot_heatmap_of_MotEvo_results - - > - Rscript workflow/scripts/heatmap.r - --input_tsv tests/unit/Plot-heatmap-for-motifs/combined_MotEvo_results.tsv - --input_sequence ATGTGAGTGAAGTGTGGGAAAGATGACTCGATATATCTGGATGCTAGGGATCGGATGGCGATACG - --outfile tests/unit/Plot-heatmap-for-motifs/ProbabilityvsSequences.pdf - --sequence_logos_directory tests/unit/Plot-heatmap-for-motifs/sequence_logos - #- md5sum --check tests/unit/Plot-heatmap-for-motifs/expected_output.md5 - - conda activate binding-scanner - # Test snakemake Rulegraph and DAG - - bash tests/integration/execution/snakemake_rulegraph_run.sh - - bash tests/integration/execution/snakemake_dag_run.sh - # Test pipeline execution: local, conda envs: - - bash tests/integration/execution/snakemake_local_run_conda_environments.sh - - md5sum --check tests/integration/expected_output.md5 - - rm -rf tests/integration/output/ - # On Linux: Test pipeline execution: local, singularity containers: - # - if [ "$TRAVIS_OS_NAME" = "linux" ]; then bash tests/integration/execution/snakemake_local_run_singularity_environments.sh; fi - # - if [ "$TRAVIS_OS_NAME" = "linux" ]; then md5sum --check tests/integration/expected_output_singularity.md5; fi - # - if [ "$TRAVIS_OS_NAME" = "linux" ]; then rm -rf tests/integration/output/; fi diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 09e4de0..defc225 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -35,33 +35,19 @@ working on them. For bug reports, it is essential that they include reproducible examples. Please **do not** use the issue tracker to ask usage questions, installation -problems etc., unless they appear to be bugs. For these issues, please use -the [communication channels](#communication) outlined below. +problems etc., unless they appear to be bugs. -## Communication - -If you want to reach out to us (e.g., to discuss potential issues for you to -work on), check the [main documentation][res-docs] for contact information. - -## Code style & testing +## Code style To make it easier for everyone to maintain, read and contribute to the code, as well as to ensure that the code base is robust and of high quality, we -would kindly ask you to stick to the following guidelines for code style and -testing. +would kindly ask you to stick to the following guidelines for code style. - Please use a recent version of [Python 3][res-py] (3.7.4+) - Please try to conform to the used code, docstring and commenting style within a project to maintain consistency -- Please use type hints for all function/method signatures - (exception: tests) - Please use the following linters (use default settings unless otherwise - stated): - - [`shellcheck`][res-sh-shellcheck] - - [`flake8`][res-py-flake8] -- Please use the following test suites: - - [`pytest`][res-py-pytest] - - [`coverage`][res-py-coverage] + stated): [`shellcheck`][res-sh-shellcheck], [`flake8`][res-py-flake8] ## Commit messages @@ -114,16 +100,15 @@ happen smoothly: code changes 3. If applicable, update relevant sections of the [documentation][res-documentation] 4. Add or update tests; untested code will not be merged; refer to the - [guidelines](#code-style--testing) above for details + [guidelines](#code-style) above for details 5. Ensure that your coding style is in line with the - [guidelines](#code-style--testing) described above -6. Ensure that all tests and linter checks configured in the [Travis - CI][res-travis-docs] [continuous integration][res-ci-cd] (CI) pipeline pass without + [guidelines](#code-style) described above +6. Ensure that all the checks configured in the [continuous integration][res-ci-cd] (CI) pipeline pass without issues 7. If necessary, clean up excessive commits with `git rebase`; cherry-pick and merge commits as you see fit; use concise and descriptive commit messages 8. Push your clean, tested and documented feature branch to the remote; make - sure the Travis CI pipeline passes + sure the CI pipeline passes 9. Issue a pull request against the default branch; follow the instructions in the [template][res-pull-request]; importantly, describe your changes in detail, yet with concise language, and do not forget to indicate which @@ -133,7 +118,7 @@ happen smoothly: [res-git]: [res-github]: [res-git-flow]: -[res-issue-tracker]: +[res-issue-tracker]: [res-bug-report]: .github/ISSUE_TEMPLATE/bug_report.md [res-feature-request]: .github/ISSUE_TEMPLATE/feature_request.md [res-py]: @@ -144,8 +129,6 @@ happen smoothly: [res-conv-commits]: [res-conv-commits-lint]: [res-conv-commits-blog]: -[res-docs]: README.md [res-documentation]: workflow/documentation.md [res-ci-cd]: -[res-travis-docs]: [res-pull-request]: PULL_REQUEST_TEMPLATE.md diff --git a/README.md b/README.md index 231b7d1..83c31e2 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,40 @@ -# BindingScanner +# _bindz-rbp_ -BindingScanner is a tool for predicting binding sites of RNA-binding proteins in a given input RNA sequence, implemented in a snakemake pipeline. +[![test-conda](https://github.com/zavolanlab/bindz-rbp/workflows/test-conda/badge.svg?branch=dev)](https://github.com/zavolanlab/bindz-rbp/actions?query=workflow%3Atest-conda) +[![test-singularity](https://github.com/zavolanlab/bindz-rbp/workflows/test-singularity/badge.svg?branch=dev)](https://github.com/zavolanlab/bindz-rbp/actions?query=workflow%3Atest-singularity) +[![ATtRACT](https://github.com/zavolanlab/bindz-rbp/workflows/ATtRACT/badge.svg?branch=dev)](https://github.com/zavolanlab/bindz-rbp/actions?query=workflow%3AATtRACT) +[![GitHub issues](https://img.shields.io/github/issues/zavolanlab/bindz-rbp)](https://github.com/zavolanlab/bindz-rbp/issues) +[![GitHub license](https://img.shields.io/github/license/zavolanlab/bindz-rbp)](https://github.com/zavolanlab/bindz-rbp/blob/dev/LICENSE) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4063595.svg)](https://doi.org/10.5281/zenodo.4063595) + +bindz-rbp is a computational workflow which aims to predict binding sites of RNA-binding proteins in a given input RNA sequence, implemented in a snakemake pipeline 🐍 ## Table of Contents -- [BindingScanner](#bindingscanner) - - [Table of Contents](#table-of-contents) - - [General information](#general-information) - - [Installation instructions](#installation-instructions) - - [Step 1: Download and install Miniconda3](#step-1-download-and-install-miniconda3) - - [Step 2: Clone the repository](#step-2-clone-the-repository) - - [Step 3: Build and activate virtual environment for BindingScanner](#step-3-build-and-activate-virtual-environment-for-bindingscanner) - - [Optional: Download and parse PWMs from ATtRACT database](#optional-download-and-parse-pwms-from-attract-database) - - [Workflow execution](#workflow-execution) - - [Contributing](#contributing) - - [Contact](#contact) +- [General information](#general-information) +- [Installation instructions](#installation-instructions) + - [Step 1: Download and install Miniconda3](#step-1-download-and-install-miniconda3) + - [Step 2: Clone the repository](#step-2-clone-the-repository) + - [Step 3: Build and activate virtual environment for bindz-rbp](#step-3-build-and-activate-virtual-environment-for-bindz-rbp) +- [Optional: Download and parse PWMs from ATtRACT database](#optional-download-and-parse-pwms-from-attract-database) +- [Workflow execution](#workflow-execution) +- [Contributing](#contributing) +- [Contact](#contact) ## General information -BindingScanner is a tool for predicting binding sites of distinct regulators in an RNA sequence by calculating posterior probabilities with [MotEvo], given the sequence specificity of regulators, represented as position-specific weight matrices. It is intended to help in the analysis of individual reporter sequences, by predicting regulatory that may act on the sequence as well as how the binding may be affected by specific mutations introduced in the reporter sequences. The tools scans the input sequence with a set of position-specific weight matrices (PWMs) representing the binding specificity of individual RNA-binding proteins. The run time scales linearly with both the sequence length and with the number of PWMs, so please make sure to test it on your architecture before running it on batches of sequences. +bindz-rbp predicts binding sites of distinct regulators in an RNA sequence by calculating posterior probabilities with [MotEvo], given the sequence specificity of regulators, represented as position-specific weight matrices. It is intended to help in the analysis of individual reporter sequences, by predicting regulatory that may act on the sequence as well as how the binding may be affected by specific mutations introduced in the reporter sequences. The tools scans the input sequence with a set of position-specific weight matrices (PWMs) representing the binding specificity of individual RNA-binding proteins. The run time scales linearly with both the sequence length and with the number of PWMs, so please make sure to test it on your architecture before running it on batches of sequences. The tool is implemented as a [Snakemake] workflow. > ![rule_graph][rule-graph] -The main output of the pipeline are: a tab-separated file (`combined_MotEvo_results.tsv`) and a PDF-formatted image (`ProbabilityVsSequence.pdf`). The former collects all predicted binding sites of all analyzed motifs into one table and reports: binding positions (relative to the input sequence start), binding posterior probability, bound subsequence as well as binding energy. The latter is a visualisation of these binding probabilities in a form of a heatmap. +The main output of the pipeline are: +* `combined_MotEvo_results.tsv`: a tab-separated file which collects information related to all predicted binding sites of all analyzed motifs into one table. +* `binding_sites.bed`: simplified list of binding sites in a BED format. +* `ProbabilityVsSequence.pdf`: a visualisation of binding positions and probabilities in a form of a heatmap. ## Installation instructions @@ -60,24 +68,24 @@ Cloning repositories requires [git] to be installed (available via `conda`): conda install git ``` -Clone this git repository into a desired location (here: binding_scanner_git in the current working directory ) with the following command: +Clone this git repository into a desired location (here: bindz-rbp in the current working directory ) with the following command: ```bash -git clone https://github.com/zavolanlab/binding-scanner.git binding_scanner_git +git clone https://github.com/zavolanlab/bindz-rbp ``` -### Step 3: Build and activate virtual environment for BindingScanner +### Step 3: Build and activate virtual environment for bindz-rbp -To help the users in the installation process we have prepared a recipe for a *conda* virtual environment that contains all the software needed to run BindingScanner. This environment can be created by the following script: +To help the users in the installation process we have prepared a recipe for a *conda* virtual environment that contains all the software needed to run bindz-rbp. This environment can be created by the following script: ```bash -bash binding_scanner_git/scripts/create-conda-environment-main.sh +bash bindz-rbp/scripts/create-conda-environment-main.sh ``` The built *conda* environment may then be activated with: ```bash -conda activate binding-scanner +conda activate bindz-rbp ``` ## Optional: Download and parse PWMs from ATtRACT database @@ -89,7 +97,7 @@ However, if the user would like to download and parse a new version of matrices Please change directory to the pipeline's root directory: ```bash -cd binding_scanner_git +cd bindz-rbp ``` To utilize position-specific weight matrices from the ATtRACT database of known RBPs' binding motifs we provide two scripts: @@ -131,7 +139,7 @@ To utilize position-specific weight matrices from the ATtRACT database of known Please change directory to the pipeline's root directory: ```bash -cd binding_scanner_git +cd bindz-rbp ``` All the input, output and parameters for the pipeline execution should be specified in a snakemake configuration file in YAML format. Such a file can be created based on our prepared template located at `workflow/config/config-template.yml`. Assuming that the user created a `config.yml` and saved it in the repository's root directory (and that it is the current working directory) the workflow can be executed on the local machine with: @@ -155,7 +163,9 @@ bash tests/integration/execution/snakemake_local_run_conda_environments.sh ## Contributing This project lives off your contributions, be it in the form of bug reports, -feature requests, discussions, or fixes and other code changes. Please refer +feature requests, discussions, or fixes and other code changes. 🙂 + +Please refer to the [contributing guidelines](CONTRIBUTING.md) if you are interested to contribute. Please mind the [code of conduct](CODE_OF_CONDUCT.md) for all interactions with the community. @@ -164,9 +174,9 @@ interactions with the community. For questions or suggestions regarding the code, please use the [issue tracker][res-issue-tracker]. For any other inquiries, please contact us -by email: +by email: 📨 -(c) 2020 [Zavolan lab, Biozentrum, University of Basel][res-zavolab] +(c) 2022 [Zavolan lab, Biozentrum, University of Basel][res-zavolab] [MotEvo]: https://academic.oup.com/bioinformatics/article/28/4/487/212418 [Snakemake]: https://snakemake.readthedocs.io/en/stable/ @@ -177,5 +187,5 @@ by email: [miniconda]: https://docs.conda.io/en/latest/miniconda.html [git]: https://git-scm.com/ [ATtRACT]: https://attract.cnic.es/index -[res-issue-tracker]: +[res-issue-tracker]: [res-zavolab]: diff --git a/envs/dev.yml b/envs/dev.yml index e3fed9d..c754463 100644 --- a/envs/dev.yml +++ b/envs/dev.yml @@ -12,12 +12,12 @@ ############################################################################### --- - name: binding-scanner-dev - + name: bindz-rbp-dev + channels: - bioconda - conda-forge - + dependencies: - bash=4.4.18 - beautysh=6.0.1 @@ -34,5 +34,5 @@ - shellcheck=0.7.0 - snakemake=5.24.2 - unzip=6.0 - + ... diff --git a/envs/main.yml b/envs/main.yml index e00d647..04c643a 100644 --- a/envs/main.yml +++ b/envs/main.yml @@ -12,12 +12,12 @@ ############################################################################### --- - name: binding-scanner - + name: bindz-rbp + channels: - bioconda - conda-forge - + dependencies: - bash=4.4.18 - curl=7.68.0 diff --git a/images/rulegraph.svg b/images/rulegraph.svg index 140fec3..4f98f71 100644 --- a/images/rulegraph.svg +++ b/images/rulegraph.svg @@ -4,95 +4,90 @@ - - + + snakemake_dag - + 0 - -all + +all 1 - -plot_heatmap_of_MotEvo_results + +plot_heatmap_of_MotEvo_results -1->0 - - +1->0 + + 2 - -combine_MotEvo_results + +prepare_output_bedfile - -2->1 - - + +2->0 + + 3 - -plot_sequence_logos + +combine_MotEvo_results -3->1 - - +3->1 + + + + +3->2 + + 4 - -run_MotEvo_analysis + +plot_sequence_logos - -4->2 - - + +4->1 + + 5 - -create_results_directory + +run_MotEvo_analysis -5->3 - - +5->3 + + 6 - -prepare_MotEvo_parameters + +prepare_MotEvo_parameters - -5->6 - - + +6->5 + + 7 - -prepare_sequence_for_MotEvo - - -5->7 - - - - -6->4 - - - - -7->4 - - + +prepare_sequence_for_MotEvo + + +7->5 + + diff --git a/scripts/create-conda-environment-dev.sh b/scripts/create-conda-environment-dev.sh index 6afdf59..7ac5a03 100644 --- a/scripts/create-conda-environment-dev.sh +++ b/scripts/create-conda-environment-dev.sh @@ -1,6 +1,6 @@ ############################################################################### # -# Create conda virtual environment for the Binding Scanner (dev) +# Create conda virtual environment for bindz-rbp (dev) # # AUTHOR: Maciej_Bak # AFFILIATION: University_of_Basel diff --git a/scripts/create-conda-environment-main.sh b/scripts/create-conda-environment-main.sh index ca413f9..bcf7481 100644 --- a/scripts/create-conda-environment-main.sh +++ b/scripts/create-conda-environment-main.sh @@ -1,6 +1,6 @@ ############################################################################### # -# Create conda virtual environment for the Binding Scanner +# Create conda virtual environment for bindz-rbp # # AUTHOR: Maciej_Bak # AFFILIATION: University_of_Basel diff --git a/tests/integration/SLURM-cluster-config.json b/tests/integration/SLURM-cluster-config.json new file mode 100755 index 0000000..10cf332 --- /dev/null +++ b/tests/integration/SLURM-cluster-config.json @@ -0,0 +1,59 @@ +{ + + "__default__": + { + "queue": "30min", + "time": "00:30:00", + "threads": "1", + "mem": "5G" + }, + + "plot_sequence_logos": + { + "queue": "30min", + "time": "00:05:00", + "threads": "1", + "mem": "1G" + }, + + "prepare_MotEvo_parameters": + { + "queue": "30min", + "time": "00:05:00", + "threads": "1", + "mem": "1G" + }, + + "prepare_sequence_for_MotEvo": + { + "queue": "30min", + "time": "00:05:00", + "threads": "1", + "mem": "1G" + }, + + "run_MotEvo_analysis": + { + "queue": "30min", + "time": "00:05:00", + "threads": "1", + "mem": "1G" + }, + + "combine_MotEvo_results": + { + "queue": "30min", + "time": "00:05:00", + "threads": "1", + "mem": "1G" + }, + + "plot_heatmap_of_MotEvo_results": + { + "queue": "30min", + "time": "00:05:00", + "threads": "1", + "mem": "1G" + } + +} diff --git a/tests/integration/config.yml b/tests/integration/config.yml index bf2a5e1..84ef0ce 100755 --- a/tests/integration/config.yml +++ b/tests/integration/config.yml @@ -16,6 +16,7 @@ pipeline_path: "" # input sequence from the user +seq_name: "TestSequence" sequence: "ATGTGAGTGAAGTGTGGGAAAGATGACTCGATATATCTGGATGCTAGGGATCGGATGGCGATACG" # path to the directory with TRANSFAC-formatted PWM files diff --git a/tests/integration/execution/snakemake_cluster_run_conda_environments.sh b/tests/integration/execution/snakemake_cluster_run_conda_environments.sh index 4177d14..2517abe 100755 --- a/tests/integration/execution/snakemake_cluster_run_conda_environments.sh +++ b/tests/integration/execution/snakemake_cluster_run_conda_environments.sh @@ -1,4 +1,4 @@ -# Run the pipeline on a SLURM-managed computational cluster with conda environments +# Run the pipeline on the sciCORE computational cluster with conda environments cleanup () { rc=$? @@ -20,7 +20,7 @@ cd "$repo_dir" snakemake \ --snakefile="workflow/Snakefile" \ --configfile="tests/integration/config.yml" \ - --cluster-config "workflow/config/SLURM-cluster-config.json" \ + --cluster-config="tests/integration/SLURM-cluster-config.json" \ --use-conda \ --cores=128 \ --local-cores 1 \ diff --git a/tests/integration/execution/snakemake_cluster_run_singularity_containers.sh b/tests/integration/execution/snakemake_cluster_run_singularity_containers.sh new file mode 100755 index 0000000..65c4335 --- /dev/null +++ b/tests/integration/execution/snakemake_cluster_run_singularity_containers.sh @@ -0,0 +1,37 @@ +# Run the pipeline on the sciCORE computational cluster with singularity containers + +cleanup () { + rc=$? + rm -rf .snakemake/ + rm -rf ../output/ + cd "$user_dir" + echo "Exit status: $rc" +} +trap cleanup SIGINT + +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands + +user_dir=$PWD +repo_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)/../../.." +cd "$repo_dir" + +snakemake \ + --snakefile="workflow/Snakefile" \ + --configfile="tests/integration/config.yml" \ + --cluster-config="tests/integration/SLURM-cluster-config.json" \ + --use-singularity \ + --cores=128 \ + --local-cores 1 \ + --printshellcmds \ + --verbose \ + --latency-wait 60 \ + --cluster \ + "sbatch \ + --cpus-per-task={cluster.threads} \ + --mem={cluster.mem} \ + --qos={cluster.queue} \ + --time={cluster.time} \ + --output={params.LOG_cluster_log}-%j-%N.log \ + -p scicore" diff --git a/tests/integration/execution/snakemake_local_run_singularity_containers.sh b/tests/integration/execution/snakemake_local_run_singularity_containers.sh new file mode 100755 index 0000000..98888cd --- /dev/null +++ b/tests/integration/execution/snakemake_local_run_singularity_containers.sh @@ -0,0 +1,26 @@ +# Run the pipeline on a local machine with conda environments + +cleanup () { + rc=$? + rm -rf .snakemake/ + rm -rf ../output/ + cd "$user_dir" + echo "Exit status: $rc" +} +trap cleanup SIGINT + +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands + +user_dir=$PWD +repo_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)/../../.." +cd "$repo_dir" + +snakemake \ + --snakefile="workflow/Snakefile" \ + --configfile="tests/integration/config.yml" \ + --use-singularity \ + --cores=1 \ + --printshellcmds \ + --verbose diff --git a/tests/integration/expected_output.md5 b/tests/integration/expected_output.md5 index 09202be..d28fe6f 100644 --- a/tests/integration/expected_output.md5 +++ b/tests/integration/expected_output.md5 @@ -1,26 +1,26 @@ -d0b39f0197807ab8fdfb9f8294b2bc16 ./tests/integration/output/sequence.fasta +9e4e01e58a8df52814a71b5cca26cc40 ./tests/integration/output/sequence.fasta 281a3ab4be432577381e6b64a628c154 ./tests/integration/output/MotEvo_parameters.txt -b1eb281f0f5b41516b7847b4398220dc ./tests/integration/output/motevo/motif_HNRNPF_826/posterior_sites +01cda68d6f5846d025b4eab37555af0b ./tests/integration/output/motevo/motif_HNRNPF_826/posterior_sites 1eb4365fcfc54bea454dc296863a9cad ./tests/integration/output/motevo/motif_HNRNPF_826/posteriors d41d8cd98f00b204e9800998ecf8427e ./tests/integration/output/motevo/motif_HNRNPF_821/posterior_sites 2f0bbba5237c73859cd212fec4dc3418 ./tests/integration/output/motevo/motif_HNRNPF_821/posteriors -15e254bc6041e4474b5c284ee2fa7dc0 ./tests/integration/output/motevo/motif_HNRNPF_828/posterior_sites +a52138378a40b3a020f12b4bc26ef50c ./tests/integration/output/motevo/motif_HNRNPF_828/posterior_sites dcc45178f17b2eea31474fb3764be204 ./tests/integration/output/motevo/motif_HNRNPF_828/posteriors d41d8cd98f00b204e9800998ecf8427e ./tests/integration/output/motevo/motif_HNRNPF_829/posterior_sites 75b0797db55efbbca8e28b908cc3c693 ./tests/integration/output/motevo/motif_HNRNPF_829/posteriors -c4c2dd970e1ce4f3e42b51b9c65860d4 ./tests/integration/output/motevo/motif_HNRNPF_820/posterior_sites +b59a7202350a340ab7102063899fdfa0 ./tests/integration/output/motevo/motif_HNRNPF_820/posterior_sites 4c595e3cee3e00a0675b4d01d71106e7 ./tests/integration/output/motevo/motif_HNRNPF_820/posteriors -d7a296a6140387564084c95732c0f692 ./tests/integration/output/motevo/motif_HNRNPF_827/posterior_sites +0b554d8e8f6dad9516ba1f4530d1c8b0 ./tests/integration/output/motevo/motif_HNRNPF_827/posterior_sites ecf600dbebb54ba5ecaaef9be59577d9 ./tests/integration/output/motevo/motif_HNRNPF_827/posteriors d41d8cd98f00b204e9800998ecf8427e ./tests/integration/output/motevo/motif_HNRNPF_822/posterior_sites 9f424c9af4c2c445836983d73829f412 ./tests/integration/output/motevo/motif_HNRNPF_822/posteriors -2b5276348c807646dc7f3436d342e999 ./tests/integration/output/motevo/motif_HNRNPF_825/posterior_sites +7de77b24c9455099011a4bf2a6c7b1ff ./tests/integration/output/motevo/motif_HNRNPF_825/posterior_sites d920cfaff4fb96ecce6b7d7cd1abfe93 ./tests/integration/output/motevo/motif_HNRNPF_825/posteriors -fe56b8577bada06fea45bb8835e7d7ee ./tests/integration/output/motevo/motif_HNRNPF_824/posterior_sites +b0ee32a9b14b9a31240cbfedd6ed90f3 ./tests/integration/output/motevo/motif_HNRNPF_824/posterior_sites faa2f9e3e8f432575e281878d93a85a6 ./tests/integration/output/motevo/motif_HNRNPF_824/posteriors -c0a4a9155ec7958b00bd3140e97c6dc9 ./tests/integration/output/motevo/motif_HNRNPF_823/posterior_sites +21e581eb309d21e194e33dbe1c4c0c05 ./tests/integration/output/motevo/motif_HNRNPF_823/posterior_sites 20e7921795665bbe24e585e741705634 ./tests/integration/output/motevo/motif_HNRNPF_823/posteriors -d00f0ca9b1561551a9cfe17a09747251 ./tests/integration/output/combined_MotEvo_results.tsv +cce86c9e75596fcc0861ef8d420dde69 ./tests/integration/output/combined_MotEvo_results.tsv 60dd0b7f9065f07660c385bac79ca94e ./tests/integration/output/sequence_logos/motif_HNRNPF_820.png 25eb71df5f876d8f575dbc417319487c ./tests/integration/output/sequence_logos/motif_HNRNPF_821.png 25eb71df5f876d8f575dbc417319487c ./tests/integration/output/sequence_logos/motif_HNRNPF_822.png @@ -31,3 +31,4 @@ b9c855c85a7777f8fb12806aed9fd6b9 ./tests/integration/output/sequence_logos/motif d1b324f46221514588b00148bac38e5c ./tests/integration/output/sequence_logos/motif_HNRNPF_827.png 0dc19411798fda72ffcd8000b38d9dc9 ./tests/integration/output/sequence_logos/motif_HNRNPF_828.png 2b68e5759d8f598d39f4ab90ccb3da67 ./tests/integration/output/sequence_logos/motif_HNRNPF_829.png +0a2feaa1fbc4916ae0bd10c23e91892f ./tests/integration/output/binding_sites.bed diff --git a/tests/integration/expected_output.txt b/tests/integration/expected_output.txt index 51ae313..cddbc54 100644 --- a/tests/integration/expected_output.txt +++ b/tests/integration/expected_output.txt @@ -31,4 +31,4 @@ ./tests/integration/output/sequence_logos/motif_HNRNPF_827.png ./tests/integration/output/sequence_logos/motif_HNRNPF_828.png ./tests/integration/output/sequence_logos/motif_HNRNPF_829.png - +./tests/integration/output/binding_sites.bed diff --git a/tests/unit/Plot-heatmap-for-motifs/combined_MotEvo_results.tsv b/tests/unit/Plot-heatmap-for-motifs/combined_MotEvo_results.tsv index 727a882..33dc069 100644 --- a/tests/unit/Plot-heatmap-for-motifs/combined_MotEvo_results.tsv +++ b/tests/unit/Plot-heatmap-for-motifs/combined_MotEvo_results.tsv @@ -1,4 +1,4 @@ -pwm_id binding_position binding_sequence binding_posterior binding_energy +pwm_id binding_position binding_sequence binding_posterior LogLik_ratio_fg_bg HNRNPF_823 46-50 AGGGA 0.674978 6.71171 HNRNPF_824 15-19 TGGGA 0.560144 6.71171 HNRNPF_824 46-50 AGGGA 0.030141 2.51049 diff --git a/tests/unit/combine_results/expected_output.md5 b/tests/unit/combine_results/expected_output.md5 index afbe504..230c298 100644 --- a/tests/unit/combine_results/expected_output.md5 +++ b/tests/unit/combine_results/expected_output.md5 @@ -1 +1 @@ -1529b2a9ad9485a578805fe4ca8b0c11 ./tests/unit/combine_results/combined_MotEvo_results.tsv +3cdf20d93235192bb612c64bb69b4ba6 ./tests/unit/combine_results/combined_MotEvo_results.tsv diff --git a/workflow/Snakefile b/workflow/Snakefile index 8d35298..30517eb 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -19,7 +19,7 @@ import glob import traceback # local rules -localrules: all, create_results_directory +localrules: all def gather_motifs_names(pwm_dir): """ @@ -32,6 +32,11 @@ def gather_motifs_names(pwm_dir): l = [i.split("/")[-1] for i in glob.glob(regex)] return(l) +onstart: + shell("mkdir -p " + config["outdir"]) + shell("mkdir -p " + config["outdir"] + "/local_log") + shell("mkdir -p " + config["outdir"] + "/cluster_log") + rule all: """ Target rule with final output of the pipeline @@ -43,45 +48,15 @@ rule all: "ProbabilityVsSequence.pdf" ), outdir = config["outdir"] - ) - -rule create_results_directory: - """ - Create directories for the results - """ - output: - TEMP_ = temp( + ), + BED_binding_sites = expand( os.path.join( "{outdir}", - "outdir" - ) - ) - - params: - DIR_output_dir = "{outdir}", - LOG_local_log = os.path.join( - "{outdir}", - "local_log" - ), - LOG_cluster_log = os.path.join( - "{outdir}", - "cluster_log" + "binding_sites.bed" + ), + outdir = config["outdir"] ) - conda: - "envs/bash.yml" - - singularity: - "docker://bash:4.4.18" - - shell: - """ - mkdir -p {params.DIR_output_dir}; \ - mkdir -p {params.LOG_local_log}; \ - mkdir -p {params.LOG_cluster_log}; \ - touch {output.TEMP_} - """ - rule plot_sequence_logos: """ Plot sequence logo for the motifs @@ -98,10 +73,6 @@ rule plot_sequence_logos: "workflow", "scripts", "sequence_logos.py" - ), - TEMP_ = os.path.join( - "{outdir}", - "outdir" ) output: @@ -159,12 +130,6 @@ rule prepare_MotEvo_parameters: """ Prepare text file with parameters for MotEvo runs """ - input: - TEMP_ = os.path.join( - "{outdir}", - "outdir" - ) - output: TXT_MotEvo_parameters = os.path.join( "{outdir}", @@ -204,7 +169,8 @@ rule prepare_MotEvo_parameters: ) run: - with open(output.TXT_MotEvo_parameters, "w") as ofile, open(log.LOG_local_stderr, "w") as logfile: + with open(output.TXT_MotEvo_parameters, "w") as ofile, \ + open(log.LOG_local_stderr, "w") as logfile: try: # ================================= bgprior = params.FLOAT_MotEvo_bg_binding_prior @@ -248,12 +214,6 @@ rule prepare_sequence_for_MotEvo: """ Create a FASTA-formatted file with the input sequence """ - input: - TEMP_ = os.path.join( - "{outdir}", - "outdir" - ) - output: FASTA_MotEvo_input = os.path.join( "{outdir}", @@ -262,7 +222,7 @@ rule prepare_sequence_for_MotEvo: params: STR_sequence = config["sequence"], - STR_MotEvo_fasta_header_tag = ">>MOTEVO_input-sequence", + STR_MotEvo_fasta_header_tag = ">>MOTEVO_" + config["seq_name"], LOG_cluster_log = os.path.join( "{outdir}", "cluster_log", @@ -527,3 +487,75 @@ rule plot_heatmap_of_MotEvo_results: --sequence_logos_directory {params.DIR_sequence_logos} \ 1> {log.LOG_local_stdout} 2> {log.LOG_local_stderr} """ + +rule prepare_output_bedfile: + """ + Prepare outfile in bed format + """ + input: + TSV_combined_MotEvo_results = os.path.join( + "{outdir}", + "combined_MotEvo_results.tsv" + ) + + output: + BED_binding_sites = os.path.join( + "{outdir}", + "binding_sites.bed" + ) + + params: + STR_seq_name = config["seq_name"], + LOG_cluster_log = os.path.join( + "{outdir}", + "cluster_log", + "prepare_output_bedfile.log" + ) + + log: + LOG_local_stdout = os.path.join( + "{outdir}", + "local_log", + "prepare_output_bedfile.stdout.log" + ), + LOG_local_stderr = os.path.join( + "{outdir}", + "local_log", + "prepare_output_bedfile.stderr.log" + ) + + benchmark: + os.path.join( + "{outdir}", + "local_log", + "prepare_output_bedfile.benchmark.log" + ) + + run: + with open(output.BED_binding_sites, "w") as ofile, \ + open(log.LOG_local_stderr, "w") as logfile: + try: + # ================================= + with open(input.TSV_combined_MotEvo_results) as ifile: + lines = ifile.read().splitlines() + for line in lines[1:]: + start = line.split("\t")[1].split("-")[0] + end = line.split("\t")[1].split("-")[-1] + ofile.write( + params.STR_seq_name + + "\t" + + start + + "\t" + + end + + "\t" + + line.split("\t")[0] + + "\t" + + line.split("\t")[3] + + "\t+" + + os.linesep + ) + except Exception: + traceback.print_exc(file = logfile) + raise Exception( + "Workflow error at rule: prepare_output_bedfile" + ) diff --git a/workflow/config/config-template.yml b/workflow/config/config-template.yml index d59ff39..1eaf077 100755 --- a/workflow/config/config-template.yml +++ b/workflow/config/config-template.yml @@ -15,6 +15,9 @@ # absolute path to the pipeline root directory pipeline_path: +# name of the user's sequence +# Please do not use spaces +seq_name: # input sequence from the user sequence: diff --git a/workflow/documentation.md b/workflow/documentation.md index 47c2b3e..5462ccf 100644 --- a/workflow/documentation.md +++ b/workflow/documentation.md @@ -1,22 +1,21 @@ -# BINDING-SCANNER: workflow documentation +# _bindz-rbp_: workflow documentation This document describes the individual steps of the workflow. For instructions -on installation and usage please see [here](../README.md). +on installation and usage please go [here](../README.md). ## Table of Contents - [**Description of workflow steps**](#description-of-workflow-steps) - [**Rule graph**](#rule-graph) - - [**Preparatory**](#preparatory) - - [**Config file**](#config-file) + - [**Config file**](#config-file) - [**Snakemake Rules**](#snakemake-rules) - [**all**](#all) - - [**create_results_directory**](#create_results_directory) - [**plot_sequence_logos**](#plot_sequence_logos) - [**prepare_MotEvo_parameters**](#prepare_MotEvo_parameters) - [**prepare_sequence_for_MotEvo**](#prepare_sequence_for_MotEvo) - [**run_MotEvo_analysis**](#run_MotEvo_analysis) - [**combine_MotEvo_results**](#combine_MotEvo_results) - [**plot_heatmap_of_MotEvo_results**](#plot_heatmap_of_MotEvo_results) + - [**prepare_output_bedfile**](#prepare_output_bedfile) ## Description of workflow steps @@ -27,18 +26,14 @@ on installation and usage please see [here](../README.md). Visual representation of workflow. Automatically prepared with [Snakemake][docs-snakemake]. -### Preparatory +### Config file -#### Config file - -##### Requirements - -- a config file as in [`config.yml`](config/config-template.yml) -- a pwm directory containing files with binding probabilities matrices of various motifs. +This workflow requires a config file as in [`config.yml`](config/config-template.yml) Parameter name | Description | Data type(s) --- | --- | --- pipeline_path | Absolute path to the pipeline directory | `str` +seq_name | Input Sequence ID | `str` sequence | Input Sequence | `str` pwm_directory | Path to the directory with TRANSFAC-formatted PWM files | `str` outdir | Path to the output directory | `str` @@ -53,46 +48,31 @@ MotEvo_Markov_chain_order | MotEvo parameter: order of the Markov chain | `float Target rule with final output of the pipeline - **Input** - - A heatmap depicting all the motifs with their sequence logos and names as y-axis tick-labels; input sequence as x-axis; and each cell representing the probability of corresponding motif and the part of the sequence. - -#### `create_results_directory` - - Create directories for the results - -- **Parameters** - - Path to output directory - - Path to local log directory - - Path to cluster log directory - -- **Output** - - An output directory which will be used in all the successive rules + - A heatmap depicting all the motifs with their sequence logos and names as y-axis tick-labels; input sequence as x-axis; and each cell representing per-position binding probability. + - BED-formatted list of inferred binding sites. #### `plot_sequence_logos` - Plot sequence logo for the motifs. This rule will run as many number as times as the number of motifs or number of files in the pwm directory. + Plot sequence logos for the motifs. This rule will run as many times as the number of motifs in the PWM directory. - **Input** - - PWM file containg binding probability matrices - - Script that processes matrices to sequence logos png [`sequence_logos.py`](../workflow/scripts/sequence_logos.py) - - Output directory generated in the rule [**create_results_directory**](#create_results_directory) + - PWM file containg binding probability matrix + - Script that generates sequence logos.[`sequence_logos.py`](../workflow/scripts/sequence_logos.py) - **Parameters** - Output directory for the logos - - File path of logs for each Pwm + - File path for cluster logs - **Output** - - A png file for each motif containing the sequence logo which will be used in the rule [**plot_heatmap_of_MotEvo_results**](#plot_heatmap_of_MotEvo_results) + - A png file containing the sequence logo which will be used in the rule [**plot_heatmap_of_MotEvo_results**](#plot_heatmap_of_MotEvo_results) #### `prepare_MotEvo_parameters` Prepare text file with parameters for MotEvo runs -- **Input** - - Output directory generated in the rule [**create_results_directory**](#create_results_directory) - - **Parameters** - - Path to output directory for the logos - - File path for the logs for each Pwm file + - MotEvo parameters: bg binding probability, min binding posterior, Markov chain order + - File path for cluster logs - **Output** - A text file containing MotEvo paramaters which will be used in the rule [**run_MotEvo_analysis**](#run_MotEvo_analysis) @@ -101,13 +81,10 @@ Target rule with final output of the pipeline Create a FASTA-formatted file with the input sequence -- **Input** - - Output directory generated in the rule [**create_results_directory**](#create_results_directory) - - **Parameters** - - Input sequence - - Header tag for the sequence - - Path for the log of this rule + - Input sequence (from the configuration file) + - Header tag for the sequence (constructed from the configuration file) + - File path for cluster logs - **Output** - A fasta file which will be used in the rule [**run_MotEvo_analysis**](#run_MotEvo_analysis) @@ -118,48 +95,62 @@ Target rule with final output of the pipeline - **Input** - MotEvo parameters file generated in rule [**prepare_MotEvo_parameters**](#prepare_MotEvo_parameters) - - Path of the pwm files containing the binding probabilities matrices + - Path of the PWM directory containing the binding probability matrices - Fasta file generated in the rule [**prepare_sequence_for_MotEvo**](#prepare_sequence_for_MotEvo) - **Parameters** - Absolute path of MotEvo parameters file generated in rule [**prepare_MotEvo_parameters**](#prepare_MotEvo_parameters) - - Absolute path of Fasta file generated in the rule [**prepare_sequence_for_MotEvo**](#prepare_sequence_for_MotEvo) - - Path for the log of this rule + - Absolute path of FASTA file generated in the rule [**prepare_sequence_for_MotEvo**](#prepare_sequence_for_MotEvo) + - File path for cluster logs - **Output** - - A directory with files containing posterior sites information which will be used in the rule [**combine_MotEvo_results**](#combine_MotEvo_results) + - A directory with files containing binding posterior information which will be used in the rule [**combine_MotEvo_results**](#combine_MotEvo_results) #### `combine_MotEvo_results` - Combine all motevo results into one tsv file + Combine all motevo results into one TSV file - **Input** - - A directory with files containing posterior sites information which is generated in [**run_MotEvo_analysis**](#run_MotEvo_analysis) - - Script that will do the job of combinining results in one tsv file [`combine-motevo-results.py`](../workflow/scripts/combine-motevo-results.py) + - A directory with files containing binding posterior information which is generated in [**run_MotEvo_analysis**](#run_MotEvo_analysis) + - Script that will combine results into one TSV file [`combine-motevo-results.py`](../workflow/scripts/combine-motevo-results.py) - **Parameters** - - Name of the file which would contain posterior sites information - - Path for the log of this rule + - Name of the MotEvo output file which contains posterior sites information + - File path for cluster logs - **Output** - - A tsv file which gathers information of every analysed PWM directory. + - A TSV file which gathers information from every analysed PWM. #### `plot_heatmap_of_MotEvo_results` Plot heatmap from the combined_MotEvo_results.tsv file - **Input** - - A TSV file containing information of every analysed PWM directory generated from rule [**combine_MotEvo_results**](#combine_MotEvo_results) + - A TSV file containing information from every analysed PWM, generated from rule [**combine_MotEvo_results**](#combine_MotEvo_results) - Script that will plot the heatmap with sequence logos as y axis ticks [`heatmap.r`](../workflow/scripts/heatmap.r) - - Sequence logo generated for each motif from the rule [**plot_sequence_logos**](#plot_sequence_logos) + - Sequence logos generated for each motif from the rule [**plot_sequence_logos**](#plot_sequence_logos) + +- **Parameters** + - Input sequence (from the configuration file) + - Directory with the all sequence logos + - File path for cluster logs + +- **Output** + - A heatmap depicting all the motifs with their sequence logos and names as y-axis tick-labels; input sequence as x-axis; and each cell representing per-position binding probability. + +#### `prepare_output_bedfile` + + Prepare a list of all inferred binding sites in a BED format + +- **Input** + - A TSV file containing information from every analysed PWM, generated from rule [**combine_MotEvo_results**](#combine_MotEvo_results) - **Parameters** - - Input sequence - - Directory of all sequence logos - - Path for the log of this rule + - Input sequence ID (from the configuration file) + - File path for cluster logs - **Output** - - A heatmap depicting all the motifs with their sequence logos and names as y-axis tick-labels; input sequence as x-axis; and each cell representing the probability of corresponding motif and the part of the sequence. + - A list of all inferred binding sites in a BED format [rule-graph]: ../images/rulegraph.svg [docs-snakemake]: \ No newline at end of file diff --git a/workflow/envs/bash.yml b/workflow/envs/bash.yml index f2957f3..e12e756 100644 --- a/workflow/envs/bash.yml +++ b/workflow/envs/bash.yml @@ -12,7 +12,7 @@ ############################################################################### --- - name: bash-4-4-18 + name: bindz-bash-4-4-18 channels: - conda-forge diff --git a/workflow/envs/combine-motevo-results.yml b/workflow/envs/combine-motevo-results.yml index 6759e73..90e8bad 100644 --- a/workflow/envs/combine-motevo-results.yml +++ b/workflow/envs/combine-motevo-results.yml @@ -11,7 +11,7 @@ ############################################################################### --- - name: combine-motevo-results + name: bindz-combine-motevo-results channels: - conda-forge diff --git a/workflow/envs/motevo.yml b/workflow/envs/motevo.yml index 42757de..357eb81 100644 --- a/workflow/envs/motevo.yml +++ b/workflow/envs/motevo.yml @@ -12,7 +12,7 @@ ############################################################################### --- - name: motevo + name: bindz-motevo channels: - vng diff --git a/workflow/envs/plot_heatmap_of_MotEvo_results.yml b/workflow/envs/plot_heatmap_of_MotEvo_results.yml index 954cf59..cc23d44 100644 --- a/workflow/envs/plot_heatmap_of_MotEvo_results.yml +++ b/workflow/envs/plot_heatmap_of_MotEvo_results.yml @@ -11,7 +11,7 @@ ############################################################################### --- - name: plot_heatmap_of_MotEvo_results + name: bindz-plot-heatmap-of-MotEvo-results channels: - conda-forge diff --git a/workflow/envs/plot_sequence_logos.yml b/workflow/envs/plot_sequence_logos.yml index 01f79bd..8cdc43d 100644 --- a/workflow/envs/plot_sequence_logos.yml +++ b/workflow/envs/plot_sequence_logos.yml @@ -11,7 +11,7 @@ ############################################################################### --- - name: plot_sequence_logos + name: bindz-plot-sequence-logos channels: - conda-forge diff --git a/workflow/scripts/combine-motevo-results.py b/workflow/scripts/combine-motevo-results.py index cc9817b..4a02c12 100644 --- a/workflow/scripts/combine-motevo-results.py +++ b/workflow/scripts/combine-motevo-results.py @@ -58,7 +58,7 @@ tabb["pwm_id"] = [] tabb["binding_region"] = [] tabb["binding_sequence"] = [] -tabb["binding_energy"] = [] +tabb["LogLik_ratio_fg_bg"] = [] tabb["fasta_record"] = [] ##### Taking input from commandline ##### @@ -98,7 +98,7 @@ ##### Appending values ##### tabb["binding_sequence"] = tabb["binding_sequence"] + [each_word[0]] - tabb["binding_energy"] = tabb["binding_energy"] + [each_word[1]] + tabb["LogLik_ratio_fg_bg"] = tabb["LogLik_ratio_fg_bg"] + [each_word[1]] tabb["fasta_record"] = tabb["fasta_record"] + [each_word[2]] i = i + 1 # increment the value of i after we are done processing each line @@ -108,7 +108,7 @@ "binding_position", "binding_sequence", "binding_posterior", - "binding_energy", + "LogLik_ratio_fg_bg", ] # these will be the headers in the final tsv file tempDict1 = ( diff --git a/workflow/scripts/heatmap.r b/workflow/scripts/heatmap.r index ec5f2e9..3daae12 100644 --- a/workflow/scripts/heatmap.r +++ b/workflow/scripts/heatmap.r @@ -1,6 +1,8 @@ ############################################################################### # # R script to generate a heatmap +# * suggested max input sequence length: 150 +# * suggested max n.o. PWMs scanned with the pipeline: 200 # # AUTHOR: Krish Agarwal # AFFILIATION: University_of_Basel @@ -21,25 +23,25 @@ library(ggtext) # list the command-line arguments option_list <- list( - make_option(c("--input_tsv"), + make_option("--input_tsv", action = "store", dest = "input_tsv", type = "character", help = "location and name of the tsv file" ), - make_option(c("--input_sequence"), + make_option("--input_sequence", action = "store", dest = "input_sequence", type = "character", help = "full input_sequence" ), - make_option(c("--outfile"), + make_option("--outfile", action = "store", dest = "output_tsv", type = "character", help = "location and name of output heatmap" ), - make_option(c("--sequence_logos_directory"), + make_option("--sequence_logos_directory", action = "store", dest = "sequence_logos_directory", type = "character", @@ -69,7 +71,7 @@ x_axis_numbers = 1:nchar(input_sequence) file = file(input_tsv, "r") -y_axis_labels = c() +y_axis_labels = vector() while (length(line <- readLines(file, n = 1)) > 0) # Read the file line by line { @@ -117,7 +119,7 @@ dff <-data.frame(col = rep(colnames(uniform_data), each = nrow(uniform_data)), input_seq = strsplit(input_sequence,"") -labels = c() +labels = vector() #### Add the sequence logo pngs to the dictionary #### if(sequence_logos_directory != FALSE)