From b1824be89d5163870204a7a00685dd1ed8fe7d05 Mon Sep 17 00:00:00 2001 From: Ivan Blagoev Topolsky Date: Sun, 8 Oct 2023 16:53:58 +0200 Subject: [PATCH] Run separate tutorials - Run per-virus tutorial in parallel - Reusing the installation tutorial's output - Use zstd compression --- .github/workflows/tutorials.yaml | 64 +++++++++++++++++++++++- docs/tutorial_hiv.md | 83 +++++++++++++++++--------------- docs/tutorial_sarscov2.md | 67 ++++++++++++++------------ 3 files changed, 141 insertions(+), 73 deletions(-) diff --git a/.github/workflows/tutorials.yaml b/.github/workflows/tutorials.yaml index 378f0a9e..49cc18f7 100644 --- a/.github/workflows/tutorials.yaml +++ b/.github/workflows/tutorials.yaml @@ -49,13 +49,13 @@ jobs: - name: Create V-pipe installation archive # HACK this acceleraters upload while preserving Unix-specifics (case sensitivity, file attributes). - run: tar -cvf vpipeinstallation.tar ./docs/vp-analysis + run: tar --zstd -cvf vpipeinstallation.tar.zst ./docs/vp-analysis - name: Keep installation uses: actions/upload-artifact@v3 with: name: VPipeInstallation - path: vpipeinstallation.tar + path: vpipeinstallation.tar.zst if-no-files-found: error - name: Save notebooks @@ -65,6 +65,66 @@ jobs: path: ./docs/*.ipynb + tutorial: + needs: installer + + strategy: + max-parallel: 5 + fail-fast: false + matrix: + virus: ["hiv", "sarscov2"] + + runs-on: ubuntu-latest + + defaults: + run: + shell: bash -l {0} + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + sparse-checkout: | + docs + tests + sparse-checkout-cone-mode: false + lfs: false + + - name: Reuse installation + uses: actions/download-artifact@v2 + with: + name: VPipeInstallation + + - name: Extract V-pipe installation archive + run: tar --zstd -xvf vpipeinstallation.tar.zst + + - name: Install conda environment dependencies + uses: conda-incubator/setup-miniconda@v2 + with: + miniforge-version: latest + miniforge-variant: Mambaforge + python-version: "3.11" + mamba-version: "*" + channels: conda-forge,bioconda + channel-priority: strict + activate-environment: JupyText + auto-update-conda: true + environment-file: tests/conda_tutorials_env.yaml + + - name: Setup Jupyter kernel + run: | + python -m ipykernel install --user + + - name: Run analysis notebooks + working-directory: ./docs + run: ./convert.sh --branch tutorial*_${{ matrix.virus }}.md + + - name: Save notebooks + uses: actions/upload-artifact@v3 + with: + name: JupyterNotebooks + path: ./docs/*.ipynb + # - name: Publish # uses: peaceiris/actions-gh-pages@v3 # with: diff --git a/docs/tutorial_hiv.md b/docs/tutorial_hiv.md index 3e8d9d65..afcdc47a 100644 --- a/docs/tutorial_hiv.md +++ b/docs/tutorial_hiv.md @@ -15,13 +15,31 @@ jupyter: --- -# V-Pipe Tutorial +# V-Pipe HIV Tutorial V-pipe is a workflow designed for the analysis of next generation sequencing (NGS) data from viral pathogens. It produces a number of results in a curated format (e.g., consensus sequences, SNV calls, local/global haplotypes). V-pipe is written using the Snakemake workflow management system. +The present tutorial will show you how to apply V-pipe on HIV sequencing data. + ## Requirements -V-pipe is optimized for Linux or Mac OS systems. Therefore, we recommend users with a Windows system to [install WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) - this is not a full virtual machine but rather a way to run Windows and Linux cooperatively at the same time. +The tutorial assumes that you have [installed V-pipe using the installation tutorial](tutorial_0_install.md), and that the workflow is setup with the following structure: + +```text +πŸ“ [HOME] +β””β”€β”€β”€πŸ“vp-analysis + β”œβ”€β”€β”€πŸ“V-pipe # V-pipe checked out from Github + β”œβ”€β”€β”€πŸ“Miniforge3 # bioconda + conda-forge + mamba + Snakemake + β”œβ”€β”€β”€πŸ“work # work directories + β”œβ”€β”€β”€πŸ“work-tests # … + β””β”€β”€β”€πŸ“ … # … +``` + +- `vp-analysis` is the main directory where we installed everything in the previous tutorial +- `Miniforge3` has dependencies to start using V-pipe (bioconda, conda-forge, mamba, snakemake) +- `V-pipe` is the directory with V-pipe's own code +- and for this tutorial we will create a directory like `work…`, which will hold the configuration and the sequencing data for our analysis. + ## Organizing Data @@ -78,52 +96,31 @@ The files will have the following structure: ## Install V-pipe -V-pipe uses the [Bioconda](https://bioconda.github.io/) bioinformatics software repository for all its pipeline components. The pipeline itself is implemented using [Snakemake](https://snakemake.readthedocs.io/en/stable/). - -For advanced users: If your are fluent with these tools, you can: - -* directly download and install [bioconda](https://bioconda.github.io/user/install.html) and [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda), -* specifiy your V-pipe configuration, and start using V-pipe - -Use `--use-conda` to [automatically download and install](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management) any further pipeline dependencies. Please refer to the documentation for additional instructions. - -In this present tutorial you will learn how to setup a workflow for the example dataset. - -To deploy V-pipe, you can use the installation script with the following parameters: +After, [having installed V-pipe using the installation tutorial](tutorial_0_install.md), create a new working directory for this analysis: ```bash -curl -O 'https://raw.githubusercontent.com/cbg-ethz/V-pipe/master/utils/quick_install.sh' -bash quick_install.sh -p testing -w work -``` +cd vp_analysis -Note that - -* using `-p` specifies the subdirectory where to download and install snakemake and V-pipe -* using `-w` will create a working directory and populate it. It will colloquial the references and the default `config/config.yaml`, and create a handy `vpipe` short-cut script to invoke `snakemake`. - - -If you get `zsh: permission denied: ./quick_install.sh`, run `chmod +x quick_install.sh` this gives the necessary permissions. - -Tip: To create and populate other new working directories, you can call init_project.sh from within the new directory: - -```console -mkdir -p working_2 -cd working_2 +# create a new directory and initialise it +mkdir -p work_hiv +cd work_hiv ../V-pipe/init_project.sh + +cd ../.. ``` ## Preparation -Copy the samples directory you created in the step "Preparing a small dataset" to this working directory. (You can display the directory structure with `tree testing/work/resources/samples` or `find testing/work/resources/samples`.) +Copy the samples directory you created in the step "Preparing a small dataset" to this working directory. (You can display the directory structure with `tree vp_analysis/work_hiv/resources/samples` or `find vp_analysis/work_hiv/resources/samples`.) ```bash -mkdir -p testing/work/resources -mv testing/V-pipe/docs/example_HIV_data/samples testing/work/resources/samples +mkdir -p vp_analysis/work_hiv/resources +mv vp_analysis/V-pipe/docs/example_HIV_data/samples vp_analysis/work_hiv/resources/samples ``` Note that: -- by default V-pipe expects its samples in a directory `samples` contained directly in the working directory - i.e. `testing/work/sample`` +- by default V-pipe expects its samples in a directory `samples` contained directly in the working directory - i.e. `vp_analysis/work_hiv/sample`` - in this tutorial we put them inside the `resources` subdirectory, and will set the config file accordingly. @@ -132,11 +129,11 @@ If you have a reference sequences that you would like to use for read mapping an ### Preparing V-pipe's configuration -In the `work` directory you can find the file `config.yaml`. This is where the V-Pipe configuation should be specified. See [here](https://github.com/cbg-ethz/V-pipe/tree/master/config#readme) for the documentation of the configuration. +In the `work_hiv` directory you can find the file `config.yaml`. This is where the V-Pipe configuation should be specified. See [here](https://github.com/cbg-ethz/V-pipe/tree/master/config#readme) for the documentation of the configuration. In this tutorial we are building our own configuration therefore `virus_base_config` will remain empty. Since we are working with HIV-1, V-Pipe is providing meta information that will be used for visualisation (metainfo_file and gff_directory). ```bash -cat < ./testing/work/config.yaml +cat < ./vp_analysis/work_hiv/config.yaml general: virus_base_config: "" aligner: bwa @@ -147,6 +144,7 @@ input: reference: "{VPIPE_BASEDIR}/../resources/hiv/HXB2.fasta" metainfo_file: "{VPIPE_BASEDIR}/../resources/hiv/metainfo.yaml" gff_directory: "{VPIPE_BASEDIR}/../resources/hiv/gffs/" + # NOTE: this input datadir isn't standard datadir: resources/samples/ read_length: 301 samples_file: samples.tsv @@ -165,7 +163,7 @@ output: EOT ``` -Note: A YAML files use spaces as indentation, you can use 2 or 4 spaces for indentation, but no tab. There are also online YAML file validators that you might want to use if your YAML file is wrongly formatted. +**Note**: A YAML files use spaces as indentation, you can use 2 or 4 spaces for indentation, but **no tab**. There are also [online YAML file validators](https://www.yamllint.com/) that you might want to use if your YAML file is wrongly formatted. ## Running V-pipe @@ -173,8 +171,10 @@ Note: A YAML files use spaces as indentation, you can use 2 or 4 spaces for inde Before running check what will be executed: ```bash -cd testing/work/ +cd vp_analysis/work_hiv/ + ./vpipe --dryrun + cd ../.. ``` @@ -183,7 +183,7 @@ As this is your first run of V-pipe, it will also generate the sample collection Note that the samples you have downloaded have reads of length 301 only. V-pipe’s default parameters are optimized for reads of length 250. To adapt to the read length, add a third column in the tab-separated file as follows: ```bash -cat < testing/work/samples.tsv +cat < vp_analysis/work_hiv/samples.tsv CAP217 4390 301 CAP188 4 301 CAP188 30 301 @@ -198,8 +198,11 @@ You can safely delete it and re-run with option `--dry-run` to regenerate it. Finally, we can run the V-pipe analysis (the necessary dependencies will be downloaded and installed in conda environments managed by snakemake): ```bash -cd testing/work/ +cd vp_analysis/work_hiv/ + ./vpipe -p --cores 2 + +cd - ``` diff --git a/docs/tutorial_sarscov2.md b/docs/tutorial_sarscov2.md index 5499f046..84a42913 100644 --- a/docs/tutorial_sarscov2.md +++ b/docs/tutorial_sarscov2.md @@ -21,6 +21,24 @@ This tutorial shows the basics of how to interact with V-pipe. For the purpose of this Tutorial, we will work with the master branch of V-pipe and use the _sars-cov-2_ virus base config which is adapted for the SARS-CoV-2 virus. +## Requirements + +The tutorial assumes that you have [installed V-pipe using the installation tutorial](tutorial_0_install.md), and that the workflow is setup with the following structure: + +```text +πŸ“ [HOME] +β””β”€β”€β”€πŸ“vp-analysis + β”œβ”€β”€β”€πŸ“V-pipe # V-pipe checked out from Github + β”œβ”€β”€β”€πŸ“Miniforge3 # bioconda + conda-forge + mamba + Snakemake + β”œβ”€β”€β”€πŸ“work # work directories + β”œβ”€β”€β”€πŸ“work-tests # … + β””β”€β”€β”€πŸ“ … # … +``` + +- `vp-analysis` is the main directory where we installed everything in the previous tutorial +- `Miniforge3` has dependencies to start using V-pipe (bioconda, conda-forge, mamba, snakemake) +- `V-pipe` is the directory with V-pipe's own code +- and for this tutorial we will create a directory like `work…`, which will hold the configuration and the sequencing data for our analysis. ## Organizing Data @@ -89,34 +107,20 @@ tree samples ## Install V-pipe -V-pipe uses the [Bioconda](https://bioconda.github.io/) bioinformatics software repository for all its pipeline components. The pipeline itself is written using [Snakemake](https://snakemake.readthedocs.io/en/stable/). - -> **For advanced users:** If your are fluent with these tools, you can: -> -> * directly download and install [bioconda](https://bioconda.github.io/user/install.html) and [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda), -> * make sure to configure V-pipe to use the `sars-cov-2` virus-config -> * and start using V-pipe with them, using the --use-conda to [automatically download and install](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management) any further pipeline dependencies. -> * please refer to the [documentation](https://github.com/cbg-ethz/V-pipe/blob/master/README.md) for additional instructions. -> -> The present tutorial will show simplified commands that automate much of this process. +## Install V-pipe -To deploy V-pipe, you can use the installation script with the following parameters: +After [having installed V-pipe using the installation tutorial](tutorial_0_install.md), create a new working directory for this analysis: ```bash -curl -O 'https://raw.githubusercontent.com/cbg-ethz/V-pipe/master/utils/quick_install.sh' -bash quick_install.sh -p tutorial -w work -``` +cd vp_analysis -* using `-p` specifies the subdirectory where to download and install snakemake and V-pipe -* using `-w` will create a working directory and populate it. It will copy over the references and the default `config/config.yaml`, and create a handy `vpipe` short-cut script to invoke `snakemake`. +# create a new directory and initialise it +mkdir -p work_sarscov2 +cd work_sarscov2 +../V-pipe/init_project.sh -> **Tip:** To create and populate other new working directories, you can call init_project.sh from within the new directory: -> -> ```console -> mkdir -p working_2 -> cd working_2 -> ../V-pipe/init_project.sh -> ``` +cd ../.. +``` ## Running V-pipe @@ -124,13 +128,13 @@ bash quick_install.sh -p tutorial -w work Copy the samples directory you created in the step [Preparing a small](#preparing-a-small-dataset) dataset to this working directory. (You can display the directory structure with `tree samples` or `find samples`.) ```bash -mv samples tutorial/work/ +mv samples vp_analysis/work_sarscov2/ ``` Prepare V-pipe's configuration. You can find more information in [the documentation](https://github.com/cbg-ethz/V-pipe/blob/master/config/README.md). In your local V-pipe installation, you will also find an exhaustive manual about all the configuration options inside `config/config.html`. ```bash -cat < tutorial/work/config.yaml +cat < vp_analysis/work_sarscov2/config.yaml general: virus_base_config: 'sars-cov-2' @@ -139,6 +143,7 @@ input: output: trim_primers: false + # NOTE: set "snv" to "true" to run the tutorial. We left "false" so automated test doesn't take too much time on GitHub. snv: false local: false global: false @@ -153,7 +158,7 @@ EOT Check what will be executed: ```bash -cd tutorial/work/ +cd vp_analysis/work_sarscov2/ ./vpipe --dryrun cd ../.. ``` @@ -163,7 +168,7 @@ As it is your first run of V-pipe, this will also generate the sample collection Note that the demo files you downloaded have reads of length 150 only. V-pipe’s default parameters are optimized for reads of length 250; add the third column in the tab-separated file: ```bash -cat < tutorial/work/samples.tsv +cat < vp_analysis/work_sarscov2/samples.tsv SRR10903401 20200102 150 SRR10903402 20200102 150 EOT @@ -176,7 +181,7 @@ You can safely delete it and re-run the `--dryrun` to regenerate it. Run the V-pipe analysis (the necessary dependencies will be downloaded and installed in conda environments managed by snakemake): ```bash -cd tutorial/work/ +cd vp_analysis/work_sarscov2/ ./vpipe -p --cores 2 ``` @@ -219,7 +224,7 @@ The most user friendly way to submit jobs to the cluster is using a special _sna [smk-simple-slurm](https://github.com/jdblischak/smk-simple-slurm) is a profile that works well in our experience with SLURM (for other platforms see suggestions in [the snakemake-profil documentation](https://github.com/snakemake-profiles/doc)). ```bash -cd tutorial/ +cd vp_analysis/ # download the profile git clone https://github.com/jdblischak/smk-simple-slurm.git # edit simple/config.yaml and either comment out the partition and qos or adapt to your local HPC @@ -249,7 +254,7 @@ printshellcmds: True scheduler: greedy use-conda: True EOT -cd work/ +cd work_sarscov2/ ./vpipe --dry-run --profile ../smk-simple-slurm/simple/ --jobs 100 cd ../.. ``` @@ -265,7 +270,7 @@ In addition, Snakemake has [parameters for conda](https://snakemake.readthedocs. - using `--conda-prefix=`_{DIR}_ stores the conda environments of dependencies in a common directory (thus possible to share and re-use between multiple instances of V-pipe). ```bash -cd tutorial/work/ +cd vp_analysis/work_sarscov2/ # First download all bioconda dependencies ahead of time ./vpipe --conda-prefix ../snake-envs --cores 1 --conda-create-envs-only # And then run on the cluster, the compute node will not need to download anything