From b1824be89d5163870204a7a00685dd1ed8fe7d05 Mon Sep 17 00:00:00 2001
From: Ivan Blagoev Topolsky <ivan.topolsky@sib.swiss>
Date: Sun, 8 Oct 2023 16:53:58 +0200
Subject: [PATCH] Run separate tutorials

- Run per-virus tutorial in parallel
- Reusing the installation tutorial's output
- Use zstd compression
---
 .github/workflows/tutorials.yaml | 64 +++++++++++++++++++++++-
 docs/tutorial_hiv.md             | 83 +++++++++++++++++---------------
 docs/tutorial_sarscov2.md        | 67 ++++++++++++++------------
 3 files changed, 141 insertions(+), 73 deletions(-)

diff --git a/.github/workflows/tutorials.yaml b/.github/workflows/tutorials.yaml
index 378f0a9e..49cc18f7 100644
--- a/.github/workflows/tutorials.yaml
+++ b/.github/workflows/tutorials.yaml
@@ -49,13 +49,13 @@ jobs:
 
       - name: Create V-pipe installation archive
         # HACK this acceleraters upload while preserving Unix-specifics (case sensitivity, file attributes).
-        run: tar -cvf vpipeinstallation.tar ./docs/vp-analysis
+        run: tar --zstd -cvf vpipeinstallation.tar.zst ./docs/vp-analysis
 
       - name: Keep installation
         uses: actions/upload-artifact@v3
         with:
           name: VPipeInstallation
-          path: vpipeinstallation.tar
+          path: vpipeinstallation.tar.zst
           if-no-files-found: error
 
       - name: Save notebooks
@@ -65,6 +65,66 @@ jobs:
           path: ./docs/*.ipynb
 
 
+  tutorial:
+    needs: installer
+
+    strategy:
+      max-parallel: 5
+      fail-fast: false
+      matrix:
+         virus: ["hiv", "sarscov2"]
+
+    runs-on: ubuntu-latest
+
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+            sparse-checkout: |
+                docs
+                tests
+            sparse-checkout-cone-mode: false
+            lfs: false
+
+      - name: Reuse installation
+        uses: actions/download-artifact@v2
+        with:
+            name: VPipeInstallation
+
+      - name: Extract V-pipe installation archive
+        run: tar --zstd -xvf vpipeinstallation.tar.zst
+
+      - name: Install conda environment dependencies
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          miniforge-version: latest
+          miniforge-variant: Mambaforge
+          python-version: "3.11"
+          mamba-version: "*"
+          channels: conda-forge,bioconda
+          channel-priority: strict
+          activate-environment: JupyText
+          auto-update-conda: true
+          environment-file: tests/conda_tutorials_env.yaml
+
+      - name: Setup Jupyter kernel
+        run: |
+          python -m ipykernel install --user
+
+      - name: Run analysis notebooks
+        working-directory: ./docs
+        run: ./convert.sh --branch tutorial*_${{ matrix.virus }}.md
+
+      - name: Save notebooks
+        uses: actions/upload-artifact@v3
+        with:
+          name: JupyterNotebooks
+          path: ./docs/*.ipynb
+
       # - name: Publish
       #   uses: peaceiris/actions-gh-pages@v3
       #   with:
diff --git a/docs/tutorial_hiv.md b/docs/tutorial_hiv.md
index 3e8d9d65..afcdc47a 100644
--- a/docs/tutorial_hiv.md
+++ b/docs/tutorial_hiv.md
@@ -15,13 +15,31 @@ jupyter:
 ---
 
 <!-- markdownlint-configure-file { "MD010": { "ignore_code_languages" : [ "tsv", "bash" ] } } -->
-# V-Pipe Tutorial
+# V-Pipe HIV Tutorial
 
 V-pipe is a workflow designed for the analysis of next generation sequencing (NGS) data from viral pathogens. It produces a number of results in a curated format (e.g., consensus sequences, SNV calls, local/global haplotypes). V-pipe is written using the Snakemake workflow management system.
 
+The present tutorial will show you how to apply V-pipe on HIV sequencing data.
+
 ## Requirements
 
-V-pipe is optimized for Linux or Mac OS systems. Therefore, we recommend users with a Windows system to [install WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) - this is not a full virtual machine but rather a way to run Windows and Linux cooperatively at the same time.
+The tutorial assumes that you have [installed V-pipe using the installation tutorial](tutorial_0_install.md), and that the workflow is setup with the following structure:
+
+```text
+📁 [HOME]
+└───📁vp-analysis
+    ├───📁V-pipe      # V-pipe checked out from Github
+    ├───📁Miniforge3  # bioconda + conda-forge + mamba + Snakemake
+    ├───📁work        # work directories
+    ├───📁work-tests  #  …
+    └───📁 …          #  …
+```
+
+- `vp-analysis` is the main directory where we installed everything in the previous tutorial
+- `Miniforge3` has dependencies to start using V-pipe (bioconda, conda-forge, mamba, snakemake)
+- `V-pipe` is the directory with V-pipe's own code
+- and for this tutorial we will create a directory like `work…`, which will hold the configuration and the sequencing data for our analysis.
+
 
 
 ## Organizing Data
@@ -78,52 +96,31 @@ The files will have the following structure:
 
 ## Install V-pipe
 
-V-pipe uses the [Bioconda](https://bioconda.github.io/) bioinformatics software repository for all its pipeline components. The pipeline itself is implemented using [Snakemake](https://snakemake.readthedocs.io/en/stable/).
-
-For advanced users: If your are fluent with these tools, you can:
-
-* directly download and install [bioconda](https://bioconda.github.io/user/install.html) and [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda),
-* specifiy your V-pipe configuration, and start using V-pipe
-
-Use `--use-conda` to [automatically download and install](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management) any further pipeline dependencies. Please refer to the documentation for additional instructions.
-
-In this present tutorial you will learn how to setup a workflow for the example dataset.
-
-To deploy V-pipe, you can use the installation script with the following parameters:
+After, [having installed V-pipe using the installation tutorial](tutorial_0_install.md), create a new working directory for this analysis:
 
 ```bash
-curl -O 'https://raw.githubusercontent.com/cbg-ethz/V-pipe/master/utils/quick_install.sh'
-bash quick_install.sh -p testing -w work
-```
+cd vp_analysis
 
-Note that
-
-* using `-p` specifies the subdirectory where to download and install snakemake and V-pipe
-* using `-w` will create a working directory and populate it. It will colloquial the references and the default `config/config.yaml`, and create a handy `vpipe` short-cut script to invoke `snakemake`.
-
-
-If you get `zsh: permission denied: ./quick_install.sh`, run `chmod +x quick_install.sh` this gives the necessary permissions.
-
-Tip: To create and populate other new working directories, you can call init_project.sh from within the new directory:
-
-```console
-mkdir -p working_2
-cd working_2
+# create a new directory and initialise it
+mkdir -p work_hiv
+cd work_hiv
 ../V-pipe/init_project.sh
+
+cd ../..
 ```
 
 
 ## Preparation
 
-Copy the samples directory you created in the step "Preparing a small dataset" to this working directory. (You can display the directory structure with `tree testing/work/resources/samples` or `find testing/work/resources/samples`.)
+Copy the samples directory you created in the step "Preparing a small dataset" to this working directory. (You can display the directory structure with `tree vp_analysis/work_hiv/resources/samples` or `find vp_analysis/work_hiv/resources/samples`.)
 
 ```bash
-mkdir -p testing/work/resources
-mv testing/V-pipe/docs/example_HIV_data/samples testing/work/resources/samples
+mkdir -p vp_analysis/work_hiv/resources
+mv vp_analysis/V-pipe/docs/example_HIV_data/samples vp_analysis/work_hiv/resources/samples
 ```
 
 Note that:
-- by default V-pipe expects its samples in a directory `samples` contained directly in the working directory - i.e. `testing/work/sample``
+- by default V-pipe expects its samples in a directory `samples` contained directly in the working directory - i.e. `vp_analysis/work_hiv/sample``
 - in this tutorial we put them inside the `resources` subdirectory, and will set the config file accordingly.
 
 
@@ -132,11 +129,11 @@ If you have a reference sequences that you would like to use for read mapping an
 
 ### Preparing V-pipe's configuration
 
-In the `work`  directory you can find the file `config.yaml`. This is where the V-Pipe configuation should be specified. See [here](https://github.com/cbg-ethz/V-pipe/tree/master/config#readme) for the documentation of the configuration.
+In the `work_hiv`  directory you can find the file `config.yaml`. This is where the V-Pipe configuation should be specified. See [here](https://github.com/cbg-ethz/V-pipe/tree/master/config#readme) for the documentation of the configuration.
 In this tutorial we are building our own configuration therefore `virus_base_config` will remain empty. Since we are working with HIV-1, V-Pipe is providing meta information that will be used for visualisation (metainfo_file and gff_directory).
 
 ```bash
-cat <<EOT > ./testing/work/config.yaml
+cat <<EOT > ./vp_analysis/work_hiv/config.yaml
 general:
     virus_base_config: ""
     aligner: bwa
@@ -147,6 +144,7 @@ input:
     reference: "{VPIPE_BASEDIR}/../resources/hiv/HXB2.fasta"
     metainfo_file: "{VPIPE_BASEDIR}/../resources/hiv/metainfo.yaml"
     gff_directory: "{VPIPE_BASEDIR}/../resources/hiv/gffs/"
+    # NOTE: this input datadir isn't standard
     datadir: resources/samples/
     read_length: 301
     samples_file: samples.tsv
@@ -165,7 +163,7 @@ output:
 EOT
 ```
 
-Note: A YAML files use spaces as indentation, you can use 2 or 4 spaces for indentation, but no tab. There are also online YAML file validators that you might want to use if your YAML file is wrongly formatted.
+**Note**: A YAML files use spaces as indentation, you can use 2 or 4 spaces for indentation, but **no tab**. There are also [online YAML file validators](https://www.yamllint.com/) that you might want to use if your YAML file is wrongly formatted.
 
 ## Running V-pipe
 
@@ -173,8 +171,10 @@ Note: A YAML files use spaces as indentation, you can use 2 or 4 spaces for inde
 Before running check what will be executed:
 
 ```bash
-cd testing/work/
+cd vp_analysis/work_hiv/
+
 ./vpipe --dryrun
+
 cd ../..
 ```
 
@@ -183,7 +183,7 @@ As this is your first run of V-pipe, it will also generate the sample collection
 Note that the samples you have downloaded have reads of length 301 only. V-pipe’s default parameters are optimized for reads of length 250. To adapt to the read length, add a third column in the tab-separated file as follows:
 
 ```bash
-cat <<EOT > testing/work/samples.tsv
+cat <<EOT > vp_analysis/work_hiv/samples.tsv
 CAP217	4390	301
 CAP188	4	301
 CAP188	30	301
@@ -198,8 +198,11 @@ You can safely delete it and re-run with option `--dry-run` to regenerate it.
 Finally, we can run the V-pipe analysis (the necessary dependencies will be downloaded and installed in conda environments managed by snakemake):
 
 ```bash
-cd testing/work/
+cd vp_analysis/work_hiv/
+
 ./vpipe -p --cores 2
+
+cd -
 ```
 
 
diff --git a/docs/tutorial_sarscov2.md b/docs/tutorial_sarscov2.md
index 5499f046..84a42913 100644
--- a/docs/tutorial_sarscov2.md
+++ b/docs/tutorial_sarscov2.md
@@ -21,6 +21,24 @@ This tutorial shows the basics of how to interact with V-pipe.
 
 For the purpose of this Tutorial, we will work with the master branch of V-pipe and use the _sars-cov-2_ virus base config which is adapted for the SARS-CoV-2 virus.
 
+## Requirements
+
+The tutorial assumes that you have [installed V-pipe using the installation tutorial](tutorial_0_install.md), and that the workflow is setup with the following structure:
+
+```text
+📁 [HOME]
+└───📁vp-analysis
+    ├───📁V-pipe      # V-pipe checked out from Github
+    ├───📁Miniforge3  # bioconda + conda-forge + mamba + Snakemake
+    ├───📁work        # work directories
+    ├───📁work-tests  #  …
+    └───📁 …          #  …
+```
+
+- `vp-analysis` is the main directory where we installed everything in the previous tutorial
+- `Miniforge3` has dependencies to start using V-pipe (bioconda, conda-forge, mamba, snakemake)
+- `V-pipe` is the directory with V-pipe's own code
+- and for this tutorial we will create a directory like `work…`, which will hold the configuration and the sequencing data for our analysis.
 
 ## Organizing Data
 
@@ -89,34 +107,20 @@ tree samples
 
 ## Install V-pipe
 
-V-pipe uses the [Bioconda](https://bioconda.github.io/) bioinformatics software repository for all its pipeline components. The pipeline itself is written using [Snakemake](https://snakemake.readthedocs.io/en/stable/).
-
-> **For advanced users:** If your are fluent with these tools, you can:
->
-> * directly download and install [bioconda](https://bioconda.github.io/user/install.html) and [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html#installation-via-conda),
-> * make sure to configure V-pipe to use the `sars-cov-2` virus-config
-> * and start using V-pipe with them, using the --use-conda to [automatically download and install](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management) any further pipeline dependencies.
-> * please refer to the [documentation](https://github.com/cbg-ethz/V-pipe/blob/master/README.md) for additional instructions.
->
-> The present tutorial will show simplified commands that automate much of this process.
+## Install V-pipe
 
-To deploy V-pipe, you can use the installation script with the following parameters:
+After [having installed V-pipe using the installation tutorial](tutorial_0_install.md), create a new working directory for this analysis:
 
 ```bash
-curl -O 'https://raw.githubusercontent.com/cbg-ethz/V-pipe/master/utils/quick_install.sh'
-bash quick_install.sh -p tutorial -w work
-```
+cd vp_analysis
 
-* using `-p` specifies the subdirectory where to download and install snakemake and V-pipe
-* using `-w` will create a working directory and populate it. It will copy over the references and the default `config/config.yaml`, and create a handy `vpipe` short-cut script to invoke `snakemake`.
+# create a new directory and initialise it
+mkdir -p work_sarscov2
+cd work_sarscov2
+../V-pipe/init_project.sh
 
-> **Tip:** To create and populate other new working directories, you can call init_project.sh from within the new directory:
->
-> ```console
-> mkdir -p working_2
-> cd working_2
-> ../V-pipe/init_project.sh
-> ```
+cd ../..
+```
 
 
 ## Running V-pipe
@@ -124,13 +128,13 @@ bash quick_install.sh -p tutorial -w work
 Copy the samples directory you created in the step [Preparing a small](#preparing-a-small-dataset) dataset to this working directory. (You can display the directory structure with `tree samples` or `find samples`.)
 
 ```bash
-mv samples tutorial/work/
+mv samples vp_analysis/work_sarscov2/
 ```
 
 Prepare V-pipe's configuration. You can find more information in [the documentation](https://github.com/cbg-ethz/V-pipe/blob/master/config/README.md). In your local V-pipe installation, you will also find an exhaustive manual about all the configuration options inside `config/config.html`.
 
 ```bash
-cat <<EOT > tutorial/work/config.yaml
+cat <<EOT > vp_analysis/work_sarscov2/config.yaml
 general:
     virus_base_config: 'sars-cov-2'
 
@@ -139,6 +143,7 @@ input:
 
 output:
     trim_primers: false
+    # NOTE: set "snv" to "true" to run the tutorial. We left "false" so automated test doesn't take too much time on GitHub.
     snv: false
     local: false
     global: false
@@ -153,7 +158,7 @@ EOT
 Check what will be executed:
 
 ```bash
-cd tutorial/work/
+cd vp_analysis/work_sarscov2/
 ./vpipe --dryrun
 cd ../..
 ```
@@ -163,7 +168,7 @@ As it is your first run of V-pipe, this will also generate the sample collection
 Note that the demo files you downloaded have reads of length 150 only. V-pipe’s default parameters are optimized for reads of length 250; add the third column in the tab-separated file:
 
 ```bash
-cat <<EOT > tutorial/work/samples.tsv
+cat <<EOT > vp_analysis/work_sarscov2/samples.tsv
 SRR10903401	20200102	150
 SRR10903402	20200102	150
 EOT
@@ -176,7 +181,7 @@ You can safely delete it and re-run the `--dryrun` to regenerate it.
 Run the V-pipe analysis (the necessary dependencies will be downloaded and installed in conda environments managed by snakemake):
 
 ```bash
-cd tutorial/work/
+cd vp_analysis/work_sarscov2/
 ./vpipe -p --cores 2
 ```
 
@@ -219,7 +224,7 @@ The most user friendly way to submit jobs to the cluster is using a special _sna
 [smk-simple-slurm](https://github.com/jdblischak/smk-simple-slurm) is a profile that works well in our experience with SLURM (for other platforms see suggestions in [the snakemake-profil documentation](https://github.com/snakemake-profiles/doc)).
 
 ```bash
-cd tutorial/
+cd vp_analysis/
 # download the profile
 git clone https://github.com/jdblischak/smk-simple-slurm.git
 # edit simple/config.yaml and either comment out the partition and qos or adapt to your local HPC
@@ -249,7 +254,7 @@ printshellcmds: True
 scheduler: greedy
 use-conda: True
 EOT
-cd work/
+cd work_sarscov2/
 ./vpipe --dry-run --profile ../smk-simple-slurm/simple/ --jobs 100
 cd ../..
 ```
@@ -265,7 +270,7 @@ In addition, Snakemake has [parameters for conda](https://snakemake.readthedocs.
 - using `--conda-prefix=`_{DIR}_ stores the conda environments of dependencies in a common directory (thus possible to share and re-use between multiple instances of V-pipe).
 
 ```bash
-cd tutorial/work/
+cd  vp_analysis/work_sarscov2/
 # First download all bioconda dependencies ahead of time
 ./vpipe --conda-prefix ../snake-envs --cores 1 --conda-create-envs-only
 # And then run on the cluster, the compute node will not need to download anything