ENH: add action to generate human pangenome index and filter against …

…it (#196) Co-authored-by: Christos Konstantinos Matzoros <[email protected]>
bokulich-lab · Aug 30, 2024 · e16ac9d · e16ac9d
1 parent 0914dd1
commit e16ac9d
Show file tree

Hide file tree

Showing 14 changed files with 474 additions and 10 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -112,7 +112,7 @@ jobs:
           mamba run -n conda-env qiime dev refresh-cache
 
       - name: Install dev dependencies
-        run: mamba run -n conda-env pip install pytest coverage parameterized
+        run: mamba run -n conda-env pip install pytest pytest-cov coverage parameterized
 
       - name: Run tests
         id: test

diff --git a/.github/workflows/q2-ci.yaml b/.github/workflows/q2-ci.yaml
@@ -37,5 +37,6 @@ jobs:
         name: 'Upload coverage'
         with:
           fail_ci_if_error: true
+          verbose: true
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.github/workflows/upload-coverage.yaml b/.github/workflows/upload-coverage.yaml
@@ -61,5 +61,6 @@ jobs:
           fail_ci_if_error: true
           override_pr: ${{ steps.pr.outputs.result }}
           override_commit: ${{ github.event.workflow_run.head_sha }}
+          verbose: true
         env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/Makefile b/Makefile
@@ -12,8 +12,7 @@ test: all
 	py.test
 
 test-cov: all
-	coverage run -m pytest
-	coverage xml
+	python -m pytest --cov q2_moshpit --cov-report xml:coverage.xml
 
 install: all
 	$(PYTHON) setup.py install

diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml
@@ -23,6 +23,7 @@ requirements:
     - busco ==5.5.0
     - diamond
     - eggnog-mapper >=2.1.10
+    - gfatools
     - kaiju
     - kraken2
     - metabat2
@@ -32,7 +33,9 @@ requirements:
     - q2-types {{ qiime2_epoch }}.*
     - q2templates {{ qiime2_epoch }}.*
     - q2-assembly {{ qiime2_epoch }}.*
+    - q2-quality-control {{ qiime2_epoch }}.*
     - samtools
+    - seqtk
     - tqdm
     - xmltodict
     - pyhmmer

diff --git a/q2_moshpit/__init__.py b/q2_moshpit/__init__.py
@@ -11,7 +11,7 @@
 from . import prodigal
 from ._version import get_versions
 from .dereplication import dereplicate_mags
-from .filtering import filter_derep_mags, filter_mags
+from .filtering import filter_derep_mags, filter_mags, filter_reads_pangenome
 from .kaiju import classification as kaiju_class, database as kaiju_db
 from .kraken2 import (
     classification as kraken_class,
@@ -28,5 +28,6 @@
     'metabat2', 'bracken', 'kraken_class', 'kraken_db',
     'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog',
     'busco', 'prodigal', 'kraken_helpers', 'partition',
-    'filter_derep_mags', 'filter_mags', 'get_feature_lengths'
+    'filter_derep_mags', 'filter_mags', 'get_feature_lengths',
+    'filter_reads_pangenome'
 ]
diff --git a/q2_moshpit/filtering/__init__.py b/q2_moshpit/filtering/__init__.py
@@ -7,5 +7,6 @@
 # ----------------------------------------------------------------------------
 
 from .filter_mags import filter_derep_mags, filter_mags
+from .filter_pangenome import filter_reads_pangenome
 
-__all__ = ["filter_derep_mags", "filter_mags"]
+__all__ = ["filter_derep_mags", "filter_mags", "filter_reads_pangenome"]
diff --git a/q2_moshpit/filtering/filter_pangenome.py b/q2_moshpit/filtering/filter_pangenome.py
@@ -0,0 +1,179 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import glob
+import os
+import shutil
+import subprocess
+import tempfile
+
+from q2_moshpit._utils import run_command
+
+EBI_SERVER_URL = ("ftp://ftp.sra.ebi.ac.uk/vol1/analysis/ERZ127/"
+                  "ERZ12792464/hprc-v1.0-pggb.gfa.gz")
+
+
+def _fetch_and_extract_pangenome(uri: str, dest_dir: str):
+    """
+    Fetches and extracts the human pangenome GFA file.
+
+    Args:
+        uri (str): The URI of the genome to fetch. Should be in the form
+                    ftp://host/path/to/file.
+        dest_dir (str): The directory where the data will be saved.
+    """
+    filename = os.path.basename(uri)
+    dest_fp = os.path.join(dest_dir, filename)
+
+    try:
+        print("Fetching the GFA file...")
+        run_command(["wget", uri, "-q", "-O", dest_fp])
+    except Exception as e:
+        raise Exception(
+            "Unable to connect to the server. Please try again later. "
+            f"The error was: {e}"
+        )
+
+    print("Download finished. Extracting files...")
+    run_command(["gunzip", dest_fp])
+
+
+def _extract_fasta_from_gfa(gfa_fp: str, fasta_fp: str):
+    """
+    Extracts a FASTA file from a GFA file using gfatools.
+
+    This function runs a subprocess calling 'gfatools' to convert a GFA file
+    into a FASTA file. If the conversion is successful, the original GFA
+    file is removed. Otherwise, an exception is raised.
+
+    Args:
+        gfa_fp (str): The file path to the input GFA file.
+        fasta_fp (str): The file path where the output FASTA will be saved.
+    """
+    cmd = ["gfatools", "gfa2fa", gfa_fp]
+    with open(fasta_fp, 'w') as f_out:
+        try:
+            subprocess.run(cmd, stdout=f_out)
+        except Exception as e:
+            raise Exception(
+                f"Failed to extract the fasta file from the GFA. "
+                f"The error was: {e}"
+            )
+    os.remove(gfa_fp)
+
+
+def _fetch_and_extract_grch38(get_ncbi_genomes: callable, dest_dir: str):
+    """
+    Fetches and extracts the GRCh38 human reference genome.
+
+    This function uses the RESCRIPt method provided through the callable
+    `get_ncbi_genomes` to fetch the GRCh38 human reference genome.
+    The fetched 'dna-sequences.fasta' file is renamed to 'grch38.fasta'.
+
+    Args:
+        get_ncbi_genomes (callable): A function to fetch genomes from NCBI.
+        dest_dir (str): The directory where the genome data will be saved.
+    """
+    results = get_ncbi_genomes(
+        taxon='Homo sapiens',
+        only_reference=True,
+        assembly_levels=['chromosome'],
+        assembly_source='refseq',
+        only_genomic=False
+    )
+    results.genome_assemblies.export_data(dest_dir)
+    shutil.move(
+        os.path.join(dest_dir, "dna-sequences.fasta"),
+        os.path.join(dest_dir, "grch38.fasta")
+    )
+
+
+def _combine_fasta_files(*fasta_in_fp, fasta_out_fp):
+    """
+    Combines multiple FASTA files into a single FASTA file.
+
+    This function uses 'seqtk' to format and combine multiple FASTA files
+    into a single file. Each input FASTA file is appended to the output file.
+    After processing, the input files are removed.
+
+    Args:
+        *fasta_in_fp: Variable length argument list of paths to input
+            FASTA files.
+        fasta_out_fp (str): The file path where the combined output FASTA
+            file should be saved.
+    """
+    with open(fasta_out_fp, 'a') as f_out:
+        for f_in in fasta_in_fp:
+            try:
+                subprocess.run(["seqtk", "seq", "-U", f_in], stdout=f_out)
+            except Exception as e:
+                raise Exception(
+                    f"Failed to add the {f_in} to the reference FASTA file. "
+                    f"The error was: {e}"
+                )
+            os.remove(f_in)
+
+
+def filter_reads_pangenome(
+        ctx, reads, index=None, n_threads=1, mode='local',
+        sensitivity='sensitive', ref_gap_open_penalty=5,
+        ref_gap_ext_penalty=3,
+):
+    """
+    Filters reads against a pangenome index, optionally generating the index
+    if not provided.
+
+    This function fetches and processes the human pangenome and GRCh38
+    reference genome, combines them into a single FASTA file, and then
+    generates a Bowtie 2 index if not already provided. It then filters
+    reads against this index according to the specified sensitivity.
+    """
+    get_ncbi_genomes = ctx.get_action("rescript", "get_ncbi_genomes")
+    build_index = ctx.get_action("quality_control", "bowtie2_build")
+    filter_reads = ctx.get_action("quality_control", "filter_reads")
+
+    with tempfile.TemporaryDirectory() as tmp:
+        if index is None:
+            print("Reference index was not provided - it will be generated.")
+            print("Fetching the human pangenome GFA file...")
+            _fetch_and_extract_pangenome(EBI_SERVER_URL, tmp)
+
+            print("Fetching the human GRCh38 reference genome...")
+            _fetch_and_extract_grch38(get_ncbi_genomes, tmp)
+
+            print("Converting pangenome GFA to FASTA...")
+            gfa_fp = glob.glob(os.path.join(tmp, "*.gfa"))[0]
+            pan_fasta_fp = os.path.join(tmp, "pangenome.fasta")
+            _extract_fasta_from_gfa(gfa_fp, pan_fasta_fp)
+
+            print("Generating an index of the combined reference...")
+            combined_fasta_fp = os.path.join(tmp, "combined.fasta")
+            _combine_fasta_files(
+                pan_fasta_fp, os.path.join(tmp, "grch38.fasta"),
+                fasta_out_fp=combined_fasta_fp
+            )
+            combined_reference = ctx.make_artifact(
+                "FeatureData[Sequence]", combined_fasta_fp
+            )
+            index, = build_index(
+                sequences=combined_reference, n_threads=n_threads
+            )
+
+        print("Filtering reads against the index...")
+        filter_params = {
+            k: v for k, v in locals().items() if k in
+            ['n_threads', 'mode', 'ref_gap_open_penalty',
+             'ref_gap_ext_penalty']
+        }
+        filtered_reads, = filter_reads(
+            demultiplexed_sequences=reads,
+            database=index,
+            exclude_seqs=True,
+            **filter_params
+        )
+
+    return filtered_reads, index
diff --git a/q2_moshpit/filtering/tests/data/pangenome/combined.fasta b/q2_moshpit/filtering/tests/data/pangenome/combined.fasta
@@ -0,0 +1,4 @@
+>ref2
+GACTAG
+>ref1
+ATGCGAT
diff --git a/q2_moshpit/filtering/tests/data/pangenome/grch38.fasta b/q2_moshpit/filtering/tests/data/pangenome/grch38.fasta
@@ -0,0 +1,2 @@
+>ref1
+ATGCGAT
diff --git a/q2_moshpit/filtering/tests/data/pangenome/pangenome.fasta b/q2_moshpit/filtering/tests/data/pangenome/pangenome.fasta
@@ -0,0 +1,2 @@
+>ref2
+GaCtAG
-Original file line number
+Diff line change
@@ -0,0 +1,4 @@
+    >ref2
+    GACTAG
+    >ref1
+    ATGCGAT