Skip to content

Commit

Permalink
ENH: add action to generate human pangenome index and filter against …
Browse files Browse the repository at this point in the history
…it (#196)

Co-authored-by: Christos Konstantinos Matzoros <[email protected]>
  • Loading branch information
misialq and ChristosMatzoros authored Aug 30, 2024
1 parent 0914dd1 commit e16ac9d
Show file tree
Hide file tree
Showing 14 changed files with 474 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ jobs:
mamba run -n conda-env qiime dev refresh-cache
- name: Install dev dependencies
run: mamba run -n conda-env pip install pytest coverage parameterized
run: mamba run -n conda-env pip install pytest pytest-cov coverage parameterized

- name: Run tests
id: test
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/q2-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,6 @@ jobs:
name: 'Upload coverage'
with:
fail_ci_if_error: true
verbose: true
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
1 change: 1 addition & 0 deletions .github/workflows/upload-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,6 @@ jobs:
fail_ci_if_error: true
override_pr: ${{ steps.pr.outputs.result }}
override_commit: ${{ github.event.workflow_run.head_sha }}
verbose: true
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ test: all
py.test

test-cov: all
coverage run -m pytest
coverage xml
python -m pytest --cov q2_moshpit --cov-report xml:coverage.xml

install: all
$(PYTHON) setup.py install
Expand Down
3 changes: 3 additions & 0 deletions ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ requirements:
- busco ==5.5.0
- diamond
- eggnog-mapper >=2.1.10
- gfatools
- kaiju
- kraken2
- metabat2
Expand All @@ -32,7 +33,9 @@ requirements:
- q2-types {{ qiime2_epoch }}.*
- q2templates {{ qiime2_epoch }}.*
- q2-assembly {{ qiime2_epoch }}.*
- q2-quality-control {{ qiime2_epoch }}.*
- samtools
- seqtk
- tqdm
- xmltodict
- pyhmmer
Expand Down
5 changes: 3 additions & 2 deletions q2_moshpit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from . import prodigal
from ._version import get_versions
from .dereplication import dereplicate_mags
from .filtering import filter_derep_mags, filter_mags
from .filtering import filter_derep_mags, filter_mags, filter_reads_pangenome
from .kaiju import classification as kaiju_class, database as kaiju_db
from .kraken2 import (
classification as kraken_class,
Expand All @@ -28,5 +28,6 @@
'metabat2', 'bracken', 'kraken_class', 'kraken_db',
'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog',
'busco', 'prodigal', 'kraken_helpers', 'partition',
'filter_derep_mags', 'filter_mags', 'get_feature_lengths'
'filter_derep_mags', 'filter_mags', 'get_feature_lengths',
'filter_reads_pangenome'
]
3 changes: 2 additions & 1 deletion q2_moshpit/filtering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,6 @@
# ----------------------------------------------------------------------------

from .filter_mags import filter_derep_mags, filter_mags
from .filter_pangenome import filter_reads_pangenome

__all__ = ["filter_derep_mags", "filter_mags"]
__all__ = ["filter_derep_mags", "filter_mags", "filter_reads_pangenome"]
179 changes: 179 additions & 0 deletions q2_moshpit/filtering/filter_pangenome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import glob
import os
import shutil
import subprocess
import tempfile

from q2_moshpit._utils import run_command

EBI_SERVER_URL = ("ftp://ftp.sra.ebi.ac.uk/vol1/analysis/ERZ127/"
"ERZ12792464/hprc-v1.0-pggb.gfa.gz")


def _fetch_and_extract_pangenome(uri: str, dest_dir: str):
"""
Fetches and extracts the human pangenome GFA file.
Args:
uri (str): The URI of the genome to fetch. Should be in the form
ftp://host/path/to/file.
dest_dir (str): The directory where the data will be saved.
"""
filename = os.path.basename(uri)
dest_fp = os.path.join(dest_dir, filename)

try:
print("Fetching the GFA file...")
run_command(["wget", uri, "-q", "-O", dest_fp])
except Exception as e:
raise Exception(
"Unable to connect to the server. Please try again later. "
f"The error was: {e}"
)

print("Download finished. Extracting files...")
run_command(["gunzip", dest_fp])


def _extract_fasta_from_gfa(gfa_fp: str, fasta_fp: str):
"""
Extracts a FASTA file from a GFA file using gfatools.
This function runs a subprocess calling 'gfatools' to convert a GFA file
into a FASTA file. If the conversion is successful, the original GFA
file is removed. Otherwise, an exception is raised.
Args:
gfa_fp (str): The file path to the input GFA file.
fasta_fp (str): The file path where the output FASTA will be saved.
"""
cmd = ["gfatools", "gfa2fa", gfa_fp]
with open(fasta_fp, 'w') as f_out:
try:
subprocess.run(cmd, stdout=f_out)
except Exception as e:
raise Exception(
f"Failed to extract the fasta file from the GFA. "
f"The error was: {e}"
)
os.remove(gfa_fp)


def _fetch_and_extract_grch38(get_ncbi_genomes: callable, dest_dir: str):
"""
Fetches and extracts the GRCh38 human reference genome.
This function uses the RESCRIPt method provided through the callable
`get_ncbi_genomes` to fetch the GRCh38 human reference genome.
The fetched 'dna-sequences.fasta' file is renamed to 'grch38.fasta'.
Args:
get_ncbi_genomes (callable): A function to fetch genomes from NCBI.
dest_dir (str): The directory where the genome data will be saved.
"""
results = get_ncbi_genomes(
taxon='Homo sapiens',
only_reference=True,
assembly_levels=['chromosome'],
assembly_source='refseq',
only_genomic=False
)
results.genome_assemblies.export_data(dest_dir)
shutil.move(
os.path.join(dest_dir, "dna-sequences.fasta"),
os.path.join(dest_dir, "grch38.fasta")
)


def _combine_fasta_files(*fasta_in_fp, fasta_out_fp):
"""
Combines multiple FASTA files into a single FASTA file.
This function uses 'seqtk' to format and combine multiple FASTA files
into a single file. Each input FASTA file is appended to the output file.
After processing, the input files are removed.
Args:
*fasta_in_fp: Variable length argument list of paths to input
FASTA files.
fasta_out_fp (str): The file path where the combined output FASTA
file should be saved.
"""
with open(fasta_out_fp, 'a') as f_out:
for f_in in fasta_in_fp:
try:
subprocess.run(["seqtk", "seq", "-U", f_in], stdout=f_out)
except Exception as e:
raise Exception(
f"Failed to add the {f_in} to the reference FASTA file. "
f"The error was: {e}"
)
os.remove(f_in)


def filter_reads_pangenome(
ctx, reads, index=None, n_threads=1, mode='local',
sensitivity='sensitive', ref_gap_open_penalty=5,
ref_gap_ext_penalty=3,
):
"""
Filters reads against a pangenome index, optionally generating the index
if not provided.
This function fetches and processes the human pangenome and GRCh38
reference genome, combines them into a single FASTA file, and then
generates a Bowtie 2 index if not already provided. It then filters
reads against this index according to the specified sensitivity.
"""
get_ncbi_genomes = ctx.get_action("rescript", "get_ncbi_genomes")
build_index = ctx.get_action("quality_control", "bowtie2_build")
filter_reads = ctx.get_action("quality_control", "filter_reads")

with tempfile.TemporaryDirectory() as tmp:
if index is None:
print("Reference index was not provided - it will be generated.")
print("Fetching the human pangenome GFA file...")
_fetch_and_extract_pangenome(EBI_SERVER_URL, tmp)

print("Fetching the human GRCh38 reference genome...")
_fetch_and_extract_grch38(get_ncbi_genomes, tmp)

print("Converting pangenome GFA to FASTA...")
gfa_fp = glob.glob(os.path.join(tmp, "*.gfa"))[0]
pan_fasta_fp = os.path.join(tmp, "pangenome.fasta")
_extract_fasta_from_gfa(gfa_fp, pan_fasta_fp)

print("Generating an index of the combined reference...")
combined_fasta_fp = os.path.join(tmp, "combined.fasta")
_combine_fasta_files(
pan_fasta_fp, os.path.join(tmp, "grch38.fasta"),
fasta_out_fp=combined_fasta_fp
)
combined_reference = ctx.make_artifact(
"FeatureData[Sequence]", combined_fasta_fp
)
index, = build_index(
sequences=combined_reference, n_threads=n_threads
)

print("Filtering reads against the index...")
filter_params = {
k: v for k, v in locals().items() if k in
['n_threads', 'mode', 'ref_gap_open_penalty',
'ref_gap_ext_penalty']
}
filtered_reads, = filter_reads(
demultiplexed_sequences=reads,
database=index,
exclude_seqs=True,
**filter_params
)

return filtered_reads, index
4 changes: 4 additions & 0 deletions q2_moshpit/filtering/tests/data/pangenome/combined.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>ref2
GACTAG
>ref1
ATGCGAT
2 changes: 2 additions & 0 deletions q2_moshpit/filtering/tests/data/pangenome/grch38.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>ref1
ATGCGAT
2 changes: 2 additions & 0 deletions q2_moshpit/filtering/tests/data/pangenome/pangenome.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>ref2
GaCtAG
Loading

0 comments on commit e16ac9d

Please sign in to comment.