diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index eb8e6a92..e9a2bec4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -112,7 +112,7 @@ jobs: mamba run -n conda-env qiime dev refresh-cache - name: Install dev dependencies - run: mamba run -n conda-env pip install pytest coverage parameterized + run: mamba run -n conda-env pip install pytest pytest-cov coverage parameterized - name: Run tests id: test diff --git a/.github/workflows/q2-ci.yaml b/.github/workflows/q2-ci.yaml index b5b8308c..b604d221 100644 --- a/.github/workflows/q2-ci.yaml +++ b/.github/workflows/q2-ci.yaml @@ -37,5 +37,6 @@ jobs: name: 'Upload coverage' with: fail_ci_if_error: true + verbose: true env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/upload-coverage.yaml b/.github/workflows/upload-coverage.yaml index 39882403..019b40ae 100644 --- a/.github/workflows/upload-coverage.yaml +++ b/.github/workflows/upload-coverage.yaml @@ -61,5 +61,6 @@ jobs: fail_ci_if_error: true override_pr: ${{ steps.pr.outputs.result }} override_commit: ${{ github.event.workflow_run.head_sha }} + verbose: true env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/Makefile b/Makefile index 7cdb2f39..24f84ef8 100644 --- a/Makefile +++ b/Makefile @@ -12,8 +12,7 @@ test: all py.test test-cov: all - coverage run -m pytest - coverage xml + python -m pytest --cov q2_moshpit --cov-report xml:coverage.xml install: all $(PYTHON) setup.py install diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index 60b0aa7b..f2d8dbf8 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -23,6 +23,7 @@ requirements: - busco ==5.5.0 - diamond - eggnog-mapper >=2.1.10 + - gfatools - kaiju - kraken2 - metabat2 @@ -32,7 +33,9 @@ requirements: - q2-types {{ qiime2_epoch }}.* - q2templates {{ qiime2_epoch }}.* - q2-assembly {{ qiime2_epoch }}.* + - q2-quality-control {{ qiime2_epoch }}.* - samtools + - seqtk - tqdm - xmltodict - pyhmmer diff --git a/q2_moshpit/__init__.py b/q2_moshpit/__init__.py index d8074c5b..69762283 100644 --- a/q2_moshpit/__init__.py +++ b/q2_moshpit/__init__.py @@ -12,7 +12,7 @@ from . import prodigal from ._version import get_versions from .dereplication import dereplicate_mags -from .filtering import filter_derep_mags, filter_mags +from .filtering import filter_derep_mags, filter_mags, filter_reads_pangenome from .kaiju import classification as kaiju_class, database as kaiju_db from .kraken2 import ( classification as kraken_class, @@ -30,5 +30,5 @@ 'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog', 'busco', 'prodigal', 'kraken_helpers', 'partition', 'filter_derep_mags', 'filter_mags', 'get_feature_lengths', - 'abundance' + 'abundance', 'filter_reads_pangenome' ] diff --git a/q2_moshpit/filtering/__init__.py b/q2_moshpit/filtering/__init__.py index 0ef6aef4..a8f8e4d5 100644 --- a/q2_moshpit/filtering/__init__.py +++ b/q2_moshpit/filtering/__init__.py @@ -7,5 +7,6 @@ # ---------------------------------------------------------------------------- from .filter_mags import filter_derep_mags, filter_mags +from .filter_pangenome import filter_reads_pangenome -__all__ = ["filter_derep_mags", "filter_mags"] +__all__ = ["filter_derep_mags", "filter_mags", "filter_reads_pangenome"] diff --git a/q2_moshpit/filtering/filter_pangenome.py b/q2_moshpit/filtering/filter_pangenome.py new file mode 100644 index 00000000..81aa05af --- /dev/null +++ b/q2_moshpit/filtering/filter_pangenome.py @@ -0,0 +1,179 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2022-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- +import glob +import os +import shutil +import subprocess +import tempfile + +from q2_moshpit._utils import run_command + +EBI_SERVER_URL = ("ftp://ftp.sra.ebi.ac.uk/vol1/analysis/ERZ127/" + "ERZ12792464/hprc-v1.0-pggb.gfa.gz") + + +def _fetch_and_extract_pangenome(uri: str, dest_dir: str): + """ + Fetches and extracts the human pangenome GFA file. + + Args: + uri (str): The URI of the genome to fetch. Should be in the form + ftp://host/path/to/file. + dest_dir (str): The directory where the data will be saved. + """ + filename = os.path.basename(uri) + dest_fp = os.path.join(dest_dir, filename) + + try: + print("Fetching the GFA file...") + run_command(["wget", uri, "-q", "-O", dest_fp]) + except Exception as e: + raise Exception( + "Unable to connect to the server. Please try again later. " + f"The error was: {e}" + ) + + print("Download finished. Extracting files...") + run_command(["gunzip", dest_fp]) + + +def _extract_fasta_from_gfa(gfa_fp: str, fasta_fp: str): + """ + Extracts a FASTA file from a GFA file using gfatools. + + This function runs a subprocess calling 'gfatools' to convert a GFA file + into a FASTA file. If the conversion is successful, the original GFA + file is removed. Otherwise, an exception is raised. + + Args: + gfa_fp (str): The file path to the input GFA file. + fasta_fp (str): The file path where the output FASTA will be saved. + """ + cmd = ["gfatools", "gfa2fa", gfa_fp] + with open(fasta_fp, 'w') as f_out: + try: + subprocess.run(cmd, stdout=f_out) + except Exception as e: + raise Exception( + f"Failed to extract the fasta file from the GFA. " + f"The error was: {e}" + ) + os.remove(gfa_fp) + + +def _fetch_and_extract_grch38(get_ncbi_genomes: callable, dest_dir: str): + """ + Fetches and extracts the GRCh38 human reference genome. + + This function uses the RESCRIPt method provided through the callable + `get_ncbi_genomes` to fetch the GRCh38 human reference genome. + The fetched 'dna-sequences.fasta' file is renamed to 'grch38.fasta'. + + Args: + get_ncbi_genomes (callable): A function to fetch genomes from NCBI. + dest_dir (str): The directory where the genome data will be saved. + """ + results = get_ncbi_genomes( + taxon='Homo sapiens', + only_reference=True, + assembly_levels=['chromosome'], + assembly_source='refseq', + only_genomic=False + ) + results.genome_assemblies.export_data(dest_dir) + shutil.move( + os.path.join(dest_dir, "dna-sequences.fasta"), + os.path.join(dest_dir, "grch38.fasta") + ) + + +def _combine_fasta_files(*fasta_in_fp, fasta_out_fp): + """ + Combines multiple FASTA files into a single FASTA file. + + This function uses 'seqtk' to format and combine multiple FASTA files + into a single file. Each input FASTA file is appended to the output file. + After processing, the input files are removed. + + Args: + *fasta_in_fp: Variable length argument list of paths to input + FASTA files. + fasta_out_fp (str): The file path where the combined output FASTA + file should be saved. + """ + with open(fasta_out_fp, 'a') as f_out: + for f_in in fasta_in_fp: + try: + subprocess.run(["seqtk", "seq", "-U", f_in], stdout=f_out) + except Exception as e: + raise Exception( + f"Failed to add the {f_in} to the reference FASTA file. " + f"The error was: {e}" + ) + os.remove(f_in) + + +def filter_reads_pangenome( + ctx, reads, index=None, n_threads=1, mode='local', + sensitivity='sensitive', ref_gap_open_penalty=5, + ref_gap_ext_penalty=3, +): + """ + Filters reads against a pangenome index, optionally generating the index + if not provided. + + This function fetches and processes the human pangenome and GRCh38 + reference genome, combines them into a single FASTA file, and then + generates a Bowtie 2 index if not already provided. It then filters + reads against this index according to the specified sensitivity. + """ + get_ncbi_genomes = ctx.get_action("rescript", "get_ncbi_genomes") + build_index = ctx.get_action("quality_control", "bowtie2_build") + filter_reads = ctx.get_action("quality_control", "filter_reads") + + with tempfile.TemporaryDirectory() as tmp: + if index is None: + print("Reference index was not provided - it will be generated.") + print("Fetching the human pangenome GFA file...") + _fetch_and_extract_pangenome(EBI_SERVER_URL, tmp) + + print("Fetching the human GRCh38 reference genome...") + _fetch_and_extract_grch38(get_ncbi_genomes, tmp) + + print("Converting pangenome GFA to FASTA...") + gfa_fp = glob.glob(os.path.join(tmp, "*.gfa"))[0] + pan_fasta_fp = os.path.join(tmp, "pangenome.fasta") + _extract_fasta_from_gfa(gfa_fp, pan_fasta_fp) + + print("Generating an index of the combined reference...") + combined_fasta_fp = os.path.join(tmp, "combined.fasta") + _combine_fasta_files( + pan_fasta_fp, os.path.join(tmp, "grch38.fasta"), + fasta_out_fp=combined_fasta_fp + ) + combined_reference = ctx.make_artifact( + "FeatureData[Sequence]", combined_fasta_fp + ) + index, = build_index( + sequences=combined_reference, n_threads=n_threads + ) + + print("Filtering reads against the index...") + filter_params = { + k: v for k, v in locals().items() if k in + ['n_threads', 'mode', 'ref_gap_open_penalty', + 'ref_gap_ext_penalty'] + } + filtered_reads, = filter_reads( + demultiplexed_sequences=reads, + database=index, + exclude_seqs=True, + **filter_params + ) + + return filtered_reads, index diff --git a/q2_moshpit/filtering/tests/data/pangenome/combined.fasta b/q2_moshpit/filtering/tests/data/pangenome/combined.fasta new file mode 100644 index 00000000..dae1405a --- /dev/null +++ b/q2_moshpit/filtering/tests/data/pangenome/combined.fasta @@ -0,0 +1,4 @@ +>ref2 +GACTAG +>ref1 +ATGCGAT diff --git a/q2_moshpit/filtering/tests/data/pangenome/grch38.fasta b/q2_moshpit/filtering/tests/data/pangenome/grch38.fasta new file mode 100644 index 00000000..1cc986b5 --- /dev/null +++ b/q2_moshpit/filtering/tests/data/pangenome/grch38.fasta @@ -0,0 +1,2 @@ +>ref1 +ATGCGAT diff --git a/q2_moshpit/filtering/tests/data/pangenome/pangenome.fasta b/q2_moshpit/filtering/tests/data/pangenome/pangenome.fasta new file mode 100644 index 00000000..54429c0a --- /dev/null +++ b/q2_moshpit/filtering/tests/data/pangenome/pangenome.fasta @@ -0,0 +1,2 @@ +>ref2 +GaCtAG diff --git a/q2_moshpit/filtering/tests/test_filter.py b/q2_moshpit/filtering/tests/test_filter.py index abb1d689..6db9a061 100644 --- a/q2_moshpit/filtering/tests/test_filter.py +++ b/q2_moshpit/filtering/tests/test_filter.py @@ -5,10 +5,21 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import filecmp +import os +import shutil +import tempfile import unittest +from unittest.mock import Mock, patch, ANY, call, MagicMock import pandas as pd import qiime2 + +from q2_moshpit.filtering.filter_pangenome import ( + _fetch_and_extract_grch38, _extract_fasta_from_gfa, + _fetch_and_extract_pangenome, filter_reads_pangenome, + _combine_fasta_files, EBI_SERVER_URL +) from qiime2.plugin.testing import TestPluginBase from q2_moshpit.busco.types import BUSCOResultsFormat @@ -165,6 +176,214 @@ def test_filter_mags_samples(self): exp_feature_count = 2 self.assertEqual(obs_feature_count, exp_feature_count) + @patch('shutil.move') + def test_fetch_and_extract_grch38(self, p1): + fake_results = Mock() + fake_callable = Mock(return_value=fake_results) + + _fetch_and_extract_grch38(fake_callable, "/some/where") + + fake_callable.assert_called_once_with( + taxon='Homo sapiens', + only_reference=True, + assembly_levels=['chromosome'], + assembly_source='refseq', + only_genomic=False + ) + fake_results.genome_assemblies.export_data.assert_called_once_with( + "/some/where" + ) + p1.assert_called_once_with( + "/some/where/dna-sequences.fasta", "/some/where/grch38.fasta" + ) + + @patch("subprocess.run") + @patch("os.remove") + def test_extract_from_gfa(self, p1, p2): + fasta_fp = os.path.join(self.temp_dir.name, "some_fasta.fa") + _extract_fasta_from_gfa("/some/gfa", fasta_fp) + + p2.assert_called_once_with( + ["gfatools", "gfa2fa", "/some/gfa"], stdout=ANY + ) + p1.assert_called_once_with("/some/gfa") + + @patch("subprocess.run", side_effect=OSError) + def test_extract_from_gfa_error(self, p1): + fasta_fp = os.path.join(self.temp_dir.name, "some_fasta.fa") + with self.assertRaisesRegex( + Exception, "Failed to extract" + ): + _extract_fasta_from_gfa("/some/gfa", fasta_fp) + + @patch("q2_moshpit.filtering.filter_pangenome.run_command") + def test_fetch_and_extract_pangenome(self, p1): + uri = "http://hello.org/file123.gz" + _fetch_and_extract_pangenome(uri, "/some/where") + + p1.assert_has_calls([ + call(["wget", uri, "-q", "-O", "/some/where/file123.gz"]), + call(["gunzip", "/some/where/file123.gz"]), + ]) + + @patch( + "q2_moshpit.filtering.filter_pangenome.run_command", + side_effect=OSError + ) + def test_fetch_and_extract_pangenome_error(self, p1): + with self.assertRaisesRegex(Exception, "Unable to connect"): + _fetch_and_extract_pangenome("http://hello.org", "/some/where") + + def test_combine_fasta_files_single(self): + fname1 = "grch38" + file1 = os.path.join(self.temp_dir.name, f"{fname1}.fasta") + shutil.copy( + self.get_data_path(f"pangenome/{fname1}.fasta"), + file1 + ) + obs = os.path.join(self.temp_dir.name, "out.fasta") + + _combine_fasta_files(file1, fasta_out_fp=obs) + + self.assertTrue( + filecmp.cmp( + self.get_data_path(f"pangenome/{fname1}.fasta"), obs + ), "Files are not identical" + ) + + def test_combine_fasta_files_multi(self): + fname1, fname2 = "pangenome", "grch38" + file1 = os.path.join(self.temp_dir.name, f"{fname1}.fasta") + file2 = os.path.join(self.temp_dir.name, f"{fname2}.fasta") + shutil.copy( + self.get_data_path(f"pangenome/{fname1}.fasta"), + file1 + ) + shutil.copy( + self.get_data_path(f"pangenome/{fname2}.fasta"), + file2 + ) + obs = os.path.join(self.temp_dir.name, "out.fasta") + + _combine_fasta_files(file1, file2, fasta_out_fp=obs) + + self.assertTrue( + filecmp.cmp( + self.get_data_path("pangenome/combined.fasta"), obs + ), "Files are not identical" + ) + + @patch("subprocess.run", side_effect=OSError) + def test_combine_fasta_files_error(self, p1): + obs = os.path.join(self.temp_dir.name, "out.fasta") + + with self.assertRaisesRegex( + Exception, "Failed to add the /fake/file" + ): + _combine_fasta_files("/fake/file", fasta_out_fp=obs) + + @patch( + 'q2_moshpit.filtering.filter_pangenome._fetch_and_extract_pangenome' + ) + @patch('q2_moshpit.filtering.filter_pangenome._fetch_and_extract_grch38') + @patch('q2_moshpit.filtering.filter_pangenome._extract_fasta_from_gfa') + def test_filter_reads_pangenome( + self, mock_extract_fasta, mock_fetch_grch38, mock_fetch_pangenome + ): + # we don't use the temp_dir from the test class as its content + # would get deleted within the context that is being tested - + # we need to be able to read files from that directory after + # the function being tested exits + temp_dir = tempfile.mkdtemp() + + # we construct our own context so that we can control each "action" + # being retrieved from it + ctx = MagicMock() + ctx.get_action.return_value = MagicMock() + mock_build_index_result = MagicMock() + ctx.get_action( + "quality_control", "bowtie2_build" + ).return_value = (mock_build_index_result,) + + mock_filtered_reads_result = MagicMock() + ctx.get_action( + "quality_control", "filter_reads" + ).return_value = (mock_filtered_reads_result,) + ctx.make_artifact.return_value = MagicMock() + + reads = MagicMock() + + # prepare some files which will be used by _combined_fasta_files + open(os.path.join(temp_dir, "pangenome.gfa"), 'w').close() + shutil.copy( + self.get_data_path("pangenome/grch38.fasta"), + os.path.join(temp_dir, "grch38.fasta") + ) + shutil.copy( + self.get_data_path("pangenome/pangenome.fasta"), + os.path.join(temp_dir, "pangenome.fasta") + ) + + with patch('tempfile.TemporaryDirectory', + return_value=MagicMock(name='TemporaryDirectory', + __enter__=lambda x: temp_dir, + __exit__=lambda x, y, z, w: None)): + filtered_reads, generated_index = filter_reads_pangenome( + ctx=ctx, + reads=reads, + index=None, + n_threads=1 + ) + + # Assertions + ctx.get_action.assert_any_call("rescript", "get_ncbi_genomes") + ctx.get_action.assert_any_call("quality_control", "bowtie2_build") + ctx.get_action.assert_any_call("quality_control", "filter_reads") + + mock_fetch_pangenome.assert_called_once_with( + EBI_SERVER_URL, temp_dir + ) + mock_fetch_grch38.assert_called_once_with( + ctx.get_action("rescript", "get_ncbi_genomes"), temp_dir + ) + mock_extract_fasta.assert_called_once_with( + os.path.join(temp_dir, "pangenome.gfa"), + os.path.join(temp_dir, "pangenome.fasta") + ) + + self.assertTrue( + filecmp.cmp( + self.get_data_path('pangenome/combined.fasta'), + os.path.join(temp_dir, 'combined.fasta') + ), + "Files are not identical" + ) + + ctx.make_artifact.assert_called_once_with( + "FeatureData[Sequence]", + os.path.join(temp_dir, "combined.fasta") + ) + ctx.get_action( + 'quality_control', 'bowtie2_build' + ).assert_has_calls( + [call(sequences=ANY, n_threads=1)], any_order=True + ) + ctx.get_action( + 'quality_control', 'filter_reads' + ).assert_has_calls([call( + demultiplexed_sequences=reads, + database=generated_index, + exclude_seqs=True, + n_threads=1, + mode='local', + ref_gap_open_penalty=5, + ref_gap_ext_penalty=3 + )], any_order=True) + self.assertIsNotNone(generated_index) + + # clean up + shutil.rmtree(temp_dir) + if __name__ == "__main__": unittest.main() diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 265898c1..60d1222c 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -6,6 +6,10 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import importlib + +from q2_quality_control.plugin_setup import ( + filter_parameters, filter_parameter_descriptions +) from qiime2.plugin import Metadata from q2_moshpit.eggnog.types import ( EggnogHmmerIdmapDirectoryFmt, EggnogHmmerIdmapFileFmt, EggnogHmmerIdmap @@ -14,6 +18,7 @@ BUSCOResultsFormat, BUSCOResultsDirectoryFormat, BuscoDatabaseDirFmt, BUSCOResults, BuscoDB ) +from q2_types.bowtie2 import Bowtie2Index from q2_types.profile_hmms import ProfileHMM, MultipleProtein, PressedProtein from q2_types.distance_matrix import DistanceMatrix from q2_types.feature_data import ( @@ -34,8 +39,7 @@ import q2_moshpit from q2_types.feature_data_mag import MAG from q2_types.genome_data import ( - NOG, Orthologs, - GenomeData, Loci, Genes, Proteins + NOG, Orthologs, GenomeData, Loci, Genes, Proteins ) from q2_types.kaiju import KaijuDB from q2_types.kraken2 import ( @@ -752,7 +756,7 @@ 'db_in_memory': Bool, }, input_descriptions={ - 'sequences': 'Sequences to be searched for hits.', + 'sequences': 'Sequences to be searched for ortholog hits.', 'diamond_db': 'Diamond database.', }, parameter_descriptions={ @@ -1582,6 +1586,51 @@ ] ) +I_reads, O_reads = TypeMap({ + SampleData[SequencesWithQuality]: + SampleData[SequencesWithQuality], + SampleData[PairedEndSequencesWithQuality]: + SampleData[PairedEndSequencesWithQuality], +}) + +plugin.pipelines.register_function( + function=q2_moshpit.filtering.filter_reads_pangenome, + inputs={ + "reads": I_reads, + "index": Bowtie2Index + }, + parameters={ + k: v for (k, v) in filter_parameters.items() if k != "exclude_seqs" + }, + outputs=[ + ("filtered_reads", O_reads), + ("reference_index", Bowtie2Index) + ], + input_descriptions={ + "reads": "Reads to be filtered against the human genome.", + "index": "Bowtie2 index of the reference human genome. If not " + "provided, an index combined from the reference GRCh38 " + "human genome and the human pangenome will be generated." + }, + parameter_descriptions={ + k: v for (k, v) in filter_parameter_descriptions.items() + if k != "exclude_seqs" + }, + output_descriptions={ + "filtered_reads": "Original reads without the contaminating " + "human reads.", + "reference_index": "Generated combined human reference index. If an " + "index was provided as an input, it will be " + "returned here instead." + }, + name="Remove contaminating human reads.", + description="This method generates a Bowtie2 index fo the combined human " + "GRCh38 reference genome and the draft human pangenome, and" + "uses that index to remove the contaminating human reads from " + "the reads provided as input.", + citations=[], +) + M_abundance_in, P_abundance_out = TypeMap({ Str % Choices(['rpkm']): Properties('rpkm'), Str % Choices(['tpm']): Properties('tpm'), diff --git a/setup.py b/setup.py index b883c187..4ba8bd29 100644 --- a/setup.py +++ b/setup.py @@ -142,6 +142,7 @@ 'data/*', 'data/mags/*', 'data/mags/*/*', + 'data/pangenome/*' ], 'q2_moshpit.abundance.tests': [ 'data/*',