From 6a024273c454a1839adf80ebbcb06332fd566864 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Fri, 29 Sep 2023 17:52:19 +0200 Subject: [PATCH] ENH: allow passing MAGs to eggnog-diamond-search (#68) * ENH: allow passing MAGs to eggnog-diamond-search * fixup! ENH: allow passing MAGs to eggnog-diamond-search --- q2_moshpit/eggnog/_method.py | 61 +++++++++++-------- .../194b1aca-9373-4298-ba5c-05b382d1f553.fa | 10 +++ .../e1ba1d20-b466-4fef-ae19-6bf9c5c63d6f.fa | 6 ++ q2_moshpit/eggnog/tests/test_method.py | 42 ++++++++++--- q2_moshpit/plugin_setup.py | 18 +++--- setup.py | 1 + 6 files changed, 97 insertions(+), 41 deletions(-) create mode 100644 q2_moshpit/eggnog/tests/data/mag-sequences/194b1aca-9373-4298-ba5c-05b382d1f553.fa create mode 100644 q2_moshpit/eggnog/tests/data/mag-sequences/e1ba1d20-b466-4fef-ae19-6bf9c5c63d6f.fa diff --git a/q2_moshpit/eggnog/_method.py b/q2_moshpit/eggnog/_method.py index 291614ac..984be9f3 100644 --- a/q2_moshpit/eggnog/_method.py +++ b/q2_moshpit/eggnog/_method.py @@ -5,56 +5,69 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - +import glob import subprocess import os import tempfile -import re +from typing import Union + import pandas as pd from q2_types_genomics.per_sample_data import ContigSequencesDirFmt from q2_types_genomics.genome_data import SeedOrthologDirFmt, OrthologFileFmt -from q2_types_genomics.feature_data import OrthologAnnotationDirFmt +from q2_types_genomics.feature_data import ( + OrthologAnnotationDirFmt, MAGSequencesDirFmt +) from q2_types_genomics.reference_db import EggnogRefDirFmt from q2_types.feature_data import DNAFASTAFormat from q2_types_genomics.reference_db import DiamondDatabaseDirFmt import qiime2.util -def eggnog_diamond_search(input_sequences: ContigSequencesDirFmt, - diamond_db: DiamondDatabaseDirFmt, - num_cpus: int = 1, db_in_memory: bool = False - ) -> (SeedOrthologDirFmt, pd.DataFrame): +def eggnog_diamond_search( + sequences: Union[ContigSequencesDirFmt, MAGSequencesDirFmt], + diamond_db: DiamondDatabaseDirFmt, + num_cpus: int = 1, db_in_memory: bool = False +) -> (SeedOrthologDirFmt, pd.DataFrame): diamond_db_fp = os.path.join(str(diamond_db), 'ref_db.dmnd') temp = tempfile.TemporaryDirectory() # run analysis - for relpath, obj_path in input_sequences.sequences.iter_views( - DNAFASTAFormat): - sample_label = str(relpath).rsplit(r'_', 1)[0] - - _diamond_search_runner(input_path=obj_path, - diamond_db=diamond_db_fp, - sample_label=sample_label, - output_loc=temp.name, - num_cpus=num_cpus, - db_in_memory=db_in_memory) + if isinstance(sequences, ContigSequencesDirFmt): + for relpath, obj_path in sequences.sequences.iter_views( + DNAFASTAFormat): + sample_id = str(relpath).rsplit(r'_', 1)[0] + _diamond_search_runner( + input_path=obj_path, diamond_db=diamond_db_fp, + sample_label=sample_id, output_loc=temp.name, + num_cpus=num_cpus, db_in_memory=db_in_memory + ) + elif isinstance(sequences, MAGSequencesDirFmt): + for mag_fp in glob.glob(f'{sequences.path}/*.fa*'): + sample_id = os.path.splitext(os.path.basename(mag_fp))[0] + _diamond_search_runner( + input_path=mag_fp, diamond_db=diamond_db_fp, + sample_label=sample_id, output_loc=temp.name, + num_cpus=num_cpus, db_in_memory=db_in_memory + ) result = SeedOrthologDirFmt() - - for item in os.listdir(temp.name): - if re.match(r".*\.seed_orthologs", item): - qiime2.util.duplicate(os.path.join(temp.name, item), - os.path.join(result.path, item)) + ortholog_fps = [ + os.path.basename(x) for x + in glob.glob(f'{temp.name}/*.seed_orthologs') + ] + for item in ortholog_fps: + qiime2.util.duplicate( + os.path.join(temp.name, item), os.path.join(result.path, item) + ) ft = _eggnog_feature_table(result) - return (result, ft) + return result, ft def _eggnog_feature_table(seed_orthologs: SeedOrthologDirFmt) -> pd.DataFrame: - per_sample_counts = [] for sample_path, obj in seed_orthologs.seed_orthologs.iter_views( diff --git a/q2_moshpit/eggnog/tests/data/mag-sequences/194b1aca-9373-4298-ba5c-05b382d1f553.fa b/q2_moshpit/eggnog/tests/data/mag-sequences/194b1aca-9373-4298-ba5c-05b382d1f553.fa new file mode 100644 index 00000000..cccc50c6 --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/mag-sequences/194b1aca-9373-4298-ba5c-05b382d1f553.fa @@ -0,0 +1,10 @@ +>shouldnt-hit-0 +ATAAATTAGTTACACTCTCCGTGACTCGAGCTAACCTGAACTCGTAAGAGGGTCCCTTAGCTAGAGACTTGTCTTGACCCAAACTAGTAGTAACTGCAAAACGGAATCTTAACAAAGGTTGCTACTAATGGCACGTCGTCACTTTTCTGAATTCGCATATGGATCCACGAGGGGAAATTGGCTTTGGAGAGATACACATCTGCCGACCAGACGCGGAATCTCAGTGAGTGTCATTCATGGCCCCTACCCT +>shouldnt-hit-1 +ATGCGTTCGTCACGAGGTTGCAACGGGCCGCCTTGCTTCTTAGCTCGAGAGATAGTTACGGGTTTTAGTAGTAGGAGCGTATTCCATACCCACAATTCGGAACTGCCCATGAGCCGCCTAGTAAGGATAACCTTATGATAGCTATATGCTCTTCCTACTATCTAGCGGTGCTCAATTTGCCAATTTCCGGGTCCGACTACGAGGCCGGATCGCTGAGGAGTGACAACCCCTGTGCTATACATTACGGTCA +>should-hit-seq8a +TGCAAATGGAGCATGTGCACCTTTTGGAACTATCGCTTTTTTAGCCTGATTCTGTATTTTCGCACCACCAACGCGCTGACCGCGTGGACCTTTGTGTTTTGCTGGCCGAACCTGATTGGCCGCAGCATGAAACATGATGGCCTGTGCCATCATAGCGCGACCTATGTGTTTCATGCGATGATGGCGGAAATGGCGAAAGTGATGGATTTTTGCAGCGCGTGGGTGGAAGATGTGATGATGCCGATGCTGGGCTTTTATCATAACCTGTTTAACCCGCGCACCGGCAACGAAAACATTTGGAACGATAACTGCGAAGTGAACTGGACCGTGGTGATGAACGGCGGCATGATGTTTTTTGTGCTGTGGGATAAACTGATTATGGTGAGCGCGGAATGGAACAACTGGGCGCGCAAAATTGTGAAAGTGTATCGCGATAAAGATAACAAAATTGTGAGCCGCTGGTTTAGCTATCGCGATGGCGTGAGCTTTAACTTTAAAGGCTGCCTGCCGTGCTTTAAAAGCGGCATTATTCATCATTGGAACCATGATTTTGCGGCGTATAAAAACTGCGGCATGCCGGAAACCCGCCCGGATCTGTATTATGGCATGAGCTATGCGCTGTTTTATAGCCTGAAAGAAACCTTTGATGGCTTTCATATTGTGAACCCGGGCGAACTGAACGTGTGGCGCATTAAAAAACGCAAAGAAATGAACATGACCCTGGAACTGCATCATATGCATTGGTATGCGTGCAAATTTCTGGATCCGATTGGCAACGGCAAAATTGCGTGCAAATGCATTGATTATGTGTATCGCGTGCATGAAAGCTGCTGCGTGCGCCATCTGAGCCTGTGGGATTATTATTGCGAAGCGGATAACGAAGAAATGCTGGCGGCGACCGCGCCGAAAAACAAACGCCTGAGCGCGGCGCTGTATGATCGCTGCGGCTTTGATATGGATAAAGGCAGCGAACATGAATGCGAATTTAAACGCAAAGATATTGTGAAATATTTTATGATGATTGAAATGTATACCGGCACCCCGGTGCGCCCGTTTCGCCGCAAATGGCGCCTGTGCCATTGCCATAAAGAAAGCCGCTGGTGGTTTGTGTGCAGCCCGGGCCCGTTTTTTGGCTAT +>should-hit-seq8b +ACCGGCAACGAAAACATTTGGAACGATAACTGCGAAGTGAACTGGACCGTGGTGATGAACGGCGGCATGATGTTTTTTGTGCTGTGGGATAAACTGATTATGGTGAGCGCGGAATGGAACAACTGGGCGCGCAAAATTGTGAAAGTGTATCGCGATAAAGATAACAAAATTGTGAGCCGCTGGTTTAGCTATCGCGATGGCGTGAGCTTTAACTTTAAAGGCTGCCTGCCGTGCTTTAAAAGCGGCATTATTCATCATTGGAACCATGATTTTGCGGCGTATAAAAACTGCGGCATGCCGGAAACCCGCCCGGATCTGTATTATGGCATGAGCTATGCGCTGTTTTATAGCCTGAAAGAAACCTTTGATGGCTTTCATATTGTGAACCCGGGCGAACTGAACGTGTGGCGCATTAAAAAACGCAAAGAAATGAACATGACCCTGGAACTGCATCATATGCATTGGTATGCGTGCAAATTTCTGGATCCGATTGGCAACGGCAAAATTGCGTGCAAATGCATTGATTATGTGTATCGCGTGCATGAAAGCTGCTGCGTGCGCCATCTGAGCCTGTGGGATTATTATTGCGAAGCGGATAACGAAGAAATGCTGGCGGCGACCGCGCCGAAAAACAAACGCCTGAGCGCGGCGCTGTATGATCGCTGCGGCTTTGATATGGATAAAGGCAGCGAACATGAATGCGAATTTAAACGCAAAGATATTGTGAAATATTTTATGATGATTGAAATGTATACCGGCACCCCGGTGCGCCCGTTTCGCCGCAAATGGCGCCTGTGCCATTGCCATAAAGAAAGCCGCTGGTGGTTTGTGTGCAGCCCGGGCCCGTTTTTTGGCTAT +>should-hit-seq8c +TGCAAATGGAGCATGTGCACCTTTTGGAACTATCGCTTTTTTAGCCTGATTCTGTATTTTCGCACCACCAACGCGCTGACCGCGTGGACCTTTGTGTTTTGCTGGCCGAACCTGATTGGCCGCAGGAAACATGATGGCCTGTGCCATCATAGCGCGACCTATGTGTTTCATGCGATGATGGCGGAAATGGCGAAAGTGATGGATTTTTGCAGCGCGTGGGTGGAAGATGTGATGATGCCGATGCTGGGCTTTTATCATAACCTGTTTAACCCGCGCACCGGCAACGAAAACATTTGGAACGATAACTGCGAAGTGAACTGGACCGTGGTGATGAACGGCGGCATGATGTTTTTTGTGCTGTGGGATAAACTGATTATGGTGAGCGCGGAATGGAACAACTGGGCGCGCAAAATTGTGAAAGTGTATCGCGATAAAGATAACAAAATTGTGAGCCGCTGGTTTAGCTATCGCGATGGCGTGAGCTTTAACTTTAAAGGCTGCCTGCCGTGCTTTAAAAGCGGCATTATTCATCATTGGAACCATGATTTTGCGGCGTATAAAAACTGCGGCATGCCGGAAACCCGCCCGGATCTGTATTATGGCATGAGCTATGCGCTGTTTTATAGCCTGAAAGAAACCTTTGATGGCTTTCATATTGTGAACCCGGGCGAACTGAACGTGTGGCGCATTAAAAAACGCAAAGAAATGAACATGACCCTGGAACTGCATCATATGCATTGGTATGCGTGCAAATTTCTGGATCCGATTGGCAAC diff --git a/q2_moshpit/eggnog/tests/data/mag-sequences/e1ba1d20-b466-4fef-ae19-6bf9c5c63d6f.fa b/q2_moshpit/eggnog/tests/data/mag-sequences/e1ba1d20-b466-4fef-ae19-6bf9c5c63d6f.fa new file mode 100644 index 00000000..6a696e38 --- /dev/null +++ b/q2_moshpit/eggnog/tests/data/mag-sequences/e1ba1d20-b466-4fef-ae19-6bf9c5c63d6f.fa @@ -0,0 +1,6 @@ +>should-hit-seq-0 +AACGATACCGAAGCGGTGATTGCGCATAAAAACAGCCTGAACATGGCGATTCTGATTATGTTTGATGATGCGGTGGCGATTTTTGTGTATAGCAACCTGGTGTGCATTCGCTTTGATGAACATTATGGCTGGGGCAACGAAGCGAACCGCATTCCGCATATTCTGCCGGATATTCGCCGCTGCATGCCGTGGAGCCATAAAGGCGATCATAACCCGATGGGCAACATGAGCGCGCTGTGCAAATGGGCGACCTATCGCCCGATTAAAAGCTGGTGGAGCCCGTATAAAATTGCGCGCGTGTTTTTTTGCATTGTGCATTTTGTGAACACCCCGAAACCGAAATGGGGCATGGAATTTGATAAACGCGAAGGCATGGTGATTACCATGCGCATTTGGAAAAACTGCCTGGGCATGTGCCTGGAAAAAGCGAACATTTGCGAAGGCACCCGCAACTGGCGCATTAAAATGAGCATGTGGGCGGGCAGCTTTATTGCGCTGATGGATT +>should-hit-seq-2 +CTGGAAAGCGCGATGGGCGGCCCGCATATGAGCATTACCCCGGAAGAAAACGCGTTTGGCGGCTTTAACTTTTGCACCGGCGTGGTGACCGAACATATTCCGATGGATATTGTGGCGATTTGCTGGGCGCTGTTTAGCTGGGAAAACACCAAATTTGGCACCGTGAAAGATAACTGGCTGTATCGCTGGACCATTTGGTGGTGGTTTACCCTGGATACCGGCGCGAGCGTGGATTGCAAATGGGGCTGCAACCGCCGCGAACGCGCGATTTGGGTGTGCTGGAACCGCTTTATTACCATTAGCTTTCATAAACCGCGCGATTGCAAAACCAGCGCGTATCATACCGGCAAACCGGAAATGTATCTGGATCTGATGTGGATTTTTGTGAGCGTGCATGTGTTTATTATGACCCATCTGGCGGGCGATCATTTTCTGGGCCGCGTGCTGCTGCATCATAACAACGATGAAGATTATGATCGCAACTTTCCGATGCTGGATTTTAACTGCCATTGCTGGATTGCGATTTGGCATCGCGTGTGGTATCCGAGCAAAGTGCATGGCAGCGTGGATGCGCTGTTTGAATGGATTCCGCGCAACGGCGATTTTACCCTGCGCCGCAACGCGGGCGATCCGCGCTATACCAGCGCGAGCATGCGCTTTTTTGCGATGTGCGCGATGGAAATTATGCTGGCGCTGATGGGCGAAAGCATGAAACATGCGCTGGAAAGCGCGATGGGCGGCCCGCATATGAGCATTACCCCGGAAGAAAACGCGTTTGGCGGCTTTAACTTTTGCACCGGCGTGGTGACCGAACATATTCCGATGGATATTGTGGCGATTTGCTGGGCGCTGTTTAGCTGGGAAAACACCAAATTTGGCACCGTGAAAGATAACTGGCTGTATCGCTGGACCATTTGGTGGTGGTTTACCCTGGATACCGGCGCGAGCGTGGATTGCAAATGGGGCTGCAACCGCCGCGAACGCGCGATTTGGGTGTGCTGGAACCGCTTTATTACCATTAGCTTTCATAAACCGCGCGATTGCAAAACCAGCGCGTATCATACCGGCAAACCGGAAATGTATCTGGATCTGATGTGGATTTTTGTGAGCGTGCATGTGTTTATTATGACCCATCTGGCGGGCGATCATTTTCTGGGCCGCGTGCTGCTGCATCATAACAACGATGAAGATTATGATCGCAACTTTCCGATGCTGGATTTTAACTGCCATTGCTGGATTGCGATTTGGCATCGCGTGTGGTATCCGAGCAAAGTGCATGGCAGCGTGGATGCGCTGTTTGAATGGATTCCGCGCAACGGCGATTTTACCCTGCGCCGCAACGCGGGCGATCCGCGCTATACCAGCGCGAGCATGCGCTTTTTTGCGATGTGCGCGATGGAAATTATGCTGGCGCTGATGGGCGAAAGCATGAAACATGCGCATGGC +>shouldnt-hit +GCATTGAAGCTTTCTGACTGTTAAATAGTGTAGGCCCCAGCTGTTGATTTTTTAGACTAGAGGTGGGGCACTGTCCCGACACTTCTGGGTGTCCGCCACTGAGATGAACCCCACCGGGTCAAAGGATGTCAACGAAGTTCATTCAAGCTCACACGTCCAAGACCAGTGGTCAGGCTCTCTGTCATGCACCGTCCGCTTTGCAGCCGCGTCTCAGCGCCTCCCTACGCTCGAGATTGTCTGGCGCTCGGGTCATGGC diff --git a/q2_moshpit/eggnog/tests/test_method.py b/q2_moshpit/eggnog/tests/test_method.py index ed794899..d93b83d0 100644 --- a/q2_moshpit/eggnog/tests/test_method.py +++ b/q2_moshpit/eggnog/tests/test_method.py @@ -11,6 +11,8 @@ import qiime2 from qiime2.plugin.testing import TestPluginBase + +from q2_types_genomics.feature_data import MAGSequencesDirFmt from .._method import eggnog_diamond_search, eggnog_annotate from q2_types_genomics.reference_db import ( DiamondDatabaseDirFmt, EggnogRefDirFmt) @@ -21,26 +23,48 @@ class TestDiamond(TestPluginBase): package = 'q2_moshpit.eggnog.tests' - def test_good_small_search(self): - input_sequences = qiime2.Artifact.import_data( - 'SampleData[Contigs]', - self.get_data_path('contig-sequences-1') - ).view(ContigSequencesDirFmt) - - diamond_db = qiime2.Artifact.import_data( + def setUp(self): + super().setUp() + self.diamond_db = qiime2.Artifact.import_data( 'ReferenceDB[Diamond]', self.get_data_path('random-db-1') ).view(DiamondDatabaseDirFmt) + def test_good_small_search_contigs(self): + contigs = qiime2.Artifact.import_data( + 'SampleData[Contigs]', + self.get_data_path('contig-sequences-1') + ).view(ContigSequencesDirFmt) + _, obs = eggnog_diamond_search( - input_sequences=input_sequences, - diamond_db=diamond_db) + sequences=contigs, + diamond_db=self.diamond_db + ) exp = pd.DataFrame({'0': [1.0, 0.0], '2': [1.0, 0.0], '8': [0.0, 3.0]}, index=['s1', 's2']) exp.columns.name = 'sseqid' pdt.assert_frame_equal(obs, exp) + def test_good_small_search_mags(self): + mags = qiime2.Artifact.import_data( + 'FeatureData[MAG]', + self.get_data_path('mag-sequences') + ).view(MAGSequencesDirFmt) + + _, obs = eggnog_diamond_search( + sequences=mags, + diamond_db=self.diamond_db + ) + exp = pd.DataFrame( + {'8': [3.0, 0.0], '0': [0.0, 1.0], '2': [0.0, 1.0]}, + index=['194b1aca-9373-4298-ba5c-05b382d1f553', + 'e1ba1d20-b466-4fef-ae19-6bf9c5c63d6f'] + ) + exp.columns.name = 'sseqid' + + pdt.assert_frame_equal(obs, exp) + class TestAnnotate(TestPluginBase): package = 'q2_moshpit.eggnog.tests' diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index acc2adad..7c8f5282 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -376,16 +376,17 @@ plugin.methods.register_function( function=q2_moshpit.eggnog.eggnog_diamond_search, - inputs={'input_sequences': SampleData[Contigs], - 'diamond_db': ReferenceDB[Diamond], - }, + inputs={ + 'sequences': SampleData[Contigs] | FeatureData[MAG], + 'diamond_db': ReferenceDB[Diamond], + }, parameters={ 'num_cpus': Int, 'db_in_memory': Bool, }, input_descriptions={ - 'input_sequences': 'Sequence data of the contigs we want to ' - 'search for hits using the Diamond Database', + 'sequences': 'Sequence data of the contigs we want to ' + 'search for hits using the Diamond Database', 'diamond_db': 'The filepath to an artifact containing the' 'Diamond database', }, @@ -397,9 +398,10 @@ 'option should only be used on clusters or other ' 'machines with enough memory.', }, - outputs=[('eggnog_hits', SampleData[BLAST6]), - ('table', FeatureTable[Frequency]) - ], + outputs=[ + ('eggnog_hits', SampleData[BLAST6]), + ('table', FeatureTable[Frequency]) + ], name='Run eggNOG search using diamond aligner', description="This method performs the steps by which we find our " "possible target sequences to annotate using the diamond " diff --git a/setup.py b/setup.py index ee4b9c9f..89100c4c 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ 'q2_moshpit.eggnog': [ 'tests/data/*', 'tests/data/contig-sequences-1/*', + 'tests/data/mag-sequences/*', 'tests/data/random-db-1/*', 'tests/data/good_hits/*', 'tests/data/bad_hits/*',