Skip to content

Commit

Permalink
ENH: allow passing MAGs to eggnog-diamond-search (#68)
Browse files Browse the repository at this point in the history
* ENH: allow passing MAGs to eggnog-diamond-search
* fixup! ENH: allow passing MAGs to eggnog-diamond-search
  • Loading branch information
misialq authored Sep 29, 2023
1 parent 6059588 commit 6a02427
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 41 deletions.
61 changes: 37 additions & 24 deletions q2_moshpit/eggnog/_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,56 +5,69 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import glob
import subprocess
import os
import tempfile
import re
from typing import Union

import pandas as pd

from q2_types_genomics.per_sample_data import ContigSequencesDirFmt
from q2_types_genomics.genome_data import SeedOrthologDirFmt, OrthologFileFmt
from q2_types_genomics.feature_data import OrthologAnnotationDirFmt
from q2_types_genomics.feature_data import (
OrthologAnnotationDirFmt, MAGSequencesDirFmt
)
from q2_types_genomics.reference_db import EggnogRefDirFmt
from q2_types.feature_data import DNAFASTAFormat
from q2_types_genomics.reference_db import DiamondDatabaseDirFmt
import qiime2.util


def eggnog_diamond_search(input_sequences: ContigSequencesDirFmt,
diamond_db: DiamondDatabaseDirFmt,
num_cpus: int = 1, db_in_memory: bool = False
) -> (SeedOrthologDirFmt, pd.DataFrame):
def eggnog_diamond_search(
sequences: Union[ContigSequencesDirFmt, MAGSequencesDirFmt],
diamond_db: DiamondDatabaseDirFmt,
num_cpus: int = 1, db_in_memory: bool = False
) -> (SeedOrthologDirFmt, pd.DataFrame):

diamond_db_fp = os.path.join(str(diamond_db), 'ref_db.dmnd')
temp = tempfile.TemporaryDirectory()

# run analysis
for relpath, obj_path in input_sequences.sequences.iter_views(
DNAFASTAFormat):
sample_label = str(relpath).rsplit(r'_', 1)[0]

_diamond_search_runner(input_path=obj_path,
diamond_db=diamond_db_fp,
sample_label=sample_label,
output_loc=temp.name,
num_cpus=num_cpus,
db_in_memory=db_in_memory)
if isinstance(sequences, ContigSequencesDirFmt):
for relpath, obj_path in sequences.sequences.iter_views(
DNAFASTAFormat):
sample_id = str(relpath).rsplit(r'_', 1)[0]
_diamond_search_runner(
input_path=obj_path, diamond_db=diamond_db_fp,
sample_label=sample_id, output_loc=temp.name,
num_cpus=num_cpus, db_in_memory=db_in_memory
)
elif isinstance(sequences, MAGSequencesDirFmt):
for mag_fp in glob.glob(f'{sequences.path}/*.fa*'):
sample_id = os.path.splitext(os.path.basename(mag_fp))[0]
_diamond_search_runner(
input_path=mag_fp, diamond_db=diamond_db_fp,
sample_label=sample_id, output_loc=temp.name,
num_cpus=num_cpus, db_in_memory=db_in_memory
)

result = SeedOrthologDirFmt()

for item in os.listdir(temp.name):
if re.match(r".*\.seed_orthologs", item):
qiime2.util.duplicate(os.path.join(temp.name, item),
os.path.join(result.path, item))
ortholog_fps = [
os.path.basename(x) for x
in glob.glob(f'{temp.name}/*.seed_orthologs')
]
for item in ortholog_fps:
qiime2.util.duplicate(
os.path.join(temp.name, item), os.path.join(result.path, item)
)

ft = _eggnog_feature_table(result)

return (result, ft)
return result, ft


def _eggnog_feature_table(seed_orthologs: SeedOrthologDirFmt) -> pd.DataFrame:

per_sample_counts = []

for sample_path, obj in seed_orthologs.seed_orthologs.iter_views(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>shouldnt-hit-0
ATAAATTAGTTACACTCTCCGTGACTCGAGCTAACCTGAACTCGTAAGAGGGTCCCTTAGCTAGAGACTTGTCTTGACCCAAACTAGTAGTAACTGCAAAACGGAATCTTAACAAAGGTTGCTACTAATGGCACGTCGTCACTTTTCTGAATTCGCATATGGATCCACGAGGGGAAATTGGCTTTGGAGAGATACACATCTGCCGACCAGACGCGGAATCTCAGTGAGTGTCATTCATGGCCCCTACCCT
>shouldnt-hit-1
ATGCGTTCGTCACGAGGTTGCAACGGGCCGCCTTGCTTCTTAGCTCGAGAGATAGTTACGGGTTTTAGTAGTAGGAGCGTATTCCATACCCACAATTCGGAACTGCCCATGAGCCGCCTAGTAAGGATAACCTTATGATAGCTATATGCTCTTCCTACTATCTAGCGGTGCTCAATTTGCCAATTTCCGGGTCCGACTACGAGGCCGGATCGCTGAGGAGTGACAACCCCTGTGCTATACATTACGGTCA
>should-hit-seq8a
TGCAAATGGAGCATGTGCACCTTTTGGAACTATCGCTTTTTTAGCCTGATTCTGTATTTTCGCACCACCAACGCGCTGACCGCGTGGACCTTTGTGTTTTGCTGGCCGAACCTGATTGGCCGCAGCATGAAACATGATGGCCTGTGCCATCATAGCGCGACCTATGTGTTTCATGCGATGATGGCGGAAATGGCGAAAGTGATGGATTTTTGCAGCGCGTGGGTGGAAGATGTGATGATGCCGATGCTGGGCTTTTATCATAACCTGTTTAACCCGCGCACCGGCAACGAAAACATTTGGAACGATAACTGCGAAGTGAACTGGACCGTGGTGATGAACGGCGGCATGATGTTTTTTGTGCTGTGGGATAAACTGATTATGGTGAGCGCGGAATGGAACAACTGGGCGCGCAAAATTGTGAAAGTGTATCGCGATAAAGATAACAAAATTGTGAGCCGCTGGTTTAGCTATCGCGATGGCGTGAGCTTTAACTTTAAAGGCTGCCTGCCGTGCTTTAAAAGCGGCATTATTCATCATTGGAACCATGATTTTGCGGCGTATAAAAACTGCGGCATGCCGGAAACCCGCCCGGATCTGTATTATGGCATGAGCTATGCGCTGTTTTATAGCCTGAAAGAAACCTTTGATGGCTTTCATATTGTGAACCCGGGCGAACTGAACGTGTGGCGCATTAAAAAACGCAAAGAAATGAACATGACCCTGGAACTGCATCATATGCATTGGTATGCGTGCAAATTTCTGGATCCGATTGGCAACGGCAAAATTGCGTGCAAATGCATTGATTATGTGTATCGCGTGCATGAAAGCTGCTGCGTGCGCCATCTGAGCCTGTGGGATTATTATTGCGAAGCGGATAACGAAGAAATGCTGGCGGCGACCGCGCCGAAAAACAAACGCCTGAGCGCGGCGCTGTATGATCGCTGCGGCTTTGATATGGATAAAGGCAGCGAACATGAATGCGAATTTAAACGCAAAGATATTGTGAAATATTTTATGATGATTGAAATGTATACCGGCACCCCGGTGCGCCCGTTTCGCCGCAAATGGCGCCTGTGCCATTGCCATAAAGAAAGCCGCTGGTGGTTTGTGTGCAGCCCGGGCCCGTTTTTTGGCTAT
>should-hit-seq8b
ACCGGCAACGAAAACATTTGGAACGATAACTGCGAAGTGAACTGGACCGTGGTGATGAACGGCGGCATGATGTTTTTTGTGCTGTGGGATAAACTGATTATGGTGAGCGCGGAATGGAACAACTGGGCGCGCAAAATTGTGAAAGTGTATCGCGATAAAGATAACAAAATTGTGAGCCGCTGGTTTAGCTATCGCGATGGCGTGAGCTTTAACTTTAAAGGCTGCCTGCCGTGCTTTAAAAGCGGCATTATTCATCATTGGAACCATGATTTTGCGGCGTATAAAAACTGCGGCATGCCGGAAACCCGCCCGGATCTGTATTATGGCATGAGCTATGCGCTGTTTTATAGCCTGAAAGAAACCTTTGATGGCTTTCATATTGTGAACCCGGGCGAACTGAACGTGTGGCGCATTAAAAAACGCAAAGAAATGAACATGACCCTGGAACTGCATCATATGCATTGGTATGCGTGCAAATTTCTGGATCCGATTGGCAACGGCAAAATTGCGTGCAAATGCATTGATTATGTGTATCGCGTGCATGAAAGCTGCTGCGTGCGCCATCTGAGCCTGTGGGATTATTATTGCGAAGCGGATAACGAAGAAATGCTGGCGGCGACCGCGCCGAAAAACAAACGCCTGAGCGCGGCGCTGTATGATCGCTGCGGCTTTGATATGGATAAAGGCAGCGAACATGAATGCGAATTTAAACGCAAAGATATTGTGAAATATTTTATGATGATTGAAATGTATACCGGCACCCCGGTGCGCCCGTTTCGCCGCAAATGGCGCCTGTGCCATTGCCATAAAGAAAGCCGCTGGTGGTTTGTGTGCAGCCCGGGCCCGTTTTTTGGCTAT
>should-hit-seq8c
TGCAAATGGAGCATGTGCACCTTTTGGAACTATCGCTTTTTTAGCCTGATTCTGTATTTTCGCACCACCAACGCGCTGACCGCGTGGACCTTTGTGTTTTGCTGGCCGAACCTGATTGGCCGCAGGAAACATGATGGCCTGTGCCATCATAGCGCGACCTATGTGTTTCATGCGATGATGGCGGAAATGGCGAAAGTGATGGATTTTTGCAGCGCGTGGGTGGAAGATGTGATGATGCCGATGCTGGGCTTTTATCATAACCTGTTTAACCCGCGCACCGGCAACGAAAACATTTGGAACGATAACTGCGAAGTGAACTGGACCGTGGTGATGAACGGCGGCATGATGTTTTTTGTGCTGTGGGATAAACTGATTATGGTGAGCGCGGAATGGAACAACTGGGCGCGCAAAATTGTGAAAGTGTATCGCGATAAAGATAACAAAATTGTGAGCCGCTGGTTTAGCTATCGCGATGGCGTGAGCTTTAACTTTAAAGGCTGCCTGCCGTGCTTTAAAAGCGGCATTATTCATCATTGGAACCATGATTTTGCGGCGTATAAAAACTGCGGCATGCCGGAAACCCGCCCGGATCTGTATTATGGCATGAGCTATGCGCTGTTTTATAGCCTGAAAGAAACCTTTGATGGCTTTCATATTGTGAACCCGGGCGAACTGAACGTGTGGCGCATTAAAAAACGCAAAGAAATGAACATGACCCTGGAACTGCATCATATGCATTGGTATGCGTGCAAATTTCTGGATCCGATTGGCAAC
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>should-hit-seq-0
AACGATACCGAAGCGGTGATTGCGCATAAAAACAGCCTGAACATGGCGATTCTGATTATGTTTGATGATGCGGTGGCGATTTTTGTGTATAGCAACCTGGTGTGCATTCGCTTTGATGAACATTATGGCTGGGGCAACGAAGCGAACCGCATTCCGCATATTCTGCCGGATATTCGCCGCTGCATGCCGTGGAGCCATAAAGGCGATCATAACCCGATGGGCAACATGAGCGCGCTGTGCAAATGGGCGACCTATCGCCCGATTAAAAGCTGGTGGAGCCCGTATAAAATTGCGCGCGTGTTTTTTTGCATTGTGCATTTTGTGAACACCCCGAAACCGAAATGGGGCATGGAATTTGATAAACGCGAAGGCATGGTGATTACCATGCGCATTTGGAAAAACTGCCTGGGCATGTGCCTGGAAAAAGCGAACATTTGCGAAGGCACCCGCAACTGGCGCATTAAAATGAGCATGTGGGCGGGCAGCTTTATTGCGCTGATGGATT
>should-hit-seq-2
CTGGAAAGCGCGATGGGCGGCCCGCATATGAGCATTACCCCGGAAGAAAACGCGTTTGGCGGCTTTAACTTTTGCACCGGCGTGGTGACCGAACATATTCCGATGGATATTGTGGCGATTTGCTGGGCGCTGTTTAGCTGGGAAAACACCAAATTTGGCACCGTGAAAGATAACTGGCTGTATCGCTGGACCATTTGGTGGTGGTTTACCCTGGATACCGGCGCGAGCGTGGATTGCAAATGGGGCTGCAACCGCCGCGAACGCGCGATTTGGGTGTGCTGGAACCGCTTTATTACCATTAGCTTTCATAAACCGCGCGATTGCAAAACCAGCGCGTATCATACCGGCAAACCGGAAATGTATCTGGATCTGATGTGGATTTTTGTGAGCGTGCATGTGTTTATTATGACCCATCTGGCGGGCGATCATTTTCTGGGCCGCGTGCTGCTGCATCATAACAACGATGAAGATTATGATCGCAACTTTCCGATGCTGGATTTTAACTGCCATTGCTGGATTGCGATTTGGCATCGCGTGTGGTATCCGAGCAAAGTGCATGGCAGCGTGGATGCGCTGTTTGAATGGATTCCGCGCAACGGCGATTTTACCCTGCGCCGCAACGCGGGCGATCCGCGCTATACCAGCGCGAGCATGCGCTTTTTTGCGATGTGCGCGATGGAAATTATGCTGGCGCTGATGGGCGAAAGCATGAAACATGCGCTGGAAAGCGCGATGGGCGGCCCGCATATGAGCATTACCCCGGAAGAAAACGCGTTTGGCGGCTTTAACTTTTGCACCGGCGTGGTGACCGAACATATTCCGATGGATATTGTGGCGATTTGCTGGGCGCTGTTTAGCTGGGAAAACACCAAATTTGGCACCGTGAAAGATAACTGGCTGTATCGCTGGACCATTTGGTGGTGGTTTACCCTGGATACCGGCGCGAGCGTGGATTGCAAATGGGGCTGCAACCGCCGCGAACGCGCGATTTGGGTGTGCTGGAACCGCTTTATTACCATTAGCTTTCATAAACCGCGCGATTGCAAAACCAGCGCGTATCATACCGGCAAACCGGAAATGTATCTGGATCTGATGTGGATTTTTGTGAGCGTGCATGTGTTTATTATGACCCATCTGGCGGGCGATCATTTTCTGGGCCGCGTGCTGCTGCATCATAACAACGATGAAGATTATGATCGCAACTTTCCGATGCTGGATTTTAACTGCCATTGCTGGATTGCGATTTGGCATCGCGTGTGGTATCCGAGCAAAGTGCATGGCAGCGTGGATGCGCTGTTTGAATGGATTCCGCGCAACGGCGATTTTACCCTGCGCCGCAACGCGGGCGATCCGCGCTATACCAGCGCGAGCATGCGCTTTTTTGCGATGTGCGCGATGGAAATTATGCTGGCGCTGATGGGCGAAAGCATGAAACATGCGCATGGC
>shouldnt-hit
GCATTGAAGCTTTCTGACTGTTAAATAGTGTAGGCCCCAGCTGTTGATTTTTTAGACTAGAGGTGGGGCACTGTCCCGACACTTCTGGGTGTCCGCCACTGAGATGAACCCCACCGGGTCAAAGGATGTCAACGAAGTTCATTCAAGCTCACACGTCCAAGACCAGTGGTCAGGCTCTCTGTCATGCACCGTCCGCTTTGCAGCCGCGTCTCAGCGCCTCCCTACGCTCGAGATTGTCTGGCGCTCGGGTCATGGC
42 changes: 33 additions & 9 deletions q2_moshpit/eggnog/tests/test_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import qiime2
from qiime2.plugin.testing import TestPluginBase

from q2_types_genomics.feature_data import MAGSequencesDirFmt
from .._method import eggnog_diamond_search, eggnog_annotate
from q2_types_genomics.reference_db import (
DiamondDatabaseDirFmt, EggnogRefDirFmt)
Expand All @@ -21,26 +23,48 @@
class TestDiamond(TestPluginBase):
package = 'q2_moshpit.eggnog.tests'

def test_good_small_search(self):
input_sequences = qiime2.Artifact.import_data(
'SampleData[Contigs]',
self.get_data_path('contig-sequences-1')
).view(ContigSequencesDirFmt)

diamond_db = qiime2.Artifact.import_data(
def setUp(self):
super().setUp()
self.diamond_db = qiime2.Artifact.import_data(
'ReferenceDB[Diamond]',
self.get_data_path('random-db-1')
).view(DiamondDatabaseDirFmt)

def test_good_small_search_contigs(self):
contigs = qiime2.Artifact.import_data(
'SampleData[Contigs]',
self.get_data_path('contig-sequences-1')
).view(ContigSequencesDirFmt)

_, obs = eggnog_diamond_search(
input_sequences=input_sequences,
diamond_db=diamond_db)
sequences=contigs,
diamond_db=self.diamond_db
)
exp = pd.DataFrame({'0': [1.0, 0.0], '2': [1.0, 0.0], '8': [0.0, 3.0]},
index=['s1', 's2'])
exp.columns.name = 'sseqid'

pdt.assert_frame_equal(obs, exp)

def test_good_small_search_mags(self):
mags = qiime2.Artifact.import_data(
'FeatureData[MAG]',
self.get_data_path('mag-sequences')
).view(MAGSequencesDirFmt)

_, obs = eggnog_diamond_search(
sequences=mags,
diamond_db=self.diamond_db
)
exp = pd.DataFrame(
{'8': [3.0, 0.0], '0': [0.0, 1.0], '2': [0.0, 1.0]},
index=['194b1aca-9373-4298-ba5c-05b382d1f553',
'e1ba1d20-b466-4fef-ae19-6bf9c5c63d6f']
)
exp.columns.name = 'sseqid'

pdt.assert_frame_equal(obs, exp)


class TestAnnotate(TestPluginBase):
package = 'q2_moshpit.eggnog.tests'
Expand Down
18 changes: 10 additions & 8 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,16 +376,17 @@

plugin.methods.register_function(
function=q2_moshpit.eggnog.eggnog_diamond_search,
inputs={'input_sequences': SampleData[Contigs],
'diamond_db': ReferenceDB[Diamond],
},
inputs={
'sequences': SampleData[Contigs] | FeatureData[MAG],
'diamond_db': ReferenceDB[Diamond],
},
parameters={
'num_cpus': Int,
'db_in_memory': Bool,
},
input_descriptions={
'input_sequences': 'Sequence data of the contigs we want to '
'search for hits using the Diamond Database',
'sequences': 'Sequence data of the contigs we want to '
'search for hits using the Diamond Database',
'diamond_db': 'The filepath to an artifact containing the'
'Diamond database',
},
Expand All @@ -397,9 +398,10 @@
'option should only be used on clusters or other '
'machines with enough memory.',
},
outputs=[('eggnog_hits', SampleData[BLAST6]),
('table', FeatureTable[Frequency])
],
outputs=[
('eggnog_hits', SampleData[BLAST6]),
('table', FeatureTable[Frequency])
],
name='Run eggNOG search using diamond aligner',
description="This method performs the steps by which we find our "
"possible target sequences to annotate using the diamond "
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
'q2_moshpit.eggnog': [
'tests/data/*',
'tests/data/contig-sequences-1/*',
'tests/data/mag-sequences/*',
'tests/data/random-db-1/*',
'tests/data/good_hits/*',
'tests/data/bad_hits/*',
Expand Down

0 comments on commit 6a02427

Please sign in to comment.