Skip to content

Commit

Permalink
ENH: allow SampleData[MAGs] as input to predict-genes-prodigal (b…
Browse files Browse the repository at this point in the history
…okulich-lab#154)

Co-authored-by: Michal Ziemski <[email protected]>
  • Loading branch information
Sann5 and misialq authored May 6, 2024
1 parent 897d7e0 commit 2b3b03d
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 28 deletions.
2 changes: 1 addition & 1 deletion q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@
plugin.methods.register_function(
function=q2_moshpit.prodigal.predict_genes_prodigal,
inputs={
'mags': FeatureData[MAG]
'mags': FeatureData[MAG] | SampleData[MAGs]
},
input_descriptions={
'mags': 'MAGs for which one wishes to predict genes.'
Expand Down
43 changes: 26 additions & 17 deletions q2_moshpit/prodigal/prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,17 @@
# ----------------------------------------------------------------------------
import os
import copy as cp
from typing import Union
from .._utils import run_command
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
from q2_types.genome_data import (
LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat,
)


def predict_genes_prodigal(
mags: MAGSequencesDirFmt,
mags: Union[MAGSequencesDirFmt, MultiMAGSequencesDirFmt],
translation_table_number: str = "11",
) -> (LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat):

Expand All @@ -24,34 +26,41 @@ def predict_genes_prodigal(
genes = GenesDirectoryFormat()
proteins = ProteinsDirectoryFormat()

# Get paths to fasta files in input dir
fasta_files = [
file for file in os.listdir(mags.path)
if file.endswith(".fa") or file.endswith(".fasta")
]

# Define base command
base_cmd = [
"prodigal",
"-g", translation_table_number,
"-f", "gff"
]

# For every fasta file in mags.path call prodigal and write
# outputs to the corresponding directories.
for fasta_file in fasta_files:
# Get the filename from the file path
file_id = os.path.splitext(fasta_file)[0]
def _run_prodigal(path_to_input: str, mag_id: str, subdir: str = None):
# If subdirectory is not None, append a "/" s.t. the command
# below is defined correctly. Otw subdir = ""
subdir = subdir + "/" if subdir else ""

# Adjust command and run
# Complete command and run
cmd = cp.deepcopy(base_cmd)
cmd.extend([
"-i", os.path.join(mags.path, fasta_file),
"-o", os.path.join(loci.path, f"{file_id}_loci.gff"),
"-a", os.path.join(proteins.path, f"{file_id}_proteins.fasta"),
"-d", os.path.join(genes.path, f"{file_id}_genes.fasta")
"-i", path_to_input,
"-o", os.path.join(loci.path, f"{subdir}{mag_id}.gff"),
"-a", os.path.join(proteins.path, f"{subdir}{mag_id}.fasta"),
"-d", os.path.join(genes.path, f"{subdir}{mag_id}.fasta")
])
run_command(cmd)

if isinstance(mags, MAGSequencesDirFmt):
for mag_id, mag_fp in mags.feature_dict().items():
_run_prodigal(mag_fp, mag_id)

elif isinstance(mags, MultiMAGSequencesDirFmt):
for sample_id, mags_dict in mags.sample_dict().items():
# Make sample_id folders in output locations
for output_object in [loci, genes, proteins]:
os.makedirs(os.path.join(output_object.path, sample_id))

# Run prodigal for each mag
for mag_id, mag_fp in mags_dict.items():
_run_prodigal(mag_fp, mag_id, sample_id)

# Return output directories
return loci, genes, proteins
Empty file.
52 changes: 42 additions & 10 deletions q2_moshpit/prodigal/tests/test_prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
from q2_moshpit.prodigal.prodigal import predict_genes_prodigal
from qiime2.plugin.testing import TestPluginBase
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
from unittest.mock import patch, call
from q2_types.genome_data import (
LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat,
Expand All @@ -20,7 +20,7 @@ class TestBUSCO(TestPluginBase):
package = "q2_moshpit.prodigal.tests"

@patch("subprocess.run")
def test_run_prodigal_1_mag(self, subp_run):
def test_run_prodigal_feature_data_1_mag(self, subp_run):
# Run prodigal with dummy data
p = self.get_data_path("dir_with_1_mag")
mags = MAGSequencesDirFmt(path=p, mode="r")
Expand All @@ -46,14 +46,14 @@ def test_run_prodigal_1_mag(self, subp_run):
"-g", "11",
"-f", "gff",
"-i", os.path.join(mags.path, f"{fasta_file}.fasta"),
"-o", os.path.join(loci.path, f"{fasta_file}_loci.gff"),
"-a", os.path.join(proteins.path, f"{fasta_file}_proteins.fasta"),
"-d", os.path.join(genes.path, f"{fasta_file}_genes.fasta")],
"-o", os.path.join(loci.path, f"{fasta_file}.gff"),
"-a", os.path.join(proteins.path, f"{fasta_file}.fasta"),
"-d", os.path.join(genes.path, f"{fasta_file}.fasta")],
check=True
)

@patch("subprocess.run")
def test_run_prodigal_3_mag(self, subp_run):
def test_run_prodigal_feature_data_3_mag(self, subp_run):
# Run prodigal with dummy data
p = self.get_data_path("dir_with_3_mag")
mags = MAGSequencesDirFmt(path=p, mode="r")
Expand All @@ -76,12 +76,44 @@ def test_run_prodigal_3_mag(self, subp_run):
"-g", "11",
"-f", "gff",
"-i", os.path.join(mags.path, f"{fasta_file}.fasta"),
"-o", os.path.join(loci.path, f"{fasta_file}_loci.gff"),
"-a", os.path.join(proteins.path, f"{fasta_file}_proteins.fasta"),
"-d", os.path.join(genes.path, f"{fasta_file}_genes.fasta")],
"-o", os.path.join(loci.path, f"{fasta_file}.gff"),
"-a", os.path.join(proteins.path, f"{fasta_file}.fasta"),
"-d", os.path.join(genes.path, f"{fasta_file}.fasta")],
check=True)
for fasta_file in fasta_files
]

# Assert that patch was called 3 times
subp_run.assert_has_calls(three_calls)
subp_run.assert_has_calls(three_calls, any_order=True)

@patch("subprocess.run")
def test_run_prodigal_sample_data(self, subp_run):
p = self.get_data_path("")
mags = MultiMAGSequencesDirFmt(path=p, mode="r")
loci, genes, prot = predict_genes_prodigal(mags=mags)

# Check that output is correct type
self.assertIsInstance(loci, LociDirectoryFormat)
self.assertIsInstance(genes, GenesDirectoryFormat)
self.assertIsInstance(prot, ProteinsDirectoryFormat)

# Get names of fasta files from test data dir
calls = []
for sample in os.listdir(mags.path):
for fasta_file in os.listdir(f"{mags.path}/{sample}"):
file_id = os.path.splitext(fasta_file)[0]
# Define calls
calls.append(call([
"prodigal",
"-g", "11",
"-f", "gff",
"-i", os.path.join(mags.path, sample, f"{file_id}.fasta"),
"-o", os.path.join(loci.path, f"{sample}/{file_id}.gff"),
"-a", os.path.join(prot.path, f"{sample}/{file_id}.fasta"),
"-d", os.path.join(genes.path, f"{sample}/{file_id}.fasta")
],
check=True)
)

# Assert that patch was called 3 times
subp_run.assert_has_calls(calls, any_order=True)

0 comments on commit 2b3b03d

Please sign in to comment.