Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: allow SampleData[MAGs] as input to predict-genes-prodigal #154

Merged
merged 10 commits into from
May 6, 2024
2 changes: 1 addition & 1 deletion q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,7 +989,7 @@
plugin.methods.register_function(
function=q2_moshpit.prodigal.predict_genes_prodigal,
inputs={
'mags': FeatureData[MAG]
'mags': FeatureData[MAG] | SampleData[MAGs]
},
input_descriptions={
'mags': 'MAGs for which one wishes to predict genes.'
Expand Down
68 changes: 46 additions & 22 deletions q2_moshpit/prodigal/prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,17 @@
# ----------------------------------------------------------------------------
import os
import copy as cp
from typing import Union
from .._utils import run_command
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
from q2_types.genome_data import (
LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat,
)


def predict_genes_prodigal(
mags: MAGSequencesDirFmt,
mags: Union[MAGSequencesDirFmt, MultiMAGSequencesDirFmt],
translation_table_number: str = "11",
) -> (LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat):

Expand All @@ -24,34 +26,56 @@ def predict_genes_prodigal(
genes = GenesDirectoryFormat()
proteins = ProteinsDirectoryFormat()

# Get paths to fasta files in input dir
fasta_files = [
file for file in os.listdir(mags.path)
if file.endswith(".fa") or file.endswith(".fasta")
]

# Define base command
base_cmd = [
"prodigal",
"-g", translation_table_number,
"-f", "gff"
]

# For every fasta file in mags.path call prodigal and write
# outputs to the corresponding directories.
for fasta_file in fasta_files:
# Get the filename from the file path
file_id = os.path.splitext(fasta_file)[0]

# Adjust command and run
cmd = cp.deepcopy(base_cmd)
cmd.extend([
"-i", os.path.join(mags.path, fasta_file),
"-o", os.path.join(loci.path, f"{file_id}_loci.gff"),
"-a", os.path.join(proteins.path, f"{file_id}_proteins.fasta"),
"-d", os.path.join(genes.path, f"{file_id}_genes.fasta")
])
run_command(cmd)
def _process_fasta_files(fasta_files: list, prefix: str, input_path: str):
# For every fasta file call prodigal and write
# outputs to the corresponding directories.
for fasta_file in fasta_files:
# Get the filename from the file path
file_id = os.path.splitext(fasta_file)[0]

# Adjust command and run
cmd = cp.deepcopy(base_cmd)
cmd.extend([
"-i", os.path.join(input_path, fasta_file),
"-o",
os.path.join(
loci.path, f"{prefix}{file_id}_loci.gff"
),
"-a",
os.path.join(
proteins.path, f"{prefix}{file_id}_proteins.fasta"
),
"-d",
os.path.join(
genes.path, f"{prefix}{file_id}_genes.fasta"
)
])
run_command(cmd)
Sann5 marked this conversation as resolved.
Show resolved Hide resolved

if isinstance(mags, MAGSequencesDirFmt):
# Get paths to fasta files in input dir
fasta_files = os.listdir(mags.path)
_process_fasta_files(fasta_files, '', mags.path)
Sann5 marked this conversation as resolved.
Show resolved Hide resolved

elif isinstance(mags, MultiMAGSequencesDirFmt):
# List all directories / samples
for sample_dir in os.listdir(mags.path):
sample_dir_path = os.path.join(mags.path, sample_dir)
if os.path.isdir(sample_dir_path):
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
fasta_files = os.listdir(sample_dir_path)
print(fasta_files)
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
_process_fasta_files(
Sann5 marked this conversation as resolved.
Show resolved Hide resolved
fasta_files,
f"{sample_dir}_",
sample_dir_path
)

# Return output directories
return loci, genes, proteins
Empty file.
43 changes: 40 additions & 3 deletions q2_moshpit/prodigal/tests/test_prodigal.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
from q2_moshpit.prodigal.prodigal import predict_genes_prodigal
from qiime2.plugin.testing import TestPluginBase
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt
from unittest.mock import patch, call
from q2_types.genome_data import (
LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat,
Expand All @@ -20,7 +20,7 @@ class TestBUSCO(TestPluginBase):
package = "q2_moshpit.prodigal.tests"

@patch("subprocess.run")
def test_run_prodigal_1_mag(self, subp_run):
def test_run_prodigal_feature_data_1_mag(self, subp_run):
# Run prodigal with dummy data
p = self.get_data_path("dir_with_1_mag")
mags = MAGSequencesDirFmt(path=p, mode="r")
Expand Down Expand Up @@ -53,7 +53,7 @@ def test_run_prodigal_1_mag(self, subp_run):
)

@patch("subprocess.run")
def test_run_prodigal_3_mag(self, subp_run):
def test_run_prodigal_feature_data_3_mag(self, subp_run):
# Run prodigal with dummy data
p = self.get_data_path("dir_with_3_mag")
mags = MAGSequencesDirFmt(path=p, mode="r")
Expand Down Expand Up @@ -85,3 +85,40 @@ def test_run_prodigal_3_mag(self, subp_run):

# Assert that patch was called 3 times
subp_run.assert_has_calls(three_calls)

@patch("subprocess.run")
def test_run_prodigal_sample_data(self, subp_run):
p = self.get_data_path("")
mags = MultiMAGSequencesDirFmt(path=p, mode="r")
loci, genes, proteins = predict_genes_prodigal(mags=mags)

# Check that output is correct type
self.assertIsInstance(loci, LociDirectoryFormat)
self.assertIsInstance(genes, GenesDirectoryFormat)
self.assertIsInstance(proteins, ProteinsDirectoryFormat)

# Get names of fasta files from test data dir
calls = []
for sample in os.listdir(mags.path):
for fasta_file in os.listdir(f"{mags.path}/{sample}"):
file_id = os.path.splitext(fasta_file)[0]
# Define calls
calls.append(call([
"prodigal",
"-g", "11",
"-f", "gff",
"-i", os.path.join(mags.path, sample, f"{file_id}.fasta"),
"-o",
os.path.join(loci.path, f"{sample}_{file_id}_loci.gff"),
"-a",
os.path.join(
proteins.path, f"{sample}_{file_id}_proteins.fasta"
),
"-d",
os.path.join(genes.path, f"{sample}_{file_id}_genes.fasta")
],
check=True)
)

# Assert that patch was called 3 times
subp_run.assert_has_calls(calls)
Loading