From ae2c5d167b54c18745c079f3fc62112c279003df Mon Sep 17 00:00:00 2001 From: Santiago Castro Dau Date: Fri, 12 Apr 2024 16:10:27 +0200 Subject: [PATCH] Add SampleData[MAGs] to prodigal --- q2_moshpit/plugin_setup.py | 2 +- q2_moshpit/prodigal/prodigal.py | 64 ++++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index c743dddb..03edfa07 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -989,7 +989,7 @@ plugin.methods.register_function( function=q2_moshpit.prodigal.predict_genes_prodigal, inputs={ - 'mags': FeatureData[MAG] + 'mags': FeatureData[MAG] | SampleData[MAGs] }, input_descriptions={ 'mags': 'MAGs for which one wishes to predict genes.' diff --git a/q2_moshpit/prodigal/prodigal.py b/q2_moshpit/prodigal/prodigal.py index 8af1d321..2754223d 100644 --- a/q2_moshpit/prodigal/prodigal.py +++ b/q2_moshpit/prodigal/prodigal.py @@ -7,15 +7,17 @@ # ---------------------------------------------------------------------------- import os import copy as cp +from typing import Union from .._utils import run_command from q2_types.feature_data_mag import MAGSequencesDirFmt +from q2_types.per_sample_sequences import MultiMAGSequencesDirFmt from q2_types.genome_data import ( LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat, ) def predict_genes_prodigal( - mags: MAGSequencesDirFmt, + mags: Union[MAGSequencesDirFmt, MultiMAGSequencesDirFmt], translation_table_number: str = "11", ) -> (LociDirectoryFormat, GenesDirectoryFormat, ProteinsDirectoryFormat): @@ -24,12 +26,6 @@ def predict_genes_prodigal( genes = GenesDirectoryFormat() proteins = ProteinsDirectoryFormat() - # Get paths to fasta files in input dir - fasta_files = [ - file for file in os.listdir(mags.path) - if file.endswith(".fa") or file.endswith(".fasta") - ] - # Define base command base_cmd = [ "prodigal", @@ -37,21 +33,47 @@ def predict_genes_prodigal( "-f", "gff" ] - # For every fasta file in mags.path call prodigal and write - # outputs to the corresponding directories. - for fasta_file in fasta_files: - # Get the filename from the file path - file_id = os.path.splitext(fasta_file)[0] - - # Adjust command and run - cmd = cp.deepcopy(base_cmd) - cmd.extend([ - "-i", os.path.join(mags.path, fasta_file), - "-o", os.path.join(loci.path, f"{file_id}_loci.gff"), - "-a", os.path.join(proteins.path, f"{file_id}_proteins.fasta"), - "-d", os.path.join(genes.path, f"{file_id}_genes.fasta") - ]) + def _process_fasta_files(fasta_files: list, prefix: str, input_path: str): + # For every fasta file call prodigal and write + # outputs to the corresponding directories. + for fasta_file in fasta_files: + # Get the filename from the file path + file_id = os.path.splitext(fasta_file)[0] + + # Adjust command and run + cmd = cp.deepcopy(base_cmd) + cmd.extend([ + "-i", os.path.join(input_path, fasta_file), + "-o", + os.path.join( + loci.path, f"{prefix}{file_id}_loci.gff" + ), + "-a", + os.path.join( + proteins.path, f"{prefix}{file_id}_proteins.fasta" + ), + "-d", + os.path.join( + genes.path, f"{prefix}{file_id}_genes.fasta" + ) + ]) run_command(cmd) + if isinstance(mags, MAGSequencesDirFmt): + # Get paths to fasta files in input dir + fasta_files = os.listdir(mags.path) + _process_fasta_files(fasta_files, '', mags.path) + + elif isinstance(mags, MultiMAGSequencesDirFmt): + # List all directories / samples + for sample_dir in os.listdir(mags.path): + if os.path.isdir(os.path.join(mags.path, sample_dir)): + fasta_files = os.listdir(os.path.join(mags.path, sample_dir)) + _process_fasta_files( + fasta_files, + f"{sample_dir}_", + os.path.join(mags.path, sample_dir) + ) + # Return output directories return loci, genes, proteins