Skip to content

Commit

Permalink
ENH: add action to get feature lengths (#164)
Browse files Browse the repository at this point in the history
  • Loading branch information
misialq authored May 17, 2024
1 parent 74cf309 commit a1b8961
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 3 deletions.
4 changes: 3 additions & 1 deletion q2_moshpit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@
helpers as kraken_helpers
)
from .metabat2 import metabat2
from ._utils import get_feature_lengths

__version__ = get_versions()['version']
del get_versions

__all__ = [
'metabat2', 'bracken', 'kraken_class', 'kraken_db',
'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog',
'busco', 'prodigal', 'kraken_helpers', 'partition'
'busco', 'prodigal', 'kraken_helpers', 'partition',
'get_feature_lengths'
]
18 changes: 18 additions & 0 deletions q2_moshpit/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
import hashlib
from typing import List

import pandas as pd
import skbio

from q2_types.feature_data_mag import MAGSequencesDirFmt


def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs):
if verbose:
Expand Down Expand Up @@ -84,3 +89,16 @@ def _calculate_md5_from_file(file_path: str) -> str:
for chunk in iter(lambda: f.read(4096), b""):
md5_hash.update(chunk)
return md5_hash.hexdigest()


def get_feature_lengths(features: MAGSequencesDirFmt) -> pd.DataFrame:
"""Calculate lengths of features in a feature data object."""
ids, lengths = [], []
for _id, fp in features.feature_dict().items():
sequences = skbio.io.read(fp, format='fasta', verify=False)
ids.append(_id)
lengths.append(sum(len(seq) for seq in sequences))

df = pd.DataFrame({'id': ids, 'length': lengths})
df.set_index('id', inplace=True)
return df
22 changes: 21 additions & 1 deletion q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from q2_types.distance_matrix import DistanceMatrix
from q2_types.feature_data import (
FeatureData, Sequence, Taxonomy, ProteinSequence
FeatureData, Sequence, Taxonomy, ProteinSequence, SequenceCharacteristics
)
from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
from q2_types.per_sample_sequences import (
Expand Down Expand Up @@ -1194,6 +1194,26 @@
citations=[citations["menzel2016"]],
)

plugin.methods.register_function(
function=q2_moshpit._utils.get_feature_lengths,
inputs={
"features": FeatureData[MAG],
},
parameters={},
outputs=[
('lengths',
FeatureData[SequenceCharacteristics % Properties('length')])
],
input_descriptions={
"features": "Features to get lengths for."
},
parameter_descriptions={},
output_descriptions={'lengths': 'Feature lengths.', },
name='Get feature lengths.',
description='This method extract lengths for the provided feature set.',
citations=[]
)

plugin.register_semantic_types(BUSCOResults)
plugin.register_semantic_type_to_format(
BUSCOResults,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>NZ_00000000.1_contig1
ATGTTTCCAGATGCAATGCGTGGGCACTG
>NZ_00000000.1_contig2
TTGACGTCAGTGAAAACCATGCAGTGTTGACGTCAGT
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>NZ_CP007255.1_contig1
GGCGTGCCCTCA
>NZ_CP007255.1_contig2
GGAACGAACTCGGTCCAGGCGTTCTCCCACAT
>NZ_CP007255.1_contig3
ATGATCGATCCTTCCCACCAAGCTCA
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
>NZ_CP018863.1_contig1
GCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTGGTCGGCGCCAAATTCCGTAACGGGCTG
GCACCTCGAACATTCCGGCCAGAGCGTCCCCAGCCTGGACCACGTTGCCATGACCGTGGC
GGTACTGCGCAGC
>NZ_CP018863.1_contig2
CCCGGGAGCTTCGTCGCAGTCCGGTGGGCCTCCTCCCAGTTCGTCTCAGCGCTGCTGCTG
GTCGGCGCCAAATTCCGTAACGGGCTGGCACCTCGAACATTCCGGCCAGAGCGTCCCCAG
CCTGGACCACGTTGCCATGACCGTGGCGGTACTGCGCAGC
>NZ_CP018863.1_contig3
GTCGGCGCCAAATTCCGTAACGGGCTGGCACCTCGAACATTCCGGCCAGAGCGTCCCCAG
GACGATCTGC
20 changes: 19 additions & 1 deletion q2_moshpit/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,14 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import unittest

import pandas as pd
from qiime2.plugin.testing import TestPluginBase

from q2_types.feature_data_mag import MAGSequencesDirFmt
from .._utils import (
_construct_param, _process_common_input_params,
_calculate_md5_from_file
_calculate_md5_from_file, get_feature_lengths
)


Expand Down Expand Up @@ -123,6 +127,20 @@ def test_calculate_md5_from_fail(self):
observed_hash = _calculate_md5_from_file(path_to_file)
self.assertNotEqual(observed_hash, "a583054a9831a6e7cc56ea5cd9cac40a")

def test_get_feature_lengths(self):
mags = MAGSequencesDirFmt(self.get_data_path('mags-derep'), mode='r')
obs = get_feature_lengths(mags)
exp = pd.DataFrame({
'id': [
'24dee6fe-9b84-45bb-8145-de7b092533a1',
'ca7012fc-ba65-40c3-84f5-05aa478a7585',
'd65a71fa-4279-4588-b937-0747ed5d604d'
],
'length': [66, 70, 363]
})
exp.set_index('id', inplace=True)
pd.testing.assert_frame_equal(obs, exp)


if __name__ == '__main__':
unittest.main()

0 comments on commit a1b8961

Please sign in to comment.