Skip to content

Commit

Permalink
added mafft keeplength parameter and tests to maff-add
Browse files Browse the repository at this point in the history
  • Loading branch information
mikerobeson committed Aug 19, 2024
1 parent 571f5e5 commit 49be297
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 5 deletions.
13 changes: 10 additions & 3 deletions q2_alignment/_mafft.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def run_command(cmd, output_fp, verbose=True):
subprocess.run(cmd, stdout=output_f, check=True)


def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments):
def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments,
keeplength):
# Save original sequence IDs since long ids (~250 chars) can be truncated
# by mafft. We'll replace the IDs in the aligned sequences file output by
# mafft with the originals.
Expand Down Expand Up @@ -91,9 +92,13 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments):
if parttree:
cmd += ['--parttree']

if keeplength is not None:
cmd += ['--keeplength']

if alignment_fp is not None:
add_flag = '--addfragments' if addfragments else '--add'
cmd += [add_flag, sequences_fp, alignment_fp]

else:
cmd += [sequences_fp]

Expand Down Expand Up @@ -132,8 +137,10 @@ def mafft_add(alignment: AlignedDNAFASTAFormat,
sequences: DNAFASTAFormat,
n_threads: int = 1,
parttree: bool = False,
addfragments: bool = False) -> AlignedDNAFASTAFormat:
addfragments: bool = False,
keeplength: bool = False) -> AlignedDNAFASTAFormat:
alignment_fp = str(alignment)
sequences_fp = str(sequences)
return _mafft(
sequences_fp, alignment_fp, n_threads, parttree, addfragments)
sequences_fp, alignment_fp, n_threads, parttree, addfragments,
keeplength)
10 changes: 8 additions & 2 deletions q2_alignment/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@
'sequences': FeatureData[Sequence]},
parameters={'n_threads': Threads,
'parttree': Bool,
'addfragments': Bool},
'addfragments': Bool,
'keeplength': Bool},
outputs=[('expanded_alignment', FeatureData[AlignedSequence])],
input_descriptions={'alignment': 'The alignment to which '
'sequences should be added.',
Expand All @@ -60,7 +61,12 @@
'addfragments': 'Optimize for the addition of short sequence '
'fragments (for example, primer or amplicon '
'sequences). If not set, default sequence addition '
'is used.'},
'is used.',
'keeplength': 'If selected, the alignment length will be unchanged. '
'Any added sequence that would otherwise introduce new '
'insertions into the alignment, will have those '
'insertions deleted, to preserve original alignment '
'length.'},
output_descriptions={
'expanded_alignment': 'Alignment containing the provided aligned and '
'unaligned sequences.'},
Expand Down
5 changes: 5 additions & 0 deletions q2_alignment/tests/data/aligned-dna-sequences-2.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
>aln-seq-1
AGGG-GGC
>aln-seq-2
AGGGTGGC

4 changes: 4 additions & 0 deletions q2_alignment/tests/data/unaligned-dna-sequences-2.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>seq-3
AGGTTTTGGC
>seq-4
AGGTTATGGC
57 changes: 57 additions & 0 deletions q2_alignment/tests/test_mafft.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,45 @@ def _prepare_sequence_data(self):

return alignment, sequences, exp

def _prepare_sequence_data_2(self):
# for new alignment using `--keeplength` parameter
sequences_fp = self.get_data_path('unaligned-dna-sequences-2.fasta')
sequences = DNAFASTAFormat(sequences_fp, mode='r')
alignment_fp = self.get_data_path('aligned-dna-sequences-2.fasta')
alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r')
exp = skbio.TabularMSA(
[skbio.DNA('AGGG-GGC',
metadata={'id': 'aln-seq-1', 'description': ''}),
skbio.DNA('AGGGTGGC',
metadata={'id': 'aln-seq-2', 'description': ''}),
skbio.DNA('AGGTTGGC',
metadata={'id': 'seq-3', 'description': ''}),
skbio.DNA('AGGATGGC',
metadata={'id': 'seq-4', 'description': ''})]
)

return alignment, sequences, exp

def _prepare_sequence_data_3(self):
# NOT using `--keeplength` parameter. To compare with
# _prepare_sequence_data_2 output
sequences_fp = self.get_data_path('unaligned-dna-sequences-2.fasta')
sequences = DNAFASTAFormat(sequences_fp, mode='r')
alignment_fp = self.get_data_path('aligned-dna-sequences-2.fasta')
alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r')
exp = skbio.TabularMSA(
[skbio.DNA('AGG--G---GGC',
metadata={'id': 'aln-seq-1', 'description': ''}),
skbio.DNA('AGG--G--TGGC',
metadata={'id': 'aln-seq-2', 'description': ''}),
skbio.DNA('AGG--TTTTGGC',
metadata={'id': 'seq-3', 'description': ''}),
skbio.DNA('AGGTTA--TGGC',
metadata={'id': 'seq-4', 'description': ''})]
)

return alignment, sequences, exp

def test_mafft_add(self):
alignment, sequences, exp = self._prepare_sequence_data()

Expand All @@ -122,6 +161,24 @@ def test_mafft_add_fragments(self):
constructor=skbio.DNA)
self.assertEqual(obs, exp)

def test_mafft_add_fragments_keeplength(self):
alignment, sequences, exp = self._prepare_sequence_data_2()

with redirected_stdio(stderr=os.devnull):
result = mafft_add(alignment, sequences, addfragments=True)
obs = skbio.io.read(str(result), into=skbio.TabularMSA,
constructor=skbio.DNA)
self.assertEqual(obs, exp)

def test_mafft_add_fragments_no_keeplength(self):
alignment, sequences, exp = self._prepare_sequence_data_3()

with redirected_stdio(stderr=os.devnull):
result = mafft_add(alignment, sequences, addfragments=True)
obs = skbio.io.read(str(result), into=skbio.TabularMSA,
constructor=skbio.DNA)
self.assertEqual(obs, exp)

def test_mafft_add_flags(self):
alignment, sequences, exp = self._prepare_sequence_data()

Expand Down

0 comments on commit 49be297

Please sign in to comment.