From 98741b3442dbc6bff6ee203c3bd337d724e08763 Mon Sep 17 00:00:00 2001 From: Mike Robeson Date: Wed, 25 Sep 2024 19:21:09 -0500 Subject: [PATCH] Adds `--keeplength` flag to maff-add action. (#84) --- q2_alignment/_mafft.py | 15 +++-- q2_alignment/plugin_setup.py | 10 +++- .../tests/data/aligned-dna-sequences-2.fasta | 5 ++ .../data/unaligned-dna-sequences-2.fasta | 4 ++ q2_alignment/tests/test_mafft.py | 59 +++++++++++++++++++ 5 files changed, 87 insertions(+), 6 deletions(-) create mode 100644 q2_alignment/tests/data/aligned-dna-sequences-2.fasta create mode 100644 q2_alignment/tests/data/unaligned-dna-sequences-2.fasta diff --git a/q2_alignment/_mafft.py b/q2_alignment/_mafft.py index e1cbd65..377ca1c 100644 --- a/q2_alignment/_mafft.py +++ b/q2_alignment/_mafft.py @@ -26,7 +26,8 @@ def run_command(cmd, output_fp, verbose=True): subprocess.run(cmd, stdout=output_f, check=True) -def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments): +def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments, + keeplength): # Save original sequence IDs since long ids (~250 chars) can be truncated # by mafft. We'll replace the IDs in the aligned sequences file output by # mafft with the originals. @@ -91,9 +92,13 @@ def _mafft(sequences_fp, alignment_fp, n_threads, parttree, addfragments): if parttree: cmd += ['--parttree'] + if keeplength: + cmd += ['--keeplength'] + if alignment_fp is not None: add_flag = '--addfragments' if addfragments else '--add' cmd += [add_flag, sequences_fp, alignment_fp] + else: cmd += [sequences_fp] @@ -125,15 +130,17 @@ def mafft(sequences: DNAFASTAFormat, n_threads: int = 1, parttree: bool = False) -> AlignedDNAFASTAFormat: sequences_fp = str(sequences) - return _mafft(sequences_fp, None, n_threads, parttree, False) + return _mafft(sequences_fp, None, n_threads, parttree, False, False) def mafft_add(alignment: AlignedDNAFASTAFormat, sequences: DNAFASTAFormat, n_threads: int = 1, parttree: bool = False, - addfragments: bool = False) -> AlignedDNAFASTAFormat: + addfragments: bool = False, + keeplength: bool = False) -> AlignedDNAFASTAFormat: alignment_fp = str(alignment) sequences_fp = str(sequences) return _mafft( - sequences_fp, alignment_fp, n_threads, parttree, addfragments) + sequences_fp, alignment_fp, n_threads, parttree, addfragments, + keeplength) diff --git a/q2_alignment/plugin_setup.py b/q2_alignment/plugin_setup.py index d0f3ed6..132352d 100644 --- a/q2_alignment/plugin_setup.py +++ b/q2_alignment/plugin_setup.py @@ -47,7 +47,8 @@ 'sequences': FeatureData[Sequence]}, parameters={'n_threads': Threads, 'parttree': Bool, - 'addfragments': Bool}, + 'addfragments': Bool, + 'keeplength': Bool}, outputs=[('expanded_alignment', FeatureData[AlignedSequence])], input_descriptions={'alignment': 'The alignment to which ' 'sequences should be added.', @@ -60,7 +61,12 @@ 'addfragments': 'Optimize for the addition of short sequence ' 'fragments (for example, primer or amplicon ' 'sequences). If not set, default sequence addition ' - 'is used.'}, + 'is used.', + 'keeplength': 'If selected, the alignment length will be unchanged. ' + 'Any added sequence that would otherwise introduce new ' + 'insertions into the alignment, will have those ' + 'insertions deleted, to preserve original alignment ' + 'length.'}, output_descriptions={ 'expanded_alignment': 'Alignment containing the provided aligned and ' 'unaligned sequences.'}, diff --git a/q2_alignment/tests/data/aligned-dna-sequences-2.fasta b/q2_alignment/tests/data/aligned-dna-sequences-2.fasta new file mode 100644 index 0000000..35173e5 --- /dev/null +++ b/q2_alignment/tests/data/aligned-dna-sequences-2.fasta @@ -0,0 +1,5 @@ +>aln-seq-1 +AGGG-GGC +>aln-seq-2 +AGGGTGGC + diff --git a/q2_alignment/tests/data/unaligned-dna-sequences-2.fasta b/q2_alignment/tests/data/unaligned-dna-sequences-2.fasta new file mode 100644 index 0000000..5d16348 --- /dev/null +++ b/q2_alignment/tests/data/unaligned-dna-sequences-2.fasta @@ -0,0 +1,4 @@ +>seq-3 +AGGTTTTGGC +>seq-4 +AGGTTATGGC \ No newline at end of file diff --git a/q2_alignment/tests/test_mafft.py b/q2_alignment/tests/test_mafft.py index fd6b656..058c9b2 100644 --- a/q2_alignment/tests/test_mafft.py +++ b/q2_alignment/tests/test_mafft.py @@ -104,6 +104,45 @@ def _prepare_sequence_data(self): return alignment, sequences, exp + def _prepare_sequence_data_2(self): + # for new alignment using `--keeplength` parameter + sequences_fp = self.get_data_path('unaligned-dna-sequences-2.fasta') + sequences = DNAFASTAFormat(sequences_fp, mode='r') + alignment_fp = self.get_data_path('aligned-dna-sequences-2.fasta') + alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r') + exp = skbio.TabularMSA( + [skbio.DNA('AGGG-GGC', + metadata={'id': 'aln-seq-1', 'description': ''}), + skbio.DNA('AGGGTGGC', + metadata={'id': 'aln-seq-2', 'description': ''}), + skbio.DNA('AGGTTGGC', + metadata={'id': 'seq-3', 'description': ''}), + skbio.DNA('AGGATGGC', + metadata={'id': 'seq-4', 'description': ''})] + ) + + return alignment, sequences, exp + + def _prepare_sequence_data_3(self): + # NOT using `--keeplength` parameter. To compare with + # _prepare_sequence_data_2 output + sequences_fp = self.get_data_path('unaligned-dna-sequences-2.fasta') + sequences = DNAFASTAFormat(sequences_fp, mode='r') + alignment_fp = self.get_data_path('aligned-dna-sequences-2.fasta') + alignment = AlignedDNAFASTAFormat(alignment_fp, mode='r') + exp = skbio.TabularMSA( + [skbio.DNA('AGG--G---GGC', + metadata={'id': 'aln-seq-1', 'description': ''}), + skbio.DNA('AGG--G--TGGC', + metadata={'id': 'aln-seq-2', 'description': ''}), + skbio.DNA('AGG--TTTTGGC', + metadata={'id': 'seq-3', 'description': ''}), + skbio.DNA('AGGTTA--TGGC', + metadata={'id': 'seq-4', 'description': ''})] + ) + + return alignment, sequences, exp + def test_mafft_add(self): alignment, sequences, exp = self._prepare_sequence_data() @@ -122,6 +161,26 @@ def test_mafft_add_fragments(self): constructor=skbio.DNA) self.assertEqual(obs, exp) + def test_mafft_add_fragments_keeplength(self): + alignment, sequences, exp = self._prepare_sequence_data_2() + + with redirected_stdio(stderr=os.devnull): + result = mafft_add(alignment, sequences, addfragments=True, + keeplength=True) + obs = skbio.io.read(str(result), into=skbio.TabularMSA, + constructor=skbio.DNA) + self.assertEqual(obs, exp) + + def test_mafft_add_fragments_no_keeplength(self): + alignment, sequences, exp = self._prepare_sequence_data_3() + + with redirected_stdio(stderr=os.devnull): + result = mafft_add(alignment, sequences, addfragments=True, + keeplength=False) + obs = skbio.io.read(str(result), into=skbio.TabularMSA, + constructor=skbio.DNA) + self.assertEqual(obs, exp) + def test_mafft_add_flags(self): alignment, sequences, exp = self._prepare_sequence_data()