Skip to content

Commit

Permalink
ENH: adds keeplength option to ensure the correct region is extract…
Browse files Browse the repository at this point in the history
…ed when using primers. (bokulich-lab#208)
  • Loading branch information
mikerobeson authored Dec 3, 2024
1 parent 83c9c15 commit 16a48fa
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 17 deletions.
2 changes: 2 additions & 0 deletions ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ requirements:
- q2-types {{ qiime2_epoch }}.*
- q2-feature-classifier {{ qiime2_epoch }}.*
- q2-vizard {{ qiime2_epoch }}.*
- q2-alignment {{ qiime2_epoch }}.*
- qiime2 {{ qiime2_epoch }}.*

test:
Expand All @@ -36,6 +37,7 @@ test:
- q2-types >={{ q2_types }}
- q2-feature-classifier >={{ q2_feature_classifier }}
- q2-vizard >={{ q2_vizard }}
- q2-alignment >={{ q2_alignment }}
- qiime2 >={{ qiime2 }}

imports:
Expand Down
9 changes: 7 additions & 2 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,7 +1071,7 @@
'primer_rev': 'Reverse primer used to find the end position '
'for alignment trimming. Provide as 5\'-3\'.',
'position_start': 'Position within the alignment where the trimming '
'will begin. If not provided, alignment will not'
'will begin. If not provided, alignment will not '
'be trimmed at the beginning. If forward primer is'
'specified this parameter will be ignored.',
'position_end': 'Position within the alignment where the trimming '
Expand All @@ -1093,7 +1093,12 @@
"alignment will be generated to locate primer positions. "
"Subsequently, start (5'-most) and end (3'-most) position from fwd "
"and rev primer located within the new alignment is identified and "
"used for slicing the original alignment."),
"used for slicing the original alignment. That is, the primer region "
"will be included in the new alignment output. WARNING: finding "
"alignment positions via primer search can be inefficient for very "
"large alignments and is only recomended for small alignments. "
"For large alignments providing specific alignment positions is "
"ideal."),
)

T = TypeMatch([AlignedSequence, Sequence])
Expand Down
10 changes: 10 additions & 0 deletions rescript/tests/data/small-silva-full-len-alignment.fasta

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions rescript/tests/data/small-silva-v4-trim-keeplength.fasta

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions rescript/tests/data/small-silva-v4-trim-no-keeplength.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>AB299544.1.1336 Bacteria;Firmicutes;Clostridia;Oscillospirales;Ruminococcaceae;uncultured;uncultured Clostridiales bacterium
----------------------------------------------------T-A--G----G--G--A---AT-AA-C--AT-------------------------T-T-G-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>JF826529.1.1310 Bacteria;Proteobacteria;Gammaproteobacteria;Burkholderiales;Alcaligenaceae;Achromobacter;Achromobacter sp. NS014
----------------------------------------------------C-G--G----G--G--G---AT-AA-C--TA-------------------------C-C-C-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>CP011401.2356993.2358524 Bacteria;Proteobacteria;Gammaproteobacteria;Burkholderiales;Alcaligenaceae;Bordetella;Bordetella pertussis
----------------------------------------------------C-G--G----G--G--G---AT-AA-C--TA-------------------------C-G-C-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>HM446088.1.1410 Bacteria;Cyanobacteria;Cyanobacteriia;Synechococcales;Cyanobiaceae;Cyanobium PCC-6307;uncultured bacterium
----------------------------------------------------A-G--G----G--G--G---AT-AA-C--GG-------------------------C-T-G-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>FJ484772.1.1316 Bacteria;Proteobacteria;Gammaproteobacteria;Acidiferrobacterales;Acidiferrobacteraceae;Sulfurifustis;uncultured proteobacterium
----------------------------------------------------T-G--G----G--G--G---AC-AA-C--CC-------------------------G-G-C-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
82 changes: 69 additions & 13 deletions rescript/tests/test_trim_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@


import qiime2
import skbio
from qiime2.plugin.testing import TestPluginBase
from q2_types.feature_data import (
AlignedDNAFASTAFormat,
DNAIterator,)

from qiime2.plugins.alignment.methods import mafft_add

from rescript.trim_alignment import (
_prepare_positions, _process_primers, _locate_primer_positions,
_trim_all_sequences, _trim_alignment)
Expand Down Expand Up @@ -43,22 +46,52 @@ class TestExtractAlignmentRegion(TestPluginBase):
def setUp(self):
super().setUp()

aligned_seqs_fp = self.get_data_path('trim-test-alignment.fasta')
aligned_with_primers_fp = self.get_data_path(
'trim-test-alignment-with-primers.fasta')
aligned_silva_seqs_fp = self.get_data_path(
'small-silva-full-len-alignment.fasta')
self.aligned_silva_seqs = AlignedDNAFASTAFormat(
aligned_silva_seqs_fp, mode='r')

self.v4_primers_dict = {
"forward": "GTGYCAGCMGCCGCGGTAA",
"reverse": "GGACTACNVGGGTWTCTAAT"
}

silva_alignment_fp = self.get_data_path(
'small-silva-full-len-alignment.fasta')
self.silva_alignment = AlignedDNAFASTAFormat(
silva_alignment_fp, mode='r')
self.aligned_silva_seqs_art = qiime2.Artifact.import_data(
'FeatureData[AlignedSequence]',
self.silva_alignment)

silva_alignment_v4_trim_keeplen_wo_primers_fp = self.get_data_path(
'small-silva-v4-trim-keeplength.fasta')
self.silva_v4_trim_keeplen = AlignedDNAFASTAFormat(
silva_alignment_v4_trim_keeplen_wo_primers_fp, mode='r')

silva_alignment_v4_trim_no_keeplen_wo_primers_fp = self.get_data_path(
'small-silva-v4-trim-no-keeplength.fasta')
self.silva_v4_trim_no_keeplen = AlignedDNAFASTAFormat(
silva_alignment_v4_trim_no_keeplen_wo_primers_fp, mode='r')

aligned_seqs_fp = self.get_data_path('trim-test-alignment.fasta')
self.aligned_seqs = qiime2.Artifact.import_data(
'FeatureData[AlignedSequence]', aligned_seqs_fp)
self.aligned_seqs_fasta = AlignedDNAFASTAFormat(
aligned_seqs_fp, mode='r')

self.primers_dict = {
"forward": "GGGAATCTTCCACAATGG",
"reverse": "GTGTTCTTCTCTAACAACAG"
}

aligned_with_primers_fp = self.get_data_path(
'trim-test-alignment-with-primers.fasta')
self.aligned_with_primers = qiime2.Artifact.import_data(
'FeatureData[AlignedSequence]', aligned_with_primers_fp)
self.aligned_with_primers_fasta = AlignedDNAFASTAFormat(
aligned_with_primers_fp, mode='r')

self.aligned_mess_fasta = AlignedDNAFASTAFormat(
self.get_data_path(
'trim-test-alignment-with-primers-mess.fasta'), mode='r')
Expand Down Expand Up @@ -237,16 +270,39 @@ def test_trim_all_sequences_no_rev(self):
for seq in obs.view(DNAIterator)}
self.assertDictEqual(obs_seqs, self.exp_seqs_only_fwd)

# test trimming when both primers are given
def test_trim_alignment(self):
obs = _trim_alignment(
self.fake_ctx.get_action(1),
self.aligned_seqs_fasta,
self.primers_dict["forward"],
self.primers_dict["reverse"])
obs_seqs = {seq.metadata['id']: str(seq)
for seq in obs.view(DNAIterator)}
self.assertDictEqual(obs_seqs, self.exp_seqs_both_primers)
# test trimming when both primers are given and keeplength = False
# tests against expected alignment length
def test_trim_alignment_keeplen_false(self):
obs_v4_nokeep_aln = _trim_alignment(
mafft_add,
self.aligned_silva_seqs_art,
self.v4_primers_dict["forward"],
self.v4_primers_dict["reverse"],
keeplength=False)

obs_aln = skbio.io.read(str(obs_v4_nokeep_aln), into=skbio.TabularMSA,
constructor=skbio.DNA)
exp_aln = skbio.io.read(str(self.silva_v4_trim_no_keeplen),
into=skbio.TabularMSA,
constructor=skbio.DNA)
self.assertEqual(obs_aln, exp_aln)

# test trimming when both primers are given and keeplength = True
# tests against expected alignment length
def test_trim_alignment_keeplen_true(self):
obs_v4_keep_aln = _trim_alignment(
mafft_add,
self.aligned_silva_seqs_art,
self.v4_primers_dict["forward"],
self.v4_primers_dict["reverse"],
keeplength=True)

obs_aln = skbio.io.read(str(obs_v4_keep_aln), into=skbio.TabularMSA,
constructor=skbio.DNA)
exp_aln = skbio.io.read(str(self.silva_v4_trim_keeplen),
into=skbio.TabularMSA,
constructor=skbio.DNA)
self.assertEqual(obs_aln, exp_aln)

# test trimming when only fwd primer is given
def test_trim_alignment_only_fwd(self):
Expand Down
12 changes: 10 additions & 2 deletions rescript/trim_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,8 @@ def _trim_alignment(expand_alignment_action,
primer_rev=None,
position_start=None,
position_end=None,
n_threads=1) -> AlignedDNAFASTAFormat:
n_threads=1,
keeplength=True) -> AlignedDNAFASTAFormat:
"""
Trim alignment based on primer alignment or explicitly specified
positions. When at least one primer sequence is given, primer-based
Expand Down Expand Up @@ -221,11 +222,18 @@ def _trim_alignment(expand_alignment_action,
'FeatureData[Sequence]', primers_fasta)

# expand the existing alignment by addition of primers
# Use 'keeplength' to make sure original alignment length
# does not change. Otherwise there is potential for the incorrect
# region to be extracted from the alignment. This is important
# to make sure alignment positions can easily be referenced
# to the original alignment. See the 'q2-alignment' plugin,
# specifically the 'mafft-add' test cases for examples.
alignment_with_primers, = expand_alignment_action(
alignment=aligned_sequences,
sequences=primers,
addfragments=True,
n_threads=n_threads)
n_threads=n_threads,
keeplength=keeplength)

# find trim positions based on primer positions within alignment
trim_positions = _locate_primer_positions(alignment_with_primers)
Expand Down

0 comments on commit 16a48fa

Please sign in to comment.