Skip to content

Commit

Permalink
ENH: Added ability to specify segment coverage value for `extract-seq…
Browse files Browse the repository at this point in the history
…-segments`. (#199)
  • Loading branch information
mikerobeson authored Aug 14, 2024
1 parent 9c5c101 commit 4f7ab4b
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 7 deletions.
4 changes: 3 additions & 1 deletion rescript/extract_seq_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
def extract_seq_segments(input_sequences: DNAFASTAFormat,
reference_segment_sequences: DNAFASTAFormat,
perc_identity: float = 0.7,
target_coverage: float = 0.9,
min_seq_len: int = None,
threads: int = 1
) -> (DNAFASTAFormat, DNAFASTAFormat):
Expand All @@ -25,7 +26,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat,
# See here for more details:
# https://www.drive5.com/usearch/manual8.1/utax_user_train.html
#
# Warning: I hard set '--strand both', as multiple segments can be
# Warning: I hard set '--strand plus', otherwise multiple segments can be
# extracted from the same sequence, resulting in multiple output
# seqeunces to have the same ID, causing this action to fail.,
#
Expand All @@ -40,6 +41,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat,
'--usearch_global', str(input_sequences),
'--db', str(reference_segment_sequences),
'--id', str(perc_identity),
'--target_cov', str(target_coverage),
'--strand', 'plus',
'--threads', str(threads),
'--qmask', 'none',
Expand Down
11 changes: 9 additions & 2 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,10 @@
'reference_segment_sequences': FeatureData[Sequence]},
parameters={'threads': VSEARCH_PARAMS['threads'],
'perc_identity': VSEARCH_PARAMS['perc_identity'],
'min_seq_len': Int % Range(1, None)
'target_coverage': Float % Range(0, 1,
inclusive_start=False,
inclusive_end=True),
'min_seq_len': Int % Range(1, None),
},
outputs=[('extracted_sequence_segments', FeatureData[Sequence]),
('unmatched_sequences', FeatureData[Sequence])],
Expand All @@ -1136,11 +1139,15 @@
'reference_segment_sequences': 'Reference sequence segments that '
'will be used to search for and '
'extract matching segments from '
'\'sequences\'.',
'\'input-sequences\'.',
},
parameter_descriptions={
'threads': VSEARCH_PARAM_DESCRIPTIONS['threads'],
'perc_identity': VSEARCH_PARAM_DESCRIPTIONS['perc_identity'],
'target_coverage': 'The minimum fraction of coverage that '
'\'reference-segment-sequences\' must have '
'in order to extract matching segments '
'from \'input-sequences\'.',
'min_seq_len': 'Minimum length of sequence allowed '
'for searching. Any sequence less than '
'this will be discarded. If not set, default '
Expand Down
2 changes: 2 additions & 0 deletions rescript/tests/data/full-length-query.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ CGCGACATAAAGACTCCTTATTTTCTTTTTCTTTATTTTATTGAAATTTCATAAACCTAAATTTAAAACTGAACTAAAGG
ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGACTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTGAATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGTATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGTGAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGATGAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATAGAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAATTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCGATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTGAATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCTAATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATGATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGAACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAACTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTTGATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT
>fulllen04
ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGACTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTGAATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGTATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGTGAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGATGAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATAGAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAATTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCGATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTGAATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCTAATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATGATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGAACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAACTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTTGATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT
>fulllen05
AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC
3 changes: 2 additions & 1 deletion rescript/tests/data/ref-segs.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAAAACTAGAAGAAAAAGG
>refsegment02
ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGACAGAAGCAGAATACAAAAACAAAAG

>refsegment03
AAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGG
63 changes: 60 additions & 3 deletions rescript/tests/test_extract_seq_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class TestExtractSeqSegments(TestPluginBase):

def setUp(self):
super().setUp()

# NOTE: sequences must be longer than 32 bases for
# these vsearch tests.
input_seq_fp = self.get_data_path('full-length-query.fasta')
self.seqs = DNAFASTAFormat(input_seq_fp, mode='r')

Expand All @@ -30,15 +31,17 @@ def test_extract_seq_segments(self):
ext_segs, unmat_seqs, = extract_seq_segments(
input_sequences=self.seqs,
reference_segment_sequences=self.segref,
perc_identity=0.9)
perc_identity=0.9,
target_coverage=0.7)

obs_ext_segs = {seq.metadata['id']: str(seq)
for seq in ext_segs.view(DNAIterator)}
exp_ext_segs = {
'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA'
'AACTAGAAGAAAAAGG'),
'fulllen02': ('ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGA'
'CAGAAGCAGAATACAAAAACAAAAG')}
'CAGAAGCAGAATACAAAAACAAAAG'),
'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'}
self.assertEqual(obs_ext_segs, exp_ext_segs)

obs_unmat_seqs = {seq.metadata['id']: str(seq)
Expand Down Expand Up @@ -77,3 +80,57 @@ def test_extract_seq_segments(self):
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT')}
self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)

def test_high_coverage_extract_seq_segments(self):
ext_segs, unmat_seqs, = extract_seq_segments(
input_sequences=self.seqs,
reference_segment_sequences=self.segref,
perc_identity=0.9,
target_coverage=0.9)

obs_ext_segs = {seq.metadata['id']: str(seq)
for seq in ext_segs.view(DNAIterator)}
exp_ext_segs = {
'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA'
'AACTAGAAGAAAAAGG'),
'fulllen02': ('ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGA'
'CAGAAGCAGAATACAAAAACAAAAG')}
self.assertEqual(obs_ext_segs, exp_ext_segs)

obs_unmat_seqs = {seq.metadata['id']: str(seq)
for seq in unmat_seqs.view(DNAIterator)}
exp_unmat_seqs = {
'fulllen03': (
'ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGA'
'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG'
'AATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGT'
'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT'
'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT'
'GAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA'
'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA'
'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG'
'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG'
'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT'
'AATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATG'
'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGA'
'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA'
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen04': (
'ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGA'
'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG'
'AATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGT'
'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT'
'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT'
'GAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA'
'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA'
'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG'
'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG'
'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT'
'AATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATG'
'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGA'
'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA'
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'}
self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)

0 comments on commit 4f7ab4b

Please sign in to comment.