Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Added ability to specify segment coverage value for extract-seq-segments. #199

Merged
merged 4 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion rescript/extract_seq_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
def extract_seq_segments(input_sequences: DNAFASTAFormat,
reference_segment_sequences: DNAFASTAFormat,
perc_identity: float = 0.7,
target_coverage: float = 0.9,
min_seq_len: int = None,
threads: int = 1
) -> (DNAFASTAFormat, DNAFASTAFormat):
Expand All @@ -25,7 +26,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat,
# See here for more details:
# https://www.drive5.com/usearch/manual8.1/utax_user_train.html
#
# Warning: I hard set '--strand both', as multiple segments can be
# Warning: I hard set '--strand plus', otherwise multiple segments can be
# extracted from the same sequence, resulting in multiple output
# seqeunces to have the same ID, causing this action to fail.,
#
Expand All @@ -40,6 +41,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat,
'--usearch_global', str(input_sequences),
'--db', str(reference_segment_sequences),
'--id', str(perc_identity),
'--target_cov', str(target_coverage),
'--strand', 'plus',
'--threads', str(threads),
'--qmask', 'none',
Expand Down
11 changes: 9 additions & 2 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,10 @@
'reference_segment_sequences': FeatureData[Sequence]},
parameters={'threads': VSEARCH_PARAMS['threads'],
'perc_identity': VSEARCH_PARAMS['perc_identity'],
'min_seq_len': Int % Range(1, None)
'target_coverage': Float % Range(0, 1,
inclusive_start=False,
inclusive_end=True),
'min_seq_len': Int % Range(1, None),
},
outputs=[('extracted_sequence_segments', FeatureData[Sequence]),
('unmatched_sequences', FeatureData[Sequence])],
Expand All @@ -1136,11 +1139,15 @@
'reference_segment_sequences': 'Reference sequence segments that '
'will be used to search for and '
'extract matching segments from '
'\'sequences\'.',
'\'input-sequences\'.',
},
parameter_descriptions={
'threads': VSEARCH_PARAM_DESCRIPTIONS['threads'],
'perc_identity': VSEARCH_PARAM_DESCRIPTIONS['perc_identity'],
'target_coverage': 'The minimum fraction of coverage that '
'\'reference-segment-sequences\' must have '
'in order to extract matching segments '
'from \'input-sequences\'.',
'min_seq_len': 'Minimum length of sequence allowed '
'for searching. Any sequence less than '
'this will be discarded. If not set, default '
Expand Down
2 changes: 2 additions & 0 deletions rescript/tests/data/full-length-query.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ CGCGACATAAAGACTCCTTATTTTCTTTTTCTTTATTTTATTGAAATTTCATAAACCTAAATTTAAAACTGAACTAAAGG
ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGACTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTGAATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGTATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGTGAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGATGAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATAGAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAATTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCGATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTGAATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCTAATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATGATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGAACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAACTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTTGATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT
>fulllen04
ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGACTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTGAATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGTATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGTGAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGATGAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATAGAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAATTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCGATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTGAATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCTAATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATGATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGAACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAACTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTTGATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT
>fulllen05
AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC
3 changes: 2 additions & 1 deletion rescript/tests/data/ref-segs.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAAAACTAGAAGAAAAAGG
>refsegment02
ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGACAGAAGCAGAATACAAAAACAAAAG

>refsegment03
AAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGG
63 changes: 60 additions & 3 deletions rescript/tests/test_extract_seq_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class TestExtractSeqSegments(TestPluginBase):

def setUp(self):
super().setUp()

# NOTE: sequences must be longer than 32 bases for
# these vsearch tests.
input_seq_fp = self.get_data_path('full-length-query.fasta')
self.seqs = DNAFASTAFormat(input_seq_fp, mode='r')

Expand All @@ -30,15 +31,17 @@ def test_extract_seq_segments(self):
ext_segs, unmat_seqs, = extract_seq_segments(
input_sequences=self.seqs,
reference_segment_sequences=self.segref,
perc_identity=0.9)
perc_identity=0.9,
target_coverage=0.7)

obs_ext_segs = {seq.metadata['id']: str(seq)
for seq in ext_segs.view(DNAIterator)}
exp_ext_segs = {
'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA'
'AACTAGAAGAAAAAGG'),
'fulllen02': ('ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGA'
'CAGAAGCAGAATACAAAAACAAAAG')}
'CAGAAGCAGAATACAAAAACAAAAG'),
'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'}
self.assertEqual(obs_ext_segs, exp_ext_segs)

obs_unmat_seqs = {seq.metadata['id']: str(seq)
Expand Down Expand Up @@ -77,3 +80,57 @@ def test_extract_seq_segments(self):
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT')}
self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)

def test_high_coverage_extract_seq_segments(self):
ext_segs, unmat_seqs, = extract_seq_segments(
input_sequences=self.seqs,
reference_segment_sequences=self.segref,
perc_identity=0.9,
target_coverage=0.9)

obs_ext_segs = {seq.metadata['id']: str(seq)
for seq in ext_segs.view(DNAIterator)}
exp_ext_segs = {
'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA'
'AACTAGAAGAAAAAGG'),
'fulllen02': ('ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGA'
'CAGAAGCAGAATACAAAAACAAAAG')}
self.assertEqual(obs_ext_segs, exp_ext_segs)

obs_unmat_seqs = {seq.metadata['id']: str(seq)
for seq in unmat_seqs.view(DNAIterator)}
exp_unmat_seqs = {
'fulllen03': (
'ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGA'
'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG'
'AATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGT'
'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT'
'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT'
'GAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA'
'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA'
'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG'
'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG'
'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT'
'AATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATG'
'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGA'
'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA'
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen04': (
'ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGA'
'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG'
'AATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGT'
'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT'
'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT'
'GAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA'
'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA'
'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG'
'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG'
'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT'
'AATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATG'
'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGA'
'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA'
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'}
self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)
Loading