From 4f7ab4b1918e92c573a69204d3357ae1014c7050 Mon Sep 17 00:00:00 2001 From: Mike Robeson Date: Wed, 14 Aug 2024 01:38:14 -0500 Subject: [PATCH] ENH: Added ability to specify segment coverage value for `extract-seq-segments`. (#199) --- rescript/extract_seq_segments.py | 4 +- rescript/plugin_setup.py | 11 +++- rescript/tests/data/full-length-query.fasta | 2 + rescript/tests/data/ref-segs.fasta | 3 +- rescript/tests/test_extract_seq_segments.py | 63 ++++++++++++++++++++- 5 files changed, 76 insertions(+), 7 deletions(-) diff --git a/rescript/extract_seq_segments.py b/rescript/extract_seq_segments.py index 73033488..bd19f89b 100644 --- a/rescript/extract_seq_segments.py +++ b/rescript/extract_seq_segments.py @@ -14,6 +14,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, reference_segment_sequences: DNAFASTAFormat, perc_identity: float = 0.7, + target_coverage: float = 0.9, min_seq_len: int = None, threads: int = 1 ) -> (DNAFASTAFormat, DNAFASTAFormat): @@ -25,7 +26,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, # See here for more details: # https://www.drive5.com/usearch/manual8.1/utax_user_train.html # - # Warning: I hard set '--strand both', as multiple segments can be + # Warning: I hard set '--strand plus', otherwise multiple segments can be # extracted from the same sequence, resulting in multiple output # seqeunces to have the same ID, causing this action to fail., # @@ -40,6 +41,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, '--usearch_global', str(input_sequences), '--db', str(reference_segment_sequences), '--id', str(perc_identity), + '--target_cov', str(target_coverage), '--strand', 'plus', '--threads', str(threads), '--qmask', 'none', diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 50e50f1c..5a5c6baf 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -1123,7 +1123,10 @@ 'reference_segment_sequences': FeatureData[Sequence]}, parameters={'threads': VSEARCH_PARAMS['threads'], 'perc_identity': VSEARCH_PARAMS['perc_identity'], - 'min_seq_len': Int % Range(1, None) + 'target_coverage': Float % Range(0, 1, + inclusive_start=False, + inclusive_end=True), + 'min_seq_len': Int % Range(1, None), }, outputs=[('extracted_sequence_segments', FeatureData[Sequence]), ('unmatched_sequences', FeatureData[Sequence])], @@ -1136,11 +1139,15 @@ 'reference_segment_sequences': 'Reference sequence segments that ' 'will be used to search for and ' 'extract matching segments from ' - '\'sequences\'.', + '\'input-sequences\'.', }, parameter_descriptions={ 'threads': VSEARCH_PARAM_DESCRIPTIONS['threads'], 'perc_identity': VSEARCH_PARAM_DESCRIPTIONS['perc_identity'], + 'target_coverage': 'The minimum fraction of coverage that ' + '\'reference-segment-sequences\' must have ' + 'in order to extract matching segments ' + 'from \'input-sequences\'.', 'min_seq_len': 'Minimum length of sequence allowed ' 'for searching. Any sequence less than ' 'this will be discarded. If not set, default ' diff --git a/rescript/tests/data/full-length-query.fasta b/rescript/tests/data/full-length-query.fasta index ab6ad1d5..17a3acf1 100644 --- a/rescript/tests/data/full-length-query.fasta +++ b/rescript/tests/data/full-length-query.fasta @@ -6,3 +6,5 @@ CGCGACATAAAGACTCCTTATTTTCTTTTTCTTTATTTTATTGAAATTTCATAAACCTAAATTTAAAACTGAACTAAAGG ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGACTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTGAATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGTATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGTGAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGATGAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATAGAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAATTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCGATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTGAATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCTAATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATGATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGAACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAACTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTTGATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT >fulllen04 ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGACTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTGAATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGTATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGTGAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGATGAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATAGAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAATTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCGATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTGAATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCTAATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATGATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGAACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAACTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTTGATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT +>fulllen05 +AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC diff --git a/rescript/tests/data/ref-segs.fasta b/rescript/tests/data/ref-segs.fasta index 37795156..3dccd96c 100644 --- a/rescript/tests/data/ref-segs.fasta +++ b/rescript/tests/data/ref-segs.fasta @@ -2,4 +2,5 @@ ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAAAACTAGAAGAAAAAGG >refsegment02 ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGACAGAAGCAGAATACAAAAACAAAAG - +>refsegment03 +AAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGG diff --git a/rescript/tests/test_extract_seq_segments.py b/rescript/tests/test_extract_seq_segments.py index e4f2b96d..606b4b2e 100644 --- a/rescript/tests/test_extract_seq_segments.py +++ b/rescript/tests/test_extract_seq_segments.py @@ -19,7 +19,8 @@ class TestExtractSeqSegments(TestPluginBase): def setUp(self): super().setUp() - + # NOTE: sequences must be longer than 32 bases for + # these vsearch tests. input_seq_fp = self.get_data_path('full-length-query.fasta') self.seqs = DNAFASTAFormat(input_seq_fp, mode='r') @@ -30,7 +31,8 @@ def test_extract_seq_segments(self): ext_segs, unmat_seqs, = extract_seq_segments( input_sequences=self.seqs, reference_segment_sequences=self.segref, - perc_identity=0.9) + perc_identity=0.9, + target_coverage=0.7) obs_ext_segs = {seq.metadata['id']: str(seq) for seq in ext_segs.view(DNAIterator)} @@ -38,7 +40,8 @@ def test_extract_seq_segments(self): 'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA' 'AACTAGAAGAAAAAGG'), 'fulllen02': ('ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGA' - 'CAGAAGCAGAATACAAAAACAAAAG')} + 'CAGAAGCAGAATACAAAAACAAAAG'), + 'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'} self.assertEqual(obs_ext_segs, exp_ext_segs) obs_unmat_seqs = {seq.metadata['id']: str(seq) @@ -77,3 +80,57 @@ def test_extract_seq_segments(self): 'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT' 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT')} self.assertEqual(obs_unmat_seqs, exp_unmat_seqs) + + def test_high_coverage_extract_seq_segments(self): + ext_segs, unmat_seqs, = extract_seq_segments( + input_sequences=self.seqs, + reference_segment_sequences=self.segref, + perc_identity=0.9, + target_coverage=0.9) + + obs_ext_segs = {seq.metadata['id']: str(seq) + for seq in ext_segs.view(DNAIterator)} + exp_ext_segs = { + 'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA' + 'AACTAGAAGAAAAAGG'), + 'fulllen02': ('ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGA' + 'CAGAAGCAGAATACAAAAACAAAAG')} + self.assertEqual(obs_ext_segs, exp_ext_segs) + + obs_unmat_seqs = {seq.metadata['id']: str(seq) + for seq in unmat_seqs.view(DNAIterator)} + exp_unmat_seqs = { + 'fulllen03': ( + 'ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGA' + 'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG' + 'AATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGT' + 'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT' + 'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT' + 'GAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA' + 'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA' + 'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG' + 'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG' + 'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT' + 'AATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATG' + 'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGA' + 'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA' + 'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTT' + 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'), + 'fulllen04': ( + 'ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGA' + 'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG' + 'AATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGT' + 'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT' + 'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT' + 'GAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA' + 'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA' + 'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG' + 'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG' + 'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT' + 'AATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATG' + 'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGA' + 'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA' + 'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT' + 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'), + 'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'} + self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)