From 83c9c1524d71f7d2234f6217d9fd3d66babe729e Mon Sep 17 00:00:00 2001 From: Mike Robeson Date: Tue, 19 Nov 2024 09:02:54 -0600 Subject: [PATCH] ENH: added `--p-max-seq-len` to `extract-seq-segments` and associated test code (#209) * added maxseqlen and test code * added default length settings as per code review * remove commented lines --- rescript/extract_seq_segments.py | 7 +- rescript/plugin_setup.py | 11 ++- rescript/tests/test_extract_seq_segments.py | 88 +++++++++++++++++++++ 3 files changed, 99 insertions(+), 7 deletions(-) diff --git a/rescript/extract_seq_segments.py b/rescript/extract_seq_segments.py index bd19f89..5fd3a51 100644 --- a/rescript/extract_seq_segments.py +++ b/rescript/extract_seq_segments.py @@ -15,7 +15,8 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, reference_segment_sequences: DNAFASTAFormat, perc_identity: float = 0.7, target_coverage: float = 0.9, - min_seq_len: int = None, + min_seq_len: int = 32, + max_seq_len: int = 50000, threads: int = 1 ) -> (DNAFASTAFormat, DNAFASTAFormat): # Note: the `--db` is actually comprised of the short amplicon @@ -43,11 +44,11 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, '--id', str(perc_identity), '--target_cov', str(target_coverage), '--strand', 'plus', + '--minseqlength', str(min_seq_len), + '--maxseqlength', str(max_seq_len), '--threads', str(threads), '--qmask', 'none', '--qsegout', extr_f.name, '--notmatched', no_match_f.name] - if min_seq_len: - cmd += ['--minseqlength', '%i' % min_seq_len] run_command(cmd) return (extr_seq_segs, no_matches) diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index a4d5247..1da9ec2 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -1130,6 +1130,7 @@ inclusive_start=False, inclusive_end=True), 'min_seq_len': Int % Range(1, None), + 'max_seq_len': Int % Range(1, None), }, outputs=[('extracted_sequence_segments', FeatureData[Sequence]), ('unmatched_sequences', FeatureData[Sequence])], @@ -1151,10 +1152,12 @@ '\'reference-segment-sequences\' must have ' 'in order to extract matching segments ' 'from \'input-sequences\'.', - 'min_seq_len': 'Minimum length of sequence allowed ' - 'for searching. Any sequence less than ' - 'this will be discarded. If not set, default ' - 'program settings will be used.'}, + 'min_seq_len': 'Minimum length of reference sequence segment ' + 'allowed for searching. Any sequence less than ' + 'this will be discarded.', + 'max_seq_len': 'Maximum length of reference sequence segment ' + 'allowed for searching. Any sequence greater ' + 'than this will be discarded.'}, output_descriptions={ 'extracted_sequence_segments': 'Extracted sequence segments ' 'from \'input-sequences\' ' diff --git a/rescript/tests/test_extract_seq_segments.py b/rescript/tests/test_extract_seq_segments.py index 606b4b2..1622802 100644 --- a/rescript/tests/test_extract_seq_segments.py +++ b/rescript/tests/test_extract_seq_segments.py @@ -134,3 +134,91 @@ def test_high_coverage_extract_seq_segments(self): 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'), 'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'} self.assertEqual(obs_unmat_seqs, exp_unmat_seqs) + + def test_extract_seq_segments_min_max(self): + ext_segs, unmat_seqs, = extract_seq_segments( + input_sequences=self.seqs, + reference_segment_sequences=self.segref, + perc_identity=0.9, + target_coverage=0.7, + min_seq_len=45, + max_seq_len=53) + + obs_ext_segs = {seq.metadata['id']: str(seq) + for seq in ext_segs.view(DNAIterator)} + exp_ext_segs = { + 'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA' + 'AACTAGAAGAAAAAGG')} + self.assertEqual(obs_ext_segs, exp_ext_segs) + + obs_unmat_seqs = {seq.metadata['id']: str(seq) + for seq in unmat_seqs.view(DNAIterator)} + exp_unmat_seqs = { + 'fulllen02': ( + 'CGCGACATAAAGACTCCTTATTTTCTTTTTCTTTATTTTATTGAAATTTCATAAACC' + 'TAAATTTAAAACTGAACTAAAGGATAAATATGATAAATGAGGCAAAATCTACTGAAG' + 'TATTATACTACAAAAAATACTGATATACTACAAAGAATACTGAGATGGATTGTATCA' + 'ATATCCGGACCTTATTGTATATACACAAATAATGGATATAGAAAAATAAAAAGAAAA' + 'AGGTCCTTTATCTTGTTCTTGATTTGTTCTGCAGAGATTTAACTACTCTAACTTTTA' + 'TTCATAATTTGAAGTTCCTATCGCATAAGAATCGGTGTCCCTTTGAAAGGAAAGAGA' + 'GGAAGAAGCCTTTTCAATATTCTTTAATTTCAAAAATACATTATCAATCATGCAAAA' + 'AAACTCAAACAAATAGAGAAAAGCCAGCTATCGGAGTCGAACCGATGACCATCGCAT' + 'TACAAATGCGATGCTCTAAAAAAAAAAGAAAAGAATCGACCGTTCGAGTATTCCAAA' + 'TTGCACGAGAAAAATGATTCGAAGAGAGACATATATAATAAATGTGGTATATATGCA' + 'TCTATATTGAATTGTGGATACAGAAATGATAGAATAATTTCTGATTAAACCAAATAT' + 'AGGTTTCCAATAGAGATGAAGGGGGGATAGATAAGAAAAAATAAGAGGAAACACTTT' + 'TATATGAATCGGTATCTAATGACTTTAACAGTTCCAGTAGAATATAAATGAAAAGGG' + 'GGGGAATGACGTAATAGTAATAATAAGATCTGAATTTCAAAACACAAAACAAAATAA' + 'AGGGGATATGGCGAAATTGGTAGACGCTACGGACTTAATTGGATTGAGCCTTGGTAT' + 'GGAAACTTACTAAGTGATAACTTTCAAATTCAGAGAAACCCTGGAAAAAGGGGGCAA' + 'TCCTGAGCCAAATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGACAGAAGCAG' + 'AATACAAAAACAAAAGGATAGGTGCAGAGACTCAATGGAAGTTGTTCTAACAATAAC' + 'AAATAGAGTTGACTGCGTTGCATTAGTCAAGTAAAGGAATATTTTTGTTAAAACTCC' + 'AGATCTAGAAAGGATAAAAAAAACTCCAGACCAGAAAAGATAAAGGAAAGTATAACC' + 'GTATACTCTATACATACGTATACGTACTGAAATACTATCTCAAATGATTAATGACAA' + 'CCGACCCGAATCTGTATTTTATTATATTTCTATGGAAAATGAAATAAAATAAATTGT' + 'TTTGAATCGATTCCAAGTTGCCGAAAGGATCGAATATTAATTGATCAAATCATTCAC' + 'CCCGTAGTCTGATAGATAACTAATTAATCGGACGAGAGTAAAGATAGAGCCCCATTC' + 'TACATGTCAATATTCTACATGTCAATATCGACAACAAAGAAATTTATAGTAAGAGGA' + 'AAATCCGTCGACTTTAGAAATCGTGAGGGTTCAAGTCCCTCTATCCCCAAAAAAGGC' + 'CTGCTCGGCTCCCTAATTATTTATCTTATTCTCTCTTTTCTTCTCTCTTTTCGTTAA' + 'CGGTTCAAAATTCGTTATCTTTCTCATTCATTCGATTCTTTCACAAATGCATCTGGG' + 'ATGCATTTTTTTCTTTTCACAAGTCTTGTGGTAGATATGATACACATACAAATGAAC' + 'TTATTTGAGTAAAACAAGGATAAAACAAGGAATCCCTTTTGGAAATTGGAATGATTA' + 'ACAATGATAAAACTTTCAAATACCTTTTGTTTTTTTTTAATTGACATAGACCCAAGT' + 'CATTTAGTGAGATGAGGAAGACGACGCGTTGGAAATGGTCGGGATAGCTCA'), + 'fulllen03': ( + 'ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGA' + 'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG' + 'AATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGT' + 'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT' + 'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT' + 'GAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA' + 'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA' + 'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG' + 'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG' + 'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT' + 'AATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATG' + 'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGA' + 'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA' + 'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTT' + 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'), + 'fulllen04': ( + 'ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGA' + 'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG' + 'AATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGT' + 'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT' + 'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT' + 'GAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA' + 'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA' + 'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG' + 'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG' + 'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT' + 'AATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATG' + 'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGA' + 'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA' + 'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT' + 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'), + 'fulllen05': ( + 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC')} + self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)