From dac24122873da29d727c9da535d307100151e0dc Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Sun, 3 Nov 2024 20:24:27 -0600 Subject: [PATCH 1/3] added maxseqlen and test code --- rescript/extract_seq_segments.py | 3 + rescript/plugin_setup.py | 11 ++- rescript/tests/test_extract_seq_segments.py | 88 +++++++++++++++++++++ 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/rescript/extract_seq_segments.py b/rescript/extract_seq_segments.py index bd19f89..0a4ed96 100644 --- a/rescript/extract_seq_segments.py +++ b/rescript/extract_seq_segments.py @@ -16,6 +16,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, perc_identity: float = 0.7, target_coverage: float = 0.9, min_seq_len: int = None, + max_seq_len: int = None, threads: int = 1 ) -> (DNAFASTAFormat, DNAFASTAFormat): # Note: the `--db` is actually comprised of the short amplicon @@ -49,5 +50,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, '--notmatched', no_match_f.name] if min_seq_len: cmd += ['--minseqlength', '%i' % min_seq_len] + if max_seq_len: + cmd += ['--maxseqlength', '%i' % max_seq_len] run_command(cmd) return (extr_seq_segs, no_matches) diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index a4d5247..88a2828 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -1130,6 +1130,7 @@ inclusive_start=False, inclusive_end=True), 'min_seq_len': Int % Range(1, None), + 'max_seq_len': Int % Range(1, None), }, outputs=[('extracted_sequence_segments', FeatureData[Sequence]), ('unmatched_sequences', FeatureData[Sequence])], @@ -1151,10 +1152,14 @@ '\'reference-segment-sequences\' must have ' 'in order to extract matching segments ' 'from \'input-sequences\'.', - 'min_seq_len': 'Minimum length of sequence allowed ' - 'for searching. Any sequence less than ' + 'min_seq_len': 'Minimum length of reference sequence segment ' + 'allowed for searching. Any sequence less than ' 'this will be discarded. If not set, default ' - 'program settings will be used.'}, + 'program settings will be used.', + 'max_seq_len': 'Maximum length of reference sequence segment ' + 'allowed for searching. Any sequence greater ' + 'than this will be discarded. If not set, ' + 'default program settings will be used.'}, output_descriptions={ 'extracted_sequence_segments': 'Extracted sequence segments ' 'from \'input-sequences\' ' diff --git a/rescript/tests/test_extract_seq_segments.py b/rescript/tests/test_extract_seq_segments.py index 606b4b2..1622802 100644 --- a/rescript/tests/test_extract_seq_segments.py +++ b/rescript/tests/test_extract_seq_segments.py @@ -134,3 +134,91 @@ def test_high_coverage_extract_seq_segments(self): 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'), 'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'} self.assertEqual(obs_unmat_seqs, exp_unmat_seqs) + + def test_extract_seq_segments_min_max(self): + ext_segs, unmat_seqs, = extract_seq_segments( + input_sequences=self.seqs, + reference_segment_sequences=self.segref, + perc_identity=0.9, + target_coverage=0.7, + min_seq_len=45, + max_seq_len=53) + + obs_ext_segs = {seq.metadata['id']: str(seq) + for seq in ext_segs.view(DNAIterator)} + exp_ext_segs = { + 'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA' + 'AACTAGAAGAAAAAGG')} + self.assertEqual(obs_ext_segs, exp_ext_segs) + + obs_unmat_seqs = {seq.metadata['id']: str(seq) + for seq in unmat_seqs.view(DNAIterator)} + exp_unmat_seqs = { + 'fulllen02': ( + 'CGCGACATAAAGACTCCTTATTTTCTTTTTCTTTATTTTATTGAAATTTCATAAACC' + 'TAAATTTAAAACTGAACTAAAGGATAAATATGATAAATGAGGCAAAATCTACTGAAG' + 'TATTATACTACAAAAAATACTGATATACTACAAAGAATACTGAGATGGATTGTATCA' + 'ATATCCGGACCTTATTGTATATACACAAATAATGGATATAGAAAAATAAAAAGAAAA' + 'AGGTCCTTTATCTTGTTCTTGATTTGTTCTGCAGAGATTTAACTACTCTAACTTTTA' + 'TTCATAATTTGAAGTTCCTATCGCATAAGAATCGGTGTCCCTTTGAAAGGAAAGAGA' + 'GGAAGAAGCCTTTTCAATATTCTTTAATTTCAAAAATACATTATCAATCATGCAAAA' + 'AAACTCAAACAAATAGAGAAAAGCCAGCTATCGGAGTCGAACCGATGACCATCGCAT' + 'TACAAATGCGATGCTCTAAAAAAAAAAGAAAAGAATCGACCGTTCGAGTATTCCAAA' + 'TTGCACGAGAAAAATGATTCGAAGAGAGACATATATAATAAATGTGGTATATATGCA' + 'TCTATATTGAATTGTGGATACAGAAATGATAGAATAATTTCTGATTAAACCAAATAT' + 'AGGTTTCCAATAGAGATGAAGGGGGGATAGATAAGAAAAAATAAGAGGAAACACTTT' + 'TATATGAATCGGTATCTAATGACTTTAACAGTTCCAGTAGAATATAAATGAAAAGGG' + 'GGGGAATGACGTAATAGTAATAATAAGATCTGAATTTCAAAACACAAAACAAAATAA' + 'AGGGGATATGGCGAAATTGGTAGACGCTACGGACTTAATTGGATTGAGCCTTGGTAT' + 'GGAAACTTACTAAGTGATAACTTTCAAATTCAGAGAAACCCTGGAAAAAGGGGGCAA' + 'TCCTGAGCCAAATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGACAGAAGCAG' + 'AATACAAAAACAAAAGGATAGGTGCAGAGACTCAATGGAAGTTGTTCTAACAATAAC' + 'AAATAGAGTTGACTGCGTTGCATTAGTCAAGTAAAGGAATATTTTTGTTAAAACTCC' + 'AGATCTAGAAAGGATAAAAAAAACTCCAGACCAGAAAAGATAAAGGAAAGTATAACC' + 'GTATACTCTATACATACGTATACGTACTGAAATACTATCTCAAATGATTAATGACAA' + 'CCGACCCGAATCTGTATTTTATTATATTTCTATGGAAAATGAAATAAAATAAATTGT' + 'TTTGAATCGATTCCAAGTTGCCGAAAGGATCGAATATTAATTGATCAAATCATTCAC' + 'CCCGTAGTCTGATAGATAACTAATTAATCGGACGAGAGTAAAGATAGAGCCCCATTC' + 'TACATGTCAATATTCTACATGTCAATATCGACAACAAAGAAATTTATAGTAAGAGGA' + 'AAATCCGTCGACTTTAGAAATCGTGAGGGTTCAAGTCCCTCTATCCCCAAAAAAGGC' + 'CTGCTCGGCTCCCTAATTATTTATCTTATTCTCTCTTTTCTTCTCTCTTTTCGTTAA' + 'CGGTTCAAAATTCGTTATCTTTCTCATTCATTCGATTCTTTCACAAATGCATCTGGG' + 'ATGCATTTTTTTCTTTTCACAAGTCTTGTGGTAGATATGATACACATACAAATGAAC' + 'TTATTTGAGTAAAACAAGGATAAAACAAGGAATCCCTTTTGGAAATTGGAATGATTA' + 'ACAATGATAAAACTTTCAAATACCTTTTGTTTTTTTTTAATTGACATAGACCCAAGT' + 'CATTTAGTGAGATGAGGAAGACGACGCGTTGGAAATGGTCGGGATAGCTCA'), + 'fulllen03': ( + 'ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGA' + 'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG' + 'AATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGT' + 'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT' + 'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT' + 'GAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA' + 'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA' + 'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG' + 'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG' + 'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT' + 'AATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATG' + 'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGA' + 'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA' + 'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTT' + 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'), + 'fulllen04': ( + 'ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGA' + 'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG' + 'AATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGT' + 'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT' + 'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT' + 'GAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA' + 'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA' + 'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG' + 'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG' + 'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT' + 'AATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATG' + 'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGA' + 'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA' + 'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT' + 'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'), + 'fulllen05': ( + 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC')} + self.assertEqual(obs_unmat_seqs, exp_unmat_seqs) From 4a4030a8009d6cb780851e5dcf970c46073f08c2 Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Tue, 19 Nov 2024 08:42:52 -0600 Subject: [PATCH 2/3] added default length settings as per code review --- rescript/extract_seq_segments.py | 14 ++++++++------ rescript/plugin_setup.py | 6 ++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/rescript/extract_seq_segments.py b/rescript/extract_seq_segments.py index 0a4ed96..315b027 100644 --- a/rescript/extract_seq_segments.py +++ b/rescript/extract_seq_segments.py @@ -15,8 +15,8 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, reference_segment_sequences: DNAFASTAFormat, perc_identity: float = 0.7, target_coverage: float = 0.9, - min_seq_len: int = None, - max_seq_len: int = None, + min_seq_len: int = 32, + max_seq_len: int = 50000, threads: int = 1 ) -> (DNAFASTAFormat, DNAFASTAFormat): # Note: the `--db` is actually comprised of the short amplicon @@ -44,13 +44,15 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, '--id', str(perc_identity), '--target_cov', str(target_coverage), '--strand', 'plus', + '--minseqlength', str(min_seq_len), + '--maxseqlength', str(max_seq_len), '--threads', str(threads), '--qmask', 'none', '--qsegout', extr_f.name, '--notmatched', no_match_f.name] - if min_seq_len: - cmd += ['--minseqlength', '%i' % min_seq_len] - if max_seq_len: - cmd += ['--maxseqlength', '%i' % max_seq_len] + # if min_seq_len: + # cmd += ['--minseqlength', '%i' % min_seq_len] + # if max_seq_len: + # cmd += ['--maxseqlength', '%i' % max_seq_len] run_command(cmd) return (extr_seq_segs, no_matches) diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 88a2828..1da9ec2 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -1154,12 +1154,10 @@ 'from \'input-sequences\'.', 'min_seq_len': 'Minimum length of reference sequence segment ' 'allowed for searching. Any sequence less than ' - 'this will be discarded. If not set, default ' - 'program settings will be used.', + 'this will be discarded.', 'max_seq_len': 'Maximum length of reference sequence segment ' 'allowed for searching. Any sequence greater ' - 'than this will be discarded. If not set, ' - 'default program settings will be used.'}, + 'than this will be discarded.'}, output_descriptions={ 'extracted_sequence_segments': 'Extracted sequence segments ' 'from \'input-sequences\' ' From 2111b2956fca52903d9157690d1bbac435aaf27b Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Tue, 19 Nov 2024 08:45:16 -0600 Subject: [PATCH 3/3] remove commented lines --- rescript/extract_seq_segments.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/rescript/extract_seq_segments.py b/rescript/extract_seq_segments.py index 315b027..5fd3a51 100644 --- a/rescript/extract_seq_segments.py +++ b/rescript/extract_seq_segments.py @@ -50,9 +50,5 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat, '--qmask', 'none', '--qsegout', extr_f.name, '--notmatched', no_match_f.name] - # if min_seq_len: - # cmd += ['--minseqlength', '%i' % min_seq_len] - # if max_seq_len: - # cmd += ['--maxseqlength', '%i' % max_seq_len] run_command(cmd) return (extr_seq_segs, no_matches)