Skip to content

Commit

Permalink
ENH: added --p-max-seq-len to extract-seq-segments and associated…
Browse files Browse the repository at this point in the history
… test code (bokulich-lab#209)

* added maxseqlen and test code

* added default length settings as per code review

* remove commented lines
  • Loading branch information
mikerobeson authored Nov 19, 2024
1 parent 49fc648 commit 83c9c15
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 7 deletions.
7 changes: 4 additions & 3 deletions rescript/extract_seq_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat,
reference_segment_sequences: DNAFASTAFormat,
perc_identity: float = 0.7,
target_coverage: float = 0.9,
min_seq_len: int = None,
min_seq_len: int = 32,
max_seq_len: int = 50000,
threads: int = 1
) -> (DNAFASTAFormat, DNAFASTAFormat):
# Note: the `--db` is actually comprised of the short amplicon
Expand Down Expand Up @@ -43,11 +44,11 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat,
'--id', str(perc_identity),
'--target_cov', str(target_coverage),
'--strand', 'plus',
'--minseqlength', str(min_seq_len),
'--maxseqlength', str(max_seq_len),
'--threads', str(threads),
'--qmask', 'none',
'--qsegout', extr_f.name,
'--notmatched', no_match_f.name]
if min_seq_len:
cmd += ['--minseqlength', '%i' % min_seq_len]
run_command(cmd)
return (extr_seq_segs, no_matches)
11 changes: 7 additions & 4 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1130,6 +1130,7 @@
inclusive_start=False,
inclusive_end=True),
'min_seq_len': Int % Range(1, None),
'max_seq_len': Int % Range(1, None),
},
outputs=[('extracted_sequence_segments', FeatureData[Sequence]),
('unmatched_sequences', FeatureData[Sequence])],
Expand All @@ -1151,10 +1152,12 @@
'\'reference-segment-sequences\' must have '
'in order to extract matching segments '
'from \'input-sequences\'.',
'min_seq_len': 'Minimum length of sequence allowed '
'for searching. Any sequence less than '
'this will be discarded. If not set, default '
'program settings will be used.'},
'min_seq_len': 'Minimum length of reference sequence segment '
'allowed for searching. Any sequence less than '
'this will be discarded.',
'max_seq_len': 'Maximum length of reference sequence segment '
'allowed for searching. Any sequence greater '
'than this will be discarded.'},
output_descriptions={
'extracted_sequence_segments': 'Extracted sequence segments '
'from \'input-sequences\' '
Expand Down
88 changes: 88 additions & 0 deletions rescript/tests/test_extract_seq_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,91 @@ def test_high_coverage_extract_seq_segments(self):
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'}
self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)

def test_extract_seq_segments_min_max(self):
ext_segs, unmat_seqs, = extract_seq_segments(
input_sequences=self.seqs,
reference_segment_sequences=self.segref,
perc_identity=0.9,
target_coverage=0.7,
min_seq_len=45,
max_seq_len=53)

obs_ext_segs = {seq.metadata['id']: str(seq)
for seq in ext_segs.view(DNAIterator)}
exp_ext_segs = {
'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA'
'AACTAGAAGAAAAAGG')}
self.assertEqual(obs_ext_segs, exp_ext_segs)

obs_unmat_seqs = {seq.metadata['id']: str(seq)
for seq in unmat_seqs.view(DNAIterator)}
exp_unmat_seqs = {
'fulllen02': (
'CGCGACATAAAGACTCCTTATTTTCTTTTTCTTTATTTTATTGAAATTTCATAAACC'
'TAAATTTAAAACTGAACTAAAGGATAAATATGATAAATGAGGCAAAATCTACTGAAG'
'TATTATACTACAAAAAATACTGATATACTACAAAGAATACTGAGATGGATTGTATCA'
'ATATCCGGACCTTATTGTATATACACAAATAATGGATATAGAAAAATAAAAAGAAAA'
'AGGTCCTTTATCTTGTTCTTGATTTGTTCTGCAGAGATTTAACTACTCTAACTTTTA'
'TTCATAATTTGAAGTTCCTATCGCATAAGAATCGGTGTCCCTTTGAAAGGAAAGAGA'
'GGAAGAAGCCTTTTCAATATTCTTTAATTTCAAAAATACATTATCAATCATGCAAAA'
'AAACTCAAACAAATAGAGAAAAGCCAGCTATCGGAGTCGAACCGATGACCATCGCAT'
'TACAAATGCGATGCTCTAAAAAAAAAAGAAAAGAATCGACCGTTCGAGTATTCCAAA'
'TTGCACGAGAAAAATGATTCGAAGAGAGACATATATAATAAATGTGGTATATATGCA'
'TCTATATTGAATTGTGGATACAGAAATGATAGAATAATTTCTGATTAAACCAAATAT'
'AGGTTTCCAATAGAGATGAAGGGGGGATAGATAAGAAAAAATAAGAGGAAACACTTT'
'TATATGAATCGGTATCTAATGACTTTAACAGTTCCAGTAGAATATAAATGAAAAGGG'
'GGGGAATGACGTAATAGTAATAATAAGATCTGAATTTCAAAACACAAAACAAAATAA'
'AGGGGATATGGCGAAATTGGTAGACGCTACGGACTTAATTGGATTGAGCCTTGGTAT'
'GGAAACTTACTAAGTGATAACTTTCAAATTCAGAGAAACCCTGGAAAAAGGGGGCAA'
'TCCTGAGCCAAATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGACAGAAGCAG'
'AATACAAAAACAAAAGGATAGGTGCAGAGACTCAATGGAAGTTGTTCTAACAATAAC'
'AAATAGAGTTGACTGCGTTGCATTAGTCAAGTAAAGGAATATTTTTGTTAAAACTCC'
'AGATCTAGAAAGGATAAAAAAAACTCCAGACCAGAAAAGATAAAGGAAAGTATAACC'
'GTATACTCTATACATACGTATACGTACTGAAATACTATCTCAAATGATTAATGACAA'
'CCGACCCGAATCTGTATTTTATTATATTTCTATGGAAAATGAAATAAAATAAATTGT'
'TTTGAATCGATTCCAAGTTGCCGAAAGGATCGAATATTAATTGATCAAATCATTCAC'
'CCCGTAGTCTGATAGATAACTAATTAATCGGACGAGAGTAAAGATAGAGCCCCATTC'
'TACATGTCAATATTCTACATGTCAATATCGACAACAAAGAAATTTATAGTAAGAGGA'
'AAATCCGTCGACTTTAGAAATCGTGAGGGTTCAAGTCCCTCTATCCCCAAAAAAGGC'
'CTGCTCGGCTCCCTAATTATTTATCTTATTCTCTCTTTTCTTCTCTCTTTTCGTTAA'
'CGGTTCAAAATTCGTTATCTTTCTCATTCATTCGATTCTTTCACAAATGCATCTGGG'
'ATGCATTTTTTTCTTTTCACAAGTCTTGTGGTAGATATGATACACATACAAATGAAC'
'TTATTTGAGTAAAACAAGGATAAAACAAGGAATCCCTTTTGGAAATTGGAATGATTA'
'ACAATGATAAAACTTTCAAATACCTTTTGTTTTTTTTTAATTGACATAGACCCAAGT'
'CATTTAGTGAGATGAGGAAGACGACGCGTTGGAAATGGTCGGGATAGCTCA'),
'fulllen03': (
'ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGA'
'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG'
'AATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGT'
'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT'
'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT'
'GAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA'
'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA'
'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG'
'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG'
'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT'
'AATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATG'
'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGA'
'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA'
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen04': (
'ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGA'
'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG'
'AATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGT'
'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT'
'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT'
'GAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA'
'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA'
'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG'
'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG'
'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT'
'AATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATG'
'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGA'
'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA'
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen05': (
'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC')}
self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)

0 comments on commit 83c9c15

Please sign in to comment.