Skip to content

Commit

Permalink
Merge branch 'master' into get_bv_brc_data
Browse files Browse the repository at this point in the history
  • Loading branch information
VinzentRisch committed Aug 21, 2024
2 parents e5ee9fe + 4f7ab4b commit 0199719
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 18 deletions.
4 changes: 3 additions & 1 deletion rescript/extract_seq_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
def extract_seq_segments(input_sequences: DNAFASTAFormat,
reference_segment_sequences: DNAFASTAFormat,
perc_identity: float = 0.7,
target_coverage: float = 0.9,
min_seq_len: int = None,
threads: int = 1
) -> (DNAFASTAFormat, DNAFASTAFormat):
Expand All @@ -25,7 +26,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat,
# See here for more details:
# https://www.drive5.com/usearch/manual8.1/utax_user_train.html
#
# Warning: I hard set '--strand both', as multiple segments can be
# Warning: I hard set '--strand plus', otherwise multiple segments can be
# extracted from the same sequence, resulting in multiple output
# seqeunces to have the same ID, causing this action to fail.,
#
Expand All @@ -40,6 +41,7 @@ def extract_seq_segments(input_sequences: DNAFASTAFormat,
'--usearch_global', str(input_sequences),
'--db', str(reference_segment_sequences),
'--id', str(perc_identity),
'--target_cov', str(target_coverage),
'--strand', 'plus',
'--threads', str(threads),
'--qmask', 'none',
Expand Down
4 changes: 2 additions & 2 deletions rescript/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@


def get_silva_data(ctx,
version='138.1',
version='138.2',
target='SSURef_NR99',
include_species_labels=False,
rank_propagation=True,
Expand Down Expand Up @@ -86,7 +86,7 @@ def _assemble_silva_data_urls(version, target, download_sequences=True):
tax_url = base_url_tax + '.txt'

# add ".gz" for the following versions:
if version in ['138', '138.1']:
if version in ['138', '138.1', '138.2']:
tree_url += '.gz'
tax_url += '.gz'

Expand Down
15 changes: 11 additions & 4 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,15 +732,15 @@
"[default: '" +
"', '".join(DEFAULT_RANKS) + "']")

_SILVA_VERSIONS = ['128', '132', '138', '138.1']
_SILVA_VERSIONS = ['128', '132', '138', '138.1', '138.2']
_SILVA_TARGETS = ['SSURef_NR99', 'SSURef', 'LSURef_NR99', 'LSURef']

version_map, target_map, _ = TypeMap({
(Str % Choices('128', '132'),
Str % Choices('SSURef_NR99', 'SSURef', 'LSURef')): Visualization,
(Str % Choices('138'),
Str % Choices('SSURef_NR99', 'SSURef')): Visualization,
(Str % Choices('138.1'),
(Str % Choices('138.1', '138.2'),
Str % Choices('SSURef_NR99', 'SSURef', 'LSURef_NR99',
'LSURef')): Visualization,
})
Expand Down Expand Up @@ -1126,7 +1126,10 @@
'reference_segment_sequences': FeatureData[Sequence]},
parameters={'threads': VSEARCH_PARAMS['threads'],
'perc_identity': VSEARCH_PARAMS['perc_identity'],
'min_seq_len': Int % Range(1, None)
'target_coverage': Float % Range(0, 1,
inclusive_start=False,
inclusive_end=True),
'min_seq_len': Int % Range(1, None),
},
outputs=[('extracted_sequence_segments', FeatureData[Sequence]),
('unmatched_sequences', FeatureData[Sequence])],
Expand All @@ -1139,11 +1142,15 @@
'reference_segment_sequences': 'Reference sequence segments that '
'will be used to search for and '
'extract matching segments from '
'\'sequences\'.',
'\'input-sequences\'.',
},
parameter_descriptions={
'threads': VSEARCH_PARAM_DESCRIPTIONS['threads'],
'perc_identity': VSEARCH_PARAM_DESCRIPTIONS['perc_identity'],
'target_coverage': 'The minimum fraction of coverage that '
'\'reference-segment-sequences\' must have '
'in order to extract matching segments '
'from \'input-sequences\'.',
'min_seq_len': 'Minimum length of sequence allowed '
'for searching. Any sequence less than '
'this will be discarded. If not set, default '
Expand Down
2 changes: 2 additions & 0 deletions rescript/tests/data/full-length-query.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ CGCGACATAAAGACTCCTTATTTTCTTTTTCTTTATTTTATTGAAATTTCATAAACCTAAATTTAAAACTGAACTAAAGG
ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGACTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTGAATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGTATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGTGAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGATGAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATAGAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAATTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCGATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTGAATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCTAATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATGATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGAACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAACTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTTGATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT
>fulllen04
ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGACTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTGAATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGTATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGTGAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGATGAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATAGAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAATTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCGATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTGAATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCTAATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATGATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGAACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAACTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTTGATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT
>fulllen05
AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC
3 changes: 2 additions & 1 deletion rescript/tests/data/ref-segs.fasta
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAAAACTAGAAGAAAAAGG
>refsegment02
ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGACAGAAGCAGAATACAAAAACAAAAG

>refsegment03
AAAATTTTGGGGCCCCAAAATTTTGGGGCCCCAAAATTTTGGGG
63 changes: 60 additions & 3 deletions rescript/tests/test_extract_seq_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class TestExtractSeqSegments(TestPluginBase):

def setUp(self):
super().setUp()

# NOTE: sequences must be longer than 32 bases for
# these vsearch tests.
input_seq_fp = self.get_data_path('full-length-query.fasta')
self.seqs = DNAFASTAFormat(input_seq_fp, mode='r')

Expand All @@ -30,15 +31,17 @@ def test_extract_seq_segments(self):
ext_segs, unmat_seqs, = extract_seq_segments(
input_sequences=self.seqs,
reference_segment_sequences=self.segref,
perc_identity=0.9)
perc_identity=0.9,
target_coverage=0.7)

obs_ext_segs = {seq.metadata['id']: str(seq)
for seq in ext_segs.view(DNAIterator)}
exp_ext_segs = {
'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA'
'AACTAGAAGAAAAAGG'),
'fulllen02': ('ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGA'
'CAGAAGCAGAATACAAAAACAAAAG')}
'CAGAAGCAGAATACAAAAACAAAAG'),
'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'}
self.assertEqual(obs_ext_segs, exp_ext_segs)

obs_unmat_seqs = {seq.metadata['id']: str(seq)
Expand Down Expand Up @@ -77,3 +80,57 @@ def test_extract_seq_segments(self):
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT')}
self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)

def test_high_coverage_extract_seq_segments(self):
ext_segs, unmat_seqs, = extract_seq_segments(
input_sequences=self.seqs,
reference_segment_sequences=self.segref,
perc_identity=0.9,
target_coverage=0.9)

obs_ext_segs = {seq.metadata['id']: str(seq)
for seq in ext_segs.view(DNAIterator)}
exp_ext_segs = {
'fulllen01': ('ATCTTTATGTTTAGAAAAACAAGGGTTTTAATTGAAA'
'AACTAGAAGAAAAAGG'),
'fulllen02': ('ATCCTGTTTTTCGAAAACAAACAAAGGTTTGTAAAGA'
'CAGAAGCAGAATACAAAAACAAAAG')}
self.assertEqual(obs_ext_segs, exp_ext_segs)

obs_unmat_seqs = {seq.metadata['id']: str(seq)
for seq in unmat_seqs.view(DNAIterator)}
exp_unmat_seqs = {
'fulllen03': (
'ACCATTTCCCATGCATCATCCTGGTGTAGTATTTGTGTCTATGTCAATTAAAGGGA'
'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG'
'AATGATTCAATAATAGAGATTATTTGCCCATATATGTTTATTTGTAGAGGTATTGT'
'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT'
'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT'
'GAAAAAAAAAGGAAGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA'
'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA'
'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG'
'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG'
'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT'
'AATTCTGAATTTTTCATATTTTAATTATCCATATAATATAATGGATTCGAGTCATG'
'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATTTATTAGGTATATAGGA'
'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA'
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCGAATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen04': (
'ACCATTTCCCATGCATCATCCTGGTGTAGTGTTTGTGTCTATGTCAATTAAAGGGA'
'CTAAAAAGTAGTAAAAAATTTAACTAGTAAGCGTAAGGATAATGATATGGACTGTG'
'AATGATTCAATAATAGAGATTATTTTCCCATATATGCTTATTTGTAGAGGTATTGT'
'ATCTATCGAAATTGGGATAAGATCCAAGGATTTTAGTTCGGATACATTTGTGTTGT'
'GAAAGAGTGGAGTGAATGAGAAAGATAGTGAATTTTGTTTGAACTGAATCGCTGAT'
'GAAAAAAAAAGGAGGATAAATATTTAGGAAGTAAAATTACCTTTTTATTGGGGATA'
'GAGGGACTTGAACCCTCACGATTTCTAAAGTCGACGGATTTTCCTCTTACTATAAA'
'TTTCATTGTTGTCGGTATTGACATGTAGAATGGGACTCTCTCTTTATTCTCGTCCG'
'ATTAATCAGTTTTTCAAAAGATCTATCAAACTCTGGAATGAATGATTTAATAATTG'
'AATTGAATATTCAATTCTTCTTTCAACTTCCATTGGACTGGATTCACAATAACTCT'
'AATTCTGAATTTTTCATATTATAATTATCCATATAATATAATGGATTCGAGTCATG'
'ATTAATCGTTTGATTTGATATGTCAGTATGTATACGTATGTATTAGGTATATAGGA'
'ACATCTTTTCTGTAATTTTGATAGACGGATTCCAACTACCAACGAAACGTAGTCAA'
'CTTCATTCGTTAGAACAGCTTCCATTGAGTCTCTGCACCTATCCTTATTCTAGTTT'
'GATAAAACTAAGGTTTATCAAACTAAGGATTTGGCTCAGGATTGCCCAT'),
'fulllen05': 'AAAATTTTGGGGCCCCAAAATTTTGGGGCCCC'}
self.assertEqual(obs_unmat_seqs, exp_unmat_seqs)
9 changes: 2 additions & 7 deletions rescript/tests/test_get_unite.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,7 @@ def test_unite_get_artifacts(self):
"k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Thelephorales;"
"f__Thelephoraceae;g__Tomentella;s__unidentified",
)
self.assertEqual(
str(type(res_two)),
"<class 'q2_types.feature_data._transformer.DNAIterator'>",
)
self.assertTrue(isinstance(res_two, q2_types.feature_data.DNAIterator))
# test no _dev files found
with self.assertRaises(ValueError):
_unite_get_artifacts(self.unitefile_no_dev, cluster_id="97")
Expand All @@ -94,7 +91,5 @@ def test_get_unite_data(self):
self.assertEqual(len(res), 2)
self.assertTrue(isinstance(res[0], pandas.core.frame.DataFrame))
self.assertTrue(
isinstance(
res[1], q2_types.feature_data._transformer.DNAIterator
)
isinstance(res[1], q2_types.feature_data.DNAIterator)
)

0 comments on commit 0199719

Please sign in to comment.