From 51df26be96c89a50ea88981c00a1c1cc3347900d Mon Sep 17 00:00:00 2001 From: Mike Robeson Date: Wed, 10 Jan 2024 16:22:31 -0600 Subject: [PATCH] get-gtdb refactor (#169) --- rescript/get_gtdb.py | 220 ++++++++++++++-------------- rescript/plugin_setup.py | 26 +++- rescript/tests/data/gtdb-seqs.fasta | 9 ++ rescript/tests/data/gtdb-tax.tsv | 6 + rescript/tests/test_get_gtdb.py | 162 +++++++++++--------- 5 files changed, 240 insertions(+), 183 deletions(-) create mode 100644 rescript/tests/data/gtdb-seqs.fasta create mode 100644 rescript/tests/data/gtdb-tax.tsv diff --git a/rescript/get_gtdb.py b/rescript/get_gtdb.py index 537de6d2..61cac2a2 100644 --- a/rescript/get_gtdb.py +++ b/rescript/get_gtdb.py @@ -10,135 +10,139 @@ import tempfile import tarfile import warnings - -import qiime2 +import pandas as pd from urllib.request import urlretrieve from collections import defaultdict from urllib.error import HTTPError -from rescript.get_data import _gzip_decompress + +from q2_types.feature_data import (TSVTaxonomyFormat, DNAFASTAFormat, + DNAIterator) # Different versions may have different file names for archaea and # bacteria. for example 'ar53' and 'bac120' mean that the GTDB phylogeny # is based on 53 and 120 concatenated proteins (cp), respectively. # If this changes we can set up a conditional statemnt below. -VERSION_MAP_DICT = {'214': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, - '207': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, - '202': {'Archaea': 'ar122', 'Bacteria': 'bac120'}} - +VERSION_MAP_DICT = {'214.1': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, + '214.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, + '207.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, + '202.0': {'Archaea': 'ar122', 'Bacteria': 'bac120'}} -def get_gtdb_data(ctx, version='214', domain='Both'): - ver_dom_dict = defaultdict(lambda: defaultdict(dict)) +def get_gtdb_data( + version: str = '214.1', + domain: str = 'Both', + db_type: str = 'SpeciesReps', + ) -> (TSVTaxonomyFormat, DNAFASTAFormat): - # Subset dict if needed, but keep same structure - # Although we can run the following merge actions on a list of one - # i.e. 'Archaea', we do not want to confuse anyone when looking - # at provenance, by running a merge command for no reason. - if domain == 'Both': - ver_dom_dict[version] = VERSION_MAP_DICT[version] - else: - ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain] - - queries = _assemble_queries(ver_dom_dict) + queries = _assemble_queries(version=version, + db_type=db_type, + domain=domain) tax_q, seqs_q = _retrieve_data_from_gtdb(queries) - if domain == 'Both': - merge_gtdb_seqs = ctx.get_action('feature_table', 'merge_seqs') - merge_gtdb_taxonomy = ctx.get_action('feature_table', 'merge_taxa') - print('\n Merging taxonomy data...') - gtdb_tax, = merge_gtdb_taxonomy(data=tax_q) - print('\n Merging sequence data...') - gtdb_seqs, = merge_gtdb_seqs(data=seqs_q) - else: - gtdb_tax = tax_q[0] - gtdb_seqs = seqs_q[0] - print('\n Saving files...\n') - return gtdb_tax, gtdb_seqs - - -def _assemble_queries(ver_dom_dict): - queries = defaultdict(lambda: list()) - - base_seq_url = ('https://data.gtdb.ecogenomic.org/releases/release{ver}/' - '{ver}.0/genomic_files_reps/{cp}_ssu_reps_r{ver}.tar.gz') - base_tax_url = ('https://data.gtdb.ecogenomic.org/releases/release{ver}/' - '{ver}.0/{cp}_taxonomy_r{ver}.tsv.gz') - - for version, dcp in ver_dom_dict.items(): - for dom, cp in dcp.items(): - - queries['Taxonomy'].append( - (dom, - base_tax_url.format(**{'ver': version, 'cp': cp}), - 'FeatureData[Taxonomy]', - 'HeaderlessTSVTaxonomyFormat')) - - queries['Sequence'].append( - (dom, - base_seq_url.format(**{'ver': version, 'cp': cp}), - 'FeatureData[Sequence]', - 'DNAFASTAFormat')) + return tax_q, seqs_q + + +def _assemble_queries(version='214.1', + db_type='SpeciesReps', + domain='Both'): + queries = [] + base_url = 'https://data.gtdb.ecogenomic.org/releases/' + base_version = version.split('.')[0] + # ^^ Set `base_version` variable becuase number after the decimal is + # only used for the directory. GTDB trims this off for the actual + # file names... + + if db_type == 'SpeciesReps': + ver_dom_dict = defaultdict(lambda: defaultdict(dict)) + + if domain == 'Both': + ver_dom_dict[version] = VERSION_MAP_DICT[version] + else: + ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain] + + full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/' + '{cp}_ssu_reps_r{bver}.tar.gz') + + for version, dcp in ver_dom_dict.items(): + for dom, cp in dcp.items(): + queries.append((dom, + full_url.format(**{'ver': version, + 'bver': base_version, + 'cp': cp}))) + elif db_type == 'All': + # Note: GTDB does not maintain separate 'Bacteria' and + # 'Archaea' files for 'All'. This is only done for + # the 'SpeciesReps'. + full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/' + 'ssu_all_r{bver}.tar.gz') + + queries.append((db_type, + full_url.format(**{'ver': version, + 'bver': base_version}))) return queries -def _retrieve_data_from_gtdb(queries): - ''' - Download data from gtdb, given a list of queries. - - queries: {'Taxonomy':(domain, url, type, format), (...), - 'Sequence':(domain, url, type, format), (...)} - ''' +def parse_gtdb_taxonomy(tax_str): + tax = tax_str.split()[0] + return tax - tax_results = [] - seq_results = [] - - print('\nDownloading and processing raw files ... \n') - with tempfile.TemporaryDirectory() as tmpdirname: - for sttype, q_info in queries.items(): - for domain, url, dtype, fmt in q_info: - print('Retrieving {0} for {1} from {2}'.format( - sttype, domain, url)) - # grab url - bn = os.path.basename(url) - destination = os.path.join(tmpdirname, bn) - try: - urlretrieve(url, destination) - except HTTPError: - msg = ("Unable to retrieve the followng file from GTDB:\n " - + url) - warnings.warn(msg, UserWarning) - # seq files are contained within `tar.gz` - if tarfile.is_tarfile(destination): - seq_results.append(get_tar_data(destination, bn, - tmpdirname, dtype, fmt)) - else: - tax_results.append(get_gzipped_data(destination, bn, - dtype, fmt)) - return tax_results, seq_results - - -def get_tar_data(tar_loc, base_fn, tmpdirname, dtype, fmt): +def _get_gtdb_data_path(tmpdirname, url, basename): + destination = os.path.join(tmpdirname, basename) try: - untarred_fn = base_fn.split('.')[0]+'.fna' - print(' Untarring {0}...\n'.format(base_fn)) - with tarfile.open(tar_loc, 'r') as tar: + print('Retrieving data from {0}'.format(url)) + urlretrieve(url, destination) + except HTTPError: + msg = ("Unable to retrieve the followng file from GTDB:\n " + + url) + warnings.warn(msg, UserWarning) + return destination + + +def _extract_seq_tar_file(tmpdirname, untarred_fn, destination): + if tarfile.is_tarfile(destination): + with tarfile.open(destination, 'r') as tar: + print(' Untarring {0}...\n'.format(untarred_fn)) tar.extract(member=untarred_fn, path=tmpdirname) - return qiime2.Artifact.import_data(dtype, os.path.join( - tmpdirname, untarred_fn), fmt) - except OSError: - raise OSError(('{0}: either does not exist or can not ' - 'be untarred!'.format(tar_loc))) + # read through gtdb fasta file + seqs = DNAFASTAFormat(os.path.join( + tmpdirname, untarred_fn), + mode="r").view(DNAIterator) + return seqs -def get_gzipped_data(zipped_loc, base_fn, dtype, fmt): - try: - unzipped_destination = os.path.splitext(zipped_loc)[0] - print(' Unzipping {0}...\n'.format(base_fn)) - _gzip_decompress(zipped_loc, unzipped_destination) - return qiime2.Artifact.import_data(dtype, unzipped_destination, fmt) - except OSError: - raise OSError(('{0}: either does not exist or can not ' - 'be unzipped!'.format(unzipped_destination))) +def _retrieve_data_from_gtdb(queries): + proc_seqs = DNAFASTAFormat() + proc_tax = TSVTaxonomyFormat() + tax_dict = {} + + print('\nDownloading and processing raw files ... \n') + with \ + tempfile.TemporaryDirectory() as tmpdirname, \ + proc_seqs.open() as out_fasta, \ + proc_tax.open() as out_tax: + + for domain, url in queries: + # variable setup + basename = os.path.basename(url) + untarred_fn = basename.split('.')[0]+'.fna' + destination = _get_gtdb_data_path(tmpdirname, url, basename) + seqs = _extract_seq_tar_file(tmpdirname, untarred_fn, destination) + print(' Writing data from \'{0}\'.\n'.format(domain)) + for seq in seqs: + seq.write(out_fasta) # write seq to new fasta file + # add taxonomy to dict: + tax_dict[seq.metadata['id']] = parse_gtdb_taxonomy( + seq.metadata['description']) + # set up final taxonomy dataframe: + print(' Sequences processed.') + print(' Processing taxonomy...') + parsed_taxonomy_df = pd.DataFrame.from_dict(tax_dict, orient='index') + parsed_taxonomy_df.index.name = 'Feature ID' + parsed_taxonomy_df.rename(columns={parsed_taxonomy_df.columns[0]: + 'Taxon'}, inplace=True) + parsed_taxonomy_df.to_csv(out_tax, sep='\t') + print(' Taxonomy processed.') + return proc_tax, proc_seqs diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 99bb2824..4ade7cd9 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -949,29 +949,39 @@ ) -plugin.pipelines.register_function( +plugin.methods.register_function( function=get_gtdb_data, inputs={}, parameters={ - 'version': Str % Choices(['202', '207', '214']), + 'version': Str % Choices(['202.0', '207.0', '214.0', '214.1']), 'domain': Str % Choices(['Both', 'Bacteria', 'Archaea']), + 'db_type': Str % Choices(['All', 'SpeciesReps']) }, outputs=[('gtdb_taxonomy', FeatureData[Taxonomy]), ('gtdb_sequences', FeatureData[Sequence])], input_descriptions={}, parameter_descriptions={ 'version': 'GTDB database version to download.', - 'domain': 'Sequence and taxonomy data to download from a given ' + 'domain': 'SSU sequence and taxonomy data to download from a given ' 'microbial domain from GTDB. \'Both\' will fetch both ' 'bacterial and archaeal data. \'Bacteria\' will only ' 'fetch bacterial data. \'Archaea\' will only fetch ' - 'archaeal data.'}, + 'archaeal data. This only applies to \'db-type ' + 'SpeciesReps\'.', + 'db_type': '\'All\': All SSU data that pass the quality-control ' + 'of GTDB, but are not clustered into representative ' + 'species. \'SpeciesReps\': SSU gene sequences ' + 'identified within the set of representative ' + 'species. Note: if \'All\' is used, the \'domain\' ' + 'parameter will be ignored as GTDB does not maintain ' + 'separate domain-level files for these non-clustered ' + 'data.'}, output_descriptions={ - 'gtdb_taxonomy': 'GTDB reference taxonomy.', - 'gtdb_sequences': 'GTDB reference sequences.'}, - name='Download, parse, and import GTDB reference data.', + 'gtdb_taxonomy': 'SSU GTDB reference taxonomy.', + 'gtdb_sequences': 'SSU GTDB reference sequences.'}, + name='Download, parse, and import SSU GTDB reference data.', description=( - 'Download, parse, and import GTDB files, given a version ' + 'Download, parse, and import SSU GTDB files, given a version ' 'number. Downloads data directly from GTDB, ' 'parses the taxonomy files, and outputs ready-to-use sequence and ' 'taxonomy artifacts. REQUIRES STABLE INTERNET CONNECTION. ' + diff --git a/rescript/tests/data/gtdb-seqs.fasta b/rescript/tests/data/gtdb-seqs.fasta new file mode 100644 index 00000000..2538451a --- /dev/null +++ b/rescript/tests/data/gtdb-seqs.fasta @@ -0,0 +1,9 @@ +>GB_GCA_000008085.1 +CCCGTTGATCCTGCGGGAGGCCACCGCTATCTCCGTCCGGCTAACCCATGGAAGGCGAGGGTCCCCGGGTAAGGGGGCCCGCCGCACGGCTGAGTAACACGTCGGTAACCTACCCTCGGGACGGGGATAACCCCGGGAAACTGGGGCTAATCCCCGATAGGGGATGGGTGCTGGAAGGCCCCATCCCCGAGAGGGGCTAGCGGTACTTCCCCCGCTAGCCCGCCCGAGGATGGGCCGGCGGCCCATCAGGTAGTTGGCGGGGTAATGGCCCGCCAAGCCGAAGACGGGTAGGGGCCGTGAGAGCGGGAGCCCCCAGATCGGCACTGAGACAAGGGCCGAGGCCCTACGGGGCGCACCAGGGGCGAAACCTCCGCAATGCGGGAAACCGTGACGGGGGGACGGAGAGTGCCGGAGGGCGTTATGCTCTCCGGCTTTTGGGGAGTGTAAGTAGCTCCCCGAATAAGCGGTGGGCAAGAGGGGTGGCAGCCGCCGCGGGAACACCCCCACCGCGAGCGGTGGCCGTGATTATTGGGCCTAAAGGGGCCGTAGCCGGGCCGGTGTGGCTCCGGTGAAATCCTCGGGCTCAACCCGAGGGCGCGCCGGAGCTACTACCGGCCTAGGGACCGGGAGGGGCCGACCGTACTCCCGGGGGAGCGGTGAAATGCTGTAATCCCGGGAGGACGACCCGTGGCGAAAGCGGTCGGCCAGAACGGGTCCGACGGTGAGGGCCGAAGGCCGGGGGCTAGAACGGGATTAGAGACCCCGGTATTCCCGGCTGTCAACGCTGCGGGCTACCTGCTGGGCGGGCTACGAGCCCGCCCAGTGGGGTAGGGAAGCCGTTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATAGGCGGGGGAGCACACAAGAGGTGGGGTGCGCGGTTTAATTGGATTCGACGCCGGGAACCTCACCGGGGCTGACAGCACAATGATGGTCGGCCTGAAGGGCCTACCGGAGGCGCTGAGAGGAGGTGCATGGCCGCCGTCAGCCTGTGCCGTGAGGTGCCCTGTTAAGTCAGGAAACAGGCGAGACCCGCGCCCGCAGTTGCGACGGCCGAAAGGCCGGCACACTGCGGGGACTGCCGGGGAAACCCGGAGGAAGGTGCGGGCGACGGCAGGTATGCATGCCCCGAATGCCCCGGGCTACACGCGCGCATCAATGGGCGGGACAGGGGGCCGCGACCCCGAAAGGGGGAGCAAATCCCCAAACCCGCTCTCAGTCCAGATCGAGGGCTGCAACTCGCCCTCGTGACGGCGGAATCTCTAGTAGTCGGACGTCACCAGCGTCCGGCGAATACGTCCCTGCTCCTTGCACTCACCGCCCGTCAAGCCACCCGAGCTGGGGCCTAGCGAGGCCGTGGGGGGTTCGCCCCCCACGGTCGAGCTAGGCCCCGGCGAGGGGGGCTAAGTCGACACAAGGTAGCCGTAGGGGAACCTGCGGCTGGATCACCTCCTA +>GB_GCA_000016605.1 +TCCGGTTGATCCTGCCGGACCCGATCGCTATAGGGGTAGGGCTAAGCCATGGGAGTCGTACGCTCTCGGGAAGAGGGCGTGGCGGACGGCTGAGTAACACGTGGCTAACCTGCCCTTGGGATCTGGATAACCCCGGGAAACTGGGGCTAATCCGGAGCGGGCAAGGGAATCTGGAATGATCTCTTGCCTAAAAGCCTCTCGGCTGATCCCGTCGAGAGGCGCCCAAGGATGGGGCTGCGGCCCATCAGGCTGTTGGGGGAGTAAAGGTCCCCCAAACCGATAACGGGTAGGGGCCGTGGGAGCGGGAGCCCCCAGTTGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCACCAGGCGCGGAACGTCCCCAATGCGGGAAACCGTGAGGGCGTTACCCCTAGTGCCCTCGCAAGAGGGCTTTTCTCCACTCCAGAAAGGTGGAGGAATAAGCGGGGGGCAAGACTGGTGTCAGCCGCCGCGGTAATACCAGCCCCGCGAGTGATCGGGACGTTTATTGGGCTTAAAGCGCCCGTAGCCGGCCTGTAAAGTCACCGTTTAAAGACCCGGGCTCAACTCGGGGAACGGCGGTGATACTTACAGGCTAGGGGGCGGGAGAGGTCGGAGGTACTCCCGGAGTAGGGGCGAAATCCTCAGATCCCGGGAGGACCACCAGTGGCGAAAGCGTCCGGCTAGAACGCGCCCGACGGTGAGGGGCGAAAGCCGGGGTAGCAAATAGGATTAGATACCCTAGTAGTCCCGGCTGTAAACGATGCAGGCTAGGTGTCGCGTAGGCTTTGTGCCTGCGCGGTGCCGCAGGAAAACTGGTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGGAACCTGCGGCTCAATTGGAGTCAACGCCTGGAATCTCACCGGGGGAGACCGCAGGATGACGGCCAGGCTAACGACCTTGCCAGACTCGCGGAGAGGAGGTGCATGGCCGTCGCCAGCTCGTGTTGTGAAATGTCCGGTTAAGTCCGGCAACGAGCGAGACCCCCACTTCTAGTTGGTAACCGTCTCTCCGGAGACGGTCCACACTAGAAGGACTGCCGGTGTTAAACCGGAGGAAGGAGGGGGCCACGGCAGGTCAGCATGCCCCGAAACTTCCGGGCCGCACGCGGGTTACAATGGCAGGGACAGCGGGATCCGACCCCGAGAGGGGAAGGCAATCCCACAAACCCTGCCTCAGTTGGGATCGAGGGCTGAAACTCGCCCTCGTGAACGAGGAATCCCTAGTAACCGCGGGTCAACAACCCGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCGCTCCACCCGAGTGGAGGGGAAGTGAGGCCTCTTGCCCCTCGGGGTGGGAGGTCGAGCTTCTCCTCCGCGAGGGGGGAGAAGTCGTAACAAGGTAGCCGTAGGGGAACCTGCGGCTGGATCACCTCA +>GB_GCA_000007325.1 +GAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAAGTCTACTTGAATTTGGGTTTTTTAACTTCGATTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTATAGGGCATCCTAGAATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGAACGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGACCGAGAGTCTGATCCAGCAATTCTGTGTGCACGATGACGTTTTTCGGAATGTAAAGTGCTTTCAGTTGGGAAGAAAAAAATGACGGTACCAACAGAAGAAGTGACGGCTAAATACGTGCCAGCAGCCGCGGTAATACGTATGTCACGAGCGTTATCCGGATTTATTGGGCGTAAAGCGCGTCTAGGTGGTTATGTAAGTCTGATGTGAAAATGCAGGGCTCAACTCTGTATTGCGTTGGAAACTGTGTAACTAGAGTACTGGAGAGGTAAGCGGAACTACAAGTGTAGAGGTGAAATTCGTAGATATTTGTAGGAATGCCGATGGGGAAGCCAGCTTACTGGACAGATACTGACGCTGAAGCGCGAAAGCGTGGGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTTGGGGGTCGAACCTCAGCGCCCAAGCAAACGCGATAAGTAATCCGCCTGGGGAGTACGTACGCAAGTATGAAACTCAAAGGAATTGACGGGGACCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGACGCAACGCGAGGAACCTTACCAGCGTTTGACATCTTAGGAATGAGACAGAGATGTTTCAGTGTCCCTTCGGGGAAACCTAAAGACAGGTGGTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTTTCGTATGTTACCATCATTAAGTTGGGGACTCATGCGATACTGCCTACGATGAGTAGGAGGAAGGTGGGGATGACGTCAAGTCATCATGCCCCTTATACGCTGGGCTACACACGTGCTACAATGGGTAGAACAGAGAGTTGCAAAGCCGTGAGGTGGAGCTAATCTCAGAAAACTATTCTTAGTTCGGATTGTACTCTGCAACTCGAGTACATGAAGTTGGAATCGCTAGTAATCGCGAATCAGCAATGTCGCGGTGAATACGTTCTCGGGTCTTGTACACACCGCCCGTCACACCACGAGAGTTGGTTGCACCTGAAGTAGCAGGCCTAACCGTAAGGAGGGATGTTCCGAGGGTGTGATTAGCGATTGGGGTGAAGTCGTAACAAGGTATCCGTACGGGAACGTGCGGATGGATCACCTCCTTTC +>GB_GCA_000008885.1 +GGAGGTGATCCAACCACAGGTTCCCCTACGGTTACCTTGTTACGACTTCACCCCAGTTATGAACCACAAAGTGGTAAGCGCCCTCCGAAAGGTTAAGCTACCTGCTTCTTTTGCAGCTCACTTCCATGGTGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCATGCTGATCCGCGATTACTAGCGATTCCGACTTCATGGAGTCGAGTTGCAGACTCCAATCCGGACTAAGACGTACTTTATGAGATTAGCTTACTTTCGCAAGTTTGCTGCCCTTTGTATACGCCATTGTAGCACGTGTGTAGCCCTACTCGTAAGGGCCATGATGACTTGACGTCATCCCCACCTTCCTCCGGTTTATCACCGGCAGTCTCCTTTGAGTTCCCGACTTTTTCGCTGGCAAAAAAGGATAGGGGTTGCGCTCGTTGCGGGACTTAACCCAACATTTCACAACACGAGCTGACGACAGCCATGCAGCACCTGTTTTAAAGCTCCCGAAGGCACTAAAGCATCTCTGCTAAATTCTTTAAATGTCAAGAGTAGGTAAGGTTTTTCGCGTTGCATCGAATTAAACCACATGCTCCACCGCTTGTGCGGGCCCCCGTCAATTCATTTGAGTTTTAACCTTGCGATTGTACTCCCCAGGCGGTCGATTTAACGCGTTAGCTTCGAAAACCCCGAGTAAACTCGCAACCTTCAAATCGACATCGTTTACGGCATGGACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACATGAGCGTCAGTTTTCGCCCAGGAGGCCGCCTTCGCCGCTGGTATTCCTCCAGATATCTACGCATTTCACCGCTACACCTGGAATTCTACCTCCCTCTACGATACTCTAGTTTATTAGTTTCAAATGCAGTTCCTAGGTTGAGCCTAGGGATTTCACATCTGACTTAATAAACCGCCTGCGTACGCTTTACGCCCAGTAATTCCGATTAACGCTTGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTCTGCAAGTAACGTCACATAAATATGGTATTATCACATTTACTTTCTTCCCTGCTGAAAGTGCTTTACAATCCGAAGACCTTCTTCACACACGCGGCATAGCTGCATCAGGGTTTCCCCCATTGTGCAATATTCCCCACTGCTGCCTCCCGTAGGAGTCTGGACCGTATCTCAGTTCCAGTGTGGCTGGTTATCCTCTCAGACCAGCTAGAGATCGTAGCCTAGGTGAGCATTTACCTCACCTACTAGCTAATCTCATCTGGGTTCATCTAAAAACGCAAGGCTGATATAAAGTATTATATTAGTCCCCTGCTTTGATCTTTCGATATTATGCGGTATTAGCTACCGTTTCCAGTAGTTGTCCCCCTTTTTTAGGCAGATCCCCAGACATTACTCACCCGTTCGCCGCTCGCCGTCAAAGAAAAATCTCTACGCTGCCGCACGACTTGCATGTGTTAGGCTTGCCGCTAGCGTTCAATCTGAGCCATGATCAAACTCTTCAAT + diff --git a/rescript/tests/data/gtdb-tax.tsv b/rescript/tests/data/gtdb-tax.tsv new file mode 100644 index 00000000..19321370 --- /dev/null +++ b/rescript/tests/data/gtdb-tax.tsv @@ -0,0 +1,6 @@ +Feature ID Taxon +GB_GCA_000008085.1 d__Archaea;p__Nanoarchaeota;c__Nanoarchaeia;o__Nanoarchaeales;f__Nanoarchaeaceae;g__Nanoarchaeum;s__Nanoarchaeum equitans +GB_GCA_000016605.1 d__Archaea;p__Thermoproteota;c__Thermoproteia;o__Sulfolobales;f__Sulfolobaceae;g__Metallosphaera;s__Metallosphaera sedula +GB_GCA_000007325.1 d__Bacteria;p__Fusobacteriota;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__Fusobacterium;s__Fusobacterium nucleatum +GB_GCA_000008885.1 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales_A;f__Enterobacteriaceae_A;g__Wigglesworthia;s__Wigglesworthia glossinidia_A + diff --git a/rescript/tests/test_get_gtdb.py b/rescript/tests/test_get_gtdb.py index 6f4bb871..38602db2 100644 --- a/rescript/tests/test_get_gtdb.py +++ b/rescript/tests/test_get_gtdb.py @@ -6,11 +6,11 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -import qiime2 import pkg_resources from qiime2.plugin.testing import TestPluginBase from qiime2.plugins import rescript -from rescript.get_gtdb import (VERSION_MAP_DICT, _assemble_queries) +from rescript.get_gtdb import _assemble_queries +from q2_types.feature_data import (TSVTaxonomyFormat, DNAFASTAFormat) from urllib.request import urlopen from urllib.error import HTTPError @@ -22,69 +22,91 @@ class TestGetGTDB(TestPluginBase): def setUp(self): super().setUp() - self.arch_tax = qiime2.Artifact.import_data( - 'FeatureData[Taxonomy]', + self.gtdb_tax = TSVTaxonomyFormat( pkg_resources.resource_filename( - 'rescript.tests', - 'data/gtdb-taxa-archaea.tsv')) - self.bact_tax = qiime2.Artifact.import_data( - 'FeatureData[Taxonomy]', + 'rescript.tests', + 'data/gtdb-tax.tsv'), + mode='r') + self.gtdb_seqs = DNAFASTAFormat( pkg_resources.resource_filename( - 'rescript.tests', - 'data/gtdb-taxa-bacteria.tsv')) - self.arch_seqs = qiime2.Artifact.import_data( - 'FeatureData[Sequence]', + 'rescript.tests', + 'data/gtdb-seqs.fasta'), + mode='r') + self.gtdb_arch_tax = TSVTaxonomyFormat( pkg_resources.resource_filename( - 'rescript.tests', - 'data/gtdb-seqs-archaea.fasta')) - self.bact_seqs = qiime2.Artifact.import_data( - 'FeatureData[Sequence]', + 'rescript.tests', + 'data/gtdb-taxa-archaea.tsv'), + mode='r') + self.gtdb_arch_seqs = DNAFASTAFormat( pkg_resources.resource_filename( - 'rescript.tests', - 'data/gtdb-seqs-bacteria.fasta')) + 'rescript.tests', + 'data/gtdb-seqs-archaea.fasta'), + mode='r') + self.gtdb_bact_tax = TSVTaxonomyFormat( + pkg_resources.resource_filename( + 'rescript.tests', + 'data/gtdb-taxa-bacteria.tsv'), + mode='r') + self.gtdb_bact_seqs = DNAFASTAFormat( + pkg_resources.resource_filename( + 'rescript.tests', + 'data/gtdb-seqs-bacteria.fasta'), + mode='r') # test that appropriate URLs are assembled - def test_assemble_queries(self): - queries = _assemble_queries(VERSION_MAP_DICT) - - obs_tax_urls = [q_info[1] for q_info in queries['Taxonomy']] - obs_seq_urls = [q_info[1] for q_info in queries['Sequence']] - - exp_tax_urls = [('https://data.gtdb.ecogenomic.org/releases/' - 'release214/214.0/ar53_taxonomy_r214.tsv.gz'), - ('https://data.gtdb.ecogenomic.org/releases/' - 'release214/214.0/bac120_taxonomy_r214.tsv.gz'), - ('https://data.gtdb.ecogenomic.org/releases/' - 'release207/207.0/ar53_taxonomy_r207.tsv.gz'), - ('https://data.gtdb.ecogenomic.org/releases/' - 'release207/207.0/bac120_taxonomy_r207.tsv.gz'), - ('https://data.gtdb.ecogenomic.org/releases/' - 'release202/202.0/ar122_taxonomy_r202.tsv.gz'), - ('https://data.gtdb.ecogenomic.org/releases' - '/release202/202.0/bac120_taxonomy_r202.tsv.gz')] - exp_seq_urls = [('https://data.gtdb.ecogenomic.org/releases' - '/release214/214.0/genomic_files_reps/' - 'ar53_ssu_reps_r214.tar.gz'), - ('https://data.gtdb.ecogenomic.org/releases/' - 'release214/214.0/genomic_files_reps/' - 'bac120_ssu_reps_r214.tar.gz'), - ('https://data.gtdb.ecogenomic.org/releases' - '/release207/207.0/genomic_files_reps/' - 'ar53_ssu_reps_r207.tar.gz'), - ('https://data.gtdb.ecogenomic.org/releases/' - 'release207/207.0/genomic_files_reps/' - 'bac120_ssu_reps_r207.tar.gz'), - ('https://data.gtdb.ecogenomic.org/releases/' - 'release202/202.0/genomic_files_reps/' - 'ar122_ssu_reps_r202.tar.gz'), - ('https://data.gtdb.ecogenomic.org/releases/' - 'release202/202.0/genomic_files_reps/' - 'bac120_ssu_reps_r202.tar.gz')] - self.assertEqual(obs_tax_urls, exp_tax_urls) - self.assertEqual(obs_seq_urls, exp_seq_urls) + def test_assemble_species_rep_queries(self): + obs_query_urls = _assemble_queries('214.1', 'SpeciesReps', 'Both') + print('obs queries: ', obs_query_urls) + + exp_query_urls = [('Archaea', + 'https://data.gtdb.ecogenomic.org/releases/' + 'release214/214.1/genomic_files_reps/' + 'ar53_ssu_reps_r214.tar.gz'), + ('Bacteria', + 'https://data.gtdb.ecogenomic.org/releases/' + 'release214/214.1/genomic_files_reps/' + 'bac120_ssu_reps_r214.tar.gz')] + print('exp queries: ', exp_query_urls) + self.assertEqual(obs_query_urls, exp_query_urls) + + # test that these URLs work + for _, u in obs_query_urls: + try: + urlopen(u) + except HTTPError: + raise ValueError('Failed to open URL: ' + u) + + def test_assemble_species_rep_queries_archaea(self): + obs_query_urls = _assemble_queries('202.0', 'SpeciesReps', 'Archaea') + print('obs queries: ', obs_query_urls) + + exp_query_urls = [('Archaea', + 'https://data.gtdb.ecogenomic.org/releases/' + 'release202/202.0/genomic_files_reps/' + 'ar122_ssu_reps_r202.tar.gz')] + print('exp queries: ', exp_query_urls) + self.assertEqual(obs_query_urls, exp_query_urls) + + # test that these URLs work + for _, u in obs_query_urls: + try: + urlopen(u) + except HTTPError: + raise ValueError('Failed to open URL: ' + u) + + def test_assemble_queries_all(self): + obs_query_urls = _assemble_queries('207.0', 'All') + print('obs queries: ', obs_query_urls) + + exp_query_urls = [('All', + 'https://data.gtdb.ecogenomic.org/releases/' + 'release207/207.0/genomic_files_all/' + 'ssu_all_r207.tar.gz')] + print('exp queries: ', exp_query_urls) + self.assertEqual(obs_query_urls, exp_query_urls) # test that these URLs work - for u in obs_tax_urls + obs_seq_urls: + for _, u in obs_query_urls: try: urlopen(u) except HTTPError: @@ -92,36 +114,42 @@ def test_assemble_queries(self): def test_get_gtdb(self): def _makey_fakey_both(faking_ignore_this): - return [self.arch_tax, self.bact_tax], [self.arch_seqs, - self.bact_seqs] + return self.gtdb_tax, self.gtdb_seqs def _makey_fakey_arch(faking_ignore_this): - return [self.arch_tax], [self.arch_seqs] + return self.gtdb_arch_tax, self.gtdb_arch_seqs def _makey_fakey_bact(faking_ignore_this): - return [self.bact_tax], [self.bact_seqs] + return self.gtdb_bact_tax, self.gtdb_bact_seqs # default (both domains, version 214) with patch('rescript.get_gtdb._retrieve_data_from_gtdb', new=_makey_fakey_both): res = rescript.actions.get_gtdb_data( - version='214', domain='Both') - self.assertEqual(len(res), 2) + version='214.1', db_type='SpeciesReps', domain='Both') self.assertEqual(str(res[0].type), 'FeatureData[Taxonomy]') self.assertEqual(str(res[1].type), 'FeatureData[Sequence]') + # just grab archaea domain, and version 202 with patch('rescript.get_gtdb._retrieve_data_from_gtdb', new=_makey_fakey_arch): resa = rescript.actions.get_gtdb_data( - version='202', domain='Archaea') - self.assertEqual(len(resa), 2) + version='202.0', domain='Archaea') self.assertEqual(str(resa[0].type), 'FeatureData[Taxonomy]') self.assertEqual(str(resa[1].type), 'FeatureData[Sequence]') + # just grab bacteria domain, and version 207 with patch('rescript.get_gtdb._retrieve_data_from_gtdb', new=_makey_fakey_bact): resb = rescript.actions.get_gtdb_data( - version='207', domain='Bacteria') - self.assertEqual(len(resb), 2) + version='207.0', domain='Bacteria') self.assertEqual(str(resb[0].type), 'FeatureData[Taxonomy]') self.assertEqual(str(resb[1].type), 'FeatureData[Sequence]') + + # Non-species rep version 214.1) + with patch('rescript.get_gtdb._retrieve_data_from_gtdb', + new=_makey_fakey_both): + resc = rescript.actions.get_gtdb_data( + version='214.1', db_type='All') + self.assertEqual(str(resc[0].type), 'FeatureData[Taxonomy]') + self.assertEqual(str(resc[1].type), 'FeatureData[Sequence]')