From 53c68bb4ecbad48de316cc2683ef127e879617e5 Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Thu, 25 Apr 2024 16:29:09 -0500 Subject: [PATCH 1/2] adding gtdb v220 to get-gtdb-data --- rescript/get_gtdb.py | 56 +++++++++++++++++++++++---------- rescript/plugin_setup.py | 3 +- rescript/tests/test_get_gtdb.py | 12 ++++--- 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/rescript/get_gtdb.py b/rescript/get_gtdb.py index 61cac2a..43ab0a0 100644 --- a/rescript/get_gtdb.py +++ b/rescript/get_gtdb.py @@ -8,7 +8,9 @@ import os import tempfile +import shutil import tarfile +import gzip import warnings import pandas as pd from urllib.request import urlretrieve @@ -22,14 +24,15 @@ # bacteria. for example 'ar53' and 'bac120' mean that the GTDB phylogeny # is based on 53 and 120 concatenated proteins (cp), respectively. # If this changes we can set up a conditional statemnt below. -VERSION_MAP_DICT = {'214.1': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, +VERSION_MAP_DICT = {'220.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, + '214.1': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, '214.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, '207.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'}, '202.0': {'Archaea': 'ar122', 'Bacteria': 'bac120'}} def get_gtdb_data( - version: str = '214.1', + version: str = '220.0', domain: str = 'Both', db_type: str = 'SpeciesReps', ) -> (TSVTaxonomyFormat, DNAFASTAFormat): @@ -43,7 +46,7 @@ def get_gtdb_data( return tax_q, seqs_q -def _assemble_queries(version='214.1', +def _assemble_queries(version='220.0', db_type='SpeciesReps', domain='Both'): queries = [] @@ -61,8 +64,14 @@ def _assemble_queries(version='214.1', else: ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain] - full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/' - '{cp}_ssu_reps_r{bver}.tar.gz') + # GTDB v220 started storing the ssu_reps FASTA files + # as 'fna.gz' instead of their usual 'tar.gz'. + if version == '220.0': + full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/' + '{cp}_ssu_reps_r{bver}.fna.gz') + else: + full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/' + '{cp}_ssu_reps_r{bver}.tar.gz') for version, dcp in ver_dom_dict.items(): for dom, cp in dcp.items(): @@ -73,9 +82,14 @@ def _assemble_queries(version='214.1', elif db_type == 'All': # Note: GTDB does not maintain separate 'Bacteria' and # 'Archaea' files for 'All'. This is only done for - # the 'SpeciesReps'. - full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/' - 'ssu_all_r{bver}.tar.gz') + # the 'SpeciesReps'. Again, account for filename changes + # to 'fna.gz' in v220. + if version == '220.0': + full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/' + 'ssu_all_r{bver}.fna.gz') + else: + full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/' + 'ssu_all_r{bver}.tar.gz') queries.append((db_type, full_url.format(**{'ver': version, @@ -100,16 +114,26 @@ def _get_gtdb_data_path(tmpdirname, url, basename): return destination -def _extract_seq_tar_file(tmpdirname, untarred_fn, destination): +def _extract_seq_file(tmpdirname, uncompressed_fn, destination): + # GTDB v220 updated the file format for the "ssu_rep" FASTA files + # to 'fna.gz' instead of `tar.gz`. We now need to check which + # form the file is in. + out_path = os.path.join(tmpdirname, uncompressed_fn) + if tarfile.is_tarfile(destination): with tarfile.open(destination, 'r') as tar: - print(' Untarring {0}...\n'.format(untarred_fn)) - tar.extract(member=untarred_fn, + print(' Untarring {0}...\n'.format(uncompressed_fn)) + tar.extract(member=uncompressed_fn, path=tmpdirname) - # read through gtdb fasta file - seqs = DNAFASTAFormat(os.path.join( - tmpdirname, untarred_fn), + seqs = DNAFASTAFormat(out_path, mode="r").view(DNAIterator) + if destination.endswith('fna.gz'): + with gzip.open(destination, 'rt') as gz_in: + print(' Unzipping {0}...\n'.format(uncompressed_fn)) + with open(out_path, 'w') as gz_out: + shutil.copyfileobj(gz_in, gz_out) + seqs = DNAFASTAFormat(out_path, + mode="r").view(DNAIterator) return seqs @@ -127,9 +151,9 @@ def _retrieve_data_from_gtdb(queries): for domain, url in queries: # variable setup basename = os.path.basename(url) - untarred_fn = basename.split('.')[0]+'.fna' + uncompressed_fn = basename.split('.')[0]+'.fna' destination = _get_gtdb_data_path(tmpdirname, url, basename) - seqs = _extract_seq_tar_file(tmpdirname, untarred_fn, destination) + seqs = _extract_seq_file(tmpdirname, uncompressed_fn, destination) print(' Writing data from \'{0}\'.\n'.format(domain)) for seq in seqs: seq.write(out_fasta) # write seq to new fasta file diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 8eae397..45a7793 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -953,7 +953,8 @@ function=get_gtdb_data, inputs={}, parameters={ - 'version': Str % Choices(['202.0', '207.0', '214.0', '214.1']), + 'version': Str % Choices(['202.0', '207.0', '214.0', '214.1', + '220.0']), 'domain': Str % Choices(['Both', 'Bacteria', 'Archaea']), 'db_type': Str % Choices(['All', 'SpeciesReps']) }, diff --git a/rescript/tests/test_get_gtdb.py b/rescript/tests/test_get_gtdb.py index 38602db..ed2f7b8 100644 --- a/rescript/tests/test_get_gtdb.py +++ b/rescript/tests/test_get_gtdb.py @@ -55,17 +55,19 @@ def setUp(self): # test that appropriate URLs are assembled def test_assemble_species_rep_queries(self): - obs_query_urls = _assemble_queries('214.1', 'SpeciesReps', 'Both') + # checking v220 as GTDB updated the file format for the "ssu_rep" + # FASTA files to 'fna.gz' from their usual 'tar.gz'. + obs_query_urls = _assemble_queries('220.0', 'SpeciesReps', 'Both') print('obs queries: ', obs_query_urls) exp_query_urls = [('Archaea', 'https://data.gtdb.ecogenomic.org/releases/' - 'release214/214.1/genomic_files_reps/' - 'ar53_ssu_reps_r214.tar.gz'), + 'release220/220.0/genomic_files_reps/' + 'ar53_ssu_reps_r220.fna.gz'), ('Bacteria', 'https://data.gtdb.ecogenomic.org/releases/' - 'release214/214.1/genomic_files_reps/' - 'bac120_ssu_reps_r214.tar.gz')] + 'release220/220.0/genomic_files_reps/' + 'bac120_ssu_reps_r220.fna.gz')] print('exp queries: ', exp_query_urls) self.assertEqual(obs_query_urls, exp_query_urls) From 9274d35812e81041f1c33851303bd3ad912bd5e8 Mon Sep 17 00:00:00 2001 From: mikerobeson Date: Tue, 30 Apr 2024 16:27:40 -0500 Subject: [PATCH 2/2] condensing some gtdb code --- rescript/get_gtdb.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/rescript/get_gtdb.py b/rescript/get_gtdb.py index 43ab0a0..602911d 100644 --- a/rescript/get_gtdb.py +++ b/rescript/get_gtdb.py @@ -55,6 +55,12 @@ def _assemble_queries(version='220.0', # ^^ Set `base_version` variable becuase number after the decimal is # only used for the directory. GTDB trims this off for the actual # file names... + # GTDB v220 started storing the ssu_reps FASTA files + # as 'fna.gz' instead of their usual 'tar.gz'. + if version == '220.0': + stype = 'fna' + else: + stype = 'tar' if db_type == 'SpeciesReps': ver_dom_dict = defaultdict(lambda: defaultdict(dict)) @@ -64,36 +70,28 @@ def _assemble_queries(version='220.0', else: ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain] - # GTDB v220 started storing the ssu_reps FASTA files - # as 'fna.gz' instead of their usual 'tar.gz'. - if version == '220.0': - full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/' - '{cp}_ssu_reps_r{bver}.fna.gz') - else: - full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/' - '{cp}_ssu_reps_r{bver}.tar.gz') + full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/' + '{cp}_ssu_reps_r{bver}.{stype}.gz') for version, dcp in ver_dom_dict.items(): for dom, cp in dcp.items(): queries.append((dom, full_url.format(**{'ver': version, 'bver': base_version, - 'cp': cp}))) + 'cp': cp, + 'stype': stype}))) elif db_type == 'All': # Note: GTDB does not maintain separate 'Bacteria' and # 'Archaea' files for 'All'. This is only done for # the 'SpeciesReps'. Again, account for filename changes - # to 'fna.gz' in v220. - if version == '220.0': - full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/' - 'ssu_all_r{bver}.fna.gz') - else: - full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/' - 'ssu_all_r{bver}.tar.gz') + # to 'fna.gz' in v220, i.e. 'stype'. + full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/' + 'ssu_all_r{bver}.{stype}.gz') queries.append((db_type, full_url.format(**{'ver': version, - 'bver': base_version}))) + 'bver': base_version, + 'stype': stype}))) return queries