Skip to content

Commit

Permalink
ENH: Adds ability to fetch GTDB v220 to get-gtdb-data (#184)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikerobeson authored Jun 5, 2024
1 parent c80286c commit 14dd23a
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 22 deletions.
54 changes: 38 additions & 16 deletions rescript/get_gtdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

import os
import tempfile
import shutil
import tarfile
import gzip
import warnings
import pandas as pd
from urllib.request import urlretrieve
Expand All @@ -22,14 +24,15 @@
# bacteria. for example 'ar53' and 'bac120' mean that the GTDB phylogeny
# is based on 53 and 120 concatenated proteins (cp), respectively.
# If this changes we can set up a conditional statemnt below.
VERSION_MAP_DICT = {'214.1': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
VERSION_MAP_DICT = {'220.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'214.1': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'214.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'207.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'202.0': {'Archaea': 'ar122', 'Bacteria': 'bac120'}}


def get_gtdb_data(
version: str = '214.1',
version: str = '220.0',
domain: str = 'Both',
db_type: str = 'SpeciesReps',
) -> (TSVTaxonomyFormat, DNAFASTAFormat):
Expand All @@ -43,7 +46,7 @@ def get_gtdb_data(
return tax_q, seqs_q


def _assemble_queries(version='214.1',
def _assemble_queries(version='220.0',
db_type='SpeciesReps',
domain='Both'):
queries = []
Expand All @@ -52,6 +55,12 @@ def _assemble_queries(version='214.1',
# ^^ Set `base_version` variable becuase number after the decimal is
# only used for the directory. GTDB trims this off for the actual
# file names...
# GTDB v220 started storing the ssu_reps FASTA files
# as 'fna.gz' instead of their usual 'tar.gz'.
if version == '220.0':
stype = 'fna'
else:
stype = 'tar'

if db_type == 'SpeciesReps':
ver_dom_dict = defaultdict(lambda: defaultdict(dict))
Expand All @@ -62,24 +71,27 @@ def _assemble_queries(version='214.1',
ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain]

full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/'
'{cp}_ssu_reps_r{bver}.tar.gz')
'{cp}_ssu_reps_r{bver}.{stype}.gz')

for version, dcp in ver_dom_dict.items():
for dom, cp in dcp.items():
queries.append((dom,
full_url.format(**{'ver': version,
'bver': base_version,
'cp': cp})))
'cp': cp,
'stype': stype})))
elif db_type == 'All':
# Note: GTDB does not maintain separate 'Bacteria' and
# 'Archaea' files for 'All'. This is only done for
# the 'SpeciesReps'.
# the 'SpeciesReps'. Again, account for filename changes
# to 'fna.gz' in v220, i.e. 'stype'.
full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/'
'ssu_all_r{bver}.tar.gz')
'ssu_all_r{bver}.{stype}.gz')

queries.append((db_type,
full_url.format(**{'ver': version,
'bver': base_version})))
'bver': base_version,
'stype': stype})))
return queries


Expand All @@ -100,16 +112,26 @@ def _get_gtdb_data_path(tmpdirname, url, basename):
return destination


def _extract_seq_tar_file(tmpdirname, untarred_fn, destination):
def _extract_seq_file(tmpdirname, uncompressed_fn, destination):
# GTDB v220 updated the file format for the "ssu_rep" FASTA files
# to 'fna.gz' instead of `tar.gz`. We now need to check which
# form the file is in.
out_path = os.path.join(tmpdirname, uncompressed_fn)

if tarfile.is_tarfile(destination):
with tarfile.open(destination, 'r') as tar:
print(' Untarring {0}...\n'.format(untarred_fn))
tar.extract(member=untarred_fn,
print(' Untarring {0}...\n'.format(uncompressed_fn))
tar.extract(member=uncompressed_fn,
path=tmpdirname)
# read through gtdb fasta file
seqs = DNAFASTAFormat(os.path.join(
tmpdirname, untarred_fn),
seqs = DNAFASTAFormat(out_path,
mode="r").view(DNAIterator)
if destination.endswith('fna.gz'):
with gzip.open(destination, 'rt') as gz_in:
print(' Unzipping {0}...\n'.format(uncompressed_fn))
with open(out_path, 'w') as gz_out:
shutil.copyfileobj(gz_in, gz_out)
seqs = DNAFASTAFormat(out_path,
mode="r").view(DNAIterator)
return seqs


Expand All @@ -127,9 +149,9 @@ def _retrieve_data_from_gtdb(queries):
for domain, url in queries:
# variable setup
basename = os.path.basename(url)
untarred_fn = basename.split('.')[0]+'.fna'
uncompressed_fn = basename.split('.')[0]+'.fna'
destination = _get_gtdb_data_path(tmpdirname, url, basename)
seqs = _extract_seq_tar_file(tmpdirname, untarred_fn, destination)
seqs = _extract_seq_file(tmpdirname, uncompressed_fn, destination)
print(' Writing data from \'{0}\'.\n'.format(domain))
for seq in seqs:
seq.write(out_fasta) # write seq to new fasta file
Expand Down
3 changes: 2 additions & 1 deletion rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,8 @@
function=get_gtdb_data,
inputs={},
parameters={
'version': Str % Choices(['202.0', '207.0', '214.0', '214.1']),
'version': Str % Choices(['202.0', '207.0', '214.0', '214.1',
'220.0']),
'domain': Str % Choices(['Both', 'Bacteria', 'Archaea']),
'db_type': Str % Choices(['All', 'SpeciesReps'])
},
Expand Down
12 changes: 7 additions & 5 deletions rescript/tests/test_get_gtdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,19 @@ def setUp(self):

# test that appropriate URLs are assembled
def test_assemble_species_rep_queries(self):
obs_query_urls = _assemble_queries('214.1', 'SpeciesReps', 'Both')
# checking v220 as GTDB updated the file format for the "ssu_rep"
# FASTA files to 'fna.gz' from their usual 'tar.gz'.
obs_query_urls = _assemble_queries('220.0', 'SpeciesReps', 'Both')
print('obs queries: ', obs_query_urls)

exp_query_urls = [('Archaea',
'https://data.gtdb.ecogenomic.org/releases/'
'release214/214.1/genomic_files_reps/'
'ar53_ssu_reps_r214.tar.gz'),
'release220/220.0/genomic_files_reps/'
'ar53_ssu_reps_r220.fna.gz'),
('Bacteria',
'https://data.gtdb.ecogenomic.org/releases/'
'release214/214.1/genomic_files_reps/'
'bac120_ssu_reps_r214.tar.gz')]
'release220/220.0/genomic_files_reps/'
'bac120_ssu_reps_r220.fna.gz')]
print('exp queries: ', exp_query_urls)
self.assertEqual(obs_query_urls, exp_query_urls)

Expand Down

0 comments on commit 14dd23a

Please sign in to comment.