Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Adds ability to fetch GTDB v220 to get-gtdb-data #184

Merged
merged 2 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 38 additions & 16 deletions rescript/get_gtdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@

import os
import tempfile
import shutil
import tarfile
import gzip
import warnings
import pandas as pd
from urllib.request import urlretrieve
Expand All @@ -22,14 +24,15 @@
# bacteria. for example 'ar53' and 'bac120' mean that the GTDB phylogeny
# is based on 53 and 120 concatenated proteins (cp), respectively.
# If this changes we can set up a conditional statemnt below.
VERSION_MAP_DICT = {'214.1': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
VERSION_MAP_DICT = {'220.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'214.1': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'214.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'207.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'202.0': {'Archaea': 'ar122', 'Bacteria': 'bac120'}}


def get_gtdb_data(
version: str = '214.1',
version: str = '220.0',
domain: str = 'Both',
db_type: str = 'SpeciesReps',
) -> (TSVTaxonomyFormat, DNAFASTAFormat):
Expand All @@ -43,7 +46,7 @@ def get_gtdb_data(
return tax_q, seqs_q


def _assemble_queries(version='214.1',
def _assemble_queries(version='220.0',
db_type='SpeciesReps',
domain='Both'):
queries = []
Expand All @@ -52,6 +55,12 @@ def _assemble_queries(version='214.1',
# ^^ Set `base_version` variable becuase number after the decimal is
# only used for the directory. GTDB trims this off for the actual
# file names...
# GTDB v220 started storing the ssu_reps FASTA files
# as 'fna.gz' instead of their usual 'tar.gz'.
if version == '220.0':
nbokulich marked this conversation as resolved.
Show resolved Hide resolved
stype = 'fna'
else:
stype = 'tar'

if db_type == 'SpeciesReps':
ver_dom_dict = defaultdict(lambda: defaultdict(dict))
Expand All @@ -62,24 +71,27 @@ def _assemble_queries(version='214.1',
ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain]

full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/'
'{cp}_ssu_reps_r{bver}.tar.gz')
'{cp}_ssu_reps_r{bver}.{stype}.gz')

for version, dcp in ver_dom_dict.items():
for dom, cp in dcp.items():
queries.append((dom,
full_url.format(**{'ver': version,
'bver': base_version,
'cp': cp})))
'cp': cp,
'stype': stype})))
elif db_type == 'All':
# Note: GTDB does not maintain separate 'Bacteria' and
# 'Archaea' files for 'All'. This is only done for
# the 'SpeciesReps'.
# the 'SpeciesReps'. Again, account for filename changes
# to 'fna.gz' in v220, i.e. 'stype'.
full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/'
'ssu_all_r{bver}.tar.gz')
'ssu_all_r{bver}.{stype}.gz')

queries.append((db_type,
full_url.format(**{'ver': version,
'bver': base_version})))
'bver': base_version,
'stype': stype})))
return queries


Expand All @@ -100,16 +112,26 @@ def _get_gtdb_data_path(tmpdirname, url, basename):
return destination


def _extract_seq_tar_file(tmpdirname, untarred_fn, destination):
def _extract_seq_file(tmpdirname, uncompressed_fn, destination):
# GTDB v220 updated the file format for the "ssu_rep" FASTA files
# to 'fna.gz' instead of `tar.gz`. We now need to check which
# form the file is in.
out_path = os.path.join(tmpdirname, uncompressed_fn)

if tarfile.is_tarfile(destination):
with tarfile.open(destination, 'r') as tar:
print(' Untarring {0}...\n'.format(untarred_fn))
tar.extract(member=untarred_fn,
print(' Untarring {0}...\n'.format(uncompressed_fn))
tar.extract(member=uncompressed_fn,
path=tmpdirname)
# read through gtdb fasta file
seqs = DNAFASTAFormat(os.path.join(
tmpdirname, untarred_fn),
seqs = DNAFASTAFormat(out_path,
mode="r").view(DNAIterator)
if destination.endswith('fna.gz'):
with gzip.open(destination, 'rt') as gz_in:
print(' Unzipping {0}...\n'.format(uncompressed_fn))
with open(out_path, 'w') as gz_out:
shutil.copyfileobj(gz_in, gz_out)
seqs = DNAFASTAFormat(out_path,
mode="r").view(DNAIterator)
return seqs


Expand All @@ -127,9 +149,9 @@ def _retrieve_data_from_gtdb(queries):
for domain, url in queries:
# variable setup
basename = os.path.basename(url)
untarred_fn = basename.split('.')[0]+'.fna'
uncompressed_fn = basename.split('.')[0]+'.fna'
destination = _get_gtdb_data_path(tmpdirname, url, basename)
seqs = _extract_seq_tar_file(tmpdirname, untarred_fn, destination)
seqs = _extract_seq_file(tmpdirname, uncompressed_fn, destination)
print(' Writing data from \'{0}\'.\n'.format(domain))
for seq in seqs:
seq.write(out_fasta) # write seq to new fasta file
Expand Down
3 changes: 2 additions & 1 deletion rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,8 @@
function=get_gtdb_data,
inputs={},
parameters={
'version': Str % Choices(['202.0', '207.0', '214.0', '214.1']),
'version': Str % Choices(['202.0', '207.0', '214.0', '214.1',
'220.0']),
'domain': Str % Choices(['Both', 'Bacteria', 'Archaea']),
'db_type': Str % Choices(['All', 'SpeciesReps'])
},
Expand Down
12 changes: 7 additions & 5 deletions rescript/tests/test_get_gtdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,19 @@ def setUp(self):

# test that appropriate URLs are assembled
def test_assemble_species_rep_queries(self):
obs_query_urls = _assemble_queries('214.1', 'SpeciesReps', 'Both')
# checking v220 as GTDB updated the file format for the "ssu_rep"
# FASTA files to 'fna.gz' from their usual 'tar.gz'.
obs_query_urls = _assemble_queries('220.0', 'SpeciesReps', 'Both')
print('obs queries: ', obs_query_urls)

exp_query_urls = [('Archaea',
'https://data.gtdb.ecogenomic.org/releases/'
'release214/214.1/genomic_files_reps/'
'ar53_ssu_reps_r214.tar.gz'),
'release220/220.0/genomic_files_reps/'
'ar53_ssu_reps_r220.fna.gz'),
('Bacteria',
'https://data.gtdb.ecogenomic.org/releases/'
'release214/214.1/genomic_files_reps/'
'bac120_ssu_reps_r214.tar.gz')]
'release220/220.0/genomic_files_reps/'
'bac120_ssu_reps_r220.fna.gz')]
print('exp queries: ', exp_query_urls)
self.assertEqual(obs_query_urls, exp_query_urls)

Expand Down
Loading