Skip to content

Commit

Permalink
get-gtdb refactor (bokulich-lab#169)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikerobeson authored Jan 10, 2024
1 parent 92ee605 commit 51df26b
Show file tree
Hide file tree
Showing 5 changed files with 240 additions and 183 deletions.
220 changes: 112 additions & 108 deletions rescript/get_gtdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,135 +10,139 @@
import tempfile
import tarfile
import warnings

import qiime2
import pandas as pd
from urllib.request import urlretrieve
from collections import defaultdict
from urllib.error import HTTPError
from rescript.get_data import _gzip_decompress

from q2_types.feature_data import (TSVTaxonomyFormat, DNAFASTAFormat,
DNAIterator)

# Different versions may have different file names for archaea and
# bacteria. for example 'ar53' and 'bac120' mean that the GTDB phylogeny
# is based on 53 and 120 concatenated proteins (cp), respectively.
# If this changes we can set up a conditional statemnt below.
VERSION_MAP_DICT = {'214': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'207': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'202': {'Archaea': 'ar122', 'Bacteria': 'bac120'}}

VERSION_MAP_DICT = {'214.1': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'214.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'207.0': {'Archaea': 'ar53', 'Bacteria': 'bac120'},
'202.0': {'Archaea': 'ar122', 'Bacteria': 'bac120'}}

def get_gtdb_data(ctx, version='214', domain='Both'):

ver_dom_dict = defaultdict(lambda: defaultdict(dict))
def get_gtdb_data(
version: str = '214.1',
domain: str = 'Both',
db_type: str = 'SpeciesReps',
) -> (TSVTaxonomyFormat, DNAFASTAFormat):

# Subset dict if needed, but keep same structure
# Although we can run the following merge actions on a list of one
# i.e. 'Archaea', we do not want to confuse anyone when looking
# at provenance, by running a merge command for no reason.
if domain == 'Both':
ver_dom_dict[version] = VERSION_MAP_DICT[version]
else:
ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain]

queries = _assemble_queries(ver_dom_dict)
queries = _assemble_queries(version=version,
db_type=db_type,
domain=domain)
tax_q, seqs_q = _retrieve_data_from_gtdb(queries)

if domain == 'Both':
merge_gtdb_seqs = ctx.get_action('feature_table', 'merge_seqs')
merge_gtdb_taxonomy = ctx.get_action('feature_table', 'merge_taxa')
print('\n Merging taxonomy data...')
gtdb_tax, = merge_gtdb_taxonomy(data=tax_q)
print('\n Merging sequence data...')
gtdb_seqs, = merge_gtdb_seqs(data=seqs_q)
else:
gtdb_tax = tax_q[0]
gtdb_seqs = seqs_q[0]

print('\n Saving files...\n')
return gtdb_tax, gtdb_seqs


def _assemble_queries(ver_dom_dict):
queries = defaultdict(lambda: list())

base_seq_url = ('https://data.gtdb.ecogenomic.org/releases/release{ver}/'
'{ver}.0/genomic_files_reps/{cp}_ssu_reps_r{ver}.tar.gz')
base_tax_url = ('https://data.gtdb.ecogenomic.org/releases/release{ver}/'
'{ver}.0/{cp}_taxonomy_r{ver}.tsv.gz')

for version, dcp in ver_dom_dict.items():
for dom, cp in dcp.items():

queries['Taxonomy'].append(
(dom,
base_tax_url.format(**{'ver': version, 'cp': cp}),
'FeatureData[Taxonomy]',
'HeaderlessTSVTaxonomyFormat'))

queries['Sequence'].append(
(dom,
base_seq_url.format(**{'ver': version, 'cp': cp}),
'FeatureData[Sequence]',
'DNAFASTAFormat'))
return tax_q, seqs_q


def _assemble_queries(version='214.1',
db_type='SpeciesReps',
domain='Both'):
queries = []
base_url = 'https://data.gtdb.ecogenomic.org/releases/'
base_version = version.split('.')[0]
# ^^ Set `base_version` variable becuase number after the decimal is
# only used for the directory. GTDB trims this off for the actual
# file names...

if db_type == 'SpeciesReps':
ver_dom_dict = defaultdict(lambda: defaultdict(dict))

if domain == 'Both':
ver_dom_dict[version] = VERSION_MAP_DICT[version]
else:
ver_dom_dict[version][domain] = VERSION_MAP_DICT[version][domain]

full_url = (base_url + 'release{bver}/{ver}/genomic_files_reps/'
'{cp}_ssu_reps_r{bver}.tar.gz')

for version, dcp in ver_dom_dict.items():
for dom, cp in dcp.items():
queries.append((dom,
full_url.format(**{'ver': version,
'bver': base_version,
'cp': cp})))
elif db_type == 'All':
# Note: GTDB does not maintain separate 'Bacteria' and
# 'Archaea' files for 'All'. This is only done for
# the 'SpeciesReps'.
full_url = (base_url + 'release{bver}/{ver}/genomic_files_all/'
'ssu_all_r{bver}.tar.gz')

queries.append((db_type,
full_url.format(**{'ver': version,
'bver': base_version})))
return queries


def _retrieve_data_from_gtdb(queries):
'''
Download data from gtdb, given a list of queries.
queries: {'Taxonomy':(domain, url, type, format), (...),
'Sequence':(domain, url, type, format), (...)}
'''
def parse_gtdb_taxonomy(tax_str):
tax = tax_str.split()[0]
return tax

tax_results = []
seq_results = []

print('\nDownloading and processing raw files ... \n')

with tempfile.TemporaryDirectory() as tmpdirname:
for sttype, q_info in queries.items():
for domain, url, dtype, fmt in q_info:
print('Retrieving {0} for {1} from {2}'.format(
sttype, domain, url))
# grab url
bn = os.path.basename(url)
destination = os.path.join(tmpdirname, bn)
try:
urlretrieve(url, destination)
except HTTPError:
msg = ("Unable to retrieve the followng file from GTDB:\n "
+ url)
warnings.warn(msg, UserWarning)
# seq files are contained within `tar.gz`
if tarfile.is_tarfile(destination):
seq_results.append(get_tar_data(destination, bn,
tmpdirname, dtype, fmt))
else:
tax_results.append(get_gzipped_data(destination, bn,
dtype, fmt))
return tax_results, seq_results


def get_tar_data(tar_loc, base_fn, tmpdirname, dtype, fmt):
def _get_gtdb_data_path(tmpdirname, url, basename):
destination = os.path.join(tmpdirname, basename)
try:
untarred_fn = base_fn.split('.')[0]+'.fna'
print(' Untarring {0}...\n'.format(base_fn))
with tarfile.open(tar_loc, 'r') as tar:
print('Retrieving data from {0}'.format(url))
urlretrieve(url, destination)
except HTTPError:
msg = ("Unable to retrieve the followng file from GTDB:\n "
+ url)
warnings.warn(msg, UserWarning)
return destination


def _extract_seq_tar_file(tmpdirname, untarred_fn, destination):
if tarfile.is_tarfile(destination):
with tarfile.open(destination, 'r') as tar:
print(' Untarring {0}...\n'.format(untarred_fn))
tar.extract(member=untarred_fn,
path=tmpdirname)
return qiime2.Artifact.import_data(dtype, os.path.join(
tmpdirname, untarred_fn), fmt)
except OSError:
raise OSError(('{0}: either does not exist or can not '
'be untarred!'.format(tar_loc)))
# read through gtdb fasta file
seqs = DNAFASTAFormat(os.path.join(
tmpdirname, untarred_fn),
mode="r").view(DNAIterator)
return seqs


def get_gzipped_data(zipped_loc, base_fn, dtype, fmt):
try:
unzipped_destination = os.path.splitext(zipped_loc)[0]
print(' Unzipping {0}...\n'.format(base_fn))
_gzip_decompress(zipped_loc, unzipped_destination)
return qiime2.Artifact.import_data(dtype, unzipped_destination, fmt)
except OSError:
raise OSError(('{0}: either does not exist or can not '
'be unzipped!'.format(unzipped_destination)))
def _retrieve_data_from_gtdb(queries):
proc_seqs = DNAFASTAFormat()
proc_tax = TSVTaxonomyFormat()
tax_dict = {}

print('\nDownloading and processing raw files ... \n')
with \
tempfile.TemporaryDirectory() as tmpdirname, \
proc_seqs.open() as out_fasta, \
proc_tax.open() as out_tax:

for domain, url in queries:
# variable setup
basename = os.path.basename(url)
untarred_fn = basename.split('.')[0]+'.fna'
destination = _get_gtdb_data_path(tmpdirname, url, basename)
seqs = _extract_seq_tar_file(tmpdirname, untarred_fn, destination)
print(' Writing data from \'{0}\'.\n'.format(domain))
for seq in seqs:
seq.write(out_fasta) # write seq to new fasta file
# add taxonomy to dict:
tax_dict[seq.metadata['id']] = parse_gtdb_taxonomy(
seq.metadata['description'])
# set up final taxonomy dataframe:
print(' Sequences processed.')
print(' Processing taxonomy...')
parsed_taxonomy_df = pd.DataFrame.from_dict(tax_dict, orient='index')
parsed_taxonomy_df.index.name = 'Feature ID'
parsed_taxonomy_df.rename(columns={parsed_taxonomy_df.columns[0]:
'Taxon'}, inplace=True)
parsed_taxonomy_df.to_csv(out_tax, sep='\t')
print(' Taxonomy processed.')
return proc_tax, proc_seqs
26 changes: 18 additions & 8 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,29 +949,39 @@
)


plugin.pipelines.register_function(
plugin.methods.register_function(
function=get_gtdb_data,
inputs={},
parameters={
'version': Str % Choices(['202', '207', '214']),
'version': Str % Choices(['202.0', '207.0', '214.0', '214.1']),
'domain': Str % Choices(['Both', 'Bacteria', 'Archaea']),
'db_type': Str % Choices(['All', 'SpeciesReps'])
},
outputs=[('gtdb_taxonomy', FeatureData[Taxonomy]),
('gtdb_sequences', FeatureData[Sequence])],
input_descriptions={},
parameter_descriptions={
'version': 'GTDB database version to download.',
'domain': 'Sequence and taxonomy data to download from a given '
'domain': 'SSU sequence and taxonomy data to download from a given '
'microbial domain from GTDB. \'Both\' will fetch both '
'bacterial and archaeal data. \'Bacteria\' will only '
'fetch bacterial data. \'Archaea\' will only fetch '
'archaeal data.'},
'archaeal data. This only applies to \'db-type '
'SpeciesReps\'.',
'db_type': '\'All\': All SSU data that pass the quality-control '
'of GTDB, but are not clustered into representative '
'species. \'SpeciesReps\': SSU gene sequences '
'identified within the set of representative '
'species. Note: if \'All\' is used, the \'domain\' '
'parameter will be ignored as GTDB does not maintain '
'separate domain-level files for these non-clustered '
'data.'},
output_descriptions={
'gtdb_taxonomy': 'GTDB reference taxonomy.',
'gtdb_sequences': 'GTDB reference sequences.'},
name='Download, parse, and import GTDB reference data.',
'gtdb_taxonomy': 'SSU GTDB reference taxonomy.',
'gtdb_sequences': 'SSU GTDB reference sequences.'},
name='Download, parse, and import SSU GTDB reference data.',
description=(
'Download, parse, and import GTDB files, given a version '
'Download, parse, and import SSU GTDB files, given a version '
'number. Downloads data directly from GTDB, '
'parses the taxonomy files, and outputs ready-to-use sequence and '
'taxonomy artifacts. REQUIRES STABLE INTERNET CONNECTION. ' +
Expand Down
9 changes: 9 additions & 0 deletions rescript/tests/data/gtdb-seqs.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>GB_GCA_000008085.1
CCCGTTGATCCTGCGGGAGGCCACCGCTATCTCCGTCCGGCTAACCCATGGAAGGCGAGGGTCCCCGGGTAAGGGGGCCCGCCGCACGGCTGAGTAACACGTCGGTAACCTACCCTCGGGACGGGGATAACCCCGGGAAACTGGGGCTAATCCCCGATAGGGGATGGGTGCTGGAAGGCCCCATCCCCGAGAGGGGCTAGCGGTACTTCCCCCGCTAGCCCGCCCGAGGATGGGCCGGCGGCCCATCAGGTAGTTGGCGGGGTAATGGCCCGCCAAGCCGAAGACGGGTAGGGGCCGTGAGAGCGGGAGCCCCCAGATCGGCACTGAGACAAGGGCCGAGGCCCTACGGGGCGCACCAGGGGCGAAACCTCCGCAATGCGGGAAACCGTGACGGGGGGACGGAGAGTGCCGGAGGGCGTTATGCTCTCCGGCTTTTGGGGAGTGTAAGTAGCTCCCCGAATAAGCGGTGGGCAAGAGGGGTGGCAGCCGCCGCGGGAACACCCCCACCGCGAGCGGTGGCCGTGATTATTGGGCCTAAAGGGGCCGTAGCCGGGCCGGTGTGGCTCCGGTGAAATCCTCGGGCTCAACCCGAGGGCGCGCCGGAGCTACTACCGGCCTAGGGACCGGGAGGGGCCGACCGTACTCCCGGGGGAGCGGTGAAATGCTGTAATCCCGGGAGGACGACCCGTGGCGAAAGCGGTCGGCCAGAACGGGTCCGACGGTGAGGGCCGAAGGCCGGGGGCTAGAACGGGATTAGAGACCCCGGTATTCCCGGCTGTCAACGCTGCGGGCTACCTGCTGGGCGGGCTACGAGCCCGCCCAGTGGGGTAGGGAAGCCGTTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATAGGCGGGGGAGCACACAAGAGGTGGGGTGCGCGGTTTAATTGGATTCGACGCCGGGAACCTCACCGGGGCTGACAGCACAATGATGGTCGGCCTGAAGGGCCTACCGGAGGCGCTGAGAGGAGGTGCATGGCCGCCGTCAGCCTGTGCCGTGAGGTGCCCTGTTAAGTCAGGAAACAGGCGAGACCCGCGCCCGCAGTTGCGACGGCCGAAAGGCCGGCACACTGCGGGGACTGCCGGGGAAACCCGGAGGAAGGTGCGGGCGACGGCAGGTATGCATGCCCCGAATGCCCCGGGCTACACGCGCGCATCAATGGGCGGGACAGGGGGCCGCGACCCCGAAAGGGGGAGCAAATCCCCAAACCCGCTCTCAGTCCAGATCGAGGGCTGCAACTCGCCCTCGTGACGGCGGAATCTCTAGTAGTCGGACGTCACCAGCGTCCGGCGAATACGTCCCTGCTCCTTGCACTCACCGCCCGTCAAGCCACCCGAGCTGGGGCCTAGCGAGGCCGTGGGGGGTTCGCCCCCCACGGTCGAGCTAGGCCCCGGCGAGGGGGGCTAAGTCGACACAAGGTAGCCGTAGGGGAACCTGCGGCTGGATCACCTCCTA
>GB_GCA_000016605.1
TCCGGTTGATCCTGCCGGACCCGATCGCTATAGGGGTAGGGCTAAGCCATGGGAGTCGTACGCTCTCGGGAAGAGGGCGTGGCGGACGGCTGAGTAACACGTGGCTAACCTGCCCTTGGGATCTGGATAACCCCGGGAAACTGGGGCTAATCCGGAGCGGGCAAGGGAATCTGGAATGATCTCTTGCCTAAAAGCCTCTCGGCTGATCCCGTCGAGAGGCGCCCAAGGATGGGGCTGCGGCCCATCAGGCTGTTGGGGGAGTAAAGGTCCCCCAAACCGATAACGGGTAGGGGCCGTGGGAGCGGGAGCCCCCAGTTGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCACCAGGCGCGGAACGTCCCCAATGCGGGAAACCGTGAGGGCGTTACCCCTAGTGCCCTCGCAAGAGGGCTTTTCTCCACTCCAGAAAGGTGGAGGAATAAGCGGGGGGCAAGACTGGTGTCAGCCGCCGCGGTAATACCAGCCCCGCGAGTGATCGGGACGTTTATTGGGCTTAAAGCGCCCGTAGCCGGCCTGTAAAGTCACCGTTTAAAGACCCGGGCTCAACTCGGGGAACGGCGGTGATACTTACAGGCTAGGGGGCGGGAGAGGTCGGAGGTACTCCCGGAGTAGGGGCGAAATCCTCAGATCCCGGGAGGACCACCAGTGGCGAAAGCGTCCGGCTAGAACGCGCCCGACGGTGAGGGGCGAAAGCCGGGGTAGCAAATAGGATTAGATACCCTAGTAGTCCCGGCTGTAAACGATGCAGGCTAGGTGTCGCGTAGGCTTTGTGCCTGCGCGGTGCCGCAGGAAAACTGGTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGGAACCTGCGGCTCAATTGGAGTCAACGCCTGGAATCTCACCGGGGGAGACCGCAGGATGACGGCCAGGCTAACGACCTTGCCAGACTCGCGGAGAGGAGGTGCATGGCCGTCGCCAGCTCGTGTTGTGAAATGTCCGGTTAAGTCCGGCAACGAGCGAGACCCCCACTTCTAGTTGGTAACCGTCTCTCCGGAGACGGTCCACACTAGAAGGACTGCCGGTGTTAAACCGGAGGAAGGAGGGGGCCACGGCAGGTCAGCATGCCCCGAAACTTCCGGGCCGCACGCGGGTTACAATGGCAGGGACAGCGGGATCCGACCCCGAGAGGGGAAGGCAATCCCACAAACCCTGCCTCAGTTGGGATCGAGGGCTGAAACTCGCCCTCGTGAACGAGGAATCCCTAGTAACCGCGGGTCAACAACCCGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCGCTCCACCCGAGTGGAGGGGAAGTGAGGCCTCTTGCCCCTCGGGGTGGGAGGTCGAGCTTCTCCTCCGCGAGGGGGGAGAAGTCGTAACAAGGTAGCCGTAGGGGAACCTGCGGCTGGATCACCTCA
>GB_GCA_000007325.1
GAAGAGTTTGATCCTGGCTCAGGATGAACGCTGACAGAATGCTTAACACATGCAAGTCTACTTGAATTTGGGTTTTTTAACTTCGATTTGGGTGGCGGACGGGTGAGTAACGCGTAAAGAACTTGCCTCACAGCTAGGGACAACATTTGGAAACGAATGCTAATACCTGATATTATGATTATAGGGCATCCTAGAATTATGAAAGCTATATGCGCTGTGAGAGAGCTTTGCGTCCCATTAGCTAGTTGGAGAGGTAACGGCTCACCAAGGCGATGATGGGTAGCCGGCCTGAGAGGGTGAACGGCCACAAGGGGACTGAGACACGGCCCTTACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGACCGAGAGTCTGATCCAGCAATTCTGTGTGCACGATGACGTTTTTCGGAATGTAAAGTGCTTTCAGTTGGGAAGAAAAAAATGACGGTACCAACAGAAGAAGTGACGGCTAAATACGTGCCAGCAGCCGCGGTAATACGTATGTCACGAGCGTTATCCGGATTTATTGGGCGTAAAGCGCGTCTAGGTGGTTATGTAAGTCTGATGTGAAAATGCAGGGCTCAACTCTGTATTGCGTTGGAAACTGTGTAACTAGAGTACTGGAGAGGTAAGCGGAACTACAAGTGTAGAGGTGAAATTCGTAGATATTTGTAGGAATGCCGATGGGGAAGCCAGCTTACTGGACAGATACTGACGCTGAAGCGCGAAAGCGTGGGTAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGATTACTAGGTGTTGGGGGTCGAACCTCAGCGCCCAAGCAAACGCGATAAGTAATCCGCCTGGGGAGTACGTACGCAAGTATGAAACTCAAAGGAATTGACGGGGACCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGACGCAACGCGAGGAACCTTACCAGCGTTTGACATCTTAGGAATGAGACAGAGATGTTTCAGTGTCCCTTCGGGGAAACCTAAAGACAGGTGGTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTTTCGTATGTTACCATCATTAAGTTGGGGACTCATGCGATACTGCCTACGATGAGTAGGAGGAAGGTGGGGATGACGTCAAGTCATCATGCCCCTTATACGCTGGGCTACACACGTGCTACAATGGGTAGAACAGAGAGTTGCAAAGCCGTGAGGTGGAGCTAATCTCAGAAAACTATTCTTAGTTCGGATTGTACTCTGCAACTCGAGTACATGAAGTTGGAATCGCTAGTAATCGCGAATCAGCAATGTCGCGGTGAATACGTTCTCGGGTCTTGTACACACCGCCCGTCACACCACGAGAGTTGGTTGCACCTGAAGTAGCAGGCCTAACCGTAAGGAGGGATGTTCCGAGGGTGTGATTAGCGATTGGGGTGAAGTCGTAACAAGGTATCCGTACGGGAACGTGCGGATGGATCACCTCCTTTC
>GB_GCA_000008885.1
GGAGGTGATCCAACCACAGGTTCCCCTACGGTTACCTTGTTACGACTTCACCCCAGTTATGAACCACAAAGTGGTAAGCGCCCTCCGAAAGGTTAAGCTACCTGCTTCTTTTGCAGCTCACTTCCATGGTGTGACGGGCGGTGTGTACAAGGCCCGGGAACGTATTCACCGCGGCATGCTGATCCGCGATTACTAGCGATTCCGACTTCATGGAGTCGAGTTGCAGACTCCAATCCGGACTAAGACGTACTTTATGAGATTAGCTTACTTTCGCAAGTTTGCTGCCCTTTGTATACGCCATTGTAGCACGTGTGTAGCCCTACTCGTAAGGGCCATGATGACTTGACGTCATCCCCACCTTCCTCCGGTTTATCACCGGCAGTCTCCTTTGAGTTCCCGACTTTTTCGCTGGCAAAAAAGGATAGGGGTTGCGCTCGTTGCGGGACTTAACCCAACATTTCACAACACGAGCTGACGACAGCCATGCAGCACCTGTTTTAAAGCTCCCGAAGGCACTAAAGCATCTCTGCTAAATTCTTTAAATGTCAAGAGTAGGTAAGGTTTTTCGCGTTGCATCGAATTAAACCACATGCTCCACCGCTTGTGCGGGCCCCCGTCAATTCATTTGAGTTTTAACCTTGCGATTGTACTCCCCAGGCGGTCGATTTAACGCGTTAGCTTCGAAAACCCCGAGTAAACTCGCAACCTTCAAATCGACATCGTTTACGGCATGGACTACCAGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTACATGAGCGTCAGTTTTCGCCCAGGAGGCCGCCTTCGCCGCTGGTATTCCTCCAGATATCTACGCATTTCACCGCTACACCTGGAATTCTACCTCCCTCTACGATACTCTAGTTTATTAGTTTCAAATGCAGTTCCTAGGTTGAGCCTAGGGATTTCACATCTGACTTAATAAACCGCCTGCGTACGCTTTACGCCCAGTAATTCCGATTAACGCTTGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTCTGCAAGTAACGTCACATAAATATGGTATTATCACATTTACTTTCTTCCCTGCTGAAAGTGCTTTACAATCCGAAGACCTTCTTCACACACGCGGCATAGCTGCATCAGGGTTTCCCCCATTGTGCAATATTCCCCACTGCTGCCTCCCGTAGGAGTCTGGACCGTATCTCAGTTCCAGTGTGGCTGGTTATCCTCTCAGACCAGCTAGAGATCGTAGCCTAGGTGAGCATTTACCTCACCTACTAGCTAATCTCATCTGGGTTCATCTAAAAACGCAAGGCTGATATAAAGTATTATATTAGTCCCCTGCTTTGATCTTTCGATATTATGCGGTATTAGCTACCGTTTCCAGTAGTTGTCCCCCTTTTTTAGGCAGATCCCCAGACATTACTCACCCGTTCGCCGCTCGCCGTCAAAGAAAAATCTCTACGCTGCCGCACGACTTGCATGTGTTAGGCTTGCCGCTAGCGTTCAATCTGAGCCATGATCAAACTCTTCAAT

6 changes: 6 additions & 0 deletions rescript/tests/data/gtdb-tax.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Feature ID Taxon
GB_GCA_000008085.1 d__Archaea;p__Nanoarchaeota;c__Nanoarchaeia;o__Nanoarchaeales;f__Nanoarchaeaceae;g__Nanoarchaeum;s__Nanoarchaeum equitans
GB_GCA_000016605.1 d__Archaea;p__Thermoproteota;c__Thermoproteia;o__Sulfolobales;f__Sulfolobaceae;g__Metallosphaera;s__Metallosphaera sedula
GB_GCA_000007325.1 d__Bacteria;p__Fusobacteriota;c__Fusobacteriia;o__Fusobacteriales;f__Fusobacteriaceae;g__Fusobacterium;s__Fusobacterium nucleatum
GB_GCA_000008885.1 d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales_A;f__Enterobacteriaceae_A;g__Wigglesworthia;s__Wigglesworthia glossinidia_A

Loading

0 comments on commit 51df26b

Please sign in to comment.