Skip to content

Commit

Permalink
get_unite_data functions (#134)
Browse files Browse the repository at this point in the history
  • Loading branch information
colinbrislawn authored Nov 13, 2023
1 parent 9467284 commit 2bee235
Show file tree
Hide file tree
Showing 6 changed files with 318 additions and 0 deletions.
17 changes: 17 additions & 0 deletions rescript/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,20 @@ @article{schoch2020
pmcid = {PMC7408187},
pmid = {32761142}
}

@article{nilsson2019unite,
author = {Nilsson, Rolf Henrik and Larsson, Karl-Henrik and Taylor, Andy F S and Bengtsson-Palme, Johan and Jeppesen, Thomas S and Schigel, Dmitry and Kennedy, Peter and Picard, Kathryn and Glöckner, Frank Oliver and Tedersoo, Leho and Saar, Irja and Kõljalg, Urmas and Abarenkov, Kessy},
title = "{The UNITE database for molecular identification of fungi: handling dark taxa and parallel taxonomic classifications}",
journal = {Nucleic Acids Research},
volume = {47},
number = {D1},
pages = {D259-D264},
year = {2018},
month = {10},
issn = {0305-1048},
doi = {10.1093/nar/gky1022},
url = {https://doi.org/10.1093/nar/gky1022},
eprint = {https://academic.oup.com/nar/article-pdf/47/D1/D259/27436038/gky1022.pdf},
pmcid = {PMC6324048},
pmid = {30371820},
}
161 changes: 161 additions & 0 deletions rescript/get_unite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
import tempfile
import tarfile
import requests

from pandas import DataFrame
from q2_types.feature_data import (
TaxonomyFormat,
MixedCaseDNAFASTAFormat,
DNAIterator,
)

# Source: https://unite.ut.ee/repository.php
UNITE_DOIS = {
"9.0": {
"fungi": {
False: "10.15156/BIO/2938079",
True: "10.15156/BIO/2938080",
},
"eukaryotes": {
False: "10.15156/BIO/2938081",
True: "10.15156/BIO/2938082",
},
},
# Old version 9.0 is not listed here
"8.3": {
"fungi": {
False: "10.15156/BIO/1264708",
True: "10.15156/BIO/1264763",
},
"eukaryotes": {
False: "10.15156/BIO/1264819",
True: "10.15156/BIO/1264861",
},
},
"8.2": {
"fungi": {
False: "10.15156/BIO/786385",
True: "10.15156/BIO/786387",
},
"eukaryotes": {
False: "10.15156/BIO/786386",
True: "10.15156/BIO/786388",
},
},
}


def _unite_get_url(
version: str = None, taxon_group: str = None, singletons: bool = None
) -> str:
"""Get DOI from included list, then query plutof API for UNITE url"""
# Get matching DOI
doi = UNITE_DOIS[version][taxon_group][singletons]
# Build URL
base_url = (
"https://api.plutof.ut.ee/v1/public/dois/"
"?format=vnd.api%2Bjson&identifier="
)
query_data = requests.get(base_url + doi).json()
# Updates can be made to files in a DOI, so on the advice of the devs,
# only return the last (newest) file with this -1 vv
URL = query_data["data"][0]["attributes"]["media"][-1]["url"]
return URL


def _unite_get_tgz(
url: str = None, download_path: str = None, retries: int = 10
) -> str:
"""Download compressed database"""
for retry in range(retries):
# Track downloaded size
file_size = 0
# Prepair error text
dlfail = "File incomplete on try " + str(retry + 1)
try:
response = requests.get(url, stream=True)
# Save .tgz file
unite_file_path = os.path.join(download_path, "unitefile.tar.gz")
with open(unite_file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
file_size += len(chunk)
# Check if the downloaded size matches the expected size
if file_size == int(response.headers.get("content-length", 0)):
return unite_file_path # done!
else:
raise ValueError(dlfail)
except ValueError:
print(dlfail)
if retry + 1 == retries:
raise ValueError(dlfail)


def _unite_get_artifacts(
tgz_file: str = None, cluster_id: str = "99"
) -> (DataFrame, DNAIterator):
"""
Find and import files with matching cluster_id from .tgz
Returns: Tuple containing tax_results and seq_results
"""
with tempfile.TemporaryDirectory() as tmpdirname:
# Extract from the .tgz file
with tarfile.open(tgz_file, "r:gz") as tar:
# Keep only _dev files
members = [m for m in tar.getmembers() if "_dev" in m.name]
if not members:
raise ValueError("No '_dev' files found in Unite .tgz file")
for member in members:
# Keep only base name
member.name = os.path.basename(member.name)
tar.extract(member, path=tmpdirname)
# Find and import the raw files...
for root, dirs, files in os.walk(tmpdirname):
# ... with the matching cluster_id
filtered_files = [
f for f in files if f.split("_")[4] == cluster_id
]
if not filtered_files or len(filtered_files) != 2:
raise ValueError(
"Expected 2, but found "
+ str(len(filtered_files))
+ " files found with cluster_id = "
+ cluster_id
)
for file in filtered_files:
fp = os.path.join(root, file)
if file.endswith(".txt"):
taxa = TaxonomyFormat(fp, mode="r").view(DataFrame)
elif file.endswith(".fasta"):
seqs = MixedCaseDNAFASTAFormat(fp, mode="r").view(
DNAIterator
)
return taxa, seqs


def get_unite_data(
version: str = "9.0",
taxon_group: str = "eukaryotes",
cluster_id: str = "99",
singletons: bool = False,
) -> (DataFrame, DNAIterator):
"""
Get Qiime2 artifacts for a given version of UNITE
Returns: Tuple containing tax_results and seq_results
"""
url = _unite_get_url(version, taxon_group, singletons)
with tempfile.TemporaryDirectory() as tmpdirname:
tar_file_path = _unite_get_tgz(url, tmpdirname)
return _unite_get_artifacts(tar_file_path, cluster_id)
40 changes: 40 additions & 0 deletions rescript/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from rescript.ncbi import (
get_ncbi_data, _default_ranks, _allowed_ranks, get_ncbi_data_protein)
from .get_gtdb import get_gtdb_data
from .get_unite import get_unite_data

citations = Citations.load('citations.bib', package='rescript')

Expand Down Expand Up @@ -76,6 +77,11 @@
'and be aware that earlier versions may be released under a different '
'license.')

UNITE_LICENSE_NOTE = (
'NOTE: THIS ACTION ACQUIRES DATA FROM UNITE, which is licensed under '
'CC BY-SA 4.0. To learn more, please visit https://unite.ut.ee/cite.php '
'and https://creativecommons.org/licenses/by-sa/4.0/.')

VOLATILITY_PLOT_XAXIS_INTERPRETATION = (
'The x-axis in these plots represents the taxonomic '
'levels present in the input taxonomies so are labeled numerically '
Expand Down Expand Up @@ -974,6 +980,40 @@
)


plugin.methods.register_function(
function=get_unite_data,
inputs={},
parameters={
'version': Str % Choices(['9.0', '8.3', '8.2']),
'taxon_group': Str % Choices(['fungi', 'eukaryotes']),
'cluster_id': Str % Choices(['99', '97', 'dynamic']),
'singletons': Bool,
},
outputs=[('taxonomy', FeatureData[Taxonomy]),
('sequences', FeatureData[Sequence])],
input_descriptions={},
parameter_descriptions={
'version': 'UNITE version to download.',
'taxon_group': 'Download a database with only \'fungi\' '
'or including all \'eukaryotes\'.',
'cluster_id': 'Percent similarity at which sequences in '
'the of database were clustered.',
'singletons': 'Include singleton clusters in the database.'},
output_descriptions={
'taxonomy': 'UNITE reference taxonomy.',
'sequences': 'UNITE reference sequences.'},
name='Download and import UNITE reference data.',
description=(
'Download and import ITS sequences and taxonomy from the '
'UNITE database, given a '
'version number and taxon_group, with the option to select a '
'cluster_id and include singletons. '
'Downloads data directly from UNITE\'s PlutoF REST API. ' +
UNITE_LICENSE_NOTE),
citations=[citations['nilsson2019unite']]
)


plugin.methods.register_function(
function=filter_taxa,
inputs={'taxonomy': FeatureData[Taxonomy]},
Expand Down
Binary file added rescript/tests/data/unitefile.tgz
Binary file not shown.
Binary file added rescript/tests/data/unitefile_no_dev.tgz
Binary file not shown.
100 changes: 100 additions & 0 deletions rescript/tests/test_get_unite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2019-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import pkg_resources
import tempfile
import pandas.core.frame
import q2_types.feature_data
from qiime2.plugin.testing import TestPluginBase
from rescript.get_unite import (
UNITE_DOIS,
_unite_get_url,
_unite_get_tgz,
_unite_get_artifacts,
get_unite_data,
)

from urllib.request import urlopen
from unittest.mock import patch, Mock


class TestGetUNITE(TestPluginBase):
package = "rescript.tests"

def setUp(self):
super().setUp()
self.unitefile = pkg_resources.resource_filename(
"rescript.tests", "data/unitefile.tgz"
)
self.unitefile_no_dev = pkg_resources.resource_filename(
"rescript.tests", "data/unitefile_no_dev.tgz"
)

# Requires internet access
def test_unite_get_url(self):
# for all combinations...
for v in UNITE_DOIS.keys():
for tg in UNITE_DOIS[v].keys():
for s in UNITE_DOIS[v][tg].keys():
# ... try to get the URL
url = _unite_get_url(v, tg, s)
urlopen(url)
self.assertTrue(True)

def test_unite_get_tgz(self):
with tempfile.TemporaryDirectory() as tmpdirname:
# mock the response object
mock_response = Mock()
mock_response.iter_content.return_value = [b"mock"]
mock_response.headers.get.return_value = "4" # matches content
# mock successful download
with patch("requests.get", return_value=mock_response):
_unite_get_tgz("fakeURL", tmpdirname)
# real failed download
with self.assertRaisesRegex(ValueError, "File incomplete on try"):
_unite_get_tgz("https://files.plutof.ut.ee/nope", tmpdirname)

def test_unite_get_artifacts(self):
# Test on small data/unitefile.tgz with two items inside
res_one, res_two = _unite_get_artifacts(
self.unitefile, cluster_id="97"
)
# Column names and one feature from TaxonomyFormat
self.assertEqual(
res_one["Taxon"]["SH1140752.08FU_UDB013072_reps"],
"k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Thelephorales;"
"f__Thelephoraceae;g__Tomentella;s__unidentified",
)
self.assertEqual(
str(type(res_two)),
"<class 'q2_types.feature_data._transformer.DNAIterator'>",
)
# test no _dev files found
with self.assertRaises(ValueError):
_unite_get_artifacts(self.unitefile_no_dev, cluster_id="97")
# test missing files or misspelled cluster_id
with self.assertRaises(ValueError):
_unite_get_artifacts(self.unitefile, "nothing")

# This tests the function with toy data.
# All relevant internals are tested elsewhere in this test class.
# Downloading is mock`ed with patch.
def test_get_unite_data(self):
with patch(
"rescript.get_unite._unite_get_tgz", return_value=self.unitefile
):
res = get_unite_data(
version="8.3", taxon_group="fungi", cluster_id="97"
)
self.assertEqual(len(res), 2)
self.assertTrue(isinstance(res[0], pandas.core.frame.DataFrame))
self.assertTrue(
isinstance(
res[1], q2_types.feature_data._transformer.DNAIterator
)
)

0 comments on commit 2bee235

Please sign in to comment.