From 2bee235974ccb88e9b7ac3fcaee5995b5997e851 Mon Sep 17 00:00:00 2001 From: "Colin J. Brislawn" Date: Mon, 13 Nov 2023 16:32:34 -0500 Subject: [PATCH] get_unite_data functions (#134) LGTM Thanks @colinbrislawn! Perhaps you can write up a small Community Contributions tutorial? Similar to the [GTDB](https://forum.qiime2.org/t/how-to-train-a-gtdb-ssu-classifier-using-rescript/25725) and [RDP](https://forum.qiime2.org/t/importing-sequence-data-with-lower-case-nucleotide-characters-constructing-an-rdp-classifier-as-an-example/25158) tutorials? --- rescript/citations.bib | 17 +++ rescript/get_unite.py | 161 +++++++++++++++++++++++ rescript/plugin_setup.py | 40 ++++++ rescript/tests/data/unitefile.tgz | Bin 0 -> 1158 bytes rescript/tests/data/unitefile_no_dev.tgz | Bin 0 -> 285 bytes rescript/tests/test_get_unite.py | 100 ++++++++++++++ 6 files changed, 318 insertions(+) create mode 100644 rescript/get_unite.py create mode 100644 rescript/tests/data/unitefile.tgz create mode 100644 rescript/tests/data/unitefile_no_dev.tgz create mode 100644 rescript/tests/test_get_unite.py diff --git a/rescript/citations.bib b/rescript/citations.bib index 13c21c22..7061da8d 100644 --- a/rescript/citations.bib +++ b/rescript/citations.bib @@ -157,3 +157,20 @@ @article{schoch2020 pmcid = {PMC7408187}, pmid = {32761142} } + +@article{nilsson2019unite, + author = {Nilsson, Rolf Henrik and Larsson, Karl-Henrik and Taylor, Andy F S and Bengtsson-Palme, Johan and Jeppesen, Thomas S and Schigel, Dmitry and Kennedy, Peter and Picard, Kathryn and Glöckner, Frank Oliver and Tedersoo, Leho and Saar, Irja and Kõljalg, Urmas and Abarenkov, Kessy}, + title = "{The UNITE database for molecular identification of fungi: handling dark taxa and parallel taxonomic classifications}", + journal = {Nucleic Acids Research}, + volume = {47}, + number = {D1}, + pages = {D259-D264}, + year = {2018}, + month = {10}, + issn = {0305-1048}, + doi = {10.1093/nar/gky1022}, + url = {https://doi.org/10.1093/nar/gky1022}, + eprint = {https://academic.oup.com/nar/article-pdf/47/D1/D259/27436038/gky1022.pdf}, + pmcid = {PMC6324048}, + pmid = {30371820}, +} diff --git a/rescript/get_unite.py b/rescript/get_unite.py new file mode 100644 index 00000000..79365e51 --- /dev/null +++ b/rescript/get_unite.py @@ -0,0 +1,161 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import os +import tempfile +import tarfile +import requests + +from pandas import DataFrame +from q2_types.feature_data import ( + TaxonomyFormat, + MixedCaseDNAFASTAFormat, + DNAIterator, +) + +# Source: https://unite.ut.ee/repository.php +UNITE_DOIS = { + "9.0": { + "fungi": { + False: "10.15156/BIO/2938079", + True: "10.15156/BIO/2938080", + }, + "eukaryotes": { + False: "10.15156/BIO/2938081", + True: "10.15156/BIO/2938082", + }, + }, + # Old version 9.0 is not listed here + "8.3": { + "fungi": { + False: "10.15156/BIO/1264708", + True: "10.15156/BIO/1264763", + }, + "eukaryotes": { + False: "10.15156/BIO/1264819", + True: "10.15156/BIO/1264861", + }, + }, + "8.2": { + "fungi": { + False: "10.15156/BIO/786385", + True: "10.15156/BIO/786387", + }, + "eukaryotes": { + False: "10.15156/BIO/786386", + True: "10.15156/BIO/786388", + }, + }, +} + + +def _unite_get_url( + version: str = None, taxon_group: str = None, singletons: bool = None +) -> str: + """Get DOI from included list, then query plutof API for UNITE url""" + # Get matching DOI + doi = UNITE_DOIS[version][taxon_group][singletons] + # Build URL + base_url = ( + "https://api.plutof.ut.ee/v1/public/dois/" + "?format=vnd.api%2Bjson&identifier=" + ) + query_data = requests.get(base_url + doi).json() + # Updates can be made to files in a DOI, so on the advice of the devs, + # only return the last (newest) file with this -1 vv + URL = query_data["data"][0]["attributes"]["media"][-1]["url"] + return URL + + +def _unite_get_tgz( + url: str = None, download_path: str = None, retries: int = 10 +) -> str: + """Download compressed database""" + for retry in range(retries): + # Track downloaded size + file_size = 0 + # Prepair error text + dlfail = "File incomplete on try " + str(retry + 1) + try: + response = requests.get(url, stream=True) + # Save .tgz file + unite_file_path = os.path.join(download_path, "unitefile.tar.gz") + with open(unite_file_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + file_size += len(chunk) + # Check if the downloaded size matches the expected size + if file_size == int(response.headers.get("content-length", 0)): + return unite_file_path # done! + else: + raise ValueError(dlfail) + except ValueError: + print(dlfail) + if retry + 1 == retries: + raise ValueError(dlfail) + + +def _unite_get_artifacts( + tgz_file: str = None, cluster_id: str = "99" +) -> (DataFrame, DNAIterator): + """ + Find and import files with matching cluster_id from .tgz + + Returns: Tuple containing tax_results and seq_results + """ + with tempfile.TemporaryDirectory() as tmpdirname: + # Extract from the .tgz file + with tarfile.open(tgz_file, "r:gz") as tar: + # Keep only _dev files + members = [m for m in tar.getmembers() if "_dev" in m.name] + if not members: + raise ValueError("No '_dev' files found in Unite .tgz file") + for member in members: + # Keep only base name + member.name = os.path.basename(member.name) + tar.extract(member, path=tmpdirname) + # Find and import the raw files... + for root, dirs, files in os.walk(tmpdirname): + # ... with the matching cluster_id + filtered_files = [ + f for f in files if f.split("_")[4] == cluster_id + ] + if not filtered_files or len(filtered_files) != 2: + raise ValueError( + "Expected 2, but found " + + str(len(filtered_files)) + + " files found with cluster_id = " + + cluster_id + ) + for file in filtered_files: + fp = os.path.join(root, file) + if file.endswith(".txt"): + taxa = TaxonomyFormat(fp, mode="r").view(DataFrame) + elif file.endswith(".fasta"): + seqs = MixedCaseDNAFASTAFormat(fp, mode="r").view( + DNAIterator + ) + return taxa, seqs + + +def get_unite_data( + version: str = "9.0", + taxon_group: str = "eukaryotes", + cluster_id: str = "99", + singletons: bool = False, +) -> (DataFrame, DNAIterator): + """ + Get Qiime2 artifacts for a given version of UNITE + + Returns: Tuple containing tax_results and seq_results + """ + url = _unite_get_url(version, taxon_group, singletons) + with tempfile.TemporaryDirectory() as tmpdirname: + tar_file_path = _unite_get_tgz(url, tmpdirname) + return _unite_get_artifacts(tar_file_path, cluster_id) diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py index 7346c9da..99bb2824 100644 --- a/rescript/plugin_setup.py +++ b/rescript/plugin_setup.py @@ -49,6 +49,7 @@ from rescript.ncbi import ( get_ncbi_data, _default_ranks, _allowed_ranks, get_ncbi_data_protein) from .get_gtdb import get_gtdb_data +from .get_unite import get_unite_data citations = Citations.load('citations.bib', package='rescript') @@ -76,6 +77,11 @@ 'and be aware that earlier versions may be released under a different ' 'license.') +UNITE_LICENSE_NOTE = ( + 'NOTE: THIS ACTION ACQUIRES DATA FROM UNITE, which is licensed under ' + 'CC BY-SA 4.0. To learn more, please visit https://unite.ut.ee/cite.php ' + 'and https://creativecommons.org/licenses/by-sa/4.0/.') + VOLATILITY_PLOT_XAXIS_INTERPRETATION = ( 'The x-axis in these plots represents the taxonomic ' 'levels present in the input taxonomies so are labeled numerically ' @@ -974,6 +980,40 @@ ) +plugin.methods.register_function( + function=get_unite_data, + inputs={}, + parameters={ + 'version': Str % Choices(['9.0', '8.3', '8.2']), + 'taxon_group': Str % Choices(['fungi', 'eukaryotes']), + 'cluster_id': Str % Choices(['99', '97', 'dynamic']), + 'singletons': Bool, + }, + outputs=[('taxonomy', FeatureData[Taxonomy]), + ('sequences', FeatureData[Sequence])], + input_descriptions={}, + parameter_descriptions={ + 'version': 'UNITE version to download.', + 'taxon_group': 'Download a database with only \'fungi\' ' + 'or including all \'eukaryotes\'.', + 'cluster_id': 'Percent similarity at which sequences in ' + 'the of database were clustered.', + 'singletons': 'Include singleton clusters in the database.'}, + output_descriptions={ + 'taxonomy': 'UNITE reference taxonomy.', + 'sequences': 'UNITE reference sequences.'}, + name='Download and import UNITE reference data.', + description=( + 'Download and import ITS sequences and taxonomy from the ' + 'UNITE database, given a ' + 'version number and taxon_group, with the option to select a ' + 'cluster_id and include singletons. ' + 'Downloads data directly from UNITE\'s PlutoF REST API. ' + + UNITE_LICENSE_NOTE), + citations=[citations['nilsson2019unite']] +) + + plugin.methods.register_function( function=filter_taxa, inputs={'taxonomy': FeatureData[Taxonomy]}, diff --git a/rescript/tests/data/unitefile.tgz b/rescript/tests/data/unitefile.tgz new file mode 100644 index 0000000000000000000000000000000000000000..5a19629fc5886bcaef595bca28a5eb4ccd85525b GIT binary patch literal 1158 zcmV;11bO=(iwFP!000001MOGAZsSG_^|kqmd_f@DvbBMM0$jFXPkZU?d7vgrBGidf z%UPs<-+7OtwYNpHn-pnLV3;_Ts2L9V_(&?_ZQS+OWBhFx?)vzt-~IU1-92pjc;3cE zeOAxU=GFOmyy`!lU3cTYo4rD+B&(*ucik?l<8Mf_TGq3=S=P(>s%@96Sv9X0O*K2M zUUS$>P6Mg6D=Ob z6El(Y2`V9oYPt)-(w!uQnc*gwrb|@~s#pxc)1*(Uk``;QAt{v#Tx(uHjRi?-G{7d- zyA52qS=<<(L6yt|cysuH0m12l?KG2o#!tF8wLk~bPQf<+0~;#PDFehtKU8@{(jj)= zXNWv0pv57=_y~pyzta~LzSUjKn-!Loq4*J4jzcq0!~(06UFOHb^>KM^D6U<_WVI~2$Pg_Fp;WN;Ckk&2nO1X!K4M9b5M{F7f~ zEZt

c9gZzyu|tBa?74>I9qcSRD$u0uY^+fgHeQ(jKg#(@r*0Ni8^^lva$$!4YVA zI%!}`gxeyBaMWBU`6yP47b=T2+d-)bzgbxHlkh3+7-Mz%umVhSbqLv|xo&`uu~66r zf*VIrtZ>Ut0d5dgopLxZ1J7)kV~rey%9YXnG!0P!s z^c`L>z>qtXccG9w8EF`%gfc|9CYj?-LsmZ#XBUJWr!V=(LnP-WcpZ2P4O6z5E^joR zZf}3$sIWb#OsOMw%$%?zk%R2p;tkH+dt=r;2c92CAJ_?VPC8=I&6(A91TC`p95p^dH7i#e&I0u^iRW2zZ`y=`)VGWs=2j?JOaFd9)_Rg(Pu}Kk6o6w z-)i#d%O{^6Pd>44n|#XJCbunjBlXE=)8q@EUq6D6tY@Yl8RrLLrFsWAOh49B@U*dZ z*(AwHf4`vf9KreQY4)j?i{@JNZt0~mbrvGx|Jks?Kk6e&`qNRc8% YiWDhQq)3q>MP4m`03~+eUjQfo04*s*2LJ#7 literal 0 HcmV?d00001 diff --git a/rescript/tests/data/unitefile_no_dev.tgz b/rescript/tests/data/unitefile_no_dev.tgz new file mode 100644 index 0000000000000000000000000000000000000000..83546ac52d162168475553de508af89a40bd4b0e GIT binary patch literal 285 zcmb2|=3oE==C_v|y$>6Ruz!gA8vI6hic0SEsqKH2YuERzqQ^Qb=zp~#{%=2+wx}=`F8d!?V5VxvxSf1u8tYH zm%juoJ$EGY?-VWBlO~gAUMgO$yl}~s(;13-(%+32^QT4VEJ}avcK-18@8{^IASe_HV7q zRkw-{|Gme%Y-aP_%z%v#7tIsCUL?LF)8}F5v2%IPuf5aBU*Era&7`M=Q$H!5Or5l3 e|JIW)-{|lD^K~=C^T_0!uZ&JrnR*Nw3=9BmxQX-t literal 0 HcmV?d00001 diff --git a/rescript/tests/test_get_unite.py b/rescript/tests/test_get_unite.py new file mode 100644 index 00000000..9a47d3d0 --- /dev/null +++ b/rescript/tests/test_get_unite.py @@ -0,0 +1,100 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2019-2023, QIIME 2 development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import pkg_resources +import tempfile +import pandas.core.frame +import q2_types.feature_data +from qiime2.plugin.testing import TestPluginBase +from rescript.get_unite import ( + UNITE_DOIS, + _unite_get_url, + _unite_get_tgz, + _unite_get_artifacts, + get_unite_data, +) + +from urllib.request import urlopen +from unittest.mock import patch, Mock + + +class TestGetUNITE(TestPluginBase): + package = "rescript.tests" + + def setUp(self): + super().setUp() + self.unitefile = pkg_resources.resource_filename( + "rescript.tests", "data/unitefile.tgz" + ) + self.unitefile_no_dev = pkg_resources.resource_filename( + "rescript.tests", "data/unitefile_no_dev.tgz" + ) + + # Requires internet access + def test_unite_get_url(self): + # for all combinations... + for v in UNITE_DOIS.keys(): + for tg in UNITE_DOIS[v].keys(): + for s in UNITE_DOIS[v][tg].keys(): + # ... try to get the URL + url = _unite_get_url(v, tg, s) + urlopen(url) + self.assertTrue(True) + + def test_unite_get_tgz(self): + with tempfile.TemporaryDirectory() as tmpdirname: + # mock the response object + mock_response = Mock() + mock_response.iter_content.return_value = [b"mock"] + mock_response.headers.get.return_value = "4" # matches content + # mock successful download + with patch("requests.get", return_value=mock_response): + _unite_get_tgz("fakeURL", tmpdirname) + # real failed download + with self.assertRaisesRegex(ValueError, "File incomplete on try"): + _unite_get_tgz("https://files.plutof.ut.ee/nope", tmpdirname) + + def test_unite_get_artifacts(self): + # Test on small data/unitefile.tgz with two items inside + res_one, res_two = _unite_get_artifacts( + self.unitefile, cluster_id="97" + ) + # Column names and one feature from TaxonomyFormat + self.assertEqual( + res_one["Taxon"]["SH1140752.08FU_UDB013072_reps"], + "k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Thelephorales;" + "f__Thelephoraceae;g__Tomentella;s__unidentified", + ) + self.assertEqual( + str(type(res_two)), + "", + ) + # test no _dev files found + with self.assertRaises(ValueError): + _unite_get_artifacts(self.unitefile_no_dev, cluster_id="97") + # test missing files or misspelled cluster_id + with self.assertRaises(ValueError): + _unite_get_artifacts(self.unitefile, "nothing") + + # This tests the function with toy data. + # All relevant internals are tested elsewhere in this test class. + # Downloading is mock`ed with patch. + def test_get_unite_data(self): + with patch( + "rescript.get_unite._unite_get_tgz", return_value=self.unitefile + ): + res = get_unite_data( + version="8.3", taxon_group="fungi", cluster_id="97" + ) + self.assertEqual(len(res), 2) + self.assertTrue(isinstance(res[0], pandas.core.frame.DataFrame)) + self.assertTrue( + isinstance( + res[1], q2_types.feature_data._transformer.DNAIterator + ) + )