From 2bee235974ccb88e9b7ac3fcaee5995b5997e851 Mon Sep 17 00:00:00 2001
From: "Colin J. Brislawn" <cbrisl@gmail.com>
Date: Mon, 13 Nov 2023 16:32:34 -0500
Subject: [PATCH] get_unite_data functions (#134)

LGTM Thanks @colinbrislawn!

Perhaps you can write up a small Community Contributions tutorial? Similar to the [GTDB](https://forum.qiime2.org/t/how-to-train-a-gtdb-ssu-classifier-using-rescript/25725) and [RDP](https://forum.qiime2.org/t/importing-sequence-data-with-lower-case-nucleotide-characters-constructing-an-rdp-classifier-as-an-example/25158) tutorials?
---
 rescript/citations.bib                   |  17 +++
 rescript/get_unite.py                    | 161 +++++++++++++++++++++++
 rescript/plugin_setup.py                 |  40 ++++++
 rescript/tests/data/unitefile.tgz        | Bin 0 -> 1158 bytes
 rescript/tests/data/unitefile_no_dev.tgz | Bin 0 -> 285 bytes
 rescript/tests/test_get_unite.py         | 100 ++++++++++++++
 6 files changed, 318 insertions(+)
 create mode 100644 rescript/get_unite.py
 create mode 100644 rescript/tests/data/unitefile.tgz
 create mode 100644 rescript/tests/data/unitefile_no_dev.tgz
 create mode 100644 rescript/tests/test_get_unite.py

diff --git a/rescript/citations.bib b/rescript/citations.bib
index 13c21c22..7061da8d 100644
--- a/rescript/citations.bib
+++ b/rescript/citations.bib
@@ -157,3 +157,20 @@ @article{schoch2020
   pmcid = {PMC7408187},
   pmid = {32761142}
 }
+
+@article{nilsson2019unite,
+  author = {Nilsson, Rolf Henrik and Larsson, Karl-Henrik and Taylor, Andy F S and Bengtsson-Palme, Johan and Jeppesen, Thomas S and Schigel, Dmitry and Kennedy, Peter and Picard, Kathryn and Glöckner, Frank Oliver and Tedersoo, Leho and Saar, Irja and Kõljalg, Urmas and Abarenkov, Kessy},
+  title = "{The UNITE database for molecular identification of fungi: handling dark taxa and parallel taxonomic classifications}",
+  journal = {Nucleic Acids Research},
+  volume = {47},
+  number = {D1},
+  pages = {D259-D264},
+  year = {2018},
+  month = {10},
+  issn = {0305-1048},
+  doi = {10.1093/nar/gky1022},
+  url = {https://doi.org/10.1093/nar/gky1022},
+  eprint = {https://academic.oup.com/nar/article-pdf/47/D1/D259/27436038/gky1022.pdf},
+  pmcid = {PMC6324048},
+  pmid = {30371820},
+}
diff --git a/rescript/get_unite.py b/rescript/get_unite.py
new file mode 100644
index 00000000..79365e51
--- /dev/null
+++ b/rescript/get_unite.py
@@ -0,0 +1,161 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import os
+import tempfile
+import tarfile
+import requests
+
+from pandas import DataFrame
+from q2_types.feature_data import (
+    TaxonomyFormat,
+    MixedCaseDNAFASTAFormat,
+    DNAIterator,
+)
+
+# Source: https://unite.ut.ee/repository.php
+UNITE_DOIS = {
+    "9.0": {
+        "fungi": {
+            False: "10.15156/BIO/2938079",
+            True: "10.15156/BIO/2938080",
+        },
+        "eukaryotes": {
+            False: "10.15156/BIO/2938081",
+            True: "10.15156/BIO/2938082",
+        },
+    },
+    # Old version 9.0 is not listed here
+    "8.3": {
+        "fungi": {
+            False: "10.15156/BIO/1264708",
+            True: "10.15156/BIO/1264763",
+        },
+        "eukaryotes": {
+            False: "10.15156/BIO/1264819",
+            True: "10.15156/BIO/1264861",
+        },
+    },
+    "8.2": {
+        "fungi": {
+            False: "10.15156/BIO/786385",
+            True: "10.15156/BIO/786387",
+        },
+        "eukaryotes": {
+            False: "10.15156/BIO/786386",
+            True: "10.15156/BIO/786388",
+        },
+    },
+}
+
+
+def _unite_get_url(
+    version: str = None, taxon_group: str = None, singletons: bool = None
+) -> str:
+    """Get DOI from included list, then query plutof API for UNITE url"""
+    # Get matching DOI
+    doi = UNITE_DOIS[version][taxon_group][singletons]
+    # Build URL
+    base_url = (
+        "https://api.plutof.ut.ee/v1/public/dois/"
+        "?format=vnd.api%2Bjson&identifier="
+    )
+    query_data = requests.get(base_url + doi).json()
+    # Updates can be made to files in a DOI, so on the advice of the devs,
+    # only return the last (newest) file with this -1  vv
+    URL = query_data["data"][0]["attributes"]["media"][-1]["url"]
+    return URL
+
+
+def _unite_get_tgz(
+    url: str = None, download_path: str = None, retries: int = 10
+) -> str:
+    """Download compressed database"""
+    for retry in range(retries):
+        # Track downloaded size
+        file_size = 0
+        # Prepair error text
+        dlfail = "File incomplete on try " + str(retry + 1)
+        try:
+            response = requests.get(url, stream=True)
+            # Save .tgz file
+            unite_file_path = os.path.join(download_path, "unitefile.tar.gz")
+            with open(unite_file_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+                        file_size += len(chunk)
+            # Check if the downloaded size matches the expected size
+            if file_size == int(response.headers.get("content-length", 0)):
+                return unite_file_path  # done!
+            else:
+                raise ValueError(dlfail)
+        except ValueError:
+            print(dlfail)
+            if retry + 1 == retries:
+                raise ValueError(dlfail)
+
+
+def _unite_get_artifacts(
+    tgz_file: str = None, cluster_id: str = "99"
+) -> (DataFrame, DNAIterator):
+    """
+    Find and import files with matching cluster_id from .tgz
+
+    Returns: Tuple containing tax_results and seq_results
+    """
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # Extract from the .tgz file
+        with tarfile.open(tgz_file, "r:gz") as tar:
+            # Keep only _dev files
+            members = [m for m in tar.getmembers() if "_dev" in m.name]
+            if not members:
+                raise ValueError("No '_dev' files found in Unite .tgz file")
+            for member in members:
+                # Keep only base name
+                member.name = os.path.basename(member.name)
+                tar.extract(member, path=tmpdirname)
+        # Find and import the raw files...
+        for root, dirs, files in os.walk(tmpdirname):
+            # ... with the matching cluster_id
+            filtered_files = [
+                f for f in files if f.split("_")[4] == cluster_id
+            ]
+            if not filtered_files or len(filtered_files) != 2:
+                raise ValueError(
+                    "Expected 2, but found "
+                    + str(len(filtered_files))
+                    + " files found with cluster_id = "
+                    + cluster_id
+                )
+            for file in filtered_files:
+                fp = os.path.join(root, file)
+                if file.endswith(".txt"):
+                    taxa = TaxonomyFormat(fp, mode="r").view(DataFrame)
+                elif file.endswith(".fasta"):
+                    seqs = MixedCaseDNAFASTAFormat(fp, mode="r").view(
+                        DNAIterator
+                    )
+    return taxa, seqs
+
+
+def get_unite_data(
+    version: str = "9.0",
+    taxon_group: str = "eukaryotes",
+    cluster_id: str = "99",
+    singletons: bool = False,
+) -> (DataFrame, DNAIterator):
+    """
+    Get Qiime2 artifacts for a given version of UNITE
+
+    Returns: Tuple containing tax_results and seq_results
+    """
+    url = _unite_get_url(version, taxon_group, singletons)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        tar_file_path = _unite_get_tgz(url, tmpdirname)
+        return _unite_get_artifacts(tar_file_path, cluster_id)
diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py
index 7346c9da..99bb2824 100644
--- a/rescript/plugin_setup.py
+++ b/rescript/plugin_setup.py
@@ -49,6 +49,7 @@
 from rescript.ncbi import (
     get_ncbi_data, _default_ranks, _allowed_ranks, get_ncbi_data_protein)
 from .get_gtdb import get_gtdb_data
+from .get_unite import get_unite_data
 
 citations = Citations.load('citations.bib', package='rescript')
 
@@ -76,6 +77,11 @@
     'and be aware that earlier versions may be released under a different '
     'license.')
 
+UNITE_LICENSE_NOTE = (
+    'NOTE: THIS ACTION ACQUIRES DATA FROM UNITE, which is licensed under '
+    'CC BY-SA 4.0. To learn more, please visit https://unite.ut.ee/cite.php '
+    'and https://creativecommons.org/licenses/by-sa/4.0/.')
+
 VOLATILITY_PLOT_XAXIS_INTERPRETATION = (
     'The x-axis in these plots represents the taxonomic '
     'levels present in the input taxonomies so are labeled numerically '
@@ -974,6 +980,40 @@
 )
 
 
+plugin.methods.register_function(
+    function=get_unite_data,
+    inputs={},
+    parameters={
+        'version': Str % Choices(['9.0', '8.3', '8.2']),
+        'taxon_group': Str % Choices(['fungi', 'eukaryotes']),
+        'cluster_id': Str % Choices(['99', '97', 'dynamic']),
+        'singletons': Bool,
+        },
+    outputs=[('taxonomy', FeatureData[Taxonomy]),
+             ('sequences', FeatureData[Sequence])],
+    input_descriptions={},
+    parameter_descriptions={
+        'version': 'UNITE version to download.',
+        'taxon_group': 'Download a database with only \'fungi\' '
+                       'or including all \'eukaryotes\'.',
+        'cluster_id': 'Percent similarity at which sequences in '
+                      'the of database were clustered.',
+        'singletons': 'Include singleton clusters in the database.'},
+    output_descriptions={
+        'taxonomy': 'UNITE reference taxonomy.',
+        'sequences': 'UNITE reference sequences.'},
+    name='Download and import UNITE reference data.',
+    description=(
+        'Download and import ITS sequences and taxonomy from the '
+        'UNITE database, given a '
+        'version number and taxon_group, with the option to select a '
+        'cluster_id and include singletons. '
+        'Downloads data directly from UNITE\'s PlutoF REST API. ' +
+        UNITE_LICENSE_NOTE),
+    citations=[citations['nilsson2019unite']]
+)
+
+
 plugin.methods.register_function(
     function=filter_taxa,
     inputs={'taxonomy': FeatureData[Taxonomy]},
diff --git a/rescript/tests/data/unitefile.tgz b/rescript/tests/data/unitefile.tgz
new file mode 100644
index 0000000000000000000000000000000000000000..5a19629fc5886bcaef595bca28a5eb4ccd85525b
GIT binary patch
literal 1158
zcmV;11bO=(iwFP!000001MOGAZsSG_^|kqmd_f@DvbBMM0$jFXPkZU?d7vgrBGidf
z%UPs<-+7OtwYNpHn-pnLV3;_Ts2L9V_(&?_ZQS+OWBhFx?)vzt-~IU1-92pjc;3cE
zeOAxU=GFOmyy`!lU3cTYo4rD+B&(*ucik?l<8Mf_TGq3=S=P(>s%@96Sv9X0O*K2M
zUUS$><uL`dJ3XEKGHf5me)qTQzV`foD&PP7Q(ZSzyPTg@?=C*Zk3YPx>P6Mg6D=Ob
z6El(Y2`V9oYPt)-(w!uQnc*gwrb|@~s#pxc)1*(Uk``;QAt{v#Tx(uHjRi?-G{7d-
zyA52qS=<<(L6yt|cysuH0m12l?KG2o#!tF8wLk~bPQf<+0~;#PDFehtKU8@{(jj)=
zXNWv0pv57=_y~pyzta~LzSUjKn-!Loq4*J4jzcq0!~(06UF<h;faoOlBJ$vH;9%iu
z6E|msEGHqO#5foVy{22nMQ~FXDSs7t#YJFXKukR?v?RbtxjQ|Aqd888*laqO*0F?$
zhbdGp67`+*!vmc>OHb^>KM^D6U<_WVI~2$Pg_Fp;WN;Ckk&2nO1X!K4M9b5M{F7f~
zEZt<p>c9gZzyu|tBa?74>I9qcSRD$u0uY^+fgHeQ(jKg#(@r*0Ni8^^lva$$!4YVA
zI%!}`gxeyBaMWBU`6yP47b=T2+d-)bzgbxHlkh3+7-Mz%umVhSbqLv|xo&`uu~66r
zf*VIrtZ>Ut0d5dgopLxZ1J7)kV~re<iJ75n6vz_+!3R{coGmV8&Wz+!?+fGs17-;l
zWuYJDHRmlz*~BvUnQiYIWGKTB8WiTov0!(Tkc}HDMN@Rq_*Ync#L>y%9YXnG!0P!s
z^c`L>z>qtXccG9w8EF`%gfc|9CYj?-LsmZ#XBUJWr!V=(LnP-WcpZ2P4O6z5E^joR
zZf}3$sIWb#OsOMw%$%?zk%R2p;tkH+dt=r;2c92CAJ_?VPC8=I<O2P5p8Mb+hF-yw
z<4U(P`<-RMi!I4X9%b(qD<5SS9~S5JV$tMDM%D%=G+Q`X&&lB}A=d-@xH_@eJ&%ST
z>&6(A91TC`p95p^dH7i#e&I0u^iRW2zZ`y=`)VGWs=2j?JOaFd9)_Rg(Pu}Kk6o6w
z-)i#d%O{^6Pd>44n|#XJCbunjBlXE=)8q@EUq6D6tY@Yl8RrLLrFsWAOh49B@U*dZ
z*(AwHf4`vf9KreQY4)j?i{@JNZt0~mbrvGx<KSrcfo`k;_0G$pVcD@O^O`RTa`Hy+
zf8%Z3cTd~<?cML+@;|`-Y5(#wzViNGHS;#V|F`L`#ruD|Y|8upzah`Q{(tvtj2Dmh
zH^cfN#`oPgTn&jQm)m`}zKl_By4`RIMZfRI^)|+E+i&`Z+wHE~B*pcy;<E4h^-a=j
z@A~_Fzu9!_F~-OH;VRt@*F%4G@?86V15b|6{);dFby|uPDN>|Jks?Kk6e&`qNRc8%
YiWDhQq)3q>MP4m`03~+eUjQfo04*s*2LJ#7

literal 0
HcmV?d00001

diff --git a/rescript/tests/data/unitefile_no_dev.tgz b/rescript/tests/data/unitefile_no_dev.tgz
new file mode 100644
index 0000000000000000000000000000000000000000..83546ac52d162168475553de508af89a40bd4b0e
GIT binary patch
literal 285
zcmb2|=3oE==C_v|y$>6Ruz!gA8vI6hic0SEsq<u_Y$`9#mETzXx+yn!cF3#k+l!|+
zo+waI`cpss_;2+Oaw>KH2YuERzqQ^Qb=zp~#{%=2+wx}=`F8d!?V5VxvxSf1u8tYH
zm%juoJ$EGY?-VWBlO~gAUMgO$yl}~s(;13-(%+32^QT4VEJ}avcK-18@8{<J5AOdS
zw9F}8OKf$K^X%JOCv;xuVte=ZDtDk|*;d`EMO#0XWk0VE+L_SR^<tS>^IASe_HV7q
zRkw-{|Gme%Y-aP_%z%v#7tIsCUL?LF)8}F5v2%IPuf5aBU*Era&7`M=Q$H!5Or5l3
e|JIW)-{|lD^K~=C^T_0!uZ&JrnR*Nw3=9BmxQX-t

literal 0
HcmV?d00001

diff --git a/rescript/tests/test_get_unite.py b/rescript/tests/test_get_unite.py
new file mode 100644
index 00000000..9a47d3d0
--- /dev/null
+++ b/rescript/tests/test_get_unite.py
@@ -0,0 +1,100 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2019-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import pkg_resources
+import tempfile
+import pandas.core.frame
+import q2_types.feature_data
+from qiime2.plugin.testing import TestPluginBase
+from rescript.get_unite import (
+    UNITE_DOIS,
+    _unite_get_url,
+    _unite_get_tgz,
+    _unite_get_artifacts,
+    get_unite_data,
+)
+
+from urllib.request import urlopen
+from unittest.mock import patch, Mock
+
+
+class TestGetUNITE(TestPluginBase):
+    package = "rescript.tests"
+
+    def setUp(self):
+        super().setUp()
+        self.unitefile = pkg_resources.resource_filename(
+            "rescript.tests", "data/unitefile.tgz"
+        )
+        self.unitefile_no_dev = pkg_resources.resource_filename(
+            "rescript.tests", "data/unitefile_no_dev.tgz"
+        )
+
+    # Requires internet access
+    def test_unite_get_url(self):
+        # for all combinations...
+        for v in UNITE_DOIS.keys():
+            for tg in UNITE_DOIS[v].keys():
+                for s in UNITE_DOIS[v][tg].keys():
+                    # ... try to get the URL
+                    url = _unite_get_url(v, tg, s)
+                    urlopen(url)
+        self.assertTrue(True)
+
+    def test_unite_get_tgz(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # mock the response object
+            mock_response = Mock()
+            mock_response.iter_content.return_value = [b"mock"]
+            mock_response.headers.get.return_value = "4"  # matches content
+            # mock successful download
+            with patch("requests.get", return_value=mock_response):
+                _unite_get_tgz("fakeURL", tmpdirname)
+            # real failed download
+            with self.assertRaisesRegex(ValueError, "File incomplete on try"):
+                _unite_get_tgz("https://files.plutof.ut.ee/nope", tmpdirname)
+
+    def test_unite_get_artifacts(self):
+        # Test on small data/unitefile.tgz with two items inside
+        res_one, res_two = _unite_get_artifacts(
+            self.unitefile, cluster_id="97"
+        )
+        # Column names and one feature from TaxonomyFormat
+        self.assertEqual(
+            res_one["Taxon"]["SH1140752.08FU_UDB013072_reps"],
+            "k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Thelephorales;"
+            "f__Thelephoraceae;g__Tomentella;s__unidentified",
+        )
+        self.assertEqual(
+            str(type(res_two)),
+            "<class 'q2_types.feature_data._transformer.DNAIterator'>",
+        )
+        # test no _dev files found
+        with self.assertRaises(ValueError):
+            _unite_get_artifacts(self.unitefile_no_dev, cluster_id="97")
+        # test missing files or misspelled cluster_id
+        with self.assertRaises(ValueError):
+            _unite_get_artifacts(self.unitefile, "nothing")
+
+    # This tests the function with toy data.
+    # All relevant internals are tested elsewhere in this test class.
+    # Downloading is mock`ed with patch.
+    def test_get_unite_data(self):
+        with patch(
+            "rescript.get_unite._unite_get_tgz", return_value=self.unitefile
+        ):
+            res = get_unite_data(
+                version="8.3", taxon_group="fungi", cluster_id="97"
+            )
+            self.assertEqual(len(res), 2)
+            self.assertTrue(isinstance(res[0], pandas.core.frame.DataFrame))
+            self.assertTrue(
+                isinstance(
+                    res[1], q2_types.feature_data._transformer.DNAIterator
+                )
+            )