get_unite_data functions (#134)

LGTM Thanks @colinbrislawn! Perhaps you can write up a small Community Contributions tutorial? Similar to the [GTDB](https://forum.qiime2.org/t/how-to-train-a-gtdb-ssu-classifier-using-rescript/25725) and [RDP](https://forum.qiime2.org/t/importing-sequence-data-with-lower-case-nucleotide-characters-constructing-an-rdp-classifier-as-an-example/25158) tutorials?
bokulich-lab · Nov 13, 2023 · 2bee235 · 2bee235
1 parent 9467284
commit 2bee235
Show file tree

Hide file tree

Showing 6 changed files with 318 additions and 0 deletions.
diff --git a/rescript/citations.bib b/rescript/citations.bib
@@ -157,3 +157,20 @@ @article{schoch2020
   pmcid = {PMC7408187},
   pmid = {32761142}
 }
+
+@article{nilsson2019unite,
+  author = {Nilsson, Rolf Henrik and Larsson, Karl-Henrik and Taylor, Andy F S and Bengtsson-Palme, Johan and Jeppesen, Thomas S and Schigel, Dmitry and Kennedy, Peter and Picard, Kathryn and Glöckner, Frank Oliver and Tedersoo, Leho and Saar, Irja and Kõljalg, Urmas and Abarenkov, Kessy},
+  title = "{The UNITE database for molecular identification of fungi: handling dark taxa and parallel taxonomic classifications}",
+  journal = {Nucleic Acids Research},
+  volume = {47},
+  number = {D1},
+  pages = {D259-D264},
+  year = {2018},
+  month = {10},
+  issn = {0305-1048},
+  doi = {10.1093/nar/gky1022},
+  url = {https://doi.org/10.1093/nar/gky1022},
+  eprint = {https://academic.oup.com/nar/article-pdf/47/D1/D259/27436038/gky1022.pdf},
+  pmcid = {PMC6324048},
+  pmid = {30371820},
+}
diff --git a/rescript/get_unite.py b/rescript/get_unite.py
@@ -0,0 +1,161 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import os
+import tempfile
+import tarfile
+import requests
+
+from pandas import DataFrame
+from q2_types.feature_data import (
+    TaxonomyFormat,
+    MixedCaseDNAFASTAFormat,
+    DNAIterator,
+)
+
+# Source: https://unite.ut.ee/repository.php
+UNITE_DOIS = {
+    "9.0": {
+        "fungi": {
+            False: "10.15156/BIO/2938079",
+            True: "10.15156/BIO/2938080",
+        },
+        "eukaryotes": {
+            False: "10.15156/BIO/2938081",
+            True: "10.15156/BIO/2938082",
+        },
+    },
+    # Old version 9.0 is not listed here
+    "8.3": {
+        "fungi": {
+            False: "10.15156/BIO/1264708",
+            True: "10.15156/BIO/1264763",
+        },
+        "eukaryotes": {
+            False: "10.15156/BIO/1264819",
+            True: "10.15156/BIO/1264861",
+        },
+    },
+    "8.2": {
+        "fungi": {
+            False: "10.15156/BIO/786385",
+            True: "10.15156/BIO/786387",
+        },
+        "eukaryotes": {
+            False: "10.15156/BIO/786386",
+            True: "10.15156/BIO/786388",
+        },
+    },
+}
+
+
+def _unite_get_url(
+    version: str = None, taxon_group: str = None, singletons: bool = None
+) -> str:
+    """Get DOI from included list, then query plutof API for UNITE url"""
+    # Get matching DOI
+    doi = UNITE_DOIS[version][taxon_group][singletons]
+    # Build URL
+    base_url = (
+        "https://api.plutof.ut.ee/v1/public/dois/"
+        "?format=vnd.api%2Bjson&identifier="
+    )
+    query_data = requests.get(base_url + doi).json()
+    # Updates can be made to files in a DOI, so on the advice of the devs,
+    # only return the last (newest) file with this -1  vv
+    URL = query_data["data"][0]["attributes"]["media"][-1]["url"]
+    return URL
+
+
+def _unite_get_tgz(
+    url: str = None, download_path: str = None, retries: int = 10
+) -> str:
+    """Download compressed database"""
+    for retry in range(retries):
+        # Track downloaded size
+        file_size = 0
+        # Prepair error text
+        dlfail = "File incomplete on try " + str(retry + 1)
+        try:
+            response = requests.get(url, stream=True)
+            # Save .tgz file
+            unite_file_path = os.path.join(download_path, "unitefile.tar.gz")
+            with open(unite_file_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+                        file_size += len(chunk)
+            # Check if the downloaded size matches the expected size
+            if file_size == int(response.headers.get("content-length", 0)):
+                return unite_file_path  # done!
+            else:
+                raise ValueError(dlfail)
+        except ValueError:
+            print(dlfail)
+            if retry + 1 == retries:
+                raise ValueError(dlfail)
+
+
+def _unite_get_artifacts(
+    tgz_file: str = None, cluster_id: str = "99"
+) -> (DataFrame, DNAIterator):
+    """
+    Find and import files with matching cluster_id from .tgz
+
+    Returns: Tuple containing tax_results and seq_results
+    """
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # Extract from the .tgz file
+        with tarfile.open(tgz_file, "r:gz") as tar:
+            # Keep only _dev files
+            members = [m for m in tar.getmembers() if "_dev" in m.name]
+            if not members:
+                raise ValueError("No '_dev' files found in Unite .tgz file")
+            for member in members:
+                # Keep only base name
+                member.name = os.path.basename(member.name)
+                tar.extract(member, path=tmpdirname)
+        # Find and import the raw files...
+        for root, dirs, files in os.walk(tmpdirname):
+            # ... with the matching cluster_id
+            filtered_files = [
+                f for f in files if f.split("_")[4] == cluster_id
+            ]
+            if not filtered_files or len(filtered_files) != 2:
+                raise ValueError(
+                    "Expected 2, but found "
+                    + str(len(filtered_files))
+                    + " files found with cluster_id = "
+                    + cluster_id
+                )
+            for file in filtered_files:
+                fp = os.path.join(root, file)
+                if file.endswith(".txt"):
+                    taxa = TaxonomyFormat(fp, mode="r").view(DataFrame)
+                elif file.endswith(".fasta"):
+                    seqs = MixedCaseDNAFASTAFormat(fp, mode="r").view(
+                        DNAIterator
+                    )
+    return taxa, seqs
+
+
+def get_unite_data(
+    version: str = "9.0",
+    taxon_group: str = "eukaryotes",
+    cluster_id: str = "99",
+    singletons: bool = False,
+) -> (DataFrame, DNAIterator):
+    """
+    Get Qiime2 artifacts for a given version of UNITE
+
+    Returns: Tuple containing tax_results and seq_results
+    """
+    url = _unite_get_url(version, taxon_group, singletons)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        tar_file_path = _unite_get_tgz(url, tmpdirname)
+        return _unite_get_artifacts(tar_file_path, cluster_id)
diff --git a/rescript/plugin_setup.py b/rescript/plugin_setup.py
@@ -49,6 +49,7 @@
 from rescript.ncbi import (
     get_ncbi_data, _default_ranks, _allowed_ranks, get_ncbi_data_protein)
 from .get_gtdb import get_gtdb_data
+from .get_unite import get_unite_data
 
 citations = Citations.load('citations.bib', package='rescript')
 
@@ -76,6 +77,11 @@
     'and be aware that earlier versions may be released under a different '
     'license.')
 
+UNITE_LICENSE_NOTE = (
+    'NOTE: THIS ACTION ACQUIRES DATA FROM UNITE, which is licensed under '
+    'CC BY-SA 4.0. To learn more, please visit https://unite.ut.ee/cite.php '
+    'and https://creativecommons.org/licenses/by-sa/4.0/.')
+
 VOLATILITY_PLOT_XAXIS_INTERPRETATION = (
     'The x-axis in these plots represents the taxonomic '
     'levels present in the input taxonomies so are labeled numerically '
@@ -974,6 +980,40 @@
 )
 
 
+plugin.methods.register_function(
+    function=get_unite_data,
+    inputs={},
+    parameters={
+        'version': Str % Choices(['9.0', '8.3', '8.2']),
+        'taxon_group': Str % Choices(['fungi', 'eukaryotes']),
+        'cluster_id': Str % Choices(['99', '97', 'dynamic']),
+        'singletons': Bool,
+        },
+    outputs=[('taxonomy', FeatureData[Taxonomy]),
+             ('sequences', FeatureData[Sequence])],
+    input_descriptions={},
+    parameter_descriptions={
+        'version': 'UNITE version to download.',
+        'taxon_group': 'Download a database with only \'fungi\' '
+                       'or including all \'eukaryotes\'.',
+        'cluster_id': 'Percent similarity at which sequences in '
+                      'the of database were clustered.',
+        'singletons': 'Include singleton clusters in the database.'},
+    output_descriptions={
+        'taxonomy': 'UNITE reference taxonomy.',
+        'sequences': 'UNITE reference sequences.'},
+    name='Download and import UNITE reference data.',
+    description=(
+        'Download and import ITS sequences and taxonomy from the '
+        'UNITE database, given a '
+        'version number and taxon_group, with the option to select a '
+        'cluster_id and include singletons. '
+        'Downloads data directly from UNITE\'s PlutoF REST API. ' +
+        UNITE_LICENSE_NOTE),
+    citations=[citations['nilsson2019unite']]
+)
+
+
 plugin.methods.register_function(
     function=filter_taxa,
     inputs={'taxonomy': FeatureData[Taxonomy]},

diff --git a/rescript/tests/data/unitefile.tgz b/rescript/tests/data/unitefile.tgz
diff --git a/rescript/tests/data/unitefile_no_dev.tgz b/rescript/tests/data/unitefile_no_dev.tgz
diff --git a/rescript/tests/test_get_unite.py b/rescript/tests/test_get_unite.py
@@ -0,0 +1,100 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2019-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import pkg_resources
+import tempfile
+import pandas.core.frame
+import q2_types.feature_data
+from qiime2.plugin.testing import TestPluginBase
+from rescript.get_unite import (
+    UNITE_DOIS,
+    _unite_get_url,
+    _unite_get_tgz,
+    _unite_get_artifacts,
+    get_unite_data,
+)
+
+from urllib.request import urlopen
+from unittest.mock import patch, Mock
+
+
+class TestGetUNITE(TestPluginBase):
+    package = "rescript.tests"
+
+    def setUp(self):
+        super().setUp()
+        self.unitefile = pkg_resources.resource_filename(
+            "rescript.tests", "data/unitefile.tgz"
+        )
+        self.unitefile_no_dev = pkg_resources.resource_filename(
+            "rescript.tests", "data/unitefile_no_dev.tgz"
+        )
+
+    # Requires internet access
+    def test_unite_get_url(self):
+        # for all combinations...
+        for v in UNITE_DOIS.keys():
+            for tg in UNITE_DOIS[v].keys():
+                for s in UNITE_DOIS[v][tg].keys():
+                    # ... try to get the URL
+                    url = _unite_get_url(v, tg, s)
+                    urlopen(url)
+        self.assertTrue(True)
+
+    def test_unite_get_tgz(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            # mock the response object
+            mock_response = Mock()
+            mock_response.iter_content.return_value = [b"mock"]
+            mock_response.headers.get.return_value = "4"  # matches content
+            # mock successful download
+            with patch("requests.get", return_value=mock_response):
+                _unite_get_tgz("fakeURL", tmpdirname)
+            # real failed download
+            with self.assertRaisesRegex(ValueError, "File incomplete on try"):
+                _unite_get_tgz("https://files.plutof.ut.ee/nope", tmpdirname)
+
+    def test_unite_get_artifacts(self):
+        # Test on small data/unitefile.tgz with two items inside
+        res_one, res_two = _unite_get_artifacts(
+            self.unitefile, cluster_id="97"
+        )
+        # Column names and one feature from TaxonomyFormat
+        self.assertEqual(
+            res_one["Taxon"]["SH1140752.08FU_UDB013072_reps"],
+            "k__Fungi;p__Basidiomycota;c__Agaricomycetes;o__Thelephorales;"
+            "f__Thelephoraceae;g__Tomentella;s__unidentified",
+        )
+        self.assertEqual(
+            str(type(res_two)),
+            "<class 'q2_types.feature_data._transformer.DNAIterator'>",
+        )
+        # test no _dev files found
+        with self.assertRaises(ValueError):
+            _unite_get_artifacts(self.unitefile_no_dev, cluster_id="97")
+        # test missing files or misspelled cluster_id
+        with self.assertRaises(ValueError):
+            _unite_get_artifacts(self.unitefile, "nothing")
+
+    # This tests the function with toy data.
+    # All relevant internals are tested elsewhere in this test class.
+    # Downloading is mock`ed with patch.
+    def test_get_unite_data(self):
+        with patch(
+            "rescript.get_unite._unite_get_tgz", return_value=self.unitefile
+        ):
+            res = get_unite_data(
+                version="8.3", taxon_group="fungi", cluster_id="97"
+            )
+            self.assertEqual(len(res), 2)
+            self.assertTrue(isinstance(res[0], pandas.core.frame.DataFrame))
+            self.assertTrue(
+                isinstance(
+                    res[1], q2_types.feature_data._transformer.DNAIterator
+                )
+            )