bokulich-lab · mikerobeson · Nov 13, 2023 · Apr 10, 2022 · Oct 5, 2023 · Oct 6, 2023
diff --git a/rescript/get_data.py b/rescript/get_data.py
@@ -12,13 +12,132 @@
 import shutil
 import gzip
 import warnings
+import requests
+import tarfile
 
 import qiime2
 from urllib.request import urlretrieve
 from urllib.error import HTTPError
 from q2_types.feature_data import RNAFASTAFormat
 
 
+def _unite_dois_to_urls(DOIs):
+    '''Generate UNITE urls, given their DOIs.'''
+    # Make DOIs iterable
+    DOIs = [DOIs] if isinstance(DOIs, str) else DOIs
+    print('Get URLs for these DOIs:', DOIs)
+    base_url = 'https://api.plutof.ut.ee/v1/public/dois/'\
+               '?format=vnd.api%2Bjson&identifier='
+    # Eventual output
+    URLs = set()
+    # For each DOI, get download URL of file
+    for DOI in DOIs:
+        query_data = requests.get(base_url + DOI).json()
+        # Updates can be made to files in a DOI, so on the advice of the devs,
+        # only return the last file uploaded with this -1  vv
+        URL = query_data['data'][0]['attributes']['media'][-1]['url']
+        URLs.add(URL)
+    return URLs
+
+
+def _unite_get_url(version, taxon_group, singletons):
+    '''Generate UNITE urls, given database version and reference target.'''
+    # Lookup DOIs for various databases, source: https://unite.ut.ee/repository.php
+    unite_dois = {
+        '9.0': {'fungi':      {False: '10.15156/BIO/2938079', True: '10.15156/BIO/2938080'},
+                'eukaryotes': {False: '10.15156/BIO/2938081', True: '10.15156/BIO/2938082'}},
+        # Old version 9.0 is not listed here
+        '8.3': {'fungi':      {False: '10.15156/BIO/1264708', True: '10.15156/BIO/1264763'},
+                'eukaryotes': {False: '10.15156/BIO/1264819', True: '10.15156/BIO/1264861'}},
+        '8.2': {'fungi':      {False: '10.15156/BIO/786385', True: '10.15156/BIO/786387'},
+                'eukaryotes': {False: '10.15156/BIO/786386', True: '10.15156/BIO/786388'}},
+        '8.0': {'fungi':      {False: '', True: '10.15156/BIO/786349'},
+                'eukaryotes': {False: '', True: ''}},  # All other 8.0 are in zip files
+    }
+    # There's got to be a better way! See https://stackoverflow.com/questions/25833613/safe-method-to-get-value-of-nested-dictionary
+    try:
+        # Check if we have the DOI requested
+        target_doi = unite_dois[version][taxon_group][singletons]
+    except KeyError as ke:
+        print('Unknown DOI for this value: ' + str(ke))
+        raise
+    return _unite_dois_to_urls(target_doi).pop()
+
+_unite_get_url(version='9.0', taxon_group='fungi', singletons=False)
+
+# with tempfile.TemporaryDirectory() as tmp_dir:
+tmp_dir = tempfile.mkdtemp()
+
+def _unite_download_targz(url, download_path):
+    print('Downloading ' + url)
+
+    response = requests.get(url, stream=True)
+    if response.status_code != 200:
+        raise ValueError("Failed to download the file from " + url)
+
+    tar_file_path = os.path.join(download_path, 'unitefile.tar.gz')
+    with open(tar_file_path, 'wb') as f:
+        f.write(response.content)
+
+    # Extract only the 'developer' subdirectory
+    with tarfile.open(tar_file_path, 'r:gz') as tar:
+        # Ensure that 'developer' exists in the tar file
+        members = [member for member in tar.getmembers() if member.name.startswith('developer')]
+        if not members:
+            raise ValueError("No 'developer' subdirectory found in the .tar.gz file.")
+
+        for member in members:
+            member.name = os.path.basename(member.name) # Strip the 'developer' prefix
+            tar.extract(member, path=download_path)
+
+    return download_path
+
+# Test it by downloading this file
+# _unite_download_targz('https://files.plutof.ut.ee/public/orig/59/12/591225E8985EFC44B595C79AF5F467421B4D9A95093A0811B13CB4CC13A6DA46.tgz', tmp_dir)
+
+# import as artifacts
+# results[name] = qiime2.Artifact.import_data(dtype, destination)
+
+def get_unite_data(version, taxon_group, singletons=False):
+    url = _unite_get_url(version, taxon_group, singletons)
+    results = {'sequences': [], 'taxonomy': []}
+
+    # with tempfile.TemporaryDirectory() as tmp_dir:
+    tmp_dir = tempfile.mkdtemp()
+
+    print('Temporary directory:', tmp_dir)
+    _unite_download_targz(url, download_path=tmp_dir)
+
+    for root, dirs, files in os.walk(tmp_dir):
+        for file in files:
+            print(results)
+            if file.endswith('.fasta'):
+                fasta_file_name = os.path.join(root, file)
+                print('found fasta: ' + fasta_file_name)
+                with open(fasta_file_name, 'r') as fasta_file:
+                    # Read the content of the file and append it as a Python object
+                    fasta_content = fasta_file.read()
+                    results['sequences'].append(qiime2.Artifact.import_data('FeatureData[RNASequence]', fasta_content))
+            elif file.endswith('.txt'):
+                txt_file_name = os.path.join(root, file)
+                print('found txt: ' + txt_file_name)
+                results['taxonomy'].append(qiime2.Artifact.import_data('FeatureData[Taxonomy]', txt_file_name))
+    return results
+
+get_unite_data(version='9.0', taxon_group='fungi')
+
+
+# How do I import data?
+with open("/tmp/tmpzprzopiu/sh_refs_qiime_ver9_97_25.07.2023_dev.fasta", 'r') as fasta_file:
+    qiime2.Artifact.import_data('FeatureData[RNASequence]', fasta_file)
+
+with open("/tmp/tmpzprzopiu/sh_refs_qiime_ver9_97_25.07.2023_dev.fasta", 'r') as fasta_file:
+    # Read the content of the file and append it as a Python object
+    fasta_content = fasta_file.read()
+    qiime2.Artifact.import_data('FeatureData[RNASequence]', fasta_content)
+
+
+
 def get_silva_data(ctx,
                    version='138.1',
                    target='SSURef_NR99',