bokulich-lab · Sann5 · Jan 29, 2024 · Dec 22, 2023 · Dec 22, 2023 · Jan 11, 2024
diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib
@@ -123,3 +123,9 @@ @article{buchfink_sensitive_2021
 	keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software},
 	pages = {366--368},
 }
+
+@misc{NCBI,
+  title = {National Center for Biotechnology Information (NCBI)},
+  url = {https://www.ncbi.nlm.nih.gov/},
+  note = {Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information;},
+}
diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py
@@ -8,11 +8,12 @@
 from ._method import eggnog_diamond_search, eggnog_annotate
 from ._dbs import (
     fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
-    fetch_eggnog_proteins
+    fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy
 )
 
 
 __all__ = [
     'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
-    'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins'
+    'fetch_diamond_db', 'build_custom_diamond_db', 'fetch_eggnog_proteins',
+    'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy'
 ]
diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
@@ -6,7 +6,10 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
+import datetime
+import pandas as pd
 from q2_types.feature_data import ProteinSequencesDirectoryFormat
+import shutil
 from q2_types_genomics.reference_db import (
     EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
     EggnogProteinSequencesDirFmt
@@ -174,3 +177,141 @@ def fetch_eggnog_proteins() -> EggnogProteinSequencesDirFmt:
     ))
 
     return eggnog_fa
+
+
+def build_eggnog_diamond_db(
+        eggnog_proteins: EggnogProteinSequencesDirFmt,
+        taxon: int
+) -> DiamondDatabaseDirFmt:
+    """
+    Creates an DIAMOND database which contains the protein
+    sequences that belong to the specified taxon.
+    """
+    # Validate taxon ID
+    _validate_taxon_id(eggnog_proteins, taxon)
+
+    # Initialize output objects
+    diamond_db = DiamondDatabaseDirFmt()
+
+    # Define command.
+    cmd = [
+        "create_dbs.py",
+        "--data_dir", str(eggnog_proteins),
+        "--taxids", str(taxon),
+        "--dbname", "ref_db"
+    ]
+    run_command(cmd)
+
+    # The script will create the diamond DB in side the directory of
+    # eggnog_proteins object, so we need to move it to diamond_db
+    source_path = os.path.join(str(eggnog_proteins), "ref_db.dmnd")
+    destination_path = os.path.join(str(diamond_db), "ref_db.dmnd")
+    shutil.move(source_path, destination_path)
+
+    # Return objects
+    return diamond_db
+
+
+def _validate_taxon_id(eggnog_proteins, taxon):
+    # Validate taxon id number
+    # Read in valid taxon ids
+    taxid_info = pd.read_csv(
+        os.path.join(str(eggnog_proteins), "e5.taxid_info.tsv"),
+        sep="\t"
+    )
+
+    # Convert them into a set
+    tax_ids = set()
+    for lineage in taxid_info["Taxid Lineage"]:
+        tax_ids.update(
+            set(
+                lineage.strip().split(",")
+            )
+        )
+
+    # Check for overlap with provided taxon id
+    if not tax_ids.intersection(set(str(taxon))):
+        raise ValueError(
+            f"'{taxon}' is not valid taxon ID. "
+            "To view all valid taxon IDs inspect e5.taxid_info.tsv "
+            "file in the input eggnog_proteins input."
+        )
+
+
+def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
+    """
+    Script fetches 3 files from the internet and puts them into the folder of
+    a NCBITaxonomyDirFmt object.
+    """
+    # Initialize output object and paths
+    ncbi_data = NCBITaxonomyDirFmt()
+    zip_path = os.path.join(str(ncbi_data), "taxdmp.zip")
+    nodes_path = os.path.join(str(ncbi_data), "nodes.dmp")
+    names_path = os.path.join(str(ncbi_data), "names.dmp")
+    proteins_path = os.path.join(str(ncbi_data), "prot.accession2taxid.gz")
+    version_path = os.path.join(str(ncbi_data), "version.tsv")
+
+    # Download zip file
+    print(colorify("Downloading *.dmp files..."))
+    run_command(
+        cmd=[
+            "wget", "-O", zip_path,
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"
+        ]
+    )
+
+    # Unzip
+    run_command(
+        cmd=[
+            "unzip", "-j", zip_path, "names.dmp", "nodes.dmp",
+            "-d", str(ncbi_data)
+        ]
+    )
+
+    # Remove zip file
+    run_command(cmd=["rm", zip_path])
+
+    # Download proteins
+    print(colorify("Downloading proteins file (~15 GB)..."))
+    run_command(
+        cmd=[
+            "wget", "-O", proteins_path,
+            "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/"
+            "prot.accession2taxid.gz"
+        ]
+    )
+
+    # Constructing version file
+    print(colorify("Constructing version file..."))
+    _write_version_tsv(nodes_path, names_path, proteins_path, version_path)
+
+    # Return object
+    print(colorify(
+        "Done! Moving data from temporary directory to final location..."
+    ))
+    return ncbi_data
+
+
+def _write_version_tsv(nodes, names, proteins, version):
+    names_time = datetime.date.fromtimestamp(os.path.getmtime(nodes))
+    nodes_time = datetime.date.fromtimestamp(os.path.getmtime(names))
+    proteins_time = datetime.date.fromtimestamp(os.path.getmtime(proteins))
+
+    # Create a DataFrame with file names and last modification times
+    data = {'file_name': [
+                'names.dmp',
+                'nodes.dmp',
+                'prot.accession2taxid.gz'
+                ],
+            'date': [
+                names_time.strftime('%d/%m/%Y'),
+                nodes_time.strftime('%d/%m/%Y'),
+                proteins_time.strftime('%d/%m/%Y')
+                ],
+            'time': [
+                names_time.strftime('%H:%M:%S'),
+                nodes_time.strftime('%H:%M:%S'),
+                proteins_time.strftime('%H:%M:%S')
+                ]
+            }
+    pd.DataFrame(data).to_csv(version, sep='\t', index=False)