bokulich-lab · misialq · Jul 3, 2024 · May 17, 2024 · May 21, 2024 · May 22, 2024
diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib
@@ -1,3 +1,9 @@
+@misc{noauthor_hmmer_nodate,
+	title = {{HMMER}},
+	url = {http://hmmer.org/},
+	urldate = {2024-05-22},
+}
+
 @InProceedings{ mckinney-proc-scipy-2010,
   author    = { Wes McKinney },
   title     = { Data Structures for Statistical Computing in Python },

diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py
@@ -11,13 +11,21 @@
 )
 from ._dbs import (
     fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
-    fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy
+    fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy,
+    fetch_eggnog_hmmer_db
+)
+from q2_moshpit.eggnog._type import EggnogHmmerIdmap
+from q2_moshpit.eggnog._format import (
+    EggnogHmmerIdmapDirectoryFmt, EggnogHmmerIdmapFileFmt
 )
 
 
 __all__ = [
     'eggnog_diamond_search', '_eggnog_diamond_search', 'eggnog_annotate',
     '_eggnog_feature_table', 'fetch_eggnog_db', 'fetch_diamond_db',
     'build_custom_diamond_db', 'fetch_eggnog_proteins',
-    'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy', '_eggnog_annotate'
+    'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy', '_eggnog_annotate',
+    'fetch_eggnog_hmmer_db', 'EggnogHmmerIdmapDirFmt',
+    'EggnogHmmerIdmapFileFmt', 'EggnogHmmerIdmap',
+    'EggnogHmmerIdmapDirectoryFmt'
 ]
diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
@@ -14,11 +14,21 @@
     EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
     EggnogProteinSequencesDirFmt
 )
-from .._utils import (
+from q2_types.profile_hmms import (
+    ProteinMultipleProfileHmmDirectoryFmt,
+    PressedProfileHmmsDirectoryFmt
+)
+from q2_types.genome_data import ProteinsDirectoryFormat
+from q2_moshpit._utils import (
     run_command, _process_common_input_params, colorify,
     _calculate_md5_from_file
 )
-from ._utils import _parse_build_diamond_db_params
+from q2_moshpit.eggnog._utils import (
+    _parse_build_diamond_db_params, _download_and_build_hmm_db,
+    _download_fastas_into_hmmer_db, _try_wget
+)
+from q2_moshpit.eggnog._format import EggnogHmmerIdmapDirectoryFmt
+import tempfile
 
 
 def fetch_eggnog_db() -> EggnogRefDirFmt:
@@ -233,12 +243,14 @@ def _validate_taxon_id(eggnog_proteins, taxon):
         )
 
     # Check for overlap with provided taxon id
-        if not str(taxon) in tax_ids:
-            raise ValueError(
-                f"'{taxon}' is not valid taxon ID. "
-                "To view all valid taxon IDs inspect e5.taxid_info.tsv "
-                "file in the eggnog_proteins input."
-            )
+    if not str(taxon) in tax_ids:
+        raise ValueError(
+            f"'{taxon}' is not valid taxon ID. "
+            "To view all valid taxon IDs inspect e5.taxid_info.tsv. "
+            "You can download it with this command: "
+            "wget "
+            "http://eggnog5.embl.de/download/eggnog_5.0/e5.taxid_info.tsv"
+        )
 
 
 def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
@@ -321,3 +333,43 @@ def _collect_and_compare_md5(path_to_md5: str, path_to_file: str):
 
     # If no exception is raised, remove md5 file
     os.remove(path_to_md5)
+
+
+def fetch_eggnog_hmmer_db(taxon_id: int) -> (
+    EggnogHmmerIdmapDirectoryFmt,
+    ProteinMultipleProfileHmmDirectoryFmt,
+    PressedProfileHmmsDirectoryFmt,
+    ProteinsDirectoryFormat
+):
+    # Validate taxon ID
+    with tempfile.TemporaryDirectory() as tmp:
+        print(colorify(
+                "Validating taxon ID: \n"
+                "Downloading taxonomy file..."
+        ))
+        _try_wget(
+            f"{tmp}/e5.taxid_info.tsv",
+            "http://eggnog5.embl.de/download/eggnog_5.0/e5.taxid_info.tsv",
+            "Error during taxon-info-file download"
+        )
+        _validate_taxon_id(tmp, taxon_id)
+
+    # Download HMMER database
+    print(colorify(
+        "Valid taxon ID. \n"
+        "Proceeding with HMMER database download and build..."
+    ))
+    idmap, hmmer_db, pressed_hmmer_db = _download_and_build_hmm_db(taxon_id)
+    print(colorify(
+        "HMM database built successfully. \n"
+        "Proceeding with FASTA files download and processing..."
+    ))
+
+    # Download fasta sequences
+    fastas = _download_fastas_into_hmmer_db(taxon_id)
+    print(colorify(
+        "FASTA files processed successfully. \n"
+        "Moving data from temporary to final location..."
+    ))
+
+    return idmap, hmmer_db, pressed_hmmer_db, fastas
diff --git a/q2_moshpit/eggnog/_format.py b/q2_moshpit/eggnog/_format.py
@@ -0,0 +1,44 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import re
+from qiime2.plugin import model
+from qiime2.core.exceptions import ValidationError
+
+
+class EggnogHmmerIdmapFileFmt(model.TextFileFormat):
+    def _validate_(self, level):
+        with open(str(self), 'r') as file:
+            # Set the number of rows to be parsed
+            max_lines = {"min": 100, "max": 10000000}[level]
+            lines = file.readlines()
+            for i, line in enumerate(lines, 1):
+                # Check number of lines parsed so far
+                if i > max_lines:
+                    break
+
+                # Validate line
+                if not re.match(r'^(\d+) ([A-Z0-9]+)$', line):
+                    raise ValidationError(
+                        f"Invalid line {i}.\n"
+                        f"{line} \n"
+                        "Expected index and an alphanumeric code separated "
+                        "by a single space."
+                    )
+
+                # Check index is equal to line number
+                idx, code = line.rstrip("\n").split(sep=" ")
+                if not idx == str(i):
+                    raise ValidationError(
+                        f"Invalid line {i}.\n"
+                        f"{line} \n"
+                        f"Expected index {i} but got {idx} instead.\n"
+                    )
+
+
+class EggnogHmmerIdmapDirectoryFmt(model.DirectoryFormat):
+    idmap = model.File(r'.*\.hmm\.idmap', format=EggnogHmmerIdmapFileFmt)
diff --git a/q2_moshpit/eggnog/_type.py b/q2_moshpit/eggnog/_type.py
@@ -0,0 +1,10 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+from qiime2.plugin import SemanticType
+
+EggnogHmmerIdmap = SemanticType('EggnogHmmerIdmap')
diff --git a/q2_moshpit/eggnog/_utils.py b/q2_moshpit/eggnog/_utils.py
@@ -5,7 +5,20 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+import re
+import os
+import gzip
+import shutil
+from tqdm import tqdm
+import tempfile
+import subprocess
 from typing import List
+from .._utils import run_command, colorify
+from q2_types.profile_hmms import (
+    ProteinMultipleProfileHmmDirectoryFmt, PressedProfileHmmsDirectoryFmt
+)
+from q2_types.genome_data import ProteinsDirectoryFormat
+from q2_moshpit.eggnog._format import EggnogHmmerIdmapDirectoryFmt
 
 
 def _parse_build_diamond_db_params(arg_key, arg_val) -> List[str]:
@@ -26,3 +39,112 @@ def _parse_build_diamond_db_params(arg_key, arg_val) -> List[str]:
         return [f"--{arg_key}"]
     else:
         return [f"--{arg_key}", str(arg_val)]
+
+
+def _download_and_build_hmm_db(taxon_id):
+    pressed_hmm_db_obj = PressedProfileHmmsDirectoryFmt()
+    hmm_db_obj = ProteinMultipleProfileHmmDirectoryFmt()
+    idmap_obj = EggnogHmmerIdmapDirectoryFmt()
+
+    with tempfile.TemporaryDirectory() as tmp:
+        _try_wget(
+            f"{tmp}/{taxon_id}_hmms.tar.gz",
+            "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/"
+            f"{taxon_id}/{taxon_id}_hmms.tar.gz",
+            "Error during HMMER database download"
+        )
+
+        # Extracting
+        print(colorify("Decompressing..."))
+        run_command(
+            cmd=["tar", "zxf", f"{taxon_id}_hmms.tar.gz"],
+            cwd=tmp
+        )
+
+        # Merge hmm files + write .hmm.idmap
+        print(colorify("Merging hmm files..."))
+        hmms_merged_p = f"{str(pressed_hmm_db_obj)}/{taxon_id}.hmm"
+        idmap_p = f"{str(idmap_obj)}/{taxon_id}.hmm.idmap"
+
+        # Open output files
+        with open(hmms_merged_p, "a") as hmms, open(idmap_p, "a") as idmap:
+
+            # Iterate through all decompressed files
+            for root, dirnames, files in os.walk(f"{tmp}/{taxon_id}"):
+                for i, file in tqdm(
+                    enumerate(files, start=1), total=len(files)
+                ):
+                    if file.endswith(".hmm"):
+
+                        # process hmm files
+                        with open(f"{root}/{file}", "r") as hmm_file:
+                            lines = hmm_file.readlines()
+
+                            # Find "NAME" line
+                            for j, line in enumerate(lines):
+                                if line.startswith("NAME "):
+                                    modified_line = re.sub(
+                                        r"\.faa\.final_tree(\.fa)?", "", line
+                                    )
+
+                                    # write modified content to hmms_merged
+                                    lines[j] = modified_line
+                                    hmms.writelines(lines)
+
+                                    # get name and write to idmap
+                                    id = modified_line.replace("NAME  ", "", 1)
+                                    idmap.write(f"{i} {id}")
+
+                                    break
+
+    # prepare an HMM database for faster hmmscan searches
+    print(colorify("Preparing HMM database..."))
+    run_command(cmd=["hmmpress", hmms_merged_p])
+    shutil.move(hmms_merged_p, f"{str(hmm_db_obj)}/{taxon_id}.hmm")
+
+    return idmap_obj, hmm_db_obj, pressed_hmm_db_obj
+
+
+def _download_fastas_into_hmmer_db(taxon_id: int):
+    fastas_obj = ProteinsDirectoryFormat()
+    with tempfile.TemporaryDirectory() as tmp:
+        _try_wget(
+            f"{tmp}/{taxon_id}_raw_algs.tar",
+            "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/"
+            f"{taxon_id}/{taxon_id}_raw_algs.tar",
+            "Error downloading FASTA files"
+        )
+
+        # Extracting
+        print(colorify("Decompressing..."))
+        run_command(
+            cmd=["tar", "xf", f"{taxon_id}_raw_algs.tar"],
+            cwd=tmp
+        )
+
+        files = [
+            f"{tmp}/{taxon_id}/{f}"
+            for f in os.listdir(f"{tmp}/{taxon_id}")
+            if f.endswith(".gz")
+        ]
+
+        # Extract, remove '-' and save to hmmer_db location
+        print(colorify("Processing FASTA files (this can take a while)... "))
+        for fpi in tqdm(files):
+            new_name = os.path.basename(fpi).replace(".raw_alg.faa.gz", ".fa")
+            fpo = os.path.join(str(fastas_obj), new_name)
+            with gzip.open(fpi, "rt") as f_in, open(fpo, "w") as f_out:
+                content = f_in.read()
+                content = content.replace("-", "")
+                f_out.write(content)
+
+    return fastas_obj
+
+
+def _try_wget(output_file: str, url: str, exception_msg: str):
+    try:
+        run_command(cmd=["wget", "-O", output_file, url])
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            f"{exception_msg}: {e.returncode}"
+        )
diff --git a/q2_moshpit/eggnog/tests/data/hmmer/fastas/1/1FKBE.raw_alg.faa.gz b/q2_moshpit/eggnog/tests/data/hmmer/fastas/1/1FKBE.raw_alg.faa.gz
diff --git a/q2_moshpit/eggnog/tests/data/hmmer/fastas/1/1FKBY.raw_alg.faa.gz b/q2_moshpit/eggnog/tests/data/hmmer/fastas/1/1FKBY.raw_alg.faa.gz
diff --git a/q2_moshpit/eggnog/tests/data/hmmer/fastas/1/1FKDQ.raw_alg.faa.gz b/q2_moshpit/eggnog/tests/data/hmmer/fastas/1/1FKDQ.raw_alg.faa.gz