Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add fetch-eggnog-hmmer-db action #173

Merged
merged 38 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
2df4471
download and process hmm files
Sann5 May 17, 2024
71913c3
some work on the download implementation (putting stuff into functions)
Sann5 May 21, 2024
9931f50
implement the download and process fastas
Sann5 May 22, 2024
7e8dc72
some details
Sann5 May 22, 2024
16991f0
some adjustments to the regex and tqdm
Sann5 May 22, 2024
e583104
increase space removed for idmap file
Sann5 May 23, 2024
acec4f9
test validate and try_wget
Sann5 May 23, 2024
1e24c3b
test download_and_build_hmm_db
Sann5 May 23, 2024
c1b0cdd
test download fastas
Sann5 May 23, 2024
bfa2521
tests fetch_eggnog_hmmer_db
Sann5 May 23, 2024
f007208
move validate tests to test_dbs.py
Sann5 May 23, 2024
59755d0
put back tests where they belong
Sann5 May 23, 2024
b4bd855
final changes
Sann5 May 23, 2024
5e835cb
Merge branch 'main' into fetch_hmmer_db
Sann5 May 23, 2024
33158ae
add and simplify test data
Sann5 May 23, 2024
22830bd
Update q2_moshpit/eggnog/_dbs.py
Sann5 May 27, 2024
c7eec33
Update q2_moshpit/eggnog/_dbs.py
Sann5 May 27, 2024
366d6f6
increase taxon id range
Sann5 May 27, 2024
9290bf1
single quotes to double quotes
Sann5 May 27, 2024
07ca2b7
Merge branch 'main' into fetch_hmmer_db
Sann5 May 27, 2024
dfc8b1b
adopting new changes from q2_types
Sann5 Jun 11, 2024
3b87776
Merge branch 'main' into fetch_hmmer_db
Sann5 Jun 11, 2024
ccb6ad9
update action description
Sann5 Jun 12, 2024
2fbacc2
fix progress bar for the hmm merging
Sann5 Jun 14, 2024
4111f7d
add properties to fetch eggnogn hmmer outputs
Sann5 Jun 17, 2024
5ef4f18
ignore type check in fetch_eggnog_hmmer_db
Sann5 Jun 20, 2024
b90555f
set EggnogHmmerIdmapFileFmt max validation level to float('inf')
Sann5 Jun 20, 2024
4fb6154
use glob in _download_fastas_into_hmmer_db
Sann5 Jun 20, 2024
9ca2cda
process hmm files one by one (don't load all lines)
Sann5 Jun 20, 2024
2835c40
extract hmm merge logic into a testable function
Sann5 Jun 21, 2024
9fa499b
rename output to seed-alignments
Sann5 Jun 21, 2024
12800e6
change validation + try_wget directly in code
Sann5 Jun 21, 2024
f3d5196
Merge branch 'main' into fetch_hmmer_db
Sann5 Jun 21, 2024
91f1e59
remove print statements and put taxon_id logic inside parser
Sann5 Jun 25, 2024
1895151
seed-alignments to seed_alignments
Sann5 Jun 25, 2024
ad6df3b
ignore return values in test_fetch_eggnog_hmmer_db
Sann5 Jun 26, 2024
6eea056
validate uncompressed and parsed fastas in test_download_fastas_into_…
Sann5 Jun 26, 2024
1d81611
validate output in test_download_and_build_hmm_db
Sann5 Jun 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions q2_moshpit/citations.bib
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
@misc{noauthor_hmmer_nodate,
title = {{HMMER}},
url = {http://hmmer.org/},
urldate = {2024-05-22},
}

@InProceedings{ mckinney-proc-scipy-2010,
author = { Wes McKinney },
title = { Data Structures for Statistical Computing in Python },
Expand Down
12 changes: 10 additions & 2 deletions q2_moshpit/eggnog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,21 @@
)
from ._dbs import (
fetch_eggnog_db, fetch_diamond_db, build_custom_diamond_db,
fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy
fetch_eggnog_proteins, build_eggnog_diamond_db, fetch_ncbi_taxonomy,
fetch_eggnog_hmmer_db
)
from q2_moshpit.eggnog._type import EggnogHmmerIdmap
from q2_moshpit.eggnog._format import (
EggnogHmmerIdmapDirectoryFmt, EggnogHmmerIdmapFileFmt
)


__all__ = [
'eggnog_diamond_search', '_eggnog_diamond_search', 'eggnog_annotate',
'_eggnog_feature_table', 'fetch_eggnog_db', 'fetch_diamond_db',
'build_custom_diamond_db', 'fetch_eggnog_proteins',
'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy', '_eggnog_annotate'
'build_eggnog_diamond_db', 'fetch_ncbi_taxonomy', '_eggnog_annotate',
'fetch_eggnog_hmmer_db', 'EggnogHmmerIdmapDirFmt',
'EggnogHmmerIdmapFileFmt', 'EggnogHmmerIdmap',
'EggnogHmmerIdmapDirectoryFmt'
]
57 changes: 49 additions & 8 deletions q2_moshpit/eggnog/_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,21 @@
EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt,
EggnogProteinSequencesDirFmt
)
from .._utils import (
from q2_types.profile_hmms import (
ProteinMultipleProfileHmmDirectoryFmt,
PressedProfileHmmsDirectoryFmt
)
from q2_types.genome_data import ProteinsDirectoryFormat
from q2_moshpit._utils import (
run_command, _process_common_input_params, colorify,
_calculate_md5_from_file
)
from ._utils import _parse_build_diamond_db_params
from q2_moshpit.eggnog._utils import (
_parse_build_diamond_db_params, _download_and_build_hmm_db,
_download_fastas_into_hmmer_db,
_validate_eggnog_hmmer_taxon_id
)
from q2_moshpit.eggnog._format import EggnogHmmerIdmapDirectoryFmt


def fetch_eggnog_db() -> EggnogRefDirFmt:
Expand Down Expand Up @@ -233,12 +243,14 @@ def _validate_taxon_id(eggnog_proteins, taxon):
)

# Check for overlap with provided taxon id
if not str(taxon) in tax_ids:
raise ValueError(
f"'{taxon}' is not valid taxon ID. "
"To view all valid taxon IDs inspect e5.taxid_info.tsv "
"file in the eggnog_proteins input."
)
if not str(taxon) in tax_ids:
raise ValueError(
f"'{taxon}' is not valid taxon ID. "
"To view all valid taxon IDs inspect e5.taxid_info.tsv. "
"You can download it with this command: "
"wget "
"http://eggnog5.embl.de/download/eggnog_5.0/e5.taxid_info.tsv"
)


def fetch_ncbi_taxonomy() -> NCBITaxonomyDirFmt:
Expand Down Expand Up @@ -321,3 +333,32 @@ def _collect_and_compare_md5(path_to_md5: str, path_to_file: str):

# If no exception is raised, remove md5 file
os.remove(path_to_md5)


def fetch_eggnog_hmmer_db(taxon_id: int) -> (
EggnogHmmerIdmapDirectoryFmt,
ProteinMultipleProfileHmmDirectoryFmt,
PressedProfileHmmsDirectoryFmt,
ProteinsDirectoryFormat
): # type: ignore
_validate_eggnog_hmmer_taxon_id(taxon_id)

# Download HMMER database
print(colorify(
"Valid taxon ID. \n"
"Proceeding with HMMER database download and build..."
))
idmap, hmmer_db, pressed_hmmer_db = _download_and_build_hmm_db(taxon_id)
print(colorify(
"HMM database built successfully. \n"
"Proceeding with FASTA files download and processing..."
))

# Download fasta sequences
fastas = _download_fastas_into_hmmer_db(taxon_id)
print(colorify(
"FASTA files processed successfully. \n"
"Moving data from temporary to final location..."
))

return idmap, hmmer_db, pressed_hmmer_db, fastas
44 changes: 44 additions & 0 deletions q2_moshpit/eggnog/_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import re
from qiime2.plugin import model
from qiime2.core.exceptions import ValidationError


class EggnogHmmerIdmapFileFmt(model.TextFileFormat):
def _validate_(self, level):
with open(str(self), 'r') as file:
# Set the number of rows to be parsed
max_lines = {"min": 100, "max": float('inf')}[level]
lines = file.readlines()
for i, line in enumerate(lines, 1):
# Check number of lines parsed so far
if i > max_lines:
break

# Validate line
if not re.match(r'^(\d+) ([A-Z0-9]+)$', line):
raise ValidationError(
f"Invalid line {i}.\n"
f"{line} \n"
"Expected index and an alphanumeric code separated "
"by a single space."
)

# Check index is equal to line number
idx, code = line.rstrip("\n").split(sep=" ")
if not idx == str(i):
raise ValidationError(
f"Invalid line {i}.\n"
f"{line} \n"
f"Expected index {i} but got {idx} instead.\n"
)


class EggnogHmmerIdmapDirectoryFmt(model.DirectoryFormat):
idmap = model.File(r'.*\.hmm\.idmap', format=EggnogHmmerIdmapFileFmt)
10 changes: 10 additions & 0 deletions q2_moshpit/eggnog/_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from qiime2.plugin import SemanticType

EggnogHmmerIdmap = SemanticType('EggnogHmmerIdmap')
152 changes: 152 additions & 0 deletions q2_moshpit/eggnog/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,25 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import re
import os
import gzip
import shutil
import glob
import tempfile
import subprocess
import urllib
from html.parser import HTMLParser
from tqdm import tqdm
from qiime2.core.exceptions import ValidationError
from typing import List
from .._utils import run_command, colorify
from q2_types.profile_hmms import (
ProteinMultipleProfileHmmDirectoryFmt, PressedProfileHmmsDirectoryFmt
)
from q2_types.genome_data import ProteinsDirectoryFormat
from q2_moshpit.eggnog._format import EggnogHmmerIdmapDirectoryFmt
COMMON_URL = "http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level"


def _parse_build_diamond_db_params(arg_key, arg_val) -> List[str]:
Expand All @@ -26,3 +44,137 @@ def _parse_build_diamond_db_params(arg_key, arg_val) -> List[str]:
return [f"--{arg_key}"]
else:
return [f"--{arg_key}", str(arg_val)]


def _download_and_build_hmm_db(taxon_id):
pressed_hmm_db_obj = PressedProfileHmmsDirectoryFmt()
hmm_db_obj = ProteinMultipleProfileHmmDirectoryFmt()
idmap_obj = EggnogHmmerIdmapDirectoryFmt()

with tempfile.TemporaryDirectory() as tmp:
try:
run_command(cmd=[
"wget", "-O", f"{tmp}/{taxon_id}_hmms.tar.gz",
f"{COMMON_URL}/{taxon_id}/{taxon_id}_hmms.tar.gz"
])
except subprocess.CalledProcessError as e:
raise Exception(
"Error during HMMER database download. "
f"wget error code: {e.returncode}"
)

# Extracting
print(colorify("Decompressing..."))
run_command(
cmd=["tar", "zxf", f"{taxon_id}_hmms.tar.gz"],
cwd=tmp
)

# Merge hmm files + write .hmm.idmap
print(colorify("Merging hmm files..."))
hmms_merged_p = f"{str(pressed_hmm_db_obj)}/{taxon_id}.hmm"
idmap_p = f"{str(idmap_obj)}/{taxon_id}.hmm.idmap"
_merge_hmms_and_write_idmap(hmms_merged_p, idmap_p, taxon_id, tmp)

# prepare an HMM database for faster hmmscan searches
print(colorify("Preparing HMM database..."))
run_command(cmd=["hmmpress", hmms_merged_p])
shutil.move(hmms_merged_p, f"{str(hmm_db_obj)}/{taxon_id}.hmm")

return idmap_obj, hmm_db_obj, pressed_hmm_db_obj


def _download_fastas_into_hmmer_db(taxon_id: int):
fastas_obj = ProteinsDirectoryFormat()
with tempfile.TemporaryDirectory() as tmp:
try:
run_command(cmd=[
"wget", "-O", f"{tmp}/{taxon_id}_raw_algs.tar",
f"{COMMON_URL}/{taxon_id}/{taxon_id}_raw_algs.tar",
])
except subprocess.CalledProcessError as e:
raise Exception(
"Error downloading seed-alignments. "
f"wget error code: {e.returncode}"
)

# Extracting
print(colorify("Decompressing..."))
run_command(
cmd=["tar", "xf", f"{taxon_id}_raw_algs.tar"],
cwd=tmp
)

files = glob.glob(f"{tmp}/{taxon_id}/*.gz")

# Extract, remove '-' and save to hmmer_db location
print(colorify("Processing FASTA files (this can take a while)... "))
for fpi in tqdm(files):
new_name = os.path.basename(fpi).replace(".raw_alg.faa.gz", ".fa")
fpo = os.path.join(str(fastas_obj), new_name)
with gzip.open(fpi, "rt") as f_in, open(fpo, "w") as f_out:
content = f_in.read()
content = content.replace("-", "")
f_out.write(content)

return fastas_obj


def _merge_hmms_and_write_idmap(hmms_merged_p, idmap_p, taxon_id, tmp):
# Open output files
with open(hmms_merged_p, "a") as hmms, open(idmap_p, "a") as idmap:

# Iterate through all decompressed files
for root, dirnames, files in os.walk(f"{tmp}/{taxon_id}"):
for i, file in tqdm(enumerate(files, start=1), total=len(files)):
if file.endswith(".hmm"): # process hmm files
with open(f"{root}/{file}", "r") as hmm_file:
for line in hmm_file:
if line.startswith("NAME "):
line = re.sub(
r"\.faa\.final_tree(\.fa)?", "", line
)
id = line.replace("NAME ", "", 1)
idmap.write(f"{i} {id}")
hmms.write(line)


class _EggnogHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.links = []

def handle_starttag(self, tag, attrs):
if tag == 'a':
for attr in attrs:
if attr[0] == 'href':
match = re.search(r'(\d+)/', attr[1])
if match:
self.links.append(match.group(1))
misialq marked this conversation as resolved.
Show resolved Hide resolved

def get_taxon_ids(self):
taxon_ids = [int(element) for element in self.links]
taxon_ids.remove(1)
return taxon_ids


def _validate_eggnog_hmmer_taxon_id(taxon_id):
try:
with urllib.request.urlopen(COMMON_URL) as response:
html_content = response.read().decode('utf-8')
except urllib.error.HTTPError as e:
print(f"HTTP Error: {e.code} {e.reason}")
except urllib.error.URLError as e:
print(f"URL Error: {e.reason}")
except Exception as e:
print(f"General Error: {e}")

# Parse the HTML content
parser = _EggnogHTMLParser()
parser.feed(html_content)

if taxon_id not in parser.get_taxon_ids():
raise ValidationError(
f"{taxon_id} is not a valid taxon ID. \n"
f"Check out {COMMON_URL} for the valid IDs."
)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading