Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add action to fetch Kaiju DBs #61

Merged
merged 8 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,20 @@ requirements:
- setuptools

run:
- altair
- beautifulsoup4
- bracken
- busco >=5.0.0
- diamond
- eggnog-mapper >=2.1.10
- kraken2
- metabat2
- samtools
- qiime2 {{ qiime2_epoch }}.*
- q2-types-genomics {{ qiime2_epoch }}.*
- q2templates {{ qiime2_epoch }}.*
- eggnog-mapper >=2.1.10
- diamond
- samtools
- tqdm
- xmltodict
- altair
- busco >=5.0.0

test:
requires:
Expand Down
9 changes: 6 additions & 3 deletions q2_moshpit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
# ----------------------------------------------------------------------------

from .dereplication import dereplicate_mags
from .kraken2 import bracken, classification, database
from .kaiju import classification as kaiju_class, database as kaiju_db
from .kraken2 import (
classification as kraken_class, database as kraken_db, bracken
)
from .metabat2 import metabat2
from . import eggnog
from . import busco
Expand All @@ -18,6 +21,6 @@
del get_versions

__all__ = [
'metabat2', 'bracken', 'classification', 'database',
'dereplicate_mags', 'eggnog', 'busco',
'metabat2', 'bracken', 'kraken_class', 'kraken_db',
'kaiju_class', 'kaiju_db', 'dereplicate_mags', 'eggnog', 'busco'
]
16 changes: 15 additions & 1 deletion q2_moshpit/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,21 @@ @article{kang2019
keywords = {Clustering,Metagenome binning,Metagenomics}
}

@article{menzel2016,
title = {Fast and Sensitive Taxonomic Classification for Metagenomics with {{Kaiju}}},
author = {Menzel, Peter and Ng, Kim Lee and Krogh, Anders},
year = {2016},
month = apr,
journal = {Nature Communications},
volume = {7},
number = {1},
pages = {11257},
publisher = {{Nature Publishing Group}},
issn = {2041-1723},
doi = {10.1038/ncomms11257},
keywords = {Classification and taxonomy,Metagenomics}
}

@article{manni_busco_2021,
title = {{BUSCO} {Update}: {Novel} and {Streamlined} {Workflows} along with {Broader} and {Deeper} {Phylogenetic} {Coverage} for {Scoring} of {Eukaryotic}, {Prokaryotic}, and {Viral} {Genomes}},
volume = {38},
Expand All @@ -68,5 +83,4 @@ @article{manni_busco_2021
month = sep,
year = {2021},
pages = {4647--4654},
file = {Manni et al. - 2021 - BUSCO Update Novel and Streamlined Workflows alon.pdf:/Users/santiago/Zotero/storage/SQ2VFGPF/Manni et al. - 2021 - BUSCO Update Novel and Streamlined Workflows alon.pdf:application/pdf},
}
12 changes: 12 additions & 0 deletions q2_moshpit/kaiju/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

from .database import fetch_kaiju_db
# from .classification import classify_kaiju

__all__ = ["fetch_kaiju_db", ]
7 changes: 7 additions & 0 deletions q2_moshpit/kaiju/classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
110 changes: 110 additions & 0 deletions q2_moshpit/kaiju/database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
import tarfile
from urllib.parse import urljoin

from tqdm import tqdm

from q2_types_genomics.kaiju import KaijuDBDirectoryFormat


from bs4 import BeautifulSoup
import requests
import pandas as pd

CHUNK_SIZE = 8192
KAIJU_SERVER_URL = "https://kaiju.binf.ku.dk/server"
ERR_MSG = (
"Unable to connect to the Kaiju server. Please try again later. "
"The error was: {}"
)


def _fetch_and_extract_db(db_uri: str, db_dir: str):
latest_db = os.path.basename(db_uri)
db_path = os.path.join(db_dir, latest_db)
try:
response = requests.get(db_uri, stream=True)
response.raise_for_status()
total_size = int(response.headers.get("content-length", 0))
if total_size > 0:
progress_bar = tqdm(
desc=f'Downloading the "{latest_db}" database',
total=total_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
)

with open(db_path, "wb") as file:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
file.write(chunk) if chunk else False
if total_size > 0:
progress_bar.update(len(chunk))
progress_bar.close() if total_size > 0 else False
except requests.exceptions.ConnectionError as e:
raise Exception(ERR_MSG.format(e))

msg = "Download finished. Extracting database files..."
print(f"{msg}", end="", flush=True)
with tarfile.open(db_path, "r:gz") as tar:
tar.extractall(path=db_dir)
print(f"\r{msg} Done.", flush=True)

os.remove(db_path)


def _find_latest_db_url(database_type, sidebox_element, url):
# Extract the databases and dates
df = _find_all_dbs(sidebox_element)

# Filter databases based on target_database type
filtered_df = df[df.index.str.contains(database_type)]

# Find the latest database
latest_database = filtered_df["Date"].idxmax()
# latest_database = filtered_df.loc[latest_index, "Database"]
download_link = sidebox_element.find("a", string=latest_database)["href"]
download_link = urljoin(url, download_link)

return download_link


def _find_all_dbs(sidebox_element):
databases, dates = [], []
for link in sidebox_element.find_all("a"):
database = link.get_text()
date = database.split()[-2] # Last element is the date
databases.append(database)
dates.append(date)
df = pd.DataFrame({"Database": databases, "Date": dates})
df.set_index("Database", inplace=True)
df.loc[:, "Date"] = pd.to_datetime(df.loc[:, "Date"])
return df


def fetch_kaiju_db(
database_type: str,
) -> KaijuDBDirectoryFormat:

try:
response = requests.get(KAIJU_SERVER_URL)
except requests.exceptions.RequestException as e:
raise Exception(ERR_MSG.format(e))
soup = BeautifulSoup(response.content, "html.parser")
sidebox_db = soup.find("div", id="sidebox_db")

download_link = _find_latest_db_url(
database_type, sidebox_db, KAIJU_SERVER_URL
)

db = KaijuDBDirectoryFormat()
_fetch_and_extract_db(download_link, str(db.path))

return db
7 changes: 7 additions & 0 deletions q2_moshpit/kaiju/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
151 changes: 151 additions & 0 deletions q2_moshpit/kaiju/tests/test_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
import tempfile
import unittest
from unittest.mock import patch, Mock

import pandas as pd
from bs4 import BeautifulSoup
from qiime2.plugin.testing import TestPluginBase

from q2_moshpit.kaiju.database import (
_fetch_and_extract_db, _find_latest_db_url, _find_all_dbs,
fetch_kaiju_db, CHUNK_SIZE, ERR_MSG, KAIJU_SERVER_URL
)
from requests.exceptions import ConnectionError, RequestException

from q2_types_genomics.kaiju import KaijuDBDirectoryFormat


class TestDatabaseFunctions(TestPluginBase):
package = 'q2_moshpit.kaiju.tests'

@patch("requests.get")
@patch("q2_moshpit.kaiju.database.tqdm")
@patch("tarfile.open")
@patch("os.remove")
def test_fetch_and_extract_db(
self, mock_remove, mock_tarfile_open,
mock_progress, mock_requests
):
response = mock_requests.return_value
response.headers = {"content-length": 1024}
response.iter_content.return_value = [b"test"] * 1024
mock_tar = Mock()
mock_tarfile_open.return_value.__enter__.return_value = mock_tar

with tempfile.TemporaryDirectory() as tmpdir:
_fetch_and_extract_db("http://a/b/db.tar.gz", tmpdir)
db_path = os.path.join(tmpdir, "db.tar.gz")

mock_progress.assert_called_with(
desc='Downloading the "db.tar.gz" database',
total=1024,
unit="B",
unit_scale=True,
unit_divisor=1024
)
response.iter_content.assert_called_with(chunk_size=CHUNK_SIZE)
mock_tarfile_open.assert_called_with(db_path, "r:gz")
mock_tar.extractall.assert_called_with(path=tmpdir)
mock_remove.assert_called_with(db_path)
mock_requests.assert_called_with(
"http://a/b/db.tar.gz", stream=True
)

@patch("requests.get", side_effect=ConnectionError("some error"))
def test_fetch_and_extract_db_exception(
self, mock_requests
):
exp_error = ERR_MSG.format("some error")
with self.assertRaisesRegex(Exception, exp_error):
with tempfile.TemporaryDirectory() as tmpdir:
_fetch_and_extract_db("http://a/b/db.tar.gz", tmpdir)

mock_requests.assert_called_with(

Check warning on line 71 in q2_moshpit/kaiju/tests/test_database.py

View check run for this annotation

Codecov / codecov/patch

q2_moshpit/kaiju/tests/test_database.py#L71

Added line #L71 was not covered by tests
"http://a/b/db.tar.gz", stream=True
)

def test_find_latest_db_url(self):
databases = [
('nr_euk 2021-02-24 (61GB)',
'https://hello.com/nr_euk_2021-02-24.tar.gz'),
('nr 2021-02-26 (52GB)',
'https://hello.com/nr_2021-02-26.tar.gz'),
('nr_euk 2022-01-11 (60GB)',
'https://hello.com/nr_euk_2022-01-11.tar.gz')
]
sidebox_element = BeautifulSoup(
'<html><body>{}</body></html>'.format(
''.join('<a href={}>{}</a>'.format(d[1], d[0])
for d in databases)
), 'html.parser')
url = _find_latest_db_url(
database_type='nr_euk',
sidebox_element=sidebox_element,
url='https://test.com'
)
self.assertEqual(url, 'https://hello.com/nr_euk_2022-01-11.tar.gz')

def test_find_all_dbs(self):
databases = ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
sidebox_element = BeautifulSoup(
'<html><body>{}</body></html>'.format(
''.join('<a>{}</a>'.format(d) for d in databases)
), 'html.parser')
df = _find_all_dbs(sidebox_element)
self.assertIsInstance(df, pd.DataFrame)
self.assertListEqual(
df.index.tolist(),
['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
)
self.assertListEqual(
df['Date'].tolist(),
[pd.to_datetime('2021-02-24'), pd.to_datetime('2021-02-26')]
)

@patch("requests.get")
@patch("q2_moshpit.kaiju.database._fetch_and_extract_db")
def test_fetch_kaiju_db(self, mock_fetch, mock_requests):
databases = [
('nr_euk 2021-02-24 (61GB)',
'https://hello.com/nr_euk_2021-02-24.tar.gz'),
('nr 2021-02-26 (52GB)',
'https://hello.com/nr_2021-02-26.tar.gz'),
('nr_euk 2022-01-11 (60GB)',
'https://hello.com/nr_euk_2022-01-11.tar.gz')
]
mock_requests.return_value = Mock(
content='<html><body><div id="sidebox_db">{}</div></body></html>'
.format(
''.join('<a href={}>{}</a>'.format(d[1], d[0])
for d in databases)
)
)

obs_db = fetch_kaiju_db('nr_euk')
self.assertIsInstance(obs_db, KaijuDBDirectoryFormat)
mock_requests.assert_called_with(KAIJU_SERVER_URL)
mock_fetch.assert_called_with(
'https://hello.com/nr_euk_2022-01-11.tar.gz',
str(obs_db.path)
)

@patch("requests.get", side_effect=RequestException("some error"))
def test_fetch_kaiju_db_exception(self, mock_requests):
with self.assertRaisesRegex(
Exception, ERR_MSG.format("some error")
):
fetch_kaiju_db('nr_euk')

mock_requests.assert_called_with(KAIJU_SERVER_URL)


if __name__ == "__main__":
unittest.main()

Check warning on line 151 in q2_moshpit/kaiju/tests/test_database.py

View check run for this annotation

Codecov / codecov/patch

q2_moshpit/kaiju/tests/test_database.py#L151

Added line #L151 was not covered by tests
Loading