Skip to content

Commit

Permalink
FIX: update how fetch-kaiju-db action discovers DB URLs (#201)
Browse files Browse the repository at this point in the history
  • Loading branch information
misialq authored Sep 2, 2024
1 parent 1bb3e46 commit 2562a9a
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 113 deletions.
85 changes: 30 additions & 55 deletions q2_moshpit/kaiju/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,16 @@
# ----------------------------------------------------------------------------
import os
import tarfile
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

from q2_types.kaiju import KaijuDBDirectoryFormat


from bs4 import BeautifulSoup
import requests
import pandas as pd

CHUNK_SIZE = 8192
KAIJU_SERVER_URL = "https://kaiju.binf.ku.dk/server"
KAIJU_SERVER_URL = ("https://bioinformatics-centre.github.io/"
"kaiju/downloads.html")
ERR_MSG = (
"Unable to connect to the Kaiju server. Please try again later. "
"The error was: {}"
Expand All @@ -32,8 +29,7 @@ def _fetch_and_extract_db(db_uri: str, db_dir: str):
Args:
db_uri (str): The URI of the database to fetch.
db_dir (str): The directory where the database will be saved.
db_dir (str): Path to the final DB directory.
"""
latest_db = os.path.basename(db_uri)
db_path = os.path.join(db_dir, latest_db)
Expand Down Expand Up @@ -68,56 +64,39 @@ def _fetch_and_extract_db(db_uri: str, db_dir: str):
os.remove(db_path)


def _find_latest_db_url(database_type, sidebox_element, url):
def _find_latest_db_url(response: bytes, database_type: str) -> str:
"""
Finds the latest database URL based on the database type.
Args:
response (bytes): HTML response containing the table with DB URLs.
database_type (str): The target database type to filter.
sidebox_element (object): The element containing the databases.
url (str): The base URL.
Returns:
str: The latest database URL.
"""
# Extract the databases and dates
df = _find_all_dbs(sidebox_element)

# Filter databases based on target_database type
filtered_df = df[df.index.str.contains(database_type)]

# Find the latest database
latest_database = filtered_df["Date"].idxmax()
# latest_database = filtered_df.loc[latest_index, "Database"]
download_link = sidebox_element.find("a", string=latest_database)["href"]
download_link = urljoin(url, download_link)

return download_link


def _find_all_dbs(sidebox_element):
"""
Args:
sidebox_element: A BeautifulSoup element containing the sidebox
element on the page.
Returns:
df: A pandas DataFrame with columns "Database" and "Date".
The "Database" column contains the names of the databases
found in the sidebox_element, while the "Date" column contains
the corresponding dates.
"""
databases, dates = [], []
for link in sidebox_element.find_all("a"):
database = link.get_text()
date = database.split()[-2] # Last element is the date
databases.append(database)
dates.append(date)
df = pd.DataFrame({"Database": databases, "Date": dates})
df.set_index("Database", inplace=True)
df.loc[:, "Date"] = pd.to_datetime(df.loc[:, "Date"])
return df
soup = BeautifulSoup(response, 'html.parser')
tables = soup.find_all('table')

for table in tables:
# Locate the table header
headers = table.find_all('th')
if headers and headers[0].get_text().strip() == "Database":
rows = table.find_all('tr')
for row in rows:
cells = row.find_all('td')

# Check if the first cell contains the required database_type
if cells and cells[0].get_text().strip() == database_type:
# The next row contains the desired URLs
next_row = row.find_next_sibling('tr')
if next_row:
url_cell = next_row.find_all('td')[-1]
url = url_cell.find('a')
if url:
return url['href']

raise ValueError(f"URL for database type '{database_type}' not found.")


def fetch_kaiju_db(
Expand All @@ -128,12 +107,8 @@ def fetch_kaiju_db(
response = requests.get(KAIJU_SERVER_URL)
except requests.exceptions.RequestException as e:
raise Exception(ERR_MSG.format(e))
soup = BeautifulSoup(response.content, "html.parser")
sidebox_db = soup.find("div", id="sidebox_db")

download_link = _find_latest_db_url(
database_type, sidebox_db, KAIJU_SERVER_URL
)
download_link = _find_latest_db_url(response.content, database_type)

db = KaijuDBDirectoryFormat()
_fetch_and_extract_db(download_link, str(db.path))
Expand Down
116 changes: 61 additions & 55 deletions q2_moshpit/kaiju/tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@
import unittest
from unittest.mock import patch, Mock

import pandas as pd
from bs4 import BeautifulSoup
from qiime2.plugin.testing import TestPluginBase

from q2_moshpit.kaiju.database import (
_fetch_and_extract_db, _find_latest_db_url, _find_all_dbs,
_fetch_and_extract_db, _find_latest_db_url,
fetch_kaiju_db, CHUNK_SIZE, ERR_MSG, KAIJU_SERVER_URL
)
from requests.exceptions import ConnectionError, RequestException
Expand All @@ -26,6 +24,50 @@
class TestDatabaseFunctions(TestPluginBase):
package = 'q2_moshpit.kaiju.tests'

def setUp(self):
super().setUp()
self.html_content = b''' # noqa: E501
<html><body>
<table>
<thead>
<tr>
<th>Database</th>
<th>Date</th>
<th style="text-align: right">Archive size (GB)</th>
<th style="text-align: right">RAM needed (GB)</th>
<th>HTTPS URL</th>
</tr>
</thead>
<tbody>
<tr><td><strong>nr</strong></td><td colspan="4">Subset of NCBI BLAST <a href="https://ftp.ncbi.nlm.nih.gov/blast/db/v5/v5/FASTA/">nr</a> database containing Archaea, bacteria and viruses.</td></tr>
<tr>
<td></td>
<td>2023-05-10</td>
<td style="text-align: right">67</td>
<td style="text-align: right">177</td>
<td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_2023-05-10.tgz">.tgz</a></td>
</tr>
<tr><td><strong>nr_euk</strong></td><td colspan="4">Like nr, but additionally including fungi and microbial eukaryotes, see <a href="https://github.com/bioinformatics-centre/kaiju/blob/master/util/kaiju-taxonlistEuk.tsv">taxon list</a></td></tr>
<tr>
<td></td>
<td>2023-05-10</td>
<td style="text-align: right">82</td>
<td style="text-align: right">204</td>
<td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_euk_2023-05-10.tgz">.tgz</a></td>
</tr>
<tr><td><strong>refseq</strong></td><td colspan="4">Protein sequences from genome assemblies of Archaea and bacteria with assembly level "Complete Genome", as well as viral protein sequences from NCBI RefSeq.</td></tr>
<tr>
<td></td>
<td>2023-05-10</td>
<td style="text-align: right">30</td>
<td style="text-align: right">87</td>
<td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_refseq_2023-05-23.tgz">.tgz</a></td>
</tr>
</tbody>
</table>
</body></html>
'''

@patch("requests.get")
@patch("q2_moshpit.kaiju.database.tqdm")
@patch("tarfile.open")
Expand Down Expand Up @@ -72,68 +114,32 @@ def test_fetch_and_extract_db_exception(
"http://a/b/db.tar.gz", stream=True
)

def test_find_latest_db_url(self):
databases = [
('nr_euk 2021-02-24 (61GB)',
'https://hello.com/nr_euk_2021-02-24.tar.gz'),
('nr 2021-02-26 (52GB)',
'https://hello.com/nr_2021-02-26.tar.gz'),
('nr_euk 2022-01-11 (60GB)',
'https://hello.com/nr_euk_2022-01-11.tar.gz')
]
sidebox_element = BeautifulSoup(
'<html><body>{}</body></html>'.format(
''.join('<a href={}>{}</a>'.format(d[1], d[0])
for d in databases)
), 'html.parser')
url = _find_latest_db_url(
database_type='nr_euk',
sidebox_element=sidebox_element,
url='https://test.com'
)
self.assertEqual(url, 'https://hello.com/nr_euk_2022-01-11.tar.gz')

def test_find_all_dbs(self):
databases = ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
sidebox_element = BeautifulSoup(
'<html><body>{}</body></html>'.format(
''.join('<a>{}</a>'.format(d) for d in databases)
), 'html.parser')
df = _find_all_dbs(sidebox_element)
self.assertIsInstance(df, pd.DataFrame)
self.assertListEqual(
df.index.tolist(),
['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
def test_find_latest_db_url_ok(self):
url = _find_latest_db_url(self.html_content, 'nr')
self.assertEqual(
url,
'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/'
'kaiju_db_nr_2023-05-10.tgz'
)
self.assertListEqual(
df['Date'].tolist(),
[pd.to_datetime('2021-02-24'), pd.to_datetime('2021-02-26')]

def test_find_latest_db_url_not_found(self):
with self.assertRaises(ValueError) as context:
_find_latest_db_url(self.html_content, 'non_existing_db')
self.assertIn(
"URL for database type 'non_existing_db' not found.",
str(context.exception)
)

@patch("requests.get")
@patch("q2_moshpit.kaiju.database._fetch_and_extract_db")
def test_fetch_kaiju_db(self, mock_fetch, mock_requests):
databases = [
('nr_euk 2021-02-24 (61GB)',
'https://hello.com/nr_euk_2021-02-24.tar.gz'),
('nr 2021-02-26 (52GB)',
'https://hello.com/nr_2021-02-26.tar.gz'),
('nr_euk 2022-01-11 (60GB)',
'https://hello.com/nr_euk_2022-01-11.tar.gz')
]
mock_requests.return_value = Mock(
content='<html><body><div id="sidebox_db">{}</div></body></html>'
.format(
''.join('<a href={}>{}</a>'.format(d[1], d[0])
for d in databases)
)
)

mock_requests.return_value = Mock(content=self.html_content)
obs_db = fetch_kaiju_db('nr_euk')
self.assertIsInstance(obs_db, KaijuDBDirectoryFormat)
mock_requests.assert_called_with(KAIJU_SERVER_URL)
mock_fetch.assert_called_with(
'https://hello.com/nr_euk_2022-01-11.tar.gz',
'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/'
'kaiju_db_nr_euk_2023-05-10.tgz',
str(obs_db.path)
)

Expand Down
8 changes: 5 additions & 3 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,8 @@
"nr",
"nr_euk",
"refseq",
"refseq_ref",
"refseq_nr",
"fungi",
"viruses",
"plasmids",
Expand All @@ -1359,13 +1361,13 @@
input_descriptions={},
parameter_descriptions={
"database_type": "Type of database to be downloaded. For more "
"information on available types please see the list on "
"Kaiju's web server: https://kaiju.binf.ku.dk/server",
"information on available types please see the list on Kaiju's web "
"server: https://bioinformatics-centre.github.io/kaiju/downloads.html",
},
output_descriptions={"database": "Kaiju database."},
name="Fetch Kaiju database.",
description="This method fetches the latest Kaiju database from "
"https://kaiju.binf.ku.dk/server.",
"Kaiju's web server.",
citations=[citations["menzel2016"]],
)

Expand Down

0 comments on commit 2562a9a

Please sign in to comment.