Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: update how fetch-kaiju-db action discovers DB URLs #201

Merged
merged 2 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 30 additions & 55 deletions q2_moshpit/kaiju/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,16 @@
# ----------------------------------------------------------------------------
import os
import tarfile
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

from q2_types.kaiju import KaijuDBDirectoryFormat


from bs4 import BeautifulSoup
import requests
import pandas as pd

CHUNK_SIZE = 8192
KAIJU_SERVER_URL = "https://kaiju.binf.ku.dk/server"
KAIJU_SERVER_URL = ("https://bioinformatics-centre.github.io/"
"kaiju/downloads.html")
ERR_MSG = (
"Unable to connect to the Kaiju server. Please try again later. "
"The error was: {}"
Expand All @@ -32,8 +29,7 @@ def _fetch_and_extract_db(db_uri: str, db_dir: str):

Args:
db_uri (str): The URI of the database to fetch.
db_dir (str): The directory where the database will be saved.

db_dir (str): Path to the final DB directory.
"""
latest_db = os.path.basename(db_uri)
db_path = os.path.join(db_dir, latest_db)
Expand Down Expand Up @@ -68,56 +64,39 @@ def _fetch_and_extract_db(db_uri: str, db_dir: str):
os.remove(db_path)


def _find_latest_db_url(database_type, sidebox_element, url):
def _find_latest_db_url(response: bytes, database_type: str) -> str:
"""
Finds the latest database URL based on the database type.

Args:
response (bytes): HTML response containing the table with DB URLs.
database_type (str): The target database type to filter.
sidebox_element (object): The element containing the databases.
url (str): The base URL.

Returns:
str: The latest database URL.
"""
# Extract the databases and dates
df = _find_all_dbs(sidebox_element)

# Filter databases based on target_database type
filtered_df = df[df.index.str.contains(database_type)]

# Find the latest database
latest_database = filtered_df["Date"].idxmax()
# latest_database = filtered_df.loc[latest_index, "Database"]
download_link = sidebox_element.find("a", string=latest_database)["href"]
download_link = urljoin(url, download_link)

return download_link


def _find_all_dbs(sidebox_element):
"""
Args:
sidebox_element: A BeautifulSoup element containing the sidebox
element on the page.

Returns:
df: A pandas DataFrame with columns "Database" and "Date".
The "Database" column contains the names of the databases
found in the sidebox_element, while the "Date" column contains
the corresponding dates.

"""
databases, dates = [], []
for link in sidebox_element.find_all("a"):
database = link.get_text()
date = database.split()[-2] # Last element is the date
databases.append(database)
dates.append(date)
df = pd.DataFrame({"Database": databases, "Date": dates})
df.set_index("Database", inplace=True)
df.loc[:, "Date"] = pd.to_datetime(df.loc[:, "Date"])
return df
soup = BeautifulSoup(response, 'html.parser')
tables = soup.find_all('table')

for table in tables:
# Locate the table header
headers = table.find_all('th')
if headers and headers[0].get_text().strip() == "Database":
rows = table.find_all('tr')
for row in rows:
cells = row.find_all('td')

# Check if the first cell contains the required database_type
if cells and cells[0].get_text().strip() == database_type:
# The next row contains the desired URLs
next_row = row.find_next_sibling('tr')
if next_row:
url_cell = next_row.find_all('td')[-1]
url = url_cell.find('a')
if url:
return url['href']

raise ValueError(f"URL for database type '{database_type}' not found.")


def fetch_kaiju_db(
Expand All @@ -128,12 +107,8 @@ def fetch_kaiju_db(
response = requests.get(KAIJU_SERVER_URL)
except requests.exceptions.RequestException as e:
raise Exception(ERR_MSG.format(e))
soup = BeautifulSoup(response.content, "html.parser")
sidebox_db = soup.find("div", id="sidebox_db")

download_link = _find_latest_db_url(
database_type, sidebox_db, KAIJU_SERVER_URL
)
download_link = _find_latest_db_url(response.content, database_type)

db = KaijuDBDirectoryFormat()
_fetch_and_extract_db(download_link, str(db.path))
Expand Down
116 changes: 61 additions & 55 deletions q2_moshpit/kaiju/tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@
import unittest
from unittest.mock import patch, Mock

import pandas as pd
from bs4 import BeautifulSoup
from qiime2.plugin.testing import TestPluginBase

from q2_moshpit.kaiju.database import (
_fetch_and_extract_db, _find_latest_db_url, _find_all_dbs,
_fetch_and_extract_db, _find_latest_db_url,
fetch_kaiju_db, CHUNK_SIZE, ERR_MSG, KAIJU_SERVER_URL
)
from requests.exceptions import ConnectionError, RequestException
Expand All @@ -26,6 +24,50 @@
class TestDatabaseFunctions(TestPluginBase):
package = 'q2_moshpit.kaiju.tests'

def setUp(self):
super().setUp()
self.html_content = b''' # noqa: E501
<html><body>
<table>
<thead>
<tr>
<th>Database</th>
<th>Date</th>
<th style="text-align: right">Archive size (GB)</th>
<th style="text-align: right">RAM needed (GB)</th>
<th>HTTPS URL</th>
</tr>
</thead>
<tbody>
<tr><td><strong>nr</strong></td><td colspan="4">Subset of NCBI BLAST <a href="https://ftp.ncbi.nlm.nih.gov/blast/db/v5/v5/FASTA/">nr</a> database containing Archaea, bacteria and viruses.</td></tr>
<tr>
<td></td>
<td>2023-05-10</td>
<td style="text-align: right">67</td>
<td style="text-align: right">177</td>
<td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_2023-05-10.tgz">.tgz</a></td>
</tr>
<tr><td><strong>nr_euk</strong></td><td colspan="4">Like nr, but additionally including fungi and microbial eukaryotes, see <a href="https://github.com/bioinformatics-centre/kaiju/blob/master/util/kaiju-taxonlistEuk.tsv">taxon list</a></td></tr>
<tr>
<td></td>
<td>2023-05-10</td>
<td style="text-align: right">82</td>
<td style="text-align: right">204</td>
<td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_euk_2023-05-10.tgz">.tgz</a></td>
</tr>
<tr><td><strong>refseq</strong></td><td colspan="4">Protein sequences from genome assemblies of Archaea and bacteria with assembly level "Complete Genome", as well as viral protein sequences from NCBI RefSeq.</td></tr>
<tr>
<td></td>
<td>2023-05-10</td>
<td style="text-align: right">30</td>
<td style="text-align: right">87</td>
<td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_refseq_2023-05-23.tgz">.tgz</a></td>
</tr>
</tbody>
</table>
</body></html>
'''

@patch("requests.get")
@patch("q2_moshpit.kaiju.database.tqdm")
@patch("tarfile.open")
Expand Down Expand Up @@ -72,68 +114,32 @@ def test_fetch_and_extract_db_exception(
"http://a/b/db.tar.gz", stream=True
)

def test_find_latest_db_url(self):
databases = [
('nr_euk 2021-02-24 (61GB)',
'https://hello.com/nr_euk_2021-02-24.tar.gz'),
('nr 2021-02-26 (52GB)',
'https://hello.com/nr_2021-02-26.tar.gz'),
('nr_euk 2022-01-11 (60GB)',
'https://hello.com/nr_euk_2022-01-11.tar.gz')
]
sidebox_element = BeautifulSoup(
'<html><body>{}</body></html>'.format(
''.join('<a href={}>{}</a>'.format(d[1], d[0])
for d in databases)
), 'html.parser')
url = _find_latest_db_url(
database_type='nr_euk',
sidebox_element=sidebox_element,
url='https://test.com'
)
self.assertEqual(url, 'https://hello.com/nr_euk_2022-01-11.tar.gz')

def test_find_all_dbs(self):
databases = ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
sidebox_element = BeautifulSoup(
'<html><body>{}</body></html>'.format(
''.join('<a>{}</a>'.format(d) for d in databases)
), 'html.parser')
df = _find_all_dbs(sidebox_element)
self.assertIsInstance(df, pd.DataFrame)
self.assertListEqual(
df.index.tolist(),
['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
def test_find_latest_db_url_ok(self):
url = _find_latest_db_url(self.html_content, 'nr')
self.assertEqual(
url,
'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/'
'kaiju_db_nr_2023-05-10.tgz'
)
self.assertListEqual(
df['Date'].tolist(),
[pd.to_datetime('2021-02-24'), pd.to_datetime('2021-02-26')]

def test_find_latest_db_url_not_found(self):
with self.assertRaises(ValueError) as context:
_find_latest_db_url(self.html_content, 'non_existing_db')
self.assertIn(
"URL for database type 'non_existing_db' not found.",
str(context.exception)
)

@patch("requests.get")
@patch("q2_moshpit.kaiju.database._fetch_and_extract_db")
def test_fetch_kaiju_db(self, mock_fetch, mock_requests):
databases = [
('nr_euk 2021-02-24 (61GB)',
'https://hello.com/nr_euk_2021-02-24.tar.gz'),
('nr 2021-02-26 (52GB)',
'https://hello.com/nr_2021-02-26.tar.gz'),
('nr_euk 2022-01-11 (60GB)',
'https://hello.com/nr_euk_2022-01-11.tar.gz')
]
mock_requests.return_value = Mock(
content='<html><body><div id="sidebox_db">{}</div></body></html>'
.format(
''.join('<a href={}>{}</a>'.format(d[1], d[0])
for d in databases)
)
)

mock_requests.return_value = Mock(content=self.html_content)
obs_db = fetch_kaiju_db('nr_euk')
self.assertIsInstance(obs_db, KaijuDBDirectoryFormat)
mock_requests.assert_called_with(KAIJU_SERVER_URL)
mock_fetch.assert_called_with(
'https://hello.com/nr_euk_2022-01-11.tar.gz',
'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/'
'kaiju_db_nr_euk_2023-05-10.tgz',
str(obs_db.path)
)

Expand Down
2 changes: 2 additions & 0 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,8 @@
"nr",
"nr_euk",
"refseq",
"refseq_ref",
"refseq_nr",
"fungi",
"viruses",
"plasmids",
Expand Down
Loading