From 2562a9a1f6bb957866cc7a2a9917392225346390 Mon Sep 17 00:00:00 2001 From: Michal Ziemski Date: Mon, 2 Sep 2024 14:05:37 +0200 Subject: [PATCH] FIX: update how fetch-kaiju-db action discovers DB URLs (#201) --- q2_moshpit/kaiju/database.py | 85 ++++++----------- q2_moshpit/kaiju/tests/test_database.py | 116 +++++++++++++----------- q2_moshpit/plugin_setup.py | 8 +- 3 files changed, 96 insertions(+), 113 deletions(-) diff --git a/q2_moshpit/kaiju/database.py b/q2_moshpit/kaiju/database.py index bbcd61e6..128bc6c8 100644 --- a/q2_moshpit/kaiju/database.py +++ b/q2_moshpit/kaiju/database.py @@ -7,19 +7,16 @@ # ---------------------------------------------------------------------------- import os import tarfile -from urllib.parse import urljoin +import requests +from bs4 import BeautifulSoup from tqdm import tqdm from q2_types.kaiju import KaijuDBDirectoryFormat - -from bs4 import BeautifulSoup -import requests -import pandas as pd - CHUNK_SIZE = 8192 -KAIJU_SERVER_URL = "https://kaiju.binf.ku.dk/server" +KAIJU_SERVER_URL = ("https://bioinformatics-centre.github.io/" + "kaiju/downloads.html") ERR_MSG = ( "Unable to connect to the Kaiju server. Please try again later. " "The error was: {}" @@ -32,8 +29,7 @@ def _fetch_and_extract_db(db_uri: str, db_dir: str): Args: db_uri (str): The URI of the database to fetch. - db_dir (str): The directory where the database will be saved. - + db_dir (str): Path to the final DB directory. """ latest_db = os.path.basename(db_uri) db_path = os.path.join(db_dir, latest_db) @@ -68,56 +64,39 @@ def _fetch_and_extract_db(db_uri: str, db_dir: str): os.remove(db_path) -def _find_latest_db_url(database_type, sidebox_element, url): +def _find_latest_db_url(response: bytes, database_type: str) -> str: """ Finds the latest database URL based on the database type. Args: + response (bytes): HTML response containing the table with DB URLs. database_type (str): The target database type to filter. - sidebox_element (object): The element containing the databases. - url (str): The base URL. Returns: str: The latest database URL. """ - # Extract the databases and dates - df = _find_all_dbs(sidebox_element) - - # Filter databases based on target_database type - filtered_df = df[df.index.str.contains(database_type)] - - # Find the latest database - latest_database = filtered_df["Date"].idxmax() - # latest_database = filtered_df.loc[latest_index, "Database"] - download_link = sidebox_element.find("a", string=latest_database)["href"] - download_link = urljoin(url, download_link) - - return download_link - - -def _find_all_dbs(sidebox_element): - """ - Args: - sidebox_element: A BeautifulSoup element containing the sidebox - element on the page. - - Returns: - df: A pandas DataFrame with columns "Database" and "Date". - The "Database" column contains the names of the databases - found in the sidebox_element, while the "Date" column contains - the corresponding dates. - - """ - databases, dates = [], [] - for link in sidebox_element.find_all("a"): - database = link.get_text() - date = database.split()[-2] # Last element is the date - databases.append(database) - dates.append(date) - df = pd.DataFrame({"Database": databases, "Date": dates}) - df.set_index("Database", inplace=True) - df.loc[:, "Date"] = pd.to_datetime(df.loc[:, "Date"]) - return df + soup = BeautifulSoup(response, 'html.parser') + tables = soup.find_all('table') + + for table in tables: + # Locate the table header + headers = table.find_all('th') + if headers and headers[0].get_text().strip() == "Database": + rows = table.find_all('tr') + for row in rows: + cells = row.find_all('td') + + # Check if the first cell contains the required database_type + if cells and cells[0].get_text().strip() == database_type: + # The next row contains the desired URLs + next_row = row.find_next_sibling('tr') + if next_row: + url_cell = next_row.find_all('td')[-1] + url = url_cell.find('a') + if url: + return url['href'] + + raise ValueError(f"URL for database type '{database_type}' not found.") def fetch_kaiju_db( @@ -128,12 +107,8 @@ def fetch_kaiju_db( response = requests.get(KAIJU_SERVER_URL) except requests.exceptions.RequestException as e: raise Exception(ERR_MSG.format(e)) - soup = BeautifulSoup(response.content, "html.parser") - sidebox_db = soup.find("div", id="sidebox_db") - download_link = _find_latest_db_url( - database_type, sidebox_db, KAIJU_SERVER_URL - ) + download_link = _find_latest_db_url(response.content, database_type) db = KaijuDBDirectoryFormat() _fetch_and_extract_db(download_link, str(db.path)) diff --git a/q2_moshpit/kaiju/tests/test_database.py b/q2_moshpit/kaiju/tests/test_database.py index e8a3e800..7bdb4711 100644 --- a/q2_moshpit/kaiju/tests/test_database.py +++ b/q2_moshpit/kaiju/tests/test_database.py @@ -10,12 +10,10 @@ import unittest from unittest.mock import patch, Mock -import pandas as pd -from bs4 import BeautifulSoup from qiime2.plugin.testing import TestPluginBase from q2_moshpit.kaiju.database import ( - _fetch_and_extract_db, _find_latest_db_url, _find_all_dbs, + _fetch_and_extract_db, _find_latest_db_url, fetch_kaiju_db, CHUNK_SIZE, ERR_MSG, KAIJU_SERVER_URL ) from requests.exceptions import ConnectionError, RequestException @@ -26,6 +24,50 @@ class TestDatabaseFunctions(TestPluginBase): package = 'q2_moshpit.kaiju.tests' + def setUp(self): + super().setUp() + self.html_content = b''' # noqa: E501 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DatabaseDateArchive size (GB)RAM needed (GB)HTTPS URL
nrSubset of NCBI BLAST nr database containing Archaea, bacteria and viruses.
2023-05-1067177.tgz
nr_eukLike nr, but additionally including fungi and microbial eukaryotes, see taxon list
2023-05-1082204.tgz
refseqProtein sequences from genome assemblies of Archaea and bacteria with assembly level "Complete Genome", as well as viral protein sequences from NCBI RefSeq.
2023-05-103087.tgz
+ + ''' + @patch("requests.get") @patch("q2_moshpit.kaiju.database.tqdm") @patch("tarfile.open") @@ -72,68 +114,32 @@ def test_fetch_and_extract_db_exception( "http://a/b/db.tar.gz", stream=True ) - def test_find_latest_db_url(self): - databases = [ - ('nr_euk 2021-02-24 (61GB)', - 'https://hello.com/nr_euk_2021-02-24.tar.gz'), - ('nr 2021-02-26 (52GB)', - 'https://hello.com/nr_2021-02-26.tar.gz'), - ('nr_euk 2022-01-11 (60GB)', - 'https://hello.com/nr_euk_2022-01-11.tar.gz') - ] - sidebox_element = BeautifulSoup( - '{}'.format( - ''.join('{}'.format(d[1], d[0]) - for d in databases) - ), 'html.parser') - url = _find_latest_db_url( - database_type='nr_euk', - sidebox_element=sidebox_element, - url='https://test.com' - ) - self.assertEqual(url, 'https://hello.com/nr_euk_2022-01-11.tar.gz') - - def test_find_all_dbs(self): - databases = ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)'] - sidebox_element = BeautifulSoup( - '{}'.format( - ''.join('{}'.format(d) for d in databases) - ), 'html.parser') - df = _find_all_dbs(sidebox_element) - self.assertIsInstance(df, pd.DataFrame) - self.assertListEqual( - df.index.tolist(), - ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)'] + def test_find_latest_db_url_ok(self): + url = _find_latest_db_url(self.html_content, 'nr') + self.assertEqual( + url, + 'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/' + 'kaiju_db_nr_2023-05-10.tgz' ) - self.assertListEqual( - df['Date'].tolist(), - [pd.to_datetime('2021-02-24'), pd.to_datetime('2021-02-26')] + + def test_find_latest_db_url_not_found(self): + with self.assertRaises(ValueError) as context: + _find_latest_db_url(self.html_content, 'non_existing_db') + self.assertIn( + "URL for database type 'non_existing_db' not found.", + str(context.exception) ) @patch("requests.get") @patch("q2_moshpit.kaiju.database._fetch_and_extract_db") def test_fetch_kaiju_db(self, mock_fetch, mock_requests): - databases = [ - ('nr_euk 2021-02-24 (61GB)', - 'https://hello.com/nr_euk_2021-02-24.tar.gz'), - ('nr 2021-02-26 (52GB)', - 'https://hello.com/nr_2021-02-26.tar.gz'), - ('nr_euk 2022-01-11 (60GB)', - 'https://hello.com/nr_euk_2022-01-11.tar.gz') - ] - mock_requests.return_value = Mock( - content='' - .format( - ''.join('{}'.format(d[1], d[0]) - for d in databases) - ) - ) - + mock_requests.return_value = Mock(content=self.html_content) obs_db = fetch_kaiju_db('nr_euk') self.assertIsInstance(obs_db, KaijuDBDirectoryFormat) mock_requests.assert_called_with(KAIJU_SERVER_URL) mock_fetch.assert_called_with( - 'https://hello.com/nr_euk_2022-01-11.tar.gz', + 'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/' + 'kaiju_db_nr_euk_2023-05-10.tgz', str(obs_db.path) ) diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py index 7a5789ce..bca10433 100644 --- a/q2_moshpit/plugin_setup.py +++ b/q2_moshpit/plugin_setup.py @@ -1345,6 +1345,8 @@ "nr", "nr_euk", "refseq", + "refseq_ref", + "refseq_nr", "fungi", "viruses", "plasmids", @@ -1359,13 +1361,13 @@ input_descriptions={}, parameter_descriptions={ "database_type": "Type of database to be downloaded. For more " - "information on available types please see the list on " - "Kaiju's web server: https://kaiju.binf.ku.dk/server", + "information on available types please see the list on Kaiju's web " + "server: https://bioinformatics-centre.github.io/kaiju/downloads.html", }, output_descriptions={"database": "Kaiju database."}, name="Fetch Kaiju database.", description="This method fetches the latest Kaiju database from " - "https://kaiju.binf.ku.dk/server.", + "Kaiju's web server.", citations=[citations["menzel2016"]], )