FIX: update how fetch-kaiju-db action discovers DB URLs (#201)

bokulich-lab · Sep 2, 2024 · 2562a9a · 2562a9a
1 parent 1bb3e46
commit 2562a9a
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 113 deletions.
diff --git a/q2_moshpit/kaiju/database.py b/q2_moshpit/kaiju/database.py
@@ -7,19 +7,16 @@
 # ----------------------------------------------------------------------------
 import os
 import tarfile
-from urllib.parse import urljoin
 
+import requests
+from bs4 import BeautifulSoup
 from tqdm import tqdm
 
 from q2_types.kaiju import KaijuDBDirectoryFormat
 
-
-from bs4 import BeautifulSoup
-import requests
-import pandas as pd
-
 CHUNK_SIZE = 8192
-KAIJU_SERVER_URL = "https://kaiju.binf.ku.dk/server"
+KAIJU_SERVER_URL = ("https://bioinformatics-centre.github.io/"
+                    "kaiju/downloads.html")
 ERR_MSG = (
     "Unable to connect to the Kaiju server. Please try again later. "
     "The error was: {}"
@@ -32,8 +29,7 @@ def _fetch_and_extract_db(db_uri: str, db_dir: str):
 
     Args:
         db_uri (str): The URI of the database to fetch.
-        db_dir (str): The directory where the database will be saved.
-
+        db_dir (str): Path to the final DB directory.
     """
     latest_db = os.path.basename(db_uri)
     db_path = os.path.join(db_dir, latest_db)
@@ -68,56 +64,39 @@ def _fetch_and_extract_db(db_uri: str, db_dir: str):
     os.remove(db_path)
 
 
-def _find_latest_db_url(database_type, sidebox_element, url):
+def _find_latest_db_url(response: bytes, database_type: str) -> str:
     """
     Finds the latest database URL based on the database type.
 
     Args:
+        response (bytes): HTML response containing the table with DB URLs.
         database_type (str): The target database type to filter.
-        sidebox_element (object): The element containing the databases.
-        url (str): The base URL.
 
     Returns:
         str: The latest database URL.
     """
-    # Extract the databases and dates
-    df = _find_all_dbs(sidebox_element)
-
-    # Filter databases based on target_database type
-    filtered_df = df[df.index.str.contains(database_type)]
-
-    # Find the latest database
-    latest_database = filtered_df["Date"].idxmax()
-    # latest_database = filtered_df.loc[latest_index, "Database"]
-    download_link = sidebox_element.find("a", string=latest_database)["href"]
-    download_link = urljoin(url, download_link)
-
-    return download_link
-
-
-def _find_all_dbs(sidebox_element):
-    """
-    Args:
-        sidebox_element: A BeautifulSoup element containing the sidebox
-            element on the page.
-
-    Returns:
-        df: A pandas DataFrame with columns "Database" and "Date".
-         The "Database" column contains the names of the databases
-         found in the sidebox_element, while the "Date" column contains
-         the corresponding dates.
-
-    """
-    databases, dates = [], []
-    for link in sidebox_element.find_all("a"):
-        database = link.get_text()
-        date = database.split()[-2]  # Last element is the date
-        databases.append(database)
-        dates.append(date)
-    df = pd.DataFrame({"Database": databases, "Date": dates})
-    df.set_index("Database", inplace=True)
-    df.loc[:, "Date"] = pd.to_datetime(df.loc[:, "Date"])
-    return df
+    soup = BeautifulSoup(response, 'html.parser')
+    tables = soup.find_all('table')
+
+    for table in tables:
+        # Locate the table header
+        headers = table.find_all('th')
+        if headers and headers[0].get_text().strip() == "Database":
+            rows = table.find_all('tr')
+            for row in rows:
+                cells = row.find_all('td')
+
+                # Check if the first cell contains the required database_type
+                if cells and cells[0].get_text().strip() == database_type:
+                    # The next row contains the desired URLs
+                    next_row = row.find_next_sibling('tr')
+                    if next_row:
+                        url_cell = next_row.find_all('td')[-1]
+                        url = url_cell.find('a')
+                        if url:
+                            return url['href']
+
+    raise ValueError(f"URL for database type '{database_type}' not found.")
 
 
 def fetch_kaiju_db(
@@ -128,12 +107,8 @@ def fetch_kaiju_db(
         response = requests.get(KAIJU_SERVER_URL)
     except requests.exceptions.RequestException as e:
         raise Exception(ERR_MSG.format(e))
-    soup = BeautifulSoup(response.content, "html.parser")
-    sidebox_db = soup.find("div", id="sidebox_db")
 
-    download_link = _find_latest_db_url(
-        database_type, sidebox_db, KAIJU_SERVER_URL
-    )
+    download_link = _find_latest_db_url(response.content, database_type)
 
     db = KaijuDBDirectoryFormat()
     _fetch_and_extract_db(download_link, str(db.path))

diff --git a/q2_moshpit/kaiju/tests/test_database.py b/q2_moshpit/kaiju/tests/test_database.py
@@ -10,12 +10,10 @@
 import unittest
 from unittest.mock import patch, Mock
 
-import pandas as pd
-from bs4 import BeautifulSoup
 from qiime2.plugin.testing import TestPluginBase
 
 from q2_moshpit.kaiju.database import (
-    _fetch_and_extract_db, _find_latest_db_url, _find_all_dbs,
+    _fetch_and_extract_db, _find_latest_db_url,
     fetch_kaiju_db, CHUNK_SIZE, ERR_MSG, KAIJU_SERVER_URL
 )
 from requests.exceptions import ConnectionError, RequestException
@@ -26,6 +24,50 @@
 class TestDatabaseFunctions(TestPluginBase):
     package = 'q2_moshpit.kaiju.tests'
 
+    def setUp(self):
+        super().setUp()
+        self.html_content = b''' # noqa: E501
+            <html><body>
+            <table>
+              <thead>
+                <tr>
+                  <th>Database</th>
+                  <th>Date</th>
+                  <th style="text-align: right">Archive size (GB)</th>
+                  <th style="text-align: right">RAM needed (GB)</th>
+                  <th>HTTPS URL</th>
+                </tr>
+              </thead>
+              <tbody>
+                <tr><td><strong>nr</strong></td><td colspan="4">Subset of NCBI BLAST <a href="https://ftp.ncbi.nlm.nih.gov/blast/db/v5/v5/FASTA/">nr</a> database containing Archaea, bacteria and viruses.</td></tr>
+                <tr>
+                  <td></td>
+                  <td>2023-05-10</td>
+                  <td style="text-align: right">67</td>
+                  <td style="text-align: right">177</td>
+                  <td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_2023-05-10.tgz">.tgz</a></td>
+                </tr>
+                <tr><td><strong>nr_euk</strong></td><td colspan="4">Like nr, but additionally including fungi and microbial eukaryotes, see <a href="https://github.com/bioinformatics-centre/kaiju/blob/master/util/kaiju-taxonlistEuk.tsv">taxon list</a></td></tr>
+                <tr>
+                  <td></td>
+                  <td>2023-05-10</td>
+                  <td style="text-align: right">82</td>
+                  <td style="text-align: right">204</td>
+                  <td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_nr_euk_2023-05-10.tgz">.tgz</a></td>
+                </tr>
+                <tr><td><strong>refseq</strong></td><td colspan="4">Protein sequences from genome assemblies of Archaea and bacteria with assembly level "Complete Genome", as well as viral protein sequences from NCBI RefSeq.</td></tr>
+                <tr>
+                  <td></td>
+                  <td>2023-05-10</td>
+                  <td style="text-align: right">30</td>
+                  <td style="text-align: right">87</td>
+                  <td><a href="https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/kaiju_db_refseq_2023-05-23.tgz">.tgz</a></td>
+                </tr>
+              </tbody>
+            </table>
+            </body></html>
+            '''
+
     @patch("requests.get")
     @patch("q2_moshpit.kaiju.database.tqdm")
     @patch("tarfile.open")
@@ -72,68 +114,32 @@ def test_fetch_and_extract_db_exception(
                     "http://a/b/db.tar.gz", stream=True
                 )
 
-    def test_find_latest_db_url(self):
-        databases = [
-            ('nr_euk 2021-02-24 (61GB)',
-             'https://hello.com/nr_euk_2021-02-24.tar.gz'),
-            ('nr 2021-02-26 (52GB)',
-             'https://hello.com/nr_2021-02-26.tar.gz'),
-            ('nr_euk 2022-01-11 (60GB)',
-             'https://hello.com/nr_euk_2022-01-11.tar.gz')
-        ]
-        sidebox_element = BeautifulSoup(
-            '<html><body>{}</body></html>'.format(
-                ''.join('<a href={}>{}</a>'.format(d[1], d[0])
-                        for d in databases)
-            ), 'html.parser')
-        url = _find_latest_db_url(
-            database_type='nr_euk',
-            sidebox_element=sidebox_element,
-            url='https://test.com'
-        )
-        self.assertEqual(url, 'https://hello.com/nr_euk_2022-01-11.tar.gz')
-
-    def test_find_all_dbs(self):
-        databases = ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
-        sidebox_element = BeautifulSoup(
-            '<html><body>{}</body></html>'.format(
-                ''.join('<a>{}</a>'.format(d) for d in databases)
-            ), 'html.parser')
-        df = _find_all_dbs(sidebox_element)
-        self.assertIsInstance(df, pd.DataFrame)
-        self.assertListEqual(
-            df.index.tolist(),
-            ['nr_euk 2021-02-24 (61GB)', 'nr 2021-02-26 (52GB)']
+    def test_find_latest_db_url_ok(self):
+        url = _find_latest_db_url(self.html_content, 'nr')
+        self.assertEqual(
+            url,
+            'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/'
+            'kaiju_db_nr_2023-05-10.tgz'
         )
-        self.assertListEqual(
-            df['Date'].tolist(),
-            [pd.to_datetime('2021-02-24'), pd.to_datetime('2021-02-26')]
+
+    def test_find_latest_db_url_not_found(self):
+        with self.assertRaises(ValueError) as context:
+            _find_latest_db_url(self.html_content, 'non_existing_db')
+        self.assertIn(
+            "URL for database type 'non_existing_db' not found.",
+            str(context.exception)
         )
 
     @patch("requests.get")
     @patch("q2_moshpit.kaiju.database._fetch_and_extract_db")
     def test_fetch_kaiju_db(self, mock_fetch, mock_requests):
-        databases = [
-            ('nr_euk 2021-02-24 (61GB)',
-             'https://hello.com/nr_euk_2021-02-24.tar.gz'),
-            ('nr 2021-02-26 (52GB)',
-             'https://hello.com/nr_2021-02-26.tar.gz'),
-            ('nr_euk 2022-01-11 (60GB)',
-             'https://hello.com/nr_euk_2022-01-11.tar.gz')
-        ]
-        mock_requests.return_value = Mock(
-            content='<html><body><div id="sidebox_db">{}</div></body></html>'
-            .format(
-                ''.join('<a href={}>{}</a>'.format(d[1], d[0])
-                        for d in databases)
-            )
-        )
-
+        mock_requests.return_value = Mock(content=self.html_content)
         obs_db = fetch_kaiju_db('nr_euk')
         self.assertIsInstance(obs_db, KaijuDBDirectoryFormat)
         mock_requests.assert_called_with(KAIJU_SERVER_URL)
         mock_fetch.assert_called_with(
-            'https://hello.com/nr_euk_2022-01-11.tar.gz',
+            'https://kaiju-idx.s3.eu-central-1.amazonaws.com/2023/'
+            'kaiju_db_nr_euk_2023-05-10.tgz',
             str(obs_db.path)
         )
 

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
@@ -1345,6 +1345,8 @@
                 "nr",
                 "nr_euk",
                 "refseq",
+                "refseq_ref",
+                "refseq_nr",
                 "fungi",
                 "viruses",
                 "plasmids",
@@ -1359,13 +1361,13 @@
     input_descriptions={},
     parameter_descriptions={
         "database_type": "Type of database to be downloaded. For more "
-        "information on available types please see the list on "
-        "Kaiju's web server: https://kaiju.binf.ku.dk/server",
+        "information on available types please see the list on Kaiju's web "
+        "server: https://bioinformatics-centre.github.io/kaiju/downloads.html",
     },
     output_descriptions={"database": "Kaiju database."},
     name="Fetch Kaiju database.",
     description="This method fetches the latest Kaiju database from "
-                "https://kaiju.binf.ku.dk/server.",
+                "Kaiju's web server.",
     citations=[citations["menzel2016"]],
 )