diff --git a/bin/search_index_clear b/bin/search_index_clear deleted file mode 100755 index 143838640c..0000000000 --- a/bin/search_index_clear +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python -"""Mark all Works as having out-of-date search index entries. - -This guarantees that the search index is periodically rebuilt from scratch, -providing automatic recovery from bugs and major metadata changes. -""" - -from palace.manager.scripts.search import SearchIndexCoverageRemover - -SearchIndexCoverageRemover().run() diff --git a/bin/search_index_refresh b/bin/search_index_refresh deleted file mode 100755 index 9c151f3a18..0000000000 --- a/bin/search_index_refresh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python -"""Re-index any Works whose entries in the search index have become -out of date. -""" - -from palace.manager.scripts.coverage_provider import RunWorkCoverageProviderScript -from palace.manager.search.coverage_provider import SearchIndexCoverageProvider - -RunWorkCoverageProviderScript(SearchIndexCoverageProvider).run() diff --git a/docker/services/cron/cron.d/circulation b/docker/services/cron/cron.d/circulation index 76be17bbb7..f6251b8290 100644 --- a/docker/services/cron/cron.d/circulation +++ b/docker/services/cron/cron.d/circulation @@ -11,8 +11,6 @@ HOME=/var/www/circulation # These scripts update internal caches. # -*/30 * * * * root bin/run -d 15 search_index_refresh >> /var/log/cron.log 2>&1 -10 0 * * * root bin/run search_index_clear >> /var/log/cron.log 2>&1 0 0 * * * root bin/run update_custom_list_size >> /var/log/cron.log 2>&1 0 2 * * * root bin/run update_lane_size >> /var/log/cron.log 2>&1 */30 * * * * root bin/run -d 5 equivalent_identifiers_refresh >> /var/log/cron.log 2>&1 diff --git a/pyproject.toml b/pyproject.toml index 2aff6542a7..e4c2575b71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,9 @@ module = [ "palace.manager.core.selftest", "palace.manager.feed.*", "palace.manager.integration.*", + "palace.manager.scripts.initialization", "palace.manager.scripts.rotate_jwe_key", + "palace.manager.scripts.search", "palace.manager.search.document", "palace.manager.search.migrator", "palace.manager.search.revision", diff --git a/src/palace/manager/api/admin/controller/custom_lists.py b/src/palace/manager/api/admin/controller/custom_lists.py index 3d24cdb3e3..9ae59a7903 100644 --- a/src/palace/manager/api/admin/controller/custom_lists.py +++ b/src/palace/manager/api/admin/controller/custom_lists.py @@ -289,12 +289,13 @@ def _create_or_update_list( if membership_change: # We need to update the search index entries for works that caused a membership change, # so the upstream counts can be calculated correctly. - documents = self.search_engine.create_search_documents_from_works( - works_to_update_in_search + documents = Work.to_search_documents( + self._db, + [w.id for w in works_to_update_in_search if w.id is not None], ) - index = self.search_engine.start_updating_search_documents() - index.add_documents(documents) - index.finish() + # TODO: Does this need to be done here, or can this be done asynchronously? + self.search_engine.add_documents(documents) + self.search_engine.search_service().refresh() # If this list was used to populate any lanes, those lanes need to have their counts updated. for lane in Lane.affected_by_customlist(list): diff --git a/src/palace/manager/celery/tasks/search.py b/src/palace/manager/celery/tasks/search.py new file mode 100644 index 0000000000..2f30722214 --- /dev/null +++ b/src/palace/manager/celery/tasks/search.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +from collections.abc import Sequence +from typing import Any + +from celery import chain, shared_task +from opensearchpy import OpenSearchException +from sqlalchemy import select +from sqlalchemy.orm import Session + +from palace.manager.celery.task import Task +from palace.manager.core.exceptions import BasePalaceException +from palace.manager.service.celery.celery import QueueNames +from palace.manager.sqlalchemy.model.work import Work +from palace.manager.util.backoff import exponential_backoff +from palace.manager.util.log import elapsed_time_logging + + +def get_work_search_documents( + session: Session, batch_size: int, offset: int +) -> Sequence[dict[str, Any]]: + """ + Get a batch of search documents for works that are presentation ready. + """ + works = [ + w.id + for w in session.execute( + select(Work.id) + .where(Work.presentation_ready == True) + .order_by(Work.id) + .limit(batch_size) + .offset(offset) + ) + ] + return Work.to_search_documents(session, works) + + +class FailedToIndex(BasePalaceException): + ... + + +@shared_task(queue=QueueNames.default, bind=True, max_retries=4) +def search_reindex(task: Task, offset: int = 0, batch_size: int = 500) -> None: + """ + Submit all works that are presentation ready to the search index. + + This is done in batches, with the batch size determined by the batch_size parameter. This + task will do a batch, then requeue itself until all works have been indexed. + """ + index = task.services.search.index() + + task.log.info( + f"Running search reindex at offset {offset} with batch size {batch_size}." + ) + + with ( + task.session() as session, + elapsed_time_logging( + log_method=task.log.info, + message_prefix="Works queried from database", + skip_start=True, + ), + ): + documents = get_work_search_documents(session, batch_size, offset) + + try: + with elapsed_time_logging( + log_method=task.log.info, + message_prefix="Works added to index", + skip_start=True, + ): + failed_documents = index.add_documents(documents=documents) + if failed_documents: + raise FailedToIndex(f"Failed to index {len(failed_documents)} works.") + except (FailedToIndex, OpenSearchException) as e: + wait_time = exponential_backoff(task.request.retries) + task.log.error(f"{e}. Retrying in {wait_time} seconds.") + raise task.retry(countdown=wait_time) + + if len(documents) == batch_size: + # This task is complete, but there are more works waiting to be indexed. Requeue ourselves + # to process the next batch. + raise task.replace( + search_reindex.s(offset=offset + batch_size, batch_size=batch_size) + ) + + task.log.info("Finished search reindex.") + + +@shared_task(queue=QueueNames.default, bind=True, max_retries=4) +def update_read_pointer(task: Task) -> None: + """ + Update the read pointer to the latest revision. + + This is used to indicate that the search index has been updated to a specific version. We + chain this task with search_reindex when doing a migration to ensure that the read pointer is + updated after all works have been indexed. See get_migrate_search_chain. + """ + task.log.info("Updating read pointer.") + service = task.services.search.service() + revision_directory = task.services.search.revision_directory() + revision = revision_directory.highest() + try: + service.read_pointer_set(revision) + except OpenSearchException as e: + wait_time = exponential_backoff(task.request.retries) + task.log.error( + f"Failed to update read pointer: {e}. Retrying in {wait_time} seconds." + ) + raise task.retry(countdown=wait_time) + task.log.info( + f"Updated read pointer ({service.base_revision_name} v{revision.version})." + ) + + +@shared_task(queue=QueueNames.default, bind=True, max_retries=4) +def index_work(task: Task, work_id: int) -> None: + """ + Index a single work into the search index. + """ + index = task.services.search.index() + with task.session() as session: + documents = Work.to_search_documents(session, [work_id]) + + if not documents: + # We were unable to find the work. It could have been deleted or maybe the transaction + # hasn't been committed yet. We'll wait a bit and try again. + wait_time = exponential_backoff(task.request.retries) + task.log.warning( + f"Work {work_id} not found. Unable to index. Retrying in {wait_time} seconds." + ) + raise task.retry(countdown=wait_time) + + try: + index.add_document(document=documents[0]) + except OpenSearchException as e: + wait_time = exponential_backoff(task.request.retries) + task.log.error( + f"Failed to index work {work_id}: {e}. Retrying in {wait_time} seconds." + ) + raise task.retry(countdown=wait_time) + + task.log.info(f"Indexed work {work_id}.") + + +def get_migrate_search_chain() -> chain: + """ + Get the chain of tasks to run when migrating the search index to a new schema. + """ + return chain(search_reindex.si(), update_read_pointer.si()) diff --git a/src/palace/manager/scripts/initialization.py b/src/palace/manager/scripts/initialization.py index 13c101c67d..82a0a700c7 100644 --- a/src/palace/manager/scripts/initialization.py +++ b/src/palace/manager/scripts/initialization.py @@ -1,5 +1,6 @@ from __future__ import annotations +from argparse import ArgumentParser from collections.abc import Callable from pathlib import Path @@ -9,6 +10,9 @@ from sqlalchemy.engine import Connection, Engine from sqlalchemy.orm import Session +from palace.manager.celery.tasks.search import get_migrate_search_chain +from palace.manager.search.revision import SearchSchemaRevision +from palace.manager.search.service import SearchService from palace.manager.service.container import container_instance from palace.manager.sqlalchemy.session import SessionManager from palace.manager.sqlalchemy.util import LOCK_ID_DB_INIT, pg_advisory_lock @@ -56,7 +60,7 @@ def migrate_database(self, connection: Connection) -> None: alembic_conf = self._get_alembic_config(connection, self._config_file) command.upgrade(alembic_conf, "head") - def initialize_database(self, connection: Connection) -> None: + def initialize_database_schema(self, connection: Connection) -> None: """ Initialize the database, creating tables, loading default data and then stamping the most recent migration as the current state of the DB. @@ -71,11 +75,7 @@ def initialize_database(self, connection: Connection) -> None: alembic_conf = self._get_alembic_config(connection, self._config_file) command.stamp(alembic_conf, "head") - def initialize_search_indexes(self) -> bool: - search = self._container.search.index() - return search.initialize_indices() - - def initialize(self, connection: Connection): + def initialize_database(self, connection: Connection) -> None: """Initialize the database if necessary.""" inspector = inspect(connection) if inspector.has_table("alembic_version"): @@ -91,10 +91,73 @@ def initialize(self, connection: Connection): ) else: self.log.info("Database schema does not exist. Initializing.") - self.initialize_database(connection) + self.initialize_database_schema(connection) self.log.info("Initialization complete.") - self.initialize_search_indexes() + @classmethod + def create_search_index( + cls, service: SearchService, revision: SearchSchemaRevision + ) -> None: + # Initialize a new search index by creating the index, setting the mapping, + # and setting the read and write pointers. + service.index_create(revision) + service.index_set_mapping(revision) + service.write_pointer_set(revision) + + @classmethod + def migrate_search( + cls, + service: SearchService, + revision: SearchSchemaRevision, + ) -> None: + # The revision is not the most recent. We need to create a new index. + # and start reindexing our data into it asynchronously. When the reindex + # is complete, we will switch the read pointer to the new index. + cls.logger().info(f"Creating a new index for revision (v{revision.version}).") + cls.create_search_index(service, revision) + task = get_migrate_search_chain().apply_async() + cls.logger().info( + f"Task queued to index data into new search index (Task ID: {task.id})." + ) + + def initialize_search(self) -> None: + service = self._container.search.service() + revision_directory = self._container.search.revision_directory() + revision = revision_directory.highest() + write_pointer = service.write_pointer() + read_pointer = service.read_pointer() + + if write_pointer is None or read_pointer is None: + # Pointers do not exist. This is a fresh index. + self.log.info("Search index does not exist. Creating a new index.") + self.create_search_index(service, revision) + service.read_pointer_set(revision) + elif write_pointer.version < revision.version: + self.log.info( + f"Search index is out-of-date ({service.base_revision_name} v{write_pointer.version})." + ) + self.migrate_search(service, revision) + elif read_pointer.version < revision.version: + self.log.info( + f"Search read pointer is out-of-date (v{read_pointer.version}). Latest is v{revision.version}." + f"This likely means that the reindexing task is in progress. If there is no reindexing task " + f"running, you may need to repair the search index." + ) + elif ( + read_pointer.version > revision.version + or write_pointer.version > revision.version + ): + self.log.error( + f"Search index is in an inconsistent state. Read pointer: v{read_pointer.version}, " + f"Write pointer: v{write_pointer.version}, Latest revision: v{revision.version}. " + f"You may be running an old version of the application against a new search index. " + ) + return + else: + self.log.info( + f"Search index is up-to-date ({service.base_revision_name} v{revision.version})." + ) + self.log.info("Search initialization complete.") def run(self) -> None: """ @@ -105,9 +168,18 @@ def run(self) -> None: instance of the script is running at a time. This prevents multiple instances from trying to initialize the database at the same time. """ + + # This script doesn't take any arguments, but we still call argparse, so that + # we can use the --help option to print out a help message. This avoids the + # surprise of the script actually running when the user just wanted to see the help. + ArgumentParser( + description="Initialize the database and search index for the Palace Manager." + ).parse_args() + engine = self._engine_factory() with engine.begin() as connection: with pg_advisory_lock(connection, LOCK_ID_DB_INIT): - self.initialize(connection) + self.initialize_database(connection) + self.initialize_search() engine.dispose() diff --git a/src/palace/manager/scripts/search.py b/src/palace/manager/scripts/search.py index fd76e392bf..26d06ce478 100644 --- a/src/palace/manager/scripts/search.py +++ b/src/palace/manager/scripts/search.py @@ -1,40 +1,72 @@ -from palace.manager.core.metadata_layer import TimestampData -from palace.manager.scripts.coverage_provider import RunWorkCoverageProviderScript -from palace.manager.scripts.timestamp import TimestampScript -from palace.manager.search.coverage_provider import SearchIndexCoverageProvider -from palace.manager.search.coverage_remover import RemovesSearchCoverage -from palace.manager.search.external_search import ExternalSearchIndex +import argparse +from sqlalchemy.orm import Session -class RebuildSearchIndexScript(RunWorkCoverageProviderScript, RemovesSearchCoverage): - """Completely delete the search index and recreate it.""" +from palace.manager.celery.tasks.search import get_migrate_search_chain, search_reindex +from palace.manager.scripts.base import Script +from palace.manager.search.external_search import ExternalSearchIndex +from palace.manager.service.container import Services - def __init__(self, *args, **kwargs): - search = kwargs.pop("search_index_client", None) - super().__init__(SearchIndexCoverageProvider, *args, **kwargs) - self.search: ExternalSearchIndex = search or self.services.search.index() - def do_run(self): - self.search.clear_search_documents() +class RebuildSearchIndexScript(Script): + """Completely delete the search index and recreate it.""" - # Remove all search coverage records so the - # SearchIndexCoverageProvider will start from scratch. - count = self.remove_search_coverage_records() - self.log.info("Deleted %d search coverage records.", count) + def __init__( + self, + _db: Session | None = None, + services: Services | None = None, + search_index: ExternalSearchIndex | None = None, + cmd_args: list[str] | None = None, + ) -> None: + super().__init__(_db, services) + self.search = search_index or self.services.search.index() + args = self.parse_command_line(self._db, cmd_args=cmd_args) + self.blocking: bool = args.blocking + self.delete: bool = args.delete + self.migration: bool = args.migration - # Now let the SearchIndexCoverageProvider do its thing. - return super().do_run() + @classmethod + def arg_parser(cls) -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Rebuild the search index from scratch." + ) + parser.add_argument( + "-b", + "--blocking", + action="store_true", + help="Block until the search index is rebuilt.", + ) + parser.add_argument( + "-d", + "--delete", + action="store_true", + help="Delete the search index before rebuilding.", + ) + parser.add_argument( + "-m", + "--migration", + action="store_true", + help="Treat as a migration and update the read pointer after the rebuild is complete.", + ) + return parser + def do_run(self) -> None: + """Delete all search documents, then rebuild the search index.""" + if self.delete: + self.log.info("Deleting all search documents.") + self.search.clear_search_documents() -class SearchIndexCoverageRemover(TimestampScript, RemovesSearchCoverage): - """Script that removes search index coverage for all works. + self.log.info("Rebuilding search index.") - This guarantees the SearchIndexCoverageProvider will add - fresh coverage for every Work the next time it runs. - """ + if self.migration: + rebuild_task = get_migrate_search_chain() + else: + rebuild_task = search_reindex.s() - def do_run(self): - count = self.remove_search_coverage_records() - return TimestampData( - achievements="Coverage records deleted: %(deleted)d" % dict(deleted=count) - ) + if self.blocking: + rebuild_task() + else: + task = rebuild_task.delay() + self.log.info( + f"Search index rebuild started (Task ID: {task.id}). The reindex will run in the background." + ) diff --git a/src/palace/manager/search/coverage_provider.py b/src/palace/manager/search/coverage_provider.py deleted file mode 100644 index 087cab4fc9..0000000000 --- a/src/palace/manager/search/coverage_provider.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import annotations - -from palace.manager.core.coverage import CoverageFailure, WorkPresentationProvider -from palace.manager.search.coverage_remover import RemovesSearchCoverage -from palace.manager.search.migrator import ( - SearchDocumentReceiver, - SearchDocumentReceiverType, - SearchMigrationInProgress, -) -from palace.manager.sqlalchemy.model.coverage import WorkCoverageRecord -from palace.manager.sqlalchemy.model.work import Work - - -class SearchIndexCoverageProvider(RemovesSearchCoverage, WorkPresentationProvider): - """Make sure all Works have up-to-date representation in the - search index. - """ - - SERVICE_NAME = "Search index coverage provider" - - DEFAULT_BATCH_SIZE = 500 - - OPERATION = WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION - - def __init__(self, *args, **kwargs): - search_index_client = kwargs.pop("search_index_client", None) - super().__init__(*args, **kwargs) - self.search_index_client = search_index_client or self.services.search.index() - - # - # Try to migrate to the latest schema. If the function returns None, it means - # that no migration is necessary, and we're already at the latest version. If - # we're already at the latest version, then simply upload search documents instead. - # - self.receiver = None - self.migration: None | ( - SearchMigrationInProgress - ) = self.search_index_client.start_migration() - if self.migration is None: - self.receiver: SearchDocumentReceiver = ( - self.search_index_client.start_updating_search_documents() - ) - else: - # We do have a migration, we must clear out the index and repopulate the index - self.remove_search_coverage_records() - - def on_completely_finished(self): - # Tell the search migrator that no more documents are going to show up. - target: SearchDocumentReceiverType = self.migration or self.receiver - target.finish() - - def run_once_and_update_timestamp(self): - # We do not catch exceptions here, so that the on_completely finished should not run - # if there was a runtime error - result = super().run_once_and_update_timestamp() - self.on_completely_finished() - return result - - def process_batch(self, works) -> list[Work | CoverageFailure]: - target: SearchDocumentReceiverType = self.migration or self.receiver - failures = target.add_documents( - documents=self.search_index_client.create_search_documents_from_works(works) - ) - - # Maintain a dictionary of works so that we can efficiently remove failed works later. - work_map: dict[int, Work] = {} - for work in works: - work_map[work.id] = work - - # Remove all the works that failed and create failure records for them. - results: list[Work | CoverageFailure] = [] - for failure in failures: - work = work_map[failure.id] - del work_map[failure.id] - results.append(CoverageFailure(work, repr(failure))) - - # Append all the remaining works that didn't fail. - for work in work_map.values(): - results.append(work) - - return results diff --git a/src/palace/manager/search/coverage_remover.py b/src/palace/manager/search/coverage_remover.py deleted file mode 100644 index 0def882e0b..0000000000 --- a/src/palace/manager/search/coverage_remover.py +++ /dev/null @@ -1,33 +0,0 @@ -from palace.manager.sqlalchemy.model.coverage import WorkCoverageRecord - - -class RemovesSearchCoverage: - """Mix-in class for a script that might remove all coverage records - for the search engine. - """ - - def remove_search_coverage_records(self): - """Delete all search coverage records from the database. - - :return: The number of records deleted. - """ - wcr = WorkCoverageRecord - clause = wcr.operation == wcr.UPDATE_SEARCH_INDEX_OPERATION - count = self._db.query(wcr).filter(clause).count() - - # We want records to be updated in ascending order in order to avoid deadlocks. - # To guarantee lock order, we explicitly acquire locks by using a subquery with FOR UPDATE (with_for_update). - # Please refer for my details to this SO article: - # https://stackoverflow.com/questions/44660368/postgres-update-with-order-by-how-to-do-it - self._db.execute( - wcr.__table__.delete().where( - wcr.id.in_( - self._db.query(wcr.id) - .with_for_update() - .filter(clause) - .order_by(WorkCoverageRecord.id) - ) - ) - ) - - return count diff --git a/src/palace/manager/search/external_search.py b/src/palace/manager/search/external_search.py index 81e6c78895..ba7d31abf5 100644 --- a/src/palace/manager/search/external_search.py +++ b/src/palace/manager/search/external_search.py @@ -5,7 +5,7 @@ import re import time from collections import defaultdict -from collections.abc import Iterable, Sequence +from collections.abc import Sequence from attr import define from flask_babel import lazy_gettext as _ @@ -36,13 +36,12 @@ from palace.manager.core.facets import FacetConstants from palace.manager.core.metadata_layer import IdentifierData from palace.manager.core.problem_details import INVALID_INPUT -from palace.manager.search.migrator import ( - SearchDocumentReceiver, - SearchMigrationInProgress, - SearchMigrator, -) from palace.manager.search.revision_directory import SearchRevisionDirectory -from palace.manager.search.service import SearchDocument, SearchService +from palace.manager.search.service import ( + SearchDocument, + SearchService, + SearchServiceFailedDocument, +) from palace.manager.sqlalchemy.model.contributor import Contributor from palace.manager.sqlalchemy.model.edition import Edition from palace.manager.sqlalchemy.model.identifier import Identifier @@ -64,53 +63,16 @@ class ExternalSearchIndex(LoggerMixin): def __init__( self, service: SearchService, - revision_directory: SearchRevisionDirectory, - version: int | None = None, ) -> None: - """Constructor - - :param revision_directory Override the directory of revisions that will be used. If this isn't provided, - the default directory will be used. - :param version The specific revision that will be used. If not specified, the highest version in the - revision directory will be used. - """ + """Constructor""" self._search_service = service - # Locate the revision of the search index that we're going to use. - # This will fail fast if the requested version isn't available. - self._revision_directory = revision_directory - if version: - self._revision = self._revision_directory.find(version) - else: - self._revision = self._revision_directory.highest() - - # Get references to the read and write pointers. - self._search_read_pointer = self._search_service.read_pointer_name() - self._search_write_pointer = self._search_service.write_pointer_name() - def search_service(self) -> SearchService: """Get the underlying search service.""" return self._search_service - def start_migration(self) -> SearchMigrationInProgress | None: - """Update to the latest schema, indexing the given works.""" - migrator = SearchMigrator( - revisions=self._revision_directory, - service=self._search_service, - ) - return migrator.migrate( - base_name=self._search_service.base_revision_name, - version=self._revision.version, - ) - - def start_updating_search_documents(self) -> SearchDocumentReceiver: - """Start submitting search documents for whatever is the current write pointer.""" - return SearchDocumentReceiver( - pointer=self._search_write_pointer, service=self._search_service - ) - def clear_search_documents(self) -> None: - self._search_service.index_clear_documents(pointer=self._search_write_pointer) + self._search_service.index_clear_documents() def create_search_doc(self, query_string, filter, pagination, debug): if filter and filter.search_type == "json": @@ -118,14 +80,13 @@ def create_search_doc(self, query_string, filter, pagination, debug): else: query = Query(query_string, filter) - search = query.build(self._search_service.search_client(), pagination) + search = query.build(self._search_service.read_search_client(), pagination) if debug: search = search.extra(explain=True) if filter is not None and filter.min_score is not None: search = search.extra(min_score=filter.min_score) - fields = None if debug: # Don't restrict the fields at all -- get everything. # This makes it easy to investigate everything about the @@ -195,7 +156,7 @@ def query_works_multi(self, queries, debug=False): (query string, Filter, Pagination) 3-tuple. """ # Create a MultiSearch. - multi = self._search_service.search_multi_client() + multi = self._search_service.read_search_multi_client() # Give it a Search object for every query definition passed in # as part of `queries`. @@ -253,54 +214,25 @@ def count_works(self, filter): ) return qu.count() - def create_search_documents_from_works( - self, works: Iterable[Work] - ) -> Sequence[SearchDocument]: - """Create search documents for all the given works.""" - if not works: - # There's nothing to do. Don't bother making any requests - # to the search index. - return [] - - time1 = time.time() - needs_add = [] - for work in works: - needs_add.append(work) - - # Add/update any works that need adding/updating. - docs = Work.to_search_documents(needs_add) - time2 = time.time() - - self.log.info( - "Created %i search documents in %.2f seconds" % (len(docs), time2 - time1) - ) - return docs - - def remove_work(self, work): + def remove_work(self, work: Work | int) -> None: """Remove the search document for `work` from the search index.""" - self._search_service.index_remove_document( - pointer=self._search_read_pointer, id=work.id - ) - - def initialize_indices(self) -> bool: - """Attempt to initialize the indices and pointers for a first time run""" - service = self.search_service() - read_pointer = service.read_pointer() - if not read_pointer or service.is_pointer_empty(read_pointer): - # A read pointer does not exist, or points to the empty index - # This means either this is a new deployment or the first time - # the new opensearch code was deployed. - # In both cases doing a migration to the latest version is safe. - migration = self.start_migration() - if migration is not None: - migration.finish() - else: - self.log.warning( - "Read pointer was set to empty, but no migration was available." - ) - return False - - return True + if isinstance(work, Work): + if work.id is None: + self.log.warning("Work has no ID, unable to remove. %r", work) + return + work = work.id + + self._search_service.index_remove_document(doc_id=work) + + def add_document(self, document: SearchDocument) -> None: + """Add a document to the search index.""" + self._search_service.index_submit_document(document=document) + + def add_documents( + self, documents: Sequence[SearchDocument] + ) -> list[SearchServiceFailedDocument]: + """Add multiple documents to the search index.""" + return self._search_service.index_submit_documents(documents=documents) class SearchBase: diff --git a/src/palace/manager/search/migrator.py b/src/palace/manager/search/migrator.py deleted file mode 100644 index a683db86c7..0000000000 --- a/src/palace/manager/search/migrator.py +++ /dev/null @@ -1,178 +0,0 @@ -from abc import ABC, abstractmethod -from collections.abc import Sequence - -from palace.manager.core.exceptions import BasePalaceException -from palace.manager.search.revision import SearchSchemaRevision -from palace.manager.search.revision_directory import SearchRevisionDirectory -from palace.manager.search.service import ( - SearchDocument, - SearchService, - SearchServiceFailedDocument, -) -from palace.manager.util.log import LoggerMixin - - -class SearchMigrationException(BasePalaceException): - """The type of exceptions raised by the search migrator.""" - - def __init__(self, fatal: bool, message: str): - super().__init__(message) - self.fatal = fatal - - -class SearchDocumentReceiverType(ABC): - """A receiver of search documents.""" - - @abstractmethod - def add_documents( - self, documents: Sequence[SearchDocument] - ) -> list[SearchServiceFailedDocument]: - """Submit documents to be indexed.""" - - @abstractmethod - def finish(self) -> None: - """Make sure all changes are committed.""" - - -class SearchDocumentReceiver(SearchDocumentReceiverType, LoggerMixin): - """A receiver of search documents.""" - - def __init__(self, pointer: str, service: SearchService): - self._pointer = pointer - self._service = service - - @property - def pointer(self) -> str: - """The name of the index that will receive search documents.""" - return self._pointer - - def add_documents( - self, documents: Sequence[SearchDocument] - ) -> list[SearchServiceFailedDocument]: - """Submit documents to be indexed.""" - return self._service.index_submit_documents( - pointer=self._pointer, documents=documents - ) - - def finish(self) -> None: - """Make sure all changes are committed.""" - self.log.info("Finishing search documents.") - self._service.refresh() - self.log.info("Finished search documents.") - - -class SearchMigrationInProgress(SearchDocumentReceiverType, LoggerMixin): - """A migration in progress. Documents are being submitted, and the migration must be - explicitly finished or cancelled to take effect (or not!).""" - - def __init__( - self, - base_name: str, - revision: SearchSchemaRevision, - service: SearchService, - ): - self._base_name = base_name - self._revision = revision - self._service = service - self._receiver = SearchDocumentReceiver( - pointer=self._revision.name_for_index(base_name), service=self._service - ) - - def add_documents( - self, documents: Sequence[SearchDocument] - ) -> list[SearchServiceFailedDocument]: - """Submit documents to be indexed.""" - return self._receiver.add_documents(documents) - - def finish(self) -> None: - """Finish the migration.""" - self.log.info(f"Completing migration to {self._revision.version}") - # Make sure all changes are committed. - self._receiver.finish() - # Create the "indexed" alias. - self._service.index_set_populated(self._revision) - # Update the write pointer to point to the now-populated index. - self._service.write_pointer_set(self._revision) - # Set the read pointer to point at the now-populated index - self._service.read_pointer_set(self._revision) - self._service.refresh() - self.log.info(f"Completed migration to {self._revision.version}") - - def cancel(self) -> None: - """Cancel the migration, leaving the read and write pointers untouched.""" - self.log.info(f"Cancelling migration to {self._revision.version}") - return None - - -class SearchMigrator(LoggerMixin): - """A search migrator. This moves a search service to the targeted schema version.""" - - def __init__(self, revisions: SearchRevisionDirectory, service: SearchService): - self._revisions = revisions - self._service = service - - def migrate(self, base_name: str, version: int) -> SearchMigrationInProgress | None: - """ - Migrate to the given version using the given base name (such as 'circulation-works'). The function returns - an object that expects to receive batches of search documents used to populate any new index. When all - the batches of documents have been sent to the object, callers must call 'finish' to indicate to the search - migrator that no more documents are coming. Only at this point will the migrator consider the new index to be - "populated". - - :arg base_name: The base name used for indices (such as 'circulation-works'). - :arg version: The version number to which we are migrating - - :raises SearchMigrationException: On errors, but always leaves the system in a usable state. - """ - - self.log.info(f"starting migration to {base_name} {version}") - - try: - target = self._revisions.available.get(version) - if target is None: - raise SearchMigrationException( - fatal=True, - message=f"No support is available for schema version {version}", - ) - - # Does the empty index exist? Create it if not. - self._service.create_empty_index() - - # Does the read pointer exist? Point it at the empty index if not. - read = self._service.read_pointer() - if read is None: - self.log.info("Read pointer did not exist.") - self._service.read_pointer_set_empty() - - # We're probably going to have to do a migration. We might end up returning - # this instance so that users can submit documents for indexing. - in_progress = SearchMigrationInProgress( - base_name=base_name, revision=target, service=self._service - ) - - # Does the write pointer exist? - write = self._service.write_pointer() - if write is None or (not write.version == version): - self.log.info( - f"Write pointer does not point to the desired version: {write} != {version}." - ) - # Either the write pointer didn't exist, or it's pointing at a version - # other than the one we want. Create a new index for the version we want. - self._service.index_create(target) - self._service.index_set_mapping(target) - - # The index now definitely exists, but it might not be populated. Populate it if necessary. - if not self._service.index_is_populated(target): - self.log.info("Write index is not populated.") - return in_progress - - # If we didn't need to return the migration, finish it here. This will - # update the read and write pointers appropriately. - in_progress.finish() - return None - except SearchMigrationException: - raise - except Exception as e: - raise SearchMigrationException( - fatal=True, message=f"Service raised exception: {repr(e)}" - ) from e diff --git a/src/palace/manager/search/revision.py b/src/palace/manager/search/revision.py index ec2cbc9a49..ee9c9975cf 100644 --- a/src/palace/manager/search/revision.py +++ b/src/palace/manager/search/revision.py @@ -10,32 +10,19 @@ class SearchSchemaRevision(ABC): and revisions are treated as immutable once created. """ - _version: int - # The SEARCH_VERSION variable MUST be populated in the implemented child classes - SEARCH_VERSION: int - - def __init__(self) -> None: - if self.SEARCH_VERSION is None: - raise ValueError("The SEARCH_VERSION must be defined with an integer value") - self._version = self.SEARCH_VERSION - @abstractmethod def mapping_document(self) -> SearchMappingDocument: """Produce a mapping document for this schema revision.""" @property + @abstractmethod def version(self) -> int: - return self._version + """The version number of this schema revision.""" def name_for_index(self, base_name: str) -> str: """Produce the name of the index as it will appear in Opensearch, such as 'circulation-works-v5'.""" return f"{base_name}-v{self.version}" - def name_for_indexed_pointer(self, base_name: str) -> str: - """Produce the name of the "indexed pointer" as it will appear in Opensearch, - such as 'circulation-works-v5-indexed'.""" - return f"{base_name}-v{self.version}-indexed" - def script_name(self, script_name: str) -> str: return f"simplified.{script_name}.v{self.version}" diff --git a/src/palace/manager/search/revision_directory.py b/src/palace/manager/search/revision_directory.py index 4ac41bcd61..ecbfeb2184 100644 --- a/src/palace/manager/search/revision_directory.py +++ b/src/palace/manager/search/revision_directory.py @@ -30,10 +30,6 @@ def __init__(self, available: Mapping[int, SearchSchemaRevision]): def create() -> "SearchRevisionDirectory": return SearchRevisionDirectory(SearchRevisionDirectory._create_revisions()) - @staticmethod - def empty() -> "SearchRevisionDirectory": - return SearchRevisionDirectory({}) - @property def available(self) -> Mapping[int, SearchSchemaRevision]: return self._available diff --git a/src/palace/manager/search/service.py b/src/palace/manager/search/service.py index 6f9aec3b27..5190d0cd8e 100644 --- a/src/palace/manager/search/service.py +++ b/src/palace/manager/search/service.py @@ -9,26 +9,39 @@ import opensearchpy.helpers from opensearch_dsl import MultiSearch, Search from opensearchpy import NotFoundError, OpenSearch, RequestError +from typing_extensions import Self from palace.manager.core.exceptions import BasePalaceException from palace.manager.search.revision import SearchSchemaRevision from palace.manager.util.log import LoggerMixin -@dataclass -class SearchWritePointer: - """The 'write' pointer; the pointer that will be used to populate an index with search documents.""" +@dataclass(frozen=True) +class SearchPointer: + """A search pointer, which is an alias that points to a specific index.""" - base_name: str + alias: str + index: str version: int - @property - def name(self) -> str: - return f"{self.base_name}-search-write" + @classmethod + def from_index(cls, base_name: str, alias: str, index: str) -> Self | None: + version = cls._parse_version(base_name, index) + if version is None: + return None - @property - def target_name(self) -> str: - return f"{self.base_name}-v{self.version}" + return cls( + alias=alias, + index=index, + version=version, + ) + + @classmethod + def _parse_version(cls, base_name: str, index: str) -> int | None: + match = re.search(f"^{base_name}-v([0-9]+)$", string=index) + if match is None: + return None + return int(match.group(1)) class SearchServiceException(BasePalaceException): @@ -93,45 +106,34 @@ def write_pointer_name(self) -> str: """Get the name used for the write pointer.""" @abstractmethod - def read_pointer(self) -> str | None: + def read_pointer(self) -> SearchPointer | None: """Get the read pointer, if it exists.""" @abstractmethod - def write_pointer(self) -> SearchWritePointer | None: + def write_pointer(self) -> SearchPointer | None: """Get the writer pointer, if it exists.""" - @abstractmethod - def create_empty_index(self) -> None: - """Atomically create the empty index for the given base name.""" - @abstractmethod def read_pointer_set(self, revision: SearchSchemaRevision) -> None: """Atomically set the read pointer to the index for the given revision and base name.""" - @abstractmethod - def read_pointer_set_empty(self) -> None: - """Atomically set the read pointer to the empty index for the base name.""" - @abstractmethod def index_create(self, revision: SearchSchemaRevision) -> None: """Atomically create an index for the given base name and revision.""" - @abstractmethod - def index_is_populated(self, revision: SearchSchemaRevision) -> bool: - """Return True if the index for the given base name and revision has been populated.""" - - @abstractmethod - def index_set_populated(self, revision: SearchSchemaRevision) -> None: - """Set an index as populated.""" - @abstractmethod def index_set_mapping(self, revision: SearchSchemaRevision) -> None: """Set the schema mappings for the given index.""" + @abstractmethod + def index_submit_document( + self, document: dict[str, Any], refresh: bool = False + ) -> None: + """Submit a search document to the given index.""" + @abstractmethod def index_submit_documents( self, - pointer: str, documents: Sequence[SearchDocument], ) -> list[SearchServiceFailedDocument]: """Submit search documents to the given index.""" @@ -145,25 +147,21 @@ def refresh(self) -> None: """Synchronously refresh the service and wait for changes to be completed.""" @abstractmethod - def index_clear_documents(self, pointer: str) -> None: + def index_clear_documents(self) -> None: """Clear all search documents in the given index.""" @abstractmethod - def search_client(self, write: bool = False) -> Search: + def read_search_client(self) -> Search: """Return the underlying search client.""" @abstractmethod - def search_multi_client(self, write: bool = False) -> MultiSearch: + def read_search_multi_client(self) -> MultiSearch: """Return the underlying search client.""" @abstractmethod - def index_remove_document(self, pointer: str, id: int) -> None: + def index_remove_document(self, doc_id: int) -> None: """Remove a specific document from the given index.""" - @abstractmethod - def is_pointer_empty(self, pointer: str) -> bool: - """Check to see if a pointer points to an empty index""" - class SearchServiceOpensearch1(SearchService, LoggerMixin): """The real Opensearch 1.x service.""" @@ -184,28 +182,26 @@ def __init__(self, client: OpenSearch, base_revision_name: str): def base_revision_name(self) -> str: return self._base_revision_name - def write_pointer(self) -> SearchWritePointer | None: + def _get_pointer(self, name: str) -> SearchPointer | None: try: - result = self._client.indices.get_alias(name=self.write_pointer_name()) - for name in result.keys(): - match = re.search(f"{self.base_revision_name}-v([0-9]+)", string=name) - if match: - return SearchWritePointer( - self.base_revision_name, int(match.group(1)) - ) - return None + result = self._client.indices.get_alias(name=name) + if len(result) != 1: + # This should never happen, based on my understanding of the API. + self.log.error( + f"unexpected number of indexes for alias {name}: {result}" + ) + return None + index_name = next(iter(result.keys())) + return SearchPointer.from_index( + base_name=self.base_revision_name, + alias=name, + index=index_name, + ) except NotFoundError: return None - def create_empty_index(self) -> None: - try: - index_name = self._empty(self.base_revision_name) - self.log.debug(f"creating empty index {index_name}") - self._client.indices.create(index=index_name) - except RequestError as e: - if e.error == "resource_already_exists_exception": - return - raise e + def write_pointer(self) -> SearchPointer | None: + return self._get_pointer(self.write_pointer_name()) def read_pointer_set(self, revision: SearchSchemaRevision) -> None: alias_name = self.read_pointer_name() @@ -219,34 +215,6 @@ def read_pointer_set(self, revision: SearchSchemaRevision) -> None: self.log.debug(f"setting read pointer {alias_name} to index {target_index}") self._client.indices.update_aliases(body=action) - def index_set_populated(self, revision: SearchSchemaRevision) -> None: - alias_name = revision.name_for_indexed_pointer(self.base_revision_name) - target_index = revision.name_for_index(self.base_revision_name) - action = { - "actions": [ - {"remove": {"index": "*", "alias": alias_name}}, - {"add": {"index": target_index, "alias": alias_name}}, - ] - } - self.log.debug( - f"creating 'indexed' flag alias {alias_name} for index {target_index}" - ) - self._client.indices.update_aliases(body=action) - - def read_pointer_set_empty(self) -> None: - alias_name = self.read_pointer_name() - target_index = self._empty(self.base_revision_name) - action = { - "actions": [ - {"remove": {"index": "*", "alias": alias_name}}, - {"add": {"index": target_index, "alias": alias_name}}, - ] - } - self.log.debug( - f"setting read pointer {alias_name} to empty index {target_index}" - ) - self._client.indices.update_aliases(body=action) - def index_create(self, revision: SearchSchemaRevision) -> None: try: index_name = revision.name_for_index(self.base_revision_name) @@ -260,11 +228,6 @@ def index_create(self, revision: SearchSchemaRevision) -> None: return raise e - def index_is_populated(self, revision: SearchSchemaRevision) -> bool: - return self._client.indices.exists_alias( - name=revision.name_for_indexed_pointer(self.base_revision_name) - ) - def index_set_mapping(self, revision: SearchSchemaRevision) -> None: data = {"properties": revision.mapping_document().serialize_properties()} index_name = revision.name_for_index(self.base_revision_name) @@ -277,11 +240,24 @@ def _ensure_scripts(self, revision: SearchSchemaRevision) -> None: script = dict(script=dict(lang="painless", source=body)) if not name.startswith("simplified"): name = revision.script_name(name) - self._client.put_script(name, script) # type: ignore [misc] ## Seems the types aren't up to date + self._client.put_script(name, body=script) + + def index_submit_document( + self, document: dict[str, Any], refresh: bool = False + ) -> None: + _id = document.pop("_id") + self._client.index( + id=_id, + index=self.write_pointer_name(), + body=document, + require_alias=True, + refresh=refresh, + ) def index_submit_documents( - self, pointer: str, documents: Sequence[SearchDocument] + self, documents: Sequence[SearchDocument] ) -> list[SearchServiceFailedDocument]: + pointer = self.write_pointer_name() self.log.info(f"submitting documents to index {pointer}") # Specifically override the target in all documents to the target pointer @@ -317,9 +293,11 @@ def index_submit_documents( return error_results - def index_clear_documents(self, pointer: str) -> None: + def index_clear_documents(self) -> None: self._client.delete_by_query( - index=pointer, body={"query": {"match_all": {}}}, wait_for_completion=True + index=self.write_pointer_name(), + body={"query": {"match_all": {}}}, + wait_for_completion=True, ) def refresh(self) -> None: @@ -338,27 +316,14 @@ def write_pointer_set(self, revision: SearchSchemaRevision) -> None: self.log.debug(f"setting write pointer {alias_name} to {target_index}") self._client.indices.update_aliases(body=action) - def read_pointer(self) -> str | None: - try: - result: dict[str, Any] = self._client.indices.get_alias( - name=self.read_pointer_name() - ) - for name in result.keys(): - if name.startswith(f"{self.base_revision_name}-"): - return name - return None - except NotFoundError: - return None + def read_pointer(self) -> SearchPointer | None: + return self._get_pointer(self.read_pointer_name()) - def search_client(self, write: bool = False) -> Search: - return self._search.index( - self.read_pointer_name() if not write else self.write_pointer_name() - ) + def read_search_client(self) -> Search: + return self._search.index(self.read_pointer_name()) - def search_multi_client(self, write: bool = False) -> MultiSearch: - return self._multi_search.index( - self.read_pointer_name() if not write else self.write_pointer_name() - ) + def read_search_multi_client(self) -> MultiSearch: + return self._multi_search.index(self.read_pointer_name()) def read_pointer_name(self) -> str: return f"{self.base_revision_name}-search-read" @@ -366,12 +331,5 @@ def read_pointer_name(self) -> str: def write_pointer_name(self) -> str: return f"{self.base_revision_name}-search-write" - @staticmethod - def _empty(base_name: str) -> str: - return f"{base_name}-empty" - - def index_remove_document(self, pointer: str, id: int) -> None: - self._client.delete(index=pointer, id=id, doc_type="_doc") - - def is_pointer_empty(self, pointer: str) -> bool: - return pointer == self._empty(self.base_revision_name) + def index_remove_document(self, doc_id: int) -> None: + self._client.delete(index=self.write_pointer_name(), id=doc_id, doc_type="_doc") diff --git a/src/palace/manager/search/v5.py b/src/palace/manager/search/v5.py index c0237aa391..e95ec14b4c 100644 --- a/src/palace/manager/search/v5.py +++ b/src/palace/manager/search/v5.py @@ -16,7 +16,10 @@ class SearchV5(SearchSchemaRevision): - SEARCH_VERSION = 5 + @property + def version(self) -> int: + return 5 + """ The body of this mapping looks for bibliographic information in the core document, primarily used for matching search diff --git a/src/palace/manager/service/celery/celery.py b/src/palace/manager/service/celery/celery.py index c7caa77efc..6e05f4bcb3 100644 --- a/src/palace/manager/service/celery/celery.py +++ b/src/palace/manager/service/celery/celery.py @@ -32,7 +32,11 @@ def beat_schedule() -> dict[str, Any]: "update_custom_lists": { "task": "custom_list.update_custom_lists", "schedule": crontab(minute="5"), # Run every hour at 5 minutes past - } + }, + "full_search_reindex": { + "task": "search.search_reindex", + "schedule": crontab(hour="0", minute="10"), # Run every day at 12:10 AM + }, } diff --git a/src/palace/manager/service/search/container.py b/src/palace/manager/service/search/container.py index 379dcf3e2d..e972adfb27 100644 --- a/src/palace/manager/service/search/container.py +++ b/src/palace/manager/service/search/container.py @@ -31,5 +31,4 @@ class Search(DeclarativeContainer): index: Provider[ExternalSearchIndex] = providers.Singleton( ExternalSearchIndex, service=service, - revision_directory=revision_directory, ) diff --git a/src/palace/manager/sqlalchemy/model/work.py b/src/palace/manager/sqlalchemy/model/work.py index 5f19627f55..841089005e 100644 --- a/src/palace/manager/sqlalchemy/model/work.py +++ b/src/palace/manager/sqlalchemy/model/work.py @@ -29,10 +29,11 @@ from sqlalchemy.orm import Mapped, contains_eager, joinedload, relationship from sqlalchemy.orm.base import NO_VALUE from sqlalchemy.orm.session import Session -from sqlalchemy.sql.expression import and_, case, join, literal_column, select +from sqlalchemy.sql.expression import and_, case, literal_column, select from sqlalchemy.sql.functions import func from palace.manager.core.classifier import Classifier, WorkClassifier +from palace.manager.core.exceptions import BasePalaceException from palace.manager.search.service import SearchDocument from palace.manager.sqlalchemy.constants import DataSourceConstants from palace.manager.sqlalchemy.model.base import Base @@ -1203,19 +1204,21 @@ def _reset_coverage(self, operation): return record def external_index_needs_updating(self): - """Mark this work as needing to have its search document reindexed. - This is a more efficient alternative to reindexing immediately, - since these WorkCoverageRecords are handled in large batches. + """Mark this work as needing to have its search document reindexed.""" + return self.queue_indexing_task(self.id) + + @staticmethod + def queue_indexing_task(work_id: int | None): """ - return self._reset_coverage(WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION) + Queue a task to index a Work. - def update_external_index(self, client, add_coverage_record=True): - """Create a WorkCoverageRecord so that this work's - entry in the search index can be modified or deleted. - This method is deprecated -- call - external_index_needs_updating() instead. + This is a static method, so it can be easily mocked in tests, + when we don't want to actually queue a task. """ - self.external_index_needs_updating() + from palace.manager.celery.tasks.search import index_work + + if work_id is not None: + index_work.apply_async((work_id,), countdown=2) def needs_full_presentation_recalculation(self): """Mark this work as needing to have its presentation completely @@ -1415,19 +1418,23 @@ def assign_appeals(self, character, language, setting, story, cutoff=0.20): OPENSEARCH_TIME_FORMAT = 'YYYY-MM-DD"T"HH24:MI:SS"."MS' @classmethod - def to_search_documents(cls, works: list[Self]) -> Sequence[SearchDocument]: + def to_search_documents( + cls, session: Session, work_ids: Sequence[int] + ) -> Sequence[SearchDocument]: """In app to search documents needed to ease off the burden of complex queries from the DB cluster No recursive identifier policy is taken here as using the RecursiveEquivalentsCache implicitly has that set """ - _db = Session.object_session(works[0]) + if not work_ids: + return [] - qu = _db.query(Work).filter(Work.id.in_([w.id for w in works])) + qu = session.query(Work).filter(Work.id.in_(work_ids)) qu = qu.options( joinedload(Work.presentation_edition) .joinedload(Edition.contributions) .joinedload(Contribution.contributor), + joinedload(Work.suppressed_for), joinedload(Work.work_genres).joinedload(WorkGenre.genre), joinedload(Work.custom_list_entries), ) @@ -1444,14 +1451,14 @@ def to_search_documents(cls, works: list[Self]) -> Sequence[SearchDocument]: ## Add it to another table so it becomes faster to just query the pre-computed table equivalent_identifiers = ( - _db.query(RecursiveEquivalencyCache) + session.query(RecursiveEquivalencyCache) .join( Edition, Edition.primary_identifier_id == RecursiveEquivalencyCache.parent_identifier_id, ) .join(Work, Work.presentation_edition_id == Edition.id) - .filter(Work.id.in_(w.id for w in works)) + .filter(Work.id.in_(work_ids)) .with_entities( Work.id.label("work_id"), RecursiveEquivalencyCache.identifier_id.label("equivalent_id"), @@ -1459,19 +1466,19 @@ def to_search_documents(cls, works: list[Self]) -> Sequence[SearchDocument]: .cte("equivalent_cte") ) - identifiers_query = ( - select( - [ - equivalent_identifiers.c.work_id, - Identifier.identifier, - Identifier.type, - ] - ) - .select_from(Identifier) - .where(Identifier.id == literal_column("equivalent_cte.equivalent_id")) + identifiers_query = select( + [ + equivalent_identifiers.c.work_id, + Identifier.identifier, + Identifier.type, + ] + ).join_from( + Identifier, + equivalent_identifiers, + Identifier.id == literal_column("equivalent_cte.equivalent_id"), ) - identifiers = list(_db.execute(identifiers_query)) + identifiers = list(session.execute(identifiers_query)) ## IDENTIFIERS END ## CLASSIFICATION START @@ -1515,16 +1522,16 @@ def to_search_documents(cls, works: list[Self]) -> Sequence[SearchDocument]: and_(Subject.type.in_(Subject.TYPES_FOR_SEARCH), term_column != None), ) .group_by(scheme_column, term_column, equivalent_identifiers.c.work_id) - .where( + .join_from( + Classification, + equivalent_identifiers, Classification.identifier_id - == literal_column("equivalent_cte.equivalent_id") - ) - .select_from( - join(Classification, Subject, Classification.subject_id == Subject.id) + == literal_column("equivalent_cte.equivalent_id"), ) + .join_from(Classification, Subject, Classification.subject_id == Subject.id) ) - all_subjects = list(_db.execute(subjects)) + all_subjects = list(session.execute(subjects)) ## CLASSIFICATION END @@ -1545,7 +1552,7 @@ def to_search_documents(cls, works: list[Self]) -> Sequence[SearchDocument]: return results @classmethod - def search_doc_as_dict(cls, doc: Self): + def search_doc_as_dict(cls, doc: Self) -> dict[str, Any]: columns = { "work": [ "fiction", @@ -1732,9 +1739,15 @@ def target_age_query(self, foreign_work_id_field): target_age = select([upper, lower]).where(Work.id == foreign_work_id_field) return target_age - def to_search_document(self): + def to_search_document(self) -> dict[str, Any]: """Generate a search document for this Work.""" - return Work.to_search_documents([self])[0] + db = Session.object_session(self) + if self.id is None: + raise BasePalaceException( + "Work has no ID. Cannot generate search document." + ) + + return Work.to_search_documents(db, [self.id])[0] def mark_licensepools_as_superceded(self): """Make sure that all but the single best open-access LicensePool for diff --git a/src/palace/manager/util/backoff.py b/src/palace/manager/util/backoff.py new file mode 100644 index 0000000000..6d06551c66 --- /dev/null +++ b/src/palace/manager/util/backoff.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from random import randrange +from typing import Literal + + +def exponential_backoff( + retries: int, max_backoff_time: Literal[False] | int = False +) -> int: + """ + Exponential backoff time, based on number of retries. + + The backoff includes some random jitter to prevent thundering herd, if + many items are retrying at the same time. + + :param retries: The number of retries that have already been attempted. + :param max_backoff_time: The maximum number of seconds to wait before the next retry. This is used as + a cap on the backoff time, to prevent the backoff time from growing too large. If False, there is + no cap. It is used to cap the backoff time BEFORE jitter is added, so the actual backoff time may be + up to 30% larger than this value. + :return: The number of seconds to wait before the next retry. + """ + backoff: int = 3 ** (retries + 1) + if max_backoff_time is not False: + backoff = min(backoff, max_backoff_time) + max_jitter = round(backoff * 0.3) + jitter: int = randrange(0, max_jitter) + return backoff + jitter diff --git a/tests/fixtures/api_controller.py b/tests/fixtures/api_controller.py index 7b8c5de333..031a2f4dd6 100644 --- a/tests/fixtures/api_controller.py +++ b/tests/fixtures/api_controller.py @@ -331,9 +331,7 @@ def add_works(self, works: list[WorkSpec]): work.license_pools[0].collection = self.collection self.works.append(work) - self.manager.external_search.search_service().index_submit_documents( - self.manager.external_search._search_write_pointer, [self.works] - ) + self.manager.external_search.search_service().index_submit_documents(self.works) self.manager.external_search.mock_query_works_multi(self.works) diff --git a/tests/fixtures/database.py b/tests/fixtures/database.py index 42d43d5197..5137c680af 100644 --- a/tests/fixtures/database.py +++ b/tests/fixtures/database.py @@ -10,7 +10,7 @@ from contextlib import contextmanager from functools import cached_property from textwrap import dedent -from typing import Any +from typing import TYPE_CHECKING, Any from unittest.mock import patch import pytest @@ -67,6 +67,9 @@ from palace.manager.util.datetime_helpers import utc_now from tests.fixtures.services import ServicesFixture +if TYPE_CHECKING: + from tests.fixtures.search import WorkExternalIndexingFixture + class TestIdFixture: """ @@ -1099,6 +1102,7 @@ def db( database_creation: DatabaseCreationFixture, database: DatabaseFixture, services_fixture: ServicesFixture, + work_external_indexing: WorkExternalIndexingFixture, ) -> Generator[DatabaseTransactionFixture, None, None]: with DatabaseTransactionFixture.fixture(database, services_fixture) as db: with ( diff --git a/tests/fixtures/search.py b/tests/fixtures/search.py index 5626fa7fe6..54194468a0 100644 --- a/tests/fixtures/search.py +++ b/tests/fixtures/search.py @@ -2,13 +2,15 @@ from collections.abc import Generator from contextlib import contextmanager +from unittest.mock import patch import pytest from opensearchpy import OpenSearch from pydantic import AnyHttpUrl -from palace.manager.search.coverage_provider import SearchIndexCoverageProvider +from palace.manager.celery.tasks.search import get_work_search_documents from palace.manager.search.external_search import ExternalSearchIndex +from palace.manager.search.revision import SearchSchemaRevision from palace.manager.search.service import SearchServiceOpensearch1 from palace.manager.service.configuration import ServiceConfiguration from palace.manager.service.container import Services, wire_container @@ -56,6 +58,9 @@ def __init__( self.client: OpenSearch = services.search.client() self.service: SearchServiceOpensearch1 = services.search.service() self.index: ExternalSearchIndex = services.search.index() + self.revision: SearchSchemaRevision = ( + services.search.revision_directory().highest() + ) # Make sure the services container is wired up with the newly created search container wire_container(self.services_container) @@ -116,13 +121,17 @@ def __init__(self, search_fixture: ExternalSearchFixture): self.external_search = search_fixture self.external_search_index = search_fixture.index + # Set up the search indices and mapping + self.external_search.service.index_create(self.external_search.revision) + self.external_search.service.index_set_mapping(self.external_search.revision) + self.external_search.service.write_pointer_set(self.external_search.revision) + self.external_search.service.read_pointer_set(self.external_search.revision) + def populate_search_index(self): """Populate the search index with a set of works. The given callback is passed this fixture instance.""" # Add all the works created in the setup to the search index. - SearchIndexCoverageProvider( - self.external_search.db.session, - search_index_client=self.external_search_index, - ).run() + documents = get_work_search_documents(self.db.session, 1000, 0) + self.external_search_index.add_documents(documents) self.external_search.client.indices.refresh() @staticmethod @@ -288,3 +297,52 @@ def external_search_fake_fixture( """Ask for an external search system that can be populated with data for end-to-end tests.""" with ExternalSearchFixtureFake.fixture(db, services_fixture.services) as fixture: yield fixture + + +class WorkExternalIndexingFixture: + """ + In normal operation, when external_index_needs_updating is called on Work, it + queues a task to index the Work in the external search index. We don't have + the full Celery setup in tests, so we need to mock this behavior. + """ + + def __init__(self): + self.queued_works = set() + self.patch = patch.object(Work, "queue_indexing_task", self.queue) + + def queue(self, work_id: int | None) -> None: + return self.queued_works.add(work_id) + + def clear(self): + self.queued_works.clear() + + def disable_fixture(self): + self.patch.stop() + + def is_queued(self, work: int | Work, *, clear: bool = False) -> bool: + if isinstance(work, Work): + work_id = work.id + else: + work_id = work + queued = work_id in self.queued_works + + if clear: + self.clear() + + return queued + + @classmethod + @contextmanager + def fixture(cls): + fixture = cls() + fixture.patch.start() + try: + yield fixture + finally: + fixture.patch.stop() + + +@pytest.fixture(scope="function") +def work_external_indexing() -> Generator[WorkExternalIndexingFixture, None, None]: + with WorkExternalIndexingFixture.fixture() as fixture: + yield fixture diff --git a/tests/fixtures/services.py b/tests/fixtures/services.py index a38030bb58..06556b1fa1 100644 --- a/tests/fixtures/services.py +++ b/tests/fixtures/services.py @@ -88,7 +88,7 @@ def services_search_fixture() -> ServicesSearchFixture: search_container = Search() client_mock = create_autospec(boto3.client) service_mock = create_autospec(SearchServiceOpensearch1) - revision_directory_mock = create_autospec(SearchRevisionDirectory.create) + revision_directory_mock = create_autospec(SearchRevisionDirectory) index_mock = create_autospec(ExternalSearchIndex) search_container.client.override(client_mock) search_container.service.override(service_mock) diff --git a/tests/manager/celery/tasks/test_search.py b/tests/manager/celery/tasks/test_search.py new file mode 100644 index 0000000000..78f95eb395 --- /dev/null +++ b/tests/manager/celery/tasks/test_search.py @@ -0,0 +1,303 @@ +from unittest.mock import MagicMock, call, patch + +import pytest +from celery.exceptions import MaxRetriesExceededError +from opensearchpy import OpenSearchException + +from palace.manager.celery.tasks.search import ( + get_migrate_search_chain, + get_work_search_documents, + index_work, + search_reindex, + update_read_pointer, +) +from palace.manager.scripts.initialization import InstanceInitializationScript +from tests.fixtures.celery import CeleryFixture +from tests.fixtures.database import DatabaseTransactionFixture +from tests.fixtures.search import EndToEndSearchFixture +from tests.fixtures.services import ServicesFixture +from tests.mocks.search import MockSearchSchemaRevisionLatest + + +def test_get_work_search_documents(db: DatabaseTransactionFixture) -> None: + work1 = db.work(with_open_access_download=True) + work2 = db.work(with_open_access_download=True) + # This work is not presentation ready, because it has no open access download. + work3 = db.work(with_open_access_download=False) + work4 = db.work(with_open_access_download=True) + + documents = get_work_search_documents(db.session, 2, 0) + assert {doc["_id"] for doc in documents} == {work1.id, work2.id} + + documents = get_work_search_documents(db.session, 2, 2) + assert {doc["_id"] for doc in documents} == {work4.id} + + documents = get_work_search_documents(db.session, 2, 4) + assert documents == [] + + +def test_search_reindex( + db: DatabaseTransactionFixture, + celery_fixture: CeleryFixture, + end_to_end_search_fixture: EndToEndSearchFixture, +) -> None: + client = end_to_end_search_fixture.external_search.client + index = end_to_end_search_fixture.external_search_index + + work1 = db.work(with_open_access_download=True) + work2 = db.work(with_open_access_download=True) + work3 = db.work(with_open_access_download=False) + work4 = db.work(with_open_access_download=True) + + # The works are not in the search index. + client.indices.refresh() + end_to_end_search_fixture.expect_results([], "") + + # Index the works, use a small batch size to test the pagination. + search_reindex.delay(batch_size=2).wait() + client.indices.refresh() + + # Check that the works are in the search index. + end_to_end_search_fixture.expect_results([work1, work2, work4], "", ordered=False) + + # Remove work1 from the search index. + index.remove_work(work1) + client.indices.refresh() + end_to_end_search_fixture.expect_results([work2, work4], "", ordered=False) + + # Reindex the works. + search_reindex.delay().wait() + client.indices.refresh() + + # Check that all the works are in the search index. + end_to_end_search_fixture.expect_results([work1, work2, work4], "", ordered=False) + + +@patch("palace.manager.celery.tasks.search.exponential_backoff") +def test_search_reindex_failures( + mock_backoff: MagicMock, + celery_fixture: CeleryFixture, + services_fixture: ServicesFixture, +): + # Make sure our backoff function doesn't delay the test. + mock_backoff.return_value = 0 + + add_documents_mock = services_fixture.search_fixture.index_mock.add_documents + + # If we fail to add documents, we should retry up to 4 times, then fail. + add_documents_mock.return_value = [1, 2, 3] + with pytest.raises(MaxRetriesExceededError): + search_reindex.delay().wait() + assert add_documents_mock.call_count == 5 + mock_backoff.assert_has_calls([call(0), call(1), call(2), call(3), call(4)]) + + add_documents_mock.reset_mock() + add_documents_mock.side_effect = [[1, 2, 3], OpenSearchException(), None] + search_reindex.delay().wait() + assert add_documents_mock.call_count == 3 + + +@patch("palace.manager.celery.tasks.search.exponential_backoff") +@patch("palace.manager.celery.tasks.search.get_work_search_documents") +def test_search_reindex_failures_multiple_batch( + mock_get_work_search_documents: MagicMock, + mock_backoff: MagicMock, + celery_fixture: CeleryFixture, + services_fixture: ServicesFixture, +): + # When a batch succeeds, the retry count is reset. + mock_backoff.return_value = 0 + search_documents = [ + {"_id": 1}, + {"_id": 2}, + {"_id": 3}, + {"_id": 4}, + {"_id": 5}, + {"_id": 6}, + {"_id": 7}, + ] + mock_get_work_search_documents.side_effect = ( + lambda session, batch_size, offset: search_documents[ + offset : offset + batch_size + ] + ) + add_documents_mock = services_fixture.search_fixture.index_mock.add_documents + add_documents_mock.side_effect = [ + # First batch + OpenSearchException(), + OpenSearchException(), + OpenSearchException(), + OpenSearchException(), + None, + # Second batch + OpenSearchException(), + None, + # Third batch + None, + # Fourth batch + OpenSearchException(), + OpenSearchException(), + None, + ] + search_reindex.delay(batch_size=2).wait() + assert add_documents_mock.call_count == 11 + + +def test_index_work( + db: DatabaseTransactionFixture, + celery_fixture: CeleryFixture, + end_to_end_search_fixture: EndToEndSearchFixture, +) -> None: + client = end_to_end_search_fixture.external_search.client + index = end_to_end_search_fixture.external_search_index + + work1_id = db.work(with_open_access_download=True).id + work2_id = db.work(with_open_access_download=True).id + + # The works are not in the search index. + client.indices.refresh() + results = index.query_works("") + assert len(results) == 0 + + # Index work2 + index_work.delay(work2_id).wait() + client.indices.refresh() + + # Check that it made it into the search index. + [result] = index.query_works("") + assert result.work_id == work2_id + + # Index work1 + index_work.delay(work1_id).wait() + client.indices.refresh() + + # Check that both works are in the search index. + results = index.query_works("") + assert {result.work_id for result in results} == {work1_id, work2_id} + + +@patch("palace.manager.celery.tasks.search.exponential_backoff") +def test_index_work_failures( + mock_backoff: MagicMock, + celery_fixture: CeleryFixture, + services_fixture: ServicesFixture, + caplog: pytest.LogCaptureFixture, + db: DatabaseTransactionFixture, +): + # Make sure our backoff function doesn't delay the test. + mock_backoff.return_value = 0 + + # If we try to index a work that doesn't exist, we retry up to 4 times, then fail. + with pytest.raises(MaxRetriesExceededError): + index_work.delay(555).wait() + assert "Work 555 not found" in caplog.text + + # If we fail to add documents, we should retry up to 4 times, then fail. + work = db.work(with_open_access_download=True) + add_document_mock = services_fixture.search_fixture.index_mock.add_document + add_document_mock.side_effect = OpenSearchException() + with pytest.raises(MaxRetriesExceededError): + index_work.delay(work.id).wait() + assert add_document_mock.call_count == 5 + + add_document_mock.reset_mock() + add_document_mock.side_effect = [OpenSearchException(), None] + index_work.delay(work.id).wait() + assert add_document_mock.call_count == 2 + + +def test_update_read_pointer( + db: DatabaseTransactionFixture, + celery_fixture: CeleryFixture, + end_to_end_search_fixture: EndToEndSearchFixture, +): + client = end_to_end_search_fixture.external_search.client + service = end_to_end_search_fixture.external_search.service + + # Remove the read pointer + alias_name = service.read_pointer_name() + action = { + "actions": [ + {"remove": {"index": "*", "alias": alias_name}}, + ] + } + client.indices.update_aliases(body=action) + + # Verify that the read pointer is gone + assert service.read_pointer() is None + + # Update the read pointer + update_read_pointer.delay().wait() + + # Verify that the read pointer is set + assert service.read_pointer() is not None + + +@patch("palace.manager.celery.tasks.search.exponential_backoff") +def test_update_read_pointer_failures( + mock_backoff: MagicMock, + celery_fixture: CeleryFixture, + services_fixture: ServicesFixture, +): + # Make sure our backoff function doesn't delay the test. + mock_backoff.return_value = 0 + + read_pointer_set_mock = ( + services_fixture.search_fixture.service_mock.read_pointer_set + ) + read_pointer_set_mock.side_effect = OpenSearchException() + with pytest.raises(MaxRetriesExceededError): + update_read_pointer.delay().wait() + assert read_pointer_set_mock.call_count == 5 + + read_pointer_set_mock.reset_mock() + read_pointer_set_mock.side_effect = [OpenSearchException(), None] + update_read_pointer.delay().wait() + assert read_pointer_set_mock.call_count == 2 + + +def test_get_migrate_search_chain( + db: DatabaseTransactionFixture, + celery_fixture: CeleryFixture, + end_to_end_search_fixture: EndToEndSearchFixture, +): + client = end_to_end_search_fixture.external_search.client + service = end_to_end_search_fixture.external_search.service + revision = end_to_end_search_fixture.external_search.revision + services = end_to_end_search_fixture.external_search.services_container + revision_directory = services.search.revision_directory() + + works = [ + db.work(title=f"Work {x}", with_open_access_download=True) for x in range(10) + ] + + end_to_end_search_fixture.populate_search_index() + new_revision = MockSearchSchemaRevisionLatest(1010101) + new_revision_index_name = new_revision.name_for_index(service.base_revision_name) + revision_directory.available[new_revision.version] = new_revision + + InstanceInitializationScript.create_search_index(service, new_revision) + + # The write pointer should point to the new revision + write_pointer = service.write_pointer() + assert write_pointer is not None + assert write_pointer.index == new_revision_index_name + assert write_pointer.version == new_revision.version + + # The read pointer should still point to the old revision + read_pointer = service.read_pointer() + assert read_pointer is not None + assert read_pointer.index == revision.name_for_index(service.base_revision_name) + assert read_pointer.version == revision.version + + # Run the migration task + get_migrate_search_chain().delay().wait() + + # The read pointer should now point to the new revision + read_pointer = service.read_pointer() + assert read_pointer is not None + assert read_pointer.index == new_revision_index_name + + # And we should have all the works in the new index + client.indices.refresh() + end_to_end_search_fixture.expect_results(works, "", ordered=False) diff --git a/tests/manager/scripts/test_informational.py b/tests/manager/scripts/test_informational.py index 5bc219f5e1..89feff9240 100644 --- a/tests/manager/scripts/test_informational.py +++ b/tests/manager/scripts/test_informational.py @@ -338,9 +338,9 @@ def test_no_presentation_ready_works( ): # This work is not presentation-ready. work = db.work(with_license_pool=True) - end_to_end_search_fixture.external_search_index.initialize_indices() work.presentation_ready = False - script = MockWhereAreMyBooks( + end_to_end_search_fixture.populate_search_index() + MockWhereAreMyBooks( _db=db.session, search=end_to_end_search_fixture.external_search_index ) self.check_explanation( @@ -357,11 +357,12 @@ def test_no_delivery_mechanisms( ): # This work has a license pool, but no delivery mechanisms. work = db.work(with_license_pool=True) - end_to_end_search_fixture.external_search_index.initialize_indices() for lpdm in work.license_pools[0].delivery_mechanisms: db.session.delete(lpdm) + end_to_end_search_fixture.populate_search_index() self.check_explanation( no_delivery_mechanisms=1, + in_search_index=1, db=db, end_to_end_search_fixture=end_to_end_search_fixture, ) @@ -373,8 +374,8 @@ def test_suppressed_pool( ): # This work has a license pool, but it's suppressed. work = db.work(with_license_pool=True) - end_to_end_search_fixture.external_search_index.initialize_indices() work.license_pools[0].suppressed = True + end_to_end_search_fixture.populate_search_index() self.check_explanation( suppressed=1, db=db, @@ -388,8 +389,8 @@ def test_no_licenses( ): # This work has a license pool, but no licenses owned. work = db.work(with_license_pool=True) - end_to_end_search_fixture.external_search_index.initialize_indices() work.license_pools[0].licenses_owned = 0 + end_to_end_search_fixture.populate_search_index() self.check_explanation( not_owned=1, db=db, @@ -401,14 +402,10 @@ def test_search_engine( db: DatabaseTransactionFixture, end_to_end_search_fixture: EndToEndSearchFixture, ): - search = end_to_end_search_fixture.external_search_index work = db.work(with_license_pool=True) work.presentation_ready = True - docs = search.start_migration() - assert docs is not None - docs.add_documents(search.create_search_documents_from_works([work])) - docs.finish() + end_to_end_search_fixture.populate_search_index() # This search index will always claim there is one result. self.check_explanation( diff --git a/tests/manager/scripts/test_initialization.py b/tests/manager/scripts/test_initialization.py index 0a36bbdc28..f02237218b 100644 --- a/tests/manager/scripts/test_initialization.py +++ b/tests/manager/scripts/test_initialization.py @@ -4,14 +4,20 @@ from pathlib import Path from unittest.mock import MagicMock, patch +import pytest from _pytest.logging import LogCaptureFixture from alembic.util import CommandError +from opensearchpy.exceptions import OpenSearchException from palace.manager.scripts.initialization import InstanceInitializationScript +from palace.manager.search.revision_directory import SearchRevisionDirectory +from palace.manager.service.logging.configuration import LogLevel from palace.manager.sqlalchemy.session import SessionManager from palace.manager.sqlalchemy.util import LOCK_ID_DB_INIT from tests.fixtures.database import DatabaseTransactionFixture -from tests.fixtures.search import EndToEndSearchFixture, ExternalSearchFixtureFake +from tests.fixtures.search import ExternalSearchFixture +from tests.fixtures.services import ServicesFixture +from tests.mocks.search import MockSearchSchemaRevisionLatest class TestInstanceInitializationScript: @@ -19,84 +25,88 @@ class TestInstanceInitializationScript: # more thoroughly as part of the migration tests, since migration tests are able # to test the script's interaction with the database. - def test_run_locks_database(self, db: DatabaseTransactionFixture): + def test_run_locks_database(self, services_fixture: ServicesFixture): # The script locks the database with a PostgreSQL advisory lock with patch( "palace.manager.scripts.initialization.pg_advisory_lock" ) as advisory_lock: mock_engine_factory = MagicMock() script = InstanceInitializationScript(engine_factory=mock_engine_factory) - script.initialize = MagicMock() + script.initialize_database = MagicMock() + script.initialize_search = MagicMock() script.run() - advisory_lock.assert_called_once_with( - mock_engine_factory().begin().__enter__(), - LOCK_ID_DB_INIT, - ) - advisory_lock().__enter__.assert_called_once() - advisory_lock().__exit__.assert_called_once() + advisory_lock.assert_called_once_with( + mock_engine_factory().begin().__enter__(), + LOCK_ID_DB_INIT, + ) + advisory_lock().__enter__.assert_called_once() + advisory_lock().__exit__.assert_called_once() + + # Run called initialize_database and initialize_search while the lock was held + script.initialize_database.assert_called_once() + script.initialize_search.assert_called_once() - def test_initialize(self, db: DatabaseTransactionFixture): + def test_initialize_database(self, services_fixture: ServicesFixture): # Test that the script inspects the database and initializes or migrates the database # as necessary. with patch("palace.manager.scripts.initialization.inspect") as inspect: script = InstanceInitializationScript() - script.migrate_database = MagicMock() # type: ignore[method-assign] - script.initialize_database = MagicMock() # type: ignore[method-assign] - script.initialize_search_indexes = MagicMock() # type: ignore[method-assign] + script.migrate_database = MagicMock() + script.initialize_database_schema = MagicMock() # If the database is uninitialized, initialize_database() is called. inspect().has_table.return_value = False - script.initialize(MagicMock()) - script.initialize_database.assert_called_once() + script.initialize_database(MagicMock()) + script.initialize_database_schema.assert_called_once() script.migrate_database.assert_not_called() # If the database is initialized, migrate_database() is called. - script.initialize_database.reset_mock() + script.initialize_database_schema.reset_mock() script.migrate_database.reset_mock() inspect().has_table.return_value = True - script.initialize(MagicMock()) - script.initialize_database.assert_not_called() + script.initialize_database(MagicMock()) + script.initialize_database_schema.assert_not_called() script.migrate_database.assert_called_once() - def test_initialize_alembic_exception(self, caplog: LogCaptureFixture): + def test_initialize_database_alembic_exception( + self, caplog: LogCaptureFixture, services_fixture: ServicesFixture + ): # Test that we handle a CommandError exception being returned by Alembic. with patch("palace.manager.scripts.initialization.inspect") as inspect: - with patch("palace.manager.scripts.initialization.container_instance"): - script = InstanceInitializationScript() + script = InstanceInitializationScript() caplog.set_level(logging.ERROR) script.migrate_database = MagicMock(side_effect=CommandError("test")) - script.initialize_database = MagicMock() - script.initialize_search_indexes = MagicMock() + script.initialize_database_schema = MagicMock() # If the database is initialized, migrate_database() is called. inspect().has_table.return_value = True - script.initialize(MagicMock()) - script.initialize_database.assert_not_called() + script.initialize_database(MagicMock()) + script.initialize_database_schema.assert_not_called() script.migrate_database.assert_called_once() assert "Error running database migrations" in caplog.text - def test_initialize_database(self, db: DatabaseTransactionFixture): + def test_initialize_database_schema(self, services_fixture: ServicesFixture): # Test that the script initializes the database. script = InstanceInitializationScript() mock_db = MagicMock() - with patch( - "palace.manager.scripts.initialization.SessionManager", - autospec=SessionManager, - ) as session_manager: - with patch( - "palace.manager.scripts.initialization.command" - ) as alemic_command: - script.initialize_database(mock_db) + with ( + patch( + "palace.manager.scripts.initialization.SessionManager", + autospec=SessionManager, + ) as session_manager, + patch("palace.manager.scripts.initialization.command") as alemic_command, + ): + script.initialize_database_schema(mock_db) session_manager.initialize_data.assert_called_once() session_manager.initialize_schema.assert_called_once() alemic_command.stamp.assert_called_once() - def test_migrate_database(self, db: DatabaseTransactionFixture): + def test_migrate_database(self, services_fixture: ServicesFixture): script = InstanceInitializationScript() mock_db = MagicMock() @@ -119,55 +129,206 @@ def test__get_alembic_config(self, db: DatabaseTransactionFixture): ) assert conf.config_file_name == str(test_ini.resolve()) - def test_initialize_search_indexes_mocked( + def test_initialize_search(self, external_search_fixture: ExternalSearchFixture): + service = external_search_fixture.service + index = external_search_fixture.index + revision = external_search_fixture.revision + base_name = service.base_revision_name + + script = InstanceInitializationScript() + script.create_search_index = MagicMock(wraps=script.create_search_index) + script.migrate_search = MagicMock(wraps=script.migrate_search) + + # Initially this should not exist, if the search service hasn't been initialized + assert service.read_pointer() is None + assert service.write_pointer() is None + + # We cannot do make search requests before we initialize the search service + with pytest.raises(OpenSearchException) as raised: + index.query_works("") + assert "index_not_found_exception" in str(raised.value) + + # Do the initial search index creation + script.initialize_search() + + # We should have created the search index, but not migrated it since we + # know it is a new freshly created index. + script.create_search_index.assert_called_once() + script.migrate_search.assert_not_called() + + # Then we have the latest version index + read_pointer = service.read_pointer() + assert read_pointer is not None + assert read_pointer.index == revision.name_for_index(base_name) + write_pointer = service.write_pointer() + assert write_pointer is not None + assert write_pointer.index == revision.name_for_index(base_name) + + # Now we try to initialize the search index again, and we should not create a new index + script.create_search_index.reset_mock() + script.initialize_search() + script.create_search_index.assert_not_called() + script.migrate_search.assert_not_called() + + # And because no new index was created, the read and write pointers should be the same + assert service.read_pointer() == read_pointer + assert service.write_pointer() == write_pointer + + # The same client should work without issue once the pointers are setup + assert index.query_works("").hits == [] + + def test_migrate_search( self, - external_search_fake_fixture: ExternalSearchFixtureFake, - caplog: LogCaptureFixture, + external_search_fixture: ExternalSearchFixture, + db: DatabaseTransactionFixture, + caplog: pytest.LogCaptureFixture, ): - caplog.set_level(logging.WARNING) + service = external_search_fixture.service + index = external_search_fixture.index + client = external_search_fixture.client + revision = external_search_fixture.revision + base_name = service.base_revision_name script = InstanceInitializationScript() - search_service = external_search_fake_fixture.external_search - search_service.start_migration = MagicMock() - search_service.search_service = MagicMock() + work1 = db.work(title="Test work 1", with_open_access_download=True) + work2 = db.work(title="Test work 2", with_open_access_download=True) - # To fake "no migration is available", mock all the values - search_service.start_migration.return_value = None - search_service.search_service().is_pointer_empty.return_value = True + # Do the initial search index creation + script.initialize_search() - # Migration should fail - assert script.initialize_search_indexes() is False + # We should have migrated the search index to the latest revision + read_pointer = service.read_pointer() + assert read_pointer is not None + assert read_pointer.version == revision.version + write_pointer = service.write_pointer() + assert write_pointer is not None + assert write_pointer.version == revision.version - # Logs were emitted - record = caplog.records.pop() - assert "WARNING" in record.levelname - assert "no migration was available" in record.message + new_revision = MockSearchSchemaRevisionLatest(1000001) + new_revision_directory = SearchRevisionDirectory( + {new_revision.version: new_revision} + ) + new_revision_index_name = new_revision.name_for_index(base_name) - search_service.search_service.reset_mock() - search_service.start_migration.reset_mock() + # The new_revision_index_name does not exist yet + with pytest.raises(OpenSearchException) as raised: + client.indices.get(new_revision_index_name) + assert "index_not_found" in str(raised.value) - # In case there is no need for a migration, read pointer exists as a non-empty pointer - search_service.search_service().is_pointer_empty.return_value = False + # The client can work without issue in this state + assert index.query_works("").hits == [] - # Initialization should pass, as a no-op - assert script.initialize_search_indexes() is True - assert search_service.start_migration.call_count == 0 + # We can add documents to the index + index.add_document(work1.to_search_document()) + index.search_service().refresh() + [indexed_work_1] = index.query_works("").hits + assert indexed_work_1.work_id == work1.id - def test_initialize_search_indexes( - self, end_to_end_search_fixture: EndToEndSearchFixture + # Now we migrate to the new revision + script.migrate_search = MagicMock(wraps=script.migrate_search) + script._container.search.revision_directory.override(new_revision_directory) + with patch( + "palace.manager.scripts.initialization.get_migrate_search_chain" + ) as get_migrate_search_chain: + script.initialize_search() + + # We should have created the new search index, and started the migration + script.migrate_search.assert_called_once() + + # The new index should exist + assert client.indices.get(new_revision_index_name) + + # The write pointer should point to the new revision + write_pointer = service.write_pointer() + assert write_pointer is not None + assert write_pointer.index == new_revision_index_name + assert write_pointer.version == new_revision.version + + # The read pointer should still point to the old revision + read_pointer = service.read_pointer() + assert read_pointer is not None + assert read_pointer.version == revision.version + + # We can add more documents to the index + index.add_document(work2.to_search_document()) + index.search_service().refresh() + + # But the documents are not searchable yet, since they are added to the new index + # and the read pointer is still pointing to the old index. So we find work1 but not work2. + [indexed_work_1] = index.query_works("").hits + assert indexed_work_1.work_id == work1.id + + # The migration should have been queued + get_migrate_search_chain.assert_called_once() + get_migrate_search_chain.return_value.apply_async.assert_called_once() + + # If the initialization is run again, the migration should not be run again, but we do log a message + # about the read pointer being out of date + script.migrate_search.reset_mock() + caplog.clear() + caplog.set_level(LogLevel.info) + script.initialize_search() + script.migrate_search.assert_not_called() + assert "Search read pointer is out-of-date" in caplog.text + + # We simulate the migration task completing, by setting the read pointer to the new index + service.read_pointer_set(new_revision) + + # Now work2 is searchable, but work1 is not, since the migration was mocked out and did not actually run + [indexed_work_2] = index.query_works("").hits + assert indexed_work_2.work_id == work2.id + + def test_migrate_downgrade( + self, + external_search_fixture: ExternalSearchFixture, + db: DatabaseTransactionFixture, + caplog: pytest.LogCaptureFixture, ): - search = end_to_end_search_fixture.external_search_index - base_name = end_to_end_search_fixture.external_search.service.base_revision_name - script = InstanceInitializationScript() + service = external_search_fixture.service - # Initially this should not exist, if InstanceInit has not been run - assert search.search_service().read_pointer() is None + script = InstanceInitializationScript() - # Initialization should work now - assert script.initialize_search_indexes() is True - # Then we have the latest version index + old_revision = MockSearchSchemaRevisionLatest(1000000) + new_revision = MockSearchSchemaRevisionLatest(1000001) + revision_directory = SearchRevisionDirectory( + { + new_revision.version: new_revision, + old_revision.version: old_revision, + } + ) + script._container.search.revision_directory.override(revision_directory) + + # Do the initial search index creation + script.initialize_search() + + # We should have migrated the search index to the latest revision + read_pointer = service.read_pointer() + assert read_pointer is not None + assert read_pointer.version == new_revision.version + write_pointer = service.write_pointer() + assert write_pointer is not None + assert write_pointer.version == new_revision.version + + # Now we run the migration script again, but this time the most recent revision is the old revision + revision_directory = SearchRevisionDirectory( + { + old_revision.version: old_revision, + } + ) + script._container.search.revision_directory.override(revision_directory) + script.initialize_search() + + # We should not have touched the read and write pointers, since they are more recent than the latest revision + read_pointer = service.read_pointer() + assert read_pointer is not None + assert read_pointer.version == new_revision.version + write_pointer = service.write_pointer() + assert write_pointer is not None + assert write_pointer.version == new_revision.version + + # And we should have logged a message about the situation assert ( - search.search_service().read_pointer() - == search._revision.name_for_index(base_name) + "You may be running an old version of the application against a new search index" + in caplog.text ) diff --git a/tests/manager/scripts/test_lane.py b/tests/manager/scripts/test_lane.py index 3c469106c8..d90849464c 100644 --- a/tests/manager/scripts/test_lane.py +++ b/tests/manager/scripts/test_lane.py @@ -53,9 +53,7 @@ def process_lane(self, lane): class TestUpdateLaneSizeScript: def test_do_run(self, db, end_to_end_search_fixture: EndToEndSearchFixture): - migration = end_to_end_search_fixture.external_search_index.start_migration() - assert migration is not None - migration.finish() + end_to_end_search_fixture.populate_search_index() lane = db.lane() lane.size = 100 @@ -85,9 +83,7 @@ def test_site_configuration_has_changed( db: DatabaseTransactionFixture, end_to_end_search_fixture: EndToEndSearchFixture, ): - migration = end_to_end_search_fixture.external_search_index.start_migration() - assert migration is not None - migration.finish() + end_to_end_search_fixture.populate_search_index() library = db.default_library() lane1 = db.lane() diff --git a/tests/manager/scripts/test_search.py b/tests/manager/scripts/test_search.py index 2e81daba03..0fdd5c07ee 100644 --- a/tests/manager/scripts/test_search.py +++ b/tests/manager/scripts/test_search.py @@ -1,88 +1,56 @@ from __future__ import annotations -import random +from unittest.mock import MagicMock, patch -from palace.manager.core.metadata_layer import TimestampData -from palace.manager.scripts.search import ( - RebuildSearchIndexScript, - SearchIndexCoverageRemover, -) -from palace.manager.sqlalchemy.model.coverage import WorkCoverageRecord +from palace.manager.scripts.search import RebuildSearchIndexScript from tests.fixtures.database import DatabaseTransactionFixture -from tests.fixtures.search import ExternalSearchFixtureFake +from tests.fixtures.services import ServicesFixture class TestRebuildSearchIndexScript: - def test_do_run( + @patch("palace.manager.scripts.search.search_reindex") + def test_do_run_no_args( self, + mock_search_reindex: MagicMock, db: DatabaseTransactionFixture, - external_search_fake_fixture: ExternalSearchFixtureFake, + services_fixture: ServicesFixture, ): - index = external_search_fake_fixture.external_search - work = db.work(with_license_pool=True) - work2 = db.work(with_license_pool=True) - wcr = WorkCoverageRecord - decoys = [wcr.QUALITY_OPERATION, wcr.SUMMARY_OPERATION] - - # Set up some coverage records. - for operation in decoys + [wcr.UPDATE_SEARCH_INDEX_OPERATION]: - for w in (work, work2): - wcr.add_for(w, operation, status=random.choice(wcr.ALL_STATUSES)) - - coverage_qu = db.session.query(wcr).filter( - wcr.operation == wcr.UPDATE_SEARCH_INDEX_OPERATION - ) - original_coverage = [x.id for x in coverage_qu] - - # Run the script. - script = RebuildSearchIndexScript(db.session, search_index_client=index) - [progress] = script.do_run() - - # The mock methods were called with the values we expect. - assert {work.id, work2.id} == set( - map( - lambda d: d["_id"], external_search_fake_fixture.service.documents_all() - ) - ) - - # The script returned a list containing a single - # CoverageProviderProgress object containing accurate - # information about what happened (from the CoverageProvider's - # point of view). - assert ( - "Items processed: 2. Successes: 2, transient failures: 0, persistent failures: 0" - == progress.achievements - ) - - # The old WorkCoverageRecords for the works were deleted. Then - # the CoverageProvider did its job and new ones were added. - new_coverage = [x.id for x in coverage_qu] - assert 2 == len(new_coverage) - assert set(new_coverage) != set(original_coverage) - - -class TestSearchIndexCoverageRemover: - SERVICE_NAME = "Search Index Coverage Remover" - - def test_do_run(self, db: DatabaseTransactionFixture): - work = db.work() - work2 = db.work() - wcr = WorkCoverageRecord - decoys = [wcr.QUALITY_OPERATION, wcr.SUMMARY_OPERATION] - - # Set up some coverage records. - for operation in decoys + [wcr.UPDATE_SEARCH_INDEX_OPERATION]: - for w in (work, work2): - wcr.add_for(w, operation, status=random.choice(wcr.ALL_STATUSES)) - - # Run the script. - script = SearchIndexCoverageRemover(db.session) - result = script.do_run() - assert isinstance(result, TimestampData) - assert "Coverage records deleted: 2" == result.achievements + # If we are called with no arguments, we default to asynchronously rebuilding the search index. + RebuildSearchIndexScript(db.session).do_run() + mock_search_reindex.s.return_value.delay.assert_called_once_with() + # But we don't delete the index before rebuilding. + services_fixture.search_fixture.index_mock.clear_search_documents.assert_not_called() + + @patch("palace.manager.scripts.search.search_reindex") + def test_do_run_blocking( + self, mock_search_reindex: MagicMock, db: DatabaseTransactionFixture + ): + # If we are called with the --blocking argument, we rebuild the search index synchronously. + RebuildSearchIndexScript(db.session, cmd_args=["--blocking"]).do_run() + mock_search_reindex.s.return_value.assert_called_once_with() - # UPDATE_SEARCH_INDEX_OPERATION records have been removed. - # No other records are affected. - for w in (work, work2): - remaining = [x.operation for x in w.coverage_records] - assert sorted(remaining) == sorted(decoys) + @patch("palace.manager.scripts.search.search_reindex") + def test_do_run_delete( + self, + mock_search_reindex: MagicMock, + db: DatabaseTransactionFixture, + services_fixture: ServicesFixture, + ): + # If we are called with the --delete argument, we clear the index before rebuilding. + RebuildSearchIndexScript(db.session, cmd_args=["--delete"]).do_run() + services_fixture.search_fixture.index_mock.clear_search_documents.assert_called_once_with() + mock_search_reindex.s.return_value.delay.assert_called_once_with() + + @patch("palace.manager.scripts.search.get_migrate_search_chain") + def test_do_run_migration( + self, mock_get_migrate_search_chain: MagicMock, db: DatabaseTransactionFixture + ): + # If we are called with the --migration argument, we treat the reindex as completing a migration. + RebuildSearchIndexScript(db.session, cmd_args=["--migration"]).do_run() + mock_get_migrate_search_chain.return_value.delay.assert_called_once_with() + + # We can also combine --blocking and --migration. + RebuildSearchIndexScript( + db.session, cmd_args=["--migration", "--blocking"] + ).do_run() + mock_get_migrate_search_chain.return_value.assert_called_once_with() diff --git a/tests/manager/search/test_external_search.py b/tests/manager/search/test_external_search.py index 30002794b5..81bda21fe6 100644 --- a/tests/manager/search/test_external_search.py +++ b/tests/manager/search/test_external_search.py @@ -1,11 +1,9 @@ import json import re -import time import uuid from collections.abc import Callable from datetime import datetime from typing import Any -from unittest.mock import MagicMock import pytest from opensearch_dsl import Q @@ -23,15 +21,10 @@ from opensearch_dsl.query import Query as opensearch_dsl_query from opensearch_dsl.query import Range, Term, Terms from psycopg2.extras import NumericRange -from sqlalchemy.sql import Delete as sqlaDelete from palace.manager.core.classifier import Classifier -from palace.manager.core.coverage import CoverageFailure from palace.manager.core.metadata_layer import ContributorData, IdentifierData from palace.manager.core.problem_details import INVALID_INPUT -from palace.manager.scripts.coverage_provider import RunWorkCoverageProviderScript -from palace.manager.search.coverage_provider import SearchIndexCoverageProvider -from palace.manager.search.document import SearchMappingDocument from palace.manager.search.external_search import ( ExternalSearchIndex, Filter, @@ -43,13 +36,11 @@ SortKeyPagination, WorkSearchResult, ) -from palace.manager.search.revision import SearchSchemaRevision from palace.manager.search.revision_directory import SearchRevisionDirectory from palace.manager.search.v5 import SearchV5 -from palace.manager.sqlalchemy.model.classification import Genre, Subject +from palace.manager.sqlalchemy.model.classification import Genre from palace.manager.sqlalchemy.model.collection import Collection from palace.manager.sqlalchemy.model.contributor import Contribution, Contributor -from palace.manager.sqlalchemy.model.coverage import WorkCoverageRecord from palace.manager.sqlalchemy.model.customlist import CustomList from palace.manager.sqlalchemy.model.datasource import DataSource from palace.manager.sqlalchemy.model.edition import Edition @@ -62,7 +53,7 @@ ) from palace.manager.sqlalchemy.model.licensing import LicensePool from palace.manager.sqlalchemy.model.work import Work -from palace.manager.sqlalchemy.util import get_one_or_create, numericrange_to_tuple +from palace.manager.sqlalchemy.util import get_one_or_create from palace.manager.util.cache import CachedData from palace.manager.util.datetime_helpers import datetime_utc, from_timestamp from tests.fixtures.database import DatabaseTransactionFixture @@ -131,6 +122,89 @@ def query_works_multi(self, queries, debug=False): assert pagination.offset == default.offset assert pagination.size == default.size + def test_remove_work( + self, + end_to_end_search_fixture: EndToEndSearchFixture, + db: DatabaseTransactionFixture, + caplog: pytest.LogCaptureFixture, + ): + duck_life = db.work( + title="Moby's life as a Duck", with_open_access_download=True + ) + moby_dick = db.work(title="Moby Dick", with_open_access_download=True) + client = end_to_end_search_fixture.external_search.client + index = end_to_end_search_fixture.external_search_index + end_to_end_search_fixture.populate_search_index() + + end_to_end_search_fixture.expect_results( + [duck_life, moby_dick], "Moby", ordered=False + ) + + index.remove_work(moby_dick) + index.remove_work(duck_life.id) + + # Refresh search index so we can query the changes + client.indices.refresh() + end_to_end_search_fixture.expect_results([], "Moby") + + # If we try to remove a work with no id, we log a warning + work_no_id = Work() + index.remove_work(work_no_id) + assert "Work has no ID" in caplog.text + + def test_add_document( + self, + end_to_end_search_fixture: EndToEndSearchFixture, + db: DatabaseTransactionFixture, + ): + client = end_to_end_search_fixture.external_search.client + index = end_to_end_search_fixture.external_search_index + + butterfly = db.work( + title="Nietzsche's Butterfly", with_open_access_download=True + ) + client.indices.refresh() + end_to_end_search_fixture.expect_results([], "Butterfly") + index.add_document(butterfly.to_search_document()) + client.indices.refresh() + end_to_end_search_fixture.expect_results([butterfly], "Butterfly") + + def test_add_documents( + self, + end_to_end_search_fixture: EndToEndSearchFixture, + db: DatabaseTransactionFixture, + ): + client = end_to_end_search_fixture.external_search.client + index = end_to_end_search_fixture.external_search_index + + butterfly = db.work( + title="Nietzsche's Butterfly", with_open_access_download=True + ) + chaos = db.work(title="Chaos", with_open_access_download=True) + client.indices.refresh() + end_to_end_search_fixture.expect_results([], "") + index.add_documents([w.to_search_document() for w in [butterfly, chaos]]) + client.indices.refresh() + end_to_end_search_fixture.expect_results([butterfly, chaos], "") + + def test_clear_search_documents( + self, + end_to_end_search_fixture: EndToEndSearchFixture, + db: DatabaseTransactionFixture, + ): + client = end_to_end_search_fixture.external_search.client + index = end_to_end_search_fixture.external_search_index + + work = db.work(with_open_access_download=True) + end_to_end_search_fixture.populate_search_index() + client.indices.refresh() + + end_to_end_search_fixture.expect_results([work], "") + + index.clear_search_documents() + client.indices.refresh() + end_to_end_search_fixture.expect_results([], "") + class TestSearchV5: def test_character_filters(self): @@ -423,9 +497,6 @@ def test_query_works( transaction = fixture.external_search.db session = transaction.session - migration = fixture.external_search_index.start_migration() - assert migration is not None - migration.finish() data = self._populate_works(fixture) fixture.populate_search_index() @@ -1008,18 +1079,6 @@ def pages(worklist): # Case-insensitive genre search, genre is saved as 'Fantasy' expect([data.lincoln_vampire], "fantasy") - def test_remove_work(self, end_to_end_search_fixture: EndToEndSearchFixture): - client = end_to_end_search_fixture.external_search.client - data = self._populate_works(end_to_end_search_fixture) - end_to_end_search_fixture.populate_search_index() - end_to_end_search_fixture.external_search_index.remove_work(data.moby_dick) - end_to_end_search_fixture.external_search_index.remove_work(data.moby_duck) - - # Immediately querying never works, the search index needs to refresh its cache/index/data - client.indices.refresh() - - end_to_end_search_fixture.expect_results([], "Moby") - class TestFacetFiltersData: becoming: Work @@ -1065,14 +1124,6 @@ def test_facet_filtering(self, end_to_end_search_fixture: EndToEndSearchFixture) data = self._populate_works(fixture) fixture.populate_search_index() - # Add all the works created in the setup to the search index. - SearchIndexCoverageProvider( - session, search_index_client=fixture.external_search_index - ).run_once_and_update_timestamp() - - # Sleep to give the index time to catch up. - time.sleep(1) - def expect(availability, collection, works): facets = Facets( transaction.default_library(), @@ -4637,11 +4688,9 @@ def test_works_not_presentation_ready_kept_in_index( w3 = db.work() index = external_search_fake_fixture.external_search - docs = index.start_updating_search_documents() - failures = docs.add_documents( - index.create_search_documents_from_works([w1, w2, w3]) + failures = index.add_documents( + Work.to_search_documents(db.session, [w1.id, w2.id, w3.id]) ) - docs.finish() # All three works are regarded as successes, because their # state was successfully mirrored to the index. @@ -4659,11 +4708,9 @@ def test_works_not_presentation_ready_kept_in_index( # If a work stops being presentation-ready, it is kept in the # index. w2.presentation_ready = False - docs = index.start_updating_search_documents() - failures = docs.add_documents( - index.create_search_documents_from_works([w1, w2, w3]) + failures = index.add_documents( + Work.to_search_documents(db.session, [w1.id, w2.id, w3.id]) ) - docs.finish() assert {w1.id, w2.id, w3.id} == set( map( lambda d: d["_id"], external_search_fake_fixture.service.documents_all() @@ -4687,9 +4734,8 @@ def test_search_connection_timeout( work = transaction.work() work.set_presentation_ready() - docs = search.external_search.start_updating_search_documents() - failures = docs.add_documents( - search.external_search.create_search_documents_from_works([work]) + failures = search.external_search.add_documents( + Work.to_search_documents(transaction.session, [work.id]) ) assert 1 == len(failures) assert work.id == failures[0].id @@ -4714,9 +4760,8 @@ def test_search_single_document_error( work = transaction.work() work.set_presentation_ready() - docs = search.external_search.start_updating_search_documents() - failures = docs.add_documents( - search.external_search.create_search_documents_from_works([work]) + failures = search.external_search.add_documents( + Work.to_search_documents(transaction.session, [work.id]) ) assert 1 == len(failures) assert work.id == failures[0].id @@ -4748,288 +4793,6 @@ def test_constructor(self, db: DatabaseTransactionFixture): assert work.sort_title == result.sort_title -class TestSearchIndexCoverageProvider: - def test_operation( - self, - db: DatabaseTransactionFixture, - external_search_fake_fixture: ExternalSearchFixtureFake, - ): - index = external_search_fake_fixture.external_search - provider = SearchIndexCoverageProvider(db.session, search_index_client=index) - assert WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION == provider.operation - - def test_to_search_document(self, db: DatabaseTransactionFixture): - """Test the output of the to_search_document method.""" - customlist, editions = db.customlist() - library = db.library() - - works = [ - db.work( - authors=[db.contributor()], - with_license_pool=True, - genre="history", - ), - editions[0].work, - ] - - work1: Work = works[0] - work2: Work = works[1] - work2.suppressed_for.append(library) - - work1.target_age = NumericRange(lower=18, upper=22, bounds="()") - work2.target_age = NumericRange(lower=18, upper=99, bounds="[]") - - genre1, is_new = Genre.lookup(db.session, "Psychology") - genre2, is_new = Genre.lookup(db.session, "Cooking") - subject1 = db.subject(type=Subject.SIMPLIFIED_GENRE, identifier="subject1") - subject1.genre = genre1 - subject2 = db.subject(type=Subject.SIMPLIFIED_GENRE, identifier="subject2") - subject2.genre = genre2 - - db.session.flush() - - search_docs = Work.to_search_documents(works) - - search_doc_work1 = list( - filter(lambda x: x["work_id"] == work1.id, search_docs) - )[0] - search_doc_work2 = list( - filter(lambda x: x["work_id"] == work2.id, search_docs) - )[0] - - def compare(doc: dict[str, Any], work: Work) -> None: - assert doc["_id"] == work.id - assert doc["work_id"] == work.id - assert doc["title"] == work.title - assert doc["sort_title"] == work.sort_title - assert doc["subtitle"] == work.subtitle - assert doc["series"] == work.series - assert doc["series_position"] == work.series_position - assert doc["language"] == work.language - assert doc["author"] == work.author - assert doc["sort_author"] == work.sort_author - assert doc["medium"] == work.presentation_edition.medium - assert doc["publisher"] == work.publisher - assert doc["imprint"] == work.imprint - assert ( - doc["permanent_work_id"] == work.presentation_edition.permanent_work_id - ) - assert doc["presentation_ready"] == work.presentation_ready - assert doc["last_update_time"] == work.last_update_time - assert doc["fiction"] == "Fiction" if work.fiction else "Nonfiction" - assert doc["audience"] == work.audience - assert doc["summary"] == work.summary_text - assert doc["quality"] == work.quality - assert doc["rating"] == work.rating - assert doc["popularity"] == work.popularity - - if work.suppressed_for: - assert doc["suppressed_for"] == [l.id for l in work.suppressed_for] - else: - assert doc["suppressed_for"] is None - - if work.license_pools: - assert len(doc["licensepools"]) == len(work.license_pools) - for idx, pool in enumerate(work.license_pools): - actual = doc["licensepools"][idx] - assert actual["licensepool_id"] == pool.id - assert actual["data_source_id"] == pool.data_source_id - assert actual["collection_id"] == pool.collection_id - assert actual["open_access"] == pool.open_access - assert actual["suppressed"] == pool.suppressed - else: - assert doc["licensepools"] is None - - if work.custom_list_entries: - assert len(doc["customlists"]) == len(work.custom_list_entries) - for idx, custom_list in enumerate(work.custom_list_entries): - actual = doc["customlists"][idx] - assert actual["list_id"] == custom_list.list_id - assert actual["featured"] == custom_list.featured - else: - assert doc["customlists"] is None - - if work.presentation_edition.contributions: - assert len(doc["contributors"]) is len( - work.presentation_edition.contributions - ) - for idx, contribution in enumerate( - work.presentation_edition.contributions - ): - actual = doc["contributors"][idx] - assert actual["sort_name"] == contribution.contributor.sort_name - assert ( - actual["display_name"] == contribution.contributor.display_name - ) - assert actual["lc"] == contribution.contributor.lc - assert actual["viaf"] == contribution.contributor.viaf - assert actual["role"] == contribution.role - else: - assert doc["contributors"] is None - - if work.identifiers: # type: ignore[attr-defined] - assert len(doc["identifiers"]) == len(work.identifiers) # type: ignore[attr-defined] - for idx, identifier in enumerate(work.identifiers): # type: ignore[attr-defined] - actual = doc["identifiers"][idx] - assert actual["type"] == identifier.type - assert actual["identifier"] == identifier.identifier - else: - assert doc["identifiers"] is None - - if work.classifications: # type: ignore[attr-defined] - assert len(doc["classifications"]) == len(work.classifications) # type: ignore[attr-defined] - else: - assert doc["classifications"] is None - - if work.work_genres: - assert len(doc["genres"]) == len(work.work_genres) - for idx, genre in enumerate(work.work_genres): - actual = doc["genres"][idx] - assert actual["name"] == genre.genre.name - assert actual["term"] == genre.genre.id - assert actual["weight"] == genre.affinity - else: - assert doc["genres"] is None - - lower, upper = numericrange_to_tuple(work.target_age) - assert doc["target_age"]["lower"] == lower - assert doc["target_age"]["upper"] == upper - - compare(search_doc_work1, work1) - compare(search_doc_work2, work2) - - def test_to_search_documents_with_missing_data( - self, db: DatabaseTransactionFixture - ): - # Missing edition relationship - work: Work = db.work(with_license_pool=True) - work.presentation_edition_id = None - [result] = Work.to_search_documents([work]) - assert result["identifiers"] is None - - # Missing just some attributes - work = db.work(with_license_pool=True) - work.presentation_edition.title = None - work.target_age = None - [result] = Work.to_search_documents([work]) - assert result["title"] is None - target_age = result["target_age"] - assert isinstance(target_age, dict) - assert target_age["lower"] is None - - def test_success( - self, - db: DatabaseTransactionFixture, - external_search_fake_fixture: ExternalSearchFixtureFake, - ): - work = db.work() - work.set_presentation_ready() - index = external_search_fake_fixture.external_search - provider = SearchIndexCoverageProvider(db.session, search_index_client=index) - results = provider.process_batch([work]) - - # We got one success and no failures. - assert [work] == results - - # The work was added to the search index. - search_service = external_search_fake_fixture.service - assert 1 == len(search_service.documents_all()) - - def test_failure( - self, - db: DatabaseTransactionFixture, - external_search_fake_fixture: ExternalSearchFixtureFake, - ): - work = db.work() - work.set_presentation_ready() - index = external_search_fake_fixture.external_search - external_search_fake_fixture.service.set_failing_mode( - SearchServiceFailureMode.FAIL_INDEXING_DOCUMENTS - ) - - provider = SearchIndexCoverageProvider(db.session, search_index_client=index) - results = provider.process_batch([work]) - - # We have one transient failure. - [record] = results - assert isinstance(record, CoverageFailure) - assert work == record.obj - assert record.transient is True - assert "There was an error!" in record.exception - - def test_migration_available( - self, external_search_fake_fixture: ExternalSearchFixtureFake - ): - search = external_search_fake_fixture.external_search - directory = search._revision_directory - - # Create a new highest version - available = dict(directory._available) - available[10000] = SearchV10000() - directory._available = available - search._revision = directory._available[10000] - search._search_service.index_is_populated = lambda revision: False - - mock_db = MagicMock() - provider = SearchIndexCoverageProvider(mock_db, search_index_client=search) - - assert provider.migration is not None - assert provider.receiver is None - # Execute is called once with a Delete statement - assert mock_db.execute.call_count == 1 - assert len(mock_db.execute.call_args[0]) == 1 - assert mock_db.execute.call_args[0][0].__class__ == sqlaDelete - - def test_migration_not_available( - self, end_to_end_search_fixture: EndToEndSearchFixture - ): - search = end_to_end_search_fixture.external_search_index - db = end_to_end_search_fixture.db - - migration = search.start_migration() - assert migration is not None - migration.finish() - - provider = SearchIndexCoverageProvider(db.session, search_index_client=search) - assert provider.migration is None - assert provider.receiver is not None - - def test_complete_run_from_script( - self, end_to_end_search_fixture: EndToEndSearchFixture - ): - search = end_to_end_search_fixture.external_search_index - db = end_to_end_search_fixture.db - work = db.work(title="A Test Work", with_license_pool=True) - work.set_presentation_ready(search_index_client=search) - - class _SearchIndexCoverageProvider(SearchIndexCoverageProvider): - _did_call_on_completely_finished = False - - def on_completely_finished(self): - self._did_call_on_completely_finished = True - super().on_completely_finished() - - # Run as the search_index_refresh script would - provider = RunWorkCoverageProviderScript( - _SearchIndexCoverageProvider, _db=db.session, search_index_client=search - ) - provider.run() - - # The run ran till the end - assert provider.providers[0]._did_call_on_completely_finished == True - # The single available work was indexed - results = search.query_works(None) - assert len(results) == 1 - assert results[0]["work_id"] == work.id - - -class SearchV10000(SearchSchemaRevision): - SEARCH_VERSION = 10000 - - def mapping_document(self) -> SearchMappingDocument: - return MagicMock() - - class TestJSONQuery: @staticmethod def _leaf(key, value, op="eq"): diff --git a/tests/manager/search/test_migration_states.py b/tests/manager/search/test_migration_states.py deleted file mode 100644 index 0d77baa1db..0000000000 --- a/tests/manager/search/test_migration_states.py +++ /dev/null @@ -1,112 +0,0 @@ -"""Explicitly test the different states of migration, and ensure we are adhering to the principles set out. -These tests do have some overlap with the unit tests for the search migration, but these are specific to the migration use cases. -Initial Case -- No pointers or indices are available -- The System comes online for the first time and some prep work must be done -- The initial versioned indices and pointers should be prepped by the init_instance script -- The ExternalSearchIndex should not be hindered by this -Migration Case -- Pointers exist, indices exist -- The migration contains a new version for the index -- The search_index_refresh script, when run, should create and populate the indices, and move the red/write pointers -- The ExternalSearchIndex should not be hindered by this, and should continue to work with the pointers, regardless of where they point -""" - -import pytest - -from palace.manager.scripts.coverage_provider import RunWorkCoverageProviderScript -from palace.manager.scripts.initialization import InstanceInitializationScript -from palace.manager.search.coverage_provider import SearchIndexCoverageProvider -from palace.manager.search.document import SearchMappingDocument -from palace.manager.search.external_search import ExternalSearchIndex -from palace.manager.search.revision import SearchSchemaRevision -from palace.manager.search.revision_directory import SearchRevisionDirectory -from tests.fixtures.search import ExternalSearchFixture - - -class TestMigrationStates: - def test_initial_migration_case( - self, external_search_fixture: ExternalSearchFixture - ): - fx = external_search_fixture - index = fx.index - service = fx.service - - # We cannot make any requests before we initialize - with pytest.raises(Exception) as raised: - index.query_works("") - assert "index_not_found" in str(raised.value) - - # When a new sytem comes up the first code to run is the InstanceInitialization script - # This preps the DB and the search indices/pointers - InstanceInitializationScript().initialize_search_indexes() - - # Ensure we have created the index and pointers - new_index_name = index._revision.name_for_index(fx.index_prefix) - empty_index_name = service._empty(fx.index_prefix) - all_indices = fx.client.indices.get("*") - - assert fx.index_prefix in new_index_name - assert new_index_name in all_indices.keys() - assert empty_index_name in all_indices.keys() - assert fx.client.indices.exists_alias( - index._search_read_pointer, index=new_index_name - ) - assert fx.client.indices.exists_alias( - index._search_write_pointer, index=new_index_name - ) - - # The same client should work without issue once the pointers are setup - assert index.query_works("").hits == [] - - def test_migration_case(self, external_search_fixture: ExternalSearchFixture): - fx = external_search_fixture - db = fx.db - - # The initial indices setup - InstanceInitializationScript().initialize_search_indexes() - - MOCK_VERSION = 1000001 - - class MockSchema(SearchSchemaRevision): - def __init__(self, v: int): - self.SEARCH_VERSION = v - super().__init__() - - def mapping_document(self) -> SearchMappingDocument: - return SearchMappingDocument() - - client = ExternalSearchIndex( - fx.search_container.service(), - revision_directory=SearchRevisionDirectory( - {MOCK_VERSION: MockSchema(MOCK_VERSION)} - ), - ) - - # The search client works just fine - assert client.query_works("") is not None - receiver = client.start_updating_search_documents() - receiver.add_documents([{"work_id": 123}]) - receiver.finish() - - mock_index_name = client._revision.name_for_index(fx.service.base_revision_name) - assert str(MOCK_VERSION) in mock_index_name - - # The mock index does not exist yet - with pytest.raises(Exception) as raised: - fx.client.indices.get(mock_index_name) - assert "index_not_found" in str(raised.value) - - # This should run the migration - RunWorkCoverageProviderScript( - SearchIndexCoverageProvider, db.session, search_index_client=client - ).run() - - # The new version is created, and the aliases point to the right index - assert fx.client.indices.get(mock_index_name) is not None - assert mock_index_name in fx.client.indices.get_alias( - name=client._search_read_pointer - ) - assert mock_index_name in fx.client.indices.get_alias( - name=client._search_write_pointer - ) diff --git a/tests/manager/search/test_migrator.py b/tests/manager/search/test_migrator.py deleted file mode 100644 index 0ef67c3aa7..0000000000 --- a/tests/manager/search/test_migrator.py +++ /dev/null @@ -1,250 +0,0 @@ -from unittest.mock import MagicMock, Mock, call - -import pytest - -from palace.manager.search.document import SearchMappingDocument -from palace.manager.search.migrator import SearchMigrationException, SearchMigrator -from palace.manager.search.revision import SearchSchemaRevision -from palace.manager.search.revision_directory import SearchRevisionDirectory -from palace.manager.search.service import SearchWritePointer - - -class EmptyRevision(SearchSchemaRevision): - SEARCH_VERSION = 0 - - def __init__(self, version: int): - self.SEARCH_VERSION = version - super().__init__() - - def mapping_document(self) -> SearchMappingDocument: - return SearchMappingDocument() - - -class TestMigrator: - def test_migrate_no_revisions(self): - """If a revision isn't available, the migration fails fast.""" - service = Mock() - revisions = SearchRevisionDirectory.empty() - migrator = SearchMigrator(revisions, service) - with pytest.raises(SearchMigrationException): - migrator.migrate(base_name="any", version=23) - - def test_migrate_from_empty(self): - """With an empty search state, migrating to a supported version works.""" - service = Mock() - service.read_pointer = MagicMock(return_value=None) - service.write_pointer = MagicMock(return_value=None) - service.index_is_populated = MagicMock(return_value=False) - service.index_set_populated = MagicMock() - - revision = EmptyRevision(3) - revisions = SearchRevisionDirectory({revision.version: revision}) - migrator = SearchMigrator(revisions, service) - - migration = migrator.migrate(base_name="works", version=revision.version) - migration.finish() - - # The sequence of expected calls. - service.create_empty_index.assert_called_with() - service.read_pointer.assert_called_with() - # The read pointer didn't exist, so it's set to the empty index - service.read_pointer_set_empty.assert_called_with() - service.write_pointer.assert_called_with() - # The new index is created and populated. - service.index_create.assert_called_with(revision) - service.populate_index.assert_not_called() - # Both the read and write pointers are set. - service.write_pointer_set.assert_called_with(revision) - service.read_pointer_set.assert_called_with(revision) - service.index_set_populated.assert_called_with(revision) - - def test_migrate_upgrade(self): - """Index 2 exists, and we can migrate to 3.""" - service = Mock() - service.read_pointer = MagicMock(return_value="works-v2") - service.write_pointer = MagicMock(return_value=None) - service.index_is_populated = MagicMock(return_value=False) - service.index_set_mapping = MagicMock() - service.index_submit_documents = MagicMock() - service.index_set_populated = MagicMock() - - revision = EmptyRevision(3) - revisions = SearchRevisionDirectory({revision.version: revision}) - migrator = SearchMigrator(revisions, service) - - docs = migrator.migrate(base_name="works", version=revision.version) - docs.add_documents([{"_id": "1"}, {"_id": "2"}, {"_id": "3"}]) - docs.add_documents([{"_id": "4"}, {"_id": "5"}, {"_id": "6"}]) - docs.add_documents([{"_id": "7"}, {"_id": "8"}]) - docs.finish() - - # The sequence of expected calls. - service.create_empty_index.assert_called_with() - # The read pointer existed, so it's left alone for now. - service.read_pointer.assert_called_with() - service.write_pointer.assert_called_with() - # The index for version 3 is created and populated. - service.index_create.assert_called_with(revision) - service.index_set_mapping.assert_called_with(revision) - service.index_submit_documents.assert_has_calls( - [ - call( - pointer="works-v3", - documents=[{"_id": "1"}, {"_id": "2"}, {"_id": "3"}], - ), - call( - pointer="works-v3", - documents=[{"_id": "4"}, {"_id": "5"}, {"_id": "6"}], - ), - call( - pointer="works-v3", - documents=[{"_id": "7"}, {"_id": "8"}], - ), - ] - ) - # Both the read and write pointers are set. - service.write_pointer_set.assert_called_with(revision) - service.read_pointer_set.assert_called_with(revision) - service.index_set_populated.assert_called_with(revision) - - def test_migrate_upgrade_cancel(self): - """Cancelling a migration leaves the pointers untouched.""" - service = Mock() - service.read_pointer = MagicMock(return_value="works-v2") - service.write_pointer = MagicMock(return_value=None) - service.index_is_populated = MagicMock(return_value=False) - service.index_set_mapping = MagicMock() - service.index_submit_documents = MagicMock() - service.index_set_populated = MagicMock() - - revision = EmptyRevision(3) - revisions = SearchRevisionDirectory({revision.version: revision}) - migrator = SearchMigrator(revisions, service) - - docs = migrator.migrate(base_name="works", version=revision.version) - docs.add_documents([{"_id": "1"}, {"_id": "2"}, {"_id": "3"}]) - docs.add_documents([{"_id": "4"}, {"_id": "5"}, {"_id": "6"}]) - docs.add_documents([{"_id": "7"}, {"_id": "8"}]) - docs.cancel() - - # The sequence of expected calls. - service.create_empty_index.assert_called_with() - # The read pointer existed, so it's left alone for now. - service.read_pointer.assert_called_with() - service.write_pointer.assert_called_with() - # The index for version 3 is created and populated. - service.index_create.assert_called_with(revision) - service.index_set_mapping.assert_called_with(revision) - service.index_submit_documents.assert_has_calls( - [ - call( - pointer="works-v3", - documents=[{"_id": "1"}, {"_id": "2"}, {"_id": "3"}], - ), - call( - pointer="works-v3", - documents=[{"_id": "4"}, {"_id": "5"}, {"_id": "6"}], - ), - call( - pointer="works-v3", - documents=[{"_id": "7"}, {"_id": "8"}], - ), - ] - ) - # Both the read and write pointers are left untouched. - service.write_pointer_set.assert_not_called() - service.read_pointer_set.assert_not_called() - service.index_set_populated.assert_not_called() - - def test_migrate_no_op(self): - """Index 3 already exists, so migrating to 3 is a no-op.""" - service = Mock() - service.read_pointer = MagicMock(return_value="works-v3") - service.write_pointer = MagicMock(return_value=SearchWritePointer("works", 3)) - service.index_is_populated = MagicMock(return_value=True) - service.index_set_populated = MagicMock() - - revision = EmptyRevision(3) - revisions = SearchRevisionDirectory({revision.version: revision}) - migrator = SearchMigrator(revisions, service) - docs = migrator.migrate("works", revision.version) - assert docs is None - - # The sequence of expected calls. - service.create_empty_index.assert_called_with() - # The read pointer existed, so it's left alone for now. - service.read_pointer.assert_called_with() - service.write_pointer.assert_called_with() - # The index for version 3 already exists and is populated, so nothing happens. - service.index_create.assert_not_called() - service.index_set_mapping.assert_not_called() - # The write pointer is set redundantly. - service.write_pointer_set.assert_called_with(revision) - # The read pointer is set redundantly. - service.read_pointer_set.assert_called_with(revision) - # The "indexed" flag is set redundantly. - service.index_set_populated.assert_called_with(revision) - - def test_migrate_from_indexed_2_to_3_unpopulated(self): - """Index 3 exists but is not populated. Migrating involves populating it.""" - service = Mock() - service.read_pointer = MagicMock(return_value="works-v2") - service.write_pointer = MagicMock(return_value=SearchWritePointer("works", 2)) - service.index_is_populated = MagicMock(return_value=False) - service.index_set_populated = MagicMock() - - revision = EmptyRevision(3) - revisions = SearchRevisionDirectory({revision.version: revision}) - migrator = SearchMigrator(revisions, service) - migration = migrator.migrate("works", revision.version) - migration.add_documents([]) - migration.finish() - - # The sequence of expected calls. - service.create_empty_index.assert_called_with() - # The read pointer existed, so it's left alone for now. - service.read_pointer.assert_called_with() - service.write_pointer.assert_called_with() - # The index for version 3 exists but isn't populated, so it is populated. - service.index_create.assert_called_with(revision) - service.index_set_mapping.assert_called_with(revision) - service.index_submit_documents.assert_has_calls( - [ - call( - pointer="works-v3", - documents=[], - ) - ] - ) - # Both the read and write pointers are updated. - service.write_pointer_set.assert_called_with(revision) - service.read_pointer_set.assert_called_with(revision) - service.index_set_populated.assert_called_with(revision) - - def test_migrate_from_indexed_2_to_3_write_unset(self): - """Index 3 exists and is populated, but the write pointer is unset.""" - service = Mock() - service.read_pointer = MagicMock(return_value="works-v2") - service.write_pointer = MagicMock(return_value=None) - service.index_is_populated = MagicMock(return_value=True) - service.index_set_populated = MagicMock() - - revision = EmptyRevision(3) - revisions = SearchRevisionDirectory({revision.version: revision}) - migrator = SearchMigrator(revisions, service) - docs = migrator.migrate("works", revision.version) - assert docs is None - - # The sequence of expected calls. - service.create_empty_index.assert_called_with() - # The read pointer existed, so it's left alone for now. - service.read_pointer.assert_called_with() - # The write pointer is completely unset. - service.write_pointer.assert_called_with() - # The index for version 3 exists and is populated. The create call is redundant but harmless. - service.index_create.assert_called_with(revision) - service.populate_index.assert_not_called() - # Both the read and write pointers are updated. - service.write_pointer_set.assert_called_with(revision) - service.read_pointer_set.assert_called_with(revision) - service.index_set_populated.assert_called_with(revision) diff --git a/tests/manager/search/test_search_revision_directory.py b/tests/manager/search/test_search_revision_directory.py index 2dd4435707..72b4200fe0 100644 --- a/tests/manager/search/test_search_revision_directory.py +++ b/tests/manager/search/test_search_revision_directory.py @@ -2,18 +2,8 @@ import pytest -from palace.manager.search.document import SearchMappingDocument -from palace.manager.search.revision import SearchSchemaRevision from palace.manager.search.revision_directory import SearchRevisionDirectory - - -class AnyNumberRevision(SearchSchemaRevision): - def __init__(self, number): - self.SEARCH_VERSION = number - super().__init__() - - def mapping_document(self) -> SearchMappingDocument: - return SearchMappingDocument() +from tests.mocks.search import MockSearchSchemaRevision class TestSearchRevisionDirectory: @@ -24,13 +14,13 @@ def test_create(self): with mock.patch( "palace.manager.search.revision_directory.REVISIONS", - new=[AnyNumberRevision(1), AnyNumberRevision(2)], + new=[MockSearchSchemaRevision(1), MockSearchSchemaRevision(2)], ): assert list(SearchRevisionDirectory.create().available.keys()) == [1, 2] with mock.patch( "palace.manager.search.revision_directory.REVISIONS", - new=[AnyNumberRevision(1), AnyNumberRevision(1)], + new=[MockSearchSchemaRevision(1), MockSearchSchemaRevision(1)], ): with pytest.raises(ValueError) as raised: SearchRevisionDirectory.create() @@ -39,12 +29,12 @@ def test_create(self): def test_highest(self): with mock.patch( "palace.manager.search.revision_directory.REVISIONS", - new=[AnyNumberRevision(1), AnyNumberRevision(2)], + new=[MockSearchSchemaRevision(1), MockSearchSchemaRevision(2)], ): assert SearchRevisionDirectory.create().highest().version == 2 with mock.patch( "palace.manager.search.revision_directory.REVISIONS", - new=[AnyNumberRevision(17), AnyNumberRevision(2)], + new=[MockSearchSchemaRevision(17), MockSearchSchemaRevision(2)], ): assert SearchRevisionDirectory.create().highest().version == 17 diff --git a/tests/manager/search/test_service.py b/tests/manager/search/test_service.py index ef88411eeb..b82ce79b66 100644 --- a/tests/manager/search/test_service.py +++ b/tests/manager/search/test_service.py @@ -1,19 +1,67 @@ -from palace.manager.search.document import LONG, SearchMappingDocument -from palace.manager.search.revision import SearchSchemaRevision -from palace.manager.search.service import SearchDocument -from tests.fixtures.search import ExternalSearchFixture - +from unittest.mock import MagicMock -class BasicMutableRevision(SearchSchemaRevision): - SEARCH_VERSION = 0 +import pytest - def __init__(self, version: int): - self.SEARCH_VERSION = version - super().__init__() - self.document = SearchMappingDocument() +from palace.manager.search.document import LONG +from palace.manager.search.service import ( + SearchDocument, + SearchPointer, + SearchServiceOpensearch1, +) +from tests.fixtures.search import ExternalSearchFixture +from tests.mocks.search import MockSearchSchemaRevision + + +class TestSearchPointer: + @pytest.mark.parametrize( + "base, index, expected_version", + [ + ("base", "base-v23", 23), + ("base", "base-v42", 42), + ("base", "base-v0", 0), + ("base", "base-v1", 1), + ("base", "base-v99", 99), + ], + ) + def test_from_index(self, base: str, index: str, expected_version: int): + service = SearchServiceOpensearch1(MagicMock(), base) + + write_pointer = SearchPointer.from_index( + base, service.write_pointer_name(), index + ) + assert write_pointer is not None + assert write_pointer.index == index + assert write_pointer.version == expected_version + assert write_pointer.alias == service.write_pointer_name() - def mapping_document(self) -> SearchMappingDocument: - return self.document + read_pointer = SearchPointer.from_index( + base, service.read_pointer_name(), index + ) + assert read_pointer is not None + assert read_pointer.index == index + assert read_pointer.version == expected_version + assert read_pointer.alias == service.read_pointer_name() + + @pytest.mark.parametrize( + "base, index", + [ + ("base", "nbase-v23"), + ("base", "base-42"), + ("base", "basee-42"), + ("base", "base"), + ("base", "basev1"), + ("base", "base-v99abc"), + ], + ) + def test_from_index_errors(self, base: str, index: str): + service = SearchServiceOpensearch1(MagicMock(), base) + + assert ( + SearchPointer.from_index(base, service.write_pointer_name(), index) is None + ) + assert ( + SearchPointer.from_index(base, service.read_pointer_name(), index) is None + ) class TestService: @@ -21,24 +69,12 @@ class TestService: Tests to verify that the Opensearch service implementation has the semantics we expect. """ - def test_create_empty_idempotent( - self, external_search_fixture: ExternalSearchFixture - ): - """Creating the empty index is idempotent.""" - service = external_search_fixture.service - service.create_empty_index() - service.create_empty_index() - - indices = external_search_fixture.client.indices.client.indices - assert indices is not None - assert indices.exists(f"{external_search_fixture.index_prefix}-empty") - def test_create_index_idempotent( self, external_search_fixture: ExternalSearchFixture ): """Creating any index is idempotent.""" service = external_search_fixture.service - revision = BasicMutableRevision(23) + revision = MockSearchSchemaRevision(23) service.index_create(revision) service.index_create(revision) @@ -61,40 +97,32 @@ def test_write_pointer_none(self, external_search_fixture: ExternalSearchFixture def test_read_pointer_set(self, external_search_fixture: ExternalSearchFixture): """Setting the read pointer works.""" service = external_search_fixture.service - revision = BasicMutableRevision(23) + revision = MockSearchSchemaRevision(23) service.index_create(revision) service.read_pointer_set(revision) - assert service.read_pointer() == f"{external_search_fixture.index_prefix}-v23" - - def test_read_pointer_set_empty( - self, external_search_fixture: ExternalSearchFixture - ): - """Setting the read pointer to the empty index works.""" - service = external_search_fixture.service - service.create_empty_index() - - service.read_pointer_set_empty() - assert service.read_pointer() == f"{external_search_fixture.index_prefix}-empty" + pointer = service.read_pointer() + assert pointer is not None + assert pointer.index == f"{external_search_fixture.index_prefix}-v23" def test_write_pointer_set(self, external_search_fixture: ExternalSearchFixture): """Setting the write pointer works.""" service = external_search_fixture.service - revision = BasicMutableRevision(23) + revision = MockSearchSchemaRevision(23) service.index_create(revision) service.write_pointer_set(revision) pointer = service.write_pointer() assert pointer is not None - assert pointer.target_name == f"{external_search_fixture.index_prefix}-v23" + assert pointer.index == f"{external_search_fixture.index_prefix}-v23" def test_populate_index_idempotent( self, external_search_fixture: ExternalSearchFixture ): """Populating an index is idempotent.""" service = external_search_fixture.service - revision = BasicMutableRevision(23) + revision = MockSearchSchemaRevision(23) mappings = revision.mapping_document() mappings.properties["x"] = LONG @@ -126,12 +154,8 @@ def test_populate_index_idempotent( service.index_create(revision) - service.index_submit_documents( - f"{external_search_fixture.index_prefix}-v23", documents - ) - service.index_submit_documents( - f"{external_search_fixture.index_prefix}-v23", documents - ) + service.index_submit_documents(documents) + service.index_submit_documents(documents) indices = external_search_fixture.client.indices.client.indices assert indices is not None @@ -143,3 +167,23 @@ def test_populate_index_idempotent( )[f"{external_search_fixture.index_prefix}-v23"]["mappings"] == { "properties": mappings.serialize_properties() } + + def test__get_pointer(self): + """Getting a pointer works.""" + mock_client = MagicMock() + mock_client.indices.get_alias.return_value = { + "base-v23": {"aliases": {"base-search-read": {}}} + } + service = SearchServiceOpensearch1(mock_client, "base") + + pointer = service._get_pointer("base-search-read") + assert pointer is not None + assert pointer.index == "base-v23" + assert pointer.version == 23 + mock_client.indices.get_alias.assert_called_once_with(name="base-search-read") + + mock_client.indices.get_alias.reset_mock() + mock_client.indices.get_alias.return_value = {"bad": [], "data": []} + pointer = service._get_pointer("base-search-read") + assert pointer is None + mock_client.indices.get_alias.assert_called_once_with(name="base-search-read") diff --git a/tests/manager/sqlalchemy/model/test_customlist.py b/tests/manager/sqlalchemy/model/test_customlist.py index 7d3f60252f..a50d75d76d 100644 --- a/tests/manager/sqlalchemy/model/test_customlist.py +++ b/tests/manager/sqlalchemy/model/test_customlist.py @@ -1,6 +1,5 @@ import pytest -from palace.manager.sqlalchemy.model.coverage import WorkCoverageRecord from palace.manager.sqlalchemy.model.customlist import ( CustomList, CustomListEntry, @@ -10,6 +9,7 @@ from palace.manager.sqlalchemy.util import get_one_or_create from palace.manager.util.datetime_helpers import utc_now from tests.fixtures.database import DatabaseTransactionFixture +from tests.fixtures.search import WorkExternalIndexingFixture class TestCustomList: @@ -48,17 +48,11 @@ def test_find(self, db: DatabaseTransactionFixture): ) assert custom_list == result - def assert_reindexing_scheduled(self, work): - """Assert that the given work has exactly one WorkCoverageRecord, which - indicates that it needs to have its search index updated. - """ - [needs_reindex] = work.coverage_records - assert ( - WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION == needs_reindex.operation - ) - assert WorkCoverageRecord.REGISTERED == needs_reindex.status - - def test_add_entry(self, db: DatabaseTransactionFixture): + def test_add_entry( + self, + db: DatabaseTransactionFixture, + work_external_indexing: WorkExternalIndexingFixture, + ): custom_list = db.customlist(num_entries=0)[0] now = utc_now() @@ -76,7 +70,6 @@ def test_add_entry(self, db: DatabaseTransactionFixture): # An edition with a work can create an entry. work = db.work() - work.coverage_records = [] worked_entry, is_new = custom_list.add_entry(work.presentation_edition) assert True == is_new assert work == worked_entry.work @@ -85,11 +78,10 @@ def test_add_entry(self, db: DatabaseTransactionFixture): assert 2 == custom_list.size # When this happens, the work is scheduled for reindexing. - self.assert_reindexing_scheduled(work) + assert work_external_indexing.is_queued(work) # A work can create an entry. work = db.work(with_open_access_download=True) - work.coverage_records = [] work_entry, is_new = custom_list.add_entry(work) assert True == is_new assert work.presentation_edition == work_entry.edition @@ -98,7 +90,7 @@ def test_add_entry(self, db: DatabaseTransactionFixture): assert 3 == custom_list.size # When this happens, the work is scheduled for reindexing. - self.assert_reindexing_scheduled(work) + assert work_external_indexing.is_queued(work) # Annotations can be passed to the entry. annotated_edition = db.edition() @@ -273,13 +265,16 @@ def test_add_entry_work_equivalent_identifier(self, db: DatabaseTransactionFixtu assert entry3 == entry1 assert False == is_new3 - def test_remove_entry(self, db: DatabaseTransactionFixture): + def test_remove_entry( + self, + db: DatabaseTransactionFixture, + work_external_indexing: WorkExternalIndexingFixture, + ): custom_list, editions = db.customlist(num_entries=3) [first, second, third] = editions now = utc_now() # An entry is removed if its edition is passed in. - first.work.coverage_records = [] custom_list.remove_entry(first) assert 2 == len(custom_list.entries) assert {second, third} == {entry.edition for entry in custom_list.entries} @@ -288,8 +283,7 @@ def test_remove_entry(self, db: DatabaseTransactionFixture): assert 2 == custom_list.size # The editon's work has been scheduled for reindexing. - self.assert_reindexing_scheduled(first.work) - first.work.coverage_records = [] + assert work_external_indexing.is_queued(first.work, clear=True) # An entry is also removed if any of its equivalent editions # are passed in. @@ -321,7 +315,7 @@ def test_remove_entry(self, db: DatabaseTransactionFixture): # The 'removed' edition's work does not need to be reindexed # because it wasn't on the list to begin with. - assert [] == first.work.coverage_records + assert not work_external_indexing.is_queued(first.work) def test_entries_for_work(self, db: DatabaseTransactionFixture): custom_list, editions = db.customlist(num_entries=2) diff --git a/tests/manager/sqlalchemy/model/test_lane.py b/tests/manager/sqlalchemy/model/test_lane.py index 0fe2970649..d321d2cfae 100644 --- a/tests/manager/sqlalchemy/model/test_lane.py +++ b/tests/manager/sqlalchemy/model/test_lane.py @@ -4031,10 +4031,7 @@ def test_search( lane = db.lane() search_client = end_to_end_search_fixture.external_search_index - docs = end_to_end_search_fixture.external_search_index.start_migration() - assert docs is not None - docs.add_documents(search_client.create_search_documents_from_works([work])) - docs.finish() + end_to_end_search_fixture.populate_search_index() pagination = Pagination(offset=0, size=1) diff --git a/tests/manager/sqlalchemy/model/test_work.py b/tests/manager/sqlalchemy/model/test_work.py index 0563534d58..b2307ec65f 100644 --- a/tests/manager/sqlalchemy/model/test_work.py +++ b/tests/manager/sqlalchemy/model/test_work.py @@ -1,6 +1,7 @@ import datetime from contextlib import nullcontext -from unittest.mock import MagicMock +from typing import Any +from unittest.mock import MagicMock, patch import opensearchpy import pytest @@ -13,6 +14,7 @@ from palace.manager.core.equivalents_coverage import ( EquivalentIdentifiersCoverageProvider, ) +from palace.manager.core.exceptions import BasePalaceException from palace.manager.service.logging.configuration import LogLevel from palace.manager.sqlalchemy.model.classification import Genre, Subject from palace.manager.sqlalchemy.model.contributor import Contributor @@ -27,11 +29,19 @@ WorkGenre, work_library_suppressions, ) -from palace.manager.sqlalchemy.util import get_one_or_create, tuple_to_numericrange +from palace.manager.sqlalchemy.util import ( + get_one_or_create, + numericrange_to_tuple, + tuple_to_numericrange, +) from palace.manager.util.datetime_helpers import datetime_utc, from_timestamp, utc_now from tests.fixtures.database import DatabaseTransactionFixture from tests.fixtures.files import SampleCoversFixture -from tests.fixtures.search import EndToEndSearchFixture, ExternalSearchFixtureFake +from tests.fixtures.search import ( + EndToEndSearchFixture, + ExternalSearchFixtureFake, + WorkExternalIndexingFixture, +) class TestWork: @@ -110,6 +120,7 @@ def test_calculate_presentation( self, db: DatabaseTransactionFixture, external_search_fake_fixture: ExternalSearchFixtureFake, + work_external_indexing: WorkExternalIndexingFixture, ): # Test that: # - work coverage records are made on work creation and primary edition selection. @@ -219,12 +230,8 @@ def test_calculate_presentation( assert "Alice Adder, Bob Bitshifter" == work.author # This Work starts out with a single CoverageRecord reflecting - # the work done to choose-edition as a primary edition is set. The - # search index CoverageRecord is a marker for work that must - # be done in the future, and is not tested here. - [choose_edition, update_search_index] = sorted( - work.coverage_records, key=lambda x: x.operation - ) + # the work done to choose-edition as a primary edition is set. + [choose_edition] = sorted(work.coverage_records, key=lambda x: x.operation) assert choose_edition.operation == WorkCoverageRecord.CHOOSE_EDITION_OPERATION # pools aren't yet aware of each other @@ -278,7 +285,7 @@ def test_calculate_presentation( # occurred as part of calculate_presentation(). # # All the work has actually been done, except for the work of - # updating the search index, which has been registered and + # updating the search index, which has been queued and # will be done later. records = work.coverage_records @@ -289,9 +296,9 @@ def test_calculate_presentation( (wcr.CLASSIFY_OPERATION, success), (wcr.SUMMARY_OPERATION, success), (wcr.QUALITY_OPERATION, success), - (wcr.UPDATE_SEARCH_INDEX_OPERATION, wcr.REGISTERED), } assert expect == {(x.operation, x.status) for x in records} + assert work_external_indexing.is_queued(work) # Now mark the pool with the presentation edition as suppressed. # work.calculate_presentation() will call work.mark_licensepools_as_superceded(), @@ -482,6 +489,7 @@ def test_set_presentation_ready_based_on_content( self, db: DatabaseTransactionFixture, external_search_fake_fixture: ExternalSearchFixtureFake, + work_external_indexing: WorkExternalIndexingFixture, ): work = db.work(with_license_pool=True) @@ -493,19 +501,8 @@ def test_set_presentation_ready_based_on_content( # The work has not been added to the search index. assert [] == external_search_fake_fixture.service.documents_all() - # But the work of adding it to the search engine has been - # registered. - def assert_record(): - # Verify the search index WorkCoverageRecord for this work - # is in the REGISTERED state. - [record] = [ - x - for x in work.coverage_records - if x.operation == WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION - ] - assert WorkCoverageRecord.REGISTERED == record.status - - assert_record() + # But the work of adding it to the search engine has been queued. + assert work_external_indexing.is_queued(work, clear=True) # This work is presentation ready because it has a title. # Remove the title, and the work stops being presentation @@ -514,11 +511,10 @@ def assert_record(): work.set_presentation_ready_based_on_content(search_index_client=search) assert False == work.presentation_ready - # The search engine WorkCoverageRecord is still in the - # REGISTERED state, but its meaning has changed -- the work - # will now be _removed_ from the search index, rather than - # updated. - assert_record() + # The work has been queued for reindexing, so that the search + # index will be updated to reflect the fact that the work is no + # longer presentation-ready. + assert work_external_indexing.is_queued(work, clear=True) # Restore the title, and everything is fixed. presentation.title = "foo" @@ -1411,6 +1407,171 @@ def test_to_search_doc_datetime_cases(self, db: DatabaseTransactionFixture): == datetime.datetime(1, 1, 1, tzinfo=pytz.UTC).timestamp() ) + def test_to_search_doc_no_id(self, db: DatabaseTransactionFixture): + work = Work() + with pytest.raises(BasePalaceException) as excinfo: + work.to_search_document() + + assert "Work has no ID. Cannot generate search document." in str(excinfo.value) + + def test_to_search_documents(self, db: DatabaseTransactionFixture): + customlist, editions = db.customlist() + library = db.library() + + works = [ + db.work( + authors=[db.contributor()], + with_license_pool=True, + genre="history", + ), + editions[0].work, + ] + + work1: Work = works[0] + work2: Work = works[1] + work2.suppressed_for.append(library) + + work1.target_age = NumericRange(lower=18, upper=22, bounds="()") + work2.target_age = NumericRange(lower=18, upper=99, bounds="[]") + + genre1, is_new = Genre.lookup(db.session, "Psychology") + genre2, is_new = Genre.lookup(db.session, "Cooking") + subject1 = db.subject(type=Subject.SIMPLIFIED_GENRE, identifier="subject1") + subject1.genre = genre1 + subject2 = db.subject(type=Subject.SIMPLIFIED_GENRE, identifier="subject2") + subject2.genre = genre2 + + db.session.flush() + + search_docs = Work.to_search_documents(db.session, [w.id for w in works]) + + search_doc_work1 = list( + filter(lambda x: x["work_id"] == work1.id, search_docs) + )[0] + search_doc_work2 = list( + filter(lambda x: x["work_id"] == work2.id, search_docs) + )[0] + + def compare(doc: dict[str, Any], work: Work) -> None: + assert doc["_id"] == work.id + assert doc["work_id"] == work.id + assert doc["title"] == work.title + assert doc["sort_title"] == work.sort_title + assert doc["subtitle"] == work.subtitle + assert doc["series"] == work.series + assert doc["series_position"] == work.series_position + assert doc["language"] == work.language + assert doc["author"] == work.author + assert doc["sort_author"] == work.sort_author + assert doc["medium"] == work.presentation_edition.medium + assert doc["publisher"] == work.publisher + assert doc["imprint"] == work.imprint + assert ( + doc["permanent_work_id"] == work.presentation_edition.permanent_work_id + ) + assert doc["presentation_ready"] == work.presentation_ready + assert doc["last_update_time"] == work.last_update_time + assert doc["fiction"] == "Fiction" if work.fiction else "Nonfiction" + assert doc["audience"] == work.audience + assert doc["summary"] == work.summary_text + assert doc["quality"] == work.quality + assert doc["rating"] == work.rating + assert doc["popularity"] == work.popularity + + if work.suppressed_for: + assert doc["suppressed_for"] == [l.id for l in work.suppressed_for] + else: + assert doc["suppressed_for"] is None + + if work.license_pools: + assert len(doc["licensepools"]) == len(work.license_pools) + for idx, pool in enumerate(work.license_pools): + actual = doc["licensepools"][idx] + assert actual["licensepool_id"] == pool.id + assert actual["data_source_id"] == pool.data_source_id + assert actual["collection_id"] == pool.collection_id + assert actual["open_access"] == pool.open_access + assert actual["suppressed"] == pool.suppressed + else: + assert doc["licensepools"] is None + + if work.custom_list_entries: + assert len(doc["customlists"]) == len(work.custom_list_entries) + for idx, custom_list in enumerate(work.custom_list_entries): + actual = doc["customlists"][idx] + assert actual["list_id"] == custom_list.list_id + assert actual["featured"] == custom_list.featured + else: + assert doc["customlists"] is None + + if work.presentation_edition.contributions: + assert len(doc["contributors"]) == len( + work.presentation_edition.contributions + ) + for idx, contribution in enumerate( + work.presentation_edition.contributions + ): + actual = doc["contributors"][idx] + assert actual["sort_name"] == contribution.contributor.sort_name + assert ( + actual["display_name"] == contribution.contributor.display_name + ) + assert actual["lc"] == contribution.contributor.lc + assert actual["viaf"] == contribution.contributor.viaf + assert actual["role"] == contribution.role + else: + assert doc["contributors"] is None + + if work.identifiers: # type: ignore[attr-defined] + assert len(doc["identifiers"]) == len(work.identifiers) # type: ignore[attr-defined] + for idx, identifier in enumerate(work.identifiers): # type: ignore[attr-defined] + actual = doc["identifiers"][idx] + assert actual["type"] == identifier.type + assert actual["identifier"] == identifier.identifier + else: + assert doc["identifiers"] is None + + if work.classifications: # type: ignore[attr-defined] + assert len(doc["classifications"]) == len(work.classifications) # type: ignore[attr-defined] + else: + assert doc["classifications"] is None + + if work.work_genres: + assert len(doc["genres"]) == len(work.work_genres) + for idx, genre in enumerate(work.work_genres): + actual = doc["genres"][idx] + assert actual["name"] == genre.genre.name + assert actual["term"] == genre.genre.id + assert actual["weight"] == genre.affinity + else: + assert doc["genres"] is None + + lower, upper = numericrange_to_tuple(work.target_age) + assert doc["target_age"]["lower"] == lower + assert doc["target_age"]["upper"] == upper + + compare(search_doc_work1, work1) + compare(search_doc_work2, work2) + + def test_to_search_documents_with_missing_data( + self, db: DatabaseTransactionFixture + ): + # Missing edition relationship + work: Work = db.work(with_license_pool=True) + work.presentation_edition_id = None + assert work.id is not None + [result] = Work.to_search_documents(db.session, [work.id]) + assert result["identifiers"] is None + + # Missing just some attributes + work = db.work(with_license_pool=True) + work.presentation_edition.title = None + work.target_age = None + assert work.id is not None + [result] = Work.to_search_documents(db.session, [work.id]) + assert result["title"] is None + assert result["target_age"]["lower"] is None + def test_age_appropriate_for_patron(self, db: DatabaseTransactionFixture): work = db.work() work.audience = Classifier.AUDIENCE_YOUNG_ADULT @@ -1541,99 +1702,72 @@ def test_target_age_string(self, db: DatabaseTransactionFixture): work.target_age = None assert "" == work.target_age_string - def test_reindex_on_availability_change(self, db: DatabaseTransactionFixture): - # A change in a LicensePool's availability creates a - # WorkCoverageRecord indicating that the work needs to be - # re-indexed. - def find_record(work): - """Find the Work's 'update search index operation' - WorkCoverageRecord. - """ - records = [ - x - for x in work.coverage_records - if x.operation.startswith( - WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION - ) - ] - if records: - return records[0] - return None - - registered = WorkCoverageRecord.REGISTERED - success = WorkCoverageRecord.SUCCESS + def test_reindex_on_availability_change( + self, + db: DatabaseTransactionFixture, + work_external_indexing: WorkExternalIndexingFixture, + ): + # A change in a LicensePool's availability queues a task indicating that + # the work needs to be re-indexed. # A Work with no LicensePool isn't registered as needing # indexing. (It will be indexed anyway, but it's not registered # as needing it.) no_licensepool = db.work() - assert None == find_record(no_licensepool) + assert not work_external_indexing.is_queued(no_licensepool) # A Work with a LicensePool starts off in a state where it # needs to be indexed. work = db.work(with_open_access_download=True) [pool] = work.license_pools - record = find_record(work) - assert registered == record.status + assert work_external_indexing.is_queued(work, clear=True) # If it stops being open-access, it needs to be reindexed. - record.status = success pool.open_access = False - record = find_record(work) - assert registered == record.status + assert work_external_indexing.is_queued(work, clear=True) # If it becomes open-access again, it needs to be reindexed. - record.status = success pool.open_access = True - record = find_record(work) - assert registered == record.status + assert work_external_indexing.is_queued(work, clear=True) # If its last_update_time is changed, it needs to be # reindexed. (This happens whenever # LicensePool.update_availability is called, meaning that # patron transactions always trigger a reindex). - record.status = success work.last_update_time = utc_now() - assert registered == record.status + assert work_external_indexing.is_queued(work, clear=True) # If its collection changes (which shouldn't happen), it needs # to be reindexed. - record.status = success collection2 = db.collection() pool.collection_id = collection2.id - assert registered == record.status + assert work_external_indexing.is_queued(work, clear=True) # If a LicensePool is deleted (which also shouldn't happen), # its former Work needs to be reindexed. - record.status = success db.session.delete(pool) work = db.session.query(Work).filter(Work.id == work.id).one() - record = find_record(work) - assert registered == record.status + assert work_external_indexing.is_queued(work, clear=True) # If a LicensePool is moved in from another Work, _both_ Works # need to be reindexed. - record.status = success another_work = db.work(with_license_pool=True) [another_pool] = another_work.license_pools work.license_pools.append(another_pool) assert [] == another_work.license_pools - for work in (work, another_work): - record = find_record(work) - assert registered == record.status + assert work_external_indexing.is_queued(work) + assert work_external_indexing.is_queued(another_work) def test_reset_coverage( self, db: DatabaseTransactionFixture, - external_search_fake_fixture: ExternalSearchFixtureFake, ): # Test the methods that reset coverage for works, indicating # that some task needs to be performed again. WCR = WorkCoverageRecord work = db.work() work.presentation_ready = True - index = external_search_fake_fixture.external_search # Calling _reset_coverage when there is no coverage creates # a new WorkCoverageRecord in the REGISTERED state @@ -1657,16 +1791,10 @@ def mock_reset_coverage(operation): for method, operation in ( (work.needs_full_presentation_recalculation, WCR.CLASSIFY_OPERATION), (work.needs_new_presentation_edition, WCR.CHOOSE_EDITION_OPERATION), - (work.external_index_needs_updating, WCR.UPDATE_SEARCH_INDEX_OPERATION), ): method() assert operation == work.coverage_reset_for - # The work was not added to the search index when we called - # external_index_needs_updating. That happens later, when the - # WorkCoverageRecord is processed. - assert [] == external_search_fake_fixture.service.documents_all() - def test_for_unchecked_subjects(self, db: DatabaseTransactionFixture): w1 = db.work(with_license_pool=True) w2 = db.work() @@ -1885,6 +2013,16 @@ def test_delete_work_not_in_search_end2end( assert db.session.query(Work).filter(Work.id == work.id).all() == [] assert warning_is_present + @patch("palace.manager.celery.tasks.search.index_work") + def test_queue_indexing_tasks(self, index_work_mock: MagicMock): + # Test the method that queues up task to index a work. + Work.queue_indexing_task(555) + index_work_mock.apply_async.assert_called_once_with((555,), countdown=2) + + index_work_mock.reset_mock() + Work.queue_indexing_task(None) + index_work_mock.delay.assert_not_called() + class TestWorkConsolidation: def test_calculate_work_success(self, db: DatabaseTransactionFixture): diff --git a/tests/manager/sqlalchemy/test_listeners.py b/tests/manager/sqlalchemy/test_listeners.py index a5efce084e..0b63fb05ce 100644 --- a/tests/manager/sqlalchemy/test_listeners.py +++ b/tests/manager/sqlalchemy/test_listeners.py @@ -1,15 +1,17 @@ from __future__ import annotations import functools +from collections.abc import Callable from unittest.mock import patch import pytest from palace.manager.core.config import Configuration from palace.manager.sqlalchemy.listeners import site_configuration_has_changed -from palace.manager.sqlalchemy.model.coverage import Timestamp, WorkCoverageRecord +from palace.manager.sqlalchemy.model.coverage import Timestamp from palace.manager.util.datetime_helpers import utc_now from tests.fixtures.database import DatabaseTransactionFixture +from tests.fixtures.search import WorkExternalIndexingFixture class TestSiteConfigurationHasChanged: @@ -122,59 +124,43 @@ def _set_property(object, value, property_name): class TestListeners: @pytest.mark.parametrize( - "name,status_property_setter", + "status_property_setter", [ - ( - "works_when_open_access_property_changes", + pytest.param( functools.partial(_set_property, property_name="open_access"), + id="works_when_open_access_property_changes", ), ], ) - def test_licensepool_storage_status_change( + def test_licensepool_status_change( self, - db, - name, - status_property_setter, + db: DatabaseTransactionFixture, + work_external_indexing: WorkExternalIndexingFixture, + status_property_setter: Callable[..., None], ): - # Arrange work = db.work(with_license_pool=True) [pool] = work.license_pools - # Clear out any WorkCoverageRecords created as the work was initialized. - work.coverage_records = [] - - # Act # Change the field status_property_setter(pool, True) + assert work_external_indexing.is_queued(work, clear=True) # Then verify that if the field is 'set' to its existing value, this doesn't happen. - # pool.self_hosted = True status_property_setter(pool, True) + assert not work_external_indexing.is_queued(work, clear=True) - # Assert - assert 1 == len(work.coverage_records) - assert work.id == work.coverage_records[0].work_id - assert ( - WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION - == work.coverage_records[0].operation - ) - assert WorkCoverageRecord.REGISTERED == work.coverage_records[0].status - - def test_work_suppressed_for_library(self, db: DatabaseTransactionFixture): + def test_work_suppressed_for_library( + self, + db: DatabaseTransactionFixture, + work_external_indexing: WorkExternalIndexingFixture, + ): work = db.work(with_license_pool=True) library = db.library() - # Clear out any WorkCoverageRecords created as the work was initialized. - work.coverage_records = [] - - # Act + # Suppress the work for the library work.suppressed_for.append(library) + assert work_external_indexing.is_queued(work, clear=True) - # Assert - assert 1 == len(work.coverage_records) - assert work.id == work.coverage_records[0].work_id - assert ( - WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION - == work.coverage_records[0].operation - ) - assert WorkCoverageRecord.REGISTERED == work.coverage_records[0].status + # Unsuppress the work for the library + work.suppressed_for.remove(library) + assert work_external_indexing.is_queued(work, clear=True) diff --git a/tests/manager/util/test_backoff.py b/tests/manager/util/test_backoff.py new file mode 100644 index 0000000000..84ebe86bb2 --- /dev/null +++ b/tests/manager/util/test_backoff.py @@ -0,0 +1,35 @@ +from unittest.mock import patch + +import pytest + +from palace.manager.util.backoff import exponential_backoff + + +@pytest.mark.parametrize( + "retries, expected", + [ + (0, 3), + (1, 9), + (2, 27), # ~0.5 minutes + (3, 81), # ~1.3 minutes + (4, 243), # ~4 minutes + (5, 729), # ~12 minutes + (6, 2187), # ~35 minutes + ], +) +def test_exponential_backoff(retries: int, expected: int) -> None: + with patch( + "palace.manager.util.backoff.randrange", return_value=0 + ) as mock_randrange: + assert exponential_backoff(retries) == expected + assert mock_randrange.call_count == 1 + mock_randrange.assert_called_with(0, round(expected * 0.3)) + + +def test_exponential_backoff_max_backoff_time() -> None: + jitter = 2 + with patch( + "palace.manager.util.backoff.randrange", return_value=jitter + ) as mock_randrange: + assert exponential_backoff(0, 6) == 3 + jitter + assert exponential_backoff(12, 6) == 6 + jitter diff --git a/tests/migration/test_instance_init_script.py b/tests/migration/test_instance_init_script.py index 927facee91..ba423db381 100644 --- a/tests/migration/test_instance_init_script.py +++ b/tests/migration/test_instance_init_script.py @@ -29,12 +29,20 @@ def __init__( self.database = function_database self.services = services_fixture self.alembic_config_path = alembic_config_path + self.script = InstanceInitializationScript( + config_file=self.alembic_config_path, + ) - def script(self) -> InstanceInitializationScript: - with self.database.patch_engine(): - return InstanceInitializationScript( - config_file=self.alembic_config_path, - ) + self.initialize_database_schema_mock = Mock( + wraps=self.script.initialize_database_schema + ) + self.script.initialize_database_schema = self.initialize_database_schema_mock + + self.migrate_database_mock = Mock(wraps=self.script.migrate_database) + self.script.migrate_database = self.migrate_database_mock + + def initialize_database(self) -> None: + self.script.initialize_database(self.database.connection) @classmethod @contextmanager @@ -74,6 +82,8 @@ def engine_factory() -> Engine: script = InstanceInitializationScript( config_file=config_path, engine_factory=engine_factory ) + # Mock out the search initialization, this is tested elsewhere + script.initialize_search = MagicMock() script.run() # Set our exit code to the number of upgrades we ran @@ -128,7 +138,9 @@ def test_locking( assert exit_codes[2] == 0 -def test_initialize(instance_init_script_fixture: InstanceInitScriptFixture) -> None: +def test_initialize_database( + instance_init_script_fixture: InstanceInitScriptFixture, +) -> None: # Drop any existing schema instance_init_script_fixture.database.drop_existing_schema() @@ -138,27 +150,24 @@ def test_initialize(instance_init_script_fixture: InstanceInitScriptFixture) -> assert "alembic_version" not in inspector.get_table_names() assert len(inspector.get_table_names()) == 0 - script = instance_init_script_fixture.script() - script.initialize_database = Mock(wraps=script.initialize_database) - script.migrate_database = Mock(wraps=script.migrate_database) - script.run() + instance_init_script_fixture.initialize_database() inspector = inspect(engine) assert "alembic_version" in inspector.get_table_names() assert "libraries" in inspector.get_table_names() assert len(inspector.get_table_names()) > 2 - assert script.initialize_database.call_count == 1 - assert script.migrate_database.call_count == 0 + assert instance_init_script_fixture.initialize_database_schema_mock.call_count == 1 + assert instance_init_script_fixture.migrate_database_mock.call_count == 0 # Run the script again. Ensure we don't call initialize_database again, # but that we do call migrate_database, since the schema already exists. - script.run() - assert script.initialize_database.call_count == 1 - assert script.migrate_database.call_count == 1 + instance_init_script_fixture.initialize_database() + assert instance_init_script_fixture.initialize_database_schema_mock.call_count == 1 + assert instance_init_script_fixture.migrate_database_mock.call_count == 1 -def test_migrate( +def test_migrate_database( alembic_runner: MigrationContext, instance_init_script_fixture: InstanceInitScriptFixture, ) -> None: @@ -170,13 +179,10 @@ def test_migrate( assert alembic_runner.current == "base" assert alembic_runner.current != alembic_runner.heads[0] - script = instance_init_script_fixture.script() - script.initialize_database = Mock(wraps=script.initialize_database) - script.migrate_database = Mock(wraps=script.migrate_database) - script.run() + instance_init_script_fixture.initialize_database() # Make sure we have upgraded assert alembic_runner.current == alembic_runner.heads[0] - assert script.initialize_database.call_count == 0 - assert script.migrate_database.call_count == 1 + assert instance_init_script_fixture.initialize_database_schema_mock.call_count == 0 + assert instance_init_script_fixture.migrate_database_mock.call_count == 1 diff --git a/tests/mocks/search.py b/tests/mocks/search.py index 0284287d9c..c20fab2aa6 100644 --- a/tests/mocks/search.py +++ b/tests/mocks/search.py @@ -1,21 +1,24 @@ from __future__ import annotations +from collections import defaultdict from collections.abc import Iterable from enum import Enum +from typing import Any from unittest.mock import MagicMock from opensearch_dsl import MultiSearch, Search from opensearch_dsl.response.hit import Hit from opensearchpy import OpenSearchException +from palace.manager.search.document import SearchMappingDocument from palace.manager.search.external_search import ExternalSearchIndex from palace.manager.search.revision import SearchSchemaRevision -from palace.manager.search.revision_directory import SearchRevisionDirectory from palace.manager.search.service import ( + SearchPointer, SearchService, SearchServiceFailedDocument, - SearchWritePointer, ) +from palace.manager.search.v5 import SearchV5 from palace.manager.sqlalchemy.model.work import Work @@ -31,18 +34,12 @@ class SearchServiceFailureMode(Enum): class SearchServiceFake(SearchService): """A search service that doesn't speak to a real service.""" - _documents_by_index: dict[str, list[dict]] - _failing: SearchServiceFailureMode - _search_client: Search - _multi_search_client: MultiSearch - _document_submission_attempts: list[dict] - def __init__(self): self.base_name = "test_index" self._failing = SearchServiceFailureMode.NOT_FAILING - self._documents_by_index = {} - self._read_pointer: str | None = None - self._write_pointer: SearchWritePointer | None = None + self._documents_by_index = defaultdict(list) + self._read_pointer: SearchPointer | None = None + self._write_pointer: SearchPointer | None = None self._search_client = Search(using=MagicMock()) self._multi_search_client = MultiSearch(using=MagicMock()) self._document_submission_attempts = [] @@ -65,8 +62,6 @@ def set_failing_mode(self, mode: SearchServiceFailureMode): def documents_for_index(self, index_name: str) -> list[dict]: self._fail_if_necessary() - if not (index_name in self._documents_by_index): - return [] return self._documents_by_index[index_name] def documents_all(self) -> list[dict]: @@ -91,45 +86,36 @@ def write_pointer_name(self) -> str: self._fail_if_necessary() return f"{self.base_name}-search-write" - def read_pointer(self) -> str | None: + def read_pointer(self) -> SearchPointer | None: self._fail_if_necessary() return self._read_pointer - def write_pointer(self) -> SearchWritePointer | None: + def write_pointer(self) -> SearchPointer | None: self._fail_if_necessary() return self._write_pointer - def create_empty_index(self) -> None: - self._fail_if_necessary() - return None - def read_pointer_set(self, revision: SearchSchemaRevision) -> None: self._fail_if_necessary() - self._read_pointer = f"{revision.name_for_indexed_pointer(self.base_name)}" - - def index_set_populated(self, revision: SearchSchemaRevision) -> None: - self._fail_if_necessary() - - def read_pointer_set_empty(self) -> None: - self._fail_if_necessary() - self._read_pointer = f"{self.base_name}-empty" + self._read_pointer = self._pointer_set(revision, self.read_pointer_name()) def index_create(self, revision: SearchSchemaRevision) -> None: self._fail_if_necessary() return None - def index_is_populated(self, revision: SearchSchemaRevision) -> bool: - self._fail_if_necessary() - return True - def index_set_mapping(self, revision: SearchSchemaRevision) -> None: self._fail_if_necessary() + def index_submit_document( + self, document: dict[str, Any], refresh: bool = False + ) -> None: + self.index_submit_documents([document]) + def index_submit_documents( - self, pointer: str, documents: Iterable[dict] + self, documents: Iterable[dict] ) -> list[SearchServiceFailedDocument]: self._fail_if_necessary() + pointer = self.write_pointer_name() _should_fail = False _should_fail = ( _should_fail @@ -162,46 +148,40 @@ def index_submit_documents( return results - if not (pointer in self._documents_by_index): - self._documents_by_index[pointer] = [] - for document in documents: self._documents_by_index[pointer].append(document) return [] + def _pointer_set(self, revision: SearchSchemaRevision, alias: str) -> SearchPointer: + return SearchPointer( + alias=alias, + index=revision.name_for_index(self.base_name), + version=revision.version, + ) + def write_pointer_set(self, revision: SearchSchemaRevision) -> None: self._fail_if_necessary() - self._write_pointer = SearchWritePointer(self.base_name, revision.version) + self._write_pointer = self._pointer_set(revision, self.write_pointer_name()) - def index_clear_documents(self, pointer: str): + def index_clear_documents(self): self._fail_if_necessary() - if pointer in self._documents_by_index: - self._documents_by_index[pointer] = [] + pointer = self.write_pointer_name() + self._documents_by_index[pointer].clear() - def search_client(self, write=False) -> Search: - return self._search_client.index( - self.read_pointer_name() if not write else self.write_pointer_name() - ) + def read_search_client(self) -> Search: + return self._search_client.index(self.read_pointer_name()) - def search_multi_client(self, write=False) -> MultiSearch: - return self._multi_search_client.index( - self.read_pointer_name() if not write else self.write_pointer_name() - ) + def read_search_multi_client(self) -> MultiSearch: + return self._multi_search_client.index(self.read_pointer_name()) - def index_remove_document(self, pointer: str, id: int): + def index_remove_document(self, doc_id: int): self._fail_if_necessary() - if pointer in self._documents_by_index: - items = self._documents_by_index[pointer] - to_remove = [] - for item in items: - if item.get("_id") == id: - to_remove.append(item) - for item in to_remove: - items.remove(item) - - def is_pointer_empty(*args): - return False + pointer = self.write_pointer_name() + items = self._documents_by_index[pointer] + to_remove = [item for item in items if item.get("_id") == doc_id] + for item in to_remove: + items.remove(item) def fake_hits(works: list[Work]): @@ -223,14 +203,9 @@ class ExternalSearchIndexFake(ExternalSearchIndex): def __init__( self, - revision_directory: SearchRevisionDirectory | None = None, - version: int | None = None, ): - revision_directory = revision_directory or SearchRevisionDirectory.create() super().__init__( service=SearchServiceFake(), - revision_directory=revision_directory, - version=version, ) self._mock_multi_works: list[dict] = [] @@ -287,3 +262,23 @@ def count_works(self, filter): def __repr__(self) -> str: return f"Expected Results({id(self)}): {self._mock_multi_works}" + + +class MockSearchSchemaRevision(SearchSchemaRevision): + def __init__(self, version: int) -> None: + super().__init__() + self._version = version + self._document = SearchMappingDocument() + + @property + def version(self) -> int: + return self._version + + def mapping_document(self) -> SearchMappingDocument: + return self._document + + +class MockSearchSchemaRevisionLatest(MockSearchSchemaRevision): + def __init__(self, version: int) -> None: + super().__init__(version) + self._document = SearchV5().mapping_document()