From 379f029abfb3c8c51f57236cd8b82e1622627830 Mon Sep 17 00:00:00 2001 From: Jonathan Green Date: Thu, 15 Feb 2024 12:00:32 -0400 Subject: [PATCH] =?UTF-8?q?Remove=20Work.to=5Fsearch=5Fdocuments=5F=5FDONO?= =?UTF-8?q?TUSE=20function=20=F0=9F=94=A5=20(PP-939)=20(#1676)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Remove old function marked DONOTUSE * Remove other test * fix list_id --- core/model/work.py | 378 +---------------------------- tests/core/test_external_search.py | 173 ++++++++----- 2 files changed, 112 insertions(+), 439 deletions(-) diff --git a/core/model/work.py b/core/model/work.py index 36f59c3107..c343cc9ed8 100644 --- a/core/model/work.py +++ b/core/model/work.py @@ -27,7 +27,7 @@ from sqlalchemy.orm import Mapped, contains_eager, joinedload, relationship from sqlalchemy.orm.base import NO_VALUE from sqlalchemy.orm.session import Session -from sqlalchemy.sql.expression import and_, case, join, literal_column, or_, select +from sqlalchemy.sql.expression import and_, case, join, literal_column, select from sqlalchemy.sql.functions import func from core.classifier import Classifier, WorkClassifier @@ -42,7 +42,7 @@ ) from core.model.classification import Classification, Subject from core.model.constants import DataSourceConstants -from core.model.contributor import Contribution, Contributor +from core.model.contributor import Contribution from core.model.coverage import CoverageRecord, WorkCoverageRecord from core.model.datasource import DataSource from core.model.edition import Edition @@ -1688,380 +1688,6 @@ def _set_value(parent, key, target): return result - @classmethod - def to_search_documents__DONOTUSE(cls, works, policy=None): - """Generate search documents for these Works. - This is done by constructing an extremely complicated - SQL query. The code is ugly, but it's about 100 times - faster than using python to create documents for - each work individually. When working on the search - index, it's very important for this to be fast. - - :param policy: A PresentationCalculationPolicy to use when - deciding how deep to go to find Identifiers equivalent to - these works. - """ - - if not works: - return [] - - _db = Session.object_session(works[0]) - - # If this is a batch of search documents, postgres needs extra working - # memory to process the query quickly. - if len(works) > 50: - _db.execute("set work_mem='200MB'") - - # This query gets relevant columns from Work and Edition for the Works we're - # interested in. The work_id, edition_id, and identifier_id columns are used - # by other subqueries to filter, and the remaining columns are used directly - # to create the json document. - works_alias = ( - select( - [ - Work.id.label("work_id"), - Edition.id.label("edition_id"), - Edition.primary_identifier_id.label("identifier_id"), - Edition.title, - Edition.subtitle, - Edition.series, - Edition.series_position, - Edition.language, - Edition.sort_title, - Edition.author, - Edition.sort_author, - Edition.medium, - Edition.publisher, - Edition.imprint, - Edition.permanent_work_id, - Work.fiction, - Work.audience, - Work.summary_text, - Work.quality, - Work.rating, - Work.popularity, - Work.presentation_ready, - Work.presentation_edition_id, - func.extract( - "EPOCH", - Work.last_update_time, - ).label("last_update_time"), - ], - Work.id.in_(w.id for w in works), - ) - .select_from( - join(Work, Edition, Work.presentation_edition_id == Edition.id) - ) - .alias("works_alias") - ) - - work_id_column = literal_column( - works_alias.name + "." + works_alias.c.work_id.name - ) - - work_presentation_edition_id_column = literal_column( - works_alias.name + "." + works_alias.c.presentation_edition_id.name - ) - - work_quality_column = literal_column( - works_alias.name + "." + works_alias.c.quality.name - ) - - def query_to_json(query): - """Convert the results of a query to a JSON object.""" - return select([func.row_to_json(literal_column(query.name))]).select_from( - query - ) - - def query_to_json_array(query): - """Convert the results of a query into a JSON array.""" - return select( - [ - func.array_to_json( - func.array_agg(func.row_to_json(literal_column(query.name))) - ) - ] - ).select_from(query) - - # This subquery gets Collection IDs for collections - # that own more than zero licenses for this book. - from core.model.classification import Genre, Subject - from core.model.customlist import CustomListEntry - from core.model.licensing import LicensePool - - # We need information about LicensePools for a few reasons: - # - # * We always want to filter out Works that are not available - # in any of the collections associated with a given library - # -- either because no licenses are owned, because the - # LicensePools are suppressed, or (TODO) because there are no - # delivery mechanisms. - # * A patron may want to sort a list of books by availability - # date. - # * A patron may want to show only books currently available, - # or only open-access books. - # - # Whenever LicensePool.open_access is changed, or - # licenses_available moves to zero or away from zero, the - # LicensePool signals that its Work needs reindexing. - # - # The work quality field is stored in the main document, but - # it's also stored here, so that we can apply a nested filter - # that combines quality with other fields found only in the subdocument. - - def explicit_bool(label, t): - # Ensure we always generate True/False instead of - # True/None. Opensearch can't filter on null values. - return case([(t, True)], else_=False).label(label) - - licensepools = ( - select( - [ - LicensePool.id.label("licensepool_id"), - LicensePool.data_source_id.label("data_source_id"), - LicensePool.collection_id.label("collection_id"), - LicensePool.open_access.label("open_access"), - LicensePool.suppressed, - explicit_bool( - "available", - or_( - LicensePool.unlimited_access, - LicensePool.licenses_available > 0, - ), - ), - explicit_bool( - "licensed", - or_( - LicensePool.unlimited_access, - LicensePool.licenses_owned > 0, - ), - ), - work_quality_column, - Edition.medium, - func.extract( - "EPOCH", - LicensePool.availability_time, - ).label("availability_time"), - ] - ) - .where( - and_( - LicensePool.work_id == work_id_column, - work_presentation_edition_id_column == Edition.id, - or_( - LicensePool.open_access, - LicensePool.unlimited_access, - LicensePool.licenses_owned > 0, - ), - ) - ) - .alias("licensepools_subquery") - ) - licensepools_json = query_to_json_array(licensepools) - - # This subquery gets CustomList IDs for all lists - # that contain the work. - # - # We also keep track of whether the work is featured on each - # list. This is used when determining which works should be - # featured for a lane based on CustomLists. - # - # And we keep track of the first time the work appears on the list. - # This is used when generating a crawlable feed for the customlist, - # which is ordered by a work's first appearance on the list. - customlists = ( - select( - [ - CustomListEntry.list_id.label("list_id"), - CustomListEntry.featured.label("featured"), - func.extract( - "EPOCH", - CustomListEntry.first_appearance, - ).label("first_appearance"), - ] - ) - .where(CustomListEntry.work_id == work_id_column) - .alias("listentries_subquery") - ) - customlists_json = query_to_json_array(customlists) - - # This subquery gets Contributors, filtered on edition_id. - contributors = ( - select( - [ - Contributor.sort_name, - Contributor.display_name, - Contributor.family_name, - Contributor.lc, - Contributor.viaf, - Contribution.role, - ] - ) - .where( - Contribution.edition_id - == literal_column( - works_alias.name + "." + works_alias.c.edition_id.name - ) - ) - .select_from( - join( - Contributor, - Contribution, - Contributor.id == Contribution.contributor_id, - ) - ) - .alias("contributors_subquery") - ) - contributors_json = query_to_json_array(contributors) - - # Use a subquery to get recursively equivalent Identifiers - # for the Edition's primary_identifier_id. - # - # NOTE: we don't reliably reindex works when this information - # changes, but it's not critical that this information be - # totally up to date -- we only use it for subject searches - # and recommendations. The index is completely rebuilt once a - # day, and that's good enough. - equivalent_identifiers = Identifier.recursively_equivalent_identifier_ids_query( - literal_column(works_alias.name + "." + works_alias.c.identifier_id.name), - policy=policy, - ).alias("equivalent_identifiers_subquery") - - identifiers = ( - select( - [ - Identifier.identifier.label("identifier"), - Identifier.type.label("type"), - ] - ) - .where(Identifier.id.in_(equivalent_identifiers)) - .alias("identifier_subquery") - ) - identifiers_json = query_to_json_array(identifiers) - - # Map our constants for Subject type to their URIs. - scheme_column = case( - [ - (Subject.type == key, literal_column("'%s'" % val)) - for key, val in list(Subject.uri_lookup.items()) - ] - ) - - # If the Subject has a name, use that, otherwise use the Subject's identifier. - # Also, 3M's classifications have slashes, e.g. "FICTION/Adventure". Make sure - # we get separated words for search. - term_column = func.replace( - case([(Subject.name != None, Subject.name)], else_=Subject.identifier), - "/", - " ", - ) - - # Normalize by dividing each weight by the sum of the weights for that Identifier's Classifications. - from core.model.classification import Classification - - weight_column = ( - func.sum(Classification.weight) - / func.sum(func.sum(Classification.weight)).over() - ) - - # The subquery for Subjects, with those three columns. The labels will become keys in json objects. - subjects = ( - select( - [ - scheme_column.label("scheme"), - term_column.label("term"), - weight_column.label("weight"), - ], - # Only include Subjects with terms that are useful for search. - and_(Subject.type.in_(Subject.TYPES_FOR_SEARCH), term_column != None), - ) - .group_by(scheme_column, term_column) - .where(Classification.identifier_id.in_(equivalent_identifiers)) - .select_from( - join(Classification, Subject, Classification.subject_id == Subject.id) - ) - .alias("subjects_subquery") - ) - subjects_json = query_to_json_array(subjects) - - # Subquery for genres. - genres = ( - select( - # All Genres have the same scheme - the simplified genre URI. - [ - literal_column("'%s'" % Subject.SIMPLIFIED_GENRE).label("scheme"), - Genre.name, - Genre.id.label("term"), - WorkGenre.affinity.label("weight"), - ] - ) - .where( - WorkGenre.work_id - == literal_column(works_alias.name + "." + works_alias.c.work_id.name) - ) - .select_from(join(WorkGenre, Genre, WorkGenre.genre_id == Genre.id)) - .alias("genres_subquery") - ) - genres_json = query_to_json_array(genres) - - target_age = cls.target_age_query( - literal_column(works_alias.name + "." + works_alias.c.work_id.name) - ).alias("target_age_subquery") - target_age_json = query_to_json(target_age) - - # Now, create a query that brings together everything we need for the final - # search document. - search_data = ( - select( - [ - works_alias.c.work_id.label("_id"), - works_alias.c.work_id.label("work_id"), - works_alias.c.title, - works_alias.c.sort_title, - works_alias.c.subtitle, - works_alias.c.series, - works_alias.c.series_position, - works_alias.c.language, - works_alias.c.author, - works_alias.c.sort_author, - works_alias.c.medium, - works_alias.c.publisher, - works_alias.c.imprint, - works_alias.c.permanent_work_id, - works_alias.c.presentation_ready, - works_alias.c.last_update_time, - # Convert true/false to "Fiction"/"Nonfiction". - case( - [(works_alias.c.fiction == True, literal_column("'Fiction'"))], - else_=literal_column("'Nonfiction'"), - ).label("fiction"), - # Replace "Young Adult" with "YoungAdult" and "Adults Only" with "AdultsOnly". - func.replace(works_alias.c.audience, " ", "").label("audience"), - works_alias.c.summary_text.label("summary"), - works_alias.c.quality, - works_alias.c.rating, - works_alias.c.popularity, - # Here are all the subqueries. - licensepools_json.label("licensepools"), - customlists_json.label("customlists"), - contributors_json.label("contributors"), - identifiers_json.label("identifiers"), - subjects_json.label("classifications"), - genres_json.label("genres"), - target_age_json.label("target_age"), - ] - ) - .select_from(works_alias) - .alias("search_data_subquery") - ) - - # Finally, convert everything to json. - search_json = query_to_json(search_data) - - result = _db.execute(search_json) - if result: - return [r[0] for r in result] - @classmethod def target_age_query(self, foreign_work_id_field): # If the upper limit of the target age is inclusive, we leave diff --git a/tests/core/test_external_search.py b/tests/core/test_external_search.py index 5e729a3c41..bd60ba9472 100644 --- a/tests/core/test_external_search.py +++ b/tests/core/test_external_search.py @@ -4,6 +4,7 @@ import uuid from collections.abc import Callable, Collection from datetime import datetime +from typing import Any from unittest.mock import MagicMock import pytest @@ -48,6 +49,7 @@ LicensePool, WorkCoverageRecord, get_one_or_create, + numericrange_to_tuple, ) from core.model.classification import Subject from core.model.work import Work @@ -60,11 +62,7 @@ from core.search.v5 import SearchV5 from core.util.cache import CachedData from core.util.datetime_helpers import datetime_utc, from_timestamp -from tests.fixtures.database import ( - DatabaseTransactionFixture, - DBStatementCounter, - PerfTimer, -) +from tests.fixtures.database import DatabaseTransactionFixture from tests.fixtures.library import LibraryFixture from tests.fixtures.search import ( EndToEndSearchFixture, @@ -4727,9 +4725,7 @@ def test_operation( assert WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION == provider.operation def test_to_search_document(self, db: DatabaseTransactionFixture): - """Compare the new and old to_search_document functions - TODO: classifications - """ + """Test the output of the to_search_document method.""" customlist, editions = db.customlist() works = [ db.work( @@ -4752,62 +4748,113 @@ def test_to_search_document(self, db: DatabaseTransactionFixture): subject1.genre = genre1 subject2 = db.subject(type=Subject.SIMPLIFIED_GENRE, identifier="subject2") subject2.genre = genre2 - source = DataSource.lookup(db.session, DataSource.AXIS_360) - - # works.extend([transaction.work() for i in range(500)]) - - result = Work.to_search_documents__DONOTUSE(works) - inapp = Work.to_search_documents(works) - - # Top level keys should be the same - assert len(result) == len(inapp) - - inapp_work1 = list(filter(lambda x: x["work_id"] == work1.id, inapp))[0] - inapp_work2 = list(filter(lambda x: x["work_id"] == work2.id, inapp))[0] - - # target ages - assert inapp_work1["target_age"]["lower"] == 19 - assert inapp_work1["target_age"]["upper"] == 21 - assert inapp_work2["target_age"]["lower"] == 18 - assert inapp_work2["target_age"]["upper"] == 99 - - assert len(inapp_work1["genres"]) == 1 - assert inapp_work2["genres"] == None - - assert len(inapp_work1["licensepools"]) == 1 - assert len(inapp_work2["licensepools"]) == 1 # customlist adds a pool - - assert len(inapp_work2["customlists"]) == 1 - assert inapp_work1["customlists"] == None - - result_work1 = list(filter(lambda x: x["work_id"] == work1.id, result))[0] - result_work2 = list(filter(lambda x: x["work_id"] == work2.id, result))[0] - # Every item must be equivalent - for result_item, inapp_item in [ - (result_work1, inapp_work1), - (result_work2, inapp_work2), - ]: - for key in result_item.keys(): - assert result_item[key] == inapp_item[key] - - def test_to_search_documents_performance(self, db: DatabaseTransactionFixture): - works = [db.work(with_license_pool=True, genre="history") for i in range(20)] - - connection = db.database.connection - with DBStatementCounter(connection) as old_counter: - with PerfTimer() as t1: - result = Work.to_search_documents(works) - - with DBStatementCounter(connection) as new_counter: - with PerfTimer() as t2: - inapp = Work.to_search_documents(works) - - # Do not be 100x performance - assert t2.execution_time < t1.execution_time * 5 - - # 4 queries per batch only - assert new_counter.get_count() <= 4 + db.session.flush() + + search_docs = Work.to_search_documents(works) + + search_doc_work1 = list( + filter(lambda x: x["work_id"] == work1.id, search_docs) + )[0] + search_doc_work2 = list( + filter(lambda x: x["work_id"] == work2.id, search_docs) + )[0] + + def compare(doc: dict[str, Any], work: Work) -> None: + assert doc["_id"] == work.id + assert doc["work_id"] == work.id + assert doc["title"] == work.title + assert doc["sort_title"] == work.sort_title + assert doc["subtitle"] == work.subtitle + assert doc["series"] == work.series + assert doc["series_position"] == work.series_position + assert doc["language"] == work.language + assert doc["author"] == work.author + assert doc["sort_author"] == work.sort_author + assert doc["medium"] == work.presentation_edition.medium + assert doc["publisher"] == work.publisher + assert doc["imprint"] == work.imprint + assert ( + doc["permanent_work_id"] == work.presentation_edition.permanent_work_id + ) + assert doc["presentation_ready"] == work.presentation_ready + assert doc["last_update_time"] == work.last_update_time + assert doc["fiction"] == "Fiction" if work.fiction else "Nonfiction" + assert doc["audience"] == work.audience + assert doc["summary"] == work.summary_text + assert doc["quality"] == work.quality + assert doc["rating"] == work.rating + assert doc["popularity"] == work.popularity + + if work.license_pools: + assert len(doc["licensepools"]) == len(work.license_pools) + for idx, pool in enumerate(work.license_pools): + actual = doc["licensepools"][idx] + assert actual["licensepool_id"] == pool.id + assert actual["data_source_id"] == pool.data_source_id + assert actual["collection_id"] == pool.collection_id + assert actual["open_access"] == pool.open_access + assert actual["suppressed"] == pool.suppressed + else: + assert doc["licensepools"] is None + + if work.custom_list_entries: + assert len(doc["customlists"]) == len(work.custom_list_entries) + for idx, custom_list in enumerate(work.custom_list_entries): + actual = doc["customlists"][idx] + assert actual["list_id"] == custom_list.list_id + assert actual["featured"] == custom_list.featured + else: + assert doc["customlists"] is None + + if work.presentation_edition.contributions: + assert len(doc["contributors"]) is len( + work.presentation_edition.contributions + ) + for idx, contribution in enumerate( + work.presentation_edition.contributions + ): + actual = doc["contributors"][idx] + assert actual["sort_name"] == contribution.contributor.sort_name + assert ( + actual["display_name"] == contribution.contributor.display_name + ) + assert actual["lc"] == contribution.contributor.lc + assert actual["viaf"] == contribution.contributor.viaf + assert actual["role"] == contribution.role + else: + assert doc["contributors"] is None + + if work.identifiers: + assert len(doc["identifiers"]) == len(work.identifiers) + for idx, identifier in enumerate(work.identifiers): + actual = doc["identifiers"][idx] + assert actual["type"] == identifier.type + assert actual["identifier"] == identifier.identifier + else: + assert doc["identifiers"] is None + + if work.classifications: + assert len(doc["classifications"]) == len(work.classifications) + else: + assert doc["classifications"] is None + + if work.work_genres: + assert len(doc["genres"]) == len(work.work_genres) + for idx, genre in enumerate(work.work_genres): + actual = doc["genres"][idx] + assert actual["name"] == genre.genre.name + assert actual["term"] == genre.genre.id + assert actual["weight"] == genre.affinity + else: + assert doc["genres"] is None + + lower, upper = numericrange_to_tuple(work.target_age) + assert doc["target_age"]["lower"] == lower + assert doc["target_age"]["upper"] == upper + + compare(search_doc_work1, work1) + compare(search_doc_work2, work2) def test_to_search_documents_with_missing_data( self, db: DatabaseTransactionFixture