From 379f029abfb3c8c51f57236cd8b82e1622627830 Mon Sep 17 00:00:00 2001
From: Jonathan Green <jonathan@razzard.com>
Date: Thu, 15 Feb 2024 12:00:32 -0400
Subject: [PATCH] =?UTF-8?q?Remove=20Work.to=5Fsearch=5Fdocuments=5F=5FDONO?=
 =?UTF-8?q?TUSE=20function=20=F0=9F=94=A5=20(PP-939)=20(#1676)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Remove old function marked DONOTUSE
* Remove other test
* fix list_id
---
 core/model/work.py                 | 378 +----------------------------
 tests/core/test_external_search.py | 173 ++++++++-----
 2 files changed, 112 insertions(+), 439 deletions(-)

diff --git a/core/model/work.py b/core/model/work.py
index 36f59c3107..c343cc9ed8 100644
--- a/core/model/work.py
+++ b/core/model/work.py
@@ -27,7 +27,7 @@
 from sqlalchemy.orm import Mapped, contains_eager, joinedload, relationship
 from sqlalchemy.orm.base import NO_VALUE
 from sqlalchemy.orm.session import Session
-from sqlalchemy.sql.expression import and_, case, join, literal_column, or_, select
+from sqlalchemy.sql.expression import and_, case, join, literal_column, select
 from sqlalchemy.sql.functions import func
 
 from core.classifier import Classifier, WorkClassifier
@@ -42,7 +42,7 @@
 )
 from core.model.classification import Classification, Subject
 from core.model.constants import DataSourceConstants
-from core.model.contributor import Contribution, Contributor
+from core.model.contributor import Contribution
 from core.model.coverage import CoverageRecord, WorkCoverageRecord
 from core.model.datasource import DataSource
 from core.model.edition import Edition
@@ -1688,380 +1688,6 @@ def _set_value(parent, key, target):
 
         return result
 
-    @classmethod
-    def to_search_documents__DONOTUSE(cls, works, policy=None):
-        """Generate search documents for these Works.
-        This is done by constructing an extremely complicated
-        SQL query. The code is ugly, but it's about 100 times
-        faster than using python to create documents for
-        each work individually. When working on the search
-        index, it's very important for this to be fast.
-
-        :param policy: A PresentationCalculationPolicy to use when
-           deciding how deep to go to find Identifiers equivalent to
-           these works.
-        """
-
-        if not works:
-            return []
-
-        _db = Session.object_session(works[0])
-
-        # If this is a batch of search documents, postgres needs extra working
-        # memory to process the query quickly.
-        if len(works) > 50:
-            _db.execute("set work_mem='200MB'")
-
-        # This query gets relevant columns from Work and Edition for the Works we're
-        # interested in. The work_id, edition_id, and identifier_id columns are used
-        # by other subqueries to filter, and the remaining columns are used directly
-        # to create the json document.
-        works_alias = (
-            select(
-                [
-                    Work.id.label("work_id"),
-                    Edition.id.label("edition_id"),
-                    Edition.primary_identifier_id.label("identifier_id"),
-                    Edition.title,
-                    Edition.subtitle,
-                    Edition.series,
-                    Edition.series_position,
-                    Edition.language,
-                    Edition.sort_title,
-                    Edition.author,
-                    Edition.sort_author,
-                    Edition.medium,
-                    Edition.publisher,
-                    Edition.imprint,
-                    Edition.permanent_work_id,
-                    Work.fiction,
-                    Work.audience,
-                    Work.summary_text,
-                    Work.quality,
-                    Work.rating,
-                    Work.popularity,
-                    Work.presentation_ready,
-                    Work.presentation_edition_id,
-                    func.extract(
-                        "EPOCH",
-                        Work.last_update_time,
-                    ).label("last_update_time"),
-                ],
-                Work.id.in_(w.id for w in works),
-            )
-            .select_from(
-                join(Work, Edition, Work.presentation_edition_id == Edition.id)
-            )
-            .alias("works_alias")
-        )
-
-        work_id_column = literal_column(
-            works_alias.name + "." + works_alias.c.work_id.name
-        )
-
-        work_presentation_edition_id_column = literal_column(
-            works_alias.name + "." + works_alias.c.presentation_edition_id.name
-        )
-
-        work_quality_column = literal_column(
-            works_alias.name + "." + works_alias.c.quality.name
-        )
-
-        def query_to_json(query):
-            """Convert the results of a query to a JSON object."""
-            return select([func.row_to_json(literal_column(query.name))]).select_from(
-                query
-            )
-
-        def query_to_json_array(query):
-            """Convert the results of a query into a JSON array."""
-            return select(
-                [
-                    func.array_to_json(
-                        func.array_agg(func.row_to_json(literal_column(query.name)))
-                    )
-                ]
-            ).select_from(query)
-
-        # This subquery gets Collection IDs for collections
-        # that own more than zero licenses for this book.
-        from core.model.classification import Genre, Subject
-        from core.model.customlist import CustomListEntry
-        from core.model.licensing import LicensePool
-
-        # We need information about LicensePools for a few reasons:
-        #
-        # * We always want to filter out Works that are not available
-        #   in any of the collections associated with a given library
-        #   -- either because no licenses are owned, because the
-        #   LicensePools are suppressed, or (TODO) because there are no
-        #   delivery mechanisms.
-        # * A patron may want to sort a list of books by availability
-        #   date.
-        # * A patron may want to show only books currently available,
-        #   or only open-access books.
-        #
-        # Whenever LicensePool.open_access is changed, or
-        # licenses_available moves to zero or away from zero, the
-        # LicensePool signals that its Work needs reindexing.
-        #
-        # The work quality field is stored in the main document, but
-        # it's also stored here, so that we can apply a nested filter
-        # that combines quality with other fields found only in the subdocument.
-
-        def explicit_bool(label, t):
-            # Ensure we always generate True/False instead of
-            # True/None. Opensearch can't filter on null values.
-            return case([(t, True)], else_=False).label(label)
-
-        licensepools = (
-            select(
-                [
-                    LicensePool.id.label("licensepool_id"),
-                    LicensePool.data_source_id.label("data_source_id"),
-                    LicensePool.collection_id.label("collection_id"),
-                    LicensePool.open_access.label("open_access"),
-                    LicensePool.suppressed,
-                    explicit_bool(
-                        "available",
-                        or_(
-                            LicensePool.unlimited_access,
-                            LicensePool.licenses_available > 0,
-                        ),
-                    ),
-                    explicit_bool(
-                        "licensed",
-                        or_(
-                            LicensePool.unlimited_access,
-                            LicensePool.licenses_owned > 0,
-                        ),
-                    ),
-                    work_quality_column,
-                    Edition.medium,
-                    func.extract(
-                        "EPOCH",
-                        LicensePool.availability_time,
-                    ).label("availability_time"),
-                ]
-            )
-            .where(
-                and_(
-                    LicensePool.work_id == work_id_column,
-                    work_presentation_edition_id_column == Edition.id,
-                    or_(
-                        LicensePool.open_access,
-                        LicensePool.unlimited_access,
-                        LicensePool.licenses_owned > 0,
-                    ),
-                )
-            )
-            .alias("licensepools_subquery")
-        )
-        licensepools_json = query_to_json_array(licensepools)
-
-        # This subquery gets CustomList IDs for all lists
-        # that contain the work.
-        #
-        # We also keep track of whether the work is featured on each
-        # list. This is used when determining which works should be
-        # featured for a lane based on CustomLists.
-        #
-        # And we keep track of the first time the work appears on the list.
-        # This is used when generating a crawlable feed for the customlist,
-        # which is ordered by a work's first appearance on the list.
-        customlists = (
-            select(
-                [
-                    CustomListEntry.list_id.label("list_id"),
-                    CustomListEntry.featured.label("featured"),
-                    func.extract(
-                        "EPOCH",
-                        CustomListEntry.first_appearance,
-                    ).label("first_appearance"),
-                ]
-            )
-            .where(CustomListEntry.work_id == work_id_column)
-            .alias("listentries_subquery")
-        )
-        customlists_json = query_to_json_array(customlists)
-
-        # This subquery gets Contributors, filtered on edition_id.
-        contributors = (
-            select(
-                [
-                    Contributor.sort_name,
-                    Contributor.display_name,
-                    Contributor.family_name,
-                    Contributor.lc,
-                    Contributor.viaf,
-                    Contribution.role,
-                ]
-            )
-            .where(
-                Contribution.edition_id
-                == literal_column(
-                    works_alias.name + "." + works_alias.c.edition_id.name
-                )
-            )
-            .select_from(
-                join(
-                    Contributor,
-                    Contribution,
-                    Contributor.id == Contribution.contributor_id,
-                )
-            )
-            .alias("contributors_subquery")
-        )
-        contributors_json = query_to_json_array(contributors)
-
-        # Use a subquery to get recursively equivalent Identifiers
-        # for the Edition's primary_identifier_id.
-        #
-        # NOTE: we don't reliably reindex works when this information
-        # changes, but it's not critical that this information be
-        # totally up to date -- we only use it for subject searches
-        # and recommendations. The index is completely rebuilt once a
-        # day, and that's good enough.
-        equivalent_identifiers = Identifier.recursively_equivalent_identifier_ids_query(
-            literal_column(works_alias.name + "." + works_alias.c.identifier_id.name),
-            policy=policy,
-        ).alias("equivalent_identifiers_subquery")
-
-        identifiers = (
-            select(
-                [
-                    Identifier.identifier.label("identifier"),
-                    Identifier.type.label("type"),
-                ]
-            )
-            .where(Identifier.id.in_(equivalent_identifiers))
-            .alias("identifier_subquery")
-        )
-        identifiers_json = query_to_json_array(identifiers)
-
-        # Map our constants for Subject type to their URIs.
-        scheme_column = case(
-            [
-                (Subject.type == key, literal_column("'%s'" % val))
-                for key, val in list(Subject.uri_lookup.items())
-            ]
-        )
-
-        # If the Subject has a name, use that, otherwise use the Subject's identifier.
-        # Also, 3M's classifications have slashes, e.g. "FICTION/Adventure". Make sure
-        # we get separated words for search.
-        term_column = func.replace(
-            case([(Subject.name != None, Subject.name)], else_=Subject.identifier),
-            "/",
-            " ",
-        )
-
-        # Normalize by dividing each weight by the sum of the weights for that Identifier's Classifications.
-        from core.model.classification import Classification
-
-        weight_column = (
-            func.sum(Classification.weight)
-            / func.sum(func.sum(Classification.weight)).over()
-        )
-
-        # The subquery for Subjects, with those three columns. The labels will become keys in json objects.
-        subjects = (
-            select(
-                [
-                    scheme_column.label("scheme"),
-                    term_column.label("term"),
-                    weight_column.label("weight"),
-                ],
-                # Only include Subjects with terms that are useful for search.
-                and_(Subject.type.in_(Subject.TYPES_FOR_SEARCH), term_column != None),
-            )
-            .group_by(scheme_column, term_column)
-            .where(Classification.identifier_id.in_(equivalent_identifiers))
-            .select_from(
-                join(Classification, Subject, Classification.subject_id == Subject.id)
-            )
-            .alias("subjects_subquery")
-        )
-        subjects_json = query_to_json_array(subjects)
-
-        # Subquery for genres.
-        genres = (
-            select(
-                # All Genres have the same scheme - the simplified genre URI.
-                [
-                    literal_column("'%s'" % Subject.SIMPLIFIED_GENRE).label("scheme"),
-                    Genre.name,
-                    Genre.id.label("term"),
-                    WorkGenre.affinity.label("weight"),
-                ]
-            )
-            .where(
-                WorkGenre.work_id
-                == literal_column(works_alias.name + "." + works_alias.c.work_id.name)
-            )
-            .select_from(join(WorkGenre, Genre, WorkGenre.genre_id == Genre.id))
-            .alias("genres_subquery")
-        )
-        genres_json = query_to_json_array(genres)
-
-        target_age = cls.target_age_query(
-            literal_column(works_alias.name + "." + works_alias.c.work_id.name)
-        ).alias("target_age_subquery")
-        target_age_json = query_to_json(target_age)
-
-        # Now, create a query that brings together everything we need for the final
-        # search document.
-        search_data = (
-            select(
-                [
-                    works_alias.c.work_id.label("_id"),
-                    works_alias.c.work_id.label("work_id"),
-                    works_alias.c.title,
-                    works_alias.c.sort_title,
-                    works_alias.c.subtitle,
-                    works_alias.c.series,
-                    works_alias.c.series_position,
-                    works_alias.c.language,
-                    works_alias.c.author,
-                    works_alias.c.sort_author,
-                    works_alias.c.medium,
-                    works_alias.c.publisher,
-                    works_alias.c.imprint,
-                    works_alias.c.permanent_work_id,
-                    works_alias.c.presentation_ready,
-                    works_alias.c.last_update_time,
-                    # Convert true/false to "Fiction"/"Nonfiction".
-                    case(
-                        [(works_alias.c.fiction == True, literal_column("'Fiction'"))],
-                        else_=literal_column("'Nonfiction'"),
-                    ).label("fiction"),
-                    # Replace "Young Adult" with "YoungAdult" and "Adults Only" with "AdultsOnly".
-                    func.replace(works_alias.c.audience, " ", "").label("audience"),
-                    works_alias.c.summary_text.label("summary"),
-                    works_alias.c.quality,
-                    works_alias.c.rating,
-                    works_alias.c.popularity,
-                    # Here are all the subqueries.
-                    licensepools_json.label("licensepools"),
-                    customlists_json.label("customlists"),
-                    contributors_json.label("contributors"),
-                    identifiers_json.label("identifiers"),
-                    subjects_json.label("classifications"),
-                    genres_json.label("genres"),
-                    target_age_json.label("target_age"),
-                ]
-            )
-            .select_from(works_alias)
-            .alias("search_data_subquery")
-        )
-
-        # Finally, convert everything to json.
-        search_json = query_to_json(search_data)
-
-        result = _db.execute(search_json)
-        if result:
-            return [r[0] for r in result]
-
     @classmethod
     def target_age_query(self, foreign_work_id_field):
         # If the upper limit of the target age is inclusive, we leave
diff --git a/tests/core/test_external_search.py b/tests/core/test_external_search.py
index 5e729a3c41..bd60ba9472 100644
--- a/tests/core/test_external_search.py
+++ b/tests/core/test_external_search.py
@@ -4,6 +4,7 @@
 import uuid
 from collections.abc import Callable, Collection
 from datetime import datetime
+from typing import Any
 from unittest.mock import MagicMock
 
 import pytest
@@ -48,6 +49,7 @@
     LicensePool,
     WorkCoverageRecord,
     get_one_or_create,
+    numericrange_to_tuple,
 )
 from core.model.classification import Subject
 from core.model.work import Work
@@ -60,11 +62,7 @@
 from core.search.v5 import SearchV5
 from core.util.cache import CachedData
 from core.util.datetime_helpers import datetime_utc, from_timestamp
-from tests.fixtures.database import (
-    DatabaseTransactionFixture,
-    DBStatementCounter,
-    PerfTimer,
-)
+from tests.fixtures.database import DatabaseTransactionFixture
 from tests.fixtures.library import LibraryFixture
 from tests.fixtures.search import (
     EndToEndSearchFixture,
@@ -4727,9 +4725,7 @@ def test_operation(
         assert WorkCoverageRecord.UPDATE_SEARCH_INDEX_OPERATION == provider.operation
 
     def test_to_search_document(self, db: DatabaseTransactionFixture):
-        """Compare the new and old to_search_document functions
-        TODO: classifications
-        """
+        """Test the output of the to_search_document method."""
         customlist, editions = db.customlist()
         works = [
             db.work(
@@ -4752,62 +4748,113 @@ def test_to_search_document(self, db: DatabaseTransactionFixture):
         subject1.genre = genre1
         subject2 = db.subject(type=Subject.SIMPLIFIED_GENRE, identifier="subject2")
         subject2.genre = genre2
-        source = DataSource.lookup(db.session, DataSource.AXIS_360)
-
-        # works.extend([transaction.work() for i in range(500)])
-
-        result = Work.to_search_documents__DONOTUSE(works)
-        inapp = Work.to_search_documents(works)
-
-        # Top level keys should be the same
-        assert len(result) == len(inapp)
-
-        inapp_work1 = list(filter(lambda x: x["work_id"] == work1.id, inapp))[0]
-        inapp_work2 = list(filter(lambda x: x["work_id"] == work2.id, inapp))[0]
-
-        # target ages
-        assert inapp_work1["target_age"]["lower"] == 19
-        assert inapp_work1["target_age"]["upper"] == 21
-        assert inapp_work2["target_age"]["lower"] == 18
-        assert inapp_work2["target_age"]["upper"] == 99
-
-        assert len(inapp_work1["genres"]) == 1
-        assert inapp_work2["genres"] == None
-
-        assert len(inapp_work1["licensepools"]) == 1
-        assert len(inapp_work2["licensepools"]) == 1  # customlist adds a pool
-
-        assert len(inapp_work2["customlists"]) == 1
-        assert inapp_work1["customlists"] == None
-
-        result_work1 = list(filter(lambda x: x["work_id"] == work1.id, result))[0]
-        result_work2 = list(filter(lambda x: x["work_id"] == work2.id, result))[0]
 
-        # Every item must be equivalent
-        for result_item, inapp_item in [
-            (result_work1, inapp_work1),
-            (result_work2, inapp_work2),
-        ]:
-            for key in result_item.keys():
-                assert result_item[key] == inapp_item[key]
-
-    def test_to_search_documents_performance(self, db: DatabaseTransactionFixture):
-        works = [db.work(with_license_pool=True, genre="history") for i in range(20)]
-
-        connection = db.database.connection
-        with DBStatementCounter(connection) as old_counter:
-            with PerfTimer() as t1:
-                result = Work.to_search_documents(works)
-
-        with DBStatementCounter(connection) as new_counter:
-            with PerfTimer() as t2:
-                inapp = Work.to_search_documents(works)
-
-        # Do not be 100x performance
-        assert t2.execution_time < t1.execution_time * 5
-
-        # 4 queries per batch only
-        assert new_counter.get_count() <= 4
+        db.session.flush()
+
+        search_docs = Work.to_search_documents(works)
+
+        search_doc_work1 = list(
+            filter(lambda x: x["work_id"] == work1.id, search_docs)
+        )[0]
+        search_doc_work2 = list(
+            filter(lambda x: x["work_id"] == work2.id, search_docs)
+        )[0]
+
+        def compare(doc: dict[str, Any], work: Work) -> None:
+            assert doc["_id"] == work.id
+            assert doc["work_id"] == work.id
+            assert doc["title"] == work.title
+            assert doc["sort_title"] == work.sort_title
+            assert doc["subtitle"] == work.subtitle
+            assert doc["series"] == work.series
+            assert doc["series_position"] == work.series_position
+            assert doc["language"] == work.language
+            assert doc["author"] == work.author
+            assert doc["sort_author"] == work.sort_author
+            assert doc["medium"] == work.presentation_edition.medium
+            assert doc["publisher"] == work.publisher
+            assert doc["imprint"] == work.imprint
+            assert (
+                doc["permanent_work_id"] == work.presentation_edition.permanent_work_id
+            )
+            assert doc["presentation_ready"] == work.presentation_ready
+            assert doc["last_update_time"] == work.last_update_time
+            assert doc["fiction"] == "Fiction" if work.fiction else "Nonfiction"
+            assert doc["audience"] == work.audience
+            assert doc["summary"] == work.summary_text
+            assert doc["quality"] == work.quality
+            assert doc["rating"] == work.rating
+            assert doc["popularity"] == work.popularity
+
+            if work.license_pools:
+                assert len(doc["licensepools"]) == len(work.license_pools)
+                for idx, pool in enumerate(work.license_pools):
+                    actual = doc["licensepools"][idx]
+                    assert actual["licensepool_id"] == pool.id
+                    assert actual["data_source_id"] == pool.data_source_id
+                    assert actual["collection_id"] == pool.collection_id
+                    assert actual["open_access"] == pool.open_access
+                    assert actual["suppressed"] == pool.suppressed
+            else:
+                assert doc["licensepools"] is None
+
+            if work.custom_list_entries:
+                assert len(doc["customlists"]) == len(work.custom_list_entries)
+                for idx, custom_list in enumerate(work.custom_list_entries):
+                    actual = doc["customlists"][idx]
+                    assert actual["list_id"] == custom_list.list_id
+                    assert actual["featured"] == custom_list.featured
+            else:
+                assert doc["customlists"] is None
+
+            if work.presentation_edition.contributions:
+                assert len(doc["contributors"]) is len(
+                    work.presentation_edition.contributions
+                )
+                for idx, contribution in enumerate(
+                    work.presentation_edition.contributions
+                ):
+                    actual = doc["contributors"][idx]
+                    assert actual["sort_name"] == contribution.contributor.sort_name
+                    assert (
+                        actual["display_name"] == contribution.contributor.display_name
+                    )
+                    assert actual["lc"] == contribution.contributor.lc
+                    assert actual["viaf"] == contribution.contributor.viaf
+                    assert actual["role"] == contribution.role
+            else:
+                assert doc["contributors"] is None
+
+            if work.identifiers:
+                assert len(doc["identifiers"]) == len(work.identifiers)
+                for idx, identifier in enumerate(work.identifiers):
+                    actual = doc["identifiers"][idx]
+                    assert actual["type"] == identifier.type
+                    assert actual["identifier"] == identifier.identifier
+            else:
+                assert doc["identifiers"] is None
+
+            if work.classifications:
+                assert len(doc["classifications"]) == len(work.classifications)
+            else:
+                assert doc["classifications"] is None
+
+            if work.work_genres:
+                assert len(doc["genres"]) == len(work.work_genres)
+                for idx, genre in enumerate(work.work_genres):
+                    actual = doc["genres"][idx]
+                    assert actual["name"] == genre.genre.name
+                    assert actual["term"] == genre.genre.id
+                    assert actual["weight"] == genre.affinity
+            else:
+                assert doc["genres"] is None
+
+            lower, upper = numericrange_to_tuple(work.target_age)
+            assert doc["target_age"]["lower"] == lower
+            assert doc["target_age"]["upper"] == upper
+
+        compare(search_doc_work1, work1)
+        compare(search_doc_work2, work2)
 
     def test_to_search_documents_with_missing_data(
         self, db: DatabaseTransactionFixture