Remove Work.to_search_documents__DONOTUSE function 🔥 (PP-939) (#1676)

* Remove old function marked DONOTUSE * Remove other test * fix list_id
ThePalaceProject · Feb 15, 2024 · 379f029 · 379f029
1 parent bd88741
commit 379f029
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 439 deletions.
diff --git a/core/model/work.py b/core/model/work.py
@@ -27,7 +27,7 @@
 from sqlalchemy.orm import Mapped, contains_eager, joinedload, relationship
 from sqlalchemy.orm.base import NO_VALUE
 from sqlalchemy.orm.session import Session
-from sqlalchemy.sql.expression import and_, case, join, literal_column, or_, select
+from sqlalchemy.sql.expression import and_, case, join, literal_column, select
 from sqlalchemy.sql.functions import func
 
 from core.classifier import Classifier, WorkClassifier
@@ -42,7 +42,7 @@
 )
 from core.model.classification import Classification, Subject
 from core.model.constants import DataSourceConstants
-from core.model.contributor import Contribution, Contributor
+from core.model.contributor import Contribution
 from core.model.coverage import CoverageRecord, WorkCoverageRecord
 from core.model.datasource import DataSource
 from core.model.edition import Edition
@@ -1688,380 +1688,6 @@ def _set_value(parent, key, target):
 
         return result
 
-    @classmethod
-    def to_search_documents__DONOTUSE(cls, works, policy=None):
-        """Generate search documents for these Works.
-        This is done by constructing an extremely complicated
-        SQL query. The code is ugly, but it's about 100 times
-        faster than using python to create documents for
-        each work individually. When working on the search
-        index, it's very important for this to be fast.
-
-        :param policy: A PresentationCalculationPolicy to use when
-           deciding how deep to go to find Identifiers equivalent to
-           these works.
-        """
-
-        if not works:
-            return []
-
-        _db = Session.object_session(works[0])
-
-        # If this is a batch of search documents, postgres needs extra working
-        # memory to process the query quickly.
-        if len(works) > 50:
-            _db.execute("set work_mem='200MB'")
-
-        # This query gets relevant columns from Work and Edition for the Works we're
-        # interested in. The work_id, edition_id, and identifier_id columns are used
-        # by other subqueries to filter, and the remaining columns are used directly
-        # to create the json document.
-        works_alias = (
-            select(
-                [
-                    Work.id.label("work_id"),
-                    Edition.id.label("edition_id"),
-                    Edition.primary_identifier_id.label("identifier_id"),
-                    Edition.title,
-                    Edition.subtitle,
-                    Edition.series,
-                    Edition.series_position,
-                    Edition.language,
-                    Edition.sort_title,
-                    Edition.author,
-                    Edition.sort_author,
-                    Edition.medium,
-                    Edition.publisher,
-                    Edition.imprint,
-                    Edition.permanent_work_id,
-                    Work.fiction,
-                    Work.audience,
-                    Work.summary_text,
-                    Work.quality,
-                    Work.rating,
-                    Work.popularity,
-                    Work.presentation_ready,
-                    Work.presentation_edition_id,
-                    func.extract(
-                        "EPOCH",
-                        Work.last_update_time,
-                    ).label("last_update_time"),
-                ],
-                Work.id.in_(w.id for w in works),
-            )
-            .select_from(
-                join(Work, Edition, Work.presentation_edition_id == Edition.id)
-            )
-            .alias("works_alias")
-        )
-
-        work_id_column = literal_column(
-            works_alias.name + "." + works_alias.c.work_id.name
-        )
-
-        work_presentation_edition_id_column = literal_column(
-            works_alias.name + "." + works_alias.c.presentation_edition_id.name
-        )
-
-        work_quality_column = literal_column(
-            works_alias.name + "." + works_alias.c.quality.name
-        )
-
-        def query_to_json(query):
-            """Convert the results of a query to a JSON object."""
-            return select([func.row_to_json(literal_column(query.name))]).select_from(
-                query
-            )
-
-        def query_to_json_array(query):
-            """Convert the results of a query into a JSON array."""
-            return select(
-                [
-                    func.array_to_json(
-                        func.array_agg(func.row_to_json(literal_column(query.name)))
-                    )
-                ]
-            ).select_from(query)
-
-        # This subquery gets Collection IDs for collections
-        # that own more than zero licenses for this book.
-        from core.model.classification import Genre, Subject
-        from core.model.customlist import CustomListEntry
-        from core.model.licensing import LicensePool
-
-        # We need information about LicensePools for a few reasons:
-        #
-        # * We always want to filter out Works that are not available
-        #   in any of the collections associated with a given library
-        #   -- either because no licenses are owned, because the
-        #   LicensePools are suppressed, or (TODO) because there are no
-        #   delivery mechanisms.
-        # * A patron may want to sort a list of books by availability
-        #   date.
-        # * A patron may want to show only books currently available,
-        #   or only open-access books.
-        #
-        # Whenever LicensePool.open_access is changed, or
-        # licenses_available moves to zero or away from zero, the
-        # LicensePool signals that its Work needs reindexing.
-        #
-        # The work quality field is stored in the main document, but
-        # it's also stored here, so that we can apply a nested filter
-        # that combines quality with other fields found only in the subdocument.
-
-        def explicit_bool(label, t):
-            # Ensure we always generate True/False instead of
-            # True/None. Opensearch can't filter on null values.
-            return case([(t, True)], else_=False).label(label)
-
-        licensepools = (
-            select(
-                [
-                    LicensePool.id.label("licensepool_id"),
-                    LicensePool.data_source_id.label("data_source_id"),
-                    LicensePool.collection_id.label("collection_id"),
-                    LicensePool.open_access.label("open_access"),
-                    LicensePool.suppressed,
-                    explicit_bool(
-                        "available",
-                        or_(
-                            LicensePool.unlimited_access,
-                            LicensePool.licenses_available > 0,
-                        ),
-                    ),
-                    explicit_bool(
-                        "licensed",
-                        or_(
-                            LicensePool.unlimited_access,
-                            LicensePool.licenses_owned > 0,
-                        ),
-                    ),
-                    work_quality_column,
-                    Edition.medium,
-                    func.extract(
-                        "EPOCH",
-                        LicensePool.availability_time,
-                    ).label("availability_time"),
-                ]
-            )
-            .where(
-                and_(
-                    LicensePool.work_id == work_id_column,
-                    work_presentation_edition_id_column == Edition.id,
-                    or_(
-                        LicensePool.open_access,
-                        LicensePool.unlimited_access,
-                        LicensePool.licenses_owned > 0,
-                    ),
-                )
-            )
-            .alias("licensepools_subquery")
-        )
-        licensepools_json = query_to_json_array(licensepools)
-
-        # This subquery gets CustomList IDs for all lists
-        # that contain the work.
-        #
-        # We also keep track of whether the work is featured on each
-        # list. This is used when determining which works should be
-        # featured for a lane based on CustomLists.
-        #
-        # And we keep track of the first time the work appears on the list.
-        # This is used when generating a crawlable feed for the customlist,
-        # which is ordered by a work's first appearance on the list.
-        customlists = (
-            select(
-                [
-                    CustomListEntry.list_id.label("list_id"),
-                    CustomListEntry.featured.label("featured"),
-                    func.extract(
-                        "EPOCH",
-                        CustomListEntry.first_appearance,
-                    ).label("first_appearance"),
-                ]
-            )
-            .where(CustomListEntry.work_id == work_id_column)
-            .alias("listentries_subquery")
-        )
-        customlists_json = query_to_json_array(customlists)
-
-        # This subquery gets Contributors, filtered on edition_id.
-        contributors = (
-            select(
-                [
-                    Contributor.sort_name,
-                    Contributor.display_name,
-                    Contributor.family_name,
-                    Contributor.lc,
-                    Contributor.viaf,
-                    Contribution.role,
-                ]
-            )
-            .where(
-                Contribution.edition_id
-                == literal_column(
-                    works_alias.name + "." + works_alias.c.edition_id.name
-                )
-            )
-            .select_from(
-                join(
-                    Contributor,
-                    Contribution,
-                    Contributor.id == Contribution.contributor_id,
-                )
-            )
-            .alias("contributors_subquery")
-        )
-        contributors_json = query_to_json_array(contributors)
-
-        # Use a subquery to get recursively equivalent Identifiers
-        # for the Edition's primary_identifier_id.
-        #
-        # NOTE: we don't reliably reindex works when this information
-        # changes, but it's not critical that this information be
-        # totally up to date -- we only use it for subject searches
-        # and recommendations. The index is completely rebuilt once a
-        # day, and that's good enough.
-        equivalent_identifiers = Identifier.recursively_equivalent_identifier_ids_query(
-            literal_column(works_alias.name + "." + works_alias.c.identifier_id.name),
-            policy=policy,
-        ).alias("equivalent_identifiers_subquery")
-
-        identifiers = (
-            select(
-                [
-                    Identifier.identifier.label("identifier"),
-                    Identifier.type.label("type"),
-                ]
-            )
-            .where(Identifier.id.in_(equivalent_identifiers))
-            .alias("identifier_subquery")
-        )
-        identifiers_json = query_to_json_array(identifiers)
-
-        # Map our constants for Subject type to their URIs.
-        scheme_column = case(
-            [
-                (Subject.type == key, literal_column("'%s'" % val))
-                for key, val in list(Subject.uri_lookup.items())
-            ]
-        )
-
-        # If the Subject has a name, use that, otherwise use the Subject's identifier.
-        # Also, 3M's classifications have slashes, e.g. "FICTION/Adventure". Make sure
-        # we get separated words for search.
-        term_column = func.replace(
-            case([(Subject.name != None, Subject.name)], else_=Subject.identifier),
-            "/",
-            " ",
-        )
-
-        # Normalize by dividing each weight by the sum of the weights for that Identifier's Classifications.
-        from core.model.classification import Classification
-
-        weight_column = (
-            func.sum(Classification.weight)
-            / func.sum(func.sum(Classification.weight)).over()
-        )
-
-        # The subquery for Subjects, with those three columns. The labels will become keys in json objects.
-        subjects = (
-            select(
-                [
-                    scheme_column.label("scheme"),
-                    term_column.label("term"),
-                    weight_column.label("weight"),
-                ],
-                # Only include Subjects with terms that are useful for search.
-                and_(Subject.type.in_(Subject.TYPES_FOR_SEARCH), term_column != None),
-            )
-            .group_by(scheme_column, term_column)
-            .where(Classification.identifier_id.in_(equivalent_identifiers))
-            .select_from(
-                join(Classification, Subject, Classification.subject_id == Subject.id)
-            )
-            .alias("subjects_subquery")
-        )
-        subjects_json = query_to_json_array(subjects)
-
-        # Subquery for genres.
-        genres = (
-            select(
-                # All Genres have the same scheme - the simplified genre URI.
-                [
-                    literal_column("'%s'" % Subject.SIMPLIFIED_GENRE).label("scheme"),
-                    Genre.name,
-                    Genre.id.label("term"),
-                    WorkGenre.affinity.label("weight"),
-                ]
-            )
-            .where(
-                WorkGenre.work_id
-                == literal_column(works_alias.name + "." + works_alias.c.work_id.name)
-            )
-            .select_from(join(WorkGenre, Genre, WorkGenre.genre_id == Genre.id))
-            .alias("genres_subquery")
-        )
-        genres_json = query_to_json_array(genres)
-
-        target_age = cls.target_age_query(
-            literal_column(works_alias.name + "." + works_alias.c.work_id.name)
-        ).alias("target_age_subquery")
-        target_age_json = query_to_json(target_age)
-
-        # Now, create a query that brings together everything we need for the final
-        # search document.
-        search_data = (
-            select(
-                [
-                    works_alias.c.work_id.label("_id"),
-                    works_alias.c.work_id.label("work_id"),
-                    works_alias.c.title,
-                    works_alias.c.sort_title,
-                    works_alias.c.subtitle,
-                    works_alias.c.series,
-                    works_alias.c.series_position,
-                    works_alias.c.language,
-                    works_alias.c.author,
-                    works_alias.c.sort_author,
-                    works_alias.c.medium,
-                    works_alias.c.publisher,
-                    works_alias.c.imprint,
-                    works_alias.c.permanent_work_id,
-                    works_alias.c.presentation_ready,
-                    works_alias.c.last_update_time,
-                    # Convert true/false to "Fiction"/"Nonfiction".
-                    case(
-                        [(works_alias.c.fiction == True, literal_column("'Fiction'"))],
-                        else_=literal_column("'Nonfiction'"),
-                    ).label("fiction"),
-                    # Replace "Young Adult" with "YoungAdult" and "Adults Only" with "AdultsOnly".
-                    func.replace(works_alias.c.audience, " ", "").label("audience"),
-                    works_alias.c.summary_text.label("summary"),
-                    works_alias.c.quality,
-                    works_alias.c.rating,
-                    works_alias.c.popularity,
-                    # Here are all the subqueries.
-                    licensepools_json.label("licensepools"),
-                    customlists_json.label("customlists"),
-                    contributors_json.label("contributors"),
-                    identifiers_json.label("identifiers"),
-                    subjects_json.label("classifications"),
-                    genres_json.label("genres"),
-                    target_age_json.label("target_age"),
-                ]
-            )
-            .select_from(works_alias)
-            .alias("search_data_subquery")
-        )
-
-        # Finally, convert everything to json.
-        search_json = query_to_json(search_data)
-
-        result = _db.execute(search_json)
-        if result:
-            return [r[0] for r in result]
-
     @classmethod
     def target_age_query(self, foreign_work_id_field):
         # If the upper limit of the target age is inclusive, we leave