Skip to content

Commit

Permalink
Remove Work.to_search_documents__DONOTUSE function 🔥 (PP-939) (#1676)
Browse files Browse the repository at this point in the history
* Remove old function marked DONOTUSE
* Remove other test
* fix list_id
  • Loading branch information
jonathangreen authored Feb 15, 2024
1 parent bd88741 commit 379f029
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 439 deletions.
378 changes: 2 additions & 376 deletions core/model/work.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from sqlalchemy.orm import Mapped, contains_eager, joinedload, relationship
from sqlalchemy.orm.base import NO_VALUE
from sqlalchemy.orm.session import Session
from sqlalchemy.sql.expression import and_, case, join, literal_column, or_, select
from sqlalchemy.sql.expression import and_, case, join, literal_column, select
from sqlalchemy.sql.functions import func

from core.classifier import Classifier, WorkClassifier
Expand All @@ -42,7 +42,7 @@
)
from core.model.classification import Classification, Subject
from core.model.constants import DataSourceConstants
from core.model.contributor import Contribution, Contributor
from core.model.contributor import Contribution
from core.model.coverage import CoverageRecord, WorkCoverageRecord
from core.model.datasource import DataSource
from core.model.edition import Edition
Expand Down Expand Up @@ -1688,380 +1688,6 @@ def _set_value(parent, key, target):

return result

@classmethod
def to_search_documents__DONOTUSE(cls, works, policy=None):
"""Generate search documents for these Works.
This is done by constructing an extremely complicated
SQL query. The code is ugly, but it's about 100 times
faster than using python to create documents for
each work individually. When working on the search
index, it's very important for this to be fast.
:param policy: A PresentationCalculationPolicy to use when
deciding how deep to go to find Identifiers equivalent to
these works.
"""

if not works:
return []

_db = Session.object_session(works[0])

# If this is a batch of search documents, postgres needs extra working
# memory to process the query quickly.
if len(works) > 50:
_db.execute("set work_mem='200MB'")

# This query gets relevant columns from Work and Edition for the Works we're
# interested in. The work_id, edition_id, and identifier_id columns are used
# by other subqueries to filter, and the remaining columns are used directly
# to create the json document.
works_alias = (
select(
[
Work.id.label("work_id"),
Edition.id.label("edition_id"),
Edition.primary_identifier_id.label("identifier_id"),
Edition.title,
Edition.subtitle,
Edition.series,
Edition.series_position,
Edition.language,
Edition.sort_title,
Edition.author,
Edition.sort_author,
Edition.medium,
Edition.publisher,
Edition.imprint,
Edition.permanent_work_id,
Work.fiction,
Work.audience,
Work.summary_text,
Work.quality,
Work.rating,
Work.popularity,
Work.presentation_ready,
Work.presentation_edition_id,
func.extract(
"EPOCH",
Work.last_update_time,
).label("last_update_time"),
],
Work.id.in_(w.id for w in works),
)
.select_from(
join(Work, Edition, Work.presentation_edition_id == Edition.id)
)
.alias("works_alias")
)

work_id_column = literal_column(
works_alias.name + "." + works_alias.c.work_id.name
)

work_presentation_edition_id_column = literal_column(
works_alias.name + "." + works_alias.c.presentation_edition_id.name
)

work_quality_column = literal_column(
works_alias.name + "." + works_alias.c.quality.name
)

def query_to_json(query):
"""Convert the results of a query to a JSON object."""
return select([func.row_to_json(literal_column(query.name))]).select_from(
query
)

def query_to_json_array(query):
"""Convert the results of a query into a JSON array."""
return select(
[
func.array_to_json(
func.array_agg(func.row_to_json(literal_column(query.name)))
)
]
).select_from(query)

# This subquery gets Collection IDs for collections
# that own more than zero licenses for this book.
from core.model.classification import Genre, Subject
from core.model.customlist import CustomListEntry
from core.model.licensing import LicensePool

# We need information about LicensePools for a few reasons:
#
# * We always want to filter out Works that are not available
# in any of the collections associated with a given library
# -- either because no licenses are owned, because the
# LicensePools are suppressed, or (TODO) because there are no
# delivery mechanisms.
# * A patron may want to sort a list of books by availability
# date.
# * A patron may want to show only books currently available,
# or only open-access books.
#
# Whenever LicensePool.open_access is changed, or
# licenses_available moves to zero or away from zero, the
# LicensePool signals that its Work needs reindexing.
#
# The work quality field is stored in the main document, but
# it's also stored here, so that we can apply a nested filter
# that combines quality with other fields found only in the subdocument.

def explicit_bool(label, t):
# Ensure we always generate True/False instead of
# True/None. Opensearch can't filter on null values.
return case([(t, True)], else_=False).label(label)

licensepools = (
select(
[
LicensePool.id.label("licensepool_id"),
LicensePool.data_source_id.label("data_source_id"),
LicensePool.collection_id.label("collection_id"),
LicensePool.open_access.label("open_access"),
LicensePool.suppressed,
explicit_bool(
"available",
or_(
LicensePool.unlimited_access,
LicensePool.licenses_available > 0,
),
),
explicit_bool(
"licensed",
or_(
LicensePool.unlimited_access,
LicensePool.licenses_owned > 0,
),
),
work_quality_column,
Edition.medium,
func.extract(
"EPOCH",
LicensePool.availability_time,
).label("availability_time"),
]
)
.where(
and_(
LicensePool.work_id == work_id_column,
work_presentation_edition_id_column == Edition.id,
or_(
LicensePool.open_access,
LicensePool.unlimited_access,
LicensePool.licenses_owned > 0,
),
)
)
.alias("licensepools_subquery")
)
licensepools_json = query_to_json_array(licensepools)

# This subquery gets CustomList IDs for all lists
# that contain the work.
#
# We also keep track of whether the work is featured on each
# list. This is used when determining which works should be
# featured for a lane based on CustomLists.
#
# And we keep track of the first time the work appears on the list.
# This is used when generating a crawlable feed for the customlist,
# which is ordered by a work's first appearance on the list.
customlists = (
select(
[
CustomListEntry.list_id.label("list_id"),
CustomListEntry.featured.label("featured"),
func.extract(
"EPOCH",
CustomListEntry.first_appearance,
).label("first_appearance"),
]
)
.where(CustomListEntry.work_id == work_id_column)
.alias("listentries_subquery")
)
customlists_json = query_to_json_array(customlists)

# This subquery gets Contributors, filtered on edition_id.
contributors = (
select(
[
Contributor.sort_name,
Contributor.display_name,
Contributor.family_name,
Contributor.lc,
Contributor.viaf,
Contribution.role,
]
)
.where(
Contribution.edition_id
== literal_column(
works_alias.name + "." + works_alias.c.edition_id.name
)
)
.select_from(
join(
Contributor,
Contribution,
Contributor.id == Contribution.contributor_id,
)
)
.alias("contributors_subquery")
)
contributors_json = query_to_json_array(contributors)

# Use a subquery to get recursively equivalent Identifiers
# for the Edition's primary_identifier_id.
#
# NOTE: we don't reliably reindex works when this information
# changes, but it's not critical that this information be
# totally up to date -- we only use it for subject searches
# and recommendations. The index is completely rebuilt once a
# day, and that's good enough.
equivalent_identifiers = Identifier.recursively_equivalent_identifier_ids_query(
literal_column(works_alias.name + "." + works_alias.c.identifier_id.name),
policy=policy,
).alias("equivalent_identifiers_subquery")

identifiers = (
select(
[
Identifier.identifier.label("identifier"),
Identifier.type.label("type"),
]
)
.where(Identifier.id.in_(equivalent_identifiers))
.alias("identifier_subquery")
)
identifiers_json = query_to_json_array(identifiers)

# Map our constants for Subject type to their URIs.
scheme_column = case(
[
(Subject.type == key, literal_column("'%s'" % val))
for key, val in list(Subject.uri_lookup.items())
]
)

# If the Subject has a name, use that, otherwise use the Subject's identifier.
# Also, 3M's classifications have slashes, e.g. "FICTION/Adventure". Make sure
# we get separated words for search.
term_column = func.replace(
case([(Subject.name != None, Subject.name)], else_=Subject.identifier),
"/",
" ",
)

# Normalize by dividing each weight by the sum of the weights for that Identifier's Classifications.
from core.model.classification import Classification

weight_column = (
func.sum(Classification.weight)
/ func.sum(func.sum(Classification.weight)).over()
)

# The subquery for Subjects, with those three columns. The labels will become keys in json objects.
subjects = (
select(
[
scheme_column.label("scheme"),
term_column.label("term"),
weight_column.label("weight"),
],
# Only include Subjects with terms that are useful for search.
and_(Subject.type.in_(Subject.TYPES_FOR_SEARCH), term_column != None),
)
.group_by(scheme_column, term_column)
.where(Classification.identifier_id.in_(equivalent_identifiers))
.select_from(
join(Classification, Subject, Classification.subject_id == Subject.id)
)
.alias("subjects_subquery")
)
subjects_json = query_to_json_array(subjects)

# Subquery for genres.
genres = (
select(
# All Genres have the same scheme - the simplified genre URI.
[
literal_column("'%s'" % Subject.SIMPLIFIED_GENRE).label("scheme"),
Genre.name,
Genre.id.label("term"),
WorkGenre.affinity.label("weight"),
]
)
.where(
WorkGenre.work_id
== literal_column(works_alias.name + "." + works_alias.c.work_id.name)
)
.select_from(join(WorkGenre, Genre, WorkGenre.genre_id == Genre.id))
.alias("genres_subquery")
)
genres_json = query_to_json_array(genres)

target_age = cls.target_age_query(
literal_column(works_alias.name + "." + works_alias.c.work_id.name)
).alias("target_age_subquery")
target_age_json = query_to_json(target_age)

# Now, create a query that brings together everything we need for the final
# search document.
search_data = (
select(
[
works_alias.c.work_id.label("_id"),
works_alias.c.work_id.label("work_id"),
works_alias.c.title,
works_alias.c.sort_title,
works_alias.c.subtitle,
works_alias.c.series,
works_alias.c.series_position,
works_alias.c.language,
works_alias.c.author,
works_alias.c.sort_author,
works_alias.c.medium,
works_alias.c.publisher,
works_alias.c.imprint,
works_alias.c.permanent_work_id,
works_alias.c.presentation_ready,
works_alias.c.last_update_time,
# Convert true/false to "Fiction"/"Nonfiction".
case(
[(works_alias.c.fiction == True, literal_column("'Fiction'"))],
else_=literal_column("'Nonfiction'"),
).label("fiction"),
# Replace "Young Adult" with "YoungAdult" and "Adults Only" with "AdultsOnly".
func.replace(works_alias.c.audience, " ", "").label("audience"),
works_alias.c.summary_text.label("summary"),
works_alias.c.quality,
works_alias.c.rating,
works_alias.c.popularity,
# Here are all the subqueries.
licensepools_json.label("licensepools"),
customlists_json.label("customlists"),
contributors_json.label("contributors"),
identifiers_json.label("identifiers"),
subjects_json.label("classifications"),
genres_json.label("genres"),
target_age_json.label("target_age"),
]
)
.select_from(works_alias)
.alias("search_data_subquery")
)

# Finally, convert everything to json.
search_json = query_to_json(search_data)

result = _db.execute(search_json)
if result:
return [r[0] for r in result]

@classmethod
def target_age_query(self, foreign_work_id_field):
# If the upper limit of the target age is inclusive, we leave
Expand Down
Loading

0 comments on commit 379f029

Please sign in to comment.