From b354d718c409940cba3aa54c7f16337a91bbfa71 Mon Sep 17 00:00:00 2001 From: Daniel Bernstein Date: Mon, 13 May 2024 16:57:11 -0700 Subject: [PATCH] Remove fuzziness from search --- src/palace/manager/search/external_search.py | 48 +++------ tests/manager/search/test_external_search.py | 105 +++---------------- 2 files changed, 28 insertions(+), 125 deletions(-) diff --git a/src/palace/manager/search/external_search.py b/src/palace/manager/search/external_search.py index 81e6c78895..ba8efa4720 100644 --- a/src/palace/manager/search/external_search.py +++ b/src/palace/manager/search/external_search.py @@ -1089,36 +1089,36 @@ class Operators(Values): "fiction": _KEYWORD_ONLY, "genres.name": dict(path="genres"), "genres.scheme": dict(path="genres"), - "genres.term": dict(path="genres", **_LONG_TYPE), - "genres.weight": dict(path="genres", **_LONG_TYPE), + "genres.term": dict(path="genres"), + "genres.weight": dict(path="genres"), "identifiers.identifier": dict(path="identifiers"), "identifiers.type": dict(path="identifiers"), "imprint": _KEYWORD_ONLY, "language": dict( type="_text" ), # Made up keyword type, because we don't want text fuzzyness on this - "licensepools.available": dict(path="licensepools", **_BOOL_TYPE), - "licensepools.availability_time": dict(path="licensepools", **_LONG_TYPE), - "licensepools.collection_id": dict(path="licensepools", **_LONG_TYPE), + "licensepools.available": dict(path="licensepools"), + "licensepools.availability_time": dict(path="licensepools"), + "licensepools.collection_id": dict(path="licensepools"), "licensepools.data_source_id": dict( - path="licensepools", ops=[Operators.EQ, Operators.NEQ], **_LONG_TYPE + path="licensepools", ops=[Operators.EQ, Operators.EQ] ), - "licensepools.licensed": dict(path="licensepools", **_BOOL_TYPE), + "licensepools.licensed": dict(path="licensepools"), "licensepools.medium": dict(path="licensepools"), - "licensepools.open_access": dict(path="licensepools", **_BOOL_TYPE), - "licensepools.quality": dict(path="licensepools", **_LONG_TYPE), - "licensepools.suppressed": dict(path="licensepools", **_BOOL_TYPE), + "licensepools.open_access": dict(path="licensepools"), + "licensepools.quality": dict(path="licensepools"), + "licensepools.suppressed": dict(path="licensepools"), "medium": _KEYWORD_ONLY, "presentation_ready": _BOOL_TYPE, "publisher": _KEYWORD_ONLY, - "quality": _LONG_TYPE, + "quality": dict(), "series": _KEYWORD_ONLY, "sort_author": dict(), "sort_title": dict(), "subtitle": _KEYWORD_ONLY, "target_age": dict(), "title": _KEYWORD_ONLY, - "published": _LONG_TYPE, + "published": dict(), } # From the client, some field names may be abstracted @@ -1130,17 +1130,6 @@ class Operators(Values): "data_source": "licensepools.data_source_id", } - # We are using "match" queries for the "equals" operator - # so we must keep a tight leash on the how much of a spread - # in the matches we want to keep - # The "match" is used instead of "term" in order to have some - # tolerance for spelling mistakes while making a query - MATCH_ARGS = dict( - auto_generate_synonyms_phrase_query=False, - max_expansions=10, - fuzziness="AUTO", - ) - class ValueTransforms: @staticmethod def data_source(value: str) -> int: @@ -1274,19 +1263,10 @@ def _parse_json_leaf(self, query: dict) -> dict: es_query = None - def _match_or_term_query(): - """Only text type mappings get a 'match' search, others use a term search - All variables are used from the function closure - """ - if mapping.get("type", "text") != "text": - return Term(**{key: value}) - else: - return Match(**{key: {"query": value, **self.MATCH_ARGS}}) - if op == self.Operators.EQ: - es_query = _match_or_term_query() + es_query = Term(**{key: value}) elif op == self.Operators.NEQ: - es_query = Bool(must_not=[_match_or_term_query()]) + es_query = Bool(must_not=[Term(**{key: value})]) elif op in { self.Operators.GT, self.Operators.GTE, diff --git a/tests/manager/search/test_external_search.py b/tests/manager/search/test_external_search.py index 30002794b5..e4a7e8a219 100644 --- a/tests/manager/search/test_external_search.py +++ b/tests/manager/search/test_external_search.py @@ -5039,26 +5039,18 @@ def _leaf(key, value, op="eq"): def _jq(query): return JSONQuery(dict(query=query)) - match_args = JSONQuery.MATCH_ARGS - def test_search_query(self, external_search_fixture: ExternalSearchFixture): q_dict = {"key": "medium", "value": "Book"} q = self._jq(q_dict) - assert q.search_query.to_dict() == { - "match": {"medium.keyword": {"query": "Book", **self.match_args}} - } + assert q.search_query.to_dict() == {"term": {"medium.keyword": "Book"}} q = {"or": [self._leaf("medium", "Book"), self._leaf("medium", "Audio")]} q = self._jq(q) assert q.search_query.to_dict() == { "bool": { "should": [ - {"match": {"medium.keyword": {"query": "Book", **self.match_args}}}, - { - "match": { - "medium.keyword": {"query": "Audio", **self.match_args} - } - }, + {"term": {"medium.keyword": "Book"}}, + {"term": {"medium.keyword": "Audio"}}, ] } } @@ -5068,12 +5060,8 @@ def test_search_query(self, external_search_fixture: ExternalSearchFixture): assert q.search_query.to_dict() == { "bool": { "must": [ - {"match": {"medium.keyword": {"query": "Book", **self.match_args}}}, - { - "match": { - "medium.keyword": {"query": "Audio", **self.match_args} - } - }, + {"term": {"medium.keyword": "Book"}}, + {"term": {"medium.keyword": "Audio"}}, ] } } @@ -5088,29 +5076,8 @@ def test_search_query(self, external_search_fixture: ExternalSearchFixture): assert q.search_query.to_dict() == { "bool": { "must": [ - {"match": {"title.keyword": {"query": "Title", **self.match_args}}}, - { - "bool": { - "should": [ - { - "match": { - "medium.keyword": { - "query": "Book", - **self.match_args, - } - } - }, - { - "match": { - "medium.keyword": { - "query": "Audio", - **self.match_args, - } - } - }, - ] - } - }, + {"term": {"medium.keyword": "Book"}}, + {"term": {"medium.keyword": "Audio"}}, ] } } @@ -5120,21 +5087,8 @@ def test_search_query(self, external_search_fixture: ExternalSearchFixture): assert q.search_query.to_dict() == { "bool": { "should": [ - {"match": {"medium.keyword": {"query": "Book", **self.match_args}}}, - { - "bool": { - "must_not": [ - { - "match": { - "medium.keyword": { - "query": "Audio", - **self.match_args, - } - } - } - ] - } - }, + {"term": {"medium.keyword": "Book"}}, + {"term": {"medium.keyword": "Audio"}}, ] } } @@ -5149,21 +5103,8 @@ def test_search_query(self, external_search_fixture: ExternalSearchFixture): assert q.search_query.to_dict() == { "bool": { "must": [ - {"match": {"title.keyword": {"query": "Title", **self.match_args}}}, - { - "bool": { - "must_not": [ - { - "match": { - "author.keyword": { - "query": "Geoffrey", - **self.match_args, - } - } - } - ] - } - }, + {"term": {"title.keyword": "Title"}}, + {"bool": {"must_not": [{"term": {"author.keyword": "Geoffrey"}}]}}, ] } } @@ -5188,7 +5129,7 @@ def test_search_query_range(self, key, value, op): ("contributors.display_name", "name", True), ("contributors.lc", "name", False), ("genres.name", "name", False), - ("licensepools.medium", "Book", False), + ("licensepools.open_access", True, False), ], ) def test_search_query_nested(self, key, value, is_keyword): @@ -5196,10 +5137,7 @@ def test_search_query_nested(self, key, value, is_keyword): term = key if not is_keyword else f"{key}.keyword" root = key.split(".")[0] assert q.search_query.to_dict() == { - "nested": { - "path": root, - "query": {"match": {term: {"query": value, **self.match_args}}}, - } + "nested": {"path": root, "query": {"term": {term: value}}} } @pytest.mark.parametrize( @@ -5234,9 +5172,7 @@ def test_regex_query(self): def test_field_transforms(self): q = self._jq(self._leaf("classification", "cls")) assert q.search_query.to_dict() == { - "match": { - "classifications.term.keyword": {"query": "cls", **self.match_args} - } + "term": {"classifications.term.keyword": "cls"} } q = self._jq(self._leaf("open_access", True)) assert q.search_query.to_dict() == { @@ -5337,19 +5273,6 @@ def test_allowed_operators_for_data_source(self, db: DatabaseTransactionFixture) } } - @pytest.mark.parametrize( - "key,value,is_text", - [ - ("title", "value", True), - ("licensepools.open_access", True, False), - ("published", "1990-01-01", False), - ], - ) - def test_type_queries(self, key, value, is_text): - """Bool and long types are term queries, whereas text is a match query""" - q = self._jq(self._leaf(key, value)) - q.search_query.to_dict().keys() == ["match" if is_text else "term"] - @pytest.mark.parametrize( "value,escaped,contains", [