Skip to content

Commit

Permalink
Remove fuzziness from search
Browse files Browse the repository at this point in the history
  • Loading branch information
dbernstein committed May 14, 2024
1 parent b0518de commit b354d71
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 125 deletions.
48 changes: 14 additions & 34 deletions src/palace/manager/search/external_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -1089,36 +1089,36 @@ class Operators(Values):
"fiction": _KEYWORD_ONLY,
"genres.name": dict(path="genres"),
"genres.scheme": dict(path="genres"),
"genres.term": dict(path="genres", **_LONG_TYPE),
"genres.weight": dict(path="genres", **_LONG_TYPE),
"genres.term": dict(path="genres"),
"genres.weight": dict(path="genres"),
"identifiers.identifier": dict(path="identifiers"),
"identifiers.type": dict(path="identifiers"),
"imprint": _KEYWORD_ONLY,
"language": dict(
type="_text"
), # Made up keyword type, because we don't want text fuzzyness on this
"licensepools.available": dict(path="licensepools", **_BOOL_TYPE),
"licensepools.availability_time": dict(path="licensepools", **_LONG_TYPE),
"licensepools.collection_id": dict(path="licensepools", **_LONG_TYPE),
"licensepools.available": dict(path="licensepools"),
"licensepools.availability_time": dict(path="licensepools"),
"licensepools.collection_id": dict(path="licensepools"),
"licensepools.data_source_id": dict(
path="licensepools", ops=[Operators.EQ, Operators.NEQ], **_LONG_TYPE
path="licensepools", ops=[Operators.EQ, Operators.EQ]
),
"licensepools.licensed": dict(path="licensepools", **_BOOL_TYPE),
"licensepools.licensed": dict(path="licensepools"),
"licensepools.medium": dict(path="licensepools"),
"licensepools.open_access": dict(path="licensepools", **_BOOL_TYPE),
"licensepools.quality": dict(path="licensepools", **_LONG_TYPE),
"licensepools.suppressed": dict(path="licensepools", **_BOOL_TYPE),
"licensepools.open_access": dict(path="licensepools"),
"licensepools.quality": dict(path="licensepools"),
"licensepools.suppressed": dict(path="licensepools"),
"medium": _KEYWORD_ONLY,
"presentation_ready": _BOOL_TYPE,
"publisher": _KEYWORD_ONLY,
"quality": _LONG_TYPE,
"quality": dict(),
"series": _KEYWORD_ONLY,
"sort_author": dict(),
"sort_title": dict(),
"subtitle": _KEYWORD_ONLY,
"target_age": dict(),
"title": _KEYWORD_ONLY,
"published": _LONG_TYPE,
"published": dict(),
}

# From the client, some field names may be abstracted
Expand All @@ -1130,17 +1130,6 @@ class Operators(Values):
"data_source": "licensepools.data_source_id",
}

# We are using "match" queries for the "equals" operator
# so we must keep a tight leash on the how much of a spread
# in the matches we want to keep
# The "match" is used instead of "term" in order to have some
# tolerance for spelling mistakes while making a query
MATCH_ARGS = dict(
auto_generate_synonyms_phrase_query=False,
max_expansions=10,
fuzziness="AUTO",
)

class ValueTransforms:
@staticmethod
def data_source(value: str) -> int:
Expand Down Expand Up @@ -1274,19 +1263,10 @@ def _parse_json_leaf(self, query: dict) -> dict:

es_query = None

def _match_or_term_query():
"""Only text type mappings get a 'match' search, others use a term search
All variables are used from the function closure
"""
if mapping.get("type", "text") != "text":
return Term(**{key: value})
else:
return Match(**{key: {"query": value, **self.MATCH_ARGS}})

if op == self.Operators.EQ:
es_query = _match_or_term_query()
es_query = Term(**{key: value})
elif op == self.Operators.NEQ:
es_query = Bool(must_not=[_match_or_term_query()])
es_query = Bool(must_not=[Term(**{key: value})])
elif op in {
self.Operators.GT,
self.Operators.GTE,
Expand Down
105 changes: 14 additions & 91 deletions tests/manager/search/test_external_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5039,26 +5039,18 @@ def _leaf(key, value, op="eq"):
def _jq(query):
return JSONQuery(dict(query=query))

match_args = JSONQuery.MATCH_ARGS

def test_search_query(self, external_search_fixture: ExternalSearchFixture):
q_dict = {"key": "medium", "value": "Book"}
q = self._jq(q_dict)
assert q.search_query.to_dict() == {
"match": {"medium.keyword": {"query": "Book", **self.match_args}}
}
assert q.search_query.to_dict() == {"term": {"medium.keyword": "Book"}}

q = {"or": [self._leaf("medium", "Book"), self._leaf("medium", "Audio")]}
q = self._jq(q)
assert q.search_query.to_dict() == {
"bool": {
"should": [
{"match": {"medium.keyword": {"query": "Book", **self.match_args}}},
{
"match": {
"medium.keyword": {"query": "Audio", **self.match_args}
}
},
{"term": {"medium.keyword": "Book"}},
{"term": {"medium.keyword": "Audio"}},
]
}
}
Expand All @@ -5068,12 +5060,8 @@ def test_search_query(self, external_search_fixture: ExternalSearchFixture):
assert q.search_query.to_dict() == {
"bool": {
"must": [
{"match": {"medium.keyword": {"query": "Book", **self.match_args}}},
{
"match": {
"medium.keyword": {"query": "Audio", **self.match_args}
}
},
{"term": {"medium.keyword": "Book"}},
{"term": {"medium.keyword": "Audio"}},
]
}
}
Expand All @@ -5088,29 +5076,8 @@ def test_search_query(self, external_search_fixture: ExternalSearchFixture):
assert q.search_query.to_dict() == {
"bool": {
"must": [
{"match": {"title.keyword": {"query": "Title", **self.match_args}}},
{
"bool": {
"should": [
{
"match": {
"medium.keyword": {
"query": "Book",
**self.match_args,
}
}
},
{
"match": {
"medium.keyword": {
"query": "Audio",
**self.match_args,
}
}
},
]
}
},
{"term": {"medium.keyword": "Book"}},
{"term": {"medium.keyword": "Audio"}},
]
}
}
Expand All @@ -5120,21 +5087,8 @@ def test_search_query(self, external_search_fixture: ExternalSearchFixture):
assert q.search_query.to_dict() == {
"bool": {
"should": [
{"match": {"medium.keyword": {"query": "Book", **self.match_args}}},
{
"bool": {
"must_not": [
{
"match": {
"medium.keyword": {
"query": "Audio",
**self.match_args,
}
}
}
]
}
},
{"term": {"medium.keyword": "Book"}},
{"term": {"medium.keyword": "Audio"}},
]
}
}
Expand All @@ -5149,21 +5103,8 @@ def test_search_query(self, external_search_fixture: ExternalSearchFixture):
assert q.search_query.to_dict() == {
"bool": {
"must": [
{"match": {"title.keyword": {"query": "Title", **self.match_args}}},
{
"bool": {
"must_not": [
{
"match": {
"author.keyword": {
"query": "Geoffrey",
**self.match_args,
}
}
}
]
}
},
{"term": {"title.keyword": "Title"}},
{"bool": {"must_not": [{"term": {"author.keyword": "Geoffrey"}}]}},
]
}
}
Expand All @@ -5188,18 +5129,15 @@ def test_search_query_range(self, key, value, op):
("contributors.display_name", "name", True),
("contributors.lc", "name", False),
("genres.name", "name", False),
("licensepools.medium", "Book", False),
("licensepools.open_access", True, False),
],
)
def test_search_query_nested(self, key, value, is_keyword):
q = self._jq(self._leaf(key, value))
term = key if not is_keyword else f"{key}.keyword"
root = key.split(".")[0]
assert q.search_query.to_dict() == {
"nested": {
"path": root,
"query": {"match": {term: {"query": value, **self.match_args}}},
}
"nested": {"path": root, "query": {"term": {term: value}}}
}

@pytest.mark.parametrize(
Expand Down Expand Up @@ -5234,9 +5172,7 @@ def test_regex_query(self):
def test_field_transforms(self):
q = self._jq(self._leaf("classification", "cls"))
assert q.search_query.to_dict() == {
"match": {
"classifications.term.keyword": {"query": "cls", **self.match_args}
}
"term": {"classifications.term.keyword": "cls"}
}
q = self._jq(self._leaf("open_access", True))
assert q.search_query.to_dict() == {
Expand Down Expand Up @@ -5337,19 +5273,6 @@ def test_allowed_operators_for_data_source(self, db: DatabaseTransactionFixture)
}
}

@pytest.mark.parametrize(
"key,value,is_text",
[
("title", "value", True),
("licensepools.open_access", True, False),
("published", "1990-01-01", False),
],
)
def test_type_queries(self, key, value, is_text):
"""Bool and long types are term queries, whereas text is a match query"""
q = self._jq(self._leaf(key, value))
q.search_query.to_dict().keys() == ["match" if is_text else "term"]

@pytest.mark.parametrize(
"value,escaped,contains",
[
Expand Down

0 comments on commit b354d71

Please sign in to comment.