Skip to content

Commit

Permalink
documents: fix fulltext search
Browse files Browse the repository at this point in the history
* Excludes the fulltext field when the search query is not a "fulltext" search.

Co-Authored-by: Johnny Mariéthoz <[email protected]>
  • Loading branch information
jma committed Jan 8, 2024
1 parent 32a5f4e commit dbcc3e2
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 135 deletions.
268 changes: 141 additions & 127 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ invenio-records = ">=1.4.0,<1.7.0"
invenio-stats = "^1.0.0a18"
invenio-records-resources = "*"
invenio-previewer = ">=1.3.5"
invenio-userprofiles = "<1.2.5"
invenio = {version = ">=3.4.0,<3.5.0", extras = ["base", "files", "postgresql", "auth", "elasticsearch7", "docs", "tests"]}

uwsgi = ">=2.0"
Expand Down
2 changes: 1 addition & 1 deletion scripts/test
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ fi
# | certifi | 2022.12.7 | >=2015.04.28,<2023.07.22 | 59956 |
# | pillow | 9.5.0 | <10.0.1 | 61489 |
# +==============================================================================+
safety check -i 45183 -i 44501 -i 51668 -i 42194 -i 42852 -i 53325 -i 53326 -i 54456 -i 42498 -i 43738 -i 47833 -i 51457 -i 51358 -i 53812 -i 52495 -i 50792 -i 55261 -i 59062 -i 59473 -i 59956 -i 60223 -i 60224 -i 60225 -i 61489 -i 62019 -i 40459 -i 62451 -i 62452 -i 62556
safety check -i 45183 -i 44501 -i 51668 -i 42194 -i 42852 -i 53325 -i 53326 -i 54456 -i 42498 -i 43738 -i 47833 -i 51457 -i 51358 -i 53812 -i 52495 -i 50792 -i 55261 -i 59062 -i 59473 -i 59956 -i 60223 -i 60224 -i 60225 -i 61489 -i 62019 -i 40459 -i 62451 -i 62452 -i 62556 -i 63073
pydocstyle sonar tests docs
isort --check-only --diff "${SCRIPT_PATH}/.."
autoflake -c -r --remove-all-unused-imports --ignore-init-module-imports . &> /dev/null || {
Expand Down
2 changes: 1 addition & 1 deletion sonar/modules/documents/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@
SONAR_DOCUMENTS_PERMALINK = '{host}{org}/documents/{pid}'
"""Permalink for accessing documents details."""

SONAR_DOCUMENT_QUERY_BOOSTING = ['title.*^3', '*']
SONAR_DOCUMENT_QUERY_BOOSTING = ['title.*^3', 'fulltext^6', '*']
"""Search query boosting parameters for documents."""
9 changes: 7 additions & 2 deletions sonar/modules/documents/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@

from __future__ import absolute_import, print_function

from invenio_base.signals import app_loaded
from invenio_oaiharvester.signals import oaiharvest_finished
from invenio_records.signals import before_record_insert, before_record_update

from sonar.modules.documents.receivers import export_json, \
transform_harvested_records, update_oai_property
set_boosting_query_fields, transform_harvested_records, \
update_oai_property

from . import config

Expand All @@ -48,9 +50,12 @@ def init_app(self, app):
# Adds `_oai` property
before_record_insert.connect(update_oai_property)
before_record_update.connect(update_oai_property)
# Expand configuration.
app_loaded.connect(set_boosting_query_fields)


def init_config(self, app):
"""Initialize configuration."""
for k in dir(config):
for k in dir(app.config):
if k.startswith('SONAR_DOCUMENTS_'):
app.config.setdefault(k, getattr(config, k))
5 changes: 3 additions & 2 deletions sonar/modules/documents/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def documents_query_parser(qstr=None):
"""Custom query parser for documents."""
if not qstr:
return Q()

fields = current_app.config.get(
'SONAR_DOCUMENT_QUERY_BOOSTING', ['*']).copy()

Expand All @@ -42,7 +41,9 @@ def documents_query_parser(qstr=None):
if 'fulltext:' in qstr:
result = re.match(r'^fulltext:(.*)$', qstr)
qstr = result.group(1)
fields.append('fulltext')
else:
# remove the fulltext field
fields = [field for field in fields if not field.startswith('fulltext')]

operator, query_type = get_operator_and_query_type(qstr)

Expand Down
50 changes: 49 additions & 1 deletion sonar/modules/documents/receivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""Signals connections for documents."""

import json
import re
import time
from datetime import datetime
from os import makedirs
Expand All @@ -26,9 +27,10 @@
import click
import pytz
from flask import current_app
from invenio_search import current_search

from sonar.modules.api import SonarRecord
from sonar.modules.documents.api import DocumentRecord
from sonar.modules.documents.api import DocumentRecord, DocumentSearch
from sonar.modules.documents.loaders.schemas.factory import LoaderSchemaFactory
from sonar.modules.utils import chunks
from sonar.webdav import HegClient
Expand Down Expand Up @@ -149,3 +151,49 @@ def export_json(sender=None, records=None, **kwargs):
client.upload_file(file_name, file_path)

click.echo('{count} records exported'.format(count=len(records_to_export)))


def process_boosting(config):
"""Expand the '*' using the mapping file.
:param config: array of es fields.
:returns: the expanded version of *.
"""
config = config.copy()
try:
config.remove('*')
except ValueError:
# nothing to replace
return config
# list of existing fields without the boosting factor
existing_fields = [re.sub(r'\^\d+$', '', field) for field in config]
index_name = DocumentSearch.Meta.index
doc_mappings = list(current_search.aliases[index_name].values())
assert len(doc_mappings) == 1
mapping_path = doc_mappings.pop()
with open(mapping_path, "r") as body:
mapping = json.load(body)
fields = []
for prop, conf in mapping['mappings']['properties'].items():
field = prop
# add .* for field with children
if conf.get('properties'):
field = f'{field}.*'
# do nothing for existing fields
if field in existing_fields:
continue
fields.append(field)
return config + fields

def set_boosting_query_fields(sender, app=None, **kwargs):
"""Expand '*' in the boosting configuration.
:param sender: sender of the signal
:param app: the flask app
"""
# required to access to the flask extension
with app.app_context():
app.config['SONAR_DOCUMENT_QUERY_BOOSTING'] = \
process_boosting(
app.config.get('SONAR_DOCUMENT_QUERY_BOOSTING', ['*']))

23 changes: 22 additions & 1 deletion tests/api/test_api_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@
from flask import url_for
from invenio_accounts.testutils import login_user_via_session

from sonar.modules.documents.receivers import process_boosting


def test_boosting_fields(app):
"""Test the boosting configuration."""
# the configuration should exists
assert app.config.get('SONAR_DOCUMENT_QUERY_BOOSTING')
# it should be expanded
assert "'*'" not in app.config.get('SONAR_DOCUMENT_QUERY_BOOSTING')

# several cases of configurations
assert process_boosting(['title.*']) == ['title.*']
assert 'title.*' in process_boosting(['*'])
assert 'title.*^2' in process_boosting(['title.*^2', '*'])

def test_api_query(client, document_with_file, document_json, make_document,
superuser):
Expand Down Expand Up @@ -95,13 +109,20 @@ def test_api_query(client, document_with_file, document_json, make_document,

# Test search in fulltext
response = client.get(url_for('invenio_records_rest.doc_list',
q='fulltext:the',
q='fulltext:(theoretically study the high-harmonic)',
debug=1),
headers=headers)
assert response.status_code == 200
assert response.json['hits']['total']['value'] == 1
assert response.json['hits']['hits'][0]['explanation']['details']

response = client.get(url_for('invenio_records_rest.doc_list',
q='(theoretically study the high-harmonic)',
debug=1),
headers=headers)
assert response.status_code == 200
assert response.json['hits']['total']['value'] == 0

# Not allowed operator
with pytest.raises(Exception) as exception:
response = client.get(url_for('invenio_records_rest.doc_list',
Expand Down

0 comments on commit dbcc3e2

Please sign in to comment.