Skip to content

Commit

Permalink
Add synonyms filter to the analysis configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
marko-bekhta committed Dec 8, 2023
1 parent ac86060 commit 1a3f256
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 11 deletions.
2 changes: 2 additions & 0 deletions src/main/java/io/quarkus/search/app/SearchService.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import org.hibernate.Length;
import org.hibernate.search.engine.search.common.BooleanOperator;
import org.hibernate.search.engine.search.predicate.dsl.SimpleQueryFlag;
import org.hibernate.search.mapper.orm.session.SearchSession;

import org.jboss.resteasy.reactive.RestQuery;
Expand Down Expand Up @@ -68,6 +69,7 @@ public SearchResult<GuideSearchHit> search(@RestQuery @DefaultValue(QuarkusVersi
.field("summary_autocomplete").boost(0.5f)
.field("fullContent_autocomplete").boost(0.1f)
.matching(q)
.flags(SimpleQueryFlag.AND, SimpleQueryFlag.OR)
.defaultOperator(BooleanOperator.AND))
.should(f.match().field("origin").matching("quarkus").boost(50.0f))
.should(f.not(f.match().field("topics").matching("compatibility"))
Expand Down
18 changes: 9 additions & 9 deletions src/main/java/io/quarkus/search/app/entity/Guide.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,32 +45,32 @@ public class Guide {
@KeywordField
public String origin;

@FullTextField(highlightable = Highlightable.UNIFIED, termVector = TermVector.WITH_POSITIONS_OFFSETS)
@FullTextField(name = "title_autocomplete", analyzer = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzer = AnalysisConfigurer.DEFAULT)
@FullTextField(highlightable = Highlightable.UNIFIED, termVector = TermVector.WITH_POSITIONS_OFFSETS, analyzer = AnalysisConfigurer.DEFAULT, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@FullTextField(name = "title_autocomplete", analyzer = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@KeywordField(name = "title_sort", normalizer = AnalysisConfigurer.SORT, searchable = Searchable.NO, sortable = Sortable.YES)
@Column(length = Length.LONG)
public String title;

@FullTextField(highlightable = Highlightable.UNIFIED, termVector = TermVector.WITH_POSITIONS_OFFSETS)
@FullTextField(name = "summary_autocomplete", analyzer = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzer = AnalysisConfigurer.DEFAULT)
@FullTextField(highlightable = Highlightable.UNIFIED, termVector = TermVector.WITH_POSITIONS_OFFSETS, analyzer = AnalysisConfigurer.DEFAULT, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@FullTextField(name = "summary_autocomplete", analyzer = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@Column(length = Length.LONG32)
public String summary;

@FullTextField
@FullTextField(name = "keywords_autocomplete", analyzer = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzer = AnalysisConfigurer.DEFAULT)
@FullTextField(analyzer = AnalysisConfigurer.DEFAULT, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@FullTextField(name = "keywords_autocomplete", analyzer = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@Column(length = Length.LONG32)
public String keywords;

@FullTextField(name = "fullContent", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), highlightable = Highlightable.UNIFIED)
@FullTextField(name = "fullContent_autocomplete", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), analyzer = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzer = AnalysisConfigurer.DEFAULT)
@FullTextField(name = "fullContent", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), highlightable = Highlightable.UNIFIED, analyzer = AnalysisConfigurer.DEFAULT, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@FullTextField(name = "fullContent_autocomplete", valueBridge = @ValueBridgeRef(type = InputProviderHtmlBodyTextBridge.class), analyzer = AnalysisConfigurer.AUTOCOMPLETE, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@Transient
@IndexingDependency(reindexOnUpdate = ReindexOnUpdate.NO)
public InputProvider htmlFullContentProvider;

@KeywordField(name = "categories")
public Set<String> categories = Set.of();

@FullTextField(name = "topics")
@FullTextField(name = "topics", analyzer = AnalysisConfigurer.DEFAULT, searchAnalyzer = AnalysisConfigurer.DEFAULT_SEARCH)
@KeywordField(name = "topics_faceting", searchable = Searchable.YES, projectable = Projectable.YES, aggregable = Aggregable.YES)
public Set<String> topics = Set.of();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,21 @@

import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurationContext;
import org.hibernate.search.backend.elasticsearch.analysis.ElasticsearchAnalysisConfigurer;
import org.hibernate.search.engine.backend.analysis.AnalyzerNames;

import io.quarkus.hibernate.search.orm.elasticsearch.SearchExtension;

@SearchExtension
public class AnalysisConfigurer implements ElasticsearchAnalysisConfigurer {
public static final String DEFAULT = AnalyzerNames.DEFAULT;

private static final String[] SYNONYMS = new String[] {
"development, dev",
"dev service, devservice, development service",
"resteasy, rest, rest api, rest easy",
"vert.x, vertx, vertex"
};

public static final String DEFAULT = "basic_analyzer";
public static final String DEFAULT_SEARCH = DEFAULT + "_search";
public static final String AUTOCOMPLETE = "autocomplete";
public static final String SORT = "sort";

Expand All @@ -18,6 +26,17 @@ public void configure(ElasticsearchAnalysisConfigurationContext context) {
.tokenizer("standard")
.tokenFilters("lowercase", "asciifolding", "stemmer")
.charFilters("html_strip");

context.analyzer(DEFAULT_SEARCH).custom()
.tokenizer("standard")
.tokenFilters("lowercase", "asciifolding", "stemmer", "synonyms_graph_filter")
.charFilters("html_strip");
context.tokenFilter("synonyms_graph_filter")
// See https://www.elastic.co/guide/en/elasticsearch/reference/8.11/analysis-synonym-graph-tokenfilter.html#analysis-synonym-graph-tokenfilter
// synonym_graph works better with multi-word synonyms
.type("synonym_graph")
.param("synonyms", SYNONYMS);

context.analyzer(AUTOCOMPLETE).custom()
.tokenizer("standard")
.tokenFilters("lowercase", "asciifolding", "stemmer", "autocomplete_edge_ngram")
Expand Down
231 changes: 231 additions & 0 deletions src/test/java/io/quarkus/search/app/SynonymSearchServiceTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
package io.quarkus.search.app;

import static io.restassured.RestAssured.given;
import static io.restassured.RestAssured.when;
import static org.assertj.core.api.Assertions.assertThat;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.TestInstance;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;

import io.quarkus.search.app.dto.GuideSearchHit;
import io.quarkus.search.app.dto.SearchResult;
import io.quarkus.search.app.quarkusio.QuarkusIO;
import io.quarkus.search.app.testsupport.GitTestUtils;
import io.quarkus.search.app.util.CloseableDirectory;
import io.quarkus.test.junit.QuarkusTest;
import io.quarkus.test.junit.QuarkusTestProfile;
import io.quarkus.test.junit.TestProfile;
import io.restassured.RestAssured;
import io.restassured.common.mapper.TypeRef;
import io.restassured.filter.log.LogDetail;
import org.apache.commons.io.file.PathUtils;
import org.awaitility.Awaitility;
import org.eclipse.jgit.api.Git;
import org.eclipse.jgit.api.errors.GitAPIException;
import org.eclipse.jgit.revwalk.RevCommit;

@QuarkusTest
@TestProfile(SynonymSearchServiceTest.Profile.class)
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
class SynonymSearchServiceTest {
private static final TypeRef<SearchResult<GuideSearchHit>> SEARCH_RESULT_SEARCH_HITS = new TypeRef<>() {
};
private static final String GUIDES_SEARCH = "api/guides/search";

protected int managementPort() {
if (getClass().getName().endsWith("IT")) {
return 9000;
} else {
return 9001;
}
}

// Unfortunately we can't use @TempDir here,
// because we need the path initialized before we create the extension below.
static CloseableDirectory tmpDir;

static {
try {
tmpDir = CloseableDirectory.temp("synonym-service-test");
} catch (IOException e) {
throw new RuntimeException("Could not init temp directory: " + e.getMessage(), e);
}
}

public static class Profile implements QuarkusTestProfile {
@Override
public Map<String, String> getConfigOverrides() {
try {
initOrigin();
} catch (IOException | GitAPIException e) {
throw new IllegalStateException("Unable to initialized sample git repository: " + e.getMessage(), e);
}
Map<String, String> config = new HashMap<>();
config.put("quarkusio.git-uri", tmpDir.path().toString());
return config;
}

@Override
public String getConfigProfile() {
return "synonyms-profile";
}
}

static void initOrigin() throws IOException, GitAPIException {
Path sourceRepoPath = tmpDir.path();
Path metadata1ToFetch = sourceRepoPath.resolve("_data/versioned/latest/index/quarkus.yaml");
Path guide1HtmlToFetch = sourceRepoPath.resolve("guides/" + FETCHED_GUIDE_1_NAME + ".html");
try (Git git = Git.init().setDirectory(sourceRepoPath.toFile())
.setInitialBranch(QuarkusIO.PAGES_BRANCH).call()) {
GitTestUtils.cleanGitUserConfig();

RevCommit initialCommit = git.commit().setMessage("Initial commit")
.setAllowEmpty(true)
.call();

PathUtils.createParentDirectories(guide1HtmlToFetch);
Files.writeString(guide1HtmlToFetch, "initial");
git.add().addFilepattern(".").call();
git.commit().setMessage("Pages first commit").call();

Files.writeString(guide1HtmlToFetch, FETCHED_GUIDE_1_CONTENT_HTML);
git.add().addFilepattern(".").call();
git.commit().setMessage("Pages second commit").call();

git.checkout()
.setName(QuarkusIO.SOURCE_BRANCH)
.setCreateBranch(true)
.setStartPoint(initialCommit)
.call();

PathUtils.createParentDirectories(metadata1ToFetch);
Files.writeString(metadata1ToFetch, METADATA_YAML);
git.add().addFilepattern(".").call();
git.commit().setMessage("Source first commit").call();
}
}

@BeforeAll
void waitForIndexing() {
Awaitility.await().timeout(Duration.ofMinutes(1))
.untilAsserted(() -> when().get("http://localhost:" + managementPort() + "/q/health/ready")
.then()
.statusCode(200));
RestAssured.enableLoggingOfRequestAndResponseIfValidationFails(LogDetail.BODY);
}

@AfterAll
void deleteTmpDir() throws IOException {
if (tmpDir != null) {
tmpDir.close();
}
}

@ParameterizedTest
@MethodSource
void synonymsTitle(String query, String result) {
assertThat(searchHitSearchResult(query).hits()).extracting(GuideSearchHit::title)
.contains(result);
}

private List<? extends Arguments> synonymsTitle() {
return List.of(
Arguments.of("REST Development Service",
"A title with <span class=\"highlighted\">DevServices</span> in it as well as Vert.x and <span class=\"highlighted\">RESTEasy</span>"),
Arguments.of("rest easy",
"A title with DevServices in it as well as Vert.x and <span class=\"highlighted\">RESTEasy</span>"),
Arguments.of("vertx",
"A title with DevServices in it as well as <span class=\"highlighted\">Vert.x</span> and RESTEasy"),
Arguments.of("rest api",
"A title with DevServices in it as well as Vert.x and <span class=\"highlighted\">RESTEasy</span>"));
}

@ParameterizedTest
@MethodSource
void synonymsContent(String query, Set<String> result) {
assertThat(searchHitSearchResult(query).hits()).extracting(GuideSearchHit::content)
.contains(result);
}

private List<? extends Arguments> synonymsContent() {
return List.of(
Arguments.of("Development Service",
Set.of("Quarkus supports a feature called <span class=\"highlighted\">DevServices</span> that allows you to start various containers",
"This page lists all the <span class=\"highlighted\">Dev</span> <span class=\"highlighted\">Services</span> that Quarkus supports.")),
Arguments.of("dev Service",
Set.of("Quarkus supports a feature called <span class=\"highlighted\">DevServices</span> that allows you to start various containers",
"This page lists all the <span class=\"highlighted\">Dev</span> <span class=\"highlighted\">Services</span> that Quarkus supports.")),
Arguments.of("rest easy",
Set.of("<span class=\"highlighted\">RESTEasy</span> Classic Writing <span class=\"highlighted\">REST</span> Services with <span class=\"highlighted\">RESTEasy</span> Reactive.",
"reactive <span class=\"highlighted\">REST</span> client.")),
Arguments.of("vertx",
Set.of("the quarkus-<span class=\"highlighted\">vertx</span> extension to your project.",
"Migrating to RESTEasy Reactive Access the <span class=\"highlighted\">Vert.x</span> instance To access the managed <span class=\"highlighted\">Vert.x</span> instance, add")),
Arguments.of("rest api",
Set.of("<span class=\"highlighted\">RESTEasy</span> Classic Writing <span class=\"highlighted\">REST</span> Services with <span class=\"highlighted\">RESTEasy</span> Reactive.",
"reactive <span class=\"highlighted\">REST</span> client.")));
}

private static SearchResult<GuideSearchHit> searchHitSearchResult(String q) {
return given()
.queryParam("q", q)
.queryParam("contentSnippets", 2)
.when().get(GUIDES_SEARCH)
.then()
.statusCode(200)
.extract().body().as(SEARCH_RESULT_SEARCH_HITS);
}

private static final String METADATA_YAML = """
# Generated file. Do not edit
---
types:
reference:
- title: A title with DevServices in it as well as Vert.x and RESTEasy
filename: foo.adoc
summary: This is a summary without words that are in synonyms list
categories: "category1, category2"
keywords: keyword1 keyword2
topics:
- topic1
- topic2
extensions:
- io.quarkus:extension1
- io.quarkus:extension2
id: foo
type: reference
url: /guides/foo
""";

private static final String FETCHED_GUIDE_1_NAME = "foo";
private static final String FETCHED_GUIDE_1_CONTENT_HTML = """
<html>
<head></head>
<body>
<h1></h1>
<p>This is the guide body
This guide shows how you can use virtual threads with RESTEasy Reactive and the reactive REST client. Learn more about virtual threads support on.
<h2>RESTEasy Classic</h2>
Writing REST Services with RESTEasy Reactive.
Migrating to RESTEasy Reactive
<h2>Access the Vert.x instance</h2>
To access the managed Vert.x instance, add the quarkus-vertx extension to your project.
<p>
This page lists all the Dev Services that Quarkus supports.
If you need multiple (shared) servers, you can configure the quarkus.elasticsearch.devservices.service-name attribute and indicate the server name.
Quarkus supports a feature called DevServices that allows you to start various containers
""";
}

0 comments on commit 1a3f256

Please sign in to comment.