Skip to content

Commit

Permalink
Merge pull request #183 from marko-bekhta/some-analysis-config-cleanups
Browse files Browse the repository at this point in the history
(hopefully) Better grouping of analyzer configs
  • Loading branch information
yrodiere authored Feb 15, 2024
2 parents dee6145 + 6eadf83 commit c693837
Showing 1 changed file with 193 additions and 83 deletions.
276 changes: 193 additions & 83 deletions src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,123 +43,238 @@ public static String autocompleteAnalyzer(Language language) {
return language.addSuffix(AUTOCOMPLETE);
}

private static String stopFilter(Language language) {
return "stop_%s".formatted(language.code);
}

private static String regularStemmerFilter(Language language) {
return "stemmer_%s".formatted(language.code);
}

private static String possessiveStemmerFilter(Language language) {
return "possessive_stemmer_%s".formatted(language.code);
}

private static String autocompleteEdgeNgramFilter(Language language) {
return "autocomplete_edge_ngram_%s".formatted(language.code);
}

private static String synonymsGraphFilter(Language language) {
return "synonyms_graph_filter_%s".formatted(language.code);
}

private static String compoundTechnicalNameFilter(Language language) {
return "compound_technical_name_filter_%s".formatted(language.code);
}

@Override
public void configure(ElasticsearchAnalysisConfigurationContext context) {
// for en/es/pt we are going to use the same english configuration since guides are not translated
EnumSet<Language> englishLanguages = EnumSet.of(Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH);

for (Language language : englishLanguages) {
SharedFilters result = sharedFilters(context, language);

context.analyzer(defaultAnalyzer(language)).custom()
.tokenizer("standard")
.tokenFilters(result.compoundTechnicalNameFilter(), "lowercase", result.possessiveStemmer(), result.stop(),
result.regularStemmer(), "asciifolding")
.charFilters("html_strip");
context.analyzer(defaultSearchAnalyzer(language)).custom()
.tokenizer("standard")
// > In general, synonym filters rewrite their inputs to the tokenizer and filters used in the preceding analysis chain
// Note how the synonym filter is added in the end. According to https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms
// preceding filters should get applied to the synonyms we passed to it, so we don't need to bother about normalizing them in some way:
.tokenFilters("lowercase", result.possessiveStemmer(), result.stop(),
result.regularStemmer(),
"asciifolding",
result.synonymsGraphFilter())
.charFilters("html_strip");
context.analyzer(autocompleteAnalyzer(language)).custom()
.tokenizer("standard")
.tokenFilters(result.compoundTechnicalNameFilter(), "lowercase", result.possessiveStemmer(), result.stop(),
result.regularStemmer(), "asciifolding", result.autocompleteEdgeNgram())
.charFilters("html_strip");
context.normalizer(language.addSuffix(SORT)).custom()
.tokenFilters("lowercase");
configureEnglishLikeLanguage(context, language);
}

// japanese
// Japanese
// https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-analyzer.html
SharedFilters japanese = sharedFilters(context, Language.JAPANESE);
context.analyzer(defaultAnalyzer(Language.JAPANESE)).custom()
configureJapanese(context);

// Chinese
// https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html
configureChinese(context);
}

void configureEnglishLikeLanguage(ElasticsearchAnalysisConfigurationContext context, Language language) {
configureSharedFilters(context, language);
// The default analyzer to prepare tokens for when the search is performed for the full words.
// To be applied to the indexed text.
context.analyzer(defaultAnalyzer(language)).custom()
.tokenizer("standard")
.tokenFilters(
// A pattern capture filter to address FQCNs
compoundTechnicalNameFilter(language),
// To make all words in lowercase.
"lowercase",
// To remove possessives (trailing 's) from words.
possessiveStemmerFilter(language),
// To remove frequently used words that do not bring much meaning, e.g. a, that, and, are, as, at, with...
stopFilter(language),
// To remove suffixes like -s/-es/-ed etc
regularStemmerFilter(language),
// To convert characters into ascii ones, e.g. à to a or ę to e etc.
"asciifolding")
.charFilters("html_strip");

// The analyzer to be applied to the user-input text.
context.analyzer(defaultSearchAnalyzer(language)).custom()
.tokenizer("standard")
.tokenFilters(
"lowercase",
possessiveStemmerFilter(language),
stopFilter(language),
regularStemmerFilter(language),
"asciifolding",
// > In general, synonym filters rewrite their inputs to the tokenizer and filters used in the preceding analysis chain
// Note how the synonym filter is added in the end. According to https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms
// preceding filters should get applied to the synonyms we passed to it, so we don't need to bother about normalizing them in some way:
synonymsGraphFilter(language))
.charFilters("html_strip");

// The autocomplete analyzer to prepare tokens for when the user types the words,
// and we want to produce results on incompletely typed (unfinished) words.
// To get that we add an edge Ngram filter in the end of the list of filters of the default analyzer
// to make sure that we get the same tokens processed into Ngrams.
// To be applied to the indexed text.
context.analyzer(autocompleteAnalyzer(language)).custom()
.tokenizer("standard")
.tokenFilters(
compoundTechnicalNameFilter(language),
"lowercase",
possessiveStemmerFilter(language),
stopFilter(language),
regularStemmerFilter(language),
"asciifolding",
autocompleteEdgeNgramFilter(language))
.charFilters("html_strip");

context.normalizer(language.addSuffix(SORT)).custom()
.tokenFilters("lowercase");
}

void configureJapanese(ElasticsearchAnalysisConfigurationContext context) {
Language language = Language.JAPANESE;
configureSharedFilters(context, language);

context.analyzer(defaultAnalyzer(language)).custom()
.tokenizer("kuromoji_tokenizer")
.tokenFilters(japanese.compoundTechnicalNameFilter(), "lowercase", "kuromoji_baseform",
"kuromoji_part_of_speech", japanese.possessiveStemmer(), "ja_stop", japanese.stop(), "kuromoji_stemmer",
japanese.regularStemmer(), "asciifolding")
.charFilters("icu_normalizer", "html_strip");
context.analyzer(defaultSearchAnalyzer(Language.JAPANESE)).custom()
.tokenFilters(
compoundTechnicalNameFilter(language),
"lowercase",
"kuromoji_baseform",
"kuromoji_part_of_speech",
possessiveStemmerFilter(language),
"ja_stop",
stopFilter(language),
"kuromoji_stemmer",
regularStemmerFilter(language),
"asciifolding")
.charFilters(
// If a text contains full-width characters, the kuromoji_tokenizer tokenizer can produce unexpected tokens.
// To avoid this, the icu_normalizer character filter is added.
// Requires icu plugin being installed.
"icu_normalizer",
"html_strip");

context.analyzer(defaultSearchAnalyzer(language)).custom()
.tokenizer("kuromoji_tokenizer")
// > In general, synonym filters rewrite their inputs to the tokenizer and filters used in the preceding analysis chain
// Note how the synonym filter is added in the end. According to https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms
// preceding filters should get applied to the synonyms we passed to it, so we don't need to bother about normalizing them in some way:
.tokenFilters("lowercase", "kuromoji_baseform", "kuromoji_part_of_speech", japanese.possessiveStemmer(),
"ja_stop", japanese.stop(), "kuromoji_stemmer", japanese.regularStemmer(),
.tokenFilters(
"lowercase",
"kuromoji_baseform",
"kuromoji_part_of_speech",
possessiveStemmerFilter(language),
"ja_stop",
stopFilter(language),
"kuromoji_stemmer",
regularStemmerFilter(language),
"asciifolding",
japanese.synonymsGraphFilter())
.charFilters("icu_normalizer", "html_strip");
context.analyzer(autocompleteAnalyzer(Language.JAPANESE)).custom()
synonymsGraphFilter(language))
.charFilters(
"icu_normalizer",
"html_strip");

context.analyzer(autocompleteAnalyzer(language)).custom()
.tokenizer("kuromoji_tokenizer")
.tokenFilters(japanese.compoundTechnicalNameFilter(), "lowercase", "kuromoji_baseform",
"kuromoji_part_of_speech", japanese.possessiveStemmer(), "ja_stop", japanese.stop(), "kuromoji_stemmer",
japanese.regularStemmer(), "asciifolding", japanese.autocompleteEdgeNgram())
.charFilters("icu_normalizer", "html_strip");
context.normalizer(Language.JAPANESE.addSuffix(SORT)).custom()
.tokenFilters(
compoundTechnicalNameFilter(language),
"lowercase",
"kuromoji_baseform",
"kuromoji_part_of_speech",
possessiveStemmerFilter(language),
"ja_stop",
stopFilter(language),
"kuromoji_stemmer",
regularStemmerFilter(language),
"asciifolding",
autocompleteEdgeNgramFilter(language))
.charFilters(
"icu_normalizer",
"html_strip");

context.normalizer(language.addSuffix(SORT)).custom()
.tokenFilters("lowercase");
}

// chinese
// https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html
SharedFilters chinese = sharedFilters(context, Language.CHINESE);
context.analyzer(defaultAnalyzer(Language.CHINESE)).custom()
void configureChinese(ElasticsearchAnalysisConfigurationContext context) {
Language language = Language.CHINESE;
configureSharedFilters(context, language);
// The default analyzer to prepare tokens for when the search is performed for the full words.
// To be applied to the indexed text.
context.analyzer(defaultAnalyzer(language)).custom()
.tokenizer("smartcn_tokenizer")
.tokenFilters(chinese.compoundTechnicalNameFilter(), "lowercase", chinese.possessiveStemmer(), "smartcn_stop",
chinese.stop(), chinese.regularStemmer(), "asciifolding")
.tokenFilters(
compoundTechnicalNameFilter(language),
"lowercase",
possessiveStemmerFilter(language),
"smartcn_stop",
stopFilter(language),
regularStemmerFilter(language),
"asciifolding")
.charFilters("html_strip");
context.analyzer(defaultSearchAnalyzer(Language.CHINESE)).custom()

// The analyzer to be applied to the user-input text.
context.analyzer(defaultSearchAnalyzer(language)).custom()
.tokenizer("smartcn_tokenizer")
// > In general, synonym filters rewrite their inputs to the tokenizer and filters used in the preceding analysis chain
// Note how the synonym filter is added in the end. According to https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms
// preceding filters should get applied to the synonyms we passed to it, so we don't need to bother about normalizing them in some way.
//
// NOTE: see how `smartcn_stop` filter goes after the synonyms, placing it next to the regular stop filter leads to
// startup failures, as schema cannot be created.
.tokenFilters("lowercase", chinese.possessiveStemmer(), chinese.stop(),
chinese.regularStemmer(),
.tokenFilters(
"lowercase",
possessiveStemmerFilter(language),
stopFilter(language),
regularStemmerFilter(language),
"asciifolding",
chinese.synonymsGraphFilter(), "smartcn_stop")
synonymsGraphFilter(language),
// must go last as it conflicts wit the synonyms filter.
"smartcn_stop")
.charFilters("html_strip");
context.analyzer(autocompleteAnalyzer(Language.CHINESE)).custom()

context.analyzer(autocompleteAnalyzer(language)).custom()
.tokenizer("smartcn_tokenizer")
.tokenFilters(chinese.compoundTechnicalNameFilter(), "lowercase", chinese.possessiveStemmer(), "smartcn_stop",
chinese.stop(), chinese.regularStemmer(), "asciifolding", chinese.autocompleteEdgeNgram())
.tokenFilters(
compoundTechnicalNameFilter(language),
"lowercase",
possessiveStemmerFilter(language),
"smartcn_stop",
stopFilter(language),
regularStemmerFilter(language),
"asciifolding",
autocompleteEdgeNgramFilter(language))
.charFilters("html_strip");
context.normalizer(Language.CHINESE.addSuffix(SORT)).custom()

context.normalizer(language.addSuffix(SORT)).custom()
.tokenFilters("lowercase");

}

private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationContext context, Language language) {
String stop = "stop_%s".formatted(language.code);
String regularStemmer = "stemmer_%s".formatted(language.code);
String possessiveStemmer = "possessive_stemmer_%s".formatted(language.code);
String autocompleteEdgeNgram = "autocomplete_edge_ngram_%s".formatted(language.code);
String synonymsGraphFilter = "synonyms_graph_filter_%s".formatted(language.code);
String compoundTechnicalNameFilter = "compound_technical_name_filter_%s".formatted(language.code);
context.tokenFilter(stop)
private static void configureSharedFilters(ElasticsearchAnalysisConfigurationContext context, Language language) {
context.tokenFilter(stopFilter(language))
.type("stop")
.param("stopwords", "_english_")
.param("ignore_case", "true");
context.tokenFilter(regularStemmer)
context.tokenFilter(regularStemmerFilter(language))
.type("stemmer")
.param("language", "english");
context.tokenFilter(possessiveStemmer)
context.tokenFilter(possessiveStemmerFilter(language))
.type("stemmer")
.param("language", "possessive_english");
context.tokenFilter(autocompleteEdgeNgram)
context.tokenFilter(autocompleteEdgeNgramFilter(language))
.type("edge_ngram")
.param("min_gram", 2)
.param("max_gram", 70);
context.tokenFilter(synonymsGraphFilter)
context.tokenFilter(synonymsGraphFilter(language))
// See https://www.elastic.co/guide/en/elasticsearch/reference/8.11/analysis-synonym-graph-tokenfilter.html#analysis-synonym-graph-tokenfilter
// synonym_graph works better with multi-word synonyms
.type("synonym_graph")
.param("synonyms", SYNONYMS);
context.tokenFilter(compoundTechnicalNameFilter)
context.tokenFilter(compoundTechnicalNameFilter(language))
.type("pattern_capture")
.param("preserve_original", true)
// This decomposes io.quarkus.deployment.builditem.AdditionalIndexedClassesBuildItem into
Expand All @@ -168,11 +283,6 @@ private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationCon
// AdditionalIndexedClassesBuildItem
// ]
.param("patterns", SIMPLIFIED_JAVA_CLASS_NAME_CAPTURE_PATTERN.pattern());
return new SharedFilters(stop, regularStemmer, possessiveStemmer, autocompleteEdgeNgram,
synonymsGraphFilter, compoundTechnicalNameFilter);
}

private record SharedFilters(String stop, String regularStemmer, String possessiveStemmer, String autocompleteEdgeNgram,
String synonymsGraphFilter, String compoundTechnicalNameFilter) {
}
}

0 comments on commit c693837

Please sign in to comment.