Merge pull request #183 from marko-bekhta/some-analysis-config-cleanups

(hopefully) Better grouping of analyzer configs
quarkusio · Feb 15, 2024 · c693837 · c693837
2 parents dee6145 + 6eadf83
commit c693837
Showing 1 changed file with 193 additions and 83 deletions.
diff --git a/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java b/src/main/java/io/quarkus/search/app/hibernate/AnalysisConfigurer.java
@@ -43,123 +43,238 @@ public static String autocompleteAnalyzer(Language language) {
         return language.addSuffix(AUTOCOMPLETE);
     }
 
+    private static String stopFilter(Language language) {
+        return "stop_%s".formatted(language.code);
+    }
+
+    private static String regularStemmerFilter(Language language) {
+        return "stemmer_%s".formatted(language.code);
+    }
+
+    private static String possessiveStemmerFilter(Language language) {
+        return "possessive_stemmer_%s".formatted(language.code);
+    }
+
+    private static String autocompleteEdgeNgramFilter(Language language) {
+        return "autocomplete_edge_ngram_%s".formatted(language.code);
+    }
+
+    private static String synonymsGraphFilter(Language language) {
+        return "synonyms_graph_filter_%s".formatted(language.code);
+    }
+
+    private static String compoundTechnicalNameFilter(Language language) {
+        return "compound_technical_name_filter_%s".formatted(language.code);
+    }
+
     @Override
     public void configure(ElasticsearchAnalysisConfigurationContext context) {
         // for en/es/pt we are going to use the same english configuration since guides are not translated
         EnumSet<Language> englishLanguages = EnumSet.of(Language.ENGLISH, Language.PORTUGUESE, Language.SPANISH);
 
         for (Language language : englishLanguages) {
-            SharedFilters result = sharedFilters(context, language);
-
-            context.analyzer(defaultAnalyzer(language)).custom()
-                    .tokenizer("standard")
-                    .tokenFilters(result.compoundTechnicalNameFilter(), "lowercase", result.possessiveStemmer(), result.stop(),
-                            result.regularStemmer(), "asciifolding")
-                    .charFilters("html_strip");
-            context.analyzer(defaultSearchAnalyzer(language)).custom()
-                    .tokenizer("standard")
-                    // > In general, synonym filters rewrite their inputs to the tokenizer and filters used in the preceding analysis chain
-                    // Note how the synonym filter is added in the end. According to https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms
-                    // preceding filters should get applied to the synonyms we passed to it, so we don't need to bother about normalizing them in some way:
-                    .tokenFilters("lowercase", result.possessiveStemmer(), result.stop(),
-                            result.regularStemmer(),
-                            "asciifolding",
-                            result.synonymsGraphFilter())
-                    .charFilters("html_strip");
-            context.analyzer(autocompleteAnalyzer(language)).custom()
-                    .tokenizer("standard")
-                    .tokenFilters(result.compoundTechnicalNameFilter(), "lowercase", result.possessiveStemmer(), result.stop(),
-                            result.regularStemmer(), "asciifolding", result.autocompleteEdgeNgram())
-                    .charFilters("html_strip");
-            context.normalizer(language.addSuffix(SORT)).custom()
-                    .tokenFilters("lowercase");
+            configureEnglishLikeLanguage(context, language);
         }
 
-        // japanese
+        // Japanese
         // https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji-analyzer.html
-        SharedFilters japanese = sharedFilters(context, Language.JAPANESE);
-        context.analyzer(defaultAnalyzer(Language.JAPANESE)).custom()
+        configureJapanese(context);
+
+        // Chinese
+        // https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html
+        configureChinese(context);
+    }
+
+    void configureEnglishLikeLanguage(ElasticsearchAnalysisConfigurationContext context, Language language) {
+        configureSharedFilters(context, language);
+        // The default analyzer to prepare tokens for when the search is performed for the full words.
+        // To be applied to the indexed text.
+        context.analyzer(defaultAnalyzer(language)).custom()
+                .tokenizer("standard")
+                .tokenFilters(
+                        // A pattern capture filter to address FQCNs
+                        compoundTechnicalNameFilter(language),
+                        // To make all words in lowercase.
+                        "lowercase",
+                        // To remove possessives (trailing 's) from words.
+                        possessiveStemmerFilter(language),
+                        // To remove frequently used words that do not bring much meaning, e.g. a, that, and, are, as, at, with...
+                        stopFilter(language),
+                        // To remove suffixes like -s/-es/-ed etc
+                        regularStemmerFilter(language),
+                        // To convert characters into ascii ones, e.g. à to a or ę to e etc.
+                        "asciifolding")
+                .charFilters("html_strip");
+
+        // The analyzer to be applied to the user-input text.
+        context.analyzer(defaultSearchAnalyzer(language)).custom()
+                .tokenizer("standard")
+                .tokenFilters(
+                        "lowercase",
+                        possessiveStemmerFilter(language),
+                        stopFilter(language),
+                        regularStemmerFilter(language),
+                        "asciifolding",
+                        // > In general, synonym filters rewrite their inputs to the tokenizer and filters used in the preceding analysis chain
+                        // Note how the synonym filter is added in the end. According to https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms
+                        // preceding filters should get applied to the synonyms we passed to it, so we don't need to bother about normalizing them in some way:
+                        synonymsGraphFilter(language))
+                .charFilters("html_strip");
+
+        // The autocomplete analyzer to prepare tokens for when the user types the words,
+        //  and we want to produce results on incompletely typed (unfinished) words.
+        //  To get that we add an edge Ngram filter in the end of the list of filters of the default analyzer
+        //  to make sure that we get the same tokens processed into Ngrams.
+        //  To be applied to the indexed text.
+        context.analyzer(autocompleteAnalyzer(language)).custom()
+                .tokenizer("standard")
+                .tokenFilters(
+                        compoundTechnicalNameFilter(language),
+                        "lowercase",
+                        possessiveStemmerFilter(language),
+                        stopFilter(language),
+                        regularStemmerFilter(language),
+                        "asciifolding",
+                        autocompleteEdgeNgramFilter(language))
+                .charFilters("html_strip");
+
+        context.normalizer(language.addSuffix(SORT)).custom()
+                .tokenFilters("lowercase");
+    }
+
+    void configureJapanese(ElasticsearchAnalysisConfigurationContext context) {
+        Language language = Language.JAPANESE;
+        configureSharedFilters(context, language);
+
+        context.analyzer(defaultAnalyzer(language)).custom()
                 .tokenizer("kuromoji_tokenizer")
-                .tokenFilters(japanese.compoundTechnicalNameFilter(), "lowercase", "kuromoji_baseform",
-                        "kuromoji_part_of_speech", japanese.possessiveStemmer(), "ja_stop", japanese.stop(), "kuromoji_stemmer",
-                        japanese.regularStemmer(), "asciifolding")
-                .charFilters("icu_normalizer", "html_strip");
-        context.analyzer(defaultSearchAnalyzer(Language.JAPANESE)).custom()
+                .tokenFilters(
+                        compoundTechnicalNameFilter(language),
+                        "lowercase",
+                        "kuromoji_baseform",
+                        "kuromoji_part_of_speech",
+                        possessiveStemmerFilter(language),
+                        "ja_stop",
+                        stopFilter(language),
+                        "kuromoji_stemmer",
+                        regularStemmerFilter(language),
+                        "asciifolding")
+                .charFilters(
+                        // If a text contains full-width characters, the kuromoji_tokenizer tokenizer can produce unexpected tokens.
+                        // To avoid this, the icu_normalizer character filter is added.
+                        // Requires icu plugin being installed.
+                        "icu_normalizer",
+                        "html_strip");
+
+        context.analyzer(defaultSearchAnalyzer(language)).custom()
                 .tokenizer("kuromoji_tokenizer")
-                // > In general, synonym filters rewrite their inputs to the tokenizer and filters used in the preceding analysis chain
-                // Note how the synonym filter is added in the end. According to https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms
-                // preceding filters should get applied to the synonyms we passed to it, so we don't need to bother about normalizing them in some way:
-                .tokenFilters("lowercase", "kuromoji_baseform", "kuromoji_part_of_speech", japanese.possessiveStemmer(),
-                        "ja_stop", japanese.stop(), "kuromoji_stemmer", japanese.regularStemmer(),
+                .tokenFilters(
+                        "lowercase",
+                        "kuromoji_baseform",
+                        "kuromoji_part_of_speech",
+                        possessiveStemmerFilter(language),
+                        "ja_stop",
+                        stopFilter(language),
+                        "kuromoji_stemmer",
+                        regularStemmerFilter(language),
                         "asciifolding",
-                        japanese.synonymsGraphFilter())
-                .charFilters("icu_normalizer", "html_strip");
-        context.analyzer(autocompleteAnalyzer(Language.JAPANESE)).custom()
+                        synonymsGraphFilter(language))
+                .charFilters(
+                        "icu_normalizer",
+                        "html_strip");
+
+        context.analyzer(autocompleteAnalyzer(language)).custom()
                 .tokenizer("kuromoji_tokenizer")
-                .tokenFilters(japanese.compoundTechnicalNameFilter(), "lowercase", "kuromoji_baseform",
-                        "kuromoji_part_of_speech", japanese.possessiveStemmer(), "ja_stop", japanese.stop(), "kuromoji_stemmer",
-                        japanese.regularStemmer(), "asciifolding", japanese.autocompleteEdgeNgram())
-                .charFilters("icu_normalizer", "html_strip");
-        context.normalizer(Language.JAPANESE.addSuffix(SORT)).custom()
+                .tokenFilters(
+                        compoundTechnicalNameFilter(language),
+                        "lowercase",
+                        "kuromoji_baseform",
+                        "kuromoji_part_of_speech",
+                        possessiveStemmerFilter(language),
+                        "ja_stop",
+                        stopFilter(language),
+                        "kuromoji_stemmer",
+                        regularStemmerFilter(language),
+                        "asciifolding",
+                        autocompleteEdgeNgramFilter(language))
+                .charFilters(
+                        "icu_normalizer",
+                        "html_strip");
+
+        context.normalizer(language.addSuffix(SORT)).custom()
                 .tokenFilters("lowercase");
+    }
 
-        // chinese
-        // https://www.elastic.co/guide/en/elasticsearch/plugins/current/_reimplementing_and_extending_the_analyzers.html
-        SharedFilters chinese = sharedFilters(context, Language.CHINESE);
-        context.analyzer(defaultAnalyzer(Language.CHINESE)).custom()
+    void configureChinese(ElasticsearchAnalysisConfigurationContext context) {
+        Language language = Language.CHINESE;
+        configureSharedFilters(context, language);
+        // The default analyzer to prepare tokens for when the search is performed for the full words.
+        // To be applied to the indexed text.
+        context.analyzer(defaultAnalyzer(language)).custom()
                 .tokenizer("smartcn_tokenizer")
-                .tokenFilters(chinese.compoundTechnicalNameFilter(), "lowercase", chinese.possessiveStemmer(), "smartcn_stop",
-                        chinese.stop(), chinese.regularStemmer(), "asciifolding")
+                .tokenFilters(
+                        compoundTechnicalNameFilter(language),
+                        "lowercase",
+                        possessiveStemmerFilter(language),
+                        "smartcn_stop",
+                        stopFilter(language),
+                        regularStemmerFilter(language),
+                        "asciifolding")
                 .charFilters("html_strip");
-        context.analyzer(defaultSearchAnalyzer(Language.CHINESE)).custom()
+
+        // The analyzer to be applied to the user-input text.
+        context.analyzer(defaultSearchAnalyzer(language)).custom()
                 .tokenizer("smartcn_tokenizer")
-                // > In general, synonym filters rewrite their inputs to the tokenizer and filters used in the preceding analysis chain
-                // Note how the synonym filter is added in the end. According to https://www.elastic.co/blog/boosting-the-power-of-elasticsearch-with-synonyms
-                // preceding filters should get applied to the synonyms we passed to it, so we don't need to bother about normalizing them in some way.
-                //
-                // NOTE: see how `smartcn_stop` filter goes after the synonyms, placing it next to the regular stop filter leads to
-                // startup failures, as schema cannot be created.
-                .tokenFilters("lowercase", chinese.possessiveStemmer(), chinese.stop(),
-                        chinese.regularStemmer(),
+                .tokenFilters(
+                        "lowercase",
+                        possessiveStemmerFilter(language),
+                        stopFilter(language),
+                        regularStemmerFilter(language),
                         "asciifolding",
-                        chinese.synonymsGraphFilter(), "smartcn_stop")
+                        synonymsGraphFilter(language),
+                        // must go last as it conflicts wit the synonyms filter.
+                        "smartcn_stop")
                 .charFilters("html_strip");
-        context.analyzer(autocompleteAnalyzer(Language.CHINESE)).custom()
+
+        context.analyzer(autocompleteAnalyzer(language)).custom()
                 .tokenizer("smartcn_tokenizer")
-                .tokenFilters(chinese.compoundTechnicalNameFilter(), "lowercase", chinese.possessiveStemmer(), "smartcn_stop",
-                        chinese.stop(), chinese.regularStemmer(), "asciifolding", chinese.autocompleteEdgeNgram())
+                .tokenFilters(
+                        compoundTechnicalNameFilter(language),
+                        "lowercase",
+                        possessiveStemmerFilter(language),
+                        "smartcn_stop",
+                        stopFilter(language),
+                        regularStemmerFilter(language),
+                        "asciifolding",
+                        autocompleteEdgeNgramFilter(language))
                 .charFilters("html_strip");
-        context.normalizer(Language.CHINESE.addSuffix(SORT)).custom()
+
+        context.normalizer(language.addSuffix(SORT)).custom()
                 .tokenFilters("lowercase");
+
     }
 
-    private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationContext context, Language language) {
-        String stop = "stop_%s".formatted(language.code);
-        String regularStemmer = "stemmer_%s".formatted(language.code);
-        String possessiveStemmer = "possessive_stemmer_%s".formatted(language.code);
-        String autocompleteEdgeNgram = "autocomplete_edge_ngram_%s".formatted(language.code);
-        String synonymsGraphFilter = "synonyms_graph_filter_%s".formatted(language.code);
-        String compoundTechnicalNameFilter = "compound_technical_name_filter_%s".formatted(language.code);
-        context.tokenFilter(stop)
+    private static void configureSharedFilters(ElasticsearchAnalysisConfigurationContext context, Language language) {
+        context.tokenFilter(stopFilter(language))
                 .type("stop")
                 .param("stopwords", "_english_")
                 .param("ignore_case", "true");
-        context.tokenFilter(regularStemmer)
+        context.tokenFilter(regularStemmerFilter(language))
                 .type("stemmer")
                 .param("language", "english");
-        context.tokenFilter(possessiveStemmer)
+        context.tokenFilter(possessiveStemmerFilter(language))
                 .type("stemmer")
                 .param("language", "possessive_english");
-        context.tokenFilter(autocompleteEdgeNgram)
+        context.tokenFilter(autocompleteEdgeNgramFilter(language))
                 .type("edge_ngram")
                 .param("min_gram", 2)
                 .param("max_gram", 70);
-        context.tokenFilter(synonymsGraphFilter)
+        context.tokenFilter(synonymsGraphFilter(language))
                 // See https://www.elastic.co/guide/en/elasticsearch/reference/8.11/analysis-synonym-graph-tokenfilter.html#analysis-synonym-graph-tokenfilter
                 // synonym_graph works better with multi-word synonyms
                 .type("synonym_graph")
                 .param("synonyms", SYNONYMS);
-        context.tokenFilter(compoundTechnicalNameFilter)
+        context.tokenFilter(compoundTechnicalNameFilter(language))
                 .type("pattern_capture")
                 .param("preserve_original", true)
                 // This decomposes io.quarkus.deployment.builditem.AdditionalIndexedClassesBuildItem into
@@ -168,11 +283,6 @@ private static SharedFilters sharedFilters(ElasticsearchAnalysisConfigurationCon
                 // AdditionalIndexedClassesBuildItem
                 // ]
                 .param("patterns", SIMPLIFIED_JAVA_CLASS_NAME_CAPTURE_PATTERN.pattern());
-        return new SharedFilters(stop, regularStemmer, possessiveStemmer, autocompleteEdgeNgram,
-                synonymsGraphFilter, compoundTechnicalNameFilter);
     }
 
-    private record SharedFilters(String stop, String regularStemmer, String possessiveStemmer, String autocompleteEdgeNgram,
-            String synonymsGraphFilter, String compoundTechnicalNameFilter) {
-    }
 }