Skip to content

Commit

Permalink
Deprecate CamelCase PathHierarchy tokenizer name (#10894)
Browse files Browse the repository at this point in the history
Deprecate CamelCase PathHierarchy tokenizer name in favor to lowercase path_hierarchy.

Signed-off-by: Lukáš Vlček <[email protected]>
  • Loading branch information
lukas-vlcek authored Nov 8, 2023
1 parent 4d8c228 commit a782f4f
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Return 409 Conflict HTTP status instead of 503 on failure to concurrently execute snapshots ([#8986](https://github.com/opensearch-project/OpenSearch/pull/5855))
- Add task completion count in search backpressure stats API ([#10028](https://github.com/opensearch-project/OpenSearch/pull/10028/))
- Performance improvement for Datetime field caching ([#4558](https://github.com/opensearch-project/OpenSearch/issues/4558))
- Deprecate CamelCase `PathHierarchy` tokenizer name in favor to lowercase `path_hierarchy` ([#10894](https://github.com/opensearch-project/OpenSearch/pull/10894))


### Deprecated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,17 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
// TODO deprecate and remove in API
tokenizers.put("lowercase", XLowerCaseTokenizerFactory::new);
tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.put("PathHierarchy", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
// TODO Remove "PathHierarchy" tokenizer name in 4.0 and throw exception
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_3_0_0)) {
deprecationLogger.deprecate(
"PathHierarchy_tokenizer_deprecation",
"The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [path_hierarchy] instead."
);
}
return new PathHierarchyTokenizerFactory(indexSettings, environment, name, settings);
});
tokenizers.put("pattern", PatternTokenizerFactory::new);
tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
Expand Down Expand Up @@ -662,8 +672,17 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
}
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
}));
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new));

tokenizers.add(PreConfiguredTokenizer.openSearchVersion("PathHierarchy", (version) -> {
// TODO Remove "PathHierarchy" tokenizer name in 4.0 and throw exception
if (version.onOrAfter(Version.V_3_0_0)) {
deprecationLogger.deprecate(
"PathHierarchy_tokenizer_deprecation",
"The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. "
+ "Please change the tokenizer name to [path_hierarchy] instead."
);
}
return new PathHierarchyTokenizer();
}));
return tokenizers;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,61 @@
import com.carrotsearch.randomizedtesting.generators.RandomPicks;

import org.apache.lucene.analysis.Tokenizer;
import org.opensearch.Version;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.core.index.Index;
import org.opensearch.env.Environment;
import org.opensearch.env.TestEnvironment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.IndexAnalyzers;
import org.opensearch.index.analysis.NamedAnalyzer;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.test.IndexSettingsModule;
import org.opensearch.test.OpenSearchTokenStreamTestCase;
import org.opensearch.test.VersionUtils;

import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;

public class PathHierarchyTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {

private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder()
.put(IndexMetadata.SETTING_VERSION_CREATED, version)
.put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
return new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(new CommonAnalysisModulePlugin()))
.getAnalysisRegistry()
.build(idxSettings);
}

/**
* Test that deprecated "PathHierarchy" tokenizer name is still available via {@link CommonAnalysisModulePlugin} starting in 3.x.
*/
public void testPreConfiguredTokenizer() throws IOException {

{
try (
IndexAnalyzers indexAnalyzers = buildAnalyzers(
VersionUtils.randomVersionBetween(random(), Version.V_3_0_0, Version.CURRENT),
"PathHierarchy"
)
) {
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
assertNotNull(analyzer);
assertTokenStreamContents(analyzer.tokenStream("dummy", "/a/b/c"), new String[] { "/a", "/a/b", "/a/b/c" });
// Once LUCENE-12750 is fixed we can use the following testing method instead.
// Similar testing approach has been used for deprecation of (Edge)NGrams tokenizers as well.
// assertAnalyzesTo(analyzer, "/a/b/c", new String[] { "/a", "/a/b", "/a/b/c" });

}
}
}

public void testDefaults() throws IOException {
final Index index = new Index("test", "_na_");
final Settings indexSettings = newAnalysisSettingsBuilder().build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,9 @@

---
"path_hierarchy":
- skip:
features: "allowed_warnings"

- do:
indices.analyze:
body:
Expand All @@ -312,6 +315,8 @@
- match: { detail.tokenizer.tokens.2.token: a/b/c }

- do:
allowed_warnings:
- 'The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. Please change the tokenizer name to [path_hierarchy] instead.'
indices.analyze:
body:
text: "a/b/c"
Expand All @@ -337,11 +342,13 @@
- match: { detail.tokenizer.tokens.2.token: a/b/c }

- do:
allowed_warnings:
- 'The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. Please change the tokenizer name to [path_hierarchy] instead.'
indices.analyze:
body:
text: "a/b/c"
explain: true
tokenizer: PathHierarchy
tokenizer: PathHierarchy
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: PathHierarchy }
- match: { detail.tokenizer.tokens.0.token: a }
Expand Down

0 comments on commit a782f4f

Please sign in to comment.