diff --git a/README.md b/README.md index 7019441..b5109fd 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,8 @@ Check this post: [How to build Elasticsearch Vietnamese Analysis Plugin](http:// ## Compatible Versions | Vietnamese Analysis Plugin | Elasticsearch | | -------------------------- | ------------- | -| master | 6.5.3 | +| master | 7.3.1 | +| 7.3.1 | 7.3.1 | | 5.6.5 | 5.6.5 | | 5.4.1 | 5.4.1 | | 5.3.1 | 5.3.1 | @@ -47,7 +48,7 @@ Check this post: [How to build Elasticsearch Vietnamese Analysis Plugin](http:// - [JetBrains](https://www.jetbrains.com) has provided a free license for their great tool: [IntelliJ IDEA](https://www.jetbrains.com/idea/) ## License - + This software is licensed under the Apache 2 license, quoted below. Licensed under the Apache License, Version 2.0 (the "License"); you may not diff --git a/pom.xml b/pom.xml index f191389..401f087 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 org.elasticsearch elasticsearch-analysis-vietnamese - 7.0.0 + 7.3.1 jar elasticsearch-analysis-vietnamese https://github.com/duydo/elasticsearch-analysis-vietnamese/ @@ -20,7 +20,7 @@ duydo Duy Do - http://duydo.me + https://duydo.me @@ -31,7 +31,7 @@ UTF-8 1.8 - 7.0.0 + 7.3.1 2.7 diff --git a/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java b/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java index 6661d52..0e593a2 100644 --- a/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java +++ b/src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java @@ -78,7 +78,7 @@ public final boolean incrementToken() throws IOException { final int length = word.getText().length(); typeAtt.setType(String.format("<%s>", word.getRule().getName().toUpperCase())); termAtt.copyBuffer(word.getText().toCharArray(), 0, length); - final int start = inputText.indexOf(word.getText(), i); + final int start = inputText.indexOf(word.getText(), offset); offsetAtt.setOffset(correctOffset(start), offset = correctOffset(start + length)); return true; } diff --git a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java index 313919e..88b200f 100644 --- a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisIntegrationTest.java @@ -2,10 +2,9 @@ import org.elasticsearch.action.admin.cluster.node.info.NodeInfo; import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; -import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; +import org.elasticsearch.action.admin.indices.analyze.AnalyzeAction; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.index.query.QueryBuilder; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.plugin.analysis.vi.AnalysisVietnamesePlugin; import org.elasticsearch.plugins.Plugin; @@ -45,7 +44,8 @@ public void testPluginIsLoaded() throws Exception { } public void testVietnameseAnalyzer() throws ExecutionException, InterruptedException { - AnalyzeResponse response = client().admin().indices() + + AnalyzeAction.Response response = client().admin().indices() .prepareAnalyze("công nghệ thông tin Việt Nam").setAnalyzer("vi_analyzer") .execute().get(); String[] expected = {"công nghệ thông tin", "việt", "nam"}; @@ -60,7 +60,7 @@ public void testVietnameseAnalyzerInMapping() throws ExecutionException, Interru createIndex("test"); ensureGreen("test"); final XContentBuilder mapping = jsonBuilder().startObject() - .startObject("type") + .startObject("_doc") .startObject("properties") .startObject("foo") .field("type", "text") @@ -69,9 +69,9 @@ public void testVietnameseAnalyzerInMapping() throws ExecutionException, Interru .endObject() .endObject() .endObject(); - client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get(); + client().admin().indices().preparePutMapping("test").setType("_doc").setSource(mapping).get(); final XContentBuilder source = jsonBuilder().startObject().field("foo", "công nghệ thông tin Việt Nam").endObject(); - index("test", "type", "1", source); + index("test", "_doc", "1", source); refresh(); SearchResponse response = client().prepareSearch("test").setQuery( QueryBuilders.matchQuery("foo", "công nghệ thông tin")).execute().actionGet(); diff --git a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTest.java b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTest.java index 1f579b9..af07834 100644 --- a/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/VietnameseAnalysisTest.java @@ -3,6 +3,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.vi.VietnameseAnalyzer; import org.apache.lucene.analysis.vi.VietnameseTokenizer; import org.elasticsearch.Version; @@ -80,4 +81,25 @@ public TestAnalysis createTestAnalysis() throws IOException { Settings nodeSettings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir()).build(); return createTestAnalysis(new Index("test", "_na_"), nodeSettings, settings, new AnalysisVietnamesePlugin()); } + + public void testTokenOffset() throws IOException { + + TestAnalysis analysis = createTestAnalysis(); + NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer"); + assertNotNull(analyzer); + + TokenStream ts = analyzer.analyzer().tokenStream("test", "Phụ tùng xe Mazda bán tải dưới 7 chỗ: ống dẫn gió tới két làm mát khí nạp- cao su lưu hóa, mới 100%, phục vụ BHBD. Ms:1D0013246A"); + CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); + OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class); + ts.reset(); + String[] expected = new String[]{"phụ tùng", "xe", "mazda", "bán", "tải", "7", "chỗ", "ống", "dẫn", "gió", "tới", "két", "làm", "mát", "khí", "nạp", "cao su", "lưu hóa", "mới", "100%", "phục vụ", "bhbd", "ms", "1", "d0", "013246", "a"}; + int[] expectedOffset = new int[]{0, 9, 12, 18, 22, 31, 33, 38, 42, 46, 50, 54, 58, 62, 66, 70, 75, 82, 91, 95, 101, 109, 115, 118, 119, 121, 127}; + + for (int i = 0; i < expected.length; i++) { + assertThat(ts.incrementToken(), equalTo(true)); + assertThat(term.toString(), equalTo(expected[i])); + assertTrue(offset.startOffset() == expectedOffset[i]); + } + assertThat(ts.incrementToken(), equalTo(false)); + } }