diff --git a/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java index ce86a65..b0effcf 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java @@ -156,7 +156,7 @@ public final boolean incrementToken() throws IOException { //keep original alphabet if (c < 128) { if (buff.length() <= 0) { - buffStartPosition = i; + buffStartPosition = i+1; } if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) { if (config.keepNoneChinese) { diff --git a/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTests.java index 4b7d564..45be63b 100644 --- a/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTests.java @@ -698,8 +698,8 @@ public void TestOnlyFirstLetterTokenizer() throws IOException { re = result.get("DJ音乐家"); Assert.assertEquals(3, re.size()); Assert.assertEquals("d", re.get(0).term); - Assert.assertEquals("djyyj", re.get(2).term); - Assert.assertEquals("j", re.get(1).term); + Assert.assertEquals("djyyj", re.get(1).term); + Assert.assertEquals("j", re.get(2).term); config = new PinyinConfig(); @@ -1275,4 +1275,66 @@ public void TestPinyinPosition3() throws IOException { Assert.assertEquals(3, re.get(5).position); } + + @Test + public void TestPinyinPosition4() throws IOException { + String[] s ={ "medcl"}; + + PinyinConfig config = new PinyinConfig(); + config.keepFirstLetter = true; + config.keepSeparateFirstLetter = true; + config.keepNoneChinese = true; + config.keepOriginal = true; + config.keepFullPinyin = true; + config.keepNoneChineseTogether = true; + + HashMap> result= getStringArrayListHashMap(s, config); + + ArrayList re = result.get("medcl"); + Assert.assertEquals("me", re.get(0).term); + Assert.assertEquals(0, re.get(0).startOffset); + Assert.assertEquals(2, re.get(0).endOffset); + Assert.assertEquals(1, re.get(0).position); + + Assert.assertEquals("medcl", re.get(1).term); + Assert.assertEquals(0, re.get(1).startOffset); + Assert.assertEquals(5, re.get(1).endOffset); + Assert.assertEquals(1, re.get(1).position); + + config = new PinyinConfig(); + config.keepFirstLetter = true; + config.keepSeparateFirstLetter = true; + config.keepNoneChinese = true; + config.keepOriginal = true; + config.keepFullPinyin = true; + config.keepNoneChineseTogether = false; + config.keepJoinedFullPinyin = true; + + result = getStringArrayListHashMap(s, config); + + re = result.get("medcl"); + Assert.assertEquals("m", re.get(0).term); + Assert.assertEquals(0, re.get(0).startOffset); + Assert.assertEquals(1, re.get(0).endOffset); + Assert.assertEquals(1, re.get(0).position); + + Assert.assertEquals("medcl", re.get(1).term); + Assert.assertEquals(0, re.get(1).startOffset); + Assert.assertEquals(5, re.get(1).endOffset); + Assert.assertEquals(1, re.get(1).position); + + + + Assert.assertEquals("e", re.get(2).term); + Assert.assertEquals(1, re.get(2).startOffset); + Assert.assertEquals(2, re.get(2).endOffset); + Assert.assertEquals(2, re.get(2).position); + + Assert.assertEquals("d", re.get(3).term); + Assert.assertEquals(2, re.get(3).startOffset); + Assert.assertEquals(3, re.get(3).endOffset); + Assert.assertEquals(3, re.get(3).position); + + + } }