From 1bb4bbcc4b9487fa8b0c8864980ac57f8d40aba8 Mon Sep 17 00:00:00 2001 From: medcl Date: Tue, 24 Jan 2017 23:15:13 +0800 Subject: [PATCH] fix incorrect offset, update readme --- README.md | 107 +++++++++--- .../analysis/PinyinAlphabetTokenizer.java | 16 +- .../index/analysis/PinyinTokenFilter.java | 2 +- .../index/analysis/PinyinTokenizer.java | 9 +- .../index/analysis/PinyinAnalysisTests.java | 154 ++++++++++++++++++ 5 files changed, 254 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 07f7c0d..4fc1e40 100644 --- a/README.md +++ b/README.md @@ -231,36 +231,91 @@ curl -XGET http://localhost:9200/medcl1/_analyze?text=%e5%88%98%e5%be%b7%e5%8d%8 7.Used in phrase query -
-PUT /medcl/
-{
-    "index" : {
-        "analysis" : {
-            "analyzer" : {
-                "pinyin_analyzer" : {
-                    "tokenizer" : "my_pinyin"
+- option 1
+    
+    PUT /medcl/
+    {
+        "index" : {
+            "analysis" : {
+                "analyzer" : {
+                    "pinyin_analyzer" : {
+                        "tokenizer" : "my_pinyin"
+                        }
+                },
+                "tokenizer" : {
+                    "my_pinyin" : {
+                        "type" : "pinyin",
+                        "keep_first_letter":false,
+                        "keep_separate_first_letter" : false,
+                        "keep_full_pinyin" : true,
+                        "keep_original" : false,
+                        "limit_first_letter_length" : 16,
+                        "lowercase" : true
                     }
-            },
-            "tokenizer" : {
-                "my_pinyin" : {
-                    "type" : "pinyin",
-                    "keep_first_letter":false,
-                    "keep_separate_first_letter" : false,
-                    "keep_full_pinyin" : true,
-                    "keep_original" : false,
-                    "limit_first_letter_length" : 16,
-                    "lowercase" : true
                 }
             }
         }
     }
-}
-GET /medcl/folks/_search
-{
-  "query": {"match_phrase": {
-    "name.pinyin": "刘德华"
-  }}
-}
-
+ GET /medcl/folks/_search + { + "query": {"match_phrase": { + "name.pinyin": "刘德华" + }} + } + +
+ +- option 2 +
+
+    PUT /medcl/
+    {
+        "index" : {
+            "analysis" : {
+                "analyzer" : {
+                    "pinyin_analyzer" : {
+                        "tokenizer" : "my_pinyin"
+                        }
+                },
+                "tokenizer" : {
+                    "my_pinyin" : {
+                        "type" : "pinyin",
+                        "keep_first_letter":false,
+                        "keep_separate_first_letter" : true,
+                        "keep_full_pinyin" : false,
+                        "keep_original" : false,
+                        "limit_first_letter_length" : 16,
+                        "lowercase" : true
+                    }
+                }
+            }
+        }
+    }
+
+    POST /medcl/folks/andy
+    {"name":"刘德华"}
+
+    GET /medcl/folks/_search
+    {
+      "query": {"match_phrase": {
+        "name.pinyin": "刘德h"
+      }}
+    }
+
+    GET /medcl/folks/_search
+    {
+      "query": {"match_phrase": {
+        "name.pinyin": "刘dh"
+      }}
+    }
+
+    GET /medcl/folks/_search
+    {
+      "query": {"match_phrase": {
+        "name.pinyin": "dh"
+      }}
+    }
+
+    
8.That's all, have fun. diff --git a/src/main/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizer.java index 990fa44..869f0a4 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PinyinAlphabetTokenizer.java @@ -29,7 +29,7 @@ public static List walk(String text) { }else{ //meet non letter if(lastWord){ - parse(candidates, buffer); + parse(candidates, buffer,true); if(buffer.length()>0){ String str = buffer.toString(); buffer.setLength(0); @@ -42,13 +42,13 @@ public static List walk(String text) { //start to check pinyin if(buffer.length()>=maxLength){ - parse(candidates, buffer); + parse(candidates, buffer,false); } } //cleanup if(lastWord){ - parse(candidates,buffer); + parse(candidates,buffer,true); } //final cleanup @@ -59,7 +59,7 @@ public static List walk(String text) { return candidates; } - private static void parse(LinkedList candidates, StringBuffer buffer) { + private static void parse(LinkedList candidates, StringBuffer buffer,Boolean last) { for (int j = 0; j < buffer.length(); j++) { String guess=buffer.substring(0,buffer.length()-j); if(PinyinAlphabetDict.getInstance().match(guess)){ @@ -67,7 +67,13 @@ private static void parse(LinkedList candidates, StringBuffer buffer) { String left=buffer.substring(buffer.length()-j,buffer.length()); buffer.setLength(0); buffer.append(left); - break; + if(!last){ + break; + }else{ + if(left.length()>0){ + parse(candidates,buffer,last); + } + } } } } diff --git a/src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilter.java index 33ccc2a..5c74639 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilter.java +++ b/src/main/java/org/elasticsearch/index/analysis/PinyinTokenFilter.java @@ -47,7 +47,7 @@ public PinyinTokenFilter(TokenStream in, PinyinConfig config) { super(in); this.config = config; //validate config - if (!(config.keepFirstLetter || config.keepFullPinyin|| config.keepJoinedFullPinyin)) { + if (!(config.keepFirstLetter||config.keepSeparateFirstLetter || config.keepFullPinyin|| config.keepJoinedFullPinyin)) { throw new ConfigErrorException("pinyin config error, can't disable first_letter and full_pinyin at the same time."); } candidate = new ArrayList<>(); diff --git a/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java index 1735b40..2c42b38 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PinyinTokenizer.java @@ -24,6 +24,7 @@ public class PinyinTokenizer extends Tokenizer { private boolean processedOriginal = false; protected int position = 0; protected int lastPosition = 0; + protected int lastBufferPosition = 0; private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private PinyinConfig config; ArrayList candidate; @@ -38,7 +39,7 @@ public PinyinTokenizer(PinyinConfig config) { this.config = config; //validate config - if (!(config.keepFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) { + if (!(config.keepFirstLetter||config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) { throw new ConfigErrorException("pinyin config error, can't disable first_letter and full_pinyin at the same time."); } candidate = new ArrayList<>(); @@ -142,6 +143,7 @@ public final boolean incrementToken() throws IOException { } else { //clean previous temp if (buff.length() > 0) { + lastBufferPosition=i; buffSize = parseBuff(buff, buffSize); } @@ -164,6 +166,7 @@ public final boolean incrementToken() throws IOException { //clean previous temp if (buff.length() > 0) { + lastBufferPosition=lastPosition; buffSize = parseBuff(buff, buffSize); } } @@ -215,8 +218,10 @@ private int parseBuff(StringBuilder buff, int buffSize) { if (config.keepNoneChinese) { if(config.noneChinesePinyinTokenize){ List result = PinyinAlphabetTokenizer.walk(buff.toString()); + int start=(lastPosition+1)-buffSize; for (int i = 0; i < result.size(); i++) { - addCandidate(new TermItem(result.get(i), lastPosition - buffSize, lastPosition)); + int end=start+i+1; + addCandidate(new TermItem(result.get(i),start+i , end)); } }else{ addCandidate(new TermItem(buff.toString(), lastPosition - buffSize, lastPosition)); diff --git a/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTests.java b/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTests.java index 31df237..40a2b95 100644 --- a/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTests.java +++ b/src/test/java/org/elasticsearch/index/analysis/PinyinAnalysisTests.java @@ -498,8 +498,162 @@ public void TestTokenizer() throws IOException { Assert.assertEquals("ceshi", re.get(2).term); + + } + @Test + public void TestFirstLetters() throws IOException { + String[] s1 = new String[]{"刘德华"}; + PinyinConfig config = new PinyinConfig(); + config.keepFirstLetter = false; + config.keepSeparateFirstLetter = true; + config.keepFullPinyin = false; + config.keepJoinedFullPinyin = false; + config.keepNoneChinese = true; + config.keepNoneChineseTogether = true; + config.keepOriginal = false; + config.LimitFirstLetterLength = 16; + config.noneChinesePinyinTokenize = true; + config.lowercase = true; + + HashMap> result = getStringArrayListHashMap(s1, config); + + ArrayList re = result.get("刘德华"); + Assert.assertEquals("l", re.get(0).term); + Assert.assertEquals("d", re.get(1).term); + Assert.assertEquals("h", re.get(2).term); + + Assert.assertEquals(0, re.get(0).startOffset); + Assert.assertEquals(1, re.get(1).startOffset); + Assert.assertEquals(2, re.get(2).startOffset); + + Assert.assertEquals(1, re.get(0).endOffset); + Assert.assertEquals(2, re.get(1).endOffset); + Assert.assertEquals(3, re.get(2).endOffset); + } + @Test + public void TestOnlyLetters() throws IOException { + String[] s1 = new String[]{"ldh"}; + PinyinConfig config = new PinyinConfig(); + config.keepFirstLetter=false; + config.keepSeparateFirstLetter=false; + config.keepFullPinyin=true; + config.keepJoinedFullPinyin =false; + config.keepNoneChinese=true; + config.keepNoneChineseTogether=true; + config.keepOriginal=false; + config.LimitFirstLetterLength=16; + config.noneChinesePinyinTokenize=true; + config.lowercase=true; + + HashMap> result = getStringArrayListHashMap(s1, config); + + ArrayList re = result.get("ldh"); + Assert.assertEquals("l", re.get(0).term); + Assert.assertEquals("d", re.get(1).term); + Assert.assertEquals("h", re.get(2).term); + + Assert.assertEquals(0, re.get(0).startOffset); + Assert.assertEquals(1, re.get(1).startOffset); + Assert.assertEquals(2, re.get(2).startOffset); + + Assert.assertEquals(1, re.get(0).endOffset); + Assert.assertEquals(2, re.get(1).endOffset); + Assert.assertEquals(3, re.get(2).endOffset); + + + s1 = new String[]{"liuldhdehua"}; + config = new PinyinConfig(); + config.keepFirstLetter=false; + config.keepSeparateFirstLetter=false; + config.keepFullPinyin=true; + config.keepJoinedFullPinyin =false; + config.keepNoneChinese=true; + config.keepNoneChineseTogether=true; + config.keepOriginal=false; + config.LimitFirstLetterLength=16; + config.noneChinesePinyinTokenize=true; + config.lowercase=true; + + result = getStringArrayListHashMap(s1, config); + + re = result.get("liuldhdehua"); + Assert.assertEquals("liu", re.get(0).term); + Assert.assertEquals("l", re.get(1).term); + Assert.assertEquals("d", re.get(2).term); + Assert.assertEquals("h", re.get(3).term); + Assert.assertEquals("de", re.get(4).term); + Assert.assertEquals("hua", re.get(5).term); + + s1 = new String[]{"liuldh"}; + config = new PinyinConfig(); + config.keepFirstLetter=false; + config.keepSeparateFirstLetter=false; + config.keepFullPinyin=true; + config.keepJoinedFullPinyin =false; + config.keepNoneChinese=true; + config.keepNoneChineseTogether=true; + config.keepOriginal=false; + config.LimitFirstLetterLength=16; + config.noneChinesePinyinTokenize=true; + config.lowercase=true; + + result = getStringArrayListHashMap(s1, config); + + re = result.get("liuldh"); + Assert.assertEquals("liu", re.get(0).term); + Assert.assertEquals("l", re.get(1).term); + Assert.assertEquals("d", re.get(2).term); + Assert.assertEquals("h", re.get(3).term); + + s1 = new String[]{"ldhdehua"}; + config = new PinyinConfig(); + config.keepFirstLetter=false; + config.keepSeparateFirstLetter=false; + config.keepFullPinyin=true; + config.keepJoinedFullPinyin =false; + config.keepNoneChinese=true; + config.keepNoneChineseTogether=true; + config.keepOriginal=false; + config.LimitFirstLetterLength=16; + config.noneChinesePinyinTokenize=true; + config.lowercase=true; + + result = getStringArrayListHashMap(s1, config); + + re = result.get("ldhdehua"); + Assert.assertEquals("l", re.get(0).term); + Assert.assertEquals("d", re.get(1).term); + Assert.assertEquals("h", re.get(2).term); + Assert.assertEquals("de", re.get(3).term); + Assert.assertEquals("hua", re.get(4).term); + + s1 = new String[]{"ldh123dehua"}; + config = new PinyinConfig(); + config.keepFirstLetter=false; + config.keepSeparateFirstLetter=false; + config.keepFullPinyin=true; + config.keepJoinedFullPinyin =false; + config.keepNoneChinese=true; + config.keepNoneChineseTogether=true; + config.keepOriginal=false; + config.LimitFirstLetterLength=16; + config.noneChinesePinyinTokenize=true; + config.lowercase=true; + + result = getStringArrayListHashMap(s1, config); + + re = result.get("ldh123dehua"); + Assert.assertEquals("l", re.get(0).term); + Assert.assertEquals("d", re.get(1).term); + Assert.assertEquals("h", re.get(2).term); + Assert.assertEquals("123", re.get(3).term); + Assert.assertEquals("de", re.get(4).term); + Assert.assertEquals("hua", re.get(5).term); + } + + @Test public void TestOnlyFirstLetterTokenizer() throws IOException { String[] s =