backport from master

infinilabs · Oct 13, 2016 · 4d2772e · 4d2772e
1 parent 200e6d8
commit 4d2772e
Show file tree

Hide file tree

Showing 34 changed files with 1,161 additions and 1,583 deletions.
diff --git a/README.md b/README.md
@@ -1,17 +1,17 @@
 Pinyin Analysis for Elasticsearch
 ==================================
 
-The Pinyin Analysis plugin integrates Pinyin4j(http://pinyin4j.sourceforge.net/) module into elasticsearch.
-
-Pinyin4j is a popular Java library supporting conversion between Chinese characters and most popular Pinyin systems. The output format of pinyin could be customized.
+This Pinyin Analysis plugin is used to do conversion between Chinese characters and Pinyin.
 
     --------------------------------------------------
     | Pinyin4j   Analysis Plugin    | Elasticsearch  |
     --------------------------------------------------
-    | master                        | 2.4.x -> master|
+    | master                        | 5.x -> master  |
     --------------------------------------------------
+    | 5.0.0-rc1                     | 5.0.0-rc1      |
+    --------------------------------------------------  
     | 1.8.1                         | 2.4.1          |
-    --------------------------------------------------
+    --------------------------------------------------  
     | 1.7.5                         | 2.3.5          |
     --------------------------------------------------  
     | 1.7.4                         | 2.3.4          |
@@ -29,92 +29,97 @@ Pinyin4j is a popular Java library supporting conversion between Chinese charact
     | 1.2.2                         | 1.0.x          |
     --------------------------------------------------
 
-The plugin includes two analyzers: `pinyin` and  `pinyin_first_letter`  , two tokenizers: `pinyin` and `pinyin_first_letter` and two token-filters:  `pinyin` and  `pinyin_first_letter`.
+The plugin includes analyzer: `pinyin` ,  tokenizer: `pinyin` and  token-filter:  `pinyin`.
+
+** Optional Parameters ** 
+* `remove_duplicated_term` when this option enabled, duplicated term will be removed to save index, eg: `de的`>`de`, default: false,  NOTE: position related query maybe influenced
+* `keep_first_letter` when this option enabled,  eg: `刘德华`>`ldh`, default: true
+* `keep_separate_first_letter` when this option enabled, will keep first letters separately,  eg: `刘德华`>`l`,`d`,`h`, default: false, NOTE: query result maybe too fuzziness due to term too frequency
+* `limit_first_letter_length` set max length of the first_letter result, default: 16
+* `keep_full_pinyin` when this option enabled, eg: `刘德华`> [`liu`,`de`,`hua`], default: true
+* `keep_none_chinese` keep non chinese letter or number in result, default: true
+* `keep_none_chinese_together` keep non chinese letter together, default: true, eg: `DJ音乐家` -> `DJ`,`yin`,`yue`,`jia`, when set to `false`, eg: `DJ音乐家` -> `D`,`J`,`yin`,`yue`,`jia`, NOTE: `keep_none_chinese` should be enabled first
+* `keep_none_chinese_in_first_letter` keep non Chinese letters in first letter, eg: `刘德华AT2016`->`ldhat2016`, default: true
+* `none_chinese_pinyin_tokenize` break non chinese letters into separate pinyin term if they are pinyin, default: true, eg: `liudehuaalibaba13zhuanghan` -> `liu`,`de`,`hua`,`a`,`li`,`ba`,`ba`,`13`,`zhuang`,`han`, NOTE:  `keep_none_chinese` and `keep_none_chinese_together` should be enabled first
+* `keep_original` when this option enabled, will keep original input as well, default: false
+* `lowercase`  lowercase non Chinese letters, default: true
+* `trim_whitespace` default: true
+
 
-1.Create a index for doing some tests
+
+1.Create a index with custom pinyin analyzer
 <pre>
 curl -XPUT http://localhost:9200/medcl/ -d'
 {
     "index" : {
         "analysis" : {
             "analyzer" : {
                 "pinyin_analyzer" : {
-                    "tokenizer" : "my_pinyin",
-                    "filter" : "word_delimiter"
+                    "tokenizer" : "my_pinyin"
                     }
             },
             "tokenizer" : {
                 "my_pinyin" : {
                     "type" : "pinyin",
-                    "first_letter" : "none",
-                    "padding_char" : " "
+                    "keep_separate_first_letter" : false,
+                    "keep_full_pinyin" : true,
+                    "keep_original" : true,
+                    "limit_first_letter_length" : 16,
+                    "lowercase" : true
                 }
             }
         }
     }
 }'
 </pre>
 
-2.Analyzing a chinese name, such as 刘德华
+2.Test Analyzer, analyzing a chinese name, such as 刘德华
 <pre>
 http://localhost:9200/medcl/_analyze?text=%e5%88%98%e5%be%b7%e5%8d%8e&analyzer=pinyin_analyzer
-{"tokens":[{"token":"liu de hua ","start_offset":0,"end_offset":3,"type":"word","position":1}]}
 </pre>
-
-3.That's all, have fun.
-
-optional config:
-the parameter `first_letter` can be set to: `prefix`, `append`, `only` and `none`, default value is `none`
-
-examples:
-`first_letter` set to`prifix` and  `padding_char` is set to `""`
-the analysis result will be:
 <pre>
-{"tokens":[{"token":"ldhliudehua","start_offset":0,"end_offset":3,"type":"word","position":1}]}
-</pre>
-
-and if we set `first_letter`  to `only` ,the result will be:
-<pre>
-{"tokens":[{"token":"ldh","start_offset":0,"end_offset":3,"type":"word","position":1}]}
-</pre>
-also   `first_letter`  to `append`
-<pre>
-{"tokens":[{"token":"liu de hua ldh","start_offset":0,"end_offset":3,"type":"word","position":1}]}
-</pre>
-
-
-
-----------additional----------example-----------------------
-
-if you wanna do a auto-complete with people's name,combining with the magic of pinyin,and it's very easy now,here is the detail instructions:
-
-1.Index setting
-<pre>
-curl -XPOST http://localhost:9200/medcl/_close
-curl -XPUT http://localhost:9200/medcl/_settings -d'
 {
-    "index" : {
-        "analysis" : {
-            "analyzer" : {
-                "pinyin_analyzer" : {
-                    "tokenizer" : "my_pinyin",
-                    "filter" : ["word_delimiter","nGram"]
-                }
-            },
-            "tokenizer" : {
-                "my_pinyin" : {
-                    "type" : "pinyin",
-                    "first_letter" : "prefix",
-                    "padding_char" : " "
-                }
-            }
-        }
+  "tokens" : [
+    {
+      "token" : "liu",
+      "start_offset" : 0,
+      "end_offset" : 1,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "de",
+      "start_offset" : 1,
+      "end_offset" : 2,
+      "type" : "word",
+      "position" : 1
+    },
+    {
+      "token" : "hua",
+      "start_offset" : 2,
+      "end_offset" : 3,
+      "type" : "word",
+      "position" : 2
+    },
+    {
+      "token" : "刘德华",
+      "start_offset" : 0,
+      "end_offset" : 3,
+      "type" : "word",
+      "position" : 3
+    },
+    {
+      "token" : "ldh",
+      "start_offset" : 0,
+      "end_offset" : 3,
+      "type" : "word",
+      "position" : 4
     }
-}'
-curl -XPOST http://localhost:9200/medcl/_open
+  ]
+}
 </pre>
 
-2.Create mapping
+3.Create mapping
 <pre>
 curl -XPOST http://localhost:9200/medcl/folks/_mapping -d'
 {
@@ -142,21 +147,21 @@ curl -XPOST http://localhost:9200/medcl/folks/_mapping -d'
 }'
 </pre>
 
-3.Indexing
+4.Indexing
 <pre>
 curl -XPOST http://localhost:9200/medcl/folks/andy -d'{"name":"刘德华"}'
 </pre>
 
-4.Have a try
+5.Let's search
 <pre>
-curl http://localhost:9200/medcl/folks/_search?q=name:%e5%88%98
+curl http://localhost:9200/medcl/folks/_search?q=name:%E5%88%98%E5%BE%B7%E5%8D%8E
 curl http://localhost:9200/medcl/folks/_search?q=name:%e5%88%98%e5%be%b7
 curl http://localhost:9200/medcl/folks/_search?q=name:liu
 curl http://localhost:9200/medcl/folks/_search?q=name:ldh
-curl http://localhost:9200/medcl/folks/_search?q=name:dehua
+curl http://localhost:9200/medcl/folks/_search?q=name:de+hua
 </pre>
 
-5.Use Pinyin-TokenFilter (contributed by @wangweiwei)
+6.Using Pinyin-TokenFilter
 <pre>
 curl -XPUT http://localhost:9200/medcl1/ -d'
 {
@@ -165,14 +170,20 @@ curl -XPUT http://localhost:9200/medcl1/ -d'
             "analyzer" : {
                 "user_name_analyzer" : {
                     "tokenizer" : "whitespace",
-                    "filter" : "pinyin_filter"
+                    "filter" : "pinyin_first_letter_and_full_pinyin_filter"
                 }
             },
             "filter" : {
-                "pinyin_filter" : {
+                "pinyin_first_letter_and_full_pinyin_filter" : {
                     "type" : "pinyin",
-                    "first_letter" : "only",
-                    "padding_char" : ""
+                    "keep_first_letter" : true,
+                    "keep_full_pinyin" : false,
+                    "keep_none_chinese" : true,
+                    "keep_original" : false,
+                    "limit_first_letter_length" : 16,
+                    "lowercase" : true,
+                    "trim_whitespace" : true,
+                    "keep_none_chinese_in_first_letter" : true
                 }
             }
         }
@@ -182,6 +193,49 @@ curl -XPUT http://localhost:9200/medcl1/ -d'
 
 Token Test:刘德华 张学友 郭富城 黎明 四大天王
 <pre>
-curl -XGET http://localhost:9200/medcl/_analyze?text=%e5%88%98%e5%be%b7%e5%8d%8e+%e5%bc%a0%e5%ad%a6%e5%8f%8b+%e9%83%ad%e5%af%8c%e5%9f%8e+%e9%bb%8e%e6%98%8e+%e5%9b%9b%e5%a4%a7%e5%a4%a9%e7%8e%8b&analyzer=user_name_analyzer
-{"tokens":[{"token":"ldh","start_offset":0,"end_offset":3,"type":"word","position":1},{"token":"zxy","start_offset":4,"end_offset":7,"type":"word","position":2},{"token":"gfc","start_offset":8,"end_offset":11,"type":"word","position":3},{"token":"lm","start_offset":12,"end_offset":14,"type":"word","position":4},{"token":"sdtw","start_offset":15,"end_offset":19,"type":"word","position":5}]}
+curl -XGET http://localhost:9200/medcl1/_analyze?text=%e5%88%98%e5%be%b7%e5%8d%8e+%e5%bc%a0%e5%ad%a6%e5%8f%8b+%e9%83%ad%e5%af%8c%e5%9f%8e+%e9%bb%8e%e6%98%8e+%e5%9b%9b%e5%a4%a7%e5%a4%a9%e7%8e%8b&analyzer=user_name_analyzer
+</pre>
+<pre>
+{
+  "tokens" : [
+    {
+      "token" : "ldh",
+      "start_offset" : 0,
+      "end_offset" : 3,
+      "type" : "word",
+      "position" : 0
+    },
+    {
+      "token" : "zxy",
+      "start_offset" : 4,
+      "end_offset" : 7,
+      "type" : "word",
+      "position" : 1
+    },
+    {
+      "token" : "gfc",
+      "start_offset" : 8,
+      "end_offset" : 11,
+      "type" : "word",
+      "position" : 2
+    },
+    {
+      "token" : "lm",
+      "start_offset" : 12,
+      "end_offset" : 14,
+      "type" : "word",
+      "position" : 3
+    },
+    {
+      "token" : "sdtw",
+      "start_offset" : 15,
+      "end_offset" : 19,
+      "type" : "word",
+      "position" : 4
+    }
+  ]
+}
 </pre>
+
+
+7.That's all, have fun.
diff --git a/lib/nlp-lang-1.7.jar b/lib/nlp-lang-1.7.jar
diff --git a/lib/pinyin4j-2.5.0.jar b/lib/pinyin4j-2.5.0.jar
diff --git a/pom.xml b/pom.xml
@@ -69,10 +69,10 @@
 
     <dependencies>
         <dependency>
-            <groupId>net.sourceforge.pinyin4j</groupId>
-            <artifactId>pinyin4j</artifactId>
-            <version>2.5.0</version>
-            <systemPath>${basedir}/lib/pinyin4j-2.5.0.jar</systemPath>
+            <groupId>nlp-lang</groupId>
+            <artifactId>nlp-lang</artifactId>
+            <version>1.7.0</version>
+            <systemPath>${basedir}/lib/nlp-lang-1.7.jar</systemPath>
             <scope>system</scope>
         </dependency>
 

diff --git a/src/main/java/org/elasticsearch/analysis/PinyinConfig.java b/src/main/java/org/elasticsearch/analysis/PinyinConfig.java
@@ -1,19 +1,38 @@
 package org.elasticsearch.analysis;
 
+import org.elasticsearch.common.settings.Settings;
+
 /**
  * Created by medcl on 15/11/26.
  */
 public class PinyinConfig {
 
-    public enum Mode{
-        onlyFirstLetter(1),
-        fullPinyin(2),
-        fullPinyinWithSpace(3),
-        supportPolyphony(4);
+    public boolean lowercase=true;
+    public boolean trimWhitespace=true;
+    public boolean keepNoneChinese=true;
+    public boolean keepNoneChineseInFirstLetter =true;
+    public boolean keepOriginal=false;
+    public boolean keepFirstLetter=true;
+    public boolean keepSeparateFirstLetter=false;
+    public boolean keepNoneChineseTogether=true;
+    public boolean noneChinesePinyinTokenize =true;
+    public int     LimitFirstLetterLength=16;
+    public boolean keepFullPinyin=true;
+    public boolean removeDuplicateTerm=false;
 
-        private final int value;
-        Mode(int i) {
-            value=i;
-        }
+    public PinyinConfig() {}
+    public PinyinConfig(Settings settings) {
+        this.keepFirstLetter=settings.getAsBoolean("keep_first_letter",true);
+        this.keepSeparateFirstLetter=settings.getAsBoolean("keep_separate_first_letter",false);
+        this.keepFullPinyin=settings.getAsBoolean("keep_full_pinyin", true);
+        this.keepNoneChinese=settings.getAsBoolean("keep_none_chinese",true);
+        this.keepNoneChineseTogether=settings.getAsBoolean("keep_none_chinese_together",true);
+        this.noneChinesePinyinTokenize =settings.getAsBoolean("none_chinese_pinyin_tokenize",true);
+        this.keepOriginal=settings.getAsBoolean("keep_original", false);
+        this.LimitFirstLetterLength=settings.getAsInt("limit_first_letter_length", 16);
+        this.lowercase=settings.getAsBoolean("lowercase", true);
+        this.trimWhitespace=settings.getAsBoolean("trim_whitespace", true);
+        this.keepNoneChineseInFirstLetter =settings.getAsBoolean("keep_none_chinese_in_first_letter", true);
+        this.removeDuplicateTerm =settings.getAsBoolean("remove_duplicated_term", false);
     }
 }