Skip to content

Commit

Permalink
add option to make non chinese stay together
Browse files Browse the repository at this point in the history
  • Loading branch information
medcl committed Sep 28, 2016
1 parent fd1ab74 commit 7472381
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 15 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ The plugin includes analyzer: `pinyin` , tokenizer: `pinyin` and token-filter:
* `keep_none_chinese_in_first_letter` keep non Chinese letters in first letter, eg: `刘德华AT2016`->`ldhat2016`, default: true
* `keep_full_pinyin` when this option enabled, eg: `刘德华`> [`liu`,`de`,`hua`], default: true
* `keep_none_chinese` keep non chinese letter or number in result, default: true
* `keep_none_chinese_together` keep non chinese letter together, default: true, eg: `DJ音乐家` -> `DJ`,`yin`,`yue`,`jia`
* `keep_original` when this option enabled, will keep original input as well, default: true
* `lowercase` lowercase non Chinese letters, default: true
* `trim_whitespace` default: true
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/elasticsearch/analysis/PinyinConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public class PinyinConfig {
public boolean keepNoneChineseInFirstLetter =true;
public boolean keepOriginal=true;
public boolean keepFirstLetter=true;
public boolean keepNoneChineseTogether=true;
public int LimitFirstLetterLength=16;
public boolean keepFullPinyin=true;

Expand All @@ -21,6 +22,7 @@ public PinyinConfig(Settings settings) {
this.keepFirstLetter=settings.getAsBoolean("keep_first_letter",true);
this.keepFullPinyin=settings.getAsBoolean("keep_full_pinyin", true);
this.keepNoneChinese=settings.getAsBoolean("keep_none_chinese",true);
this.keepNoneChineseTogether=settings.getAsBoolean("keep_none_chinese_together",true);
this.keepOriginal=settings.getAsBoolean("keep_original", true);
this.LimitFirstLetterLength=settings.getAsInt("limit_first_letter_length", 16);
this.lowercase=settings.getAsBoolean("lowercase", true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,20 +84,31 @@ private boolean readTerm() {

List<String> pinyinList = Pinyin.pinyin(source);

StringBuffer buff=new StringBuffer();

for (int i = 0; i < source.length(); i++) {
char c = source.charAt(i);
//keep original alphabet
if (c < 128) {
if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
if (config.keepNoneChinese) {
candidate.add(String.valueOf(c));
if(config.keepNoneChineseTogether){
buff.append(c);
}else{
candidate.add(String.valueOf(c));
}
}
if(config.keepNoneChineseInFirstLetter)
{
firstLetters.append(c);
}
}
} else {
//clean previous temp
if(buff.length()>0){
candidate.add(buff.toString());
buff=new StringBuffer();
}

String pinyin = pinyinList.get(i);
if (pinyin != null&&pinyin.length()>0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,30 @@ public final boolean incrementToken() throws IOException {

List<String> pinyinList = Pinyin.pinyin(source);

StringBuffer buff=new StringBuffer();

for (int i = 0; i < source.length(); i++) {
char c = source.charAt(i);
//keep original alphabet
if (c < 128) {
if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
if (config.keepNoneChinese) {
candidate.add(String.valueOf(c));
if (config.keepNoneChinese) {
if(config.keepNoneChineseTogether){
buff.append(c);
}else{
candidate.add(String.valueOf(c));
}
}
firstLetters.append(c);
}
}
} else {
//clean previous temp
if(buff.length()>0){
candidate.add(buff.toString());
buff=new StringBuffer();
}

String pinyin = pinyinList.get(i);
if (pinyin != null&&pinyin.length()>0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,14 +215,12 @@ public void TestTokenizer() throws IOException {
Assert.assertEquals("ldh", re.get(4));

re = result.get("刘德华A1");
Assert.assertEquals(7, re.size());
Assert.assertEquals(5, re.size());
Assert.assertEquals("liu", re.get(0));
Assert.assertEquals("de", re.get(1));
Assert.assertEquals("hua", re.get(2));
Assert.assertEquals("a", re.get(3));
Assert.assertEquals("1", re.get(4));
Assert.assertEquals("刘德华A1", re.get(5));
Assert.assertEquals("ldha1", re.get(6));
Assert.assertEquals("刘德华A1", re.get(3));
Assert.assertEquals("ldha1", re.get(4));

re = result.get("讲话频率小,不能发高音");
Assert.assertEquals(12, re.size());
Expand Down Expand Up @@ -261,14 +259,13 @@ public void TestTokenizer() throws IOException {
Assert.assertEquals("ajsn", re.get(5));

re = result.get("DJ音乐家");
Assert.assertEquals(7, re.size());
Assert.assertEquals("d", re.get(0));
Assert.assertEquals("j", re.get(1));
Assert.assertEquals("yin", re.get(2));
Assert.assertEquals("yue", re.get(3));
Assert.assertEquals("jia", re.get(4));
Assert.assertEquals("DJ音乐家", re.get(5));
Assert.assertEquals("djyyj", re.get(6));
Assert.assertEquals(6, re.size());
Assert.assertEquals("dj", re.get(0));
Assert.assertEquals("yin", re.get(1));
Assert.assertEquals("yue", re.get(2));
Assert.assertEquals("jia", re.get(3));
Assert.assertEquals("DJ音乐家", re.get(4));
Assert.assertEquals("djyyj", re.get(5));
}

@Test
Expand All @@ -282,6 +279,8 @@ public void TestOnlyFirstLetterTokenizer() throws IOException {
config.keepNoneChinese = true;
config.keepOriginal = false;
config.keepFullPinyin = false;
config.keepNoneChineseTogether = false;

HashMap<String, ArrayList<String>> result = getStringArrayListHashMap(s, config);

ArrayList<String> re = result.get("刘德华");
Expand All @@ -304,11 +303,25 @@ public void TestOnlyFirstLetterTokenizer() throws IOException {
config.keepNoneChinese = false;
config.keepOriginal = false;
config.keepFullPinyin = false;
config.keepNoneChineseTogether = false;

result = getStringArrayListHashMap(s, config);

re = result.get("DJ音乐家");
Assert.assertEquals(1, re.size());
Assert.assertEquals("yyj", re.get(0));

config = new PinyinConfig();
config.keepFirstLetter = true;
config.keepNoneChineseTogether = true;
config.keepOriginal = false;
config.keepFullPinyin = false;
result = getStringArrayListHashMap(s, config);

re = result.get("DJ音乐家");
Assert.assertEquals(2, re.size());
Assert.assertEquals("dj", re.get(0));
Assert.assertEquals("djyyj", re.get(1));
}

@Test
Expand Down

0 comments on commit 7472381

Please sign in to comment.