Skip to content

Commit

Permalink
fix incorrect offset, update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
medcl committed Jan 24, 2017
1 parent 6a6f71b commit 1bb4bbc
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 34 deletions.
107 changes: 81 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,36 +231,91 @@ curl -XGET http://localhost:9200/medcl1/_analyze?text=%e5%88%98%e5%be%b7%e5%8d%8


7.Used in phrase query
<pre>
PUT /medcl/
{
"index" : {
"analysis" : {
"analyzer" : {
"pinyin_analyzer" : {
"tokenizer" : "my_pinyin"
- option 1
<pre>
PUT /medcl/
{
"index" : {
"analysis" : {
"analyzer" : {
"pinyin_analyzer" : {
"tokenizer" : "my_pinyin"
}
},
"tokenizer" : {
"my_pinyin" : {
"type" : "pinyin",
"keep_first_letter":false,
"keep_separate_first_letter" : false,
"keep_full_pinyin" : true,
"keep_original" : false,
"limit_first_letter_length" : 16,
"lowercase" : true
}
},
"tokenizer" : {
"my_pinyin" : {
"type" : "pinyin",
"keep_first_letter":false,
"keep_separate_first_letter" : false,
"keep_full_pinyin" : true,
"keep_original" : false,
"limit_first_letter_length" : 16,
"lowercase" : true
}
}
}
}
}
GET /medcl/folks/_search
{
"query": {"match_phrase": {
"name.pinyin": "刘德华"
}}
}
</pre>
GET /medcl/folks/_search
{
"query": {"match_phrase": {
"name.pinyin": "刘德华"
}}
}

</pre>

- option 2
<pre>

PUT /medcl/
{
"index" : {
"analysis" : {
"analyzer" : {
"pinyin_analyzer" : {
"tokenizer" : "my_pinyin"
}
},
"tokenizer" : {
"my_pinyin" : {
"type" : "pinyin",
"keep_first_letter":false,
"keep_separate_first_letter" : true,
"keep_full_pinyin" : false,
"keep_original" : false,
"limit_first_letter_length" : 16,
"lowercase" : true
}
}
}
}
}

POST /medcl/folks/andy
{"name":"刘德华"}

GET /medcl/folks/_search
{
"query": {"match_phrase": {
"name.pinyin": "刘德h"
}}
}

GET /medcl/folks/_search
{
"query": {"match_phrase": {
"name.pinyin": "刘dh"
}}
}

GET /medcl/folks/_search
{
"query": {"match_phrase": {
"name.pinyin": "dh"
}}
}

</pre>

8.That's all, have fun.
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public static List<String> walk(String text) {
}else{
//meet non letter
if(lastWord){
parse(candidates, buffer);
parse(candidates, buffer,true);
if(buffer.length()>0){
String str = buffer.toString();
buffer.setLength(0);
Expand All @@ -42,13 +42,13 @@ public static List<String> walk(String text) {

//start to check pinyin
if(buffer.length()>=maxLength){
parse(candidates, buffer);
parse(candidates, buffer,false);
}
}

//cleanup
if(lastWord){
parse(candidates,buffer);
parse(candidates,buffer,true);
}

//final cleanup
Expand All @@ -59,15 +59,21 @@ public static List<String> walk(String text) {
return candidates;
}

private static void parse(LinkedList<String> candidates, StringBuffer buffer) {
private static void parse(LinkedList<String> candidates, StringBuffer buffer,Boolean last) {
for (int j = 0; j < buffer.length(); j++) {
String guess=buffer.substring(0,buffer.length()-j);
if(PinyinAlphabetDict.getInstance().match(guess)){
candidates.add(guess);
String left=buffer.substring(buffer.length()-j,buffer.length());
buffer.setLength(0);
buffer.append(left);
break;
if(!last){
break;
}else{
if(left.length()>0){
parse(candidates,buffer,last);
}
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public PinyinTokenFilter(TokenStream in, PinyinConfig config) {
super(in);
this.config = config;
//validate config
if (!(config.keepFirstLetter || config.keepFullPinyin|| config.keepJoinedFullPinyin)) {
if (!(config.keepFirstLetter||config.keepSeparateFirstLetter || config.keepFullPinyin|| config.keepJoinedFullPinyin)) {
throw new ConfigErrorException("pinyin config error, can't disable first_letter and full_pinyin at the same time.");
}
candidate = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class PinyinTokenizer extends Tokenizer {
private boolean processedOriginal = false;
protected int position = 0;
protected int lastPosition = 0;
protected int lastBufferPosition = 0;
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private PinyinConfig config;
ArrayList<TermItem> candidate;
Expand All @@ -38,7 +39,7 @@ public PinyinTokenizer(PinyinConfig config) {
this.config = config;

//validate config
if (!(config.keepFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) {
if (!(config.keepFirstLetter||config.keepSeparateFirstLetter || config.keepFullPinyin || config.keepJoinedFullPinyin)) {
throw new ConfigErrorException("pinyin config error, can't disable first_letter and full_pinyin at the same time.");
}
candidate = new ArrayList<>();
Expand Down Expand Up @@ -142,6 +143,7 @@ public final boolean incrementToken() throws IOException {
} else {
//clean previous temp
if (buff.length() > 0) {
lastBufferPosition=i;
buffSize = parseBuff(buff, buffSize);
}

Expand All @@ -164,6 +166,7 @@ public final boolean incrementToken() throws IOException {

//clean previous temp
if (buff.length() > 0) {
lastBufferPosition=lastPosition;
buffSize = parseBuff(buff, buffSize);
}
}
Expand Down Expand Up @@ -215,8 +218,10 @@ private int parseBuff(StringBuilder buff, int buffSize) {
if (config.keepNoneChinese) {
if(config.noneChinesePinyinTokenize){
List<String> result = PinyinAlphabetTokenizer.walk(buff.toString());
int start=(lastPosition+1)-buffSize;
for (int i = 0; i < result.size(); i++) {
addCandidate(new TermItem(result.get(i), lastPosition - buffSize, lastPosition));
int end=start+i+1;
addCandidate(new TermItem(result.get(i),start+i , end));
}
}else{
addCandidate(new TermItem(buff.toString(), lastPosition - buffSize, lastPosition));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -498,8 +498,162 @@ public void TestTokenizer() throws IOException {
Assert.assertEquals("ceshi", re.get(2).term);




}

@Test
public void TestFirstLetters() throws IOException {
String[] s1 = new String[]{"刘德华"};
PinyinConfig config = new PinyinConfig();
config.keepFirstLetter = false;
config.keepSeparateFirstLetter = true;
config.keepFullPinyin = false;
config.keepJoinedFullPinyin = false;
config.keepNoneChinese = true;
config.keepNoneChineseTogether = true;
config.keepOriginal = false;
config.LimitFirstLetterLength = 16;
config.noneChinesePinyinTokenize = true;
config.lowercase = true;

HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s1, config);

ArrayList<TermItem> re = result.get("刘德华");
Assert.assertEquals("l", re.get(0).term);
Assert.assertEquals("d", re.get(1).term);
Assert.assertEquals("h", re.get(2).term);

Assert.assertEquals(0, re.get(0).startOffset);
Assert.assertEquals(1, re.get(1).startOffset);
Assert.assertEquals(2, re.get(2).startOffset);

Assert.assertEquals(1, re.get(0).endOffset);
Assert.assertEquals(2, re.get(1).endOffset);
Assert.assertEquals(3, re.get(2).endOffset);
}
@Test
public void TestOnlyLetters() throws IOException {
String[] s1 = new String[]{"ldh"};
PinyinConfig config = new PinyinConfig();
config.keepFirstLetter=false;
config.keepSeparateFirstLetter=false;
config.keepFullPinyin=true;
config.keepJoinedFullPinyin =false;
config.keepNoneChinese=true;
config.keepNoneChineseTogether=true;
config.keepOriginal=false;
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;

HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s1, config);

ArrayList<TermItem> re = result.get("ldh");
Assert.assertEquals("l", re.get(0).term);
Assert.assertEquals("d", re.get(1).term);
Assert.assertEquals("h", re.get(2).term);

Assert.assertEquals(0, re.get(0).startOffset);
Assert.assertEquals(1, re.get(1).startOffset);
Assert.assertEquals(2, re.get(2).startOffset);

Assert.assertEquals(1, re.get(0).endOffset);
Assert.assertEquals(2, re.get(1).endOffset);
Assert.assertEquals(3, re.get(2).endOffset);


s1 = new String[]{"liuldhdehua"};
config = new PinyinConfig();
config.keepFirstLetter=false;
config.keepSeparateFirstLetter=false;
config.keepFullPinyin=true;
config.keepJoinedFullPinyin =false;
config.keepNoneChinese=true;
config.keepNoneChineseTogether=true;
config.keepOriginal=false;
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;

result = getStringArrayListHashMap(s1, config);

re = result.get("liuldhdehua");
Assert.assertEquals("liu", re.get(0).term);
Assert.assertEquals("l", re.get(1).term);
Assert.assertEquals("d", re.get(2).term);
Assert.assertEquals("h", re.get(3).term);
Assert.assertEquals("de", re.get(4).term);
Assert.assertEquals("hua", re.get(5).term);

s1 = new String[]{"liuldh"};
config = new PinyinConfig();
config.keepFirstLetter=false;
config.keepSeparateFirstLetter=false;
config.keepFullPinyin=true;
config.keepJoinedFullPinyin =false;
config.keepNoneChinese=true;
config.keepNoneChineseTogether=true;
config.keepOriginal=false;
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;

result = getStringArrayListHashMap(s1, config);

re = result.get("liuldh");
Assert.assertEquals("liu", re.get(0).term);
Assert.assertEquals("l", re.get(1).term);
Assert.assertEquals("d", re.get(2).term);
Assert.assertEquals("h", re.get(3).term);

s1 = new String[]{"ldhdehua"};
config = new PinyinConfig();
config.keepFirstLetter=false;
config.keepSeparateFirstLetter=false;
config.keepFullPinyin=true;
config.keepJoinedFullPinyin =false;
config.keepNoneChinese=true;
config.keepNoneChineseTogether=true;
config.keepOriginal=false;
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;

result = getStringArrayListHashMap(s1, config);

re = result.get("ldhdehua");
Assert.assertEquals("l", re.get(0).term);
Assert.assertEquals("d", re.get(1).term);
Assert.assertEquals("h", re.get(2).term);
Assert.assertEquals("de", re.get(3).term);
Assert.assertEquals("hua", re.get(4).term);

s1 = new String[]{"ldh123dehua"};
config = new PinyinConfig();
config.keepFirstLetter=false;
config.keepSeparateFirstLetter=false;
config.keepFullPinyin=true;
config.keepJoinedFullPinyin =false;
config.keepNoneChinese=true;
config.keepNoneChineseTogether=true;
config.keepOriginal=false;
config.LimitFirstLetterLength=16;
config.noneChinesePinyinTokenize=true;
config.lowercase=true;

result = getStringArrayListHashMap(s1, config);

re = result.get("ldh123dehua");
Assert.assertEquals("l", re.get(0).term);
Assert.assertEquals("d", re.get(1).term);
Assert.assertEquals("h", re.get(2).term);
Assert.assertEquals("123", re.get(3).term);
Assert.assertEquals("de", re.get(4).term);
Assert.assertEquals("hua", re.get(5).term);
}


@Test
public void TestOnlyFirstLetterTokenizer() throws IOException {
String[] s =
Expand Down

0 comments on commit 1bb4bbc

Please sign in to comment.