Skip to content

Commit

Permalink
fix incorrect position, Closes #121
Browse files Browse the repository at this point in the history
  • Loading branch information
medcl committed Jul 2, 2017
1 parent 47d2b2e commit 1089270
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ public final boolean incrementToken() throws IOException {
//keep original alphabet
if (c < 128) {
if (buff.length() <= 0) {
buffStartPosition = i;
buffStartPosition = i+1;
}
if ((c > 96 && c < 123) || (c > 64 && c < 91) || (c > 47 && c < 58)) {
if (config.keepNoneChinese) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -698,8 +698,8 @@ public void TestOnlyFirstLetterTokenizer() throws IOException {
re = result.get("DJ音乐家");
Assert.assertEquals(3, re.size());
Assert.assertEquals("d", re.get(0).term);
Assert.assertEquals("djyyj", re.get(2).term);
Assert.assertEquals("j", re.get(1).term);
Assert.assertEquals("djyyj", re.get(1).term);
Assert.assertEquals("j", re.get(2).term);


config = new PinyinConfig();
Expand Down Expand Up @@ -1275,4 +1275,66 @@ public void TestPinyinPosition3() throws IOException {
Assert.assertEquals(3, re.get(5).position);

}

@Test
public void TestPinyinPosition4() throws IOException {
String[] s ={ "medcl"};

PinyinConfig config = new PinyinConfig();
config.keepFirstLetter = true;
config.keepSeparateFirstLetter = true;
config.keepNoneChinese = true;
config.keepOriginal = true;
config.keepFullPinyin = true;
config.keepNoneChineseTogether = true;

HashMap<String, ArrayList<TermItem>> result= getStringArrayListHashMap(s, config);

ArrayList<TermItem> re = result.get("medcl");
Assert.assertEquals("me", re.get(0).term);
Assert.assertEquals(0, re.get(0).startOffset);
Assert.assertEquals(2, re.get(0).endOffset);
Assert.assertEquals(1, re.get(0).position);

Assert.assertEquals("medcl", re.get(1).term);
Assert.assertEquals(0, re.get(1).startOffset);
Assert.assertEquals(5, re.get(1).endOffset);
Assert.assertEquals(1, re.get(1).position);

config = new PinyinConfig();
config.keepFirstLetter = true;
config.keepSeparateFirstLetter = true;
config.keepNoneChinese = true;
config.keepOriginal = true;
config.keepFullPinyin = true;
config.keepNoneChineseTogether = false;
config.keepJoinedFullPinyin = true;

result = getStringArrayListHashMap(s, config);

re = result.get("medcl");
Assert.assertEquals("m", re.get(0).term);
Assert.assertEquals(0, re.get(0).startOffset);
Assert.assertEquals(1, re.get(0).endOffset);
Assert.assertEquals(1, re.get(0).position);

Assert.assertEquals("medcl", re.get(1).term);
Assert.assertEquals(0, re.get(1).startOffset);
Assert.assertEquals(5, re.get(1).endOffset);
Assert.assertEquals(1, re.get(1).position);



Assert.assertEquals("e", re.get(2).term);
Assert.assertEquals(1, re.get(2).startOffset);
Assert.assertEquals(2, re.get(2).endOffset);
Assert.assertEquals(2, re.get(2).position);

Assert.assertEquals("d", re.get(3).term);
Assert.assertEquals(2, re.get(3).startOffset);
Assert.assertEquals(3, re.get(3).endOffset);
Assert.assertEquals(3, re.get(3).position);


}
}

0 comments on commit 1089270

Please sign in to comment.