Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add new Fast match phrase #1058

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
*.iml
\.*
!.travis.yml
*/target
123 changes: 123 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,129 @@ Result
}
```

针对ES 的 match_phrase query 搜索,是一个非常消耗CPU的query,因为需要处理 term 和 position 的相对位置。为了加速搜素,现优化了分词形式,保存了正确position 的相对位置信息,使得match_phrase query 可以在分词条件下使用,经测试使用该分词之后查询降为原来的 10% 以下。该分词器分为 index 和 search分词器,分别用于索引数据和查询数据。</br>
原理是分词出来的词项对应着首字 position ,所以可以在倒排中保存相对位置信息。index 分词器是切分出了所有的组合,search 分词器是没有重复的切出最少词项的组合,且不会重复。</br>
使用:</br>
1, 定义text 字段,analyzer 设置为 index 分词器,search_analyer 设置为 search分词器;</br>
2, 写数据。</br>
3, 查询。</br>
4, 分词器首字确定位置: fcp_index、fcp_search; 末字确定位置:lcp_index、lcp_search</br>
5, 缺点是目前原生的高亮不支持这种分词方式</br>

原理<br>

```json
# 使用index 分词是,最细粒度的,按照字的position确定词的position,确定了position的取值标准
POST /_analyze
{
"analyzer": "fcp_index",
"text": "中国平安"
}
# response
{
"tokens": [
{
"token": "中",
"start_offset": 0,
"end_offset": 0,
"type": "<CHAR_CHINESE>",
"position": 0
},
{
"token": "中国",
"start_offset": 0,
"end_offset": 0,
"type": "<COMBINE_WORD>",
"position": 0
},
{
"token": "国",
"start_offset": 0,
"end_offset": 0,
"type": "<CHAR_CHINESE>",
"position": 1
},
{
"token": "平",
"start_offset": 0,
"end_offset": 0,
"type": "<CHAR_CHINESE>",
"position": 2
},
{
"token": "平安",
"start_offset": 0,
"end_offset": 0,
"type": "<COMBINE_WORD>",
"position": 2
},
{
"token": "安",
"start_offset": 0,
"end_offset": 0,
"type": "<CHAR_CHINESE>",
"position": 3
}
]
}
# 使用search 分词是粗粒度、无重叠分词,但仍按照字的position确定词的position,所以使用match_phrase有效
POST /_analyze
{
"analyzer": "fcp_search",
"text": "中国平安"
}
# response
{
"tokens": [
{
"token": "中国",
"start_offset": 0,
"end_offset": 2,
"type": "<COMBINE_WORD>",
"position": 0
},
{
"token": "平安",
"start_offset": 2,
"end_offset": 4,
"type": "<COMBINE_WORD>",
"position": 2
}
]
}
```

```json
PUT test_index
{
"mappings": {
"properties": {
"content":{
"type": "text",
"analyzer": "fcp_index",
"search_analyzer": "fcp_search"
}
}
}
}

POST test_index/_doc/1
{
"content": "如果需要覆盖原来的配置"
}

GET test_index/_search
{
"query": {
"match_phrase": {
"content": {
"query": "要覆盖"
}
}
}
}
```

# Dictionary Configuration

Config file `IKAnalyzer.cfg.xml` can be located at `{conf}/analysis-ik/config/IKAnalyzer.cfg.xml`
Expand Down
8 changes: 4 additions & 4 deletions core/src/main/java/org/wltea/analyzer/dic/DictSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
/**
* 词典树分段,表示词典树的一个分枝
*/
class DictSegment implements Comparable<DictSegment>{
public class DictSegment implements Comparable<DictSegment>{

//公用字典表,存储汉字
private static final Map<Character , Character> charMap = new ConcurrentHashMap<Character , Character>(16 , 0.95f);
Expand All @@ -55,7 +55,7 @@ class DictSegment implements Comparable<DictSegment>{
private int nodeState = 0;


DictSegment(Character nodeChar){
public DictSegment(Character nodeChar){
if(nodeChar == null){
throw new IllegalArgumentException("node char cannot be empty");
}
Expand All @@ -78,7 +78,7 @@ boolean hasNextNode(){
* @param charArray
* @return Hit
*/
Hit match(char[] charArray){
public Hit match(char[] charArray){
return this.match(charArray , 0 , charArray.length , null);
}

Expand Down Expand Up @@ -166,7 +166,7 @@ Hit match(char[] charArray , int begin , int length , Hit searchHit){
* 加载填充词典片段
* @param charArray
*/
void fillSegment(char[] charArray){
public void fillSegment(char[] charArray){
this.fillSegment(charArray, 0 , charArray.length , 1);
}

Expand Down
4 changes: 4 additions & 0 deletions core/src/main/java/org/wltea/analyzer/dic/Dictionary.java
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ private Dictionary(Configuration cfg) {
}
}

public DictSegment get_MainDict() {
return _MainDict;
}

private String getProperty(String key){
if(props!=null){
return props.getProperty(key);
Expand Down
185 changes: 185 additions & 0 deletions core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
package org.wltea.analyzer.fcp;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.fcp.util.CharacterUtil;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;

/**
* combine continues english or number
*/
public class CombineCharFilter extends TokenFilter {
public static final int DEFAULT_MAX_WORD_LEN = 255;

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

// used for saving upstream tokens , implemented by Arraylist
private List<TokenBody> tokenBodies = null;
private Queue<TokenBody> tokenResults = new LinkedList();
// token 最大长度。防止过长English
private final int maxTokenLen;

private static final Set<String> numberDot;
static {
Set<String> tmp = new HashSet<>();
tmp.add("."); // 2.345
tmp.add(","); // 1,234,567
numberDot = Collections.unmodifiableSet(tmp);
}

public CombineCharFilter(TokenStream input) {
super(input);
this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
}
/**
* Construct a token stream filtering the given input.
*
* @param input
* @param maxTokenLen
*/
public CombineCharFilter(TokenStream input, int maxTokenLen) {
super(input);
this.maxTokenLen = maxTokenLen;
}

@Override
public final boolean incrementToken() throws IOException {
if (tokenBodies == null && input.incrementToken()) {
tokenBodies = new ArrayList<>();
do {
TokenBody tb = new TokenBody(
termAtt.toString(),
offsetAtt.startOffset(),
offsetAtt.endOffset(),
typeAtt.type());
tokenBodies.add(tb);
} while (input.incrementToken());

combineCharsByType(tokenBodies);
}
if (tokenResults.size() > 0) {
TokenBody body = tokenResults.poll();
char[] chars = body.termBuffer.toCharArray();
termAtt.copyBuffer(chars, 0, chars.length);
offsetAtt.setOffset(body.startOffset, body.endOffset);
typeAtt.setType(body.type);
posIncrAtt.setPositionIncrement(1);
return true;
} else {
tokenBodies = null;
}
return false;
}

private void combineCharsByType(List<TokenBody> tokenBodies) {
if (tokenBodies == null || tokenBodies.size() == 0) {
return;
}
// 处理合并 english number useless
List<TokenBody> sameType = new ArrayList<>();
for (int beginI = 0; beginI < tokenBodies.size();) {
int nextTypeIndex = getNextTypeIndex(tokenBodies, beginI);
TokenBody body = composeTokens(tokenBodies, beginI, nextTypeIndex, tokenBodies.get(beginI).type);
sameType.add(body);
beginI = nextTypeIndex;
}
// 继续处理 english number
for (int beginI = 0; beginI < sameType.size();) {
TokenBody current = sameType.get(beginI);
int nextI = beginI + 1;
if (CharacterUtil.CHAR_NUMBER.equals(current.type) || CharacterUtil.CHAR_ENGLISH.equals(current.type)) {
for(; nextI < sameType.size(); nextI++) {
TokenBody next = sameType.get(nextI);
if (CharacterUtil.CHAR_NUMBER.equals(next.type)
|| CharacterUtil.CHAR_ENGLISH.equals(next.type)) {
current.type = CharacterUtil.ALPHANUM;
current.termBuffer = current.termBuffer + next.termBuffer;
current.endOffset = next.endOffset;
} else {
break;
}
}
}
beginI = nextI;
tokenResults.add(current);
}

}

private TokenBody composeTokens(List<TokenBody> tokenBodies, int beginI, int nextTypeIndex, String type) {
StringBuffer buffer = new StringBuffer();
int startOffset = tokenBodies.get(beginI).startOffset;
int endOffset = tokenBodies.get(nextTypeIndex - 1).endOffset;
for(int i = beginI; i < nextTypeIndex; i++) {
buffer.append(tokenBodies.get(i).termBuffer);
}
return new TokenBody(buffer.toString(), startOffset, endOffset, type);
}

// 首 TokenBody 的 type 作为整体
private int getNextTypeIndex(List<TokenBody> tokenBodies,final int beginI) {
int currentIndex = beginI;
// 如果 currentIndex 为 tokenBodies 的最后一个位置,直接返回
if (currentIndex == tokenBodies.size() - 1) {
return currentIndex + 1;
}
TokenBody current = tokenBodies.get(currentIndex);
final String currentWordType = current.type;
int maxIndex = Math.min(currentIndex + maxTokenLen, tokenBodies.size());
if (CharacterUtil.CHAR_NUMBER.equals(currentWordType)) {
for (currentIndex++; currentIndex < maxIndex; currentIndex++) {
current = tokenBodies.get(currentIndex);
if (CharacterUtil.CHAR_USELESS.equals(current.type) && numberDot.contains(current.termBuffer)) {
if (currentIndex+1 < maxIndex && CharacterUtil.CHAR_NUMBER.equals(tokenBodies.get(currentIndex+1).type)) {
// 改变了整体的 type
tokenBodies.get(beginI).type = CharacterUtil.CHAR_NUMBER_DOT;
} else {
break;
}
} else if (!CharacterUtil.CHAR_NUMBER.equals(current.type)) {
break;
}
}
return currentIndex;
} else if (CharacterUtil.CHAR_ENGLISH.equals(currentWordType) || CharacterUtil.CHAR_USELESS.equals(currentWordType)) {
for (currentIndex++; currentIndex < maxIndex; currentIndex++) {
current = tokenBodies.get(currentIndex);
if (!currentWordType.equals(current.type)) {
break;
}
}
return currentIndex;
} else {
return currentIndex + 1;
}
}


private static class TokenBody {
String termBuffer;
int startOffset, endOffset;
String type;

TokenBody(String termBuffer, int startOffset, int endOffset, String type){
this.termBuffer = termBuffer;
this.startOffset = startOffset;
this.endOffset = endOffset;
this.type = type;
}
}
}
Loading