infinilabs · muhao1020 · Apr 20, 2024 · Apr 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@
 *.iml
 \.*
 !.travis.yml
+*/target
diff --git a/README.md b/README.md
@@ -143,6 +143,129 @@ Result
 }
 ```
 
+针对ES 的 match_phrase query 搜索，是一个非常消耗CPU的query，因为需要处理 term 和 position 的相对位置。为了加速搜素，现优化了分词形式，保存了正确position 的相对位置信息，使得match_phrase query 可以在分词条件下使用，经测试使用该分词之后查询降为原来的 10% 以下。该分词器分为 index 和 search分词器，分别用于索引数据和查询数据。</br>
+原理是分词出来的词项对应着首字 position ，所以可以在倒排中保存相对位置信息。index 分词器是切分出了所有的组合，search 分词器是没有重复的切出最少词项的组合，且不会重复。</br>
+使用：</br>
+1, 定义text 字段，analyzer 设置为 index 分词器，search_analyer 设置为 search分词器；</br>
+2, 写数据。</br>
+3, 查询。</br>
+4, 分词器首字确定位置： fcp_index、fcp_search; 末字确定位置：lcp_index、lcp_search</br>
+5, 缺点是目前原生的高亮不支持这种分词方式</br>
+
+原理<br>
+
+```json
+# 使用index 分词是，最细粒度的，按照字的position确定词的position，确定了position的取值标准
+POST /_analyze
+{
+  "analyzer": "fcp_index",
+  "text": "中国平安"
+}
+# response
+{
+  "tokens": [
+    {
+      "token": "中",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<CHAR_CHINESE>",
+      "position": 0
+    },
+    {
+      "token": "中国",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<COMBINE_WORD>",
+      "position": 0
+    },
+    {
+      "token": "国",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<CHAR_CHINESE>",
+      "position": 1
+    },
+    {
+      "token": "平",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<CHAR_CHINESE>",
+      "position": 2
+    },
+    {
+      "token": "平安",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<COMBINE_WORD>",
+      "position": 2
+    },
+    {
+      "token": "安",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<CHAR_CHINESE>",
+      "position": 3
+    }
+  ]
+}
+# 使用search 分词是粗粒度、无重叠分词，但仍按照字的position确定词的position，所以使用match_phrase有效
+POST /_analyze
+{
+  "analyzer": "fcp_search",
+  "text": "中国平安"
+}
+# response
+{
+  "tokens": [
+    {
+      "token": "中国",
+      "start_offset": 0,
+      "end_offset": 2,
+      "type": "<COMBINE_WORD>",
+      "position": 0
+    },
+    {
+      "token": "平安",
+      "start_offset": 2,
+      "end_offset": 4,
+      "type": "<COMBINE_WORD>",
+      "position": 2
+    }
+  ]
+}
+```
+
+```json
+PUT test_index
+{
+  "mappings": {
+    "properties": {
+      "content":{
+        "type": "text",
+        "analyzer": "fcp_index",
+        "search_analyzer": "fcp_search"
+      }
+    }
+  }
+}
+
+POST test_index/_doc/1
+{
+  "content": "如果需要覆盖原来的配置"
+}
+
+GET test_index/_search
+{
+  "query": {
+    "match_phrase": {
+      "content": {
+        "query": "要覆盖"
+      }
+    }
+  }
+}
+```
+
 # Dictionary Configuration
 
 Config file `IKAnalyzer.cfg.xml` can be located at `{conf}/analysis-ik/config/IKAnalyzer.cfg.xml`

diff --git a/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java
@@ -32,7 +32,7 @@
 /**
  * 词典树分段，表示词典树的一个分枝
  */
-class DictSegment implements Comparable<DictSegment>{
+public class DictSegment implements Comparable<DictSegment>{
 
 	//公用字典表，存储汉字
 	private static final Map<Character , Character> charMap = new ConcurrentHashMap<Character , Character>(16 , 0.95f);
@@ -55,7 +55,7 @@ class DictSegment implements Comparable<DictSegment>{
 	private int nodeState = 0;	
 
 
-	DictSegment(Character nodeChar){
+	public DictSegment(Character nodeChar){
 		if(nodeChar == null){
 			throw new IllegalArgumentException("node char cannot be empty");
 		}
@@ -78,7 +78,7 @@ boolean hasNextNode(){
 	 * @param charArray
 	 * @return Hit
 	 */
-	Hit match(char[] charArray){
+	public Hit match(char[] charArray){
 		return this.match(charArray , 0 , charArray.length , null);
 	}
 
@@ -166,7 +166,7 @@ Hit match(char[] charArray , int begin , int length , Hit searchHit){
 	 * 加载填充词典片段
 	 * @param charArray
 	 */
-	void fillSegment(char[] charArray){
+	public void fillSegment(char[] charArray){
 		this.fillSegment(charArray, 0 , charArray.length , 1); 
 	}
 

diff --git a/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java
@@ -126,6 +126,10 @@ private Dictionary(Configuration cfg) {
 		}
 	}
 
+	public DictSegment get_MainDict() {
+		return _MainDict;
+	}
+
 	private String getProperty(String key){
 		if(props!=null){
 			return props.getProperty(key);

diff --git a/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java
@@ -0,0 +1,185 @@
+package org.wltea.analyzer.fcp;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.wltea.analyzer.fcp.util.CharacterUtil;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+/**
+ * combine continues english or number
+ */
+public class CombineCharFilter extends TokenFilter {
+    public static final int DEFAULT_MAX_WORD_LEN = 255;
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+    // used for saving upstream tokens , implemented by Arraylist
+    private List<TokenBody> tokenBodies = null;
+    private Queue<TokenBody> tokenResults = new LinkedList();
+    // token 最大长度。防止过长English
+    private final int maxTokenLen;
+
+    private static final Set<String> numberDot;
+    static {
+        Set<String> tmp = new HashSet<>();
+        tmp.add("."); // 2.345
+        tmp.add(","); // 1,234,567
+        numberDot = Collections.unmodifiableSet(tmp);
+    }
+
+    public CombineCharFilter(TokenStream input) {
+        super(input);
+        this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
+    }
+    /**
+     * Construct a token stream filtering the given input.
+     *
+     * @param input
+     * @param maxTokenLen
+     */
+    public CombineCharFilter(TokenStream input, int maxTokenLen) {
+        super(input);
+        this.maxTokenLen = maxTokenLen;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+        if (tokenBodies == null && input.incrementToken()) {
+            tokenBodies = new ArrayList<>();
+            do {
+                TokenBody tb = new TokenBody(
+                        termAtt.toString(),
+                        offsetAtt.startOffset(),
+                        offsetAtt.endOffset(),
+                        typeAtt.type());
+                tokenBodies.add(tb);
+            } while (input.incrementToken());
+
+            combineCharsByType(tokenBodies);
+        }
+        if (tokenResults.size() > 0) {
+            TokenBody body = tokenResults.poll();
+            char[] chars = body.termBuffer.toCharArray();
+            termAtt.copyBuffer(chars, 0, chars.length);
+            offsetAtt.setOffset(body.startOffset, body.endOffset);
+            typeAtt.setType(body.type);
+            posIncrAtt.setPositionIncrement(1);
+            return true;
+        } else {
+            tokenBodies = null;
+        }
+        return false;
+    }
+
+    private void combineCharsByType(List<TokenBody> tokenBodies) {
+        if (tokenBodies == null || tokenBodies.size() == 0) {
+            return;
+        }
+        // 处理合并 english number useless
+        List<TokenBody> sameType = new ArrayList<>();
+        for (int beginI = 0; beginI < tokenBodies.size();) {
+            int nextTypeIndex = getNextTypeIndex(tokenBodies, beginI);
+            TokenBody body = composeTokens(tokenBodies, beginI, nextTypeIndex, tokenBodies.get(beginI).type);
+            sameType.add(body);
+            beginI = nextTypeIndex;
+        }
+        // 继续处理 english number
+        for (int beginI = 0; beginI < sameType.size();) {
+            TokenBody current = sameType.get(beginI);
+            int nextI = beginI + 1;
+            if (CharacterUtil.CHAR_NUMBER.equals(current.type) || CharacterUtil.CHAR_ENGLISH.equals(current.type)) {
+                for(; nextI < sameType.size(); nextI++) {
+                    TokenBody next = sameType.get(nextI);
+                    if (CharacterUtil.CHAR_NUMBER.equals(next.type)
+                            || CharacterUtil.CHAR_ENGLISH.equals(next.type)) {
+                        current.type = CharacterUtil.ALPHANUM;
+                        current.termBuffer = current.termBuffer + next.termBuffer;
+                        current.endOffset = next.endOffset;
+                    } else {
+                        break;
+                    }
+                }
+            }
+            beginI = nextI;
+            tokenResults.add(current);
+        }
+
+    }
+
+    private TokenBody composeTokens(List<TokenBody> tokenBodies, int beginI, int nextTypeIndex, String type) {
+        StringBuffer buffer = new StringBuffer();
+        int startOffset = tokenBodies.get(beginI).startOffset;
+        int endOffset = tokenBodies.get(nextTypeIndex - 1).endOffset;
+        for(int i = beginI; i < nextTypeIndex; i++) {
+            buffer.append(tokenBodies.get(i).termBuffer);
+        }
+        return new TokenBody(buffer.toString(), startOffset, endOffset, type);
+    }
+
+    // 首 TokenBody 的 type 作为整体
+    private int getNextTypeIndex(List<TokenBody> tokenBodies,final int beginI) {
+        int currentIndex = beginI;
+        // 如果 currentIndex 为 tokenBodies 的最后一个位置，直接返回
+        if (currentIndex == tokenBodies.size() - 1) {
+            return currentIndex + 1;
+        }
+        TokenBody current = tokenBodies.get(currentIndex);
+        final String currentWordType = current.type;
+        int maxIndex = Math.min(currentIndex + maxTokenLen, tokenBodies.size());
+        if (CharacterUtil.CHAR_NUMBER.equals(currentWordType)) {
+            for (currentIndex++; currentIndex < maxIndex; currentIndex++) {
+                current = tokenBodies.get(currentIndex);
+                if (CharacterUtil.CHAR_USELESS.equals(current.type) && numberDot.contains(current.termBuffer)) {
+                    if (currentIndex+1 < maxIndex && CharacterUtil.CHAR_NUMBER.equals(tokenBodies.get(currentIndex+1).type)) {
+                        // 改变了整体的 type
+                        tokenBodies.get(beginI).type = CharacterUtil.CHAR_NUMBER_DOT;
+                    } else {
+                        break;
+                    }
+                } else if (!CharacterUtil.CHAR_NUMBER.equals(current.type)) {
+                    break;
+                }
+            }
+            return currentIndex;
+        } else if (CharacterUtil.CHAR_ENGLISH.equals(currentWordType) || CharacterUtil.CHAR_USELESS.equals(currentWordType)) {
+            for (currentIndex++; currentIndex < maxIndex; currentIndex++) {
+                current = tokenBodies.get(currentIndex);
+                if (!currentWordType.equals(current.type)) {
+                    break;
+                }
+            }
+            return currentIndex;
+        } else {
+            return currentIndex + 1;
+        }
+    }
+
+
+    private static class TokenBody {
+        String termBuffer;
+        int startOffset, endOffset;
+        String type;
+
+        TokenBody(String termBuffer, int startOffset, int endOffset, String type){
+            this.termBuffer = termBuffer;
+            this.startOffset = startOffset;
+            this.endOffset = endOffset;
+            this.type = type;
+        }
+    }
+}