From f2e64ebbefc075fddb9c248fbc76b4cd907a3f49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Chao=2Emou?= <1967886749@qq.com>
Date: Sat, 20 Apr 2024 19:51:54 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=C3=A5=C2=A2add=20new=20analyzer=20for=20sp?=
 =?UTF-8?q?eed=20match=5Fphrase=20query?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                    |   1 +
 README.md                                     | 123 +++++
 .../org/wltea/analyzer/dic/DictSegment.java   |   8 +-
 .../org/wltea/analyzer/dic/Dictionary.java    |   4 +
 .../wltea/analyzer/fcp/CombineCharFilter.java | 185 +++++++
 .../org/wltea/analyzer/fcp/ExtendFilter.java  | 474 ++++++++++++++++++
 .../org/wltea/analyzer/fcp/FCPAnalyzer.java   | 131 +++++
 .../org/wltea/analyzer/fcp/FormatFilter.java  |  51 ++
 .../org/wltea/analyzer/fcp/OptionPath.java    | 102 ++++
 .../org/wltea/analyzer/fcp/TokenBody.java     |  47 ++
 .../PositionLengthAttribute.java              |  33 ++
 .../PositionLengthAttributeImpl.java          |  62 +++
 .../analyzer/fcp/util/CharacterUtil.java      | 124 +++++
 .../ik/elasticsearch/AnalysisIkPlugin.java    |   6 +
 .../ik/elasticsearch/FCPAnalyzerProvider.java |  89 ++++
 15 files changed, 1436 insertions(+), 4 deletions(-)
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
 create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java
 create mode 100644 elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/FCPAnalyzerProvider.java
diff --git a/.gitignore b/.gitignore
index a53ac3d1..90db62f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@
 *.iml
 \.*
 !.travis.yml
+*/target
diff --git a/README.md b/README.md
index 60eab6b9..8bea7581 100644
--- a/README.md
+++ b/README.md
@@ -143,6 +143,129 @@ Result
 }
 ```
 
+针对ES 的 match_phrase query 搜索，是一个非常消耗CPU的query，因为需要处理 term 和 position 的相对位置。为了加速搜素，现优化了分词形式，保存了正确position 的相对位置信息，使得match_phrase query 可以在分词条件下使用，经测试使用该分词之后查询降为原来的 10% 以下。该分词器分为 index 和 search分词器，分别用于索引数据和查询数据。</br>
+原理是分词出来的词项对应着首字 position ，所以可以在倒排中保存相对位置信息。index 分词器是切分出了所有的组合，search 分词器是没有重复的切出最少词项的组合，且不会重复。</br>
+使用：</br>
+1, 定义text 字段，analyzer 设置为 index 分词器，search_analyer 设置为 search分词器；</br>
+2, 写数据。</br>
+3, 查询。</br>
+4, 分词器首字确定位置： fcp_index、fcp_search; 末字确定位置：lcp_index、lcp_search</br>
+5, 缺点是目前原生的高亮不支持这种分词方式</br>
+
+原理<br>
+
+```json
+# 使用index 分词是，最细粒度的，按照字的position确定词的position，确定了position的取值标准
+POST /_analyze
+{
+  "analyzer": "fcp_index",
+  "text": "中国平安"
+}
+# response
+{
+  "tokens": [
+    {
+      "token": "中",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<CHAR_CHINESE>",
+      "position": 0
+    },
+    {
+      "token": "中国",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<COMBINE_WORD>",
+      "position": 0
+    },
+    {
+      "token": "国",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<CHAR_CHINESE>",
+      "position": 1
+    },
+    {
+      "token": "平",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<CHAR_CHINESE>",
+      "position": 2
+    },
+    {
+      "token": "平安",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<COMBINE_WORD>",
+      "position": 2
+    },
+    {
+      "token": "安",
+      "start_offset": 0,
+      "end_offset": 0,
+      "type": "<CHAR_CHINESE>",
+      "position": 3
+    }
+  ]
+}
+# 使用search 分词是粗粒度、无重叠分词，但仍按照字的position确定词的position，所以使用match_phrase有效
+POST /_analyze
+{
+  "analyzer": "fcp_search",
+  "text": "中国平安"
+}
+# response
+{
+  "tokens": [
+    {
+      "token": "中国",
+      "start_offset": 0,
+      "end_offset": 2,
+      "type": "<COMBINE_WORD>",
+      "position": 0
+    },
+    {
+      "token": "平安",
+      "start_offset": 2,
+      "end_offset": 4,
+      "type": "<COMBINE_WORD>",
+      "position": 2
+    }
+  ]
+}
+```
+
+```json
+PUT test_index
+{
+  "mappings": {
+    "properties": {
+      "content":{
+        "type": "text",
+        "analyzer": "fcp_index",
+        "search_analyzer": "fcp_search"
+      }
+    }
+  }
+}
+
+POST test_index/_doc/1
+{
+  "content": "如果需要覆盖原来的配置"
+}
+
+GET test_index/_search
+{
+  "query": {
+    "match_phrase": {
+      "content": {
+        "query": "要覆盖"
+      }
+    }
+  }
+}
+```
+
 # Dictionary Configuration
 
 Config file `IKAnalyzer.cfg.xml` can be located at `{conf}/analysis-ik/config/IKAnalyzer.cfg.xml`
diff --git a/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java
index 9e7b6fe4..33e60139 100644
--- a/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java
+++ b/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java
@@ -32,7 +32,7 @@
 /**
  * 词典树分段，表示词典树的一个分枝
  */
-class DictSegment implements Comparable<DictSegment>{
+public class DictSegment implements Comparable<DictSegment>{
 	
 	//公用字典表，存储汉字
 	private static final Map<Character , Character> charMap = new ConcurrentHashMap<Character , Character>(16 , 0.95f);
@@ -55,7 +55,7 @@ class DictSegment implements Comparable<DictSegment>{
 	private int nodeState = 0;	
 	
 	
-	DictSegment(Character nodeChar){
+	public DictSegment(Character nodeChar){
 		if(nodeChar == null){
 			throw new IllegalArgumentException("node char cannot be empty");
 		}
@@ -78,7 +78,7 @@ boolean hasNextNode(){
 	 * @param charArray
 	 * @return Hit
 	 */
-	Hit match(char[] charArray){
+	public Hit match(char[] charArray){
 		return this.match(charArray , 0 , charArray.length , null);
 	}
 	
@@ -166,7 +166,7 @@ Hit match(char[] charArray , int begin , int length , Hit searchHit){
 	 * 加载填充词典片段
 	 * @param charArray
 	 */
-	void fillSegment(char[] charArray){
+	public void fillSegment(char[] charArray){
 		this.fillSegment(charArray, 0 , charArray.length , 1); 
 	}
 	
diff --git a/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java
index 80a92da1..a6d60278 100755
--- a/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java
+++ b/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java
@@ -126,6 +126,10 @@ private Dictionary(Configuration cfg) {
 		}
 	}
 
+	public DictSegment get_MainDict() {
+		return _MainDict;
+	}
+
 	private String getProperty(String key){
 		if(props!=null){
 			return props.getProperty(key);
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java
new file mode 100644
index 00000000..e3ad6c4c
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java
@@ -0,0 +1,185 @@
+package org.wltea.analyzer.fcp;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.wltea.analyzer.fcp.util.CharacterUtil;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+
+/**
+ * combine continues english or number
+ */
+public class CombineCharFilter extends TokenFilter {
+    public static final int DEFAULT_MAX_WORD_LEN = 255;
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+
+    // used for saving upstream tokens , implemented by Arraylist
+    private List<TokenBody> tokenBodies = null;
+    private Queue<TokenBody> tokenResults = new LinkedList();
+    // token 最大长度。防止过长English
+    private final int maxTokenLen;
+
+    private static final Set<String> numberDot;
+    static {
+        Set<String> tmp = new HashSet<>();
+        tmp.add("."); // 2.345
+        tmp.add(","); // 1,234,567
+        numberDot = Collections.unmodifiableSet(tmp);
+    }
+
+    public CombineCharFilter(TokenStream input) {
+        super(input);
+        this.maxTokenLen = DEFAULT_MAX_WORD_LEN;
+    }
+    /**
+     * Construct a token stream filtering the given input.
+     *
+     * @param input
+     * @param maxTokenLen
+     */
+    public CombineCharFilter(TokenStream input, int maxTokenLen) {
+        super(input);
+        this.maxTokenLen = maxTokenLen;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+        if (tokenBodies == null && input.incrementToken()) {
+            tokenBodies = new ArrayList<>();
+            do {
+                TokenBody tb = new TokenBody(
+                        termAtt.toString(),
+                        offsetAtt.startOffset(),
+                        offsetAtt.endOffset(),
+                        typeAtt.type());
+                tokenBodies.add(tb);
+            } while (input.incrementToken());
+
+            combineCharsByType(tokenBodies);
+        }
+        if (tokenResults.size() > 0) {
+            TokenBody body = tokenResults.poll();
+            char[] chars = body.termBuffer.toCharArray();
+            termAtt.copyBuffer(chars, 0, chars.length);
+            offsetAtt.setOffset(body.startOffset, body.endOffset);
+            typeAtt.setType(body.type);
+            posIncrAtt.setPositionIncrement(1);
+            return true;
+        } else {
+            tokenBodies = null;
+        }
+        return false;
+    }
+
+    private void combineCharsByType(List<TokenBody> tokenBodies) {
+        if (tokenBodies == null || tokenBodies.size() == 0) {
+            return;
+        }
+        // 处理合并 english number useless
+        List<TokenBody> sameType = new ArrayList<>();
+        for (int beginI = 0; beginI < tokenBodies.size();) {
+            int nextTypeIndex = getNextTypeIndex(tokenBodies, beginI);
+            TokenBody body = composeTokens(tokenBodies, beginI, nextTypeIndex, tokenBodies.get(beginI).type);
+            sameType.add(body);
+            beginI = nextTypeIndex;
+        }
+        // 继续处理 english number
+        for (int beginI = 0; beginI < sameType.size();) {
+            TokenBody current = sameType.get(beginI);
+            int nextI = beginI + 1;
+            if (CharacterUtil.CHAR_NUMBER.equals(current.type) || CharacterUtil.CHAR_ENGLISH.equals(current.type)) {
+                for(; nextI < sameType.size(); nextI++) {
+                    TokenBody next = sameType.get(nextI);
+                    if (CharacterUtil.CHAR_NUMBER.equals(next.type)
+                            || CharacterUtil.CHAR_ENGLISH.equals(next.type)) {
+                        current.type = CharacterUtil.ALPHANUM;
+                        current.termBuffer = current.termBuffer + next.termBuffer;
+                        current.endOffset = next.endOffset;
+                    } else {
+                        break;
+                    }
+                }
+            }
+            beginI = nextI;
+            tokenResults.add(current);
+        }
+
+    }
+
+    private TokenBody composeTokens(List<TokenBody> tokenBodies, int beginI, int nextTypeIndex, String type) {
+        StringBuffer buffer = new StringBuffer();
+        int startOffset = tokenBodies.get(beginI).startOffset;
+        int endOffset = tokenBodies.get(nextTypeIndex - 1).endOffset;
+        for(int i = beginI; i < nextTypeIndex; i++) {
+            buffer.append(tokenBodies.get(i).termBuffer);
+        }
+        return new TokenBody(buffer.toString(), startOffset, endOffset, type);
+    }
+
+    // 首 TokenBody 的 type 作为整体
+    private int getNextTypeIndex(List<TokenBody> tokenBodies,final int beginI) {
+        int currentIndex = beginI;
+        // 如果 currentIndex 为 tokenBodies 的最后一个位置，直接返回
+        if (currentIndex == tokenBodies.size() - 1) {
+            return currentIndex + 1;
+        }
+        TokenBody current = tokenBodies.get(currentIndex);
+        final String currentWordType = current.type;
+        int maxIndex = Math.min(currentIndex + maxTokenLen, tokenBodies.size());
+        if (CharacterUtil.CHAR_NUMBER.equals(currentWordType)) {
+            for (currentIndex++; currentIndex < maxIndex; currentIndex++) {
+                current = tokenBodies.get(currentIndex);
+                if (CharacterUtil.CHAR_USELESS.equals(current.type) && numberDot.contains(current.termBuffer)) {
+                    if (currentIndex+1 < maxIndex && CharacterUtil.CHAR_NUMBER.equals(tokenBodies.get(currentIndex+1).type)) {
+                        // 改变了整体的 type
+                        tokenBodies.get(beginI).type = CharacterUtil.CHAR_NUMBER_DOT;
+                    } else {
+                        break;
+                    }
+                } else if (!CharacterUtil.CHAR_NUMBER.equals(current.type)) {
+                    break;
+                }
+            }
+            return currentIndex;
+        } else if (CharacterUtil.CHAR_ENGLISH.equals(currentWordType) || CharacterUtil.CHAR_USELESS.equals(currentWordType)) {
+            for (currentIndex++; currentIndex < maxIndex; currentIndex++) {
+                current = tokenBodies.get(currentIndex);
+                if (!currentWordType.equals(current.type)) {
+                    break;
+                }
+            }
+            return currentIndex;
+        } else {
+            return currentIndex + 1;
+        }
+    }
+
+
+    private static class TokenBody {
+        String termBuffer;
+        int startOffset, endOffset;
+        String type;
+
+        TokenBody(String termBuffer, int startOffset, int endOffset, String type){
+            this.termBuffer = termBuffer;
+            this.startOffset = startOffset;
+            this.endOffset = endOffset;
+            this.type = type;
+        }
+    }
+}
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
new file mode 100644
index 00000000..211c09b0
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
@@ -0,0 +1,474 @@
+package org.wltea.analyzer.fcp;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.wltea.analyzer.dic.DictSegment;
+import org.wltea.analyzer.dic.Dictionary;
+import org.wltea.analyzer.dic.Hit;
+import org.wltea.analyzer.fcp.tokenattributes.PositionLengthAttribute;
+import org.wltea.analyzer.fcp.util.CharacterUtil;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+
+/**
+ * use dict to extend terms
+ */
+public class ExtendFilter extends TokenFilter {
+    private static final boolean IS_DEBUG = true;
+    // 默认入库模式
+    public static final boolean DEFAULT_INDEX_MODE = true;
+    // 默认对于特殊字符采用模糊搜索，扩大搜索范围
+    public static final boolean DEFAULT_USELESS_MAPPING = true;
+    // 默认对于句子的空白进行忽略
+    public static final boolean DEFAULT_IGNORE_BLANK = true;
+    // 默认对于句子的空白进行忽略
+    public static final boolean DEFAULT_IGNORE_WHITESPACE = true;
+    // 默认使用 lcp 的模式，使用最后一个char的position
+    public static final boolean DEFAULT_USE_FIRST_POSITION = false;
+    // 在高亮的时候使用 offset
+    public static final boolean DEFAULT_SHOW_OFFSET = false;
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    // 用于记录每一个term 的position length
+    private final PositionLengthAttribute lengthAttribute = addAttribute(PositionLengthAttribute.class);
+
+    // used for saving upstream tokens , implemented by Arraylist
+    private List<TokenBody> tokenBodies = null;
+    //use to save analyzed tokens ,use priority heap save order
+    PriorityQueue<TokenBody> tokenResults = new PriorityQueue<TokenBody>(new Comparator<TokenBody>(){
+        @Override
+        public int compare(TokenBody o1, TokenBody o2){
+//            return o1.position != o2.position ? Integer.compare(o1.position, o2.position) : Integer.compare(o2.startOffset, o1.startOffset);
+            if(o1.position != o2.position) {
+                return Integer.compare(o1.position, o2.position);
+            } else if (o2.startOffset != o1.startOffset) {
+                return Integer.compare(o2.startOffset, o1.startOffset);
+            } else {
+                return Integer.compare(o1.endOffset-o1.startOffset, o2.endOffset-o2.startOffset);
+            }
+        }
+    });
+    // 记录上一个 term 的position ，用于计算 positionIncrement
+    private int prePosition = -1;
+
+    private final boolean indexMode;
+    // 对于上游的 分词结果 上个 end_offset 和 下一个 token的 start_offset 不相等。 像 “成 功” 之间有空格，该参数决定是否忽略空格组词， 默认为true，忽略之间的 空白
+    private boolean ignoreBlank = true;
+    // 是否使用 first char position ，默认使用，如果为 false，则变为 lcp_analyzer
+    private boolean useFirstPos = true;
+    // 特殊字符的映射，默认为 true 表示模糊匹配特殊字符。如果设置为 false ，将会把原始的char放到最终分词结果中。
+    private boolean uselessMapping = true;
+    // 入库模式下不显示，search 模式下显示offset，在 highlight 的时候也开启
+    private boolean showOffset = false;
+
+
+    public ExtendFilter setIgnoreBlank(boolean ignoreBlank) {
+        this.ignoreBlank = ignoreBlank;
+        return this;
+    }
+
+
+    public ExtendFilter setUseFirstPos(boolean useFirstPos) {
+        this.useFirstPos = useFirstPos;
+        return this;
+    }
+
+    public ExtendFilter setUselessMapping(boolean uselessMapping) {
+        this.uselessMapping = uselessMapping;
+        return this;
+    }
+
+    public ExtendFilter setShowOffset(boolean showOffset) {
+        this.showOffset = showOffset;
+        return this;
+    }
+
+
+    /**
+     * Construct a token stream filtering the given input.
+     *
+     * @param input
+     */
+    public ExtendFilter(TokenStream input) {
+        this(input, DEFAULT_INDEX_MODE);
+    }
+
+    public ExtendFilter(TokenStream input, boolean indexMode) {
+        super(input);
+        this.indexMode = indexMode;
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+        if (tokenBodies == null && input.incrementToken()) {
+            tokenBodies = new ArrayList<>();
+            int position = -1;
+            do {
+                TokenBody tb= new TokenBody();
+                // TODO lcp analyzer 入库的特殊处理方式(不支持 offset 和 term_vector 存储方式)，否则就要改变 lucene源码。
+                tb.startOffset = showOffset ? offsetAtt.startOffset() : 0;
+                tb.endOffset = showOffset ? offsetAtt.endOffset() : 0;
+                // blank 类型会被舍弃，position不变
+                tb.termBuffer = termAtt.toString();
+                // 下面是处理 position 和 type的赋值，单个 term，没有 startPosition 和 endPosition
+                if (CharacterUtil.CHAR_USELESS.equals(typeAtt.type())) {
+                    if (isAllBlank(tb.termBuffer) && this.ignoreBlank) {
+                        // 表示沿用上一个 position，下面将会被舍弃掉
+                        tb.position = position;
+                        tb.type = CharacterUtil.CHAR_BLANK;
+                        tb.termBuffer = "";
+                    } else {
+                        position += posIncrAtt.getPositionIncrement();
+                        tb.position = position;
+                        tb.type = typeAtt.type();
+                        if (uselessMapping) {
+                            tb.termBuffer = "#"; // 无特殊含义，将特殊字符统一映射为 # 方便查询
+                        }
+                    }
+                } else {
+                    position += posIncrAtt.getPositionIncrement();
+                    tb.position = position;
+                    tb.type = typeAtt.type();
+                }
+                tokenBodies.add(tb);
+            } while (input.incrementToken());
+
+            extendTerms(tokenBodies, indexMode, ignoreBlank, useFirstPos);
+        }
+        if (tokenResults.size() > 0) {
+            TokenBody body = tokenResults.poll();
+
+            posIncrAtt.setPositionIncrement(body.position - prePosition);
+            prePosition = body.position;
+            char[] chars = body.termBuffer.toCharArray();
+            termAtt.copyBuffer(chars, 0, chars.length);
+            offsetAtt.setOffset(body.startOffset, body.endOffset);
+            typeAtt.setType(body.type);
+            if (!indexMode) {
+                // 计算当前combine term 的跨度，占用了多少个 term
+                lengthAttribute.setPositionLength(body.endPosition - body.startPosition + 1);
+            }
+            return true;
+        } else {
+            tokenBodies = null;
+            prePosition = -1;
+        }
+        return false;
+    }
+
+
+    /**
+     * 判断参数是否全部由空白字符组成
+     * @param s
+     * @return
+     */
+    private boolean isAllBlank(String s) {
+        return s.trim().length() == 0;
+    }
+
+    private void extendTerms(List<TokenBody> tokenBodies, boolean indexMode, boolean ignoreBlank, boolean useFirstPos) {
+        if (tokenBodies == null || tokenBodies.size() == 0) {
+            return;
+        }
+        for (int beginI = 0; beginI < tokenBodies.size(); beginI++) {
+            TokenBody tokenBody = tokenBodies.get(beginI);
+            if (!tokenBody.type.equals(CharacterUtil.CHAR_BLANK)) {
+                // 处理当前char, 但要考虑向后扩展，得到以当前位置开始 以 endList 中位置结束的一系列term，
+                List<Integer> endList = getCurrentEndList(tokenBodies, beginI, ignoreBlank);
+                // 默认在 index 模式下，一股脑全部放到倒排中（index 模式对性能敏感，所以必须保证）
+                if (!indexMode) {
+                    tokenBody.startPosition = tokenBody.position;
+                    tokenBody.endPosition = tokenBody.position;
+                }
+                tokenResults.add(tokenBody);
+                for (Integer endI : endList) {
+                    TokenBody tb= new TokenBody();
+                    tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI);
+                    tb.startOffset = tokenBodies.get(beginI).startOffset;
+                    tb.endOffset = tokenBodies.get(endI).endOffset;
+                    // search 模式下需要记录组合 term  前后的 position
+                    if (!indexMode) {
+                        tb.startPosition = tokenBodies.get(beginI).position;
+                        tb.endPosition = tokenBodies.get(endI).position;
+                    }
+                    if (useFirstPos) {
+                        tb.position = tokenBodies.get(beginI).position;
+                    } else {
+                        tb.position = tokenBodies.get(endI).position;
+                    }
+                    tb.type = "<COMBINE_WORD>";
+                    tokenResults.add(tb);
+                }
+            }
+        }
+        // 到这里如果是index 模式的话，已经可以结束了；
+        // 如果是 search模式，需要做歧义处理(如果有的话， 使用 <CHAR_USELESS> 类型的char 作为天然分割句子)
+        if (!indexMode && tokenResults.size() > 0) {
+            // 在search 模式下，采用 ik_smart 的逻辑进行语义分割，一个重大的意义：引入了语义分割
+            // 1，ik 使用没有语义重叠的那个 char 作为分割点，只作用于有字符重叠的部分
+            // 2，由于和 index 模式使用相同的 向后扩展逻辑，所以search是index 的子集
+            // 3，search 模式下，不会涉及mapping的扩展引入
+            // 4，search 模式下，使用 startPosition 来进行判断扩是否有歧义
+
+            // 用于保存多个term的组合形式，逆序。：采用动态编程思想，完成快速组合
+            PriorityQueue<TokenBody> combineTerms = new PriorityQueue<TokenBody>(new Comparator<TokenBody>(){
+                @Override
+                public int compare(TokenBody o1, TokenBody o2){
+                    // 顺序有重要意义
+                    return o1.startPosition != o2.startPosition ?
+                            Integer.compare(o1.startPosition, o2.startPosition)
+                            : Integer.compare(o2.endPosition, o1.endPosition);
+                }
+            });
+            // 用于保存单个term的形式（最后将保存全部的结果）
+            Map<Integer, TokenBody> singleTerm = new HashMap<>();
+
+            // 将切分结果重新排序, 并清空之前的处理结果
+            int startPosition = Integer.MAX_VALUE;
+            int endPosition = Integer.MIN_VALUE;
+            while (tokenResults.size() > 0) {
+                TokenBody t = tokenResults.poll();
+                if (t.startPosition == t.endPosition) {
+                    // 单个 term
+                    singleTerm.put(t.position, t);
+                    startPosition = Math.min(startPosition, t.startPosition);
+                    endPosition = Math.max(endPosition, t.endPosition);
+                } else {
+                    // 组合出来的term，不参与歧义判断，仅仅用于歧义判断后的填补那些空白的 position
+                    combineTerms.add(t);
+                }
+            }
+
+            // 处理分词，没有歧义的直接放到结果中，有歧义的处理完之后放到结果中
+            PriorityQueue<TokenBody> searchReverseOrder = new PriorityQueue<TokenBody>(new Comparator<TokenBody>(){
+                @Override
+                public int compare(TokenBody o1, TokenBody o2){
+                    // 顺序有重要意义
+                    return o1.startPosition != o2.startPosition ?
+                            Integer.compare(o2.startPosition, o1.startPosition)
+                            : Integer.compare(o1.endPosition, o2.endPosition);
+                }
+            });
+
+            // 在处理一段歧义时，控制前后范围， 第一次就是最开始的范围
+            int maxExtend = Integer.MIN_VALUE;            // 边界包含
+            for (TokenBody tb : combineTerms) {
+                if (searchReverseOrder.size() == 0) {
+                    searchReverseOrder.add(tb);
+                    maxExtend = tb.endPosition;
+                    continue;
+                }
+
+                if (maxExtend < tb.startPosition) {
+                    // 表示当前term 与之前的切分没有歧义
+                    if (searchReverseOrder.size() == 1) {
+                        final TokenBody body = searchReverseOrder.poll();
+                        singleTerm.put(body.startPosition, body);
+                    } else {
+                        // 这里先处理掉之前有歧义的部分，
+                        final List<TokenBody> arbitrator = arbitrator(searchReverseOrder);
+                        for(TokenBody body : arbitrator) {
+                            singleTerm.put(body.startPosition, body);
+                        }
+                    }
+                }
+                searchReverseOrder.add(tb);
+                maxExtend = Math.max(maxExtend, tb.endPosition);
+            }
+            // 处理最后的歧义
+            if (searchReverseOrder.size() == 1) {
+                final TokenBody body = searchReverseOrder.poll();
+                singleTerm.put(body.startPosition, body);
+            } else if(searchReverseOrder.size() > 1){
+                final List<TokenBody> arbitrator = arbitrator(searchReverseOrder);
+                for(TokenBody body : arbitrator) {
+                    singleTerm.put(body.startPosition, body);
+                }
+            }
+            // endPosition 的用途
+            while (startPosition <= endPosition) {
+                if (singleTerm.containsKey(startPosition)) {
+                    final TokenBody body = singleTerm.get(startPosition);
+                    tokenResults.add(body);
+                    startPosition = body.endPosition + 1;
+                } else {
+                    startPosition++;
+                }
+            }
+        }
+    }
+
+    /**
+     * 处理有歧义的token，
+     * @param searchReverseOrder  为倒序的token
+     * @return
+     */
+    private List<TokenBody> arbitrator(PriorityQueue<TokenBody> searchReverseOrder) {
+        Map<Integer, List<TokenBody>> positionMap = new HashMap<>();
+        int maxIndex = -1;
+        int minIndex = -1;
+        while (searchReverseOrder.size() > 0) {
+            final TokenBody body = searchReverseOrder.poll();
+            if (searchReverseOrder.size() == 0) {
+                // 要处理的最开始的位置，也就是 searchReverseOrder 的最后一个
+                minIndex = body.startPosition;
+            }
+            if (maxIndex == -1) {
+                // 要处理的最后的位置，也就是 searchReverseOrder 的第一个
+                maxIndex = body.startPosition;
+            }
+            // 下面给当前的 token 添加 child
+            int currentMax = maxIndex;
+            for (int i = body.endPosition + 1; i <= currentMax; i++) {
+                if (positionMap.containsKey(i)) {
+                    final List<TokenBody> bodies = positionMap.get(i);
+                    final TokenBody minLengthBody = bodies.get(0); // 表示取其后紧挨着的最短token作为结束位置
+                    if (currentMax == maxIndex) {
+                        currentMax = minLengthBody.endPosition; // 表示 minLengthBody 后面的 term 不可以作为 child了
+                    }
+                    if (body.child == null) {
+                        body.child = new ArrayList<>();
+                    }
+                    body.child.addAll(positionMap.get(i));
+                }
+            }
+            // 将 token放到结果中
+            if (positionMap.containsKey(body.startPosition) == false) {
+                positionMap.put(body.startPosition, new ArrayList<>());
+            }
+            positionMap.get(body.startPosition).add(body);
+
+//            if (IS_DEBUG) {
+//                for(int i = 0; i < maxIndex + 10; i++) {
+//                    String s = "- ";
+//                    if (body.startPosition <= i && i <= body.endPosition) {
+//                        s = "# ";
+//                    }
+//                    System.out.print(s);
+//                }
+//                System.out.println();
+//            }
+        }
+        List<TokenBody> topOptions = new ArrayList<>();
+
+        final TokenBody firstMinLength = positionMap.get(minIndex).get(0);
+        for(int i = firstMinLength.startPosition; i <= firstMinLength.endPosition; i++) {
+            if (positionMap.containsKey(i)) {
+                topOptions.addAll(positionMap.get(i));
+            }
+        }
+        for (TokenBody t : topOptions) {
+            System.out.println(t);
+        }
+        List<TokenBody> result = new ArrayList<>();
+        final OptionPath bestPath = chooseBestPath(topOptions);
+        for (int i = 0; i < bestPath.size ; i++) {
+            int startP = bestPath.getValueByIndex(2 * i);
+            int endP = bestPath.getValueByIndex(2 * i + 1);
+            final List<TokenBody> bodyList = positionMap.get(startP);
+            for(TokenBody tb : bodyList) {
+               if (tb.startPosition == startP && tb.endPosition == endP) {
+                   result.add(tb);
+                   break;
+               }
+            }
+        }
+        return result;
+    }
+
+    // options 本身为已经处理好的结构，使用引用指向下级关系
+    private OptionPath chooseBestPath(List<TokenBody> options) {
+        // 使用 PriorityQueue，因为只是需要获取最小的那一个，其后的严格有序不是必须的
+        PriorityQueue<OptionPath> allOptionPath = new PriorityQueue<OptionPath>(new Comparator<OptionPath> () {
+            @Override
+            public int compare(OptionPath o1, OptionPath o2) {
+                return o2.compareTo(o1);
+            }
+        });
+
+        for(TokenBody tokenBody : options) {
+            OptionPath path = new OptionPath();
+            path.addElement(tokenBody.startPosition, tokenBody.endPosition);
+            findNextPath(allOptionPath, tokenBody, path);
+        }
+        final OptionPath bestPath = allOptionPath.poll();
+        return bestPath;
+    }
+
+    private void findNextPath(PriorityQueue<OptionPath> allOptionPath, TokenBody tokenBody, OptionPath parentPath) {
+        if (tokenBody.child == null) {
+            // 路径的最后，结束递归
+            allOptionPath.add(parentPath);
+            return;
+        }
+        for(TokenBody child : tokenBody.child) {
+            // 复制parent path
+            OptionPath childPath = parentPath.copy();
+            childPath.addElement(child.startPosition, child.endPosition);
+            findNextPath(allOptionPath, child, childPath);
+        }
+    }
+
+    /**
+     * 以 begin 开始，但是不包含 begin
+     * @param tokenBodies
+     * @param begin
+     * @param ignoreBlank
+     * @return
+     */
+    private List<Integer> getCurrentEndList(List<TokenBody> tokenBodies, int begin, boolean ignoreBlank) {
+        List<Integer> endList = new ArrayList<>();
+        DictSegment dict = Dictionary.getSingleton().get_MainDict();
+        StringBuffer sb = new StringBuffer(tokenBodies.get(begin).termBuffer);
+        for (int j = begin+1; j < tokenBodies.size(); j++) {
+            TokenBody current = tokenBodies.get(j);
+            if (current.type.equals(CharacterUtil.CHAR_BLANK)) {
+                if(ignoreBlank) {
+                    continue;
+                } else {
+                    break;
+                }
+            }
+            // 处理 中文情况
+            sb.append(current.termBuffer);
+            Hit hit = dict.match(sb.toString().toCharArray());
+            if (hit.isUnmatch()) {
+                break;
+            }
+            if (hit.isMatch()) {
+                endList.add(j);
+            }
+        }
+//        System.out.println(endList);
+        return endList;
+    }
+
+    /**
+     * 拼接 [begin, end] termBuffer
+     * @param tokenBodies
+     * @param begin
+     * @param end
+     * @return
+     */
+    private String combineTermBuffer(List<TokenBody> tokenBodies, int begin, int end) {
+        StringBuffer sb = new StringBuffer(tokenBodies.get(begin).termBuffer);
+        for(int i = begin+1; i <= end; i++) {
+            sb.append(tokenBodies.get(i).termBuffer);
+        }
+        return sb.toString();
+    }
+
+}
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
new file mode 100644
index 00000000..e0e28f13
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
@@ -0,0 +1,131 @@
+package org.wltea.analyzer.fcp;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+
+import java.util.Arrays;
+import java.util.List;
+
+public final class FCPAnalyzer extends Analyzer {
+    /** Default maximum allowed token length */
+    public static final boolean DEFAULT_SPLIT_COMPLETE = false;
+
+    // 决定分词时对 英文、数字 是否进行完全切分，默认为 false，表示数字和英文为一个整体，不会继续向下切分，完全切分的话 splitComplete = true
+    private boolean splitComplete = false;
+    // 默认为建立 索引模式， 如果为 查询模式 indexMode = false
+    private final boolean indexMode;
+    // 特殊字符的映射，默认为 true 表示模糊匹配特殊字符。如果设置为 false ，将会把原始的char放到最终分词结果中。
+    private boolean uselessMapping = true;
+    // 默认文本是正确文本，其中的空白是有意义的，不能忽略空白。如果认为原文中的空白由于ETL错误引入，应该忽略空白。
+    private boolean ignoreBlank = false;
+    // 是否使用 first char position ，默认使用，如果为 false，则变为 lcp_analyzer
+    private boolean useFirstPos = true;
+    // 是否显示 offset，默认随着 indexMode 变化
+    private boolean showOffset;
+
+    private int maxTokenLength = CombineCharFilter.DEFAULT_MAX_WORD_LEN;
+
+    public FCPAnalyzer() {
+        this(ExtendFilter.DEFAULT_INDEX_MODE);
+    }
+    public FCPAnalyzer(boolean indexMode) {
+        this.indexMode = indexMode;
+        // 改变 showOffset 的默认值
+        if (indexMode) {
+            showOffset = false;
+        } else {
+            showOffset = true;
+        }
+    }
+
+    public FCPAnalyzer setIgnoreBlank(boolean ignoreBlank) {
+        this.ignoreBlank = ignoreBlank;
+        return this;
+    }
+
+    public FCPAnalyzer setUselessMapping(boolean uselessMapping) {
+        this.uselessMapping = uselessMapping;
+        return this;
+    }
+
+    public FCPAnalyzer setSplitComplete(boolean splitComplete) {
+        this.splitComplete = splitComplete;
+        return this;
+    }
+
+    public FCPAnalyzer setShowOffset(boolean showOffset) {
+        this.showOffset = showOffset;
+        return this;
+    }
+
+    public FCPAnalyzer setUseFirstPos(boolean useFirstPos) {
+        this.useFirstPos = useFirstPos;
+        return this;
+    }
+
+    /**
+     * Set the max allowed token length.  Tokens larger than this will be chopped
+     * up at this token length and emitted as multiple tokens.  If you need to
+     * skip such large tokens, you could increase this max length, and then
+     * use {@code LengthFilter} to remove long tokens.  The default is
+     * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+     */
+    public FCPAnalyzer setMaxTokenLength(int length) {
+        maxTokenLength = length;
+        return this;
+    }
+
+    /** Returns the current maximum token length
+     *
+     *  @see #setMaxTokenLength */
+    public int getMaxTokenLength() {
+        return maxTokenLength;
+    }
+
+    public boolean isIgnoreBlank() {
+        return ignoreBlank;
+    }
+
+
+    public boolean isIndexMode() {
+        return indexMode;
+    }
+
+    public boolean isUseFirstPos() {
+        return useFirstPos;
+    }
+
+    @Override
+    protected TokenStreamComponents createComponents(final String fieldName) {
+        final Tokenizer src = new NGramTokenizer(1, 1);
+        TokenStream tok = new FormatFilter(src);
+        if (!splitComplete) {
+            tok = new CombineCharFilter(tok, maxTokenLength);
+        }
+
+        tok = new ExtendFilter(tok, indexMode)
+                .setShowOffset(showOffset)
+                .setIgnoreBlank(ignoreBlank)
+                .setUseFirstPos(useFirstPos)
+                .setUselessMapping(uselessMapping);
+        return new TokenStreamComponents(src, tok);
+    }
+
+    @Override
+    public String toString() {
+        return "FCPAnalyzer{" +
+                "splitComplete=" + splitComplete +
+                ", indexMode=" + indexMode +
+                ", showOffset=" + showOffset +
+                ", uselessMapping=" + uselessMapping +
+                ", ignoreBlank=" + ignoreBlank +
+                ", useFirstPos=" + useFirstPos +
+                ", maxTokenLength=" + maxTokenLength +
+                '}';
+    }
+
+}
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java
new file mode 100644
index 00000000..e85d6cb1
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java
@@ -0,0 +1,51 @@
+package org.wltea.analyzer.fcp;
+
+import org.apache.lucene.analysis.CharacterUtils;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.wltea.analyzer.fcp.util.CharacterUtil;
+
+import java.io.IOException;
+
+/**
+ * 英文转小写
+ * 字符的类型处理
+ */
+public class FormatFilter extends TokenFilter {
+
+    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+    /**
+     * Construct a token stream filtering the given input.
+     *
+     * @param input
+     */
+    public FormatFilter(TokenStream input) {
+        super(input);
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+        if (input.incrementToken()) {
+            String s = termAtt.toString();
+            // 如果从 ngram 1 的 Tokenizer 得到的 token 应该length 都为 1
+            if (s.length() == 1) {
+                int c = s.codePointAt(0);
+                typeAtt.setType(CharacterUtil.identifyCharType(c));
+                c = CharacterUtil.regularize(c);
+                char[] chars = Character.toChars(c);
+                termAtt.copyBuffer(chars, 0, chars.length);
+            } else {
+                // 对英文进行 lower case
+                CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+            }
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+}
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java b/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java
new file mode 100644
index 00000000..659469de
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java
@@ -0,0 +1,102 @@
+package org.wltea.analyzer.fcp;
+
+import java.util.Arrays;
+
+/**
+ * present a no conflict path for choose
+ */
+public class OptionPath implements Comparable<OptionPath> {
+    private static final int DEFAULT_CAPACITY = 10;
+    int[] groups;
+    int size = 0;
+    int payloadLength = 0;
+
+    OptionPath() {
+        groups = new int[DEFAULT_CAPACITY];
+    }
+
+    OptionPath(int capacity) {
+        assert capacity > 0;
+        groups = new int[capacity];
+    }
+
+    private OptionPath(int size, int[] groups) {
+        this.size = size;
+        int newCapacity = Math.max(size * 2, groups.length);
+        this.groups = Arrays.copyOf(groups, newCapacity);
+    }
+
+    OptionPath copy() {
+        return new OptionPath(this.size, this.groups);
+    }
+
+    void addElement(int startPosition, int endPosition) {
+        assert endPosition > startPosition;
+        this.size++;
+        if (this.size*2 >= this.groups.length) {
+            this.groups = Arrays.copyOf(this.groups, this.groups.length * 2);
+        }
+        this.payloadLength += (endPosition - startPosition + 1);
+        this.groups[size*2 - 2] = startPosition;
+        this.groups[size*2 - 1] = endPosition;
+    }
+
+    int getValueByIndex(int index) {
+        assert -1 < index && index < this.groups.length;
+        return this.groups[index];
+    }
+
+    int getEndPosition(int startPosition) {
+        int endPosition = -1;
+        for(int i = 0; i < size && this.groups[2*i] <= startPosition; i++) {
+            if (startPosition == this.groups[2*i]) {
+                endPosition = this.groups[2*i + 1];
+            }
+        }
+        return endPosition;
+    }
+
+    int getPathLength() {
+        return this.groups[this.size*2+1] - this.groups[0];
+    }
+
+    int getPathEnd() {
+        return this.groups[size*2+1];
+    }
+
+    int getXWeight() {
+        int product = 1;
+        for(int i = 0; i < size; i++) {
+            product *= (this.groups[2*i+1] - this.groups[2*i]);
+        }
+        return product;
+    }
+
+    int getPWeight() {
+        int pWeight = 0;
+        int p = 0;
+        for(int i = 0; i < size; i++) {
+            p++;
+            pWeight += p * (this.groups[2*i+1] - this.groups[2*i]);
+        }
+        return pWeight;
+    }
+
+    // ik_smart 解决歧义问题的实现逻辑
+    @Override
+    public int compareTo(OptionPath o) {
+        if (this.payloadLength != o.payloadLength) {
+            return Integer.compare(this.payloadLength, o.payloadLength);
+        } else if (this.size != o.size) {
+            return Integer.compare(this.size, o.size);
+        } else if (this.getPathLength() !=  o.getPathLength()) {
+            return Integer.compare(this.getPathLength(), o.getPathLength());
+        } else if(this.getPathEnd() != o.getPathEnd()) {
+            return Integer.compare(this.getPathEnd(), o.getPathEnd());
+        } else if (this.getXWeight() != o.getXWeight()) {
+            return Integer.compare(this.getXWeight(), o.getXWeight());
+        } else {
+            return Integer.compare(this.getPWeight(), o.getPWeight());
+        }
+    }
+}
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
new file mode 100644
index 00000000..587ca1d9
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
@@ -0,0 +1,47 @@
+package org.wltea.analyzer.fcp;
+
+import java.util.List;
+
+/**
+ * compose term
+ */
+class TokenBody {
+    String termBuffer;
+    int startOffset, endOffset;
+    // position 用于表示在 elasticsearch 分词时得到的 position， 通过 curr.position - prev.position 得到 positionIncrement
+    // startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token，主要给 ik_smart 使用
+    int position, startPosition = -1, endPosition = -1;
+    String type;
+
+    List<TokenBody> child;
+
+    TokenBody(){}
+    TokenBody(String termBuffer, int startOffset, int endOffset, int position, int startPosition, int endPosition, String type){
+        this.termBuffer = termBuffer;
+        this.startOffset = startOffset;
+        this.endOffset = endOffset;
+        this.position = position;
+        this.startPosition = startPosition;
+        this.endPosition = endPosition;
+        this.type = type;
+    }
+
+
+    TokenBody copy() {
+        return new TokenBody(termBuffer, startOffset, endOffset, position, startPosition, endPosition, "<UNDEFINED>");
+    }
+
+    @Override
+    public String toString() {
+        return "TokenBody{" +
+                "termBuffer='" + termBuffer + '\'' +
+                ", startOffset=" + startOffset +
+                ", endOffset=" + endOffset +
+                ", position=" + position +
+                ", startPosition=" + startPosition +
+                ", endPosition=" + endPosition +
+                ", type='" + type + '\'' +
+                ", child=" + child +
+                '}';
+    }
+}
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
new file mode 100644
index 00000000..ba9ae2ff
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
@@ -0,0 +1,33 @@
+package org.wltea.analyzer.fcp.tokenattributes;
+
+import org.apache.lucene.util.Attribute;
+
+/** Determines how many positions this
+ *  token spans.  Very few analyzer components actually
+ *  produce this attribute, and indexing ignores it, but
+ *  it's useful to express the graph structure naturally
+ *  produced by decompounding, word splitting/joining,
+ *  synonym filtering, etc.
+ *
+ * <p>NOTE: this is optional, and most analyzers
+ *  don't change the default value (1). */
+
+public interface PositionLengthAttribute extends Attribute {
+    /**
+     * Set the position length of this Token.
+     * <p>
+     * The default value is one.
+     * @param positionLength how many positions this token
+     *  spans.
+     * @throws IllegalArgumentException if <code>positionLength</code>
+     *         is zero or negative.
+     * @see #getPositionLength()
+     */
+    public void setPositionLength(int positionLength);
+
+    /** Returns the position length of this Token.
+     * @see #setPositionLength
+     */
+    public int getPositionLength();
+}
+
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
new file mode 100644
index 00000000..5aa230c0
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
@@ -0,0 +1,62 @@
+package org.wltea.analyzer.fcp.tokenattributes;
+
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/** Default implementation of {@link PositionLengthAttribute}. */
+public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
+    private int positionLength = 1;
+
+    /** Initializes this attribute with position length of 1. */
+    public PositionLengthAttributeImpl() {}
+
+    @Override
+    public void setPositionLength(int positionLength) {
+        if (positionLength < 1) {
+            throw new IllegalArgumentException("Position length must be 1 or greater; got " + positionLength);
+        }
+        this.positionLength = positionLength;
+    }
+
+    @Override
+    public int getPositionLength() {
+        return positionLength;
+    }
+
+    @Override
+    public void clear() {
+        this.positionLength = 1;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+        if (other == this) {
+            return true;
+        }
+
+        if (other instanceof PositionLengthAttributeImpl) {
+            PositionLengthAttributeImpl _other = (PositionLengthAttributeImpl) other;
+            return positionLength ==  _other.positionLength;
+        }
+
+        return false;
+    }
+
+    @Override
+    public int hashCode() {
+        return positionLength;
+    }
+
+    @Override
+    public void copyTo(AttributeImpl target) {
+        PositionLengthAttribute t = (PositionLengthAttribute) target;
+        t.setPositionLength(positionLength);
+    }
+
+    @Override
+    public void reflectWith(AttributeReflector reflector) {
+        reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength);
+    }
+}
+
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java b/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java
new file mode 100644
index 00000000..0f618963
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java
@@ -0,0 +1,124 @@
+/**
+ * IK 中文分词  版本 5.0
+ * IK Analyzer release 5.0
+ * 
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供
+ * 版权声明 2012，乌龙茶工作室
+ * provided by Linliangyi and copyright 2012 by Oolong studio
+ * 
+ * 字符集识别工具类
+ */
+package org.wltea.analyzer.fcp.util;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * 字符集识别工具类
+ */
+public class CharacterUtil {
+	
+	public static final String CHAR_USELESS = "<CHAR_USELESS>";
+	
+	public static final String CHAR_ENGLISH = "<CHAR_ENGLISH>";
+
+	public static final String CHAR_NUMBER = "<CHAR_NUMBER>";
+
+	public static final String CHAR_NUMBER_DOT = "<CHAR_NUMBER_DOT>";
+
+	public static final String ALPHANUM = "<ALPHANUM>";
+
+	public static final String CHAR_CHINESE = "<CHAR_CHINESE>";
+
+	public static final String COMBINE_WORD = "<COMBINE_WORD>";
+
+	public static final String CHAR_MAPPING = "<CHAR_MAPPING>";
+
+	public static final String CHAR_BLANK = "<CHAR_BLANK>";
+
+	// pinyin
+	public static final String CHAR_PINYIN = "<CHAR_PINYIN>";
+	// pinyin 前缀
+	public static final String CHAR_PINYIN_PRE = "<CHAR_PINYIN_PRE>";
+
+	private static Map<String, Integer> order;
+	static {
+		// value 越小，排序越靠前，用于区分在同一个 position 上的不同 type 之间的排序
+		order = new HashMap<>();
+		order.put(CHAR_CHINESE, 0);
+		order.put(CHAR_PINYIN_PRE, 5);
+		order.put(CHAR_PINYIN, 10);
+
+		order.put(CHAR_USELESS, 0);
+		order.put(CHAR_MAPPING, 10);
+	}
+
+	public static int getOrderByType(String type) {
+		return order.getOrDefault(type, 0);
+	}
+
+
+
+	/**
+	 * 识别字符类型
+	 * @param input
+	 * @return int CharacterUtil定义的字符类型常量
+	 */
+	public static String identifyCharType(int input){
+
+		if (input >= '0' && input <= '9') {
+			return CHAR_NUMBER;
+		} else if ((input >= 'a' && input <= 'z')
+				|| (input >= 'A' && input <= 'Z')) {
+			return CHAR_ENGLISH;
+		} else {
+			Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
+
+			if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
+					|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
+					|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
+				//目前已知的中文字符UTF-8集合
+				return CHAR_CHINESE;
+
+			}
+		}
+		//其他的不做处理的字符
+		return CHAR_USELESS;
+
+	}
+
+	/**
+	 * 进行字符规格化（全角转半角，大写转小写处理）
+	 * @param input
+	 * @return char
+	 */
+	public static int regularize(int input){
+		if (input == 12288) {
+			input = 32;
+
+		}else if (input > 65280 && input < 65375) {
+			input = input - 65248;
+
+		}else if (input >= 'A' && input <= 'Z') {
+			input += 32;
+		}
+
+
+		return input;
+	}
+}
diff --git a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java
index 54ee735e..f906af6d 100644
--- a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java
+++ b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java
@@ -33,6 +33,12 @@ public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends An
         extra.put("ik_smart", IkAnalyzerProvider::getIkSmartAnalyzerProvider);
         extra.put("ik_max_word", IkAnalyzerProvider::getIkAnalyzerProvider);
 
+        extra.put("fcp_index", FCPAnalyzerProvider::getFCPIndexAnalyzer);
+        extra.put("fcp_search", FCPAnalyzerProvider::getFCPSearchAnalyzer);
+        extra.put("lcp_index", FCPAnalyzerProvider::getLCPIndexAnalyzer);
+        extra.put("lcp_search", FCPAnalyzerProvider::getLCPSearchAnalyzer);
+
+
         return extra;
     }
 
diff --git a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/FCPAnalyzerProvider.java b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/FCPAnalyzerProvider.java
new file mode 100644
index 00000000..4dc72a4a
--- /dev/null
+++ b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/FCPAnalyzerProvider.java
@@ -0,0 +1,89 @@
+package com.infinilabs.ik.elasticsearch;
+
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
+import org.wltea.analyzer.fcp.CombineCharFilter;
+import org.wltea.analyzer.fcp.ExtendFilter;
+import org.wltea.analyzer.fcp.FCPAnalyzer;
+
+
+/**
+ * fcp analyzer
+ *
+ */
+public class FCPAnalyzerProvider extends AbstractIndexAnalyzerProvider<FCPAnalyzer> {
+    private final FCPAnalyzer analyzer;
+
+    /**
+     * indexMode 作为重要的参数，
+     * @param indexSettings
+     * @param env
+     * @param name
+     * @param settings
+     * @param indexMode
+     */
+    public FCPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, boolean indexMode) {
+        super(name, settings);
+        boolean splitComplete = settings.getAsBoolean("split_complete", FCPAnalyzer.DEFAULT_SPLIT_COMPLETE);
+        int maxTokenLength = settings.getAsInt("max_token_length", CombineCharFilter.DEFAULT_MAX_WORD_LEN);
+        boolean uselessMapping = settings.getAsBoolean("useless_mapping", ExtendFilter.DEFAULT_USELESS_MAPPING);
+        boolean ignoreBlank = settings.getAsBoolean("ignore_blank", ExtendFilter.DEFAULT_IGNORE_BLANK);
+        boolean useFirstPos = settings.getAsBoolean("use_first_position", ExtendFilter.DEFAULT_USE_FIRST_POSITION);
+        Boolean showOffset = settings.getAsBoolean("show_offset", null);
+        analyzer = new FCPAnalyzer(indexMode);
+        if (showOffset != null) {
+            analyzer.setShowOffset(showOffset);
+        }
+        analyzer.setSplitComplete(splitComplete);
+        analyzer.setUselessMapping(uselessMapping);
+        analyzer.setMaxTokenLength(maxTokenLength);
+        analyzer.setIgnoreBlank(ignoreBlank);
+        analyzer.setUseFirstPos(useFirstPos);
+    }
+
+    public static FCPAnalyzerProvider getFCPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        boolean indexMode = settings.getAsBoolean("index_mode", ExtendFilter.DEFAULT_INDEX_MODE);
+        return new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+    }
+
+    public static FCPAnalyzerProvider getFCPIndexAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        boolean indexMode = true;
+        boolean useFirstPos = true;
+        FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+        FCPAnalyzer fcpAnalyzer = provider.get();
+        fcpAnalyzer.setUseFirstPos(useFirstPos);
+        return provider;
+    }
+
+    public static FCPAnalyzerProvider getFCPSearchAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        boolean indexMode = false;
+        boolean useFirstPos = true;
+        FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+        FCPAnalyzer fcpAnalyzer = provider.get();
+        fcpAnalyzer.setUseFirstPos(useFirstPos);
+        return provider;
+    }
+
+    public static FCPAnalyzerProvider getLCPIndexAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        boolean indexMode = true;
+        FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+        FCPAnalyzer fcpAnalyzer = provider.get();
+        fcpAnalyzer.setUseFirstPos(false);
+        return provider;
+    }
+
+    public static FCPAnalyzerProvider getLCPSearchAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+        boolean indexMode = false;
+        FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+        FCPAnalyzer fcpAnalyzer = provider.get();
+        fcpAnalyzer.setUseFirstPos(false);
+        return provider;
+    }
+
+    @Override
+    public FCPAnalyzer get() {
+        return analyzer;
+    }
+}
\ No newline at end of file

From 5e5445f29761de91c3d7a20e1ebc70b2c9daaf56 Mon Sep 17 00:00:00 2001
From: "hao.mou" <hao.mou@ly.com>
Date: Sun, 28 Apr 2024 10:06:27 +0800
Subject: [PATCH 2/2] add new analyzer , simple implementation

---
 .../org/wltea/analyzer/fcp/ExtendFilter.java  | 271 +++---------------
 .../org/wltea/analyzer/fcp/FCPAnalyzer.java   |   5 +-
 .../org/wltea/analyzer/fcp/TokenBody.java     |   5 +-
 .../PositionLengthAttribute.java              |   1 +
 .../PositionLengthAttributeImpl.java          |   1 +
 .../analyzer/fcp/Configuration4Test.java      |  29 ++
 .../wltea/analyzer/fcp/FCPAnalyzerTest.java   |  80 ++++++
 7 files changed, 148 insertions(+), 244 deletions(-)
 create mode 100644 core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java
 create mode 100644 core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java

diff --git a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
index 211c09b0..be2451a1 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
@@ -9,30 +9,24 @@
 import org.wltea.analyzer.dic.DictSegment;
 import org.wltea.analyzer.dic.Dictionary;
 import org.wltea.analyzer.dic.Hit;
-import org.wltea.analyzer.fcp.tokenattributes.PositionLengthAttribute;
 import org.wltea.analyzer.fcp.util.CharacterUtil;
 
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.PriorityQueue;
 
 /**
  * use dict to extend terms
  */
 public class ExtendFilter extends TokenFilter {
-    private static final boolean IS_DEBUG = true;
     // 默认入库模式
     public static final boolean DEFAULT_INDEX_MODE = true;
     // 默认对于特殊字符采用模糊搜索，扩大搜索范围
     public static final boolean DEFAULT_USELESS_MAPPING = true;
     // 默认对于句子的空白进行忽略
     public static final boolean DEFAULT_IGNORE_BLANK = true;
-    // 默认对于句子的空白进行忽略
-    public static final boolean DEFAULT_IGNORE_WHITESPACE = true;
     // 默认使用 lcp 的模式，使用最后一个char的position
     public static final boolean DEFAULT_USE_FIRST_POSITION = false;
     // 在高亮的时候使用 offset
@@ -42,8 +36,6 @@ public class ExtendFilter extends TokenFilter {
     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
     private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
     private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-    // 用于记录每一个term 的position length
-    private final PositionLengthAttribute lengthAttribute = addAttribute(PositionLengthAttribute.class);
 
     // used for saving upstream tokens , implemented by Arraylist
     private List<TokenBody> tokenBodies = null;
@@ -123,7 +115,7 @@ public final boolean incrementToken() throws IOException {
                 tb.endOffset = showOffset ? offsetAtt.endOffset() : 0;
                 // blank 类型会被舍弃，position不变
                 tb.termBuffer = termAtt.toString();
-                // 下面是处理 position 和 type的赋值，单个 term，没有 startPosition 和 endPosition
+                // 下面是处理 position 和 type的赋值
                 if (CharacterUtil.CHAR_USELESS.equals(typeAtt.type())) {
                     if (isAllBlank(tb.termBuffer) && this.ignoreBlank) {
                         // 表示沿用上一个 position，下面将会被舍弃掉
@@ -135,7 +127,7 @@ public final boolean incrementToken() throws IOException {
                         tb.position = position;
                         tb.type = typeAtt.type();
                         if (uselessMapping) {
-                            tb.termBuffer = "#"; // 无特殊含义，将特殊字符统一映射为 # 方便查询
+                            tb.termBuffer = "#"; // 无特殊含义，将特殊字符统一映射为 # 方便查询, 否则特殊字符也是需要精准匹配
                         }
                     }
                 } else {
@@ -157,10 +149,6 @@ public final boolean incrementToken() throws IOException {
             termAtt.copyBuffer(chars, 0, chars.length);
             offsetAtt.setOffset(body.startOffset, body.endOffset);
             typeAtt.setType(body.type);
-            if (!indexMode) {
-                // 计算当前combine term 的跨度，占用了多少个 term
-                lengthAttribute.setPositionLength(body.endPosition - body.startPosition + 1);
-            }
             return true;
         } else {
             tokenBodies = null;
@@ -171,7 +159,7 @@ public final boolean incrementToken() throws IOException {
 
 
     /**
-     * 判断参数是否全部由空白字符组成
+     * 判断参数是否全部由空白字符(空格、制表符、换行……)组成
      * @param s
      * @return
      */
@@ -188,237 +176,44 @@ private void extendTerms(List<TokenBody> tokenBodies, boolean indexMode, boolean
             if (!tokenBody.type.equals(CharacterUtil.CHAR_BLANK)) {
                 // 处理当前char, 但要考虑向后扩展，得到以当前位置开始 以 endList 中位置结束的一系列term，
                 List<Integer> endList = getCurrentEndList(tokenBodies, beginI, ignoreBlank);
-                // 默认在 index 模式下，一股脑全部放到倒排中（index 模式对性能敏感，所以必须保证）
-                if (!indexMode) {
-                    tokenBody.startPosition = tokenBody.position;
-                    tokenBody.endPosition = tokenBody.position;
-                }
-                tokenResults.add(tokenBody);
-                for (Integer endI : endList) {
-                    TokenBody tb= new TokenBody();
-                    tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI);
-                    tb.startOffset = tokenBodies.get(beginI).startOffset;
-                    tb.endOffset = tokenBodies.get(endI).endOffset;
-                    // search 模式下需要记录组合 term  前后的 position
-                    if (!indexMode) {
-                        tb.startPosition = tokenBodies.get(beginI).position;
-                        tb.endPosition = tokenBodies.get(endI).position;
-                    }
-                    if (useFirstPos) {
-                        tb.position = tokenBodies.get(beginI).position;
-                    } else {
-                        tb.position = tokenBodies.get(endI).position;
+                if (indexMode) {
+                    tokenResults.add(tokenBody);
+                    for (Integer endI : endList) {
+                        TokenBody tb= new TokenBody();
+                        tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI);
+                        tb.startOffset = tokenBodies.get(beginI).startOffset;
+                        tb.endOffset = tokenBodies.get(endI).endOffset;
+                        if (useFirstPos) {
+                            tb.position = tokenBodies.get(beginI).position;
+                        } else {
+                            tb.position = tokenBodies.get(endI).position;
+                        }
+                        tb.type = CharacterUtil.COMBINE_WORD;
+                        tokenResults.add(tb);
                     }
-                    tb.type = "<COMBINE_WORD>";
-                    tokenResults.add(tb);
-                }
-            }
-        }
-        // 到这里如果是index 模式的话，已经可以结束了；
-        // 如果是 search模式，需要做歧义处理(如果有的话， 使用 <CHAR_USELESS> 类型的char 作为天然分割句子)
-        if (!indexMode && tokenResults.size() > 0) {
-            // 在search 模式下，采用 ik_smart 的逻辑进行语义分割，一个重大的意义：引入了语义分割
-            // 1，ik 使用没有语义重叠的那个 char 作为分割点，只作用于有字符重叠的部分
-            // 2，由于和 index 模式使用相同的 向后扩展逻辑，所以search是index 的子集
-            // 3，search 模式下，不会涉及mapping的扩展引入
-            // 4，search 模式下，使用 startPosition 来进行判断扩是否有歧义
-
-            // 用于保存多个term的组合形式，逆序。：采用动态编程思想，完成快速组合
-            PriorityQueue<TokenBody> combineTerms = new PriorityQueue<TokenBody>(new Comparator<TokenBody>(){
-                @Override
-                public int compare(TokenBody o1, TokenBody o2){
-                    // 顺序有重要意义
-                    return o1.startPosition != o2.startPosition ?
-                            Integer.compare(o1.startPosition, o2.startPosition)
-                            : Integer.compare(o2.endPosition, o1.endPosition);
-                }
-            });
-            // 用于保存单个term的形式（最后将保存全部的结果）
-            Map<Integer, TokenBody> singleTerm = new HashMap<>();
-
-            // 将切分结果重新排序, 并清空之前的处理结果
-            int startPosition = Integer.MAX_VALUE;
-            int endPosition = Integer.MIN_VALUE;
-            while (tokenResults.size() > 0) {
-                TokenBody t = tokenResults.poll();
-                if (t.startPosition == t.endPosition) {
-                    // 单个 term
-                    singleTerm.put(t.position, t);
-                    startPosition = Math.min(startPosition, t.startPosition);
-                    endPosition = Math.max(endPosition, t.endPosition);
                 } else {
-                    // 组合出来的term，不参与歧义判断，仅仅用于歧义判断后的填补那些空白的 position
-                    combineTerms.add(t);
-                }
-            }
-
-            // 处理分词，没有歧义的直接放到结果中，有歧义的处理完之后放到结果中
-            PriorityQueue<TokenBody> searchReverseOrder = new PriorityQueue<TokenBody>(new Comparator<TokenBody>(){
-                @Override
-                public int compare(TokenBody o1, TokenBody o2){
-                    // 顺序有重要意义
-                    return o1.startPosition != o2.startPosition ?
-                            Integer.compare(o2.startPosition, o1.startPosition)
-                            : Integer.compare(o1.endPosition, o2.endPosition);
-                }
-            });
-
-            // 在处理一段歧义时，控制前后范围， 第一次就是最开始的范围
-            int maxExtend = Integer.MIN_VALUE;            // 边界包含
-            for (TokenBody tb : combineTerms) {
-                if (searchReverseOrder.size() == 0) {
-                    searchReverseOrder.add(tb);
-                    maxExtend = tb.endPosition;
-                    continue;
-                }
-
-                if (maxExtend < tb.startPosition) {
-                    // 表示当前term 与之前的切分没有歧义
-                    if (searchReverseOrder.size() == 1) {
-                        final TokenBody body = searchReverseOrder.poll();
-                        singleTerm.put(body.startPosition, body);
+                    // 处理search analyzer 结果，贪婪向后匹配
+                    // 1，只有单字，加入单字
+                    // 2，有后缀匹配，采用最长的token结果(目的是找到个数最少的组合，非最优，但比较简单)
+                    if (endList.isEmpty()) {
+                        tokenResults.add(tokenBody); // 单字
                     } else {
-                        // 这里先处理掉之前有歧义的部分，
-                        final List<TokenBody> arbitrator = arbitrator(searchReverseOrder);
-                        for(TokenBody body : arbitrator) {
-                            singleTerm.put(body.startPosition, body);
+                        int lastEnd = endList.get(endList.size()-1); // 取最长token
+                        tokenBody.termBuffer = combineTermBuffer(tokenBodies, beginI, lastEnd);
+                        tokenBody.startOffset = tokenBodies.get(beginI).startOffset;
+                        tokenBody.endOffset = tokenBodies.get(lastEnd).endOffset;
+                        if (useFirstPos) {
+                            tokenBody.position = tokenBodies.get(beginI).position;
+                        } else {
+                            tokenBody.position = tokenBodies.get(lastEnd).position;
                         }
-                    }
-                }
-                searchReverseOrder.add(tb);
-                maxExtend = Math.max(maxExtend, tb.endPosition);
-            }
-            // 处理最后的歧义
-            if (searchReverseOrder.size() == 1) {
-                final TokenBody body = searchReverseOrder.poll();
-                singleTerm.put(body.startPosition, body);
-            } else if(searchReverseOrder.size() > 1){
-                final List<TokenBody> arbitrator = arbitrator(searchReverseOrder);
-                for(TokenBody body : arbitrator) {
-                    singleTerm.put(body.startPosition, body);
-                }
-            }
-            // endPosition 的用途
-            while (startPosition <= endPosition) {
-                if (singleTerm.containsKey(startPosition)) {
-                    final TokenBody body = singleTerm.get(startPosition);
-                    tokenResults.add(body);
-                    startPosition = body.endPosition + 1;
-                } else {
-                    startPosition++;
-                }
-            }
-        }
-    }
+                        tokenBody.type = CharacterUtil.COMBINE_WORD;
+                        tokenResults.add(tokenBody);
 
-    /**
-     * 处理有歧义的token，
-     * @param searchReverseOrder  为倒序的token
-     * @return
-     */
-    private List<TokenBody> arbitrator(PriorityQueue<TokenBody> searchReverseOrder) {
-        Map<Integer, List<TokenBody>> positionMap = new HashMap<>();
-        int maxIndex = -1;
-        int minIndex = -1;
-        while (searchReverseOrder.size() > 0) {
-            final TokenBody body = searchReverseOrder.poll();
-            if (searchReverseOrder.size() == 0) {
-                // 要处理的最开始的位置，也就是 searchReverseOrder 的最后一个
-                minIndex = body.startPosition;
-            }
-            if (maxIndex == -1) {
-                // 要处理的最后的位置，也就是 searchReverseOrder 的第一个
-                maxIndex = body.startPosition;
-            }
-            // 下面给当前的 token 添加 child
-            int currentMax = maxIndex;
-            for (int i = body.endPosition + 1; i <= currentMax; i++) {
-                if (positionMap.containsKey(i)) {
-                    final List<TokenBody> bodies = positionMap.get(i);
-                    final TokenBody minLengthBody = bodies.get(0); // 表示取其后紧挨着的最短token作为结束位置
-                    if (currentMax == maxIndex) {
-                        currentMax = minLengthBody.endPosition; // 表示 minLengthBody 后面的 term 不可以作为 child了
-                    }
-                    if (body.child == null) {
-                        body.child = new ArrayList<>();
+                        beginI = lastEnd;
                     }
-                    body.child.addAll(positionMap.get(i));
                 }
             }
-            // 将 token放到结果中
-            if (positionMap.containsKey(body.startPosition) == false) {
-                positionMap.put(body.startPosition, new ArrayList<>());
-            }
-            positionMap.get(body.startPosition).add(body);
-
-//            if (IS_DEBUG) {
-//                for(int i = 0; i < maxIndex + 10; i++) {
-//                    String s = "- ";
-//                    if (body.startPosition <= i && i <= body.endPosition) {
-//                        s = "# ";
-//                    }
-//                    System.out.print(s);
-//                }
-//                System.out.println();
-//            }
-        }
-        List<TokenBody> topOptions = new ArrayList<>();
-
-        final TokenBody firstMinLength = positionMap.get(minIndex).get(0);
-        for(int i = firstMinLength.startPosition; i <= firstMinLength.endPosition; i++) {
-            if (positionMap.containsKey(i)) {
-                topOptions.addAll(positionMap.get(i));
-            }
-        }
-        for (TokenBody t : topOptions) {
-            System.out.println(t);
-        }
-        List<TokenBody> result = new ArrayList<>();
-        final OptionPath bestPath = chooseBestPath(topOptions);
-        for (int i = 0; i < bestPath.size ; i++) {
-            int startP = bestPath.getValueByIndex(2 * i);
-            int endP = bestPath.getValueByIndex(2 * i + 1);
-            final List<TokenBody> bodyList = positionMap.get(startP);
-            for(TokenBody tb : bodyList) {
-               if (tb.startPosition == startP && tb.endPosition == endP) {
-                   result.add(tb);
-                   break;
-               }
-            }
-        }
-        return result;
-    }
-
-    // options 本身为已经处理好的结构，使用引用指向下级关系
-    private OptionPath chooseBestPath(List<TokenBody> options) {
-        // 使用 PriorityQueue，因为只是需要获取最小的那一个，其后的严格有序不是必须的
-        PriorityQueue<OptionPath> allOptionPath = new PriorityQueue<OptionPath>(new Comparator<OptionPath> () {
-            @Override
-            public int compare(OptionPath o1, OptionPath o2) {
-                return o2.compareTo(o1);
-            }
-        });
-
-        for(TokenBody tokenBody : options) {
-            OptionPath path = new OptionPath();
-            path.addElement(tokenBody.startPosition, tokenBody.endPosition);
-            findNextPath(allOptionPath, tokenBody, path);
-        }
-        final OptionPath bestPath = allOptionPath.poll();
-        return bestPath;
-    }
-
-    private void findNextPath(PriorityQueue<OptionPath> allOptionPath, TokenBody tokenBody, OptionPath parentPath) {
-        if (tokenBody.child == null) {
-            // 路径的最后，结束递归
-            allOptionPath.add(parentPath);
-            return;
-        }
-        for(TokenBody child : tokenBody.child) {
-            // 复制parent path
-            OptionPath childPath = parentPath.copy();
-            childPath.addElement(child.startPosition, child.endPosition);
-            findNextPath(allOptionPath, child, childPath);
         }
     }
 
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
index e0e28f13..ec381546 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
@@ -5,10 +5,7 @@
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.synonym.SynonymMap;
 
-import java.util.Arrays;
-import java.util.List;
 
 public final class FCPAnalyzer extends Analyzer {
     /** Default maximum allowed token length */
@@ -21,7 +18,7 @@ public final class FCPAnalyzer extends Analyzer {
     // 特殊字符的映射，默认为 true 表示模糊匹配特殊字符。如果设置为 false ，将会把原始的char放到最终分词结果中。
     private boolean uselessMapping = true;
     // 默认文本是正确文本，其中的空白是有意义的，不能忽略空白。如果认为原文中的空白由于ETL错误引入，应该忽略空白。
-    private boolean ignoreBlank = false;
+    private boolean ignoreBlank = true;
     // 是否使用 first char position ，默认使用，如果为 false，则变为 lcp_analyzer
     private boolean useFirstPos = true;
     // 是否显示 offset，默认随着 indexMode 变化
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
index 587ca1d9..6e9bcf4d 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
@@ -9,8 +9,9 @@ class TokenBody {
     String termBuffer;
     int startOffset, endOffset;
     // position 用于表示在 elasticsearch 分词时得到的 position， 通过 curr.position - prev.position 得到 positionIncrement
-    // startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token，主要给 ik_smart 使用
-    int position, startPosition = -1, endPosition = -1;
+    int position;
+    // todo 未来startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token，主要给 ik_smart 使用
+    int startPosition = -1, endPosition = -1;
     String type;
 
     List<TokenBody> child;
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
index ba9ae2ff..2e0f6327 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
@@ -12,6 +12,7 @@
  * <p>NOTE: this is optional, and most analyzers
  *  don't change the default value (1). */
 
+@Deprecated
 public interface PositionLengthAttribute extends Attribute {
     /**
      * Set the position length of this Token.
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
index 5aa230c0..c4d5dffb 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
@@ -5,6 +5,7 @@
 import org.apache.lucene.util.AttributeReflector;
 
 /** Default implementation of {@link PositionLengthAttribute}. */
+@Deprecated
 public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
     private int positionLength = 1;
 
diff --git a/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java b/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java
new file mode 100644
index 00000000..34c6d8e8
--- /dev/null
+++ b/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java
@@ -0,0 +1,29 @@
+package org.wltea.analyzer.fcp;
+
+import org.wltea.analyzer.cfg.Configuration;
+
+import java.io.File;
+import java.net.URI;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+/**
+ * @ClassName Configuration4Test
+ * @Description:
+ */
+public class Configuration4Test extends Configuration {
+    @Override
+    public Path getConfDir() {
+        return Paths.get("../", "config");
+    }
+
+    @Override
+    public Path getConfigInPluginDir() {
+        return Paths.get("../", "config");
+    }
+
+    @Override
+    public Path getPath(String first, String... more) {
+        return Paths.get(first, more);
+    }
+}
diff --git a/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java b/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java
new file mode 100644
index 00000000..b65507d5
--- /dev/null
+++ b/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java
@@ -0,0 +1,80 @@
+package org.wltea.analyzer.fcp;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.junit.Before;
+import org.junit.Test;
+import org.wltea.analyzer.dic.Dictionary;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * @ClassName FCPAnalyzerTest
+ * @Description: fcp test
+ */
+public class FCPAnalyzerTest {
+
+    @Before
+    public void init() {
+        // 初始化词典
+        Dictionary.initial(new Configuration4Test());
+    }
+
+    @Test
+    public void testFcpIndexAnalyzer() {
+        FCPAnalyzer fcpIndex = new FCPAnalyzer(true);
+        String str = "这里是中国, this is china #4.345^";
+        TokenStream stream = null ;
+        try {
+            stream = fcpIndex.tokenStream( "any", new StringReader(str)) ;
+            PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class ) ;  //保存位置
+            OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class ) ; //保存辞与词之间偏移量
+            CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class ) ;//保存响应词汇
+            TypeAttribute ta = stream.addAttribute(TypeAttribute.class ) ; //保存类型
+            stream.reset() ;
+            int position = -1;
+            while (stream.incrementToken()) {
+                position += pia.getPositionIncrement();
+                System. out.println(position + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type());
+            }
+            stream.end() ;
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
+    }
+
+    @Test
+    public void testFcpSearchAnalyzer() {
+        FCPAnalyzer fcpSearch = new FCPAnalyzer(false);
+        String str = "这里是中国, this is china #4.345^";
+        TokenStream stream = null ;
+        try {
+            stream = fcpSearch.tokenStream( "any", new StringReader(str)) ;
+            PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class ) ;  //保存位置
+            OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class ) ; //保存辞与词之间偏移量
+            CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class ) ;//保存响应词汇
+            TypeAttribute ta = stream.addAttribute(TypeAttribute.class ) ; //保存类型
+            stream.reset() ;
+            int position = -1;
+            while (stream.incrementToken()) {
+                position += pia.getPositionIncrement();
+                System. out.println(position + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type());
+            }
+            stream.end() ;
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
+    }
+
+    @Test
+    public void test03() {
+        String s = " \t \n";
+        System.out.println(s.trim().length() == 0);
+    }
+}