");
+ }
+
+ @Override
+ public String toString() {
+ return "TokenBody{" +
+ "termBuffer='" + termBuffer + '\'' +
+ ", startOffset=" + startOffset +
+ ", endOffset=" + endOffset +
+ ", position=" + position +
+ ", startPosition=" + startPosition +
+ ", endPosition=" + endPosition +
+ ", type='" + type + '\'' +
+ ", child=" + child +
+ '}';
+ }
+}
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
new file mode 100644
index 00000000..ba9ae2ff
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
@@ -0,0 +1,33 @@
+package org.wltea.analyzer.fcp.tokenattributes;
+
+import org.apache.lucene.util.Attribute;
+
+/** Determines how many positions this
+ * token spans. Very few analyzer components actually
+ * produce this attribute, and indexing ignores it, but
+ * it's useful to express the graph structure naturally
+ * produced by decompounding, word splitting/joining,
+ * synonym filtering, etc.
+ *
+ * NOTE: this is optional, and most analyzers
+ * don't change the default value (1). */
+
+public interface PositionLengthAttribute extends Attribute {
+ /**
+ * Set the position length of this Token.
+ *
+ * The default value is one.
+ * @param positionLength how many positions this token
+ * spans.
+ * @throws IllegalArgumentException if positionLength
+ * is zero or negative.
+ * @see #getPositionLength()
+ */
+ public void setPositionLength(int positionLength);
+
+ /** Returns the position length of this Token.
+ * @see #setPositionLength
+ */
+ public int getPositionLength();
+}
+
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
new file mode 100644
index 00000000..5aa230c0
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
@@ -0,0 +1,62 @@
+package org.wltea.analyzer.fcp.tokenattributes;
+
+
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+/** Default implementation of {@link PositionLengthAttribute}. */
+public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
+ private int positionLength = 1;
+
+ /** Initializes this attribute with position length of 1. */
+ public PositionLengthAttributeImpl() {}
+
+ @Override
+ public void setPositionLength(int positionLength) {
+ if (positionLength < 1) {
+ throw new IllegalArgumentException("Position length must be 1 or greater; got " + positionLength);
+ }
+ this.positionLength = positionLength;
+ }
+
+ @Override
+ public int getPositionLength() {
+ return positionLength;
+ }
+
+ @Override
+ public void clear() {
+ this.positionLength = 1;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof PositionLengthAttributeImpl) {
+ PositionLengthAttributeImpl _other = (PositionLengthAttributeImpl) other;
+ return positionLength == _other.positionLength;
+ }
+
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return positionLength;
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ PositionLengthAttribute t = (PositionLengthAttribute) target;
+ t.setPositionLength(positionLength);
+ }
+
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength);
+ }
+}
+
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java b/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java
new file mode 100644
index 00000000..0f618963
--- /dev/null
+++ b/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java
@@ -0,0 +1,124 @@
+/**
+ * IK 中文分词 版本 5.0
+ * IK Analyzer release 5.0
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * 源代码由林良益(linliangyi2005@gmail.com)提供
+ * 版权声明 2012,乌龙茶工作室
+ * provided by Linliangyi and copyright 2012 by Oolong studio
+ *
+ * 字符集识别工具类
+ */
+package org.wltea.analyzer.fcp.util;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * 字符集识别工具类
+ */
+public class CharacterUtil {
+
+ public static final String CHAR_USELESS = "";
+
+ public static final String CHAR_ENGLISH = "";
+
+ public static final String CHAR_NUMBER = "";
+
+ public static final String CHAR_NUMBER_DOT = "";
+
+ public static final String ALPHANUM = "";
+
+ public static final String CHAR_CHINESE = "";
+
+ public static final String COMBINE_WORD = "";
+
+ public static final String CHAR_MAPPING = "";
+
+ public static final String CHAR_BLANK = "";
+
+ // pinyin
+ public static final String CHAR_PINYIN = "";
+ // pinyin 前缀
+ public static final String CHAR_PINYIN_PRE = "";
+
+ private static Map order;
+ static {
+ // value 越小,排序越靠前,用于区分在同一个 position 上的不同 type 之间的排序
+ order = new HashMap<>();
+ order.put(CHAR_CHINESE, 0);
+ order.put(CHAR_PINYIN_PRE, 5);
+ order.put(CHAR_PINYIN, 10);
+
+ order.put(CHAR_USELESS, 0);
+ order.put(CHAR_MAPPING, 10);
+ }
+
+ public static int getOrderByType(String type) {
+ return order.getOrDefault(type, 0);
+ }
+
+
+
+ /**
+ * 识别字符类型
+ * @param input
+ * @return int CharacterUtil定义的字符类型常量
+ */
+ public static String identifyCharType(int input){
+
+ if (input >= '0' && input <= '9') {
+ return CHAR_NUMBER;
+ } else if ((input >= 'a' && input <= 'z')
+ || (input >= 'A' && input <= 'Z')) {
+ return CHAR_ENGLISH;
+ } else {
+ Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
+
+ if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
+ || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
+ || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
+ //目前已知的中文字符UTF-8集合
+ return CHAR_CHINESE;
+
+ }
+ }
+ //其他的不做处理的字符
+ return CHAR_USELESS;
+
+ }
+
+ /**
+ * 进行字符规格化(全角转半角,大写转小写处理)
+ * @param input
+ * @return char
+ */
+ public static int regularize(int input){
+ if (input == 12288) {
+ input = 32;
+
+ }else if (input > 65280 && input < 65375) {
+ input = input - 65248;
+
+ }else if (input >= 'A' && input <= 'Z') {
+ input += 32;
+ }
+
+
+ return input;
+ }
+}
diff --git a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java
index 54ee735e..f906af6d 100644
--- a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java
+++ b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java
@@ -33,6 +33,12 @@ public Map {
+ private final FCPAnalyzer analyzer;
+
+ /**
+ * indexMode 作为重要的参数,
+ * @param indexSettings
+ * @param env
+ * @param name
+ * @param settings
+ * @param indexMode
+ */
+ public FCPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, boolean indexMode) {
+ super(name, settings);
+ boolean splitComplete = settings.getAsBoolean("split_complete", FCPAnalyzer.DEFAULT_SPLIT_COMPLETE);
+ int maxTokenLength = settings.getAsInt("max_token_length", CombineCharFilter.DEFAULT_MAX_WORD_LEN);
+ boolean uselessMapping = settings.getAsBoolean("useless_mapping", ExtendFilter.DEFAULT_USELESS_MAPPING);
+ boolean ignoreBlank = settings.getAsBoolean("ignore_blank", ExtendFilter.DEFAULT_IGNORE_BLANK);
+ boolean useFirstPos = settings.getAsBoolean("use_first_position", ExtendFilter.DEFAULT_USE_FIRST_POSITION);
+ Boolean showOffset = settings.getAsBoolean("show_offset", null);
+ analyzer = new FCPAnalyzer(indexMode);
+ if (showOffset != null) {
+ analyzer.setShowOffset(showOffset);
+ }
+ analyzer.setSplitComplete(splitComplete);
+ analyzer.setUselessMapping(uselessMapping);
+ analyzer.setMaxTokenLength(maxTokenLength);
+ analyzer.setIgnoreBlank(ignoreBlank);
+ analyzer.setUseFirstPos(useFirstPos);
+ }
+
+ public static FCPAnalyzerProvider getFCPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ boolean indexMode = settings.getAsBoolean("index_mode", ExtendFilter.DEFAULT_INDEX_MODE);
+ return new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+ }
+
+ public static FCPAnalyzerProvider getFCPIndexAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ boolean indexMode = true;
+ boolean useFirstPos = true;
+ FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+ FCPAnalyzer fcpAnalyzer = provider.get();
+ fcpAnalyzer.setUseFirstPos(useFirstPos);
+ return provider;
+ }
+
+ public static FCPAnalyzerProvider getFCPSearchAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ boolean indexMode = false;
+ boolean useFirstPos = true;
+ FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+ FCPAnalyzer fcpAnalyzer = provider.get();
+ fcpAnalyzer.setUseFirstPos(useFirstPos);
+ return provider;
+ }
+
+ public static FCPAnalyzerProvider getLCPIndexAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ boolean indexMode = true;
+ FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+ FCPAnalyzer fcpAnalyzer = provider.get();
+ fcpAnalyzer.setUseFirstPos(false);
+ return provider;
+ }
+
+ public static FCPAnalyzerProvider getLCPSearchAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) {
+ boolean indexMode = false;
+ FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode);
+ FCPAnalyzer fcpAnalyzer = provider.get();
+ fcpAnalyzer.setUseFirstPos(false);
+ return provider;
+ }
+
+ @Override
+ public FCPAnalyzer get() {
+ return analyzer;
+ }
+}
\ No newline at end of file
From 5e5445f29761de91c3d7a20e1ebc70b2c9daaf56 Mon Sep 17 00:00:00 2001
From: "hao.mou"
Date: Sun, 28 Apr 2024 10:06:27 +0800
Subject: [PATCH 2/2] add new analyzer , simple implementation
---
.../org/wltea/analyzer/fcp/ExtendFilter.java | 271 +++---------------
.../org/wltea/analyzer/fcp/FCPAnalyzer.java | 5 +-
.../org/wltea/analyzer/fcp/TokenBody.java | 5 +-
.../PositionLengthAttribute.java | 1 +
.../PositionLengthAttributeImpl.java | 1 +
.../analyzer/fcp/Configuration4Test.java | 29 ++
.../wltea/analyzer/fcp/FCPAnalyzerTest.java | 80 ++++++
7 files changed, 148 insertions(+), 244 deletions(-)
create mode 100644 core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java
create mode 100644 core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
index 211c09b0..be2451a1 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java
@@ -9,30 +9,24 @@
import org.wltea.analyzer.dic.DictSegment;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
-import org.wltea.analyzer.fcp.tokenattributes.PositionLengthAttribute;
import org.wltea.analyzer.fcp.util.CharacterUtil;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
import java.util.PriorityQueue;
/**
* use dict to extend terms
*/
public class ExtendFilter extends TokenFilter {
- private static final boolean IS_DEBUG = true;
// 默认入库模式
public static final boolean DEFAULT_INDEX_MODE = true;
// 默认对于特殊字符采用模糊搜索,扩大搜索范围
public static final boolean DEFAULT_USELESS_MAPPING = true;
// 默认对于句子的空白进行忽略
public static final boolean DEFAULT_IGNORE_BLANK = true;
- // 默认对于句子的空白进行忽略
- public static final boolean DEFAULT_IGNORE_WHITESPACE = true;
// 默认使用 lcp 的模式,使用最后一个char的position
public static final boolean DEFAULT_USE_FIRST_POSITION = false;
// 在高亮的时候使用 offset
@@ -42,8 +36,6 @@ public class ExtendFilter extends TokenFilter {
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- // 用于记录每一个term 的position length
- private final PositionLengthAttribute lengthAttribute = addAttribute(PositionLengthAttribute.class);
// used for saving upstream tokens , implemented by Arraylist
private List tokenBodies = null;
@@ -123,7 +115,7 @@ public final boolean incrementToken() throws IOException {
tb.endOffset = showOffset ? offsetAtt.endOffset() : 0;
// blank 类型会被舍弃,position不变
tb.termBuffer = termAtt.toString();
- // 下面是处理 position 和 type的赋值,单个 term,没有 startPosition 和 endPosition
+ // 下面是处理 position 和 type的赋值
if (CharacterUtil.CHAR_USELESS.equals(typeAtt.type())) {
if (isAllBlank(tb.termBuffer) && this.ignoreBlank) {
// 表示沿用上一个 position,下面将会被舍弃掉
@@ -135,7 +127,7 @@ public final boolean incrementToken() throws IOException {
tb.position = position;
tb.type = typeAtt.type();
if (uselessMapping) {
- tb.termBuffer = "#"; // 无特殊含义,将特殊字符统一映射为 # 方便查询
+ tb.termBuffer = "#"; // 无特殊含义,将特殊字符统一映射为 # 方便查询, 否则特殊字符也是需要精准匹配
}
}
} else {
@@ -157,10 +149,6 @@ public final boolean incrementToken() throws IOException {
termAtt.copyBuffer(chars, 0, chars.length);
offsetAtt.setOffset(body.startOffset, body.endOffset);
typeAtt.setType(body.type);
- if (!indexMode) {
- // 计算当前combine term 的跨度,占用了多少个 term
- lengthAttribute.setPositionLength(body.endPosition - body.startPosition + 1);
- }
return true;
} else {
tokenBodies = null;
@@ -171,7 +159,7 @@ public final boolean incrementToken() throws IOException {
/**
- * 判断参数是否全部由空白字符组成
+ * 判断参数是否全部由空白字符(空格、制表符、换行……)组成
* @param s
* @return
*/
@@ -188,237 +176,44 @@ private void extendTerms(List tokenBodies, boolean indexMode, boolean
if (!tokenBody.type.equals(CharacterUtil.CHAR_BLANK)) {
// 处理当前char, 但要考虑向后扩展,得到以当前位置开始 以 endList 中位置结束的一系列term,
List endList = getCurrentEndList(tokenBodies, beginI, ignoreBlank);
- // 默认在 index 模式下,一股脑全部放到倒排中(index 模式对性能敏感,所以必须保证)
- if (!indexMode) {
- tokenBody.startPosition = tokenBody.position;
- tokenBody.endPosition = tokenBody.position;
- }
- tokenResults.add(tokenBody);
- for (Integer endI : endList) {
- TokenBody tb= new TokenBody();
- tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI);
- tb.startOffset = tokenBodies.get(beginI).startOffset;
- tb.endOffset = tokenBodies.get(endI).endOffset;
- // search 模式下需要记录组合 term 前后的 position
- if (!indexMode) {
- tb.startPosition = tokenBodies.get(beginI).position;
- tb.endPosition = tokenBodies.get(endI).position;
- }
- if (useFirstPos) {
- tb.position = tokenBodies.get(beginI).position;
- } else {
- tb.position = tokenBodies.get(endI).position;
+ if (indexMode) {
+ tokenResults.add(tokenBody);
+ for (Integer endI : endList) {
+ TokenBody tb= new TokenBody();
+ tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI);
+ tb.startOffset = tokenBodies.get(beginI).startOffset;
+ tb.endOffset = tokenBodies.get(endI).endOffset;
+ if (useFirstPos) {
+ tb.position = tokenBodies.get(beginI).position;
+ } else {
+ tb.position = tokenBodies.get(endI).position;
+ }
+ tb.type = CharacterUtil.COMBINE_WORD;
+ tokenResults.add(tb);
}
- tb.type = "";
- tokenResults.add(tb);
- }
- }
- }
- // 到这里如果是index 模式的话,已经可以结束了;
- // 如果是 search模式,需要做歧义处理(如果有的话, 使用 类型的char 作为天然分割句子)
- if (!indexMode && tokenResults.size() > 0) {
- // 在search 模式下,采用 ik_smart 的逻辑进行语义分割,一个重大的意义:引入了语义分割
- // 1,ik 使用没有语义重叠的那个 char 作为分割点,只作用于有字符重叠的部分
- // 2,由于和 index 模式使用相同的 向后扩展逻辑,所以search是index 的子集
- // 3,search 模式下,不会涉及mapping的扩展引入
- // 4,search 模式下,使用 startPosition 来进行判断扩是否有歧义
-
- // 用于保存多个term的组合形式,逆序。:采用动态编程思想,完成快速组合
- PriorityQueue combineTerms = new PriorityQueue(new Comparator(){
- @Override
- public int compare(TokenBody o1, TokenBody o2){
- // 顺序有重要意义
- return o1.startPosition != o2.startPosition ?
- Integer.compare(o1.startPosition, o2.startPosition)
- : Integer.compare(o2.endPosition, o1.endPosition);
- }
- });
- // 用于保存单个term的形式(最后将保存全部的结果)
- Map singleTerm = new HashMap<>();
-
- // 将切分结果重新排序, 并清空之前的处理结果
- int startPosition = Integer.MAX_VALUE;
- int endPosition = Integer.MIN_VALUE;
- while (tokenResults.size() > 0) {
- TokenBody t = tokenResults.poll();
- if (t.startPosition == t.endPosition) {
- // 单个 term
- singleTerm.put(t.position, t);
- startPosition = Math.min(startPosition, t.startPosition);
- endPosition = Math.max(endPosition, t.endPosition);
} else {
- // 组合出来的term,不参与歧义判断,仅仅用于歧义判断后的填补那些空白的 position
- combineTerms.add(t);
- }
- }
-
- // 处理分词,没有歧义的直接放到结果中,有歧义的处理完之后放到结果中
- PriorityQueue searchReverseOrder = new PriorityQueue(new Comparator(){
- @Override
- public int compare(TokenBody o1, TokenBody o2){
- // 顺序有重要意义
- return o1.startPosition != o2.startPosition ?
- Integer.compare(o2.startPosition, o1.startPosition)
- : Integer.compare(o1.endPosition, o2.endPosition);
- }
- });
-
- // 在处理一段歧义时,控制前后范围, 第一次就是最开始的范围
- int maxExtend = Integer.MIN_VALUE; // 边界包含
- for (TokenBody tb : combineTerms) {
- if (searchReverseOrder.size() == 0) {
- searchReverseOrder.add(tb);
- maxExtend = tb.endPosition;
- continue;
- }
-
- if (maxExtend < tb.startPosition) {
- // 表示当前term 与之前的切分没有歧义
- if (searchReverseOrder.size() == 1) {
- final TokenBody body = searchReverseOrder.poll();
- singleTerm.put(body.startPosition, body);
+ // 处理search analyzer 结果,贪婪向后匹配
+ // 1,只有单字,加入单字
+ // 2,有后缀匹配,采用最长的token结果(目的是找到个数最少的组合,非最优,但比较简单)
+ if (endList.isEmpty()) {
+ tokenResults.add(tokenBody); // 单字
} else {
- // 这里先处理掉之前有歧义的部分,
- final List arbitrator = arbitrator(searchReverseOrder);
- for(TokenBody body : arbitrator) {
- singleTerm.put(body.startPosition, body);
+ int lastEnd = endList.get(endList.size()-1); // 取最长token
+ tokenBody.termBuffer = combineTermBuffer(tokenBodies, beginI, lastEnd);
+ tokenBody.startOffset = tokenBodies.get(beginI).startOffset;
+ tokenBody.endOffset = tokenBodies.get(lastEnd).endOffset;
+ if (useFirstPos) {
+ tokenBody.position = tokenBodies.get(beginI).position;
+ } else {
+ tokenBody.position = tokenBodies.get(lastEnd).position;
}
- }
- }
- searchReverseOrder.add(tb);
- maxExtend = Math.max(maxExtend, tb.endPosition);
- }
- // 处理最后的歧义
- if (searchReverseOrder.size() == 1) {
- final TokenBody body = searchReverseOrder.poll();
- singleTerm.put(body.startPosition, body);
- } else if(searchReverseOrder.size() > 1){
- final List arbitrator = arbitrator(searchReverseOrder);
- for(TokenBody body : arbitrator) {
- singleTerm.put(body.startPosition, body);
- }
- }
- // endPosition 的用途
- while (startPosition <= endPosition) {
- if (singleTerm.containsKey(startPosition)) {
- final TokenBody body = singleTerm.get(startPosition);
- tokenResults.add(body);
- startPosition = body.endPosition + 1;
- } else {
- startPosition++;
- }
- }
- }
- }
+ tokenBody.type = CharacterUtil.COMBINE_WORD;
+ tokenResults.add(tokenBody);
- /**
- * 处理有歧义的token,
- * @param searchReverseOrder 为倒序的token
- * @return
- */
- private List arbitrator(PriorityQueue searchReverseOrder) {
- Map> positionMap = new HashMap<>();
- int maxIndex = -1;
- int minIndex = -1;
- while (searchReverseOrder.size() > 0) {
- final TokenBody body = searchReverseOrder.poll();
- if (searchReverseOrder.size() == 0) {
- // 要处理的最开始的位置,也就是 searchReverseOrder 的最后一个
- minIndex = body.startPosition;
- }
- if (maxIndex == -1) {
- // 要处理的最后的位置,也就是 searchReverseOrder 的第一个
- maxIndex = body.startPosition;
- }
- // 下面给当前的 token 添加 child
- int currentMax = maxIndex;
- for (int i = body.endPosition + 1; i <= currentMax; i++) {
- if (positionMap.containsKey(i)) {
- final List bodies = positionMap.get(i);
- final TokenBody minLengthBody = bodies.get(0); // 表示取其后紧挨着的最短token作为结束位置
- if (currentMax == maxIndex) {
- currentMax = minLengthBody.endPosition; // 表示 minLengthBody 后面的 term 不可以作为 child了
- }
- if (body.child == null) {
- body.child = new ArrayList<>();
+ beginI = lastEnd;
}
- body.child.addAll(positionMap.get(i));
}
}
- // 将 token放到结果中
- if (positionMap.containsKey(body.startPosition) == false) {
- positionMap.put(body.startPosition, new ArrayList<>());
- }
- positionMap.get(body.startPosition).add(body);
-
-// if (IS_DEBUG) {
-// for(int i = 0; i < maxIndex + 10; i++) {
-// String s = "- ";
-// if (body.startPosition <= i && i <= body.endPosition) {
-// s = "# ";
-// }
-// System.out.print(s);
-// }
-// System.out.println();
-// }
- }
- List topOptions = new ArrayList<>();
-
- final TokenBody firstMinLength = positionMap.get(minIndex).get(0);
- for(int i = firstMinLength.startPosition; i <= firstMinLength.endPosition; i++) {
- if (positionMap.containsKey(i)) {
- topOptions.addAll(positionMap.get(i));
- }
- }
- for (TokenBody t : topOptions) {
- System.out.println(t);
- }
- List result = new ArrayList<>();
- final OptionPath bestPath = chooseBestPath(topOptions);
- for (int i = 0; i < bestPath.size ; i++) {
- int startP = bestPath.getValueByIndex(2 * i);
- int endP = bestPath.getValueByIndex(2 * i + 1);
- final List bodyList = positionMap.get(startP);
- for(TokenBody tb : bodyList) {
- if (tb.startPosition == startP && tb.endPosition == endP) {
- result.add(tb);
- break;
- }
- }
- }
- return result;
- }
-
- // options 本身为已经处理好的结构,使用引用指向下级关系
- private OptionPath chooseBestPath(List options) {
- // 使用 PriorityQueue,因为只是需要获取最小的那一个,其后的严格有序不是必须的
- PriorityQueue allOptionPath = new PriorityQueue(new Comparator () {
- @Override
- public int compare(OptionPath o1, OptionPath o2) {
- return o2.compareTo(o1);
- }
- });
-
- for(TokenBody tokenBody : options) {
- OptionPath path = new OptionPath();
- path.addElement(tokenBody.startPosition, tokenBody.endPosition);
- findNextPath(allOptionPath, tokenBody, path);
- }
- final OptionPath bestPath = allOptionPath.poll();
- return bestPath;
- }
-
- private void findNextPath(PriorityQueue allOptionPath, TokenBody tokenBody, OptionPath parentPath) {
- if (tokenBody.child == null) {
- // 路径的最后,结束递归
- allOptionPath.add(parentPath);
- return;
- }
- for(TokenBody child : tokenBody.child) {
- // 复制parent path
- OptionPath childPath = parentPath.copy();
- childPath.addElement(child.startPosition, child.endPosition);
- findNextPath(allOptionPath, child, childPath);
}
}
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
index e0e28f13..ec381546 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java
@@ -5,10 +5,7 @@
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.synonym.SynonymMap;
-import java.util.Arrays;
-import java.util.List;
public final class FCPAnalyzer extends Analyzer {
/** Default maximum allowed token length */
@@ -21,7 +18,7 @@ public final class FCPAnalyzer extends Analyzer {
// 特殊字符的映射,默认为 true 表示模糊匹配特殊字符。如果设置为 false ,将会把原始的char放到最终分词结果中。
private boolean uselessMapping = true;
// 默认文本是正确文本,其中的空白是有意义的,不能忽略空白。如果认为原文中的空白由于ETL错误引入,应该忽略空白。
- private boolean ignoreBlank = false;
+ private boolean ignoreBlank = true;
// 是否使用 first char position ,默认使用,如果为 false,则变为 lcp_analyzer
private boolean useFirstPos = true;
// 是否显示 offset,默认随着 indexMode 变化
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
index 587ca1d9..6e9bcf4d 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java
@@ -9,8 +9,9 @@ class TokenBody {
String termBuffer;
int startOffset, endOffset;
// position 用于表示在 elasticsearch 分词时得到的 position, 通过 curr.position - prev.position 得到 positionIncrement
- // startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token,主要给 ik_smart 使用
- int position, startPosition = -1, endPosition = -1;
+ int position;
+ // todo 未来startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token,主要给 ik_smart 使用
+ int startPosition = -1, endPosition = -1;
String type;
List child;
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
index ba9ae2ff..2e0f6327 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java
@@ -12,6 +12,7 @@
* NOTE: this is optional, and most analyzers
* don't change the default value (1). */
+@Deprecated
public interface PositionLengthAttribute extends Attribute {
/**
* Set the position length of this Token.
diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
index 5aa230c0..c4d5dffb 100644
--- a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
+++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java
@@ -5,6 +5,7 @@
import org.apache.lucene.util.AttributeReflector;
/** Default implementation of {@link PositionLengthAttribute}. */
+@Deprecated
public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable {
private int positionLength = 1;
diff --git a/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java b/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java
new file mode 100644
index 00000000..34c6d8e8
--- /dev/null
+++ b/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java
@@ -0,0 +1,29 @@
+package org.wltea.analyzer.fcp;
+
+import org.wltea.analyzer.cfg.Configuration;
+
+import java.io.File;
+import java.net.URI;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+/**
+ * @ClassName Configuration4Test
+ * @Description:
+ */
+public class Configuration4Test extends Configuration {
+ @Override
+ public Path getConfDir() {
+ return Paths.get("../", "config");
+ }
+
+ @Override
+ public Path getConfigInPluginDir() {
+ return Paths.get("../", "config");
+ }
+
+ @Override
+ public Path getPath(String first, String... more) {
+ return Paths.get(first, more);
+ }
+}
diff --git a/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java b/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java
new file mode 100644
index 00000000..b65507d5
--- /dev/null
+++ b/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java
@@ -0,0 +1,80 @@
+package org.wltea.analyzer.fcp;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.junit.Before;
+import org.junit.Test;
+import org.wltea.analyzer.dic.Dictionary;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * @ClassName FCPAnalyzerTest
+ * @Description: fcp test
+ */
+public class FCPAnalyzerTest {
+
+ @Before
+ public void init() {
+ // 初始化词典
+ Dictionary.initial(new Configuration4Test());
+ }
+
+ @Test
+ public void testFcpIndexAnalyzer() {
+ FCPAnalyzer fcpIndex = new FCPAnalyzer(true);
+ String str = "这里是中国, this is china #4.345^";
+ TokenStream stream = null ;
+ try {
+ stream = fcpIndex.tokenStream( "any", new StringReader(str)) ;
+ PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class ) ; //保存位置
+ OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class ) ; //保存辞与词之间偏移量
+ CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class ) ;//保存响应词汇
+ TypeAttribute ta = stream.addAttribute(TypeAttribute.class ) ; //保存类型
+ stream.reset() ;
+ int position = -1;
+ while (stream.incrementToken()) {
+ position += pia.getPositionIncrement();
+ System. out.println(position + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type());
+ }
+ stream.end() ;
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ @Test
+ public void testFcpSearchAnalyzer() {
+ FCPAnalyzer fcpSearch = new FCPAnalyzer(false);
+ String str = "这里是中国, this is china #4.345^";
+ TokenStream stream = null ;
+ try {
+ stream = fcpSearch.tokenStream( "any", new StringReader(str)) ;
+ PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class ) ; //保存位置
+ OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class ) ; //保存辞与词之间偏移量
+ CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class ) ;//保存响应词汇
+ TypeAttribute ta = stream.addAttribute(TypeAttribute.class ) ; //保存类型
+ stream.reset() ;
+ int position = -1;
+ while (stream.incrementToken()) {
+ position += pia.getPositionIncrement();
+ System. out.println(position + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type());
+ }
+ stream.end() ;
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ @Test
+ public void test03() {
+ String s = " \t \n";
+ System.out.println(s.trim().length() == 0);
+ }
+}