From f2e64ebbefc075fddb9c248fbc76b4cd907a3f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Chao=2Emou?= <1967886749@qq.com> Date: Sat, 20 Apr 2024 19:51:54 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=C3=A5=C2=A2add=20new=20analyzer=20for=20sp?= =?UTF-8?q?eed=20match=5Fphrase=20query?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + README.md | 123 +++++ .../org/wltea/analyzer/dic/DictSegment.java | 8 +- .../org/wltea/analyzer/dic/Dictionary.java | 4 + .../wltea/analyzer/fcp/CombineCharFilter.java | 185 +++++++ .../org/wltea/analyzer/fcp/ExtendFilter.java | 474 ++++++++++++++++++ .../org/wltea/analyzer/fcp/FCPAnalyzer.java | 131 +++++ .../org/wltea/analyzer/fcp/FormatFilter.java | 51 ++ .../org/wltea/analyzer/fcp/OptionPath.java | 102 ++++ .../org/wltea/analyzer/fcp/TokenBody.java | 47 ++ .../PositionLengthAttribute.java | 33 ++ .../PositionLengthAttributeImpl.java | 62 +++ .../analyzer/fcp/util/CharacterUtil.java | 124 +++++ .../ik/elasticsearch/AnalysisIkPlugin.java | 6 + .../ik/elasticsearch/FCPAnalyzerProvider.java | 89 ++++ 15 files changed, 1436 insertions(+), 4 deletions(-) create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java create mode 100644 core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java create mode 100644 elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/FCPAnalyzerProvider.java diff --git a/.gitignore b/.gitignore index a53ac3d1..90db62f7 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ *.iml \.* !.travis.yml +*/target diff --git a/README.md b/README.md index 60eab6b9..8bea7581 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,129 @@ Result } ``` +针对ES 的 match_phrase query 搜索,是一个非常消耗CPU的query,因为需要处理 term 和 position 的相对位置。为了加速搜素,现优化了分词形式,保存了正确position 的相对位置信息,使得match_phrase query 可以在分词条件下使用,经测试使用该分词之后查询降为原来的 10% 以下。该分词器分为 index 和 search分词器,分别用于索引数据和查询数据。
+原理是分词出来的词项对应着首字 position ,所以可以在倒排中保存相对位置信息。index 分词器是切分出了所有的组合,search 分词器是没有重复的切出最少词项的组合,且不会重复。
+使用:
+1, 定义text 字段,analyzer 设置为 index 分词器,search_analyer 设置为 search分词器;
+2, 写数据。
+3, 查询。
+4, 分词器首字确定位置: fcp_index、fcp_search; 末字确定位置:lcp_index、lcp_search
+5, 缺点是目前原生的高亮不支持这种分词方式
+ +原理
+ +```json +# 使用index 分词是,最细粒度的,按照字的position确定词的position,确定了position的取值标准 +POST /_analyze +{ + "analyzer": "fcp_index", + "text": "中国平安" +} +# response +{ + "tokens": [ + { + "token": "中", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 0 + }, + { + "token": "中国", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 0 + }, + { + "token": "国", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 1 + }, + { + "token": "平", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 2 + }, + { + "token": "平安", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 2 + }, + { + "token": "安", + "start_offset": 0, + "end_offset": 0, + "type": "", + "position": 3 + } + ] +} +# 使用search 分词是粗粒度、无重叠分词,但仍按照字的position确定词的position,所以使用match_phrase有效 +POST /_analyze +{ + "analyzer": "fcp_search", + "text": "中国平安" +} +# response +{ + "tokens": [ + { + "token": "中国", + "start_offset": 0, + "end_offset": 2, + "type": "", + "position": 0 + }, + { + "token": "平安", + "start_offset": 2, + "end_offset": 4, + "type": "", + "position": 2 + } + ] +} +``` + +```json +PUT test_index +{ + "mappings": { + "properties": { + "content":{ + "type": "text", + "analyzer": "fcp_index", + "search_analyzer": "fcp_search" + } + } + } +} + +POST test_index/_doc/1 +{ + "content": "如果需要覆盖原来的配置" +} + +GET test_index/_search +{ + "query": { + "match_phrase": { + "content": { + "query": "要覆盖" + } + } + } +} +``` + # Dictionary Configuration Config file `IKAnalyzer.cfg.xml` can be located at `{conf}/analysis-ik/config/IKAnalyzer.cfg.xml` diff --git a/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java b/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java index 9e7b6fe4..33e60139 100644 --- a/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java +++ b/core/src/main/java/org/wltea/analyzer/dic/DictSegment.java @@ -32,7 +32,7 @@ /** * 词典树分段,表示词典树的一个分枝 */ -class DictSegment implements Comparable{ +public class DictSegment implements Comparable{ //公用字典表,存储汉字 private static final Map charMap = new ConcurrentHashMap(16 , 0.95f); @@ -55,7 +55,7 @@ class DictSegment implements Comparable{ private int nodeState = 0; - DictSegment(Character nodeChar){ + public DictSegment(Character nodeChar){ if(nodeChar == null){ throw new IllegalArgumentException("node char cannot be empty"); } @@ -78,7 +78,7 @@ boolean hasNextNode(){ * @param charArray * @return Hit */ - Hit match(char[] charArray){ + public Hit match(char[] charArray){ return this.match(charArray , 0 , charArray.length , null); } @@ -166,7 +166,7 @@ Hit match(char[] charArray , int begin , int length , Hit searchHit){ * 加载填充词典片段 * @param charArray */ - void fillSegment(char[] charArray){ + public void fillSegment(char[] charArray){ this.fillSegment(charArray, 0 , charArray.length , 1); } diff --git a/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java b/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java index 80a92da1..a6d60278 100755 --- a/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java +++ b/core/src/main/java/org/wltea/analyzer/dic/Dictionary.java @@ -126,6 +126,10 @@ private Dictionary(Configuration cfg) { } } + public DictSegment get_MainDict() { + return _MainDict; + } + private String getProperty(String key){ if(props!=null){ return props.getProperty(key); diff --git a/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java new file mode 100644 index 00000000..e3ad6c4c --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/CombineCharFilter.java @@ -0,0 +1,185 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.wltea.analyzer.fcp.util.CharacterUtil; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.Set; + +/** + * combine continues english or number + */ +public class CombineCharFilter extends TokenFilter { + public static final int DEFAULT_MAX_WORD_LEN = 255; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + + // used for saving upstream tokens , implemented by Arraylist + private List tokenBodies = null; + private Queue tokenResults = new LinkedList(); + // token 最大长度。防止过长English + private final int maxTokenLen; + + private static final Set numberDot; + static { + Set tmp = new HashSet<>(); + tmp.add("."); // 2.345 + tmp.add(","); // 1,234,567 + numberDot = Collections.unmodifiableSet(tmp); + } + + public CombineCharFilter(TokenStream input) { + super(input); + this.maxTokenLen = DEFAULT_MAX_WORD_LEN; + } + /** + * Construct a token stream filtering the given input. + * + * @param input + * @param maxTokenLen + */ + public CombineCharFilter(TokenStream input, int maxTokenLen) { + super(input); + this.maxTokenLen = maxTokenLen; + } + + @Override + public final boolean incrementToken() throws IOException { + if (tokenBodies == null && input.incrementToken()) { + tokenBodies = new ArrayList<>(); + do { + TokenBody tb = new TokenBody( + termAtt.toString(), + offsetAtt.startOffset(), + offsetAtt.endOffset(), + typeAtt.type()); + tokenBodies.add(tb); + } while (input.incrementToken()); + + combineCharsByType(tokenBodies); + } + if (tokenResults.size() > 0) { + TokenBody body = tokenResults.poll(); + char[] chars = body.termBuffer.toCharArray(); + termAtt.copyBuffer(chars, 0, chars.length); + offsetAtt.setOffset(body.startOffset, body.endOffset); + typeAtt.setType(body.type); + posIncrAtt.setPositionIncrement(1); + return true; + } else { + tokenBodies = null; + } + return false; + } + + private void combineCharsByType(List tokenBodies) { + if (tokenBodies == null || tokenBodies.size() == 0) { + return; + } + // 处理合并 english number useless + List sameType = new ArrayList<>(); + for (int beginI = 0; beginI < tokenBodies.size();) { + int nextTypeIndex = getNextTypeIndex(tokenBodies, beginI); + TokenBody body = composeTokens(tokenBodies, beginI, nextTypeIndex, tokenBodies.get(beginI).type); + sameType.add(body); + beginI = nextTypeIndex; + } + // 继续处理 english number + for (int beginI = 0; beginI < sameType.size();) { + TokenBody current = sameType.get(beginI); + int nextI = beginI + 1; + if (CharacterUtil.CHAR_NUMBER.equals(current.type) || CharacterUtil.CHAR_ENGLISH.equals(current.type)) { + for(; nextI < sameType.size(); nextI++) { + TokenBody next = sameType.get(nextI); + if (CharacterUtil.CHAR_NUMBER.equals(next.type) + || CharacterUtil.CHAR_ENGLISH.equals(next.type)) { + current.type = CharacterUtil.ALPHANUM; + current.termBuffer = current.termBuffer + next.termBuffer; + current.endOffset = next.endOffset; + } else { + break; + } + } + } + beginI = nextI; + tokenResults.add(current); + } + + } + + private TokenBody composeTokens(List tokenBodies, int beginI, int nextTypeIndex, String type) { + StringBuffer buffer = new StringBuffer(); + int startOffset = tokenBodies.get(beginI).startOffset; + int endOffset = tokenBodies.get(nextTypeIndex - 1).endOffset; + for(int i = beginI; i < nextTypeIndex; i++) { + buffer.append(tokenBodies.get(i).termBuffer); + } + return new TokenBody(buffer.toString(), startOffset, endOffset, type); + } + + // 首 TokenBody 的 type 作为整体 + private int getNextTypeIndex(List tokenBodies,final int beginI) { + int currentIndex = beginI; + // 如果 currentIndex 为 tokenBodies 的最后一个位置,直接返回 + if (currentIndex == tokenBodies.size() - 1) { + return currentIndex + 1; + } + TokenBody current = tokenBodies.get(currentIndex); + final String currentWordType = current.type; + int maxIndex = Math.min(currentIndex + maxTokenLen, tokenBodies.size()); + if (CharacterUtil.CHAR_NUMBER.equals(currentWordType)) { + for (currentIndex++; currentIndex < maxIndex; currentIndex++) { + current = tokenBodies.get(currentIndex); + if (CharacterUtil.CHAR_USELESS.equals(current.type) && numberDot.contains(current.termBuffer)) { + if (currentIndex+1 < maxIndex && CharacterUtil.CHAR_NUMBER.equals(tokenBodies.get(currentIndex+1).type)) { + // 改变了整体的 type + tokenBodies.get(beginI).type = CharacterUtil.CHAR_NUMBER_DOT; + } else { + break; + } + } else if (!CharacterUtil.CHAR_NUMBER.equals(current.type)) { + break; + } + } + return currentIndex; + } else if (CharacterUtil.CHAR_ENGLISH.equals(currentWordType) || CharacterUtil.CHAR_USELESS.equals(currentWordType)) { + for (currentIndex++; currentIndex < maxIndex; currentIndex++) { + current = tokenBodies.get(currentIndex); + if (!currentWordType.equals(current.type)) { + break; + } + } + return currentIndex; + } else { + return currentIndex + 1; + } + } + + + private static class TokenBody { + String termBuffer; + int startOffset, endOffset; + String type; + + TokenBody(String termBuffer, int startOffset, int endOffset, String type){ + this.termBuffer = termBuffer; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.type = type; + } + } +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java new file mode 100644 index 00000000..211c09b0 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java @@ -0,0 +1,474 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.wltea.analyzer.dic.DictSegment; +import org.wltea.analyzer.dic.Dictionary; +import org.wltea.analyzer.dic.Hit; +import org.wltea.analyzer.fcp.tokenattributes.PositionLengthAttribute; +import org.wltea.analyzer.fcp.util.CharacterUtil; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.PriorityQueue; + +/** + * use dict to extend terms + */ +public class ExtendFilter extends TokenFilter { + private static final boolean IS_DEBUG = true; + // 默认入库模式 + public static final boolean DEFAULT_INDEX_MODE = true; + // 默认对于特殊字符采用模糊搜索,扩大搜索范围 + public static final boolean DEFAULT_USELESS_MAPPING = true; + // 默认对于句子的空白进行忽略 + public static final boolean DEFAULT_IGNORE_BLANK = true; + // 默认对于句子的空白进行忽略 + public static final boolean DEFAULT_IGNORE_WHITESPACE = true; + // 默认使用 lcp 的模式,使用最后一个char的position + public static final boolean DEFAULT_USE_FIRST_POSITION = false; + // 在高亮的时候使用 offset + public static final boolean DEFAULT_SHOW_OFFSET = false; + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + // 用于记录每一个term 的position length + private final PositionLengthAttribute lengthAttribute = addAttribute(PositionLengthAttribute.class); + + // used for saving upstream tokens , implemented by Arraylist + private List tokenBodies = null; + //use to save analyzed tokens ,use priority heap save order + PriorityQueue tokenResults = new PriorityQueue(new Comparator(){ + @Override + public int compare(TokenBody o1, TokenBody o2){ +// return o1.position != o2.position ? Integer.compare(o1.position, o2.position) : Integer.compare(o2.startOffset, o1.startOffset); + if(o1.position != o2.position) { + return Integer.compare(o1.position, o2.position); + } else if (o2.startOffset != o1.startOffset) { + return Integer.compare(o2.startOffset, o1.startOffset); + } else { + return Integer.compare(o1.endOffset-o1.startOffset, o2.endOffset-o2.startOffset); + } + } + }); + // 记录上一个 term 的position ,用于计算 positionIncrement + private int prePosition = -1; + + private final boolean indexMode; + // 对于上游的 分词结果 上个 end_offset 和 下一个 token的 start_offset 不相等。 像 “成 功” 之间有空格,该参数决定是否忽略空格组词, 默认为true,忽略之间的 空白 + private boolean ignoreBlank = true; + // 是否使用 first char position ,默认使用,如果为 false,则变为 lcp_analyzer + private boolean useFirstPos = true; + // 特殊字符的映射,默认为 true 表示模糊匹配特殊字符。如果设置为 false ,将会把原始的char放到最终分词结果中。 + private boolean uselessMapping = true; + // 入库模式下不显示,search 模式下显示offset,在 highlight 的时候也开启 + private boolean showOffset = false; + + + public ExtendFilter setIgnoreBlank(boolean ignoreBlank) { + this.ignoreBlank = ignoreBlank; + return this; + } + + + public ExtendFilter setUseFirstPos(boolean useFirstPos) { + this.useFirstPos = useFirstPos; + return this; + } + + public ExtendFilter setUselessMapping(boolean uselessMapping) { + this.uselessMapping = uselessMapping; + return this; + } + + public ExtendFilter setShowOffset(boolean showOffset) { + this.showOffset = showOffset; + return this; + } + + + /** + * Construct a token stream filtering the given input. + * + * @param input + */ + public ExtendFilter(TokenStream input) { + this(input, DEFAULT_INDEX_MODE); + } + + public ExtendFilter(TokenStream input, boolean indexMode) { + super(input); + this.indexMode = indexMode; + } + + @Override + public final boolean incrementToken() throws IOException { + if (tokenBodies == null && input.incrementToken()) { + tokenBodies = new ArrayList<>(); + int position = -1; + do { + TokenBody tb= new TokenBody(); + // TODO lcp analyzer 入库的特殊处理方式(不支持 offset 和 term_vector 存储方式),否则就要改变 lucene源码。 + tb.startOffset = showOffset ? offsetAtt.startOffset() : 0; + tb.endOffset = showOffset ? offsetAtt.endOffset() : 0; + // blank 类型会被舍弃,position不变 + tb.termBuffer = termAtt.toString(); + // 下面是处理 position 和 type的赋值,单个 term,没有 startPosition 和 endPosition + if (CharacterUtil.CHAR_USELESS.equals(typeAtt.type())) { + if (isAllBlank(tb.termBuffer) && this.ignoreBlank) { + // 表示沿用上一个 position,下面将会被舍弃掉 + tb.position = position; + tb.type = CharacterUtil.CHAR_BLANK; + tb.termBuffer = ""; + } else { + position += posIncrAtt.getPositionIncrement(); + tb.position = position; + tb.type = typeAtt.type(); + if (uselessMapping) { + tb.termBuffer = "#"; // 无特殊含义,将特殊字符统一映射为 # 方便查询 + } + } + } else { + position += posIncrAtt.getPositionIncrement(); + tb.position = position; + tb.type = typeAtt.type(); + } + tokenBodies.add(tb); + } while (input.incrementToken()); + + extendTerms(tokenBodies, indexMode, ignoreBlank, useFirstPos); + } + if (tokenResults.size() > 0) { + TokenBody body = tokenResults.poll(); + + posIncrAtt.setPositionIncrement(body.position - prePosition); + prePosition = body.position; + char[] chars = body.termBuffer.toCharArray(); + termAtt.copyBuffer(chars, 0, chars.length); + offsetAtt.setOffset(body.startOffset, body.endOffset); + typeAtt.setType(body.type); + if (!indexMode) { + // 计算当前combine term 的跨度,占用了多少个 term + lengthAttribute.setPositionLength(body.endPosition - body.startPosition + 1); + } + return true; + } else { + tokenBodies = null; + prePosition = -1; + } + return false; + } + + + /** + * 判断参数是否全部由空白字符组成 + * @param s + * @return + */ + private boolean isAllBlank(String s) { + return s.trim().length() == 0; + } + + private void extendTerms(List tokenBodies, boolean indexMode, boolean ignoreBlank, boolean useFirstPos) { + if (tokenBodies == null || tokenBodies.size() == 0) { + return; + } + for (int beginI = 0; beginI < tokenBodies.size(); beginI++) { + TokenBody tokenBody = tokenBodies.get(beginI); + if (!tokenBody.type.equals(CharacterUtil.CHAR_BLANK)) { + // 处理当前char, 但要考虑向后扩展,得到以当前位置开始 以 endList 中位置结束的一系列term, + List endList = getCurrentEndList(tokenBodies, beginI, ignoreBlank); + // 默认在 index 模式下,一股脑全部放到倒排中(index 模式对性能敏感,所以必须保证) + if (!indexMode) { + tokenBody.startPosition = tokenBody.position; + tokenBody.endPosition = tokenBody.position; + } + tokenResults.add(tokenBody); + for (Integer endI : endList) { + TokenBody tb= new TokenBody(); + tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI); + tb.startOffset = tokenBodies.get(beginI).startOffset; + tb.endOffset = tokenBodies.get(endI).endOffset; + // search 模式下需要记录组合 term 前后的 position + if (!indexMode) { + tb.startPosition = tokenBodies.get(beginI).position; + tb.endPosition = tokenBodies.get(endI).position; + } + if (useFirstPos) { + tb.position = tokenBodies.get(beginI).position; + } else { + tb.position = tokenBodies.get(endI).position; + } + tb.type = ""; + tokenResults.add(tb); + } + } + } + // 到这里如果是index 模式的话,已经可以结束了; + // 如果是 search模式,需要做歧义处理(如果有的话, 使用 类型的char 作为天然分割句子) + if (!indexMode && tokenResults.size() > 0) { + // 在search 模式下,采用 ik_smart 的逻辑进行语义分割,一个重大的意义:引入了语义分割 + // 1,ik 使用没有语义重叠的那个 char 作为分割点,只作用于有字符重叠的部分 + // 2,由于和 index 模式使用相同的 向后扩展逻辑,所以search是index 的子集 + // 3,search 模式下,不会涉及mapping的扩展引入 + // 4,search 模式下,使用 startPosition 来进行判断扩是否有歧义 + + // 用于保存多个term的组合形式,逆序。:采用动态编程思想,完成快速组合 + PriorityQueue combineTerms = new PriorityQueue(new Comparator(){ + @Override + public int compare(TokenBody o1, TokenBody o2){ + // 顺序有重要意义 + return o1.startPosition != o2.startPosition ? + Integer.compare(o1.startPosition, o2.startPosition) + : Integer.compare(o2.endPosition, o1.endPosition); + } + }); + // 用于保存单个term的形式(最后将保存全部的结果) + Map singleTerm = new HashMap<>(); + + // 将切分结果重新排序, 并清空之前的处理结果 + int startPosition = Integer.MAX_VALUE; + int endPosition = Integer.MIN_VALUE; + while (tokenResults.size() > 0) { + TokenBody t = tokenResults.poll(); + if (t.startPosition == t.endPosition) { + // 单个 term + singleTerm.put(t.position, t); + startPosition = Math.min(startPosition, t.startPosition); + endPosition = Math.max(endPosition, t.endPosition); + } else { + // 组合出来的term,不参与歧义判断,仅仅用于歧义判断后的填补那些空白的 position + combineTerms.add(t); + } + } + + // 处理分词,没有歧义的直接放到结果中,有歧义的处理完之后放到结果中 + PriorityQueue searchReverseOrder = new PriorityQueue(new Comparator(){ + @Override + public int compare(TokenBody o1, TokenBody o2){ + // 顺序有重要意义 + return o1.startPosition != o2.startPosition ? + Integer.compare(o2.startPosition, o1.startPosition) + : Integer.compare(o1.endPosition, o2.endPosition); + } + }); + + // 在处理一段歧义时,控制前后范围, 第一次就是最开始的范围 + int maxExtend = Integer.MIN_VALUE; // 边界包含 + for (TokenBody tb : combineTerms) { + if (searchReverseOrder.size() == 0) { + searchReverseOrder.add(tb); + maxExtend = tb.endPosition; + continue; + } + + if (maxExtend < tb.startPosition) { + // 表示当前term 与之前的切分没有歧义 + if (searchReverseOrder.size() == 1) { + final TokenBody body = searchReverseOrder.poll(); + singleTerm.put(body.startPosition, body); + } else { + // 这里先处理掉之前有歧义的部分, + final List arbitrator = arbitrator(searchReverseOrder); + for(TokenBody body : arbitrator) { + singleTerm.put(body.startPosition, body); + } + } + } + searchReverseOrder.add(tb); + maxExtend = Math.max(maxExtend, tb.endPosition); + } + // 处理最后的歧义 + if (searchReverseOrder.size() == 1) { + final TokenBody body = searchReverseOrder.poll(); + singleTerm.put(body.startPosition, body); + } else if(searchReverseOrder.size() > 1){ + final List arbitrator = arbitrator(searchReverseOrder); + for(TokenBody body : arbitrator) { + singleTerm.put(body.startPosition, body); + } + } + // endPosition 的用途 + while (startPosition <= endPosition) { + if (singleTerm.containsKey(startPosition)) { + final TokenBody body = singleTerm.get(startPosition); + tokenResults.add(body); + startPosition = body.endPosition + 1; + } else { + startPosition++; + } + } + } + } + + /** + * 处理有歧义的token, + * @param searchReverseOrder 为倒序的token + * @return + */ + private List arbitrator(PriorityQueue searchReverseOrder) { + Map> positionMap = new HashMap<>(); + int maxIndex = -1; + int minIndex = -1; + while (searchReverseOrder.size() > 0) { + final TokenBody body = searchReverseOrder.poll(); + if (searchReverseOrder.size() == 0) { + // 要处理的最开始的位置,也就是 searchReverseOrder 的最后一个 + minIndex = body.startPosition; + } + if (maxIndex == -1) { + // 要处理的最后的位置,也就是 searchReverseOrder 的第一个 + maxIndex = body.startPosition; + } + // 下面给当前的 token 添加 child + int currentMax = maxIndex; + for (int i = body.endPosition + 1; i <= currentMax; i++) { + if (positionMap.containsKey(i)) { + final List bodies = positionMap.get(i); + final TokenBody minLengthBody = bodies.get(0); // 表示取其后紧挨着的最短token作为结束位置 + if (currentMax == maxIndex) { + currentMax = minLengthBody.endPosition; // 表示 minLengthBody 后面的 term 不可以作为 child了 + } + if (body.child == null) { + body.child = new ArrayList<>(); + } + body.child.addAll(positionMap.get(i)); + } + } + // 将 token放到结果中 + if (positionMap.containsKey(body.startPosition) == false) { + positionMap.put(body.startPosition, new ArrayList<>()); + } + positionMap.get(body.startPosition).add(body); + +// if (IS_DEBUG) { +// for(int i = 0; i < maxIndex + 10; i++) { +// String s = "- "; +// if (body.startPosition <= i && i <= body.endPosition) { +// s = "# "; +// } +// System.out.print(s); +// } +// System.out.println(); +// } + } + List topOptions = new ArrayList<>(); + + final TokenBody firstMinLength = positionMap.get(minIndex).get(0); + for(int i = firstMinLength.startPosition; i <= firstMinLength.endPosition; i++) { + if (positionMap.containsKey(i)) { + topOptions.addAll(positionMap.get(i)); + } + } + for (TokenBody t : topOptions) { + System.out.println(t); + } + List result = new ArrayList<>(); + final OptionPath bestPath = chooseBestPath(topOptions); + for (int i = 0; i < bestPath.size ; i++) { + int startP = bestPath.getValueByIndex(2 * i); + int endP = bestPath.getValueByIndex(2 * i + 1); + final List bodyList = positionMap.get(startP); + for(TokenBody tb : bodyList) { + if (tb.startPosition == startP && tb.endPosition == endP) { + result.add(tb); + break; + } + } + } + return result; + } + + // options 本身为已经处理好的结构,使用引用指向下级关系 + private OptionPath chooseBestPath(List options) { + // 使用 PriorityQueue,因为只是需要获取最小的那一个,其后的严格有序不是必须的 + PriorityQueue allOptionPath = new PriorityQueue(new Comparator () { + @Override + public int compare(OptionPath o1, OptionPath o2) { + return o2.compareTo(o1); + } + }); + + for(TokenBody tokenBody : options) { + OptionPath path = new OptionPath(); + path.addElement(tokenBody.startPosition, tokenBody.endPosition); + findNextPath(allOptionPath, tokenBody, path); + } + final OptionPath bestPath = allOptionPath.poll(); + return bestPath; + } + + private void findNextPath(PriorityQueue allOptionPath, TokenBody tokenBody, OptionPath parentPath) { + if (tokenBody.child == null) { + // 路径的最后,结束递归 + allOptionPath.add(parentPath); + return; + } + for(TokenBody child : tokenBody.child) { + // 复制parent path + OptionPath childPath = parentPath.copy(); + childPath.addElement(child.startPosition, child.endPosition); + findNextPath(allOptionPath, child, childPath); + } + } + + /** + * 以 begin 开始,但是不包含 begin + * @param tokenBodies + * @param begin + * @param ignoreBlank + * @return + */ + private List getCurrentEndList(List tokenBodies, int begin, boolean ignoreBlank) { + List endList = new ArrayList<>(); + DictSegment dict = Dictionary.getSingleton().get_MainDict(); + StringBuffer sb = new StringBuffer(tokenBodies.get(begin).termBuffer); + for (int j = begin+1; j < tokenBodies.size(); j++) { + TokenBody current = tokenBodies.get(j); + if (current.type.equals(CharacterUtil.CHAR_BLANK)) { + if(ignoreBlank) { + continue; + } else { + break; + } + } + // 处理 中文情况 + sb.append(current.termBuffer); + Hit hit = dict.match(sb.toString().toCharArray()); + if (hit.isUnmatch()) { + break; + } + if (hit.isMatch()) { + endList.add(j); + } + } +// System.out.println(endList); + return endList; + } + + /** + * 拼接 [begin, end] termBuffer + * @param tokenBodies + * @param begin + * @param end + * @return + */ + private String combineTermBuffer(List tokenBodies, int begin, int end) { + StringBuffer sb = new StringBuffer(tokenBodies.get(begin).termBuffer); + for(int i = begin+1; i <= end; i++) { + sb.append(tokenBodies.get(i).termBuffer); + } + return sb.toString(); + } + +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java new file mode 100644 index 00000000..e0e28f13 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java @@ -0,0 +1,131 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.synonym.SynonymMap; + +import java.util.Arrays; +import java.util.List; + +public final class FCPAnalyzer extends Analyzer { + /** Default maximum allowed token length */ + public static final boolean DEFAULT_SPLIT_COMPLETE = false; + + // 决定分词时对 英文、数字 是否进行完全切分,默认为 false,表示数字和英文为一个整体,不会继续向下切分,完全切分的话 splitComplete = true + private boolean splitComplete = false; + // 默认为建立 索引模式, 如果为 查询模式 indexMode = false + private final boolean indexMode; + // 特殊字符的映射,默认为 true 表示模糊匹配特殊字符。如果设置为 false ,将会把原始的char放到最终分词结果中。 + private boolean uselessMapping = true; + // 默认文本是正确文本,其中的空白是有意义的,不能忽略空白。如果认为原文中的空白由于ETL错误引入,应该忽略空白。 + private boolean ignoreBlank = false; + // 是否使用 first char position ,默认使用,如果为 false,则变为 lcp_analyzer + private boolean useFirstPos = true; + // 是否显示 offset,默认随着 indexMode 变化 + private boolean showOffset; + + private int maxTokenLength = CombineCharFilter.DEFAULT_MAX_WORD_LEN; + + public FCPAnalyzer() { + this(ExtendFilter.DEFAULT_INDEX_MODE); + } + public FCPAnalyzer(boolean indexMode) { + this.indexMode = indexMode; + // 改变 showOffset 的默认值 + if (indexMode) { + showOffset = false; + } else { + showOffset = true; + } + } + + public FCPAnalyzer setIgnoreBlank(boolean ignoreBlank) { + this.ignoreBlank = ignoreBlank; + return this; + } + + public FCPAnalyzer setUselessMapping(boolean uselessMapping) { + this.uselessMapping = uselessMapping; + return this; + } + + public FCPAnalyzer setSplitComplete(boolean splitComplete) { + this.splitComplete = splitComplete; + return this; + } + + public FCPAnalyzer setShowOffset(boolean showOffset) { + this.showOffset = showOffset; + return this; + } + + public FCPAnalyzer setUseFirstPos(boolean useFirstPos) { + this.useFirstPos = useFirstPos; + return this; + } + + /** + * Set the max allowed token length. Tokens larger than this will be chopped + * up at this token length and emitted as multiple tokens. If you need to + * skip such large tokens, you could increase this max length, and then + * use {@code LengthFilter} to remove long tokens. The default is + * {@link StandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}. + */ + public FCPAnalyzer setMaxTokenLength(int length) { + maxTokenLength = length; + return this; + } + + /** Returns the current maximum token length + * + * @see #setMaxTokenLength */ + public int getMaxTokenLength() { + return maxTokenLength; + } + + public boolean isIgnoreBlank() { + return ignoreBlank; + } + + + public boolean isIndexMode() { + return indexMode; + } + + public boolean isUseFirstPos() { + return useFirstPos; + } + + @Override + protected TokenStreamComponents createComponents(final String fieldName) { + final Tokenizer src = new NGramTokenizer(1, 1); + TokenStream tok = new FormatFilter(src); + if (!splitComplete) { + tok = new CombineCharFilter(tok, maxTokenLength); + } + + tok = new ExtendFilter(tok, indexMode) + .setShowOffset(showOffset) + .setIgnoreBlank(ignoreBlank) + .setUseFirstPos(useFirstPos) + .setUselessMapping(uselessMapping); + return new TokenStreamComponents(src, tok); + } + + @Override + public String toString() { + return "FCPAnalyzer{" + + "splitComplete=" + splitComplete + + ", indexMode=" + indexMode + + ", showOffset=" + showOffset + + ", uselessMapping=" + uselessMapping + + ", ignoreBlank=" + ignoreBlank + + ", useFirstPos=" + useFirstPos + + ", maxTokenLength=" + maxTokenLength + + '}'; + } + +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java new file mode 100644 index 00000000..e85d6cb1 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/FormatFilter.java @@ -0,0 +1,51 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.CharacterUtils; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.wltea.analyzer.fcp.util.CharacterUtil; + +import java.io.IOException; + +/** + * 英文转小写 + * 字符的类型处理 + */ +public class FormatFilter extends TokenFilter { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + /** + * Construct a token stream filtering the given input. + * + * @param input + */ + public FormatFilter(TokenStream input) { + super(input); + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String s = termAtt.toString(); + // 如果从 ngram 1 的 Tokenizer 得到的 token 应该length 都为 1 + if (s.length() == 1) { + int c = s.codePointAt(0); + typeAtt.setType(CharacterUtil.identifyCharType(c)); + c = CharacterUtil.regularize(c); + char[] chars = Character.toChars(c); + termAtt.copyBuffer(chars, 0, chars.length); + } else { + // 对英文进行 lower case + CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length()); + } + return true; + } else { + return false; + } + } + +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java b/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java new file mode 100644 index 00000000..659469de --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/OptionPath.java @@ -0,0 +1,102 @@ +package org.wltea.analyzer.fcp; + +import java.util.Arrays; + +/** + * present a no conflict path for choose + */ +public class OptionPath implements Comparable { + private static final int DEFAULT_CAPACITY = 10; + int[] groups; + int size = 0; + int payloadLength = 0; + + OptionPath() { + groups = new int[DEFAULT_CAPACITY]; + } + + OptionPath(int capacity) { + assert capacity > 0; + groups = new int[capacity]; + } + + private OptionPath(int size, int[] groups) { + this.size = size; + int newCapacity = Math.max(size * 2, groups.length); + this.groups = Arrays.copyOf(groups, newCapacity); + } + + OptionPath copy() { + return new OptionPath(this.size, this.groups); + } + + void addElement(int startPosition, int endPosition) { + assert endPosition > startPosition; + this.size++; + if (this.size*2 >= this.groups.length) { + this.groups = Arrays.copyOf(this.groups, this.groups.length * 2); + } + this.payloadLength += (endPosition - startPosition + 1); + this.groups[size*2 - 2] = startPosition; + this.groups[size*2 - 1] = endPosition; + } + + int getValueByIndex(int index) { + assert -1 < index && index < this.groups.length; + return this.groups[index]; + } + + int getEndPosition(int startPosition) { + int endPosition = -1; + for(int i = 0; i < size && this.groups[2*i] <= startPosition; i++) { + if (startPosition == this.groups[2*i]) { + endPosition = this.groups[2*i + 1]; + } + } + return endPosition; + } + + int getPathLength() { + return this.groups[this.size*2+1] - this.groups[0]; + } + + int getPathEnd() { + return this.groups[size*2+1]; + } + + int getXWeight() { + int product = 1; + for(int i = 0; i < size; i++) { + product *= (this.groups[2*i+1] - this.groups[2*i]); + } + return product; + } + + int getPWeight() { + int pWeight = 0; + int p = 0; + for(int i = 0; i < size; i++) { + p++; + pWeight += p * (this.groups[2*i+1] - this.groups[2*i]); + } + return pWeight; + } + + // ik_smart 解决歧义问题的实现逻辑 + @Override + public int compareTo(OptionPath o) { + if (this.payloadLength != o.payloadLength) { + return Integer.compare(this.payloadLength, o.payloadLength); + } else if (this.size != o.size) { + return Integer.compare(this.size, o.size); + } else if (this.getPathLength() != o.getPathLength()) { + return Integer.compare(this.getPathLength(), o.getPathLength()); + } else if(this.getPathEnd() != o.getPathEnd()) { + return Integer.compare(this.getPathEnd(), o.getPathEnd()); + } else if (this.getXWeight() != o.getXWeight()) { + return Integer.compare(this.getXWeight(), o.getXWeight()); + } else { + return Integer.compare(this.getPWeight(), o.getPWeight()); + } + } +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java new file mode 100644 index 00000000..587ca1d9 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java @@ -0,0 +1,47 @@ +package org.wltea.analyzer.fcp; + +import java.util.List; + +/** + * compose term + */ +class TokenBody { + String termBuffer; + int startOffset, endOffset; + // position 用于表示在 elasticsearch 分词时得到的 position, 通过 curr.position - prev.position 得到 positionIncrement + // startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token,主要给 ik_smart 使用 + int position, startPosition = -1, endPosition = -1; + String type; + + List child; + + TokenBody(){} + TokenBody(String termBuffer, int startOffset, int endOffset, int position, int startPosition, int endPosition, String type){ + this.termBuffer = termBuffer; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.position = position; + this.startPosition = startPosition; + this.endPosition = endPosition; + this.type = type; + } + + + TokenBody copy() { + return new TokenBody(termBuffer, startOffset, endOffset, position, startPosition, endPosition, ""); + } + + @Override + public String toString() { + return "TokenBody{" + + "termBuffer='" + termBuffer + '\'' + + ", startOffset=" + startOffset + + ", endOffset=" + endOffset + + ", position=" + position + + ", startPosition=" + startPosition + + ", endPosition=" + endPosition + + ", type='" + type + '\'' + + ", child=" + child + + '}'; + } +} diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java new file mode 100644 index 00000000..ba9ae2ff --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java @@ -0,0 +1,33 @@ +package org.wltea.analyzer.fcp.tokenattributes; + +import org.apache.lucene.util.Attribute; + +/** Determines how many positions this + * token spans. Very few analyzer components actually + * produce this attribute, and indexing ignores it, but + * it's useful to express the graph structure naturally + * produced by decompounding, word splitting/joining, + * synonym filtering, etc. + * + *

NOTE: this is optional, and most analyzers + * don't change the default value (1). */ + +public interface PositionLengthAttribute extends Attribute { + /** + * Set the position length of this Token. + *

+ * The default value is one. + * @param positionLength how many positions this token + * spans. + * @throws IllegalArgumentException if positionLength + * is zero or negative. + * @see #getPositionLength() + */ + public void setPositionLength(int positionLength); + + /** Returns the position length of this Token. + * @see #setPositionLength + */ + public int getPositionLength(); +} + diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java new file mode 100644 index 00000000..5aa230c0 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java @@ -0,0 +1,62 @@ +package org.wltea.analyzer.fcp.tokenattributes; + + +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeReflector; + +/** Default implementation of {@link PositionLengthAttribute}. */ +public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable { + private int positionLength = 1; + + /** Initializes this attribute with position length of 1. */ + public PositionLengthAttributeImpl() {} + + @Override + public void setPositionLength(int positionLength) { + if (positionLength < 1) { + throw new IllegalArgumentException("Position length must be 1 or greater; got " + positionLength); + } + this.positionLength = positionLength; + } + + @Override + public int getPositionLength() { + return positionLength; + } + + @Override + public void clear() { + this.positionLength = 1; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PositionLengthAttributeImpl) { + PositionLengthAttributeImpl _other = (PositionLengthAttributeImpl) other; + return positionLength == _other.positionLength; + } + + return false; + } + + @Override + public int hashCode() { + return positionLength; + } + + @Override + public void copyTo(AttributeImpl target) { + PositionLengthAttribute t = (PositionLengthAttribute) target; + t.setPositionLength(positionLength); + } + + @Override + public void reflectWith(AttributeReflector reflector) { + reflector.reflect(PositionLengthAttribute.class, "positionLength", positionLength); + } +} + diff --git a/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java b/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java new file mode 100644 index 00000000..0f618963 --- /dev/null +++ b/core/src/main/java/org/wltea/analyzer/fcp/util/CharacterUtil.java @@ -0,0 +1,124 @@ +/** + * IK 中文分词 版本 5.0 + * IK Analyzer release 5.0 + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * 源代码由林良益(linliangyi2005@gmail.com)提供 + * 版权声明 2012,乌龙茶工作室 + * provided by Linliangyi and copyright 2012 by Oolong studio + * + * 字符集识别工具类 + */ +package org.wltea.analyzer.fcp.util; + +import java.util.HashMap; +import java.util.Map; + +/** + * 字符集识别工具类 + */ +public class CharacterUtil { + + public static final String CHAR_USELESS = ""; + + public static final String CHAR_ENGLISH = ""; + + public static final String CHAR_NUMBER = ""; + + public static final String CHAR_NUMBER_DOT = ""; + + public static final String ALPHANUM = ""; + + public static final String CHAR_CHINESE = ""; + + public static final String COMBINE_WORD = ""; + + public static final String CHAR_MAPPING = ""; + + public static final String CHAR_BLANK = ""; + + // pinyin + public static final String CHAR_PINYIN = ""; + // pinyin 前缀 + public static final String CHAR_PINYIN_PRE = ""; + + private static Map order; + static { + // value 越小,排序越靠前,用于区分在同一个 position 上的不同 type 之间的排序 + order = new HashMap<>(); + order.put(CHAR_CHINESE, 0); + order.put(CHAR_PINYIN_PRE, 5); + order.put(CHAR_PINYIN, 10); + + order.put(CHAR_USELESS, 0); + order.put(CHAR_MAPPING, 10); + } + + public static int getOrderByType(String type) { + return order.getOrDefault(type, 0); + } + + + + /** + * 识别字符类型 + * @param input + * @return int CharacterUtil定义的字符类型常量 + */ + public static String identifyCharType(int input){ + + if (input >= '0' && input <= '9') { + return CHAR_NUMBER; + } else if ((input >= 'a' && input <= 'z') + || (input >= 'A' && input <= 'Z')) { + return CHAR_ENGLISH; + } else { + Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); + + if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS + || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS + || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ + //目前已知的中文字符UTF-8集合 + return CHAR_CHINESE; + + } + } + //其他的不做处理的字符 + return CHAR_USELESS; + + } + + /** + * 进行字符规格化(全角转半角,大写转小写处理) + * @param input + * @return char + */ + public static int regularize(int input){ + if (input == 12288) { + input = 32; + + }else if (input > 65280 && input < 65375) { + input = input - 65248; + + }else if (input >= 'A' && input <= 'Z') { + input += 32; + } + + + return input; + } +} diff --git a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java index 54ee735e..f906af6d 100644 --- a/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java +++ b/elasticsearch/src/main/java/com/infinilabs/ik/elasticsearch/AnalysisIkPlugin.java @@ -33,6 +33,12 @@ public Map { + private final FCPAnalyzer analyzer; + + /** + * indexMode 作为重要的参数, + * @param indexSettings + * @param env + * @param name + * @param settings + * @param indexMode + */ + public FCPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings, boolean indexMode) { + super(name, settings); + boolean splitComplete = settings.getAsBoolean("split_complete", FCPAnalyzer.DEFAULT_SPLIT_COMPLETE); + int maxTokenLength = settings.getAsInt("max_token_length", CombineCharFilter.DEFAULT_MAX_WORD_LEN); + boolean uselessMapping = settings.getAsBoolean("useless_mapping", ExtendFilter.DEFAULT_USELESS_MAPPING); + boolean ignoreBlank = settings.getAsBoolean("ignore_blank", ExtendFilter.DEFAULT_IGNORE_BLANK); + boolean useFirstPos = settings.getAsBoolean("use_first_position", ExtendFilter.DEFAULT_USE_FIRST_POSITION); + Boolean showOffset = settings.getAsBoolean("show_offset", null); + analyzer = new FCPAnalyzer(indexMode); + if (showOffset != null) { + analyzer.setShowOffset(showOffset); + } + analyzer.setSplitComplete(splitComplete); + analyzer.setUselessMapping(uselessMapping); + analyzer.setMaxTokenLength(maxTokenLength); + analyzer.setIgnoreBlank(ignoreBlank); + analyzer.setUseFirstPos(useFirstPos); + } + + public static FCPAnalyzerProvider getFCPAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = settings.getAsBoolean("index_mode", ExtendFilter.DEFAULT_INDEX_MODE); + return new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + } + + public static FCPAnalyzerProvider getFCPIndexAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = true; + boolean useFirstPos = true; + FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + FCPAnalyzer fcpAnalyzer = provider.get(); + fcpAnalyzer.setUseFirstPos(useFirstPos); + return provider; + } + + public static FCPAnalyzerProvider getFCPSearchAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = false; + boolean useFirstPos = true; + FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + FCPAnalyzer fcpAnalyzer = provider.get(); + fcpAnalyzer.setUseFirstPos(useFirstPos); + return provider; + } + + public static FCPAnalyzerProvider getLCPIndexAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = true; + FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + FCPAnalyzer fcpAnalyzer = provider.get(); + fcpAnalyzer.setUseFirstPos(false); + return provider; + } + + public static FCPAnalyzerProvider getLCPSearchAnalyzer(IndexSettings indexSettings, Environment env, String name, Settings settings) { + boolean indexMode = false; + FCPAnalyzerProvider provider = new FCPAnalyzerProvider(indexSettings, env, name, settings, indexMode); + FCPAnalyzer fcpAnalyzer = provider.get(); + fcpAnalyzer.setUseFirstPos(false); + return provider; + } + + @Override + public FCPAnalyzer get() { + return analyzer; + } +} \ No newline at end of file From 5e5445f29761de91c3d7a20e1ebc70b2c9daaf56 Mon Sep 17 00:00:00 2001 From: "hao.mou" Date: Sun, 28 Apr 2024 10:06:27 +0800 Subject: [PATCH 2/2] add new analyzer , simple implementation --- .../org/wltea/analyzer/fcp/ExtendFilter.java | 271 +++--------------- .../org/wltea/analyzer/fcp/FCPAnalyzer.java | 5 +- .../org/wltea/analyzer/fcp/TokenBody.java | 5 +- .../PositionLengthAttribute.java | 1 + .../PositionLengthAttributeImpl.java | 1 + .../analyzer/fcp/Configuration4Test.java | 29 ++ .../wltea/analyzer/fcp/FCPAnalyzerTest.java | 80 ++++++ 7 files changed, 148 insertions(+), 244 deletions(-) create mode 100644 core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java create mode 100644 core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java diff --git a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java index 211c09b0..be2451a1 100644 --- a/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java +++ b/core/src/main/java/org/wltea/analyzer/fcp/ExtendFilter.java @@ -9,30 +9,24 @@ import org.wltea.analyzer.dic.DictSegment; import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Hit; -import org.wltea.analyzer.fcp.tokenattributes.PositionLengthAttribute; import org.wltea.analyzer.fcp.util.CharacterUtil; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.PriorityQueue; /** * use dict to extend terms */ public class ExtendFilter extends TokenFilter { - private static final boolean IS_DEBUG = true; // 默认入库模式 public static final boolean DEFAULT_INDEX_MODE = true; // 默认对于特殊字符采用模糊搜索,扩大搜索范围 public static final boolean DEFAULT_USELESS_MAPPING = true; // 默认对于句子的空白进行忽略 public static final boolean DEFAULT_IGNORE_BLANK = true; - // 默认对于句子的空白进行忽略 - public static final boolean DEFAULT_IGNORE_WHITESPACE = true; // 默认使用 lcp 的模式,使用最后一个char的position public static final boolean DEFAULT_USE_FIRST_POSITION = false; // 在高亮的时候使用 offset @@ -42,8 +36,6 @@ public class ExtendFilter extends TokenFilter { private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - // 用于记录每一个term 的position length - private final PositionLengthAttribute lengthAttribute = addAttribute(PositionLengthAttribute.class); // used for saving upstream tokens , implemented by Arraylist private List tokenBodies = null; @@ -123,7 +115,7 @@ public final boolean incrementToken() throws IOException { tb.endOffset = showOffset ? offsetAtt.endOffset() : 0; // blank 类型会被舍弃,position不变 tb.termBuffer = termAtt.toString(); - // 下面是处理 position 和 type的赋值,单个 term,没有 startPosition 和 endPosition + // 下面是处理 position 和 type的赋值 if (CharacterUtil.CHAR_USELESS.equals(typeAtt.type())) { if (isAllBlank(tb.termBuffer) && this.ignoreBlank) { // 表示沿用上一个 position,下面将会被舍弃掉 @@ -135,7 +127,7 @@ public final boolean incrementToken() throws IOException { tb.position = position; tb.type = typeAtt.type(); if (uselessMapping) { - tb.termBuffer = "#"; // 无特殊含义,将特殊字符统一映射为 # 方便查询 + tb.termBuffer = "#"; // 无特殊含义,将特殊字符统一映射为 # 方便查询, 否则特殊字符也是需要精准匹配 } } } else { @@ -157,10 +149,6 @@ public final boolean incrementToken() throws IOException { termAtt.copyBuffer(chars, 0, chars.length); offsetAtt.setOffset(body.startOffset, body.endOffset); typeAtt.setType(body.type); - if (!indexMode) { - // 计算当前combine term 的跨度,占用了多少个 term - lengthAttribute.setPositionLength(body.endPosition - body.startPosition + 1); - } return true; } else { tokenBodies = null; @@ -171,7 +159,7 @@ public final boolean incrementToken() throws IOException { /** - * 判断参数是否全部由空白字符组成 + * 判断参数是否全部由空白字符(空格、制表符、换行……)组成 * @param s * @return */ @@ -188,237 +176,44 @@ private void extendTerms(List tokenBodies, boolean indexMode, boolean if (!tokenBody.type.equals(CharacterUtil.CHAR_BLANK)) { // 处理当前char, 但要考虑向后扩展,得到以当前位置开始 以 endList 中位置结束的一系列term, List endList = getCurrentEndList(tokenBodies, beginI, ignoreBlank); - // 默认在 index 模式下,一股脑全部放到倒排中(index 模式对性能敏感,所以必须保证) - if (!indexMode) { - tokenBody.startPosition = tokenBody.position; - tokenBody.endPosition = tokenBody.position; - } - tokenResults.add(tokenBody); - for (Integer endI : endList) { - TokenBody tb= new TokenBody(); - tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI); - tb.startOffset = tokenBodies.get(beginI).startOffset; - tb.endOffset = tokenBodies.get(endI).endOffset; - // search 模式下需要记录组合 term 前后的 position - if (!indexMode) { - tb.startPosition = tokenBodies.get(beginI).position; - tb.endPosition = tokenBodies.get(endI).position; - } - if (useFirstPos) { - tb.position = tokenBodies.get(beginI).position; - } else { - tb.position = tokenBodies.get(endI).position; + if (indexMode) { + tokenResults.add(tokenBody); + for (Integer endI : endList) { + TokenBody tb= new TokenBody(); + tb.termBuffer = combineTermBuffer(tokenBodies, beginI, endI); + tb.startOffset = tokenBodies.get(beginI).startOffset; + tb.endOffset = tokenBodies.get(endI).endOffset; + if (useFirstPos) { + tb.position = tokenBodies.get(beginI).position; + } else { + tb.position = tokenBodies.get(endI).position; + } + tb.type = CharacterUtil.COMBINE_WORD; + tokenResults.add(tb); } - tb.type = ""; - tokenResults.add(tb); - } - } - } - // 到这里如果是index 模式的话,已经可以结束了; - // 如果是 search模式,需要做歧义处理(如果有的话, 使用 类型的char 作为天然分割句子) - if (!indexMode && tokenResults.size() > 0) { - // 在search 模式下,采用 ik_smart 的逻辑进行语义分割,一个重大的意义:引入了语义分割 - // 1,ik 使用没有语义重叠的那个 char 作为分割点,只作用于有字符重叠的部分 - // 2,由于和 index 模式使用相同的 向后扩展逻辑,所以search是index 的子集 - // 3,search 模式下,不会涉及mapping的扩展引入 - // 4,search 模式下,使用 startPosition 来进行判断扩是否有歧义 - - // 用于保存多个term的组合形式,逆序。:采用动态编程思想,完成快速组合 - PriorityQueue combineTerms = new PriorityQueue(new Comparator(){ - @Override - public int compare(TokenBody o1, TokenBody o2){ - // 顺序有重要意义 - return o1.startPosition != o2.startPosition ? - Integer.compare(o1.startPosition, o2.startPosition) - : Integer.compare(o2.endPosition, o1.endPosition); - } - }); - // 用于保存单个term的形式(最后将保存全部的结果) - Map singleTerm = new HashMap<>(); - - // 将切分结果重新排序, 并清空之前的处理结果 - int startPosition = Integer.MAX_VALUE; - int endPosition = Integer.MIN_VALUE; - while (tokenResults.size() > 0) { - TokenBody t = tokenResults.poll(); - if (t.startPosition == t.endPosition) { - // 单个 term - singleTerm.put(t.position, t); - startPosition = Math.min(startPosition, t.startPosition); - endPosition = Math.max(endPosition, t.endPosition); } else { - // 组合出来的term,不参与歧义判断,仅仅用于歧义判断后的填补那些空白的 position - combineTerms.add(t); - } - } - - // 处理分词,没有歧义的直接放到结果中,有歧义的处理完之后放到结果中 - PriorityQueue searchReverseOrder = new PriorityQueue(new Comparator(){ - @Override - public int compare(TokenBody o1, TokenBody o2){ - // 顺序有重要意义 - return o1.startPosition != o2.startPosition ? - Integer.compare(o2.startPosition, o1.startPosition) - : Integer.compare(o1.endPosition, o2.endPosition); - } - }); - - // 在处理一段歧义时,控制前后范围, 第一次就是最开始的范围 - int maxExtend = Integer.MIN_VALUE; // 边界包含 - for (TokenBody tb : combineTerms) { - if (searchReverseOrder.size() == 0) { - searchReverseOrder.add(tb); - maxExtend = tb.endPosition; - continue; - } - - if (maxExtend < tb.startPosition) { - // 表示当前term 与之前的切分没有歧义 - if (searchReverseOrder.size() == 1) { - final TokenBody body = searchReverseOrder.poll(); - singleTerm.put(body.startPosition, body); + // 处理search analyzer 结果,贪婪向后匹配 + // 1,只有单字,加入单字 + // 2,有后缀匹配,采用最长的token结果(目的是找到个数最少的组合,非最优,但比较简单) + if (endList.isEmpty()) { + tokenResults.add(tokenBody); // 单字 } else { - // 这里先处理掉之前有歧义的部分, - final List arbitrator = arbitrator(searchReverseOrder); - for(TokenBody body : arbitrator) { - singleTerm.put(body.startPosition, body); + int lastEnd = endList.get(endList.size()-1); // 取最长token + tokenBody.termBuffer = combineTermBuffer(tokenBodies, beginI, lastEnd); + tokenBody.startOffset = tokenBodies.get(beginI).startOffset; + tokenBody.endOffset = tokenBodies.get(lastEnd).endOffset; + if (useFirstPos) { + tokenBody.position = tokenBodies.get(beginI).position; + } else { + tokenBody.position = tokenBodies.get(lastEnd).position; } - } - } - searchReverseOrder.add(tb); - maxExtend = Math.max(maxExtend, tb.endPosition); - } - // 处理最后的歧义 - if (searchReverseOrder.size() == 1) { - final TokenBody body = searchReverseOrder.poll(); - singleTerm.put(body.startPosition, body); - } else if(searchReverseOrder.size() > 1){ - final List arbitrator = arbitrator(searchReverseOrder); - for(TokenBody body : arbitrator) { - singleTerm.put(body.startPosition, body); - } - } - // endPosition 的用途 - while (startPosition <= endPosition) { - if (singleTerm.containsKey(startPosition)) { - final TokenBody body = singleTerm.get(startPosition); - tokenResults.add(body); - startPosition = body.endPosition + 1; - } else { - startPosition++; - } - } - } - } + tokenBody.type = CharacterUtil.COMBINE_WORD; + tokenResults.add(tokenBody); - /** - * 处理有歧义的token, - * @param searchReverseOrder 为倒序的token - * @return - */ - private List arbitrator(PriorityQueue searchReverseOrder) { - Map> positionMap = new HashMap<>(); - int maxIndex = -1; - int minIndex = -1; - while (searchReverseOrder.size() > 0) { - final TokenBody body = searchReverseOrder.poll(); - if (searchReverseOrder.size() == 0) { - // 要处理的最开始的位置,也就是 searchReverseOrder 的最后一个 - minIndex = body.startPosition; - } - if (maxIndex == -1) { - // 要处理的最后的位置,也就是 searchReverseOrder 的第一个 - maxIndex = body.startPosition; - } - // 下面给当前的 token 添加 child - int currentMax = maxIndex; - for (int i = body.endPosition + 1; i <= currentMax; i++) { - if (positionMap.containsKey(i)) { - final List bodies = positionMap.get(i); - final TokenBody minLengthBody = bodies.get(0); // 表示取其后紧挨着的最短token作为结束位置 - if (currentMax == maxIndex) { - currentMax = minLengthBody.endPosition; // 表示 minLengthBody 后面的 term 不可以作为 child了 - } - if (body.child == null) { - body.child = new ArrayList<>(); + beginI = lastEnd; } - body.child.addAll(positionMap.get(i)); } } - // 将 token放到结果中 - if (positionMap.containsKey(body.startPosition) == false) { - positionMap.put(body.startPosition, new ArrayList<>()); - } - positionMap.get(body.startPosition).add(body); - -// if (IS_DEBUG) { -// for(int i = 0; i < maxIndex + 10; i++) { -// String s = "- "; -// if (body.startPosition <= i && i <= body.endPosition) { -// s = "# "; -// } -// System.out.print(s); -// } -// System.out.println(); -// } - } - List topOptions = new ArrayList<>(); - - final TokenBody firstMinLength = positionMap.get(minIndex).get(0); - for(int i = firstMinLength.startPosition; i <= firstMinLength.endPosition; i++) { - if (positionMap.containsKey(i)) { - topOptions.addAll(positionMap.get(i)); - } - } - for (TokenBody t : topOptions) { - System.out.println(t); - } - List result = new ArrayList<>(); - final OptionPath bestPath = chooseBestPath(topOptions); - for (int i = 0; i < bestPath.size ; i++) { - int startP = bestPath.getValueByIndex(2 * i); - int endP = bestPath.getValueByIndex(2 * i + 1); - final List bodyList = positionMap.get(startP); - for(TokenBody tb : bodyList) { - if (tb.startPosition == startP && tb.endPosition == endP) { - result.add(tb); - break; - } - } - } - return result; - } - - // options 本身为已经处理好的结构,使用引用指向下级关系 - private OptionPath chooseBestPath(List options) { - // 使用 PriorityQueue,因为只是需要获取最小的那一个,其后的严格有序不是必须的 - PriorityQueue allOptionPath = new PriorityQueue(new Comparator () { - @Override - public int compare(OptionPath o1, OptionPath o2) { - return o2.compareTo(o1); - } - }); - - for(TokenBody tokenBody : options) { - OptionPath path = new OptionPath(); - path.addElement(tokenBody.startPosition, tokenBody.endPosition); - findNextPath(allOptionPath, tokenBody, path); - } - final OptionPath bestPath = allOptionPath.poll(); - return bestPath; - } - - private void findNextPath(PriorityQueue allOptionPath, TokenBody tokenBody, OptionPath parentPath) { - if (tokenBody.child == null) { - // 路径的最后,结束递归 - allOptionPath.add(parentPath); - return; - } - for(TokenBody child : tokenBody.child) { - // 复制parent path - OptionPath childPath = parentPath.copy(); - childPath.addElement(child.startPosition, child.endPosition); - findNextPath(allOptionPath, child, childPath); } } diff --git a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java index e0e28f13..ec381546 100644 --- a/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java +++ b/core/src/main/java/org/wltea/analyzer/fcp/FCPAnalyzer.java @@ -5,10 +5,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.synonym.SynonymMap; -import java.util.Arrays; -import java.util.List; public final class FCPAnalyzer extends Analyzer { /** Default maximum allowed token length */ @@ -21,7 +18,7 @@ public final class FCPAnalyzer extends Analyzer { // 特殊字符的映射,默认为 true 表示模糊匹配特殊字符。如果设置为 false ,将会把原始的char放到最终分词结果中。 private boolean uselessMapping = true; // 默认文本是正确文本,其中的空白是有意义的,不能忽略空白。如果认为原文中的空白由于ETL错误引入,应该忽略空白。 - private boolean ignoreBlank = false; + private boolean ignoreBlank = true; // 是否使用 first char position ,默认使用,如果为 false,则变为 lcp_analyzer private boolean useFirstPos = true; // 是否显示 offset,默认随着 indexMode 变化 diff --git a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java index 587ca1d9..6e9bcf4d 100644 --- a/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java +++ b/core/src/main/java/org/wltea/analyzer/fcp/TokenBody.java @@ -9,8 +9,9 @@ class TokenBody { String termBuffer; int startOffset, endOffset; // position 用于表示在 elasticsearch 分词时得到的 position, 通过 curr.position - prev.position 得到 positionIncrement - // startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token,主要给 ik_smart 使用 - int position, startPosition = -1, endPosition = -1; + int position; + // todo 未来startPosition、endPosition 用于收集 那些在 词库中 扩展出来的 token,主要给 ik_smart 使用 + int startPosition = -1, endPosition = -1; String type; List child; diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java index ba9ae2ff..2e0f6327 100644 --- a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java +++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttribute.java @@ -12,6 +12,7 @@ *

NOTE: this is optional, and most analyzers * don't change the default value (1). */ +@Deprecated public interface PositionLengthAttribute extends Attribute { /** * Set the position length of this Token. diff --git a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java index 5aa230c0..c4d5dffb 100644 --- a/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java +++ b/core/src/main/java/org/wltea/analyzer/fcp/tokenattributes/PositionLengthAttributeImpl.java @@ -5,6 +5,7 @@ import org.apache.lucene.util.AttributeReflector; /** Default implementation of {@link PositionLengthAttribute}. */ +@Deprecated public class PositionLengthAttributeImpl extends AttributeImpl implements PositionLengthAttribute, Cloneable { private int positionLength = 1; diff --git a/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java b/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java new file mode 100644 index 00000000..34c6d8e8 --- /dev/null +++ b/core/src/test/java/org/wltea/analyzer/fcp/Configuration4Test.java @@ -0,0 +1,29 @@ +package org.wltea.analyzer.fcp; + +import org.wltea.analyzer.cfg.Configuration; + +import java.io.File; +import java.net.URI; +import java.nio.file.Path; +import java.nio.file.Paths; + +/** + * @ClassName Configuration4Test + * @Description: + */ +public class Configuration4Test extends Configuration { + @Override + public Path getConfDir() { + return Paths.get("../", "config"); + } + + @Override + public Path getConfigInPluginDir() { + return Paths.get("../", "config"); + } + + @Override + public Path getPath(String first, String... more) { + return Paths.get(first, more); + } +} diff --git a/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java b/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java new file mode 100644 index 00000000..b65507d5 --- /dev/null +++ b/core/src/test/java/org/wltea/analyzer/fcp/FCPAnalyzerTest.java @@ -0,0 +1,80 @@ +package org.wltea.analyzer.fcp; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.junit.Before; +import org.junit.Test; +import org.wltea.analyzer.dic.Dictionary; + +import java.io.IOException; +import java.io.StringReader; + +/** + * @ClassName FCPAnalyzerTest + * @Description: fcp test + */ +public class FCPAnalyzerTest { + + @Before + public void init() { + // 初始化词典 + Dictionary.initial(new Configuration4Test()); + } + + @Test + public void testFcpIndexAnalyzer() { + FCPAnalyzer fcpIndex = new FCPAnalyzer(true); + String str = "这里是中国, this is china #4.345^"; + TokenStream stream = null ; + try { + stream = fcpIndex.tokenStream( "any", new StringReader(str)) ; + PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class ) ; //保存位置 + OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class ) ; //保存辞与词之间偏移量 + CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class ) ;//保存响应词汇 + TypeAttribute ta = stream.addAttribute(TypeAttribute.class ) ; //保存类型 + stream.reset() ; + int position = -1; + while (stream.incrementToken()) { + position += pia.getPositionIncrement(); + System. out.println(position + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type()); + } + stream.end() ; + } catch (IOException e) { + e.printStackTrace(); + } + + } + + @Test + public void testFcpSearchAnalyzer() { + FCPAnalyzer fcpSearch = new FCPAnalyzer(false); + String str = "这里是中国, this is china #4.345^"; + TokenStream stream = null ; + try { + stream = fcpSearch.tokenStream( "any", new StringReader(str)) ; + PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class ) ; //保存位置 + OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class ) ; //保存辞与词之间偏移量 + CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class ) ;//保存响应词汇 + TypeAttribute ta = stream.addAttribute(TypeAttribute.class ) ; //保存类型 + stream.reset() ; + int position = -1; + while (stream.incrementToken()) { + position += pia.getPositionIncrement(); + System. out.println(position + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type()); + } + stream.end() ; + } catch (IOException e) { + e.printStackTrace(); + } + + } + + @Test + public void test03() { + String s = " \t \n"; + System.out.println(s.trim().length() == 0); + } +}