Skip to content

Commit

Permalink
Merge pull request #62 from duydo/feature/v7.0.0
Browse files Browse the repository at this point in the history
Feature/v7.0.0
  • Loading branch information
duydo authored Aug 2, 2019
2 parents 50c8f30 + 65e662e commit eabdbfe
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 101 deletions.
16 changes: 2 additions & 14 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-vietnamese</artifactId>
<version>6.5.3</version>
<version>7.0.0</version>
<packaging>jar</packaging>
<name>elasticsearch-analysis-vietnamese</name>
<url>https://github.com/duydo/elasticsearch-analysis-vietnamese/</url>
Expand Down Expand Up @@ -31,11 +31,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.build.java.version>1.8</project.build.java.version>
<elasticsearch.version>6.5.3</elasticsearch.version>
<!--
<lucene.version>6.6.1</lucene.version>
<jna.version>4.1.0</jna.version>
-->
<elasticsearch.version>7.0.0</elasticsearch.version>
<log4j.version>2.7</log4j.version>
</properties>
<dependencies>
Expand Down Expand Up @@ -66,14 +62,6 @@
<version>${elasticsearch.version}</version>
<scope>test</scope>
</dependency>
<!--
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>${jna.version}</version>
<scope>test</scope>
</dependency>
-->
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

import org.apache.lucene.analysis.*;

import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.Arrays;
import java.util.List;

Expand All @@ -25,6 +27,7 @@
public class VietnameseAnalyzer extends StopwordAnalyzerBase {

public static final CharArraySet VIETNAMESE_STOP_WORDS_SET;
private final me.duydo.vi.Tokenizer tokenizer;

static {
final List<String> stopWords = Arrays.asList(
Expand Down Expand Up @@ -69,11 +72,12 @@ public VietnameseAnalyzer() {
*/
public VietnameseAnalyzer(CharArraySet stopWords) {
super(stopWords);
tokenizer = AccessController.doPrivileged((PrivilegedAction<me.duydo.vi.Tokenizer>) () -> new me.duydo.vi.Tokenizer());
}

@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new VietnameseTokenizer();
final Tokenizer tokenizer = new VietnameseTokenizer(this.tokenizer);
TokenStream tokenStream = new LowerCaseFilter(tokenizer);
tokenStream = new StopFilter(tokenStream, stopwords);
return new TokenStreamComponents(tokenizer, tokenStream);
Expand Down
102 changes: 32 additions & 70 deletions src/main/java/org/apache/lucene/analysis/vi/VietnameseTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,18 @@

package org.apache.lucene.analysis.vi;

import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import vn.hus.nlp.sd.IConstants;
import vn.hus.nlp.sd.SentenceDetector;
import vn.hus.nlp.sd.SentenceDetectorFactory;
import vn.hus.nlp.tokenizer.TokenizerProvider;
import vn.hus.nlp.tokenizer.tokens.TaggedWord;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;


/**
Expand All @@ -42,75 +35,53 @@
*/
public class VietnameseTokenizer extends Tokenizer {

private Iterator<TaggedWord> taggedWords;

private List<TaggedWord> pending = new CopyOnWriteArrayList<>();
private int offset = 0;
private int skippedPositions;

private int pos = 0;

private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

private vn.hus.nlp.tokenizer.Tokenizer tokenizer;
private SentenceDetector sentenceDetector;

private boolean sentenceDetectorEnabled;
private boolean ambiguitiesResolved;

public VietnameseTokenizer() {
this(true, false);
}
private final me.duydo.vi.Tokenizer tokenizer;
private String inputText;

public VietnameseTokenizer(boolean sentenceDetectorEnabled, boolean ambiguitiesResolved) {
public VietnameseTokenizer(me.duydo.vi.Tokenizer tokenizer) {
super();
this.sentenceDetectorEnabled = sentenceDetectorEnabled;
this.ambiguitiesResolved = ambiguitiesResolved;

if (this.sentenceDetectorEnabled) {
sentenceDetector = SentenceDetectorFactory.create(IConstants.LANG_VIETNAMESE);
}
tokenizer = AccessController.doPrivileged(new PrivilegedAction<vn.hus.nlp.tokenizer.Tokenizer>() {
@Override
public vn.hus.nlp.tokenizer.Tokenizer run() {
vn.hus.nlp.tokenizer.Tokenizer vnTokenizer = TokenizerProvider.getInstance().getTokenizer();
vnTokenizer.setAmbiguitiesResolved(ambiguitiesResolved);
return vnTokenizer;
}
});
this.tokenizer = tokenizer;
}

private void tokenize(Reader input) throws IOException {
if (isSentenceDetectorEnabled()) {
final List<TaggedWord> words = new ArrayList<TaggedWord>();
final String[] sentences = sentenceDetector.detectSentences(input);
for (String s : sentences) {
tokenizer.tokenize(new StringReader(s));
words.addAll(tokenizer.getResult());
}
taggedWords = words.iterator();
} else {
tokenizer.tokenize(input);
taggedWords = tokenizer.getResult().iterator();
private void tokenize() throws IOException {
inputText = IOUtils.toString(input);
final List<TaggedWord> result = tokenizer.tokenize(new StringReader(inputText));
if (result != null) {
pending.addAll(result);
}
}

@Override
public final boolean incrementToken() throws IOException {
while (pending.size() == 0) {
tokenize();
if (pending.size() == 0) {
return false;
}
}
clearAttributes();
while (taggedWords.hasNext()) {
final TaggedWord word = taggedWords.next();

for (int i = pos; i < pending.size(); i++) {
pos++;
final TaggedWord word = pending.get(i);
if (accept(word)) {
posIncrAtt.setPositionIncrement(skippedPositions + 1);
typeAtt.setType(word.getRule().getName());
posIncrAtt.setPositionIncrement(1);
final int length = word.getText().length();
typeAtt.setType(String.format("<%s>", word.getRule().getName().toUpperCase()));
termAtt.copyBuffer(word.getText().toCharArray(), 0, length);
offsetAtt.setOffset(correctOffset(offset), offset = correctOffset(offset + length));
offset++;
final int start = inputText.indexOf(word.getText(), i);
offsetAtt.setOffset(correctOffset(start), offset = correctOffset(start + length));
return true;
}
skippedPositions++;
}
return false;
}
Expand All @@ -119,9 +90,9 @@ public final boolean incrementToken() throws IOException {
* Only accept the word characters.
*/
private final boolean accept(TaggedWord word) {
final String token = word.getText();
if (token.length() == 1) {
return Character.isLetterOrDigit(token.charAt(0));
final String type = word.getRule().getName().toLowerCase();
if ("punctuation".equals(type) || "special".equals(type)) {
return false;
}
return true;
}
Expand All @@ -131,22 +102,13 @@ public final void end() throws IOException {
super.end();
final int finalOffset = correctOffset(offset);
offsetAtt.setOffset(finalOffset, finalOffset);
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}

@Override
public void reset() throws IOException {
super.reset();
pos = 0;
offset = 0;
skippedPositions = 0;
tokenize(input);
}

public boolean isSentenceDetectorEnabled() {
return sentenceDetectorEnabled;
}

public boolean isAmbiguitiesResolved() {
return ambiguitiesResolved;
pending.clear();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,22 @@
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;

import java.security.AccessController;
import java.security.PrivilegedAction;

/**
* @author duydo
*/
public class VietnameseTokenizerFactory extends AbstractTokenizerFactory {

private final boolean sentenceDetectorEnabled;
private final boolean ambiguitiesResolved;
private final me.duydo.vi.Tokenizer tokenizer;

public VietnameseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
sentenceDetectorEnabled = settings.getAsBoolean("sentence_detector", Boolean.FALSE);
ambiguitiesResolved = settings.getAsBoolean("ambiguities_resolved", Boolean.FALSE);
super(indexSettings, settings);
tokenizer = AccessController.doPrivileged((PrivilegedAction<me.duydo.vi.Tokenizer>) () -> new me.duydo.vi.Tokenizer());
}

@Override
public Tokenizer create() {
return new VietnameseTokenizer(sentenceDetectorEnabled, ambiguitiesResolved);
return new VietnameseTokenizer(tokenizer);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.plugin.analysis.vi.AnalysisVietnamesePlugin;
import org.elasticsearch.plugins.Plugin;
Expand Down Expand Up @@ -47,9 +48,9 @@ public void testVietnameseAnalyzer() throws ExecutionException, InterruptedExcep
AnalyzeResponse response = client().admin().indices()
.prepareAnalyze("công nghệ thông tin Việt Nam").setAnalyzer("vi_analyzer")
.execute().get();
String[] expected = {"công nghệ thông tin", "việt nam"};
String[] expected = {"công nghệ thông tin", "việt", "nam"};
assertThat(response, notNullValue());
assertThat(response.getTokens().size(), is(2));
assertThat(response.getTokens().size(), is(3));
for (int i = 0; i < expected.length; i++) {
assertThat(response.getTokens().get(i).getTerm(), is(expected[i]));
}
Expand All @@ -69,11 +70,11 @@ public void testVietnameseAnalyzerInMapping() throws ExecutionException, Interru
.endObject()
.endObject();
client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get();
index("test", "type", "1", "foo", "công nghệ thông tin Việt Nam");
final XContentBuilder source = jsonBuilder().startObject().field("foo", "công nghệ thông tin Việt Nam").endObject();
index("test", "type", "1", source);
refresh();
SearchResponse response = client().prepareSearch("test").setQuery(
QueryBuilders.matchQuery("foo", "Việt Nam")
).execute().actionGet();
QueryBuilders.matchQuery("foo", "công nghệ thông tin")).execute().actionGet();
assertThat(response.getHits().getTotalHits(), is(1L));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,19 @@ public void testVietnameseTokenizer() throws IOException {
Tokenizer tokenizer = tokenizerFactory.create();
assertNotNull(tokenizer);

tokenizer.setReader(new StringReader("Công nghệ thông tin Việt Nam"));
assertTokenStreamContents(tokenizer, new String[]{"Công nghệ thông tin", "Việt Nam"});
tokenizer.setReader(new StringReader("công nghệ thông tin Việt Nam"));
assertTokenStreamContents(tokenizer, new String[]{"công nghệ thông tin", "Việt", "Nam"});
}

public void testVietnameseAnalyzer() throws IOException {
TestAnalysis analysis = createTestAnalysis();
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("vi_analyzer");
assertNotNull(analyzer);

TokenStream ts = analyzer.analyzer().tokenStream("test", "Công nghệ thông tin Việt Nam");
TokenStream ts = analyzer.analyzer().tokenStream("test", "công nghệ thông tin Việt Nam");
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
ts.reset();
for (String expected : new String[]{"công nghệ thông tin", "việt nam"}) {
for (String expected : new String[]{"công nghệ thông tin", "việt", "nam"}) {
assertThat(ts.incrementToken(), equalTo(true));
assertThat(term.toString(), equalTo(expected));
}
Expand Down

0 comments on commit eabdbfe

Please sign in to comment.