Skip to content

Commit

Permalink
Use Lindera Analyzer instead of Lindera Tokenizer (#68)
Browse files Browse the repository at this point in the history
* Use Lindera Analyzer instead of Lindera Tokenizer

* Format

* Update CHANGES.md
  • Loading branch information
mosuka authored Jan 23, 2023
1 parent 6da8f49 commit c10023e
Show file tree
Hide file tree
Showing 9 changed files with 232 additions and 138 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## Unreleased
- Use Lindera Analyzer instead of Lindera Tokenizer #68 @mosuka

## 0.20.0 (2023-01-16)
- Update dependencies #67 @mosuka

Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ compress = ["lindera/compress"]
[dependencies]
tantivy = "0.19.1"

lindera = "0.20.0"
lindera = "0.21.0"

[dev-dependencies]
criterion = { version = "0.4.0", features = ["html_reports"] }
Expand Down
15 changes: 1 addition & 14 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@ fn bench_indexing(c: &mut Criterion) {
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::Index;

use lindera_tantivy::mode::Mode;
use lindera_tantivy::tokenizer::LinderaTokenizer;
use lindera_tantivy::tokenizer::{DictionaryConfig, DictionaryKind, TokenizerConfig};

// create schema builder
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -44,17 +42,6 @@ fn bench_indexing(c: &mut Criterion) {
// create index on memory
let index = Index::create_in_ram(schema.clone());

let dictionary = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};

let config = TokenizerConfig {
dictionary,
user_dictionary: None,
mode: Mode::Normal,
};

// Test document set.
let mut docs = Vec::new();
for i in 0..1000 {
Expand All @@ -68,7 +55,7 @@ fn bench_indexing(c: &mut Criterion) {
// register Lindera tokenizer
index
.tokenizers()
.register("lang_ja", LinderaTokenizer::from_config(config).unwrap());
.register("lang_ja", LinderaTokenizer::default());

// create index writer
let mut index_writer = index.writer(50_000_000).unwrap();
Expand Down
30 changes: 18 additions & 12 deletions examples/cc-cedict_example.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#[cfg(feature = "cc-cedict")]
fn main() -> tantivy::Result<()> {
use tantivy::collector::TopDocs;
use tantivy::doc;
use tantivy::query::QueryParser;
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::Index;

use lindera_tantivy::mode::Mode;
use lindera_tantivy::tokenizer::{
DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig,
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
};

use lindera::tokenizer::Tokenizer;
use lindera_tantivy::{
mode::Mode,
tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig},
};

// create schema builder
Expand Down Expand Up @@ -67,10 +70,13 @@ fn main() -> tantivy::Result<()> {
mode: Mode::Normal,
};

let tokenizer = Tokenizer::from_config(config).unwrap();

// register Lindera tokenizer
index
.tokenizers()
.register("lang_zh", LinderaTokenizer::from_config(config).unwrap());
index.tokenizers().register(
"lang_zh",
LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()),
);

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
30 changes: 9 additions & 21 deletions examples/ipadic_example.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
#[cfg(feature = "ipadic")]
fn main() -> tantivy::Result<()> {
use tantivy::collector::TopDocs;
use tantivy::doc;
use tantivy::query::QueryParser;
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::Index;

use lindera_tantivy::mode::Mode;
use lindera_tantivy::tokenizer::{
DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig,
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
};

use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();

Expand Down Expand Up @@ -56,21 +55,10 @@ fn main() -> tantivy::Result<()> {
// create index on memory
let index = Index::create_in_ram(schema.clone());

let dictionary = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};

let config = TokenizerConfig {
dictionary,
user_dictionary: None,
mode: Mode::Normal,
};

// register Lindera tokenizer
index
.tokenizers()
.register("lang_ja", LinderaTokenizer::from_config(config).unwrap());
.register("lang_ja", LinderaTokenizer::default());

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
30 changes: 18 additions & 12 deletions examples/ko-dic_example.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#[cfg(feature = "ko-dic")]
fn main() -> tantivy::Result<()> {
use tantivy::collector::TopDocs;
use tantivy::doc;
use tantivy::query::QueryParser;
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::Index;

use lindera_tantivy::mode::Mode;
use lindera_tantivy::tokenizer::{
DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig,
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
};

use lindera::tokenizer::Tokenizer;
use lindera_tantivy::{
mode::Mode,
tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig},
};

// create schema builder
Expand Down Expand Up @@ -67,10 +70,13 @@ fn main() -> tantivy::Result<()> {
mode: Mode::Normal,
};

let tokenizer = Tokenizer::from_config(config).unwrap();

// register Lindera tokenizer
index
.tokenizers()
.register("lang_ko", LinderaTokenizer::from_config(config).unwrap());
index.tokenizers().register(
"lang_ko",
LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()),
);

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
30 changes: 18 additions & 12 deletions examples/unidic_example.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
#[cfg(feature = "unidic")]
fn main() -> tantivy::Result<()> {
use tantivy::collector::TopDocs;
use tantivy::doc;
use tantivy::query::QueryParser;
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::Index;

use lindera_tantivy::mode::Mode;
use lindera_tantivy::tokenizer::{
DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig,
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Index,
};

use lindera::tokenizer::Tokenizer;
use lindera_tantivy::{
mode::Mode,
tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig},
};

// create schema builder
Expand Down Expand Up @@ -67,10 +70,13 @@ fn main() -> tantivy::Result<()> {
mode: Mode::Normal,
};

let tokenizer = Tokenizer::from_config(config).unwrap();

// register Lindera tokenizer
index
.tokenizers()
.register("lang_ja", LinderaTokenizer::from_config(config).unwrap());
index.tokenizers().register(
"lang_ja",
LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()),
);

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
40 changes: 19 additions & 21 deletions src/stream.rs
Original file line number Diff line number Diff line change
@@ -1,31 +1,29 @@
use std::collections::VecDeque;

use tantivy::tokenizer::{Token, TokenStream};

pub struct LinderaTokenStream {
pub result: Vec<String>,
pub index: usize,
pub offset_from: usize,
pub token: Token,
tokens: VecDeque<Token>,
token: Token,
}

impl LinderaTokenStream {
pub fn new(tokens: VecDeque<Token>) -> Self {
Self {
tokens,
token: Default::default(),
}
}
}

impl TokenStream for LinderaTokenStream {
fn advance(&mut self) -> bool {
if self.index < self.result.len() {
let token = self.result.get(self.index).unwrap();

self.token = Token {
offset_from: self.offset_from,
offset_to: self.offset_from + token.len(),
position: self.index,
text: token.to_string(),
position_length: self.result.len(),
};

self.offset_from += token.len();
self.index += 1;

true
} else {
false
match self.tokens.pop_front() {
Some(token) => {
self.token = token;
true
}
None => false,
}
}

Expand Down
Loading

0 comments on commit c10023e

Please sign in to comment.