From c10023ed92ec69188ed3338d6d263f1e2be979ac Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Mon, 23 Jan 2023 22:30:26 +0900 Subject: [PATCH] Use Lindera Analyzer instead of Lindera Tokenizer (#68) * Use Lindera Analyzer instead of Lindera Tokenizer * Format * Update CHANGES.md --- CHANGES.md | 3 + Cargo.toml | 2 +- benches/bench.rs | 15 +-- examples/cc-cedict_example.rs | 30 +++--- examples/ipadic_example.rs | 30 ++---- examples/ko-dic_example.rs | 30 +++--- examples/unidic_example.rs | 30 +++--- src/stream.rs | 40 ++++--- src/tokenizer.rs | 190 ++++++++++++++++++++++++++-------- 9 files changed, 232 insertions(+), 138 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index edf87b9..cbf2d5f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,9 @@ All notable changes to this project will be documented in this file. This project adheres to [Semantic Versioning](http://semver.org/). +## Unreleased +- Use Lindera Analyzer instead of Lindera Tokenizer #68 @mosuka + ## 0.20.0 (2023-01-16) - Update dependencies #67 @mosuka diff --git a/Cargo.toml b/Cargo.toml index eb39d94..9252749 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ compress = ["lindera/compress"] [dependencies] tantivy = "0.19.1" -lindera = "0.20.0" +lindera = "0.21.0" [dev-dependencies] criterion = { version = "0.4.0", features = ["html_reports"] } diff --git a/benches/bench.rs b/benches/bench.rs index 6b289a0..e33e394 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -7,9 +7,7 @@ fn bench_indexing(c: &mut Criterion) { use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; use tantivy::Index; - use lindera_tantivy::mode::Mode; use lindera_tantivy::tokenizer::LinderaTokenizer; - use lindera_tantivy::tokenizer::{DictionaryConfig, DictionaryKind, TokenizerConfig}; // create schema builder let mut schema_builder = Schema::builder(); @@ -44,17 +42,6 @@ fn bench_indexing(c: &mut Criterion) { // create index on memory let index = Index::create_in_ram(schema.clone()); - let dictionary = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - - let config = TokenizerConfig { - dictionary, - user_dictionary: None, - mode: Mode::Normal, - }; - // Test document set. let mut docs = Vec::new(); for i in 0..1000 { @@ -68,7 +55,7 @@ fn bench_indexing(c: &mut Criterion) { // register Lindera tokenizer index .tokenizers() - .register("lang_ja", LinderaTokenizer::from_config(config).unwrap()); + .register("lang_ja", LinderaTokenizer::default()); // create index writer let mut index_writer = index.writer(50_000_000).unwrap(); diff --git a/examples/cc-cedict_example.rs b/examples/cc-cedict_example.rs index c2bf7d2..25caa97 100644 --- a/examples/cc-cedict_example.rs +++ b/examples/cc-cedict_example.rs @@ -1,14 +1,17 @@ #[cfg(feature = "cc-cedict")] fn main() -> tantivy::Result<()> { - use tantivy::collector::TopDocs; - use tantivy::doc; - use tantivy::query::QueryParser; - use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; - use tantivy::Index; - - use lindera_tantivy::mode::Mode; - use lindera_tantivy::tokenizer::{ - DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig, + use tantivy::{ + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Index, + }; + + use lindera::tokenizer::Tokenizer; + use lindera_tantivy::{ + mode::Mode, + tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig}, }; // create schema builder @@ -67,10 +70,13 @@ fn main() -> tantivy::Result<()> { mode: Mode::Normal, }; + let tokenizer = Tokenizer::from_config(config).unwrap(); + // register Lindera tokenizer - index - .tokenizers() - .register("lang_zh", LinderaTokenizer::from_config(config).unwrap()); + index.tokenizers().register( + "lang_zh", + LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()), + ); // create index writer let mut index_writer = index.writer(50_000_000)?; diff --git a/examples/ipadic_example.rs b/examples/ipadic_example.rs index 658baaf..7e0cb88 100644 --- a/examples/ipadic_example.rs +++ b/examples/ipadic_example.rs @@ -1,16 +1,15 @@ #[cfg(feature = "ipadic")] fn main() -> tantivy::Result<()> { - use tantivy::collector::TopDocs; - use tantivy::doc; - use tantivy::query::QueryParser; - use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; - use tantivy::Index; - - use lindera_tantivy::mode::Mode; - use lindera_tantivy::tokenizer::{ - DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig, + use tantivy::{ + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Index, }; + use lindera_tantivy::tokenizer::LinderaTokenizer; + // create schema builder let mut schema_builder = Schema::builder(); @@ -56,21 +55,10 @@ fn main() -> tantivy::Result<()> { // create index on memory let index = Index::create_in_ram(schema.clone()); - let dictionary = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - - let config = TokenizerConfig { - dictionary, - user_dictionary: None, - mode: Mode::Normal, - }; - // register Lindera tokenizer index .tokenizers() - .register("lang_ja", LinderaTokenizer::from_config(config).unwrap()); + .register("lang_ja", LinderaTokenizer::default()); // create index writer let mut index_writer = index.writer(50_000_000)?; diff --git a/examples/ko-dic_example.rs b/examples/ko-dic_example.rs index 24aa407..ac3eb41 100644 --- a/examples/ko-dic_example.rs +++ b/examples/ko-dic_example.rs @@ -1,14 +1,17 @@ #[cfg(feature = "ko-dic")] fn main() -> tantivy::Result<()> { - use tantivy::collector::TopDocs; - use tantivy::doc; - use tantivy::query::QueryParser; - use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; - use tantivy::Index; - - use lindera_tantivy::mode::Mode; - use lindera_tantivy::tokenizer::{ - DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig, + use tantivy::{ + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Index, + }; + + use lindera::tokenizer::Tokenizer; + use lindera_tantivy::{ + mode::Mode, + tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig}, }; // create schema builder @@ -67,10 +70,13 @@ fn main() -> tantivy::Result<()> { mode: Mode::Normal, }; + let tokenizer = Tokenizer::from_config(config).unwrap(); + // register Lindera tokenizer - index - .tokenizers() - .register("lang_ko", LinderaTokenizer::from_config(config).unwrap()); + index.tokenizers().register( + "lang_ko", + LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()), + ); // create index writer let mut index_writer = index.writer(50_000_000)?; diff --git a/examples/unidic_example.rs b/examples/unidic_example.rs index 5c951cb..99d8d3b 100644 --- a/examples/unidic_example.rs +++ b/examples/unidic_example.rs @@ -1,14 +1,17 @@ #[cfg(feature = "unidic")] fn main() -> tantivy::Result<()> { - use tantivy::collector::TopDocs; - use tantivy::doc; - use tantivy::query::QueryParser; - use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; - use tantivy::Index; - - use lindera_tantivy::mode::Mode; - use lindera_tantivy::tokenizer::{ - DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig, + use tantivy::{ + collector::TopDocs, + doc, + query::QueryParser, + schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, + Index, + }; + + use lindera::tokenizer::Tokenizer; + use lindera_tantivy::{ + mode::Mode, + tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig}, }; // create schema builder @@ -67,10 +70,13 @@ fn main() -> tantivy::Result<()> { mode: Mode::Normal, }; + let tokenizer = Tokenizer::from_config(config).unwrap(); + // register Lindera tokenizer - index - .tokenizers() - .register("lang_ja", LinderaTokenizer::from_config(config).unwrap()); + index.tokenizers().register( + "lang_ja", + LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()), + ); // create index writer let mut index_writer = index.writer(50_000_000)?; diff --git a/src/stream.rs b/src/stream.rs index 79b9438..98c6611 100644 --- a/src/stream.rs +++ b/src/stream.rs @@ -1,31 +1,29 @@ +use std::collections::VecDeque; + use tantivy::tokenizer::{Token, TokenStream}; pub struct LinderaTokenStream { - pub result: Vec, - pub index: usize, - pub offset_from: usize, - pub token: Token, + tokens: VecDeque, + token: Token, +} + +impl LinderaTokenStream { + pub fn new(tokens: VecDeque) -> Self { + Self { + tokens, + token: Default::default(), + } + } } impl TokenStream for LinderaTokenStream { fn advance(&mut self) -> bool { - if self.index < self.result.len() { - let token = self.result.get(self.index).unwrap(); - - self.token = Token { - offset_from: self.offset_from, - offset_to: self.offset_from + token.len(), - position: self.index, - text: token.to_string(), - position_length: self.result.len(), - }; - - self.offset_from += token.len(); - self.index += 1; - - true - } else { - false + match self.tokens.pop_front() { + Some(token) => { + self.token = token; + true + } + None => false, } } diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 3b3589e..cb360a1 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,15 +1,28 @@ -use tantivy::tokenizer::{BoxTokenStream, Tokenizer as TTokenizer}; +use std::collections::{HashSet, VecDeque}; + +use tantivy::tokenizer::{BoxTokenStream, Token as TToken, Tokenizer as TTokenizer}; use lindera::{ + analyzer::Analyzer, builder, + character_filter::unicode_normalize::{ + UnicodeNormalizeCharacterFilter, UnicodeNormalizeCharacterFilterConfig, + UnicodeNormalizeKind, + }, + token_filter::{ + japanese_compound_word::{ + JapaneseCompoundWordTokenFilter, JapaneseCompoundWordTokenFilterConfig, + }, + japanese_number::{JapaneseNumberTokenFilter, JapaneseNumberTokenFilterConfig}, + }, tokenizer::{ DictionaryConfig as LDictionaryConfig, Tokenizer as LTokenizer, TokenizerConfig as LTokenizerConfig, }, - DictionaryKind as LDictionaryKind, Token as LToken, + BoxCharacterFilter, BoxTokenFilter, DictionaryKind as LDictionaryKind, Token as LToken, }; -use crate::LinderaResult; +// use crate::LinderaResult; use crate::{mode::Mode, stream::LinderaTokenStream}; pub type DictionaryConfig = LDictionaryConfig; @@ -18,49 +31,80 @@ pub type TokenizerConfig = LTokenizerConfig; pub type Token<'a> = LToken<'a>; pub struct LinderaTokenizer { - pub tokenizer: LTokenizer, + pub analyzer: Analyzer, } impl Clone for LinderaTokenizer { fn clone(&self) -> Self { Self { - tokenizer: self.tokenizer.clone(), + analyzer: self.analyzer.clone(), } } } impl LinderaTokenizer { - pub fn new() -> LinderaResult { - let dictionary = builder::load_dictionary_from_kind(DictionaryKind::IPADIC)?; - - Ok(LinderaTokenizer { - tokenizer: LTokenizer::new(dictionary, None, Mode::Normal), - }) + pub fn new( + character_filters: Vec, + tokenizer: LTokenizer, + token_filters: Vec, + ) -> LinderaTokenizer { + LinderaTokenizer { + analyzer: Analyzer::new(character_filters, tokenizer, token_filters), + } } +} - pub fn from_config(config: TokenizerConfig) -> LinderaResult { - Ok(LinderaTokenizer { - tokenizer: LTokenizer::from_config(config)?, - }) +impl Default for LinderaTokenizer { + fn default() -> Self { + // Add character filters. + let mut character_filters: Vec = Vec::new(); + // Unicode normalize character filter + character_filters.push(BoxCharacterFilter::from( + UnicodeNormalizeCharacterFilter::new(UnicodeNormalizeCharacterFilterConfig::new( + UnicodeNormalizeKind::NFKC, + )), + )); + + // Tokenizer with IPADIC + let dictionary = builder::load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap(); + let tokenizer = LTokenizer::new(dictionary, None, Mode::Normal); + + // Add token filters. + let mut token_filters: Vec = Vec::new(); + // Japanese compound word token filter + token_filters.push(BoxTokenFilter::from(JapaneseCompoundWordTokenFilter::new( + JapaneseCompoundWordTokenFilterConfig::new( + DictionaryKind::IPADIC, + HashSet::from(["名詞,数".to_string()]), + Some("名詞,数".to_string()), + ), + ))); + // Japanese number token filter + token_filters.push(BoxTokenFilter::from(JapaneseNumberTokenFilter::new( + JapaneseNumberTokenFilterConfig::new(Some(HashSet::from(["名詞,数".to_string()]))), + ))); + + Self::new(character_filters, tokenizer, token_filters) } } impl TTokenizer for LinderaTokenizer { fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - let result = match self.tokenizer.tokenize(text) { - Ok(result) => result + let tokens = match self.analyzer.analyze(&mut text.to_string()) { + Ok(lindera_tokens) => lindera_tokens .iter() - .map(|token| token.get_text().to_string()) - .collect(), - Err(_err) => Vec::new(), + .map(|lindera_token| TToken { + offset_from: lindera_token.byte_start, + offset_to: lindera_token.byte_end, + position: lindera_token.position, + text: lindera_token.get_text().to_string(), + position_length: lindera_token.position_length, + }) + .collect::>(), + Err(_err) => VecDeque::new(), }; - BoxTokenStream::from(LinderaTokenStream { - result, - token: Default::default(), - index: 0, - offset_from: 0, - }) + BoxTokenStream::from(LinderaTokenStream::new(tokens)) } } @@ -69,8 +113,7 @@ impl TTokenizer for LinderaTokenizer { mod tests { use tantivy::tokenizer::{BoxTokenStream, Token, Tokenizer}; - use crate::mode::Mode; - use crate::tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig}; + use crate::tokenizer::LinderaTokenizer; fn test_helper(mut tokenizer: BoxTokenStream) -> Vec { let mut tokens: Vec = vec![]; @@ -79,23 +122,9 @@ mod tests { } #[test] - fn test_tokenizer_equal() { - let dictionary = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - - let config = TokenizerConfig { - dictionary, - user_dictionary: None, - mode: Mode::Normal, - }; - - let tokens = test_helper( - LinderaTokenizer::from_config(config) - .unwrap() - .token_stream("すもももももももものうち"), - ); + fn test_tokenizer() { + let tokens = + test_helper(LinderaTokenizer::default().token_stream("すもももももももものうち")); assert_eq!(tokens.len(), 7); { let token = &tokens[0]; @@ -103,6 +132,7 @@ mod tests { assert_eq!(token.offset_from, 0); assert_eq!(token.offset_to, 9); assert_eq!(token.position, 0); + assert_eq!(token.position_length, 1); } { let token = &tokens[1]; @@ -110,6 +140,7 @@ mod tests { assert_eq!(token.offset_from, 9); assert_eq!(token.offset_to, 12); assert_eq!(token.position, 1); + assert_eq!(token.position_length, 1); } { let token = &tokens[2]; @@ -117,6 +148,7 @@ mod tests { assert_eq!(token.offset_from, 12); assert_eq!(token.offset_to, 18); assert_eq!(token.position, 2); + assert_eq!(token.position_length, 1); } { let token = &tokens[3]; @@ -124,6 +156,7 @@ mod tests { assert_eq!(token.offset_from, 18); assert_eq!(token.offset_to, 21); assert_eq!(token.position, 3); + assert_eq!(token.position_length, 1); } { let token = &tokens[4]; @@ -131,6 +164,7 @@ mod tests { assert_eq!(token.offset_from, 21); assert_eq!(token.offset_to, 27); assert_eq!(token.position, 4); + assert_eq!(token.position_length, 1); } { let token = &tokens[5]; @@ -138,6 +172,7 @@ mod tests { assert_eq!(token.offset_from, 27); assert_eq!(token.offset_to, 30); assert_eq!(token.position, 5); + assert_eq!(token.position_length, 1); } { let token = &tokens[6]; @@ -145,6 +180,71 @@ mod tests { assert_eq!(token.offset_from, 30); assert_eq!(token.offset_to, 36); assert_eq!(token.position, 6); + assert_eq!(token.position_length, 1); + } + } + + #[test] + fn test_tokenizer_lindera() { + let tokens = test_helper( + LinderaTokenizer::default().token_stream("Linderaは形態素解析エンジンです。"), + ); + assert_eq!(tokens.len(), 7); + { + let token = &tokens[0]; + assert_eq!(token.text, "Lindera"); + assert_eq!(token.offset_from, 0); + assert_eq!(token.offset_to, 21); + assert_eq!(token.position, 0); + assert_eq!(token.position_length, 1); + } + { + let token = &tokens[1]; + assert_eq!(token.text, "は"); + assert_eq!(token.offset_from, 21); + assert_eq!(token.offset_to, 24); + assert_eq!(token.position, 1); + assert_eq!(token.position_length, 1); + } + { + let token = &tokens[2]; + assert_eq!(token.text, "形態素"); + assert_eq!(token.offset_from, 24); + assert_eq!(token.offset_to, 33); + assert_eq!(token.position, 2); + assert_eq!(token.position_length, 1); + } + { + let token = &tokens[3]; + assert_eq!(token.text, "解析"); + assert_eq!(token.offset_from, 33); + assert_eq!(token.offset_to, 39); + assert_eq!(token.position, 3); + assert_eq!(token.position_length, 1); + } + { + let token = &tokens[4]; + assert_eq!(token.text, "エンジン"); + assert_eq!(token.offset_from, 39); + assert_eq!(token.offset_to, 54); + assert_eq!(token.position, 4); + assert_eq!(token.position_length, 1); + } + { + let token = &tokens[5]; + assert_eq!(token.text, "です"); + assert_eq!(token.offset_from, 54); + assert_eq!(token.offset_to, 60); + assert_eq!(token.position, 5); + assert_eq!(token.position_length, 1); + } + { + let token = &tokens[6]; + assert_eq!(token.text, "。"); + assert_eq!(token.offset_from, 60); + assert_eq!(token.offset_to, 63); + assert_eq!(token.position, 6); + assert_eq!(token.position_length, 1); } } }