Skip to content

Commit

Permalink
Update Lindera to 0.8 and Tantivy to 0.15. (#22)
Browse files Browse the repository at this point in the history
* Update Lindera to 0.8 and Tantivy to 0.15.
  • Loading branch information
mosuka authored Aug 22, 2021
1 parent 802e11c commit b19d4d5
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 24 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## Unreleased
- Update Lindera to 0.8 and Tantivy to 0.15. #22 @mosuka

## 0.7.2 (2021-02-08)
- Upgrade Tantivy to 0.14.0 #19 @mosuka
- Bump up version to 0.7.2 #21 @mosuka
Expand Down
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ categories = ["text-processing"]
license = "MIT"

[dependencies]
tantivy = "0.14"
tantivy = "0.15"

lindera = "0.7"
lindera-core = "0.7"
lindera = "0.8.0"
lindera-core = "0.8.0"
13 changes: 11 additions & 2 deletions examples/basic_example.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
use lindera_tantivy::tokenizer::LinderaTokenizer;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::{doc, Index};

use lindera::tokenizer::TokenizerConfig;
use lindera_core::viterbi::{Mode, Penalty};
use lindera_tantivy::tokenizer::LinderaTokenizer;

fn main() -> tantivy::Result<()> {
// create schema builder
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -50,10 +53,16 @@ fn main() -> tantivy::Result<()> {
// create index on memory
let index = Index::create_in_ram(schema.clone());

let config = TokenizerConfig {
dict_path: None,
user_dict_path: None,
mode: Mode::Decompose(Penalty::default()),
};

// register Lindera tokenizer
index
.tokenizers()
.register("lang_ja", LinderaTokenizer::new("decompose", ""));
.register("lang_ja", LinderaTokenizer::with_config(config).unwrap());

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
42 changes: 23 additions & 19 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use crate::stream::LinderaTokenStream;
use lindera::tokenizer::Tokenizer as LTokenizer;
use lindera_core::core::viterbi::{Mode, Penalty};
use tantivy::tokenizer::{BoxTokenStream, Tokenizer};

use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerConfig};
use lindera_core::LinderaResult;

use crate::stream::LinderaTokenStream;

/// Tokenize text with the specified mode and dictionary.
///
/// Example: `すもももももももものうち` would be tokenized as (mode: "normal", dict: "")
Expand All @@ -18,7 +20,7 @@ use tantivy::tokenizer::{BoxTokenStream, Tokenizer};
/// use lindera_tantivy::tokenizer::*;
/// use tantivy::tokenizer::Tokenizer;
///
/// let tokenizer = LinderaTokenizer::new("normal", "");
/// let tokenizer = LinderaTokenizer::new().unwrap();
/// let mut stream = tokenizer.token_stream("すもももももももものうち");
/// {
/// let token = stream.next().unwrap();
Expand Down Expand Up @@ -70,26 +72,26 @@ pub struct LinderaTokenizer {
}

impl LinderaTokenizer {
pub fn new(mode: &str, dict: &str) -> LinderaTokenizer {
let mode = match mode {
"normal" => Mode::Normal,
"decompose" => Mode::Decompose(Penalty::default()),
_ => {
// show error message
println!("unsupported mode: {}", mode);
Mode::Normal
}
};
LinderaTokenizer {
tokenizer: LTokenizer::new(mode, dict),
}
pub fn new() -> LinderaResult<LinderaTokenizer> {
Ok(LinderaTokenizer {
tokenizer: LTokenizer::new()?,
})
}

pub fn with_config(config: TokenizerConfig) -> LinderaResult<LinderaTokenizer> {
Ok(LinderaTokenizer {
tokenizer: LTokenizer::with_config(config)?,
})
}
}

impl Tokenizer for LinderaTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let mut tokenizer = self.tokenizer.clone();
let result = tokenizer.tokenize(text);
let result = match tokenizer.tokenize(text) {
Ok(result) => result,
Err(_err) => Vec::new(),
};

BoxTokenStream::from(LinderaTokenStream {
result,
Expand All @@ -114,7 +116,9 @@ mod tests {
#[test]
fn test_tokenizer_equal() {
let tokens = test_helper(
LinderaTokenizer::new("normal", "").token_stream("すもももももももものうち"),
LinderaTokenizer::new()
.unwrap()
.token_stream("すもももももももものうち"),
);
assert_eq!(tokens.len(), 7);
{
Expand Down

0 comments on commit b19d4d5

Please sign in to comment.