Skip to content

Commit

Permalink
Update dependencies (#72)
Browse files Browse the repository at this point in the history
  • Loading branch information
mosuka authored Feb 23, 2023
1 parent e0389b2 commit ae04c92
Show file tree
Hide file tree
Showing 10 changed files with 107 additions and 246 deletions.
11 changes: 6 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,19 @@ license = "MIT"

[features]
default = ["ipadic"] # Japanese dictionary
cjk = ["cc-cedict", "ipadic", "ko-dic"]
all-dictionaries = ["ipadic", "unidic", "ko-dic", "cc-cedict"]
ipadic = ["lindera/ipadic"] # Japanese dictionary
unidic = ["lindera/unidic"] # Japanese dictionary
ko-dic = ["lindera/ko-dic"] # Korean dictionary
cc-cedict = ["lindera/cc-cedict"] # Chinese dictionary
compress = ["lindera/compress"]
ipadic-compress = ["lindera/ipadic-compress"]
unidic-compress = ["lindera/unidic-compress"]
ko-dic-compress = ["lindera/ko-dic-compress"]
cc-cedict-compress = ["lindera/cc-cedict-compress"]

[dependencies]
tantivy = "0.19.1"
tantivy = "0.19.2"

lindera = "0.21.0"
lindera = "0.23.0"

[dev-dependencies]
criterion = { version = "0.4.0", features = ["html_reports"] }
Expand Down
61 changes: 10 additions & 51 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ lindera-tantivy = { version = "0.12.0", features = ["ipadic"] }
### Basic example

```rust
use std::collections::HashSet;

use tantivy::{
collector::TopDocs,
doc,
Expand All @@ -34,23 +32,10 @@ use tantivy::{
Index,
};

use lindera::{
builder,
character_filter::unicode_normalize::{
UnicodeNormalizeCharacterFilter, UnicodeNormalizeCharacterFilterConfig,
UnicodeNormalizeKind,
},
mode::Mode,
token_filter::{
japanese_compound_word::{
JapaneseCompoundWordTokenFilter, JapaneseCompoundWordTokenFilterConfig,
},
japanese_number::{JapaneseNumberTokenFilter, JapaneseNumberTokenFilterConfig},
},
tokenizer::Tokenizer,
BoxCharacterFilter, BoxTokenFilter, DictionaryKind,
use lindera_tantivy::{
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};
use lindera_tantivy::tokenizer::LinderaTokenizer;

fn main() -> tantivy::Result<()> {
// create schema builder
Expand Down Expand Up @@ -98,42 +83,16 @@ fn main() -> tantivy::Result<()> {
// create index on memory
let index = Index::create_in_ram(schema.clone());

// Character filters.
// Filters are executed in the order in which they are added.
// Character filters are performed before the text is tokenized in the tokenizer.
let mut character_filters: Vec<BoxCharacterFilter> = Vec::new();
// Unicode normalize character filter
character_filters.push(BoxCharacterFilter::from(
UnicodeNormalizeCharacterFilter::new(UnicodeNormalizeCharacterFilterConfig::new(
UnicodeNormalizeKind::NFKC,
)),
));

// Tokenizer with IPADIC
let dictionary = builder::load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap();
let tokenizer = LTokenizer::new(dictionary, None, Mode::Normal);

// Token filters.
// Filters are executed in the order in which they are added.
// Token filters are performed after the text is tokenized in the tokenizer.
let mut token_filters: Vec<BoxTokenFilter> = Vec::new();
// Japanese compound word token filter
token_filters.push(BoxTokenFilter::from(JapaneseCompoundWordTokenFilter::new(
JapaneseCompoundWordTokenFilterConfig::new(
DictionaryKind::IPADIC,
HashSet::from(["名詞,数".to_string()]),
Some("名詞,数".to_string()),
),
)));
// Japanese number token filter
token_filters.push(BoxTokenFilter::from(JapaneseNumberTokenFilter::new(
JapaneseNumberTokenFilterConfig::new(Some(HashSet::from(["名詞,数".to_string()]))),
)));
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
index
.tokenizers()
.register("lang_ja", LinderaTokenizer::new(character_filters, tokenizer, token_filters));
index.tokenizers().register("lang_ja", tokenizer);

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
23 changes: 7 additions & 16 deletions examples/cc-cedict_example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
Index,
};

use lindera::tokenizer::Tokenizer;
use lindera_tantivy::{
mode::Mode,
tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig},
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};

// create schema builder
Expand Down Expand Up @@ -59,24 +58,16 @@ fn main() -> tantivy::Result<()> {
// create index on memory
let index = Index::create_in_ram(schema.clone());

let dictionary = DictionaryConfig {
// Tokenizer with CC-CEDICT
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::CcCedict),
path: None,
};

let config = TokenizerConfig {
dictionary,
user_dictionary: None,
mode: Mode::Normal,
};

let tokenizer = Tokenizer::from_config(config).unwrap();
let dictionary = load_dictionary(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
index.tokenizers().register(
"lang_zh",
LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()),
);
index.tokenizers().register("lang_zh", tokenizer);

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
62 changes: 10 additions & 52 deletions examples/ipadic_example.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#[cfg(feature = "ipadic")]
fn main() -> tantivy::Result<()> {
use std::collections::HashSet;

use tantivy::{
collector::TopDocs,
doc,
Expand All @@ -10,23 +8,10 @@ fn main() -> tantivy::Result<()> {
Index,
};

use lindera::{
builder,
character_filter::unicode_normalize::{
UnicodeNormalizeCharacterFilter, UnicodeNormalizeCharacterFilterConfig,
UnicodeNormalizeKind,
},
mode::Mode,
token_filter::{
japanese_compound_word::{
JapaneseCompoundWordTokenFilter, JapaneseCompoundWordTokenFilterConfig,
},
japanese_number::{JapaneseNumberTokenFilter, JapaneseNumberTokenFilterConfig},
},
tokenizer::Tokenizer,
BoxCharacterFilter, BoxTokenFilter, DictionaryKind,
use lindera_tantivy::{
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -73,43 +58,16 @@ fn main() -> tantivy::Result<()> {
// create index on memory
let index = Index::create_in_ram(schema.clone());

// Character filters.
// Filters are executed in the order in which they are added.
// Character filters are performed before the text is tokenized in the tokenizer.
let mut character_filters: Vec<BoxCharacterFilter> = Vec::new();
// Unicode normalize character filter
character_filters.push(BoxCharacterFilter::from(
UnicodeNormalizeCharacterFilter::new(UnicodeNormalizeCharacterFilterConfig::new(
UnicodeNormalizeKind::NFKC,
)),
));

// Tokenizer with IPADIC
let dictionary = builder::load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap();
let tokenizer = Tokenizer::new(dictionary, None, Mode::Normal);

// Token filters.
// Filters are executed in the order in which they are added.
// Token filters are performed after the text is tokenized in the tokenizer.
let mut token_filters: Vec<BoxTokenFilter> = Vec::new();
// Japanese compound word token filter
token_filters.push(BoxTokenFilter::from(JapaneseCompoundWordTokenFilter::new(
JapaneseCompoundWordTokenFilterConfig::new(
DictionaryKind::IPADIC,
HashSet::from(["名詞,数".to_string()]),
Some("名詞,数".to_string()),
),
)));
// Japanese number token filter
token_filters.push(BoxTokenFilter::from(JapaneseNumberTokenFilter::new(
JapaneseNumberTokenFilterConfig::new(Some(HashSet::from(["名詞,数".to_string()]))),
)));
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
index.tokenizers().register(
"lang_ja",
LinderaTokenizer::new(character_filters, tokenizer, token_filters),
);
index.tokenizers().register("lang_ja", tokenizer);

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
23 changes: 7 additions & 16 deletions examples/ko-dic_example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
Index,
};

use lindera::tokenizer::Tokenizer;
use lindera_tantivy::{
mode::Mode,
tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig},
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};

// create schema builder
Expand Down Expand Up @@ -59,24 +58,16 @@ fn main() -> tantivy::Result<()> {
// create index on memory
let index = Index::create_in_ram(schema.clone());

let dictionary = DictionaryConfig {
// Tokenizer with ko-dic
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::KoDic),
path: None,
};

let config = TokenizerConfig {
dictionary,
user_dictionary: None,
mode: Mode::Normal,
};

let tokenizer = Tokenizer::from_config(config).unwrap();
let dictionary = load_dictionary(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
index.tokenizers().register(
"lang_ko",
LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()),
);
index.tokenizers().register("lang_ko", tokenizer);

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
23 changes: 7 additions & 16 deletions examples/unidic_example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
Index,
};

use lindera::tokenizer::Tokenizer;
use lindera_tantivy::{
mode::Mode,
tokenizer::{DictionaryConfig, DictionaryKind, LinderaTokenizer, TokenizerConfig},
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};

// create schema builder
Expand Down Expand Up @@ -59,24 +58,16 @@ fn main() -> tantivy::Result<()> {
// create index on memory
let index = Index::create_in_ram(schema.clone());

let dictionary = DictionaryConfig {
// Tokenizer with UniDic
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::UniDic),
path: None,
};

let config = TokenizerConfig {
dictionary,
user_dictionary: None,
mode: Mode::Normal,
};

let tokenizer = Tokenizer::from_config(config).unwrap();
let dictionary = load_dictionary(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
index.tokenizers().register(
"lang_ja",
LinderaTokenizer::new(Vec::new(), tokenizer, Vec::new()),
);
index.tokenizers().register("lang_ja", tokenizer);

// create index writer
let mut index_writer = index.writer(50_000_000)?;
Expand Down
5 changes: 5 additions & 0 deletions src/dictionary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
use crate::{Dictionary, DictionaryConfig, LinderaResult};

pub fn load_dictionary(dictionary_config: DictionaryConfig) -> LinderaResult<Dictionary> {
lindera::dictionary::load_dictionary(dictionary_config)
}
13 changes: 9 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
pub mod mode;
pub mod dictionary;
pub mod stream;
pub mod tokenizer;

use lindera::LinderaResult as LLinderaResult;

pub type LinderaResult<T> = LLinderaResult<T>;
pub type LinderaResult<T> = lindera::LinderaResult<T>;
pub type Penalty = lindera::mode::Penalty;
pub type Mode = lindera::mode::Mode;
pub type DictionaryConfig = lindera::dictionary::DictionaryConfig;
pub type UserDictionryConfig = lindera::dictionary::UserDictionaryConfig;
pub type DictionaryKind = lindera::DictionaryKind;
pub type Dictionary = lindera::Dictionary;
pub type UserDictionary = lindera::UserDictionary;
4 changes: 0 additions & 4 deletions src/mode.rs

This file was deleted.

Loading

0 comments on commit ae04c92

Please sign in to comment.