diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 428c495..b1a196d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,19 +19,19 @@ jobs: - runner: ubuntu-latest target: x86_64-unknown-linux-gnu archive: .zip - extension: ".a" + extension: ".so" - runner: macOS-latest target: x86_64-apple-darwin archive: .zip - extension: ".a" + extension: ".dylib" - runner: macOS-latest target: aarch64-apple-darwin archive: .zip - extension: ".a" + extension: ".dylib" - runner: windows-latest target: x86_64-pc-windows-msvc archive: .zip - extension: ".ilb" + extension: ".dll" toolchain: [stable] features: ["ipadic", "ko-dic", "cc-cedict"] runs-on: ${{ matrix.platform.runner }} diff --git a/.gitignore b/.gitignore index bfeeed4..95bc0e9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ target Cargo.lock + +*.db diff --git a/Cargo.toml b/Cargo.toml index dc9d6a6..59fa71f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lindera-sqlite" -version = "0.35.0" +version = "0.38.0" edition = "2021" description = "Lindera tokenizer for SQLite FTS5 extention" documentation = "https://docs.rs/lindera-sqlite" @@ -22,20 +22,20 @@ compress = ["lindera/compress"] # Compress dictionaries extension = [] [lib] -crate-type = ["rlib", "staticlib"] +# crate-type = ["rlib", "staticlib"] +crate-type = ["cdylib"] [profile.release] lto = true [dependencies] -dotenv = "0.15.0" # libc without `std` -libc = { version = "0.2.161", "default-features" = false, features = [] } -serde_json = "1.0.132" +libc = { version = "0.2.164", "default-features" = false, features = [] } +serde_json = "1.0.133" unicode-segmentation = "1.12.0" unicode-normalization = "0.1.22" -lindera = "0.35.0" +lindera = "0.38.0" [dev-dependencies] criterion = "0.5" diff --git a/README.md b/README.md index 3370d35..e12d582 100644 --- a/README.md +++ b/README.md @@ -4,22 +4,44 @@ lindera-sqlite is a C ABI library which exposes a [FTS5](https://www.sqlite.org/ When used as a custom FTS5 tokenizer this enables application to support Chinese, Japanese and Korean in full-text search. -## Extension Build/Usage Example +## Build extension ```sh -cargo rustc --features extension -- --crate-type=cdylib +% cargo build --features=ipadic,ko-dic,cc-cedict,compress,extension ``` -Load extension from `./target/release/liblindera_tokenizer.dylib`. +## Set enviromment variable for Lindera configuration -```sql -CREATE VIRTUAL TABLE -fts -USING fts5(content, tokenize='lindera_tokenizer') +```sh +% export LINDERA_CONFIG_PATH=./resources/lindera.yml ``` -## Generating headers +## Then start SQLite ```sh -cbindgen --profile release . -o target/release/fts5-tokenizer.h +% sqlite3 example.db +``` + +## Load extension + +```sql +sqlite> .load ./target/debug/liblindera_sqlite lindera_fts5_tokenizer_init +``` + +## Create table using FTS5 with Lindera tokenizer + +```sql +sqlite> CREATE VIRTUAL TABLE example USING fts5(content, tokenize='lindera_tokenizer'); +``` + +## Insert data + +```sql +sqlite> INSERT INTO example(content) VALUES ("Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。"); +``` + +## Search data + +```sql +sqlite> SELECT * FROM example WHERE content MATCH "Lindera" ORDER BY bm25(example) LIMIT 10; ``` diff --git a/resources/lindera.json b/resources/lindera.json deleted file mode 100644 index 0977f68..0000000 --- a/resources/lindera.json +++ /dev/null @@ -1,97 +0,0 @@ -{ - "segmenter": { - "mode": "normal", - "dictionary": { - "kind": "ipadic" - }, - "user_dictionary": { - "path": "./resources/ipadic_simple.csv", - "kind": "ipadic" - } - }, - "character_filters": [ - { - "kind": "unicode_normalize", - "args": { - "kind": "nfkc" - } - }, - { - "kind": "japanese_iteration_mark", - "args": { - "normalize_kanji": true, - "normalize_kana": true - } - } - ], - "token_filters": [ - { - "kind": "japanese_base_form", - "args": { - "kind": "ipadic" - } - }, - { - "kind": "japanese_compound_word", - "args": { - "kind": "ipadic", - "tags": [ - "名詞,数" - ], - "new_tag": "名詞,数" - } - }, - { - "kind": "japanese_number", - "args": { - "tags": [ - "名詞,数" - ] - } - }, - { - "kind": "japanese_stop_tags", - "args": { - "tags": [ - "接続詞", - "助詞", - "助詞,格助詞", - "助詞,格助詞,一般", - "助詞,格助詞,引用", - "助詞,格助詞,連語", - "助詞,係助詞", - "助詞,副助詞", - "助詞,間投助詞", - "助詞,並立助詞", - "助詞,終助詞", - "助詞,副助詞/並立助詞/終助詞", - "助詞,連体化", - "助詞,副詞化", - "助詞,特殊", - "助動詞", - "記号", - "記号,一般", - "記号,読点", - "記号,句点", - "記号,空白", - "記号,括弧閉", - "その他,間投", - "フィラー", - "非言語音" - ] - } - }, - { - "kind": "japanese_katakana_stem", - "args": { - "min": 3 - } - }, - { - "kind": "remove_diacritical_mark", - "args": { - "japanese": false - } - } - ] -} diff --git a/resources/lindera.yml b/resources/lindera.yml new file mode 100644 index 0000000..df3f713 --- /dev/null +++ b/resources/lindera.yml @@ -0,0 +1,67 @@ +segmenter: + mode: "normal" + dictionary: + kind: "ipadic" + # user_dictionary: + # path: "./resources/ipadic_simple.csv" + # kind: "ipadic" + +character_filters: + - kind: "unicode_normalize" + args: + kind: "nfkc" + - kind: "japanese_iteration_mark" + args: + normalize_kanji: true + normalize_kana: true + - kind: mapping + args: + mapping: + リンデラ: Lindera + +token_filters: + - kind: "japanese_compound_word" + args: + kind: "ipadic" + tags: + - "名詞,数" + - "名詞,接尾,助数詞" + new_tag: "名詞,数" + - kind: "japanese_number" + args: + tags: + - "名詞,数" + - kind: "japanese_stop_tags" + args: + tags: + - "接続詞" + - "助詞" + - "助詞,格助詞" + - "助詞,格助詞,一般" + - "助詞,格助詞,引用" + - "助詞,格助詞,連語" + - "助詞,係助詞" + - "助詞,副助詞" + - "助詞,間投助詞" + - "助詞,並立助詞" + - "助詞,終助詞" + - "助詞,副助詞/並立助詞/終助詞" + - "助詞,連体化" + - "助詞,副詞化" + - "助詞,特殊" + - "助動詞" + - "記号" + - "記号,一般" + - "記号,読点" + - "記号,句点" + - "記号,空白" + - "記号,括弧閉" + - "その他,間投" + - "フィラー" + - "非言語音" + - kind: "japanese_katakana_stem" + args: + min: 3 + - kind: "remove_diacritical_mark" + args: + japanese: false diff --git a/src/common.rs b/src/common.rs index 46602ad..b1ddd09 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,15 +1,16 @@ use libc::{c_char, c_int, c_void}; -use lindera::tokenizer::Tokenizer; -pub struct Fts5Tokenizer { - pub tokenizer: Tokenizer, -} +use lindera::tokenizer::Tokenizer; // sqlite3.h pub const SQLITE_OK: c_int = 0; pub const SQLITE_INTERNAL: c_int = 2; pub const SQLITE_MISUSE: c_int = 21; +pub struct Fts5Tokenizer { + pub tokenizer: Tokenizer, +} + pub type TokenFunction = extern "C" fn( p_ctx: *mut c_void, t_flags: c_int, diff --git a/src/extension.rs b/src/extension.rs index 6597971..e8d1639 100644 --- a/src/extension.rs +++ b/src/extension.rs @@ -5,11 +5,11 @@ use crate::common::*; use crate::lindera_fts5_tokenize; use crate::load_tokenizer; +pub const FTS5_API_VERSION: c_int = 2; + pub struct Sqlite3 {} struct Sqlite3Stmt {} -pub const FTS5_API_VERSION: c_int = 2; - // fts5.h #[repr(C)] struct Fts5TokenizerApi { diff --git a/src/lib.rs b/src/lib.rs index 955eff9..e2c056d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,32 +4,18 @@ mod common; #[cfg(feature = "extension")] mod extension; -use std::env; -use std::fs::File; -use std::io::BufReader; - -use dotenv::dotenv; use libc::{c_char, c_int, c_uchar, c_void}; -use lindera::tokenizer::{Tokenizer, TokenizerConfig}; +use lindera::tokenizer::{Tokenizer, TokenizerBuilder}; pub use crate::common::*; pub fn load_tokenizer() -> Result { - dotenv().ok(); - - let config_path = - env::var("LINDERA_CONFIG_PATH").unwrap_or_else(|_| "./lindera.json".to_string()); - let config_file = File::open(config_path).map_err(|e| { - eprintln!("Failed to create tokenizer: {}", e); - SQLITE_INTERNAL - })?; - let config_reader = BufReader::new(config_file); - let config: TokenizerConfig = serde_json::from_reader(config_reader).map_err(|e| { - eprintln!("Failed to create tokenizer: {}", e); + let builder = TokenizerBuilder::new().map_err(|e| { + eprintln!("Failed to create tokenizer builder: {}", e); SQLITE_INTERNAL })?; - let tokenizer = Tokenizer::from_config(&config).map_err(|e| { + let tokenizer = builder.build().map_err(|e| { eprintln!("Failed to create tokenizer: {}", e); SQLITE_INTERNAL })?;