lindera · mosuka · Nov 17, 2024 · Nov 17, 2024
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -19,19 +19,19 @@ jobs:
           - runner: ubuntu-latest
             target: x86_64-unknown-linux-gnu
             archive: .zip
-            extension: ".a"
+            extension: ".so"
           - runner: macOS-latest
             target: x86_64-apple-darwin
             archive: .zip
-            extension: ".a"
+            extension: ".dylib"
           - runner: macOS-latest
             target: aarch64-apple-darwin
             archive: .zip
-            extension: ".a"
+            extension: ".dylib"
           - runner: windows-latest
             target: x86_64-pc-windows-msvc
             archive: .zip
-            extension: ".ilb"
+            extension: ".dll"
         toolchain: [stable]
         features: ["ipadic", "ko-dic", "cc-cedict"]
     runs-on: ${{ matrix.platform.runner }}

diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@
 target
 
 Cargo.lock
+
+*.db
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lindera-sqlite"
-version = "0.35.0"
+version = "0.38.0"
 edition = "2021"
 description = "Lindera tokenizer for SQLite FTS5 extention"
 documentation = "https://docs.rs/lindera-sqlite"
@@ -22,20 +22,20 @@ compress = ["lindera/compress"]  # Compress dictionaries
 extension = []
 
 [lib]
-crate-type = ["rlib", "staticlib"]
+# crate-type = ["rlib", "staticlib"]
+crate-type = ["cdylib"]
 
 [profile.release]
 lto = true
 
 [dependencies]
-dotenv = "0.15.0"
 # libc without `std`
-libc = { version = "0.2.161", "default-features" = false, features = [] }
-serde_json = "1.0.132"
+libc = { version = "0.2.164", "default-features" = false, features = [] }
+serde_json = "1.0.133"
 unicode-segmentation = "1.12.0"
 unicode-normalization = "0.1.22"
 
-lindera = "0.35.0"
+lindera = "0.38.0"
 
 [dev-dependencies]
 criterion = "0.5"

diff --git a/README.md b/README.md
@@ -4,22 +4,44 @@ lindera-sqlite is a C ABI library which exposes a [FTS5](https://www.sqlite.org/
 
 When used as a custom FTS5 tokenizer this enables application to support Chinese, Japanese and Korean in full-text search.
 
-## Extension Build/Usage Example
+## Build extension
 
 ```sh
-cargo rustc --features extension -- --crate-type=cdylib
+% cargo build --features=ipadic,ko-dic,cc-cedict,compress,extension
 ```
 
-Load extension from `./target/release/liblindera_tokenizer.dylib`.
+## Set enviromment variable for Lindera configuration
 
-```sql
-CREATE VIRTUAL TABLE
-fts
-USING fts5(content, tokenize='lindera_tokenizer')
+```sh
+% export LINDERA_CONFIG_PATH=./resources/lindera.yml
 ```
 
-## Generating headers
+## Then start SQLite
 
 ```sh
-cbindgen --profile release . -o target/release/fts5-tokenizer.h
+% sqlite3 example.db
+```
+
+## Load extension
+
+```sql
+sqlite> .load ./target/debug/liblindera_sqlite lindera_fts5_tokenizer_init
+```
+
+## Create table using FTS5 with Lindera tokenizer
+
+```sql
+sqlite> CREATE VIRTUAL TABLE example USING fts5(content, tokenize='lindera_tokenizer');
+```
+
+## Insert data
+
+```sql
+sqlite> INSERT INTO example(content) VALUES ("Ｌｉｎｄｅｒａは形態素解析ｴﾝｼﾞﾝです。ユーザー辞書も利用可能です。");
+```
+
+## Search data
+
+```sql
+sqlite> SELECT * FROM example WHERE content MATCH "Lindera" ORDER BY bm25(example) LIMIT 10;
 ```
diff --git a/resources/lindera.json b/resources/lindera.json
diff --git a/resources/lindera.yml b/resources/lindera.yml
@@ -0,0 +1,67 @@
+segmenter:
+  mode: "normal"
+  dictionary:
+    kind: "ipadic"
+  # user_dictionary:
+  #   path: "./resources/ipadic_simple.csv"
+  #   kind: "ipadic"
+
+character_filters:
+  - kind: "unicode_normalize"
+    args:
+      kind: "nfkc"
+  - kind: "japanese_iteration_mark"
+    args:
+      normalize_kanji: true
+      normalize_kana: true
+  - kind: mapping
+    args:
+       mapping:
+         リンデラ: Lindera
+
+token_filters:
+  - kind: "japanese_compound_word"
+    args:
+      kind: "ipadic"
+      tags:
+        - "名詞,数"
+        - "名詞,接尾,助数詞"
+      new_tag: "名詞,数"
+  - kind: "japanese_number"
+    args:
+      tags:
+        - "名詞,数"
+  - kind: "japanese_stop_tags"
+    args:
+      tags:
+        - "接続詞"
+        - "助詞"
+        - "助詞,格助詞"
+        - "助詞,格助詞,一般"
+        - "助詞,格助詞,引用"
+        - "助詞,格助詞,連語"
+        - "助詞,係助詞"
+        - "助詞,副助詞"
+        - "助詞,間投助詞"
+        - "助詞,並立助詞"
+        - "助詞,終助詞"
+        - "助詞,副助詞／並立助詞／終助詞"
+        - "助詞,連体化"
+        - "助詞,副詞化"
+        - "助詞,特殊"
+        - "助動詞"
+        - "記号"
+        - "記号,一般"
+        - "記号,読点"
+        - "記号,句点"
+        - "記号,空白"
+        - "記号,括弧閉"
+        - "その他,間投"
+        - "フィラー"
+        - "非言語音"
+  - kind: "japanese_katakana_stem"
+    args:
+      min: 3
+  - kind: "remove_diacritical_mark"
+    args:
+      japanese: false
diff --git a/src/common.rs b/src/common.rs
@@ -1,15 +1,16 @@
 use libc::{c_char, c_int, c_void};
-use lindera::tokenizer::Tokenizer;
 
-pub struct Fts5Tokenizer {
-    pub tokenizer: Tokenizer,
-}
+use lindera::tokenizer::Tokenizer;
 
 // sqlite3.h
 pub const SQLITE_OK: c_int = 0;
 pub const SQLITE_INTERNAL: c_int = 2;
 pub const SQLITE_MISUSE: c_int = 21;
 
+pub struct Fts5Tokenizer {
+    pub tokenizer: Tokenizer,
+}
+
 pub type TokenFunction = extern "C" fn(
     p_ctx: *mut c_void,
     t_flags: c_int,

diff --git a/src/extension.rs b/src/extension.rs
@@ -5,11 +5,11 @@ use crate::common::*;
 use crate::lindera_fts5_tokenize;
 use crate::load_tokenizer;
 
+pub const FTS5_API_VERSION: c_int = 2;
+
 pub struct Sqlite3 {}
 struct Sqlite3Stmt {}
 
-pub const FTS5_API_VERSION: c_int = 2;
-
 // fts5.h
 #[repr(C)]
 struct Fts5TokenizerApi {

diff --git a/src/lib.rs b/src/lib.rs
@@ -4,32 +4,18 @@ mod common;
 #[cfg(feature = "extension")]
 mod extension;
 
-use std::env;
-use std::fs::File;
-use std::io::BufReader;
-
-use dotenv::dotenv;
 use libc::{c_char, c_int, c_uchar, c_void};
 
-use lindera::tokenizer::{Tokenizer, TokenizerConfig};
+use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
 
 pub use crate::common::*;
 
 pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
-    dotenv().ok();
-
-    let config_path =
-        env::var("LINDERA_CONFIG_PATH").unwrap_or_else(|_| "./lindera.json".to_string());
-    let config_file = File::open(config_path).map_err(|e| {
-        eprintln!("Failed to create tokenizer: {}", e);
-        SQLITE_INTERNAL
-    })?;
-    let config_reader = BufReader::new(config_file);
-    let config: TokenizerConfig = serde_json::from_reader(config_reader).map_err(|e| {
-        eprintln!("Failed to create tokenizer: {}", e);
+    let builder = TokenizerBuilder::new().map_err(|e| {
+        eprintln!("Failed to create tokenizer builder: {}", e);
         SQLITE_INTERNAL
     })?;
-    let tokenizer = Tokenizer::from_config(&config).map_err(|e| {
+    let tokenizer = builder.build().map_err(|e| {
         eprintln!("Failed to create tokenizer: {}", e);
         SQLITE_INTERNAL
     })?;