Skip to content

Commit

Permalink
Merge pull request #7 from lindera/update_deps
Browse files Browse the repository at this point in the history
Update dependencies
  • Loading branch information
mosuka authored Nov 17, 2024
2 parents bd35c3e + 34cfc58 commit 1516744
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 140 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,19 @@ jobs:
- runner: ubuntu-latest
target: x86_64-unknown-linux-gnu
archive: .zip
extension: ".a"
extension: ".so"
- runner: macOS-latest
target: x86_64-apple-darwin
archive: .zip
extension: ".a"
extension: ".dylib"
- runner: macOS-latest
target: aarch64-apple-darwin
archive: .zip
extension: ".a"
extension: ".dylib"
- runner: windows-latest
target: x86_64-pc-windows-msvc
archive: .zip
extension: ".ilb"
extension: ".dll"
toolchain: [stable]
features: ["ipadic", "ko-dic", "cc-cedict"]
runs-on: ${{ matrix.platform.runner }}
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
target

Cargo.lock

*.db
12 changes: 6 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lindera-sqlite"
version = "0.35.0"
version = "0.38.0"
edition = "2021"
description = "Lindera tokenizer for SQLite FTS5 extention"
documentation = "https://docs.rs/lindera-sqlite"
Expand All @@ -22,20 +22,20 @@ compress = ["lindera/compress"] # Compress dictionaries
extension = []

[lib]
crate-type = ["rlib", "staticlib"]
# crate-type = ["rlib", "staticlib"]
crate-type = ["cdylib"]

[profile.release]
lto = true

[dependencies]
dotenv = "0.15.0"
# libc without `std`
libc = { version = "0.2.161", "default-features" = false, features = [] }
serde_json = "1.0.132"
libc = { version = "0.2.164", "default-features" = false, features = [] }
serde_json = "1.0.133"
unicode-segmentation = "1.12.0"
unicode-normalization = "0.1.22"

lindera = "0.35.0"
lindera = "0.38.0"

[dev-dependencies]
criterion = "0.5"
Expand Down
40 changes: 31 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,44 @@ lindera-sqlite is a C ABI library which exposes a [FTS5](https://www.sqlite.org/

When used as a custom FTS5 tokenizer this enables application to support Chinese, Japanese and Korean in full-text search.

## Extension Build/Usage Example
## Build extension

```sh
cargo rustc --features extension -- --crate-type=cdylib
% cargo build --features=ipadic,ko-dic,cc-cedict,compress,extension
```

Load extension from `./target/release/liblindera_tokenizer.dylib`.
## Set enviromment variable for Lindera configuration

```sql
CREATE VIRTUAL TABLE
fts
USING fts5(content, tokenize='lindera_tokenizer')
```sh
% export LINDERA_CONFIG_PATH=./resources/lindera.yml
```

## Generating headers
## Then start SQLite

```sh
cbindgen --profile release . -o target/release/fts5-tokenizer.h
% sqlite3 example.db
```

## Load extension

```sql
sqlite> .load ./target/debug/liblindera_sqlite lindera_fts5_tokenizer_init
```

## Create table using FTS5 with Lindera tokenizer

```sql
sqlite> CREATE VIRTUAL TABLE example USING fts5(content, tokenize='lindera_tokenizer');
```

## Insert data

```sql
sqlite> INSERT INTO example(content) VALUES ("Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。");
```

## Search data

```sql
sqlite> SELECT * FROM example WHERE content MATCH "Lindera" ORDER BY bm25(example) LIMIT 10;
```
97 changes: 0 additions & 97 deletions resources/lindera.json

This file was deleted.

67 changes: 67 additions & 0 deletions resources/lindera.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
segmenter:
mode: "normal"
dictionary:
kind: "ipadic"
# user_dictionary:
# path: "./resources/ipadic_simple.csv"
# kind: "ipadic"

character_filters:
- kind: "unicode_normalize"
args:
kind: "nfkc"
- kind: "japanese_iteration_mark"
args:
normalize_kanji: true
normalize_kana: true
- kind: mapping
args:
mapping:
リンデラ: Lindera

token_filters:
- kind: "japanese_compound_word"
args:
kind: "ipadic"
tags:
- "名詞,数"
- "名詞,接尾,助数詞"
new_tag: "名詞,数"
- kind: "japanese_number"
args:
tags:
- "名詞,数"
- kind: "japanese_stop_tags"
args:
tags:
- "接続詞"
- "助詞"
- "助詞,格助詞"
- "助詞,格助詞,一般"
- "助詞,格助詞,引用"
- "助詞,格助詞,連語"
- "助詞,係助詞"
- "助詞,副助詞"
- "助詞,間投助詞"
- "助詞,並立助詞"
- "助詞,終助詞"
- "助詞,副助詞/並立助詞/終助詞"
- "助詞,連体化"
- "助詞,副詞化"
- "助詞,特殊"
- "助動詞"
- "記号"
- "記号,一般"
- "記号,読点"
- "記号,句点"
- "記号,空白"
- "記号,括弧閉"
- "その他,間投"
- "フィラー"
- "非言語音"
- kind: "japanese_katakana_stem"
args:
min: 3
- kind: "remove_diacritical_mark"
args:
japanese: false
9 changes: 5 additions & 4 deletions src/common.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
use libc::{c_char, c_int, c_void};
use lindera::tokenizer::Tokenizer;

pub struct Fts5Tokenizer {
pub tokenizer: Tokenizer,
}
use lindera::tokenizer::Tokenizer;

// sqlite3.h
pub const SQLITE_OK: c_int = 0;
pub const SQLITE_INTERNAL: c_int = 2;
pub const SQLITE_MISUSE: c_int = 21;

pub struct Fts5Tokenizer {
pub tokenizer: Tokenizer,
}

pub type TokenFunction = extern "C" fn(
p_ctx: *mut c_void,
t_flags: c_int,
Expand Down
4 changes: 2 additions & 2 deletions src/extension.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ use crate::common::*;
use crate::lindera_fts5_tokenize;
use crate::load_tokenizer;

pub const FTS5_API_VERSION: c_int = 2;

pub struct Sqlite3 {}
struct Sqlite3Stmt {}

pub const FTS5_API_VERSION: c_int = 2;

// fts5.h
#[repr(C)]
struct Fts5TokenizerApi {
Expand Down
22 changes: 4 additions & 18 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,18 @@ mod common;
#[cfg(feature = "extension")]
mod extension;

use std::env;
use std::fs::File;
use std::io::BufReader;

use dotenv::dotenv;
use libc::{c_char, c_int, c_uchar, c_void};

use lindera::tokenizer::{Tokenizer, TokenizerConfig};
use lindera::tokenizer::{Tokenizer, TokenizerBuilder};

pub use crate::common::*;

pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
dotenv().ok();

let config_path =
env::var("LINDERA_CONFIG_PATH").unwrap_or_else(|_| "./lindera.json".to_string());
let config_file = File::open(config_path).map_err(|e| {
eprintln!("Failed to create tokenizer: {}", e);
SQLITE_INTERNAL
})?;
let config_reader = BufReader::new(config_file);
let config: TokenizerConfig = serde_json::from_reader(config_reader).map_err(|e| {
eprintln!("Failed to create tokenizer: {}", e);
let builder = TokenizerBuilder::new().map_err(|e| {
eprintln!("Failed to create tokenizer builder: {}", e);
SQLITE_INTERNAL
})?;
let tokenizer = Tokenizer::from_config(&config).map_err(|e| {
let tokenizer = builder.build().map_err(|e| {
eprintln!("Failed to create tokenizer: {}", e);
SQLITE_INTERNAL
})?;
Expand Down

0 comments on commit 1516744

Please sign in to comment.