Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update dependencies #7

Merged
merged 1 commit into from
Nov 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,19 @@ jobs:
- runner: ubuntu-latest
target: x86_64-unknown-linux-gnu
archive: .zip
extension: ".a"
extension: ".so"
- runner: macOS-latest
target: x86_64-apple-darwin
archive: .zip
extension: ".a"
extension: ".dylib"
- runner: macOS-latest
target: aarch64-apple-darwin
archive: .zip
extension: ".a"
extension: ".dylib"
- runner: windows-latest
target: x86_64-pc-windows-msvc
archive: .zip
extension: ".ilb"
extension: ".dll"
toolchain: [stable]
features: ["ipadic", "ko-dic", "cc-cedict"]
runs-on: ${{ matrix.platform.runner }}
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
target

Cargo.lock

*.db
12 changes: 6 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lindera-sqlite"
version = "0.35.0"
version = "0.38.0"
edition = "2021"
description = "Lindera tokenizer for SQLite FTS5 extention"
documentation = "https://docs.rs/lindera-sqlite"
Expand All @@ -22,20 +22,20 @@ compress = ["lindera/compress"] # Compress dictionaries
extension = []

[lib]
crate-type = ["rlib", "staticlib"]
# crate-type = ["rlib", "staticlib"]
crate-type = ["cdylib"]

[profile.release]
lto = true

[dependencies]
dotenv = "0.15.0"
# libc without `std`
libc = { version = "0.2.161", "default-features" = false, features = [] }
serde_json = "1.0.132"
libc = { version = "0.2.164", "default-features" = false, features = [] }
serde_json = "1.0.133"
unicode-segmentation = "1.12.0"
unicode-normalization = "0.1.22"

lindera = "0.35.0"
lindera = "0.38.0"

[dev-dependencies]
criterion = "0.5"
Expand Down
40 changes: 31 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,44 @@ lindera-sqlite is a C ABI library which exposes a [FTS5](https://www.sqlite.org/

When used as a custom FTS5 tokenizer this enables application to support Chinese, Japanese and Korean in full-text search.

## Extension Build/Usage Example
## Build extension

```sh
cargo rustc --features extension -- --crate-type=cdylib
% cargo build --features=ipadic,ko-dic,cc-cedict,compress,extension
```

Load extension from `./target/release/liblindera_tokenizer.dylib`.
## Set enviromment variable for Lindera configuration

```sql
CREATE VIRTUAL TABLE
fts
USING fts5(content, tokenize='lindera_tokenizer')
```sh
% export LINDERA_CONFIG_PATH=./resources/lindera.yml
```

## Generating headers
## Then start SQLite

```sh
cbindgen --profile release . -o target/release/fts5-tokenizer.h
% sqlite3 example.db
```

## Load extension

```sql
sqlite> .load ./target/debug/liblindera_sqlite lindera_fts5_tokenizer_init
```

## Create table using FTS5 with Lindera tokenizer

```sql
sqlite> CREATE VIRTUAL TABLE example USING fts5(content, tokenize='lindera_tokenizer');
```

## Insert data

```sql
sqlite> INSERT INTO example(content) VALUES ("Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。");
```

## Search data

```sql
sqlite> SELECT * FROM example WHERE content MATCH "Lindera" ORDER BY bm25(example) LIMIT 10;
```
97 changes: 0 additions & 97 deletions resources/lindera.json

This file was deleted.

67 changes: 67 additions & 0 deletions resources/lindera.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
segmenter:
mode: "normal"
dictionary:
kind: "ipadic"
# user_dictionary:
# path: "./resources/ipadic_simple.csv"
# kind: "ipadic"

character_filters:
- kind: "unicode_normalize"
args:
kind: "nfkc"
- kind: "japanese_iteration_mark"
args:
normalize_kanji: true
normalize_kana: true
- kind: mapping
args:
mapping:
リンデラ: Lindera

token_filters:
- kind: "japanese_compound_word"
args:
kind: "ipadic"
tags:
- "名詞,数"
- "名詞,接尾,助数詞"
new_tag: "名詞,数"
- kind: "japanese_number"
args:
tags:
- "名詞,数"
- kind: "japanese_stop_tags"
args:
tags:
- "接続詞"
- "助詞"
- "助詞,格助詞"
- "助詞,格助詞,一般"
- "助詞,格助詞,引用"
- "助詞,格助詞,連語"
- "助詞,係助詞"
- "助詞,副助詞"
- "助詞,間投助詞"
- "助詞,並立助詞"
- "助詞,終助詞"
- "助詞,副助詞/並立助詞/終助詞"
- "助詞,連体化"
- "助詞,副詞化"
- "助詞,特殊"
- "助動詞"
- "記号"
- "記号,一般"
- "記号,読点"
- "記号,句点"
- "記号,空白"
- "記号,括弧閉"
- "その他,間投"
- "フィラー"
- "非言語音"
- kind: "japanese_katakana_stem"
args:
min: 3
- kind: "remove_diacritical_mark"
args:
japanese: false
9 changes: 5 additions & 4 deletions src/common.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
use libc::{c_char, c_int, c_void};
use lindera::tokenizer::Tokenizer;

pub struct Fts5Tokenizer {
pub tokenizer: Tokenizer,
}
use lindera::tokenizer::Tokenizer;

// sqlite3.h
pub const SQLITE_OK: c_int = 0;
pub const SQLITE_INTERNAL: c_int = 2;
pub const SQLITE_MISUSE: c_int = 21;

pub struct Fts5Tokenizer {
pub tokenizer: Tokenizer,
}

pub type TokenFunction = extern "C" fn(
p_ctx: *mut c_void,
t_flags: c_int,
Expand Down
4 changes: 2 additions & 2 deletions src/extension.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ use crate::common::*;
use crate::lindera_fts5_tokenize;
use crate::load_tokenizer;

pub const FTS5_API_VERSION: c_int = 2;

pub struct Sqlite3 {}
struct Sqlite3Stmt {}

pub const FTS5_API_VERSION: c_int = 2;

// fts5.h
#[repr(C)]
struct Fts5TokenizerApi {
Expand Down
22 changes: 4 additions & 18 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,18 @@ mod common;
#[cfg(feature = "extension")]
mod extension;

use std::env;
use std::fs::File;
use std::io::BufReader;

use dotenv::dotenv;
use libc::{c_char, c_int, c_uchar, c_void};

use lindera::tokenizer::{Tokenizer, TokenizerConfig};
use lindera::tokenizer::{Tokenizer, TokenizerBuilder};

pub use crate::common::*;

pub fn load_tokenizer() -> Result<Tokenizer, c_int> {
dotenv().ok();

let config_path =
env::var("LINDERA_CONFIG_PATH").unwrap_or_else(|_| "./lindera.json".to_string());
let config_file = File::open(config_path).map_err(|e| {
eprintln!("Failed to create tokenizer: {}", e);
SQLITE_INTERNAL
})?;
let config_reader = BufReader::new(config_file);
let config: TokenizerConfig = serde_json::from_reader(config_reader).map_err(|e| {
eprintln!("Failed to create tokenizer: {}", e);
let builder = TokenizerBuilder::new().map_err(|e| {
eprintln!("Failed to create tokenizer builder: {}", e);
SQLITE_INTERNAL
})?;
let tokenizer = Tokenizer::from_config(&config).map_err(|e| {
let tokenizer = builder.build().map_err(|e| {
eprintln!("Failed to create tokenizer: {}", e);
SQLITE_INTERNAL
})?;
Expand Down
Loading