From 55f07abfaf45b4eed4025516680161d34deb49f7 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Sun, 20 Oct 2024 22:31:44 +0900 Subject: [PATCH 1/2] Restructure --- .github/workflows/{CI.yml => release.yml} | 2 +- Cargo.lock | 1160 +++++++-------------- Cargo.toml | 19 +- Makefile | 51 +- README.ja.md | 21 - README.md | 14 + examples/analyze_example.py | 21 - examples/tokenize_cc-cedict_example.py | 25 - examples/tokenize_ipadic.py | 20 + examples/tokenize_ipadic_example.py | 25 - examples/tokenize_ko-dic_example.py | 25 - examples/tokenize_unidic_example.py | 25 - poetry.lock | 370 +++++++ pyproject.toml | 42 +- pyproject.toml.bak | 20 + src/analyzer.rs | 41 - src/dictionary.rs | 133 ++- src/lib.rs | 72 +- src/token.rs | 17 + src/tokenizer.rs | 129 +-- src/util.rs | 131 +++ tests/test_analyze.py | 23 - tests/test_tokenize_cc-cedict.py | 34 - tests/test_tokenize_ipadic.py | 39 +- tests/test_tokenize_ko-dic.py | 35 - tests/test_tokenize_unidic.py | 35 - 26 files changed, 1149 insertions(+), 1380 deletions(-) rename .github/workflows/{CI.yml => release.yml} (99%) delete mode 100644 README.ja.md delete mode 100644 examples/analyze_example.py delete mode 100644 examples/tokenize_cc-cedict_example.py create mode 100644 examples/tokenize_ipadic.py delete mode 100644 examples/tokenize_ipadic_example.py delete mode 100644 examples/tokenize_ko-dic_example.py delete mode 100644 examples/tokenize_unidic_example.py create mode 100644 poetry.lock create mode 100644 pyproject.toml.bak delete mode 100644 src/analyzer.rs create mode 100644 src/token.rs create mode 100644 src/util.rs delete mode 100644 tests/test_analyze.py delete mode 100644 tests/test_tokenize_cc-cedict.py delete mode 100644 tests/test_tokenize_ko-dic.py delete mode 100644 tests/test_tokenize_unidic.py diff --git a/.github/workflows/CI.yml b/.github/workflows/release.yml similarity index 99% rename from .github/workflows/CI.yml rename to .github/workflows/release.yml index 6d039f5..9fd8e07 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -name: CI +name: Release on: push: diff --git a/Cargo.lock b/Cargo.lock index 0325f22..7d3cd3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3,37 +3,25 @@ version = 3 [[package]] -name = "adler" -version = "1.0.2" +name = "adler2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "aes" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", - "opaque-debug", -] +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] [[package]] name = "anyhow" -version = "1.0.70" +version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" +checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" [[package]] name = "autocfg" @@ -43,15 +31,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" -version = "0.13.1" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" - -[[package]] -name = "base64ct" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bincode" @@ -69,66 +51,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - -[[package]] -name = "bstr" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - -[[package]] -name = "bumpalo" -version = "3.12.0" +name = "bitflags" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "byteorder" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" - -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.11+1.0.8" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" -dependencies = [ - "cc", - "libc", - "pkg-config", -] +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.0.77" +version = "1.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f73505338f7d905b19d18738976aae232eb46b8efc15554ffc56deb5d9ebe4" +checksum = "07b1695e2c7e8fc85310cde85aeaab7e3097f593c91d209d3f9df76c928100f0" dependencies = [ - "jobserver", + "shlex", ] [[package]] @@ -138,88 +78,99 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] -name = "cipher" -version = "0.3.0" +name = "crc32fast" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ - "generic-array", + "cfg-if", ] [[package]] -name = "constant_time_eq" -version = "0.1.5" +name = "csv" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] [[package]] -name = "cpufeatures" -version = "0.2.6" +name = "csv-core" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" dependencies = [ - "libc", + "memchr", ] [[package]] -name = "crc32fast" -version = "1.3.2" +name = "darling" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ - "cfg-if", + "darling_core", + "darling_macro", ] [[package]] -name = "crossbeam-utils" -version = "0.8.15" +name = "darling_core" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ - "cfg-if", + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", ] [[package]] -name = "crypto-common" -version = "0.1.6" +name = "darling_macro" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ - "generic-array", - "typenum", + "darling_core", + "quote", + "syn", ] [[package]] -name = "csv" -version = "1.1.6" +name = "derive_builder" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +checksum = "cd33f37ee6a119146a1781d3356a7c26028f83d779b2e04ecd45fdc75c76877b" dependencies = [ - "bstr", - "csv-core", - "itoa 0.4.8", - "ryu", - "serde", + "derive_builder_macro", ] [[package]] -name = "csv-core" -version = "0.1.10" +name = "derive_builder_core" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +checksum = "7431fa049613920234f22c47fdc33e6cf3ee83067091ea4277a3f8c4587aae38" dependencies = [ - "memchr", + "darling", + "proc-macro2", + "quote", + "syn", ] [[package]] -name = "digest" -version = "0.10.6" +name = "derive_builder_macro" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" dependencies = [ - "block-buffer", - "crypto-common", - "subtle", + "derive_builder_core", + "syn", ] [[package]] @@ -288,9 +239,9 @@ checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" [[package]] name = "encoding_rs" -version = "0.8.31" +version = "0.8.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9852635589dc9f9ea1b6fe9f05b50ef208c85c834a562f0c6abb1c475736ec2b" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" dependencies = [ "cfg-if", ] @@ -304,38 +255,14 @@ dependencies = [ "encoding_rs", ] -[[package]] -name = "env_logger" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" -dependencies = [ - "humantime", - "is-terminal", - "log", - "regex", - "termcolor", -] - [[package]] name = "errno" -version = "0.2.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ - "errno-dragonfly", - "libc", - "winapi", -] - -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", "libc", + "windows-sys 0.52.0", ] [[package]] @@ -347,19 +274,25 @@ dependencies = [ "cfg-if", "libc", "redox_syscall", - "windows-sys", + "windows-sys 0.42.0", ] [[package]] name = "flate2" -version = "1.0.25" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" dependencies = [ "crc32fast", "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "form_urlencoded" version = "1.1.0" @@ -370,13 +303,14 @@ dependencies = [ ] [[package]] -name = "generic-array" -version = "0.14.7" +name = "getrandom" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ - "typenum", - "version_check", + "cfg-if", + "libc", + "wasi", ] [[package]] @@ -386,28 +320,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] -name = "hermit-abi" -version = "0.2.6" +name = "heck" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] -name = "hmac" -version = "0.12.1" +name = "ident_case" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest", -] - -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" @@ -421,37 +343,9 @@ dependencies = [ [[package]] name = "indoc" -version = "1.0.7" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adab1eaa3408fb7f0c777a73e7465fd5656136fc93b670eb6df3c88c2c1344e3" - -[[package]] -name = "io-lifetimes" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c" -dependencies = [ - "libc", - "windows-sys", -] - -[[package]] -name = "is-terminal" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927609f78c2913a6f6ac3c27a4fe87f43e2a35367c0c4b0f8265e8f49a104330" -dependencies = [ - "hermit-abi", - "io-lifetimes", - "rustix", - "windows-sys", -] - -[[package]] -name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" [[package]] name = "itoa" @@ -459,260 +353,109 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" -[[package]] -name = "jobserver" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" -dependencies = [ - "libc", -] - -[[package]] -name = "js-sys" -version = "0.3.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" -dependencies = [ - "wasm-bindgen", -] - [[package]] name = "kanaria" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - [[package]] name = "libc" -version = "0.2.138" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6d7e329c562c5dfab7a46a2afabc8b987ab9a4834c9d1ca04dc54c1546cef8" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "lindera" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72be283281bec2768687b1784be03a678609b51f2f90f6f9d9b4f07953e6dd25" +version = "0.33.0" dependencies = [ "anyhow", "bincode", "byteorder", - "encoding", + "csv", "kanaria", - "lindera-cc-cedict-builder", - "lindera-core", + "lindera-cc-cedict", "lindera-dictionary", - "lindera-filter", - "lindera-ipadic-builder", - "lindera-ko-dic-builder", - "lindera-unidic-builder", + "lindera-ipadic", + "lindera-ipadic-neologd", + "lindera-ko-dic", + "lindera-unidic", + "once_cell", "regex", "serde", "serde_json", - "thiserror", + "strum", + "strum_macros", "unicode-blocks", "unicode-normalization", + "unicode-segmentation", "yada", ] [[package]] name = "lindera-cc-cedict" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e000928dd52091a70de6992e765b1955b7c3356699077d1ac4cc84d0d12183f0" +version = "0.33.0" dependencies = [ "bincode", "byteorder", - "encoding", - "lindera-cc-cedict-builder", - "lindera-core", - "lindera-decompress", + "lindera-dictionary", "once_cell", - "zip", ] [[package]] -name = "lindera-cc-cedict-builder" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10fbafd37adab44ccc2668a40fba2dbc4e665cb3c36018c15dfe2e2b830e28ce" +name = "lindera-dictionary" +version = "0.33.0" dependencies = [ "anyhow", "bincode", "byteorder", "csv", + "derive_builder", "encoding", - "env_logger", - "glob", - "lindera-compress", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-compress" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed9196bf5995503f6878a090dfee6114ba86430c72f67ef3624246b564869937" -dependencies = [ - "anyhow", - "flate2", - "lindera-decompress", -] - -[[package]] -name = "lindera-core" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5f0baa9932f682e9c5b388897330f155d3c40de80016e60125897fde5e0e246" -dependencies = [ - "anyhow", - "bincode", - "byteorder", "encoding_rs", + "encoding_rs_io", + "flate2", + "glob", "log", "once_cell", "serde", + "tar", "thiserror", + "ureq", "yada", ] [[package]] -name = "lindera-decompress" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6e63fa6ef0bc3ce2c26d372aa6185b7a316194494a84f81678f5da2893bf4a2" -dependencies = [ - "anyhow", - "flate2", - "serde", -] - -[[package]] -name = "lindera-dictionary" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd765c36166016de87a1f447ea971573e4c63e334836c46ad0020f0408c88bfc" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "lindera-cc-cedict", - "lindera-core", - "lindera-ipadic", - "lindera-ko-dic", - "lindera-unidic", - "serde", -] - -[[package]] -name = "lindera-filter" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5345e37fb9521ab3cee19283bed135d46b3521dc1fd13a49fa0992379056203" +name = "lindera-ipadic" +version = "0.33.0" dependencies = [ - "anyhow", "bincode", "byteorder", - "kanaria", - "lindera-core", "lindera-dictionary", "once_cell", - "regex", - "serde", - "serde_json", - "unicode-blocks", - "unicode-normalization", - "unicode-segmentation", - "yada", ] [[package]] -name = "lindera-ipadic" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60eeb356295f784e7db4cfd2c6772f2bd059e565a7744e246642a07bc333a88a" +name = "lindera-ipadic-neologd" +version = "0.33.0" dependencies = [ "bincode", "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-decompress", - "lindera-ipadic-builder", + "lindera-dictionary", "once_cell", - "tar", -] - -[[package]] -name = "lindera-ipadic-builder" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a16a2a88db9d956f5086bc976deb9951ca2dbbfef41a002df0a7bfb2c845aab" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-compress", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", ] [[package]] name = "lindera-ko-dic" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb479b170a841b8cfbe602d772e30849ffe0562b219190a378368968b8c8f66" +version = "0.33.0" dependencies = [ "bincode", "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-decompress", - "lindera-ko-dic-builder", + "lindera-dictionary", "once_cell", - "tar", -] - -[[package]] -name = "lindera-ko-dic-builder" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b9b58213552560717c48e7833444a20d2d7fe26a6e565f7ce0cbbf85784c7cf" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-compress", - "lindera-core", - "lindera-decompress", - "log", - "yada", ] [[package]] @@ -721,151 +464,61 @@ version = "0.23.1" dependencies = [ "lindera", "pyo3", + "serde", + "serde_json", ] [[package]] name = "lindera-unidic" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbac9202c81f6a350cdfee30bad297178b132f499b79c229383e6f7bc48296b0" +version = "0.33.0" dependencies = [ "bincode", "byteorder", - "encoding", - "lindera-core", - "lindera-decompress", - "lindera-unidic-builder", + "lindera-dictionary", "once_cell", - "ureq", - "zip", -] - -[[package]] -name = "lindera-unidic-builder" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6858147cdaf4a7b564c08a247449d3aca38e9b4812499651af08afbf85324596" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-compress", - "lindera-core", - "lindera-decompress", - "log", - "yada", ] [[package]] name = "linux-raw-sys" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f9f08d8963a6c613f4b1a78f4f4a4dbfadf8e6545b2d72861731e4858b8b47f" - -[[package]] -name = "lock_api" -version = "0.4.9" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" -dependencies = [ - "autocfg", - "scopeguard", -] +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "log" -version = "0.4.17" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" -dependencies = [ - "cfg-if", -] +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "memchr" -version = "2.5.0" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memoffset" -version = "0.8.0" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" dependencies = [ "autocfg", ] [[package]] name = "miniz_oxide" -version = "0.6.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" dependencies = [ - "adler", + "adler2", ] [[package]] name = "once_cell" -version = "1.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" - -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - -[[package]] -name = "parking_lot" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ff9f3fef3968a3ec5945535ed654cb38ff72d7495a25619e2247fb15a2ed9ba" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-sys", -] - -[[package]] -name = "password-hash" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" -dependencies = [ - "base64ct", - "rand_core", - "subtle", -] - -[[package]] -name = "pbkdf2" -version = "0.11.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" -dependencies = [ - "digest", - "hmac", - "password-hash", - "sha2", -] +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "percent-encoding" @@ -874,31 +527,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" [[package]] -name = "pkg-config" -version = "0.3.26" +name = "portable-atomic" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +checksum = "d30538d42559de6b034bc76fd6dd4c38961b1ee5c6c56e3808c50128fdbc22ce" [[package]] name = "proc-macro2" -version = "1.0.52" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d0e1ae9e836cc3beddd63db0df682593d7e2d3d891ae8c9083d2113e1744224" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "pyo3" -version = "0.18.2" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfb848f80438f926a9ebddf0a539ed6065434fd7aae03a89312a9821f81b8501" +checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", - "parking_lot", + "once_cell", + "portable-atomic", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", @@ -907,9 +561,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.18.2" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98a42e7f42e917ce6664c832d5eee481ad514c98250c49e0b03b20593e2c7ed0" +checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" dependencies = [ "once_cell", "target-lexicon", @@ -917,9 +571,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.18.2" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0707f0ab26826fe4ccd59b69106e9df5e12d097457c7b8f9c0fd1d2743eec4d" +checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" dependencies = [ "libc", "pyo3-build-config", @@ -927,224 +581,231 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.18.2" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978d18e61465ecd389e1f235ff5a467146dc4e3c3968b90d274fe73a5dd4a438" +checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 1.0.105", + "syn", ] [[package]] name = "pyo3-macros-backend" -version = "0.18.2" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e0e1128f85ce3fca66e435e08aa2089a2689c1c48ce97803e13f63124058462" +checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" dependencies = [ + "heck", "proc-macro2", + "pyo3-build-config", "quote", - "syn 1.0.105", + "syn", ] [[package]] name = "quote" -version = "1.0.26" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" - [[package]] name = "redox_syscall" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] name = "regex" -version = "1.7.1" +version = "1.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" dependencies = [ "aho-corasick", "memchr", + "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" -version = "0.1.10" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] [[package]] name = "regex-syntax" -version = "0.6.28" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "ring" -version = "0.16.20" +version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", + "cfg-if", + "getrandom", "libc", - "once_cell", "spin", "untrusted", - "web-sys", - "winapi", + "windows-sys 0.52.0", ] [[package]] name = "rustix" -version = "0.36.5" +version = "0.38.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3807b5d10909833d3e9acd1eb5fb988f79376ff10fce42937de71a449c4c588" +checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" dependencies = [ - "bitflags", + "bitflags 2.6.0", "errno", - "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "rustls" -version = "0.20.8" +version = "0.23.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" +checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" dependencies = [ "log", + "once_cell", "ring", - "sct", - "webpki", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", ] [[package]] -name = "ryu" -version = "1.0.11" +name = "rustls-pki-types" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] -name = "scopeguard" -version = "1.1.0" +name = "rustls-webpki" +version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "sct" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d53dcdb7c9f8158937a7981b48accfd39a43af418591a5d008c7b22b5e1b7ca4" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ "ring", + "rustls-pki-types", "untrusted", ] +[[package]] +name = "rustversion" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" + +[[package]] +name = "ryu" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" + [[package]] name = "serde" -version = "1.0.157" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707de5fcf5df2b5788fca98dd7eab490bc2fd9b7ef1404defc462833b83f25ca" +checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.157" +version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78997f4555c22a7971214540c4a661291970619afd56de19f77e0de86296e1e5" +checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.2", + "syn", ] [[package]] name = "serde_json" -version = "1.0.94" +version = "1.0.131" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea" +checksum = "67d42a0bd4ac281beff598909bb56a86acaf979b84483e1c79c10dcaf98f8cf3" dependencies = [ - "itoa 1.0.4", + "itoa", + "memchr", "ryu", "serde", ] [[package]] -name = "sha1" -version = "0.10.5" +name = "shlex" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] -name = "smallvec" -version = "1.10.0" +name = "spin" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] -name = "spin" -version = "0.5.2" +name = "strsim" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] -name = "subtle" -version = "2.4.1" +name = "strum" +version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] [[package]] -name = "syn" -version = "1.0.105" +name = "strum_macros" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b9b43d45702de4c839cb9b51d9f529c5dd26a4aff255b42b1ebc03e88ee908" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ + "heck", "proc-macro2", "quote", - "unicode-ident", + "rustversion", + "syn", ] +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" -version = "2.0.2" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59d3276aee1fa0c33612917969b5172b5be2db051232a6e4826f1a1a9191b045" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", @@ -1153,9 +814,9 @@ dependencies = [ [[package]] name = "tar" -version = "0.4.38" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6" +checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909" dependencies = [ "filetime", "libc", @@ -1164,55 +825,30 @@ dependencies = [ [[package]] name = "target-lexicon" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9410d0f6853b1d94f0e519fb95df60f29d2c1eff2d921ffdf01a4c8a3b54f12d" - -[[package]] -name = "termcolor" -version = "1.1.3" +version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" -dependencies = [ - "winapi-util", -] +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "thiserror" -version = "1.0.40" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.40" +version = "1.0.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.2", + "syn", ] -[[package]] -name = "time" -version = "0.3.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" -dependencies = [ - "serde", - "time-core", -] - -[[package]] -name = "time-core" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" - [[package]] name = "tinyvec" version = "1.6.0" @@ -1228,12 +864,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" -[[package]] -name = "typenum" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" - [[package]] name = "unicode-bidi" version = "0.3.13" @@ -1242,9 +872,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" [[package]] name = "unicode-blocks" -version = "0.1.5" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de2be6bad6f56ce8373d377e611cbb2265de3a656138065609ce82e217aad70" +checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" [[package]] name = "unicode-ident" @@ -1254,43 +884,43 @@ checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] [[package]] name = "unicode-segmentation" -version = "1.10.1" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unindent" -version = "0.1.10" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ee9362deb4a96cef4d437d1ad49cffc9b9e92d202b6995674e928ce684f112" +checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" [[package]] name = "untrusted" -version = "0.7.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.6.2" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d" +checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a" dependencies = [ "base64", "log", "once_cell", "rustls", + "rustls-pki-types", "url", - "webpki", "webpki-roots", ] @@ -1306,243 +936,169 @@ dependencies = [ ] [[package]] -name = "version_check" -version = "0.9.4" +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] -name = "wasm-bindgen" -version = "0.2.84" +name = "webpki-roots" +version = "0.26.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "841c67bff177718f1d4dfefde8d8f0e78f9b6589319ba88312f567fc5841a958" dependencies = [ - "cfg-if", - "wasm-bindgen-macro", + "rustls-pki-types", ] [[package]] -name = "wasm-bindgen-backend" -version = "0.2.84" +name = "windows-sys" +version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn 1.0.105", - "wasm-bindgen-shared", + "windows_aarch64_gnullvm 0.42.0", + "windows_aarch64_msvc 0.42.0", + "windows_i686_gnu 0.42.0", + "windows_i686_msvc 0.42.0", + "windows_x86_64_gnu 0.42.0", + "windows_x86_64_gnullvm 0.42.0", + "windows_x86_64_msvc 0.42.0", ] [[package]] -name = "wasm-bindgen-macro" -version = "0.2.84" +name = "windows-sys" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "quote", - "wasm-bindgen-macro-support", + "windows-targets", ] [[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.84" +name = "windows-targets" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.105", - "wasm-bindgen-backend", - "wasm-bindgen-shared", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] -name = "wasm-bindgen-shared" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" - -[[package]] -name = "web-sys" -version = "0.3.61" +name = "windows_aarch64_gnullvm" +version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" -dependencies = [ - "js-sys", - "wasm-bindgen", -] +checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" [[package]] -name = "webpki" -version = "0.22.0" +name = "windows_aarch64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f095d78192e208183081cc07bc5515ef55216397af48b873e5edcd72637fa1bd" -dependencies = [ - "ring", - "untrusted", -] +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] -name = "webpki-roots" -version = "0.22.6" +name = "windows_aarch64_msvc" +version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c71e40d7d2c34a5106301fb632274ca37242cd0c9d3e64dbece371a40a2d87" -dependencies = [ - "webpki", -] +checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" [[package]] -name = "winapi" -version = "0.3.9" +name = "windows_aarch64_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" +name = "windows_i686_gnu" +version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" [[package]] -name = "winapi-util" -version = "0.1.5" +name = "windows_i686_gnu" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" +name = "windows_i686_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] -name = "windows-sys" +name = "windows_i686_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" -dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", -] +checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" [[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.0" +name = "windows_i686_msvc" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] -name = "windows_aarch64_msvc" +name = "windows_x86_64_gnu" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" +checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" [[package]] -name = "windows_i686_gnu" -version = "0.42.0" +name = "windows_x86_64_gnu" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] -name = "windows_i686_msvc" +name = "windows_x86_64_gnullvm" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" +checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" [[package]] -name = "windows_x86_64_gnu" -version = "0.42.0" +name = "windows_x86_64_gnullvm" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] -name = "windows_x86_64_gnullvm" +name = "windows_x86_64_msvc" version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" +checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" [[package]] name = "windows_x86_64_msvc" -version = "0.42.0" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "xattr" -version = "0.2.3" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d1526bbe5aaeb5eb06885f4d987bcdfa5e23187055de9b83fe00156a821fabc" +checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f" dependencies = [ "libc", + "linux-raw-sys", + "rustix", ] [[package]] name = "yada" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6d12cb7a57bbf2ab670ed9545bae3648048547f9039279a89ce000208e585c1" +checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" [[package]] -name = "zip" -version = "0.6.4" +name = "zeroize" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0445d0fbc924bb93539b4316c11afb121ea39296f99a3c4c9edad09e3658cdef" -dependencies = [ - "aes", - "byteorder", - "bzip2", - "constant_time_eq", - "crc32fast", - "crossbeam-utils", - "flate2", - "hmac", - "pbkdf2", - "sha1", - "time", - "zstd", -] - -[[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" -dependencies = [ - "libc", - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" -dependencies = [ - "cc", - "libc", - "pkg-config", -] +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" diff --git a/Cargo.toml b/Cargo.toml index 7d95fd2..94a75b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,25 +12,22 @@ categories = ["text-processing"] license = "MIT" [lib] -name = "lindera_py" +name = "lindera" crate-type = ["cdylib"] [features] default = [] # No directories included ipadic = ["lindera/ipadic"] # Include IPADIC dictionary (Japanese) +ipadic-neologd = ["lindera/ipadic-neologd"] # Include IPADIC NEologd dictionary (Japanese) unidic = ["lindera/unidic"] # Include UniDic dictionary (Japanese) ko-dic = ["lindera/ko-dic"] # Include ko-dic dictionary (Korean) cc-cedict = ["lindera/cc-cedict"] # Include CC-CEDICT dictionary (Chinese) -ipadic-filter = ["lindera/ipadic-filter"] # Include filters for IPADIC -unidic-filter = ["lindera/unidic-filter"] # Include filters for UniDic -ko-dic-filter = ["lindera/ko-dic-filter"] # Include filters for ko-dic -cc-cedict-filter = ["lindera/cc-cedict-filter"] # Include filters for CC-CEDICT -ipadic-compress = ["lindera/ipadic-compress"] # Compress IPADIC -unidic-compress = ["lindera/unidic-compress"] # Compress UniDic -ko-dic-compress = ["lindera/ko-dic-compress"] # Compress ko-dic -cc-cedict-compress = ["lindera/cc-cedict-compress"] # Compress CC-CEDICT +compress = ["lindera/compress"] # Compress dictionaries [dependencies] -pyo3 = { version = "0.18.2", features = ["extension-module"] } +pyo3 = { version = "0.22.5", features = ["extension-module"] } +serde = { version = "1.0.210", features = ["derive"] } +serde_json = "1.0.131" -lindera = "0.23.1" +# lindera = "0.33.0" +lindera = { path = "../lindera/lindera" } diff --git a/Makefile b/Makefile index 77d9a25..4b688cb 100644 --- a/Makefile +++ b/Makefile @@ -1,30 +1,43 @@ +.DEFAULT_GOAL := build + +VERSION := $(shell poetry version -s) + +init: + poetry self add poetry-plugin-export + poetry config warnings.export false + poetry config virtualenvs.in-project true + poetry install --no-root + +update: + poetry update + clean: cargo clean - find . | grep -E "(__pycache__|\.pyc|\.pyo$$)" | xargs rm -rf - rm -rf .pytest_cache + find . | grep -E "(__pycache__|.pytest_cache|.mypy_cache|\.pyc|\.pyo$$)" | xargs rm -rf + +format: + cargo fmt + poetry run isort ./docs ./examples ./tests + poetry run black ./docs ./examples ./tests lint: - black --check examples tests + cargo clippy --all-features + poetry run isort --check-only --diff ./docs ./examples ./tests + poetry run black --check ./docs ./examples ./tests + poetry run flake8 ./docs ./examples ./tests -fmt: - cargo fmt - black examples/* tests/* +typecheck: + poetry run mypy ./examples ./tests .PHONY: tests test: - cargo test - -develop: - maturin develop --release --features=ipadic,unidic,ko-dic,cc-cedict,ipadic-filter,unidic-filter,ko-dic-filter,cc-cedict-filter + cargo test --all-features + maturin develop --release --all-features + poetry run pytest -v ./tests build: - maturin build -i python --release --features=ipadic,unidic,ko-dic,cc-cedict,ipadic-filter,unidic-filter,ko-dic-filter,cc-cedict-filter - -install: - pip install . lindera_py - -uninstall: - pip uninstall -y lindera_py + maturin build -i python --release --all-features -pytest: - python -m pytest tests +tag: + git tag v$(VERSION) + git push origin v$(VERSION) diff --git a/README.ja.md b/README.ja.md deleted file mode 100644 index cd08f30..0000000 --- a/README.ja.md +++ /dev/null @@ -1,21 +0,0 @@ -# lindera-py - -日本語の形態素解析エンジン[Lindera](https://github.com/lindera-morphology/lindera)のPython bindingです. - -## Usage - -このライブラリは現時点では実験的な実装のため、PyPIでは公開していません. - -利用したい場合は自分の利用しているインタプリタ環境上で以下の手順でビルドしてください. - -```shell -git clone https://github.com/lindera-morphology/lindera-py.git -maturin develop --release -``` - -[Maturin](https://github.com/PyO3/maturin)のインストール方法は[here](https://github.com/PyO3/maturin)を参照してください. - -## Config file - -設定ファイルの仕様は[Lindera]()と共有しています. -[こちら](https://github.com/lindera-morphology/lindera/blob/main/resources/lindera_ipadic_conf.json)を参考にして設定を行ってください. diff --git a/README.md b/README.md index d83ab34..5b5b14c 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,20 @@ Python binding for [Lindera](https://github.com/lindera-morphology/lindera), a Japanese morphological analysis engine. + +```shell +$ pyenv local 3.12.3 + +$ python -m venv .venv + +$ source .venv/bin/actibate + +$ poetry update + +$ poetry run maturin develop +``` + + ## Usage This library is experimental at this time and is not available on PyPI. diff --git a/examples/analyze_example.py b/examples/analyze_example.py deleted file mode 100644 index 93b53f6..0000000 --- a/examples/analyze_example.py +++ /dev/null @@ -1,21 +0,0 @@ -from pathlib import Path -from lindera_py import Analyzer - - -def main(): - lindera_conf_path = Path("resources") / "lindera_ipadic_conf.json" - analyzer = Analyzer(str(lindera_conf_path)) - - text = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。" - print(text) - - # tokenize the text - tokens = analyzer.analyze(text) - - # output the tokens - for token in tokens: - print(f"token: {token.text}, details: {token.details}") - - -if __name__ == "__main__": - main() diff --git a/examples/tokenize_cc-cedict_example.py b/examples/tokenize_cc-cedict_example.py deleted file mode 100644 index cfe2700..0000000 --- a/examples/tokenize_cc-cedict_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from lindera_py import Tokenizer -from lindera_py import DictionaryConfig -from lindera_py import TokenizerConfig - - -def main(): - dictionary_config = DictionaryConfig("cc-cedict") - tokenizer_config = TokenizerConfig(dictionary_config, "normal") - - tokenizer = Tokenizer(tokenizer_config) - - text = "Lindera是一个词法分析引擎。用户词典也可用。" - print(text) - - # tokenize the text - tokens = tokenizer.tokenize(text) - - # output the tokens - for token in tokens: - print(f"token: {token.text}, details: {token.details}") - - -if __name__ == "__main__": - main() diff --git a/examples/tokenize_ipadic.py b/examples/tokenize_ipadic.py new file mode 100644 index 0000000..d8440d5 --- /dev/null +++ b/examples/tokenize_ipadic.py @@ -0,0 +1,20 @@ +from lindera import load_dictionary # type: ignore +from lindera import Tokenizer + + +def main(): + dictionary = load_dictionary("ipadic") + tokenizer = Tokenizer("normal", dictionary) + + text = "すもももももももものうち" + print(f"text: {text}\n") + + # tokenize the text + tokens = tokenizer.tokenize(text) + + for token in tokens: + print(token.text) + + +if __name__ == "__main__": + main() diff --git a/examples/tokenize_ipadic_example.py b/examples/tokenize_ipadic_example.py deleted file mode 100644 index d3a62ee..0000000 --- a/examples/tokenize_ipadic_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from lindera_py import Tokenizer -from lindera_py import DictionaryConfig -from lindera_py import TokenizerConfig - - -def main(): - dictionary_config = DictionaryConfig("ipadic") - tokenizer_config = TokenizerConfig(dictionary_config, "normal") - - tokenizer = Tokenizer(tokenizer_config) - - text = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。" - print(text) - - # tokenize the text - tokens = tokenizer.tokenize(text) - - # output the tokens - for token in tokens: - print(f"token: {token.text}, details: {token.details}") - - -if __name__ == "__main__": - main() diff --git a/examples/tokenize_ko-dic_example.py b/examples/tokenize_ko-dic_example.py deleted file mode 100644 index 9871562..0000000 --- a/examples/tokenize_ko-dic_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from lindera_py import Tokenizer -from lindera_py import DictionaryConfig -from lindera_py import TokenizerConfig - - -def main(): - dictionary_config = DictionaryConfig("ko-dic") - tokenizer_config = TokenizerConfig(dictionary_config, "normal") - - tokenizer = Tokenizer(tokenizer_config) - - text = "Lindera는형태소해석엔진입니다.사용자사전도사용할수있습니다." - print(text) - - # tokenize the text - tokens = tokenizer.tokenize(text) - - # output the tokens - for token in tokens: - print(f"token: {token.text}, details: {token.details}") - - -if __name__ == "__main__": - main() diff --git a/examples/tokenize_unidic_example.py b/examples/tokenize_unidic_example.py deleted file mode 100644 index 5fc7897..0000000 --- a/examples/tokenize_unidic_example.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from lindera_py import Tokenizer -from lindera_py import DictionaryConfig -from lindera_py import TokenizerConfig - - -def main(): - dictionary_config = DictionaryConfig("unidic") - tokenizer_config = TokenizerConfig(dictionary_config, "normal") - - tokenizer = Tokenizer(tokenizer_config) - - text = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。" - print(text) - - # tokenize the text - tokens = tokenizer.tokenize(text) - - # output the tokens - for token in tokens: - print(f"token: {token.text}, details: {token.details}") - - -if __name__ == "__main__": - main() diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..a0a82c2 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,370 @@ +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. + +[[package]] +name = "autopep8" +version = "2.3.1" +description = "A tool that automatically formats Python code to conform to the PEP 8 style guide" +optional = false +python-versions = ">=3.8" +files = [ + {file = "autopep8-2.3.1-py2.py3-none-any.whl", hash = "sha256:a203fe0fcad7939987422140ab17a930f684763bf7335bdb6709991dd7ef6c2d"}, + {file = "autopep8-2.3.1.tar.gz", hash = "sha256:8d6c87eba648fdcfc83e29b788910b8643171c395d9c4bcf115ece035b9c9dda"}, +] + +[package.dependencies] +pycodestyle = ">=2.12.0" + +[[package]] +name = "black" +version = "24.10.0" +description = "The uncompromising code formatter." +optional = false +python-versions = ">=3.9" +files = [ + {file = "black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812"}, + {file = "black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea"}, + {file = "black-24.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:649fff99a20bd06c6f727d2a27f401331dc0cc861fb69cde910fe95b01b5928f"}, + {file = "black-24.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe4d6476887de70546212c99ac9bd803d90b42fc4767f058a0baa895013fbb3e"}, + {file = "black-24.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5a2221696a8224e335c28816a9d331a6c2ae15a2ee34ec857dcf3e45dbfa99ad"}, + {file = "black-24.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f9da3333530dbcecc1be13e69c250ed8dfa67f43c4005fb537bb426e19200d50"}, + {file = "black-24.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4007b1393d902b48b36958a216c20c4482f601569d19ed1df294a496eb366392"}, + {file = "black-24.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:394d4ddc64782e51153eadcaaca95144ac4c35e27ef9b0a42e121ae7e57a9175"}, + {file = "black-24.10.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b5e39e0fae001df40f95bd8cc36b9165c5e2ea88900167bddf258bacef9bbdc3"}, + {file = "black-24.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d37d422772111794b26757c5b55a3eade028aa3fde43121ab7b673d050949d65"}, + {file = "black-24.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:14b3502784f09ce2443830e3133dacf2c0110d45191ed470ecb04d0f5f6fcb0f"}, + {file = "black-24.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:30d2c30dc5139211dda799758559d1b049f7f14c580c409d6ad925b74a4208a8"}, + {file = "black-24.10.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1cbacacb19e922a1d75ef2b6ccaefcd6e93a2c05ede32f06a21386a04cedb981"}, + {file = "black-24.10.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1f93102e0c5bb3907451063e08b9876dbeac810e7da5a8bfb7aeb5a9ef89066b"}, + {file = "black-24.10.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ddacb691cdcdf77b96f549cf9591701d8db36b2f19519373d60d31746068dbf2"}, + {file = "black-24.10.0-cp313-cp313-win_amd64.whl", hash = "sha256:680359d932801c76d2e9c9068d05c6b107f2584b2a5b88831c83962eb9984c1b"}, + {file = "black-24.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:17374989640fbca88b6a448129cd1745c5eb8d9547b464f281b251dd00155ccd"}, + {file = "black-24.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:63f626344343083322233f175aaf372d326de8436f5928c042639a4afbbf1d3f"}, + {file = "black-24.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccfa1d0cb6200857f1923b602f978386a3a2758a65b52e0950299ea014be6800"}, + {file = "black-24.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:2cd9c95431d94adc56600710f8813ee27eea544dd118d45896bb734e9d7a0dc7"}, + {file = "black-24.10.0-py3-none-any.whl", hash = "sha256:3bb2b7a1f7b685f85b11fed1ef10f8a9148bceb49853e47a294a3dd963c1dd7d"}, + {file = "black-24.10.0.tar.gz", hash = "sha256:846ea64c97afe3bc677b761787993be4991810ecc7a4a937816dd6bddedc4875"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.10)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "flake8" +version = "7.1.1" +description = "the modular source code checker: pep8 pyflakes and co" +optional = false +python-versions = ">=3.8.1" +files = [ + {file = "flake8-7.1.1-py2.py3-none-any.whl", hash = "sha256:597477df7860daa5aa0fdd84bf5208a043ab96b8e96ab708770ae0364dd03213"}, + {file = "flake8-7.1.1.tar.gz", hash = "sha256:049d058491e228e03e67b390f311bbf88fce2dbaa8fa673e7aea87b7198b8d38"}, +] + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.12.0,<2.13.0" +pyflakes = ">=3.2.0,<3.3.0" + +[[package]] +name = "flake8-pyproject" +version = "1.2.3" +description = "Flake8 plug-in loading the configuration from pyproject.toml" +optional = false +python-versions = ">= 3.6" +files = [ + {file = "flake8_pyproject-1.2.3-py3-none-any.whl", hash = "sha256:6249fe53545205af5e76837644dc80b4c10037e73a0e5db87ff562d75fb5bd4a"}, +] + +[package.dependencies] +Flake8 = ">=5" + +[package.extras] +dev = ["pyTest", "pyTest-cov"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "isort" +version = "5.13.2" +description = "A Python utility / library to sort Python imports." +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "isort-5.13.2-py3-none-any.whl", hash = "sha256:8ca5e72a8d85860d5a3fa69b8745237f2939afe12dbf656afbcb47fe72d947a6"}, + {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, +] + +[package.extras] +colors = ["colorama (>=0.4.6)"] + +[[package]] +name = "maturin" +version = "1.7.4" +description = "Build and publish crates with pyo3, cffi and uniffi bindings as well as rust binaries as python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "maturin-1.7.4-py3-none-linux_armv6l.whl", hash = "sha256:eb7b7753b733ae302c08f80bca7b0c3fda1eea665c2b1922c58795f35a54c833"}, + {file = "maturin-1.7.4-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0182a9638399c8835afd39d2aeacf56908e37cba3f7abb15816b9df6774fab81"}, + {file = "maturin-1.7.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:41a29c5b23f3ebdfe7633637e3de256579a1b2700c04cd68c16ed46934440c5a"}, + {file = "maturin-1.7.4-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:23fae44e345a2da5cb391ae878726fb793394826e2f97febe41710bd4099460e"}, + {file = "maturin-1.7.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:8b441521c151f0dbe70ed06fb1feb29b855d787bda038ff4330ca962e5d56641"}, + {file = "maturin-1.7.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:7ccb66d0c5297cf06652c5f72cb398f447d3a332eccf5d1e73b3fe14dbc9498c"}, + {file = "maturin-1.7.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:71f668f19e719048605dbca6a1f4d0dc03b987c922ad9c4bf5be03b9b278e4c3"}, + {file = "maturin-1.7.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:c179fcb2b494f19186781b667320e43d95b3e71fcb1c98fffad9ef6bd6e276b3"}, + {file = "maturin-1.7.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd5b4b95286f2f376437340f8a4908f4761587212170263084455be8099099a7"}, + {file = "maturin-1.7.4-py3-none-win32.whl", hash = "sha256:35487a424467d1fda4567cbb02d21f09febb10eda22f5fd647b130bc0767dc61"}, + {file = "maturin-1.7.4-py3-none-win_amd64.whl", hash = "sha256:f70c1c8ec9bd4749a53c0f3ae8fdbb326ce45be4f1c5551985ee25a6d7150328"}, + {file = "maturin-1.7.4-py3-none-win_arm64.whl", hash = "sha256:f3d38a6d0c7fd7b04bec30dd470b2173cf9bd184ab6220c1acaf49df6b48faf5"}, + {file = "maturin-1.7.4.tar.gz", hash = "sha256:2b349d742a07527d236f0b4b6cab26f53ebecad0ceabfc09ec4c6a396e3176f9"}, +] + +[package.extras] +patchelf = ["patchelf"] +zig = ["ziglang (>=0.10.0,<0.13.0)"] + +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +optional = false +python-versions = ">=3.6" +files = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] + +[[package]] +name = "mypy" +version = "1.12.1" +description = "Optional static typing for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mypy-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3d7d4371829184e22fda4015278fbfdef0327a4b955a483012bd2d423a788801"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f59f1dfbf497d473201356966e353ef09d4daec48caeacc0254db8ef633a28a5"}, + {file = "mypy-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b947097fae68004b8328c55161ac9db7d3566abfef72d9d41b47a021c2fba6b1"}, + {file = "mypy-1.12.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:96af62050971c5241afb4701c15189ea9507db89ad07794a4ee7b4e092dc0627"}, + {file = "mypy-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:d90da248f4c2dba6c44ddcfea94bb361e491962f05f41990ff24dbd09969ce20"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66"}, + {file = "mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6"}, + {file = "mypy-1.12.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:186e0c8346efc027ee1f9acf5ca734425fc4f7dc2b60144f0fbe27cc19dc7931"}, + {file = "mypy-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9fb83a7be97c498176fb7486cafbb81decccaef1ac339d837c377b0ce3743a7f"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:389e307e333879c571029d5b93932cf838b811d3f5395ed1ad05086b52148fb0"}, + {file = "mypy-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:94b2048a95a21f7a9ebc9fbd075a4fcd310410d078aa0228dbbad7f71335e042"}, + {file = "mypy-1.12.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ee5932370ccf7ebf83f79d1c157a5929d7ea36313027b0d70a488493dc1b179"}, + {file = "mypy-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:19bf51f87a295e7ab2894f1d8167622b063492d754e69c3c2fed6563268cb42a"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d34167d43613ffb1d6c6cdc0cc043bb106cac0aa5d6a4171f77ab92a3c758bcc"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:427878aa54f2e2c5d8db31fa9010c599ed9f994b3b49e64ae9cd9990c40bd635"}, + {file = "mypy-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fcde63ea2c9f69d6be859a1e6dd35955e87fa81de95bc240143cf00de1f7f81"}, + {file = "mypy-1.12.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d54d840f6c052929f4a3d2aab2066af0f45a020b085fe0e40d4583db52aab4e4"}, + {file = "mypy-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:20db6eb1ca3d1de8ece00033b12f793f1ea9da767334b7e8c626a4872090cf02"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b16fe09f9c741d85a2e3b14a5257a27a4f4886c171d562bc5a5e90d8591906b8"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0dcc1e843d58f444fce19da4cce5bd35c282d4bde232acdeca8279523087088a"}, + {file = "mypy-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e10ba7de5c616e44ad21005fa13450cd0de7caaa303a626147d45307492e4f2d"}, + {file = "mypy-1.12.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e6fe449223fa59fbee351db32283838a8fee8059e0028e9e6494a03802b4004"}, + {file = "mypy-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:dc6e2a2195a290a7fd5bac3e60b586d77fc88e986eba7feced8b778c373f9afe"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:de5b2a8988b4e1269a98beaf0e7cc71b510d050dce80c343b53b4955fff45f19"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843826966f1d65925e8b50d2b483065c51fc16dc5d72647e0236aae51dc8d77e"}, + {file = "mypy-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fe20f89da41a95e14c34b1ddb09c80262edcc295ad891f22cc4b60013e8f78d"}, + {file = "mypy-1.12.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8135ffec02121a75f75dc97c81af7c14aa4ae0dda277132cfcd6abcd21551bfd"}, + {file = "mypy-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:a7b76fa83260824300cc4834a3ab93180db19876bce59af921467fd03e692810"}, + {file = "mypy-1.12.1-py3-none-any.whl", hash = "sha256:ce561a09e3bb9863ab77edf29ae3a50e65685ad74bba1431278185b7e5d5486e"}, + {file = "mypy-1.12.1.tar.gz", hash = "sha256:f5b3936f7a6d0e8280c9bdef94c7ce4847f5cdfc258fbb2c29a8c1711e8bb96d"}, +] + +[package.dependencies] +mypy-extensions = ">=1.0.0" +typing-extensions = ">=4.6.0" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +install-types = ["pip"] +mypyc = ["setuptools (>=50)"] +reports = ["lxml"] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + +[[package]] +name = "packaging" +version = "24.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, +] + +[[package]] +name = "patchelf" +version = "0.17.2.1" +description = "A small utility to modify the dynamic linker and RPATH of ELF executables." +optional = false +python-versions = "*" +files = [ + {file = "patchelf-0.17.2.1-py2.py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:fc329da0e8f628bd836dfb8eaf523547e342351fa8f739bf2b3fe4a6db5a297c"}, + {file = "patchelf-0.17.2.1-py2.py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:ccb266a94edf016efe80151172c26cff8c2ec120a57a1665d257b0442784195d"}, + {file = "patchelf-0.17.2.1-py2.py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:f47b5bdd6885cfb20abdd14c707d26eb6f499a7f52e911865548d4aa43385502"}, + {file = "patchelf-0.17.2.1-py2.py3-none-manylinux_2_17_s390x.manylinux2014_s390x.musllinux_1_1_s390x.whl", hash = "sha256:a9e6ebb0874a11f7ed56d2380bfaa95f00612b23b15f896583da30c2059fcfa8"}, + {file = "patchelf-0.17.2.1-py2.py3-none-manylinux_2_5_i686.manylinux1_i686.musllinux_1_1_i686.whl", hash = "sha256:3c8d58f0e4c1929b1c7c45ba8da5a84a8f1aa6a82a46e1cfb2e44a4d40f350e5"}, + {file = "patchelf-0.17.2.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:d1a9bc0d4fd80c038523ebdc451a1cce75237cfcc52dbd1aca224578001d5927"}, + {file = "patchelf-0.17.2.1.tar.gz", hash = "sha256:a6eb0dd452ce4127d0d5e1eb26515e39186fa609364274bc1b0b77539cfa7031"}, +] + +[package.extras] +test = ["importlib-metadata", "pytest"] + +[[package]] +name = "pathspec" +version = "0.12.1" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, + {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, +] + +[[package]] +name = "platformdirs" +version = "4.3.6" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +files = [ + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pycodestyle" +version = "2.12.1" +description = "Python style guide checker" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"}, + {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"}, +] + +[[package]] +name = "pyflakes" +version = "3.2.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a"}, + {file = "pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f"}, +] + +[[package]] +name = "pytest" +version = "8.3.3" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"}, + {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.12" +content-hash = "00bb513e85056b8de5fa3ecabff697e591bea824474579850cbbbe80c46fc2f3" diff --git a/pyproject.toml b/pyproject.toml index 2c0d8cd..b278a4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,20 +1,26 @@ -[build-system] -requires = ["maturin>=0.14,<0.15"] -build-backend = "maturin" - -[project] -name = "lindera-py" -description = "Python binding for Lindera." +[tool.poetry] +name = "lindera" +version = "0.23.1" +description = "" +authors = ["Minoru Osuka "] +license = "MIT" readme = "README.md" -requires-python = ">=3.7" -classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] -[project.optional-dependencies] -test = [ - "pytest >= 7.2.0", - "black >= 23.1.0" -] +[tool.poetry.dependencies] +python = "^3.12" +maturin = "^1.7.1" +patchelf = "^0.18.0.0" + + +[tool.poetry.group.dev.dependencies] +pytest = "^8.3.3" +black = "^24.10.0" +isort = "^5.13.2" +autopep8 = "^2.3.1" +flake8 = "^7.1.1" +flake8-pyproject = "^1.2.3" +mypy = "^1.12.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/pyproject.toml.bak b/pyproject.toml.bak new file mode 100644 index 0000000..2c0d8cd --- /dev/null +++ b/pyproject.toml.bak @@ -0,0 +1,20 @@ +[build-system] +requires = ["maturin>=0.14,<0.15"] +build-backend = "maturin" + +[project] +name = "lindera-py" +description = "Python binding for Lindera." +readme = "README.md" +requires-python = ">=3.7" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] + +[project.optional-dependencies] +test = [ + "pytest >= 7.2.0", + "black >= 23.1.0" +] diff --git a/src/analyzer.rs b/src/analyzer.rs deleted file mode 100644 index eddb9bc..0000000 --- a/src/analyzer.rs +++ /dev/null @@ -1,41 +0,0 @@ -use std::path::Path; - -use pyo3::{exceptions::PyValueError, prelude::*}; - -use lindera::analyzer::Analyzer; - -use crate::PyToken; - -#[pyclass(name = "Analyzer")] -pub struct PyAnalyzer { - inner: Analyzer, -} - -#[pymethods] -impl PyAnalyzer { - #[new] - fn new(config_path: &str) -> PyResult { - Ok(Self { - inner: Analyzer::from_file(Path::new(config_path)).unwrap(), - }) - } - - fn analyze(&self, text: &str) -> PyResult> { - let mut text = text.to_string(); - let tokens = self - .inner - .analyze(&mut text) - .map_err(|e| PyValueError::new_err(format!("{:?}", e)))? - .into_iter() - .map(|x| PyToken::from(x)) - .collect(); - - Ok(tokens) - } -} - -#[cfg(test)] -mod tests { - #[test] - fn it_works() {} -} diff --git a/src/dictionary.rs b/src/dictionary.rs index 5ba90dd..5ba359b 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -1,91 +1,78 @@ -use std::{path::PathBuf, str::FromStr}; +use std::path::PathBuf; +use std::str::FromStr; use pyo3::{exceptions::PyValueError, prelude::*}; -use lindera::{ - dictionary::{ - build_dictionary as lindera_build_dictionary, - build_user_dictionary as lindera_build_user_dictionary, DictionaryConfig, - UserDictionaryConfig, - }, - DictionaryKind, +use lindera::dictionary::{ + load_dictionary_from_kind, load_dictionary_from_path, load_user_dictionary_from_bin, + load_user_dictionary_from_csv, Dictionary, DictionaryKind, UserDictionary, }; - +#[pyclass(name = "Dictionary")] #[derive(Clone)] -#[pyclass(name = "DictionaryConfig")] -pub struct PyDictionaryConfig { - pub inner: DictionaryConfig, -} - -#[pymethods] -impl PyDictionaryConfig { - #[new] - fn new(kind: Option<&str>, path: Option<&str>) -> PyResult { - let k = match kind { - Some(kind_str) => Some( - DictionaryKind::from_str(kind_str) - .map_err(|_err| PyValueError::new_err("Invalid kind"))?, - ), - None => None, - }; - let p = match path { - Some(path_str) => Some(PathBuf::from(path_str)), - None => None, - }; - - Ok(Self { - inner: DictionaryConfig { kind: k, path: p }, - }) - } +pub struct PyDictionary { + pub inner: Dictionary, } +#[pyclass(name = "UserDictionary")] #[derive(Clone)] -#[pyclass(name = "UserDictionaryConfig")] -pub struct PyUserDictionaryConfig { - pub inner: UserDictionaryConfig, +pub struct PyUserDictionary { + pub inner: UserDictionary, } -#[pymethods] -impl PyUserDictionaryConfig { - #[new] - fn new(path: &str, kind: Option<&str>) -> PyResult { - let p = PathBuf::from(path); - let k = match kind { - Some(kind_str) => Some( - DictionaryKind::from_str(kind_str) - .map_err(|_err| PyValueError::new_err("Invalid kind"))?, - ), - None => None, - }; +#[pyfunction] +#[pyo3(signature = (kind=None, path=None))] +pub fn load_dictionary(kind: Option<&str>, path: Option<&str>) -> PyResult { + match (kind, path) { + (Some(kind_str), None) => { + let k = DictionaryKind::from_str(kind_str) + .map_err(|_err| PyValueError::new_err("Invalid kind"))?; + let dictionary = load_dictionary_from_kind(k).map_err(|err| { + PyValueError::new_err(format!("Failed to load dictionary: {}", err)) + })?; - Ok(Self { - inner: UserDictionaryConfig { path: p, kind: k }, - }) + Ok(PyDictionary { inner: dictionary }) + } + (None, Some(path_str)) => { + let p = PathBuf::from(path_str); + let dictionary = load_dictionary_from_path(p.as_path()).map_err(|err| { + PyValueError::new_err(format!("Failed to load dictionary: {}", err)) + })?; + + Ok(PyDictionary { inner: dictionary }) + } + _ => Err(PyValueError::new_err("Invalid arguments")), } } #[pyfunction] -pub fn build_dictionary(kind: &str, input_dir: &str, output_dir: &str) -> PyResult<()> { - lindera_build_dictionary( - DictionaryKind::from_str(kind).map_err(|_err| PyValueError::new_err("Invalid kind"))?, - &PathBuf::from(input_dir), - &PathBuf::from(output_dir), - ) - .map_err(|_err| PyValueError::new_err("Failed to build dictionary")) -} +#[pyo3(signature = (path, kind=None))] +pub fn load_user_dictionary(path: &str, kind: Option<&str>) -> PyResult { + let p = PathBuf::from(path); + let ext = p + .extension() + .and_then(|ext| ext.to_str()) + .ok_or_else(|| PyValueError::new_err("Invalid file path"))?; + match (ext, kind) { + ("csv", Some(kind_str)) => { + let k = DictionaryKind::from_str(kind_str) + .map_err(|_err| PyValueError::new_err("Invalid kind"))?; + let user_dictionary = load_user_dictionary_from_csv(k, p).map_err(|err| { + PyValueError::new_err(format!("Failed to load user dictionary: {}", err)) + })?; -#[pyfunction] -pub fn build_user_dictionary(kind: &str, input_file: &str, output_dir: &str) -> PyResult<()> { - lindera_build_user_dictionary( - DictionaryKind::from_str(kind).map_err(|_err| PyValueError::new_err("Invalid kind"))?, - &PathBuf::from(input_file), - &PathBuf::from(output_dir), - ) - .map_err(|_err| PyValueError::new_err("Failed to build user dictionary")) -} + Ok(PyUserDictionary { + inner: user_dictionary, + }) + } + ("bin", None) => { + let user_dictionary = load_user_dictionary_from_bin(p).map_err(|err| { + PyValueError::new_err(format!("Failed to load user dictionary: {}", err)) + })?; -#[cfg(test)] -mod tests { - #[test] - fn it_works() {} + Ok(PyUserDictionary { + inner: user_dictionary, + }) + } + _ => Err(PyValueError::new_err("Invalid arguments")), + } } diff --git a/src/lib.rs b/src/lib.rs index be4911b..d2cabeb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,69 +1,23 @@ -#[cfg(any( - feature = "ipadic-filter", - feature = "unidic-filter", - feature = "ko-dic-filter", - feature = "cc-cedict-filter", -))] -pub mod analyzer; pub mod dictionary; +pub mod token; pub mod tokenizer; +pub mod util; -#[cfg(any( - feature = "ipadic-filter", - feature = "unidic-filter", - feature = "ko-dic-filter", - feature = "cc-cedict-filter", -))] -use analyzer::PyAnalyzer; -use dictionary::{PyDictionaryConfig, PyUserDictionaryConfig}; use pyo3::prelude::*; -use lindera::FilteredToken; +use crate::dictionary::{load_dictionary, load_user_dictionary, PyDictionary, PyUserDictionary}; +use crate::token::PyToken; +use crate::tokenizer::PyTokenizer; -use crate::{ - dictionary::{build_dictionary, build_user_dictionary}, - tokenizer::{PyTokenizer, PyTokenizerConfig}, -}; - -#[pyclass(name = "Token")] -struct PyToken { - #[pyo3(get)] - text: String, - #[pyo3(get)] - details: Vec, -} - -impl From for PyToken { - fn from(token: FilteredToken) -> Self { - PyToken { - text: token.text.to_string(), - details: token.details, - } - } -} - -/// A Python module implemented in Rust. #[pymodule] -fn lindera_py(_py: Python, m: &PyModule) -> PyResult<()> { - #[cfg(any( - feature = "ipadic-filter", - feature = "unidic-filter", - feature = "ko-dic-filter", - feature = "cc-cedict-filter", - ))] - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_function(wrap_pyfunction!(build_dictionary, m)?)?; - m.add_function(wrap_pyfunction!(build_user_dictionary, m)?)?; +fn lindera(module: &Bound<'_, PyModule>) -> PyResult<()> { + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; + module.add_class::()?; - Ok(()) -} + module.add_function(wrap_pyfunction!(load_dictionary, module)?)?; + module.add_function(wrap_pyfunction!(load_user_dictionary, module)?)?; -#[cfg(test)] -mod tests { - #[test] - fn it_works() {} + Ok(()) } diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..5c9b8e0 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,17 @@ +use pyo3::prelude::*; + +#[pyclass(name = "Token")] +pub struct PyToken { + #[pyo3(get)] + pub text: String, + #[pyo3(get)] + pub byte_start: usize, + #[pyo3(get)] + pub byte_end: usize, + #[pyo3(get)] + pub position: usize, + #[pyo3(get)] + pub position_length: usize, + #[pyo3(get)] + pub details: Vec, +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 05465fb..90229fa 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,88 +1,93 @@ use std::str::FromStr; -use pyo3::{exceptions::PyValueError, prelude::*}; +use lindera::token_filter::TokenFilterLoader; +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use pyo3::types::PyDict; +use serde_json::json; -use lindera::{ - mode::Mode, - tokenizer::{Tokenizer, TokenizerConfig}, - FilteredToken, -}; +use lindera::character_filter::CharacterFilterLoader; +use lindera::mode::Mode; +use lindera::tokenizer::Tokenizer; -use crate::{ - dictionary::{PyDictionaryConfig, PyUserDictionaryConfig}, - PyToken, -}; +use crate::dictionary::{PyDictionary, PyUserDictionary}; +use crate::token::PyToken; +use crate::util::pydict_to_value; -#[derive(Clone)] -#[pyclass(name = "TokenizerConfig")] -pub struct PyTokenizerConfig { - inner: TokenizerConfig, +#[pyclass(name = "Tokenizer")] +pub struct PyTokenizer { + inner: Tokenizer, } #[pymethods] -impl PyTokenizerConfig { +impl PyTokenizer { #[new] + #[pyo3(signature = (mode, dictionary, user_dictionary=None))] fn new( - dic_config: PyDictionaryConfig, mode: &str, - user_dic_config: Option, + dictionary: PyDictionary, + user_dictionary: Option, ) -> PyResult { let m = Mode::from_str(mode).map_err(|_err| PyValueError::new_err("Invalid mode"))?; - + let u = user_dictionary.map(|d| d.inner); Ok(Self { - inner: TokenizerConfig { - dictionary: dic_config.inner, - mode: m, - user_dictionary: user_dic_config.map(|x| x.inner), - }, + inner: Tokenizer::new(m, dictionary.inner, u), }) } -} -#[pyclass(name = "Tokenizer")] -pub struct PyTokenizer { - inner: Tokenizer, -} + #[pyo3(signature = (name, **args))] + fn append_character_filter( + &mut self, + name: &str, + args: Option<&Bound<'_, PyDict>>, + ) -> PyResult<()> { + let character_filter_args = match args { + Some(a) => pydict_to_value(a)?, + None => json!({}), + }; -#[pymethods] -impl PyTokenizer { - #[new] - fn new(config: PyTokenizerConfig) -> PyResult { - Ok(Self { - inner: Tokenizer::from_config(config.inner) - .map_err(|_err| PyValueError::new_err("Invalid config"))?, - }) + let filter = CharacterFilterLoader::load_from_value(name, &character_filter_args) + .map_err(|_err| PyValueError::new_err("Invalid character filter"))?; + self.inner.append_character_filter(filter); + + Ok(()) } + #[pyo3(signature = (name, **args))] + fn append_token_filter( + &mut self, + name: &str, + args: Option<&Bound<'_, PyDict>>, + ) -> PyResult<()> { + let token_filter_args = match args { + Some(a) => pydict_to_value(a)?, + None => json!({}), + }; + + let filter = TokenFilterLoader::load_from_value(name, &token_filter_args) + .map_err(|_err| PyValueError::new_err("Invalid token filter"))?; + self.inner.append_token_filter(filter); + + Ok(()) + } + + #[pyo3(signature = (text))] fn tokenize(&self, text: &str) -> PyResult> { - let tokens = self + let mut tokens = self .inner .tokenize(text) - .map_err(|_err| PyValueError::new_err("Tokenize error"))?; - - let mut py_tokens = Vec::new(); - for token in tokens.clone().iter_mut() { - py_tokens.push(PyToken::from(FilteredToken { - text: token.text.to_string(), - byte_start: token.byte_start, - byte_end: token.byte_end, - position: token.position, - position_length: token.position_length, - details: token - .get_details() - .ok_or_else(|| PyValueError::new_err("Invalid token details"))? - .iter() - .map(|s| s.to_string()) - .collect::>(), - })); - } + .map_err(|_err| PyValueError::new_err("Invalid token filter"))?; - Ok(py_tokens) + Ok(tokens + .iter_mut() + .map(|t| PyToken { + text: t.text.to_owned().to_string(), + byte_start: t.byte_start, + byte_end: t.byte_end, + position: t.position, + position_length: t.position_length, + details: t.details().iter().map(|d| d.to_string()).collect(), + }) + .collect()) } } - -#[cfg(test)] -mod tests { - #[test] - fn it_works() {} -} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..2c1525b --- /dev/null +++ b/src/util.rs @@ -0,0 +1,131 @@ +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyList}; +use serde_json::Value; + +pub fn pydict_to_value(py_dict: &Bound<'_, PyDict>) -> PyResult { + let mut map = serde_json::Map::new(); + for (key, value) in py_dict.iter() { + let key_str: String = key.extract()?; // Convert the key to a String + let value_json: Value = python_to_json(&value)?; // Convert the value to a JSON Value + map.insert(key_str, value_json); + } + Ok(Value::Object(map)) +} + +fn python_to_json(obj: &Bound<'_, PyAny>) -> PyResult { + if let Ok(py_dict) = obj.downcast::() { + pydict_to_value(py_dict) + } else if let Ok(py_list) = obj.downcast::() { + let mut list = Vec::new(); + for elem in py_list.iter() { + list.push(python_to_json(&elem)?); + } + Ok(Value::Array(list)) + } else if let Ok(py_str) = obj.extract::() { + Ok(Value::String(py_str)) + } else if let Ok(py_int) = obj.extract::() { + Ok(Value::Number(serde_json::Number::from(py_int))) + } else if let Ok(py_float) = obj.extract::() { + Ok(Value::Number( + serde_json::Number::from_f64(py_float).unwrap(), + )) + } else if obj.is_none() { + Ok(Value::Null) + } else { + Err(PyErr::new::( + "Unsupported Python object", + )) + } +} + +#[cfg(test)] +mod tests { + // use pyo3::types::IntoPyDict; + // use serde_json::json; + + // use super::*; + + // #[test] + // fn test_pydict_to_value() { + // Python::with_gil(|py| { + // let py_dict = [("key1", "value1"), ("key2", "value2")].into_py_dict_bound(py); + // let value = pydict_to_value(&py_dict).unwrap(); + // let expected = json!({"key1": "value1", "key2": "value2"}); + // assert_eq!(value, expected); + // }); + // } + + // #[test] + // fn test_python_to_json_with_dict() { + // Python::with_gil(|py| { + // let py_dict = [("key1", "value1"), ("key2", "value2")].into_py_dict_bound(py); + // let value = python_to_json(&py_dict).unwrap(); + // let expected = json!({"key1": "value1", "key2": "value2"}); + // assert_eq!(value, expected); + // }); + // } + + // #[test] + // fn test_python_to_json_with_list() { + // Python::with_gil(|py| { + // let binding = vec!["value1", "value2"].into_py(py); + // let py_list = binding.downcast_bound::(py).unwrap(); + // let value = python_to_json(py_list).unwrap(); + // let expected = json!(["value1", "value2"]); + // assert_eq!(value, expected); + // }); + // } + + // #[test] + // fn test_python_to_json_with_string() { + // Python::with_gil(|py| { + // let binding = "value1".to_string().into_py(py); + // let py_str = binding.downcast_bound::(py).unwrap(); + // let value = python_to_json(py_str).unwrap(); + // let expected = json!("value"); + // assert_eq!(value, expected); + // }); + // } + + // #[test] + // fn test_python_to_json_with_int() { + // Python::with_gil(|py| { + // let binding = 42_i64.into_py(py); + // let py_int = binding.downcast_bound::(py).unwrap(); + // let value = python_to_json(py_int).unwrap(); + // let expected = json!(42); + // assert_eq!(value, expected); + // }); + // } + + // #[test] + // fn test_python_to_json_with_float() { + // Python::with_gil(|py| { + // let binding = 3.14_f64.into_py(py); + // let py_float = binding.downcast_bound::(py).unwrap(); + // let value = python_to_json(py_float).unwrap(); + // let expected = json!(3.14); + // assert_eq!(value, expected); + // }); + // } + + // #[test] + // fn test_python_to_json_with_none() { + // Python::with_gil(|py| { + // let binding = py.None(); + // let py_none = binding.downcast_bound::(py).unwrap(); + // let value = python_to_json(py_none).unwrap(); + // let expected = json!(null); + // assert_eq!(value, expected); + // }); + // } + + // #[test] + // fn test_python_to_json_with_unsupported_type() { + // Python::with_gil(|py| { + // let py_tuple = (1, 2).into_py(py); + // let result = python_to_json(py_tuple); + // assert!(result.is_err()); + // }); + // } +} diff --git a/tests/test_analyze.py b/tests/test_analyze.py deleted file mode 100644 index 7b1a6d3..0000000 --- a/tests/test_analyze.py +++ /dev/null @@ -1,23 +0,0 @@ -from pathlib import Path -from lindera_py import Analyzer - - -def test_analyze(): - lindera_conf_path = Path("resources") / "lindera_ipadic_conf.json" - analyzer = Analyzer(str(lindera_conf_path)) - - text = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。" - - # tokenize the text - tokens = analyzer.analyze(text) - - assert tokens[0].text == "Lindera" - assert tokens[1].text == "形態素" - assert tokens[2].text == "解析" - assert tokens[3].text == "エンジン" - assert tokens[4].text == "ユーザ" - assert tokens[5].text == "辞書" - assert tokens[6].text == "利用" - assert tokens[7].text == "可能" - - assert len(tokens) == 8 diff --git a/tests/test_tokenize_cc-cedict.py b/tests/test_tokenize_cc-cedict.py deleted file mode 100644 index 31f150f..0000000 --- a/tests/test_tokenize_cc-cedict.py +++ /dev/null @@ -1,34 +0,0 @@ -from pathlib import Path -from lindera_py import Tokenizer -from lindera_py import DictionaryConfig -from lindera_py import TokenizerConfig - - -def test_analyze(): - dictionary_config = DictionaryConfig("cc-cedict") - tokenizer_config = TokenizerConfig(dictionary_config, "normal") - - tokenizer = Tokenizer(tokenizer_config) - - text = "Lindera是一个词法分析引擎。用户词典也可用。" - print(text) - - # tokenize the text - tokens = tokenizer.tokenize(text) - - assert tokens[0].text == "Lindera" - assert tokens[1].text == "是" - assert tokens[2].text == "一" - assert tokens[3].text == "个" - assert tokens[4].text == "词法" - assert tokens[5].text == "分析" - assert tokens[6].text == "引擎" - assert tokens[7].text == "。" - assert tokens[8].text == "用户" - assert tokens[9].text == "词典" - assert tokens[10].text == "也" - assert tokens[11].text == "可" - assert tokens[12].text == "用" - assert tokens[13].text == "。" - - assert len(tokens) == 14 diff --git a/tests/test_tokenize_ipadic.py b/tests/test_tokenize_ipadic.py index f00ce48..a9128bf 100644 --- a/tests/test_tokenize_ipadic.py +++ b/tests/test_tokenize_ipadic.py @@ -1,34 +1,23 @@ -from pathlib import Path -from lindera_py import Tokenizer -from lindera_py import DictionaryConfig -from lindera_py import TokenizerConfig +from lindera import load_dictionary # type: ignore +from lindera import Tokenizer -def test_analyze(): - dictionary_config = DictionaryConfig("ipadic") - tokenizer_config = TokenizerConfig(dictionary_config, "normal") +def test_tokenize_with_ipadic(): + dictionary = load_dictionary("ipadic") + tokenizer = Tokenizer("normal", dictionary) - tokenizer = Tokenizer(tokenizer_config) - - text = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。" + text = "すもももももももものうち" print(text) # tokenize the text tokens = tokenizer.tokenize(text) - assert tokens[0].text == "Lindera" - assert tokens[1].text == "は" - assert tokens[2].text == "形態素" - assert tokens[3].text == "解析" - assert tokens[4].text == "エンジン" - assert tokens[5].text == "です" - assert tokens[6].text == "。" - assert tokens[7].text == "ユーザー" - assert tokens[8].text == "辞書" - assert tokens[9].text == "も" - assert tokens[10].text == "利用" - assert tokens[11].text == "可能" - assert tokens[12].text == "です" - assert tokens[13].text == "。" + assert tokens[0].text == "すもも" + assert tokens[1].text == "も" + assert tokens[2].text == "もも" + assert tokens[3].text == "も" + assert tokens[4].text == "もも" + assert tokens[5].text == "の" + assert tokens[6].text == "うち" - assert len(tokens) == 14 + assert len(tokens) == 7 diff --git a/tests/test_tokenize_ko-dic.py b/tests/test_tokenize_ko-dic.py deleted file mode 100644 index 185a743..0000000 --- a/tests/test_tokenize_ko-dic.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path -from lindera_py import Tokenizer -from lindera_py import DictionaryConfig -from lindera_py import TokenizerConfig - - -def test_analyze(): - dictionary_config = DictionaryConfig("ko-dic") - tokenizer_config = TokenizerConfig(dictionary_config, "normal") - - tokenizer = Tokenizer(tokenizer_config) - - text = "Lindera는형태소해석엔진입니다.사용자사전도사용할수있습니다." - print(text) - - # tokenize the text - tokens = tokenizer.tokenize(text) - - assert tokens[0].text == "Lindera" - assert tokens[1].text == "는" - assert tokens[2].text == "형태소" - assert tokens[3].text == "해석" - assert tokens[4].text == "엔진" - assert tokens[5].text == "입니다" - assert tokens[6].text == "." - assert tokens[7].text == "사용자" - assert tokens[8].text == "사전도" - assert tokens[9].text == "사용" - assert tokens[10].text == "할" - assert tokens[11].text == "수" - assert tokens[12].text == "있" - assert tokens[13].text == "습니다" - assert tokens[14].text == "." - - assert len(tokens) == 15 diff --git a/tests/test_tokenize_unidic.py b/tests/test_tokenize_unidic.py deleted file mode 100644 index 11225b0..0000000 --- a/tests/test_tokenize_unidic.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path -from lindera_py import Tokenizer -from lindera_py import DictionaryConfig -from lindera_py import TokenizerConfig - - -def test_analyze(): - dictionary_config = DictionaryConfig("unidic") - tokenizer_config = TokenizerConfig(dictionary_config, "normal") - - tokenizer = Tokenizer(tokenizer_config) - - text = "Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。" - print(text) - - # tokenize the text - tokens = tokenizer.tokenize(text) - - assert tokens[0].text == "Lindera" - assert tokens[1].text == "は" - assert tokens[2].text == "形態" - assert tokens[3].text == "素" - assert tokens[4].text == "解析" - assert tokens[5].text == "エンジン" - assert tokens[6].text == "です" - assert tokens[7].text == "。" - assert tokens[8].text == "ユーザー" - assert tokens[9].text == "辞書" - assert tokens[10].text == "も" - assert tokens[11].text == "利用" - assert tokens[12].text == "可能" - assert tokens[13].text == "です" - assert tokens[14].text == "。" - - assert len(tokens) == 15 From e79ebbb9d1e172e411118379b28e7fa0cbff1d3f Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Sun, 20 Oct 2024 22:36:06 +0900 Subject: [PATCH 2/2] Update release.yml --- .github/workflows/release.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9fd8e07..ab3914f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,13 +1,12 @@ name: Release on: - push: - branches: - - main - - master - pull_request: workflow_dispatch: + push: + tags: + - "v*.*.*" + jobs: linux: runs-on: ubuntu-latest