From e933d5866124b14fcf53b92f48ea8920ae055fbf Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda Date: Thu, 26 Aug 2021 10:31:55 +0900 Subject: [PATCH 1/4] add a link for Japanese page --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 3fcf6f7..0f6ac40 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ An Open Source Japanese NLP Library, based on Universal Dependencies ***Please read the [Important changes](#ginza-500) before you upgrade GiNZA.*** +[日本語ページはこちら](https://megagonlabs.github.io/ginza/) + ## License GiNZA NLP Library and GiNZA Japanese Universal Dependencies Models are distributed under the [MIT License](https://github.com/megagonlabs/ginza/blob/master/LICENSE). From f501fc58f28ac7b7e829e05c9789c8dee49f84df Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda Date: Thu, 26 Aug 2021 12:25:15 +0900 Subject: [PATCH 2/4] debug ginzame #179 --- ginza/command_line.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ginza/command_line.py b/ginza/command_line.py index fef1f2c..6c2a5c8 100644 --- a/ginza/command_line.py +++ b/ginza/command_line.py @@ -8,7 +8,7 @@ import spacy from spacy.tokens import Span -from spacy.lang.ja import JapaneseDefaults +from spacy.lang.ja import Japanese, JapaneseTokenizer from . import set_split_mode, inflection, reading_form, ent_label_ene, ent_label_ontonotes,\ bunsetu_bi_label, bunsetu_position_type, force_using_normalized_form_as_lemma @@ -178,9 +178,7 @@ def set_nlp(self): spacy.require_gpu() if self.output_format in ["2", "mecab"]: - nlp = JapaneseDefaults.create_tokenizer(config={ - "split_mode": self.split_mode - }).tokenizer + nlp = JapaneseTokenizer(nlp=Japanese(), split_mode=self.split_mode).tokenizer else: # Work-around for pickle error. Need to share model data. if self.model_path: From 65eb63524a7e9b100ca6e942375ae947351571f2 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda Date: Thu, 26 Aug 2021 12:51:52 +0900 Subject: [PATCH 3/4] Debug disable_sentencizer #178 --- ginza/__init__.py | 18 ++++++++++- ginza/command_line.py | 14 +++----- ginza/disable_sentencizer.py | 63 ++++++++++++++++++++++++++++++++++++ setup.py | 3 +- 4 files changed, 87 insertions(+), 11 deletions(-) create mode 100644 ginza/disable_sentencizer.py diff --git a/ginza/__init__.py b/ginza/__init__.py index 29bff7f..d2503fa 100644 --- a/ginza/__init__.py +++ b/ginza/__init__.py @@ -9,11 +9,12 @@ from .bunsetu_recognizer import * from .compound_splitter import * +from .disable_sentencizer import * from .ene_ontonotes_mapper import ENE_ONTONOTES_MAPPING __all__ = [ - "make_compound_splitter", "make_bunsetu_recognizer", + "make_compound_splitter", "make_bunsetu_recognizer", "make_disable_sentencizer", "force_using_normalized_form_as_lemma", "set_split_mode", "token_i", "text", "text_with_ws", "orth", "orth_", "ent_type", "ent_type_", "ent_iob", "ent_iob_", @@ -79,6 +80,21 @@ def make_bunsetu_recognizer( remain_bunsetu_suffix, ) +@Language.factory( + "disable_sentencizer", + requires=[], + assigns=[], + retokenizes=False, + default_config={}, +) +def make_disable_sentencizer( + nlp: Language, + name: str, +): + return DisableSentencizer( + nlp.vocab, + ) + _morpheme_dictionary_form = None diff --git a/ginza/command_line.py b/ginza/command_line.py index 6c2a5c8..290586e 100644 --- a/ginza/command_line.py +++ b/ginza/command_line.py @@ -11,7 +11,7 @@ from spacy.lang.ja import Japanese, JapaneseTokenizer from . import set_split_mode, inflection, reading_form, ent_label_ene, ent_label_ontonotes,\ - bunsetu_bi_label, bunsetu_position_type, force_using_normalized_form_as_lemma + bunsetu_bi_label, bunsetu_position_type, force_using_normalized_form_as_lemma, make_disable_sentencizer from .bunsetu_recognizer import bunsetu_available, bunsetu_head_list, bunsetu_phrase_span MINI_BATCH_SIZE = 100 @@ -184,7 +184,7 @@ def set_nlp(self): if self.model_path: nlp = spacy.load(self.model_path) elif self.ensure_model: - nlp = spacy.load(self.ensure_model) + nlp = spacy.load(self.ensure_model.replace("-", "_")) else: try: nlp = spacy.load("ja_ginza_electra") @@ -192,15 +192,11 @@ def set_nlp(self): try: nlp = spacy.load("ja_ginza") except IOError as e: - print('Could not find the model. You need to install "ja_ginza_electra" or "ja_ginza" by executing pip like `pip install ja_ginza_electra`.', file=sys.stderr) + print('Could not find the model. You need to install "ja-ginza-electra" or "ja-ginza" by executing pip like `pip install ja-ginza-electra`.', file=sys.stderr) raise e if self.disable_sentencizer: - def disable_sentencizer(doc): - for t in doc: - t.is_sent_start = False - return doc - nlp.add_pipe(disable_sentencizer, before="parser") + nlp.add_pipe("disable_sentencizer", before="parser") if self.split_mode: set_split_mode(nlp, self.split_mode) @@ -444,7 +440,7 @@ def main_ginzame(): @plac.annotations( model_path=("model directory path", "option", "b", str), - ensure_model=("select model either ja_ginza or ja_ginza_electra", "option", "m", str, ["ja_ginza", "ja_ginza_electra", None]), + ensure_model=("select model either ja_ginza or ja_ginza_electra", "option", "m", str, ["ja_ginza", "ja-ginza", "ja_ginza_electra", "ja-ginza-electra", None]), split_mode=("split mode", "option", "s", str, ["A", "B", "C", None]), hash_comment=("hash comment", "option", "c", str, ["print", "skip", "analyze"]), output_path=("output path", "option", "o", Path), diff --git a/ginza/disable_sentencizer.py b/ginza/disable_sentencizer.py new file mode 100644 index 0000000..4bcf747 --- /dev/null +++ b/ginza/disable_sentencizer.py @@ -0,0 +1,63 @@ +# encoding: utf8 +from collections import OrderedDict + +import srsly + +from spacy import util + + +__all__ = [ + "DisableSentencizer", +] + + + +class DisableSentencizer: + def __init__(self, nlp): + self.nlp = nlp + + def __call__(self, doc): + for t in doc: + t.is_sent_start = False + return doc + + def _get_config(self): + return {} + + def _set_config(self, config=None): + pass + + def to_bytes(self, **_kwargs): + serializers = OrderedDict( + ( + ("cfg", lambda: srsly.json_dumps(self._get_config())), + ) + ) + return util.to_bytes(serializers, []) + + def from_bytes(self, data, **_kwargs): + deserializers = OrderedDict( + ( + ("cfg", lambda b: self._set_config(srsly.json_loads(b))), + ) + ) + util.from_bytes(data, deserializers, []) + return self + + def to_disk(self, path, **_kwargs): + path = util.ensure_path(path) + serializers = OrderedDict( + ( + ("cfg", lambda p: srsly.write_json(p, self._get_config())), + ) + ) + return util.to_disk(path, serializers, []) + + def from_disk(self, path, **_kwargs): + path = util.ensure_path(path) + serializers = OrderedDict( + ( + ("cfg", lambda p: self._set_config(srsly.read_json(p))), + ) + ) + util.from_disk(path, serializers, []) diff --git a/setup.py b/setup.py index db19e60..8db0f55 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,7 @@ "spacy_factories": [ "bunsetu_recognizer = ginza:make_bunsetu_recognizer", "compound_splitter = ginza:make_compound_splitter", + "disable_sentencizer = ginza:disable_sentencizer", ], "console_scripts": [ "ginza = ginza.command_line:main_ginza", @@ -25,5 +26,5 @@ name="ginza", packages=find_packages(include=["ginza"]), url="https://github.com/megagonlabs/ginza", - version='5.0.0', + version='5.0.1', ) From 95d8aaf55be20978407d13fcd20004344d60ff23 Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda Date: Thu, 26 Aug 2021 12:52:07 +0900 Subject: [PATCH 4/4] readme --- README.md | 10 +++++++++- docs/index.md | 12 ++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0f6ac40..fedf09c 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,14 @@ Please read the official documents to compile user dictionaries with `sudachipy` ### version 5.x +#### ginza-5.0.1 +- 2021-08-26 +- Bug fix + - `ginzame not woriking in ginza ver. 5` #179 + - `Command Line -d option not working in v5.0.0` #178 +- Improvement + - accept `ja-ginza` and `ja-ginza-electra` for `-m` option of `ginza` command + #### ginza-5.0.0 - 2021-08-26, Demantoid - Important changes @@ -389,7 +397,7 @@ with open('sample2.pickle', 'wb') as f: - upgrade `sudachipy` to v0.4.2 ### version 2.x -### version 2.x + #### ginza-2.2.1 - 2019-10-28 - Improvements diff --git a/docs/index.md b/docs/index.md index 230ae31..bc6aa5a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -41,7 +41,7 @@ GiNZA v5の解析精度は以前のバージョンから飛躍的な向上を遂 | --- | --- | --- | --- | --- | | ja_ginza_electra | 92.3 | 93.7 | 98.1 | 61.3 | | ja_ginza (v5) | 89.2 | 91.1 | 97.0 | 53.9 | -| ja_ginza (v4相当) | 89.0 | 91.0 | 95.1 | 53.3 | +| ja_ginza (v4相当) | 89.0 | 91.0 | 95.1 | 53.1 | `ja_ginza_electra`は`ja_ginza`に対して、5万ステップ学習時の依存関係ラベリング・単語依存構造解析の誤りを、以前のバージョンと比較して25%以上低減できました。 @@ -49,7 +49,7 @@ GiNZA v5の解析精度は以前のバージョンから飛躍的な向上を遂 [関根の拡張固有表現階層](http://liat-aip.sakura.ne.jp/ene/ene8/definition_jp/html/enedetail.html)を用いた拡張固有表現抽出精度(ENE)においても`ja_ginza_electra`は大幅な精度向上が得られています。GiNZAは関根の拡張固有表現階層にもとづく固有表現抽出結果を、spaCyで標準的に用いられる[OntoNotes5](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf)にマッピング(変換表を適用)して出力しています。OntoNotes5は関根の拡張固有表現階層よりカテゴリ数が非常に少ない(粗い)ため、拡張固有表現をOntoNotes5体系にマッピングした場合の固有表現抽出精度は、拡張固有表現での数値より一般に高くなります。 -※各モデルの学習と解析精度評価にはUD_Japanese-BCCWJ r2.8から新聞記事系のテキストを除外したものをSudachi辞書mode C(長単位))で再解析(retokenize)した上で、文節主辞情報を依存関係ラベルに組み合わせた状態のコーパスを用いています。 +※各モデルの学習と解析精度評価にはUD_Japanese-BCCWJ r2.8から新聞記事系のテキストを除外したものをSudachi辞書mode C(長単位)で再解析(retokenize)した上で、文節主辞情報を依存関係ラベルに組み合わせた状態のコーパスを用いています。 ## 実行環境 @@ -225,6 +225,14 @@ Contains information from mC4 which is made available under the ODC Attribution ### version 5.x +#### ginza-5.0.1 +- 2021-08-26 +- Bug fix + - `ginzame not woriking in ginza ver. 5` #179 + - `Command Line -d option not working in v5.0.0` #178 +- Improvement + - accept `ja-ginza` and `ja-ginza-electra` for `-m` option of `ginza` command + #### ginza-5.0.0 - 2021-08-26, Demantoid - 重要な変更