Merge pull request #181 from megagonlabs/develop

Release v5.0.1
megagonlabs · Aug 26, 2021 · 8b10169 · 8b10169
2 parents 79f27f8 + 3910b5e
commit 8b10169
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,8 @@ An Open Source Japanese NLP Library, based on Universal Dependencies
 
 ***Please read the [Important changes](#ginza-500) before you upgrade GiNZA.***
 
+[日本語ページはこちら](https://megagonlabs.github.io/ginza/)
+
 ## License
 GiNZA NLP Library and GiNZA Japanese Universal Dependencies Models are distributed under the
 [MIT License](https://github.com/megagonlabs/ginza/blob/master/LICENSE).
@@ -219,6 +221,14 @@ Please read the official documents to compile user dictionaries with `sudachipy`
 
 ### version 5.x
 
+#### ginza-5.0.1
+- 2021-08-26
+- Bug fix
+  - `ginzame not woriking in ginza ver. 5` #179
+  - `Command Line -d option not working in v5.0.0` #178
+- Improvement
+  - accept `ja-ginza` and `ja-ginza-electra` for `-m` option of `ginza` command
+
 #### ginza-5.0.0
 - 2021-08-26, Demantoid
 - Important changes
@@ -387,7 +397,7 @@ with open('sample2.pickle', 'wb') as f:
   - upgrade `sudachipy` to v0.4.2
 
 ### version 2.x
-### version 2.x
+
 #### ginza-2.2.1
 - 2019-10-28
 - Improvements

diff --git a/docs/index.md b/docs/index.md
@@ -41,15 +41,15 @@ GiNZA v5の解析精度は以前のバージョンから飛躍的な向上を遂
 | --- | --- | --- | --- | --- |
 | ja_ginza_electra | 92.3 | 93.7 | 98.1 | 61.3 |
 | ja_ginza (v5)    | 89.2 | 91.1 | 97.0 | 53.9 |
-| ja_ginza (v4相当) | 89.0 | 91.0 | 95.1 | 53.3 |
+| ja_ginza (v4相当) | 89.0 | 91.0 | 95.1 | 53.1 |
 
 `ja_ginza_electra`は`ja_ginza`に対して、5万ステップ学習時の依存関係ラベリング・単語依存構造解析の誤りを、以前のバージョンと比較して25%以上低減できました。
 
 また従来型モデルの`ja_ginza`においても、処理パイプラインに`morphologizer`を追加することにより、以前のバージョンと比較してUD品詞推定の誤りを4割低減できました。
 
 [関根の拡張固有表現階層](http://liat-aip.sakura.ne.jp/ene/ene8/definition_jp/html/enedetail.html)を用いた拡張固有表現抽出精度(ENE)においても`ja_ginza_electra`は大幅な精度向上が得られています。GiNZAは関根の拡張固有表現階層にもとづく固有表現抽出結果を、spaCyで標準的に用いられる[OntoNotes5](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf)にマッピング(変換表を適用)して出力しています。OntoNotes5は関根の拡張固有表現階層よりカテゴリ数が非常に少ない(粗い)ため、拡張固有表現をOntoNotes5体系にマッピングした場合の固有表現抽出精度は、拡張固有表現での数値より一般に高くなります。
 
-※各モデルの学習と解析精度評価にはUD_Japanese-BCCWJ r2.8から新聞記事系のテキストを除外したものをSudachi辞書mode C(長単位))で再解析(retokenize)した上で、文節主辞情報を依存関係ラベルに組み合わせた状態のコーパスを用いています。
+※各モデルの学習と解析精度評価にはUD_Japanese-BCCWJ r2.8から新聞記事系のテキストを除外したものをSudachi辞書mode C(長単位)で再解析(retokenize)した上で、文節主辞情報を依存関係ラベルに組み合わせた状態のコーパスを用いています。
 
 ## 実行環境
 
@@ -225,6 +225,14 @@ Contains information from mC4 which is made available under the ODC Attribution
 
 ### version 5.x
 
+#### ginza-5.0.1
+- 2021-08-26
+- Bug fix
+  - `ginzame not woriking in ginza ver. 5` #179
+  - `Command Line -d option not working in v5.0.0` #178
+- Improvement
+  - accept `ja-ginza` and `ja-ginza-electra` for `-m` option of `ginza` command
+
 #### ginza-5.0.0
 - 2021-08-26, Demantoid
 - 重要な変更

diff --git a/ginza/__init__.py b/ginza/__init__.py
@@ -9,11 +9,12 @@
 
 from .bunsetu_recognizer import *
 from .compound_splitter import *
+from .disable_sentencizer import *
 from .ene_ontonotes_mapper import ENE_ONTONOTES_MAPPING
 
 
 __all__ = [
-    "make_compound_splitter", "make_bunsetu_recognizer",
+    "make_compound_splitter", "make_bunsetu_recognizer", "make_disable_sentencizer",
     "force_using_normalized_form_as_lemma", "set_split_mode",
     "token_i", "text", "text_with_ws", "orth", "orth_",
     "ent_type", "ent_type_", "ent_iob", "ent_iob_",
@@ -79,6 +80,21 @@ def make_bunsetu_recognizer(
         remain_bunsetu_suffix,
     )
 
+@Language.factory(
+    "disable_sentencizer",
+    requires=[],
+    assigns=[],
+    retokenizes=False,
+    default_config={},
+)
+def make_disable_sentencizer(
+    nlp: Language,
+    name: str,
+):
+    return DisableSentencizer(
+        nlp.vocab,
+    )
+
 
 _morpheme_dictionary_form = None
 

diff --git a/ginza/command_line.py b/ginza/command_line.py
@@ -8,10 +8,10 @@
 import spacy
 from spacy.tokens import Span
 
-from spacy.lang.ja import JapaneseDefaults
+from spacy.lang.ja import Japanese, JapaneseTokenizer
 
 from . import set_split_mode, inflection, reading_form, ent_label_ene, ent_label_ontonotes,\
-    bunsetu_bi_label, bunsetu_position_type, force_using_normalized_form_as_lemma
+    bunsetu_bi_label, bunsetu_position_type, force_using_normalized_form_as_lemma, make_disable_sentencizer
 from .bunsetu_recognizer import bunsetu_available, bunsetu_head_list, bunsetu_phrase_span
 
 MINI_BATCH_SIZE = 100
@@ -178,31 +178,25 @@ def set_nlp(self):
             spacy.require_gpu()
 
         if self.output_format in ["2", "mecab"]:
-            nlp = JapaneseDefaults.create_tokenizer(config={
-                "split_mode": self.split_mode
-            }).tokenizer
+            nlp = JapaneseTokenizer(nlp=Japanese(), split_mode=self.split_mode).tokenizer
         else:
             # Work-around for pickle error. Need to share model data.
             if self.model_path:
                 nlp = spacy.load(self.model_path)
             elif self.ensure_model:
-                    nlp = spacy.load(self.ensure_model)
+                    nlp = spacy.load(self.ensure_model.replace("-", "_"))
             else:
                 try:
                     nlp = spacy.load("ja_ginza_electra")
                 except IOError as e:
                     try:
                         nlp = spacy.load("ja_ginza")
                     except IOError as e:
-                        print('Could not find the model. You need to install "ja_ginza_electra" or "ja_ginza" by executing pip like `pip install ja_ginza_electra`.', file=sys.stderr)
+                        print('Could not find the model. You need to install "ja-ginza-electra" or "ja-ginza" by executing pip like `pip install ja-ginza-electra`.', file=sys.stderr)
                         raise e
 
             if self.disable_sentencizer:
-                def disable_sentencizer(doc):
-                    for t in doc:
-                        t.is_sent_start = False
-                    return doc
-                nlp.add_pipe(disable_sentencizer, before="parser")
+                nlp.add_pipe("disable_sentencizer", before="parser")
 
             if self.split_mode:
                 set_split_mode(nlp, self.split_mode)
@@ -446,7 +440,7 @@ def main_ginzame():
 
 @plac.annotations(
     model_path=("model directory path", "option", "b", str),
-    ensure_model=("select model either ja_ginza or ja_ginza_electra", "option", "m", str, ["ja_ginza", "ja_ginza_electra", None]),
+    ensure_model=("select model either ja_ginza or ja_ginza_electra", "option", "m", str, ["ja_ginza", "ja-ginza", "ja_ginza_electra", "ja-ginza-electra", None]),
     split_mode=("split mode", "option", "s", str, ["A", "B", "C", None]),
     hash_comment=("hash comment", "option", "c", str, ["print", "skip", "analyze"]),
     output_path=("output path", "option", "o", Path),

diff --git a/ginza/disable_sentencizer.py b/ginza/disable_sentencizer.py
@@ -0,0 +1,63 @@
+# encoding: utf8
+from collections import OrderedDict
+
+import srsly
+
+from spacy import util
+
+
+__all__ = [
+    "DisableSentencizer",
+]
+
+
+
+class DisableSentencizer:
+    def __init__(self, nlp):
+        self.nlp = nlp
+
+    def __call__(self, doc):
+        for t in doc:
+            t.is_sent_start = False
+        return doc
+
+    def _get_config(self):
+        return {}
+
+    def _set_config(self, config=None):
+        pass
+
+    def to_bytes(self, **_kwargs):
+        serializers = OrderedDict(
+            (
+                ("cfg", lambda: srsly.json_dumps(self._get_config())),
+            )
+        )
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, data, **_kwargs):
+        deserializers = OrderedDict(
+            (
+                ("cfg", lambda b: self._set_config(srsly.json_loads(b))),
+            )
+        )
+        util.from_bytes(data, deserializers, [])
+        return self
+
+    def to_disk(self, path, **_kwargs):
+        path = util.ensure_path(path)
+        serializers = OrderedDict(
+            (
+                ("cfg", lambda p: srsly.write_json(p, self._get_config())),
+            )
+        )
+        return util.to_disk(path, serializers, [])
+
+    def from_disk(self, path, **_kwargs):
+        path = util.ensure_path(path)
+        serializers = OrderedDict(
+            (
+                ("cfg", lambda p: self._set_config(srsly.read_json(p))),
+            )
+        )
+        util.from_disk(path, serializers, [])
diff --git a/setup.py b/setup.py
@@ -9,6 +9,7 @@
         "spacy_factories": [
             "bunsetu_recognizer = ginza:make_bunsetu_recognizer",
             "compound_splitter = ginza:make_compound_splitter",
+            "disable_sentencizer = ginza:disable_sentencizer",
         ],
         "console_scripts": [
             "ginza = ginza.command_line:main_ginza",
@@ -25,5 +26,5 @@
     name="ginza",
     packages=find_packages(include=["ginza"]),
     url="https://github.com/megagonlabs/ginza",
-    version='5.0.0',
+    version='5.0.1',
 )