Skip to content

Commit

Permalink
Merge pull request #181 from megagonlabs/develop
Browse files Browse the repository at this point in the history
Release v5.0.1
  • Loading branch information
hiroshi-matsuda-rit authored Aug 26, 2021
2 parents 79f27f8 + 3910b5e commit 8b10169
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 18 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ An Open Source Japanese NLP Library, based on Universal Dependencies

***Please read the [Important changes](#ginza-500) before you upgrade GiNZA.***

[日本語ページはこちら](https://megagonlabs.github.io/ginza/)

## License
GiNZA NLP Library and GiNZA Japanese Universal Dependencies Models are distributed under the
[MIT License](https://github.com/megagonlabs/ginza/blob/master/LICENSE).
Expand Down Expand Up @@ -219,6 +221,14 @@ Please read the official documents to compile user dictionaries with `sudachipy`

### version 5.x

#### ginza-5.0.1
- 2021-08-26
- Bug fix
- `ginzame not woriking in ginza ver. 5` #179
- `Command Line -d option not working in v5.0.0` #178
- Improvement
- accept `ja-ginza` and `ja-ginza-electra` for `-m` option of `ginza` command

#### ginza-5.0.0
- 2021-08-26, Demantoid
- Important changes
Expand Down Expand Up @@ -387,7 +397,7 @@ with open('sample2.pickle', 'wb') as f:
- upgrade `sudachipy` to v0.4.2

### version 2.x
### version 2.x

#### ginza-2.2.1
- 2019-10-28
- Improvements
Expand Down
12 changes: 10 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,15 @@ GiNZA v5の解析精度は以前のバージョンから飛躍的な向上を遂
| --- | --- | --- | --- | --- |
| ja_ginza_electra | 92.3 | 93.7 | 98.1 | 61.3 |
| ja_ginza (v5) | 89.2 | 91.1 | 97.0 | 53.9 |
| ja_ginza (v4相当) | 89.0 | 91.0 | 95.1 | 53.3 |
| ja_ginza (v4相当) | 89.0 | 91.0 | 95.1 | 53.1 |

`ja_ginza_electra``ja_ginza`に対して、5万ステップ学習時の依存関係ラベリング・単語依存構造解析の誤りを、以前のバージョンと比較して25%以上低減できました。

また従来型モデルの`ja_ginza`においても、処理パイプラインに`morphologizer`を追加することにより、以前のバージョンと比較してUD品詞推定の誤りを4割低減できました。

[関根の拡張固有表現階層](http://liat-aip.sakura.ne.jp/ene/ene8/definition_jp/html/enedetail.html)を用いた拡張固有表現抽出精度(ENE)においても`ja_ginza_electra`は大幅な精度向上が得られています。GiNZAは関根の拡張固有表現階層にもとづく固有表現抽出結果を、spaCyで標準的に用いられる[OntoNotes5](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf)にマッピング(変換表を適用)して出力しています。OntoNotes5は関根の拡張固有表現階層よりカテゴリ数が非常に少ない(粗い)ため、拡張固有表現をOntoNotes5体系にマッピングした場合の固有表現抽出精度は、拡張固有表現での数値より一般に高くなります。

※各モデルの学習と解析精度評価にはUD_Japanese-BCCWJ r2.8から新聞記事系のテキストを除外したものをSudachi辞書mode C(長単位))で再解析(retokenize)した上で、文節主辞情報を依存関係ラベルに組み合わせた状態のコーパスを用いています。
※各モデルの学習と解析精度評価にはUD_Japanese-BCCWJ r2.8から新聞記事系のテキストを除外したものをSudachi辞書mode C(長単位)で再解析(retokenize)した上で、文節主辞情報を依存関係ラベルに組み合わせた状態のコーパスを用いています。

## 実行環境

Expand Down Expand Up @@ -225,6 +225,14 @@ Contains information from mC4 which is made available under the ODC Attribution

### version 5.x

#### ginza-5.0.1
- 2021-08-26
- Bug fix
- `ginzame not woriking in ginza ver. 5` #179
- `Command Line -d option not working in v5.0.0` #178
- Improvement
- accept `ja-ginza` and `ja-ginza-electra` for `-m` option of `ginza` command

#### ginza-5.0.0
- 2021-08-26, Demantoid
- 重要な変更
Expand Down
18 changes: 17 additions & 1 deletion ginza/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@

from .bunsetu_recognizer import *
from .compound_splitter import *
from .disable_sentencizer import *
from .ene_ontonotes_mapper import ENE_ONTONOTES_MAPPING


__all__ = [
"make_compound_splitter", "make_bunsetu_recognizer",
"make_compound_splitter", "make_bunsetu_recognizer", "make_disable_sentencizer",
"force_using_normalized_form_as_lemma", "set_split_mode",
"token_i", "text", "text_with_ws", "orth", "orth_",
"ent_type", "ent_type_", "ent_iob", "ent_iob_",
Expand Down Expand Up @@ -79,6 +80,21 @@ def make_bunsetu_recognizer(
remain_bunsetu_suffix,
)

@Language.factory(
"disable_sentencizer",
requires=[],
assigns=[],
retokenizes=False,
default_config={},
)
def make_disable_sentencizer(
nlp: Language,
name: str,
):
return DisableSentencizer(
nlp.vocab,
)


_morpheme_dictionary_form = None

Expand Down
20 changes: 7 additions & 13 deletions ginza/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
import spacy
from spacy.tokens import Span

from spacy.lang.ja import JapaneseDefaults
from spacy.lang.ja import Japanese, JapaneseTokenizer

from . import set_split_mode, inflection, reading_form, ent_label_ene, ent_label_ontonotes,\
bunsetu_bi_label, bunsetu_position_type, force_using_normalized_form_as_lemma
bunsetu_bi_label, bunsetu_position_type, force_using_normalized_form_as_lemma, make_disable_sentencizer
from .bunsetu_recognizer import bunsetu_available, bunsetu_head_list, bunsetu_phrase_span

MINI_BATCH_SIZE = 100
Expand Down Expand Up @@ -178,31 +178,25 @@ def set_nlp(self):
spacy.require_gpu()

if self.output_format in ["2", "mecab"]:
nlp = JapaneseDefaults.create_tokenizer(config={
"split_mode": self.split_mode
}).tokenizer
nlp = JapaneseTokenizer(nlp=Japanese(), split_mode=self.split_mode).tokenizer
else:
# Work-around for pickle error. Need to share model data.
if self.model_path:
nlp = spacy.load(self.model_path)
elif self.ensure_model:
nlp = spacy.load(self.ensure_model)
nlp = spacy.load(self.ensure_model.replace("-", "_"))
else:
try:
nlp = spacy.load("ja_ginza_electra")
except IOError as e:
try:
nlp = spacy.load("ja_ginza")
except IOError as e:
print('Could not find the model. You need to install "ja_ginza_electra" or "ja_ginza" by executing pip like `pip install ja_ginza_electra`.', file=sys.stderr)
print('Could not find the model. You need to install "ja-ginza-electra" or "ja-ginza" by executing pip like `pip install ja-ginza-electra`.', file=sys.stderr)
raise e

if self.disable_sentencizer:
def disable_sentencizer(doc):
for t in doc:
t.is_sent_start = False
return doc
nlp.add_pipe(disable_sentencizer, before="parser")
nlp.add_pipe("disable_sentencizer", before="parser")

if self.split_mode:
set_split_mode(nlp, self.split_mode)
Expand Down Expand Up @@ -446,7 +440,7 @@ def main_ginzame():

@plac.annotations(
model_path=("model directory path", "option", "b", str),
ensure_model=("select model either ja_ginza or ja_ginza_electra", "option", "m", str, ["ja_ginza", "ja_ginza_electra", None]),
ensure_model=("select model either ja_ginza or ja_ginza_electra", "option", "m", str, ["ja_ginza", "ja-ginza", "ja_ginza_electra", "ja-ginza-electra", None]),
split_mode=("split mode", "option", "s", str, ["A", "B", "C", None]),
hash_comment=("hash comment", "option", "c", str, ["print", "skip", "analyze"]),
output_path=("output path", "option", "o", Path),
Expand Down
63 changes: 63 additions & 0 deletions ginza/disable_sentencizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# encoding: utf8
from collections import OrderedDict

import srsly

from spacy import util


__all__ = [
"DisableSentencizer",
]



class DisableSentencizer:
def __init__(self, nlp):
self.nlp = nlp

def __call__(self, doc):
for t in doc:
t.is_sent_start = False
return doc

def _get_config(self):
return {}

def _set_config(self, config=None):
pass

def to_bytes(self, **_kwargs):
serializers = OrderedDict(
(
("cfg", lambda: srsly.json_dumps(self._get_config())),
)
)
return util.to_bytes(serializers, [])

def from_bytes(self, data, **_kwargs):
deserializers = OrderedDict(
(
("cfg", lambda b: self._set_config(srsly.json_loads(b))),
)
)
util.from_bytes(data, deserializers, [])
return self

def to_disk(self, path, **_kwargs):
path = util.ensure_path(path)
serializers = OrderedDict(
(
("cfg", lambda p: srsly.write_json(p, self._get_config())),
)
)
return util.to_disk(path, serializers, [])

def from_disk(self, path, **_kwargs):
path = util.ensure_path(path)
serializers = OrderedDict(
(
("cfg", lambda p: self._set_config(srsly.read_json(p))),
)
)
util.from_disk(path, serializers, [])
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"spacy_factories": [
"bunsetu_recognizer = ginza:make_bunsetu_recognizer",
"compound_splitter = ginza:make_compound_splitter",
"disable_sentencizer = ginza:disable_sentencizer",
],
"console_scripts": [
"ginza = ginza.command_line:main_ginza",
Expand All @@ -25,5 +26,5 @@
name="ginza",
packages=find_packages(include=["ginza"]),
url="https://github.com/megagonlabs/ginza",
version='5.0.0',
version='5.0.1',
)

0 comments on commit 8b10169

Please sign in to comment.