diff --git a/README.md b/README.md index 3797772..b5a05f5 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ $ pip install -U ginza https://github.com/megagonlabs/ginza/releases/download/la If you hope to accelarate the transformers-based models by using GPUs with CUDA support, you can install `spacy` by specifying the CUDA version as follows: ```console -pip install -U "spacy[cuda110]" +pip install -U "spacy[cuda117]" ``` And you need to install a version of pytorch that is consistent with the CUDA version. @@ -108,6 +108,11 @@ Then, install the latest version of `ginza` and `ja_ginza`: $ pip install -U ginza ja_ginza ``` +When using Apple Silicon such as M1 or M2, you can accelerate the analysis process by installing `thinc-apple-ops`: +```console +$ pip install torch thinc-apple-ops +``` + ### Execute ginza command Run `ginza` command from the console, then input some Japanese text. After pressing enter key, you will get the parsed results with [CoNLL-U Syntactic Annotation](https://universaldependencies.org/format.html#syntactic-annotation) format. @@ -234,6 +239,11 @@ Please read the official documents to compile user dictionaries with `sudachipy` ### version 5.x +#### ginza-5.1.3 +- 2023-09-25 +- Migrate to spaCy v3.6 +- Beta release of `ja_ginza_bert_large` + #### ginza-5.1.2 - 2022-03-12 - Migrate to spaCy v3.4 diff --git a/config/ja_ginza.meta.json b/config/ja_ginza.meta.json index 343c229..8266fc0 100644 --- a/config/ja_ginza.meta.json +++ b/config/ja_ginza.meta.json @@ -1,7 +1,7 @@ { "lang":"ja", "name":"ginza", - "version":"5.1.2", + "version":"5.1.3", "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019). Assigns word2vec token vectors. Components: tok2vec, parser, ner, morphologizer, atteribute_ruler, compound_splitter, bunsetu_recognizer.", "author":"Megagon Labs Tokyo.", "email":"ginza@megagon.ai", @@ -34,7 +34,14 @@ } ], "parent_package":"spacy", - "spacy_version":">=3.2.0,<3.5.0", + "spacy_version":">=3.2.0,<3.7.0", + "spacy_git_version":"0fc3dee77", + "vectors":{ + "width":300, + "vectors":20000, + "keys":480443, + "name":"ja_vectors" + }, "pipeline":[ "tok2vec", "parser", @@ -53,7 +60,7 @@ "compound_splitter", "bunsetu_recognizer" ], - "disabled": [ + "disabled":[ "attribute_ruler" ], "requirements":[ diff --git a/config/ja_ginza_bert_char_v2_basic.analysis.cfg b/config/ja_ginza_bert_char_v2_basic.analysis.cfg deleted file mode 100644 index e3733be..0000000 --- a/config/ja_ginza_bert_char_v2_basic.analysis.cfg +++ /dev/null @@ -1,201 +0,0 @@ -[paths] -train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy" -dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy" -vectors = null -init_tok2vec = null - -[system] -gpu_allocator = "pytorch" -seed = 0 - -[nlp] -lang = "ja" -pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"] -batch_size = 128 -disabled = ["attribute_ruler"] -before_creation = null -after_creation = null -after_pipeline_creation = null - -[nlp.tokenizer] -@tokenizers = "spacy.ja.JapaneseTokenizer" -split_mode = "C" - -[components] - -[components.attribute_ruler] -factory = "attribute_ruler" -validate = false - -[components.bunsetu_recognizer] -factory = "bunsetu_recognizer" -remain_bunsetu_suffix = false - -[components.compound_splitter] -factory = "compound_splitter" -split_mode = null - -[components.morphologizer] -factory = "morphologizer" - -[components.morphologizer.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.morphologizer.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.ner] -factory = "ner" -moves = null -update_with_oracle_cut_size = 100 - -[components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "ner" -extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 -use_upper = false -nO = null - -[components.ner.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.parser] -factory = "parser" -learn_tokens = false -min_action_freq = 30 -moves = null -update_with_oracle_cut_size = 100 - -[components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "parser" -extra_state_tokens = false -hidden_width = 128 -maxout_pieces = 3 -use_upper = false -nO = null - -[components.parser.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.transformer] -factory = "transformer_custom" -max_batch_items = 4096 -set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} - -[components.transformer.model] -@architectures = "ginza-transformers.TransformerModel.v1" -name = "megagonlabs/bert-base-japanese-char-v2-ginza" - -[components.transformer.model.get_spans] -@span_getters = "spacy-transformers.strided_spans.v1" -window = 128 -stride = 96 - -[components.transformer.model.tokenizer_config] -use_fast = false -tokenizer_class = "BertJapaneseTokenizer" -word_tokenizer_type = basic -subword_tokenizer_type = character - -[corpora] - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -max_length = 0 -gold_preproc = false -limit = 0 -augmenter = null - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -max_length = 500 -gold_preproc = false -limit = 0 -augmenter = null - -[training] -accumulate_gradient = 3 -dev_corpus = "corpora.dev" -train_corpus = "corpora.train" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -dropout = 0.1 -patience = 0 -max_epochs = 0 -max_steps = 50000 -eval_frequency = 200 -frozen_components = [] -before_to_disk = null -annotating_components = [] - -[training.batcher] -@batchers = "spacy.batch_by_padded.v1" -discard_oversize = true -size = 2000 -buffer = 256 -get_length = null - -[training.logger] -@loggers = "spacy.ConsoleLogger.v1" -progress_bar = false - -[training.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = false -eps = 0.00000001 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 50000 -initial_rate = 0.00005 - -[training.score_weights] -dep_uas = 0.25 -dep_las = 0.25 -dep_las_per_type = null -sents_p = null -sents_r = null -sents_f = 0.1 -ents_f = 0.25 -ents_p = 0.0 -ents_r = 0.0 -ents_per_type = null -pos_acc = 0.15 -morph_acc = 0.0 -morph_per_feat = null -tag_acc = 0.0 - -[pretraining] - -[initialize] -vectors = null -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.tokenizer] \ No newline at end of file diff --git a/config/ja_ginza_bert_char_v2_basic.cfg b/config/ja_ginza_bert_char_v2_basic.cfg deleted file mode 100644 index 31cf1ab..0000000 --- a/config/ja_ginza_bert_char_v2_basic.cfg +++ /dev/null @@ -1,201 +0,0 @@ -[paths] -train = "corpus/ja_ginza-ud-train.ne.rea.random_sents.spacy" -dev = "corpus/ja_ginza-ud-dev.ne.rea.random_sents.spacy" -vectors = null -init_tok2vec = null - -[system] -gpu_allocator = "pytorch" -seed = 0 - -[nlp] -lang = "ja" -pipeline = ["transformer","parser","attribute_ruler","ner","morphologizer","compound_splitter","bunsetu_recognizer"] -batch_size = 128 -disabled = ["attribute_ruler"] -before_creation = null -after_creation = null -after_pipeline_creation = null - -[nlp.tokenizer] -@tokenizers = "spacy.ja.JapaneseTokenizer" -split_mode = "C" - -[components] - -[components.attribute_ruler] -factory = "attribute_ruler" -validate = false - -[components.bunsetu_recognizer] -factory = "bunsetu_recognizer" -remain_bunsetu_suffix = true - -[components.compound_splitter] -factory = "compound_splitter" -split_mode = null - -[components.morphologizer] -factory = "morphologizer" - -[components.morphologizer.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.morphologizer.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.ner] -factory = "ner" -moves = null -update_with_oracle_cut_size = 100 - -[components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "ner" -extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 -use_upper = false -nO = null - -[components.ner.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.parser] -factory = "parser" -learn_tokens = false -min_action_freq = 30 -moves = null -update_with_oracle_cut_size = 100 - -[components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "parser" -extra_state_tokens = false -hidden_width = 128 -maxout_pieces = 3 -use_upper = false -nO = null - -[components.parser.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.transformer] -factory = "transformer" -max_batch_items = 4096 -set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} - -[components.transformer.model] -@architectures = "ginza-transformers.TransformerModel.v1" -name = "cl-tohoku/bert-base-japanese-char-v2" - -[components.transformer.model.get_spans] -@span_getters = "spacy-transformers.strided_spans.v1" -window = 128 -stride = 96 - -[components.transformer.model.tokenizer_config] -use_fast = false -tokenizer_class = "BertJapaneseTokenizer" -word_tokenizer_type = basic -subword_tokenizer_type = character - -[corpora] - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -max_length = 0 -gold_preproc = false -limit = 0 -augmenter = null - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -max_length = 500 -gold_preproc = false -limit = 0 -augmenter = null - -[training] -accumulate_gradient = 3 -dev_corpus = "corpora.dev" -train_corpus = "corpora.train" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -dropout = 0.1 -patience = 0 -max_epochs = 0 -max_steps = 50000 -eval_frequency = 200 -frozen_components = [] -before_to_disk = null -annotating_components = [] - -[training.batcher] -@batchers = "spacy.batch_by_padded.v1" -discard_oversize = true -size = 2000 -buffer = 256 -get_length = null - -[training.logger] -@loggers = "spacy.ConsoleLogger.v1" -progress_bar = false - -[training.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = false -eps = 0.00000001 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 50000 -initial_rate = 0.00005 - -[training.score_weights] -dep_uas = 0.25 -dep_las = 0.25 -dep_las_per_type = null -sents_p = null -sents_r = null -sents_f = 0.1 -ents_f = 0.25 -ents_p = 0.0 -ents_r = 0.0 -ents_per_type = null -pos_acc = 0.15 -morph_acc = 0.0 -morph_per_feat = null -tag_acc = 0.0 - -[pretraining] - -[initialize] -vectors = null -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.tokenizer] \ No newline at end of file diff --git a/config/ja_ginza_bert_char_v2_basic.meta.json b/config/ja_ginza_bert_char_v2_basic.meta.json deleted file mode 100644 index 48bc1ec..0000000 --- a/config/ja_ginza_bert_char_v2_basic.meta.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "lang":"ja", - "name":"ginza_bert_v2", - "version":"5.0.0b1", - "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + cl-tohoku/bert-base-japanese-char-v2(BasicTokenizer). Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", - "author":"Megagon Labs Tokyo.", - "email":"ginza@megagon.ai", - "url":"https://github.com/megagonlabs/ginza", - "license":"MIT License", - "sources":[ - { - "name":"UD_Japanese-BCCWJ r2.8", - "url":"https://github.com/UniversalDependencies/UD_Japanese-BCCWJ", - "license":"CC BY-NC-SA 4.0", - "author":"Asahara, M., Kanayama, H., Tanaka, T., Miyao, Y., Uematsu, S., Mori, S., Matsumoto, Y., Omura, M., & Murawaki, Y." - }, - { - "name":"GSK2014-A(2019)", - "url":"https://www.gsk.or.jp/catalog/gsk2014-a/", - "license":"Individually defined commercial license", - "author":"Tokyo Institute of Technology" - }, - { - "name":"SudachiDict_core", - "url":"https://github.com/WorksApplications/SudachiDict", - "license":"Apache License 2.0", - "author":"Works Applications Enterprise Co., Ltd." - }, - { - "name":"cl-tohoku/bert-base-japanese-char-v2", - "url":"https://huggingface.co/cl-tohoku/bert-base-japanese-char-v2", - "license":"CC BY-SA 3.0", - "author":"Masatoshi Suzuki (Tohoku University)" - } - ], - "parent_package":"spacy", - "spacy_version":">=3.0.6,<3.2.0", - "pipeline":[ - "transformer", - "parser", - "attribute_ruler", - "ner", - "morphologizer", - "compound_splitter", - "bunsetu_recognizer" - ], - "components":[ - "transformer", - "parser", - "attribute_ruler", - "ner", - "morphologizer", - "compound_splitter", - "bunsetu_recognizer" - ], - "disabled": [ - "attribute_ruler" - ], - "requirements":[ - "sudachipy>=0.5.2,<0.6.0", - "sudachidict_core>=20210608", - "ginza-transformers>=0.3.1,<1.0.0", - "ginza>=5.0.0,<5.1.0" - ] -} diff --git a/config/ja_ginza_bert_v2.cfg b/config/ja_ginza_bert_large.cfg similarity index 87% rename from config/ja_ginza_bert_v2.cfg rename to config/ja_ginza_bert_large.cfg index 1619266..65035b8 100644 --- a/config/ja_ginza_bert_v2.cfg +++ b/config/ja_ginza_bert_large.cfg @@ -37,10 +37,14 @@ split_mode = null [components.morphologizer] factory = "morphologizer" +extend = true +overwrite = true +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} [components.morphologizer.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null +normalize = false [components.morphologizer.model.tok2vec] @architectures = "spacy-transformers.TransformerListener.v1" @@ -50,7 +54,9 @@ upstream = "*" [components.ner] factory = "ner" +incorrect_spans_key = null moves = null +scorer = {"@scorers":"spacy.ner_scorer.v1"} update_with_oracle_cut_size = 100 [components.ner.model] @@ -73,6 +79,7 @@ factory = "parser" learn_tokens = false min_action_freq = 30 moves = null +scorer = {"@scorers":"spacy.parser_scorer.v1"} update_with_oracle_cut_size = 100 [components.parser.model] @@ -96,14 +103,17 @@ max_batch_items = 4096 set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} [components.transformer.model] -@architectures = "ginza-transformers.TransformerModel.v1" -name = "cl-tohoku/bert-base-japanese-v2" +@architectures = "spacy-transformers.TransformerModel.v3" +name = "cl-tohoku/bert-large-japanese-v2" +mixed_precision = false [components.transformer.model.get_spans] @span_getters = "spacy-transformers.strided_spans.v1" window = 128 stride = 96 +[components.transformer.model.grad_scaler_config] + [components.transformer.model.tokenizer_config] use_fast = false tokenizer_class = "BertJapaneseTokenizer" @@ -112,6 +122,8 @@ word_tokenizer_type = mecab subword_tokenizer_type = wordpiece mecab_kwargs = {"mecab_dic":"unidic_lite"} +[components.transformer.model.transformer_config] + [corpora] [corpora.dev] @@ -125,7 +137,7 @@ augmenter = null [corpora.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -max_length = 500 +max_length = 0 gold_preproc = false limit = 0 augmenter = null @@ -139,11 +151,12 @@ gpu_allocator = ${system.gpu_allocator} dropout = 0.1 patience = 0 max_epochs = 0 -max_steps = 50000 +max_steps = 20000 eval_frequency = 200 frozen_components = [] -before_to_disk = null annotating_components = [] +before_to_disk = null +before_update = null [training.batcher] @batchers = "spacy.batch_by_padded.v1" @@ -169,10 +182,13 @@ eps = 0.00000001 [training.optimizer.learn_rate] @schedules = "warmup_linear.v1" warmup_steps = 250 -total_steps = 50000 +total_steps = 20000 initial_rate = 0.00005 [training.score_weights] +pos_acc = 0.15 +morph_micro_f = 0.0 +morph_per_feat = null dep_uas = 0.25 dep_las = 0.25 dep_las_per_type = null @@ -183,9 +199,6 @@ ents_f = 0.25 ents_p = 0.0 ents_r = 0.0 ents_per_type = null -pos_acc = 0.15 -morph_acc = 0.0 -morph_per_feat = null tag_acc = 0.0 [pretraining] diff --git a/config/ja_ginza_bert_v2.meta.json b/config/ja_ginza_bert_large.meta.json similarity index 62% rename from config/ja_ginza_bert_v2.meta.json rename to config/ja_ginza_bert_large.meta.json index e830c45..4777627 100644 --- a/config/ja_ginza_bert_v2.meta.json +++ b/config/ja_ginza_bert_large.meta.json @@ -1,8 +1,8 @@ { "lang":"ja", - "name":"ginza_bert_v2", - "version":"5.0.0b1", - "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + cl-tohoku/bert-base-japanese-v2. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", + "name":"ginza_bert_large", + "version":"5.1.3b1", + "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", "author":"Megagon Labs Tokyo.", "email":"ginza@megagon.ai", "url":"https://github.com/megagonlabs/ginza", @@ -27,18 +27,17 @@ "author":"Works Applications Enterprise Co., Ltd." }, { - "name":"cl-tohoku/bert-base-japanese-v2", - "url":"https://huggingface.co/cl-tohoku/bert-base-japanese-v2", - "license":"CC BY-SA 3.0", - "author":"Masatoshi Suzuki (Tohoku University)" + "name":"cl-tohoku/bert-large-japanese-v2", + "url":"https://huggingface.co/cl-tohoku/bert-large-japanese-v2", + "license":"Apache License 2.0", + "author":"Tohoku University" } ], - "parent_package":"spacy", - "spacy_version":">=3.0.6,<3.2.0", + "spacy_version":">=3.6.1,<3.7.0", + "spacy_git_version":"458bc5f45", "pipeline":[ "transformer", "parser", - "attribute_ruler", "ner", "morphologizer", "compound_splitter", @@ -53,15 +52,23 @@ "compound_splitter", "bunsetu_recognizer" ], - "disabled": [ + "disabled":[ "attribute_ruler" ], + "vectors":{ + "width":0, + "vectors":0, + "keys":0, + "name":null, + "mode":"default" + }, "requirements":[ - "sudachipy>=0.5.2,<0.6.0", - "sudachidict_core>=20210608", - "fugashi>=1.1.1", + "sudachipy>=0.6.7,<0.7.0", + "sudachidict_core>=20230711", + "spacy>=3.6.1,<3.7.0", + "spacy-transformers>=1.2.5,<1.3.0", + "fugashi>=1.3.0", "unidic-lite>=1.0.8", - "ginza-transformers>=0.3.1,<1.0.0", - "ginza>=5.0.0,<5.1.0" + "ginza>=5.1.3,<5.2.0" ] -} +} \ No newline at end of file diff --git a/config/ja_ginza_bert_v2.analysis.cfg b/config/ja_ginza_bert_large_analysis.cfg similarity index 86% rename from config/ja_ginza_bert_v2.analysis.cfg rename to config/ja_ginza_bert_large_analysis.cfg index fe00716..8ad31bb 100644 --- a/config/ja_ginza_bert_v2.analysis.cfg +++ b/config/ja_ginza_bert_large_analysis.cfg @@ -37,10 +37,14 @@ split_mode = null [components.morphologizer] factory = "morphologizer" +extend = true +overwrite = true +scorer = {"@scorers":"spacy.morphologizer_scorer.v1"} [components.morphologizer.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null +normalize = false [components.morphologizer.model.tok2vec] @architectures = "spacy-transformers.TransformerListener.v1" @@ -50,7 +54,9 @@ upstream = "*" [components.ner] factory = "ner" +incorrect_spans_key = null moves = null +scorer = {"@scorers":"spacy.ner_scorer.v1"} update_with_oracle_cut_size = 100 [components.ner.model] @@ -73,6 +79,7 @@ factory = "parser" learn_tokens = false min_action_freq = 30 moves = null +scorer = {"@scorers":"spacy.parser_scorer.v1"} update_with_oracle_cut_size = 100 [components.parser.model] @@ -91,19 +98,22 @@ pooling = {"@layers":"reduce_mean.v1"} upstream = "*" [components.transformer] -factory = "transformer_custom" +factory = "transformer" max_batch_items = 4096 set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} [components.transformer.model] -@architectures = "ginza-transformers.TransformerModel.v1" -name = "megagonlabs/bert-base-japanese-v2-ginza" +@architectures = "spacy-transformers.TransformerModel.v3" +name = "cl-tohoku/bert-large-japanese-v2" +mixed_precision = false [components.transformer.model.get_spans] @span_getters = "spacy-transformers.strided_spans.v1" window = 128 stride = 96 +[components.transformer.model.grad_scaler_config] + [components.transformer.model.tokenizer_config] use_fast = false tokenizer_class = "BertJapaneseTokenizer" @@ -112,6 +122,8 @@ word_tokenizer_type = mecab subword_tokenizer_type = wordpiece mecab_kwargs = {"mecab_dic":"unidic_lite"} +[components.transformer.model.transformer_config] + [corpora] [corpora.dev] @@ -125,7 +137,7 @@ augmenter = null [corpora.train] @readers = "spacy.Corpus.v1" path = ${paths.train} -max_length = 500 +max_length = 0 gold_preproc = false limit = 0 augmenter = null @@ -139,11 +151,12 @@ gpu_allocator = ${system.gpu_allocator} dropout = 0.1 patience = 0 max_epochs = 0 -max_steps = 50000 +max_steps = 20000 eval_frequency = 200 frozen_components = [] -before_to_disk = null annotating_components = [] +before_to_disk = null +before_update = null [training.batcher] @batchers = "spacy.batch_by_padded.v1" @@ -169,10 +182,13 @@ eps = 0.00000001 [training.optimizer.learn_rate] @schedules = "warmup_linear.v1" warmup_steps = 250 -total_steps = 50000 +total_steps = 20000 initial_rate = 0.00005 [training.score_weights] +pos_acc = 0.15 +morph_micro_f = 0.0 +morph_per_feat = null dep_uas = 0.25 dep_las = 0.25 dep_las_per_type = null @@ -183,9 +199,6 @@ ents_f = 0.25 ents_p = 0.0 ents_r = 0.0 ents_per_type = null -pos_acc = 0.15 -morph_acc = 0.0 -morph_per_feat = null tag_acc = 0.0 [pretraining] diff --git a/config/ja_ginza_electra.meta.json b/config/ja_ginza_electra.meta.json index 50390f2..c800715 100644 --- a/config/ja_ginza_electra.meta.json +++ b/config/ja_ginza_electra.meta.json @@ -1,7 +1,7 @@ { "lang":"ja", "name":"ginza_electra", - "version":"5.1.2", + "version":"5.1.3", "description":"Japanese multi-task CNN trained on UD-Japanese BCCWJ r2.8 + GSK2014-A(2019) + transformers-ud-japanese-electra--base. Components: transformer, parser, atteribute_ruler, ner, morphologizer, compound_splitter, bunsetu_recognizer.", "author":"Megagon Labs Tokyo.", "email":"ginza@megagon.ai", @@ -34,14 +34,15 @@ "author":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, & Peter J. Liu" }, { - "name":"megagonlabs/transformers-ud-japanese-electra-base-ginza", - "url":"https://huggingface.co/megagonlabs/transformers-ud-japanese-electra-base-ginza", + "name":"megagonlabs/transformers-ud-japanese-electra-base-ginza-5.1.0", + "url":"https://huggingface.co/megagonlabs/transformers-ud-japanese-electra-base-ginza-5.1.0", "license":"MIT Licence", "author":"Hiroshi Matsuda (Megagon Labs Tokyo, Recruit Co., Ltd.)" } ], "parent_package":"spacy", - "spacy_version":">=3.2.0,<3.5.0", + "spacy_version":">=3.2.0,<3.7.0", + "spacy_git_version":"0fc3dee77", "pipeline":[ "transformer", "parser", @@ -63,11 +64,18 @@ "disabled": [ "attribute_ruler" ], - "requirements":[ + "vectors":{ + "width":0, + "vectors":0, + "keys":0, + "name":null + }, + "requirements":[ "sudachipy>=0.6.2,<0.7.0", "sudachidict_core>=20210802", "sudachitra>=0.1.6,<0.2.0", "ginza-transformers>=0.4.0,<0.5.0", - "ginza>=5.1.0,<5.2.0" + "ginza>=5.1.0,<5.2.0", + "spacy-transformers>=1.1.2,<1.2.0" ] } diff --git a/docs/index.md b/docs/index.md index 3a59c5d..524c77d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,7 +8,14 @@ ## What's new! -- `spaCy v3.2`と`Sudachi.rs(SudachiPy v0.6.2)`に対応した`GiNZA v5.1`をリリース +- `ja_ginza_bert_large`のβ版を公開中 + - [cl-tohoku/bert-large-japanese-v2](https://huggingface.co/cl-tohoku/bert-large-japanese-v2)をベースモデルに採用 + - 精度が大幅に向上(LAS=0.938, UAS=0.949, UPOS=0.983, ENE=0.708) + - CUDAに対応し8GB以上のRAMを搭載したGPU環境、または、M1・M2などApple Silicon環境の利用を推奨 +- `GiNZA v5.1.3` + - `spaCy v3.2 ~ v3.6`に対応 +- `GiNZA v5.1` + - `spaCy v3.2`と`Sudachi.rs(SudachiPy v0.6.2)`に対応 - バッチ解析処理をGPU環境で50〜60%・CPU環境で10〜40%高速化 - ginzaコマンドの並列実行オプション(`ginza -p {n_process}`および`ginzame`)の処理効率を向上 - ginzaコマンドで日本語以外を含む全てのspaCyモデルが利用可能に @@ -47,9 +54,10 @@ GiNZA v5の解析精度は以前のバージョンから飛躍的な向上を遂 | Model | LAS | UAS | UPOS | ENE | | --- | --- | --- | --- | --- | -| ja_ginza_electra | 92.3 | 93.7 | 98.1 | 61.3 | -| ja_ginza (v5) | 89.2 | 91.1 | 97.0 | 53.9 | -| ja_ginza (v4相当) | 89.0 | 91.0 | 95.1 | 53.1 | +| *ja_ginza_bert_large (β版)* | *93.8* | *94.9* | *98.3* | *70.8* | +| ja_ginza_electra | 92.3 | 93.7 | 98.1 | 61.3 | +| ja_ginza (v5) | 89.2 | 91.1 | 97.0 | 53.9 | +| ja_ginza (v4相当) | 89.0 | 91.0 | 95.1 | 53.1 | `ja_ginza_electra`は`ja_ginza`に対して、5万ステップ学習時の依存関係ラベリング・単語依存構造解析の誤りを、以前のバージョンと比較して25%以上低減できました。 @@ -72,7 +80,7 @@ GiNZAをインストールする前に予めPython実行環境を構築してく 旧バージョンのGiNZAをインストールしている場合は次のコマンドでアンインストールします。 ```console -$ pip uninstall ginza ja_ginza_electra +$ pip uninstall ginza ``` 旧バージョンの`ja_ginza`をインストールしている場合は次のコマンドでアンインストールします。 @@ -97,14 +105,13 @@ $ pip install -U ginza ja_ginza_electra $ pip install -U ginza https://github.com/megagonlabs/ginza/releases/download/latest/ja_ginza_electra-latest-with-model.tar.gz ``` -GPUを利用してtransformersモデルを高速に実行するには、実行環境のCUDAバージョンを指定してspacyを上書きインストールします。 +GPUを利用してtransformersモデルを高速に実行するには、実行環境に応じたオプションを指定してspacyを上書きインストールします。 -CUDA 11.0を使用する場合: +CUDA 11.7を使用する場合: ```console -pip install -U "spacy[cuda110]" +$ pip install -U spacy[cuda117] ``` - -あわせてpytorchもCUDAと整合したバージョンをインストールする必要があります。 +※ あわせてpytorchもCUDAと整合したバージョンをインストールする必要があります。 #### 2. GiNZA + 従来型モデル @@ -118,6 +125,11 @@ $ pip uninstall ginza _ginza $ pip install -U ginza ja_ginza ``` +M1やM2などのMPSに対応したApple Silicon環境では、`thinc-apple-ops`を導入することで解析速度が向上します。 +```console +$ pip install torch thinc-apple-ops +``` + ### ginzaコマンドによる解析処理の実行 `ginza`コマンドを実行して、日本語の文に続けてEnterを入力すると、[CoNLL-U Syntactic Annotation](https://universaldependencies.org/format.html#syntactic-annotation) 形式で解析結果が出力されます。 @@ -261,6 +273,11 @@ Contains information from mC4 which is made available under the ODC Attribution ### version 5.x +#### ginza-5.1.3 +- 2023-09-25 +- Migrate to spaCy v3.6 +- Beta release of `ja_ginza_bert_large` + #### ginza-5.1.2 - 2022-03-12 - Migrate to spaCy v3.4 diff --git a/requirements.txt b/requirements.txt index 2f6952d..9e8a3d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -spacy>=3.2.0,<3.5.0 +spacy>=3.2.0,<3.7.0 plac>=1.3.3 SudachiPy>=0.6.2,<0.7.0 SudachiDict-core>=20210802 diff --git a/setup.py b/setup.py index 14a5a3a..d8caa3d 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ }, python_requires=">=3.6", install_requires=[ - "spacy>=3.2.0,<3.5.0", + "spacy>=3.2.0,<3.7.0", "plac>=1.3.3", "SudachiPy>=0.6.2,<0.7.0", "SudachiDict-core>=20210802", @@ -29,5 +29,5 @@ name="ginza", packages=find_packages(include=["ginza"]), url="https://github.com/megagonlabs/ginza", - version='5.1.2', + version='5.1.3', )