From 019b033bc2f5a2ab5d70cc96ebbe883db45db30e Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Wed, 3 Feb 2021 12:41:55 -0800 Subject: [PATCH 01/15] add configs for a roberta model --- configs/base_ner_roberta.cfg | 133 +++++++++++++++++++ configs/base_parser_tagger_roberta.cfg | 175 +++++++++++++++++++++++++ data/meta_roberta.json | 10 ++ project.yml | 90 ++++++++++++- 4 files changed, 406 insertions(+), 2 deletions(-) create mode 100644 configs/base_ner_roberta.cfg create mode 100644 configs/base_parser_tagger_roberta.cfg create mode 100644 data/meta_roberta.json diff --git a/configs/base_ner_roberta.cfg b/configs/base_ner_roberta.cfg new file mode 100644 index 00000000..4cc576a2 --- /dev/null +++ b/configs/base_ner_roberta.cfg @@ -0,0 +1,133 @@ +[paths] +vectors = null +init_tok2vec = null +parser_tagger_path = null +vocab_path = null + +[system] +gpu_allocator = null +seed = 0 + +[nlp] +lang = "en" +pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser","ner"] +batch_size = 128 +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null + +[components] + +[components.ner] +factory = "ner" +moves = null +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v1" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = true +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*"e + +[components.transformer] +source = ${paths.parser_tagger_path} + +[components.parser] +source = ${paths.parser_tagger_path} + +[components.tagger] +source = ${paths.parser_tagger_path} + +[components.tok2vec] +source = ${paths.parser_tagger_path} + +[corpora] + +[corpora.dev] +@readers = "med_mentions_reader" +directory_path = "assets/" +split = "dev" + +[corpora.train] +@readers = "med_mentions_reader" +directory_path = "assets/" +split = "train" + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.2 +accumulate_gradient = 1 +patience = 0 +max_epochs = 7 +max_steps = 0 +eval_frequency = 500 +frozen_components = ["transformer", "parser", "tagger", "attribute_ruler", "lemmatizer"] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_sequence.v1" +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 1 +stop = 32 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = true + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +dep_las_per_type = null +sents_p = null +sents_r = null +ents_per_type = null +tag_acc = null +dep_uas = null +dep_las = null +sents_f = null +ents_f = 1.0 +ents_p = 0.0 +ents_r = 0.0 + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = ${paths.vocab_path} +lookups = null + +[initialize.components] + +[initialize.tokenizer] + +[initialize.before_init] +@callbacks = "replace_tokenizer" \ No newline at end of file diff --git a/configs/base_parser_tagger_roberta.cfg b/configs/base_parser_tagger_roberta.cfg new file mode 100644 index 00000000..7532035e --- /dev/null +++ b/configs/base_parser_tagger_roberta.cfg @@ -0,0 +1,175 @@ +[paths] +genia_train = "project_data/genia_train.spacy" +genia_dev = "project_data/genia_dev.spacy" +onto_train = "project_data/train" +vectors = null +init_tok2vec = null +vocab_path = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser"] +batch_size = 128 +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null + +[components] + +[components.attribute_ruler] +source = "en_core_web_sm" + +[components.lemmatizer] +source = "en_core_web_sm" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v1" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = true +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.transformer] +factory = "transformer" +max_batch_items = 4096 +set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "allenai/biomed_roberta_base" +tokenizer_config = {"use_fast": true} + +[components.transformer.model.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.genia_dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.train] +@readers = "parser_tagger_data" +path = ${paths.genia_train} +mixin_data_path = ${paths.onto_train} +mixin_data_percent = 0.2 +max_length = 2000 +gold_preproc = false +limit = 0 +augmenter = null +seed = ${system.seed} + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.2 +accumulate_gradient = 1 +patience = 0 +max_epochs = 20 +max_steps = 0 +eval_frequency = 2300 +frozen_components = ["attribute_ruler", "lemmatizer"] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_sequence.v1" +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 1 +stop = 16 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = true + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + + +[training.score_weights] +dep_las_per_type = null +sents_p = null +sents_r = null +ents_per_type = null +tag_acc = 0.33 +dep_uas = 0.33 +dep_las = 0.33 +sents_f = 0.0 +ents_f = 0.0 +ents_p = 0.0 +ents_r = 0.0 + +[pretraining] + +[initialize] +vectors = ${paths.vectors} +init_tok2vec = ${paths.init_tok2vec} +vocab_data = ${paths.vocab_path} +lookups = null + +[initialize.components] + +[initialize.tokenizer] + +[initialize.before_init] +@callbacks = "replace_tokenizer" \ No newline at end of file diff --git a/data/meta_roberta.json b/data/meta_roberta.json new file mode 100644 index 00000000..9380d766 --- /dev/null +++ b/data/meta_roberta.json @@ -0,0 +1,10 @@ +{ + "lang":"en", + "name":"core_sci_roberta", + "sources": ["OntoNotes 5", "Common Crawl", "GENIA 1.0"], + "description":"Spacy Models for Biomedical Text.", + "author":"Allen Institute for Artificial Intelligence", + "email": "ai2-info@allenai.org", + "url":"https://allenai.github.io/SciSpaCy/", + "license":"CC BY-SA 3.0" +} diff --git a/project.yml b/project.yml index 8dc0b7c3..075aa59b 100644 --- a/project.yml +++ b/project.yml @@ -34,20 +34,25 @@ vars: parser_tagger_sm_loc: "output/en_core_sci_sm_parser_tagger" parser_tagger_md_loc: "output/en_core_sci_md_parser_tagger" parser_tagger_lg_loc: "output/en_core_sci_lg_parser_tagger" + parser_tagger_roberta_loc: "output/en_core_sci_roberta_parser_tagger" ner_sm_loc: "output/en_core_sci_sm_ner" ner_md_loc: "output/en_core_sci_md_ner" ner_lg_loc: "output/en_core_sci_lg_ner" + ner_roberta_loc: "output/en_core_sci_roberta_ner" bc5cdr_md_loc: "output/en_ner_bc5cdr_md" jnlpba_md_loc: "output/en_ner_jnlpba_md" craft_md_loc: "output/en_ner_craft_md" bionlp13cg_md_loc: "output/en_ner_bionlp13cg_md" parser_tagger_config_loc: "configs/base_parser_tagger.cfg" + parser_tagger_roberta_config_loc: "configs/base_parser_tagger.cfg" ner_config_loc: "configs/base_ner.cfg" + ner_roberta_config_loc: "configs/base_ner.cfg" specialized_ner_config_loc: "configs/base_specialized_ner.cfg" code_loc: "scispacy/base_project_code.py" meta_sm_loc: "data/meta_small.json" meta_md_loc: "data/meta_medium.json" meta_lg_loc: "data/meta_large.json" + meta_roberta_loc: "data/meta_roberta.json" meta_bc5cdr_loc: "data/bc5cdr_ner.json" meta_bionlp13cg_loc: "data/bionlp13cg_ner.json" meta_craft_loc: "data/craft_ner.json" @@ -55,6 +60,7 @@ vars: package_sm_loc: "packages/en_core_sci_sm-${vars.version_string}/en_core_sci_sm/en_core_sci_sm-${vars.version_string}" package_md_loc: "packages/en_core_sci_md-${vars.version_string}/en_core_sci_md/en_core_sci_md-${vars.version_string}" package_lg_loc: "packages/en_core_sci_lg-${vars.version_string}/en_core_sci_lg/en_core_sci_lg-${vars.version_string}" + package_roberta_loc: "packages/en_core_sci_roberta-${vars.version_string}/en_core_sci_roberta/en_core_sci_roberta-${vars.version_string}" package_bc5cdr_loc: "packages/en_ner_bc5cdr_md-${vars.version_string}/en_ner_bc5cdr_md/en_ner_bc5cdr_md-${vars.version_string}" package_bionlp13cg_loc: "packages/en_ner_bionlp13cg_md-${vars.version_string}/en_ner_bionlp13cg_md/en_ner_bionlp13cg_md-${vars.version_string}" package_craft_loc: "packages/en_ner_craft_md-${vars.version_string}/en_ner_craft_md/en_ner_craft_md-${vars.version_string}" @@ -98,6 +104,15 @@ workflows: - evaluate-ner-lg - package-lg - evaluate-package-lg + roberta: + - download + - convert-shared + - parser-tagger-train-roberta + - evaluate-parser-tagger-roberta + - ner-train-roberta + - evaluate-ner-roberta + - package-roberta + - evaluate-package-roberta specialized-ner: - ner-train-specialized - evaluate-specialized-ner @@ -280,6 +295,19 @@ commands: - "${vars.vectors_lg_loc}" outputs: - "${vars.parser_tagger_lg_loc}/model-best" + + - name: parser-tagger-train-roberta + help: "Train the roberta transformer model" + script: + - "spacy train ${vars.parser_tagger_roberta_config_loc} --output ${vars.parser_tagger_roberta_loc} --code ${vars.code_loc}" + deps: + - "${vars.parser_tagger_config_loc}" + - "${vars.genia_train_spacy_loc}" + - "${vars.genia_dev_spacy_loc}" + - "${vars.genia_test_spacy_loc}" + - "${vars.ontonotes_train_spacy_loc}" + outputs: + - "${vars.parser_tagger_roberta_loc}/model-best" - name: ner-train-sm help: "Train the main ner" @@ -316,6 +344,17 @@ commands: outputs: - "${vars.ner_lg_loc}/model-best" + - name: ner-train-roberta + help: "Train the roberta ner model." + script: + - "spacy train ${vars.ner_roberta_config_loc} --output ${vars.ner_roberta_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_roberta_loc}/model-best" + deps: + - "${vars.ner_config_loc}" + - "${vars.parser_tagger_roberta_loc}/model-best" + - "${vars.corpus_pubtator_loc_local}" + outputs: + - "${vars.ner_roberta_loc}/model-best + - name: ner-train-specialized help: "Train the specialized NER models" script: @@ -384,6 +423,19 @@ commands: - "${vars.parser_tagger_lg_loc}/model_best_results.json" - "${vars.parser_tagger_lg_loc}/model_best_results_onto.json" + - name: evaluate-parser-tagger-roberta + help: "Evaluate the parser and tagger roberta model" + script: + - "spacy evaluate ${vars.parser_tagger_roberta_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_roberta_loc}/model_best_results.json" + - "spacy evaluate ${vars.parser_tagger_roberta_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_roberta_loc}/model_best_results_onto.json" + deps: + - "${vars.parser_tagger_roberta_loc}/model-best" + - "${vars.genia_test_spacy_loc}" + - "${vars.ontonotes_test_spacy_loc}" + outputs: + - "${vars.parser_tagger_roberta_loc}/model_best_results.json" + - "${vars.parser_tagger_roberta_loc}/model_best_results_onto.json" + - name: evaluate-ner-sm help: "Evaluate NER" script: @@ -413,6 +465,16 @@ commands: - "${vars.corpus_pubtator_loc_local}" outputs: - "${vars.ner_lg_loc}/model_best_results.json" + + - name: evaluate-ner-roberta + help: "Evaluate NER roberta" + script: + - "python scripts/evaluate_ner.py --model_path ${vars.ner_roberta_loc}/model-best --dataset medmentions-test --output ${vars.ner_roberta_loc}/model_best_results.json --med_mentions_folder_path assets/" + deps: + - "${vars.ner_roberta_loc}" + - "${vars.corpus_pubtator_loc_local}" + outputs: + - "${vars.ner_roberta_loc}/model_best_results.json" - name: evaluate-specialized-ner help: "Evaluate specialize NER" @@ -475,7 +537,16 @@ commands: - "${vars.ner_lg_loc}/model-best" outputs: - "${vars.package_lg_loc}" - + + - name: package-roberta + help: "Package the roberta model" + script: + - "spacy package ${vars.ner_roberta_loc}/model-best packages/ --meta-path ${vars.meta_roberta_loc} --version ${vars.version_string}" + deps: + - "${vars.ner_roberta_loc}/model-best" + outputs: + - "${vars.package_roberta_loc}" + - name: evaluate-package-md help: "Evaluate the packaged models" script: @@ -501,7 +572,22 @@ commands: - "packages/lg_genia_results.json" - "packages/lg_onto_results.json" - "packages/lg_mm_results.json" - + + - name: evaluate-package-roberta + help: "Evaluate the packaged roberta model" + script: + - "spacy evaluate ${vars.package_roberta_loc} ${vars.genia_test_spacy_loc} --output packages/roberta_genia_results.json" + - "spacy evaluate ${vars.package_lg_loc} ${vars.ontonotes_test_spacy_loc} --output packages/roberta_onto_results.json" + - "python scripts/evaluate_ner.py --model_path ${vars.package_roberta_loc} --dataset medmentions-test --output packages/roberta_mm_results.json --med_mentions_folder_path assets/" + deps: + - "${vars.package_roberta_loc}" + outputs: + - "packages/roberta_genia_results.json" + - "packages/roberta_onto_results.json" + - "packages/roberta_mm_results.json" + + + - name: package-ner help: "Package the models" script: From 3c43395f4f258a3722fa548e3b87e95dc0e05580 Mon Sep 17 00:00:00 2001 From: Mark Neuman Date: Wed, 3 Feb 2021 15:43:17 -0800 Subject: [PATCH 02/15] switch to bert --- configs/base_parser_tagger_roberta.cfg | 4 +- project.yml | 98 +++++++++++++------------- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/configs/base_parser_tagger_roberta.cfg b/configs/base_parser_tagger_roberta.cfg index 7532035e..0d4f4f87 100644 --- a/configs/base_parser_tagger_roberta.cfg +++ b/configs/base_parser_tagger_roberta.cfg @@ -70,7 +70,7 @@ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotati [components.transformer.model] @architectures = "spacy-transformers.TransformerModel.v1" -name = "allenai/biomed_roberta_base" +name = "allenai/scibert_scivocab_uncased" tokenizer_config = {"use_fast": true} [components.transformer.model.get_spans] @@ -172,4 +172,4 @@ lookups = null [initialize.tokenizer] [initialize.before_init] -@callbacks = "replace_tokenizer" \ No newline at end of file +@callbacks = "replace_tokenizer" diff --git a/project.yml b/project.yml index 075aa59b..8378948f 100644 --- a/project.yml +++ b/project.yml @@ -34,25 +34,25 @@ vars: parser_tagger_sm_loc: "output/en_core_sci_sm_parser_tagger" parser_tagger_md_loc: "output/en_core_sci_md_parser_tagger" parser_tagger_lg_loc: "output/en_core_sci_lg_parser_tagger" - parser_tagger_roberta_loc: "output/en_core_sci_roberta_parser_tagger" + parser_tagger_scibert_loc: "output/en_core_sci_scibert_parser_tagger" ner_sm_loc: "output/en_core_sci_sm_ner" ner_md_loc: "output/en_core_sci_md_ner" ner_lg_loc: "output/en_core_sci_lg_ner" - ner_roberta_loc: "output/en_core_sci_roberta_ner" + ner_scibert_loc: "output/en_core_sci_scibert_ner" bc5cdr_md_loc: "output/en_ner_bc5cdr_md" jnlpba_md_loc: "output/en_ner_jnlpba_md" craft_md_loc: "output/en_ner_craft_md" bionlp13cg_md_loc: "output/en_ner_bionlp13cg_md" parser_tagger_config_loc: "configs/base_parser_tagger.cfg" - parser_tagger_roberta_config_loc: "configs/base_parser_tagger.cfg" + parser_tagger_scibert_config_loc: "configs/base_parser_tagger.cfg" ner_config_loc: "configs/base_ner.cfg" - ner_roberta_config_loc: "configs/base_ner.cfg" + ner_scibert_config_loc: "configs/base_ner.cfg" specialized_ner_config_loc: "configs/base_specialized_ner.cfg" code_loc: "scispacy/base_project_code.py" meta_sm_loc: "data/meta_small.json" meta_md_loc: "data/meta_medium.json" meta_lg_loc: "data/meta_large.json" - meta_roberta_loc: "data/meta_roberta.json" + meta_scibert_loc: "data/meta_scibert.json" meta_bc5cdr_loc: "data/bc5cdr_ner.json" meta_bionlp13cg_loc: "data/bionlp13cg_ner.json" meta_craft_loc: "data/craft_ner.json" @@ -60,7 +60,7 @@ vars: package_sm_loc: "packages/en_core_sci_sm-${vars.version_string}/en_core_sci_sm/en_core_sci_sm-${vars.version_string}" package_md_loc: "packages/en_core_sci_md-${vars.version_string}/en_core_sci_md/en_core_sci_md-${vars.version_string}" package_lg_loc: "packages/en_core_sci_lg-${vars.version_string}/en_core_sci_lg/en_core_sci_lg-${vars.version_string}" - package_roberta_loc: "packages/en_core_sci_roberta-${vars.version_string}/en_core_sci_roberta/en_core_sci_roberta-${vars.version_string}" + package_scibert_loc: "packages/en_core_sci_scibert-${vars.version_string}/en_core_sci_scibert/en_core_sci_scibert-${vars.version_string}" package_bc5cdr_loc: "packages/en_ner_bc5cdr_md-${vars.version_string}/en_ner_bc5cdr_md/en_ner_bc5cdr_md-${vars.version_string}" package_bionlp13cg_loc: "packages/en_ner_bionlp13cg_md-${vars.version_string}/en_ner_bionlp13cg_md/en_ner_bionlp13cg_md-${vars.version_string}" package_craft_loc: "packages/en_ner_craft_md-${vars.version_string}/en_ner_craft_md/en_ner_craft_md-${vars.version_string}" @@ -104,15 +104,15 @@ workflows: - evaluate-ner-lg - package-lg - evaluate-package-lg - roberta: + scibert: - download - convert-shared - - parser-tagger-train-roberta - - evaluate-parser-tagger-roberta - - ner-train-roberta - - evaluate-ner-roberta - - package-roberta - - evaluate-package-roberta + - parser-tagger-train-scibert + - evaluate-parser-tagger-scibert + - ner-train-scibert + - evaluate-ner-scibert + - package-scibert + - evaluate-package-scibert specialized-ner: - ner-train-specialized - evaluate-specialized-ner @@ -296,10 +296,10 @@ commands: outputs: - "${vars.parser_tagger_lg_loc}/model-best" - - name: parser-tagger-train-roberta - help: "Train the roberta transformer model" + - name: parser-tagger-train-scibert + help: "Train the scibert transformer model" script: - - "spacy train ${vars.parser_tagger_roberta_config_loc} --output ${vars.parser_tagger_roberta_loc} --code ${vars.code_loc}" + - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc}" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -307,7 +307,7 @@ commands: - "${vars.genia_test_spacy_loc}" - "${vars.ontonotes_train_spacy_loc}" outputs: - - "${vars.parser_tagger_roberta_loc}/model-best" + - "${vars.parser_tagger_scibert_loc}/model-best" - name: ner-train-sm help: "Train the main ner" @@ -344,16 +344,16 @@ commands: outputs: - "${vars.ner_lg_loc}/model-best" - - name: ner-train-roberta - help: "Train the roberta ner model." + - name: ner-train-scibert + help: "Train the scibert ner model." script: - - "spacy train ${vars.ner_roberta_config_loc} --output ${vars.ner_roberta_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_roberta_loc}/model-best" + - "spacy train ${vars.ner_scibert_config_loc} --output ${vars.ner_scibert_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_scibert_loc}/model-best" deps: - "${vars.ner_config_loc}" - - "${vars.parser_tagger_roberta_loc}/model-best" + - "${vars.parser_tagger_scibert_loc}/model-best" - "${vars.corpus_pubtator_loc_local}" outputs: - - "${vars.ner_roberta_loc}/model-best + - "${vars.ner_scibert_loc}/model-best" - name: ner-train-specialized help: "Train the specialized NER models" @@ -423,18 +423,18 @@ commands: - "${vars.parser_tagger_lg_loc}/model_best_results.json" - "${vars.parser_tagger_lg_loc}/model_best_results_onto.json" - - name: evaluate-parser-tagger-roberta - help: "Evaluate the parser and tagger roberta model" + - name: evaluate-parser-tagger-scibert + help: "Evaluate the parser and tagger scibert model" script: - - "spacy evaluate ${vars.parser_tagger_roberta_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_roberta_loc}/model_best_results.json" - - "spacy evaluate ${vars.parser_tagger_roberta_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_roberta_loc}/model_best_results_onto.json" + - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results.json" + - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results_onto.json" deps: - - "${vars.parser_tagger_roberta_loc}/model-best" + - "${vars.parser_tagger_scibert_loc}/model-best" - "${vars.genia_test_spacy_loc}" - "${vars.ontonotes_test_spacy_loc}" outputs: - - "${vars.parser_tagger_roberta_loc}/model_best_results.json" - - "${vars.parser_tagger_roberta_loc}/model_best_results_onto.json" + - "${vars.parser_tagger_scibert_loc}/model_best_results.json" + - "${vars.parser_tagger_scibert_loc}/model_best_results_onto.json" - name: evaluate-ner-sm help: "Evaluate NER" @@ -466,15 +466,15 @@ commands: outputs: - "${vars.ner_lg_loc}/model_best_results.json" - - name: evaluate-ner-roberta - help: "Evaluate NER roberta" + - name: evaluate-ner-scibert + help: "Evaluate NER scibert" script: - - "python scripts/evaluate_ner.py --model_path ${vars.ner_roberta_loc}/model-best --dataset medmentions-test --output ${vars.ner_roberta_loc}/model_best_results.json --med_mentions_folder_path assets/" + - "python scripts/evaluate_ner.py --model_path ${vars.ner_scibert_loc}/model-best --dataset medmentions-test --output ${vars.ner_scibert_loc}/model_best_results.json --med_mentions_folder_path assets/" deps: - - "${vars.ner_roberta_loc}" + - "${vars.ner_scibert_loc}" - "${vars.corpus_pubtator_loc_local}" outputs: - - "${vars.ner_roberta_loc}/model_best_results.json" + - "${vars.ner_scibert_loc}/model_best_results.json" - name: evaluate-specialized-ner help: "Evaluate specialize NER" @@ -538,14 +538,14 @@ commands: outputs: - "${vars.package_lg_loc}" - - name: package-roberta - help: "Package the roberta model" + - name: package-scibert + help: "Package the scibert model" script: - - "spacy package ${vars.ner_roberta_loc}/model-best packages/ --meta-path ${vars.meta_roberta_loc} --version ${vars.version_string}" + - "spacy package ${vars.ner_scibert_loc}/model-best packages/ --meta-path ${vars.meta_scibert_loc} --version ${vars.version_string}" deps: - - "${vars.ner_roberta_loc}/model-best" + - "${vars.ner_scibert_loc}/model-best" outputs: - - "${vars.package_roberta_loc}" + - "${vars.package_scibert_loc}" - name: evaluate-package-md help: "Evaluate the packaged models" @@ -573,18 +573,18 @@ commands: - "packages/lg_onto_results.json" - "packages/lg_mm_results.json" - - name: evaluate-package-roberta - help: "Evaluate the packaged roberta model" + - name: evaluate-package-scibert + help: "Evaluate the packaged scibert model" script: - - "spacy evaluate ${vars.package_roberta_loc} ${vars.genia_test_spacy_loc} --output packages/roberta_genia_results.json" - - "spacy evaluate ${vars.package_lg_loc} ${vars.ontonotes_test_spacy_loc} --output packages/roberta_onto_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.package_roberta_loc} --dataset medmentions-test --output packages/roberta_mm_results.json --med_mentions_folder_path assets/" + - "spacy evaluate ${vars.package_scibert_loc} ${vars.genia_test_spacy_loc} --output packages/scibert_genia_results.json" + - "spacy evaluate ${vars.package_lg_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json" + - "python scripts/evaluate_ner.py --model_path ${vars.package_scibert_loc} --dataset medmentions-test --output packages/scibert_mm_results.json --med_mentions_folder_path assets/" deps: - - "${vars.package_roberta_loc}" + - "${vars.package_scibert_loc}" outputs: - - "packages/roberta_genia_results.json" - - "packages/roberta_onto_results.json" - - "packages/roberta_mm_results.json" + - "packages/scibert_genia_results.json" + - "packages/scibert_onto_results.json" + - "packages/scibert_mm_results.json" @@ -626,4 +626,4 @@ commands: - # TODOs: evaluate everything, package command, another evaluate of the package, twiddle params, maybe add a command for uploading models and releasing? \ No newline at end of file + # TODOs: evaluate everything, package command, another evaluate of the package, twiddle params, maybe add a command for uploading models and releasing? From b9ac9c3da9f1df73aae2d9142f172c153dbd853b Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Wed, 3 Feb 2021 15:48:18 -0800 Subject: [PATCH 03/15] fix spacing yaml :rage: --- project.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/project.yml b/project.yml index 075aa59b..b6d861f2 100644 --- a/project.yml +++ b/project.yml @@ -353,7 +353,7 @@ commands: - "${vars.parser_tagger_roberta_loc}/model-best" - "${vars.corpus_pubtator_loc_local}" outputs: - - "${vars.ner_roberta_loc}/model-best + - "${vars.ner_roberta_loc}/model-best" - name: ner-train-specialized help: "Train the specialized NER models" @@ -538,7 +538,7 @@ commands: outputs: - "${vars.package_lg_loc}" - - name: package-roberta + - name: package-roberta help: "Package the roberta model" script: - "spacy package ${vars.ner_roberta_loc}/model-best packages/ --meta-path ${vars.meta_roberta_loc} --version ${vars.version_string}" @@ -573,7 +573,7 @@ commands: - "packages/lg_onto_results.json" - "packages/lg_mm_results.json" - - name: evaluate-package-roberta + - name: evaluate-package-roberta help: "Evaluate the packaged roberta model" script: - "spacy evaluate ${vars.package_roberta_loc} ${vars.genia_test_spacy_loc} --output packages/roberta_genia_results.json" From 5f33645b8220dd5934ac98665bc20707d19b72ee Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Wed, 3 Feb 2021 15:49:04 -0800 Subject: [PATCH 04/15] Revert "fix spacing yaml :rage:" This reverts commit b9ac9c3da9f1df73aae2d9142f172c153dbd853b. --- project.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/project.yml b/project.yml index b6d861f2..075aa59b 100644 --- a/project.yml +++ b/project.yml @@ -353,7 +353,7 @@ commands: - "${vars.parser_tagger_roberta_loc}/model-best" - "${vars.corpus_pubtator_loc_local}" outputs: - - "${vars.ner_roberta_loc}/model-best" + - "${vars.ner_roberta_loc}/model-best - name: ner-train-specialized help: "Train the specialized NER models" @@ -538,7 +538,7 @@ commands: outputs: - "${vars.package_lg_loc}" - - name: package-roberta + - name: package-roberta help: "Package the roberta model" script: - "spacy package ${vars.ner_roberta_loc}/model-best packages/ --meta-path ${vars.meta_roberta_loc} --version ${vars.version_string}" @@ -573,7 +573,7 @@ commands: - "packages/lg_onto_results.json" - "packages/lg_mm_results.json" - - name: evaluate-package-roberta + - name: evaluate-package-roberta help: "Evaluate the packaged roberta model" script: - "spacy evaluate ${vars.package_roberta_loc} ${vars.genia_test_spacy_loc} --output packages/roberta_genia_results.json" From 8319bc5f831b0a5f62e8611996ab265557912355 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Wed, 3 Feb 2021 15:52:08 -0800 Subject: [PATCH 05/15] fix yaml and paths :rage: --- data/{meta_roberta.json => meta_scibert.json} | 2 +- project.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename data/{meta_roberta.json => meta_scibert.json} (91%) diff --git a/data/meta_roberta.json b/data/meta_scibert.json similarity index 91% rename from data/meta_roberta.json rename to data/meta_scibert.json index 9380d766..6364e49d 100644 --- a/data/meta_roberta.json +++ b/data/meta_scibert.json @@ -1,6 +1,6 @@ { "lang":"en", - "name":"core_sci_roberta", + "name":"core_sci_scibert", "sources": ["OntoNotes 5", "Common Crawl", "GENIA 1.0"], "description":"Spacy Models for Biomedical Text.", "author":"Allen Institute for Artificial Intelligence", diff --git a/project.yml b/project.yml index 8378948f..13bfac49 100644 --- a/project.yml +++ b/project.yml @@ -538,7 +538,7 @@ commands: outputs: - "${vars.package_lg_loc}" - - name: package-scibert + - name: package-scibert help: "Package the scibert model" script: - "spacy package ${vars.ner_scibert_loc}/model-best packages/ --meta-path ${vars.meta_scibert_loc} --version ${vars.version_string}" @@ -573,7 +573,7 @@ commands: - "packages/lg_onto_results.json" - "packages/lg_mm_results.json" - - name: evaluate-package-scibert + - name: evaluate-package-scibert help: "Evaluate the packaged scibert model" script: - "spacy evaluate ${vars.package_scibert_loc} ${vars.genia_test_spacy_loc} --output packages/scibert_genia_results.json" From e731b9183e4658f7afb45f61d5a0b16da992f9d5 Mon Sep 17 00:00:00 2001 From: Mark Neuman Date: Fri, 5 Feb 2021 11:57:07 -0800 Subject: [PATCH 06/15] updates to config --- ...e_ner_roberta.cfg => base_ner_scibert.cfg} | 21 ++++++++++++------- ...rta.cfg => base_parser_tagger_scibert.cfg} | 16 +++++--------- project.yml | 10 ++++----- 3 files changed, 23 insertions(+), 24 deletions(-) rename configs/{base_ner_roberta.cfg => base_ner_scibert.cfg} (87%) rename configs/{base_parser_tagger_roberta.cfg => base_parser_tagger_scibert.cfg} (94%) diff --git a/configs/base_ner_roberta.cfg b/configs/base_ner_scibert.cfg similarity index 87% rename from configs/base_ner_roberta.cfg rename to configs/base_ner_scibert.cfg index 4cc576a2..42830282 100644 --- a/configs/base_ner_roberta.cfg +++ b/configs/base_ner_scibert.cfg @@ -11,7 +11,7 @@ seed = 0 [nlp] lang = "en" pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser","ner"] -batch_size = 128 +batch_size = 256 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} disabled = [] before_creation = null @@ -20,6 +20,12 @@ after_pipeline_creation = null [components] +[components.attribute_ruler] +source = "en_core_web_sm" + +[components.lemmatizer] +source = "en_core_web_sm" + [components.ner] factory = "ner" moves = null @@ -38,10 +44,11 @@ nO = null @architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 pooling = {"@layers":"reduce_mean.v1"} -upstream = "*"e +upstream = "*" [components.transformer] source = ${paths.parser_tagger_path} +replace_listeners = ["model.transformer"] [components.parser] source = ${paths.parser_tagger_path} @@ -49,8 +56,6 @@ source = ${paths.parser_tagger_path} [components.tagger] source = ${paths.parser_tagger_path} -[components.tok2vec] -source = ${paths.parser_tagger_path} [corpora] @@ -75,7 +80,7 @@ patience = 0 max_epochs = 7 max_steps = 0 eval_frequency = 500 -frozen_components = ["transformer", "parser", "tagger", "attribute_ruler", "lemmatizer"] +frozen_components = ["parser", "tagger", "attribute_ruler", "lemmatizer"] before_to_disk = null [training.batcher] @@ -84,7 +89,7 @@ get_length = null [training.batcher.size] @schedules = "compounding.v1" -start = 1 +start = 16 stop = 32 compound = 1.001 t = 0.0 @@ -102,7 +107,7 @@ L2 = 0.01 grad_clip = 1.0 use_averages = false eps = 0.00000001 -learn_rate = 0.001 +learn_rate = 0.0001 [training.score_weights] dep_las_per_type = null @@ -130,4 +135,4 @@ lookups = null [initialize.tokenizer] [initialize.before_init] -@callbacks = "replace_tokenizer" \ No newline at end of file +@callbacks = "replace_tokenizer" diff --git a/configs/base_parser_tagger_roberta.cfg b/configs/base_parser_tagger_scibert.cfg similarity index 94% rename from configs/base_parser_tagger_roberta.cfg rename to configs/base_parser_tagger_scibert.cfg index 0d4f4f87..15be9beb 100644 --- a/configs/base_parser_tagger_roberta.cfg +++ b/configs/base_parser_tagger_scibert.cfg @@ -13,7 +13,7 @@ seed = 0 [nlp] lang = "en" pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser"] -batch_size = 128 +batch_size = 256 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} disabled = [] before_creation = null @@ -107,7 +107,7 @@ gpu_allocator = ${system.gpu_allocator} dropout = 0.2 accumulate_gradient = 1 patience = 0 -max_epochs = 20 +max_epochs = 8 max_steps = 0 eval_frequency = 2300 frozen_components = ["attribute_ruler", "lemmatizer"] @@ -119,8 +119,8 @@ get_length = null [training.batcher.size] @schedules = "compounding.v1" -start = 1 -stop = 16 +start = 16 +stop = 64 compound = 1.001 t = 0.0 @@ -137,13 +137,7 @@ L2 = 0.01 grad_clip = 1.0 use_averages = false eps = 0.00000001 -learn_rate = 0.001 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 20000 -initial_rate = 0.00005 +learn_rate = 0.00005 [training.score_weights] diff --git a/project.yml b/project.yml index 13bfac49..a8619009 100644 --- a/project.yml +++ b/project.yml @@ -44,9 +44,9 @@ vars: craft_md_loc: "output/en_ner_craft_md" bionlp13cg_md_loc: "output/en_ner_bionlp13cg_md" parser_tagger_config_loc: "configs/base_parser_tagger.cfg" - parser_tagger_scibert_config_loc: "configs/base_parser_tagger.cfg" + parser_tagger_scibert_config_loc: "configs/base_parser_tagger_scibert.cfg" ner_config_loc: "configs/base_ner.cfg" - ner_scibert_config_loc: "configs/base_ner.cfg" + ner_scibert_config_loc: "configs/base_ner_scibert.cfg" specialized_ner_config_loc: "configs/base_specialized_ner.cfg" code_loc: "scispacy/base_project_code.py" meta_sm_loc: "data/meta_small.json" @@ -299,7 +299,7 @@ commands: - name: parser-tagger-train-scibert help: "Train the scibert transformer model" script: - - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc}" + - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc} --gpu-id 0" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -347,7 +347,7 @@ commands: - name: ner-train-scibert help: "Train the scibert ner model." script: - - "spacy train ${vars.ner_scibert_config_loc} --output ${vars.ner_scibert_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_scibert_loc}/model-best" + - "spacy train ${vars.ner_scibert_config_loc} --output ${vars.ner_scibert_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_scibert_loc}/model-best --gpu-id 0" deps: - "${vars.ner_config_loc}" - "${vars.parser_tagger_scibert_loc}/model-best" @@ -577,7 +577,7 @@ commands: help: "Evaluate the packaged scibert model" script: - "spacy evaluate ${vars.package_scibert_loc} ${vars.genia_test_spacy_loc} --output packages/scibert_genia_results.json" - - "spacy evaluate ${vars.package_lg_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json" + - "spacy evaluate ${vars.package_scibert_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json" - "python scripts/evaluate_ner.py --model_path ${vars.package_scibert_loc} --dataset medmentions-test --output packages/scibert_mm_results.json --med_mentions_folder_path assets/" deps: - "${vars.package_scibert_loc}" From c2d966592f8768b4192ec4b5fb3f7b2e137af7db Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Mon, 8 Feb 2021 12:20:13 -0800 Subject: [PATCH 07/15] try independent tranformers --- configs/base_ner_scibert.cfg | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index 42830282..a40f7823 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -41,14 +41,9 @@ use_upper = true nO = null [components.ner.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.transformer] -source = ${paths.parser_tagger_path} -replace_listeners = ["model.transformer"] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "allenai/scibert_scivocab_uncased" +tokenizer_config = {"use_fast": true} [components.parser] source = ${paths.parser_tagger_path} @@ -80,7 +75,7 @@ patience = 0 max_epochs = 7 max_steps = 0 eval_frequency = 500 -frozen_components = ["parser", "tagger", "attribute_ruler", "lemmatizer"] +frozen_components = ["transformer", "parser", "tagger", "attribute_ruler", "lemmatizer"] before_to_disk = null [training.batcher] From c03e14a551b6a911b65bb9b648230e1977d33dce Mon Sep 17 00:00:00 2001 From: Mark Neuman Date: Mon, 8 Feb 2021 12:28:01 -0800 Subject: [PATCH 08/15] updates which break training.. --- configs/base_ner_scibert.cfg | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index a40f7823..4fbd6f16 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -10,7 +10,7 @@ seed = 0 [nlp] lang = "en" -pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser","ner"] +pipeline = ["tagger","attribute_ruler","lemmatizer","parser","ner"] batch_size = 256 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} disabled = [] @@ -45,6 +45,11 @@ nO = null name = "allenai/scibert_scivocab_uncased" tokenizer_config = {"use_fast": true} +[components.ner.model.tok2vec.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + [components.parser] source = ${paths.parser_tagger_path} @@ -75,7 +80,7 @@ patience = 0 max_epochs = 7 max_steps = 0 eval_frequency = 500 -frozen_components = ["transformer", "parser", "tagger", "attribute_ruler", "lemmatizer"] +frozen_components = ["parser", "tagger", "attribute_ruler", "lemmatizer"] before_to_disk = null [training.batcher] From bb95219d851993f37a2cd96f773d9bcf3173ac1c Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Mon, 8 Feb 2021 12:54:19 -0800 Subject: [PATCH 09/15] try way using upstream --- configs/base_ner_scibert.cfg | 10 +++++++--- configs/base_parser_tagger_scibert.cfg | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index a40f7823..4cd31aae 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -41,9 +41,10 @@ use_upper = true nO = null [components.ner.model.tok2vec] -@architectures = "spacy-transformers.TransformerModel.v1" -name = "allenai/scibert_scivocab_uncased" -tokenizer_config = {"use_fast": true} +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "components.ner_transformer" [components.parser] source = ${paths.parser_tagger_path} @@ -51,6 +52,9 @@ source = ${paths.parser_tagger_path} [components.tagger] source = ${paths.parser_tagger_path} +[components.transformer] +source = ${paths.parser_tagger_path} + [corpora] diff --git a/configs/base_parser_tagger_scibert.cfg b/configs/base_parser_tagger_scibert.cfg index 15be9beb..f104bba8 100644 --- a/configs/base_parser_tagger_scibert.cfg +++ b/configs/base_parser_tagger_scibert.cfg @@ -78,6 +78,22 @@ tokenizer_config = {"use_fast": true} window = 128 stride = 96 +[components.ner_transformer] +factory = "transformer" +max_batch_items = 4096 +set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} + +[components.ner_transformer.model] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "allenai/scibert_scivocab_uncased" +tokenizer_config = {"use_fast": true} + +[components.ner_transformer.model.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + + [corpora] [corpora.dev] From c2dfd3a553cf3cd1f9ddf1f962a9219ffbe9ccb4 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Thu, 11 Feb 2021 13:46:36 -0800 Subject: [PATCH 10/15] back to first way --- configs/base_ner_scibert.cfg | 6 +++--- configs/base_parser_tagger_scibert.cfg | 15 --------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index 4cd31aae..b73243bf 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -42,9 +42,9 @@ nO = null [components.ner.model.tok2vec] @architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 +grad_factor = 0.0 pooling = {"@layers":"reduce_mean.v1"} -upstream = "components.ner_transformer" +upstream = "*" [components.parser] source = ${paths.parser_tagger_path} @@ -79,7 +79,7 @@ patience = 0 max_epochs = 7 max_steps = 0 eval_frequency = 500 -frozen_components = ["transformer", "parser", "tagger", "attribute_ruler", "lemmatizer"] +frozen_components = ["parser", "tagger", "attribute_ruler", "lemmatizer"] before_to_disk = null [training.batcher] diff --git a/configs/base_parser_tagger_scibert.cfg b/configs/base_parser_tagger_scibert.cfg index f104bba8..9e59d4e2 100644 --- a/configs/base_parser_tagger_scibert.cfg +++ b/configs/base_parser_tagger_scibert.cfg @@ -78,21 +78,6 @@ tokenizer_config = {"use_fast": true} window = 128 stride = 96 -[components.ner_transformer] -factory = "transformer" -max_batch_items = 4096 -set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} - -[components.ner_transformer.model] -@architectures = "spacy-transformers.TransformerModel.v1" -name = "allenai/scibert_scivocab_uncased" -tokenizer_config = {"use_fast": true} - -[components.ner_transformer.model.get_spans] -@span_getters = "spacy-transformers.strided_spans.v1" -window = 128 -stride = 96 - [corpora] From cac9e8a250d333ca75c6928b4dddf835ea100d45 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Fri, 12 Feb 2021 10:29:39 -0800 Subject: [PATCH 11/15] revert to standard embedding for NER --- configs/base_ner_scibert.cfg | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index b73243bf..3c141219 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -26,11 +26,6 @@ source = "en_core_web_sm" [components.lemmatizer] source = "en_core_web_sm" -[components.ner] -factory = "ner" -moves = null -update_with_oracle_cut_size = 100 - [components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" state_type = "ner" @@ -41,10 +36,21 @@ use_upper = true nO = null [components.ner.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 0.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" +@architectures = "spacy.Tok2Vec.v1" + +[components.ner.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 96 +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +rows = [5000, 2500, 2500, 2500] +include_static_vectors = false + +[components.ner.model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 [components.parser] source = ${paths.parser_tagger_path} From 896f34baf5170111170cf75f8e29cd5788260031 Mon Sep 17 00:00:00 2001 From: Mark Neuman Date: Fri, 12 Feb 2021 12:12:03 -0800 Subject: [PATCH 12/15] updates --- configs/base_ner_scibert.cfg | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index c036b7f3..916a1eb1 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -10,7 +10,7 @@ seed = 0 [nlp] lang = "en" -pipeline = ["tagger","attribute_ruler","lemmatizer","parser","ner"] +pipeline = ["transformer", "tagger","attribute_ruler","lemmatizer","parser","ner"] batch_size = 256 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} disabled = [] @@ -26,6 +26,11 @@ source = "en_core_web_sm" [components.lemmatizer] source = "en_core_web_sm" +[components.ner] +factory = "ner" +moves = null +update_with_oracle_cut_size = 100 + [components.ner.model] @architectures = "spacy.TransitionBasedParser.v1" state_type = "ner" @@ -52,11 +57,6 @@ depth = 4 window_size = 1 maxout_pieces = 3 -[components.ner.model.tok2vec.get_spans] -@span_getters = "spacy-transformers.strided_spans.v1" -window = 128 -stride = 96 - [components.parser] source = ${paths.parser_tagger_path} @@ -90,7 +90,7 @@ patience = 0 max_epochs = 7 max_steps = 0 eval_frequency = 500 -frozen_components = ["parser", "tagger", "attribute_ruler", "lemmatizer"] +frozen_components = ["transformer", "parser", "tagger", "attribute_ruler", "lemmatizer"] before_to_disk = null [training.batcher] From 1930eb1495896f198a0d20ce7b0a1e107db8ec04 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Fri, 12 Feb 2021 12:19:48 -0800 Subject: [PATCH 13/15] match config, add vocab --- configs/base_ner_scibert.cfg | 5 ++--- project.yml | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index 916a1eb1..9e80b53f 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -11,7 +11,6 @@ seed = 0 [nlp] lang = "en" pipeline = ["transformer", "tagger","attribute_ruler","lemmatizer","parser","ner"] -batch_size = 256 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} disabled = [] before_creation = null @@ -99,7 +98,7 @@ get_length = null [training.batcher.size] @schedules = "compounding.v1" -start = 16 +start = 1 stop = 32 compound = 1.001 t = 0.0 @@ -117,7 +116,7 @@ L2 = 0.01 grad_clip = 1.0 use_averages = false eps = 0.00000001 -learn_rate = 0.0001 +learn_rate = 0.001 [training.score_weights] dep_las_per_type = null diff --git a/project.yml b/project.yml index a8619009..5f7e63c3 100644 --- a/project.yml +++ b/project.yml @@ -107,6 +107,7 @@ workflows: scibert: - download - convert-shared + - convert-lg # use the large vocab, but not the vectors - parser-tagger-train-scibert - evaluate-parser-tagger-scibert - ner-train-scibert @@ -299,7 +300,7 @@ commands: - name: parser-tagger-train-scibert help: "Train the scibert transformer model" script: - - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc} --gpu-id 0" + - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_lg_loc} --gpu-id 0" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" From 7b94fafcc3e025c8d54f05f84a41a7ccbd9df5ab Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Fri, 12 Feb 2021 14:27:23 -0800 Subject: [PATCH 14/15] release --- README.md | 1 + docs/index.md | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 1df047f9..ccfe1fea 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,7 @@ pip install CMD-V(to paste the copied URL) | en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz)| | en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz)| | en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz)| | en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz)| | en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz)| | en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz)| diff --git a/docs/index.md b/docs/index.md index 8edb987f..f165b749 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,6 +19,7 @@ pip install |:---------------|:------------------|:----------| | en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz)| | en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_md-0.3.0.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz)| | en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_lg-0.3.0.tar.gz)| | en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_ner_craft_md-0.3.0.tar.gz)| | en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_ner_jnlpba_md-0.3.0.tar.gz)| @@ -37,6 +38,7 @@ Our models achieve performance within 3% of published state of the art dependenc | en_core_sci_sm | 89.75| 87.80 | 98.41 | 67.92 | 88.18 | | en_core_sci_md | 90.36| 88.53 | 98.49 | 68.58 | 88.27 | | en_core_sci_lg | 90.30| 88.48 | 98.53 | 69.21 | 88.67 | +| en_core_sci_lg | 92.03| 90.25 | 98.91 | 67.91 | 92.21 | | model | F1 | Entity Types| From 76de264564be4cb0310ad484ad514eb39058831c Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Fri, 12 Feb 2021 14:34:23 -0800 Subject: [PATCH 15/15] PR --- docs/index.md | 2 +- project.yml | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/index.md b/docs/index.md index f165b749..7156a6be 100644 --- a/docs/index.md +++ b/docs/index.md @@ -38,7 +38,7 @@ Our models achieve performance within 3% of published state of the art dependenc | en_core_sci_sm | 89.75| 87.80 | 98.41 | 67.92 | 88.18 | | en_core_sci_md | 90.36| 88.53 | 98.49 | 68.58 | 88.27 | | en_core_sci_lg | 90.30| 88.48 | 98.53 | 69.21 | 88.67 | -| en_core_sci_lg | 92.03| 90.25 | 98.91 | 67.91 | 92.21 | +| en_core_sci_scibert | 92.03| 90.25 | 98.91 | 67.91 | 92.21 | | model | F1 | Entity Types| diff --git a/project.yml b/project.yml index 5f7e63c3..5a0ca6d5 100644 --- a/project.yml +++ b/project.yml @@ -624,7 +624,3 @@ commands: - "packages/bionlp13cg_results.json" - "packages/craft_results.json" - "packages/jnlpba_results.json" - - - - # TODOs: evaluate everything, package command, another evaluate of the package, twiddle params, maybe add a command for uploading models and releasing?