From bdf2ba61577a0aed3d5b259c249dccc2c81b6761 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 17 Aug 2022 14:03:12 -0700 Subject: [PATCH 01/17] update spacy and scispacy versions --- requirements.in | 2 +- scispacy/version.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.in b/requirements.in index 8351a3a..3cc4a36 100644 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,5 @@ numpy -spacy>=3.2.0,<3.3.0 +spacy>=3.4.0,<3.5.0 spacy-lookups-data pandas requests>=2.0.0,<3.0.0 diff --git a/scispacy/version.py b/scispacy/version.py index c8b11da..b7139a8 100644 --- a/scispacy/version.py +++ b/scispacy/version.py @@ -1,6 +1,6 @@ _MAJOR = "0" _MINOR = "5" -_REVISION = "0" +_REVISION = "1" VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION) diff --git a/setup.py b/setup.py index c972d5c..91373dc 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), license="Apache", install_requires=[ - "spacy>=3.2.0,<3.3.0", + "spacy>=3.4.0,<3.5.0", "requests>=2.0.0,<3.0.0", "conllu", "numpy", From bed5d79325fa7f0fa520c741a1e6fefacaf619e2 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 17 Aug 2022 14:04:55 -0700 Subject: [PATCH 02/17] Update configs with new tagger version --- configs/base_parser_tagger.cfg | 3 ++- configs/base_parser_tagger_scibert.cfg | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/configs/base_parser_tagger.cfg b/configs/base_parser_tagger.cfg index cec9e47..738c85c 100644 --- a/configs/base_parser_tagger.cfg +++ b/configs/base_parser_tagger.cfg @@ -55,8 +55,9 @@ upstream = "*" factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null +normalize = False [components.tagger.model.tok2vec] @architectures = "spacy.Tok2VecListener.v1" diff --git a/configs/base_parser_tagger_scibert.cfg b/configs/base_parser_tagger_scibert.cfg index 016a395..65b8f2f 100644 --- a/configs/base_parser_tagger_scibert.cfg +++ b/configs/base_parser_tagger_scibert.cfg @@ -54,8 +54,9 @@ upstream = "*" factory = "tagger" [components.tagger.model] -@architectures = "spacy.Tagger.v1" +@architectures = "spacy.Tagger.v2" nO = null +normalize = False [components.tagger.model.tok2vec] @architectures = "spacy-transformers.TransformerListener.v1" From a0d54b149cc66cdfbc8ba6e00c9cb9d9decda4eb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 17 Aug 2022 14:13:45 -0700 Subject: [PATCH 03/17] Add gpu args everywhere --- project.yml | 84 ++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 43 deletions(-) diff --git a/project.yml b/project.yml index 2f11d1a..42eae35 100644 --- a/project.yml +++ b/project.yml @@ -2,8 +2,8 @@ title: "scispaCy pipeline" description: "All the steps needed in the scispaCy pipeline" vars: - version_string: "0.5.0" - gpu_id: "0" + version_string: "0.5.1" + gpu_id: 0 freqs_loc_s3: "s3://ai2-s2-scispacy/data/gorc_subset.freqs" freqs_loc_local: "assets/gorc_subset.freqs" vectors_loc_s3: "s3://ai2-s2-scispacy/data/pubmed_with_header.txt.gz" @@ -260,7 +260,7 @@ commands: - name: parser-tagger-train-sm help: "Train the base models" script: - - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_sm_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_sm_loc} --vars.include_static_vectors False" + - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_sm_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_sm_loc} --vars.include_static_vectors False --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -273,7 +273,7 @@ commands: - name: parser-tagger-train-md help: "Train the base models" script: - - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" + - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -287,7 +287,7 @@ commands: - name: parser-tagger-train-lg help: "Train the base models" script: - - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.vocab_path ${vars.vocab_lg_loc} --vars.include_static_vectors True" + - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.vocab_path ${vars.vocab_lg_loc} --vars.include_static_vectors True --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -314,7 +314,7 @@ commands: - name: ner-train-sm help: "Train the main ner" script: - - "spacy train ${vars.ner_config_loc} --output ${vars.ner_sm_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_sm_loc}/model-best --paths.vocab_path ${vars.vocab_sm_loc} --vars.include_static_vectors False" + - "spacy train ${vars.ner_config_loc} --output ${vars.ner_sm_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_sm_loc}/model-best --paths.vocab_path ${vars.vocab_sm_loc} --vars.include_static_vectors False --gpu-id ${vars.gpu_id}" deps: - "${vars.ner_config_loc}" - "${vars.parser_tagger_sm_loc}/model-best" @@ -325,7 +325,7 @@ commands: - name: ner-train-md help: "Train the main ner" script: - - "spacy train ${vars.ner_config_loc} --output ${vars.ner_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" + - "spacy train ${vars.ner_config_loc} --output ${vars.ner_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True --gpu-id ${vars.gpu_id}" deps: - "${vars.ner_config_loc}" - "${vars.parser_tagger_md_loc}/model-best" @@ -337,7 +337,7 @@ commands: - name: ner-train-lg help: "Train the main ner" script: - - "spacy train ${vars.ner_config_loc} --output ${vars.ner_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.parser_tagger_path ${vars.parser_tagger_lg_loc}/model-best --paths.vocab_path ${vars.vocab_lg_loc} --vars.include_static_vectors True" + - "spacy train ${vars.ner_config_loc} --output ${vars.ner_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.parser_tagger_path ${vars.parser_tagger_lg_loc}/model-best --paths.vocab_path ${vars.vocab_lg_loc} --vars.include_static_vectors True --gpu-id ${vars.gpu_id}" deps: - "${vars.ner_config_loc}" - "${vars.parser_tagger_lg_loc}/model-best" @@ -360,10 +360,10 @@ commands: - name: ner-train-specialized help: "Train the specialized NER models" script: - - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bc5cdr_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bc5cdr_loc_local}/train.tsv --paths.dev_path ${vars.bc5cdr_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" - - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bionlp13cg_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bionlp13cg_loc_local}/train.tsv --paths.dev_path ${vars.bionlp13cg_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" - - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.craft_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.craft_loc_local}/train.tsv --paths.dev_path ${vars.craft_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" - - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.jnlpba_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.jnlpba_loc_local}/train.tsv --paths.dev_path ${vars.jnlpba_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" + - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bc5cdr_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bc5cdr_loc_local}/train.tsv --paths.dev_path ${vars.bc5cdr_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True --gpu-id ${vars.gpu_id}" + - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bionlp13cg_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bionlp13cg_loc_local}/train.tsv --paths.dev_path ${vars.bionlp13cg_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True --gpu-id ${vars.gpu_id}" + - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.craft_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.craft_loc_local}/train.tsv --paths.dev_path ${vars.craft_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True --gpu-id ${vars.gpu_id}" + - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.jnlpba_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.jnlpba_loc_local}/train.tsv --paths.dev_path ${vars.jnlpba_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True --gpu-id ${vars.gpu_id}" deps: - "${vars.corpus_pubtator_loc_local}" - "${vars.bc5cdr_loc_local}/train.tsv" @@ -389,8 +389,8 @@ commands: - name: evaluate-parser-tagger-sm help: "Evaluate the parser and tagger" script: - - "spacy evaluate ${vars.parser_tagger_sm_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_sm_loc}/model_best_results.json" - - "spacy evaluate ${vars.parser_tagger_sm_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_sm_loc}/model_best_results_onto.json" + - "spacy evaluate ${vars.parser_tagger_sm_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_sm_loc}/model_best_results.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.parser_tagger_sm_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_sm_loc}/model_best_results_onto.json --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_sm_loc}/model-best" - "${vars.genia_test_spacy_loc}" @@ -402,8 +402,8 @@ commands: - name: evaluate-parser-tagger-md help: "Evaluate the parser and tagger" script: - - "spacy evaluate ${vars.parser_tagger_md_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_md_loc}/model_best_results.json" - - "spacy evaluate ${vars.parser_tagger_md_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_md_loc}/model_best_results_onto.json" + - "spacy evaluate ${vars.parser_tagger_md_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_md_loc}/model_best_results.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.parser_tagger_md_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_md_loc}/model_best_results_onto.json --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_md_loc}/model-best" - "${vars.genia_test_spacy_loc}" @@ -415,8 +415,8 @@ commands: - name: evaluate-parser-tagger-lg help: "Evaluate the parser and tagger" script: - - "spacy evaluate ${vars.parser_tagger_lg_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_lg_loc}/model_best_results.json" - - "spacy evaluate ${vars.parser_tagger_lg_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_lg_loc}/model_best_results_onto.json" + - "spacy evaluate ${vars.parser_tagger_lg_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_lg_loc}/model_best_results.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.parser_tagger_lg_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_lg_loc}/model_best_results_onto.json --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_lg_loc}/model-best" - "${vars.genia_test_spacy_loc}" @@ -428,8 +428,8 @@ commands: - name: evaluate-parser-tagger-scibert help: "Evaluate the parser and tagger scibert model" script: - - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results.json --gpu-id ${vars.gpu_id}" - - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results_onto.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results.json --gpu-id ${vars.gpu_id} --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results_onto.json --gpu-id ${vars.gpu_id} --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_scibert_loc}/model-best" - "${vars.genia_test_spacy_loc}" @@ -441,7 +441,7 @@ commands: - name: evaluate-ner-sm help: "Evaluate NER" script: - - "python scripts/evaluate_ner.py --model_path ${vars.ner_sm_loc}/model-best --dataset medmentions-test --output ${vars.ner_sm_loc}/model_best_results.json --med_mentions_folder_path assets/" + - "python scripts/evaluate_ner.py --model_path ${vars.ner_sm_loc}/model-best --dataset medmentions-test --output ${vars.ner_sm_loc}/model_best_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.ner_sm_loc}" - "${vars.corpus_pubtator_loc_local}" @@ -451,7 +451,7 @@ commands: - name: evaluate-ner-md help: "Evaluate NER" script: - - "python scripts/evaluate_ner.py --model_path ${vars.ner_md_loc}/model-best --dataset medmentions-test --output ${vars.ner_md_loc}/model_best_results.json --med_mentions_folder_path assets/" + - "python scripts/evaluate_ner.py --model_path ${vars.ner_md_loc}/model-best --dataset medmentions-test --output ${vars.ner_md_loc}/model_best_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.ner_md_loc}" - "${vars.corpus_pubtator_loc_local}" @@ -461,7 +461,7 @@ commands: - name: evaluate-ner-lg help: "Evaluate NER" script: - - "python scripts/evaluate_ner.py --model_path ${vars.ner_lg_loc}/model-best --dataset medmentions-test --output ${vars.ner_lg_loc}/model_best_results.json --med_mentions_folder_path assets/" + - "python scripts/evaluate_ner.py --model_path ${vars.ner_lg_loc}/model-best --dataset medmentions-test --output ${vars.ner_lg_loc}/model_best_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.ner_lg_loc}" - "${vars.corpus_pubtator_loc_local}" @@ -481,10 +481,10 @@ commands: - name: evaluate-specialized-ner help: "Evaluate specialize NER" script: - - "python scripts/evaluate_ner.py --model_path ${vars.bc5cdr_md_loc}/model-best --dataset ${vars.bc5cdr_loc_local}/test.tsv --output ${vars.bc5cdr_md_loc}/model_best_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.bionlp13cg_md_loc}/model-best --dataset ${vars.bionlp13cg_loc_local}/test.tsv --output ${vars.bionlp13cg_md_loc}/model_best_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.craft_md_loc}/model-best --dataset ${vars.craft_loc_local}/test.tsv --output ${vars.craft_md_loc}/model_best_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.jnlpba_md_loc}/model-best --dataset ${vars.jnlpba_loc_local}/test.tsv --output ${vars.jnlpba_md_loc}/model_best_results.json" + - "python scripts/evaluate_ner.py --model_path ${vars.bc5cdr_md_loc}/model-best --dataset ${vars.bc5cdr_loc_local}/test.tsv --output ${vars.bc5cdr_md_loc}/model_best_results.json --gpu_id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.bionlp13cg_md_loc}/model-best --dataset ${vars.bionlp13cg_loc_local}/test.tsv --output ${vars.bionlp13cg_md_loc}/model_best_results.json --gpu_id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.craft_md_loc}/model-best --dataset ${vars.craft_loc_local}/test.tsv --output ${vars.craft_md_loc}/model_best_results.json --gpu_id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.jnlpba_md_loc}/model-best --dataset ${vars.jnlpba_loc_local}/test.tsv --output ${vars.jnlpba_md_loc}/model_best_results.json --gpu_id ${vars.gpu_id}" deps: - "${vars.bc5cdr_md_loc}/model-best" - "${vars.bionlp13cg_md_loc}/model-best" @@ -512,9 +512,9 @@ commands: - name: evaluate-package-sm help: "Evaluate the packaged models" script: - - "spacy evaluate ${vars.package_sm_loc} ${vars.genia_test_spacy_loc} --output packages/sm_genia_results.json" - - "spacy evaluate ${vars.package_sm_loc} ${vars.ontonotes_test_spacy_loc} --output packages/sm_onto_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.package_sm_loc} --dataset medmentions-test --output packages/sm_mm_results.json --med_mentions_folder_path assets/" + - "spacy evaluate ${vars.package_sm_loc} ${vars.genia_test_spacy_loc} --output packages/sm_genia_results.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.package_sm_loc} ${vars.ontonotes_test_spacy_loc} --output packages/sm_onto_results.json --gpu-id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.package_sm_loc} --dataset medmentions-test --output packages/sm_mm_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.package_sm_loc}" outputs: @@ -553,9 +553,9 @@ commands: - name: evaluate-package-md help: "Evaluate the packaged models" script: - - "spacy evaluate ${vars.package_md_loc} ${vars.genia_test_spacy_loc} --output packages/md_genia_results.json" - - "spacy evaluate ${vars.package_md_loc} ${vars.ontonotes_test_spacy_loc} --output packages/md_onto_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.package_md_loc} --dataset medmentions-test --output packages/md_mm_results.json --med_mentions_folder_path assets/" + - "spacy evaluate ${vars.package_md_loc} ${vars.genia_test_spacy_loc} --output packages/md_genia_results.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.package_md_loc} ${vars.ontonotes_test_spacy_loc} --output packages/md_onto_results.json --gpu-id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.package_md_loc} --dataset medmentions-test --output packages/md_mm_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.package_md_loc}" outputs: @@ -566,9 +566,9 @@ commands: - name: evaluate-package-lg help: "Evaluate the packaged models" script: - - "spacy evaluate ${vars.package_lg_loc} ${vars.genia_test_spacy_loc} --output packages/lg_genia_results.json" - - "spacy evaluate ${vars.package_lg_loc} ${vars.ontonotes_test_spacy_loc} --output packages/lg_onto_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.package_lg_loc} --dataset medmentions-test --output packages/lg_mm_results.json --med_mentions_folder_path assets/" + - "spacy evaluate ${vars.package_lg_loc} ${vars.genia_test_spacy_loc} --output packages/lg_genia_results.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.package_lg_loc} ${vars.ontonotes_test_spacy_loc} --output packages/lg_onto_results.json --gpu-id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.package_lg_loc} --dataset medmentions-test --output packages/lg_mm_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.package_lg_loc}" outputs: @@ -580,7 +580,7 @@ commands: help: "Evaluate the packaged scibert model" script: - "spacy evaluate ${vars.package_scibert_loc} ${vars.genia_test_spacy_loc} --output packages/scibert_genia_results.json --gpu-id ${vars.gpu_id}" - - "spacy evaluate ${vars.package_scibert_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json --gpu-id ${var.gpu_id}" + - "spacy evaluate ${vars.package_scibert_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json --gpu-id ${vars.gpu_id}" - "python scripts/evaluate_ner.py --model_path ${vars.package_scibert_loc} --dataset medmentions-test --output packages/scibert_mm_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.package_scibert_loc}" @@ -588,8 +588,6 @@ commands: - "packages/scibert_genia_results.json" - "packages/scibert_onto_results.json" - "packages/scibert_mm_results.json" - - - name: package-ner help: "Package the models" @@ -612,10 +610,10 @@ commands: - name: evaluate-package-ner help: "Evaluate the packaged models" script: - - "python scripts/evaluate_ner.py --model_path ${vars.package_bc5cdr_loc} --dataset ${vars.bc5cdr_loc_local}/test.tsv --output packages/bc5cdr_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.package_bionlp13cg_loc} --dataset ${vars.bionlp13cg_loc_local}/test.tsv --output packages/bionlp13cg_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.package_craft_loc} --dataset ${vars.craft_loc_local}/test.tsv --output packages/craft_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.package_jnlpba_loc} --dataset ${vars.jnlpba_loc_local}/test.tsv --output packages/jnlpba_results.json" + - "python scripts/evaluate_ner.py --model_path ${vars.package_bc5cdr_loc} --dataset ${vars.bc5cdr_loc_local}/test.tsv --output packages/bc5cdr_results.json --gpu_id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.package_bionlp13cg_loc} --dataset ${vars.bionlp13cg_loc_local}/test.tsv --output packages/bionlp13cg_results.json --gpu_id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.package_craft_loc} --dataset ${vars.craft_loc_local}/test.tsv --output packages/craft_results.json --gpu_id ${vars.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.package_jnlpba_loc} --dataset ${vars.jnlpba_loc_local}/test.tsv --output packages/jnlpba_results.json --gpu_id ${vars.gpu_id}" deps: - "${vars.package_bc5cdr_loc}" - "${vars.package_bionlp13cg_loc}" From f3e25283ccfead9784a204d0c1aee6c600f7e71c Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 17 Aug 2022 14:15:19 -0700 Subject: [PATCH 04/17] Fix numpy warning in candidate generation --- scispacy/candidate_generation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scispacy/candidate_generation.py b/scispacy/candidate_generation.py index 7bcf934..e7f3981 100644 --- a/scispacy/candidate_generation.py +++ b/scispacy/candidate_generation.py @@ -281,8 +281,12 @@ def nmslib_knn_with_zero_vectors( neighbors.append([]) distances.append([]) # interleave `neighbors` and Nones in `extended_neighbors` - extended_neighbors[empty_vectors_boolean_flags] = numpy.array(neighbors)[:-1] - extended_distances[empty_vectors_boolean_flags] = numpy.array(distances)[:-1] + extended_neighbors[empty_vectors_boolean_flags] = numpy.array( + neighbors, dtype=object + )[:-1] + extended_distances[empty_vectors_boolean_flags] = numpy.array( + distances, dtype=object + )[:-1] return extended_neighbors, extended_distances From a5a826cb89cff12ca756309af7159c9830763e1f Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 17 Aug 2022 14:20:46 -0700 Subject: [PATCH 05/17] Add scibert to all workflow --- project.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/project.yml b/project.yml index 42eae35..75cf02f 100644 --- a/project.yml +++ b/project.yml @@ -131,25 +131,31 @@ workflows: - parser-tagger-train-sm - parser-tagger-train-md - parser-tagger-train-lg + - parser-tagger-train-scibert - ner-train-sm - ner-train-md - ner-train-lg - ner-train-specialized + - ner-train-scibert - evaluate-parser-tagger-sm - evaluate-parser-tagger-md - evaluate-parser-tagger-lg + - evaluate-parser-tagger-scibert - evaluate-ner-sm - evaluate-ner-md - evaluate-ner-lg - evaluate-specialized-ner + - evaluate-ner-scibert - package-sm - package-md - package-lg - package-ner + - package-scibert - evaluate-package-sm - evaluate-package-md - evaluate-package-lg - evaluate-package-ner + - evaluate-package-scibert commands: - name: download From 483a6a954b32ee4b2020de723ae34833d4b1809d Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 27 Aug 2022 21:35:25 -0700 Subject: [PATCH 06/17] Add convenience scripts for testing models before release --- scripts/install_local_packages.py | 33 +++++++++++ scripts/smoke_test.py | 86 +++++++++++++++++++++++++++++ scripts/uninstall_local_packages.py | 23 ++++++++ 3 files changed, 142 insertions(+) create mode 100644 scripts/install_local_packages.py create mode 100644 scripts/smoke_test.py create mode 100644 scripts/uninstall_local_packages.py diff --git a/scripts/install_local_packages.py b/scripts/install_local_packages.py new file mode 100644 index 0000000..6f1e293 --- /dev/null +++ b/scripts/install_local_packages.py @@ -0,0 +1,33 @@ +import os + +from scispacy.version import VERSION + + +def main(): + model_names = [ + "en_core_sci_sm", + "en_core_sci_md", + "en_core_sci_lg", + "en_core_sci_scibert", + "en_ner_bc5cdr_md", + "en_ner_craft_md", + "en_ner_bionlp13cg_md", + "en_ner_jnlpba_md", + ] + + full_package_paths = [ + os.path.join( + "packages", + f"{model_name}-{VERSION}", + "dist", + f"{model_name}-{VERSION}.tar.gz", + ) + for model_name in model_names + ] + + for package_path in full_package_paths: + os.system(f"pip install {package_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/smoke_test.py b/scripts/smoke_test.py new file mode 100644 index 0000000..cd488d6 --- /dev/null +++ b/scripts/smoke_test.py @@ -0,0 +1,86 @@ +import spacy +from tqdm import tqdm + +from scispacy.abbreviation import AbbreviationDetector +from scispacy.linking import EntityLinker + + +def main(): + print("Testing core models...") + print() + model_names = [ + "en_core_sci_sm", + "en_core_sci_md", + "en_core_sci_lg", + "en_core_sci_scibert", + "en_ner_bc5cdr_md", + "en_ner_craft_md", + "en_ner_bionlp13cg_md", + "en_ner_jnlpba_md", + ] + + models = [ + spacy.load(model_name) + for model_name in tqdm(model_names, desc="Loading core models") + ] + + text = ( + "This first sentence mentions John. " + "John uses IL gene and interleukin-2 to treat diabetes and " + "aspirin as proteins for arms and legs on lemurs and humans." + ) + + for model_name, model in zip(model_names, models): + print(f"Testing {model_name}") + doc = model(text) + for sentence in doc.sents: + print([t.text for t in sentence]) + print([t.lemma_ for t in sentence]) + print([t.pos_ for t in sentence]) + print([t.tag_ for t in sentence]) + print([t.dep_ for t in sentence]) + print([t.ent_type_ for t in sentence]) + print() + print() + + print("Testing abbreivation detector...") + abbreviation_nlp = spacy.load("en_core_sci_sm") + abbreviation_nlp.add_pipe("abbreviation_detector") + abbreviation_text = ( + "Spinal and bulbar muscular atrophy (SBMA) is an inherited " + "motor neuron disease caused by the expansion of a polyglutamine " + "tract within the androgen receptor (AR). SBMA can be caused by this easily." + ) + abbreviation_doc = abbreviation_nlp(abbreviation_text) + for abbrevation in abbreviation_doc._.abbreviations: + print( + f"{abbrevation} \t ({abbrevation.start}, {abbrevation.end}) {abbrevation._.long_form}" + ) + print() + + print("Testing entity linkers...") + print() + ontology_names = ["umls", "mesh", "rxnorm", "go", "hpo"] + ontology_models = [spacy.load("en_core_sci_sm") for _ in ontology_names] + for ontology_name, ontology_model in tqdm( + zip(ontology_names, ontology_models), desc="Adding entity linker pipes" + ): + ontology_model.add_pipe( + "scispacy_linker", + config={"resolve_abbreviations": False, "linker_name": ontology_name}, + ) + + linking_text = "Diabetes is a disease that affects humans." + for ontology_name, ontology_model in zip(ontology_names, ontology_models): + print(f"Testing {ontology_name} linker...") + linker_pipe = ontology_model.get_pipe("scispacy_linker") + doc = ontology_model(linking_text) + for entity in doc.ents: + print("Entity name: ", entity) + for ontology_entity in entity._.kb_ents[:1]: + print(linker_pipe.kb.cui_to_entity[ontology_entity[0]]) + print() + + +if __name__ == "__main__": + main() diff --git a/scripts/uninstall_local_packages.py b/scripts/uninstall_local_packages.py new file mode 100644 index 0000000..d14d69f --- /dev/null +++ b/scripts/uninstall_local_packages.py @@ -0,0 +1,23 @@ +import os + +from scispacy.version import VERSION + + +def main(): + model_names = [ + "en_core_sci_sm", + "en_core_sci_md", + "en_core_sci_lg", + "en_core_sci_scibert", + "en_ner_bc5cdr_md", + "en_ner_craft_md", + "en_ner_bionlp13cg_md", + "en_ner_jnlpba_md", + ] + + for package_name in model_names: + os.system(f"pip uninstall {package_name}") + + +if __name__ == "__main__": + main() From bfe27f5cd366104dd5eaba678b579735b08a480f Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 27 Aug 2022 21:36:02 -0700 Subject: [PATCH 07/17] Fix a couple of file dependencies in project.yml --- project.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/project.yml b/project.yml index 75cf02f..58a8788 100644 --- a/project.yml +++ b/project.yml @@ -309,7 +309,7 @@ commands: script: - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_lg_loc} --gpu-id ${vars.gpu_id}" deps: - - "${vars.parser_tagger_config_loc}" + - "${vars.parser_tagger_scibert_config_loc}" - "${vars.genia_train_spacy_loc}" - "${vars.genia_dev_spacy_loc}" - "${vars.genia_test_spacy_loc}" @@ -357,7 +357,7 @@ commands: script: - "spacy train ${vars.ner_scibert_config_loc} --output ${vars.ner_scibert_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_scibert_loc}/model-best --gpu-id ${vars.gpu_id}" deps: - - "${vars.ner_config_loc}" + - "${vars.ner_scibert_config_loc}" - "${vars.parser_tagger_scibert_loc}/model-best" - "${vars.corpus_pubtator_loc_local}" outputs: From ffe630de68de9bcd0cc7a5fef15994fe78e38cfe Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 27 Aug 2022 21:36:42 -0700 Subject: [PATCH 08/17] Update scibert config to better match spacys transformer configuration. Attempting to fix scibert sentence splitting --- configs/base_parser_tagger_scibert.cfg | 42 ++++++++++++++------------ 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/configs/base_parser_tagger_scibert.cfg b/configs/base_parser_tagger_scibert.cfg index 65b8f2f..fd13a56 100644 --- a/configs/base_parser_tagger_scibert.cfg +++ b/configs/base_parser_tagger_scibert.cfg @@ -13,7 +13,6 @@ seed = 0 [nlp] lang = "en" pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser"] -batch_size = 256 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} disabled = [] before_creation = null @@ -39,8 +38,8 @@ update_with_oracle_cut_size = 100 @architectures = "spacy.TransitionBasedParser.v2" state_type = "parser" extra_state_tokens = false -hidden_width = 128 -maxout_pieces = 3 +hidden_width = 64 +maxout_pieces = 2 use_upper = false nO = null @@ -48,10 +47,12 @@ nO = null @architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" +upstream = "transformer" [components.tagger] factory = "tagger" +neg_prefix = "!" +overwrite = false [components.tagger.model] @architectures = "spacy.Tagger.v2" @@ -62,7 +63,7 @@ normalize = False @architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" +upstream = "transformer" [components.transformer] factory = "transformer" @@ -108,24 +109,20 @@ train_corpus = "corpora.train" seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 -accumulate_gradient = 1 -patience = 0 -max_epochs = 8 -max_steps = 0 -eval_frequency = 2300 +accumulate_gradient = 3 +patience = 5000 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 1000 frozen_components = ["attribute_ruler", "lemmatizer"] before_to_disk = null [training.batcher] -@batchers = "spacy.batch_by_sequence.v1" +@batchers = "spacy.batch_by_padded.v1" get_length = null - -[training.batcher.size] -@schedules = "compounding.v1" -start = 4 -stop = 12 -compound = 1.001 -t = 0.0 +discard_oversize = true +size = 2000 +buffer = 256 [training.logger] @loggers = "spacy.ConsoleLogger.v1" @@ -138,9 +135,14 @@ beta2 = 0.999 L2_is_weight_decay = true L2 = 0.01 grad_clip = 1.0 -use_averages = false +use_averages = true eps = 0.00000001 -learn_rate = 0.00005 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 [training.score_weights] From 00aa1a4f973810a550770e49d7d542da3bfa9850 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 28 Aug 2022 01:05:14 -0700 Subject: [PATCH 09/17] Tweak text in smoke test --- scripts/smoke_test.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/smoke_test.py b/scripts/smoke_test.py index cd488d6..0cb1efc 100644 --- a/scripts/smoke_test.py +++ b/scripts/smoke_test.py @@ -25,7 +25,7 @@ def main(): ] text = ( - "This first sentence mentions John. " + "DNA is a very important part of the cellular structure of the body. " "John uses IL gene and interleukin-2 to treat diabetes and " "aspirin as proteins for arms and legs on lemurs and humans." ) @@ -42,6 +42,7 @@ def main(): print([t.ent_type_ for t in sentence]) print() print() + input("Continue?") print("Testing abbreivation detector...") abbreviation_nlp = spacy.load("en_core_sci_sm") @@ -57,6 +58,7 @@ def main(): f"{abbrevation} \t ({abbrevation.start}, {abbrevation.end}) {abbrevation._.long_form}" ) print() + input("Continue?") print("Testing entity linkers...") print() @@ -70,7 +72,7 @@ def main(): config={"resolve_abbreviations": False, "linker_name": ontology_name}, ) - linking_text = "Diabetes is a disease that affects humans." + linking_text = "Diabetes is a disease that affects humans and is treated with aspirin via a metabolic process." for ontology_name, ontology_model in zip(ontology_names, ontology_models): print(f"Testing {ontology_name} linker...") linker_pipe = ontology_model.get_pipe("scispacy_linker") @@ -80,6 +82,7 @@ def main(): for ontology_entity in entity._.kb_ents[:1]: print(linker_pipe.kb.cui_to_entity[ontology_entity[0]]) print() + input("Continue?") if __name__ == "__main__": From 0fe3d79602f3917aaad4e3eeb45b07c2e278d3c3 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 28 Aug 2022 14:14:19 -0700 Subject: [PATCH 10/17] Update version strings in docs --- Dockerfile | 2 +- README.md | 18 +++++++++--------- docs/index.md | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index 396daba..8417070 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /work COPY requirements.in . RUN pip install -r requirements.in -RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz +RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz RUN python -m spacy download en_core_web_sm RUN python -m spacy download en_core_web_md diff --git a/README.md b/README.md index 210b36c..5bd7394 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ pip install scispacy to install a model (see our full selection of available models below), run a command like the following: ```bash -pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz ``` Note: We strongly recommend that you use an isolated Python environment (such as virtualenv or conda) to install scispacy. @@ -76,14 +76,14 @@ pip install CMD-V(to paste the copied URL) | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz)| -| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_craft_md-0.5.0.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_jnlpba_md-0.5.0.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz)| ## Additional Pipeline Components diff --git a/docs/index.md b/docs/index.md index 8579f9e..25b7150 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,14 +17,14 @@ pip install | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz)| -| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_craft_md-0.5.0.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_jnlpba_md-0.5.0.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz)| From b0e69473b8ad63b5cb2eacd46c55f369f3e4b6b8 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 28 Aug 2022 22:52:07 -0700 Subject: [PATCH 11/17] Put transformer config back the way it was --- configs/base_parser_tagger_scibert.cfg | 47 ++++++++++++-------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/configs/base_parser_tagger_scibert.cfg b/configs/base_parser_tagger_scibert.cfg index fd13a56..3615299 100644 --- a/configs/base_parser_tagger_scibert.cfg +++ b/configs/base_parser_tagger_scibert.cfg @@ -13,6 +13,7 @@ seed = 0 [nlp] lang = "en" pipeline = ["transformer","tagger","attribute_ruler","lemmatizer","parser"] +batch_size = 256 tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} disabled = [] before_creation = null @@ -38,8 +39,8 @@ update_with_oracle_cut_size = 100 @architectures = "spacy.TransitionBasedParser.v2" state_type = "parser" extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 +hidden_width = 128 +maxout_pieces = 3 use_upper = false nO = null @@ -47,23 +48,20 @@ nO = null @architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 pooling = {"@layers":"reduce_mean.v1"} -upstream = "transformer" +upstream = "*" [components.tagger] factory = "tagger" -neg_prefix = "!" -overwrite = false [components.tagger.model] -@architectures = "spacy.Tagger.v2" +@architectures = "spacy.Tagger.v1" nO = null -normalize = False [components.tagger.model.tok2vec] @architectures = "spacy-transformers.TransformerListener.v1" grad_factor = 1.0 pooling = {"@layers":"reduce_mean.v1"} -upstream = "transformer" +upstream = "*" [components.transformer] factory = "transformer" @@ -109,20 +107,24 @@ train_corpus = "corpora.train" seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} dropout = 0.1 -accumulate_gradient = 3 -patience = 5000 -max_epochs = 0 -max_steps = 20000 -eval_frequency = 1000 +accumulate_gradient = 1 +patience = 0 +max_epochs = 8 +max_steps = 0 +eval_frequency = 2300 frozen_components = ["attribute_ruler", "lemmatizer"] before_to_disk = null [training.batcher] -@batchers = "spacy.batch_by_padded.v1" +@batchers = "spacy.batch_by_sequence.v1" get_length = null -discard_oversize = true -size = 2000 -buffer = 256 + +[training.batcher.size] +@schedules = "compounding.v1" +start = 4 +stop = 12 +compound = 1.001 +t = 0.0 [training.logger] @loggers = "spacy.ConsoleLogger.v1" @@ -135,14 +137,9 @@ beta2 = 0.999 L2_is_weight_decay = true L2 = 0.01 grad_clip = 1.0 -use_averages = true +use_averages = false eps = 0.00000001 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 20000 -initial_rate = 0.00005 +learn_rate = 0.00005 [training.score_weights] @@ -163,4 +160,4 @@ ents_r = 0.0 [initialize] [initialize.before_init] -@callbacks = "replace_tokenizer" +@callbacks = "replace_tokenizer" \ No newline at end of file From 3825235e6d689186d3c87cef6c09e1a48e943223 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 28 Aug 2022 22:52:16 -0700 Subject: [PATCH 12/17] Update metrics --- docs/index.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/index.md b/docs/index.md index 25b7150..b1aab56 100644 --- a/docs/index.md +++ b/docs/index.md @@ -34,18 +34,18 @@ Our models achieve performance within 3% of published state of the art dependenc | model | UAS | LAS | POS | Mentions (F1) | Web UAS | |:---------------|:----|:------|:------|:---|:---| -| en_core_sci_sm | 89.27| 87.33 | 98.29 | 68.05 | 87.61 | -| en_core_sci_md | 89.86| 87.92 | 98.43 | 69.32 | 88.05 | -| en_core_sci_lg | 89.54| 87.66 | 98.29 | 69.52 | 87.68 | -| en_core_sci_scibert | 92.28| 90.83 | 98.93 | 67.84 | 92.63 | +| en_core_sci_sm | 89.03| 87.00 | 98.13 | 67.87 | 87.42 | +| en_core_sci_md | 89.73| 87.85 | 98.40 | 69.53 | 87.79 | +| en_core_sci_lg | 89.75| 87.79 | 98.49 | 69.69 | 87.74 | +| en_core_sci_scibert | 92.21| 90.65 | 98.86 | 68.01 | 92.58 | | model | F1 | Entity Types| |:---------------|:-----|:--------| -| en_ner_craft_md | 78.35|GGP, SO, TAXON, CHEBI, GO, CL| -| en_ner_jnlpba_md | 70.89| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | -| en_ner_bc5cdr_md | 84.70| DISEASE, CHEMICAL| -| en_ner_bionlp13cg_md | 76.79| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | +| en_ner_craft_md | 76.75|GGP, SO, TAXON, CHEBI, GO, CL| +| en_ner_jnlpba_md | 72.28| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | +| en_ner_bc5cdr_md | 84.53| DISEASE, CHEMICAL| +| en_ner_bionlp13cg_md | 76.57| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | ### Example Usage From bd8ab1ad6b7fbb3abc804f2a6fc64bb6e22f3d3f Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 28 Aug 2022 22:53:01 -0700 Subject: [PATCH 13/17] Add new line back --- configs/base_parser_tagger_scibert.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/base_parser_tagger_scibert.cfg b/configs/base_parser_tagger_scibert.cfg index 3615299..016a395 100644 --- a/configs/base_parser_tagger_scibert.cfg +++ b/configs/base_parser_tagger_scibert.cfg @@ -160,4 +160,4 @@ ents_r = 0.0 [initialize] [initialize.before_init] -@callbacks = "replace_tokenizer" \ No newline at end of file +@callbacks = "replace_tokenizer" From 82c51e0ce75cf72ec9ea340a32121681c6090a53 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 6 Sep 2022 17:07:27 -0700 Subject: [PATCH 14/17] Convenience script for installing remote packages --- scripts/install_remote_packages.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 scripts/install_remote_packages.py diff --git a/scripts/install_remote_packages.py b/scripts/install_remote_packages.py new file mode 100644 index 0000000..60ff0f5 --- /dev/null +++ b/scripts/install_remote_packages.py @@ -0,0 +1,28 @@ +import os + +from scispacy.version import VERSION + + +def main(): + s3_prefix = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/" + model_names = [ + "en_core_sci_sm", + "en_core_sci_md", + "en_core_sci_lg", + "en_core_sci_scibert", + "en_ner_bc5cdr_md", + "en_ner_craft_md", + "en_ner_bionlp13cg_md", + "en_ner_jnlpba_md", + ] + + full_package_paths = [ + f"{s3_prefix}{model_name}-{VERSION}.tar.gz" for model_name in model_names + ] + + for package_path in full_package_paths: + os.system(f"pip install {package_path}") + + +if __name__ == "__main__": + main() From 2369bdf7941cbcbfdd6ad1c31cbb769049fe2d12 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 6 Sep 2022 17:07:45 -0700 Subject: [PATCH 15/17] Convenience script for getting the metrics to report --- scripts/print_out_metrics.py | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 scripts/print_out_metrics.py diff --git a/scripts/print_out_metrics.py b/scripts/print_out_metrics.py new file mode 100644 index 0000000..4c634b5 --- /dev/null +++ b/scripts/print_out_metrics.py @@ -0,0 +1,45 @@ +import os +import json + + +def main(): + core_model_names = ["lg", "md", "sm", "scibert"] + ner_model_names = ["bc5cdr", "bionlp13cg", "craft", "jnlpba"] + + base_path = "packages" + for core_model_name in core_model_names: + print(f"Printing results for {core_model_name}") + with open( + os.path.join(base_path, f"{core_model_name}_genia_results.json") + ) as _genia_results_file: + genia_results = json.load(_genia_results_file) + + with open( + os.path.join(base_path, f"{core_model_name}_onto_results.json") + ) as _onto_results_file: + onto_results = json.load(_onto_results_file) + + with open( + os.path.join(base_path, f"{core_model_name}_mm_results.json") + ) as _mm_results_file: + mm_results = json.load(_mm_results_file) + + print(f"Genia tag accuracy: {genia_results['tag_acc']}") + print(f"Genia uas: {genia_results['dep_uas']}") + print(f"Genia las: {genia_results['dep_las']}") + print(f"Ontonotes uas: {onto_results['dep_uas']}") + print(f"MedMentions F1: {mm_results['f1-measure-untyped']}") + print() + + for ner_model_name in ner_model_names: + print(f"Printing results for {ner_model_name}") + with open( + os.path.join(base_path, f"{ner_model_name}_results.json") + ) as _ner_results_file: + ner_results = json.load(_ner_results_file) + + print(f"NER F1: {ner_results['f1-measure-overall']}") + + +if __name__ == "__main__": + main() From 05e67037b85b80067c1ed11e0ca18d62a78cdf78 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 6 Sep 2022 17:07:54 -0700 Subject: [PATCH 16/17] Fix hyponym test --- tests/test_hyponym_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_hyponym_detector.py b/tests/test_hyponym_detector.py index e8ab8f6..ff41c54 100644 --- a/tests/test_hyponym_detector.py +++ b/tests/test_hyponym_detector.py @@ -20,7 +20,7 @@ def test_sentences(self): ) doc = self.nlp(text) fig_trees = doc[21:23] - plant_species = doc[17:19] + plant_species = doc[18:19] assert doc._.hearst_patterns == [("such_as", plant_species, fig_trees)] doc = self.nlp("SARS, or other coronaviruses, are bad.") From 40ded231f25b781a9bdf89db932ff675c5156ed2 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 6 Sep 2022 17:14:01 -0700 Subject: [PATCH 17/17] Actually fix hyponym test --- tests/test_hyponym_detector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_hyponym_detector.py b/tests/test_hyponym_detector.py index ff41c54..f89f8e3 100644 --- a/tests/test_hyponym_detector.py +++ b/tests/test_hyponym_detector.py @@ -20,7 +20,7 @@ def test_sentences(self): ) doc = self.nlp(text) fig_trees = doc[21:23] - plant_species = doc[18:19] + plant_species = doc[16:19] assert doc._.hearst_patterns == [("such_as", plant_species, fig_trees)] doc = self.nlp("SARS, or other coronaviruses, are bad.")