From 94bfa82ae9d0a2fa4c65caf5265690fd9d5e7c52 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 3 Mar 2022 23:01:02 -0800 Subject: [PATCH 01/16] typo --- project.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/project.yml b/project.yml index b2983a1..936d85b 100644 --- a/project.yml +++ b/project.yml @@ -130,25 +130,31 @@ workflows: - parser-tagger-train-sm - parser-tagger-train-md - parser-tagger-train-lg + - parser-tagger-train-scibert - ner-train-sm - ner-train-md - ner-train-lg - ner-train-specialized + - ner-train-scibert - evaluate-parser-tagger-sm - evaluate-parser-tagger-md - evaluate-parser-tagger-lg + - evaluate-parser-tagger-scibert - evaluate-ner-sm - evaluate-ner-md - evaluate-ner-lg - evaluate-specialized-ner + - evaluate-ner-scibert - package-sm - package-md - package-lg - package-ner + - package-scibert - evaluate-package-sm - evaluate-package-md - evaluate-package-lg - evaluate-package-ner + - evaluate-package-scibert commands: - name: download @@ -159,7 +165,7 @@ commands: - "aws s3 cp ${vars.genia_loc_s3}/train.json ${vars.genia_train_loc_local} --no-sign-request" - "aws s3 cp ${vars.genia_loc_s3}/dev.json ${vars.genia_dev_loc_local} --no-sign-request" - "aws s3 cp ${vars.genia_loc_s3}/test.json ${vars.genia_test_loc_local} --no-sign-request" - - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz" + # - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz" - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/" - "rm ${vars.ontonotes_loc_local}.tar.gz" - "aws s3 cp ${vars.med_mentions_loc_s3} assets/med_mentions.tar.gz --no-sign-request" From 6b53c1f246ba7a2b5ad8ae0182adc0f684092abf Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 4 Mar 2022 00:07:51 -0800 Subject: [PATCH 02/16] Fix static vectors var --- configs/base_ner.cfg | 5 ++++- configs/base_parser_tagger.cfg | 5 ++++- configs/base_specialized_ner.cfg | 5 ++++- project.yml | 20 ++++++++++---------- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/configs/base_ner.cfg b/configs/base_ner.cfg index 84b2ca5..60c2153 100644 --- a/configs/base_ner.cfg +++ b/configs/base_ner.cfg @@ -1,3 +1,6 @@ +[vars] +include_static_vectors = null + [paths] vectors = null init_tok2vec = null @@ -47,7 +50,7 @@ nO = null width = 96 attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] rows = [5000, 2500, 2500, 2500] -include_static_vectors = true +include_static_vectors = ${vars.include_static_vectors} [components.ner.model.tok2vec.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" diff --git a/configs/base_parser_tagger.cfg b/configs/base_parser_tagger.cfg index 437170c..70d38f1 100644 --- a/configs/base_parser_tagger.cfg +++ b/configs/base_parser_tagger.cfg @@ -1,3 +1,6 @@ +[vars] +include_static_vectors = null + [paths] genia_train = "project_data/genia_train.spacy" genia_dev = "project_data/genia_dev.spacy" @@ -71,7 +74,7 @@ factory = "tok2vec" width = ${components.tok2vec.model.encode.width} attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] rows = [5000, 2500, 2500, 2500] -include_static_vectors = true +include_static_vectors = ${vars.include_static_vectors} [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" diff --git a/configs/base_specialized_ner.cfg b/configs/base_specialized_ner.cfg index c6274b7..1c22d17 100644 --- a/configs/base_specialized_ner.cfg +++ b/configs/base_specialized_ner.cfg @@ -1,3 +1,6 @@ +[vars] +include_static_vectors = null + [paths] vectors = null init_tok2vec = null @@ -49,7 +52,7 @@ nO = null width = 96 attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] rows = [5000, 2500, 2500, 2500] -include_static_vectors = true +include_static_vectors = ${vars.include_static_vectors} [components.ner.model.tok2vec.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" diff --git a/project.yml b/project.yml index 936d85b..7a1f85f 100644 --- a/project.yml +++ b/project.yml @@ -265,7 +265,7 @@ commands: - name: parser-tagger-train-sm help: "Train the base models" script: - - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_sm_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_sm_loc}" + - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_sm_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_sm_loc} --vars.include_static_vectors False" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -278,7 +278,7 @@ commands: - name: parser-tagger-train-md help: "Train the base models" script: - - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.vocab_path ${vars.vocab_md_loc}" + - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -292,7 +292,7 @@ commands: - name: parser-tagger-train-lg help: "Train the base models" script: - - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.vocab_path ${vars.vocab_lg_loc}" + - "spacy train ${vars.parser_tagger_config_loc} --output ${vars.parser_tagger_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.vocab_path ${vars.vocab_lg_loc} --vars.include_static_vectors True" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -319,7 +319,7 @@ commands: - name: ner-train-sm help: "Train the main ner" script: - - "spacy train ${vars.ner_config_loc} --output ${vars.ner_sm_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_sm_loc}/model-best --paths.vocab_path ${vars.vocab_sm_loc}" + - "spacy train ${vars.ner_config_loc} --output ${vars.ner_sm_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_sm_loc}/model-best --paths.vocab_path ${vars.vocab_sm_loc} --vars.include_static_vectors False" deps: - "${vars.ner_config_loc}" - "${vars.parser_tagger_sm_loc}/model-best" @@ -330,7 +330,7 @@ commands: - name: ner-train-md help: "Train the main ner" script: - - "spacy train ${vars.ner_config_loc} --output ${vars.ner_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.vocab_path ${vars.vocab_md_loc}" + - "spacy train ${vars.ner_config_loc} --output ${vars.ner_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" deps: - "${vars.ner_config_loc}" - "${vars.parser_tagger_md_loc}/model-best" @@ -342,7 +342,7 @@ commands: - name: ner-train-lg help: "Train the main ner" script: - - "spacy train ${vars.ner_config_loc} --output ${vars.ner_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.parser_tagger_path ${vars.parser_tagger_lg_loc}/model-best --paths.vocab_path ${vars.vocab_lg_loc}" + - "spacy train ${vars.ner_config_loc} --output ${vars.ner_lg_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_lg_loc} --paths.parser_tagger_path ${vars.parser_tagger_lg_loc}/model-best --paths.vocab_path ${vars.vocab_lg_loc} --vars.include_static_vectors True" deps: - "${vars.ner_config_loc}" - "${vars.parser_tagger_lg_loc}/model-best" @@ -365,10 +365,10 @@ commands: - name: ner-train-specialized help: "Train the specialized NER models" script: - - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bc5cdr_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bc5cdr_loc_local}/train.tsv --paths.dev_path ${vars.bc5cdr_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc}" - - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bionlp13cg_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bionlp13cg_loc_local}/train.tsv --paths.dev_path ${vars.bionlp13cg_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc}" - - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.craft_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.craft_loc_local}/train.tsv --paths.dev_path ${vars.craft_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc}" - - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.jnlpba_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.jnlpba_loc_local}/train.tsv --paths.dev_path ${vars.jnlpba_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc}" + - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bc5cdr_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bc5cdr_loc_local}/train.tsv --paths.dev_path ${vars.bc5cdr_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" + - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.bionlp13cg_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.bionlp13cg_loc_local}/train.tsv --paths.dev_path ${vars.bionlp13cg_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" + - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.craft_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.craft_loc_local}/train.tsv --paths.dev_path ${vars.craft_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" + - "spacy train ${vars.specialized_ner_config_loc} --output ${vars.jnlpba_md_loc} --code ${vars.code_loc} --paths.vectors ${vars.vectors_md_loc} --paths.parser_tagger_path ${vars.parser_tagger_md_loc}/model-best --paths.train_path ${vars.jnlpba_loc_local}/train.tsv --paths.dev_path ${vars.jnlpba_loc_local}/devel.tsv --paths.vocab_path ${vars.vocab_md_loc} --vars.include_static_vectors True" deps: - "${vars.corpus_pubtator_loc_local}" - "${vars.bc5cdr_loc_local}/train.tsv" From abc7c88146acd3e3b1385aa2c008e388ac931efb Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 4 Mar 2022 00:08:07 -0800 Subject: [PATCH 03/16] Adjust requirements --- requirements.in | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.in b/requirements.in index b5970a4..8351a3a 100644 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,5 @@ numpy -spacy>=3.0.0,<3.1.0 +spacy>=3.2.0,<3.3.0 spacy-lookups-data pandas requests>=2.0.0,<3.0.0 @@ -15,7 +15,7 @@ pytest pytest-cov flake8 # black currently pinned because of a dependency issue with spacy, typer, and click -black<=21.12b0 +black mypy types-requests diff --git a/setup.py b/setup.py index b9c0d5d..c972d5c 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), license="Apache", install_requires=[ - "spacy>=3.0.0,<3.1.0", + "spacy>=3.2.0,<3.3.0", "requests>=2.0.0,<3.0.0", "conllu", "numpy", From d7c6c416c843ada5c1ef8f0bb556b396bdd87848 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Fri, 4 Mar 2022 00:08:23 -0800 Subject: [PATCH 04/16] update version num --- scispacy/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scispacy/version.py b/scispacy/version.py index 65ece88..c8b11da 100644 --- a/scispacy/version.py +++ b/scispacy/version.py @@ -1,5 +1,5 @@ _MAJOR = "0" -_MINOR = "4" +_MINOR = "5" _REVISION = "0" VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) From 247695f4bdbc1951cca1a919498ac3742d2b79ee Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 5 Mar 2022 21:44:41 -0800 Subject: [PATCH 05/16] changes to configs for new version and smaller gpu --- configs/base_ner.cfg | 18 +++++++++--------- configs/base_ner_scibert.cfg | 16 ++++++++-------- configs/base_parser_tagger.cfg | 14 +++++++------- configs/base_parser_tagger_scibert.cfg | 23 ++++++++--------------- configs/base_specialized_ner.cfg | 18 +++++++++--------- 5 files changed, 41 insertions(+), 48 deletions(-) diff --git a/configs/base_ner.cfg b/configs/base_ner.cfg index 60c2153..00b0506 100644 --- a/configs/base_ner.cfg +++ b/configs/base_ner.cfg @@ -34,26 +34,26 @@ moves = null update_with_oracle_cut_size = 100 [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v2" state_type = "ner" extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 +hidden_width = 128 +maxout_pieces = 3 use_upper = true nO = null [components.ner.model.tok2vec] -@architectures = "spacy.Tok2Vec.v1" +@architectures = "spacy.Tok2Vec.v2" [components.ner.model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = 96 -attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] -rows = [5000, 2500, 2500, 2500] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"] +rows = [5000, 2500, 2500, 2500, 100] include_static_vectors = ${vars.include_static_vectors} [components.ner.model.tok2vec.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" +@architectures = "spacy.MaxoutWindowEncoder.v2" width = 96 depth = 4 window_size = 1 @@ -85,7 +85,7 @@ dev_corpus = "corpora.dev" train_corpus = "corpora.train" seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} -dropout = 0.2 +dropout = 0.1 accumulate_gradient = 1 patience = 0 max_epochs = 7 diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index 9e80b53..c8b7371 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -5,7 +5,7 @@ parser_tagger_path = null vocab_path = null [system] -gpu_allocator = null +gpu_allocator = "pytorch" seed = 0 [nlp] @@ -31,7 +31,7 @@ moves = null update_with_oracle_cut_size = 100 [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v2" state_type = "ner" extra_state_tokens = false hidden_width = 64 @@ -40,17 +40,17 @@ use_upper = true nO = null [components.ner.model.tok2vec] -@architectures = "spacy.Tok2Vec.v1" +@architectures = "spacy.Tok2Vec.v2" [components.ner.model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = 96 -attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] -rows = [5000, 2500, 2500, 2500] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"] +rows = [5000, 2500, 2500, 2500, 100] include_static_vectors = false [components.ner.model.tok2vec.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" +@architectures = "spacy.MaxoutWindowEncoder.v2" width = 96 depth = 4 window_size = 1 @@ -83,7 +83,7 @@ dev_corpus = "corpora.dev" train_corpus = "corpora.train" seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} -dropout = 0.2 +dropout = 0.1 accumulate_gradient = 1 patience = 0 max_epochs = 7 diff --git a/configs/base_parser_tagger.cfg b/configs/base_parser_tagger.cfg index 70d38f1..cec9e47 100644 --- a/configs/base_parser_tagger.cfg +++ b/configs/base_parser_tagger.cfg @@ -38,7 +38,7 @@ moves = null update_with_oracle_cut_size = 100 [components.parser.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v2" state_type = "parser" extra_state_tokens = false hidden_width = 128 @@ -67,17 +67,17 @@ upstream = "*" factory = "tok2vec" [components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v1" +@architectures = "spacy.Tok2Vec.v2" [components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = ${components.tok2vec.model.encode.width} -attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] -rows = [5000, 2500, 2500, 2500] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"] +rows = [5000, 2500, 2500, 2500, 100] include_static_vectors = ${vars.include_static_vectors} [components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" +@architectures = "spacy.MaxoutWindowEncoder.v2" width = 96 depth = 4 window_size = 1 @@ -109,7 +109,7 @@ dev_corpus = "corpora.dev" train_corpus = "corpora.train" seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} -dropout = 0.2 +dropout = 0.1 accumulate_gradient = 1 patience = 0 max_epochs = 20 diff --git a/configs/base_parser_tagger_scibert.cfg b/configs/base_parser_tagger_scibert.cfg index 9e59d4e..016a395 100644 --- a/configs/base_parser_tagger_scibert.cfg +++ b/configs/base_parser_tagger_scibert.cfg @@ -7,7 +7,7 @@ init_tok2vec = null vocab_path = null [system] -gpu_allocator = "pytorch" +gpu_allocator = null seed = 0 [nlp] @@ -36,12 +36,12 @@ moves = null update_with_oracle_cut_size = 100 [components.parser.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v2" state_type = "parser" extra_state_tokens = false hidden_width = 128 maxout_pieces = 3 -use_upper = true +use_upper = false nO = null [components.parser.model.tok2vec] @@ -69,9 +69,10 @@ max_batch_items = 4096 set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} [components.transformer.model] -@architectures = "spacy-transformers.TransformerModel.v1" +@architectures = "spacy-transformers.TransformerModel.v3" name = "allenai/scibert_scivocab_uncased" tokenizer_config = {"use_fast": true} +mixed_precision = true [components.transformer.model.get_spans] @span_getters = "spacy-transformers.strided_spans.v1" @@ -105,7 +106,7 @@ dev_corpus = "corpora.dev" train_corpus = "corpora.train" seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} -dropout = 0.2 +dropout = 0.1 accumulate_gradient = 1 patience = 0 max_epochs = 8 @@ -120,8 +121,8 @@ get_length = null [training.batcher.size] @schedules = "compounding.v1" -start = 16 -stop = 64 +start = 4 +stop = 12 compound = 1.001 t = 0.0 @@ -157,14 +158,6 @@ ents_r = 0.0 [pretraining] [initialize] -vectors = ${paths.vectors} -init_tok2vec = ${paths.init_tok2vec} -vocab_data = ${paths.vocab_path} -lookups = null - -[initialize.components] - -[initialize.tokenizer] [initialize.before_init] @callbacks = "replace_tokenizer" diff --git a/configs/base_specialized_ner.cfg b/configs/base_specialized_ner.cfg index 1c22d17..f083838 100644 --- a/configs/base_specialized_ner.cfg +++ b/configs/base_specialized_ner.cfg @@ -36,26 +36,26 @@ moves = null update_with_oracle_cut_size = 100 [components.ner.model] -@architectures = "spacy.TransitionBasedParser.v1" +@architectures = "spacy.TransitionBasedParser.v2" state_type = "ner" extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 +hidden_width = 128 +maxout_pieces = 3 use_upper = true nO = null [components.ner.model.tok2vec] -@architectures = "spacy.Tok2Vec.v1" +@architectures = "spacy.Tok2Vec.v2" [components.ner.model.tok2vec.embed] -@architectures = "spacy.MultiHashEmbed.v1" +@architectures = "spacy.MultiHashEmbed.v2" width = 96 -attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] -rows = [5000, 2500, 2500, 2500] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"] +rows = [5000, 2500, 2500, 2500, 100] include_static_vectors = ${vars.include_static_vectors} [components.ner.model.tok2vec.encode] -@architectures = "spacy.MaxoutWindowEncoder.v1" +@architectures = "spacy.MaxoutWindowEncoder.v2" width = 96 depth = 4 window_size = 1 @@ -85,7 +85,7 @@ dev_corpus = "corpora.dev" train_corpus = "corpora.train" seed = ${system.seed} gpu_allocator = ${system.gpu_allocator} -dropout = 0.2 +dropout = 0.1 accumulate_gradient = 1 patience = 0 max_epochs = 7 From 3242c3065035a35cb3ebd8fa4a8f7a0faea136cc Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 5 Mar 2022 21:45:13 -0800 Subject: [PATCH 06/16] Un xfail sentence splitter test --- .../custom_tests/test_custom_segmentation.py | 289 +++++++++++++++--- 1 file changed, 249 insertions(+), 40 deletions(-) diff --git a/tests/custom_tests/test_custom_segmentation.py b/tests/custom_tests/test_custom_segmentation.py index 83e9e02..4d4838d 100644 --- a/tests/custom_tests/test_custom_segmentation.py +++ b/tests/custom_tests/test_custom_segmentation.py @@ -1,51 +1,260 @@ import pytest TEST_CASES = [ - ("LSTM networks, which we preview in Sec. 2, have been successfully", ["LSTM networks, which we preview in Sec. 2, have been successfully"]), - ("When the tree is simply a chain, both Eqs. 2–8 and Eqs. 9–14 reduce to the standard LSTM transitions, Eqs. 1.", ["When the tree is simply a chain, both Eqs. 2–8 and Eqs. 9–14 reduce to the standard LSTM transitions, Eqs. 1."]), - ("We used fluorescence time-lapse microscopy (Fig. 1D; fig. S1 and movies S1 and S2) and computational", ["We used fluorescence time-lapse microscopy (Fig. 1D; fig. S1 and movies S1 and S2) and computational"]), - ("Hill functions indeed fit the data well (Fig. 3A and Table 1).", ["Hill functions indeed fit the data well (Fig. 3A and Table 1)."]), - ('In order to produce sentence representations that fully capture the semantics of natural language, order-insensitive models are insufficient due to their inability to account for differences in meaning as a result of differences in word order or syntactic structure (e.g., “cats climb trees” vs. “trees climb cats”).', ['In order to produce sentence representations that fully capture the semantics of natural language, order-insensitive models are insufficient due to their inability to account for differences in meaning as a result of differences in word order or syntactic structure (e.g., “cats climb trees” vs. “trees climb cats”).']), - ("There is an average exact sparsity (fraction of zeros) of the hidden layers of 83.40% on MNIST and 72.00% on CIFAR10. Figure 3 (left) provides a better understanding of the influence of sparsity.", ["There is an average exact sparsity (fraction of zeros) of the hidden layers of 83.40% on MNIST and 72.00% on CIFAR10.", "Figure 3 (left) provides a better understanding of the influence of sparsity."]), - ("Sparsity has become a concept of interest, not only in computational neuroscience and machine learning but also in statistics and signal processing (Candes and Tao, 2005). It was first introduced in computational neuroscience in the context of sparse coding in the visual system (Olshausen and Field, 1997).", ["Sparsity has become a concept of interest, not only in computational neuroscience and machine learning but also in statistics and signal processing (Candes and Tao, 2005).", "It was first introduced in computational neuroscience in the context of sparse coding in the visual system (Olshausen and Field, 1997)."]), - ("1) The first item. 2) The second item.", ["1) The first item.", "2) The second item."]), - ("two of these stages (in areas V1 and V2 of visual cortex) (Lee et al., 2008), and that they", ["two of these stages (in areas V1 and V2 of visual cortex) (Lee et al., 2008), and that they"]), - pytest.param("all neu-\nrons fire at", ["all neu-\nrons fire at"], marks=pytest.mark.xfail), - ("the support of the Defense Advanced Resarch Projects Agency (DARPA) Deep Exploration and Filtering of Text (DEFT) Program under Air Force Research Laboratory (AFRL) contract", ["the support of the Defense Advanced Resarch Projects Agency (DARPA) Deep Exploration and Filtering of Text (DEFT) Program under Air Force Research Laboratory (AFRL) contract"]), - ("While proprietary environments such as Microsoft Robotics Studio [9] and Webots [10] have many commendable attributes, we feel there is no substitute for a fully open platform.", ["While proprietary environments such as Microsoft Robotics Studio [9] and Webots [10] have many commendable attributes, we feel there is no substitute for a fully open platform."]), - ("We first produce sentence representations hL and hR for each sentence in the pair using a Tree-LSTM model over each sentence’s parse tree.", ["We first produce sentence representations hL and hR for each sentence in the pair using a Tree-LSTM model over each sentence’s parse tree."]), - ("LSTM networks, which we review in Sec. 2, have been successfully applied to a variety of sequence modeling and prediction tasks, notably machine translation (Bahdanau et al., 2014; Sutskever et al., 2014), speech recognition (Graves et al., 2013), image caption generation (Vinyals et al., 2014), and program execution (Zaremba and Sutskever, 2014).", ["LSTM networks, which we review in Sec. 2, have been successfully applied to a variety of sequence modeling and prediction tasks, notably machine translation (Bahdanau et al., 2014; Sutskever et al., 2014), speech recognition (Graves et al., 2013), image caption generation (Vinyals et al., 2014), and program execution (Zaremba and Sutskever, 2014)."]), - ("1 Introduction\n\nMost models for distributed representations of phrases and sentences—that is, models where realvalued vectors are used to represent meaning—fall into one of three classes: bag-of-words models, sequence models, and tree-structured models.", ["1 Introduction\n\n", "Most models for distributed representations of phrases and sentences—that is, models where realvalued vectors are used to represent meaning—fall into one of three classes: bag-of-words models, sequence models, and tree-structured models."]), - ("In this section, we will elaborate these philosophies and shows how they influenced the design and implementation of ROS.\n\nA. Peer-to-Peer\n\nA system built using ROS consists of a number of processes, potentially on a number of different", ["In this section, we will elaborate these philosophies and shows how they influenced the design and implementation of ROS.\n\n", "A. Peer-to-Peer\n\n", "A system built using ROS consists of a number of processes, potentially on a number of different"]), - ("\n\n2 Long Short-Term Memory Networks\n\n\n\n2.1 Overview\n\nRecurrent neural networks (RNNs) are able to process input sequences of arbitrary length via the recursive application of a transition function on a hidden state vector ht.", ["\n\n2 Long Short-Term Memory Networks\n\n\n\n", "2.1 Overview\n\n", "Recurrent neural networks (RNNs) are able to process input sequences of arbitrary length via the recursive application of a transition function on a hidden state vector ht."]), - ("In order to address all three aspects, it is necessary to observe gene regulation in individual cells over time. Therefore, we built Bl-cascade[ strains of Escherichia coli, containing the l repressor and a downstream gene, such that both the amount of the repressor protein and the rate of expression of its target gene could be monitored simultaneously in individual cells (Fig. 1B). These strains incorporate a yellow fluorescent repressor fusion protein (cI-yfp) and a chromosomally integrated target promoter (P R ) controlling cyan fluorescent protein (cfp).", ["In order to address all three aspects, it is necessary to observe gene regulation in individual cells over time.", "Therefore, we built Bl-cascade[ strains of Escherichia coli, containing the l repressor and a downstream gene, such that both the amount of the repressor protein and the rate of expression of its target gene could be monitored simultaneously in individual cells (Fig. 1B).", "These strains incorporate a yellow fluorescent repressor fusion protein (cI-yfp) and a chromosomally integrated target promoter (P R ) controlling cyan fluorescent protein (cfp)."]), - ("This is a sentence. (This is an interjected sentence.) This is also a sentence.", ["This is a sentence.", "(This is an interjected sentence.)", "This is also a sentence."]), - ("Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system. EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material.", ["Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system.", "EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material."]), - ("Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007). Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior. A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012).", ["Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007).", "Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior.", "A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012)."]), - ('. . .', ['. . .']), - ("IF condition and goalCondition THEN action condition relates to the current state and goalCondition to the goal state. If variable bindings exist such that predicates in condition match with the current state, and predicates in goalCondition match with the goal state then the action may be performed. Note that the action's precondition as specified in the domain model must also be satisfied. Figure 5 presents an outline of the system. Each iteration starts with a population of policies (line(2)). Current L2Plan settings are such that the individuals comprising the (1) Create initial population (2) WHILE termination criterion false (3) Determine n% fittest polices (4) Perform local search on policies (5) Insert improved policies in new generation (6) WHILE new generation not full (7) SET Pol to empty policy (8) Select two parents (9) IF crossover (10) Perform crossover (11) Pol := fittest of parents & offspring (12) ELSE (13) Pol := fittest of parents (14) ENDIF (15) IF mutation (16) Perform mutation on Pol (17) ENDIF (18) Perform local search on Pol (19) Insert Pol in new generation (20) ENDWHILE (21) (5)). Note that the evaluation of policies is implied when the fittest policy or policies is/are required.", ["IF condition and goalCondition THEN action condition relates to the current state and goalCondition to the goal state.", "If variable bindings exist such that predicates in condition match with the current state, and predicates in goalCondition match with the goal state then the action may be performed.", "Note that the action's precondition as specified in the domain model must also be satisfied.", "Figure 5 presents an outline of the system.", "Each iteration starts with a population of policies (line(2)).", "Current L2Plan settings are such that the individuals comprising the (1) Create initial population (2) WHILE termination criterion false (3) Determine n% fittest polices (4) Perform local search on policies (5) Insert improved policies in new generation (6) WHILE new generation not full (7) SET Pol to empty policy (8) Select two parents (9) IF crossover (10) Perform crossover (11) Pol := fittest of parents & offspring (12) ELSE (13) Pol := fittest of parents (14) ENDIF (15) IF mutation (16) Perform mutation on Pol (17) ENDIF (18) Perform local search on Pol (19) Insert Pol in new generation (20) ENDWHILE (21) (5)).", "Note that the evaluation of policies is implied when the fittest policy or policies is/are required."]), - ("MCC summarizes these four quantities into one score and is regarded as a balanced measure; it takes values between -1 and 1, with higher values indicating better performance (see e.g. Baldi et al. (2000) for further details). Since the convergence threshold in the glasso algorithm is 10 −4 , we take entriesω ij in estimated precision matrices to be non-zero if |ω ij | > 10 −3 . Since cluster assignments can only be identified up to permutation, in all cases labels were permuted to maximize agreement with true cluster assignments before calculating these quantities. Figure 2 shows MCC plotted against per-cluster sample size n k and Supplementary Figure S1 shows corresponding plots for TPR and FPR. Due to selection of smaller tuning parameter values, BIC discovers fewer non-zeroes in the precision matrices than train/test, resulting in both fewer true positives and false positives. Under MCC, BIC, with either the γ = 1 mixture model (B1) or the non-mixture approach (Bh), leads to the best network reconstruction (except at small sample sizes with p = 25) and outperforms all other regimes at larger sample sizes.", ["MCC summarizes these four quantities into one score and is regarded as a balanced measure; it takes values between -1 and 1, with higher values indicating better performance (see e.g. Baldi et al. (2000) for further details).", "Since the convergence threshold in the glasso algorithm is 10 −4 , we take entriesω ij in estimated precision matrices to be non-zero if |ω ij | > 10 −3 .", "Since cluster assignments can only be identified up to permutation, in all cases labels were permuted to maximize agreement with true cluster assignments before calculating these quantities.", "Figure 2 shows MCC plotted against per-cluster sample size n k and Supplementary Figure S1 shows corresponding plots for TPR and FPR.", "Due to selection of smaller tuning parameter values, BIC discovers fewer non-zeroes in the precision matrices than train/test, resulting in both fewer true positives and false positives.", "Under MCC, BIC, with either the γ = 1 mixture model (B1) or the non-mixture approach (Bh), leads to the best network reconstruction (except at small sample sizes with p = 25) and outperforms all other regimes at larger sample sizes."]), - ("Societal impact measurements are mostly commissioned by governments which argue that measuring the impact on science little says about real-world benefits of research (Cohen et al., 2015). Nightingale and Scott (2007) summarize this argumentation in the following pointedly sentence: \"Research that is highly cited or published in top journals may be good for the academic discipline but not for society\" (p. 547). Governments are interested to know the importance of public-funded research (1) for the private and public sectors (e.g. health care), (2) to tackle societal challenges (e.g. climate change), and (3) for education and training of the next generations (ERiC, 2010;Grimson, 2014). The impact model of Cleary, Siegfried, Jackson, and Hunt (2013) additionally highlights the policy enactment of research, in which the impact on policies, laws, and regulations is of special interest. The current study seizes upon this additional issue by investigating a possible source for measuring policy enactment of research.", ["Societal impact measurements are mostly commissioned by governments which argue that measuring the impact on science little says about real-world benefits of research (Cohen et al., 2015).", "Nightingale and Scott (2007) summarize this argumentation in the following pointedly sentence: \"Research that is highly cited or published in top journals may be good for the academic discipline but not for society\" (p. 547).", "Governments are interested to know the importance of public-funded research (1) for the private and public sectors (e.g. health care), (2) to tackle societal challenges (e.g. climate change), and (3) for education and training of the next generations (ERiC, 2010;Grimson, 2014).", "The impact model of Cleary, Siegfried, Jackson, and Hunt (2013) additionally highlights the policy enactment of research, in which the impact on policies, laws, and regulations is of special interest.", "The current study seizes upon this additional issue by investigating a possible source for measuring policy enactment of research."]), - ("CONCLUSIONS: This study demonstrates that TF activation, occurring in mononuclear cells of cardiac transplant recipients, is inhibited by treatment with CsA. Inhibition of monocyte TF induction by CsA may contribute to its successful use in cardiac transplant medicine and might be useful in managing further settings of vascular pathology also known to involve TF expression and NF-kappaB activation.", ["CONCLUSIONS: This study demonstrates that TF activation, occurring in mononuclear cells of cardiac transplant recipients, is inhibited by treatment with CsA.", "Inhibition of monocyte TF induction by CsA may contribute to its successful use in cardiac transplant medicine and might be useful in managing further settings of vascular pathology also known to involve TF expression and NF-kappaB activation."]), - ("In contrast, anti-AIM mAb did not induce any change in the binding activity of NF-kappa B, a transcription factor whose activity is also regulated by protein kinase C. The increase in AP-1-binding activity was accompanied by the marked stimulation of the transcription of c-fos but not that of c-jun.", ["In contrast, anti-AIM mAb did not induce any change in the binding activity of NF-kappa B, a transcription factor whose activity is also regulated by protein kinase C. The increase in AP-1-binding activity was accompanied by the marked stimulation of the transcription of c-fos but not that of c-jun."]), - ("A mutant Tax protein deficient in transactivation of genes by the nuclear factor (NF)-kappaB pathway was unable to induce transcriptional activity of IL-1alpha promoter-CAT constructs, but was rescued by exogenous provision of p65/p50 NF-kappaB. We found that two IL-1alpha kappaB-like sites (positions -1,065 to -1,056 and +646 to +655) specifically formed a complex with NF-kappaB-containing nuclear extract from MT-2 cells and that NF-kappaB bound with higher affinity to the 3' NF-kappaB binding site than to the 5' NF-kappaB site.", ["A mutant Tax protein deficient in transactivation of genes by the nuclear factor (NF)-kappaB pathway was unable to induce transcriptional activity of IL-1alpha promoter-CAT constructs, but was rescued by exogenous provision of p65/p50 NF-kappaB.", "We found that two IL-1alpha kappaB-like sites (positions -1,065 to -1,056 and +646 to +655) specifically formed a complex with NF-kappaB-containing nuclear extract from MT-2 cells and that NF-kappaB bound with higher affinity to the 3' NF-kappaB binding site than to the 5' NF-kappaB site."]), - pytest.param("Protein kinase C inhibitor staurosporine, but not cyclic nucleotide-dependent protein kinase inhibitor HA-1004, also dramatically reduced constitutive levels of nuclear NF kappa B. Finally, TPA addition to monocytes infected with HIV-1 inhibited HIV-1 replication, as determined by reverse transcriptase assays, in a concentration-dependent manner.", ["Protein kinase C inhibitor staurosporine, but not cyclic nucleotide-dependent protein kinase inhibitor HA-1004, also dramatically reduced constitutive levels of nuclear NF kappa B.", "Finally, TPA addition to monocytes infected with HIV-1 inhibited HIV-1 replication, as determined by reverse transcriptase assays, in a concentration-dependent manner."], marks=pytest.mark.xfail), - pytest.param("There are p50.c-rel heterodimers were also detected bound to this sequence at early time points (7-16 h; early), and both remained active at later time points (40 h; late) after activation.", ["There are p50.c-rel heterodimers were also detected bound to this sequence at early time points (7-16 h; early), and both remained active at later time points (40 h; late) after activation."], marks=pytest.mark.xfail), - ("This sentence mentions Eqs. 1-4 and should not be split.", ["This sentence mentions Eqs. 1-4 and should not be split."]), - ("This sentence ends with part an abbreviation that is part of a word material. It also has another sentence after it.", ["This sentence ends with part an abbreviation that is part of a word material.", "It also has another sentence after it."]), - ("It also has a sentence before it. This sentence mentions Eqs. 1-4 and should not be split. It also has another sentence after it.", ["It also has a sentence before it.", "This sentence mentions Eqs. 1-4 and should not be split.", "It also has another sentence after it."]), - ("This sentence is the last segment and ends with an abbreviation that is part of a word material.", ["This sentence is the last segment and ends with an abbreviation that is part of a word material."]), - ("PDBu + iono induced equally high IL-2 levels in both groups and, when stimulated with plate-bound anti-CD3 monoclonal antibody (mAb), the IL-2 secretion by neonatal cells was undetectable and adult cells produced low amounts of IL-2 (mean 331 +/- 86 pg/ml).", ["PDBu + iono induced equally high IL-2 levels in both groups and, when stimulated with plate-bound anti-CD3 monoclonal antibody (mAb), the IL-2 secretion by neonatal cells was undetectable and adult cells produced low amounts of IL-2 (mean 331 +/- 86 pg/ml)."]), - (" This document starts with whitespaces. Next sentence.", [" ", "This document starts with whitespaces.", "Next sentence."]), - pytest.param("How about tomorrow?We can meet at eden garden.", ["How about tomorrow?", "We can meet at eden garden."], marks=pytest.mark.xfail) - ] + ( + "LSTM networks, which we preview in Sec. 2, have been successfully", + ["LSTM networks, which we preview in Sec. 2, have been successfully"], + ), + ( + "When the tree is simply a chain, both Eqs. 2–8 and Eqs. 9–14 reduce to the standard LSTM transitions, Eqs. 1.", + [ + "When the tree is simply a chain, both Eqs. 2–8 and Eqs. 9–14 reduce to the standard LSTM transitions, Eqs. 1." + ], + ), + ( + "We used fluorescence time-lapse microscopy (Fig. 1D; fig. S1 and movies S1 and S2) and computational", + [ + "We used fluorescence time-lapse microscopy (Fig. 1D; fig. S1 and movies S1 and S2) and computational" + ], + ), + ( + "Hill functions indeed fit the data well (Fig. 3A and Table 1).", + ["Hill functions indeed fit the data well (Fig. 3A and Table 1)."], + ), + ( + "In order to produce sentence representations that fully capture the semantics of natural language, order-insensitive models are insufficient due to their inability to account for differences in meaning as a result of differences in word order or syntactic structure (e.g., “cats climb trees” vs. “trees climb cats”).", + [ + "In order to produce sentence representations that fully capture the semantics of natural language, order-insensitive models are insufficient due to their inability to account for differences in meaning as a result of differences in word order or syntactic structure (e.g., “cats climb trees” vs. “trees climb cats”)." + ], + ), + ( + "There is an average exact sparsity (fraction of zeros) of the hidden layers of 83.40% on MNIST and 72.00% on CIFAR10. Figure 3 (left) provides a better understanding of the influence of sparsity.", + [ + "There is an average exact sparsity (fraction of zeros) of the hidden layers of 83.40% on MNIST and 72.00% on CIFAR10.", + "Figure 3 (left) provides a better understanding of the influence of sparsity.", + ], + ), + ( + "Sparsity has become a concept of interest, not only in computational neuroscience and machine learning but also in statistics and signal processing (Candes and Tao, 2005). It was first introduced in computational neuroscience in the context of sparse coding in the visual system (Olshausen and Field, 1997).", + [ + "Sparsity has become a concept of interest, not only in computational neuroscience and machine learning but also in statistics and signal processing (Candes and Tao, 2005).", + "It was first introduced in computational neuroscience in the context of sparse coding in the visual system (Olshausen and Field, 1997).", + ], + ), + ( + "1) The first item. 2) The second item.", + ["1) The first item.", "2) The second item."], + ), + ( + "two of these stages (in areas V1 and V2 of visual cortex) (Lee et al., 2008), and that they", + [ + "two of these stages (in areas V1 and V2 of visual cortex) (Lee et al., 2008), and that they" + ], + ), + pytest.param( + "all neu-\nrons fire at", ["all neu-\nrons fire at"], marks=pytest.mark.xfail + ), + ( + "the support of the Defense Advanced Resarch Projects Agency (DARPA) Deep Exploration and Filtering of Text (DEFT) Program under Air Force Research Laboratory (AFRL) contract", + [ + "the support of the Defense Advanced Resarch Projects Agency (DARPA) Deep Exploration and Filtering of Text (DEFT) Program under Air Force Research Laboratory (AFRL) contract" + ], + ), + ( + "While proprietary environments such as Microsoft Robotics Studio [9] and Webots [10] have many commendable attributes, we feel there is no substitute for a fully open platform.", + [ + "While proprietary environments such as Microsoft Robotics Studio [9] and Webots [10] have many commendable attributes, we feel there is no substitute for a fully open platform." + ], + ), + ( + "We first produce sentence representations hL and hR for each sentence in the pair using a Tree-LSTM model over each sentence’s parse tree.", + [ + "We first produce sentence representations hL and hR for each sentence in the pair using a Tree-LSTM model over each sentence’s parse tree." + ], + ), + ( + "LSTM networks, which we review in Sec. 2, have been successfully applied to a variety of sequence modeling and prediction tasks, notably machine translation (Bahdanau et al., 2014; Sutskever et al., 2014), speech recognition (Graves et al., 2013), image caption generation (Vinyals et al., 2014), and program execution (Zaremba and Sutskever, 2014).", + [ + "LSTM networks, which we review in Sec. 2, have been successfully applied to a variety of sequence modeling and prediction tasks, notably machine translation (Bahdanau et al., 2014; Sutskever et al., 2014), speech recognition (Graves et al., 2013), image caption generation (Vinyals et al., 2014), and program execution (Zaremba and Sutskever, 2014)." + ], + ), + ( + "1 Introduction\n\nMost models for distributed representations of phrases and sentences—that is, models where realvalued vectors are used to represent meaning—fall into one of three classes: bag-of-words models, sequence models, and tree-structured models.", + [ + "1 Introduction\n\n", + "Most models for distributed representations of phrases and sentences—that is, models where realvalued vectors are used to represent meaning—fall into one of three classes: bag-of-words models, sequence models, and tree-structured models.", + ], + ), + ( + "In this section, we will elaborate these philosophies and shows how they influenced the design and implementation of ROS.\n\nA. Peer-to-Peer\n\nA system built using ROS consists of a number of processes, potentially on a number of different", + [ + "In this section, we will elaborate these philosophies and shows how they influenced the design and implementation of ROS.\n\n", + "A. Peer-to-Peer\n\n", + "A system built using ROS consists of a number of processes, potentially on a number of different", + ], + ), + ( + "\n\n2 Long Short-Term Memory Networks\n\n\n\n2.1 Overview\n\nRecurrent neural networks (RNNs) are able to process input sequences of arbitrary length via the recursive application of a transition function on a hidden state vector ht.", + [ + "\n\n2 Long Short-Term Memory Networks\n\n\n\n", + "2.1 Overview\n\n", + "Recurrent neural networks (RNNs) are able to process input sequences of arbitrary length via the recursive application of a transition function on a hidden state vector ht.", + ], + ), + ( + "In order to address all three aspects, it is necessary to observe gene regulation in individual cells over time. Therefore, we built Bl-cascade[ strains of Escherichia coli, containing the l repressor and a downstream gene, such that both the amount of the repressor protein and the rate of expression of its target gene could be monitored simultaneously in individual cells (Fig. 1B). These strains incorporate a yellow fluorescent repressor fusion protein (cI-yfp) and a chromosomally integrated target promoter (P R ) controlling cyan fluorescent protein (cfp).", + [ + "In order to address all three aspects, it is necessary to observe gene regulation in individual cells over time.", + "Therefore, we built Bl-cascade[ strains of Escherichia coli, containing the l repressor and a downstream gene, such that both the amount of the repressor protein and the rate of expression of its target gene could be monitored simultaneously in individual cells (Fig. 1B).", + "These strains incorporate a yellow fluorescent repressor fusion protein (cI-yfp) and a chromosomally integrated target promoter (P R ) controlling cyan fluorescent protein (cfp).", + ], + ), + ( + "This is a sentence. (This is an interjected sentence.) This is also a sentence.", + [ + "This is a sentence.", + "(This is an interjected sentence.)", + "This is also a sentence.", + ], + ), + ( + "Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system. EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material.", + [ + "Thus, we first compute EMC 3 's response time-i.e., the duration from the initial of a call (from/to a participant in the target region) to the time when the decision of task assignment is made; and then, based on the computed response time, we estimate EMC 3 maximum throughput [28]-i.e., the maximum number of mobile users allowed in the MCS system.", + "EMC 3 algorithm is implemented with the Java SE platform and is running on a Java HotSpot(TM) 64-Bit Server VM; and the implementation details are given in Appendix, available in the online supplemental material.", + ], + ), + ( + "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007). Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior. A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012).", + [ + "Random walk models (Skellam, 1951;Turchin, 1998) received a lot of attention and were then extended to several more mathematically and statistically sophisticated approaches to interpret movement data such as State-Space Models (SSM) (Jonsen et al., 2003(Jonsen et al., , 2005 and Brownian Bridge Movement Model (BBMM) (Horne et al., 2007).", + "Nevertheless, these models require heavy computational resources (Patterson et al., 2008) and unrealistic structural a priori hypotheses about movement, such as homogeneous movement behavior.", + "A fundamental property of animal movements is behavioral heterogeneity (Gurarie et al., 2009) and these models poorly performed in highlighting behavioral changes in animal movements through space and time (Kranstauber et al., 2012).", + ], + ), + (". . .", [". . ."]), + ( + "IF condition and goalCondition THEN action condition relates to the current state and goalCondition to the goal state. If variable bindings exist such that predicates in condition match with the current state, and predicates in goalCondition match with the goal state then the action may be performed. Note that the action's precondition as specified in the domain model must also be satisfied. Figure 5 presents an outline of the system. Each iteration starts with a population of policies (line(2)). Current L2Plan settings are such that the individuals comprising the (1) Create initial population (2) WHILE termination criterion false (3) Determine n% fittest polices (4) Perform local search on policies (5) Insert improved policies in new generation (6) WHILE new generation not full (7) SET Pol to empty policy (8) Select two parents (9) IF crossover (10) Perform crossover (11) Pol := fittest of parents & offspring (12) ELSE (13) Pol := fittest of parents (14) ENDIF (15) IF mutation (16) Perform mutation on Pol (17) ENDIF (18) Perform local search on Pol (19) Insert Pol in new generation (20) ENDWHILE (21) (5)). Note that the evaluation of policies is implied when the fittest policy or policies is/are required.", + [ + "IF condition and goalCondition THEN action condition relates to the current state and goalCondition to the goal state.", + "If variable bindings exist such that predicates in condition match with the current state, and predicates in goalCondition match with the goal state then the action may be performed.", + "Note that the action's precondition as specified in the domain model must also be satisfied.", + "Figure 5 presents an outline of the system.", + "Each iteration starts with a population of policies (line(2)).", + "Current L2Plan settings are such that the individuals comprising the (1) Create initial population (2) WHILE termination criterion false (3) Determine n% fittest polices (4) Perform local search on policies (5) Insert improved policies in new generation (6) WHILE new generation not full (7) SET Pol to empty policy (8) Select two parents (9) IF crossover (10) Perform crossover (11) Pol := fittest of parents & offspring (12) ELSE (13) Pol := fittest of parents (14) ENDIF (15) IF mutation (16) Perform mutation on Pol (17) ENDIF (18) Perform local search on Pol (19) Insert Pol in new generation (20) ENDWHILE (21) (5)).", + "Note that the evaluation of policies is implied when the fittest policy or policies is/are required.", + ], + ), + ( + "MCC summarizes these four quantities into one score and is regarded as a balanced measure; it takes values between -1 and 1, with higher values indicating better performance (see e.g. Baldi et al. (2000) for further details). Since the convergence threshold in the glasso algorithm is 10 −4 , we take entriesω ij in estimated precision matrices to be non-zero if |ω ij | > 10 −3 . Since cluster assignments can only be identified up to permutation, in all cases labels were permuted to maximize agreement with true cluster assignments before calculating these quantities. Figure 2 shows MCC plotted against per-cluster sample size n k and Supplementary Figure S1 shows corresponding plots for TPR and FPR. Due to selection of smaller tuning parameter values, BIC discovers fewer non-zeroes in the precision matrices than train/test, resulting in both fewer true positives and false positives. Under MCC, BIC, with either the γ = 1 mixture model (B1) or the non-mixture approach (Bh), leads to the best network reconstruction (except at small sample sizes with p = 25) and outperforms all other regimes at larger sample sizes.", + [ + "MCC summarizes these four quantities into one score and is regarded as a balanced measure; it takes values between -1 and 1, with higher values indicating better performance (see e.g. Baldi et al. (2000) for further details).", + "Since the convergence threshold in the glasso algorithm is 10 −4 , we take entriesω ij in estimated precision matrices to be non-zero if |ω ij | > 10 −3 .", + "Since cluster assignments can only be identified up to permutation, in all cases labels were permuted to maximize agreement with true cluster assignments before calculating these quantities.", + "Figure 2 shows MCC plotted against per-cluster sample size n k and Supplementary Figure S1 shows corresponding plots for TPR and FPR.", + "Due to selection of smaller tuning parameter values, BIC discovers fewer non-zeroes in the precision matrices than train/test, resulting in both fewer true positives and false positives.", + "Under MCC, BIC, with either the γ = 1 mixture model (B1) or the non-mixture approach (Bh), leads to the best network reconstruction (except at small sample sizes with p = 25) and outperforms all other regimes at larger sample sizes.", + ], + ), + ( + 'Societal impact measurements are mostly commissioned by governments which argue that measuring the impact on science little says about real-world benefits of research (Cohen et al., 2015). Nightingale and Scott (2007) summarize this argumentation in the following pointedly sentence: "Research that is highly cited or published in top journals may be good for the academic discipline but not for society" (p. 547). Governments are interested to know the importance of public-funded research (1) for the private and public sectors (e.g. health care), (2) to tackle societal challenges (e.g. climate change), and (3) for education and training of the next generations (ERiC, 2010;Grimson, 2014). The impact model of Cleary, Siegfried, Jackson, and Hunt (2013) additionally highlights the policy enactment of research, in which the impact on policies, laws, and regulations is of special interest. The current study seizes upon this additional issue by investigating a possible source for measuring policy enactment of research.', + [ + "Societal impact measurements are mostly commissioned by governments which argue that measuring the impact on science little says about real-world benefits of research (Cohen et al., 2015).", + 'Nightingale and Scott (2007) summarize this argumentation in the following pointedly sentence: "Research that is highly cited or published in top journals may be good for the academic discipline but not for society" (p. 547).', + "Governments are interested to know the importance of public-funded research (1) for the private and public sectors (e.g. health care), (2) to tackle societal challenges (e.g. climate change), and (3) for education and training of the next generations (ERiC, 2010;Grimson, 2014).", + "The impact model of Cleary, Siegfried, Jackson, and Hunt (2013) additionally highlights the policy enactment of research, in which the impact on policies, laws, and regulations is of special interest.", + "The current study seizes upon this additional issue by investigating a possible source for measuring policy enactment of research.", + ], + ), + ( + "CONCLUSIONS: This study demonstrates that TF activation, occurring in mononuclear cells of cardiac transplant recipients, is inhibited by treatment with CsA. Inhibition of monocyte TF induction by CsA may contribute to its successful use in cardiac transplant medicine and might be useful in managing further settings of vascular pathology also known to involve TF expression and NF-kappaB activation.", + [ + "CONCLUSIONS: This study demonstrates that TF activation, occurring in mononuclear cells of cardiac transplant recipients, is inhibited by treatment with CsA.", + "Inhibition of monocyte TF induction by CsA may contribute to its successful use in cardiac transplant medicine and might be useful in managing further settings of vascular pathology also known to involve TF expression and NF-kappaB activation.", + ], + ), + ( + "In contrast, anti-AIM mAb did not induce any change in the binding activity of NF-kappa B, a transcription factor whose activity is also regulated by protein kinase C. The increase in AP-1-binding activity was accompanied by the marked stimulation of the transcription of c-fos but not that of c-jun.", + [ + "In contrast, anti-AIM mAb did not induce any change in the binding activity of NF-kappa B, a transcription factor whose activity is also regulated by protein kinase C. The increase in AP-1-binding activity was accompanied by the marked stimulation of the transcription of c-fos but not that of c-jun." + ], + ), + ( + "A mutant Tax protein deficient in transactivation of genes by the nuclear factor (NF)-kappaB pathway was unable to induce transcriptional activity of IL-1alpha promoter-CAT constructs, but was rescued by exogenous provision of p65/p50 NF-kappaB. We found that two IL-1alpha kappaB-like sites (positions -1,065 to -1,056 and +646 to +655) specifically formed a complex with NF-kappaB-containing nuclear extract from MT-2 cells and that NF-kappaB bound with higher affinity to the 3' NF-kappaB binding site than to the 5' NF-kappaB site.", + [ + "A mutant Tax protein deficient in transactivation of genes by the nuclear factor (NF)-kappaB pathway was unable to induce transcriptional activity of IL-1alpha promoter-CAT constructs, but was rescued by exogenous provision of p65/p50 NF-kappaB.", + "We found that two IL-1alpha kappaB-like sites (positions -1,065 to -1,056 and +646 to +655) specifically formed a complex with NF-kappaB-containing nuclear extract from MT-2 cells and that NF-kappaB bound with higher affinity to the 3' NF-kappaB binding site than to the 5' NF-kappaB site.", + ], + ), + pytest.param( + "Protein kinase C inhibitor staurosporine, but not cyclic nucleotide-dependent protein kinase inhibitor HA-1004, also dramatically reduced constitutive levels of nuclear NF kappa B. Finally, TPA addition to monocytes infected with HIV-1 inhibited HIV-1 replication, as determined by reverse transcriptase assays, in a concentration-dependent manner.", + [ + "Protein kinase C inhibitor staurosporine, but not cyclic nucleotide-dependent protein kinase inhibitor HA-1004, also dramatically reduced constitutive levels of nuclear NF kappa B.", + "Finally, TPA addition to monocytes infected with HIV-1 inhibited HIV-1 replication, as determined by reverse transcriptase assays, in a concentration-dependent manner.", + ], + marks=pytest.mark.xfail, + ), + ( + "There are p50.c-rel heterodimers were also detected bound to this sequence at early time points (7-16 h; early), and both remained active at later time points (40 h; late) after activation.", + [ + "There are p50.c-rel heterodimers were also detected bound to this sequence at early time points (7-16 h; early), and both remained active at later time points (40 h; late) after activation." + ], + ), + ( + "This sentence mentions Eqs. 1-4 and should not be split.", + ["This sentence mentions Eqs. 1-4 and should not be split."], + ), + ( + "This sentence ends with part an abbreviation that is part of a word material. It also has another sentence after it.", + [ + "This sentence ends with part an abbreviation that is part of a word material.", + "It also has another sentence after it.", + ], + ), + ( + "It also has a sentence before it. This sentence mentions Eqs. 1-4 and should not be split. It also has another sentence after it.", + [ + "It also has a sentence before it.", + "This sentence mentions Eqs. 1-4 and should not be split.", + "It also has another sentence after it.", + ], + ), + ( + "This sentence is the last segment and ends with an abbreviation that is part of a word material.", + [ + "This sentence is the last segment and ends with an abbreviation that is part of a word material." + ], + ), + ( + "PDBu + iono induced equally high IL-2 levels in both groups and, when stimulated with plate-bound anti-CD3 monoclonal antibody (mAb), the IL-2 secretion by neonatal cells was undetectable and adult cells produced low amounts of IL-2 (mean 331 +/- 86 pg/ml).", + [ + "PDBu + iono induced equally high IL-2 levels in both groups and, when stimulated with plate-bound anti-CD3 monoclonal antibody (mAb), the IL-2 secretion by neonatal cells was undetectable and adult cells produced low amounts of IL-2 (mean 331 +/- 86 pg/ml)." + ], + ), + ( + " This document starts with whitespaces. Next sentence.", + [" ", "This document starts with whitespaces.", "Next sentence."], + ), + pytest.param( + "How about tomorrow?We can meet at eden garden.", + ["How about tomorrow?", "We can meet at eden garden."], + marks=pytest.mark.xfail, + ), +] -@pytest.mark.parametrize('text,expected_sents', TEST_CASES) -def test_custom_segmentation(en_with_combined_rule_tokenizer_and_segmenter_fixture, remove_new_lines_fixture, text, expected_sents): + +@pytest.mark.parametrize("text,expected_sents", TEST_CASES) +def test_custom_segmentation( + en_with_combined_rule_tokenizer_and_segmenter_fixture, + remove_new_lines_fixture, + text, + expected_sents, +): doc = en_with_combined_rule_tokenizer_and_segmenter_fixture(text) sents = [s.text for s in doc.sents] assert sents == expected_sents + def test_segmenter(en_with_combined_rule_tokenizer_and_segmenter_fixture): # this text used to crash pysbd text = r"Then, (S\{ℓ 1 , ℓ 2 }) ∪ {v} is a smaller power dominating set than S, which is a contradiction. Now consider the case in which v ∈ V is incident to exactly two leaves, ℓ 1 and ℓ 2 , and suppose there is a minimum power dominating set S of G such that {v, ℓ 1 , ℓ 2 } ∩ S = ∅." From c9c50ed556960281fb734a980321fa12185e8e87 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 5 Mar 2022 21:45:33 -0800 Subject: [PATCH 07/16] pos tag difference in hyponym detector test --- tests/test_hyponym_detector.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_hyponym_detector.py b/tests/test_hyponym_detector.py index d1f14dd..7dcdab5 100644 --- a/tests/test_hyponym_detector.py +++ b/tests/test_hyponym_detector.py @@ -20,8 +20,10 @@ def test_sentences(self): ) doc = self.nlp(text) fig_trees = doc[21:23] - keystone_plant_species = doc[16:19] - assert doc._.hearst_patterns == [("such_as", keystone_plant_species, fig_trees)] + plant_species = doc[17:19] + print([t.pos_ for t in doc]) + print([chunk.text for chunk in doc.noun_chunks]) + assert doc._.hearst_patterns == [("such_as", plant_species, fig_trees)] doc = self.nlp("SARS, or other coronaviruses, are bad.") assert doc._.hearst_patterns == [("other", doc[4:5], doc[0:1])] From dbbb66114da38f89783e694db38441fa6fbe0fa3 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 5 Mar 2022 21:45:44 -0800 Subject: [PATCH 08/16] Adjustments to project.yml --- project.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/project.yml b/project.yml index 7a1f85f..d1d2c94 100644 --- a/project.yml +++ b/project.yml @@ -2,7 +2,7 @@ title: "scispaCy pipeline" description: "All the steps needed in the scispaCy pipeline" vars: - version_string: "0.4.0" + version_string: "0.5.0" freqs_loc_s3: "s3://ai2-s2-scispacy/data/gorc_subset.freqs" freqs_loc_local: "assets/gorc_subset.freqs" vectors_loc_s3: "s3://ai2-s2-scispacy/data/pubmed_with_header.txt.gz" @@ -130,31 +130,25 @@ workflows: - parser-tagger-train-sm - parser-tagger-train-md - parser-tagger-train-lg - - parser-tagger-train-scibert - ner-train-sm - ner-train-md - ner-train-lg - ner-train-specialized - - ner-train-scibert - evaluate-parser-tagger-sm - evaluate-parser-tagger-md - evaluate-parser-tagger-lg - - evaluate-parser-tagger-scibert - evaluate-ner-sm - evaluate-ner-md - evaluate-ner-lg - evaluate-specialized-ner - - evaluate-ner-scibert - package-sm - package-md - package-lg - package-ner - - package-scibert - evaluate-package-sm - evaluate-package-md - evaluate-package-lg - evaluate-package-ner - - evaluate-package-scibert commands: - name: download From 5edd6c1a68e3f76bca044ab803e788a648fd99de Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 5 Mar 2022 21:51:23 -0800 Subject: [PATCH 09/16] coupld comments and print statements --- project.yml | 2 +- tests/test_hyponym_detector.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/project.yml b/project.yml index d1d2c94..59739bb 100644 --- a/project.yml +++ b/project.yml @@ -159,7 +159,7 @@ commands: - "aws s3 cp ${vars.genia_loc_s3}/train.json ${vars.genia_train_loc_local} --no-sign-request" - "aws s3 cp ${vars.genia_loc_s3}/dev.json ${vars.genia_dev_loc_local} --no-sign-request" - "aws s3 cp ${vars.genia_loc_s3}/test.json ${vars.genia_test_loc_local} --no-sign-request" - # - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz" + - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz" - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/" - "rm ${vars.ontonotes_loc_local}.tar.gz" - "aws s3 cp ${vars.med_mentions_loc_s3} assets/med_mentions.tar.gz --no-sign-request" diff --git a/tests/test_hyponym_detector.py b/tests/test_hyponym_detector.py index 7dcdab5..e8ab8f6 100644 --- a/tests/test_hyponym_detector.py +++ b/tests/test_hyponym_detector.py @@ -21,8 +21,6 @@ def test_sentences(self): doc = self.nlp(text) fig_trees = doc[21:23] plant_species = doc[17:19] - print([t.pos_ for t in doc]) - print([chunk.text for chunk in doc.noun_chunks]) assert doc._.hearst_patterns == [("such_as", plant_species, fig_trees)] doc = self.nlp("SARS, or other coronaviruses, are bad.") From 25a69fcaa2bdaa8f0c76a4c2240c65e783f9a84b Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 6 Mar 2022 14:49:24 -0800 Subject: [PATCH 10/16] Fix up some gpu stuff --- project.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/project.yml b/project.yml index 59739bb..dd2cb41 100644 --- a/project.yml +++ b/project.yml @@ -3,6 +3,7 @@ description: "All the steps needed in the scispaCy pipeline" vars: version_string: "0.5.0" + gpu_id: "0" freqs_loc_s3: "s3://ai2-s2-scispacy/data/gorc_subset.freqs" freqs_loc_local: "assets/gorc_subset.freqs" vectors_loc_s3: "s3://ai2-s2-scispacy/data/pubmed_with_header.txt.gz" @@ -300,7 +301,7 @@ commands: - name: parser-tagger-train-scibert help: "Train the scibert transformer model" script: - - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_lg_loc} --gpu-id 0" + - "spacy train ${vars.parser_tagger_scibert_config_loc} --output ${vars.parser_tagger_scibert_loc} --code ${vars.code_loc} --paths.vocab_path ${vars.vocab_lg_loc} --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_config_loc}" - "${vars.genia_train_spacy_loc}" @@ -348,7 +349,7 @@ commands: - name: ner-train-scibert help: "Train the scibert ner model." script: - - "spacy train ${vars.ner_scibert_config_loc} --output ${vars.ner_scibert_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_scibert_loc}/model-best --gpu-id 0" + - "spacy train ${vars.ner_scibert_config_loc} --output ${vars.ner_scibert_loc} --code ${vars.code_loc} --paths.parser_tagger_path ${vars.parser_tagger_scibert_loc}/model-best --gpu-id ${vars.gpu_id}" deps: - "${vars.ner_config_loc}" - "${vars.parser_tagger_scibert_loc}/model-best" @@ -427,8 +428,8 @@ commands: - name: evaluate-parser-tagger-scibert help: "Evaluate the parser and tagger scibert model" script: - - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results.json" - - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results_onto.json" + - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.genia_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.parser_tagger_scibert_loc}/model-best ${vars.ontonotes_test_spacy_loc} --output ${vars.parser_tagger_scibert_loc}/model_best_results_onto.json --gpu-id ${vars.gpu_id}" deps: - "${vars.parser_tagger_scibert_loc}/model-best" - "${vars.genia_test_spacy_loc}" @@ -470,7 +471,7 @@ commands: - name: evaluate-ner-scibert help: "Evaluate NER scibert" script: - - "python scripts/evaluate_ner.py --model_path ${vars.ner_scibert_loc}/model-best --dataset medmentions-test --output ${vars.ner_scibert_loc}/model_best_results.json --med_mentions_folder_path assets/" + - "python scripts/evaluate_ner.py --model_path ${vars.ner_scibert_loc}/model-best --dataset medmentions-test --output ${vars.ner_scibert_loc}/model_best_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.ner_scibert_loc}" - "${vars.corpus_pubtator_loc_local}" From ad4fb512896f7cd0386a292923a50266b4a09e59 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 6 Mar 2022 14:49:44 -0800 Subject: [PATCH 11/16] add gpu to ner evaluation script and fix code import --- scripts/evaluate_ner.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/evaluate_ner.py b/scripts/evaluate_ner.py index 707b50d..cbe6add 100644 --- a/scripts/evaluate_ner.py +++ b/scripts/evaluate_ner.py @@ -4,14 +4,19 @@ import spacy import importlib +from thinc.api import require_gpu + from scispacy.data_util import read_full_med_mentions, read_ner_from_tsv from scispacy.train_utils import evaluate_ner -def main(model_path: str, dataset: str, output_path: str, code: Optional[str], med_mentions_folder_path: Optional[str]): +def main(model_path: str, dataset: str, output_path: str, code: Optional[str], med_mentions_folder_path: Optional[str], gpu_id: Optional[int]): + if gpu_id is not None and gpu_id >= 0: + require_gpu(gpu_id) + if code is not None: # need to import code before loading a spacy model - spec = importlib.util.spec_from_file_location(name, str(loc)) + spec = importlib.util.spec_from_file_location("python_code", str(code)) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) @@ -40,6 +45,7 @@ def main(model_path: str, dataset: str, output_path: str, code: Optional[str], m parser.add_argument("--output_path", type=str, help="Path to write results to") parser.add_argument("--code", type=str, default=None, help="Path to code to import before loading spacy model") parser.add_argument("--med_mentions_folder_path", type=str, default=None, help="Path to the med mentions folder") + parser.add_argument("--gpu_id", type=int, default=-1, help="GPU id to use") args = parser.parse_args() - main(args.model_path, args.dataset, args.output_path, args.code, args.med_mentions_folder_path) \ No newline at end of file + main(args.model_path, args.dataset, args.output_path, args.code, args.med_mentions_folder_path, args.gpu_id) \ No newline at end of file From 0d7bcb5c7cfd32f54df1e735fd09d037e462737e Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 6 Mar 2022 15:03:13 -0800 Subject: [PATCH 12/16] Add a couple more gpu args --- project.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/project.yml b/project.yml index dd2cb41..2f11d1a 100644 --- a/project.yml +++ b/project.yml @@ -579,9 +579,9 @@ commands: - name: evaluate-package-scibert help: "Evaluate the packaged scibert model" script: - - "spacy evaluate ${vars.package_scibert_loc} ${vars.genia_test_spacy_loc} --output packages/scibert_genia_results.json" - - "spacy evaluate ${vars.package_scibert_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json" - - "python scripts/evaluate_ner.py --model_path ${vars.package_scibert_loc} --dataset medmentions-test --output packages/scibert_mm_results.json --med_mentions_folder_path assets/" + - "spacy evaluate ${vars.package_scibert_loc} ${vars.genia_test_spacy_loc} --output packages/scibert_genia_results.json --gpu-id ${vars.gpu_id}" + - "spacy evaluate ${vars.package_scibert_loc} ${vars.ontonotes_test_spacy_loc} --output packages/scibert_onto_results.json --gpu-id ${var.gpu_id}" + - "python scripts/evaluate_ner.py --model_path ${vars.package_scibert_loc} --dataset medmentions-test --output packages/scibert_mm_results.json --med_mentions_folder_path assets/ --gpu_id ${vars.gpu_id}" deps: - "${vars.package_scibert_loc}" outputs: From 136aed077fe4b6a08efc30b9e7a50ff424765576 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 6 Mar 2022 15:03:25 -0800 Subject: [PATCH 13/16] update metrics in docs --- docs/index.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/index.md b/docs/index.md index 63ab93d..8b4ccf3 100644 --- a/docs/index.md +++ b/docs/index.md @@ -35,18 +35,18 @@ Our models achieve performance within 3% of published state of the art dependenc | model | UAS | LAS | POS | Mentions (F1) | Web UAS | |:---------------|:----|:------|:------|:---|:---| -| en_core_sci_sm | 89.54| 87.62 | 98.32 | 68.15 | 87.62 | -| en_core_sci_md | 89.61| 87.77 | 98.56 | 69.64 | 88.05 | -| en_core_sci_lg | 89.63| 87.81 | 98.56 | 69.61 | 88.08 | -| en_core_sci_scibert | 92.03| 90.25 | 98.91 | 67.91 | 92.21 | +| en_core_sci_sm | 89.27| 87.33 | 98.29 | 68.05 | 87.61 | +| en_core_sci_md | 89.86| 87.92 | 98.43 | 69.32 | 88.05 | +| en_core_sci_lg | 89.54| 87.66 | 98.29 | 69.52 | 87.68 | +| en_core_sci_scibert | 92.28| 90.83 | 98.93 | 67.84 | 92.63 | | model | F1 | Entity Types| |:---------------|:-----|:--------| -| en_ner_craft_md | 76.11|GGP, SO, TAXON, CHEBI, GO, CL| -| en_ner_jnlpba_md | 71.62| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | -| en_ner_bc5cdr_md | 84.49| DISEASE, CHEMICAL| -| en_ner_bionlp13cg_md | 77.75| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | +| en_ner_craft_md | 78.35|GGP, SO, TAXON, CHEBI, GO, CL| +| en_ner_jnlpba_md | 70.89| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | +| en_ner_bc5cdr_md | 84.70| DISEASE, CHEMICAL| +| en_ner_bionlp13cg_md | 76.79| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | ### Example Usage From 6d33093f319d12b04da28727c3258d26c76d0c79 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 6 Mar 2022 15:05:40 -0800 Subject: [PATCH 14/16] Update version numbers everywhere --- Dockerfile | 2 +- README.md | 18 +++++++++--------- docs/index.md | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index 36c3e9a..396daba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /work COPY requirements.in . RUN pip install -r requirements.in -RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz +RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz RUN python -m spacy download en_core_web_sm RUN python -m spacy download en_core_web_md diff --git a/README.md b/README.md index 8501b41..210b36c 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ pip install scispacy to install a model (see our full selection of available models below), run a command like the following: ```bash -pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz ``` Note: We strongly recommend that you use an isolated Python environment (such as virtualenv or conda) to install scispacy. @@ -76,14 +76,14 @@ pip install CMD-V(to paste the copied URL) | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz)| -| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_craft_md-0.5.0.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_jnlpba_md-0.5.0.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz)| ## Additional Pipeline Components diff --git a/docs/index.md b/docs/index.md index 8b4ccf3..ea5b945 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,14 +17,14 @@ pip install | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz)| -| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_scibert-0.4.0.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_scibert-0.5.0.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_lg-0.5.0.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_craft_md-0.5.0.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_jnlpba_md-0.5.0.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bc5cdr_md-0.5.0.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz)| From c1d9bab895bcb93c4edc9090aebc8356eb205015 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 9 Mar 2022 20:23:39 -0800 Subject: [PATCH 15/16] dummy change --- docs/index.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index ea5b945..8579f9e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -28,7 +28,6 @@ pip install - ### Performance Our models achieve performance within 3% of published state of the art dependency parsers and within 0.4% accuracy of state of the art biomedical POS taggers. From 8ff659f4414999cc2ab4b2be39b199aa0cb53d76 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 9 Mar 2022 22:51:49 -0800 Subject: [PATCH 16/16] mypy --- scispacy/abbreviation.py | 2 +- scispacy/hearst_patterns.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scispacy/abbreviation.py b/scispacy/abbreviation.py index cf5e4aa..e2ac595 100644 --- a/scispacy/abbreviation.py +++ b/scispacy/abbreviation.py @@ -229,7 +229,7 @@ def find_matches_for( to_remove = set() global_matches = self.global_matcher(doc) for match, start, end in global_matches: - string_key = self.global_matcher.vocab.strings[match] + string_key = self.global_matcher.vocab.strings[match] # type: ignore to_remove.add(string_key) all_occurences[rules[string_key]].add(doc[start:end]) for key in to_remove: diff --git a/scispacy/hearst_patterns.py b/scispacy/hearst_patterns.py index c13446f..992b7a4 100644 --- a/scispacy/hearst_patterns.py +++ b/scispacy/hearst_patterns.py @@ -1,3 +1,5 @@ +from typing import List, Dict, Any + """ BSD 3-Clause License @@ -35,7 +37,7 @@ punct = {"IS_PUNCT": True, "OP": "?"} det = {"ORTH": "*", "OP": "*"} -BASE_PATTERNS = [ +BASE_PATTERNS: List[Dict[str, Any]] = [ # '(NP_\\w+ (, )?such as (NP_\\w+ ?(, )?(and |or )?)+)', 'first' { "label": "such_as",