From 36528a96ba5844a0024cfc4ed2ece61a4f52cb5b Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 26 Aug 2023 00:03:43 -0700 Subject: [PATCH 01/11] update requirements --- requirements.in | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.in b/requirements.in index dc2f1f2..cb24b3e 100644 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,5 @@ numpy -spacy>=3.4.0,<3.5.0 +spacy>=3.6.0,<3.7.0 spacy-lookups-data pandas requests>=2.0.0,<3.0.0 diff --git a/setup.py b/setup.py index 91373dc..da52d2c 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), license="Apache", install_requires=[ - "spacy>=3.4.0,<3.5.0", + "spacy>=3.6.0,<3.7.0", "requests>=2.0.0,<3.0.0", "conllu", "numpy", From 2474bd6b1d703e6f681af351e2638087a701de83 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 26 Aug 2023 00:13:35 -0700 Subject: [PATCH 02/11] minor config changes based on latest spacy --- configs/base_ner.cfg | 4 ++-- configs/base_ner_scibert.cfg | 4 ++-- configs/base_parser_tagger.cfg | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/base_ner.cfg b/configs/base_ner.cfg index 00b0506..bdf0ff7 100644 --- a/configs/base_ner.cfg +++ b/configs/base_ner.cfg @@ -48,8 +48,8 @@ nO = null [components.ner.model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v2" width = 96 -attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"] -rows = [5000, 2500, 2500, 2500, 100] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +rows = [5000, 1000, 2500, 2500] include_static_vectors = ${vars.include_static_vectors} [components.ner.model.tok2vec.encode] diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg index c8b7371..7e3a839 100644 --- a/configs/base_ner_scibert.cfg +++ b/configs/base_ner_scibert.cfg @@ -45,8 +45,8 @@ nO = null [components.ner.model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v2" width = 96 -attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"] -rows = [5000, 2500, 2500, 2500, 100] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +rows = [5000, 1000, 2500, 2500] include_static_vectors = false [components.ner.model.tok2vec.encode] diff --git a/configs/base_parser_tagger.cfg b/configs/base_parser_tagger.cfg index 738c85c..6801de5 100644 --- a/configs/base_parser_tagger.cfg +++ b/configs/base_parser_tagger.cfg @@ -73,8 +73,8 @@ factory = "tok2vec" [components.tok2vec.model.embed] @architectures = "spacy.MultiHashEmbed.v2" width = ${components.tok2vec.model.encode.width} -attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"] -rows = [5000, 2500, 2500, 2500, 100] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY", "IS_SPACE"] +rows = [5000, 1000, 2500, 2500, 50, 50] include_static_vectors = ${vars.include_static_vectors} [components.tok2vec.model.encode] From 8e72a496de0987ba1e17c903d2828ac4395481ba Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 26 Aug 2023 00:55:39 -0700 Subject: [PATCH 03/11] pin scipy --- requirements.in | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements.in b/requirements.in index cb24b3e..df702bf 100644 --- a/requirements.in +++ b/requirements.in @@ -1,4 +1,5 @@ numpy +scipy<1.11 spacy>=3.6.0,<3.7.0 spacy-lookups-data pandas diff --git a/setup.py b/setup.py index da52d2c..c43dca7 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ license="Apache", install_requires=[ "spacy>=3.6.0,<3.7.0", + "scipy<1.11", "requests>=2.0.0,<3.0.0", "conllu", "numpy", From 81df1dde2a653ff431cc04e2837ab3d1617f9e05 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 27 Aug 2023 12:08:08 -0700 Subject: [PATCH 04/11] update release instructions --- RELEASE.md | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 65b2e95..d59211c 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -15,16 +15,11 @@ Update the version in version.py. #### Training new models -For the release, new models should be trained using the `scripts/pipeline.sh` and `scripts/ner_pipeline.sh` scripts, for the small, medium and large models, and specialized NER models. Remember to export the `ONTONOTES_PATH` and `ONTONOTES_PERCENT` environment variables to mix in the ontonotes training data. +The entire pipeline can be run using `spacy project run all`. This will train and package all the models. -``` -bash scripts/pipeline.sh small -bash scripts/pipeline.sh medium -bash scripts/pipeline.sh large -bash scripts/ner_pipeline.sh -``` +The packages should then be uploaded to the `https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/{VERSION}` S3 bucket, and references to previous models (e.g in the readme and in the docs) should be updated. You can find all these places using `git grep `. -these should then be uploaded to the `https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/{VERSION}` S3 bucket, and references to previous models (e.g in the readme and in the docs) should be updated. You can find all these places using `git grep `. +The scripts `install_local_packages.py`, `instal_remote_packages.py`, `print_out_metrics.py`, `smoke_test.py`, and `uninstall_local_packages.py` are useful for testing at each step of the process. Before uploading, `install_local_packages.py` and `smoke_test.py` can be used to make sure the packages are installable and do a quick check of output. `print_out_metrics.py` can then be used to easily get the metrics that need to be update in the README. Once the packages have been uploaded, `uninstall_local_packages.py`, `install_remote_packages.py`, and `smoke_test.py` can be used to ensure everything was uploaded correctly. #### Merge a PR with the above changes Merge a PR with the above changes, and publish a release with a tag corresponding to the commit from the merged PR. This should trigger the publish github action, which will create the `scispacy` package and publish it to pypi. From cd7dfcde871f39df63fca768f0cf937ee22f4624 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sun, 27 Aug 2023 12:08:17 -0700 Subject: [PATCH 05/11] update version --- scispacy/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scispacy/version.py b/scispacy/version.py index 7b6fdbf..cb96d45 100644 --- a/scispacy/version.py +++ b/scispacy/version.py @@ -1,6 +1,6 @@ _MAJOR = "0" _MINOR = "5" -_REVISION = "2" +_REVISION = "3" VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION) From ccf9e9b511955915b9244a47e0ff9de0e3d363cd Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 4 Sep 2023 00:20:23 -0700 Subject: [PATCH 06/11] update version again --- project.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/project.yml b/project.yml index 91eae4e..ce7190b 100644 --- a/project.yml +++ b/project.yml @@ -2,7 +2,7 @@ title: "scispaCy pipeline" description: "All the steps needed in the scispaCy pipeline" vars: - version_string: "0.5.2" + version_string: "0.5.3" gpu_id: 0 freqs_loc_s3: "s3://ai2-s2-scispacy/data/gorc_subset.freqs" freqs_loc_local: "assets/gorc_subset.freqs" @@ -166,9 +166,9 @@ commands: - "aws s3 cp ${vars.genia_loc_s3}/train.json ${vars.genia_train_loc_local} --no-sign-request" - "aws s3 cp ${vars.genia_loc_s3}/dev.json ${vars.genia_dev_loc_local} --no-sign-request" - "aws s3 cp ${vars.genia_loc_s3}/test.json ${vars.genia_test_loc_local} --no-sign-request" - - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz" - - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/" - - "rm ${vars.ontonotes_loc_local}.tar.gz" + # - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz" + # - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/" + # - "rm ${vars.ontonotes_loc_local}.tar.gz" - "aws s3 cp ${vars.med_mentions_loc_s3} assets/med_mentions.tar.gz --no-sign-request" - "tar -xzvf assets/med_mentions.tar.gz -C assets/" - "rm assets/med_mentions.tar.gz" From adeaba4769b071f4e42dc1589ba0c9996adfdfc8 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Wed, 6 Sep 2023 23:23:10 -0700 Subject: [PATCH 07/11] update numbers --- docs/index.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/index.md b/docs/index.md index 65b94cd..7f72404 100644 --- a/docs/index.md +++ b/docs/index.md @@ -34,18 +34,18 @@ Our models achieve performance within 3% of published state of the art dependenc | model | UAS | LAS | POS | Mentions (F1) | Web UAS | |:---------------|:----|:------|:------|:---|:---| -| en_core_sci_sm | 89.03| 87.00 | 98.13 | 67.87 | 87.42 | -| en_core_sci_md | 89.73| 87.85 | 98.40 | 69.53 | 87.79 | -| en_core_sci_lg | 89.75| 87.79 | 98.49 | 69.69 | 87.74 | -| en_core_sci_scibert | 92.21| 90.65 | 98.86 | 68.01 | 92.58 | +| en_core_sci_sm | 89.39| 87.41 | 98.32 | 68.00 | 87.65 | +| en_core_sci_md | 90.23| 88.39 | 98.39 | 68.95 | 87.63 | +| en_core_sci_lg | 89.98| 88.15 | 98.50 | 68.67 | 88.21 | +| en_core_sci_scibert | 92.54| 91.02 | 98.89 | 67.90 | 92.85 | | model | F1 | Entity Types| |:---------------|:-----|:--------| -| en_ner_craft_md | 76.75|GGP, SO, TAXON, CHEBI, GO, CL| -| en_ner_jnlpba_md | 72.28| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | -| en_ner_bc5cdr_md | 84.53| DISEASE, CHEMICAL| -| en_ner_bionlp13cg_md | 76.57| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | +| en_ner_craft_md | 77.56|GGP, SO, TAXON, CHEBI, GO, CL| +| en_ner_jnlpba_md | 72.98| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | +| en_ner_bc5cdr_md | 84.23| DISEASE, CHEMICAL| +| en_ner_bionlp13cg_md | 77.36| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE | ### Example Usage From 3a87799074e14f9a9839c899cbfdd9996d67e2c5 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Mon, 11 Sep 2023 00:16:01 -0700 Subject: [PATCH 08/11] undo comment --- project.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/project.yml b/project.yml index ce7190b..4c75c93 100644 --- a/project.yml +++ b/project.yml @@ -166,9 +166,9 @@ commands: - "aws s3 cp ${vars.genia_loc_s3}/train.json ${vars.genia_train_loc_local} --no-sign-request" - "aws s3 cp ${vars.genia_loc_s3}/dev.json ${vars.genia_dev_loc_local} --no-sign-request" - "aws s3 cp ${vars.genia_loc_s3}/test.json ${vars.genia_test_loc_local} --no-sign-request" - # - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz" - # - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/" - # - "rm ${vars.ontonotes_loc_local}.tar.gz" + - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz" + - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/" + - "rm ${vars.ontonotes_loc_local}.tar.gz" - "aws s3 cp ${vars.med_mentions_loc_s3} assets/med_mentions.tar.gz --no-sign-request" - "tar -xzvf assets/med_mentions.tar.gz -C assets/" - "rm assets/med_mentions.tar.gz" From 2c5c6a53a411df68d78138297e820003a1e9b0bf Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 30 Sep 2023 12:40:17 -0700 Subject: [PATCH 09/11] update links --- docs/index.md | 16 ++++++++-------- scripts/install_remote_packages.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/index.md b/docs/index.md index 7f72404..5313b2f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,14 +17,14 @@ pip install | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz)| -| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_craft_md-0.5.3.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_jnlpba_md-0.5.3.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bionlp13cg_md-0.5.3.tar.gz)| diff --git a/scripts/install_remote_packages.py b/scripts/install_remote_packages.py index 60ff0f5..6232a5c 100644 --- a/scripts/install_remote_packages.py +++ b/scripts/install_remote_packages.py @@ -4,7 +4,7 @@ def main(): - s3_prefix = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/" + s3_prefix = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/" model_names = [ "en_core_sci_sm", "en_core_sci_md", From c6c35be951cd55dca660f60e78dbadefc2577d77 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 30 Sep 2023 12:42:26 -0700 Subject: [PATCH 10/11] update readme links --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c74af2c..bd8b1f7 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ pip install scispacy to install a model (see our full selection of available models below), run a command like the following: ```bash -pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz ``` Note: We strongly recommend that you use an isolated Python environment (such as virtualenv or conda) to install scispacy. @@ -76,14 +76,14 @@ pip install CMD-V(to paste the copied URL) | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz)| -| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz)| +| en_core_sci_scibert | A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_craft_md-0.5.3.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_jnlpba_md-0.5.3.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bionlp13cg_md-0.5.3.tar.gz)| ## Additional Pipeline Components From b4cef3d61a8ef0f181ca988e1977f5fb81940de7 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Sat, 30 Sep 2023 12:43:22 -0700 Subject: [PATCH 11/11] update version in dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 8417070..42d8f6e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /work COPY requirements.in . RUN pip install -r requirements.in -RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz +RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz RUN python -m spacy download en_core_web_sm RUN python -m spacy download en_core_web_md