diff --git a/Dockerfile b/Dockerfile index a3f69d14..662bd814 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,7 +26,7 @@ COPY tests/ tests/ COPY .flake8 .flake8 RUN pip install -r requirements.in -RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz +RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz RUN python -m spacy download en_core_web_sm RUN python -m spacy download en_core_web_md diff --git a/README.md b/README.md index acae4fa8..63cc2d77 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ pip install scispacy to install a model (see our full selection of available models below), run a command like the following: ```bash -pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz +pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz ``` Note: We strongly recommend that you use an isolated Python environment (such as virtualenv or conda) to install scispacy. @@ -77,13 +77,13 @@ pip install CMD-V(to paste the copied URL) | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_craft_md-0.2.4.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_jnlpba_md-0.2.4.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bionlp13cg_md-0.2.4.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_lg-0.2.5.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_craft_md-0.2.5.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_jnlpba_md-0.2.5.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bc5cdr_md-0.2.5.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bionlp13cg_md-0.2.5.tar.gz)| ## Additional Pipeline Components diff --git a/docs/index.md b/docs/index.md index e56067df..9f97758e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -17,13 +17,13 @@ pip install | Model | Description | Install URL |:---------------|:------------------|:----------| -| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_sm-0.2.4.tar.gz)| -| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz)| -| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz)| -| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_craft_md-0.2.4.tar.gz)| -| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_jnlpba_md-0.2.4.tar.gz)| -| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz)| -| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bionlp13cg_md-0.2.4.tar.gz)| +| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz)| +| en_core_sci_md | A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_md-0.2.5.tar.gz)| +| en_core_sci_lg | A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_lg-0.2.5.tar.gz)| +| en_ner_craft_md| A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_craft_md-0.2.5.tar.gz)| +| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_jnlpba_md-0.2.5.tar.gz)| +| en_ner_bc5cdr_md | A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bc5cdr_md-0.2.5.tar.gz)| +| en_ner_bionlp13cg_md | A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bionlp13cg_md-0.2.5.tar.gz)| @@ -34,17 +34,17 @@ Our models achieve performance within 3% of published state of the art dependenc | model | UAS | LAS | POS | Mentions (F1) | Web UAS | |:---------------|:----|:------|:------|:---|:---| -| en_core_sci_sm | 89.36| 87.41 | 98.30 | 67.12 | 85.46 | -| en_core_sci_md | 90.08| 88.26 | 98.51 | 69.17 | 86.88 | -| en_core_sci_lg | 90.11| 88.31 | 98.52 | 69.08 | 85.16 | +| en_core_sci_sm | 89.26| 87.38 | 98.38 | 67.14 | 87.18 | +| en_core_sci_md | 89.92| 88.01 | 98.54 | 69.46 | 88.20 | +| en_core_sci_lg | 89.81| 88.02 | 98.57 | 69.29 | 88.11 | | model | F1 | Entity Types| |:---------------|:-----|:--------| -| en_ner_craft_md | 76.60|GGP, SO, TAXON, CHEBI, GO, CL| -| en_ner_jnlpba_md | 74.26| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | -| en_ner_bc5cdr_md | 85.02| DISEASE, CHEMICAL| -| en_ner_bionlp13cg_md | 78.28|CANCER, ORGAN, TISSUE, ORGANISM, CELL, AMINO_ACID, GENE_OR_GENE_PRODUCT, SIMPLE_CHEMICAL, ANATOMICAL_SYSTEM, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, DEVELOPING_ANATOMICAL_STRUCTURE, ORGANISM_SUBDIVISION, CELLULAR_COMPONENT| +| en_ner_craft_md | 75.02|GGP, SO, TAXON, CHEBI, GO, CL| +| en_ner_jnlpba_md | 73.56| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN | +| en_ner_bc5cdr_md | 84.94| DISEASE, CHEMICAL| +| en_ner_bionlp13cg_md | 78.09|CANCER, ORGAN, TISSUE, ORGANISM, CELL, AMINO_ACID, GENE_OR_GENE_PRODUCT, SIMPLE_CHEMICAL, ANATOMICAL_SYSTEM, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, DEVELOPING_ANATOMICAL_STRUCTURE, ORGANISM_SUBDIVISION, CELLULAR_COMPONENT| ### Example Usage diff --git a/requirements.in b/requirements.in index 79559a37..fede57e6 100644 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,5 @@ numpy -spacy>=2.2.1 +spacy>=2.3.0,<3.0.0 spacy-lookups-data pandas requests>=2.0.0,<3.0.0 diff --git a/scispacy/file_cache.py b/scispacy/file_cache.py index 5a519da0..744c6d23 100644 --- a/scispacy/file_cache.py +++ b/scispacy/file_cache.py @@ -126,7 +126,7 @@ def get_from_cache(url: str, cache_dir: str = None) -> str: if not os.path.exists(cache_path): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. - with tempfile.NamedTemporaryFile() as temp_file: + with tempfile.NamedTemporaryFile() as temp_file: # type: IO print(f"{url} not found in cache, downloading to {temp_file.name}") # GET file object diff --git a/scispacy/version.py b/scispacy/version.py index 53df94b1..07e77d87 100644 --- a/scispacy/version.py +++ b/scispacy/version.py @@ -1,6 +1,6 @@ _MAJOR = "0" _MINOR = "2" -_REVISION = "4-unreleased" +_REVISION = "5-unreleased" VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION) diff --git a/scripts/init_model.py b/scripts/init_model.py index 9bea83be..1bd71a23 100644 --- a/scripts/init_model.py +++ b/scripts/init_model.py @@ -105,7 +105,6 @@ def create_model(lang, probs, oov_prob, vectors_data, vector_keys, expand_vector lexeme = nlp.vocab[word] lexeme.rank = i lexeme.prob = prob - lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx lexeme.cluster = 0 @@ -117,7 +116,6 @@ def create_model(lang, probs, oov_prob, vectors_data, vector_keys, expand_vector for i, word in enumerate(vector_keys): if word not in nlp.vocab and expand_vectors: lexeme = nlp.vocab[word] - lexeme.is_oov = False lex_added += 1 elif word in nlp.vocab and not expand_vectors: new_keys.append(word) diff --git a/scripts/parser.sh b/scripts/parser.sh index 26dd350e..90cec465 100644 --- a/scripts/parser.sh +++ b/scripts/parser.sh @@ -1,4 +1,4 @@ -#!/user/bin/env bash +#!/usr/bin/env bash set -e diff --git a/scripts/train_parser_and_tagger.py b/scripts/train_parser_and_tagger.py index 3591212c..d8faf8c4 100644 --- a/scripts/train_parser_and_tagger.py +++ b/scripts/train_parser_and_tagger.py @@ -102,13 +102,13 @@ def train_parser_and_tagger(train_json_path: str, train_docs = train_corpus.train_docs(nlp) train_docs = list(train_docs) - train_mixture = train_docs if ontonotes_path: - onto_train_docs = onto_train_corpus.train_docs(nlp) + # Ignoring misaligned because the ontonotes raw text does not always match the tokenized text + onto_train_docs = onto_train_corpus.train_docs(nlp, ignore_misaligned=True) onto_train_docs = list([doc for doc in onto_train_docs if len(doc[0]) > 0]) num_onto_docs = int(float(ontonotes_train_percent)*len(onto_train_docs)) randomly_sampled_onto = random.sample(onto_train_docs, num_onto_docs) - train_mixture += randomly_sampled_onto + train_docs += randomly_sampled_onto row_head, output_stats = _configure_training_output(nlp.pipe_names, -1, False) row_widths = [len(w) for w in row_head] @@ -121,7 +121,7 @@ def train_parser_and_tagger(train_json_path: str, best_epoch = 0 best_epoch_uas = 0.0 for i in range(20): - random.shuffle(train_mixture) + random.shuffle(train_docs) with nlp.disable_pipes(*other_pipes): with tqdm(total=n_train_words, leave=False) as pbar: losses = {} @@ -152,7 +152,8 @@ def train_parser_and_tagger(train_json_path: str, cpu_wps = nwords/(end_time-start_time) if ontonotes_path: - onto_dev_docs = list([doc for doc in onto_train_corpus.dev_docs(nlp_loaded) if len(doc[0]) > 0]) + # Ignoring misaligned docs because the ontonotes raw text does not always match the tokenized text + onto_dev_docs = list([doc for doc in onto_train_corpus.dev_docs(nlp_loaded, ignore_misaligned=True) if len(doc[0]) > 0]) onto_scorer = nlp_loaded.evaluate(onto_dev_docs) @@ -200,7 +201,8 @@ def train_parser_and_tagger(train_json_path: str, meta_fp.write(json.dumps(meta)) if ontonotes_path: - onto_test_docs = list([doc for doc in onto_test_corpus.dev_docs(nlp_loaded) if len(doc[0]) > 0]) + # Ignoring misaligned docs because the ontonotes raw text does not always match the tokenized text + onto_test_docs = list([doc for doc in onto_test_corpus.dev_docs(nlp_loaded, ignore_misaligned=True) if len(doc[0]) > 0]) print("Retrained ontonotes evaluation") scorer_onto_retrained = nlp_loaded.evaluate(onto_test_docs) print("Test results:") diff --git a/scripts/train_specialised_ner.py b/scripts/train_specialised_ner.py index 9f5c93f4..a37d2d22 100644 --- a/scripts/train_specialised_ner.py +++ b/scripts/train_specialised_ner.py @@ -90,7 +90,8 @@ def train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overr util.env_opt('batch_to', 32), util.env_opt('batch_compound', 1.001)) - optimizer = nlp.begin_training() + with nlp.disable_pipes(*other_pipes): + optimizer = nlp.begin_training() best_epoch = 0 best_f1 = 0 for i in range(n_iter): diff --git a/setup.py b/setup.py index 979df6f9..adc99867 100644 --- a/setup.py +++ b/setup.py @@ -21,41 +21,34 @@ exec(version_file.read(), VERSION) setup( - name = 'scispacy', - version = VERSION["VERSION"], - url = 'https://allenai.github.io/SciSpaCy/', - author = 'Allen Institute for Artificial Intelligence', - author_email = 'ai2-info@allenai.org', - description = 'A full SpaCy pipeline and models for scientific/biomedical documents.', + name="scispacy", + version=VERSION["VERSION"], + url="https://allenai.github.io/SciSpaCy/", + author="Allen Institute for Artificial Intelligence", + author_email="ai2-info@allenai.org", + description="A full SpaCy pipeline and models for scientific/biomedical documents.", long_description=open("README.md").read(), long_description_content_type="text/markdown", - keywords = ["bioinformatics nlp spacy SpaCy biomedical"], + keywords=["bioinformatics nlp spacy SpaCy biomedical"], classifiers=[ - 'Intended Audience :: Science/Research', - 'Development Status :: 3 - Alpha', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 3.6', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'Topic :: Scientific/Engineering :: Bio-Informatics', + "Intended Audience :: Science/Research", + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.6", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Bio-Informatics", ], - packages = find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), + packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), license="Apache", install_requires=[ - "spacy>=2.2.1", - "requests>=2.0.0,<3.0.0" - "conllu", + "spacy>=2.3.0,<3.0.0", + "requests>=2.0.0,<3.0.0" "conllu", "numpy", "joblib", "nmslib>=1.7.3.6", "scikit-learn>=0.20.3", - "pysbd" - ], - tests_require=[ - "pytest", - "pytest-cov", - "flake8", - "black", - "mypy" - ], - python_requires='>=3.6.0', + "pysbd", + ], + tests_require=["pytest", "pytest-cov", "flake8", "black", "mypy"], + python_requires=">=3.6.0", )