From c135a6afa61e3ab88025142b7a0cd7d046f5c689 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Tue, 7 Jul 2020 15:13:12 -0700 Subject: [PATCH 1/7] add source filtering, remove old scripts --- scispacy/umls_utils.py | 6 +- ...export_uml_json.py => export_umls_json.py} | 13 +++- scripts/train_linker.py | 66 ------------------- 3 files changed, 15 insertions(+), 70 deletions(-) rename scripts/{export_uml_json.py => export_umls_json.py} (92%) delete mode 100644 scripts/train_linker.py diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index 61ba732d..b5bde877 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -37,7 +37,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: return None -def read_umls_concepts(meta_path: str, concept_details: Dict): +def read_umls_concepts(meta_path: str, concept_details: Dict, source: str = None): """ Read the concepts file MRCONSO.RRF from a UMLS release and store it in concept_details dictionary. Each concept is represented with @@ -65,6 +65,10 @@ def read_umls_concepts(meta_path: str, concept_details: Dict): if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N": continue # Keep English non-suppressed concepts only + if source is not None: + if concept["SAB"] != source: + continue + concept_id = concept["CUI"] if concept_id not in concept_details: # a new concept # add it to the dictionary with an empty list of aliases and types diff --git a/scripts/export_uml_json.py b/scripts/export_umls_json.py similarity index 92% rename from scripts/export_uml_json.py rename to scripts/export_umls_json.py index 80361fce..322744a6 100644 --- a/scripts/export_uml_json.py +++ b/scripts/export_umls_json.py @@ -7,7 +7,7 @@ import argparse from scispacy import umls_utils -def main(meta_path, output_path): +def main(meta_path: str, output_path: str, source: str = None): concept_details = {} # dictionary of concept_id -> { # 'concept_id': str, @@ -18,7 +18,7 @@ def main(meta_path, output_path): # } print('Reading concepts ... ') - umls_utils.read_umls_concepts(meta_path, concept_details) + umls_utils.read_umls_concepts(meta_path, concept_details, source) print('Reading types ... ') umls_utils.read_umls_types(meta_path, concept_details) @@ -79,6 +79,7 @@ def main(meta_path, output_path): print('DONE.') + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -89,5 +90,11 @@ def main(meta_path, output_path): '--output_path', help="Path to the output json file" ) + parser.add_argument( + '--source', + type=str, + default=None, + help="Path to the output json file" + ) args = parser.parse_args() - main(args.meta_path, args.output_path) + main(args.meta_path, args.output_path, args.source) diff --git a/scripts/train_linker.py b/scripts/train_linker.py deleted file mode 100644 index be61d79d..00000000 --- a/scripts/train_linker.py +++ /dev/null @@ -1,66 +0,0 @@ -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.metrics import accuracy_score, classification_report -from joblib import dump, load -import datetime -from scispacy import umls_semantic_type_tree -from linking import Linker -import argparse -from tqdm import tqdm -import json - -def read_file(filename, limit): - x = [] - y = [] - with open(filename) as f: - for line in tqdm(f, total=limit): - d = json.loads(line) - x.append(Linker.featurizer(d)) - y.append(d['label']) - if len(x) >= limit: - break - return x, y - -def main(data_path: str): - start_time = datetime.datetime.now() - - x_train, y_train = read_file(f'{data_path}/train.jsonl', 5000000) # the full set is unnecessarily large - x_dev, y_dev = read_file(f'{data_path}/dev.jsonl', 1) # the full set is unnecessarily large - x_test, y_test = read_file(f'{data_path}/test.jsonl', 5000000) - - # sklearn classifier already splits the training set into train and dev, so we don't need separate sets - x_train.extend(x_dev) - y_train.extend(y_dev) - - classifier = GradientBoostingClassifier(verbose=1) - - classifier.fit(x_train, y_train) - linking_classifier_path = f'{data_path}/linking_classifier.joblib' - dump(classifier, linking_classifier_path) - classifier = load(linking_classifier_path) - pred = classifier.predict(x_train) - accuracy = accuracy_score(y_train, pred) - report = classification_report(y_train, pred) - - print('Train+Dev results:') - print(accuracy) - print(report) - - pred = classifier.predict(x_test) - accuracy = accuracy_score(y_test, pred) - report = classification_report(y_test, pred) - print('Test results:') - print(accuracy) - print(report) - - end_time = datetime.datetime.now() - total_time = end_time - start_time - print(f'Time: {total_time.total_seconds()} seconds') - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - '--data_path', - help='Path to a directory with training set.' - ) - args = parser.parse_args() - main(args.data_path) From 47d7c2eb9a71034f19b17597de79053132192ac0 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Tue, 7 Jul 2020 15:28:27 -0700 Subject: [PATCH 2/7] add linker creation script --- scripts/create_linker.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 scripts/create_linker.py diff --git a/scripts/create_linker.py b/scripts/create_linker.py new file mode 100644 index 00000000..e3e3cbef --- /dev/null +++ b/scripts/create_linker.py @@ -0,0 +1,27 @@ +import argparse +import os + +from scispacy.candidate_generation import create_tfidf_ann_index +from scispacy.linking_utils import KnowledgeBase + + +def main(kb_path: str, output_path: str): + + os.makedirs(output_path, exist_ok=True) + kb = KnowledgeBase(kb_path) + create_tfidf_ann_index(output_path, kb) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--kb_path', + help="Path to the KB file." + ) + parser.add_argument( + '--output_path', + help="Path to the output directory." + ) + + args = parser.parse_args() + main(args.kb_path, args.output_path) From 93310207e017919e6691a73836022fc2e1880884 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Tue, 7 Jul 2020 15:58:45 -0700 Subject: [PATCH 3/7] add new paths --- scispacy/candidate_generation.py | 46 +++++++++++++++++++++++++++----- scispacy/linking_utils.py | 28 +++++++++++++++++-- scripts/export_umls_json.py | 11 ++++---- 3 files changed, 72 insertions(+), 13 deletions(-) diff --git a/scispacy/candidate_generation.py b/scispacy/candidate_generation.py index 7b0a0df7..b6957024 100644 --- a/scispacy/candidate_generation.py +++ b/scispacy/candidate_generation.py @@ -11,7 +11,14 @@ from nmslib.dist import FloatIndex from scispacy.file_cache import cached_path -from scispacy.linking_utils import KnowledgeBase, UmlsKnowledgeBase, MeshKnowledgeBase +from scispacy.linking_utils import ( + KnowledgeBase, + UmlsKnowledgeBase, + Mesh, + GeneOntology, + RxNorm, + HumanPhenotypeOntology, +) class LinkerPaths(NamedTuple): @@ -34,10 +41,10 @@ class LinkerPaths(NamedTuple): UmlsLinkerPaths = LinkerPaths( - ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/nmslib_index.bin", - tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectorizer.joblib", # noqa - tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectors_sparse.npz", # noqa - concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/concept_aliases.json", # noqa + ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/nmslib_index.bin", + tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/tfidf_vectorizer.joblib", # noqa + tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/tfidf_vectors_sparse.npz", # noqa + concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/concept_aliases.json", # noqa ) MeshLinkerPaths = LinkerPaths( @@ -47,15 +54,42 @@ class LinkerPaths(NamedTuple): concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_linking_model/concept_aliases.json", # noqa ) +GeneOntologyLinkerPaths = LinkerPaths( + ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/nmslib_index.bin", + tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/tfidf_vectorizer.joblib", # noqa + tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/tfidf_vectors_sparse.npz", # noqa + concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/concept_aliases.json", # noqa +) + +HumanPhenotypeOntologyLinkerPaths = LinkerPaths( + ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/nmslib_index.bin", + tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/tfidf_vectorizer.joblib", # noqa + tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/tfidf_vectors_sparse.npz", # noqa + concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/concept_aliases.json", # noqa +) + +RxNormLinkerPaths = LinkerPaths( + ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/nmslib_index.bin", + tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/tfidf_vectorizer.joblib", # noqa + tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/tfidf_vectors_sparse.npz", # noqa + concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/concept_aliases.json", # noqa +) + DEFAULT_PATHS: Dict[str, LinkerPaths] = { "umls": UmlsLinkerPaths, "mesh": MeshLinkerPaths, + "go": GeneOntologyLinkerPaths, + "hpo": HumanPhenotypeOntologyLinkerPaths, + "rxnorm": RxNormLinkerPaths, } DEFAULT_KNOWLEDGE_BASES: Dict[str, Type[KnowledgeBase]] = { "umls": UmlsKnowledgeBase, - "mesh": MeshKnowledgeBase, + "mesh": Mesh, + "go": GeneOntology, + "hpo": HumanPhenotypeOntology, + "rxnorm": RxNorm, } diff --git a/scispacy/linking_utils.py b/scispacy/linking_utils.py index 947f18e6..6fa0eb24 100644 --- a/scispacy/linking_utils.py +++ b/scispacy/linking_utils.py @@ -37,7 +37,7 @@ def __repr__(self): DEFAULT_UMLS_PATH = ( - "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2017_aa_cat0129.json" + "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2020_aa_cat0129.jsonl" ) DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv" @@ -94,9 +94,33 @@ def __init__( ) -class MeshKnowledgeBase(KnowledgeBase): +class Mesh(KnowledgeBase): def __init__( self, file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_2020.jsonl", ): super().__init__(file_path) + + +class GeneOntology(KnowledgeBase): + def __init__( + self, + file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_gene_ontology.jsonl", + ): + super().__init__(file_path) + + +class HumanPhenotypeOntology(KnowledgeBase): + def __init__( + self, + file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", + ): + super().__init__(file_path) + + +class RxNorm(KnowledgeBase): + def __init__( + self, + file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", + ): + super().__init__(file_path) diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py index 322744a6..b9cf01c3 100644 --- a/scripts/export_umls_json.py +++ b/scripts/export_umls_json.py @@ -1,6 +1,6 @@ """ -Convert a umls release to a json file of concepts. +Convert a umls release to a jsonl file of concepts. """ import json @@ -73,10 +73,11 @@ def main(meta_path: str, output_path: str, source: str = None): if 'is_from_preferred_source' in concept: del concept['is_from_preferred_source'] - print('Exporting to the a json file {} ...'.format(output_path)) + print('Exporting to the a jsonl file {} ...'.format(output_path)) with open(output_path, 'w') as fout: - json.dump(list(concept_details.values()), fout) + for value in concept_details.values(): + fout.write(json.dumps(value) + "\n") print('DONE.') @@ -88,13 +89,13 @@ def main(meta_path: str, output_path: str, source: str = None): ) parser.add_argument( '--output_path', - help="Path to the output json file" + help="Path to the output jsonl file" ) parser.add_argument( '--source', type=str, default=None, - help="Path to the output json file" + help="Whether to filter for a only a single UMLS source." ) args = parser.parse_args() main(args.meta_path, args.output_path, args.source) From d7412ba6404aa196813bbcda5d0912bcccff27c5 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Tue, 7 Jul 2020 16:14:48 -0700 Subject: [PATCH 4/7] update readme --- README.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index acae4fa8..7c8a98a2 100644 --- a/README.md +++ b/README.md @@ -127,10 +127,17 @@ for abrv in doc._.abbreviations: ``` ### EntityLinker -The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. Currently, the -Unified Medical Language System and the Medical Subject Headings (MESH) are supported. -The linker simply performs a string overlap search on named entities, -comparing them with a knowledge base of 2.7 million concepts using an approximate nearest neighbours search. +The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. The linker simply performs +a string overlap based search on named entities, comparing them with a knowledge base of 2.7 million concepts +using an approximate nearest neighbours search. + +Currently (v2.5.0), there are 5 supported linkers: + +- `umls`: Links to the Unified Medical Language System, levels 0,1,2 and 9. This has ~3M concepts. +- `mesh`: Links to the Medical Subject Headings. This contains a smaller set of higher quality entities, which are used for indexing in Pubmed. MeSH contains ~30k entities. +- `rxnorm`: Links to the RxNorm ontology. RxNorm contains ~100k concepts focused on normalized names for clinical drugs. It is comprised of several other drug vocabularies commonly used in pharmacy management and drug interaction, including First Databank, Micromedex, and the Gold Standard Drug Database. +- `go`: Links to the Gene Ontology. The Gene Ontology contains ~67k concepts focused on the functions of genes. +- `hpo`: Links to the Human Phenotype ontology. The Human Phenotype Ontology contains 16k concepts focused on phenotypic abnormalities encountered in human disease. You may want to play around with some of the parameters below to adapt to your use case (higher precision, higher recall etc). From 6ddd910d2a95db45f2b8f3527821b21423b5944c Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Tue, 7 Jul 2020 16:15:57 -0700 Subject: [PATCH 5/7] update readme, fix lint --- scispacy/linking_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scispacy/linking_utils.py b/scispacy/linking_utils.py index 6fa0eb24..ecb964a0 100644 --- a/scispacy/linking_utils.py +++ b/scispacy/linking_utils.py @@ -113,7 +113,7 @@ def __init__( class HumanPhenotypeOntology(KnowledgeBase): def __init__( self, - file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", + file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", # noqa ): super().__init__(file_path) @@ -121,6 +121,6 @@ def __init__( class RxNorm(KnowledgeBase): def __init__( self, - file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", + file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", # noqa ): super().__init__(file_path) From 231f98266ac63ddd514b83937f3c008f2052d3ad Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Wed, 8 Jul 2020 08:33:45 -0700 Subject: [PATCH 6/7] add links, update readme --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7c8a98a2..afc8b3b3 100644 --- a/README.md +++ b/README.md @@ -128,16 +128,16 @@ for abrv in doc._.abbreviations: ### EntityLinker The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. The linker simply performs -a string overlap based search on named entities, comparing them with a knowledge base of 2.7 million concepts +a string overlap - based search (char-3grams) on named entities, comparing them with the concepts in a knowledge base using an approximate nearest neighbours search. Currently (v2.5.0), there are 5 supported linkers: -- `umls`: Links to the Unified Medical Language System, levels 0,1,2 and 9. This has ~3M concepts. -- `mesh`: Links to the Medical Subject Headings. This contains a smaller set of higher quality entities, which are used for indexing in Pubmed. MeSH contains ~30k entities. -- `rxnorm`: Links to the RxNorm ontology. RxNorm contains ~100k concepts focused on normalized names for clinical drugs. It is comprised of several other drug vocabularies commonly used in pharmacy management and drug interaction, including First Databank, Micromedex, and the Gold Standard Drug Database. -- `go`: Links to the Gene Ontology. The Gene Ontology contains ~67k concepts focused on the functions of genes. -- `hpo`: Links to the Human Phenotype ontology. The Human Phenotype Ontology contains 16k concepts focused on phenotypic abnormalities encountered in human disease. +- `umls`: Links to the [Unified Medical Language System](https://www.nlm.nih.gov/research/umls/index.html), levels 0,1,2 and 9. This has ~3M concepts. +- `mesh`: Links to the [Medical Subject Headings](https://www.nlm.nih.gov/mesh/meshhome.html). This contains a smaller set of higher quality entities, which are used for indexing in Pubmed. MeSH contains ~30k entities. NOTE: The MeSH KB is derrived directly from MeSH itself, and as such uses different unique identifiers than the other KBs. +- `rxnorm`: Links to the [RxNorm](https://www.nlm.nih.gov/research/umls/rxnorm/index.html) ontology. RxNorm contains ~100k concepts focused on normalized names for clinical drugs. It is comprised of several other drug vocabularies commonly used in pharmacy management and drug interaction, including First Databank, Micromedex, and the Gold Standard Drug Database. +- `go`: Links to the [Gene Ontology](http://geneontology.org/). The Gene Ontology contains ~67k concepts focused on the functions of genes. +- `hpo`: Links to the [Human Phenotype Ontology](https://hpo.jax.org/app/). The Human Phenotype Ontology contains 16k concepts focused on phenotypic abnormalities encountered in human disease. You may want to play around with some of the parameters below to adapt to your use case (higher precision, higher recall etc). From 60da6056f5481119048577164a907bdb361ef5e0 Mon Sep 17 00:00:00 2001 From: Mark Neumann Date: Wed, 8 Jul 2020 08:37:06 -0700 Subject: [PATCH 7/7] add docstring --- scispacy/umls_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index b5bde877..73a448f0 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -54,6 +54,8 @@ def read_umls_concepts(meta_path: str, concept_details: Dict, source: str = None Args: meta_path: path to the META directory of an UMLS release concept_details: a dictionary to be filled with concept informations + source: An optional source identifier, used as a filter to extract only a + specific source from UMLS. """ concepts_filename = "MRCONSO.RRF" headers = read_umls_file_headers(meta_path, concepts_filename)