diff --git a/README.md b/README.md index 63cc2d77..653eb143 100644 --- a/README.md +++ b/README.md @@ -127,10 +127,17 @@ for abrv in doc._.abbreviations: ``` ### EntityLinker -The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. Currently, the -Unified Medical Language System and the Medical Subject Headings (MESH) are supported. -The linker simply performs a string overlap search on named entities, -comparing them with a knowledge base of 2.7 million concepts using an approximate nearest neighbours search. +The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. The linker simply performs +a string overlap - based search (char-3grams) on named entities, comparing them with the concepts in a knowledge base +using an approximate nearest neighbours search. + +Currently (v2.5.0), there are 5 supported linkers: + +- `umls`: Links to the [Unified Medical Language System](https://www.nlm.nih.gov/research/umls/index.html), levels 0,1,2 and 9. This has ~3M concepts. +- `mesh`: Links to the [Medical Subject Headings](https://www.nlm.nih.gov/mesh/meshhome.html). This contains a smaller set of higher quality entities, which are used for indexing in Pubmed. MeSH contains ~30k entities. NOTE: The MeSH KB is derrived directly from MeSH itself, and as such uses different unique identifiers than the other KBs. +- `rxnorm`: Links to the [RxNorm](https://www.nlm.nih.gov/research/umls/rxnorm/index.html) ontology. RxNorm contains ~100k concepts focused on normalized names for clinical drugs. It is comprised of several other drug vocabularies commonly used in pharmacy management and drug interaction, including First Databank, Micromedex, and the Gold Standard Drug Database. +- `go`: Links to the [Gene Ontology](http://geneontology.org/). The Gene Ontology contains ~67k concepts focused on the functions of genes. +- `hpo`: Links to the [Human Phenotype Ontology](https://hpo.jax.org/app/). The Human Phenotype Ontology contains 16k concepts focused on phenotypic abnormalities encountered in human disease. You may want to play around with some of the parameters below to adapt to your use case (higher precision, higher recall etc). diff --git a/scispacy/candidate_generation.py b/scispacy/candidate_generation.py index 7b0a0df7..b6957024 100644 --- a/scispacy/candidate_generation.py +++ b/scispacy/candidate_generation.py @@ -11,7 +11,14 @@ from nmslib.dist import FloatIndex from scispacy.file_cache import cached_path -from scispacy.linking_utils import KnowledgeBase, UmlsKnowledgeBase, MeshKnowledgeBase +from scispacy.linking_utils import ( + KnowledgeBase, + UmlsKnowledgeBase, + Mesh, + GeneOntology, + RxNorm, + HumanPhenotypeOntology, +) class LinkerPaths(NamedTuple): @@ -34,10 +41,10 @@ class LinkerPaths(NamedTuple): UmlsLinkerPaths = LinkerPaths( - ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/nmslib_index.bin", - tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectorizer.joblib", # noqa - tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectors_sparse.npz", # noqa - concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/concept_aliases.json", # noqa + ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/nmslib_index.bin", + tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/tfidf_vectorizer.joblib", # noqa + tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/tfidf_vectors_sparse.npz", # noqa + concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/concept_aliases.json", # noqa ) MeshLinkerPaths = LinkerPaths( @@ -47,15 +54,42 @@ class LinkerPaths(NamedTuple): concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_linking_model/concept_aliases.json", # noqa ) +GeneOntologyLinkerPaths = LinkerPaths( + ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/nmslib_index.bin", + tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/tfidf_vectorizer.joblib", # noqa + tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/tfidf_vectors_sparse.npz", # noqa + concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/concept_aliases.json", # noqa +) + +HumanPhenotypeOntologyLinkerPaths = LinkerPaths( + ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/nmslib_index.bin", + tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/tfidf_vectorizer.joblib", # noqa + tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/tfidf_vectors_sparse.npz", # noqa + concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/concept_aliases.json", # noqa +) + +RxNormLinkerPaths = LinkerPaths( + ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/nmslib_index.bin", + tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/tfidf_vectorizer.joblib", # noqa + tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/tfidf_vectors_sparse.npz", # noqa + concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/concept_aliases.json", # noqa +) + DEFAULT_PATHS: Dict[str, LinkerPaths] = { "umls": UmlsLinkerPaths, "mesh": MeshLinkerPaths, + "go": GeneOntologyLinkerPaths, + "hpo": HumanPhenotypeOntologyLinkerPaths, + "rxnorm": RxNormLinkerPaths, } DEFAULT_KNOWLEDGE_BASES: Dict[str, Type[KnowledgeBase]] = { "umls": UmlsKnowledgeBase, - "mesh": MeshKnowledgeBase, + "mesh": Mesh, + "go": GeneOntology, + "hpo": HumanPhenotypeOntology, + "rxnorm": RxNorm, } diff --git a/scispacy/linking_utils.py b/scispacy/linking_utils.py index 947f18e6..ecb964a0 100644 --- a/scispacy/linking_utils.py +++ b/scispacy/linking_utils.py @@ -37,7 +37,7 @@ def __repr__(self): DEFAULT_UMLS_PATH = ( - "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2017_aa_cat0129.json" + "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2020_aa_cat0129.jsonl" ) DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv" @@ -94,9 +94,33 @@ def __init__( ) -class MeshKnowledgeBase(KnowledgeBase): +class Mesh(KnowledgeBase): def __init__( self, file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_2020.jsonl", ): super().__init__(file_path) + + +class GeneOntology(KnowledgeBase): + def __init__( + self, + file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_gene_ontology.jsonl", + ): + super().__init__(file_path) + + +class HumanPhenotypeOntology(KnowledgeBase): + def __init__( + self, + file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", # noqa + ): + super().__init__(file_path) + + +class RxNorm(KnowledgeBase): + def __init__( + self, + file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", # noqa + ): + super().__init__(file_path) diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index 61ba732d..73a448f0 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -37,7 +37,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: return None -def read_umls_concepts(meta_path: str, concept_details: Dict): +def read_umls_concepts(meta_path: str, concept_details: Dict, source: str = None): """ Read the concepts file MRCONSO.RRF from a UMLS release and store it in concept_details dictionary. Each concept is represented with @@ -54,6 +54,8 @@ def read_umls_concepts(meta_path: str, concept_details: Dict): Args: meta_path: path to the META directory of an UMLS release concept_details: a dictionary to be filled with concept informations + source: An optional source identifier, used as a filter to extract only a + specific source from UMLS. """ concepts_filename = "MRCONSO.RRF" headers = read_umls_file_headers(meta_path, concepts_filename) @@ -65,6 +67,10 @@ def read_umls_concepts(meta_path: str, concept_details: Dict): if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N": continue # Keep English non-suppressed concepts only + if source is not None: + if concept["SAB"] != source: + continue + concept_id = concept["CUI"] if concept_id not in concept_details: # a new concept # add it to the dictionary with an empty list of aliases and types diff --git a/scripts/create_linker.py b/scripts/create_linker.py new file mode 100644 index 00000000..e3e3cbef --- /dev/null +++ b/scripts/create_linker.py @@ -0,0 +1,27 @@ +import argparse +import os + +from scispacy.candidate_generation import create_tfidf_ann_index +from scispacy.linking_utils import KnowledgeBase + + +def main(kb_path: str, output_path: str): + + os.makedirs(output_path, exist_ok=True) + kb = KnowledgeBase(kb_path) + create_tfidf_ann_index(output_path, kb) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--kb_path', + help="Path to the KB file." + ) + parser.add_argument( + '--output_path', + help="Path to the output directory." + ) + + args = parser.parse_args() + main(args.kb_path, args.output_path) diff --git a/scripts/export_uml_json.py b/scripts/export_umls_json.py similarity index 85% rename from scripts/export_uml_json.py rename to scripts/export_umls_json.py index 80361fce..b9cf01c3 100644 --- a/scripts/export_uml_json.py +++ b/scripts/export_umls_json.py @@ -1,13 +1,13 @@ """ -Convert a umls release to a json file of concepts. +Convert a umls release to a jsonl file of concepts. """ import json import argparse from scispacy import umls_utils -def main(meta_path, output_path): +def main(meta_path: str, output_path: str, source: str = None): concept_details = {} # dictionary of concept_id -> { # 'concept_id': str, @@ -18,7 +18,7 @@ def main(meta_path, output_path): # } print('Reading concepts ... ') - umls_utils.read_umls_concepts(meta_path, concept_details) + umls_utils.read_umls_concepts(meta_path, concept_details, source) print('Reading types ... ') umls_utils.read_umls_types(meta_path, concept_details) @@ -73,12 +73,14 @@ def main(meta_path, output_path): if 'is_from_preferred_source' in concept: del concept['is_from_preferred_source'] - print('Exporting to the a json file {} ...'.format(output_path)) + print('Exporting to the a jsonl file {} ...'.format(output_path)) with open(output_path, 'w') as fout: - json.dump(list(concept_details.values()), fout) + for value in concept_details.values(): + fout.write(json.dumps(value) + "\n") print('DONE.') + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -87,7 +89,13 @@ def main(meta_path, output_path): ) parser.add_argument( '--output_path', - help="Path to the output json file" + help="Path to the output jsonl file" + ) + parser.add_argument( + '--source', + type=str, + default=None, + help="Whether to filter for a only a single UMLS source." ) args = parser.parse_args() - main(args.meta_path, args.output_path) + main(args.meta_path, args.output_path, args.source) diff --git a/scripts/train_linker.py b/scripts/train_linker.py deleted file mode 100644 index be61d79d..00000000 --- a/scripts/train_linker.py +++ /dev/null @@ -1,66 +0,0 @@ -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.metrics import accuracy_score, classification_report -from joblib import dump, load -import datetime -from scispacy import umls_semantic_type_tree -from linking import Linker -import argparse -from tqdm import tqdm -import json - -def read_file(filename, limit): - x = [] - y = [] - with open(filename) as f: - for line in tqdm(f, total=limit): - d = json.loads(line) - x.append(Linker.featurizer(d)) - y.append(d['label']) - if len(x) >= limit: - break - return x, y - -def main(data_path: str): - start_time = datetime.datetime.now() - - x_train, y_train = read_file(f'{data_path}/train.jsonl', 5000000) # the full set is unnecessarily large - x_dev, y_dev = read_file(f'{data_path}/dev.jsonl', 1) # the full set is unnecessarily large - x_test, y_test = read_file(f'{data_path}/test.jsonl', 5000000) - - # sklearn classifier already splits the training set into train and dev, so we don't need separate sets - x_train.extend(x_dev) - y_train.extend(y_dev) - - classifier = GradientBoostingClassifier(verbose=1) - - classifier.fit(x_train, y_train) - linking_classifier_path = f'{data_path}/linking_classifier.joblib' - dump(classifier, linking_classifier_path) - classifier = load(linking_classifier_path) - pred = classifier.predict(x_train) - accuracy = accuracy_score(y_train, pred) - report = classification_report(y_train, pred) - - print('Train+Dev results:') - print(accuracy) - print(report) - - pred = classifier.predict(x_test) - accuracy = accuracy_score(y_test, pred) - report = classification_report(y_test, pred) - print('Test results:') - print(accuracy) - print(report) - - end_time = datetime.datetime.now() - total_time = end_time - start_time - print(f'Time: {total_time.total_seconds()} seconds') - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - '--data_path', - help='Path to a directory with training set.' - ) - args = parser.parse_args() - main(args.data_path)