Skip to content

Commit

Permalink
Merge pull request #246 from DeNeutoy/new-linkers
Browse files Browse the repository at this point in the history
New linkers
  • Loading branch information
DeNeutoy authored Jul 8, 2020
2 parents e3e9f0f + 60da605 commit 8994934
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 86 deletions.
15 changes: 11 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,17 @@ for abrv in doc._.abbreviations:
```
### EntityLinker
The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. Currently, the
Unified Medical Language System and the Medical Subject Headings (MESH) are supported.
The linker simply performs a string overlap search on named entities,
comparing them with a knowledge base of 2.7 million concepts using an approximate nearest neighbours search.
The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. The linker simply performs
a string overlap - based search (char-3grams) on named entities, comparing them with the concepts in a knowledge base
using an approximate nearest neighbours search.
Currently (v2.5.0), there are 5 supported linkers:
- `umls`: Links to the [Unified Medical Language System](https://www.nlm.nih.gov/research/umls/index.html), levels 0,1,2 and 9. This has ~3M concepts.
- `mesh`: Links to the [Medical Subject Headings](https://www.nlm.nih.gov/mesh/meshhome.html). This contains a smaller set of higher quality entities, which are used for indexing in Pubmed. MeSH contains ~30k entities. NOTE: The MeSH KB is derrived directly from MeSH itself, and as such uses different unique identifiers than the other KBs.
- `rxnorm`: Links to the [RxNorm](https://www.nlm.nih.gov/research/umls/rxnorm/index.html) ontology. RxNorm contains ~100k concepts focused on normalized names for clinical drugs. It is comprised of several other drug vocabularies commonly used in pharmacy management and drug interaction, including First Databank, Micromedex, and the Gold Standard Drug Database.
- `go`: Links to the [Gene Ontology](http://geneontology.org/). The Gene Ontology contains ~67k concepts focused on the functions of genes.
- `hpo`: Links to the [Human Phenotype Ontology](https://hpo.jax.org/app/). The Human Phenotype Ontology contains 16k concepts focused on phenotypic abnormalities encountered in human disease.
You may want to play around with some of the parameters
below to adapt to your use case (higher precision, higher recall etc).
Expand Down
46 changes: 40 additions & 6 deletions scispacy/candidate_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
from nmslib.dist import FloatIndex

from scispacy.file_cache import cached_path
from scispacy.linking_utils import KnowledgeBase, UmlsKnowledgeBase, MeshKnowledgeBase
from scispacy.linking_utils import (
KnowledgeBase,
UmlsKnowledgeBase,
Mesh,
GeneOntology,
RxNorm,
HumanPhenotypeOntology,
)


class LinkerPaths(NamedTuple):
Expand All @@ -34,10 +41,10 @@ class LinkerPaths(NamedTuple):


UmlsLinkerPaths = LinkerPaths(
ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/nmslib_index.bin",
tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectorizer.joblib", # noqa
tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/tfidf_vectors_sparse.npz", # noqa
concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linking_model/concept_aliases.json", # noqa
ann_index="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/nmslib_index.bin",
tfidf_vectorizer="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/tfidf_vectorizer.joblib", # noqa
tfidf_vectors="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/tfidf_vectors_sparse.npz", # noqa
concept_aliases_list="https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls/concept_aliases.json", # noqa
)

MeshLinkerPaths = LinkerPaths(
Expand All @@ -47,15 +54,42 @@ class LinkerPaths(NamedTuple):
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_linking_model/concept_aliases.json", # noqa
)

GeneOntologyLinkerPaths = LinkerPaths(
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/nmslib_index.bin",
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/tfidf_vectorizer.joblib", # noqa
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/tfidf_vectors_sparse.npz", # noqa
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/gene_ontology/concept_aliases.json", # noqa
)

HumanPhenotypeOntologyLinkerPaths = LinkerPaths(
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/nmslib_index.bin",
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/tfidf_vectorizer.joblib", # noqa
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/tfidf_vectors_sparse.npz", # noqa
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/human_phenotype_ontology/concept_aliases.json", # noqa
)

RxNormLinkerPaths = LinkerPaths(
ann_index="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/nmslib_index.bin",
tfidf_vectorizer="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/tfidf_vectorizer.joblib", # noqa
tfidf_vectors="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/tfidf_vectors_sparse.npz", # noqa
concept_aliases_list="https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/rxnorm/concept_aliases.json", # noqa
)


DEFAULT_PATHS: Dict[str, LinkerPaths] = {
"umls": UmlsLinkerPaths,
"mesh": MeshLinkerPaths,
"go": GeneOntologyLinkerPaths,
"hpo": HumanPhenotypeOntologyLinkerPaths,
"rxnorm": RxNormLinkerPaths,
}

DEFAULT_KNOWLEDGE_BASES: Dict[str, Type[KnowledgeBase]] = {
"umls": UmlsKnowledgeBase,
"mesh": MeshKnowledgeBase,
"mesh": Mesh,
"go": GeneOntology,
"hpo": HumanPhenotypeOntology,
"rxnorm": RxNorm,
}


Expand Down
28 changes: 26 additions & 2 deletions scispacy/linking_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __repr__(self):


DEFAULT_UMLS_PATH = (
"https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2017_aa_cat0129.json"
"https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_2020_aa_cat0129.jsonl"
)
DEFAULT_UMLS_TYPES_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv"

Expand Down Expand Up @@ -94,9 +94,33 @@ def __init__(
)


class MeshKnowledgeBase(KnowledgeBase):
class Mesh(KnowledgeBase):
def __init__(
self,
file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/mesh_2020.jsonl",
):
super().__init__(file_path)


class GeneOntology(KnowledgeBase):
def __init__(
self,
file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_gene_ontology.jsonl",
):
super().__init__(file_path)


class HumanPhenotypeOntology(KnowledgeBase):
def __init__(
self,
file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", # noqa
):
super().__init__(file_path)


class RxNorm(KnowledgeBase):
def __init__(
self,
file_path: str = "https://ai2-s2-scispacy.s3-us-west-2.amazonaws.com/data/umls_2020_human_phenotype_ontology.jsonl", # noqa
):
super().__init__(file_path)
8 changes: 7 additions & 1 deletion scispacy/umls_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
return None


def read_umls_concepts(meta_path: str, concept_details: Dict):
def read_umls_concepts(meta_path: str, concept_details: Dict, source: str = None):
"""
Read the concepts file MRCONSO.RRF from a UMLS release and store it in
concept_details dictionary. Each concept is represented with
Expand All @@ -54,6 +54,8 @@ def read_umls_concepts(meta_path: str, concept_details: Dict):
Args:
meta_path: path to the META directory of an UMLS release
concept_details: a dictionary to be filled with concept informations
source: An optional source identifier, used as a filter to extract only a
specific source from UMLS.
"""
concepts_filename = "MRCONSO.RRF"
headers = read_umls_file_headers(meta_path, concepts_filename)
Expand All @@ -65,6 +67,10 @@ def read_umls_concepts(meta_path: str, concept_details: Dict):
if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N":
continue # Keep English non-suppressed concepts only

if source is not None:
if concept["SAB"] != source:
continue

concept_id = concept["CUI"]
if concept_id not in concept_details: # a new concept
# add it to the dictionary with an empty list of aliases and types
Expand Down
27 changes: 27 additions & 0 deletions scripts/create_linker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import argparse
import os

from scispacy.candidate_generation import create_tfidf_ann_index
from scispacy.linking_utils import KnowledgeBase


def main(kb_path: str, output_path: str):

os.makedirs(output_path, exist_ok=True)
kb = KnowledgeBase(kb_path)
create_tfidf_ann_index(output_path, kb)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--kb_path',
help="Path to the KB file."
)
parser.add_argument(
'--output_path',
help="Path to the output directory."
)

args = parser.parse_args()
main(args.kb_path, args.output_path)
22 changes: 15 additions & 7 deletions scripts/export_uml_json.py → scripts/export_umls_json.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
"""
Convert a umls release to a json file of concepts.
Convert a umls release to a jsonl file of concepts.
"""
import json
import argparse
from scispacy import umls_utils

def main(meta_path, output_path):
def main(meta_path: str, output_path: str, source: str = None):

concept_details = {} # dictionary of concept_id -> {
# 'concept_id': str,
Expand All @@ -18,7 +18,7 @@ def main(meta_path, output_path):
# }

print('Reading concepts ... ')
umls_utils.read_umls_concepts(meta_path, concept_details)
umls_utils.read_umls_concepts(meta_path, concept_details, source)

print('Reading types ... ')
umls_utils.read_umls_types(meta_path, concept_details)
Expand Down Expand Up @@ -73,12 +73,14 @@ def main(meta_path, output_path):
if 'is_from_preferred_source' in concept:
del concept['is_from_preferred_source']

print('Exporting to the a json file {} ...'.format(output_path))
print('Exporting to the a jsonl file {} ...'.format(output_path))
with open(output_path, 'w') as fout:
json.dump(list(concept_details.values()), fout)

for value in concept_details.values():
fout.write(json.dumps(value) + "\n")
print('DONE.')


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
Expand All @@ -87,7 +89,13 @@ def main(meta_path, output_path):
)
parser.add_argument(
'--output_path',
help="Path to the output json file"
help="Path to the output jsonl file"
)
parser.add_argument(
'--source',
type=str,
default=None,
help="Whether to filter for a only a single UMLS source."
)
args = parser.parse_args()
main(args.meta_path, args.output_path)
main(args.meta_path, args.output_path, args.source)
66 changes: 0 additions & 66 deletions scripts/train_linker.py

This file was deleted.

0 comments on commit 8994934

Please sign in to comment.