Skip to content

Commit

Permalink
Merge pull request #478 from nachollorca/multilang
Browse files Browse the repository at this point in the history
Support UMLS filtering by language (Solves #477)
  • Loading branch information
dakinggg authored May 16, 2023
2 parents 80920ef + a4a9141 commit a5276f1
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 7 deletions.
14 changes: 11 additions & 3 deletions scispacy/umls_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:


def read_umls_concepts(
meta_path: str, concept_details: Dict, source: Optional[str] = None
meta_path: str,
concept_details: Dict,
source: Optional[str] = None,
lang: str = "ENG",
non_suppressed: bool = True,
):
"""
Read the concepts file MRCONSO.RRF from a UMLS release and store it in
Expand All @@ -58,6 +62,8 @@ def read_umls_concepts(
concept_details: a dictionary to be filled with concept informations
source: An optional source identifier, used as a filter to extract only a
specific source from UMLS.
lang: An optional language identifier, used to filter terms by language
non_suppressed: flag to indicate whether only non-suppressed concepts should be kept
"""
concepts_filename = "MRCONSO.RRF"
headers = read_umls_file_headers(meta_path, concepts_filename)
Expand All @@ -66,8 +72,10 @@ def read_umls_concepts(
splits = line.strip().split("|")
assert len(headers) == len(splits), (headers, splits)
concept = dict(zip(headers, splits))
if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N":
continue # Keep English non-suppressed concepts only
if (lang is not None and concept["LAT"] != lang) or (
non_suppressed and concept["SUPPRESS"] != "N"
):
continue # Keep non-suppressed concepts in target language only

if source is not None:
if concept["SAB"] != source:
Expand Down
18 changes: 14 additions & 4 deletions scripts/export_umls_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import argparse
from scispacy import umls_utils

def main(meta_path: str, output_path: str, source: str = None):
def main(meta_path: str, output_path: str, lang: str = None, source: str = None):

concept_details = {} # dictionary of concept_id -> {
# 'concept_id': str,
Expand All @@ -18,7 +18,7 @@ def main(meta_path: str, output_path: str, source: str = None):
# }

print('Reading concepts ... ')
umls_utils.read_umls_concepts(meta_path, concept_details, source)
umls_utils.read_umls_concepts(meta_path, concept_details, lang, source)

print('Reading types ... ')
umls_utils.read_umls_types(meta_path, concept_details)
Expand Down Expand Up @@ -93,13 +93,23 @@ def main(meta_path: str, output_path: str, source: str = None):
)
parser.add_argument(
'--output_path',
help="Path to the output jsonl file"
help="Path to the output jsonl file."
)
parser.add_argument(
'--lang',
default="ENG",
help="Language subset of UMLS."
)
parser.add_argument(
'--source',
type=str,
default=None,
help="Whether to filter for a only a single UMLS source."
)
parser.add_argument(
'--non_suppressed',
default=True,
help="Whether to include non supressed terms."
)
args = parser.parse_args()
main(args.meta_path, args.output_path, args.source)
main(args.meta_path, args.output_path, args.lang, args.source, args.non_suppressed)

0 comments on commit a5276f1

Please sign in to comment.