diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index 07be1b9..1506bc9 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -38,7 +38,11 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: def read_umls_concepts( - meta_path: str, concept_details: Dict, source: Optional[str] = None + meta_path: str, + concept_details: Dict, + source: Optional[str] = None, + lang: str = "ENG", + non_suppressed: bool = True, ): """ Read the concepts file MRCONSO.RRF from a UMLS release and store it in @@ -58,6 +62,8 @@ def read_umls_concepts( concept_details: a dictionary to be filled with concept informations source: An optional source identifier, used as a filter to extract only a specific source from UMLS. + lang: An optional language identifier, used to filter terms by language + non_suppressed: flag to indicate whether only non-suppressed concepts should be kept """ concepts_filename = "MRCONSO.RRF" headers = read_umls_file_headers(meta_path, concepts_filename) @@ -66,8 +72,10 @@ def read_umls_concepts( splits = line.strip().split("|") assert len(headers) == len(splits), (headers, splits) concept = dict(zip(headers, splits)) - if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N": - continue # Keep English non-suppressed concepts only + if (lang is not None and concept["LAT"] != lang) or ( + non_suppressed and concept["SUPPRESS"] != "N" + ): + continue # Keep non-suppressed concepts in target language only if source is not None: if concept["SAB"] != source: diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py index 5b1192e..e3acd74 100644 --- a/scripts/export_umls_json.py +++ b/scripts/export_umls_json.py @@ -7,7 +7,7 @@ import argparse from scispacy import umls_utils -def main(meta_path: str, output_path: str, source: str = None): +def main(meta_path: str, output_path: str, lang: str = None, source: str = None): concept_details = {} # dictionary of concept_id -> { # 'concept_id': str, @@ -18,7 +18,7 @@ def main(meta_path: str, output_path: str, source: str = None): # } print('Reading concepts ... ') - umls_utils.read_umls_concepts(meta_path, concept_details, source) + umls_utils.read_umls_concepts(meta_path, concept_details, lang, source) print('Reading types ... ') umls_utils.read_umls_types(meta_path, concept_details) @@ -93,7 +93,12 @@ def main(meta_path: str, output_path: str, source: str = None): ) parser.add_argument( '--output_path', - help="Path to the output jsonl file" + help="Path to the output jsonl file." + ) + parser.add_argument( + '--lang', + default="ENG", + help="Language subset of UMLS." ) parser.add_argument( '--source', @@ -101,5 +106,10 @@ def main(meta_path: str, output_path: str, source: str = None): default=None, help="Whether to filter for a only a single UMLS source." ) + parser.add_argument( + '--non_suppressed', + default=True, + help="Whether to include non supressed terms." + ) args = parser.parse_args() - main(args.meta_path, args.output_path, args.source) + main(args.meta_path, args.output_path, args.lang, args.source, args.non_suppressed)