From d16887bb716483af74789ce89bd3761df5a0be5d Mon Sep 17 00:00:00 2001 From: nachollorca Date: Sun, 7 May 2023 14:09:00 +0200 Subject: [PATCH 1/9] Add lang and non_supressed options to UMLS reader --- scispacy/umls_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index 07be1b9..c037288 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -38,7 +38,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: def read_umls_concepts( - meta_path: str, concept_details: Dict, source: Optional[str] = None + meta_path: str, concept_details: Dict, source: str = None, lang: str = None, non_suppressed : bool= True ): """ Read the concepts file MRCONSO.RRF from a UMLS release and store it in @@ -58,6 +58,8 @@ def read_umls_concepts( concept_details: a dictionary to be filled with concept informations source: An optional source identifier, used as a filter to extract only a specific source from UMLS. + lang: An optional language identifier, used to filter terms by language + non_suppressed: flag to indicate whether only non-suppressed concepts should be kept """ concepts_filename = "MRCONSO.RRF" headers = read_umls_file_headers(meta_path, concepts_filename) @@ -66,8 +68,8 @@ def read_umls_concepts( splits = line.strip().split("|") assert len(headers) == len(splits), (headers, splits) concept = dict(zip(headers, splits)) - if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N": - continue # Keep English non-suppressed concepts only + if (lang is not None and concept["LAT"] != lang) or (non_suppressed and concept["SUPPRESS"] != "N"): + continue # Keep non-suppressed concepts in target language only if source is not None: if concept["SAB"] != source: From 8a72058ce12727eb5dfd1e3d4e3fda2ca2bfc4ac Mon Sep 17 00:00:00 2001 From: nachollorca Date: Sun, 7 May 2023 14:12:29 +0200 Subject: [PATCH 2/9] Support multiple lang exporting UMLS .jsons --- scripts/export_umls_json.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py index 5b1192e..19c558a 100644 --- a/scripts/export_umls_json.py +++ b/scripts/export_umls_json.py @@ -7,7 +7,7 @@ import argparse from scispacy import umls_utils -def main(meta_path: str, output_path: str, source: str = None): +def main(meta_path: str, output_path: str, lang: str, source: str = None): concept_details = {} # dictionary of concept_id -> { # 'concept_id': str, @@ -18,7 +18,7 @@ def main(meta_path: str, output_path: str, source: str = None): # } print('Reading concepts ... ') - umls_utils.read_umls_concepts(meta_path, concept_details, source) + umls_utils.read_umls_concepts(meta_path, concept_details, lang, source) print('Reading types ... ') umls_utils.read_umls_types(meta_path, concept_details) @@ -95,6 +95,10 @@ def main(meta_path: str, output_path: str, source: str = None): '--output_path', help="Path to the output jsonl file" ) + parser.add_argument( + '--lang', + help="Language subset of UMLS" + ) parser.add_argument( '--source', type=str, @@ -102,4 +106,4 @@ def main(meta_path: str, output_path: str, source: str = None): help="Whether to filter for a only a single UMLS source." ) args = parser.parse_args() - main(args.meta_path, args.output_path, args.source) + main(args.meta_path, args.output_path, args.lang, args.source) From 5f25b3d33a0b2e81a0ec50c6e5502267bf20fb6a Mon Sep 17 00:00:00 2001 From: nachollorca Date: Sun, 7 May 2023 14:15:36 +0200 Subject: [PATCH 3/9] Add default to lang To avoid breaking current implementations --- scripts/export_umls_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py index 19c558a..976c382 100644 --- a/scripts/export_umls_json.py +++ b/scripts/export_umls_json.py @@ -7,7 +7,7 @@ import argparse from scispacy import umls_utils -def main(meta_path: str, output_path: str, lang: str, source: str = None): +def main(meta_path: str, output_path: str, lang: str = None, source: str = None): concept_details = {} # dictionary of concept_id -> { # 'concept_id': str, From 8c130ea37460ac9fcd960b85c50013752c32c2f8 Mon Sep 17 00:00:00 2001 From: nachollorca Date: Sun, 7 May 2023 14:24:20 +0200 Subject: [PATCH 4/9] Correct source type to Optional[str] --- scispacy/umls_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index c037288..3f26c7b 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -38,7 +38,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: def read_umls_concepts( - meta_path: str, concept_details: Dict, source: str = None, lang: str = None, non_suppressed : bool= True + meta_path: str, concept_details: Dict, source: Optional[str] = None, lang: str = None, non_suppressed : bool= True ): """ Read the concepts file MRCONSO.RRF from a UMLS release and store it in From 1fd7b99cbe37073317ef16f053ef1360035b75fd Mon Sep 17 00:00:00 2001 From: nachollorca Date: Sun, 7 May 2023 15:19:51 +0200 Subject: [PATCH 5/9] Set English as default language to maintain default behavior --- scispacy/umls_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index 3f26c7b..f3f90c6 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -38,7 +38,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: def read_umls_concepts( - meta_path: str, concept_details: Dict, source: Optional[str] = None, lang: str = None, non_suppressed : bool= True + meta_path: str, concept_details: Dict, source: Optional[str] = None, lang: str = "ENG", non_suppressed : bool= True ): """ Read the concepts file MRCONSO.RRF from a UMLS release and store it in From abdd0fe874ad6912f0c64740ad7fcfbdacaef3c0 Mon Sep 17 00:00:00 2001 From: nachollorca Date: Sun, 7 May 2023 15:23:48 +0200 Subject: [PATCH 6/9] Include --non_supressed And correct lang default to "ENG" --- scripts/export_umls_json.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py index 976c382..e98ff18 100644 --- a/scripts/export_umls_json.py +++ b/scripts/export_umls_json.py @@ -93,11 +93,12 @@ def main(meta_path: str, output_path: str, lang: str = None, source: str = None) ) parser.add_argument( '--output_path', - help="Path to the output jsonl file" + help="Path to the output jsonl file." ) parser.add_argument( '--lang', - help="Language subset of UMLS" + default="ENG", + help="Language subset of UMLS." ) parser.add_argument( '--source', @@ -105,5 +106,10 @@ def main(meta_path: str, output_path: str, lang: str = None, source: str = None) default=None, help="Whether to filter for a only a single UMLS source." ) + parser.add_argument( + '--non_supressed', + default=True, + help="Whether to include non supressed terms." + ) args = parser.parse_args() main(args.meta_path, args.output_path, args.lang, args.source) From ea1fcc11c847d31b60e924a9ea92371750e98480 Mon Sep 17 00:00:00 2001 From: nachollorca Date: Sun, 7 May 2023 16:50:45 +0200 Subject: [PATCH 7/9] Include `args.non_suppressed` in main call and correct a typo --- scripts/export_umls_json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py index e98ff18..e3acd74 100644 --- a/scripts/export_umls_json.py +++ b/scripts/export_umls_json.py @@ -107,9 +107,9 @@ def main(meta_path: str, output_path: str, lang: str = None, source: str = None) help="Whether to filter for a only a single UMLS source." ) parser.add_argument( - '--non_supressed', + '--non_suppressed', default=True, help="Whether to include non supressed terms." ) args = parser.parse_args() - main(args.meta_path, args.output_path, args.lang, args.source) + main(args.meta_path, args.output_path, args.lang, args.source, args.non_suppressed) From 93a77264a1ed6a554a185a50b4a5b6aca8a41d26 Mon Sep 17 00:00:00 2001 From: nachollorca Date: Mon, 8 May 2023 18:08:30 +0200 Subject: [PATCH 8/9] Correct linting errors in `read_umls_concepts()` --- scispacy/umls_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index f3f90c6..943faa5 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -38,7 +38,11 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]: def read_umls_concepts( - meta_path: str, concept_details: Dict, source: Optional[str] = None, lang: str = "ENG", non_suppressed : bool= True + meta_path: str, + concept_details: Dict, + source: Optional[str] = None, + lang: str = "ENG", + non_suppressed: bool = True ): """ Read the concepts file MRCONSO.RRF from a UMLS release and store it in From a4a9141b38c60ddcb569a5fe3af2bbaec699dda9 Mon Sep 17 00:00:00 2001 From: illorca Date: Thu, 11 May 2023 18:54:45 +0200 Subject: [PATCH 9/9] Line length 88 --- scispacy/umls_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py index 943faa5..1506bc9 100644 --- a/scispacy/umls_utils.py +++ b/scispacy/umls_utils.py @@ -42,7 +42,7 @@ def read_umls_concepts( concept_details: Dict, source: Optional[str] = None, lang: str = "ENG", - non_suppressed: bool = True + non_suppressed: bool = True, ): """ Read the concepts file MRCONSO.RRF from a UMLS release and store it in @@ -72,7 +72,9 @@ def read_umls_concepts( splits = line.strip().split("|") assert len(headers) == len(splits), (headers, splits) concept = dict(zip(headers, splits)) - if (lang is not None and concept["LAT"] != lang) or (non_suppressed and concept["SUPPRESS"] != "N"): + if (lang is not None and concept["LAT"] != lang) or ( + non_suppressed and concept["SUPPRESS"] != "N" + ): continue # Keep non-suppressed concepts in target language only if source is not None: