From fed781e0a45aadb3e6695bfcde39cc975a78d678 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 4 Nov 2024 17:30:59 -0500 Subject: [PATCH 01/24] Updated NodeNorm preferred label to match Babel's. --- config.json | 3 ++- node_normalizer/normalizer.py | 35 ++++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/config.json b/config.json index 616334c..e003005 100644 --- a/config.json +++ b/config.json @@ -52,5 +52,6 @@ "HMDB", "PUBCHEM.COMPOUND" ] - } + }, + "demote_labels_longer_than": 15 } diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index ab0ea83..bc4cdf5 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -712,21 +712,42 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # Note that types[canonical_id] goes from most specific to least specific, so we # need to reverse it in order to apply preferred_name_boost_prefixes for the most # specific type. + possible_labels = [] for typ in types[canonical_id][::-1]: if typ in config['preferred_name_boost_prefixes']: - # This is the most specific matching type, so we use this. - labels = map(lambda identifier: identifier.get('l', ''), + # This is the most specific matching type, so we use this and then break. + possible_labels = map(lambda identifier: identifier.get('l', ''), sort_identifiers_with_boosted_prefixes( eids, config['preferred_name_boost_prefixes'][typ] )) + + # Add in all the other labels -- we'd still like to consider them, but at a lower priority. + for eid in eids: + label = eid.get('l', '') + if label not in possible_labels: + possible_labels.append(label) + + # Since this is the most specific matching type, we shouldn't do other (presumably higher-level) + # categories: so let's break here. break - # Filter out unsuitable labels. - labels = [l for l in labels if - l and # Ignore blank or empty names. - not l.startswith('CHEMBL') # Some CHEMBL names are just the identifier again. - ] + # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their + # Biolink prefix order. + if not possible_labels: + possible_labels = map(lambda eid: eid.get('l', ''), eids) + + # Step 2. Filter out any suspicious labels. + filtered_possible_labels = [l for l in possible_labels if + l and # Ignore blank or empty names. + not l.startswith('CHEMBL') # Some CHEMBL names are just the identifier again. + ] + + # Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at + # least one label shorter than this limit. + labels_shorter_than_limit = [l for l in filtered_possible_labels if l and len(l) <= config['demote_labels_longer_than']] + if labels_shorter_than_limit: + labels = labels_shorter_than_limit # Note that the id will be from the equivalent ids, not the canonical_id. This is to handle conflation if len(labels) > 0: From 0d1545fb87e103d727608213927d10c1b69490ff Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 4 Nov 2024 18:09:01 -0500 Subject: [PATCH 02/24] Added on:push trigger for testing. --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 800d57b..e9d8f8f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,6 +1,7 @@ name: 'Publish to GitHub Packages' on: + push: release: types: [published] From c7751e1361affd0cd1295daa52c3f82a480755cc Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 4 Nov 2024 18:29:22 -0500 Subject: [PATCH 03/24] Wrapped a map() in a list(). --- node_normalizer/normalizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index bc4cdf5..d3b25d8 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -716,11 +716,11 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ for typ in types[canonical_id][::-1]: if typ in config['preferred_name_boost_prefixes']: # This is the most specific matching type, so we use this and then break. - possible_labels = map(lambda identifier: identifier.get('l', ''), + possible_labels = list(map(lambda identifier: identifier.get('l', ''), sort_identifiers_with_boosted_prefixes( eids, config['preferred_name_boost_prefixes'][typ] - )) + ))) # Add in all the other labels -- we'd still like to consider them, but at a lower priority. for eid in eids: From 4f4517deecdf4db00947f54f6148425b425cf230 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Mon, 4 Nov 2024 18:34:00 -0500 Subject: [PATCH 04/24] Deleted on:push trigger after testing. --- .github/workflows/release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e9d8f8f..800d57b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,7 +1,6 @@ name: 'Publish to GitHub Packages' on: - push: release: types: [published] From b3590f84ec2cfc3319a77572e97c3849153967bb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 00:39:10 -0500 Subject: [PATCH 05/24] Updated preferred_name_boost_prefixes to sync with Babel. We use the order in https://github.com/TranslatorSRI/Babel/pull/330 --- config.json | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/config.json b/config.json index e003005..e618bf0 100644 --- a/config.json +++ b/config.json @@ -44,12 +44,13 @@ "preferred_name_boost_prefixes": { "biolink:ChemicalEntity": [ "DRUGBANK", - "GTOPDB", "DrugCentral", - "CHEMBL.COMPOUND", - "RXCUI", "CHEBI", + "MESH", + "CHEMBL.COMPOUND", + "GTOPDB", "HMDB", + "RXCUI", "PUBCHEM.COMPOUND" ] }, From 31bc57fde490ba5fb26aada1a3fa0c05f9734cac Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 02:16:34 -0500 Subject: [PATCH 06/24] Added on:push trigger for testing. --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 800d57b..e9d8f8f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,6 +1,7 @@ name: 'Publish to GitHub Packages' on: + push: release: types: [published] From 8f64ec3e683ce75308410126d30890e522f0633b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 10:28:53 -0500 Subject: [PATCH 07/24] Fixed possible bug in label choosing. --- node_normalizer/normalizer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index d3b25d8..a00a0b5 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -707,8 +707,7 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # As per https://github.com/TranslatorSRI/Babel/issues/158, we select the first label from any # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case # we prefer the prefixes listed there. - labels = list(filter(lambda x: len(x) > 0, [eid['l'] for eid in eids if 'l' in eid])) - + # # Note that types[canonical_id] goes from most specific to least specific, so we # need to reverse it in order to apply preferred_name_boost_prefixes for the most # specific type. @@ -747,11 +746,11 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # least one label shorter than this limit. labels_shorter_than_limit = [l for l in filtered_possible_labels if l and len(l) <= config['demote_labels_longer_than']] if labels_shorter_than_limit: - labels = labels_shorter_than_limit + filtered_possible_labels = labels_shorter_than_limit # Note that the id will be from the equivalent ids, not the canonical_id. This is to handle conflation - if len(labels) > 0: - node = {"id": {"identifier": eids[0]['i'], "label": labels[0]}} + if len(filtered_possible_labels) > 0: + node = {"id": {"identifier": eids[0]['i'], "label": filtered_possible_labels[0]}} else: # Sometimes, nothing has a label :( node = {"id": {"identifier": eids[0]['i']}} From 67a46c9ab8daace14dabbd166535fb721f30679b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 10:48:46 -0500 Subject: [PATCH 08/24] Get rid of trying to sync preferred label algorithm. --- node_normalizer/normalizer.py | 48 +++++++++++++++-------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index a00a0b5..ff70603 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -708,33 +708,27 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case # we prefer the prefixes listed there. # - # Note that types[canonical_id] goes from most specific to least specific, so we - # need to reverse it in order to apply preferred_name_boost_prefixes for the most - # specific type. - possible_labels = [] - for typ in types[canonical_id][::-1]: - if typ in config['preferred_name_boost_prefixes']: - # This is the most specific matching type, so we use this and then break. - possible_labels = list(map(lambda identifier: identifier.get('l', ''), - sort_identifiers_with_boosted_prefixes( - eids, - config['preferred_name_boost_prefixes'][typ] - ))) - - # Add in all the other labels -- we'd still like to consider them, but at a lower priority. - for eid in eids: - label = eid.get('l', '') - if label not in possible_labels: - possible_labels.append(label) - - # Since this is the most specific matching type, we shouldn't do other (presumably higher-level) - # categories: so let's break here. - break - - # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their - # Biolink prefix order. - if not possible_labels: - possible_labels = map(lambda eid: eid.get('l', ''), eids) + # HOWEVER, there are three reasons not to do that here: + # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique + # so we can autocomplete to it. But for NodeNorm, users would be expecting the label that + # goes with the identifier we've normalized to, so we should probably go with that label + # unless that would be annoying (e.g. if it's very long). + # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes + # conflation in Babel doesn't pick the preferred label across all possible labels within the + # conflated clique, but instead picks the preferred label for each subclique, and then chooses + # the first preferred label in order of conflation. Which is what we should be doing, but by + # this point we've lost track of each subclique that went into this conflated clique. + # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code + # in Babel -- the ideal solution here would be to use the preferred_name being generated by + # Babel, but that will require some large changes to NodeNorm. + # + # For these reasons, I'm going to try to replace this with a simplified algorithm: + # - Order labels in clique identifier order. + # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers. + # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size. + # + # Step 1. Get all possible labels. + possible_labels = map(lambda eid: eid.get('l', ''), eids) # Step 2. Filter out any suspicious labels. filtered_possible_labels = [l for l in possible_labels if From d09c560846e478f645902d2e6c7b857c3f81274f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 11:14:42 -0500 Subject: [PATCH 09/24] Revert "Get rid of trying to sync preferred label algorithm." This reverts commit 67a46c9ab8daace14dabbd166535fb721f30679b. --- node_normalizer/normalizer.py | 48 ++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index ff70603..a00a0b5 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -708,27 +708,33 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case # we prefer the prefixes listed there. # - # HOWEVER, there are three reasons not to do that here: - # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique - # so we can autocomplete to it. But for NodeNorm, users would be expecting the label that - # goes with the identifier we've normalized to, so we should probably go with that label - # unless that would be annoying (e.g. if it's very long). - # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes - # conflation in Babel doesn't pick the preferred label across all possible labels within the - # conflated clique, but instead picks the preferred label for each subclique, and then chooses - # the first preferred label in order of conflation. Which is what we should be doing, but by - # this point we've lost track of each subclique that went into this conflated clique. - # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code - # in Babel -- the ideal solution here would be to use the preferred_name being generated by - # Babel, but that will require some large changes to NodeNorm. - # - # For these reasons, I'm going to try to replace this with a simplified algorithm: - # - Order labels in clique identifier order. - # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers. - # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size. - # - # Step 1. Get all possible labels. - possible_labels = map(lambda eid: eid.get('l', ''), eids) + # Note that types[canonical_id] goes from most specific to least specific, so we + # need to reverse it in order to apply preferred_name_boost_prefixes for the most + # specific type. + possible_labels = [] + for typ in types[canonical_id][::-1]: + if typ in config['preferred_name_boost_prefixes']: + # This is the most specific matching type, so we use this and then break. + possible_labels = list(map(lambda identifier: identifier.get('l', ''), + sort_identifiers_with_boosted_prefixes( + eids, + config['preferred_name_boost_prefixes'][typ] + ))) + + # Add in all the other labels -- we'd still like to consider them, but at a lower priority. + for eid in eids: + label = eid.get('l', '') + if label not in possible_labels: + possible_labels.append(label) + + # Since this is the most specific matching type, we shouldn't do other (presumably higher-level) + # categories: so let's break here. + break + + # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their + # Biolink prefix order. + if not possible_labels: + possible_labels = map(lambda eid: eid.get('l', ''), eids) # Step 2. Filter out any suspicious labels. filtered_possible_labels = [l for l in possible_labels if From 5b2e20f067c884296088758c34109d980ca43d4a Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 11:18:41 -0500 Subject: [PATCH 10/24] Improve documentation. --- node_normalizer/normalizer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index a00a0b5..9b68b87 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -708,6 +708,13 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case # we prefer the prefixes listed there. # + # This should perfectly replicate NameRes labels for non-conflated cliques, but it WON'T perfectly + # match conflated cliques. When Babel conflates synonyms, it actually picks the first preferred name + # it can among the cliques being conflated -- which means it applies the preferred label algorithm + # to just the first clique being conflated, then the next clique, and so on. But by this place in + # NodeNorm we've lost track of what the subcliques within the conflated cliques are, so all we can + # do is apply the preferred label algorithm across all possible labels and hope for the best. + # # Note that types[canonical_id] goes from most specific to least specific, so we # need to reverse it in order to apply preferred_name_boost_prefixes for the most # specific type. From 6e8d1fa395c8670a12d7921831ae70855e29e6e0 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 15:16:20 -0500 Subject: [PATCH 11/24] Unreverted the simpler algorithm. --- node_normalizer/normalizer.py | 53 +++++++++++++---------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 9b68b87..ff70603 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -708,40 +708,27 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case # we prefer the prefixes listed there. # - # This should perfectly replicate NameRes labels for non-conflated cliques, but it WON'T perfectly - # match conflated cliques. When Babel conflates synonyms, it actually picks the first preferred name - # it can among the cliques being conflated -- which means it applies the preferred label algorithm - # to just the first clique being conflated, then the next clique, and so on. But by this place in - # NodeNorm we've lost track of what the subcliques within the conflated cliques are, so all we can - # do is apply the preferred label algorithm across all possible labels and hope for the best. + # HOWEVER, there are three reasons not to do that here: + # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique + # so we can autocomplete to it. But for NodeNorm, users would be expecting the label that + # goes with the identifier we've normalized to, so we should probably go with that label + # unless that would be annoying (e.g. if it's very long). + # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes + # conflation in Babel doesn't pick the preferred label across all possible labels within the + # conflated clique, but instead picks the preferred label for each subclique, and then chooses + # the first preferred label in order of conflation. Which is what we should be doing, but by + # this point we've lost track of each subclique that went into this conflated clique. + # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code + # in Babel -- the ideal solution here would be to use the preferred_name being generated by + # Babel, but that will require some large changes to NodeNorm. # - # Note that types[canonical_id] goes from most specific to least specific, so we - # need to reverse it in order to apply preferred_name_boost_prefixes for the most - # specific type. - possible_labels = [] - for typ in types[canonical_id][::-1]: - if typ in config['preferred_name_boost_prefixes']: - # This is the most specific matching type, so we use this and then break. - possible_labels = list(map(lambda identifier: identifier.get('l', ''), - sort_identifiers_with_boosted_prefixes( - eids, - config['preferred_name_boost_prefixes'][typ] - ))) - - # Add in all the other labels -- we'd still like to consider them, but at a lower priority. - for eid in eids: - label = eid.get('l', '') - if label not in possible_labels: - possible_labels.append(label) - - # Since this is the most specific matching type, we shouldn't do other (presumably higher-level) - # categories: so let's break here. - break - - # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their - # Biolink prefix order. - if not possible_labels: - possible_labels = map(lambda eid: eid.get('l', ''), eids) + # For these reasons, I'm going to try to replace this with a simplified algorithm: + # - Order labels in clique identifier order. + # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers. + # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size. + # + # Step 1. Get all possible labels. + possible_labels = map(lambda eid: eid.get('l', ''), eids) # Step 2. Filter out any suspicious labels. filtered_possible_labels = [l for l in possible_labels if From ab4404d99159ad772cde65de0338f89e4ea189fb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 15:34:00 -0500 Subject: [PATCH 12/24] Increased demote_labels_longer_than to 40. --- config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.json b/config.json index e618bf0..416eeca 100644 --- a/config.json +++ b/config.json @@ -54,5 +54,5 @@ "PUBCHEM.COMPOUND" ] }, - "demote_labels_longer_than": 15 + "demote_labels_longer_than": 40 } From 7faf8aa0d13a6575fd0a934f81a2187521e49851 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 15:46:30 -0500 Subject: [PATCH 13/24] Reduced demote_labels_longer_than to 20. --- config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.json b/config.json index 416eeca..d6dc205 100644 --- a/config.json +++ b/config.json @@ -54,5 +54,5 @@ "PUBCHEM.COMPOUND" ] }, - "demote_labels_longer_than": 40 + "demote_labels_longer_than": 20 } From 78cfb42d7ca4f5ba3ff14274c913be5a99b9a24e Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 18:15:20 -0500 Subject: [PATCH 14/24] Revert "Unreverted the simpler algorithm." This reverts commit 6e8d1fa395c8670a12d7921831ae70855e29e6e0. --- node_normalizer/normalizer.py | 53 ++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index ff70603..9b68b87 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -708,27 +708,40 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case # we prefer the prefixes listed there. # - # HOWEVER, there are three reasons not to do that here: - # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique - # so we can autocomplete to it. But for NodeNorm, users would be expecting the label that - # goes with the identifier we've normalized to, so we should probably go with that label - # unless that would be annoying (e.g. if it's very long). - # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes - # conflation in Babel doesn't pick the preferred label across all possible labels within the - # conflated clique, but instead picks the preferred label for each subclique, and then chooses - # the first preferred label in order of conflation. Which is what we should be doing, but by - # this point we've lost track of each subclique that went into this conflated clique. - # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code - # in Babel -- the ideal solution here would be to use the preferred_name being generated by - # Babel, but that will require some large changes to NodeNorm. + # This should perfectly replicate NameRes labels for non-conflated cliques, but it WON'T perfectly + # match conflated cliques. When Babel conflates synonyms, it actually picks the first preferred name + # it can among the cliques being conflated -- which means it applies the preferred label algorithm + # to just the first clique being conflated, then the next clique, and so on. But by this place in + # NodeNorm we've lost track of what the subcliques within the conflated cliques are, so all we can + # do is apply the preferred label algorithm across all possible labels and hope for the best. # - # For these reasons, I'm going to try to replace this with a simplified algorithm: - # - Order labels in clique identifier order. - # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers. - # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size. - # - # Step 1. Get all possible labels. - possible_labels = map(lambda eid: eid.get('l', ''), eids) + # Note that types[canonical_id] goes from most specific to least specific, so we + # need to reverse it in order to apply preferred_name_boost_prefixes for the most + # specific type. + possible_labels = [] + for typ in types[canonical_id][::-1]: + if typ in config['preferred_name_boost_prefixes']: + # This is the most specific matching type, so we use this and then break. + possible_labels = list(map(lambda identifier: identifier.get('l', ''), + sort_identifiers_with_boosted_prefixes( + eids, + config['preferred_name_boost_prefixes'][typ] + ))) + + # Add in all the other labels -- we'd still like to consider them, but at a lower priority. + for eid in eids: + label = eid.get('l', '') + if label not in possible_labels: + possible_labels.append(label) + + # Since this is the most specific matching type, we shouldn't do other (presumably higher-level) + # categories: so let's break here. + break + + # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their + # Biolink prefix order. + if not possible_labels: + possible_labels = map(lambda eid: eid.get('l', ''), eids) # Step 2. Filter out any suspicious labels. filtered_possible_labels = [l for l in possible_labels if From a867dea351e63af8d1d3227262f3e40bb67d970c Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 18:31:19 -0500 Subject: [PATCH 15/24] Support Babel's preferred labels for conflated cliques. --- node_normalizer/normalizer.py | 59 ++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 9b68b87..8ee34ba 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -643,9 +643,13 @@ async def get_normalized_nodes( # output the final result normal_nodes = { - input_curie: await create_node(canonical_id, dereference_ids, dereference_types, info_contents, + input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents, include_descriptions=include_descriptions, - include_individual_types=include_individual_types) + include_individual_types=include_individual_types, + conflations={ + 'GeneProtein': conflate_gene_protein, + 'DrugChemical': conflate_chemical_drug, + }) for input_curie, canonical_id in zip(curies, canonical_ids) } @@ -680,13 +684,17 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict: return new_attrib -async def create_node(canonical_id, equivalent_ids, types, info_contents, include_descriptions=True, - include_individual_types=False): +async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True, + include_individual_types=False, conflations=None): """Construct the output format given the compressed redis data""" # It's possible that we didn't find a canonical_id if canonical_id is None: return None + # If no conflation information was provided, assume it's empty. + if conflations is None: + conflations = {} + # If we have 'None' in the equivalent IDs, skip it so we don't confuse things further down the line. if None in equivalent_ids[canonical_id]: logging.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}") @@ -709,12 +717,32 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # we prefer the prefixes listed there. # # This should perfectly replicate NameRes labels for non-conflated cliques, but it WON'T perfectly - # match conflated cliques. When Babel conflates synonyms, it actually picks the first preferred name - # it can among the cliques being conflated -- which means it applies the preferred label algorithm - # to just the first clique being conflated, then the next clique, and so on. But by this place in - # NodeNorm we've lost track of what the subcliques within the conflated cliques are, so all we can - # do is apply the preferred label algorithm across all possible labels and hope for the best. - # + # match conflated cliques. To do that, we need to run the preferred label algorithm on ONLY the labels + # for the FIRST clique of the conflated cliques with labels. + any_conflation = any(conflations.values()) + if not any_conflation: + # No conflation. We just use the identifiers we've been given. + identifiers_with_labels = equivalent_ids[canonical_id] + else: + # We have a conflation going on! To replicate Babel's behavior, we need to run the algorithem + # on the list of labels corresponding to the first + # So we need to run the algorithm on the first set of identifiers that have any + # label whatsoever. + identifiers_with_labels = [] + for identifier in equivalent_ids[canonical_id]: + curie = identifier.get('i', '') + identifiers_with_labels, types = await get_eqids_and_types(app, curie) + labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[curie]) + if any(map(lambda l: l != '', labels)): + break + + # We might get here without any labels, which is fine. At least we tried. + + # At this point: + # - eids will be the full list of all identifiers and labels in this clique. + # - identifiers_with_labels is the list of identifiers and labels for the first subclique that has at least + # one label. + # Note that types[canonical_id] goes from most specific to least specific, so we # need to reverse it in order to apply preferred_name_boost_prefixes for the most # specific type. @@ -722,14 +750,14 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ for typ in types[canonical_id][::-1]: if typ in config['preferred_name_boost_prefixes']: # This is the most specific matching type, so we use this and then break. - possible_labels = list(map(lambda identifier: identifier.get('l', ''), + possible_labels = list(map(lambda ident: ident.get('l', ''), sort_identifiers_with_boosted_prefixes( - eids, + identifiers_with_labels, config['preferred_name_boost_prefixes'][typ] ))) # Add in all the other labels -- we'd still like to consider them, but at a lower priority. - for eid in eids: + for eid in identifiers_with_labels: label = eid.get('l', '') if label not in possible_labels: possible_labels.append(label) @@ -741,7 +769,7 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their # Biolink prefix order. if not possible_labels: - possible_labels = map(lambda eid: eid.get('l', ''), eids) + possible_labels = map(lambda eid: eid.get('l', ''), identifiers_with_labels) # Step 2. Filter out any suspicious labels. filtered_possible_labels = [l for l in possible_labels if @@ -762,6 +790,9 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ # Sometimes, nothing has a label :( node = {"id": {"identifier": eids[0]['i']}} + # Now that we've determined a label for this clique, we should never use identifiers_with_labels, possible_labels, + # or filtered_possible_labels after this point. + # if descriptions are enabled look for the first available description and use that if include_descriptions: descriptions = list( From b0021f17938a7a922630b896ef1cfc517589205f Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 18:40:27 -0500 Subject: [PATCH 16/24] Improved code, maybe fixed bug. --- node_normalizer/normalizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 8ee34ba..7216525 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -722,16 +722,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i any_conflation = any(conflations.values()) if not any_conflation: # No conflation. We just use the identifiers we've been given. - identifiers_with_labels = equivalent_ids[canonical_id] + identifiers_with_labels = eids else: # We have a conflation going on! To replicate Babel's behavior, we need to run the algorithem # on the list of labels corresponding to the first # So we need to run the algorithm on the first set of identifiers that have any # label whatsoever. identifiers_with_labels = [] - for identifier in equivalent_ids[canonical_id]: + for identifier in eids: curie = identifier.get('i', '') - identifiers_with_labels, types = await get_eqids_and_types(app, curie) + identifiers_with_labels, types = await get_eqids_and_types(app, [curie]) labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[curie]) if any(map(lambda l: l != '', labels)): break From 3e1319670137d85e8972b8e31b465e4b2b69ccd3 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 18:47:03 -0500 Subject: [PATCH 17/24] More bugfixes. --- node_normalizer/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 7216525..989466f 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -732,7 +732,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i for identifier in eids: curie = identifier.get('i', '') identifiers_with_labels, types = await get_eqids_and_types(app, [curie]) - labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[curie]) + labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[0]) if any(map(lambda l: l != '', labels)): break From 3312137314ad0ade73bfe02d049138623edf0ff2 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 18:51:34 -0500 Subject: [PATCH 18/24] Oops. --- node_normalizer/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 989466f..7d47215 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -731,7 +731,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i identifiers_with_labels = [] for identifier in eids: curie = identifier.get('i', '') - identifiers_with_labels, types = await get_eqids_and_types(app, [curie]) + identifiers_with_labels, _ = await get_eqids_and_types(app, [curie]) labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[0]) if any(map(lambda l: l != '', labels)): break From 20d1fd266e3d88b682d7e6827b58f8fcc8e1a7e9 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 18:55:47 -0500 Subject: [PATCH 19/24] More bugfixes. --- node_normalizer/normalizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 7d47215..2e03c20 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -731,8 +731,9 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i identifiers_with_labels = [] for identifier in eids: curie = identifier.get('i', '') - identifiers_with_labels, _ = await get_eqids_and_types(app, [curie]) - labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[0]) + results, _ = await get_eqids_and_types(app, [curie]) + identifiers_with_labels = results[0] + labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels) if any(map(lambda l: l != '', labels)): break From 4cf160d1f08f937f338ac01db8c0ab4de92e3ddb Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 19:03:32 -0500 Subject: [PATCH 20/24] Reduced demote_labels_longer_than to 15. This will bring it in line with Babel. --- config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.json b/config.json index d6dc205..e618bf0 100644 --- a/config.json +++ b/config.json @@ -54,5 +54,5 @@ "PUBCHEM.COMPOUND" ] }, - "demote_labels_longer_than": 20 + "demote_labels_longer_than": 15 } From 0ea99197b7ccaea4690c238bce1324b14d0177b1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Thu, 7 Nov 2024 19:15:02 -0500 Subject: [PATCH 21/24] Removed on:push trigger after testing. --- .github/workflows/release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e9d8f8f..800d57b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,7 +1,6 @@ name: 'Publish to GitHub Packages' on: - push: release: types: [published] From fa32307b4a633ecbb2ebb109ba8009028be916e1 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 8 Nov 2024 00:20:29 -0500 Subject: [PATCH 22/24] Slightly improved algorithm to avoid unnecessary queries to Redis. --- node_normalizer/normalizer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py index 2e03c20..fa1ed61 100644 --- a/node_normalizer/normalizer.py +++ b/node_normalizer/normalizer.py @@ -729,14 +729,22 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i # So we need to run the algorithm on the first set of identifiers that have any # label whatsoever. identifiers_with_labels = [] + curies_already_checked = set() for identifier in eids: curie = identifier.get('i', '') + if curie in curies_already_checked: + continue results, _ = await get_eqids_and_types(app, [curie]) + identifiers_with_labels = results[0] labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels) if any(map(lambda l: l != '', labels)): break + # Since we didn't get any matches here, add it to the list of CURIEs already checked so + # we don't make redundant queries to the database. + curies_already_checked.update(set(map(lambda x: x.get('i', ''), identifiers_with_labels))) + # We might get here without any labels, which is fine. At least we tried. # At this point: From 4800f1d6b1bed9cfa5de4258472c178aa9b8870b Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 8 Nov 2024 00:21:03 -0500 Subject: [PATCH 23/24] Added on:push for testing. --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 800d57b..e9d8f8f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,6 +1,7 @@ name: 'Publish to GitHub Packages' on: + push: release: types: [published] From 6fb76758596fc6c7d55bc55a0d4258ed04c52166 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Fri, 8 Nov 2024 00:30:12 -0500 Subject: [PATCH 24/24] Removed on:push trigger. --- .github/workflows/release.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e9d8f8f..800d57b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,7 +1,6 @@ name: 'Publish to GitHub Packages' on: - push: release: types: [published]