From fed781e0a45aadb3e6695bfcde39cc975a78d678 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 4 Nov 2024 17:30:59 -0500
Subject: [PATCH 01/24] Updated NodeNorm preferred label to match Babel's.

---
 config.json                   |  3 ++-
 node_normalizer/normalizer.py | 35 ++++++++++++++++++++++++++++-------
 2 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/config.json b/config.json
index 616334c..e003005 100644
--- a/config.json
+++ b/config.json
@@ -52,5 +52,6 @@
             "HMDB",
             "PUBCHEM.COMPOUND"
         ]
-    }
+    },
+    "demote_labels_longer_than": 15
 }
diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index ab0ea83..bc4cdf5 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -712,21 +712,42 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # Note that types[canonical_id] goes from most specific to least specific, so we
     # need to reverse it in order to apply preferred_name_boost_prefixes for the most
     # specific type.
+    possible_labels = []
     for typ in types[canonical_id][::-1]:
         if typ in config['preferred_name_boost_prefixes']:
-            # This is the most specific matching type, so we use this.
-            labels = map(lambda identifier: identifier.get('l', ''),
+            # This is the most specific matching type, so we use this and then break.
+            possible_labels = map(lambda identifier: identifier.get('l', ''),
                                   sort_identifiers_with_boosted_prefixes(
                                       eids,
                                       config['preferred_name_boost_prefixes'][typ]
                                   ))
+
+            # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
+            for eid in eids:
+                label = eid.get('l', '')
+                if label not in possible_labels:
+                    possible_labels.append(label)
+
+            # Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
+            # categories: so let's break here.
             break
 
-    # Filter out unsuitable labels.
-    labels = [l for l in labels if
-              l and                               # Ignore blank or empty names.
-              not l.startswith('CHEMBL')          # Some CHEMBL names are just the identifier again.
-              ]
+    # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
+    # Biolink prefix order.
+    if not possible_labels:
+        possible_labels = map(lambda eid: eid.get('l', ''), eids)
+
+    # Step 2. Filter out any suspicious labels.
+    filtered_possible_labels = [l for l in possible_labels if
+        l and                               # Ignore blank or empty names.
+        not l.startswith('CHEMBL')          # Some CHEMBL names are just the identifier again.
+        ]
+
+    # Step 3. Filter out labels longer than config['demote_labels_longer_than'], but only if there is at
+    # least one label shorter than this limit.
+    labels_shorter_than_limit = [l for l in filtered_possible_labels if l and len(l) <= config['demote_labels_longer_than']]
+    if labels_shorter_than_limit:
+        labels = labels_shorter_than_limit
 
     # Note that the id will be from the equivalent ids, not the canonical_id.  This is to handle conflation
     if len(labels) > 0:

From 0d1545fb87e103d727608213927d10c1b69490ff Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 4 Nov 2024 18:09:01 -0500
Subject: [PATCH 02/24] Added on:push trigger for testing.

---
 .github/workflows/release.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 800d57b..e9d8f8f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,6 +1,7 @@
 name: 'Publish to GitHub Packages'
 
 on:
+    push:
     release:
         types: [published]
 

From c7751e1361affd0cd1295daa52c3f82a480755cc Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 4 Nov 2024 18:29:22 -0500
Subject: [PATCH 03/24] Wrapped a map() in a list().

---
 node_normalizer/normalizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index bc4cdf5..d3b25d8 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -716,11 +716,11 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     for typ in types[canonical_id][::-1]:
         if typ in config['preferred_name_boost_prefixes']:
             # This is the most specific matching type, so we use this and then break.
-            possible_labels = map(lambda identifier: identifier.get('l', ''),
+            possible_labels = list(map(lambda identifier: identifier.get('l', ''),
                                   sort_identifiers_with_boosted_prefixes(
                                       eids,
                                       config['preferred_name_boost_prefixes'][typ]
-                                  ))
+                                  )))
 
             # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
             for eid in eids:

From 4f4517deecdf4db00947f54f6148425b425cf230 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Mon, 4 Nov 2024 18:34:00 -0500
Subject: [PATCH 04/24] Deleted on:push trigger after testing.

---
 .github/workflows/release.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e9d8f8f..800d57b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,7 +1,6 @@
 name: 'Publish to GitHub Packages'
 
 on:
-    push:
     release:
         types: [published]
 

From b3590f84ec2cfc3319a77572e97c3849153967bb Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 7 Nov 2024 00:39:10 -0500
Subject: [PATCH 05/24] Updated preferred_name_boost_prefixes to sync with
 Babel.

We use the order in https://github.com/TranslatorSRI/Babel/pull/330
---
 config.json | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/config.json b/config.json
index e003005..e618bf0 100644
--- a/config.json
+++ b/config.json
@@ -44,12 +44,13 @@
     "preferred_name_boost_prefixes": {
         "biolink:ChemicalEntity": [
             "DRUGBANK",
-            "GTOPDB",
             "DrugCentral",
-            "CHEMBL.COMPOUND",
-            "RXCUI",
             "CHEBI",
+            "MESH",
+            "CHEMBL.COMPOUND",
+            "GTOPDB",
             "HMDB",
+            "RXCUI",
             "PUBCHEM.COMPOUND"
         ]
     },

From 31bc57fde490ba5fb26aada1a3fa0c05f9734cac Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 7 Nov 2024 02:16:34 -0500
Subject: [PATCH 06/24] Added on:push trigger for testing.

---
 .github/workflows/release.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 800d57b..e9d8f8f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,6 +1,7 @@
 name: 'Publish to GitHub Packages'
 
 on:
+    push:
     release:
         types: [published]
 

From 8f64ec3e683ce75308410126d30890e522f0633b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 7 Nov 2024 10:28:53 -0500
Subject: [PATCH 07/24] Fixed possible bug in label choosing.

---
 node_normalizer/normalizer.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index d3b25d8..a00a0b5 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -707,8 +707,7 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # As per https://github.com/TranslatorSRI/Babel/issues/158, we select the first label from any
     # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case
     # we prefer the prefixes listed there.
-    labels = list(filter(lambda x: len(x) > 0, [eid['l'] for eid in eids if 'l' in eid]))
-
+    #
     # Note that types[canonical_id] goes from most specific to least specific, so we
     # need to reverse it in order to apply preferred_name_boost_prefixes for the most
     # specific type.
@@ -747,11 +746,11 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # least one label shorter than this limit.
     labels_shorter_than_limit = [l for l in filtered_possible_labels if l and len(l) <= config['demote_labels_longer_than']]
     if labels_shorter_than_limit:
-        labels = labels_shorter_than_limit
+        filtered_possible_labels = labels_shorter_than_limit
 
     # Note that the id will be from the equivalent ids, not the canonical_id.  This is to handle conflation
-    if len(labels) > 0:
-        node = {"id": {"identifier": eids[0]['i'], "label": labels[0]}}
+    if len(filtered_possible_labels) > 0:
+        node = {"id": {"identifier": eids[0]['i'], "label": filtered_possible_labels[0]}}
     else:
         # Sometimes, nothing has a label :(
         node = {"id": {"identifier": eids[0]['i']}}

From 67a46c9ab8daace14dabbd166535fb721f30679b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 7 Nov 2024 10:48:46 -0500
Subject: [PATCH 08/24] Get rid of trying to sync preferred label algorithm.

---
 node_normalizer/normalizer.py | 48 +++++++++++++++--------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index a00a0b5..ff70603 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -708,33 +708,27 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case
     # we prefer the prefixes listed there.
     #
-    # Note that types[canonical_id] goes from most specific to least specific, so we
-    # need to reverse it in order to apply preferred_name_boost_prefixes for the most
-    # specific type.
-    possible_labels = []
-    for typ in types[canonical_id][::-1]:
-        if typ in config['preferred_name_boost_prefixes']:
-            # This is the most specific matching type, so we use this and then break.
-            possible_labels = list(map(lambda identifier: identifier.get('l', ''),
-                                  sort_identifiers_with_boosted_prefixes(
-                                      eids,
-                                      config['preferred_name_boost_prefixes'][typ]
-                                  )))
-
-            # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
-            for eid in eids:
-                label = eid.get('l', '')
-                if label not in possible_labels:
-                    possible_labels.append(label)
-
-            # Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
-            # categories: so let's break here.
-            break
-
-    # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
-    # Biolink prefix order.
-    if not possible_labels:
-        possible_labels = map(lambda eid: eid.get('l', ''), eids)
+    # HOWEVER, there are three reasons not to do that here:
+    # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique
+    #    so we can autocomplete to it. But for NodeNorm, users would be expecting the label that
+    #    goes with the identifier we've normalized to, so we should probably go with that label
+    #    unless that would be annoying (e.g. if it's very long).
+    # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes
+    #    conflation in Babel doesn't pick the preferred label across all possible labels within the
+    #    conflated clique, but instead picks the preferred label for each subclique, and then chooses
+    #    the first preferred label in order of conflation. Which is what we should be doing, but by
+    #    this point we've lost track of each subclique that went into this conflated clique.
+    # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code
+    #    in Babel -- the ideal solution here would be to use the preferred_name being generated by
+    #    Babel, but that will require some large changes to NodeNorm.
+    #
+    # For these reasons, I'm going to try to replace this with a simplified algorithm:
+    # - Order labels in clique identifier order.
+    # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers.
+    # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size.
+    #
+    # Step 1. Get all possible labels.
+    possible_labels = map(lambda eid: eid.get('l', ''), eids)
 
     # Step 2. Filter out any suspicious labels.
     filtered_possible_labels = [l for l in possible_labels if

From d09c560846e478f645902d2e6c7b857c3f81274f Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 7 Nov 2024 11:14:42 -0500
Subject: [PATCH 09/24] Revert "Get rid of trying to sync preferred label
 algorithm."

This reverts commit 67a46c9ab8daace14dabbd166535fb721f30679b.
---
 node_normalizer/normalizer.py | 48 ++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index ff70603..a00a0b5 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -708,27 +708,33 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case
     # we prefer the prefixes listed there.
     #
-    # HOWEVER, there are three reasons not to do that here:
-    # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique
-    #    so we can autocomplete to it. But for NodeNorm, users would be expecting the label that
-    #    goes with the identifier we've normalized to, so we should probably go with that label
-    #    unless that would be annoying (e.g. if it's very long).
-    # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes
-    #    conflation in Babel doesn't pick the preferred label across all possible labels within the
-    #    conflated clique, but instead picks the preferred label for each subclique, and then chooses
-    #    the first preferred label in order of conflation. Which is what we should be doing, but by
-    #    this point we've lost track of each subclique that went into this conflated clique.
-    # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code
-    #    in Babel -- the ideal solution here would be to use the preferred_name being generated by
-    #    Babel, but that will require some large changes to NodeNorm.
-    #
-    # For these reasons, I'm going to try to replace this with a simplified algorithm:
-    # - Order labels in clique identifier order.
-    # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers.
-    # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size.
-    #
-    # Step 1. Get all possible labels.
-    possible_labels = map(lambda eid: eid.get('l', ''), eids)
+    # Note that types[canonical_id] goes from most specific to least specific, so we
+    # need to reverse it in order to apply preferred_name_boost_prefixes for the most
+    # specific type.
+    possible_labels = []
+    for typ in types[canonical_id][::-1]:
+        if typ in config['preferred_name_boost_prefixes']:
+            # This is the most specific matching type, so we use this and then break.
+            possible_labels = list(map(lambda identifier: identifier.get('l', ''),
+                                  sort_identifiers_with_boosted_prefixes(
+                                      eids,
+                                      config['preferred_name_boost_prefixes'][typ]
+                                  )))
+
+            # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
+            for eid in eids:
+                label = eid.get('l', '')
+                if label not in possible_labels:
+                    possible_labels.append(label)
+
+            # Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
+            # categories: so let's break here.
+            break
+
+    # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
+    # Biolink prefix order.
+    if not possible_labels:
+        possible_labels = map(lambda eid: eid.get('l', ''), eids)
 
     # Step 2. Filter out any suspicious labels.
     filtered_possible_labels = [l for l in possible_labels if

From 5b2e20f067c884296088758c34109d980ca43d4a Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Thu, 7 Nov 2024 11:18:41 -0500
Subject: [PATCH 10/24] Improve documentation.

---
 node_normalizer/normalizer.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index a00a0b5..9b68b87 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -708,6 +708,13 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case
     # we prefer the prefixes listed there.
     #
+    # This should perfectly replicate NameRes labels for non-conflated cliques, but it WON'T perfectly
+    # match conflated cliques. When Babel conflates synonyms, it actually picks the first preferred name
+    # it can among the cliques being conflated -- which means it applies the preferred label algorithm
+    # to just the first clique being conflated, then the next clique, and so on. But by this place in
+    # NodeNorm we've lost track of what the subcliques within the conflated cliques are, so all we can
+    # do is apply the preferred label algorithm across all possible labels and hope for the best.
+    #
     # Note that types[canonical_id] goes from most specific to least specific, so we
     # need to reverse it in order to apply preferred_name_boost_prefixes for the most
     # specific type.

From 6e8d1fa395c8670a12d7921831ae70855e29e6e0 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 15:16:20 -0500
Subject: [PATCH 11/24] Unreverted the simpler algorithm.

---
 node_normalizer/normalizer.py | 53 +++++++++++++----------------------
 1 file changed, 20 insertions(+), 33 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 9b68b87..ff70603 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -708,40 +708,27 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case
     # we prefer the prefixes listed there.
     #
-    # This should perfectly replicate NameRes labels for non-conflated cliques, but it WON'T perfectly
-    # match conflated cliques. When Babel conflates synonyms, it actually picks the first preferred name
-    # it can among the cliques being conflated -- which means it applies the preferred label algorithm
-    # to just the first clique being conflated, then the next clique, and so on. But by this place in
-    # NodeNorm we've lost track of what the subcliques within the conflated cliques are, so all we can
-    # do is apply the preferred label algorithm across all possible labels and hope for the best.
+    # HOWEVER, there are three reasons not to do that here:
+    # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique
+    #    so we can autocomplete to it. But for NodeNorm, users would be expecting the label that
+    #    goes with the identifier we've normalized to, so we should probably go with that label
+    #    unless that would be annoying (e.g. if it's very long).
+    # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes
+    #    conflation in Babel doesn't pick the preferred label across all possible labels within the
+    #    conflated clique, but instead picks the preferred label for each subclique, and then chooses
+    #    the first preferred label in order of conflation. Which is what we should be doing, but by
+    #    this point we've lost track of each subclique that went into this conflated clique.
+    # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code
+    #    in Babel -- the ideal solution here would be to use the preferred_name being generated by
+    #    Babel, but that will require some large changes to NodeNorm.
     #
-    # Note that types[canonical_id] goes from most specific to least specific, so we
-    # need to reverse it in order to apply preferred_name_boost_prefixes for the most
-    # specific type.
-    possible_labels = []
-    for typ in types[canonical_id][::-1]:
-        if typ in config['preferred_name_boost_prefixes']:
-            # This is the most specific matching type, so we use this and then break.
-            possible_labels = list(map(lambda identifier: identifier.get('l', ''),
-                                  sort_identifiers_with_boosted_prefixes(
-                                      eids,
-                                      config['preferred_name_boost_prefixes'][typ]
-                                  )))
-
-            # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
-            for eid in eids:
-                label = eid.get('l', '')
-                if label not in possible_labels:
-                    possible_labels.append(label)
-
-            # Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
-            # categories: so let's break here.
-            break
-
-    # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
-    # Biolink prefix order.
-    if not possible_labels:
-        possible_labels = map(lambda eid: eid.get('l', ''), eids)
+    # For these reasons, I'm going to try to replace this with a simplified algorithm:
+    # - Order labels in clique identifier order.
+    # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers.
+    # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size.
+    #
+    # Step 1. Get all possible labels.
+    possible_labels = map(lambda eid: eid.get('l', ''), eids)
 
     # Step 2. Filter out any suspicious labels.
     filtered_possible_labels = [l for l in possible_labels if

From ab4404d99159ad772cde65de0338f89e4ea189fb Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 15:34:00 -0500
Subject: [PATCH 12/24] Increased demote_labels_longer_than to 40.

---
 config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.json b/config.json
index e618bf0..416eeca 100644
--- a/config.json
+++ b/config.json
@@ -54,5 +54,5 @@
             "PUBCHEM.COMPOUND"
         ]
     },
-    "demote_labels_longer_than": 15
+    "demote_labels_longer_than": 40
 }

From 7faf8aa0d13a6575fd0a934f81a2187521e49851 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 15:46:30 -0500
Subject: [PATCH 13/24] Reduced demote_labels_longer_than to 20.

---
 config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.json b/config.json
index 416eeca..d6dc205 100644
--- a/config.json
+++ b/config.json
@@ -54,5 +54,5 @@
             "PUBCHEM.COMPOUND"
         ]
     },
-    "demote_labels_longer_than": 40
+    "demote_labels_longer_than": 20
 }

From 78cfb42d7ca4f5ba3ff14274c913be5a99b9a24e Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 18:15:20 -0500
Subject: [PATCH 14/24] Revert "Unreverted the simpler algorithm."

This reverts commit 6e8d1fa395c8670a12d7921831ae70855e29e6e0.
---
 node_normalizer/normalizer.py | 53 ++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index ff70603..9b68b87 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -708,27 +708,40 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case
     # we prefer the prefixes listed there.
     #
-    # HOWEVER, there are three reasons not to do that here:
-    # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique
-    #    so we can autocomplete to it. But for NodeNorm, users would be expecting the label that
-    #    goes with the identifier we've normalized to, so we should probably go with that label
-    #    unless that would be annoying (e.g. if it's very long).
-    # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes
-    #    conflation in Babel doesn't pick the preferred label across all possible labels within the
-    #    conflated clique, but instead picks the preferred label for each subclique, and then chooses
-    #    the first preferred label in order of conflation. Which is what we should be doing, but by
-    #    this point we've lost track of each subclique that went into this conflated clique.
-    # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code
-    #    in Babel -- the ideal solution here would be to use the preferred_name being generated by
-    #    Babel, but that will require some large changes to NodeNorm.
+    # This should perfectly replicate NameRes labels for non-conflated cliques, but it WON'T perfectly
+    # match conflated cliques. When Babel conflates synonyms, it actually picks the first preferred name
+    # it can among the cliques being conflated -- which means it applies the preferred label algorithm
+    # to just the first clique being conflated, then the next clique, and so on. But by this place in
+    # NodeNorm we've lost track of what the subcliques within the conflated cliques are, so all we can
+    # do is apply the preferred label algorithm across all possible labels and hope for the best.
     #
-    # For these reasons, I'm going to try to replace this with a simplified algorithm:
-    # - Order labels in clique identifier order.
-    # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers.
-    # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size.
-    #
-    # Step 1. Get all possible labels.
-    possible_labels = map(lambda eid: eid.get('l', ''), eids)
+    # Note that types[canonical_id] goes from most specific to least specific, so we
+    # need to reverse it in order to apply preferred_name_boost_prefixes for the most
+    # specific type.
+    possible_labels = []
+    for typ in types[canonical_id][::-1]:
+        if typ in config['preferred_name_boost_prefixes']:
+            # This is the most specific matching type, so we use this and then break.
+            possible_labels = list(map(lambda identifier: identifier.get('l', ''),
+                                  sort_identifiers_with_boosted_prefixes(
+                                      eids,
+                                      config['preferred_name_boost_prefixes'][typ]
+                                  )))
+
+            # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
+            for eid in eids:
+                label = eid.get('l', '')
+                if label not in possible_labels:
+                    possible_labels.append(label)
+
+            # Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
+            # categories: so let's break here.
+            break
+
+    # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
+    # Biolink prefix order.
+    if not possible_labels:
+        possible_labels = map(lambda eid: eid.get('l', ''), eids)
 
     # Step 2. Filter out any suspicious labels.
     filtered_possible_labels = [l for l in possible_labels if

From a867dea351e63af8d1d3227262f3e40bb67d970c Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 18:31:19 -0500
Subject: [PATCH 15/24] Support Babel's preferred labels for conflated cliques.

---
 node_normalizer/normalizer.py | 59 ++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 9b68b87..8ee34ba 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -643,9 +643,13 @@ async def get_normalized_nodes(
 
         # output the final result
         normal_nodes = {
-            input_curie: await create_node(canonical_id, dereference_ids, dereference_types, info_contents,
+            input_curie: await create_node(app, canonical_id, dereference_ids, dereference_types, info_contents,
                                            include_descriptions=include_descriptions,
-                                           include_individual_types=include_individual_types)
+                                           include_individual_types=include_individual_types,
+                                           conflations={
+                                               'GeneProtein': conflate_gene_protein,
+                                               'DrugChemical': conflate_chemical_drug,
+                                           })
             for input_curie, canonical_id in zip(curies, canonical_ids)
         }
 
@@ -680,13 +684,17 @@ async def get_info_content_attribute(app, canonical_nonan) -> dict:
     return new_attrib
 
 
-async def create_node(canonical_id, equivalent_ids, types, info_contents, include_descriptions=True,
-                      include_individual_types=False):
+async def create_node(app, canonical_id, equivalent_ids, types, info_contents, include_descriptions=True,
+                      include_individual_types=False, conflations=None):
     """Construct the output format given the compressed redis data"""
     # It's possible that we didn't find a canonical_id
     if canonical_id is None:
         return None
 
+    # If no conflation information was provided, assume it's empty.
+    if conflations is None:
+        conflations = {}
+
     # If we have 'None' in the equivalent IDs, skip it so we don't confuse things further down the line.
     if None in equivalent_ids[canonical_id]:
         logging.warning(f"Skipping None in canonical ID {canonical_id} among eqids: {equivalent_ids}")
@@ -709,12 +717,32 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # we prefer the prefixes listed there.
     #
     # This should perfectly replicate NameRes labels for non-conflated cliques, but it WON'T perfectly
-    # match conflated cliques. When Babel conflates synonyms, it actually picks the first preferred name
-    # it can among the cliques being conflated -- which means it applies the preferred label algorithm
-    # to just the first clique being conflated, then the next clique, and so on. But by this place in
-    # NodeNorm we've lost track of what the subcliques within the conflated cliques are, so all we can
-    # do is apply the preferred label algorithm across all possible labels and hope for the best.
-    #
+    # match conflated cliques. To do that, we need to run the preferred label algorithm on ONLY the labels
+    # for the FIRST clique of the conflated cliques with labels.
+    any_conflation = any(conflations.values())
+    if not any_conflation:
+        # No conflation. We just use the identifiers we've been given.
+        identifiers_with_labels = equivalent_ids[canonical_id]
+    else:
+        # We have a conflation going on! To replicate Babel's behavior, we need to run the algorithem
+        # on the list of labels corresponding to the first
+        # So we need to run the algorithm on the first set of identifiers that have any
+        # label whatsoever.
+        identifiers_with_labels = []
+        for identifier in equivalent_ids[canonical_id]:
+            curie = identifier.get('i', '')
+            identifiers_with_labels, types = await get_eqids_and_types(app, curie)
+            labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[curie])
+            if any(map(lambda l: l != '', labels)):
+                break
+
+        # We might get here without any labels, which is fine. At least we tried.
+
+    # At this point:
+    #   - eids will be the full list of all identifiers and labels in this clique.
+    #   - identifiers_with_labels is the list of identifiers and labels for the first subclique that has at least
+    #     one label.
+
     # Note that types[canonical_id] goes from most specific to least specific, so we
     # need to reverse it in order to apply preferred_name_boost_prefixes for the most
     # specific type.
@@ -722,14 +750,14 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     for typ in types[canonical_id][::-1]:
         if typ in config['preferred_name_boost_prefixes']:
             # This is the most specific matching type, so we use this and then break.
-            possible_labels = list(map(lambda identifier: identifier.get('l', ''),
+            possible_labels = list(map(lambda ident: ident.get('l', ''),
                                   sort_identifiers_with_boosted_prefixes(
-                                      eids,
+                                      identifiers_with_labels,
                                       config['preferred_name_boost_prefixes'][typ]
                                   )))
 
             # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
-            for eid in eids:
+            for eid in identifiers_with_labels:
                 label = eid.get('l', '')
                 if label not in possible_labels:
                     possible_labels.append(label)
@@ -741,7 +769,7 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
     # Biolink prefix order.
     if not possible_labels:
-        possible_labels = map(lambda eid: eid.get('l', ''), eids)
+        possible_labels = map(lambda eid: eid.get('l', ''), identifiers_with_labels)
 
     # Step 2. Filter out any suspicious labels.
     filtered_possible_labels = [l for l in possible_labels if
@@ -762,6 +790,9 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
         # Sometimes, nothing has a label :(
         node = {"id": {"identifier": eids[0]['i']}}
 
+    # Now that we've determined a label for this clique, we should never use identifiers_with_labels, possible_labels,
+    # or filtered_possible_labels after this point.
+
     # if descriptions are enabled look for the first available description and use that 
     if include_descriptions:
         descriptions = list(

From b0021f17938a7a922630b896ef1cfc517589205f Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 18:40:27 -0500
Subject: [PATCH 16/24] Improved code, maybe fixed bug.

---
 node_normalizer/normalizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 8ee34ba..7216525 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -722,16 +722,16 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
     any_conflation = any(conflations.values())
     if not any_conflation:
         # No conflation. We just use the identifiers we've been given.
-        identifiers_with_labels = equivalent_ids[canonical_id]
+        identifiers_with_labels = eids
     else:
         # We have a conflation going on! To replicate Babel's behavior, we need to run the algorithem
         # on the list of labels corresponding to the first
         # So we need to run the algorithm on the first set of identifiers that have any
         # label whatsoever.
         identifiers_with_labels = []
-        for identifier in equivalent_ids[canonical_id]:
+        for identifier in eids:
             curie = identifier.get('i', '')
-            identifiers_with_labels, types = await get_eqids_and_types(app, curie)
+            identifiers_with_labels, types = await get_eqids_and_types(app, [curie])
             labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[curie])
             if any(map(lambda l: l != '', labels)):
                 break

From 3e1319670137d85e8972b8e31b465e4b2b69ccd3 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 18:47:03 -0500
Subject: [PATCH 17/24] More bugfixes.

---
 node_normalizer/normalizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 7216525..989466f 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -732,7 +732,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
         for identifier in eids:
             curie = identifier.get('i', '')
             identifiers_with_labels, types = await get_eqids_and_types(app, [curie])
-            labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[curie])
+            labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[0])
             if any(map(lambda l: l != '', labels)):
                 break
 

From 3312137314ad0ade73bfe02d049138623edf0ff2 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 18:51:34 -0500
Subject: [PATCH 18/24] Oops.

---
 node_normalizer/normalizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 989466f..7d47215 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -731,7 +731,7 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
         identifiers_with_labels = []
         for identifier in eids:
             curie = identifier.get('i', '')
-            identifiers_with_labels, types = await get_eqids_and_types(app, [curie])
+            identifiers_with_labels, _ = await get_eqids_and_types(app, [curie])
             labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[0])
             if any(map(lambda l: l != '', labels)):
                 break

From 20d1fd266e3d88b682d7e6827b58f8fcc8e1a7e9 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 18:55:47 -0500
Subject: [PATCH 19/24] More bugfixes.

---
 node_normalizer/normalizer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 7d47215..2e03c20 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -731,8 +731,9 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
         identifiers_with_labels = []
         for identifier in eids:
             curie = identifier.get('i', '')
-            identifiers_with_labels, _ = await get_eqids_and_types(app, [curie])
-            labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels[0])
+            results, _ = await get_eqids_and_types(app, [curie])
+            identifiers_with_labels = results[0]
+            labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels)
             if any(map(lambda l: l != '', labels)):
                 break
 

From 4cf160d1f08f937f338ac01db8c0ab4de92e3ddb Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 19:03:32 -0500
Subject: [PATCH 20/24] Reduced demote_labels_longer_than to 15.

This will bring it in line with Babel.
---
 config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.json b/config.json
index d6dc205..e618bf0 100644
--- a/config.json
+++ b/config.json
@@ -54,5 +54,5 @@
             "PUBCHEM.COMPOUND"
         ]
     },
-    "demote_labels_longer_than": 20
+    "demote_labels_longer_than": 15
 }

From 0ea99197b7ccaea4690c238bce1324b14d0177b1 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@renci.org>
Date: Thu, 7 Nov 2024 19:15:02 -0500
Subject: [PATCH 21/24] Removed on:push trigger after testing.

---
 .github/workflows/release.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e9d8f8f..800d57b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,7 +1,6 @@
 name: 'Publish to GitHub Packages'
 
 on:
-    push:
     release:
         types: [published]
 

From fa32307b4a633ecbb2ebb109ba8009028be916e1 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Fri, 8 Nov 2024 00:20:29 -0500
Subject: [PATCH 22/24] Slightly improved algorithm to avoid unnecessary
 queries to Redis.

---
 node_normalizer/normalizer.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
index 2e03c20..fa1ed61 100644
--- a/node_normalizer/normalizer.py
+++ b/node_normalizer/normalizer.py
@@ -729,14 +729,22 @@ async def create_node(app, canonical_id, equivalent_ids, types, info_contents, i
         # So we need to run the algorithm on the first set of identifiers that have any
         # label whatsoever.
         identifiers_with_labels = []
+        curies_already_checked = set()
         for identifier in eids:
             curie = identifier.get('i', '')
+            if curie in curies_already_checked:
+                continue
             results, _ = await get_eqids_and_types(app, [curie])
+
             identifiers_with_labels = results[0]
             labels = map(lambda ident: ident.get('l', ''), identifiers_with_labels)
             if any(map(lambda l: l != '', labels)):
                 break
 
+            # Since we didn't get any matches here, add it to the list of CURIEs already checked so
+            # we don't make redundant queries to the database.
+            curies_already_checked.update(set(map(lambda x: x.get('i', ''), identifiers_with_labels)))
+
         # We might get here without any labels, which is fine. At least we tried.
 
     # At this point:

From 4800f1d6b1bed9cfa5de4258472c178aa9b8870b Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Fri, 8 Nov 2024 00:21:03 -0500
Subject: [PATCH 23/24] Added on:push for testing.

---
 .github/workflows/release.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 800d57b..e9d8f8f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,6 +1,7 @@
 name: 'Publish to GitHub Packages'
 
 on:
+    push:
     release:
         types: [published]
 

From 6fb76758596fc6c7d55bc55a0d4258ed04c52166 Mon Sep 17 00:00:00 2001
From: Gaurav Vaidya <gaurav@ggvaidya.com>
Date: Fri, 8 Nov 2024 00:30:12 -0500
Subject: [PATCH 24/24] Removed on:push trigger.

---
 .github/workflows/release.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e9d8f8f..800d57b 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,7 +1,6 @@
 name: 'Publish to GitHub Packages'
 
 on:
-    push:
     release:
         types: [published]