Refactor converter generation code (#974)

This PR gets rid of code that focuses on lists of `curies.Record` objects and instead works directly with `curies.Converter` objects. Along the way, this also identified issues with the data integrity on MIRIAM, N2T, and Prefix Commons with respect to the TAIR resources (`tair.gene` and `tair.protein`) which all used non-specific, overlapping URLs. Therefore, these needed to get cleaned out before being import. Why do this? If we work directly with converters, we can make use of the CURIE prefix reconciliation tooling to more cleanly refactor the Bioregistry to Converter pipeline (which is causing issues when adding prefix casing variants in a related PR #969)
biopragmatics · Nov 2, 2023 · 6eeeaf2 · 6eeeaf2
1 parent 1f12bae
commit 6eeeaf2
Show file tree

Hide file tree

Showing 17 changed files with 103 additions and 136 deletions.
diff --git a/exports/contexts/bioregistry.epm.json b/exports/contexts/bioregistry.epm.json
@@ -26168,14 +26168,12 @@
         "uri_prefix": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:",
         "uri_prefix_synonyms": [
             "TAIR.PROTEIN:",
-            "http://arabidopsis.org/servlets/TairObject?accession=",
             "http://bio2rdf.org/tair.protein:",
             "http://bioregistry.io/tair.protein:",
             "http://identifiers.org/tair.protein/",
             "http://identifiers.org/tair.protein/AASequence:",
             "http://identifiers.org/tair.protein:",
             "http://n2t.net/tair.protein:",
-            "https://arabidopsis.org/servlets/TairObject?accession=",
             "https://arabidopsis.org/servlets/TairObject?accession=AASequence:",
             "https://bio2rdf.org/tair.protein:",
             "https://bioregistry.io/tair.protein:",

diff --git a/exports/contexts/bioregistry.rpm.json b/exports/contexts/bioregistry.rpm.json
@@ -3167,7 +3167,6 @@
     "http://aps.unmc.edu/AP/database/query_output.php?ID=": "apd",
     "http://ar5iv.org/abs/": "arxiv",
     "http://arabidopsis.info/StockInfo?NASC_id=": "nasc",
-    "http://arabidopsis.org/servlets/TairObject?accession=": "tair.protein",
     "http://arabidopsis.org/servlets/TairObject?accession=AASequence:": "tair.protein",
     "http://arabidopsis.org/servlets/TairObject?accession=Gene:": "tair.gene",
     "http://arabidopsis.org/servlets/TairObject?type=locus&name=": "tair.locus",
@@ -10733,7 +10732,6 @@
     "https://aps.unmc.edu/AP/database/query_output.php?ID=": "apd",
     "https://ar5iv.org/abs/": "arxiv",
     "https://arabidopsis.info/StockInfo?NASC_id=": "nasc",
-    "https://arabidopsis.org/servlets/TairObject?accession=": "tair.protein",
     "https://arabidopsis.org/servlets/TairObject?accession=AASequence:": "tair.protein",
     "https://arabidopsis.org/servlets/TairObject?accession=Gene:": "tair.gene",
     "https://arabidopsis.org/servlets/TairObject?type=locus&name=": "tair.locus",

diff --git a/exports/contexts/obo.epm.json b/exports/contexts/obo.epm.json
@@ -22771,14 +22771,12 @@
         "prefix": "tair.protein",
         "uri_prefix": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:",
         "uri_prefix_synonyms": [
-            "http://arabidopsis.org/servlets/TairObject?accession=",
             "http://bio2rdf.org/tair.protein:",
             "http://bioregistry.io/tair.protein:",
             "http://identifiers.org/tair.protein/",
             "http://identifiers.org/tair.protein/AASequence:",
             "http://identifiers.org/tair.protein:",
             "http://n2t.net/tair.protein:",
-            "https://arabidopsis.org/servlets/TairObject?accession=",
             "https://arabidopsis.org/servlets/TairObject?accession=AASequence:",
             "https://bio2rdf.org/tair.protein:",
             "https://bioregistry.io/tair.protein:",

diff --git a/src/bioregistry/__init__.py b/src/bioregistry/__init__.py
@@ -127,7 +127,6 @@
     write_registry,
 )
 from .uri_format import (  # noqa:F401
-    get_extended_prefix_map,
     get_pattern_map,
     get_prefix_map,
     get_uri_format,

diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json
@@ -95986,8 +95986,7 @@
       "namespaceEmbeddedInLui": false,
       "pattern": "^Gene:\\d{7}$",
       "prefix": "tair.gene",
-      "sampleId": "Gene:2200934",
-      "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+      "sampleId": "Gene:2200934"
     },
     "n2t": {
       "description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. This is the reference gene model for a given locus.",
@@ -95996,8 +95995,7 @@
       "name": "The Arabidopsis Information Resource (TAIR) Gene",
       "namespaceEmbeddedInLui": false,
       "pattern": "^Gene:\\d{7}$",
-      "prefix": "tair.gene",
-      "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+      "prefix": "tair.gene"
     },
     "part_of": "tair",
     "pattern": "^\\d{7}$",
@@ -96012,8 +96010,7 @@
       "prefix": "tair.gene",
       "pubmed_ids": [
         "12444417"
-      ],
-      "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+      ]
     },
     "publications": [
       {
@@ -96133,8 +96130,7 @@
       "namespaceEmbeddedInLui": false,
       "pattern": "^AASequence:\\d{10}$",
       "prefix": "tair.protein",
-      "sampleId": "AASequence:1009107926",
-      "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+      "sampleId": "AASequence:1009107926"
     },
     "n2t": {
       "description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. This provides protein information for a given gene model and provides links to other sources such as UniProtKB and GenPept",
@@ -96143,8 +96139,7 @@
       "name": "The Arabidopsis Information Resource (TAIR) Protein",
       "namespaceEmbeddedInLui": false,
       "pattern": "^AASequence:\\d{10}$",
-      "prefix": "tair.protein",
-      "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+      "prefix": "tair.protein"
     },
     "pattern": "^\\d{10}$",
     "prefixcommons": {
@@ -96158,8 +96153,7 @@
       "prefix": "tair.protein",
       "pubmed_ids": [
         "12444417"
-      ],
-      "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+      ]
     },
     "publications": [
       {

diff --git a/src/bioregistry/data/external/miriam/processed.json b/src/bioregistry/data/external/miriam/processed.json
@@ -9440,8 +9440,7 @@
     "namespaceEmbeddedInLui": false,
     "pattern": "^Gene:\\d{7}$",
     "prefix": "tair.gene",
-    "sampleId": "Gene:2200934",
-    "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+    "sampleId": "Gene:2200934"
   },
   "tair.locus": {
     "deprecated": false,
@@ -9476,8 +9475,7 @@
     "namespaceEmbeddedInLui": false,
     "pattern": "^AASequence:\\d{10}$",
     "prefix": "tair.protein",
-    "sampleId": "AASequence:1009107926",
-    "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+    "sampleId": "AASequence:1009107926"
   },
   "tarbase": {
     "deprecated": false,

diff --git a/src/bioregistry/data/external/n2t/processed.json b/src/bioregistry/data/external/n2t/processed.json
@@ -5460,8 +5460,7 @@
     "homepage": "http://arabidopsis.org/index.jsp",
     "name": "The Arabidopsis Information Resource (TAIR) Gene",
     "namespaceEmbeddedInLui": false,
-    "pattern": "^Gene:\\d{7}$",
-    "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+    "pattern": "^Gene:\\d{7}$"
   },
   "tair.locus": {
     "description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. The name of a Locus is unique and used by TAIR, TIGR, and MIPS.",
@@ -5478,8 +5477,7 @@
     "homepage": "http://arabidopsis.org/index.jsp",
     "name": "The Arabidopsis Information Resource (TAIR) Protein",
     "namespaceEmbeddedInLui": false,
-    "pattern": "^AASequence:\\d{10}$",
-    "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+    "pattern": "^AASequence:\\d{10}$"
   },
   "tarbase": {
     "description": "TarBase stores microRNA (miRNA) information for miRNA&#x2013;gene interactions, as well as miRNA- and gene-related facts to information specific to the interaction and the experimental validation methodologies used.",

diff --git a/src/bioregistry/data/external/prefixcommons/processed.json b/src/bioregistry/data/external/prefixcommons/processed.json
@@ -10795,8 +10795,7 @@
     "prefix": "tair.gene",
     "pubmed_ids": [
       "12444417"
-    ],
-    "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+    ]
   },
   "tair.locus": {
     "description": "The locus name",
@@ -10823,8 +10822,7 @@
     "prefix": "tair.protein",
     "pubmed_ids": [
       "12444417"
-    ],
-    "uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
+    ]
   },
   "tao": {
     "bioportal": "1110",

diff --git a/src/bioregistry/external/miriam.py b/src/bioregistry/external/miriam.py
@@ -27,6 +27,9 @@
     "4503",
     "6vts",
 }
+SKIP_URI_FORMATS = {
+    "http://arabidopsis.org/servlets/TairObject?accession=$1",
+}
 
 
 def get_miriam(force_download: bool = False, force_process: bool = False):
@@ -91,7 +94,8 @@ def _process(record):
     else:
         primary, *rest = resources
     rv["homepage"] = primary["homepage"]
-    rv[URI_FORMAT_KEY] = primary[URI_FORMAT_KEY]
+    if URI_FORMAT_KEY in primary:
+        rv[URI_FORMAT_KEY] = primary[URI_FORMAT_KEY]
 
     extras = []
     for provider in rest:
@@ -113,14 +117,17 @@ def _process(record):
 
 
 def _preprocess_resource(resource):
-    return {
+    rv = {
         "official": resource["official"],
         "homepage": resource["resourceHomeUrl"],
         "code": resource["providerCode"],
-        URI_FORMAT_KEY: resource["urlPattern"].replace("{$id}", "$1"),
         "name": resource["name"],
         "description": resource["description"],
     }
+    uri_format = resource["urlPattern"].replace("{$id}", "$1")
+    if uri_format not in SKIP_URI_FORMATS:
+        rv[URI_FORMAT_KEY] = uri_format
+    return rv
 
 
 @click.command()

diff --git a/src/bioregistry/external/n2t.py b/src/bioregistry/external/n2t.py
@@ -26,6 +26,9 @@
     "merops": "issue with miriam having duplicate prefixes for this resource",  # FIXME
     "hgnc.family": "issue with miriam having duplicate prefixes for this resource",  # FIXME
 }
+SKIP_URI_FORMATS = {
+    "http://arabidopsis.org/servlets/TairObject?accession=$1",
+}
 
 
 def get_n2t(force_download: bool = False):
@@ -53,7 +56,7 @@ def get_n2t(force_download: bool = False):
 def _process(record):
     rv = {
         "name": record.get("name"),
-        URI_FORMAT_KEY: record["redirect"].replace("$id", "$1") if "redirect" in record else None,
+        URI_FORMAT_KEY: _get_uri_format(record),
         "description": record.get("description"),
         "homepage": record.get("more"),
         "pattern": record.get("pattern"),
@@ -63,6 +66,16 @@ def _process(record):
     return {k: v for k, v in rv.items() if v is not None}
 
 
+def _get_uri_format(record):
+    raw_redirect = record.get("redirect")
+    if raw_redirect is None:
+        return None
+    uri_format = raw_redirect.replace("$id", "$1")
+    if uri_format in SKIP_URI_FORMATS:
+        return None
+    return uri_format
+
+
 @click.command()
 def main():
     """Reload the N2T data."""

diff --git a/src/bioregistry/external/prefixcommons.py b/src/bioregistry/external/prefixcommons.py
@@ -85,6 +85,11 @@
 }
 #: These contain synonyms with mismatches
 DISCARD_SYNONYMS = {"biogrid", "cath", "zfa"}
+SKIP_URI_FORMATS = {
+    "http://purl.obolibrary.org/obo/$1",
+    "http://www.ebi.ac.uk/ontology-lookup/?termId=$1",
+    "http://arabidopsis.org/servlets/TairObject?accession=$1",
+}
 
 
 def get_prefixcommons(force_download: bool = False, force_process: bool = False):
@@ -147,10 +152,7 @@ def _process_row(line: str):
     uri_format = rv.pop("uri_format", None)
     if uri_format:
         uri_format = uri_format.replace("$id", "$1").replace("[?id]", "$1").replace("$d", "$1")
-        if uri_format not in {
-            "http://purl.obolibrary.org/obo/$1",
-            "http://www.ebi.ac.uk/ontology-lookup/?termId=$1",
-        }:
+        if uri_format not in SKIP_URI_FORMATS:
             rv["uri_format"] = uri_format
 
     uri_rdf_formats = _get_uri_formats(rv, "rdf_uri_prefix")
@@ -191,9 +193,12 @@ def _get_uri_formats(rv, key) -> List[str]:
             continue
         if "$1" in uri_format or "[?id]" in uri_format:  # FIXME check if these come at the end
             continue
-        rv.append(f"{uri_format}$1")
+        uri_format = f"{uri_format}$1"
+        if uri_format in SKIP_URI_FORMATS:
+            continue
+        rv.append(uri_format)
     return rv
 
 
 if __name__ == "__main__":
-    print(len(get_prefixcommons(force_process=True)))  # noqa:T201
+    print(len(get_prefixcommons(force_process=True, force_download=True)))  # noqa:T201
diff --git a/src/bioregistry/record_accumulator.py b/src/bioregistry/record_accumulator.py
@@ -18,9 +18,14 @@
 )
 
 import curies
+from curies import Converter
 
 from bioregistry import Resource
 
+__all__ = [
+    "get_converter",
+]
+
 logger = logging.getLogger(__name__)
 prefix_blacklist = {"bgee.gene"}
 uri_prefix_blacklist = {
@@ -33,6 +38,7 @@
     "http://www.ncbi.nlm.nih.gov/nuccore/",
     "https://www.ebi.ac.uk/ena/data/view/",
     "http://www.ebi.ac.uk/ena/data/view/",
+    "http://arabidopsis.org/servlets/TairObject?accession=",
 }
 prefix_resource_blacklist = {
     ("orphanet", "http://www.orpha.net/ORDO/Orphanet_"),  # biocontext is wrong
@@ -103,6 +109,28 @@ def _iterate_prefix_prefix(resource: Resource, *extras: str):
 #  (e.g., uniprot.isoform and uniprot)
 
 
+def get_converter(
+    resources: List[Resource],
+    prefix_priority: Optional[Sequence[str]] = None,
+    uri_prefix_priority: Optional[Sequence[str]] = None,
+    include_prefixes: bool = False,
+    strict: bool = False,
+    blacklist: Optional[Collection[str]] = None,
+    remapping: Optional[Mapping[str, str]] = None,
+) -> Converter:
+    """Generate a converter from resources."""
+    records = get_records(
+        resources,
+        prefix_priority=prefix_priority,
+        uri_prefix_priority=uri_prefix_priority,
+        include_prefixes=include_prefixes,
+        strict=strict,
+        blacklist=blacklist,
+        remapping=remapping,
+    )
+    return curies.Converter(records)
+
+
 def get_records(  # noqa: C901
     resources: List[Resource],
     prefix_priority: Optional[Sequence[str]] = None,