Skip to content

Commit

Permalink
Refactor converter generation code (#974)
Browse files Browse the repository at this point in the history
This PR gets rid of code that focuses on lists of `curies.Record`
objects and instead works directly with `curies.Converter` objects.

Along the way, this also identified issues with the data integrity on
MIRIAM, N2T, and Prefix Commons with respect to the TAIR resources
(`tair.gene` and `tair.protein`) which all used non-specific,
overlapping URLs. Therefore, these needed to get cleaned out before
being import.

Why do this? If we work directly with converters, we can make use of the
CURIE prefix reconciliation tooling to more cleanly refactor the
Bioregistry to Converter pipeline (which is causing issues when adding
prefix casing variants in a related PR #969)
  • Loading branch information
cthoyt authored Nov 2, 2023
1 parent 1f12bae commit 6eeeaf2
Show file tree
Hide file tree
Showing 17 changed files with 103 additions and 136 deletions.
2 changes: 0 additions & 2 deletions exports/contexts/bioregistry.epm.json
Original file line number Diff line number Diff line change
Expand Up @@ -26168,14 +26168,12 @@
"uri_prefix": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:",
"uri_prefix_synonyms": [
"TAIR.PROTEIN:",
"http://arabidopsis.org/servlets/TairObject?accession=",
"http://bio2rdf.org/tair.protein:",
"http://bioregistry.io/tair.protein:",
"http://identifiers.org/tair.protein/",
"http://identifiers.org/tair.protein/AASequence:",
"http://identifiers.org/tair.protein:",
"http://n2t.net/tair.protein:",
"https://arabidopsis.org/servlets/TairObject?accession=",
"https://arabidopsis.org/servlets/TairObject?accession=AASequence:",
"https://bio2rdf.org/tair.protein:",
"https://bioregistry.io/tair.protein:",
Expand Down
2 changes: 0 additions & 2 deletions exports/contexts/bioregistry.rpm.json
Original file line number Diff line number Diff line change
Expand Up @@ -3167,7 +3167,6 @@
"http://aps.unmc.edu/AP/database/query_output.php?ID=": "apd",
"http://ar5iv.org/abs/": "arxiv",
"http://arabidopsis.info/StockInfo?NASC_id=": "nasc",
"http://arabidopsis.org/servlets/TairObject?accession=": "tair.protein",
"http://arabidopsis.org/servlets/TairObject?accession=AASequence:": "tair.protein",
"http://arabidopsis.org/servlets/TairObject?accession=Gene:": "tair.gene",
"http://arabidopsis.org/servlets/TairObject?type=locus&name=": "tair.locus",
Expand Down Expand Up @@ -10733,7 +10732,6 @@
"https://aps.unmc.edu/AP/database/query_output.php?ID=": "apd",
"https://ar5iv.org/abs/": "arxiv",
"https://arabidopsis.info/StockInfo?NASC_id=": "nasc",
"https://arabidopsis.org/servlets/TairObject?accession=": "tair.protein",
"https://arabidopsis.org/servlets/TairObject?accession=AASequence:": "tair.protein",
"https://arabidopsis.org/servlets/TairObject?accession=Gene:": "tair.gene",
"https://arabidopsis.org/servlets/TairObject?type=locus&name=": "tair.locus",
Expand Down
2 changes: 0 additions & 2 deletions exports/contexts/obo.epm.json
Original file line number Diff line number Diff line change
Expand Up @@ -22771,14 +22771,12 @@
"prefix": "tair.protein",
"uri_prefix": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:",
"uri_prefix_synonyms": [
"http://arabidopsis.org/servlets/TairObject?accession=",
"http://bio2rdf.org/tair.protein:",
"http://bioregistry.io/tair.protein:",
"http://identifiers.org/tair.protein/",
"http://identifiers.org/tair.protein/AASequence:",
"http://identifiers.org/tair.protein:",
"http://n2t.net/tair.protein:",
"https://arabidopsis.org/servlets/TairObject?accession=",
"https://arabidopsis.org/servlets/TairObject?accession=AASequence:",
"https://bio2rdf.org/tair.protein:",
"https://bioregistry.io/tair.protein:",
Expand Down
1 change: 0 additions & 1 deletion src/bioregistry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@
write_registry,
)
from .uri_format import ( # noqa:F401
get_extended_prefix_map,
get_pattern_map,
get_prefix_map,
get_uri_format,
Expand Down
18 changes: 6 additions & 12 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -95986,8 +95986,7 @@
"namespaceEmbeddedInLui": false,
"pattern": "^Gene:\\d{7}$",
"prefix": "tair.gene",
"sampleId": "Gene:2200934",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"sampleId": "Gene:2200934"
},
"n2t": {
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. This is the reference gene model for a given locus.",
Expand All @@ -95996,8 +95995,7 @@
"name": "The Arabidopsis Information Resource (TAIR) Gene",
"namespaceEmbeddedInLui": false,
"pattern": "^Gene:\\d{7}$",
"prefix": "tair.gene",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"prefix": "tair.gene"
},
"part_of": "tair",
"pattern": "^\\d{7}$",
Expand All @@ -96012,8 +96010,7 @@
"prefix": "tair.gene",
"pubmed_ids": [
"12444417"
],
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
]
},
"publications": [
{
Expand Down Expand Up @@ -96133,8 +96130,7 @@
"namespaceEmbeddedInLui": false,
"pattern": "^AASequence:\\d{10}$",
"prefix": "tair.protein",
"sampleId": "AASequence:1009107926",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"sampleId": "AASequence:1009107926"
},
"n2t": {
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. This provides protein information for a given gene model and provides links to other sources such as UniProtKB and GenPept",
Expand All @@ -96143,8 +96139,7 @@
"name": "The Arabidopsis Information Resource (TAIR) Protein",
"namespaceEmbeddedInLui": false,
"pattern": "^AASequence:\\d{10}$",
"prefix": "tair.protein",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"prefix": "tair.protein"
},
"pattern": "^\\d{10}$",
"prefixcommons": {
Expand All @@ -96158,8 +96153,7 @@
"prefix": "tair.protein",
"pubmed_ids": [
"12444417"
],
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
]
},
"publications": [
{
Expand Down
6 changes: 2 additions & 4 deletions src/bioregistry/data/external/miriam/processed.json
Original file line number Diff line number Diff line change
Expand Up @@ -9440,8 +9440,7 @@
"namespaceEmbeddedInLui": false,
"pattern": "^Gene:\\d{7}$",
"prefix": "tair.gene",
"sampleId": "Gene:2200934",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"sampleId": "Gene:2200934"
},
"tair.locus": {
"deprecated": false,
Expand Down Expand Up @@ -9476,8 +9475,7 @@
"namespaceEmbeddedInLui": false,
"pattern": "^AASequence:\\d{10}$",
"prefix": "tair.protein",
"sampleId": "AASequence:1009107926",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"sampleId": "AASequence:1009107926"
},
"tarbase": {
"deprecated": false,
Expand Down
6 changes: 2 additions & 4 deletions src/bioregistry/data/external/n2t/processed.json
Original file line number Diff line number Diff line change
Expand Up @@ -5460,8 +5460,7 @@
"homepage": "http://arabidopsis.org/index.jsp",
"name": "The Arabidopsis Information Resource (TAIR) Gene",
"namespaceEmbeddedInLui": false,
"pattern": "^Gene:\\d{7}$",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"pattern": "^Gene:\\d{7}$"
},
"tair.locus": {
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. The name of a Locus is unique and used by TAIR, TIGR, and MIPS.",
Expand All @@ -5478,8 +5477,7 @@
"homepage": "http://arabidopsis.org/index.jsp",
"name": "The Arabidopsis Information Resource (TAIR) Protein",
"namespaceEmbeddedInLui": false,
"pattern": "^AASequence:\\d{10}$",
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
"pattern": "^AASequence:\\d{10}$"
},
"tarbase": {
"description": "TarBase stores microRNA (miRNA) information for miRNA–gene interactions, as well as miRNA- and gene-related facts to information specific to the interaction and the experimental validation methodologies used.",
Expand Down
6 changes: 2 additions & 4 deletions src/bioregistry/data/external/prefixcommons/processed.json
Original file line number Diff line number Diff line change
Expand Up @@ -10795,8 +10795,7 @@
"prefix": "tair.gene",
"pubmed_ids": [
"12444417"
],
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
]
},
"tair.locus": {
"description": "The locus name",
Expand All @@ -10823,8 +10822,7 @@
"prefix": "tair.protein",
"pubmed_ids": [
"12444417"
],
"uri_format": "http://arabidopsis.org/servlets/TairObject?accession=$1"
]
},
"tao": {
"bioportal": "1110",
Expand Down
13 changes: 10 additions & 3 deletions src/bioregistry/external/miriam.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
"4503",
"6vts",
}
SKIP_URI_FORMATS = {
"http://arabidopsis.org/servlets/TairObject?accession=$1",
}


def get_miriam(force_download: bool = False, force_process: bool = False):
Expand Down Expand Up @@ -91,7 +94,8 @@ def _process(record):
else:
primary, *rest = resources
rv["homepage"] = primary["homepage"]
rv[URI_FORMAT_KEY] = primary[URI_FORMAT_KEY]
if URI_FORMAT_KEY in primary:
rv[URI_FORMAT_KEY] = primary[URI_FORMAT_KEY]

extras = []
for provider in rest:
Expand All @@ -113,14 +117,17 @@ def _process(record):


def _preprocess_resource(resource):
return {
rv = {
"official": resource["official"],
"homepage": resource["resourceHomeUrl"],
"code": resource["providerCode"],
URI_FORMAT_KEY: resource["urlPattern"].replace("{$id}", "$1"),
"name": resource["name"],
"description": resource["description"],
}
uri_format = resource["urlPattern"].replace("{$id}", "$1")
if uri_format not in SKIP_URI_FORMATS:
rv[URI_FORMAT_KEY] = uri_format
return rv


@click.command()
Expand Down
15 changes: 14 additions & 1 deletion src/bioregistry/external/n2t.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
"merops": "issue with miriam having duplicate prefixes for this resource", # FIXME
"hgnc.family": "issue with miriam having duplicate prefixes for this resource", # FIXME
}
SKIP_URI_FORMATS = {
"http://arabidopsis.org/servlets/TairObject?accession=$1",
}


def get_n2t(force_download: bool = False):
Expand Down Expand Up @@ -53,7 +56,7 @@ def get_n2t(force_download: bool = False):
def _process(record):
rv = {
"name": record.get("name"),
URI_FORMAT_KEY: record["redirect"].replace("$id", "$1") if "redirect" in record else None,
URI_FORMAT_KEY: _get_uri_format(record),
"description": record.get("description"),
"homepage": record.get("more"),
"pattern": record.get("pattern"),
Expand All @@ -63,6 +66,16 @@ def _process(record):
return {k: v for k, v in rv.items() if v is not None}


def _get_uri_format(record):
raw_redirect = record.get("redirect")
if raw_redirect is None:
return None
uri_format = raw_redirect.replace("$id", "$1")
if uri_format in SKIP_URI_FORMATS:
return None
return uri_format


@click.command()
def main():
"""Reload the N2T data."""
Expand Down
17 changes: 11 additions & 6 deletions src/bioregistry/external/prefixcommons.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@
}
#: These contain synonyms with mismatches
DISCARD_SYNONYMS = {"biogrid", "cath", "zfa"}
SKIP_URI_FORMATS = {
"http://purl.obolibrary.org/obo/$1",
"http://www.ebi.ac.uk/ontology-lookup/?termId=$1",
"http://arabidopsis.org/servlets/TairObject?accession=$1",
}


def get_prefixcommons(force_download: bool = False, force_process: bool = False):
Expand Down Expand Up @@ -147,10 +152,7 @@ def _process_row(line: str):
uri_format = rv.pop("uri_format", None)
if uri_format:
uri_format = uri_format.replace("$id", "$1").replace("[?id]", "$1").replace("$d", "$1")
if uri_format not in {
"http://purl.obolibrary.org/obo/$1",
"http://www.ebi.ac.uk/ontology-lookup/?termId=$1",
}:
if uri_format not in SKIP_URI_FORMATS:
rv["uri_format"] = uri_format

uri_rdf_formats = _get_uri_formats(rv, "rdf_uri_prefix")
Expand Down Expand Up @@ -191,9 +193,12 @@ def _get_uri_formats(rv, key) -> List[str]:
continue
if "$1" in uri_format or "[?id]" in uri_format: # FIXME check if these come at the end
continue
rv.append(f"{uri_format}$1")
uri_format = f"{uri_format}$1"
if uri_format in SKIP_URI_FORMATS:
continue
rv.append(uri_format)
return rv


if __name__ == "__main__":
print(len(get_prefixcommons(force_process=True))) # noqa:T201
print(len(get_prefixcommons(force_process=True, force_download=True))) # noqa:T201
28 changes: 28 additions & 0 deletions src/bioregistry/record_accumulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,14 @@
)

import curies
from curies import Converter

from bioregistry import Resource

__all__ = [
"get_converter",
]

logger = logging.getLogger(__name__)
prefix_blacklist = {"bgee.gene"}
uri_prefix_blacklist = {
Expand All @@ -33,6 +38,7 @@
"http://www.ncbi.nlm.nih.gov/nuccore/",
"https://www.ebi.ac.uk/ena/data/view/",
"http://www.ebi.ac.uk/ena/data/view/",
"http://arabidopsis.org/servlets/TairObject?accession=",
}
prefix_resource_blacklist = {
("orphanet", "http://www.orpha.net/ORDO/Orphanet_"), # biocontext is wrong
Expand Down Expand Up @@ -103,6 +109,28 @@ def _iterate_prefix_prefix(resource: Resource, *extras: str):
# (e.g., uniprot.isoform and uniprot)


def get_converter(
resources: List[Resource],
prefix_priority: Optional[Sequence[str]] = None,
uri_prefix_priority: Optional[Sequence[str]] = None,
include_prefixes: bool = False,
strict: bool = False,
blacklist: Optional[Collection[str]] = None,
remapping: Optional[Mapping[str, str]] = None,
) -> Converter:
"""Generate a converter from resources."""
records = get_records(
resources,
prefix_priority=prefix_priority,
uri_prefix_priority=uri_prefix_priority,
include_prefixes=include_prefixes,
strict=strict,
blacklist=blacklist,
remapping=remapping,
)
return curies.Converter(records)


def get_records( # noqa: C901
resources: List[Resource],
prefix_priority: Optional[Sequence[str]] = None,
Expand Down
Loading

0 comments on commit 6eeeaf2

Please sign in to comment.