Skip to content

Commit

Permalink
Add types for Prefix and PrefixMap (#134)
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt authored Dec 19, 2024
1 parent 34b4663 commit 1df957b
Show file tree
Hide file tree
Showing 5 changed files with 384 additions and 6 deletions.
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,4 @@ for updating your code.
struct
api
services/index
typing
50 changes: 50 additions & 0 deletions docs/source/typing.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
Typing
======
This package comes with utilities for better typing other resources.

Let's say you have a table like this:

====== ========== ======== ======
prefix identifier name smiles
====== ========== ======== ======
CHEBI 16236 ethanol CCO
CHEBI 28831 propanol CCCO
CHOBI 44884 pentanol CCCCCO
====== ========== ======== ======

Note that there's a typo in the prefix on the fourth row in the prefix because it
uses ``CHOBI`` instead of ``CHEBI``. In the following code, we simulate reading that
file and show where the error shows up:

.. code-block:: python
import csv
from pydantic import BaseModel, ValidationError
from curies import Converter, Prefix
converter = Converter.from_prefix_map({
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
})
class Row(BaseModel):
prefix: Prefix
identifier: str
name: str
smiles: str
records = [
{"prefix": "CHEBI", "identifier": "16236", "name": "ethanol", "smiles": "CCO"},
{"prefix": "CHEBI", "identifier": "28831", "name": "propanol", "smiles": "CCCO"},
{"prefix": "CHOBI", "identifier": "44884", "name": "pentanol", "smiles": "CCCCCO"},
]
for record in records:
try:
model = Row.model_validate(record, context=converter)
except ValidationError as e:
print(f"Issue parsing record {record}: {e}")
continue
Note that :meth:`pydantic.BaseModel.model_validate` allows for passing a "context".
The :class:`curies.Prefix` class implements custom context handling, so if you pass
a converter, it knows how to check using prefixes in the converter.
4 changes: 4 additions & 0 deletions src/curies/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
DuplicateURIPrefixes,
DuplicateValueError,
NamedReference,
Prefix,
PrefixMap,
Record,
Records,
Reference,
Expand Down Expand Up @@ -38,6 +40,8 @@
"DuplicateURIPrefixes",
"DuplicateValueError",
"NamedReference",
"Prefix",
"PrefixMap",
"Record",
"Records",
"Reference",
Expand Down
222 changes: 216 additions & 6 deletions src/curies/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
overload,
)

from pydantic import BaseModel, ConfigDict, Field, RootModel, field_validator
from pydantic import BaseModel, ConfigDict, Field, GetCoreSchemaHandler, RootModel, field_validator
from pydantic_core import core_schema
from pytrie import StringTrie
from typing_extensions import Self

if TYPE_CHECKING: # pragma: no cover
import pandas
Expand All @@ -36,6 +38,8 @@
"DuplicateURIPrefixes",
"DuplicateValueError",
"NamedReference",
"Prefix",
"PrefixMap",
"Record",
"Records",
"Reference",
Expand Down Expand Up @@ -148,6 +152,194 @@ def from_curie(cls, curie: str, *, sep: str = ":") -> ReferenceTuple:
return cls(prefix, identifier)


class Prefix(str):
"""A string that is validated by Pydantic as a CURIE prefix.
This class is a subclass of Python's built-in string class,
so you can wrap any string with it:
.. code-block:: python
from curies import Prefix
prefix = Prefix("CHEBI")
You can implicitly type annotate data with this class:
.. code-block:: python
from curies import Prefix
prefix_map: dict[Prefix, str] = {
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
You can more explicitly type annotate data with this class using
Pydantic's `root model <https://docs.pydantic.dev/2.3/usage/models/#rootmodel-and-custom-root-types>`_:
.. code-block:: python
from pydantic import RootModel
from curies import Prefix
PrefixMap = RootModel[dict[Prefix, str]]
prefix_map = PrefixMap.model_validate(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
).root
This pattern is common enough that it's included in :class:`curies.PrefixMap`.
When used inside a Pydantic model, this class knows how to
do validation that the prefix matches the regular expression
for an XSD NCName. Here's an example usage with Pydantic:
.. code-block:: python
from curies import Prefix
from pydantic import BaseModel
class ResourceInfo(BaseModel):
prefix: Prefix
name: str
model = ResourceInfo.model_validate(
{
"prefix": "CHEBI",
"name": "Chemical Entities of Biological Interest",
}
)
# raises a pydantic.ValidationError, because the prefix
# doesn't match the NCName pattern
ResourceInfo.model_validate(
{
"prefix": "$nope",
"name": "An invalid semantic space!",
}
)
This class implements a hook that uses Pydantic's "context"
for validation that lets you pass a :class:`Converter` to check
for existence and standardization with respect to the context
in the converter:
.. code-block:: python
from curies import Prefix, get_obo_converter
from pydantic import BaseModel
class ResourceInfo(BaseModel):
prefix: Prefix
name: str
converter = get_obo_converter()
model = ResourceInfo.model_validate(
{
"prefix": "CHEBI",
"name": "Chemical Entities of Biological Interest",
},
context=converter,
)
# raises a pydantic.ValidationError, because the prefix
# is not registered in the OBO Foundry, and is therefore
# not part of the OBO converter
ResourceInfo.model_validate(
{
"prefix": "efo",
"name": "Experimental Factor Ontology",
},
context=converter,
)
# In case you need to pass more arbitrary
# context, you can also use a dict with the key
# "converter"
ResourceInfo.model_validate(
{
"prefix": "CHEBI",
"name": "Chemical Entities of Biological Interest",
},
context={
"converter": converter,
...
},
)
"""

@classmethod
def __get_pydantic_core_schema__(
cls, source: type[Any], handler: GetCoreSchemaHandler
) -> core_schema.AfterValidatorFunctionSchema:
return core_schema.with_info_after_validator_function(
cls._validate,
# TODO consider if we should use strict NCNAME pattern
# here like ^$|^[a-zA-Z_][\w.-]*$. See also
# https://cthoyt.com/2023/01/11/bioregistry-w3c-compliance.html
core_schema.str_schema(strict=False),
)

@classmethod
def _validate(cls, __input_value: str, info: core_schema.ValidationInfo) -> Self:
converter = _converter_from_validation_info(info)
if converter is None:
return cls(__input_value)
return cls(converter.standardize_prefix(__input_value, strict=True))


class PrefixMap(RootModel[dict[Prefix, str]]):
"""A simple prefix map.
This can be used to validate dictionaries:
.. code-block:: python
from curies import PrefixMap
prefix_map_model = PrefixMap.model_validate(
{
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
}
)
# note that you have to unpack it
prefix_map_dict = prefix_map_model.root
Similarly, a prefix map can be used as part of another Pydantic model
like in:
.. code-block:: python
from pydantic import BaseModel
from curies import PrefixMap
class RDFContent(BaseModel):
prefix_map: PrefixMap
triples: list[tuple[str, str, str]]
rdf_content = RDFContent.model_validate(
{
"prefix_map": {
"CHEBI": "http://purl.obolibrary.org/obo/CHEBI_",
},
"triples": [
("CHEBI:1234", "RO:0000001", "CHEBI:5678"),
],
}
)
# note that you have to unpack the resulting prefix map
prefix_map = rdf_content.prefix_map.root
"""


class Reference(BaseModel):
"""A reference to an entity in a given identifier space.
Expand Down Expand Up @@ -192,7 +384,7 @@ class notion of parsed reference (instead of merely stringified
ReferenceTuple(prefix='chebi', identifier='1234')
"""

prefix: str = Field(
prefix: Prefix = Field(
...,
description="The prefix used in a compact URI (CURIE).",
)
Expand Down Expand Up @@ -234,18 +426,21 @@ def pair(self) -> ReferenceTuple:
return ReferenceTuple(self.prefix, self.identifier)

@classmethod
def from_curie(cls, curie: str, *, sep: str = ":") -> Reference:
def from_curie(
cls, curie: str, *, sep: str = ":", converter: Converter | None = None
) -> Reference:
"""Parse a CURIE string and populate a reference.
:param curie: A string representation of a compact URI (CURIE)
:param sep: The separator
:param converter: The converter to use as context when parsing
:return: A reference object
>>> Reference.from_curie("chebi:1234")
Reference(prefix='chebi', identifier='1234')
"""
prefix, identifier = _split(curie, sep=sep)
return cls(prefix=prefix, identifier=identifier)
return cls.model_validate({"prefix": prefix, "identifier": identifier}, context=converter)


class NamedReference(Reference):
Expand All @@ -258,19 +453,24 @@ class NamedReference(Reference):
model_config = ConfigDict(frozen=True)

@classmethod
def from_curie(cls, curie: str, name: str, *, sep: str = ":") -> NamedReference: # type:ignore
def from_curie( # type:ignore
cls, curie: str, name: str, *, sep: str = ":", converter: Converter | None = None
) -> NamedReference:
"""Parse a CURIE string and populate a reference.
:param curie: A string representation of a compact URI (CURIE)
:param name: The name of the reference
:param sep: The separator
:param converter: The converter to use as context when parsing
:return: A reference object
>>> NamedReference.from_curie("chebi:1234", "6-methoxy-2-octaprenyl-1,4-benzoquinone")
NamedReference(prefix='chebi', identifier='1234', name='6-methoxy-2-octaprenyl-1,4-benzoquinone')
"""
prefix, identifier = _split(curie, sep=sep)
return cls(prefix=prefix, identifier=identifier, name=name)
return cls.model_validate(
{"prefix": prefix, "identifier": identifier, "name": name}, context=converter
)


RecordKey = tuple[str, str, str, str]
Expand Down Expand Up @@ -2512,3 +2712,13 @@ def upgrade_prefix_map(prefix_map: Mapping[str, str]) -> list[Record]:
Record(prefix=prefix, prefix_synonyms=prefix_synonyms, uri_prefix=uri_prefix)
for uri_prefix, (prefix, *prefix_synonyms) in sorted(priority_prefix_map.items())
]


def _converter_from_validation_info(info: core_schema.ValidationInfo) -> Converter | None:
context = info.context or {}
if isinstance(context, Converter):
return context
elif isinstance(context, dict):
return context.get("converter")
else:
raise TypeError
Loading

0 comments on commit 1df957b

Please sign in to comment.