Skip to content

Commit

Permalink
Add w3c validation module (#143)
Browse files Browse the repository at this point in the history
This PR splits out some of #104 to implement regular expressions for
testing prefix and CURIE validity against the W3C standard
  • Loading branch information
cthoyt authored Dec 19, 2024
1 parent 79c1c3a commit 13e4ca1
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,4 @@ for updating your code.
api
services/index
typing
w3c
6 changes: 6 additions & 0 deletions docs/source/w3c.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
W3C Validation
==============
.. automodapi:: curies.w3c
:no-inheritance-diagram:
:no-heading:
:include-all-objects:
170 changes: 170 additions & 0 deletions src/curies/w3c.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""Validation based on W3C standards.
The Worldwide Web Consortium (W3C) provides standards for
`prefixes <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_ (i.e., ``NCName``),
`CURIEs <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_, and
`IRIs <https://www.ietf.org/rfc/rfc3987.txt>`_, but they are
highly obfuscated and spread across many documents.
This module attempts to operationalize these standards, along with best practices
of documentation and testing.
.. seealso::
Some other work towards operationalizing these standards:
- https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py
- https://github.com/dgerber/rfc3987/blob/gh-archived/rfc3987.py
"""

import re

__all__ = [
"CURIE_PATTERN",
"LOCAL_UNIQUE_IDENTIFIER_PATTERN",
"NCNAME_PATTERN",
"is_w3c_curie",
"is_w3c_prefix",
]

NCNAME_PATTERN = r"[A-Za-z_][A-Za-z0-9\.\-_]*"
"""A regex for prefixes, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName.
.. code-block::
prefix := NCName
NCName := (Letter | '_') (NCNameChar)*
NCNameChar ::= Letter | Digit | '.' | '-' | '_'
"""

NCNAME_RE = re.compile(f"^{NCNAME_PATTERN}$")

LOCAL_UNIQUE_IDENTIFIER_PATTERN = r"(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)"
"""A regex for local unique identifiers in CURIEs, based on https://www.ietf.org/rfc/rfc3987.txt
This pattern was adapted from https://gist.github.com/niklasl/2506955, which sort of
implements RFC3987,
"""

LOCAL_UNIQUE_IDENTIFIER_RE = re.compile(LOCAL_UNIQUE_IDENTIFIER_PATTERN)

CURIE_PATTERN = rf"^({NCNAME_PATTERN}?:)?{LOCAL_UNIQUE_IDENTIFIER_PATTERN}$"
"""A regex for CURIEs, based on https://www.w3.org/TR/2010/NOTE-curie-20101216.
.. code-block::
curie := [ [ prefix ] ':' ] reference
prefix := NCName
reference := irelative-ref (as defined in `IRI <https://www.ietf.org/rfc/rfc3987.txt>`_)
`irelative-ref` is defined/documented in :data:`curies.w3c.LOCAL_UNIQUE_IDENTIFIER_PATTERN`.
"""

CURIE_RE = re.compile(CURIE_PATTERN)


def is_w3c_prefix(prefix: str) -> bool:
"""Return if the string is a valid prefix under the W3C specification.
:param prefix: A string
:return: If the string is a valid prefix under the W3C specification.
Validation is implemented as a regular expression match against
:data:`curies.w3c.NCNAME_PATTERN`, as defined by the W3C
`here <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_.
Examples
--------
Strings containig numbers, letters, and underscores are valid prefixes.
>>> is_w3c_prefix("GO")
True
The W3C specification states that the prefix '_' is reserved for use
by languages that support RDF. For this reason, the prefix '_' SHOULD
be avoided by authors.
>>> is_w3c_prefix("_")
True
Strings starting with a number are not
valid prefixes.
>>> is_w3c_prefix("3dmet")
Strings containing a colon or other
characters are invalid
>>> is_w3c_prefix("GO:")
False
"""
return bool(NCNAME_RE.match(prefix))


def _is_w3c_luid(luid: str) -> bool:
return bool(LOCAL_UNIQUE_IDENTIFIER_RE.match(luid))


def is_w3c_curie(curie: str) -> bool:
"""Return if the string is a valid CURIE under the W3C specification.
:param curie: A string to check if it is a valid CURIE under the W3C specification.
:return: True if the string is a valid CURIE under the W3C specification.
.. warning::
This is slightly different from the :meth:`curies.Converter.is_curie` function,
which checks if a given CURIE is valid under the extended prefix map contained
within the converter.
Further, the base converter is slightly more lenient than the W3C specification
by default to allow for the inclusion of CURIEs, e.g., for SMILES strings like
``smiles:CC(=O)NC([H])(C)C(=O)O``. These are useful, but not technically valid
due to their inclusion of brackets.
Examples
--------
If no prefix is given, the host language chooses how to assign a default
prefix.
>>> is_w3c_curie(":test")
True
From the specification, regarding using an underscore as the prefix
The CURIE prefix '_' is reserved for use by languages that support RDF.
For this reason, the prefix '_' SHOULD be avoided by authors.
>>> is_w3c_curie("_:test")
True
This is invalid because a CURIE prefix isn't allowed to start with
a number. It has to start with either a letter, or an underscore.
>>> is_w3c_curie("4cdn:test")
False
Empty strings are explicitly noted as being invalid.
>>> is_w3c_curie("")
False
"""
if "[" in curie or "]" in curie:
return False

# empty curie is invalid (for now)
if not curie.strip():
return False

# if there's no colon, then validate the whole thing against the LUID pattern.
# this is because
prefix, sep, identifier = curie.partition(":")
if not sep:
return _is_w3c_luid(curie)

# it's okay for there to be no prefix in a CURIE, even though
# the NCName definition is not itself allowed to be empty
if not prefix:
return _is_w3c_luid(identifier)

return is_w3c_prefix(prefix) and _is_w3c_luid(identifier)
5 changes: 5 additions & 0 deletions tests/resources/invalid_curies.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@

smiles:CC(=O)NC([H])(C)C(=O)O
4cdn:test
#nope:nope
?nope:nope
9 changes: 9 additions & 0 deletions tests/resources/invalid_prefixes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

4dn
GO:GO:
GO:
#nope
$
#
:
:luid
16 changes: 16 additions & 0 deletions tests/resources/valid_curies.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
pfx:abc
:
pfx:
abc
:abc
_:abc
pfx:/abc
pfx:/
:/
bioregistry:bioregistry
GO:0000012
go:0123456
home:#start
joseki:
google:xforms+or+'xml+forms'
isbn:0321154991
9 changes: 9 additions & 0 deletions tests/resources/valid_prefixes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
go
GO
NCBITaxon
ncbi.taxon
ncbi_taxon
_
_secret
secret_
_secret
43 changes: 43 additions & 0 deletions tests/test_w3c.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Test W3C validation."""

import unittest
from pathlib import Path

from curies.w3c import is_w3c_curie, is_w3c_prefix

HERE = Path(__file__).parent.resolve()
DIRECTORY = HERE.joinpath("resources")
VALID_CURIES_PATH = DIRECTORY.joinpath("valid_curies.txt")
INVALID_CURIES_PATH = DIRECTORY.joinpath("invalid_curies.txt")
VALID_PREFIXES_PATH = DIRECTORY.joinpath("valid_prefixes.txt")
INVALID_PREFIXES_PATH = DIRECTORY.joinpath("invalid_prefixes.txt")


def _read(path: Path) -> list[str]:
return path.read_text().splitlines()


class TestValidators(unittest.TestCase):
"""Test W3C validation."""

def test_prefixes(self) -> None:
"""Test prefixes validation."""
for prefix in _read(VALID_PREFIXES_PATH):
with self.subTest(prefix=prefix):
self.assertTrue(is_w3c_prefix(prefix))

for prefix in _read(INVALID_PREFIXES_PATH):
with self.subTest(prefix=prefix):
self.assertFalse(is_w3c_prefix(prefix))

def test_curies(self) -> None:
"""Test CURIE validation."""
for curie in _read(VALID_CURIES_PATH):
with self.subTest(curie=curie):
self.assertTrue(is_w3c_curie(curie), msg="CURIE should test as valid, but did not")

for curie in _read(INVALID_CURIES_PATH):
with self.subTest(curie=curie):
self.assertFalse(
is_w3c_curie(curie), msg="CURIE should test as invalid, but did not"
)

0 comments on commit 13e4ca1

Please sign in to comment.