-
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This PR splits out some of #104 to implement regular expressions for testing prefix and CURIE validity against the W3C standard
- Loading branch information
Showing
8 changed files
with
259 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,3 +68,4 @@ for updating your code. | |
api | ||
services/index | ||
typing | ||
w3c |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
W3C Validation | ||
============== | ||
.. automodapi:: curies.w3c | ||
:no-inheritance-diagram: | ||
:no-heading: | ||
:include-all-objects: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
"""Validation based on W3C standards. | ||
The Worldwide Web Consortium (W3C) provides standards for | ||
`prefixes <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_ (i.e., ``NCName``), | ||
`CURIEs <https://www.w3.org/TR/2010/NOTE-curie-20101216/>`_, and | ||
`IRIs <https://www.ietf.org/rfc/rfc3987.txt>`_, but they are | ||
highly obfuscated and spread across many documents. | ||
This module attempts to operationalize these standards, along with best practices | ||
of documentation and testing. | ||
.. seealso:: | ||
Some other work towards operationalizing these standards: | ||
- https://github.com/linkml/linkml-runtime/blob/main/linkml_runtime/utils/uri_validator.py | ||
- https://github.com/dgerber/rfc3987/blob/gh-archived/rfc3987.py | ||
""" | ||
|
||
import re | ||
|
||
__all__ = [ | ||
"CURIE_PATTERN", | ||
"LOCAL_UNIQUE_IDENTIFIER_PATTERN", | ||
"NCNAME_PATTERN", | ||
"is_w3c_curie", | ||
"is_w3c_prefix", | ||
] | ||
|
||
NCNAME_PATTERN = r"[A-Za-z_][A-Za-z0-9\.\-_]*" | ||
"""A regex for prefixes, from https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName. | ||
.. code-block:: | ||
prefix := NCName | ||
NCName := (Letter | '_') (NCNameChar)* | ||
NCNameChar ::= Letter | Digit | '.' | '-' | '_' | ||
""" | ||
|
||
NCNAME_RE = re.compile(f"^{NCNAME_PATTERN}$") | ||
|
||
LOCAL_UNIQUE_IDENTIFIER_PATTERN = r"(/[^\s/][^\s]*|[^\s/][^\s]*|[^\s]?)" | ||
"""A regex for local unique identifiers in CURIEs, based on https://www.ietf.org/rfc/rfc3987.txt | ||
This pattern was adapted from https://gist.github.com/niklasl/2506955, which sort of | ||
implements RFC3987, | ||
""" | ||
|
||
LOCAL_UNIQUE_IDENTIFIER_RE = re.compile(LOCAL_UNIQUE_IDENTIFIER_PATTERN) | ||
|
||
CURIE_PATTERN = rf"^({NCNAME_PATTERN}?:)?{LOCAL_UNIQUE_IDENTIFIER_PATTERN}$" | ||
"""A regex for CURIEs, based on https://www.w3.org/TR/2010/NOTE-curie-20101216. | ||
.. code-block:: | ||
curie := [ [ prefix ] ':' ] reference | ||
prefix := NCName | ||
reference := irelative-ref (as defined in `IRI <https://www.ietf.org/rfc/rfc3987.txt>`_) | ||
`irelative-ref` is defined/documented in :data:`curies.w3c.LOCAL_UNIQUE_IDENTIFIER_PATTERN`. | ||
""" | ||
|
||
CURIE_RE = re.compile(CURIE_PATTERN) | ||
|
||
|
||
def is_w3c_prefix(prefix: str) -> bool: | ||
"""Return if the string is a valid prefix under the W3C specification. | ||
:param prefix: A string | ||
:return: If the string is a valid prefix under the W3C specification. | ||
Validation is implemented as a regular expression match against | ||
:data:`curies.w3c.NCNAME_PATTERN`, as defined by the W3C | ||
`here <https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-NCName>`_. | ||
Examples | ||
-------- | ||
Strings containig numbers, letters, and underscores are valid prefixes. | ||
>>> is_w3c_prefix("GO") | ||
True | ||
The W3C specification states that the prefix '_' is reserved for use | ||
by languages that support RDF. For this reason, the prefix '_' SHOULD | ||
be avoided by authors. | ||
>>> is_w3c_prefix("_") | ||
True | ||
Strings starting with a number are not | ||
valid prefixes. | ||
>>> is_w3c_prefix("3dmet") | ||
Strings containing a colon or other | ||
characters are invalid | ||
>>> is_w3c_prefix("GO:") | ||
False | ||
""" | ||
return bool(NCNAME_RE.match(prefix)) | ||
|
||
|
||
def _is_w3c_luid(luid: str) -> bool: | ||
return bool(LOCAL_UNIQUE_IDENTIFIER_RE.match(luid)) | ||
|
||
|
||
def is_w3c_curie(curie: str) -> bool: | ||
"""Return if the string is a valid CURIE under the W3C specification. | ||
:param curie: A string to check if it is a valid CURIE under the W3C specification. | ||
:return: True if the string is a valid CURIE under the W3C specification. | ||
.. warning:: | ||
This is slightly different from the :meth:`curies.Converter.is_curie` function, | ||
which checks if a given CURIE is valid under the extended prefix map contained | ||
within the converter. | ||
Further, the base converter is slightly more lenient than the W3C specification | ||
by default to allow for the inclusion of CURIEs, e.g., for SMILES strings like | ||
``smiles:CC(=O)NC([H])(C)C(=O)O``. These are useful, but not technically valid | ||
due to their inclusion of brackets. | ||
Examples | ||
-------- | ||
If no prefix is given, the host language chooses how to assign a default | ||
prefix. | ||
>>> is_w3c_curie(":test") | ||
True | ||
From the specification, regarding using an underscore as the prefix | ||
The CURIE prefix '_' is reserved for use by languages that support RDF. | ||
For this reason, the prefix '_' SHOULD be avoided by authors. | ||
>>> is_w3c_curie("_:test") | ||
True | ||
This is invalid because a CURIE prefix isn't allowed to start with | ||
a number. It has to start with either a letter, or an underscore. | ||
>>> is_w3c_curie("4cdn:test") | ||
False | ||
Empty strings are explicitly noted as being invalid. | ||
>>> is_w3c_curie("") | ||
False | ||
""" | ||
if "[" in curie or "]" in curie: | ||
return False | ||
|
||
# empty curie is invalid (for now) | ||
if not curie.strip(): | ||
return False | ||
|
||
# if there's no colon, then validate the whole thing against the LUID pattern. | ||
# this is because | ||
prefix, sep, identifier = curie.partition(":") | ||
if not sep: | ||
return _is_w3c_luid(curie) | ||
|
||
# it's okay for there to be no prefix in a CURIE, even though | ||
# the NCName definition is not itself allowed to be empty | ||
if not prefix: | ||
return _is_w3c_luid(identifier) | ||
|
||
return is_w3c_prefix(prefix) and _is_w3c_luid(identifier) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
|
||
smiles:CC(=O)NC([H])(C)C(=O)O | ||
4cdn:test | ||
#nope:nope | ||
?nope:nope |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
|
||
4dn | ||
GO:GO: | ||
GO: | ||
#nope | ||
$ | ||
# | ||
: | ||
:luid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
pfx:abc | ||
: | ||
pfx: | ||
abc | ||
:abc | ||
_:abc | ||
pfx:/abc | ||
pfx:/ | ||
:/ | ||
bioregistry:bioregistry | ||
GO:0000012 | ||
go:0123456 | ||
home:#start | ||
joseki: | ||
google:xforms+or+'xml+forms' | ||
isbn:0321154991 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
go | ||
GO | ||
NCBITaxon | ||
ncbi.taxon | ||
ncbi_taxon | ||
_ | ||
_secret | ||
secret_ | ||
_secret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
"""Test W3C validation.""" | ||
|
||
import unittest | ||
from pathlib import Path | ||
|
||
from curies.w3c import is_w3c_curie, is_w3c_prefix | ||
|
||
HERE = Path(__file__).parent.resolve() | ||
DIRECTORY = HERE.joinpath("resources") | ||
VALID_CURIES_PATH = DIRECTORY.joinpath("valid_curies.txt") | ||
INVALID_CURIES_PATH = DIRECTORY.joinpath("invalid_curies.txt") | ||
VALID_PREFIXES_PATH = DIRECTORY.joinpath("valid_prefixes.txt") | ||
INVALID_PREFIXES_PATH = DIRECTORY.joinpath("invalid_prefixes.txt") | ||
|
||
|
||
def _read(path: Path) -> list[str]: | ||
return path.read_text().splitlines() | ||
|
||
|
||
class TestValidators(unittest.TestCase): | ||
"""Test W3C validation.""" | ||
|
||
def test_prefixes(self) -> None: | ||
"""Test prefixes validation.""" | ||
for prefix in _read(VALID_PREFIXES_PATH): | ||
with self.subTest(prefix=prefix): | ||
self.assertTrue(is_w3c_prefix(prefix)) | ||
|
||
for prefix in _read(INVALID_PREFIXES_PATH): | ||
with self.subTest(prefix=prefix): | ||
self.assertFalse(is_w3c_prefix(prefix)) | ||
|
||
def test_curies(self) -> None: | ||
"""Test CURIE validation.""" | ||
for curie in _read(VALID_CURIES_PATH): | ||
with self.subTest(curie=curie): | ||
self.assertTrue(is_w3c_curie(curie), msg="CURIE should test as valid, but did not") | ||
|
||
for curie in _read(INVALID_CURIES_PATH): | ||
with self.subTest(curie=curie): | ||
self.assertFalse( | ||
is_w3c_curie(curie), msg="CURIE should test as invalid, but did not" | ||
) |