diff --git a/linkml_runtime/utils/metamodelcore.py b/linkml_runtime/utils/metamodelcore.py index a4c60f41..957a1269 100644 --- a/linkml_runtime/utils/metamodelcore.py +++ b/linkml_runtime/utils/metamodelcore.py @@ -13,6 +13,10 @@ from linkml_runtime.utils.namespaces import Namespaces from linkml_runtime.utils.strictness import is_strict +from linkml_runtime.utils.uri_validator import validate_uri +from linkml_runtime.utils.uri_validator import validate_uri_reference +from linkml_runtime.utils.uri_validator import validate_curie + # Reference Decimal to make sure it stays in the imports _z = Decimal(1) @@ -105,10 +109,12 @@ def is_valid(cls, v: Union[str, URIRef, "Curie", "URIorCURIE"]) -> bool: if not isinstance(v, (str, URIRef, Curie, URIorCURIE)): return False v = str(v) - if ':' in v and '://' not in v: - return URIorCURIE.is_curie(v) + if validate_uri(v): + return True + elif validate_uri_reference(v): + return True else: - return URI.is_valid(v) + return URIorCURIE.is_curie(v) @staticmethod def is_absolute(v: str) -> bool: @@ -116,6 +122,8 @@ def is_absolute(v: str) -> bool: @staticmethod def is_curie(v: str, nsm: Optional[Namespaces] = None) -> bool: + if not validate_curie(v): + return False if ':' in v and '://' not in v: ns, ln = v.split(':', 1) return len(ns) == 0 or (NCName.is_valid(ns) and @@ -136,13 +144,14 @@ def __init__(self, v: str) -> None: raise ValueError(f"'{v}': is not a valid URI") super().__init__(v) - # this is more inclusive than the W3C specification - #uri_re = re.compile("^[A-Za-z]\\S*$") - uri_re = re.compile("^\\S+$") - @classmethod def is_valid(cls, v: str) -> bool: - return v is not None and not URIorCURIE.is_curie(v) and cls.uri_re.match(v) + if validate_uri(v): + return True + elif validate_uri_reference(v): + return True + else: + return False class Curie(URIorCURIE): @@ -174,6 +183,8 @@ def ns_ln(cls, v: str) -> Optional[Tuple[str, str]]: @classmethod def is_valid(cls, v: str) -> bool: + if not validate_curie(v): + return False pnln = cls.ns_ln(v) #return pnln is not None and (not pnln[0] or isinstance(pnln[0], PN_PREFIX)) return pnln is not None diff --git a/linkml_runtime/utils/uri_validator.py b/linkml_runtime/utils/uri_validator.py new file mode 100644 index 00000000..70203d27 --- /dev/null +++ b/linkml_runtime/utils/uri_validator.py @@ -0,0 +1,362 @@ +# Copyright Siemens 2023 +# SPDX-License-Identifier: CC0-1.0 + +import re + +""" +Regular-expression-based URI and CURIE validation functions + +These regex are directly derived from the official sources mentioned in each +section. + +They should be processed with re.VERBOSE. + +Python named regular expression groups are being used to better understand the +URI/CURIE parsing. +""" + + +# ----------------------------------------------------------------------------- +# +### BASICS + +# Define DIGIT according RFC2234 section 3.4: +# https://datatracker.ietf.org/doc/html/rfc2234/#section-3.4 +DIGIT = r"[0-9]" + +# Define ALPHA according RFC2234 section 6.1: +# https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1 +ALPHA = r"[A-Za-z]" + +# Define HEXDIG according RFC2234 section 6.1: +# https://datatracker.ietf.org/doc/html/rfc2234/#section-6.1 +HEXDIG = r"[0-9A-Fa-f]" + +# pct-encoded = "%" HEXDIG HEXDIG +pct_encoded = rf"% {HEXDIG} {HEXDIG}" + +# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +unreserved = rf"(?: {ALPHA} | {DIGIT} | \- | \. | _ | ~ )" + +# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )" + +# sub-delims = "!" / "$" / "&" / "'" / "(" +sub_delims = r"(?: ! | \$ | & | ' | \( | \) | \* | \+ | , | ; | = )" + +# pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +pchar = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | : | @ )" + +# reserved = gen-delims / sub-delims +reserved = rf"(?: {gen_delims} | {sub_delims} )" + + +### required for Authority + +# dec-octet = DIGIT ; 0-9 +# / %x31-39 DIGIT ; 10-99 +# / "1" 2DIGIT ; 100-199 +# / "2" %x30-34 DIGIT ; 200-249 +# / "25" %x30-35 ; 250-255 +dec_octet = rf"""(?: {DIGIT} | + [1-9] {DIGIT} | + 1 {DIGIT}{{2}} | + 2 [0-4] {DIGIT} | + 25 [0-5] + ) +""" + +# IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +IPv4address = rf"{dec_octet} \. {dec_octet} \. {dec_octet} \. {dec_octet}" + +# h16 = 1*4HEXDIG +h16 = rf"(?: {HEXDIG} ){{1,4}}" + +# ls32 = ( h16 ":" h16 ) / IPv4address +ls32 = rf"(?: (?: {h16} : {h16} ) | {IPv4address} )" + +# IPv6address = 6( h16 ":" ) ls32 +# / "::" 5( h16 ":" ) ls32 +# / [ h16 ] "::" 4( h16 ":" ) ls32 +# / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +# / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +# / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +# / [ *4( h16 ":" ) h16 ] "::" ls32 +# / [ *5( h16 ":" ) h16 ] "::" h16 +# / [ *6( h16 ":" ) h16 ] "::" +IPv6address = rf"""(?: (?: {h16} : ){{6}} {ls32} | + :: (?: {h16} : ){{5}} {ls32} | + (?: {h16} )? :: (?: {h16} : ){{4}} {ls32} | + (?: (?: {h16} : ) {h16} )? :: (?: {h16} : ){{3}} {ls32} | + (?: (?: {h16} : ){{1,2}} {h16} )? :: (?: {h16} : ){{2}} {ls32} | + (?: (?: {h16} : ){{1,3}} {h16} )? :: {h16} : {ls32} | + (?: (?: {h16} : ){{1,4}} {h16} )? :: {ls32} | + (?: (?: {h16} : ){{1,5}} {h16} )? :: {h16} | + (?: (?: {h16} : ){{1,6}} {h16} )? :: + ) +""" + +# IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +IPvFuture = rf"v {HEXDIG}+ \. (?: {unreserved} | {sub_delims} | : )+" + +# IP-literal = "[" ( IPv6address / IPvFuture ) "]" +IP_literal = rf"\[ (?: {IPv6address} | {IPvFuture} ) \]" + +# reg-name = *( unreserved / pct-encoded / sub-delims ) +reg_name = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} )*" + + +### required for Path + +# segment = *pchar +segment = rf"{pchar}*" + +# segment-nz = 1*pchar +segment_nz = rf"{pchar}+" + +# segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) +segment_nz_nc = rf"(?: {unreserved} | {pct_encoded} | {sub_delims} | @ )+" + +# ----------------------------------------------------------------------------- +# +# Define SCHEME according RFC3986 section 3.1: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.1 +# + +# scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +scheme = rf"(?P {ALPHA} (?: {ALPHA} | {DIGIT} | \+ | \- | \. )* )" + + +# ----------------------------------------------------------------------------- +# +# Define AUTHORITY according RFC3986 section 3.2: + +# Define USER INFORMATION according RFC3986 section 3.2.1: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.1 + +# userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +userinfo = rf"""(?P + (?: {unreserved} | {pct_encoded} | {sub_delims} | : )* + ) +""" + +# Define HOST according RFC3986 section 3.2.2: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 + +# host = IP-literal / IPv4address / reg-name +host = rf"(?P {IP_literal} | {IPv4address} | {reg_name} )" + +# Define PORT according RFC3986 section 3.2.3: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.3 + +# port = *DIGIT +port = rf"(?P ( {DIGIT} )* )" + +# Define AUTHORITY according RFC3986 section 3.2: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2 +# + +# authority = [ userinfo "@" ] host [ ":" port ] +#authority = rf"""(?: (?P {userinfo} ) @)? +authority = rf"""(?P + (?: {userinfo} @)? + {host} + (?: : {port} )? + ) +""" + + +# ----------------------------------------------------------------------------- +# +# Define different PATHs according RFC3986 section 3.3: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.3 +# + +# path-abempty = *( "/" segment ) +path_abempty = rf"( / {segment} )*" + +# path-absolute = "/" [ segment-nz *( "/" segment ) ] +path_absolute = rf"( / (?: {segment_nz} (?: / {segment} )* )? )" + +# path-noscheme = segment-nz-nc *( "/" segment ) +path_noscheme = rf"( {segment_nz_nc} (?: / {segment} )* )" + +# path-rootless = segment-nz *( "/" segment ) +path_rootless = rf"( {segment_nz} (?: / {segment} )* )" + +# path-empty = 0 +path_empty = r"" + +# path = path-abempty ; begins with "/" or is empty +# / path-absolute ; begins with "/" but not "//" +# / path-noscheme ; begins with a non-colon segment +# / path-rootless ; begins with a segment +# / path-empty ; zero characters +path = rf"""(?: + {path_abempty} | + {path_absolute} | + {path_noscheme} | + {path_rootless} | + {path_empty} + ) +""" + + +# ----------------------------------------------------------------------------- +# +# Define QUERY according RFC3986 section 3.4: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.4 +# + +# query = *( pchar / "/" / "?" ) +query = rf"(?P (?: {pchar} | / | \? )* )" + + +# ----------------------------------------------------------------------------- +# +# Define FRAGMENT according RFC3986 section 3.5: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3.5 +# + +# fragment = *( pchar / "/" / "?" ) +fragment = rf"(?P (?: {pchar} | / | \? )* )" + + +# ----------------------------------------------------------------------------- +# +# Define URI and HIERARCHICAL PATH according RFC3986 section 3: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-3 +# + +# hier-part = "//" authority path-abempty +# / path-absolute +# / path-rootless +# / path-empty +hier_part = rf"""(?P + (?: // {authority} {path_abempty} ) | + {path_absolute} | + {path_rootless} | + {path_empty} + ) +""" + + +# URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] +URI = rf"""(?P + {scheme} : {hier_part} (?: \? {query} )? (?: \# {fragment} )? + ) +""" + + +# ----------------------------------------------------------------------------- +# +# Define RELATIVE REFERENCE according RFC3986 section 4.2: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-4.2 +# + +# relative-part = "//" authority path-abempty +# / path-absolute +# / path-noscheme +# / path-empty +# relative-ref = relative-part [ "?" query ] [ "#" fragment ] +relative_ref = rf"""(?P + (?: + (?: // + {authority} + (?P {path_abempty} ) + ) | + (?P {path_absolute} ) | + (?P {path_noscheme} ) | + (?P {path_empty} ) + ) + (?: \? {query} )? + (?: \# {fragment} )? + ) +""" + +# ----------------------------------------------------------------------------- +# +# Define ABSOLUTE URI according RFC3986 section 4.3: +# https://datatracker.ietf.org/doc/html/rfc3986/#section-4.3 +# + +# absolute-URI = scheme ":" hier-part [ "?" query ] +absolute_URI = rf"(?P {scheme} : {hier_part} (?: \? {query} )? )" + + +# ----------------------------------------------------------------------------- +# +# Define CURIE according W3C CURIE Syntax 1.0 +# https://www.w3.org/TR/curie/#s_syntax +# + +# NCNameChar ::= Letter | Digit | '.' | '-' | '_' | CombiningChar | Extender +# !! IMPORTANT NOTE !! +# As of now this module doesn't support NCNameChar IRI, but +# relative-refs as defined in URI, +# NCNameChar ::= Letter | Digit | '.' | '-' | '_' +NCNameChar = rf"(?: {ALPHA} | {DIGIT} | \. | \- | _ )" + +# prefix := NCName +# NCName := (Letter | '_') (NCNameChar)* +prefix = rf"(?: {ALPHA} | _ ) (?: {NCNameChar} )*" + +# reference := irelative-ref (as defined in IRI) +# !! IMPORTANT NOTE !! +# As of now this module don't support irelative-refs as defined in IRI, but +# relative-refs as defined in URI +# curie := [ [ prefix ] ':' ] reference +# reference := relative-ref (as defined in URI) +CURIE = rf"""(?P + (?: (?P {prefix} )? : )? + {relative_ref} + ) +""" + +# safe_curie := '[' curie ']' +safe_CURIE = rf"""(?P + \[ {CURIE} \] + ) +""" + + +# ----------------------------------------------------------------------------- +# +### Compile the regular expressions for better performance + +uri_validator = re.compile("^{}$".format(URI), re.VERBOSE) + +#uri_ref_validator = re.compile("^{}$".format(URI_reference), re.VERBOSE) + +uri_relative_ref_validator = re.compile("^{}$".format(relative_ref), re.VERBOSE) + +abs_uri_validator = re.compile("^{}$".format(absolute_URI), re.VERBOSE) + +curie_validator = re.compile("^{}$".format(CURIE), re.VERBOSE) + +safe_curie_validator = re.compile("^{}$".format(safe_CURIE), re.VERBOSE) + +# ----------------------------------------------------------------------------- +# +### FUNCTIONS + + +def validate_uri(input): + return uri_validator.match(input) + + +def validate_uri_reference(input): + # ----------------------------------------------------------------------------- + # + # Define URI REFERENCE according RFC3986 section 4.1: + # https://datatracker.ietf.org/doc/html/rfc3986/#section-4.1 + # + + # URI-reference = URI / relative-ref + return uri_validator.match(input) or uri_relative_ref_validator.match(input) + + +def validate_curie(input): + # print(CURIE) + return curie_validator.match(input) + diff --git a/tests/test_loaders_dumpers/input/phenopackets/constants.yaml b/tests/test_loaders_dumpers/input/phenopackets/constants.yaml index b17077a3..e5fb97a4 100644 --- a/tests/test_loaders_dumpers/input/phenopackets/constants.yaml +++ b/tests/test_loaders_dumpers/input/phenopackets/constants.yaml @@ -322,7 +322,7 @@ enums: meaning: UCUM:degree DIOPTER: description: diopter - meaning: UCUM:[diop] + meaning: UCUM:%5Bdiop%5D GRAM: description: gram meaning: UCUM:g @@ -373,7 +373,7 @@ enums: meaning: UCUM:mm MILLIMETRES_OF_MERCURY: description: millimetres of mercury - meaning: UCUM:mm[Hg] + meaning: UCUM:mm%5BHg%5D MILLIMOLE: description: millimole meaning: UCUM:mmol diff --git a/tests/test_loaders_dumpers/input/phenopackets/cv_terms.yaml b/tests/test_loaders_dumpers/input/phenopackets/cv_terms.yaml index 28c8bc93..a539d7df 100644 --- a/tests/test_loaders_dumpers/input/phenopackets/cv_terms.yaml +++ b/tests/test_loaders_dumpers/input/phenopackets/cv_terms.yaml @@ -110,7 +110,7 @@ enums: reachable_from: source_ontology: bioregistry:uo source_nodes: - - UO:0000000 ! unit + - UO:0000000 is_direct: false include_self: false relationship_types: diff --git a/tests/test_loaders_dumpers/models/phenopackets.py b/tests/test_loaders_dumpers/models/phenopackets.py index 5826c9d6..255a4578 100644 --- a/tests/test_loaders_dumpers/models/phenopackets.py +++ b/tests/test_loaders_dumpers/models/phenopackets.py @@ -3006,7 +3006,7 @@ class UnitTerms(EnumDefinitionImpl): meaning=UCUM.degree) DIOPTER = PermissibleValue(text="DIOPTER", description="diopter", - meaning=UCUM["[diop]"]) + meaning=UCUM["%5Bdiop%5D"]) GRAM = PermissibleValue(text="GRAM", description="gram", meaning=UCUM.g) @@ -3057,7 +3057,7 @@ class UnitTerms(EnumDefinitionImpl): meaning=UCUM.mm) MILLIMETRES_OF_MERCURY = PermissibleValue(text="MILLIMETRES_OF_MERCURY", description="millimetres of mercury", - meaning=UCUM["mm[Hg]"]) + meaning=UCUM["mm%5BHg%5D"]) MILLIMOLE = PermissibleValue(text="MILLIMOLE", description="millimole", meaning=UCUM.mmol) diff --git a/tests/test_processing/test_referencevalidator.py b/tests/test_processing/test_referencevalidator.py index 5c541959..7f813799 100644 --- a/tests/test_processing/test_referencevalidator.py +++ b/tests/test_processing/test_referencevalidator.py @@ -1062,7 +1062,7 @@ def test_08_normalize_types(self): "uriorcurie": [ ("X:1", [], [], "X:1"), ("http://example.org", [], [], "http://example.org"), - ("", [], [ConstraintType.TypeConstraint], ""), + ("", [], [], ""), ("a b", [], [ConstraintType.TypeConstraint], "a b"), (None, [], [], None), ], diff --git a/tests/test_utils/test_metamodelcore.py b/tests/test_utils/test_metamodelcore.py index cad39f86..2d74e6b1 100644 --- a/tests/test_utils/test_metamodelcore.py +++ b/tests/test_utils/test_metamodelcore.py @@ -31,29 +31,30 @@ def test_ncname(self): with self.assertRaises(ValueError): NCName('A12!') - def test_uris(self): + def test_uriorcuries(self): """ Test the URI and URIorCURIE types """ str1 = "https://google.com/test#file?abc=1&def=4" self.assertEqual(str1, URIorCURIE(str1)) - self.assertEqual(str1, URI(str1)) str2 = "abc:123" self.assertEqual(str2, URIorCURIE(str2)) str3 = ":123" self.assertEqual(str3, URIorCURIE(str3)) with self.assertRaises(ValueError): - URI(str2) + URIorCURIE("abc:[def]") with self.assertRaises(ValueError): URIorCURIE("1abc:def") with self.assertRaises(ValueError): URIorCURIE("1:def") with self.assertRaises(ValueError): URIorCURIE(" ") - #with self.assertRaises(ValueError): - # URIorCURIE("_") + with self.assertRaises(ValueError): + URIorCURIE("[") lax() - URI(str2) URIorCURIE("1abc:def") URIorCURIE("1:def") + self.assertFalse(URIorCURIE.is_valid(123)) + URIorCURIE.is_curie("abc:123") + self.assertFalse(URIorCURIE.is_curie("http://example.org/path")) def test_curie(self): """ Test the CURIE type """ @@ -79,22 +80,34 @@ def test_curie(self): def test_uri(self): """ Test the URI data type """ + str1 = "https://google.com/test#file?abc=1&def=4" + self.assertEqual(str1, URI(str1)) self.assertEqual("http://foo.org/bargles", URI("http://foo.org/bargles")) - with self.assertRaises(ValueError): - URI("rdf:type") with self.assertRaises(ValueError): URI(":") + with self.assertRaises(ValueError): + URI(":123") # imports range is uriorcurie, so we allow file paths #URI("1") - URI("foo.bar") - URI("../a/b") + self.assertTrue(URI.is_valid("foo.bar")) + self.assertTrue(URI.is_valid("../a/b")) + self.assertTrue(URI.is_valid("abc:123")) #with self.assertRaises(ValueError): # URI("x1") - with self.assertRaises(ValueError): - URI("") - lax() + # an empty URI is a valid same-document URI reference + self.assertTrue(URI.is_valid("")) x = URI("rdf:type") - self.assertFalse(URI.is_valid(x)) + self.assertTrue(URI.is_valid(x)) + self.assertTrue(URI.is_valid("urn:abc:123")) + self.assertTrue(URI.is_valid("https://john.doe@www.example.com:123/forum/questions/?tag=networking&order=newest#top")) + self.assertTrue(URI.is_valid("ldap://[2001:db8::7]/c=GB?objectClass?one")) + self.assertTrue(URI.is_valid("ldap://[2001:db8::7]/c=GB?objectClass?one")) + self.assertTrue(URI.is_valid("mailto:John.Doe@example.com")) + self.assertTrue(URI.is_valid("news:comp.infosystems.www.servers.unix")) + self.assertTrue(URI.is_valid("tel:+1-816-555-1212")) + self.assertTrue(URI.is_valid("telnet://192.0.2.16:80/")) + self.assertTrue(URI.is_valid("urn:oasis:names:specification:docbook:dtd:xml:4.1.2")) + self.assertTrue(URI.is_valid("file:///home/user/")) def test_bool(self): self.assertTrue(Bool(True))