From c8dbbbddf5eb419a24f21159d662d20d3734b295 Mon Sep 17 00:00:00 2001 From: muhtasham Date: Sat, 21 Sep 2024 04:32:30 +0200 Subject: [PATCH 1/5] Add URI to json_schema.py --- outlines/fsm/json_schema.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py index 98d2de59c..1db360067 100644 --- a/outlines/fsm/json_schema.py +++ b/outlines/fsm/json_schema.py @@ -32,12 +32,14 @@ DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"' TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"' UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"' +URI = r'"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"' format_to_regex = { "uuid": UUID, "date-time": DATE_TIME, "date": DATE, "time": TIME, + "uri": URI, } @@ -350,14 +352,8 @@ def to_regex( return rf'("{pattern}")' elif "format" in instance: format = instance["format"] - if format == "date-time": - return format_to_regex["date-time"] - elif format == "uuid": - return format_to_regex["uuid"] - elif format == "date": - return format_to_regex["date"] - elif format == "time": - return format_to_regex["time"] + if format in format_to_regex: + return format_to_regex[format] else: raise NotImplementedError( f"Format {format} is not supported by Outlines" From 91fa56b9b7b35ec8ed6ef3437a345d5f64ccdc16 Mon Sep 17 00:00:00 2001 From: muhtasham Date: Sat, 21 Sep 2024 17:26:15 +0200 Subject: [PATCH 2/5] Add tests for URI test_json_schema.py --- tests/fsm/test_json_schema.py | 37 +++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index 7565ff642..233aff7f5 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -17,6 +17,7 @@ STRING_INNER, TIME, UUID, + URI, WHITESPACE, build_regex_from_schema, get_schema_from_signature, @@ -825,6 +826,42 @@ def test_match(schema, regex, examples): ('"15:30:00+01:00"', False), # incorrect separator ], ), + # URI + ( + {"title": "Foo", "type": "string", "format": "uri"}, + URI, + [ + ('"https://www.example.com"', True), + ('"http://example.com"', True), + ('"https://subdomain.example.co.uk/path?query=value#fragment"', True), + ('"https://example.com:8080"', True), # With port + ('"http://123.45.67.89"', True), # IP address + ('"https://example.com/path/to/resource.html"', True), # With file extension + ('"https://user:pass@example.com"', True), # With basic auth + ('"https://example.com/?q=test&r=123"', True), # With multiple query parameters + ('"https://example.co.uk"', True), # Different TLD + ('"https://xn--bcher-kva.example"', True), # Punycode domain + ('"https://example.com/path%20with%20spaces"', True), # Encoded spaces + ('"ftp://example.com"', False), # FTP protocol + ('"not a uri"', False), + ('"https://"', False), # Incomplete URI + ('""', False), # Empty string + ('https://www.example.com', False), # Missing quotes + ('"http:/example.com"', False), # Missing slash after protocol + ('"https://example.com:abc"', False), # Invalid port + ('"https://exa mple.com"', False), # Space in domain + ('"https://.example.com"', False), # Domain starting with dot + ('"https://example..com"', False), # Consecutive dots in domain + ('"https://exam ple.com/path"', False), # Space in domain (but valid path) + ('"https://example.com/path "', False), # Space at end of path + ('"https://example.com#frag ment"', False), # Space in fragment + ('"https://example.com/?q=va lue"', False), # Space in query + ('"https://exa\nmple.com"', False), # Newline in domain + ('"https://example.com/pat\nh"', False), # Newline in path + ('"https://example.com#frag\nment"', False), # Newline in fragment + ('"https://example.com/?q=val\nue"', False), # Newline in query + ] + ), ], ) def test_format(schema, regex, examples): From 1908514cacb73a875d2e63f478c3c06bbd0f1d92 Mon Sep 17 00:00:00 2001 From: muhtasham Date: Mon, 23 Sep 2024 11:21:47 -0700 Subject: [PATCH 3/5] Refine URI regex Update URI regex pattern to remove unnecessary `www.` and restrict `@` to the auth portion. Comment added to clarify support for a subset of RFC 3986 (https URLs with optional auth details). --- outlines/fsm/json_schema.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py index 1db360067..f4ca55f7e 100644 --- a/outlines/fsm/json_schema.py +++ b/outlines/fsm/json_schema.py @@ -32,7 +32,8 @@ DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"' TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"' UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"' -URI = r'"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"' +# URI only supports a subset of https://datatracker.ietf.org/doc/html/rfc3986, specifically https:// URLs with optional auth details +URI = r'"(https?:\/\/)?([-a-zA-Z0-9:%._\+~#=]+@)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}([-a-zA-Z0-9@:%_\+.~#?&//=]*)"' format_to_regex = { "uuid": UUID, From ca2055c89901e11c2a2cd5c9ade578629991212a Mon Sep 17 00:00:00 2001 From: muhtasham Date: Mon, 30 Sep 2024 01:17:16 +0200 Subject: [PATCH 4/5] pre-commit --- tests/fsm/test_json_schema.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index 233aff7f5..f77d0fa88 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -16,8 +16,8 @@ STRING, STRING_INNER, TIME, - UUID, URI, + UUID, WHITESPACE, build_regex_from_schema, get_schema_from_signature, @@ -836,9 +836,15 @@ def test_match(schema, regex, examples): ('"https://subdomain.example.co.uk/path?query=value#fragment"', True), ('"https://example.com:8080"', True), # With port ('"http://123.45.67.89"', True), # IP address - ('"https://example.com/path/to/resource.html"', True), # With file extension + ( + '"https://example.com/path/to/resource.html"', + True, + ), # With file extension ('"https://user:pass@example.com"', True), # With basic auth - ('"https://example.com/?q=test&r=123"', True), # With multiple query parameters + ( + '"https://example.com/?q=test&r=123"', + True, + ), # With multiple query parameters ('"https://example.co.uk"', True), # Different TLD ('"https://xn--bcher-kva.example"', True), # Punycode domain ('"https://example.com/path%20with%20spaces"', True), # Encoded spaces @@ -846,13 +852,16 @@ def test_match(schema, regex, examples): ('"not a uri"', False), ('"https://"', False), # Incomplete URI ('""', False), # Empty string - ('https://www.example.com', False), # Missing quotes + ("https://www.example.com", False), # Missing quotes ('"http:/example.com"', False), # Missing slash after protocol ('"https://example.com:abc"', False), # Invalid port ('"https://exa mple.com"', False), # Space in domain ('"https://.example.com"', False), # Domain starting with dot ('"https://example..com"', False), # Consecutive dots in domain - ('"https://exam ple.com/path"', False), # Space in domain (but valid path) + ( + '"https://exam ple.com/path"', + False, + ), # Space in domain (but valid path) ('"https://example.com/path "', False), # Space at end of path ('"https://example.com#frag ment"', False), # Space in fragment ('"https://example.com/?q=va lue"', False), # Space in query @@ -860,7 +869,7 @@ def test_match(schema, regex, examples): ('"https://example.com/pat\nh"', False), # Newline in path ('"https://example.com#frag\nment"', False), # Newline in fragment ('"https://example.com/?q=val\nue"', False), # Newline in query - ] + ], ), ], ) From 94f0ea63ffc30ba1b2095d0a21a9a21d021e46f2 Mon Sep 17 00:00:00 2001 From: muhtasham Date: Mon, 30 Sep 2024 01:23:44 +0200 Subject: [PATCH 5/5] refactor test_json_schema.py --- tests/fsm/test_json_schema.py | 90 +++++++++++++++++------------------ 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index f77d0fa88..c3f481642 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -747,6 +747,51 @@ def test_match_number(pattern, does_match): ("{ }", True), ], ), + # URI + ( + {"title": "Foo", "type": "string", "format": "uri"}, + URI, + [ + ('"https://www.example.com"', True), + ('"http://example.com"', True), + ('"https://subdomain.example.co.uk/path?query=value#fragment"', True), + ('"https://example.com:8080"', True), # With port + ('"http://123.45.67.89"', True), # IP address + ( + '"https://example.com/path/to/resource.html"', + True, + ), # With file extension + ('"https://user:pass@example.com"', True), # With basic auth + ( + '"https://example.com/?q=test&r=123"', + True, + ), # With multiple query parameters + ('"https://example.co.uk"', True), # Different TLD + ('"https://xn--bcher-kva.example"', True), # Punycode domain + ('"https://example.com/path%20with%20spaces"', True), # Encoded spaces + ('"ftp://example.com"', False), # FTP protocol + ('"not a uri"', False), + ('"https://"', False), # Incomplete URI + ('""', False), # Empty string + ("https://www.example.com", False), # Missing quotes + ('"http:/example.com"', False), # Missing slash after protocol + ('"https://example.com:abc"', False), # Invalid port + ('"https://exa mple.com"', False), # Space in domain + ('"https://.example.com"', False), # Domain starting with dot + ('"https://example..com"', False), # Consecutive dots in domain + ( + '"https://exam ple.com/path"', + False, + ), # Space in domain (but valid path) + ('"https://example.com/path "', False), # Space at end of path + ('"https://example.com#frag ment"', False), # Space in fragment + ('"https://example.com/?q=va lue"', False), # Space in query + ('"https://exa\nmple.com"', False), # Newline in domain + ('"https://example.com/pat\nh"', False), # Newline in path + ('"https://example.com#frag\nment"', False), # Newline in fragment + ('"https://example.com/?q=val\nue"', False), # Newline in query + ], + ), ], ) def test_match(schema, regex, examples): @@ -826,51 +871,6 @@ def test_match(schema, regex, examples): ('"15:30:00+01:00"', False), # incorrect separator ], ), - # URI - ( - {"title": "Foo", "type": "string", "format": "uri"}, - URI, - [ - ('"https://www.example.com"', True), - ('"http://example.com"', True), - ('"https://subdomain.example.co.uk/path?query=value#fragment"', True), - ('"https://example.com:8080"', True), # With port - ('"http://123.45.67.89"', True), # IP address - ( - '"https://example.com/path/to/resource.html"', - True, - ), # With file extension - ('"https://user:pass@example.com"', True), # With basic auth - ( - '"https://example.com/?q=test&r=123"', - True, - ), # With multiple query parameters - ('"https://example.co.uk"', True), # Different TLD - ('"https://xn--bcher-kva.example"', True), # Punycode domain - ('"https://example.com/path%20with%20spaces"', True), # Encoded spaces - ('"ftp://example.com"', False), # FTP protocol - ('"not a uri"', False), - ('"https://"', False), # Incomplete URI - ('""', False), # Empty string - ("https://www.example.com", False), # Missing quotes - ('"http:/example.com"', False), # Missing slash after protocol - ('"https://example.com:abc"', False), # Invalid port - ('"https://exa mple.com"', False), # Space in domain - ('"https://.example.com"', False), # Domain starting with dot - ('"https://example..com"', False), # Consecutive dots in domain - ( - '"https://exam ple.com/path"', - False, - ), # Space in domain (but valid path) - ('"https://example.com/path "', False), # Space at end of path - ('"https://example.com#frag ment"', False), # Space in fragment - ('"https://example.com/?q=va lue"', False), # Space in query - ('"https://exa\nmple.com"', False), # Newline in domain - ('"https://example.com/pat\nh"', False), # Newline in path - ('"https://example.com#frag\nment"', False), # Newline in fragment - ('"https://example.com/?q=val\nue"', False), # Newline in query - ], - ), ], ) def test_format(schema, regex, examples):