From e8cd43a9bce7f721076a67daa9a9d522c060f541 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 17 Aug 2023 00:32:35 +0200 Subject: [PATCH 1/2] Escape special characters in JSON structure --- outlines/text/json_schema.py | 9 +++++---- tests/text/test_json_schema.py | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/outlines/text/json_schema.py b/outlines/text/json_schema.py index 8b96c9dca..ae917a322 100644 --- a/outlines/text/json_schema.py +++ b/outlines/text/json_schema.py @@ -1,5 +1,6 @@ import itertools import json +import re from typing import Dict STRING = r'".*"' @@ -192,7 +193,7 @@ def match_step_to_regex(step): Parameters ---------- step: - A string that represents the schema's structure, or a dictionnary + A string that represents the schema's structure, or a dictionary that represents a field in the schema. Returns @@ -203,13 +204,13 @@ def match_step_to_regex(step): """ match step: case str() as step: - return step + return re.escape(step) case {"enum": choices, "type": "string"}: - choices = [f'"{choice}"' for choice in choices] + choices = [f'"{re.escape(choice)}"' for choice in choices] return f"({'|'.join(choices)})" case {"enum": choices}: - choices = [str(choice) for choice in choices] + choices = [re.escape(str(choice)) for choice in choices] return f"({'|'.join(choices)})" case {"type": "array", "items": items}: diff --git a/tests/text/test_json_schema.py b/tests/text/test_json_schema.py index 4e3018e0d..1d46cdf6a 100644 --- a/tests/text/test_json_schema.py +++ b/tests/text/test_json_schema.py @@ -299,6 +299,11 @@ def test_match_number(pattern, does_match): '("Marc"|"Jean")', [('"Marc"', True), ('"Jean"', True), ('"John"', False)], ), + ( + {"title": "Foo", "enum": [".*", r"\s*"], "type": "string"}, + r'("\.\*"|"\\s\*")', + [('".*"', True), (r'"\s*"', True), (r'"\.\*"', False)], + ), ( {"title": "Foo", "enum": [0, 1], "type": "integer"}, "(0|1)", @@ -310,7 +315,7 @@ def test_match_number(pattern, does_match): "type": "object", "properties": {"count": {"title": "Count", "type": "integer"}}, }, - '{\n "count": ' + INTEGER + "\n}", + '\\{\\\n\\ \\ "count":\\ ' + INTEGER + "\\\n\\}", [('{\n "count": 100\n}', True)], ), ( @@ -339,16 +344,20 @@ def test_match_number(pattern, does_match): } }, }, - '{\n "fuzz": {\n "spam": ' + INTEGER + "\n }\n}", + '\\{\\\n\\ \\ "fuzz":\\ \\{\\\n\\ \\ \\ \\ "spam":\\ ' + + INTEGER + + "\\\n\\ \\ \\}\\\n\\}", [('{\n "fuzz": {\n "spam": 100\n }\n}', True)], ), ], ) def test_match(step, regex, examples): - assert match_step_to_regex(step) == regex + test_regex = match_step_to_regex(step) + + assert test_regex == regex for string, does_match in examples: - match = re.fullmatch(regex, string) + match = re.fullmatch(test_regex, string) if does_match: assert match[0] == string assert match.span() == (0, len(string)) From f1e04ac271eb208f3e4cfd56713a8037001d85dd Mon Sep 17 00:00:00 2001 From: "Brandon T. Willard" Date: Sat, 9 Sep 2023 20:23:12 -0500 Subject: [PATCH 2/2] Fix unescaped string case in JSON string regex --- outlines/text/json_schema.py | 7 ++++--- tests/text/test_json_schema.py | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/outlines/text/json_schema.py b/outlines/text/json_schema.py index ae917a322..c076a2e4e 100644 --- a/outlines/text/json_schema.py +++ b/outlines/text/json_schema.py @@ -3,7 +3,8 @@ import re from typing import Dict -STRING = r'".*"' +STRING_INNER = r'(?:[^"\\]|\\.)' +STRING = f'"{STRING_INNER}*"' INTEGER = r"(0|[1-9][0-9]*)" NUMBER = rf"(-)?({INTEGER})(\.[0-9]+)?([eE][+-][0-9]+)?" BOOLEAN = r"(true|false)" @@ -225,9 +226,9 @@ def match_step_to_regex(step): return regex_str case {"type": "string", "maxLength": max_length}: - return f'".{{,{max_length}}}"' + return f'"{STRING_INNER}{{,{max_length}}}"' case {"type": "string", "minLength": min_length}: - return f'".{{{min_length},}}"' + return f'"{STRING_INNER}{{{min_length},}}"' case {"type": field_type}: return type_to_regex[field_type] diff --git a/tests/text/test_json_schema.py b/tests/text/test_json_schema.py index 1d46cdf6a..f8814aeba 100644 --- a/tests/text/test_json_schema.py +++ b/tests/text/test_json_schema.py @@ -12,6 +12,7 @@ NULL, NUMBER, STRING, + STRING_INNER, build_schedule_from_schema, match_step_to_regex, ) @@ -258,13 +259,13 @@ def test_match_number(pattern, does_match): ), ( {"title": "Foo", "type": "string", "maxLength": 3}, - '".{,3}"', - [('"ab"', True), ('"abcd"', False)], + f'"{STRING_INNER}{{,3}}"', + [('"ab"', True), ('"a""', False), ('"abcd"', False)], ), ( {"title": "Foo", "type": "string", "minLength": 3}, - '".{3,}"', - [('"ab"', False), ('"abcd"', True)], + f'"{STRING_INNER}{{3,}}"', + [('"ab"', False), ('"abcd"', True), ('"abc""', False)], ), ( {"title": "Foo", "type": "boolean"}, @@ -290,6 +291,7 @@ def test_match_number(pattern, does_match): f"({STRING}|{NUMBER})", [ ('"string"', True), + ('"st"ring"', False), ("1000", True), ("true", False), ],