Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix invalid regex in unconstrained arrays for json_schema.py #919

Merged
merged 1 commit into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions outlines/fsm/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,15 +297,22 @@ def to_regex(
# Here we need to make the choice to exclude generating list of objects
# if the specification of the object is not given, even though a JSON
# object that contains an object here would be valid under the specification.
types = [
legal_types = [
{"type": "boolean"},
{"type": "null"},
{"type": "number"},
{"type": "integer"},
{"type": "string"},
]
regexes = [to_regex(resolver, t, whitespace_pattern) for t in types]
return rf"\[{whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}){allow_empty}{whitespace_pattern}\]"
depth = instance.get("depth", 2)
if depth > 0:
legal_types.append({"type": "object", "depth": depth - 1})
legal_types.append({"type": "array", "depth": depth - 1})

regexes = [
to_regex(resolver, t, whitespace_pattern) for t in legal_types
]
return rf"\[{whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}{allow_empty}{whitespace_pattern}\]"
Copy link

@ekagra-ranjan ekagra-ranjan Jun 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lapp0 - Right now the array will have at least one element. I think there is missing a parentheses to denote that we want the entire array to be optional and not just the elements which start with , i.e.

rf"\[({whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}){allow_empty}{whitespace_pattern}\]"

Note the starting bracket after rf"\[ and closing bracket before {allow_empty}.
Wdyt?

Copy link
Contributor Author

@lapp0 lapp0 Jun 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've refactored a lot of this code so we can do constrained yaml. It's a lot more liberal with parenthesis use. Do you think this makes sense to fix the problem?

    def format_object_with_additional_properties(self, value_pattern, min_properties=None, max_properties=None):
        if max_properties == 0:
            return rf"\{{{self.ws}\}}"

        inner = self._regex_repeat_elem(
            elem_pattern=f"{STRING}{self.ws}:{self.ws}{value_pattern}",
            separator_pattern=f"{self.ws},{self.ws}",
            min_elem=min_properties,
            max_elem=max_properties,
            pad=self.ws
        )
        return rf'{{{inner}}}'
    def _regex_repeat_elem(self, elem_pattern, separator_pattern, min_elem=None, max_elem=None, pad=""):
        """Creates a pattern allowing between min_elem and max_elem occurrences of elem_pattern"""
        if max_elem == 0:
            return ""

        base_pattern = f"{elem_pattern}"
        suffix_pattern = f"(?:{separator_pattern}{elem_pattern})"

        min_suffix_repeats = "" if min_elem is None else max(0, int(min_elem) - 1)
        max_suffix_repeats = "" if max_elem is None else max_elem - 1

        if max_suffix_repeats == 0:
            pattern = base_pattern
        else:
            pattern = f"{base_pattern}({suffix_pattern}){{{min_suffix_repeats},{max_suffix_repeats}}}"

        padded_pattern = f"({pad}{pattern}{pad})"

        if not min_elem:
            return f"({padded_pattern}|{pad})"
        else:
            return padded_pattern

Copy link

@ekagra-ranjan ekagra-ranjan Jun 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I didnt get you. I just meant that right now the regex is this:

rf"\[{whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}{allow_empty}{whitespace_pattern}\]"

and I think it should be this:

rf"\[({whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}){allow_empty}{whitespace_pattern}\]"

For e.g its this [(true|false)(,(true|false)){0,}?] vs [((true|false)(,(true|false)){0,})?]. The former wont allow [] whereas the latter will allow it

Does this make sense?


elif instance_type == "object":
# pattern for json object with values defined by instance["additionalProperties"]
Expand All @@ -328,20 +335,20 @@ def to_regex(
# unset or True, it is unconstrained object.
# We handle this by setting additionalProperties to anyOf: {all types}

legal_values = [
legal_types = [
{"type": "string"},
{"type": "number"},
{"type": "boolean"},
{"type": "null"}
# { "type": "array" }, # TODO: enable arrays within object-types
{"type": "null"},
]

# We set the object depth to 2 to keep the expression finite, but the "depth"
# key is not a true component of the JSON Schema specification.
depth = instance.get("depth", 2)
if depth > 0:
legal_values.append({"type": "object", "depth": depth - 1})
additional_properties = {"anyOf": legal_values}
legal_types.append({"type": "object", "depth": depth - 1})
legal_types.append({"type": "array", "depth": depth - 1})
additional_properties = {"anyOf": legal_types}

value_pattern = to_regex(
resolver, additional_properties, whitespace_pattern
Expand Down
40 changes: 40 additions & 0 deletions tests/fsm/test_json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,46 @@ def test_format(schema, regex, examples):
('{"time":20:20:39Z}', False), # missing quotes for value
],
),
# Unconstrained Object
(
{
"title": "Foo",
"type": "object",
},
[
("{}", True),
('{"a": 1, "b": null}', True),
('{"a": {"z": {"g": 4}}, "b": null}', True),
("1234", False), # not an object
('["a", "a"]', False), # not an array

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lapp0 are we missing a test case for having arrays inside object?

Copy link
Contributor Author

@lapp0 lapp0 Jun 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the comments! It's important that we create patterns that follow json-schema spec expectations. I'll ensure the two conditions you've referenced are tested in the refactor here lapp0#36

],
),
# Unconstrained Array
(
{
"type": "array",
},
[
("[1, {}, false]", True),
("[{}]", True),
('[{"a": {"z": "q"}, "b": null}]', True),
('[{"a": [1, 2, true], "b": null}]', True),
('[{"a": [1, 2, true], "b": {"a": "b"}}, 1, true, [1, [2]]]', True),
# too deep, default unconstrained depth limit = 2
(
'[{"a": [1, 2, true], "b": {"a": "b"}}, 1, true, [1, [2, [3]]]]',
False,
),
('[{"a": {"z": {"g": 4}}, "b": null}]', False),
("[[[[1]]]]", False),
# not an array
("{}", False),
('{"a": 1, "b": null}', False),
('{"a": {"z": {"g": 4}}, "b": null}', False),
("1234", False), # not an array
('{"a": "a"}', False), # not an array
],
),
],
)
def test_format_without_regex(schema, examples):
Expand Down
Loading