diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py index 095e41115..647c95a22 100644 --- a/outlines/fsm/json_schema.py +++ b/outlines/fsm/json_schema.py @@ -25,10 +25,10 @@ "null": NULL, } -DATE_TIME = r"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?" -DATE = r"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])" -TIME = r"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?" -UUID = r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" +DATE_TIME = r'"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?"' +DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"' +TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"' +UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"' format_to_regex = { "uuid": UUID, diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py index f5bdd8565..f61c6a3ba 100644 --- a/tests/fsm/test_json_schema.py +++ b/tests/fsm/test_json_schema.py @@ -561,11 +561,12 @@ def test_match(schema, regex, examples): {"title": "Foo", "type": "string", "format": "uuid"}, UUID, [ - ("123e4567-e89b-12d3-a456-426614174000", True), - ("123e4567-e89b-12d3-a456-42661417400", False), - ("123e4567-e89b-12d3-a456-42661417400g", False), - ("123e4567-e89b-12d3-a456-42661417400-", False), - ("", False), + ('123e4567-e89b-12d3-a456-426614174000', False), + ('"123e4567-e89b-12d3-a456-426614174000"', True), + ('"123e4567-e89b-12d3-a456-42661417400"', False), + ('"123e4567-e89b-12d3-a456-42661417400g"', False), + ('"123e4567-e89b-12d3-a456-42661417400-"', False), + ('""', False), ], ), # DATE-TIME @@ -573,13 +574,14 @@ def test_match(schema, regex, examples): {"title": "Foo", "type": "string", "format": "date-time"}, DATE_TIME, [ - ("2018-11-13T20:20:39Z", True), - ("2016-09-18T17:34:02.666Z", True), - ("2008-05-11T15:30:00Z", True), - ("2021-01-01T00:00:00", True), - ("2022-01-10 07:19:30", False), # missing T - ("2022-12-10T10-04-29", False), # incorrect separator - ("2023-01-01", False), + ('2018-11-13T20:20:39Z', False), + ('"2018-11-13T20:20:39Z"', True), + ('"2016-09-18T17:34:02.666Z"', True), + ('"2008-05-11T15:30:00Z"', True), + ('"2021-01-01T00:00:00"', True), + ('"2022-01-10 07:19:30"', False), # missing T + ('"2022-12-10T10-04-29"', False), # incorrect separator + ('"2023-01-01"', False), ], ), # DATE @@ -587,12 +589,13 @@ def test_match(schema, regex, examples): {"title": "Foo", "type": "string", "format": "date"}, DATE, [ - ("2018-11-13", True), - ("2016-09-18", True), - ("2008-05-11", True), - ("2015-13-01", False), # incorrect month - ("2022-01", False), # missing day - ("2022/12/01", False), # incorrect separator" + ('2018-11-13', False), + ('"2018-11-13"', True), + ('"2016-09-18"', True), + ('"2008-05-11"', True), + ('"2015-13-01"', False), # incorrect month + ('"2022-01"', False), # missing day + ('"2022/12/01"', False), # incorrect separator" ], ), # TIME @@ -600,13 +603,14 @@ def test_match(schema, regex, examples): {"title": "Foo", "type": "string", "format": "time"}, TIME, [ - ("20:20:39Z", True), - ("15:30:00Z", True), - ("25:30:00", False), # incorrect hour - ("15:30", False), # missing seconds - ("15:30:00.000", False), # missing Z - ("15-30-00", False), # incorrect separator - ("15:30:00+01:00", False), # incorrect separator + ('20:20:39Z', False), + ('"20:20:39Z"', True), + ('"15:30:00Z"', True), + ('"25:30:00"', False), # incorrect hour + ('"15:30"', False), # missing seconds + ('"15:30:00.000"', False), # missing Z + ('"15-30-00"', False), # incorrect separator + ('"15:30:00+01:00"', False), # incorrect separator ], ), ], @@ -625,6 +629,98 @@ def test_format(schema, regex, examples): assert match is None +@pytest.mark.parametrize( + "schema,examples", + [ + # NESTED UUID + ( + { + "title": "Foo", + "type": "object", + "properties": {"uuid": {"type": "string", "format": "uuid"}}, + }, + [ + ('{"uuid": "123e4567-e89b-12d3-a456-426614174000"}', True), + ('{"uuid":"123e4567-e89b-12d3-a456-42661417400"}', False), + ('{"uuid":"123e4567-e89b-12d3-a456-42661417400g"}', False), + ('{"uuid":"123e4567-e89b-12d3-a456-42661417400-"}', False), + ( + '{"uuid":123e4567-e89b-12d3-a456-426614174000}', + False, + ), # missing quotes for value + ('{"uuid":""}', False), + ], + ), + # NESTED DATE-TIME + ( + { + "title": "Foo", + "type": "object", + "properties": {"dateTime": {"type": "string", "format": "date-time"}}, + }, + [ + ('{"dateTime": "2018-11-13T20:20:39Z"}', True), + ('{"dateTime":"2016-09-18T17:34:02.666Z"}', True), + ('{"dateTime":"2008-05-11T15:30:00Z"}', True), + ('{"dateTime":"2021-01-01T00:00:00"}', True), + ('{"dateTime":"2022-01-10 07:19:30"}', False), # missing T + ('{"dateTime":"2022-12-10T10-04-29"}', False), # incorrect separator + ( + '{"dateTime":2018-11-13T20:20:39Z}', + False, + ), # missing quotes for value + ('{"dateTime":"2023-01-01"}', False), + ], + ), + # NESTED DATE + ( + { + "title": "Foo", + "type": "object", + "properties": {"date": {"type": "string", "format": "date"}}, + }, + [ + ('{"date": "2018-11-13"}', True), + ('{"date":"2016-09-18"}', True), + ('{"date":"2008-05-11"}', True), + ('{"date":"2015-13-01"}', False), # incorrect month + ('{"date":"2022-01"}', False), # missing day + ('{"date":"2022/12/01"}', False), # incorrect separator" + ('{"date":2018-11-13}', False), # missing quotes for value + ], + ), + # NESTED TIME + ( + { + "title": "Foo", + "type": "object", + "properties": {"time": {"type": "string", "format": "time"}}, + }, + [ + ('{"time": "20:20:39Z"}', True), + ('{"time":"15:30:00Z"}', True), + ('{"time":"25:30:00"}', False), # incorrect hour + ('{"time":"15:30"}', False), # missing seconds + ('{"time":"15:30:00.000"}', False), # missing Z + ('{"time":"15-30-00"}', False), # incorrect separator + ('{"time":"15:30:00+01:00"}', False), # incorrect separator + ('{"time":20:20:39Z}', False), # missing quotes for value + ], + ), + ], +) +def test_format_without_regex(schema, examples): + schema = json.dumps(schema) + test_regex = build_regex_from_schema(schema) + for string, does_match in examples: + match = re.fullmatch(test_regex, string) + if does_match: + assert match[0] == string + assert match.span() == (0, len(string)) + else: + assert match is None + + @pytest.mark.parametrize("whitespace_pattern", [None, r"[\n ]?", "abc"]) def test_json_schema_custom_whitespace_pattern(whitespace_pattern): """assert whitespace_pattern setting respected"""