From 99fa1ec674b224e57936393dc5e7c5ba95a6f663 Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Tue, 1 Oct 2024 18:38:16 -0400
Subject: [PATCH] Refactor json-to-regex, subclass for yaml generation

---
 .pre-commit-config.yaml            |    2 +-
 outlines/fsm/json_schema.py        | 1099 +++++++++++++++++++---------
 outlines/generate/json.py          |   26 +-
 pyproject.toml                     |    2 +
 tests/fsm/test_json_schema.py      |  258 ++++++-
 tests/fsm/test_json_schema_full.py |  130 ++++
 tests/generate/test_generate.py    |   19 +-
 7 files changed, 1128 insertions(+), 408 deletions(-)
 create mode 100644 tests/fsm/test_json_schema_full.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a9039b605..de10b27cd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,4 +30,4 @@ repos:
     - id: mypy
       args: [--allow-redefinition]
       exclude: ^examples/
-      additional_dependencies: [types-tqdm, types-Pillow]
+      additional_dependencies: [types-tqdm, types-Pillow, types-PyYAML]
diff --git a/outlines/fsm/json_schema.py b/outlines/fsm/json_schema.py
index 98d2de59c..a33c9c924 100644
--- a/outlines/fsm/json_schema.py
+++ b/outlines/fsm/json_schema.py
@@ -1,9 +1,12 @@
+import dataclasses
 import inspect
+import itertools
 import json
 import re
 import warnings
-from typing import Callable, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
+import yaml
 from jsonschema.protocols import Validator
 from pydantic import BaseModel, create_model
 from referencing import Registry, Resource
@@ -20,28 +23,23 @@
 NULL = r"null"
 WHITESPACE = r"[ ]?"
 
-type_to_regex = {
-    "string": STRING,
-    "integer": INTEGER,
-    "number": NUMBER,
-    "boolean": BOOLEAN,
-    "null": NULL,
-}
-
 DATE_TIME = r'"(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])T(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\.[0-9]{3})?(Z)?"'
 DATE = r'"(?:\d{4})-(?:0[1-9]|1[0-2])-(?:0[1-9]|[1-2][0-9]|3[0-1])"'
 TIME = r'"(2[0-3]|[01][0-9]):([0-5][0-9]):([0-5][0-9])(\\.[0-9]+)?(Z)?"'
 UUID = r'"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"'
 
-format_to_regex = {
-    "uuid": UUID,
-    "date-time": DATE_TIME,
-    "date": DATE,
-    "time": TIME,
-}
+
+def load_yaml(yaml_str: str) -> Any:
+    """Parse a YAML string and return the corresponding Python object."""
+    return yaml.safe_load(yaml_str)
 
 
-def build_regex_from_schema(schema: str, whitespace_pattern: Optional[str] = None):
+def build_regex_from_schema(
+    schema: str,
+    whitespace_pattern: Optional[str] = None,
+    mode: str = "json",
+    strict_json_schema_subset: bool = True,
+):
     """Turn a JSON schema into a regex that matches any JSON object that follows
     this schema.
 
@@ -60,6 +58,12 @@ def build_regex_from_schema(schema: str, whitespace_pattern: Optional[str] = Non
     whitespace_pattern
         Pattern to use for JSON syntactic whitespace (doesn't impact string literals)
         Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"`
+    mode
+        Either `json` or `yaml`, determines the structure of the generated output
+    strict_json_schema_subset
+        For `items` and `properties`, the JSON Schema spec by default allows these to be unconstrained
+        if not set. This is usually undesired behavior, so by default strict_json_schema_subset is True.
+
 
     Returns
     -------
@@ -83,7 +87,21 @@ def build_regex_from_schema(schema: str, whitespace_pattern: Optional[str] = Non
     resolver = registry.resolver()
 
     content = schema.contents
-    return to_regex(resolver, content, whitespace_pattern)
+
+    if mode == "json":
+        return JSONSchemaRegexGenerator(
+            resolver,
+            whitespace_pattern,
+            strict_json_schema_subset=strict_json_schema_subset,
+        ).to_regex(content)
+    elif mode == "yaml":
+        return YAMLRegexGenerator(
+            resolver,
+            whitespace_pattern,
+            strict_json_schema_subset=strict_json_schema_subset,
+        ).to_regex(content)
+    else:
+        raise ValueError(f"invalid mode: {mode}")
 
 
 def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -> str:
@@ -119,18 +137,6 @@ def convert_json_schema_to_str(json_schema: Union[dict, str, Type[BaseModel]]) -
     return schema_str
 
 
-def _get_num_items_pattern(min_items, max_items, whitespace_pattern):
-    # Helper function for arrays and objects
-    min_items = int(min_items or 0)
-    if max_items is None:
-        return rf"{{{max(min_items - 1, 0)},}}"
-    else:
-        max_items = int(max_items)
-        if max_items < 1:
-            return None
-        return rf"{{{max(min_items - 1, 0)},{max_items - 1}}}"
-
-
 def validate_quantifiers(
     min_bound: Optional[str], max_bound: Optional[str], start_offset: int = 0
 ) -> Tuple[str, str]:
@@ -172,16 +178,58 @@ def validate_quantifiers(
     return min_bound, max_bound
 
 
-def to_regex(
-    resolver: Resolver, instance: dict, whitespace_pattern: Optional[str] = None
-):
+def get_schema_from_signature(fn: Callable) -> str:
+    """Turn a function signature into a JSON schema.
+
+    Every JSON object valid to the output JSON Schema can be passed
+    to `fn` using the ** unpacking syntax.
+
+    """
+    signature = inspect.signature(fn)
+    arguments = {}
+    for name, arg in signature.parameters.items():
+        if arg.annotation == inspect._empty:
+            raise ValueError("Each argument must have a type annotation")
+        else:
+            arguments[name] = (arg.annotation, ...)
+
+    try:
+        fn_name = fn.__name__
+    except Exception as e:
+        fn_name = "Arguments"
+        warnings.warn(
+            f"The function name could not be determined. Using default name 'Arguments' instead. For debugging, here is exact error:\n{e}",
+            category=UserWarning,
+        )
+    model = create_model(fn_name, **arguments)
+
+    return model.model_json_schema()
+
+
+@dataclasses.dataclass
+class Context:
+    """Context for json schema rule application"""
+
+    recursion_depth: int = 0
+    nesting_level: int = 0
+
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
+    def increment(self, attr: str, value: int = 1) -> "Context":
+        """Return a **new** Context with the specified attribute incremented by `value`"""
+        return dataclasses.replace(self, **{attr: getattr(self, attr) + value})
+
+    def __repr__(self):
+        return f"Context({self.__dict__})"
+
+
+class JSONSchemaRegexGenerator:
     """Translate a JSON Schema instance into a regex that validates the schema.
 
     Note
     ----
     Many features of JSON schema are missing:
-    - Handle `additionalProperties` keyword
-    - Handle types defined as a list
     - Handle constraints on numbers
     - Handle special patterns: `date`, `uri`, etc.
 
@@ -191,362 +239,707 @@ def to_regex(
     ----------
     resolver
         An object that resolves references to other instances within a schema
-    instance
-        The instance to translate
     whitespace_pattern
         Pattern to use for JSON syntactic whitespace (doesn't impact string literals)
         Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"`
+    recursion_level
+        For unconstrained objects and lists ond many levels deep the pattern should be constructed.
+    strict_json_schema_subset
+        For `items` and `properties`, the JSON Schema spec by default allows these to be unconstrained
+        if not set. This is usually undesired behavior, so by default strict_json_schema_subset is True.
     """
 
-    # set whitespace pattern
-    if whitespace_pattern is None:
-        whitespace_pattern = WHITESPACE
+    # Never impacted by parameters
+    STATIC_PRIMATIVES = {"boolean", "null"}
+    # Default value of primatives (when provided no parameters)
+    FORMAT_PRIMATIVE = {
+        "null": NULL,
+        "boolean": BOOLEAN,
+        "number": NUMBER,
+        "integer": INTEGER,
+        "string": STRING,
+    }
+    FORMAT_STRING = {
+        "uuid": UUID,
+        "date-time": DATE_TIME,
+        "date": DATE,
+        "time": TIME,
+    }
+
+    def __init__(
+        self,
+        resolver: Resolver,
+        whitespace_pattern: Optional[str] = None,
+        max_nesting_level: int = 2,
+        strict_json_schema_subset: bool = True,
+    ):
+        self.resolver = resolver
+        self.ws = WHITESPACE if whitespace_pattern is None else whitespace_pattern
+        self.max_nesting_level = max_nesting_level
+        self.strict_json_schema_subset = strict_json_schema_subset
+
+    def _validate_node(self, node: Any, ctx: Context):
+        """Validate the JSON Schema node for unsupported features and recursion limits."""
+        if ctx.recursion_depth > 256:
+            raise NotImplementedError(
+                "Recursive schemas aren't currently available with Outlines."
+            )
 
-    if instance == {}:
-        # JSON Schema Spec: Empty object means unconstrained, any json type is legal
-        types = [
-            {"type": "boolean"},
-            {"type": "null"},
-            {"type": "number"},
-            {"type": "integer"},
-            {"type": "string"},
-            {"type": "array"},
-            {"type": "object"},
+        if node is True:
+            return
+
+        if node is False:
+            # this should be implemented along-side `visit_not(...)`
+            raise NotImplementedError("schema = False isn't available with Outlines.")
+
+        # keys have no handling
+        not_implemented_keys = [
+            "dependentSchemas",
+            "unevaluatedItems",
+            "unevaluatedProperties",
+            "contains",
+            "patternProperties",
+            "maximum",
+            "default",
+            "__proto__",
         ]
-        regexes = [to_regex(resolver, t, whitespace_pattern) for t in types]
-        regexes = [rf"({r})" for r in regexes]
-        return rf"{'|'.join(regexes)}"
-
-    elif "properties" in instance:
-        regex = ""
-        regex += r"\{"
-        properties = instance["properties"]
-        required_properties = instance.get("required", [])
-        is_required = [item in required_properties for item in properties]
-        # If at least one property is required, we include the one in the lastest position
-        # without any comma.
-        # For each property before it (optional or required), we add with a comma after the property.
-        # For each property after it (optional), we add with a comma before the property.
-        if any(is_required):
-            last_required_pos = max([i for i, value in enumerate(is_required) if value])
-            for i, (name, value) in enumerate(properties.items()):
-                subregex = f'{whitespace_pattern}"{re.escape(name)}"{whitespace_pattern}:{whitespace_pattern}'
-                subregex += to_regex(resolver, value, whitespace_pattern)
-                if i < last_required_pos:
-                    subregex = f"{subregex}{whitespace_pattern},"
-                elif i > last_required_pos:
-                    subregex = f"{whitespace_pattern},{subregex}"
-                regex += subregex if is_required[i] else f"({subregex})?"
-        # If no property is required, we have to create a possible pattern for each property in which
-        # it's the last one necessarilly present. Then, we add the others as optional before and after
-        # following the same strategy as described above.
-        # The whole block is made optional to allow the case in which no property is returned.
-        else:
-            property_subregexes = []
-            for i, (name, value) in enumerate(properties.items()):
-                subregex = f'{whitespace_pattern}"{name}"{whitespace_pattern}:{whitespace_pattern}'
-                subregex += to_regex(resolver, value, whitespace_pattern)
-                property_subregexes.append(subregex)
-            possible_patterns = []
-            for i in range(len(property_subregexes)):
-                pattern = ""
-                for subregex in property_subregexes[:i]:
-                    pattern += f"({subregex}{whitespace_pattern},)?"
-                pattern += property_subregexes[i]
-                for subregex in property_subregexes[i + 1 :]:
-                    pattern += f"({whitespace_pattern},{subregex})?"
-                possible_patterns.append(pattern)
-            regex += f"({'|'.join(possible_patterns)})?"
-
-        regex += f"{whitespace_pattern}" + r"\}"
-
-        return regex
-
-    # To validate against allOf, the given data must be valid against all of the
-    # given subschemas.
-    elif "allOf" in instance:
-        subregexes = [
-            to_regex(resolver, t, whitespace_pattern) for t in instance["allOf"]
-        ]
-        subregexes_str = [f"{subregex}" for subregex in subregexes]
-        return rf"({''.join(subregexes_str)})"
-
-    # To validate against `anyOf`, the given data must be valid against
-    # any (one or more) of the given subschemas.
-    elif "anyOf" in instance:
-        subregexes = [
-            to_regex(resolver, t, whitespace_pattern) for t in instance["anyOf"]
+        # keys coinciding within same object not handled
+        not_implemented_key_pairs = [
+            ("allOf", "anyOf"),
+            ("properties", "anyOf"),
         ]
-        return rf"({'|'.join(subregexes)})"
 
-    # To validate against oneOf, the given data must be valid against exactly
-    # one of the given subschemas.
-    elif "oneOf" in instance:
-        subregexes = [
-            to_regex(resolver, t, whitespace_pattern) for t in instance["oneOf"]
-        ]
+        node_invalid_keys = set(node) & set(not_implemented_keys)
+        if node_invalid_keys:
+            raise NotImplementedError(
+                f"Cannot handle the keys: {node_invalid_keys}. Please open an Outlines issue."
+            )
+        for k in not_implemented_key_pairs:
+            if not (set(k) - set(node.keys())):
+                raise NotImplementedError(
+                    f"Cannot simultaneously use the keys: {k}. Please open an Outlines issue."
+                )
 
-        xor_patterns = [f"(?:{subregex})" for subregex in subregexes]
+    def to_regex(self, node: Any, ctx: Optional[Context] = None):
+        """Convert a JSON Schema node into a regular expression pattern."""
+        ctx = (
+            ctx.increment("recursion_depth")
+            if ctx
+            else Context(nesting_level=0, recursion_depth=0)
+        )
+        self._validate_node(node, ctx)
+
+        # Handle unconstrained nodes
+        if node in ({}, True):
+            return self.visit_unconstrained({}, ctx)
+
+        # Handle multiple types (via anyOf)
+        if isinstance(node.get("type"), list):
+            subpatterns = [self.to_regex({"type": t}, ctx) for t in node["type"]]
+            return self.format_anyOf(subpatterns)
+
+        # Visit based on node attributes
+        node_attr_to_visitor = {
+            "$ref": self.visit_ref,
+            "allOf": self.visit_allOf,
+            "anyOf": self.visit_anyOf,
+            "oneOf": self.visit_oneOf,
+            "enum": self.visit_enum,
+            "prefixItems": self.visit_array,
+            "items": self.visit_array,
+            "properties": self.visit_object,
+            "const": self.visit_string,
+            "pattern": self.visit_string,
+        }
+        for attr, visitor in node_attr_to_visitor.items():
+            if attr in node:
+                return (
+                    f"({visitor(node, ctx)})"
+                    if ctx.nesting_level > 0
+                    else visitor(node, ctx)
+                )
 
-        return rf"({'|'.join(xor_patterns)})"
+        # Visit based on type
+        type_to_visitor = {
+            "number": self.visit_number,
+            "integer": self.visit_integer,
+            "string": self.visit_string,
+            "object": self.visit_object,
+            "array": self.visit_array,
+        }
+        if node.get("type") in self.STATIC_PRIMATIVES:
+            return self.FORMAT_PRIMATIVE[node["type"]]
+        if node.get("type") in type_to_visitor:
+            return type_to_visitor[node["type"]](node, ctx)
+
+        return self.visit_notimplemented(node, ctx)
+
+    ##########
+    # VISITORS
+    ##########
+    def visit_ref(self, node: Any, ctx: Context):
+        path = node["$ref"]
+        if path == "#":
+            raise NotImplementedError("Recursive schemas aren't supported")
+        new_node = self.resolver.lookup(path).contents
+        return self.to_regex(new_node, ctx)
+
+    def visit_object(self, node: Any, ctx: Context):
+        """
+        Handle JSON Schema `object` rules
+
+        additionalProperties handling:
+            pattern for json object with values defined by instance["additionalProperties"]
+            enforces value type constraints recursively, "minProperties", and "maxProperties"
+            doesn't enforce "required", "dependencies", "propertyNames" "any/all/on Of"
+
+        TODO: the json-schema compliant implementation is as follows:
+        - properties and additionalProperties can both be set simultaneously
+        - min/maxProperties can be specified even if properties has constraints set
+        """
+        value_ctx = ctx.increment("nesting_level")
+
+        # TODO: handling for node["unevaluatedProperties"]
+        properties = node.get("properties", not self.strict_json_schema_subset)
+        properties = {} if properties is True else properties
+        required_properties = node.get("required", [])
+        additional_properties = node.get("additionalProperties")
+
+        if properties and additional_properties:
+            raise NotImplementedError(
+                "`properties` & `additionalProperties != False` not implemented. Please open an Outlines issue."
+            )
 
-    # Create pattern for Tuples, per JSON Schema spec, `prefixItems` determines types at each idx
-    elif "prefixItems" in instance:
-        element_patterns = [
-            to_regex(resolver, t, whitespace_pattern) for t in instance["prefixItems"]
-        ]
-        comma_split_pattern = rf"{whitespace_pattern},{whitespace_pattern}"
-        tuple_inner = comma_split_pattern.join(element_patterns)
-        return rf"\[{whitespace_pattern}{tuple_inner}{whitespace_pattern}\]"
-
-    # The enum keyword is used to restrict a value to a fixed set of values. It
-    # must be an array with at least one element, where each element is unique.
-    elif "enum" in instance:
-        choices = []
-        for choice in instance["enum"]:
-            if type(choice) in [int, float, bool, type(None), str]:
-                choices.append(re.escape(json.dumps(choice)))
+        elif properties and "minProperties" in node or "maxProperties" in node:
+            raise NotImplementedError(
+                "properties and minProperties / maxProperties not implemented. Please open an Outlines issue."
+            )
+
+        elif properties:
+            property_details = [
+                {
+                    "key_pattern": self.format_literal(name),
+                    "value_pattern": self.to_regex(value, value_ctx),
+                    "is_required": name in required_properties,
+                }
+                for name, value in properties.items()
+            ]
+            if any(pd["is_required"] for pd in property_details):
+                return self.format_object_with_required_properties(
+                    property_details, ctx
+                )
             else:
-                raise TypeError(f"Unsupported data type in enum: {type(choice)}")
-        return f"({'|'.join(choices)})"
+                return self.format_object_properties_all_optional(property_details, ctx)
+
+        elif additional_properties is False:
+            return self.format_empty_object()
 
-    elif "const" in instance:
-        const = instance["const"]
-        if type(const) in [int, float, bool, type(None), str]:
-            const = re.escape(json.dumps(const))
         else:
-            raise TypeError(f"Unsupported data type in const: {type(const)}")
-        return const
-
-    elif "$ref" in instance:
-        path = f"{instance['$ref']}"
-        instance = resolver.lookup(path).contents
-        return to_regex(resolver, instance, whitespace_pattern)
-
-    # The type keyword may either be a string or an array:
-    # - If it's a string, it is the name of one of the basic types.
-    # - If it is an array, it must be an array of strings, where each string is
-    # the name of one of the basic types, and each element is unique. In this
-    # case, the JSON snippet is valid if it matches any of the given types.
-    elif "type" in instance:
-        instance_type = instance["type"]
-        if instance_type == "string":
-            if "maxLength" in instance or "minLength" in instance:
-                max_items = instance.get("maxLength", "")
-                min_items = instance.get("minLength", "")
-                try:
-                    if int(max_items) < int(min_items):
-                        raise ValueError(
-                            "maxLength must be greater than or equal to minLength"
-                        )  # FIXME this raises an error but is caught right away by the except (meant for int("") I assume)
-                except ValueError:
-                    pass
-                return f'"{STRING_INNER}{{{min_items},{max_items}}}"'
-            elif "pattern" in instance:
-                pattern = instance["pattern"]
-                if pattern[0] == "^" and pattern[-1] == "$":
-                    return rf'("{pattern[1:-1]}")'
-                else:
-                    return rf'("{pattern}")'
-            elif "format" in instance:
-                format = instance["format"]
-                if format == "date-time":
-                    return format_to_regex["date-time"]
-                elif format == "uuid":
-                    return format_to_regex["uuid"]
-                elif format == "date":
-                    return format_to_regex["date"]
-                elif format == "time":
-                    return format_to_regex["time"]
-                else:
-                    raise NotImplementedError(
-                        f"Format {format} is not supported by Outlines"
-                    )
+            if additional_properties in (True, None):
+                value_pattern = self.visit_unconstrained(node, value_ctx)
             else:
-                return type_to_regex["string"]
-
-        elif instance_type == "number":
-            bounds = {
-                "minDigitsInteger",
-                "maxDigitsInteger",
-                "minDigitsFraction",
-                "maxDigitsFraction",
-                "minDigitsExponent",
-                "maxDigitsExponent",
-            }
-            if bounds.intersection(set(instance.keys())):
-                min_digits_integer, max_digits_integer = validate_quantifiers(
-                    instance.get("minDigitsInteger"),
-                    instance.get("maxDigitsInteger"),
-                    start_offset=1,
-                )
-                min_digits_fraction, max_digits_fraction = validate_quantifiers(
-                    instance.get("minDigitsFraction"), instance.get("maxDigitsFraction")
-                )
-                min_digits_exponent, max_digits_exponent = validate_quantifiers(
-                    instance.get("minDigitsExponent"), instance.get("maxDigitsExponent")
-                )
-                integers_quantifier = (
-                    f"{{{min_digits_integer},{max_digits_integer}}}"
-                    if min_digits_integer or max_digits_integer
-                    else "*"
-                )
-                fraction_quantifier = (
-                    f"{{{min_digits_fraction},{max_digits_fraction}}}"
-                    if min_digits_fraction or max_digits_fraction
-                    else "+"
-                )
-                exponent_quantifier = (
-                    f"{{{min_digits_exponent},{max_digits_exponent}}}"
-                    if min_digits_exponent or max_digits_exponent
-                    else "+"
-                )
-                return rf"((-)?(0|[1-9][0-9]{integers_quantifier}))(\.[0-9]{fraction_quantifier})?([eE][+-][0-9]{exponent_quantifier})?"
-            return type_to_regex["number"]
+                # Object with arbitrary key name, constrained value
+                value_pattern = self.to_regex(additional_properties, value_ctx)
+
+            return self.format_object_with_additional_properties(
+                value_pattern,
+                ctx,
+                min_properties=node.get("minProperties"),
+                max_properties=node.get("maxProperties"),
+            )
 
-        elif instance_type == "integer":
-            if "minDigits" in instance or "maxDigits" in instance:
-                min_digits, max_digits = validate_quantifiers(
-                    instance.get("minDigits"), instance.get("maxDigits"), start_offset=1
-                )
-                return rf"(-)?(0|[1-9][0-9]{{{min_digits},{max_digits}}})"
-            return type_to_regex["integer"]
+    def visit_array(self, node: Any, ctx: Context):
+        """Handle JSON Schema `array` rules with optional item constraints."""
 
-        elif instance_type == "array":
-            num_repeats = _get_num_items_pattern(
-                instance.get("minItems"), instance.get("maxItems"), whitespace_pattern
+        # TODO: handling for node["unevaluatedItems"]
+        # TODO: handling for node["additionalItems"]
+        # TODO: handling for node["uniqueItems"]
+        if "uniqueItems" in node:
+            raise NotImplementedError(
+                "uniqueItems is not implemented. Please open an Outlines issue."
             )
-            if num_repeats is None:
-                return rf"\[{whitespace_pattern}\]"
 
-            allow_empty = "?" if int(instance.get("minItems", 0)) == 0 else ""
+        elem_ctx = ctx.increment("nesting_level")
+
+        items = node.get("items", not self.strict_json_schema_subset)
 
-            if "items" in instance:
-                items_regex = to_regex(resolver, instance["items"], whitespace_pattern)
-                return rf"\[{whitespace_pattern}(({items_regex})(,{whitespace_pattern}({items_regex})){num_repeats}){allow_empty}{whitespace_pattern}\]"
+        if node.get("prefixItems") is not None:
+            # `prefixItems` determines types at each idx, which precedes `items` rules
+            if items in (True, None):
+                suffix_elem_pattern = self.visit_unconstrained(node, elem_ctx)
+            elif items is False:
+                suffix_elem_pattern = None
             else:
-                # Here we need to make the choice to exclude generating list of objects
-                # if the specification of the object is not given, even though a JSON
-                # object that contains an object here would be valid under the specification.
-                legal_types = [
-                    {"type": "boolean"},
-                    {"type": "null"},
-                    {"type": "number"},
-                    {"type": "integer"},
-                    {"type": "string"},
-                ]
-                depth = instance.get("depth", 2)
-                if depth > 0:
-                    legal_types.append({"type": "object", "depth": depth - 1})
-                    legal_types.append({"type": "array", "depth": depth - 1})
-
-                regexes = [
-                    to_regex(resolver, t, whitespace_pattern) for t in legal_types
-                ]
-                return rf"\[{whitespace_pattern}({'|'.join(regexes)})(,{whitespace_pattern}({'|'.join(regexes)})){num_repeats}{allow_empty}{whitespace_pattern}\]"
-
-        elif instance_type == "object":
-            # pattern for json object with values defined by instance["additionalProperties"]
-            # enforces value type constraints recursively, "minProperties", and "maxProperties"
-            # doesn't enforce "required", "dependencies", "propertyNames" "any/all/on Of"
-            num_repeats = _get_num_items_pattern(
-                instance.get("minProperties"),
-                instance.get("maxProperties"),
-                whitespace_pattern,
+                suffix_elem_pattern = self.to_regex(items, elem_ctx)
+
+            prefix_subpatterns = [
+                self.to_regex(item, elem_ctx) for item in node["prefixItems"]
+            ]
+            return self.format_prefixItems(prefix_subpatterns, ctx, suffix_elem_pattern)
+
+        else:
+            # handle simple case: no prefix items
+            if node.get("items") in (True, None):  # noqa
+                items_regex = self.visit_unconstrained(node, elem_ctx)
+            else:
+                items_regex = self.to_regex(node["items"], elem_ctx)
+            return self.format_array(
+                items_regex, ctx, node.get("minItems"), node.get("maxItems")
             )
-            if num_repeats is None:
-                return rf"\{{{whitespace_pattern}\}}"
-
-            allow_empty = "?" if int(instance.get("minProperties", 0)) == 0 else ""
-
-            additional_properties = instance.get("additionalProperties")
-
-            if additional_properties is None or additional_properties is True:
-                # JSON Schema behavior: If the additionalProperties of an object is
-                # unset or True, it is unconstrained object.
-                # We handle this by setting additionalProperties to anyOf: {all types}
-
-                legal_types = [
-                    {"type": "string"},
-                    {"type": "number"},
-                    {"type": "boolean"},
-                    {"type": "null"},
-                ]
-
-                # We set the object depth to 2 to keep the expression finite, but the "depth"
-                # key is not a true component of the JSON Schema specification.
-                depth = instance.get("depth", 2)
-                if depth > 0:
-                    legal_types.append({"type": "object", "depth": depth - 1})
-                    legal_types.append({"type": "array", "depth": depth - 1})
-                additional_properties = {"anyOf": legal_types}
-
-            value_pattern = to_regex(
-                resolver, additional_properties, whitespace_pattern
+
+    def visit_number(self, node: Any, ctx: Context):
+        quantifier_keys = [
+            "minDigitsInteger",
+            "maxDigitsInteger",
+            "minDigitsFraction",
+            "maxDigitsFraction",
+            "minDigitsExponent",
+            "maxDigitsExponent",
+        ]
+        if any([qk in node for qk in quantifier_keys]):
+            min_digits_integer, max_digits_integer = validate_quantifiers(
+                node.get("minDigitsInteger"),
+                node.get("maxDigitsInteger"),
+                start_offset=1,
             )
-            key_value_pattern = (
-                f"{STRING}{whitespace_pattern}:{whitespace_pattern}{value_pattern}"
+            min_digits_fraction, max_digits_fraction = validate_quantifiers(
+                node.get("minDigitsFraction"), node.get("maxDigitsFraction")
             )
-            key_value_successor_pattern = (
-                f"{whitespace_pattern},{whitespace_pattern}{key_value_pattern}"
+            min_digits_exponent, max_digits_exponent = validate_quantifiers(
+                node.get("minDigitsExponent"), node.get("maxDigitsExponent")
             )
-            multiple_key_value_pattern = f"({key_value_pattern}({key_value_successor_pattern}){num_repeats}){allow_empty}"
-
-            return (
-                r"\{"
-                + whitespace_pattern
-                + multiple_key_value_pattern
-                + whitespace_pattern
-                + r"\}"
+            return self.format_number_range(
+                min_digits_integer,
+                max_digits_integer,
+                min_digits_fraction,
+                max_digits_fraction,
+                min_digits_exponent,
+                max_digits_exponent,
             )
+        else:
+            return self.FORMAT_PRIMATIVE["number"]
 
-        elif instance_type == "boolean":
-            return type_to_regex["boolean"]
+    def visit_integer(self, node: Any, ctx: Context):
+        if "maxDigits" in node or "minDigits" in node:
+            min_digits, max_digits = validate_quantifiers(
+                node.get("minDigits"), node.get("maxDigits"), start_offset=1
+            )
+            return self.format_integer_range(min_digits, max_digits)
+        else:
+            return self.FORMAT_PRIMATIVE["integer"]
+
+    def visit_string(self, node: Any, ctx: Context):
+        if "const" in node:
+            return self.format_literal(node["const"])
+        if "maxLength" in node or "minLength" in node:
+            min_length, max_length = validate_quantifiers(
+                node.get("minLength"), node.get("maxLength")
+            )
+            return self.format_string_length(min_length, max_length)
+        elif "pattern" in node:
+            return self.format_string_pattern(node["pattern"])
+        elif "format" in node:
+            return self.format_string_format(node["format"])
+        return self.FORMAT_PRIMATIVE["string"]
+
+    def visit_enum(self, node: Any, ctx: Context):
+        """
+        The enum keyword is used to restrict a value to a fixed set of values. It
+        must be an array with at least one element, where each element is unique.
+        """
+        choices = [self.format_literal(choice) for choice in node["enum"]]
+        return self.format_anyOf(choices)
+
+    def visit_allOf(self, node: Any, ctx: Context):
+        subpatterns = [self.to_regex(subschema, ctx) for subschema in node["allOf"]]
+        return self.format_allOf(subpatterns)
+
+    def visit_anyOf(self, node: Any, ctx: Context):
+        subpatterns = [self.to_regex(subschema, ctx) for subschema in node["anyOf"]]
+        return self.format_anyOf(subpatterns)
+
+    def visit_oneOf(self, node: Any, ctx: Context):
+        subpatterns = [self.to_regex(subschema, ctx) for subschema in node["oneOf"]]
+        return self.format_oneOf(subpatterns)
+
+    def visit_notimplemented(self, node: Any, ctx: Context):
+        raise NotImplementedError(
+            f"Handler for node `{node}` is not implemented. Please open an Outlines issue."
+        )
 
-        elif instance_type == "null":
-            return type_to_regex["null"]
+    ############
+    # FORMATTERS
+    ############
+    def format_number_range(
+        self,
+        min_digits_integer,
+        max_digits_integer,
+        min_digits_fraction,
+        max_digits_fraction,
+        min_digits_exponent,
+        max_digits_exponent,
+    ):
+        integers_quantifier = (
+            f"{{{min_digits_integer},{max_digits_integer}}}"
+            if min_digits_integer or max_digits_integer
+            else "*"
+        )
+        fraction_quantifier = (
+            f"{{{min_digits_fraction},{max_digits_fraction}}}"
+            if min_digits_fraction or max_digits_fraction
+            else "+"
+        )
+        exponent_quantifier = (
+            f"{{{min_digits_exponent},{max_digits_exponent}}}"
+            if min_digits_exponent or max_digits_exponent
+            else "+"
+        )
+        return rf"((-)?(0|[1-9][0-9]{integers_quantifier}))(\.[0-9]{fraction_quantifier})?([eE][+-][0-9]{exponent_quantifier})?"
 
-        elif isinstance(instance_type, list):
-            # Here we need to make the choice to exclude generating an object
-            # if the specification of the object is not give, even though a JSON
-            # object that contains an object here would be valid under the specification.
-            regexes = [
-                to_regex(resolver, {"type": t}, whitespace_pattern)
-                for t in instance_type
-                if t != "object"
-            ]
-            return rf"({'|'.join(regexes)})"
+    def format_integer_range(self, min_digits=None, max_digits=None):
+        if min_digits or max_digits:
+            num_items_pattern = f"{{{min_digits},{max_digits}}}"
+        else:
+            num_items_pattern = "*"
 
-    raise NotImplementedError(
-        f"""Could not translate the instance {instance} to a
-    regular expression. Make sure it is valid to the JSON Schema specification. If
-    it is, please open an issue on the Outlines repository"""
-    )
+        return rf"(-)?(0|[1-9][0-9]{num_items_pattern})"
 
+    def format_string_length(self, min_length, max_length):
+        return f'"{STRING_INNER}{{{min_length},{max_length}}}"'
 
-def get_schema_from_signature(fn: Callable) -> str:
-    """Turn a function signature into a JSON schema.
+    def format_string_pattern(self, pattern: str):
+        if pattern[0] == "^" and pattern[-1] == "$":
+            pattern_string_inner = pattern[1:-1]
+        else:
+            pattern_string_inner = pattern
+        return f'"{pattern_string_inner}"'
+
+    def format_string_format(self, fmt: str):
+        format_regex = self.FORMAT_STRING.get(fmt)
+        if format_regex:
+            return format_regex
+        raise NotImplementedError(
+            f"Format {fmt} is not supported. Please open an Outlines issue."
+        )
 
-    Every JSON object valid to the output JSON Schema can be passed
-    to `fn` using the ** unpacking syntax.
+    def format_property_kv(
+        self, key_pattern: str, value_pattern: str, ctx: Context
+    ) -> str:
+        return f"{self.ws}{key_pattern}{self.ws}(:){self.ws}{value_pattern}"
 
-    """
-    signature = inspect.signature(fn)
-    arguments = {}
-    for name, arg in signature.parameters.items():
-        if arg.annotation == inspect._empty:
-            raise ValueError("Each argument must have a type annotation")
+    def format_empty_object(self):
+        return r"\{" + self.ws + r"\}"
+
+    def format_object_properties_all_optional(
+        self, property_details: List[Dict], ctx: Context
+    ):
+        property_subregexes = [
+            self.format_property_kv(pd["key_pattern"], pd["value_pattern"], ctx)
+            for pd in property_details
+        ]
+        possible_patterns = [
+            f"{self.ws},".join(combination)
+            for i in range(1, len(property_subregexes) + 1)
+            for combination in itertools.combinations(property_subregexes, i)
+        ]
+        inner = f"({'|'.join(possible_patterns)})?"
+        return r"\{" + inner + self.ws + r"\}"
+
+    def format_object_with_required_properties(
+        self, property_details: List[Dict], ctx: Context
+    ):
+        is_required = [prop["is_required"] for prop in property_details]
+        last_required_pos = max(i for i, value in enumerate(is_required) if value)
+        inner = ""
+        for i, pd in enumerate(property_details):
+            subregex = self.format_property_kv(
+                pd["key_pattern"], pd["value_pattern"], ctx
+            )
+            if i < last_required_pos:
+                subregex = f"{subregex}{self.ws},"
+            elif i > last_required_pos:
+                subregex = f"{self.ws},{subregex}"
+            inner += subregex if is_required[i] else f"({subregex})?"
+        return r"\{" + inner + self.ws + r"\}"
+
+    def format_object_with_additional_properties(
+        self, value_pattern: str, ctx: Context, min_properties=None, max_properties=None
+    ):
+        inner = self._regex_repeat_elem(
+            elem_pattern=f"({STRING}){self.ws}(:){self.ws}({value_pattern})",
+            separator_pattern=f"{self.ws},{self.ws}",
+            min_elem=min_properties,
+            max_elem=max_properties,
+            pad=self.ws,
+        )
+        return r"\{" + inner + r"\}"
+
+    def format_array(
+        self, elem_pattern: str, ctx: Context, min_items=None, max_items=None
+    ):
+        inner = self._regex_repeat_elem(
+            elem_pattern=elem_pattern,
+            separator_pattern=f"{self.ws},{self.ws}",
+            min_elem=min_items,
+            max_elem=max_items,
+            pad=self.ws,
+        )
+        return rf"\[{inner}\]"
+
+    def format_prefixItems(
+        self,
+        prefix_patterns: List[str],
+        ctx: Context,
+        suffix_elem_pattern: Optional[str] = None,
+    ):
+        comma_split_pattern = rf"{self.ws},{self.ws}"
+        prefix_pattern = f"{self.ws}{comma_split_pattern.join(prefix_patterns)}"
+        if suffix_elem_pattern:
+            suffix_pattern = self._regex_repeat_elem(
+                elem_pattern=suffix_elem_pattern,
+                separator_pattern=f"{self.ws},{self.ws}",
+                min_elem=1,
+                pad=self.ws,
+            )
+            suffix_pattern = f"((,{suffix_pattern})|)"
+            inner = f"{prefix_pattern}{suffix_pattern}"
         else:
-            arguments[name] = (arg.annotation, ...)
+            inner = prefix_pattern + self.ws
+        return rf"\[{inner}\]"
+
+    def format_literal(self, literal: Any):
+        if isinstance(literal, str):
+            return f"{re.escape(json.dumps(literal))}"
+        if type(literal) in [int, bool, type(None)]:
+            return re.escape(json.dumps(literal))
+        elif isinstance(literal, float):
+            if float(literal) == int(literal):
+                int_literal = re.escape(json.dumps(int(literal)))
+                float_literal = re.escape(json.dumps(float(literal)))
+                return f"({int_literal}|{float_literal})"
+            else:
+                return re.escape(json.dumps(literal))
+        else:
+            raise NotImplementedError(
+                f"Unsupported data type in literal: {type(literal)}. Please open an Outlines issue."
+            )
 
-    try:
-        fn_name = fn.__name__
-    except Exception as e:
-        fn_name = "Arguments"
+    def format_allOf(self, patterns: List[str]):
+        return (
+            "(" + "".join([f"(?={pat})" for pat in patterns[:-1]]) + patterns[-1] + ")"
+        )
+
+    def format_anyOf(self, patterns: List[str]):
+        return "(" + "|".join([f"({pat})" for pat in patterns]) + ")"
+
+    def format_oneOf(self, patterns: List[str]):
+        # If you're searching "NotImplementedError", this method also needs to be properly implemented!
         warnings.warn(
-            f"The function name could not be determined. Using default name 'Arguments' instead. For debugging, here is exact error:\n{e}",
-            category=UserWarning,
+            "JSON Schema `oneOf` not implemented. Using `anyOf` instead. Please open an Outlines Issue."
         )
-    model = create_model(fn_name, **arguments)
+        return self.format_anyOf(patterns)
 
-    return model.model_json_schema()
+    def visit_unconstrained(self, node: Any, ctx: Context):
+        legal_types = [
+            {"type": "boolean"},
+            {"type": "null"},
+            {"type": "number"},
+            {"type": "integer"},
+            {"type": "string"},
+        ]
+        allowed_nesting = node.get(
+            "_allowed_nesting", ctx.nesting_level + self.max_nesting_level
+        )
+        # We limit the object depth to keep the expression finite, but the "depth"
+        # key is not a true component of the JSON Schema specification.
+        if ctx.nesting_level < allowed_nesting:
+            legal_types.append({"type": "object", "_allowed_nesting": allowed_nesting})
+            legal_types.append({"type": "array", "_allowed_nesting": allowed_nesting})
+
+        subpatterns = [self.to_regex(t, ctx) for t in legal_types]
+        return self.format_anyOf(subpatterns)
+
+    def _regex_repeat_elem(
+        self,
+        elem_pattern: str,
+        separator_pattern: str,
+        min_elem=None,
+        max_elem=None,
+        pad="",
+    ):
+        """
+        Creates a pattern allowing between min_elem and max_elem occurrences of elem_pattern
+        Ensures each element pattern is separated by separator_pattern
+        Surrounds result with `pad`
+        """
+        if str(max_elem) == "0":
+            return pad
+
+        base_pattern = f"({elem_pattern})"
+        suffix_pattern = f"(({separator_pattern})({elem_pattern}))"
+
+        min_suffix_repeats = "" if min_elem is None else max(0, int(min_elem) - 1)
+        max_suffix_repeats = "" if max_elem is None else max_elem - 1
+
+        if str(max_suffix_repeats) == "0":
+            pattern = base_pattern
+        else:
+            pattern = f"{base_pattern}({suffix_pattern}){{{min_suffix_repeats},{max_suffix_repeats}}}"
+
+        padded_pattern = f"({pad}{pattern}{pad})"
+
+        if not min_elem:
+            return f"({padded_pattern}|{pad})"
+        else:
+            return padded_pattern
+
+
+class YAMLRegexGenerator(JSONSchemaRegexGenerator):
+    """
+    Core differences between JSON and YAML
+    --------------------------------------
+
+    For most types including `boolean`, `null`, `number`, and `integer`
+    YAML supports a superset of JSON representation. For example, `boolean` can
+    be `true` / `false` like JSON, however it can also be `yes` / `no`. For these
+    types we will limit generation to the valid JSON-representation subset.
+
+    ```
+    string:
+    - Equivalent to JSON, but doesn't use quotes
+
+    array:
+    - In YAML arrays are represented
+    - by newline separated
+    - dash-prefixed array elements
+
+    object:
+    - An object is represented as a newline separated list of key: value pairs
+    ```
+    """
+
+    FORMAT_PRIMATIVE = {
+        # yaml allows for more escape types
+        "string": r'([^"\\\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]|\\["\\])',
+        **JSONSchemaRegexGenerator.FORMAT_PRIMATIVE,
+    }
+
+    @staticmethod
+    def _indentation(nesting_level: int):
+        return r"(\n)" + (f"[ ]{{{nesting_level * 2}}}" if nesting_level else "")
+
+    def format_property_kv(
+        self, key_pattern: str, value_pattern: str, ctx: Context
+    ) -> str:
+        """
+        Similar to JSON property kv, but with changes to accomodate yaml rules:
+        - leading spaces are not allowed as the spaces are syntactic
+        - `foo:bar` isn't a legal kv,
+          - need a single space, e.g. `foo: bar`
+          - or an indented newline, e.g. `foo:\n  bar`
+        """
+        child_indentation = self._indentation(ctx.nesting_level + 1)
+        return f"({key_pattern}{self.ws}(:){child_indentation}{value_pattern})"
+
+    def format_object_properties_all_optional(
+        self, property_details: List[Dict], ctx: Context
+    ):
+        property_subregexes = [
+            self.format_property_kv(pd["key_pattern"], pd["value_pattern"], ctx)
+            for pd in property_details
+        ]
+        indentation = self._indentation(ctx.nesting_level)
+        possible_patterns = [
+            indentation.join(combination)  # first indent is optional
+            for i in range(1, len(property_subregexes) + 1)
+            for combination in itertools.combinations(property_subregexes, i)
+        ]
+        one_or_more_pattern = "|".join(possible_patterns)
+        return f"({one_or_more_pattern}|{self.format_empty_object()})"
+
+    def format_object_with_required_properties(
+        self, property_details: List[Dict], ctx: Context
+    ):
+        is_required = [prop["is_required"] for prop in property_details]
+        last_required_pos = max(i for i, value in enumerate(is_required) if value)
+
+        indentation = self._indentation(ctx.nesting_level)
+
+        inner = ""
+        for i, pd in enumerate(property_details):
+            subregex = self.format_property_kv(
+                pd["key_pattern"], pd["value_pattern"], ctx
+            )
+            if i < last_required_pos:
+                subregex = f"{subregex}{indentation}"
+            elif i > last_required_pos:
+                subregex = f"{indentation}{subregex}"
+            inner += subregex if is_required[i] else f"({subregex})?"
+
+        return inner
+
+    def format_object_with_additional_properties(
+        self, value_pattern: str, ctx: Context, min_properties=None, max_properties=None
+    ):
+        if min_properties in (0, "0", "", None):
+            min_properties = 0
+
+        inner = self._regex_repeat_elem(
+            elem_pattern=self.format_property_kv(STRING, value_pattern, ctx),
+            separator_pattern=self._indentation(ctx.nesting_level),
+            min_elem=max(1, min_properties),
+            max_elem=max_properties,
+        )
+        if min_properties == 0:
+            empty_obj_pattern = self.format_empty_object()
+            return f"({inner})|({empty_obj_pattern})"
+
+        return inner
+
+    def format_array(
+        self, elem_pattern: str, ctx: Context, min_items=None, max_items=None
+    ):
+        self_indentation = self._indentation(ctx.nesting_level)
+
+        child_indentation = self._indentation(ctx.nesting_level + 1)
+        child_separator = f"([ ]|({child_indentation}))"
+
+        if min_items in (0, "0", "", None):
+            min_items = 0
+
+        inner = self._regex_repeat_elem(
+            elem_pattern=f"(-){child_separator}{elem_pattern}",
+            separator_pattern=self_indentation,
+            min_elem=max(1, min_items),
+            max_elem=max_items,
+        )
+        if min_items == 0:
+            empty_list_pattern = r"(\[\])"
+            return f"({inner})|({empty_list_pattern})"
+        return inner
+
+    def format_prefixItems(
+        self,
+        prefix_patterns: List[str],
+        ctx: Context,
+        suffix_elem_pattern: Optional[str] = None,
+    ):
+        self_indentation = self._indentation(ctx.nesting_level)
+
+        child_indentation = self._indentation(ctx.nesting_level + 1)
+        child_separator = f"([ ]|({child_indentation}))"
+
+        prefix_pattern = self_indentation.join(
+            [f"(-){child_separator}{pat}" for pat in prefix_patterns]
+        )
+
+        if suffix_elem_pattern:
+            suffix_pattern = self._regex_repeat_elem(
+                elem_pattern=f"(-){child_separator}{suffix_elem_pattern}",
+                separator_pattern=self_indentation,
+                min_elem=1,
+            )
+            suffix_pattern = f"({self_indentation}{suffix_pattern})?"
+            return f"{prefix_pattern}{suffix_pattern}"
+        else:
+            return prefix_pattern
diff --git a/outlines/generate/json.py b/outlines/generate/json.py
index f75878d29..9cd063531 100644
--- a/outlines/generate/json.py
+++ b/outlines/generate/json.py
@@ -4,7 +4,11 @@
 
 from pydantic import BaseModel
 
-from outlines.fsm.json_schema import build_regex_from_schema, get_schema_from_signature
+from outlines.fsm.json_schema import (
+    build_regex_from_schema,
+    get_schema_from_signature,
+    load_yaml,
+)
 from outlines.generate.api import SequenceGeneratorAdapter
 from outlines.models import OpenAI
 from outlines.samplers import Sampler, multinomial
@@ -18,6 +22,7 @@ def json(
     schema_object: Union[str, object, Callable],
     sampler: Sampler = multinomial(),
     whitespace_pattern: Optional[str] = None,
+    mode="json",
 ) -> SequenceGeneratorAdapter:
     """
     Generate structured JSON data with a `Transformer` model based on a specified JSON Schema.
@@ -36,6 +41,8 @@ def json(
     whitespace_pattern
         Pattern to use for JSON syntactic whitespace (doesn't impact string literals)
         Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"`
+    mode
+        Either `json` or `yaml`, determines the structure of the generated output
 
     Returns
     -------
@@ -43,21 +50,26 @@ def json(
     transforms the result if BaseModel is used.
 
     """
+    if mode == "yaml":
+        to_json = lambda x: pyjson.dumps(load_yaml(x))
+    else:
+        to_json = lambda x: x
+
     if isinstance(schema_object, type(BaseModel)):
         schema = pyjson.dumps(schema_object.model_json_schema())
-        regex_str = build_regex_from_schema(schema, whitespace_pattern)
+        regex_str = build_regex_from_schema(schema, whitespace_pattern, mode=mode)
         generator = regex(model, regex_str, sampler)
-        generator.format_sequence = lambda x: schema_object.parse_raw(x)
+        generator.format_sequence = lambda x: schema_object.parse_raw(to_json(x))
     elif callable(schema_object):
         schema = pyjson.dumps(get_schema_from_signature(schema_object))
-        regex_str = build_regex_from_schema(schema, whitespace_pattern)
+        regex_str = build_regex_from_schema(schema, whitespace_pattern, mode=mode)
         generator = regex(model, regex_str, sampler)
-        generator.format_sequence = lambda x: pyjson.loads(x)
+        generator.format_sequence = lambda x: pyjson.loads(to_json(x))
     elif isinstance(schema_object, str):
         schema = schema_object
-        regex_str = build_regex_from_schema(schema, whitespace_pattern)
+        regex_str = build_regex_from_schema(schema, whitespace_pattern, mode=mode)
         generator = regex(model, regex_str, sampler)
-        generator.format_sequence = lambda x: pyjson.loads(x)
+        generator.format_sequence = lambda x: pyjson.loads(to_json(x))
     else:
         raise ValueError(
             f"Cannot parse schema {schema_object}. The schema must be either "
diff --git a/pyproject.toml b/pyproject.toml
index ab3ecd775..ed0773bc1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,6 +64,7 @@ test = [
     "torch",
     "transformers",
     "pillow",
+    "requests_cache",
 ]
 serve = [
     "vllm>=0.3.0",
@@ -136,6 +137,7 @@ module = [
     "pycountry.*",
     "airportsdata.*",
     "outlines_core.*",
+    "requests_cache.*",
 ]
 ignore_missing_imports = true
 
diff --git a/tests/fsm/test_json_schema.py b/tests/fsm/test_json_schema.py
index 7565ff642..627ef59cc 100644
--- a/tests/fsm/test_json_schema.py
+++ b/tests/fsm/test_json_schema.py
@@ -1,9 +1,11 @@
+import collections
 import json
 import re
 from typing import List, Literal, Union
 
 import interegular
 import pytest
+import yaml
 from pydantic import BaseModel, Field, constr
 
 from outlines.fsm.json_schema import (
@@ -20,10 +22,129 @@
     WHITESPACE,
     build_regex_from_schema,
     get_schema_from_signature,
-    to_regex,
 )
 
 
+def assert_patterns_equivalent(
+    generated_pattern, expected_pattern, n_diff=0, allow_both=False
+):
+    gen_fsm = interegular.parse_pattern(generated_pattern).to_fsm()
+    expect_fsm = interegular.parse_pattern(expected_pattern).to_fsm()
+    if gen_fsm.reduce() != expect_fsm.reduce():
+        if n_diff:
+            to_str = lambda s: "".join([c if isinstance(c, str) else "{*}" for c in s])
+            only_generated = [
+                to_str(s)
+                for _, s in zip(range(n_diff), gen_fsm.difference(expect_fsm).strings())
+            ]
+            only_expected = [
+                to_str(s)
+                for _, s in zip(range(n_diff), expect_fsm.difference(gen_fsm).strings())
+            ]
+            additional_details = (
+                f"Accepted only by generated pattern (max {n_diff}): {only_generated}\n"
+                f"Accepted only by expected pattern (max {n_diff}): {only_expected}\n"
+            )
+            if allow_both:
+                both = [
+                    to_str(s)
+                    for _, s in zip(range(n_diff), (gen_fsm & expect_fsm).strings())
+                ]
+                additional_details += (
+                    f"Accepted by both patterns (max {n_diff}): {both}\n"
+                )
+        else:
+            additional_details = ""
+
+        raise ValueError(
+            "Patterns Not Equivalent:\n"
+            f"generated_pattern = {generated_pattern}\n"
+            f" expected_pattern = {expected_pattern}\n"
+            f"{additional_details}"
+        )
+
+
+def dump_yaml_normalized(data):
+    """
+    yaml can represent the same data in many different ways.
+
+    This function creates a normalized yaml dump which ensures
+    - strings are always represented with quotes
+    - OrderedDict is represented without !!python/object/apply:collections.OrderedDict
+    - End of document signifier "\n...\n" is removed
+    - Standardize Indentation Behavior
+    """
+
+    # handle confusion in yaml dumper
+    if isinstance(data, str):
+        return json.dumps(data)
+
+    class NormalizedDumper(yaml.Dumper):
+        def increase_indent(self, flow=False, indentless=False):
+            return super().increase_indent(flow, False)
+
+    def quoted_str_presenter(dumper, data):
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style='"')
+
+    def dict_representer(dumper, data):
+        return dumper.represent_dict(data.items())
+
+    # Ensure strings are always quoted
+    NormalizedDumper.add_representer(str, quoted_str_presenter)
+    # Ensure OrderedDict is represented without !!python/object/apply
+    NormalizedDumper.add_representer(collections.OrderedDict, dict_representer)
+
+    dumped = yaml.dump(
+        data, Dumper=NormalizedDumper, default_flow_style=False, sort_keys=False
+    ).rstrip("\n...\n")
+
+    # hack to normalize formatting to our yaml subset
+    return re.sub(
+        r"^([ \t-]*)([^:\s]+): (\S.*)",
+        lambda m: f"{m.group(1)}{m.group(2)}:\n{' ' * (len(m.group(1)) + 2)}{m.group(3)}",
+        dumped,
+        flags=re.MULTILINE,
+    )
+
+
+def assert_match_expectation(json_sample, pattern, does_match, schema, mode="json"):
+    """
+    Ensure sample conforms to `does_match` expectation
+    - check sample normally if in json mode
+    - convert sample to normalized yaml if in yaml mode
+    """
+    # if yaml mode, convert to yaml if possible, otherwise succeed the test
+    if mode == "yaml":
+        try:
+            if json.dumps(json.loads(json_sample)) != json_sample:
+                return
+        except json.decoder.JSONDecodeError:
+            return
+
+        sample = dump_yaml_normalized(
+            json.loads(json_sample, object_pairs_hook=collections.OrderedDict)
+        )
+
+        # ensure yaml wasn't corrupted by rstrip
+        assert yaml.safe_load(sample) == json.loads(
+            json_sample
+        ), "invalid test, json -> yaml inconsistent"
+
+    else:
+        sample = json_sample
+
+    match = re.fullmatch(pattern, sample)
+    if does_match:
+        if not match:
+            raise ValueError(
+                f"Expected match for sample:\n{sample}\n\n"
+                f"Schema: {json.dumps(json.loads(schema), indent=4)}\n"
+                f"Generated Pattern: {repr(pattern)}\n"
+            )
+    else:
+        assert match is None
+
+
 def test_function_basic():
     def test_function(foo: str, bar: List[int]):
         pass
@@ -71,7 +192,7 @@ class User(BaseModel):
 )
 def test_match_integer(pattern, does_match):
     step = {"title": "Foo", "type": "integer"}
-    regex = to_regex(None, step)
+    regex = build_regex_from_schema(json.dumps(step))
     assert regex == INTEGER
 
     value = pattern["integer"]
@@ -98,7 +219,7 @@ def test_match_integer(pattern, does_match):
 )
 def test_match_number(pattern, does_match):
     step = {"title": "Foo", "type": "number"}
-    regex = to_regex(None, step)
+    regex = build_regex_from_schema(json.dumps(step))
     assert regex == NUMBER
 
     value = pattern["number"]
@@ -420,7 +541,7 @@ def test_match_number(pattern, does_match):
         # array
         (
             {"title": "Foo", "type": "array", "items": {"type": "number"}},
-            rf"\[{WHITESPACE}(({NUMBER})(,{WHITESPACE}({NUMBER})){{0,}})?{WHITESPACE}\]",
+            rf"\[(({WHITESPACE}({NUMBER})((?:{WHITESPACE},{WHITESPACE}({NUMBER}))){{,}}{WHITESPACE})|{WHITESPACE})\]",
             [("[1e+9,1.3]", True), ("[]", True), ("[1", False)],
         ),
         # array with a set length of 1
@@ -444,7 +565,7 @@ def test_match_number(pattern, does_match):
                 "minItems": 3,
                 "maxItems": 3,
             },
-            rf"\[{WHITESPACE}(({INTEGER})(,{WHITESPACE}({INTEGER})){{2,2}}){WHITESPACE}\]",
+            rf"\[({WHITESPACE}({INTEGER})((?:{WHITESPACE},{WHITESPACE}({INTEGER}))){{2,2}}{WHITESPACE})\]",
             [("[1]", False), ("[]", False), ("[1,2,3]", True), ("[1,2,3,4]", False)],
         ),
         # array with length 0
@@ -473,7 +594,7 @@ def test_match_number(pattern, does_match):
                 },
                 "required": ["test_dict"],
             },
-            rf"""\{{{WHITESPACE}"test_dict"{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}({STRING}{WHITESPACE}:{WHITESPACE}{STRING}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{STRING}){{0,}})?{WHITESPACE}\}}{WHITESPACE}\}}""",
+            rf"""\{{{WHITESPACE}"test_dict"{WHITESPACE}:{WHITESPACE}(\{{({WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}({STRING})({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}({STRING})){{0,}})?{WHITESPACE}\}}){WHITESPACE}\}}""",
             [
                 ("""{ "test_dict":{"foo":"bar","baz": "bif"}}""", True),
                 ("""{ "test_dict":{"foo":"bar" }}""", True),
@@ -499,7 +620,7 @@ def test_match_number(pattern, does_match):
                 },
                 "required": ["test_dict"],
             },
-            rf"""\{{{WHITESPACE}"test_dict"{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}({STRING}{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}({STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}){{0,}})?{WHITESPACE}\}}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}\{{{WHITESPACE}({STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}){{0,}})?{WHITESPACE}\}}){{0,}})?{WHITESPACE}\}}{WHITESPACE}\}}""",
+            rf"""\{{{WHITESPACE}"test_dict"{WHITESPACE}:{WHITESPACE}(\{{({WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}\{{({WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}){{0,}})?{WHITESPACE}\}}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}\{{({WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}({WHITESPACE},{WHITESPACE}{STRING}{WHITESPACE}:{WHITESPACE}{INTEGER}){{0,}})?{WHITESPACE}\}}){{0,}})?{WHITESPACE}\}}){WHITESPACE}\}}""",
             [
                 (
                     """{"test_dict": {"foo": {"bar": 123, "apple": 99}, "baz": {"bif": 456}}}""",
@@ -544,14 +665,48 @@ def test_match_number(pattern, does_match):
             rf"({STRING}|{INTEGER})",
             [("12", True), ('"a"', True), ('1"a"', False)],
         ),
+        # oneOf: TODO: currently implemented as anyOf, uncomment when proper oneOf is implemented
+        # (
+        #     {
+        #         "title": "Foo",
+        #         "oneOf": [{"type": "string", "format": "date"}, {"type": "string", "pattern": "2024.*"}],
+        #     },
+        #     rf"TODO",
+        #     [('"2024-01-07"', False), ('"2024-01-01"', True), ('"2024foobar7"', True), ('"2024-neither"', False)],
+        # ),
+        # anyOf
+        (
+            {
+                "title": "Foo",
+                "anyOf": [
+                    {"type": "string", "format": "date"},
+                    {"type": "string", "pattern": "2024.*7"},
+                ],
+            },
+            rf'({DATE})|("2024.*7")',
+            [
+                ('"2024-01-07"', True),
+                ('"2024-01-01"', True),
+                ('"2024foobar7"', True),
+                ('"2024-neither"', False),
+            ],
+        ),
         # allOf
         (
             {
                 "title": "Foo",
-                "allOf": [{"type": "string"}, {"type": "integer"}],
+                "allOf": [
+                    {"type": "string", "format": "date"},
+                    {"type": "string", "pattern": "2024.*7"},
+                ],
             },
-            rf"({STRING}{INTEGER})",
-            [('"a"1', True), ('"a"', False), ('"1"', False)],
+            rf'(?=({DATE}))("2024.*7")',
+            [
+                ('"2024-01-07"', True),
+                ('"2024-01-01"', False),
+                ('"2024foobar7"', False),
+                ('"2024-neither"', False),
+            ],
         ),
         # Tuple / prefixItems
         (
@@ -748,21 +903,24 @@ def test_match_number(pattern, does_match):
         ),
     ],
 )
-def test_match(schema, regex, examples):
-    interegular.parse_pattern(regex)
+@pytest.mark.parametrize("mode", ["json", "yaml"])
+def test_match(schema, regex, examples, mode):
     schema = json.dumps(schema)
-    test_regex = build_regex_from_schema(schema)
-    assert test_regex == regex
+    generated_pattern = build_regex_from_schema(schema, mode=mode)
+
+    if mode == "json":
+        # patterns assert equivalence of pattern behavior to expectation
+        assert_patterns_equivalent(
+            generated_pattern=generated_pattern, expected_pattern=regex
+        )
+
+    # ensure pattern can be parsed by interegular
+    interegular.parse_pattern(regex)
 
     for string, does_match in examples:
-        match = re.fullmatch(test_regex, string)
-        if does_match:
-            if match is None:
-                raise ValueError(f"Expected match for '{string}'")
-            assert match[0] == string
-            assert match.span() == (0, len(string))
-        else:
-            assert match is None
+        assert_match_expectation(
+            string, generated_pattern, does_match, schema, mode=mode
+        )
 
 
 @pytest.mark.parametrize(
@@ -827,19 +985,17 @@ def test_match(schema, regex, examples):
         ),
     ],
 )
-def test_format(schema, regex, examples):
+@pytest.mark.parametrize("mode", ["json", "yaml"])
+def test_format(schema, regex, examples, mode):
     interegular.parse_pattern(regex)
     schema = json.dumps(schema)
-    test_regex = build_regex_from_schema(schema)
-    assert test_regex == regex
+    generated_pattern = build_regex_from_schema(schema, mode=mode)
+    assert generated_pattern == regex
 
     for string, does_match in examples:
-        match = re.fullmatch(test_regex, string)
-        if does_match:
-            assert match[0] == string
-            assert match.span() == (0, len(string))
-        else:
-            assert match is None
+        assert_match_expectation(
+            string, generated_pattern, does_match, schema, mode=mode
+        )
 
 
 @pytest.mark.parametrize(
@@ -976,16 +1132,14 @@ def test_format(schema, regex, examples):
         ),
     ],
 )
-def test_format_without_regex(schema, examples):
+@pytest.mark.parametrize("mode", ["json", "yaml"])
+def test_format_without_regex(schema, examples, mode):
     schema = json.dumps(schema)
-    test_regex = build_regex_from_schema(schema)
+    generated_pattern = build_regex_from_schema(schema, mode=mode)
     for string, does_match in examples:
-        match = re.fullmatch(test_regex, string)
-        if does_match:
-            assert match[0] == string
-            assert match.span() == (0, len(string))
-        else:
-            assert match is None
+        assert_match_expectation(
+            string, generated_pattern, does_match, schema, mode=mode
+        )
 
 
 @pytest.mark.parametrize("whitespace_pattern", [None, r"[\n ]*", "abc"])
@@ -1017,6 +1171,7 @@ class MockModel(BaseModel):
         assert re.fullmatch(pattern, mock_result_mult_ws)
 
 
+@pytest.mark.skip("oneOf not implemented")
 def test_one_of_doesnt_produce_illegal_lookaround():
     """Reproduces failure in https://github.com/dottxt-ai/outlines/issues/823"""
 
@@ -1039,3 +1194,28 @@ class Model(BaseModel):
 
     # check if the pattern uses lookarounds incompatible with interegular.Pattern.to_fsm()
     interegular.parse_pattern(pattern).to_fsm()
+
+
+def test_all_generations_legal():
+    """
+    # Array of literal {"k": "v"}
+    (
+        {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "k": {"const": "v"}
+                }
+            },
+            "required": ["k"],
+            "additionalProperties": False
+        },
+        [
+            ("1", True),
+        ]
+    ),
+    """
+    # TODO: check all fsm.strings() matches the schema
+    # patch STRING, INTEGER, and NUMBER so they have limited length
+    pass
diff --git a/tests/fsm/test_json_schema_full.py b/tests/fsm/test_json_schema_full.py
new file mode 100644
index 000000000..eecb09d38
--- /dev/null
+++ b/tests/fsm/test_json_schema_full.py
@@ -0,0 +1,130 @@
+# LOCAL IMPORT HACK
+import importlib
+import json
+import re
+
+import pytest
+import requests
+import requests_cache
+import yaml
+from referencing.exceptions import Unresolvable
+
+from outlines.fsm.json_schema import build_regex_from_schema
+
+dump_yaml_normalized = importlib.import_module("test_json_schema").dump_yaml_normalized
+
+
+requests_cache.install_cache("test_request_cache", expire_after=3600)
+
+
+def get_json_schema_tests_from_repo(
+    repo="json-schema-org/JSON-Schema-Test-Suite", configs_dir="tests/draft2020-12"
+):
+    api_url = f"https://api.github.com/repos/{repo}/contents/{configs_dir}"
+    headers = {"Accept": "application/vnd.github.v3+json"}
+    response = requests.get(api_url, headers=headers)
+    response.raise_for_status()
+    contents = response.json()
+
+    results = []
+    for item in contents:
+        if item["type"] == "file" and item["name"].endswith(".json"):
+            file_url = item["download_url"]
+            file_response = requests.get(file_url)
+            file_response.raise_for_status()
+            json_data = file_response.json()
+
+            for entry in json_data:
+                for test in entry["tests"]:
+                    results.append(
+                        {
+                            "file": item["name"],
+                            "schema": json.dumps(entry["schema"]),
+                            "data": json.dumps(test["data"]),
+                            "is_valid": test["valid"],
+                        }
+                    )
+
+    return results
+
+
+@pytest.mark.skip("Utility for improving compliance with json schema spec")
+@pytest.mark.parametrize("sample", get_json_schema_tests_from_repo())
+def test_json_schema_to_json_compliance(sample):
+    """
+    Assert that we either correctly handle a schema, or skip if NotImplementedError
+    """
+    try:
+        pattern = build_regex_from_schema(
+            sample["schema"], strict_json_schema_subset=False
+        )
+    except NotImplementedError as e:
+        pytest.skip(f"{e}")
+    except Unresolvable:
+        pytest.xfail()
+
+    if sample["is_valid"]:
+        assert (
+            re.fullmatch(pattern, sample["data"]) is not None
+        ), "Failed to match valid schema"
+    else:
+        assert (
+            re.fullmatch(pattern, sample["data"]) is None
+        ), "Incorrectly matched invalid schema"
+
+
+@pytest.mark.parametrize("sample", get_json_schema_tests_from_repo())
+def test_json_schema_to_yaml_compliance(sample):
+    """
+    Skip tests checking whether it can be built, that is covered by
+    `test_json_schema_compliance`.
+
+    Here we are purely testing whether yaml is valid IFF json schema is valid.
+    """
+    # skip if the test fails to construct the pattern
+    try:
+        json_pattern = build_regex_from_schema(
+            sample["schema"], strict_json_schema_subset=False
+        )
+    except Exception:
+        pytest.skip()
+
+    # skip invalid with json
+    json_valid = re.fullmatch(json_pattern, sample["data"]) is not None
+    if sample["is_valid"] != json_valid:
+        pytest.skip()
+
+    # valide yaml generation for samples for all samples where json is valid
+    try:
+        yaml_pattern = build_regex_from_schema(
+            sample["schema"], mode="yaml", strict_json_schema_subset=False
+        )
+    except NotImplementedError as e:
+        pytest.skip(f"{e}")
+    except Unresolvable:
+        pytest.xfail()
+
+    yaml_sample = dump_yaml_normalized(json.loads(sample["data"]))
+
+    # xfail complex mappings
+    if any(line.startswith("? ") for line in yaml_sample.split("\n")):
+        pytest.xfail()
+
+    if sample["is_valid"]:
+        assert (
+            re.fullmatch(yaml_pattern, yaml_sample) is not None
+        ), "Failed to match valid schema"
+    else:
+        assert (
+            re.fullmatch(yaml_pattern, yaml_sample) is None
+        ), "Incorrectly matched invalid schema"
+
+
+@pytest.mark.skip()
+@pytest.mark.parametrize("sample", get_json_schema_tests_from_repo())
+def test_yaml_dumper_consistency(sample):
+    """valide output yaml is equivalent to input json"""
+    sample_from_json = json.loads(sample["data"])
+    sample_from_yaml = yaml.safe_load(dump_yaml_normalized(sample_from_json))
+    assert sample_from_yaml == sample_from_json
+    assert json.dumps(sample_from_yaml) == sample["data"]
diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py
index a96ce8673..4ab30383d 100644
--- a/tests/generate/test_generate.py
+++ b/tests/generate/test_generate.py
@@ -116,12 +116,15 @@ def model_t5(tmp_path_factory):
 
 @pytest.fixture()
 def sample_schema():
-    from pydantic import BaseModel, conint, conlist, constr
+    from typing import Tuple
+
+    from pydantic import BaseModel, constr
+
+    from outlines.types import countries
 
     class SampleSchema(BaseModel):
         title: constr(max_length=10)
-        numbers: conlist(conint(strict=True), min_length=3, max_length=3)
-        labels: conlist(constr(min_length=1, max_length=5), min_length=3, max_length=3)
+        tup: Tuple[constr(min_length=1, max_length=5), countries.Name, countries.Flag]
 
     return SampleSchema
 
@@ -234,13 +237,13 @@ def test_generate_fsm(request, model_fixture, pattern):
     assert re.fullmatch(pattern, res) is not None, res
 
 
-@pytest.mark.skip(
-    "Fix issues with JSON, some models fail this test https://github.com/dottxt-ai/outlines/issues/985"
-)
 @pytest.mark.parametrize("model_fixture", ALL_MODEL_FIXTURES)
-def test_generate_json(request, model_fixture, sample_schema):
+@pytest.mark.parametrize("mode", ["json", "yaml"])
+def test_generate_json(request, model_fixture, sample_schema, mode):
+    if model_fixture in ("model_transformers_random", "model_bart"):
+        pytest.skip("model vocabulary insufficient for test")
     model = request.getfixturevalue(model_fixture)
-    generator = generate.json(model, sample_schema)
+    generator = generate.json(model, sample_schema, mode=mode)
     # asserts valid within call
     generator(**get_inputs(model_fixture), max_tokens=100)