From c1fae8aca517d1289e3f934750cb6ea52804ad6d Mon Sep 17 00:00:00 2001
From: Cameron Pfiffer <cameron@pfiffer.org>
Date: Thu, 10 Oct 2024 14:34:17 -0700
Subject: [PATCH 1/6] Create lmstudio.md

---
 docs/reference/serve/lmstudio.md | 89 ++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 docs/reference/serve/lmstudio.md

diff --git a/docs/reference/serve/lmstudio.md b/docs/reference/serve/lmstudio.md
new file mode 100644
index 000000000..e8c336137
--- /dev/null
+++ b/docs/reference/serve/lmstudio.md
@@ -0,0 +1,89 @@
+# Serve with LM Studio
+
+!!! tip "Would rather not self-host?"
+
+    If you want to get started quickly with JSON-structured generation you can call instead [.json](https://h1xbpbfsf0w.typeform.com/to/ZgBCvJHF), a [.txt](http://dottxt.co) API that guarantees valid JSON.
+
+[LM Studio](https://lmstudio.ai/) is an application that runs local LLMs. It flexibly mixes GPU and CPU compute in hardware-constrained environments.
+
+As of [LM Studio 0.3.4](https://lmstudio.ai/blog/lmstudio-v0.3.4), it natively supports Outlines for structured text generation, using an OpenAI-compatible endpoint.
+
+## Setup
+
+1. Install LM Studio by visiting their [downloads page](https://lmstudio.ai/download).
+2. Enable the LM Studio [server functionality](https://lmstudio.ai/docs/basics/server).
+3. Download [a model](https://lmstudio.ai/docs/basics#1-download-an-llm-to-your-computer).
+4. Install Python dependencies.
+```bash
+pip install pydantic openai
+```
+
+## Calling the server
+
+By default, LM Studio will serve from `http://localhost:1234`. If you are serving on a different port or host, make sure to change the `base_url` argument in `OpenAI` to the relevant location.
+
+```python
+class Testing(BaseModel):
+    """
+    A class representing a testing schema.
+    """
+    name: str
+    age: int
+
+openai_client = openai.OpenAI(
+    base_url="http://0.0.0.0:1234/v1",
+    api_key="dopeness"
+)
+
+# Make a request to the local LM Studio server
+response = openai_client.beta.chat.completions.parse(
+    model="hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF",
+    messages=[
+        {"role": "system", "content": "You are like so good at whatever you do."},
+        {"role": "user", "content": "My name is Cameron and I am 28 years old. What's my name and age?"}
+    ],
+    response_format=Testing
+)
+```
+
+You should receive a `ParsedChatCompletion[Testing]` object back:
+
+```python
+ParsedChatCompletion[Testing](
+    id='chatcmpl-3hykyf0fxus7jc90k6gwlw',
+    choices=[
+        ParsedChoice[Testing](
+            finish_reason='stop',
+            index=0,
+            logprobs=None,
+            message=ParsedChatCompletionMessage[Testing](
+                content='{ "age": 28, "name": "Cameron" }',
+                refusal=None,
+                role='assistant',
+                function_call=None,
+                tool_calls=[],
+                parsed=Testing(name='Cameron', age=28)
+            )
+        )
+    ],
+    created=1728595622,
+    model='lmstudio-community/Phi-3.1-mini-128k-instruct-GGUF/Phi-3.1-mini-128k-instruct-Q4_K_M.gguf',
+    object='chat.completion',
+    service_tier=None,
+    system_fingerprint='lmstudio-community/Phi-3.1-mini-128k-instruct-GGUF/Phi-3.1-mini-128k-instruct-
+Q4_K_M.gguf',
+    usage=CompletionUsage(
+        completion_tokens=17,
+        prompt_tokens=47,
+        total_tokens=64,
+        completion_tokens_details=None,
+        prompt_tokens_details=None
+    )
+)
+```
+
+You can retrieve your `Testing` object with 
+
+```python
+response.choices[0].message.parsed
+```

From 64fb30faae5201e4c9957770fbc19863c7d3750c Mon Sep 17 00:00:00 2001
From: Cameron Pfiffer <cameron@pfiffer.org>
Date: Mon, 14 Oct 2024 18:43:42 -0700
Subject: [PATCH 2/6] use pre-commit

---
 docs/reference/serve/lmstudio.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/reference/serve/lmstudio.md b/docs/reference/serve/lmstudio.md
index e8c336137..db383186a 100644
--- a/docs/reference/serve/lmstudio.md
+++ b/docs/reference/serve/lmstudio.md
@@ -82,7 +82,7 @@ Q4_K_M.gguf',
 )
 ```
 
-You can retrieve your `Testing` object with 
+You can retrieve your `Testing` object with
 
 ```python
 response.choices[0].message.parsed

From 866b9a3c25b88a1790228632063583701990e364 Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Thu, 10 Oct 2024 15:03:04 -0400
Subject: [PATCH 3/6] recover fsm_union, get_sub_fsms_from_seq, walk_fsm. Add
 to fsm/parser.py

---
 outlines/fsm/parsing.py | 215 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 210 insertions(+), 5 deletions(-)

diff --git a/outlines/fsm/parsing.py b/outlines/fsm/parsing.py
index 92d3cc166..e48fb69e4 100644
--- a/outlines/fsm/parsing.py
+++ b/outlines/fsm/parsing.py
@@ -1,10 +1,22 @@
 from copy import copy, deepcopy
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Any, Dict, FrozenSet, Iterator, Optional, Set, Tuple, Union
+from typing import (
+    Any,
+    Dict,
+    FrozenSet,
+    Generator,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Union,
+)
 
 import interegular
-from interegular.fsm import FSM
+from interegular.fsm import FSM, Alphabet, OblivionError
 from interegular.patterns import Unsupported
 from lark import Lark, Token
 from lark.common import LexerConf, ParserConf
@@ -35,11 +47,9 @@
 from lark.parsers.lalr_interactive_parser import InteractiveParser
 from lark.parsers.lalr_parser import LALR_Parser, ParseConf, ParserState, _Parser
 from outlines_core.fsm.regex import (
-    fsm_union,
-    get_sub_fsms_from_seq,
+    BetterFSM,
     get_token_transition_keys,
     make_deterministic_fsm,
-    walk_fsm,
 )
 
 PartialParseState = Tuple[str, int]
@@ -920,3 +930,198 @@ def terminals_to_fsms(lp: PartialLark) -> Dict[str, FSM]:
         symbol_names_and_fsms[terminal.name] = fsm
 
     return symbol_names_and_fsms
+
+
+def fsm_union(
+    fsms: Sequence[FSM],
+) -> Tuple[FSM, Dict[int, Tuple[Set[Tuple[int, int]], Set[int], Dict[int, Set[int]]]]]:
+    """Construct an FSM representing the union of the FSMs in `fsms`.
+
+    This is an updated version of `interegular.fsm.FSM.union` made to return an
+    extra map of component FSMs to the sets of state transitions that
+    correspond to them in the new FSM.
+
+    """
+
+    alphabet, new_to_old = Alphabet.union(*[fsm.alphabet for fsm in fsms])
+
+    indexed_fsms = tuple(enumerate(fsms))
+
+    initial = {i: fsm.initial for (i, fsm) in indexed_fsms}
+
+    # Dedicated function accepting a "superset" and returning the next
+    # "superset" obtained by following this transition in the new FSM
+    def follow(current_state, new_transition: int):
+        next = {}
+        for i, f in indexed_fsms:
+            old_transition = new_to_old[i][new_transition]
+            if (
+                i in current_state
+                and current_state[i] in f.map
+                and old_transition in f.map[current_state[i]]
+            ):
+                next[i] = f.map[current_state[i]][old_transition]
+        if not next:
+            raise OblivionError
+        return next
+
+    states = [initial]
+    finals: Set[int] = set()
+    map: Dict[int, Dict[int, int]] = {}
+
+    # Map component FSMs to their new state-to-state transitions, finals, and a
+    # map translating component FSM states to aggregate FSM states
+    fsms_to_trans_finals: Dict[
+        int, Tuple[Set[Tuple[int, int]], Set[int], Dict[int, Set[int]]]
+    ] = {}
+
+    i = 0
+    while i < len(states):
+        state = states[i]
+
+        # Add to the finals of the aggregate FSM whenever we hit a final in a
+        # component FSM
+        if any(state.get(j, -1) in fsm.finals for (j, fsm) in indexed_fsms):
+            finals.add(i)
+
+        # Compute the map for this state
+        map[i] = {}
+        for transition in alphabet.by_transition:
+            try:
+                next = follow(state, transition)
+            except OblivionError:
+                # Reached an oblivion state; don't list it
+                continue
+            else:
+                try:
+                    # TODO: Seems like this could--and should--be avoided
+                    j = states.index(next)
+                except ValueError:
+                    j = len(states)
+                    states.append(next)
+
+                map[i][transition] = j
+
+                for fsm_id, fsm_state in next.items():
+                    (
+                        fsm_transitions,
+                        fsm_finals,
+                        fsm_old_to_new,
+                    ) = fsms_to_trans_finals.setdefault(fsm_id, (set(), set(), {}))
+                    old_from = state[fsm_id]
+                    old_to = fsm_state
+                    fsm_old_to_new.setdefault(old_from, set()).add(i)
+                    fsm_old_to_new.setdefault(old_to, set()).add(j)
+                    fsm_transitions.add((i, j))
+                    if fsm_state in fsms[fsm_id].finals:
+                        fsm_finals.add(j)
+
+        i += 1
+
+    fsm = FSM(
+        alphabet=alphabet,
+        states=range(len(states)),
+        initial=0,
+        finals=finals,
+        map=map,
+        __no_validation__=True,
+    )
+
+    fsm, old_to_new_states = make_deterministic_fsm(fsm)
+    _fsms_to_trans_finals = {
+        fsm_id: (
+            {(old_to_new_states[s1], old_to_new_states[s2]) for s1, s2 in transitions},
+            {old_to_new_states[s] for s in finals},
+            {
+                old_state: {old_to_new_states[new_state] for new_state in new_states}
+                for old_state, new_states in old_to_new.items()
+            },
+        )
+        for fsm_id, (transitions, finals, old_to_new) in sorted(
+            fsms_to_trans_finals.items(), key=lambda x: x[0]
+        )
+    }
+
+    return (
+        fsm,
+        _fsms_to_trans_finals,
+    )
+
+
+def get_sub_fsms_from_seq(
+    state_seq: Sequence[int],
+    fsms_to_trans_finals: Dict[
+        int, Tuple[Set[Tuple[int, int]], Set[int], Dict[int, Set[int]]]
+    ],
+) -> Generator[Tuple[int, bool, bool], None, None]:
+    """Get the indices of the sub-FSMs in `fsm` that could have matched the state sequence `state_seq`.
+
+    Parameters
+    ----------
+    state_seq
+        A state sequence.
+    fsms_to_trans_finals
+        A map from FSM indices to tuples containing sets of their state transitions
+        and sets of the final/accept states.
+
+    Returns
+    -------
+    A generator returning tuples containing each sub-FSM index (in the order
+    they were union-ed to construct `fsm`) and booleans indicating whether or
+    not there is another valid transition from the last state in the sequence
+    for the associated sub-FSM (i.e. if the FSM can continue
+    accepting/matching) and whether or not the sequence ends in a final state
+    of the sub-FSM.
+    """
+    state_seq_transitions = set(zip(state_seq[:-1], state_seq[1:]))
+    last_fsm_state = state_seq[-1]
+    yield from (
+        (
+            # The sub-FMS index
+            fsm_idx,
+            # Is there another possible transition in this sub-FSM?
+            any(last_fsm_state == from_s for (from_s, to_s) in transitions),
+            # Is this sub-FSM in a final state?
+            state_seq[-1] in finals,
+        )
+        for fsm_idx, (transitions, finals, _) in fsms_to_trans_finals.items()
+        if state_seq_transitions.issubset(transitions)
+    )
+
+
+def walk_fsm(
+    fsm: BetterFSM,
+    token_transition_keys: Sequence[int],
+    start_state: int,
+    full_match: bool = True,
+) -> List[int]:
+    fsm_finals = fsm.finals
+
+    state = start_state
+    accepted_states: List[int] = []
+    last_final_idx: int = 0
+
+    fsm_transitions = fsm.flat_transition_map
+
+    # Iterate over token transition key sequence. The transition key
+    # sequence represents the FSM traversal rules of the tokens symbols.
+    for i, trans_key in enumerate(token_transition_keys):
+        new_state = fsm_transitions.get((state, trans_key))
+
+        if new_state is None:
+            if not full_match and last_final_idx > 0:
+                return accepted_states[:last_final_idx]
+
+            return []
+
+        state = new_state
+
+        if state in fsm_finals:
+            last_final_idx = i + 1
+
+        accepted_states.append(state)
+
+    if full_match and last_final_idx - 1 != i:
+        return []
+
+    return accepted_states

From eabca69013298145acd09dcda2eb4fc2d58d9c2d Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Thu, 10 Oct 2024 15:03:16 -0400
Subject: [PATCH 4/6] update dependencies: torch is required, pin
 outlines-core==0.1.14

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ac94ecf57..fa7005afd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,8 @@ dependencies = [
    "typing_extensions",
    "pycountry",
    "airportsdata",
-   "outlines_core==0.1.0",
+   "torch",
+   "outlines_core==0.1.14",
 ]
 dynamic = ["version"]
 
@@ -61,7 +62,6 @@ test = [
     "huggingface_hub",
     "openai>=1.0.0",
     "vllm; sys_platform != 'darwin'",
-    "torch",
     "transformers",
     "pillow",
     "exllamav2",

From 6f36b71e995cec4ff17d6d4d4fca28dcf5122cef Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Fri, 11 Oct 2024 10:13:40 -0400
Subject: [PATCH 5/6] update RegexGuide to conform with outlines-core

---
 benchmarks/bench_json_schema.py               |  2 +-
 benchmarks/bench_regex_guide.py               |  4 +--
 outlines/fsm/guide.py                         | 26 +++++++++++--------
 outlines/processors/structured.py             |  2 +-
 tests/fsm/test_guide.py                       | 10 +++----
 tests/generate/test_integration_llamacpp.py   |  4 +--
 .../generate/test_integration_transformers.py |  4 +--
 7 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/benchmarks/bench_json_schema.py b/benchmarks/bench_json_schema.py
index 8990b015c..62d9b3c1d 100644
--- a/benchmarks/bench_json_schema.py
+++ b/benchmarks/bench_json_schema.py
@@ -77,4 +77,4 @@ def time_json_schema_to_regex(self, schema_name):
     @cache_disabled()
     def time_json_schema_to_fsm(self, schema_name):
         regex = build_regex_from_schema(self.schema)
-        RegexGuide(regex, self.tokenizer)
+        RegexGuide.from_regex(regex, self.tokenizer)
diff --git a/benchmarks/bench_regex_guide.py b/benchmarks/bench_regex_guide.py
index 7aaef6bac..fa23a724f 100644
--- a/benchmarks/bench_regex_guide.py
+++ b/benchmarks/bench_regex_guide.py
@@ -25,7 +25,7 @@ def setup(self, pattern_name):
 
     @cache_disabled()
     def time_regex_to_guide(self, pattern_name):
-        RegexGuide(self.pattern, self.tokenizer)
+        RegexGuide.from_regex(self.pattern, self.tokenizer)
 
 
 class MemoryRegexGuideBenchmark:
@@ -37,4 +37,4 @@ def setup(self, pattern_name):
 
     @cache_disabled()
     def peakmem_regex_to_guide(self, pattern_name):
-        RegexGuide(self.pattern, self.tokenizer)
+        RegexGuide.from_regex(self.pattern, self.tokenizer)
diff --git a/outlines/fsm/guide.py b/outlines/fsm/guide.py
index 697597234..d46228fe9 100644
--- a/outlines/fsm/guide.py
+++ b/outlines/fsm/guide.py
@@ -74,8 +74,8 @@ def copy(self):
 
 
 @cache()
-def create_states_mapping(regex_string, tokenizer):
-    return uncached_create_states_mapping(regex_string, tokenizer)
+def cached_create_states_mapping(regex_string, tokenizer, *args, **kwargs):
+    return uncached_create_states_mapping(regex_string, tokenizer, *args, **kwargs)
 
 
 class RegexGuide(CoreRegexGuide):
@@ -84,15 +84,19 @@ class RegexGuide(CoreRegexGuide):
     CoreRegexGuide with outlines cache
     """
 
-    def __init__(self, regex_string: str, tokenizer: "Tokenizer"):
-        (
-            self.states_to_token_maps,
-            self.empty_token_ids,
-            fsm_finals,
-        ) = create_states_mapping(regex_string, tokenizer)
-        self.eos_token_id = tokenizer.eos_token_id
-        self.final_states = fsm_finals | {-1}
-        self._cache_state_to_token_tensor()
+    @classmethod
+    def from_regex(
+        cls,
+        regex_string: str,
+        tokenizer,
+        **kwargs,
+    ):
+        return super().from_regex(
+            regex_string,
+            tokenizer,
+            _create_states_mapping=cached_create_states_mapping,
+            **kwargs,
+        )
 
 
 CFGState = collections.namedtuple("CFGState", ["parser_state", "prev_token"])
diff --git a/outlines/processors/structured.py b/outlines/processors/structured.py
index e3b9e60d3..d2bc15f77 100644
--- a/outlines/processors/structured.py
+++ b/outlines/processors/structured.py
@@ -149,7 +149,7 @@ def __init__(self, regex_string: str, tokenizer: "Tokenizer"):
         tokenizer
             An Outlines tokenizer
         """
-        guide = RegexGuide(regex_string, tokenizer)
+        guide = RegexGuide.from_regex(regex_string, tokenizer)
         super().__init__(tokenizer=tokenizer, guide=guide)
 
 
diff --git a/tests/fsm/test_guide.py b/tests/fsm/test_guide.py
index 67b4e0dd8..510faf4b0 100644
--- a/tests/fsm/test_guide.py
+++ b/tests/fsm/test_guide.py
@@ -43,7 +43,7 @@ def convert_token_to_string(self, token):
     regex_str = "[1-9]"
 
     with pytest.raises(ValueError, match="The vocabulary"):
-        RegexGuide(regex_str, MockTokenizer())
+        RegexGuide.from_regex(regex_str, MockTokenizer())
 
 
 def test_regex():
@@ -57,7 +57,7 @@ def convert_token_to_string(self, token):
 
     regex_str = "[1-9]"
     tokenizer = MockTokenizer()
-    fsm = RegexGuide(regex_str, tokenizer)
+    fsm = RegexGuide.from_regex(regex_str, tokenizer)
 
     assert fsm.states_to_token_maps == {0: {1: 1}}
 
@@ -98,7 +98,7 @@ def convert_token_to_string(self, token):
 
     regex_str = "[😁-😎]"
     tokenizer = MockTokenizer()
-    fsm = RegexGuide(regex_str, tokenizer)
+    fsm = RegexGuide.from_regex(regex_str, tokenizer)
 
     assert fsm.states_to_token_maps == {
         0: {5: 1, 4: 2},
@@ -145,7 +145,7 @@ def convert_token_to_string(self, token):
 
     regex_str = " [😁-😎]"
     tokenizer = MockTokenizer()
-    fsm = RegexGuide(regex_str, tokenizer)
+    fsm = RegexGuide.from_regex(regex_str, tokenizer)
 
     assert fsm.states_to_token_maps == {
         0: {5: 1, 10: 2},
@@ -180,7 +180,7 @@ def convert_token_to_string(self, token):
 
     regex_str = r"`\n(\.\n)?`\n"
     tokenizer = MockTokenizer()
-    fsm = RegexGuide(regex_str, tokenizer)
+    fsm = RegexGuide.from_regex(regex_str, tokenizer)
 
     state = fsm.get_next_state(state=4, token_id=103)
     assert state == 5
diff --git a/tests/generate/test_integration_llamacpp.py b/tests/generate/test_integration_llamacpp.py
index 08521c672..8d4596d60 100644
--- a/tests/generate/test_integration_llamacpp.py
+++ b/tests/generate/test_integration_llamacpp.py
@@ -278,7 +278,7 @@ def test_RegexGuide_caching(model, temp_cache_dir):
     import llama_cpp
 
     import outlines.caching
-    from outlines.fsm.guide import create_states_mapping
+    from outlines.fsm.guide import cached_create_states_mapping
 
     assert outlines.caching._caching_enabled
 
@@ -291,7 +291,7 @@ def test_RegexGuide_caching(model, temp_cache_dir):
     _ = cache.stats(enable=True)
     assert cache.statistics
 
-    assert create_states_mapping.__memory__ is cache
+    assert cached_create_states_mapping.__memory__ is cache
 
     generator = generate.regex(model, regex, sampler=samplers.greedy())
     assert cache.stats() == (0, 1)
diff --git a/tests/generate/test_integration_transformers.py b/tests/generate/test_integration_transformers.py
index 1d26a9ee4..2462d9fcf 100644
--- a/tests/generate/test_integration_transformers.py
+++ b/tests/generate/test_integration_transformers.py
@@ -494,7 +494,7 @@ def test_transformers_use_existing_model_and_tokenizer():
 
 def test_RegexGuide_caching(temp_cache_dir):
     import outlines.caching
-    from outlines.fsm.guide import create_states_mapping
+    from outlines.fsm.guide import cached_create_states_mapping
 
     assert outlines.caching._caching_enabled
 
@@ -507,7 +507,7 @@ def test_RegexGuide_caching(temp_cache_dir):
     _ = cache.stats(enable=True)
     assert cache.statistics
 
-    assert create_states_mapping.__memory__ is cache
+    assert cached_create_states_mapping.__memory__ is cache
 
     model = models.transformers(
         "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM", device="cpu"

From 969887ee3b626380921fa0cb8f6360cb11ff3ed9 Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Mon, 14 Oct 2024 13:03:26 -0400
Subject: [PATCH 6/6] test fsm_union and walk_fsm

---
 tests/fsm/test_parsing.py | 101 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/tests/fsm/test_parsing.py b/tests/fsm/test_parsing.py
index 3f4c1ba42..b7446fa0c 100644
--- a/tests/fsm/test_parsing.py
+++ b/tests/fsm/test_parsing.py
@@ -204,3 +204,104 @@ def test_sequential_parse_example(cleanup_lark_import):
 
         if i + 1 == len(input_tokens):
             assert all(tk in next_vocab for tk in ["\n", "\nde", "  ", " + 1"])
+
+
+# TODO: Remove once fsm_union and walk_fsm are implemented in Outlines-Core
+import interegular  # noqa
+
+from outlines.fsm.parsing import fsm_union, walk_fsm  # noqa
+
+
+def test_outlines_interegular_union_consistency():
+    fsm0 = interegular.parse_pattern(r"abc").to_fsm()
+    fsm1 = interegular.parse_pattern(r"WXYZ").to_fsm()
+    fsm2 = interegular.parse_pattern(r"12345").to_fsm()
+
+    interegular_unioned_fsm = fsm0 | fsm1 | fsm2
+    outlines_unioned_fsm, _ = fsm_union([fsm0, fsm1, fsm2])
+
+    assert list(outlines_unioned_fsm.strings()) == list(
+        interegular_unioned_fsm.strings()
+    )
+
+
+def _reconstruct_fsms(fsm, fsms_to_trans_finals):
+    """Reconstruct the original fsms for testing purposes"""
+    reconstructed_fsms = []
+    for transitions, finals, state_map in fsms_to_trans_finals.values():
+        inv_state_map = {new: orig for orig, news in state_map.items() for new in news}
+        states = set(inv_state_map.values())
+        initial = inv_state_map.get(fsm.initial) or next(
+            (orig for orig, news in state_map.items() if fsm.initial in news), None
+        )
+        finals = {inv_state_map[s] for s in finals}
+
+        transition_map = {}
+        alphabet = {}
+        for trans_id, (from_state, to_state) in enumerate(transitions):
+            orig_from, orig_to = inv_state_map[from_state], inv_state_map[to_state]
+            # Collect symbols associated with the transition
+            symbols = {
+                symbol
+                for trans, dest in fsm.map.get(from_state, {}).items()
+                if dest == to_state
+                for symbol in fsm.alphabet.by_transition.get(trans, [])
+            }
+            if symbols:
+                # NOTE: THIS RECONSTRUCTOR DOESNT WORK FOR MORE THAN ONE TRANSITION PER SYMBOL
+                assert len(symbols) == 1
+                symbol = list(symbols)[0]
+                alphabet[symbol] = trans_id
+                transition_map.setdefault(orig_from, {})[trans_id] = orig_to
+
+        reconstructed_fsms.append(
+            interegular.fsm.FSM(
+                alphabet=interegular.fsm.Alphabet(alphabet),
+                states=frozenset(states),
+                initial=initial,
+                finals=frozenset(finals),
+                map=transition_map,
+                __no_validation__=True,
+            )
+        )
+    return reconstructed_fsms
+
+
+def test_fsm_to_trans_finals_reconstruction():
+    """Assert that _fsms_to_trans_finals is correct by reconstructing original fsms"""
+    fsm0 = interegular.parse_pattern(r"abc").to_fsm()
+    fsm1 = interegular.parse_pattern(r"XYZ").to_fsm()
+    fsm2 = interegular.parse_pattern(r"12345").to_fsm()
+
+    fsm, _fsms_to_trans_finals = fsm_union([fsm0, fsm1, fsm2])
+
+    reconstructed = _reconstruct_fsms(fsm, _fsms_to_trans_finals)
+
+    # assert reconstruction equivalent
+    assert list(fsm0.strings()) == list(reconstructed[0].strings())
+    assert list(fsm1.strings()) == list(reconstructed[1].strings())
+    assert list(fsm2.strings()) == list(reconstructed[2].strings())
+
+
+def test_walk_fsm():
+    fsm = interegular.parse_pattern(r"abc*d").to_fsm()
+    # convert to BetterFSM
+    fsm = fsm_union([fsm])[0]
+
+    # if match, produce equivalent number of states, assert state can terminate
+    transitions = [fsm.alphabet[letter] for letter in "abcccd"]
+    accepted_states = walk_fsm(fsm, transitions, fsm.initial, full_match=True)
+    assert len(accepted_states) == len(transitions)
+    assert accepted_states[-1] in fsm.finals
+
+    # if no match, assert empty
+    accepted_states = walk_fsm(
+        fsm, [fsm.alphabet[letter] for letter in "b"], fsm.initial, full_match=True
+    )
+    assert accepted_states == []
+
+    # if full_match, but last state not present, assert empty
+    accepted_states = walk_fsm(
+        fsm, [fsm.alphabet[letter] for letter in "abc"], fsm.initial, full_match=True
+    )
+    assert accepted_states == []