From 6ec4ac596a5e004e8dd4fa79adbc3fe1a9bcf521 Mon Sep 17 00:00:00 2001
From: Chris McMaster <mcmastc1@gmail.com>
Date: Mon, 11 Nov 2024 16:46:06 +1100
Subject: [PATCH] Update `mlx-lm` kvcache creation

---
 outlines/models/mlxlm.py   |   7 +--
 pyproject.toml             |   2 +-
 tests/models/test_mlxlm.py | 100 +++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 7 deletions(-)
 create mode 100644 tests/models/test_mlxlm.py

diff --git a/outlines/models/mlxlm.py b/outlines/models/mlxlm.py
index 6e63ef5b6..d8b7e032c 100644
--- a/outlines/models/mlxlm.py
+++ b/outlines/models/mlxlm.py
@@ -167,12 +167,7 @@ def sample(logits: "mx.array") -> Tuple["mx.array", float]:
             prob = softmax_logits[0, token]
             return token, prob
 
-        kv_heads = (
-            [self.model.n_kv_heads] * len(self.model.layers)
-            if isinstance(self.model.n_kv_heads, int)
-            else self.model.n_kv_heads
-        )
-        cache = [mlx_lm.models.base.KVCache(self.model.head_dim, n) for n in kv_heads]
+        cache = mlx_lm.models.cache.make_prompt_cache(self.model)
 
         # kv cache contains processed input IDs, we pass the unprocessed inputs and cache to model()
         unprocessed_input_ids = prompt
diff --git a/pyproject.toml b/pyproject.toml
index 1fd2897aa..e4f12f76c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,7 @@ test = [
     "beartype<0.16.0",
     "responses",
     "llama-cpp-python",
-    "mlx-lm; platform_machine == 'arm64' and sys_platform == 'darwin'",
+    "mlx-lm>=0.19.2; platform_machine == 'arm64' and sys_platform == 'darwin'",
     "huggingface_hub",
     "openai>=1.0.0",
     "vllm; sys_platform != 'darwin'",
diff --git a/tests/models/test_mlxlm.py b/tests/models/test_mlxlm.py
new file mode 100644
index 000000000..20e59da81
--- /dev/null
+++ b/tests/models/test_mlxlm.py
@@ -0,0 +1,100 @@
+import pytest
+
+from outlines.models.mlxlm import mlxlm
+from outlines.models.transformers import TransformerTokenizer
+
+try:
+    import mlx.core as mx
+
+    HAS_MLX = mx.metal.is_available()
+except ImportError:
+    HAS_MLX = False
+
+
+TEST_MODEL = "mlx-community/SmolLM-135M-Instruct-4bit"
+
+
+@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
+def test_mlxlm_model():
+    model = mlxlm(TEST_MODEL)
+    assert hasattr(model, "model")
+    assert hasattr(model, "tokenizer")
+    assert isinstance(model.tokenizer, TransformerTokenizer)
+
+
+@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
+def test_mlxlm_tokenizer():
+    model = mlxlm(TEST_MODEL)
+
+    # Test single string encoding/decoding
+    test_text = "Hello, world!"
+    token_ids = mx.array(model.mlx_tokenizer.encode(test_text))
+    assert isinstance(token_ids, mx.array)
+
+
+@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
+def test_mlxlm_generate():
+    from outlines.generate.api import GenerationParameters, SamplingParameters
+
+    model = mlxlm(TEST_MODEL)
+    prompt = "Write a haiku about programming:"
+
+    # Test with basic generation parameters
+    gen_params = GenerationParameters(max_tokens=50, stop_at=None, seed=None)
+
+    # Test with different sampling parameters
+    sampling_params = SamplingParameters(
+        sampler="multinomial", num_samples=1, top_p=0.9, top_k=None, temperature=0.7
+    )
+
+    # Test generation
+    output = model.generate(prompt, gen_params, None, sampling_params)
+    assert isinstance(output, str)
+    assert len(output) > 0
+
+
+@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
+def test_mlxlm_stream():
+    from outlines.generate.api import GenerationParameters, SamplingParameters
+
+    model = mlxlm(TEST_MODEL)
+    prompt = "Count from 1 to 5:"
+
+    gen_params = GenerationParameters(max_tokens=20, stop_at=None, seed=None)
+
+    sampling_params = SamplingParameters(
+        sampler="greedy",  # Use greedy sampling for deterministic output
+        num_samples=1,
+        top_p=None,
+        top_k=None,
+        temperature=0.0,
+    )
+
+    # Test streaming
+    stream = model.stream(prompt, gen_params, None, sampling_params)
+    tokens = list(stream)
+    assert len(tokens) > 0
+    assert all(isinstance(token, str) for token in tokens)
+
+    # Test that concatenated streaming output matches generate output
+    streamed_text = "".join(tokens)
+    generated_text = model.generate(prompt, gen_params, None, sampling_params)
+    assert streamed_text == generated_text
+
+
+@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
+def test_mlxlm_errors():
+    model = mlxlm(TEST_MODEL)
+
+    # Test batch inference (should raise NotImplementedError)
+    with pytest.raises(NotImplementedError):
+        from outlines.generate.api import GenerationParameters, SamplingParameters
+
+        gen_params = GenerationParameters(max_tokens=10, stop_at=None, seed=None)
+        sampling_params = SamplingParameters("multinomial", 1, None, None, 1.0)
+        model.generate(["prompt1", "prompt2"], gen_params, None, sampling_params)
+
+    # Test beam search (should raise NotImplementedError)
+    with pytest.raises(NotImplementedError):
+        sampling_params = SamplingParameters("beam_search", 1, None, None, 1.0)
+        model.generate("test prompt", gen_params, None, sampling_params)