From 04fc5594a5276469a5c9304668941cc0c8ff826f Mon Sep 17 00:00:00 2001 From: Andrew Lapp Date: Fri, 17 May 2024 09:33:45 -0500 Subject: [PATCH] circumvent broken llama.cpp pre-tokenizer --- outlines/integrations/llamacpp.py | 16 ++++++++--- outlines/models/llamacpp.py | 11 +++++++ tests/generate/test_integration_llamacpp.py | 32 +++++++++++++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) diff --git a/outlines/integrations/llamacpp.py b/outlines/integrations/llamacpp.py index 8c000a6e5..4041c54fb 100644 --- a/outlines/integrations/llamacpp.py +++ b/outlines/integrations/llamacpp.py @@ -49,11 +49,19 @@ def __init__(self, model: "Llama"): self.special_tokens: Set[int] = set() self.vocabulary: Dict[str, int] = dict() - for t in range(model.n_vocab()): - token_piece = model.tokenizer().decode([t]) - self.vocabulary[token_piece] = t - self.decode = model.tokenizer().decode + tokenizer = model.tokenizer() + + self.decode = tokenizer.decode + + # TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved + try: + self.vocabulary = model.tokenizer_.hf_tokenizer.get_vocab() + except AttributeError: + # ### + for t in range(model.n_vocab()): + token_piece = model.tokenizer().decode([t]) + self.vocabulary[token_piece] = t def convert_token_to_string(self, token: str) -> str: return token diff --git a/outlines/models/llamacpp.py b/outlines/models/llamacpp.py index 8a6a53a27..5920f08d6 100644 --- a/outlines/models/llamacpp.py +++ b/outlines/models/llamacpp.py @@ -1,4 +1,5 @@ import dataclasses +import warnings from typing import TYPE_CHECKING, Iterator, List, Optional, TypedDict, Union from typing_extensions import Unpack @@ -288,6 +289,16 @@ def llamacpp( if "verbose" not in llamacpp_model_params: llamacpp_model_params["verbose"] = False + # TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved + if "tokenizer" not in llamacpp_model_params: + warnings.warn( + "The pre-tokenizer in `llama.cpp` handles unicode improperly " + + "(https://github.com/ggerganov/llama.cpp/pull/5613)\n" + + "Outlines may raise a `RuntimeError` when building the regex index.\n" + + "To circumvent this error when using `models.llamacpp()` you may pass the argument" + + "`tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained()`\n" + ) + model = Llama.from_pretrained(repo_id, filename, **llamacpp_model_params) return LlamaCpp(model) diff --git a/tests/generate/test_integration_llamacpp.py b/tests/generate/test_integration_llamacpp.py index d036b560f..75d0e4cef 100644 --- a/tests/generate/test_integration_llamacpp.py +++ b/tests/generate/test_integration_llamacpp.py @@ -247,3 +247,35 @@ def test_llamacpp_cfg(model): prompt = "<|im_start|>user\nOutput a short and valid JSON object with two keys.<|im_end|>\n><|im_start|>assistant\n" result = generate.cfg(model, grammars.arithmetic)(prompt, seed=11) assert isinstance(result, str) + + +@pytest.mark.parametrize( + "repo,model_path,hf_tokenizer_uri", + [ + ("Qwen/Qwen1.5-0.5B-Chat-GGUF", "*q2*.gguf", "Qwen/Qwen1.5-0.5B-Chat"), + ("TheBloke/phi-2-GGUF", "*Q2*.gguf", "microsoft/phi-2"), + ], +) +def test_byte_tokenizer_regression(repo, model_path, hf_tokenizer_uri): + """Reproduce https://github.com/outlines-dev/outlines/issues/820""" + import llama_cpp + + model = models.llamacpp( + repo, + model_path, + tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( + hf_tokenizer_uri + ), + ) + generator = generate.choice(model, ["skirt", "dress", "pen", "jacket"]) + generator("Pick the odd word out: skirt, dress, pen, jacket") + + +def test_llama_cpp_pre_tokenizer_remains_broken(): + """If fails, llama.cpp pre-tokenizer is fixed -> revert #892, remove `with pytest.raises`""" + repo = "Qwen/Qwen1.5-0.5B-Chat-GGUF" + model_path = "*q2*.gguf" + + model = models.llamacpp(repo, model_path) + with pytest.raises(RuntimeError): + generate.choice(model, ["skirt", "dress", "pen", "jacket"])