Skip to content

Commit

Permalink
circumvent broken llama.cpp pre-tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
lapp0 committed May 17, 2024
1 parent 499d19d commit 04fc559
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
16 changes: 12 additions & 4 deletions outlines/integrations/llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,19 @@ def __init__(self, model: "Llama"):
self.special_tokens: Set[int] = set()

self.vocabulary: Dict[str, int] = dict()
for t in range(model.n_vocab()):
token_piece = model.tokenizer().decode([t])
self.vocabulary[token_piece] = t

self.decode = model.tokenizer().decode
tokenizer = model.tokenizer()

self.decode = tokenizer.decode

# TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
try:
self.vocabulary = model.tokenizer_.hf_tokenizer.get_vocab()
except AttributeError:
# ###
for t in range(model.n_vocab()):
token_piece = model.tokenizer().decode([t])
self.vocabulary[token_piece] = t

def convert_token_to_string(self, token: str) -> str:
return token
Expand Down
11 changes: 11 additions & 0 deletions outlines/models/llamacpp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import dataclasses
import warnings
from typing import TYPE_CHECKING, Iterator, List, Optional, TypedDict, Union

from typing_extensions import Unpack
Expand Down Expand Up @@ -288,6 +289,16 @@ def llamacpp(
if "verbose" not in llamacpp_model_params:
llamacpp_model_params["verbose"] = False

# TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613 is resolved
if "tokenizer" not in llamacpp_model_params:
warnings.warn(
"The pre-tokenizer in `llama.cpp` handles unicode improperly "
+ "(https://github.com/ggerganov/llama.cpp/pull/5613)\n"
+ "Outlines may raise a `RuntimeError` when building the regex index.\n"
+ "To circumvent this error when using `models.llamacpp()` you may pass the argument"
+ "`tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(<hf_repo_id>)`\n"
)

model = Llama.from_pretrained(repo_id, filename, **llamacpp_model_params)

return LlamaCpp(model)
32 changes: 32 additions & 0 deletions tests/generate/test_integration_llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,35 @@ def test_llamacpp_cfg(model):
prompt = "<|im_start|>user\nOutput a short and valid JSON object with two keys.<|im_end|>\n><|im_start|>assistant\n"
result = generate.cfg(model, grammars.arithmetic)(prompt, seed=11)
assert isinstance(result, str)


@pytest.mark.parametrize(
"repo,model_path,hf_tokenizer_uri",
[
("Qwen/Qwen1.5-0.5B-Chat-GGUF", "*q2*.gguf", "Qwen/Qwen1.5-0.5B-Chat"),
("TheBloke/phi-2-GGUF", "*Q2*.gguf", "microsoft/phi-2"),
],
)
def test_byte_tokenizer_regression(repo, model_path, hf_tokenizer_uri):
"""Reproduce https://github.com/outlines-dev/outlines/issues/820"""
import llama_cpp

model = models.llamacpp(
repo,
model_path,
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
hf_tokenizer_uri
),
)
generator = generate.choice(model, ["skirt", "dress", "pen", "jacket"])
generator("Pick the odd word out: skirt, dress, pen, jacket")


def test_llama_cpp_pre_tokenizer_remains_broken():
"""If fails, llama.cpp pre-tokenizer is fixed -> revert #892, remove `with pytest.raises`"""
repo = "Qwen/Qwen1.5-0.5B-Chat-GGUF"
model_path = "*q2*.gguf"

model = models.llamacpp(repo, model_path)
with pytest.raises(RuntimeError):
generate.choice(model, ["skirt", "dress", "pen", "jacket"])

0 comments on commit 04fc559

Please sign in to comment.