From ff76c6209b697880eee231e74f42e016167da794 Mon Sep 17 00:00:00 2001 From: Charles Goddard Date: Sat, 5 Oct 2024 12:43:33 -0700 Subject: [PATCH] Handle merges stored as list instead of space-separated string --- mergekit/tokenizer/build.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mergekit/tokenizer/build.py b/mergekit/tokenizer/build.py index fb9f9d9c..3cefed91 100644 --- a/mergekit/tokenizer/build.py +++ b/mergekit/tokenizer/build.py @@ -90,7 +90,12 @@ def get_stripped_tokenizer( del tok_dict["model"]["vocab"][tok] def _keep_merge(m): - toks = m.split(" ") + if isinstance(m, str) and m.count(" ") == 1: + toks = m.split(" ") + elif isinstance(m, list): + toks = m + else: + raise RuntimeError(f"Unexpected merge format: {repr(m)} ({type(m)})") for tok in toks: if tok in unused_toks: return False