Skip to content

Commit

Permalink
Handle merges stored as list instead of space-separated string
Browse files Browse the repository at this point in the history
  • Loading branch information
cg123 committed Oct 5, 2024
1 parent 8522917 commit ff76c62
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion mergekit/tokenizer/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,12 @@ def get_stripped_tokenizer(
del tok_dict["model"]["vocab"][tok]

def _keep_merge(m):
toks = m.split(" ")
if isinstance(m, str) and m.count(" ") == 1:
toks = m.split(" ")
elif isinstance(m, list):
toks = m
else:
raise RuntimeError(f"Unexpected merge format: {repr(m)} ({type(m)})")
for tok in toks:
if tok in unused_toks:
return False
Expand Down

0 comments on commit ff76c62

Please sign in to comment.