Skip to content

Commit

Permalink
ensure null bytes added to empty_token_ids
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrew Lapp committed May 15, 2024
1 parent d2066ad commit 884f921
Showing 1 changed file with 4 additions and 7 deletions.
11 changes: 4 additions & 7 deletions outlines/fsm/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,10 @@ def reduced_vocabulary(
token
)

if token_str:
# numpy doesn't track null bytes in arrays
# np.fromiter('\x00' ...) results in an empty string [""]
# https://github.com/numpy/numpy/issues/26275
if token_str and token_str != "\x00":
# invalid utf-8 sequences are replaced with � (\ufffd), but there
# might also be tokens specifically for �, ��, ���, etc.
if "\ufffd" in token_str and not re_replacement_seq.match(token):
Expand Down Expand Up @@ -825,12 +828,6 @@ def reduced_vocabulary(
)
)
for token_tuple, token_ids in vocabulary.items():
# numpy doesn't track null bytes in arrays
# np.fromiter('\x00' ...) results in an empty string [""]
# https://github.com/numpy/numpy/issues/26275
if token_tuple == "\x00":
continue

token_tuple_np = np.fromiter(token_tuple, dtype=np.dtype("U2"))
token_ids_np = np.fromiter(token_ids, dtype=np.dtype("int64"))
vocabulary_nb.append((token_tuple_np, token_ids_np))
Expand Down

0 comments on commit 884f921

Please sign in to comment.