diff --git a/outlines/fsm/regex.py b/outlines/fsm/regex.py index 8dbbd79b1..b83d1f11b 100644 --- a/outlines/fsm/regex.py +++ b/outlines/fsm/regex.py @@ -825,6 +825,12 @@ def reduced_vocabulary( ) ) for token_tuple, token_ids in vocabulary.items(): + # numpy doesn't track null bytes in arrays + # np.fromiter('\x00' ...) results in an empty string [""] + # https://github.com/numpy/numpy/issues/26275 + if token_tuple == "\x00": + continue + token_tuple_np = np.fromiter(token_tuple, dtype=np.dtype("U2")) token_ids_np = np.fromiter(token_ids, dtype=np.dtype("int64")) vocabulary_nb.append((token_tuple_np, token_ids_np))