ensure null bytes added to empty_token_ids

lapp0 · May 15, 2024 · 884f921 · 884f921
1 parent d2066ad
commit 884f921
Showing 1 changed file with 4 additions and 7 deletions.
diff --git a/outlines/fsm/regex.py b/outlines/fsm/regex.py
@@ -789,7 +789,10 @@ def reduced_vocabulary(
             token
         )
 
-        if token_str:
+        # numpy doesn't track null bytes in arrays
+        # np.fromiter('\x00' ...) results in an empty string [""]
+        # https://github.com/numpy/numpy/issues/26275
+        if token_str and token_str != "\x00":
             # invalid utf-8 sequences are replaced with � (\ufffd), but there
             # might also be tokens specifically for �, ��, ���, etc.
             if "\ufffd" in token_str and not re_replacement_seq.match(token):
@@ -825,12 +828,6 @@ def reduced_vocabulary(
         )
     )
     for token_tuple, token_ids in vocabulary.items():
-        # numpy doesn't track null bytes in arrays
-        # np.fromiter('\x00' ...) results in an empty string [""]
-        # https://github.com/numpy/numpy/issues/26275
-        if token_tuple == "\x00":
-            continue
-
         token_tuple_np = np.fromiter(token_tuple, dtype=np.dtype("U2"))
         token_ids_np = np.fromiter(token_ids, dtype=np.dtype("int64"))
         vocabulary_nb.append((token_tuple_np, token_ids_np))