Skip to content

Commit

Permalink
Add SP Space handling for decoder
Browse files Browse the repository at this point in the history
  • Loading branch information
apaniukov committed Nov 20, 2023
1 parent 6c3bae3 commit 5c3b656
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,9 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode
[sp_model_node, token_ids],
).outputs()

if streaming_decoder:
decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder)

string_output = factory.create("StringTensorPack", decoder).outputs()
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,13 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
)
return factory.create("RegexNormalization", input_nodes).outputs()

@classmethod
def replace_sp_spaces(cls) -> "RegexDecodingStep":
return cls(
regex_search_pattern="▁",
replace_term=" ",
)


@dataclass
class TokenizerPipeline:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
"tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88,
"tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882,
"tokenizers_test.py::test_tiktoken_tokenizers": 0.9,
"tokenizers_test.py::test_": 0.8124118476727785
"tokenizers_test.py::test_": 0.825187969924812
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = "path/to/libuser_ov_extensions.so"

import numpy as np
import openvino
import pytest
from openvino import Core
from transformers import AutoTokenizer
Expand Down Expand Up @@ -321,3 +322,20 @@ def test_tiktoken_detokenizer(tiktoken_detokenizers, test_string):
ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"])

assert ov_output == hf_output


def test_streaming_detokenizer():
hf_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_3b_v2")
_, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True, streaming_decoder=True)
ov_detokenizer = core.compile_model(ov_detokenizer)

test_string = "this is a test string"
tokenized_string = hf_tokenizer(test_string).input_ids
hf_detokenized = hf_tokenizer.decode(tokenized_string)

detokenized_string = ""
for token in tokenized_string:
ov_output = unpack_strings(ov_detokenizer(np.atleast_2d(token))["string_output"])[0]
detokenized_string += ov_output

assert detokenized_string == hf_detokenized

0 comments on commit 5c3b656

Please sign in to comment.