diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index 53c647d7ce..401c8ea2b9 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -437,6 +437,9 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode [sp_model_node, token_ids], ).outputs() + if streaming_decoder: + decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder) + string_output = factory.create("StringTensorPack", decoder).outputs() string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py index d155bf93ce..74654344ae 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/tokenizer_pipeline.py @@ -692,6 +692,13 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]: ) return factory.create("RegexNormalization", input_nodes).outputs() + @classmethod + def replace_sp_spaces(cls) -> "RegexDecodingStep": + return cls( + regex_search_pattern="▁", + replace_term=" ", + ) + @dataclass class TokenizerPipeline: diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json index 1ec7a932d4..6d8440fc9e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/pass_rates.json @@ -6,5 +6,5 @@ "tokenizers_test.py::test_hf_bpe_tokenizers_outputs": 0.88, "tokenizers_test.py::test_bpe_detokenizer": 0.9529411764705882, "tokenizers_test.py::test_tiktoken_tokenizers": 0.9, - "tokenizers_test.py::test_": 0.8124118476727785 + "tokenizers_test.py::test_": 0.825187969924812 } \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index ca2611a445..57e723babe 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -7,6 +7,7 @@ # os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = "path/to/libuser_ov_extensions.so" import numpy as np +import openvino import pytest from openvino import Core from transformers import AutoTokenizer @@ -321,3 +322,20 @@ def test_tiktoken_detokenizer(tiktoken_detokenizers, test_string): ov_output = unpack_strings(ov_detokenizer(token_ids.astype("int32"))["string_output"]) assert ov_output == hf_output + + +def test_streaming_detokenizer(): + hf_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_3b_v2") + _, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True, streaming_decoder=True) + ov_detokenizer = core.compile_model(ov_detokenizer) + + test_string = "this is a test string" + tokenized_string = hf_tokenizer(test_string).input_ids + hf_detokenized = hf_tokenizer.decode(tokenized_string) + + detokenized_string = "" + for token in tokenized_string: + ov_output = unpack_strings(ov_detokenizer(np.atleast_2d(token))["string_output"])[0] + detokenized_string += ov_output + + assert detokenized_string == hf_detokenized