From 138b8184d75f339bc1dc0d0343aec3501827f31d Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Fri, 8 Dec 2023 21:42:45 +0000 Subject: [PATCH] [Tokenizers] Revise Parameters (#777) * Refactoring Del node_factory.py Move from os to Path Support -e install for _ext_libs_path Wrap Core.__init__ * Rename with_decoder to with_detokenizer --- .../tokenizer/python/README.md | 31 +++++++++---------- .../tokenizer/python/ov_tokenizer/__init__.py | 29 +++++++++-------- .../python/ov_tokenizer/convert_tokenizer.py | 8 ++--- .../python/ov_tokenizer/hf_parser.py | 12 +++---- .../python/ov_tokenizer/node_factory.py | 25 --------------- .../tokenizer/python/tests/tokenizers_test.py | 6 ++-- 6 files changed, 44 insertions(+), 67 deletions(-) delete mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md index fe91c8a9f..ba42683a0 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md @@ -121,14 +121,13 @@ import numpy as np from openvino import compile_model, convert_model from transformers import AutoModelForCausalLM, AutoTokenizer from ov_tokenizer import ( - add_greedy_decoding, - convert_tokenizer, - init_extension, - pack_strings, - unpack_strings, + add_greedy_decoding, + convert_tokenizer, + init_extension, + pack_strings, + unpack_strings, ) - init_extension("path/to/libuser_ov_extensions.so") # Use different repo for the tokenizer because the original repo doesn't have .model file @@ -140,7 +139,7 @@ hf_model = AutoModelForCausalLM.from_pretrained(model_checkpoint, use_cache=Fals # convert hf tokenizer text_input = ["Quick brown fox was"] -ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) +ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) compiled_tokenizer = compile_model(ov_tokenizer) # transform input text into tokens @@ -156,20 +155,20 @@ compiled_model = compile_model(ov_model_with_greedy_decoding) new_tokens_size = 10 prompt_size = ov_input["input_ids"].shape[-1] input_dict = { - output.any_name: np.hstack([tensor, np.zeros(shape=(1, new_tokens_size), dtype=np.int_)]) - for output, tensor in ov_input.items() + output.any_name: np.hstack([tensor, np.zeros(shape=(1, new_tokens_size), dtype=np.int_)]) + for output, tensor in ov_input.items() } for idx in range(prompt_size, prompt_size + new_tokens_size): - output = compiled_model(input_dict)["token_ids"] - input_dict["input_ids"][:, idx] = output[:, idx - 1] - input_dict["attention_mask"][:, idx] = 1 + output = compiled_model(input_dict)["token_ids"] + input_dict["input_ids"][:, idx] = output[:, idx - 1] + input_dict["attention_mask"][:, idx] = 1 ov_token_ids = input_dict["input_ids"] hf_token_ids = hf_model.generate( - **hf_input, - min_new_tokens=new_tokens_size, - max_new_tokens=new_tokens_size, - temperature=0, # greedy decoding + **hf_input, + min_new_tokens=new_tokens_size, + max_new_tokens=new_tokens_size, + temperature=0, # greedy decoding ) # decode model output diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py index a0e5a30a7..f4571c9ae 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/__init__.py @@ -1,48 +1,51 @@ # -*- coding: utf-8 -*- # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - +import functools import os import sys +import sysconfig +from pathlib import Path import openvino from openvino.runtime.utils.node_factory import NodeFactory from .convert_tokenizer import convert_tokenizer -from .node_factory import _extension_path, init_extension from .str_pack import pack_strings, unpack_strings from .utils import add_greedy_decoding, connect_models +_extension_path = os.environ.get("OV_TOKENIZER_PREBUILD_EXTENSION_PATH") _ext_name = "user_ov_extensions" if _extension_path: # when the path to extension set manually - _ext_libs_path = os.path.dirname(_extension_path) + _ext_libs_path = Path(_extension_path).parent else: # python installation case - _ext_libs_path = os.path.join(os.path.dirname(__file__), "libs") + _ext_libs_path = Path(sysconfig.get_paths()["purelib"]) / __name__ / "libs" if sys.platform == "win32": - _ext_path = os.path.join(_ext_libs_path, f"{_ext_name}.dll") - if os.path.isdir(_ext_libs_path): + _ext_path = _ext_libs_path / f"{_ext_name}.dll" + if _ext_libs_path.is_dir(): # On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH. - os.add_dll_directory(os.path.abspath(_ext_libs_path)) + os.add_dll_directory(str(_ext_libs_path.absolute())) else: sys.exit(f"Error: extention libriary path {_ext_libs_path} not found") elif sys.platform == "darwin": - _ext_path = os.path.join(_ext_libs_path, f"lib{_ext_name}.dylib") + _ext_path = _ext_libs_path / f"lib{_ext_name}.dylib" elif sys.platform == "linux": - _ext_path = os.path.join(_ext_libs_path, f"lib{_ext_name}.so") + _ext_path = _ext_libs_path / f"lib{_ext_name}.so" else: - sys.exit(f"Error: extention does not support platform {sys.platform}") + sys.exit(f"Error: extension does not support platform {sys.platform}") # patching openvino old_core_init = openvino.runtime.Core.__init__ -def new_core_init(self, *k, **kw): - old_core_init(self, *k, **kw) - self.add_extension(_ext_path) +@functools.wraps(old_core_init) +def new_core_init(self, *args, **kwargs): + old_core_init(self, *args, **kwargs) + self.add_extension(str(_ext_path)) # Core.add_extension doesn't support Path object openvino.runtime.Core.__init__ = new_core_init diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py index 1d107a1ce..e693da913 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/convert_tokenizer.py @@ -18,7 +18,7 @@ def convert_tokenizer( tokenizer_object: Any, number_of_inputs: int = 1, - with_decoder: bool = False, + with_detokenizer: bool = False, streaming_decoder: bool = False, tokenizer_output_type: Type = Type.i64, detokenizer_input_type: Type = Type.i64, @@ -46,21 +46,21 @@ def convert_tokenizer( ov_tokenizers = convert_sentencepiece_model_tokenizer( tokenizer_object, add_attention_mask=True, - with_decoder=with_decoder, + with_detokenizer=with_detokenizer, streaming_decoder=streaming_decoder, ) elif is_tiktoken_model(tokenizer_object): logger.info("Convert tiktoken-based tokenizer") ov_tokenizers = convert_tiktoken_model_tokenizer( tokenizer_object, - with_decoder=with_decoder, + with_detokenizer=with_detokenizer, ) elif isinstance(tokenizer_object, PreTrainedTokenizerFast): logger.info("Convert Huggingface Fast tokenizer pipeline.") ov_tokenizers = convert_fast_tokenizer( tokenizer_object, number_of_inputs=number_of_inputs, - with_decoder=with_decoder, + with_detokenizer=with_detokenizer, ) if ov_tokenizers is None: diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py index b6419849f..ba3f847c8 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/hf_parser.py @@ -275,7 +275,7 @@ def decoding(self) -> None: def convert_fast_tokenizer( hf_tokenizer: "PreTrainedTokenizerBase", number_of_inputs: int = 1, - with_decoder: bool = False, + with_detokenizer: bool = False, ) -> Union[Model, Tuple[Model, Model]]: pipeline = TransformersTokenizerPipelineParser(hf_tokenizer).parse(number_of_inputs=number_of_inputs) ov_tokenizer = pipeline.get_encoder_ov_subgraph() @@ -300,7 +300,7 @@ def convert_fast_tokenizer( filtered_outputs.append(ov_tokenizer.output(i)) tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters(), TOKENIZER_ENCODER_NAME) - if with_decoder: + if with_detokenizer: return tokenizer_model, pipeline.get_decoder_ov_subgraph() return tokenizer_model @@ -329,7 +329,7 @@ def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTra def convert_sentencepiece_model_tokenizer( hf_tokenizer: "PreTrainedTokenizerBase", add_attention_mask: bool = True, - with_decoder: bool = False, + with_detokenizer: bool = False, streaming_decoder: bool = False, ) -> Union[Model, Tuple[Model, Model]]: if not is_sentencepiece_model(hf_tokenizer): @@ -423,7 +423,7 @@ def convert_sentencepiece_model_tokenizer( tokenizer_encoder = Model(outputs, [input_node], TOKENIZER_ENCODER_NAME) tokenizer_encoder.validate_nodes_and_infer_types() - if not with_decoder: + if not with_detokenizer: return tokenizer_encoder return tokenizer_encoder, get_sp_decoder(sp_model_node, streaming_decoder=streaming_decoder) @@ -460,7 +460,7 @@ def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: def convert_tiktoken_model_tokenizer( hf_tokenizer: "PreTrainedTokenizerBase", - with_decoder: bool = False, + with_detokenizer: bool = False, ) -> Union[Model, Tuple[Model, Model]]: encoding = getattr(hf_tokenizer, "tokenizer", None) or hf_tokenizer.encoder split_pattern = encoding._pat_str @@ -480,7 +480,7 @@ def convert_tiktoken_model_tokenizer( CharsToBytesStep(), ] ) - if not with_decoder: + if not with_detokenizer: return pipeline.get_encoder_ov_subgraph() return pipeline.get_encoder_ov_subgraph(), pipeline.get_decoder_ov_subgraph() diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py deleted file mode 100644 index e2b6ed63c..000000000 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/ov_tokenizer/node_factory.py +++ /dev/null @@ -1,25 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os -from pathlib import Path -from typing import Union - -from openvino.runtime.utils.node_factory import NodeFactory - - -factory = NodeFactory() - - -def init_extension(extension_path: Union[str, Path]) -> None: - """ - Initialize factory with compiled tokenizer extension. - - :param extension_path: path to prebuilt C++ tokenizer library. - """ - factory.add_extension(extension_path) - - -if _extension_path := os.environ.get("OV_TOKENIZER_PREBUILD_EXTENSION_PATH"): - init_extension(_extension_path) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py index 35b42e1a2..c7217f694 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py +++ b/modules/custom_operations/user_ie_extensions/tokenizer/python/tests/tokenizers_test.py @@ -124,7 +124,7 @@ def get_tokenizer(request, fast_tokenizer=True, trust_remote_code=False): hf_tokenizer = AutoTokenizer.from_pretrained( request.param, use_fast=fast_tokenizer, trust_remote_code=trust_remote_code ) - ov_tokenizer = convert_tokenizer(hf_tokenizer, with_decoder=False) + ov_tokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=False) compiled_tokenizer = core.compile_model(ov_tokenizer) return hf_tokenizer, compiled_tokenizer @@ -133,7 +133,7 @@ def get_tokenizer_detokenizer(request, fast_tokenizer=True, trust_remote_code=Fa hf_tokenizer = AutoTokenizer.from_pretrained( request.param, use_fast=fast_tokenizer, trust_remote_code=trust_remote_code ) - ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True) + ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) compiled_tokenizer = core.compile_model(ov_tokenizer) compiled_detokenizer = core.compile_model(ov_detokenizer) return hf_tokenizer, compiled_tokenizer, compiled_detokenizer @@ -326,7 +326,7 @@ def test_tiktoken_detokenizer(tiktoken_tokenizers, test_string): def test_streaming_detokenizer(): hf_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_3b_v2") - _, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_decoder=True, streaming_decoder=True) + _, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, streaming_decoder=True) ov_detokenizer = core.compile_model(ov_detokenizer) test_string = "this is a test string"