From 8e230b7cd3b33389a30a03782e4245d6e2b05a98 Mon Sep 17 00:00:00 2001 From: Mikhail Ryzhov Date: Thu, 23 Nov 2023 15:26:57 +0100 Subject: [PATCH] changed tests --- .../user_ie_extensions/CMakeLists.txt | 21 +++++++++---------- .../src/tokenizer/CMakeLists.txt | 20 ++++++++++++------ .../tokenizer/python/ov_tokenizer/__init__.py | 11 ++++++---- .../python/ov_tokenizer/hf_parser.py | 18 ++++++++-------- .../src/tokenizer/python/tests/conftest.py | 18 ++++++++-------- .../src/tokenizer/python/tests/test.py | 2 +- 6 files changed, 50 insertions(+), 40 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index ace716d92..05a2d2469 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -2,11 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # cmake_policy(SET CMP0057 NEW) - -if(POLICY CMP0079) - cmake_policy(SET CMP0079 NEW) -endif() - +cmake_policy(SET CMP0079 NEW) if(POLICY CMP0057) cmake_policy(SET CMP0057 NEW) @@ -92,9 +88,9 @@ endforeach() add_library(${TARGET_NAME} SHARED ${SRC}) set_target_properties(${TARGET_NAME} PROPERTIES - RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin # .exe and .dll - ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib # .lib and .a - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib # .so and .dylib + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} # .exe and .dll + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib # .lib and .a + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} # .so and .dylib ) if(OpenCV_FOUND) @@ -108,18 +104,21 @@ endif() # Left sentence_piece for backward compatibility if("tokenizer" IN_LIST CUSTOM_OPERATIONS) add_subdirectory(${SOURCES}/tokenizer) + if(extra_dlls) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${extra_dlls} $) + install(FILES ${extra_dlls} DESTINATION .) + endif() endif() target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) -#target_include_directories(${TARGET_NAME} PRIVATE $) target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS}) target_include_directories(${TARGET_NAME} PUBLIC ./include/) if(DEFINED SKBUILD) # Installing the extension module to the root of the package - install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . ) - + install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION .) if(APPLE) set_target_properties( ${TARGET_NAME} PROPERTIES INSTALL_RPATH "@loader_path") diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/src/tokenizer/CMakeLists.txt index c3182c2d2..4937702a0 100644 --- a/modules/custom_operations/user_ie_extensions/src/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/src/tokenizer/CMakeLists.txt @@ -1,7 +1,6 @@ # Copyright (C) 2018-2023 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # - if(POLICY CMP0135) cmake_policy(SET CMP0135 NEW) endif() @@ -109,6 +108,8 @@ else() FetchContent_MakeAvailable(fast_tokenizer) include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake") + set(fast_tokenizer_SOURCE_DIR "${fast_tokenizer_SOURCE_DIR}" PARENT_SCOPE) + if(WIN32 AND X86_64) # we use re2 library in regex_normalization operation, so have to add to this list # because prebuilt fast_tokenizers package does not provide this library @@ -147,9 +148,16 @@ target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS} ${RE2_LIBS} string(REPLACE " " ";" cxx_flags "${cxx_flags}") set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17 COMPILE_OPTIONS "${cxx_flags}") -if(WIN32) - file(GLOB DEPS_LIBS ${fast_tokenizer_SOURCE_DIR}/lib/*.dll ${fast_tokenizer_SOURCE_DIR}/third_party/lib/*.dll) - file(COPY ${DEPS_LIBS} DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMAKE_BUILD_TYPE}) -endif() +# +# Post build steps to copy core_tokenizers dependencies +# -install(FILES ${DEPS_LIBS} DESTINATION .) \ No newline at end of file +if(WIN32 AND X86_64) + if(BUILD_FAST_TOKENIZERS) + # TODO + else() + set(extra_dlls "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll" + "${fast_tokenizer_SOURCE_DIR}/third_party/lib/icudt70.dll" + "${fast_tokenizer_SOURCE_DIR}/third_party/lib/icuuc70.dll" PARENT_SCOPE) + endif() +endif() \ No newline at end of file diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/__init__.py index 623965673..2b237f8eb 100644 --- a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/__init__.py +++ b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/__init__.py @@ -11,19 +11,22 @@ from .str_pack import pack_strings, unpack_strings from .utils import add_greedy_decoding, connect_models +_ext_name = "user_ov_extensions" _ext_libs_path = os.path.join(os.path.dirname(__file__), "libs") -_ext_path = os.path.join(_ext_libs_path, "libuser_ov_extensions.so") if sys.platform == "win32": - _ext_libs_path = os.path.join(_ext_libs_path, "bin") - _ext_path = os.path.join(_ext_libs_path, "user_ov_extensions.dll") + _ext_path = os.path.join(_ext_libs_path, f'{_ext_name}.dll') if os.path.isdir(_ext_libs_path): # On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH. os.add_dll_directory(os.path.abspath(_ext_path)) else: sys.exit(f'Error: extention libriary path {_ext_libs_path} not found') elif sys.platform == "darwin": - _ext_path = os.path.join(_ext_libs_path, "libuser_ov_extensions.dylib") + _ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.dylib') +elif sys.platform == "linux": + _ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.so') +else: + sys.exit(f'Error: extention does not support platform {sys.platform}') old_core_init = openvino.runtime.Core.__init__ def new_core_init(self, *k, **kw): diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/hf_parser.py index 401c8ea2b..27f8bc261 100644 --- a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/hf_parser.py +++ b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/hf_parser.py @@ -25,7 +25,7 @@ TOKENIZER_DECODER_NAME, TOKENIZER_ENCODER_NAME, ) -from .node_factory import factory +from . import _factory from .tokenizer_pipeline import ( BPETokenizationStep, BytesToCharsStep, @@ -116,7 +116,7 @@ def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1) -> None: self.original_tokenizer = tokenizer_object with TemporaryDirectory() as tmpdir: tokenizer_object.save_pretrained(tmpdir) - with open(Path(tmpdir) / "tokenizer.json") as tj: + with open(Path(tmpdir) / "tokenizer.json", encoding="utf8") as tj: self.tokenizer_json = json.load(tj) self.pipeline = TokenizerPipeline() self.number_of_inputs = number_of_inputs @@ -313,7 +313,7 @@ def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool: def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTrainedTokenizerBase") -> None: model_pb = import_protobuf() model = model_pb.ModelProto() - with open(sp_model_path, "rb") as model_file: + with open(sp_model_path, "rb", encoding="utf8") as model_file: model.ParseFromString(model_file.read()) add_token_dict = hf_tokenizer.tokenizer.index_special_tokens @@ -322,7 +322,7 @@ def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTra new_piece.piece = token model.pieces.append(new_piece) - with open(sp_model_path, "wb") as model_file: + with open(sp_model_path, "wb", encoding="utf8") as model_file: model_file.write(model.SerializeToString()) @@ -365,7 +365,7 @@ def convert_sentencepiece_model_tokenizer( ) add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False - tokenizer_node = factory.create( + tokenizer_node = _factory.create( "SentencepieceTokenizer", [sp_model_node, input_node], { @@ -383,7 +383,7 @@ def convert_sentencepiece_model_tokenizer( default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type) broadcast = opset.broadcast(default_value, dense_shape) - scatternd_input_ids = factory.create( + scatternd_input_ids = _factory.create( "ScatterNDUpdate", [broadcast, indices, values], # FIXME: pad left side instead of right ) @@ -399,7 +399,7 @@ def convert_sentencepiece_model_tokenizer( outputs = scatternd_input_ids.outputs() if add_attention_mask: - attention_mask = factory.create( + attention_mask = _factory.create( "ScatterNDUpdate", [ broadcast, @@ -432,7 +432,7 @@ def convert_sentencepiece_model_tokenizer( def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Model: token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence) - decoder = factory.create( + decoder = _factory.create( "SentencepieceStreamDetokenizer" if streaming_decoder else "SentencepieceDetokenizer", [sp_model_node, token_ids], ).outputs() @@ -440,7 +440,7 @@ def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Mode if streaming_decoder: decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder) - string_output = factory.create("StringTensorPack", decoder).outputs() + string_output = _factory.create("StringTensorPack", decoder).outputs() string_output[0].tensor.add_names({STRING_OUTPUT_NAME}) tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME) tokenizer_decoder.validate_nodes_and_infer_types() diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/conftest.py index 054388410..074efa64d 100644 --- a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/conftest.py +++ b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/conftest.py @@ -6,17 +6,17 @@ import pytest -def prebuild_extenson_path(): - ext_path = os.getenv("CUSTOM_OP_LIB") or os.getenv("OV_TOKENIZER_PREBUILD_EXTENSION_PATH") - if not ext_path: - raise EnvironmentError( - "No extension path found in the environment. " - "Export path to libuser_ov_extensions.so to CUSTOM_OP_LIB or OV_TOKENIZER_PREBUILD_EXTENSION_PATH variable." - ) - return ext_path +# def prebuild_extenson_path(): +# ext_path = os.getenv("CUSTOM_OP_LIB") or os.getenv("OV_TOKENIZER_PREBUILD_EXTENSION_PATH") +# if not ext_path: +# raise EnvironmentError( +# "No extension path found in the environment. " +# "Export path to libuser_ov_extensions.so to CUSTOM_OP_LIB or OV_TOKENIZER_PREBUILD_EXTENSION_PATH variable." +# ) +# return ext_path -os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path() +# os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path() PASS_RATES_FILE = Path(__file__).parent / "pass_rates.json" diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/test.py b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/test.py index 1a9d5b71a..34d0fb3af 100644 --- a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/test.py +++ b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/test.py @@ -1,5 +1,5 @@ import os -import openvino +# import openvino import ov_tokenizer from transformers import AutoTokenizer, AutoModelForSequenceClassification