changed tests

apaniukov · Nov 23, 2023 · 8e230b7 · 8e230b7
1 parent 2159b91
commit 8e230b7
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 40 deletions.
diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt
@@ -2,11 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 cmake_policy(SET CMP0057 NEW)
-
-if(POLICY CMP0079)
-  cmake_policy(SET CMP0079 NEW)
-endif()
-
+cmake_policy(SET CMP0079 NEW)
 
 if(POLICY CMP0057)
   cmake_policy(SET CMP0057 NEW)
@@ -92,9 +88,9 @@ endforeach()
 add_library(${TARGET_NAME} SHARED ${SRC})
 
 set_target_properties(${TARGET_NAME} PROPERTIES
-  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin # .exe and .dll
-  ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib # .lib and .a
-  LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib # .so and .dylib
+  RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} # .exe and .dll
+  ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib # .lib and .a
+  LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} # .so and .dylib
 )
 
 if(OpenCV_FOUND)
@@ -108,18 +104,21 @@ endif()
 # Left sentence_piece for backward compatibility
 if("tokenizer" IN_LIST CUSTOM_OPERATIONS)
   add_subdirectory(${SOURCES}/tokenizer)
+  if(extra_dlls)
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${extra_dlls} $<TARGET_FILE_DIR:${TARGET_NAME}>)
+    install(FILES ${extra_dlls} DESTINATION .)
+  endif()
 endif()
 
 target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
-#target_include_directories(${TARGET_NAME} PRIVATE $<TARGET_PROPERTY:openvino::runtime,INTERFACE_INCLUDE_DIRECTORIES>)
 
 target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS})
 target_include_directories(${TARGET_NAME} PUBLIC ./include/)
 
 if(DEFINED SKBUILD)
   # Installing the extension module to the root of the package
-  install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . )
-
+  install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION .)
   if(APPLE)
     set_target_properties(
       ${TARGET_NAME} PROPERTIES INSTALL_RPATH "@loader_path")

diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/src/tokenizer/CMakeLists.txt
@@ -1,7 +1,6 @@
 # Copyright (C) 2018-2023 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
-
 if(POLICY CMP0135)
   cmake_policy(SET CMP0135 NEW)
 endif()
@@ -109,6 +108,8 @@ else()
   FetchContent_MakeAvailable(fast_tokenizer)
   include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake")
 
+  set(fast_tokenizer_SOURCE_DIR "${fast_tokenizer_SOURCE_DIR}" PARENT_SCOPE)
+
   if(WIN32 AND X86_64)
       # we use re2 library in regex_normalization operation, so have to add to this list
       # because prebuilt fast_tokenizers package does not provide this library
@@ -147,9 +148,16 @@ target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS} ${RE2_LIBS}
 string(REPLACE " " ";" cxx_flags "${cxx_flags}")
 set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17
                                                 COMPILE_OPTIONS "${cxx_flags}")
-if(WIN32)
-  file(GLOB DEPS_LIBS ${fast_tokenizer_SOURCE_DIR}/lib/*.dll ${fast_tokenizer_SOURCE_DIR}/third_party/lib/*.dll)
-  file(COPY ${DEPS_LIBS} DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMAKE_BUILD_TYPE})
-endif()
+#
+# Post build steps to copy core_tokenizers dependencies
+#
 
-install(FILES ${DEPS_LIBS} DESTINATION .)
+if(WIN32 AND X86_64)
+  if(BUILD_FAST_TOKENIZERS)
+    # TODO
+  else()
+    set(extra_dlls "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll"
+                   "${fast_tokenizer_SOURCE_DIR}/third_party/lib/icudt70.dll"
+                   "${fast_tokenizer_SOURCE_DIR}/third_party/lib/icuuc70.dll" PARENT_SCOPE)
+  endif()
+endif()
diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/__init__.py b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/__init__.py
@@ -11,19 +11,22 @@
 from .str_pack import pack_strings, unpack_strings
 from .utils import add_greedy_decoding, connect_models
 
+_ext_name = "user_ov_extensions"
 _ext_libs_path = os.path.join(os.path.dirname(__file__), "libs")
-_ext_path = os.path.join(_ext_libs_path, "libuser_ov_extensions.so")
 
 if sys.platform == "win32":
-    _ext_libs_path = os.path.join(_ext_libs_path, "bin")
-    _ext_path = os.path.join(_ext_libs_path, "user_ov_extensions.dll")
+    _ext_path = os.path.join(_ext_libs_path, f'{_ext_name}.dll')
     if os.path.isdir(_ext_libs_path):
         # On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH.
         os.add_dll_directory(os.path.abspath(_ext_path))
     else:
         sys.exit(f'Error: extention libriary path {_ext_libs_path} not found')
 elif sys.platform == "darwin":
-    _ext_path = os.path.join(_ext_libs_path, "libuser_ov_extensions.dylib")
+    _ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.dylib')
+elif sys.platform == "linux":
+    _ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.so')
+else:
+    sys.exit(f'Error: extention does not support platform {sys.platform}')
 
 old_core_init = openvino.runtime.Core.__init__
 def new_core_init(self, *k, **kw):

diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/hf_parser.py b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/ov_tokenizer/hf_parser.py
@@ -25,7 +25,7 @@
     TOKENIZER_DECODER_NAME,
     TOKENIZER_ENCODER_NAME,
 )
-from .node_factory import factory
+from . import _factory
 from .tokenizer_pipeline import (
     BPETokenizationStep,
     BytesToCharsStep,
@@ -116,7 +116,7 @@ def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1) -> None:
         self.original_tokenizer = tokenizer_object
         with TemporaryDirectory() as tmpdir:
             tokenizer_object.save_pretrained(tmpdir)
-            with open(Path(tmpdir) / "tokenizer.json") as tj:
+            with open(Path(tmpdir) / "tokenizer.json", encoding="utf8") as tj:
                 self.tokenizer_json = json.load(tj)
         self.pipeline = TokenizerPipeline()
         self.number_of_inputs = number_of_inputs
@@ -313,7 +313,7 @@ def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
 def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTrainedTokenizerBase") -> None:
     model_pb = import_protobuf()
     model = model_pb.ModelProto()
-    with open(sp_model_path, "rb") as model_file:
+    with open(sp_model_path, "rb", encoding="utf8") as model_file:
         model.ParseFromString(model_file.read())
 
     add_token_dict = hf_tokenizer.tokenizer.index_special_tokens
@@ -322,7 +322,7 @@ def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTra
         new_piece.piece = token
         model.pieces.append(new_piece)
 
-    with open(sp_model_path, "wb") as model_file:
+    with open(sp_model_path, "wb", encoding="utf8") as model_file:
         model_file.write(model.SerializeToString())
 
 
@@ -365,7 +365,7 @@ def convert_sentencepiece_model_tokenizer(
         )
     add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False
 
-    tokenizer_node = factory.create(
+    tokenizer_node = _factory.create(
         "SentencepieceTokenizer",
         [sp_model_node, input_node],
         {
@@ -383,7 +383,7 @@ def convert_sentencepiece_model_tokenizer(
 
     default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type)
     broadcast = opset.broadcast(default_value, dense_shape)
-    scatternd_input_ids = factory.create(
+    scatternd_input_ids = _factory.create(
         "ScatterNDUpdate",
         [broadcast, indices, values],  # FIXME: pad left side instead of right
     )
@@ -399,7 +399,7 @@ def convert_sentencepiece_model_tokenizer(
     outputs = scatternd_input_ids.outputs()
 
     if add_attention_mask:
-        attention_mask = factory.create(
+        attention_mask = _factory.create(
             "ScatterNDUpdate",
             [
                 broadcast,
@@ -432,15 +432,15 @@ def convert_sentencepiece_model_tokenizer(
 def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Model:
     token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"]))  # (batch, sequence)
 
-    decoder = factory.create(
+    decoder = _factory.create(
         "SentencepieceStreamDetokenizer" if streaming_decoder else "SentencepieceDetokenizer",
         [sp_model_node, token_ids],
     ).outputs()
 
     if streaming_decoder:
         decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder)
 
-    string_output = factory.create("StringTensorPack", decoder).outputs()
+    string_output = _factory.create("StringTensorPack", decoder).outputs()
     string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
     tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME)
     tokenizer_decoder.validate_nodes_and_infer_types()

diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/conftest.py b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/conftest.py
@@ -6,17 +6,17 @@
 import pytest
 
 
-def prebuild_extenson_path():
-    ext_path = os.getenv("CUSTOM_OP_LIB") or os.getenv("OV_TOKENIZER_PREBUILD_EXTENSION_PATH")
-    if not ext_path:
-        raise EnvironmentError(
-            "No extension path found in the environment. "
-            "Export path to libuser_ov_extensions.so to CUSTOM_OP_LIB or OV_TOKENIZER_PREBUILD_EXTENSION_PATH variable."
-        )
-    return ext_path
+# def prebuild_extenson_path():
+#     ext_path = os.getenv("CUSTOM_OP_LIB") or os.getenv("OV_TOKENIZER_PREBUILD_EXTENSION_PATH")
+#     if not ext_path:
+#         raise EnvironmentError(
+#             "No extension path found in the environment. "
+#             "Export path to libuser_ov_extensions.so to CUSTOM_OP_LIB or OV_TOKENIZER_PREBUILD_EXTENSION_PATH variable."
+#         )
+#     return ext_path
 
 
-os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path()
+# os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path()
 PASS_RATES_FILE = Path(__file__).parent / "pass_rates.json"
 
 

diff --git a/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/test.py b/modules/custom_operations/user_ie_extensions/src/tokenizer/python/tests/test.py
@@ -1,5 +1,5 @@
 import os
-import openvino
+# import openvino
 import ov_tokenizer
 
 from transformers import AutoTokenizer, AutoModelForSequenceClassification