Skip to content

Commit

Permalink
changed tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mryzhov committed Nov 23, 2023
1 parent 2159b91 commit 8e230b7
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 40 deletions.
21 changes: 10 additions & 11 deletions modules/custom_operations/user_ie_extensions/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
#
cmake_policy(SET CMP0057 NEW)

if(POLICY CMP0079)
cmake_policy(SET CMP0079 NEW)
endif()

cmake_policy(SET CMP0079 NEW)

if(POLICY CMP0057)
cmake_policy(SET CMP0057 NEW)
Expand Down Expand Up @@ -92,9 +88,9 @@ endforeach()
add_library(${TARGET_NAME} SHARED ${SRC})

set_target_properties(${TARGET_NAME} PROPERTIES
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin # .exe and .dll
ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib # .lib and .a
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib # .so and .dylib
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} # .exe and .dll
ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib # .lib and .a
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} # .so and .dylib
)

if(OpenCV_FOUND)
Expand All @@ -108,18 +104,21 @@ endif()
# Left sentence_piece for backward compatibility
if("tokenizer" IN_LIST CUSTOM_OPERATIONS)
add_subdirectory(${SOURCES}/tokenizer)
if(extra_dlls)
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${extra_dlls} $<TARGET_FILE_DIR:${TARGET_NAME}>)
install(FILES ${extra_dlls} DESTINATION .)
endif()
endif()

target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime)
#target_include_directories(${TARGET_NAME} PRIVATE $<TARGET_PROPERTY:openvino::runtime,INTERFACE_INCLUDE_DIRECTORIES>)

target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS})
target_include_directories(${TARGET_NAME} PUBLIC ./include/)

if(DEFINED SKBUILD)
# Installing the extension module to the root of the package
install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . )

install(TARGETS ${TARGET_NAME} RUNTIME DESTINATION .)
if(APPLE)
set_target_properties(
${TARGET_NAME} PROPERTIES INSTALL_RPATH "@loader_path")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#

if(POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()
Expand Down Expand Up @@ -109,6 +108,8 @@ else()
FetchContent_MakeAvailable(fast_tokenizer)
include("${fast_tokenizer_SOURCE_DIR}/FastTokenizer.cmake")

set(fast_tokenizer_SOURCE_DIR "${fast_tokenizer_SOURCE_DIR}" PARENT_SCOPE)

if(WIN32 AND X86_64)
# we use re2 library in regex_normalization operation, so have to add to this list
# because prebuilt fast_tokenizers package does not provide this library
Expand Down Expand Up @@ -147,9 +148,16 @@ target_link_libraries(${TARGET_NAME} PRIVATE ${FAST_TOKENIZER_LIBS} ${RE2_LIBS}
string(REPLACE " " ";" cxx_flags "${cxx_flags}")
set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 17
COMPILE_OPTIONS "${cxx_flags}")
if(WIN32)
file(GLOB DEPS_LIBS ${fast_tokenizer_SOURCE_DIR}/lib/*.dll ${fast_tokenizer_SOURCE_DIR}/third_party/lib/*.dll)
file(COPY ${DEPS_LIBS} DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${CMAKE_BUILD_TYPE})
endif()
#
# Post build steps to copy core_tokenizers dependencies
#

install(FILES ${DEPS_LIBS} DESTINATION .)
if(WIN32 AND X86_64)
if(BUILD_FAST_TOKENIZERS)
# TODO
else()
set(extra_dlls "${fast_tokenizer_SOURCE_DIR}/lib/core_tokenizers.dll"
"${fast_tokenizer_SOURCE_DIR}/third_party/lib/icudt70.dll"
"${fast_tokenizer_SOURCE_DIR}/third_party/lib/icuuc70.dll" PARENT_SCOPE)
endif()
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@
from .str_pack import pack_strings, unpack_strings
from .utils import add_greedy_decoding, connect_models

_ext_name = "user_ov_extensions"
_ext_libs_path = os.path.join(os.path.dirname(__file__), "libs")
_ext_path = os.path.join(_ext_libs_path, "libuser_ov_extensions.so")

if sys.platform == "win32":
_ext_libs_path = os.path.join(_ext_libs_path, "bin")
_ext_path = os.path.join(_ext_libs_path, "user_ov_extensions.dll")
_ext_path = os.path.join(_ext_libs_path, f'{_ext_name}.dll')
if os.path.isdir(_ext_libs_path):
# On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH.
os.add_dll_directory(os.path.abspath(_ext_path))
else:
sys.exit(f'Error: extention libriary path {_ext_libs_path} not found')
elif sys.platform == "darwin":
_ext_path = os.path.join(_ext_libs_path, "libuser_ov_extensions.dylib")
_ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.dylib')
elif sys.platform == "linux":
_ext_path = os.path.join(_ext_libs_path, f'lib{_ext_name}.so')
else:
sys.exit(f'Error: extention does not support platform {sys.platform}')

old_core_init = openvino.runtime.Core.__init__
def new_core_init(self, *k, **kw):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
TOKENIZER_DECODER_NAME,
TOKENIZER_ENCODER_NAME,
)
from .node_factory import factory
from . import _factory
from .tokenizer_pipeline import (
BPETokenizationStep,
BytesToCharsStep,
Expand Down Expand Up @@ -116,7 +116,7 @@ def __init__(self, tokenizer_object: Any, number_of_inputs: int = 1) -> None:
self.original_tokenizer = tokenizer_object
with TemporaryDirectory() as tmpdir:
tokenizer_object.save_pretrained(tmpdir)
with open(Path(tmpdir) / "tokenizer.json") as tj:
with open(Path(tmpdir) / "tokenizer.json", encoding="utf8") as tj:
self.tokenizer_json = json.load(tj)
self.pipeline = TokenizerPipeline()
self.number_of_inputs = number_of_inputs
Expand Down Expand Up @@ -313,7 +313,7 @@ def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTrainedTokenizerBase") -> None:
model_pb = import_protobuf()
model = model_pb.ModelProto()
with open(sp_model_path, "rb") as model_file:
with open(sp_model_path, "rb", encoding="utf8") as model_file:
model.ParseFromString(model_file.read())

add_token_dict = hf_tokenizer.tokenizer.index_special_tokens
Expand All @@ -322,7 +322,7 @@ def add_tokens_to_sentencepiece_model(sp_model_path: Path, hf_tokenizer: "PreTra
new_piece.piece = token
model.pieces.append(new_piece)

with open(sp_model_path, "wb") as model_file:
with open(sp_model_path, "wb", encoding="utf8") as model_file:
model_file.write(model.SerializeToString())


Expand Down Expand Up @@ -365,7 +365,7 @@ def convert_sentencepiece_model_tokenizer(
)
add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False

tokenizer_node = factory.create(
tokenizer_node = _factory.create(
"SentencepieceTokenizer",
[sp_model_node, input_node],
{
Expand All @@ -383,7 +383,7 @@ def convert_sentencepiece_model_tokenizer(

default_value = make_constant_node(hf_tokenizer.pad_token_id or 0, values.element_type)
broadcast = opset.broadcast(default_value, dense_shape)
scatternd_input_ids = factory.create(
scatternd_input_ids = _factory.create(
"ScatterNDUpdate",
[broadcast, indices, values], # FIXME: pad left side instead of right
)
Expand All @@ -399,7 +399,7 @@ def convert_sentencepiece_model_tokenizer(
outputs = scatternd_input_ids.outputs()

if add_attention_mask:
attention_mask = factory.create(
attention_mask = _factory.create(
"ScatterNDUpdate",
[
broadcast,
Expand Down Expand Up @@ -432,15 +432,15 @@ def convert_sentencepiece_model_tokenizer(
def get_sp_decoder(sp_model_node: Node, streaming_decoder: bool = False) -> Model:
token_ids = op.Parameter(Type.i32, PartialShape(["?", "?"])) # (batch, sequence)

decoder = factory.create(
decoder = _factory.create(
"SentencepieceStreamDetokenizer" if streaming_decoder else "SentencepieceDetokenizer",
[sp_model_node, token_ids],
).outputs()

if streaming_decoder:
decoder = RegexDecodingStep.replace_sp_spaces().get_ov_subgraph(decoder)

string_output = factory.create("StringTensorPack", decoder).outputs()
string_output = _factory.create("StringTensorPack", decoder).outputs()
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
tokenizer_decoder = Model(string_output, [token_ids], TOKENIZER_DECODER_NAME)
tokenizer_decoder.validate_nodes_and_infer_types()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,17 @@
import pytest


def prebuild_extenson_path():
ext_path = os.getenv("CUSTOM_OP_LIB") or os.getenv("OV_TOKENIZER_PREBUILD_EXTENSION_PATH")
if not ext_path:
raise EnvironmentError(
"No extension path found in the environment. "
"Export path to libuser_ov_extensions.so to CUSTOM_OP_LIB or OV_TOKENIZER_PREBUILD_EXTENSION_PATH variable."
)
return ext_path
# def prebuild_extenson_path():
# ext_path = os.getenv("CUSTOM_OP_LIB") or os.getenv("OV_TOKENIZER_PREBUILD_EXTENSION_PATH")
# if not ext_path:
# raise EnvironmentError(
# "No extension path found in the environment. "
# "Export path to libuser_ov_extensions.so to CUSTOM_OP_LIB or OV_TOKENIZER_PREBUILD_EXTENSION_PATH variable."
# )
# return ext_path


os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path()
# os.environ["OV_TOKENIZER_PREBUILD_EXTENSION_PATH"] = prebuild_extenson_path()
PASS_RATES_FILE = Path(__file__).parent / "pass_rates.json"


Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
import openvino
# import openvino
import ov_tokenizer

from transformers import AutoTokenizer, AutoModelForSequenceClassification
Expand Down

0 comments on commit 8e230b7

Please sign in to comment.