From 70f1177f75debbf901f812c46a187eaf12f362f8 Mon Sep 17 00:00:00 2001 From: Zlobin Vladimir Date: Thu, 23 May 2024 20:35:40 +0400 Subject: [PATCH 1/5] Fix archive (#8) * enable * libtbb-dev * move * slash * install * core_genai_dev * remove export * rreorganaise components * add SOVERSION, and requirements-build.txt * repalce SKBUILD with EXCLUDE_FROM_ALL because the effect is the same * fix NAMELINK_COMPONENT * remove extraline * add soft restrictions * Fix build to unblock packaging * improve naming * install samples * remove quotes * use main target name because an alias can't be specified in cmake --target * define CMAKE_BUILD_PARALLEL_LEVEL * Ensure ./requirements-build.txt won't outdate * Use ./requirements-build.txt in python lib build * Add missing && * Test Debug * add matrix for windows_genai_package * openvino_tokenizers from form * update openvino_tokenizers * update openvino_tokenizers * update openvino_tokenizers * revert openvino_tokenizers * tokenizers from fork * update tokenizers * centos7_2024.2.0.dev * copy target * revert tokenizers * reapply useful changes * copy so only * fix CMAKE_BUILD_PARALLEL_LEVEL --- .github/dependabot.yml | 4 ++ .github/workflows/genai_package.yml | 38 ++++++++++++----- .github/workflows/genai_python_lib.yml | 14 ++++--- CMakeLists.txt | 5 ++- pyproject.toml | 2 +- requirements-build.txt | 2 + src/cpp/CMakeLists.txt | 42 ++++++++++++------- ....cmake.in => OpenVINOGenAIConfig.cmake.in} | 2 +- src/python/CMakeLists.txt | 13 +++--- text_generation/causal_lm/cpp/CMakeLists.txt | 7 +++- 10 files changed, 87 insertions(+), 42 deletions(-) create mode 100644 requirements-build.txt rename src/cpp/{openvino_genaiConfig.cmake.in => OpenVINOGenAIConfig.cmake.in} (70%) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9ab4587c2a..789167949f 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,5 +1,9 @@ version: 2 updates: + - package-ecosystem: "pip" + directory: "./" + schedule: + interval: "weekly" - package-ecosystem: "pip" directory: "image_generation/stable_diffusion_1_5/cpp/scripts/" schedule: diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml index b6f1647c7a..42ef1da025 100644 --- a/.github/workflows/genai_package.yml +++ b/.github/workflows/genai_package.yml @@ -2,7 +2,9 @@ name: genai_package on: pull_request jobs: ubuntu_genai_package: - if: false + strategy: + matrix: + build-type: [Release, Debug] runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -14,16 +16,25 @@ jobs: - run: mkdir ./ov/ - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release --target package -j - - run: source ./ov/setupvars.sh && cmake --install ./build/ --config Release --prefix ov - - run: ov/samples/cpp/build_samples.sh -b "${{ github.workspace }}/s pace" + - run: sudo apt-get install libtbb-dev + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov + - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + if: ${{ 'Release' == matrix.build-type }} - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - - run: source ./ov/setupvars.sh && timeout 50s "${{ github.workspace }}/s pace/intel64/Release/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" + if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && timeout 50s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" + if: ${{ 'Release' == matrix.build-type }} windows_genai_package: + strategy: + matrix: + build-type: [Release, Debug] runs-on: windows-latest defaults: run: @@ -37,11 +48,16 @@ jobs: python-version: 3.8 - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.2.0-15349-765302e0de1/w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64.zip - run: unzip ov.zip - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release --target package -j - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --install ./build/ --config Release --prefix w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64 - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\samples\cpp\build_samples_msvc.bat -b "${{ github.workspace }}/samples_build" + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64 + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install" + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + if: ${{ 'Release' == matrix.build-type }} - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && "${{ github.workspace }}/samples_build/intel64/Release/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" + if: ${{ 'Release' == matrix.build-type }} + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" + if: ${{ 'Release' == matrix.build-type }} diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml index 29f537858a..db6bb9fa65 100644 --- a/.github/workflows/genai_python_lib.yml +++ b/.github/workflows/genai_python_lib.yml @@ -2,7 +2,7 @@ name: genai_python_lib on: pull_request jobs: ubuntu_genai_python_lib: - runs-on: ubuntu-20.04-16-cores + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 with: @@ -16,9 +16,11 @@ jobs: - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j - run: python -m pip install --pre openvino --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly # Can't load CentOS libraries from the archive - - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt - run: PYTHONPATH=./src/python/ python -c "from openvino_genai import LLMPipeline" - - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + - run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - run: python -c "from openvino_genai import LLMPipeline" - name: GenAI Python API tests run: | @@ -48,7 +50,9 @@ jobs: - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j - run: python -m pip install "numpy<1.27" - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install . + - run: set CMAKE_BUILD_PARALLEL_LEVEL=&& call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install . - run: python -c "from openvino_genai import LLMPipeline" diff --git a/CMakeLists.txt b/CMakeLists.txt index ac392233a6..6c01b378c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,13 +13,14 @@ if(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...") endif() -project(openvino_genai VERSION 2024.2.0.0) +project(OpenVINOGenAI VERSION 2024.2.0.0) add_subdirectory(./thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/") add_subdirectory(src) add_subdirectory(text_generation/causal_lm/cpp) install(DIRECTORY text_generation/causal_lm/cpp/ DESTINATION samples/cpp/causal_lm COMPONENT cpp_samples_genai) -install(FILES LICENSE third-party-programs.txt DESTINATION licensing_genai COMPONENT licensing_genai) # TODO: how to merge with OPenvino +install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) +install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) set(CPACK_GENERATOR "ZIP") include(CPack) diff --git a/pyproject.toml b/pyproject.toml index cb373e12c8..f9707988bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ cmake.source-dir = "./" cmake.build-type = "Release" cmake.targets = ["py_generate_pipeline", "genai"] -install.components = ["core_genai", "pygenai"] +install.components = ["wheel_genai"] sdist.cmake = true wheel.packages = ["src/python/openvino_genai"] wheel.install-dir = "openvino_genai" diff --git a/requirements-build.txt b/requirements-build.txt new file mode 100644 index 0000000000..aaaf7148ec --- /dev/null +++ b/requirements-build.txt @@ -0,0 +1,2 @@ +cmake~=3.23 +build~=1.2.1 diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index e0151376b4..30d95d3553 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -57,6 +57,11 @@ target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$" COMMENT "Copy ${TARGET_NAME} to src/python/openvino_genai") -# Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/ -add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND "${CMAKE_COMMAND}" -E copy - "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so" - "${CMAKE_BINARY_DIR}/openvino_tokenizers/src/" - COMMENT "Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/") +find_package(Python3 REQUIRED COMPONENTS Interpreter Development) +install(TARGETS ${TARGET_NAME} + LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR} + RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) -install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . COMPONENT core_genai RUNTIME DESTINATION . COMPONENT core_genai) +# Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/ +if(NOT MSVC) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND "${CMAKE_COMMAND}" -E copy + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so" + "${CMAKE_BINARY_DIR}/openvino_tokenizers/src/" + COMMENT "Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/") +endif() # - Windows: `\runtime\bin\intel64\Release\` # - MacOS_x86: `/runtime/lib/intel64/Release` @@ -94,17 +104,17 @@ endif() if(MSVC OR APPLE) set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE}) endif() -install(TARGETS ${TARGET_NAME} EXPORT openvino_genaiTargets - LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev +install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets + LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai + NAMELINK_COMPONENT core_genai_dev ARCHIVE DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev - RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai_dev + RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai INCLUDES DESTINATION runtime/include) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION runtime/include COMPONENT core_genai_dev) -install(EXPORT openvino_genaiTargets FILE openvino_genaiTargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake) +install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake) include(CMakePackageConfigHelpers) -configure_package_config_file(openvino_genaiConfig.cmake.in "${CMAKE_BINARY_DIR}/openvino_genaiConfig.cmake" INSTALL_DESTINATION runtime/cmake) -install(FILES "${CMAKE_BINARY_DIR}/openvino_genaiConfig.cmake" "${CMAKE_BINARY_DIR}/openvino_genaiConfigVersion.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev) +configure_package_config_file(OpenVINOGenAIConfig.cmake.in "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" INSTALL_DESTINATION runtime/cmake) +install(FILES "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev) include(CMakePackageConfigHelpers) -write_basic_package_version_file("${CMAKE_BINARY_DIR}/openvino_genaiConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion) -export(EXPORT openvino_genaiTargets FILE "${CMAKE_BINARY_DIR}/openvino_genaiTargets.cmake" NAMESPACE openvino::) -# export(TARGETS ${TARGET_NAME} NAMESPACE openvino:: FILE "${CMAKE_BINARY_DIR}/openvino_genaiConfig.cmake") TODO +write_basic_package_version_file("${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion) +export(EXPORT OpenVINOGenAITargets FILE "${CMAKE_BINARY_DIR}/OpenVINOGenAITargets.cmake" NAMESPACE openvino::) diff --git a/src/cpp/openvino_genaiConfig.cmake.in b/src/cpp/OpenVINOGenAIConfig.cmake.in similarity index 70% rename from src/cpp/openvino_genaiConfig.cmake.in rename to src/cpp/OpenVINOGenAIConfig.cmake.in index abfd33b524..18c0bb4e48 100644 --- a/src/cpp/openvino_genaiConfig.cmake.in +++ b/src/cpp/OpenVINOGenAIConfig.cmake.in @@ -4,7 +4,7 @@ include(CMakeFindDependencyMacro) find_dependency(OpenVINO COMPONENTS Runtime) if(NOT TARGET genai) - include("${CMAKE_CURRENT_LIST_DIR}/openvino_genaiTargets.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake") endif() check_required_components(openvino_genai) diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 62f26f3215..00722b6fff 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -5,8 +5,8 @@ include(FetchContent) FetchContent_Declare( pybind11 - GIT_REPOSITORY https://github.com/pybind/pybind11 - GIT_TAG v2.12.0 + URL https://github.com/pybind/pybind11/archive/3e9dfa2866941655c56877882565e7577de6fc7b.tar.gz + URL_HASH SHA256=9a7d245f405f470798b9d2a48912cc97230658024775299eac203f7c9c9ae37c ) set(CMAKE_POSITION_INDEPENDENT_CODE ON) FetchContent_GetProperties(pybind11) @@ -16,9 +16,7 @@ if(NOT pybind11_POPULATED) endif() pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp) -target_link_libraries(py_generate_pipeline PRIVATE genai) - -install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT pygenai) +target_link_libraries(py_generate_pipeline PRIVATE openvino::genai) # setting RPATH / LC_RPATH depending on platform if(LINUX) @@ -46,3 +44,8 @@ add_custom_command(TARGET py_generate_pipeline POST_BUILD find_package(Python3 REQUIRED COMPONENTS Interpreter Development) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/ DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) +install(TARGETS py_generate_pipeline LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + +# wheel_genai component is used for wheel generation in pyproject.toml. +# Exclude wheel_genai from normal packaging process. +install(TARGETS genai py_generate_pipeline LIBRARY DESTINATION . COMPONENT wheel_genai RUNTIME DESTINATION . COMPONENT wheel_genai EXCLUDE_FROM_ALL) diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt index 1998c3ccb6..7e3ec23fde 100644 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ b/text_generation/causal_lm/cpp/CMakeLists.txt @@ -10,7 +10,7 @@ else() set(OPENVINO_TOKENIZERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../bin/openvino_tokenizers.dll) # TODO: I'll go away after the generate() gets a way to find openvino_tokenizers endif() -find_package(openvino_genai REQUIRED PATHS +find_package(OpenVINOGenAI REQUIRED PATHS "${CMAKE_BINARY_DIR}" # Reuse the package from the build. ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. ) @@ -51,3 +51,8 @@ target_link_libraries(chat_sample PRIVATE openvino::genai) target_include_directories(chat_sample PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") set_target_properties(chat_sample PROPERTIES CXX_STANDARD 17) set_target_properties(chat_sample PROPERTIES CXX_STANDARD_REQUIRED ON) + +install(TARGETS greedy_causal_lm beam_search_causal_lm speculative_decoding_lm prompt_lookup_decoding_lm chat_sample + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) From da729ba1d61035347d0864fa856a743fb433416a Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 24 May 2024 08:29:35 +0200 Subject: [PATCH 2/5] Apply suggestions from code review Co-authored-by: Yaroslav Tarkan Co-authored-by: Xiake Sun Co-authored-by: Ilya Lavrenov --- src/README.md | 12 ++++++------ src/cpp/include/openvino/genai/generation_config.hpp | 3 +-- src/cpp/include/openvino/genai/llm_pipeline.hpp | 6 +++--- src/cpp/include/openvino/genai/streamer_base.hpp | 2 +- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/README.md b/src/README.md index ad21250989..2f729b8b3c 100644 --- a/src/README.md +++ b/src/README.md @@ -8,7 +8,7 @@ optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weigh pip install openvino-genai ``` -LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration. +`LLMPipeline` is the main object used for decoding. You can initialize it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration. ### Python @@ -129,7 +129,7 @@ int main(int argc, char* argv[]) { for (size_t i = 0; i < questions.size(); i++) { std::cout << "question:\n"; - cout << prompt << endl; + std::cout << prompt << std::endl; auto answer = pipe(prompt, config, streamer); // no need to print answer, streamer will do that @@ -138,7 +138,7 @@ int main(int argc, char* argv[]) { } ``` -Streaming exapmle with lambda function +Streaming example with lambda function ``` cpp @@ -156,11 +156,11 @@ int main(int argc, char* argv[]) { Streaming with custom class ``` cpp -#include +#include "openvino/genai/streamer_base.hpp" #include "openvino/genai/llm_pipeline.hpp" #include -class CustomStreamer: publict StreamerBase { +class CustomStreamer: public ov::StreamerBase { public: void put(int64_t token) { /* custom decoding/tokens processing code @@ -180,6 +180,6 @@ int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::LLMPipeline pipe(model_path, "CPU"); - cout << pipe.generate("The Sun is yellow bacause", custom_streamer); + std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer); } ``` diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 837fae21ad..879f802ae7 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -53,12 +53,11 @@ enum class StopCriteria { early, heuristic, never }; * @param eos_token_id id of token * @param bos_token token string representation * @param eos_token token string representation - * @param draft_model draft model for assitive decoding */ class OPENVINO_GENAI_EXPORTS GenerationConfig { public: GenerationConfig() = default; - GenerationConfig(std::string json_path); + explicit GenerationConfig(std::string json_path); // Generic size_t max_new_tokens = SIZE_MAX; diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 3bc8453d4e..48f9292b02 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -6,7 +6,7 @@ #include #include -#include +#include "openvino/core/any.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/tokenizer.hpp" #include "openvino/genai/streamer_base.hpp" @@ -174,10 +174,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { }; /* - * utils that allow to use generate and operarator() in the folllowing way: + * utils that allow to use generate and operator() in the following way: * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...) * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...) - * All names match to names in cofnig except streamer. + * All names match to names in config except streamer. */ static constexpr ov::Property max_new_tokens{"max_new_tokens"}; static constexpr ov::Property max_length{"max_length"}; diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp index 3f0879d702..385cb2bf1e 100644 --- a/src/cpp/include/openvino/genai/streamer_base.hpp +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -15,7 +15,7 @@ namespace ov { class StreamerBase { public: Tokenizer m_tokenizer; - StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {}; + explicit StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {} StreamerBase() = default; /// @brief put is called every time new token is decoded From 28c313be5b48cb287ad631a2817999f26ee6dd16 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 24 May 2024 09:43:33 +0200 Subject: [PATCH 3/5] add groups to GenerationConfig docstring remove generation_config_helper fix windows failing define operator() alias in hpp add GENAI_MODELS_PATH_PREFIX env to test_generate_api improve LLMPipeline constructor --- .github/workflows/causal_lm_cpp.yml | 1 + src/README.md | 2 +- .../openvino/genai/generation_config.hpp | 36 ++-- .../include/openvino/genai/llm_pipeline.hpp | 25 +-- src/cpp/src/generation_config.cpp | 28 ++-- src/cpp/src/generation_config_helper.hpp | 31 ---- src/cpp/src/greedy_decoding.cpp | 15 +- src/cpp/src/group_beam_searcher.cpp | 10 +- src/cpp/src/llm_pipeline.cpp | 158 +++++++++--------- src/cpp/src/tokenizer.cpp | 1 - tests/python_tests/list_test_models.py | 5 +- tests/python_tests/test_generate_api.py | 10 +- 12 files changed, 159 insertions(+), 163 deletions(-) delete mode 100644 src/cpp/src/generation_config_helper.hpp diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index a07dacac30..a0687e231d 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -194,6 +194,7 @@ jobs: shell: cmd run: | call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat + set PATH=.\build\src\cpp\Release;%PATH% .\build\text_generation\causal_lm\cpp\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt echo import transformers > ref.py diff --git a/src/README.md b/src/README.md index 2f729b8b3c..be51fb10b2 100644 --- a/src/README.md +++ b/src/README.md @@ -8,7 +8,7 @@ optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weigh pip install openvino-genai ``` -`LLMPipeline` is the main object used for decoding. You can initialize it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration. +`LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration. ### Python diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 879f802ae7..529f5ac8f3 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -22,37 +22,43 @@ namespace ov { enum class StopCriteria { early, heuristic, never }; /** - * @brief structure to keep generation config parameters. + * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + * and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + * be used while greedy and beam search parameters will not affect decoding at all. * + * Generic parameters: * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt + * `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. * @param ignore_eos if set to true, then generation will not stop even if token is met. + * @param pad_token_id token_id of (padding) + * @param bos_token_id token_id of (beggining of sentence) + * @param eos_token_id token_id of (end of sentence) + * @param bos_token token string representation + * @param eos_token token string representation + * + * Beam search specific parameters: * @param num_beams number of beams for beam search. 1 disables beam search. * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a - * particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. + * particular time. * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to * the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log * likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while * `length_penalty` < 0.0 encourages shorter sequences. - * @param num_return_sequences the number of sequences to return for grouped beam search decoding + * @param num_return_sequences the number of sequences to return for grouped beam search decoding. * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once. * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: * "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an * heuristic is applied and the generation stops when is it very unlikely to find better candidates; * "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - * @param temperature the value used to modulate token probabilities for random sampling + * + * Random sampling parameters: + * @param temperature the value used to modulate token probabilities for random sampling. * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. - * @param do_sample whether or not to use multinomial random sampling - * that add up to `top_p` or higher are kept. - * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. See https://arxiv.org/pdf/1909.05858. - * @param pad_token_id id of padding token - * @param bos_token_id id of token - * @param eos_token_id id of token - * @param bos_token token string representation - * @param eos_token token string representation + * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. */ class OPENVINO_GENAI_EXPORTS GenerationConfig { public: @@ -88,6 +94,12 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { // used for chat scenario std::string bos_token = ""; std::string eos_token = ""; + + size_t get_max_new_tokens(size_t prompt_length = 0) const; + bool is_greedy_decoding() const; + bool is_beam_search() const; + bool is_multimomial() const; + static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {}); }; } // namespace ov diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 48f9292b02..7d9d1ea9b3 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -71,7 +71,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param device optional device * @param plugin_config optional plugin_config */ - LLMPipeline(std::string& path, std::string device="CPU", + LLMPipeline(const std::string& path, const std::string& device="CPU", const ov::AnyMap& plugin_config={}, const std::string& ov_tokenizers_path=""); @@ -84,9 +84,9 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param plugin_config optional plugin_config */ LLMPipeline( - const std::string model_path, + const std::string& model_path, const ov::Tokenizer& tokenizer, - const std::string device="CPU", + const std::string& device="CPU", const ov::AnyMap& plugin_config = {}, const std::string& ov_tokenizers_path="" ); @@ -127,8 +127,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * @param generation_config optional GenerationConfig * @return DecodedResults a structure with resulting texts & scores */ - DecodedResults generate(std::vector texts, OptionalGenerationConfig generation_config); - DecodedResults generate(std::initializer_list text, OptionalGenerationConfig generation_config); + DecodedResults generate(const std::vector& texts, OptionalGenerationConfig generation_config); /** * @brief Low level generate to be called with already encoded input_ids tokens. @@ -153,12 +152,17 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { return generate(text, AnyMap{std::forward(properties)...}); } - DecodedResults operator()(std::vector text, OptionalGenerationConfig generation_config=std::nullopt); - DecodedResults operator()(std::initializer_list text, OptionalGenerationConfig generation_config=std::nullopt); + DecodedResults operator()(const std::vector& text, OptionalGenerationConfig generation_config=std::nullopt) { + return generate(text, generation_config); + } - // generate with streamers - std::string operator()(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt); - std::string operator()(std::string text, OptionalStreamerVariant streamer); + std::string operator()( + std::string text, + OptionalGenerationConfig generation_config=std::nullopt, + OptionalStreamerVariant streamer=std::nullopt + ) { + return generate(text, generation_config, streamer); + } ov::Tokenizer get_tokenizer(); GenerationConfig get_generation_config() const; @@ -177,7 +181,6 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * utils that allow to use generate and operator() in the following way: * pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...) * pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...) - * All names match to names in config except streamer. */ static constexpr ov::Property max_new_tokens{"max_new_tokens"}; static constexpr ov::Property max_length{"max_length"}; diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index 14fc370c59..d9f98837da 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -6,12 +6,10 @@ #include #include - #include "openvino/genai/generation_config.hpp" - -#include "generation_config_helper.hpp" #include "utils.hpp" + namespace { @@ -62,10 +60,10 @@ GenerationConfig::GenerationConfig(std::string json_path) { } -GenerationConfig GenerationConfigHelper::anymap_to_generation_config(const ov::AnyMap& config_map) { +GenerationConfig GenerationConfig::anymap_to_generation_config(const ov::AnyMap& config_map) { using ov::generate_utils::read_anymap_param; - GenerationConfig config = m_config; + GenerationConfig config; read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens); read_anymap_param(config_map, "max_length", config.max_length); read_anymap_param(config_map, "ignore_eos", config.ignore_eos); @@ -90,25 +88,25 @@ GenerationConfig GenerationConfigHelper::anymap_to_generation_config(const ov::A return config; } -size_t GenerationConfigHelper::get_max_new_tokens(size_t prompt_length) { +size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length - if (m_config.max_new_tokens != SIZE_MAX) { - return m_config.max_new_tokens; + if (max_new_tokens != SIZE_MAX) { + return max_new_tokens; } else { - return m_config.max_length - prompt_length; + return max_length - prompt_length; } } -bool GenerationConfigHelper::is_greedy_decoding() const { - return !m_config.do_sample && !is_beam_search(); +bool GenerationConfig::is_greedy_decoding() const { + return !do_sample && !is_beam_search(); } -bool GenerationConfigHelper::is_beam_search() const { - return m_config.num_beams > 1; +bool GenerationConfig::is_beam_search() const { + return num_beams > 1; } -bool GenerationConfigHelper::is_multimomial() const { - return m_config.do_sample; +bool GenerationConfig::is_multimomial() const { + return do_sample; } } // namespace ov diff --git a/src/cpp/src/generation_config_helper.hpp b/src/cpp/src/generation_config_helper.hpp deleted file mode 100644 index f4e5839990..0000000000 --- a/src/cpp/src/generation_config_helper.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include "openvino/genai/generation_config.hpp" - -namespace ov { - - -class GenerationConfigHelper { -public: - GenerationConfig m_config; - - GenerationConfigHelper() = default; - - GenerationConfigHelper(const GenerationConfig& config): m_config(config) {}; - - size_t get_max_new_tokens(size_t prompt_length = 0); - - bool is_greedy_decoding() const; - - bool is_beam_search() const; - - bool is_multimomial() const; - - GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {}); - -}; - -} // namespace ov diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 3298553a76..4d785077a7 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -1,17 +1,20 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "generation_config_helper.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" namespace ov { -ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, - ov::Tensor input_ids, ov::Tensor attention_mask, ov::GenerationConfig generation_config, - std::shared_ptr streamer, bool is_chat_conversation) { +ov::EncodedResults greedy_decoding( + ov::InferRequest& m_model_runner, + ov::Tensor input_ids, + ov::Tensor attention_mask, + const ov::GenerationConfig generation_config, + const std::shared_ptr streamer, + const bool is_chat_conversation +) { - ov::GenerationConfigHelper config_helper = generation_config; ov::Shape prompts_shape = input_ids.get_shape(); size_t batch_size = prompts_shape[0]; size_t prompt_len = prompts_shape[1]; @@ -58,7 +61,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner, auto beam_data = m_model_runner.get_tensor("beam_idx").data(); std::iota(beam_data, beam_data + batch_size, 0); - size_t max_tokens = config_helper.get_max_new_tokens(prompt_len); + size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); m_model_runner.infer(); auto logits = m_model_runner.get_tensor("logits"); diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index 312671c8f0..d801d2674e 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include "generation_config_helper.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" @@ -87,7 +86,7 @@ bool greater(const Beam& left, const Beam& right) { struct Parameters { std::vector> prompts; - int64_t eos_token; + int64_t eos_token_id; size_t n_groups = 3; size_t group_size = 5; float diversity_penalty = 1.0; @@ -110,7 +109,7 @@ struct Group { beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty); // HF implementation counts eos_token for length penalty calculation - if (beam.tokens.back() == parameters.eos_token) { + if (beam.tokens.back() == parameters.eos_token_id) { beam.tokens.pop_back(); } @@ -270,7 +269,7 @@ struct GroupBeamSearcher { std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); group->ongoing.clear(); for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { - if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) { + if (parameters.eos_token_id == candidates.at(cand_idx).tokens.back()) { // If beam_token does not belong to top num_beams tokens, it should not be added if (cand_idx >= parameters.group_size) { continue; @@ -370,7 +369,6 @@ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention namespace ov { EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig config) { - GenerationConfigHelper config_helper = config; OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups"); // Initialize beam search @@ -388,7 +386,7 @@ EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tenso Parameters parameters{std::move(prompts)}; parameters.max_new_tokens = config.max_new_tokens; - parameters.eos_token = config.eos_token_id; + parameters.eos_token_id = config.eos_token_id; parameters.n_groups = config.num_beam_groups; parameters.group_size = config.num_beams / config.num_beam_groups; parameters.diversity_penalty = config.diversity_penalty; diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 9ea685e583..49a7fde38b 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -13,9 +13,51 @@ #include "openvino/genai/generation_config.hpp" #include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" -#include "generation_config_helper.hpp" #include "text_callback_streamer.hpp" +namespace { + +ov::GenerationConfig from_config_json_if_exists(const std::string& path) { + constexpr char generation_config_fname[] = "generation_config.json"; + constexpr char config_fname[] = "config.json"; + if (std::filesystem::exists(path + "/" + generation_config_fname)) { + return ov::GenerationConfig(path + "/" + generation_config_fname); + } else if (std::filesystem::exists(path + "/" + config_fname)) { + // some models (e.g. google/gemma-*) do not have generation_config.json, but have config.json + // and special tokens are stored there. + std::ifstream file(path + "/" + config_fname); + if (!file.is_open()) + return ov::GenerationConfig{}; + + nlohmann::json data = nlohmann::json::parse(file); + using ov::generate_utils::read_json_param; + ov:: GenerationConfig config; + + read_json_param(data, "pad_token_id", config.pad_token_id); + read_json_param(data, "bos_token_id", config.bos_token_id); + read_json_param(data, "eos_token_id", config.eos_token_id); + return config; + + } + return ov::GenerationConfig{}; +} + +std::string from_tokenizer_json_if_exists(const std::string& path) { + std::string res = ""; + + if (!std::filesystem::exists(path)) + return res; + + std::ifstream file(path + "/tokenizer_config.json"); + if (!file.is_open()) + return res; + + ov::generate_utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); + return res; +} + +} + namespace ov { @@ -23,9 +65,9 @@ ov::EncodedResults greedy_decoding( ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, - GenerationConfig sampling_params, - std::shared_ptr streamer, - bool is_chat_conversation = false + const GenerationConfig sampling_params, + const std::shared_ptr streamer, + const bool is_chat_conversation = false ); EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig config); @@ -36,20 +78,23 @@ class LLMPipeline::LLMPipelineImpl { ov::InferRequest m_model_runner; Tokenizer m_tokenizer; GenerationConfig m_generation_config; - std::string m_device; - ov::AnyMap m_plugin_config; std::string m_chat_template = ""; bool is_chat_conversation = false; LLMPipelineImpl( - const std::string model_path, + const std::string& model_path, const ov::Tokenizer& tokenizer, - const std::string device, + const std::string& device, const ov::AnyMap& plugin_config, const std::string& ov_tokenizers_path="" ); - LLMPipelineImpl(std::string& path, std::string device, const ov::AnyMap& config, const std::string& ov_tokenizers_path=""); + LLMPipelineImpl( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path="" + ); GenerationConfig generation_config() const; @@ -65,11 +110,10 @@ class LLMPipeline::LLMPipelineImpl { using namespace std; - ov::LLMPipeline::LLMPipeline( - const std::string model_path, + const std::string& model_path, const ov::Tokenizer& tokenizer, - const std::string device, + const std::string& device, const ov::AnyMap& plugin_config, const std::string& ov_tokenizers_path ) { @@ -77,12 +121,12 @@ ov::LLMPipeline::LLMPipeline( } ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( - const std::string model_path, + const std::string& model_path, const ov::Tokenizer& tokenizer, - std::string device, + const std::string& device, const ov::AnyMap& plugin_config, const std::string& ov_tokenizers_path -): m_tokenizer(tokenizer), m_device(device), m_plugin_config(plugin_config) { +): m_tokenizer(tokenizer) { ov::Core core; std::string full_path = model_path; @@ -95,43 +139,26 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( } } -ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config, const std::string& ov_tokenizers_path) { +ov::LLMPipeline::LLMPipeline( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path +) { m_pimpl = make_unique(path, device, config, ov_tokenizers_path); } -ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string device, - const ov::AnyMap& config, const std::string& ov_tokenizers_path) { - std::string config_path = path + "/" + "config.json"; - std::string tokenizer_config_path = path + "/" +"tokenizer_config.json"; - std::string generation_config_path = path + "/" +"generation_config.json"; - - if (std::filesystem::exists(generation_config_path)) { - m_generation_config = GenerationConfig(generation_config_path); - } else if (std::filesystem::exists(config_path)) { - // some models (e.g. google/gemma-*) do not have generation_config.json, but have config.json - // and special tokens are stored there. - - std::ifstream f(config_path); - OPENVINO_ASSERT(f.is_open(), "Failed to open '" + config_path + "' with config.json"); - - nlohmann::json data = nlohmann::json::parse(f); - using ov::generate_utils::read_json_param; - read_json_param(data, "pad_token_id", m_generation_config.pad_token_id); - read_json_param(data, "bos_token_id", m_generation_config.bos_token_id); - read_json_param(data, "eos_token_id", m_generation_config.eos_token_id); - } - - if (std::filesystem::exists(tokenizer_config_path)) { - std::ifstream f(tokenizer_config_path); - ov::generate_utils::read_json_param(nlohmann::json::parse(f), "chat_template", m_chat_template); - } - - m_device = device; - - ov::Core core; - m_model_runner = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request(); - m_tokenizer = Tokenizer(path, device, ov_tokenizers_path); -} +ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( + const std::string& path, + const std::string& device, + const ov::AnyMap& config, + const std::string& ov_tokenizers_path +): + m_model_runner{ov::Core{}.compile_model(path + "/openvino_model.xml", device, config).create_infer_request()}, + m_tokenizer{Tokenizer(path, device, ov_tokenizers_path)}, + m_generation_config{from_config_json_if_exists(path)}, + m_chat_template{from_tokenizer_json_if_exists(path)} + {} ov::GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const { return m_generation_config; @@ -191,14 +218,10 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate( return m_tokenizer.decode(generate_results.tokens)[0]; } -ov::DecodedResults ov::LLMPipeline::generate(std::vector texts, OptionalGenerationConfig generation_config) { +ov::DecodedResults ov::LLMPipeline::generate(const std::vector& texts, OptionalGenerationConfig generation_config) { return m_pimpl->generate(texts, generation_config); } -ov::DecodedResults ov::LLMPipeline::generate(std::initializer_list text, OptionalGenerationConfig generation_config) { - return m_pimpl->generate(text, generation_config); -} - ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { auto [input_ids, attention_mask] = m_tokenizer.encode(texts); @@ -207,14 +230,6 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { - return m_pimpl-> generate(texts, generation_config); -} - -ov::DecodedResults ov::LLMPipeline::operator()(std::initializer_list text, OptionalGenerationConfig generation_config) { - return m_pimpl->generate(text, generation_config); -} - ov::EncodedResults ov::LLMPipeline::LLMPipeline::generate(ov::Tensor input_ids, std::optional attention_mask, OptionalGenerationConfig generation_config, @@ -229,7 +244,6 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( ) { ov::EncodedResults result; GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - GenerationConfigHelper config_helper = config; std::shared_ptr streamer_ptr; if (!streamer.has_value()){ @@ -240,15 +254,15 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( streamer_ptr = std::make_shared(m_tokenizer, *callback); } auto batch_size = input_ids.get_shape().at(0); - if ((batch_size != 1 || !config_helper.is_greedy_decoding()) && streamer_ptr) { + if ((batch_size != 1 || !config.is_greedy_decoding()) && streamer_ptr) { OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy decoding"); } auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::generate_utils::init_attention_mask(input_ids); - if (config_helper.is_greedy_decoding()) { + if (config.is_greedy_decoding()) { result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation); - } else if (config_helper.is_beam_search()) { + } else if (config.is_beam_search()) { result = beam_search(m_model_runner, input_ids, attention_mask_data, config); } else { // todo: implement multinomial sampling @@ -267,7 +281,7 @@ std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) { OptionalStreamerVariant streamer; - auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map); + auto config = GenerationConfig::anymap_to_generation_config(config_map); if (config_map.count("streamer")) { streamer = config_map.at("streamer").as>(); } @@ -277,7 +291,7 @@ std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) { OptionalStreamerVariant streamer; - auto config = GenerationConfigHelper(get_generation_config()).anymap_to_generation_config(config_map); + auto config = GenerationConfig::anymap_to_generation_config(config_map); if (config_map.count("streamer")) { streamer = config_map.at("streamer").as>(); } @@ -286,14 +300,6 @@ ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::Any return m_pimpl->generate(input_ids, attention_mask, config, streamer); } -std::string ov::LLMPipeline::operator()(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { - return m_pimpl->generate(text, generation_config, streamer); -} - -std::string ov::LLMPipeline::operator()(std::string text, OptionalStreamerVariant streamer) { - return m_pimpl->generate(text, m_pimpl->m_generation_config, streamer); -} - ov::Tokenizer ov::LLMPipeline::get_tokenizer() { return m_pimpl->m_tokenizer; } diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 778778faec..321597b5b7 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -47,7 +47,6 @@ class Tokenizer::TokenizerImpl { public: ov::InferRequest m_tokenize_request; ov::InferRequest m_detokenizer_request; - std::string m_device; int64_t m_pad_token_id = 0; int64_t m_bos_token_id = 1; int64_t m_eos_token_id = 2; diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py index 09addcfaba..a24a4fd13d 100644 --- a/tests/python_tests/list_test_models.py +++ b/tests/python_tests/list_test_models.py @@ -14,7 +14,10 @@ def models_list(): # ("microsoft/phi-1_5", "phi-1_5/"), # ("Qwen/Qwen1.5-7B-Chat", "Qwen1.5-7B-Chat"), ] - return model_ids + import os + prefix = os.getenv('GENAI_MODELS_PATH_PREFIX', '') + return [(model_id, os.path.join(prefix, model_path)) for model_id, model_path in model_ids] + if __name__ == "__main__": for model_id, model_path in models_list(): diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 1d46e227c9..e7f9adf5d5 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -14,6 +14,7 @@ def model_fixture(request): return model_id, path, tokenizer, model def run_hf_ov_genai_comparison(model_fixture, generation_config, prompt): + import openvino_genai as ov_genai model_id, path, tokenizer, model = model_fixture generation_config_hf = generation_config.copy() @@ -28,10 +29,13 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config, prompt): hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) device = 'CPU' - ov_tokenizers_path = '../../build/openvino_tokenizers/src/' - import openvino_genai as ov_genai + # pipe = ov_genai.LLMPipeline(path, device) + import os + build_dir = os.getenv('GENAI_BUILD_DIR', 'build') + ov_tokenizers_path = f'{build_dir}/openvino_tokenizers/src/' pipe = ov_genai.LLMPipeline(path, device, {}, ov_tokenizers_path) + ov_output = pipe.generate(prompt, **generation_config) if hf_output != ov_output: @@ -46,7 +50,7 @@ def stop_criteria_map(): test_cases = [ (dict(max_new_tokens=20, do_sample=False), 'table is made of'), # generation_config, prompt - # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'), # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), From c395a8d4232a7ed7293f2b6af9847618f449aa58 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 24 May 2024 11:26:34 +0200 Subject: [PATCH 4/5] refactor namespace ov::* -> ov::genai::* --- src/README.md | 57 +++------ .../openvino/genai/generation_config.hpp | 4 +- .../include/openvino/genai/llm_pipeline.hpp | 10 +- .../include/openvino/genai/streamer_base.hpp | 4 +- src/cpp/include/openvino/genai/tokenizer.hpp | 4 +- src/cpp/src/generation_config.cpp | 12 +- src/cpp/src/greedy_decoding.cpp | 20 +-- src/cpp/src/group_beam_searcher.cpp | 16 +-- src/cpp/src/llm_pipeline.cpp | 79 ++++++------ src/cpp/src/text_callback_streamer.cpp | 4 +- src/cpp/src/text_callback_streamer.hpp | 4 +- src/cpp/src/tokenizer.cpp | 6 +- src/cpp/src/utils.cpp | 6 +- src/cpp/src/utils.hpp | 6 +- src/python/py_generate_pipeline.cpp | 114 +++++++++--------- .../causal_lm/cpp/beam_search_causal_lm.cpp | 4 +- text_generation/causal_lm/cpp/chat_sample.cpp | 6 +- .../causal_lm/cpp/greedy_causal_lm.cpp | 4 +- 18 files changed, 181 insertions(+), 179 deletions(-) diff --git a/src/README.md b/src/README.md index be51fb10b2..250bf4105b 100644 --- a/src/README.md +++ b/src/README.md @@ -63,7 +63,7 @@ Minimalistc example int main(int argc, char* argv[]) { std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); std::cout << pipe.generate("The Sun is yellow bacause"); } ``` @@ -75,9 +75,9 @@ Using Group Beam Search Decoding int main(int argc, char* argv[]) { std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 256; config.num_groups = 3; config.group_size = 5; @@ -87,59 +87,36 @@ int main(int argc, char* argv[]) { } ``` -A simplest chat in C++ +A simple chat in C++ using grouped beam search decoding ``` cpp -#include "openvino/genai/llm_pipeline.hpp" -#include - int main(int argc, char* argv[]) { std::string prompt; std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); - - pipe.start_chat(); - for (size_t i = 0; i < questions.size(); i++) { - std::cout << "question:\n"; - std::getline(std::cin, prompt); - - std::cout << pipe(prompt) << std::endl>>; - } - pipe.finish_chat(); -} -``` - -Specifying generation_config to use grouped beam search -``` cpp -int main(int argc, char* argv[]) { - std::string prompt; - - std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 256; config.num_groups = 3; config.group_size = 5; config.diversity_penalty = 1.0f; - auto streamer = [](std::string word) { std::cout << word << std::flush; }; - pipe.start_chat(); - for (size_t i = 0; i < questions.size(); i++) { - + for (;;;) { std::cout << "question:\n"; - std::cout << prompt << std::endl; + std::getline(std::cin, prompt); + if (prompts == "Stop!") + break; - auto answer = pipe(prompt, config, streamer); - // no need to print answer, streamer will do that + std::cout << "answer:\n"; + auto answer = pipe(prompt, config); + std::cout << answer << std::endl; } pipe.finish_chat(); } ``` Streaming example with lambda function - ``` cpp #include "openvino/genai/llm_pipeline.hpp" @@ -147,20 +124,20 @@ Streaming example with lambda function int main(int argc, char* argv[]) { std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); auto streamer = [](std::string word) { std::cout << word << std::flush; }; std::cout << pipe.generate("The Sun is yellow bacause", streamer); } ``` -Streaming with custom class +Streaming with a custom class ``` cpp #include "openvino/genai/streamer_base.hpp" #include "openvino/genai/llm_pipeline.hpp" #include -class CustomStreamer: public ov::StreamerBase { +class CustomStreamer: public ov::genai::StreamerBase { public: void put(int64_t token) { /* custom decoding/tokens processing code @@ -179,7 +156,7 @@ int main(int argc, char* argv[]) { CustomStreamer custom_streamer; std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer); } ``` diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 529f5ac8f3..9a922549a1 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -12,6 +12,7 @@ #include "openvino/genai/tokenizer.hpp" namespace ov { +namespace genai { /** * @brief controls the stopping condition for grouped beam search. The following values are possible: @@ -102,4 +103,5 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {}); }; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 7d9d1ea9b3..911a5a237a 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -12,6 +12,7 @@ #include "openvino/genai/streamer_base.hpp" namespace ov { +namespace genai { using StreamerVariant = std::variant, std::shared_ptr>; using OptionalGenerationConfig = std::optional; @@ -85,7 +86,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { */ LLMPipeline( const std::string& model_path, - const ov::Tokenizer& tokenizer, + const ov::genai::Tokenizer& tokenizer, const std::string& device="CPU", const ov::AnyMap& plugin_config = {}, const std::string& ov_tokenizers_path="" @@ -164,7 +165,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { return generate(text, generation_config, streamer); } - ov::Tokenizer get_tokenizer(); + ov::genai::Tokenizer get_tokenizer(); GenerationConfig get_generation_config() const; void set_generation_config(const GenerationConfig& generation_config); @@ -210,6 +211,7 @@ static constexpr ov::Property eos_token{"eos_token"}; // only lambda streamer can be set via ov::streamer(),... syntaxic sugar, // because std::variant> can not be stored in AnyMap -static constexpr ov::Property> streamer_lambda{"streamer"}; +static constexpr ov::Property> streamer{"streamer"}; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp index 385cb2bf1e..7731b51c1c 100644 --- a/src/cpp/include/openvino/genai/streamer_base.hpp +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -6,6 +6,7 @@ #include "openvino/genai/tokenizer.hpp" namespace ov { +namespace genai { /** * @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods @@ -25,4 +26,5 @@ class StreamerBase { virtual void end() = 0; }; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 03c0cd64f7..e0214fcfbb 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -10,6 +10,7 @@ #include "openvino/genai/visibility.hpp" namespace ov { +namespace genai { /** * @brief class is used to encode prompts and decode resulting tokens @@ -78,4 +79,5 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { std::shared_ptr m_pimpl; }; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index d9f98837da..66f31f7ffd 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -10,16 +10,11 @@ #include "utils.hpp" -namespace { - - -} // namespace - - namespace ov { +namespace genai { GenerationConfig::GenerationConfig(std::string json_path) { - using ov::generate_utils::read_json_param; + using ov::genai::utils::read_json_param; std::ifstream f(json_path); OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); @@ -61,7 +56,7 @@ GenerationConfig::GenerationConfig(std::string json_path) { } GenerationConfig GenerationConfig::anymap_to_generation_config(const ov::AnyMap& config_map) { - using ov::generate_utils::read_anymap_param; + using ov::genai::utils::read_anymap_param; GenerationConfig config; read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens); @@ -109,4 +104,5 @@ bool GenerationConfig::is_multimomial() const { return do_sample; } +} // namespace genai } // namespace ov diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 4d785077a7..51e8023b42 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -5,12 +5,13 @@ #include "utils.hpp" namespace ov { +namespace genai { -ov::EncodedResults greedy_decoding( +EncodedResults greedy_decoding( ov::InferRequest& m_model_runner, ov::Tensor input_ids, ov::Tensor attention_mask, - const ov::GenerationConfig generation_config, + const ov::genai::GenerationConfig generation_config, const std::shared_ptr streamer, const bool is_chat_conversation ) { @@ -23,9 +24,9 @@ ov::EncodedResults greedy_decoding( // todo: make this work even if position_ids are not specified auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; - generate_utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len); + utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len); - ov::EncodedResults results; + EncodedResults results; results.scores.resize(batch_size); results.tokens.resize(batch_size); std::fill(results.scores.begin(), results.scores.end(), 0); @@ -72,7 +73,7 @@ ov::EncodedResults greedy_decoding( std::vector token_iter_results(batch_size); // results of a single infer request std::vector eos_met(batch_size, 0); // use int because can not use std::all_of with vector for (size_t batch = 0; batch < batch_size; ++batch) { - auto res = generate_utils::softmax(logits, batch); + auto res = utils::softmax(logits, batch); auto out_token = res.first; results.tokens[batch].emplace_back(res.first); results.scores[batch] += res.second; @@ -89,8 +90,8 @@ ov::EncodedResults greedy_decoding( return results; for (size_t i = 0; i < max_tokens - 1; ++i) { - generate_utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); - m_model_runner.set_tensor("attention_mask", generate_utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); + m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); // todo: consider replacing with start_async and run callback right after that m_model_runner.infer(); @@ -102,7 +103,7 @@ ov::EncodedResults greedy_decoding( std::vector eos_met(batch_size, 0); // use int because can not use std::all_of with vector for (size_t batch = 0; batch < batch_size; ++batch) { - auto res = ov::generate_utils::softmax(logits, batch); + auto res = ov::genai::utils::softmax(logits, batch); auto out_token = res.first; results.tokens[batch].emplace_back(res.first); results.scores[batch] += res.second; @@ -125,4 +126,5 @@ ov::EncodedResults greedy_decoding( return results; } -} \ No newline at end of file +} //namespace genai +} //namespace ov \ No newline at end of file diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index d801d2674e..96138cec62 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -91,7 +91,7 @@ struct Parameters { size_t group_size = 5; float diversity_penalty = 1.0; size_t max_new_tokens = 20; - ov::StopCriteria stop_criteria = ov::StopCriteria::heuristic; + ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::heuristic; float length_penalty = 1.0; size_t no_repeat_ngram_size = std::numeric_limits::max(); @@ -128,15 +128,15 @@ struct Group { float best_sum_logprobs = ongoing.front().score; float worst_score = min_heap.front().score; switch (parameters.stop_criteria) { - case ov::StopCriteria::early: + case ov::genai::StopCriteria::early: done = true; return; - case ov::StopCriteria::heuristic: { + case ov::genai::StopCriteria::heuristic: { float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); done = worst_score >= highest_attainable_score; return; } - case ov::StopCriteria::never: { + case ov::genai::StopCriteria::never: { size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); done = worst_score >= highest_attainable_score; @@ -324,7 +324,7 @@ void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_ ov::Tensor position_ids = request.get_tensor("position_ids"); position_ids.set_shape(input_shape); - ov::generate_utils::initialize_position_ids(position_ids, attention_mask); + ov::genai::utils::initialize_position_ids(position_ids, attention_mask); ov::Tensor beam_idx = request.get_tensor("beam_idx"); beam_idx.set_shape({input_shape.at(0)}); @@ -367,6 +367,7 @@ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention namespace ov { +namespace genai { EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tensor attention_mask, GenerationConfig config) { OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups"); @@ -427,7 +428,7 @@ EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tenso auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); }; std::sort(beams.begin(), beams.end(), compare_scores); - ov::EncodedResults results; + ov::genai::EncodedResults results; for (auto beam = beams.begin(); beam != beams.begin() + config.num_return_sequences; ++beam) { results.scores.emplace_back(beam->score); results.tokens.emplace_back(beam->tokens); @@ -435,4 +436,5 @@ EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tenso return results; } -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 49a7fde38b..6f8dc675a0 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -17,21 +17,21 @@ namespace { -ov::GenerationConfig from_config_json_if_exists(const std::string& path) { +ov::genai::GenerationConfig from_config_json_if_exists(const std::string& path) { constexpr char generation_config_fname[] = "generation_config.json"; constexpr char config_fname[] = "config.json"; if (std::filesystem::exists(path + "/" + generation_config_fname)) { - return ov::GenerationConfig(path + "/" + generation_config_fname); + return ov::genai::GenerationConfig(path + "/" + generation_config_fname); } else if (std::filesystem::exists(path + "/" + config_fname)) { // some models (e.g. google/gemma-*) do not have generation_config.json, but have config.json // and special tokens are stored there. std::ifstream file(path + "/" + config_fname); if (!file.is_open()) - return ov::GenerationConfig{}; + return ov::genai::GenerationConfig{}; nlohmann::json data = nlohmann::json::parse(file); - using ov::generate_utils::read_json_param; - ov:: GenerationConfig config; + using ov::genai::utils::read_json_param; + ov::genai::GenerationConfig config; read_json_param(data, "pad_token_id", config.pad_token_id); read_json_param(data, "bos_token_id", config.bos_token_id); @@ -39,7 +39,7 @@ ov::GenerationConfig from_config_json_if_exists(const std::string& path) { return config; } - return ov::GenerationConfig{}; + return ov::genai::GenerationConfig{}; } std::string from_tokenizer_json_if_exists(const std::string& path) { @@ -52,16 +52,16 @@ std::string from_tokenizer_json_if_exists(const std::string& path) { if (!file.is_open()) return res; - ov::generate_utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); + ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); return res; } } - namespace ov { +namespace genai { -ov::EncodedResults greedy_decoding( +ov::genai::EncodedResults greedy_decoding( ov::InferRequest& model_runner, ov::Tensor prompts, ov::Tensor attentin_mask, @@ -83,7 +83,7 @@ class LLMPipeline::LLMPipelineImpl { LLMPipelineImpl( const std::string& model_path, - const ov::Tokenizer& tokenizer, + const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config, const std::string& ov_tokenizers_path="" @@ -105,14 +105,15 @@ class LLMPipeline::LLMPipelineImpl { std::string apply_chat_template(std::string prompt, std::string role = "user") const; }; -} // namespace ov +} // namespace genai +} // namespace ov using namespace std; -ov::LLMPipeline::LLMPipeline( +ov::genai::LLMPipeline::LLMPipeline( const std::string& model_path, - const ov::Tokenizer& tokenizer, + const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config, const std::string& ov_tokenizers_path @@ -120,9 +121,9 @@ ov::LLMPipeline::LLMPipeline( m_pimpl = make_unique(model_path, tokenizer, device, plugin_config, ov_tokenizers_path); } -ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( +ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( const std::string& model_path, - const ov::Tokenizer& tokenizer, + const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config, const std::string& ov_tokenizers_path @@ -130,7 +131,7 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( ov::Core core; std::string full_path = model_path; - if (!ov::generate_utils::is_xml(full_path)) + if (!ov::genai::utils::is_xml(full_path)) full_path += "/openvino_model.xml"; try { m_model_runner = core.compile_model(full_path, device, plugin_config).create_infer_request(); @@ -139,7 +140,7 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( } } -ov::LLMPipeline::LLMPipeline( +ov::genai::LLMPipeline::LLMPipeline( const std::string& path, const std::string& device, const ov::AnyMap& config, @@ -148,7 +149,7 @@ ov::LLMPipeline::LLMPipeline( m_pimpl = make_unique(path, device, config, ov_tokenizers_path); } -ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( +ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( const std::string& path, const std::string& device, const ov::AnyMap& config, @@ -160,15 +161,15 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( m_chat_template{from_tokenizer_json_if_exists(path)} {} -ov::GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const { +ov::genai::GenerationConfig ov::genai::LLMPipeline::LLMPipelineImpl::generation_config() const { return m_generation_config; } -ov::GenerationConfig ov::LLMPipeline::get_generation_config() const { +ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { return m_pimpl->generation_config(); } -std::string ov::LLMPipeline::LLMPipelineImpl::generate( +std::string ov::genai::LLMPipeline::LLMPipelineImpl::generate( std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer @@ -218,11 +219,11 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate( return m_tokenizer.decode(generate_results.tokens)[0]; } -ov::DecodedResults ov::LLMPipeline::generate(const std::vector& texts, OptionalGenerationConfig generation_config) { +ov::genai::DecodedResults ov::genai::LLMPipeline::generate(const std::vector& texts, OptionalGenerationConfig generation_config) { return m_pimpl->generate(texts, generation_config); } -ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { +ov::genai::DecodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate(std::vector texts, OptionalGenerationConfig generation_config) { auto [input_ids, attention_mask] = m_tokenizer.encode(texts); auto generate_results = generate(input_ids, attention_mask, generation_config, {}); @@ -230,19 +231,19 @@ ov::DecodedResults ov::LLMPipeline::LLMPipelineImpl::generate(std::vector attention_mask, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { return m_pimpl->generate(input_ids, attention_mask, generation_config, streamer); } -ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( +ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate( ov::Tensor input_ids, std::optional attention_mask, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer ) { - ov::EncodedResults result; + ov::genai::EncodedResults result; GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; std::shared_ptr streamer_ptr; @@ -258,10 +259,10 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy decoding"); } - auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::generate_utils::init_attention_mask(input_ids); + auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids); if (config.is_greedy_decoding()) { - result = ov::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation); + result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation); } else if (config.is_beam_search()) { result = beam_search(m_model_runner, input_ids, attention_mask_data, config); } else { @@ -275,11 +276,11 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate( return result; } -std::string ov::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { +std::string ov::genai::LLMPipeline::generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer) { return m_pimpl->generate(text, generation_config, streamer); } -std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) { +std::string ov::genai::LLMPipeline::generate(std::string text, const ov::AnyMap& config_map) { OptionalStreamerVariant streamer; auto config = GenerationConfig::anymap_to_generation_config(config_map); if (config_map.count("streamer")) { @@ -289,7 +290,7 @@ std::string ov::LLMPipeline::generate(std::string text, const ov::AnyMap& config return m_pimpl->generate(text, config, streamer); } -ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) { +ov::genai::EncodedResults ov::genai::LLMPipeline::generate(ov::Tensor input_ids, const ov::AnyMap& config_map) { OptionalStreamerVariant streamer; auto config = GenerationConfig::anymap_to_generation_config(config_map); if (config_map.count("streamer")) { @@ -300,15 +301,15 @@ ov::EncodedResults ov::LLMPipeline::generate(ov::Tensor input_ids, const ov::Any return m_pimpl->generate(input_ids, attention_mask, config, streamer); } -ov::Tokenizer ov::LLMPipeline::get_tokenizer() { +ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() { return m_pimpl->m_tokenizer; } -std::string ov::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const { +std::string ov::genai::LLMPipeline::apply_chat_template(std::string prompt, std::string role) const { return m_pimpl->apply_chat_template(prompt, role); } -std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const { +std::string ov::genai::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string prompt, std::string role) const { jinja2::TemplateEnv env; env.GetSettings().lstripBlocks = true; env.GetSettings().trimBlocks = true; @@ -326,21 +327,21 @@ std::string ov::LLMPipeline::LLMPipelineImpl::apply_chat_template(std::string pr return tpl.RenderAsString(params).value(); } -void ov::LLMPipeline::start_chat() { +void ov::genai::LLMPipeline::start_chat() { m_pimpl->is_chat_conversation = true; } -void ov::LLMPipeline::finish_chat() { +void ov::genai::LLMPipeline::finish_chat() { m_pimpl->is_chat_conversation = false; reset_state(); } -void ov::LLMPipeline::reset_state() { +void ov::genai::LLMPipeline::reset_state() { m_pimpl->m_model_runner.reset_state(); } -void ov::LLMPipeline::set_generation_config(const GenerationConfig& generation_config) { +void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& generation_config) { m_pimpl->m_generation_config = generation_config; } -ov::LLMPipeline::~LLMPipeline() = default; +ov::genai::LLMPipeline::~LLMPipeline() = default; diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index f9b3ad8ccd..bb2bec09d9 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -1,6 +1,7 @@ #include "text_callback_streamer.hpp" namespace ov { +namespace genai { TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function callback, bool print_eos_token) { m_tokenizer = tokenizer; @@ -70,4 +71,5 @@ void TextCallbackStreamer::on_finalized_text(const std::string& subword) { } } -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp index d9c1ba3ee5..3834dd01ba 100644 --- a/src/cpp/src/text_callback_streamer.hpp +++ b/src/cpp/src/text_callback_streamer.hpp @@ -6,6 +6,7 @@ #include "openvino/genai/tokenizer.hpp" namespace ov { +namespace genai { class TextCallbackStreamer: public StreamerBase { public: @@ -32,4 +33,5 @@ class TextCallbackStreamer: public StreamerBase { void on_finalized_text(const std::string& subword); }; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 321597b5b7..2cecdad22a 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -42,6 +42,7 @@ std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& } namespace ov { +namespace genai { class Tokenizer::TokenizerImpl { public: @@ -55,7 +56,7 @@ class Tokenizer::TokenizerImpl { TokenizerImpl(std::string tokenizers_path, const std::string device, const std::string& ov_tokenizers_path) { ov::Core core; - if (ov::generate_utils::is_xml(tokenizers_path)) + if (ov::genai::utils::is_xml(tokenizers_path)) OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file"); if (ov_tokenizers_path.empty()) { @@ -201,4 +202,5 @@ void Tokenizer::set_eos_token_id(int64_t eos_token_id) { Tokenizer::~Tokenizer() = default; -} // namespace ov +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index dbd18cf3f3..477a6efd54 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -4,7 +4,8 @@ #include "utils.hpp" namespace ov { -namespace generate_utils { +namespace genai { +namespace utils { Tensor init_attention_mask(Tensor& position_ids) { auto shape = position_ids.get_shape(); @@ -135,5 +136,6 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) { return new_atten_mask; } -} // namespace generate_utils +} // namespace utils +} // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index d7998a9594..4559a8962f 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -7,7 +7,8 @@ #include namespace ov { -namespace generate_utils { +namespace genai { +namespace utils { Tensor init_attention_mask(Tensor& position_ids); @@ -58,6 +59,7 @@ void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& } } -} // namespace generate_utils +} // namespace utils +} // namespace genai } // namespace ov diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 2aee67593c..0a5cf98d02 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -7,9 +7,15 @@ #include "openvino/genai/llm_pipeline.hpp" namespace py = pybind11; -using namespace ov; - -void str_to_stop_criteria(ov::GenerationConfig& config, const std::string& stop_criteria_str){ +using ov::genai::LLMPipeline; +using ov::genai::Tokenizer; +using ov::genai::GenerationConfig; +using ov::genai::EncodedResults; +using ov::genai::DecodedResults; +using ov::genai::StopCriteria; +using ov::genai::StreamerBase; + +void str_to_stop_criteria(GenerationConfig& config, const std::string& stop_criteria_str){ if (stop_criteria_str == "early") config.stop_criteria = StopCriteria::early; else if (stop_criteria_str == "never") config.stop_criteria = StopCriteria::never; else if (stop_criteria_str == "heuristic") config.stop_criteria = StopCriteria::heuristic; @@ -17,16 +23,16 @@ void str_to_stop_criteria(ov::GenerationConfig& config, const std::string& stop_ "Allowed values are: \"early\", \"never\", \"heuristic\". "); } -std::string stop_criteria_to_str(const ov::GenerationConfig& config) { +std::string stop_criteria_to_str(const GenerationConfig& config) { switch (config.stop_criteria) { - case ov::StopCriteria::early: return "early"; - case ov::StopCriteria::heuristic: return "heuristic"; - case ov::StopCriteria::never: return "never"; + case StopCriteria::early: return "early"; + case StopCriteria::heuristic: return "heuristic"; + case StopCriteria::never: return "never"; default: throw std::runtime_error("Incorrect stop_criteria"); } } -void update_config_from_kwargs(ov::GenerationConfig& config, const py::kwargs& kwargs) { +void update_config_from_kwargs(GenerationConfig& config, const py::kwargs& kwargs) { if (kwargs.contains("max_new_tokens")) config.max_new_tokens = kwargs["max_new_tokens"].cast(); if (kwargs.contains("max_length")) config.max_length = kwargs["max_length"].cast(); if (kwargs.contains("ignore_eos")) config.ignore_eos = kwargs["ignore_eos"].cast(); @@ -50,14 +56,14 @@ void update_config_from_kwargs(ov::GenerationConfig& config, const py::kwargs& k } // operator() and generate methods are identical, operator() is just an alias for generate -std::string call_with_kwargs(ov::LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { +std::string call_with_kwargs(LLMPipeline& pipeline, const std::string& text, const py::kwargs& kwargs) { // Create a new GenerationConfig instance and initialize from kwargs - ov::GenerationConfig config = pipeline.get_generation_config(); + GenerationConfig config = pipeline.get_generation_config(); update_config_from_kwargs(config, kwargs); return pipeline(text, config); } -std::string call_with_config(ov::LLMPipeline& pipe, const std::string& text, const ov::GenerationConfig& config) { +std::string call_with_config(LLMPipeline& pipe, const std::string& text, const GenerationConfig& config) { std::shared_ptr streamer; return pipe(text, config); } @@ -72,15 +78,15 @@ PYBIND11_MODULE(py_generate_pipeline, m) { m.doc() = "Pybind11 binding for LLM Pipeline"; py::class_(m, "LLMPipeline") - .def(py::init(), + .def(py::init(), py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) .def(py::init(), py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) - .def("__call__", py::overload_cast(&call_with_kwargs)) - .def("__call__", py::overload_cast(&call_with_config)) - .def("generate", py::overload_cast(&call_with_kwargs)) - .def("generate", py::overload_cast(&call_with_config)) + .def("__call__", py::overload_cast(&call_with_kwargs)) + .def("__call__", py::overload_cast(&call_with_config)) + .def("generate", py::overload_cast(&call_with_kwargs)) + .def("generate", py::overload_cast(&call_with_config)) // todo: if input_ids is a ov::Tensor/numpy tensor // todo: implement calling generate/operator() with StreamerBase or lambda streamer @@ -92,15 +98,15 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def("get_tokenizer", &LLMPipeline::get_tokenizer) - .def("start_chat", &ov::LLMPipeline::start_chat) - .def("finish_chat", &ov::LLMPipeline::finish_chat) - .def("reset_state", &ov::LLMPipeline::reset_state) - .def("get_generation_config", &ov::LLMPipeline::get_generation_config, py::return_value_policy::copy) - .def("set_generation_config", &ov::LLMPipeline::set_generation_config) + .def("start_chat", &LLMPipeline::start_chat) + .def("finish_chat", &LLMPipeline::finish_chat) + .def("reset_state", &LLMPipeline::reset_state) + .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &LLMPipeline::set_generation_config) .def("apply_chat_template", &LLMPipeline::apply_chat_template); // Binding for Tokenizer - py::class_(m, "Tokenizer") + py::class_(m, "Tokenizer") .def(py::init<>()) .def(py::init(), py::arg("tokenizers_path"), @@ -108,46 +114,46 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::arg("ov_tokenizers_path") = py::str(ov_tokenizers_module_path())) // todo: implement encode/decode when for numpy inputs and outputs - .def("encode", py::overload_cast(&ov::Tokenizer::encode), "Encode a single prompt") + .def("encode", py::overload_cast(&Tokenizer::encode), "Encode a single prompt") // TODO: common.h(1106...) template argument deduction/substitution failed: - // .def("encode", py::overload_cast&>(&ov::Tokenizer::encode), "Encode multiple prompts") - .def("decode", py::overload_cast>(&ov::Tokenizer::decode), "Decode a list of tokens") - .def("decode", py::overload_cast(&ov::Tokenizer::decode), "Decode a tensor of tokens") - .def("decode", py::overload_cast>>(&ov::Tokenizer::decode), "Decode multiple lines of tokens"); + // .def("encode", py::overload_cast&>(&Tokenizer::encode), "Encode multiple prompts") + .def("decode", py::overload_cast>(&Tokenizer::decode), "Decode a list of tokens") + .def("decode", py::overload_cast(&Tokenizer::decode), "Decode a tensor of tokens") + .def("decode", py::overload_cast>>(&Tokenizer::decode), "Decode multiple lines of tokens"); // Binding for GenerationConfig - py::class_(m, "GenerationConfig") + py::class_(m, "GenerationConfig") .def(py::init<>()) .def(py::init()) - .def_readwrite("max_new_tokens", &ov::GenerationConfig::max_new_tokens) - .def_readwrite("max_length", &ov::GenerationConfig::max_length) - .def_readwrite("ignore_eos", &ov::GenerationConfig::ignore_eos) - .def_readwrite("num_beam_groups", &ov::GenerationConfig::num_beam_groups) - .def_readwrite("num_beams", &ov::GenerationConfig::num_beams) - .def_readwrite("diversity_penalty", &ov::GenerationConfig::diversity_penalty) - .def_readwrite("length_penalty", &ov::GenerationConfig::length_penalty) - .def_readwrite("num_return_sequences", &ov::GenerationConfig::num_return_sequences) - .def_readwrite("no_repeat_ngram_size", &ov::GenerationConfig::no_repeat_ngram_size) + .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) + .def_readwrite("max_length", &GenerationConfig::max_length) + .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) + .def_readwrite("num_beam_groups", &GenerationConfig::num_beam_groups) + .def_readwrite("num_beams", &GenerationConfig::num_beams) + .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) + .def_readwrite("length_penalty", &GenerationConfig::length_penalty) + .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences) + .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size) .def_property("stop_criteria", &stop_criteria_to_str, &str_to_stop_criteria) - .def_readwrite("temperature", &ov::GenerationConfig::temperature) - .def_readwrite("top_p", &ov::GenerationConfig::top_p) - .def_readwrite("top_k", &ov::GenerationConfig::top_k) - .def_readwrite("do_sample", &ov::GenerationConfig::do_sample) - .def_readwrite("repetition_penalty", &ov::GenerationConfig::repetition_penalty) - .def_readwrite("pad_token_id", &ov::GenerationConfig::pad_token_id) - .def_readwrite("bos_token_id", &ov::GenerationConfig::bos_token_id) - .def_readwrite("eos_token_id", &ov::GenerationConfig::eos_token_id) - .def_readwrite("eos_token", &ov::GenerationConfig::eos_token) - .def_readwrite("bos_token", &ov::GenerationConfig::bos_token); - - py::class_(m, "DecodedResults") + .def_readwrite("temperature", &GenerationConfig::temperature) + .def_readwrite("top_p", &GenerationConfig::top_p) + .def_readwrite("top_k", &GenerationConfig::top_k) + .def_readwrite("do_sample", &GenerationConfig::do_sample) + .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) + .def_readwrite("pad_token_id", &GenerationConfig::pad_token_id) + .def_readwrite("bos_token_id", &GenerationConfig::bos_token_id) + .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id) + .def_readwrite("eos_token", &GenerationConfig::eos_token) + .def_readwrite("bos_token", &GenerationConfig::bos_token); + + py::class_(m, "DecodedResults") .def(py::init<>()) - .def_readwrite("texts", &ov::DecodedResults::texts) - .def_readwrite("scores", &ov::DecodedResults::scores); + .def_readwrite("texts", &DecodedResults::texts) + .def_readwrite("scores", &DecodedResults::scores); - py::class_(m, "EncodedResults") + py::class_(m, "EncodedResults") .def(py::init<>()) - .def_readwrite("tokens", &ov::EncodedResults::tokens) - .def_readwrite("scores", &ov::EncodedResults::scores); + .def_readwrite("tokens", &EncodedResults::tokens) + .def_readwrite("scores", &EncodedResults::scores); } diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp index 1afc5f93ed..474537de17 100644 --- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp @@ -16,8 +16,8 @@ int main(int argc, char* argv[]) try { std::string model_path = argv[1]; std::string device = "CPU"; // GPU can be used as well - ov::LLMPipeline pipe(model_path, device); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 20; config.num_beam_groups = 3; config.num_beams = 15; diff --git a/text_generation/causal_lm/cpp/chat_sample.cpp b/text_generation/causal_lm/cpp/chat_sample.cpp index b1ecb5f5f4..3e215e5208 100644 --- a/text_generation/causal_lm/cpp/chat_sample.cpp +++ b/text_generation/causal_lm/cpp/chat_sample.cpp @@ -20,9 +20,9 @@ int main(int argc, char* argv[]) try { std::string accumulated_str = ""; std::string model_path = argv[1]; - ov::LLMPipeline pipe(model_path, "CPU"); + ov::genai::LLMPipeline pipe(model_path, "CPU"); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 10000; auto streamer = [](std::string word) { std::cout << word << std::flush; }; @@ -35,7 +35,7 @@ int main(int argc, char* argv[]) try { cout << prompt << endl; // auto answer_str = pipe(prompt, config, streamer); - auto answer_str = pipe.generate(prompt, ov::max_new_tokens(10000), ov::streamer_lambda(streamer)); + auto answer_str = pipe.generate(prompt, ov::genai::max_new_tokens(10000), ov::genai::streamer(streamer)); accumulated_str += answer_str; cout << "\n----------\n"; diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp index e410d170ca..0fea9b36d3 100644 --- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp @@ -14,8 +14,8 @@ int main(int argc, char* argv[]) try { std::string device = "CPU"; if (argc > 3) device = argv[3]; - ov::LLMPipeline pipe(model_path, device); - ov::GenerationConfig config = pipe.get_generation_config(); + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 100; config.do_sample = false; auto streamer = [](std::string subword){std::cout << subword << std::flush;}; From bbc8c25502436afd1216748d0d3776f7a37a776a Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 24 May 2024 12:23:24 +0200 Subject: [PATCH 5/5] removed ov_tokenizers_path when ov::gena::Tokenizer is passed to LLMPipeline --- src/README.md | 13 +++++++------ .../include/openvino/genai/generation_config.hpp | 2 +- src/cpp/include/openvino/genai/llm_pipeline.hpp | 3 +-- src/cpp/src/llm_pipeline.cpp | 11 ++++------- src/python/py_generate_pipeline.cpp | 4 ++-- 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/README.md b/src/README.md index 250bf4105b..06a649a752 100644 --- a/src/README.md +++ b/src/README.md @@ -24,8 +24,8 @@ Calling generate with custom generation config parameters, e.g. config for group import openvino_genai as ov_genai pipe = ov_genai.LLMPipeline(model_path, "CPU") -res = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5) -print(res) +result = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5, diversity_penalty=1.5) +print(result) ``` output: @@ -38,7 +38,7 @@ A simples chat in python: import openvino_genai as ov_genai pipe = ov_ov_genai.LLMPipeline(model_path) -config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1} +config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.5} pipe.set_generation_cofnig(config) pipe.start_chat() @@ -49,7 +49,6 @@ while True:         break     print(pipe(prompt)) pipe.finish_chat() - ``` Test to compare with Huggingface outputs @@ -89,6 +88,9 @@ int main(int argc, char* argv[]) { A simple chat in C++ using grouped beam search decoding ``` cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + int main(int argc, char* argv[]) { std::string prompt; @@ -105,7 +107,7 @@ int main(int argc, char* argv[]) { for (;;;) { std::cout << "question:\n"; std::getline(std::cin, prompt); - if (prompts == "Stop!") + if (prompt == "Stop!") break; std::cout << "answer:\n"; @@ -118,7 +120,6 @@ int main(int argc, char* argv[]) { Streaming example with lambda function ``` cpp - #include "openvino/genai/llm_pipeline.hpp" #include diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 9a922549a1..4c43f880d9 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -42,7 +42,7 @@ enum class StopCriteria { early, heuristic, never }; * @param num_beams number of beams for beam search. 1 disables beam search. * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a - * particular time. + * particular time. See https://arxiv.org/pdf/1909.05858. * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to * the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log * likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 911a5a237a..7501058ca9 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -88,8 +88,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const std::string& device="CPU", - const ov::AnyMap& plugin_config = {}, - const std::string& ov_tokenizers_path="" + const ov::AnyMap& plugin_config = {} ); ~LLMPipeline(); diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 6f8dc675a0..4a3683bbd7 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -85,8 +85,7 @@ class LLMPipeline::LLMPipelineImpl { const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, - const ov::AnyMap& plugin_config, - const std::string& ov_tokenizers_path="" + const ov::AnyMap& plugin_config ); LLMPipelineImpl( @@ -115,18 +114,16 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, - const ov::AnyMap& plugin_config, - const std::string& ov_tokenizers_path + const ov::AnyMap& plugin_config ) { - m_pimpl = make_unique(model_path, tokenizer, device, plugin_config, ov_tokenizers_path); + m_pimpl = make_unique(model_path, tokenizer, device, plugin_config); } ov::genai::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, - const ov::AnyMap& plugin_config, - const std::string& ov_tokenizers_path + const ov::AnyMap& plugin_config ): m_tokenizer(tokenizer) { ov::Core core; diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 0a5cf98d02..fa944bb4eb 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -78,9 +78,9 @@ PYBIND11_MODULE(py_generate_pipeline, m) { m.doc() = "Pybind11 binding for LLM Pipeline"; py::class_(m, "LLMPipeline") - .def(py::init(), + .def(py::init(), py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", - py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) + py::arg("plugin_config") = ov::AnyMap{}) .def(py::init(), py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) .def("__call__", py::overload_cast(&call_with_kwargs))