From 2c2a34a3ff821446c7b258b01429300a9722bf95 Mon Sep 17 00:00:00 2001 From: Zlobin Vladimir Date: Tue, 4 Jun 2024 11:51:34 +0400 Subject: [PATCH] Cache a model, rename genai target, fix Windows (#14) * Fix noise images generated for '--num' > 1 in Stable Diffusion sample (#441) Fixes #405 * update optimum intel commit in llm bench (#444) * Fix an attempt to add a string value to a numerical value (#447) * output no hook data warning when it is text gen model (#449) * Fix md5 hash for env that does not support usedforsecurity arg (#445) I got an error running benchmarking on my working machine (python3.8, ubuntu20) due to unsupported args for hashlib. ``` [ ERROR ] An exception occurred [ INFO ] Traceback (most recent call last): File "benchmark.py", line 532, in main iter_data_list, pretrain_time = CASE_TO_BENCH[model_args['use_case']](model_path, framework, args.device, model_args, args.num_iters) File "benchmark.py", line 194, in run_text_generation_benchmark run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_idx, bench_hook, model_precision, proc_id) File "benchmark.py", line 131, in run_text_generation result_md5_list.append(hashlib.md5(result_text.encode(), usedforsecurity=False).hexdigest()) TypeError: openssl_md5() takes at most 1 argument (2 given) ``` Based on this [StackOverflow issue](https://stackoverflow.com/questions/54717862/how-do-i-know-if-the-usedforsecurity-flag-is-supported-by-hashlib-md5), not all clients support this argument and usage hashlib.new("md5") vs hashlib.md5 should be safe for usage in both cases * fix path based configuration (#456) * Revert "Force to generate "inference count" tokens" (#455) Reverts openvinotoolkit/openvino.genai#289 to unblock the release. Since it causes the performance regression of some models. (WIP to investigate the reason) * enable * libtbb-dev * move * slash * install * core_genai_dev * remove export * rreorganaise components * add SOVERSION, and requirements-build.txt * repalce SKBUILD with EXCLUDE_FROM_ALL because the effect is the same * fix NAMELINK_COMPONENT * remove extraline * add soft restrictions * Fix build to unblock packaging * verify beam search 1st token optimization (#426) The minimum version of transformers to get 1st and 2nd tokens latency is v4.40-release. * Output median min and avg values to csv (#450) Co-authored-by: Chen Peter * improve naming * install samples * remove quotes * use main target name because an alias can't be specified in cmake --target * define CMAKE_BUILD_PARALLEL_LEVEL * Ensure ./requirements-build.txt won't outdate * Use ./requirements-build.txt in python lib build * Add missing && * Test Debug * add matrix for windows_genai_package * openvino_tokenizers from form * update openvino_tokenizers * update openvino_tokenizers * update openvino_tokenizers * revert openvino_tokenizers * tokenizers from fork * update tokenizers * centos7_2024.2.0.dev * copy target * revert tokenizers * reapply useful changes * copy so only * Update tokenizers, centos7_2024.2.0.dev * single thread * ubuntu22 * nightyl * --pre --extra-index-url * update tokenizers * space * move --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly * release tokenizers * merge * downgrade tokenizers * downgrade * two steps * downgrade tokenizers * dont setupvars * source * fix * submodule * releases/2024/2 tokenizers * fix-2 * rebase * use make * comment * CMAKE_GENERATOR=Unix Makefiles * update openvino * space * optimum-cli from fork * different commit * from branch * remove exrtra-index for SD * reorder pip install * revert unwanted changes * Ubuntu-22 * openvino_tokenizers~=2024.2.0.0 * remove -pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly * upgrade to prerelease * revert requirements.txt * remove --pre, setupvars * get openvino_tokenizers._ext_path * take release pybind, fix soversion, and tokenizers folder * spelling * dont copy libs * put ov_tokenizers_path back * GENAI_BUILD_DIR=../../build * Add extension near to genai library * include openvino/util/file_util.hpp * get_absolute_file_path * remove namepsace * # include * more than one . * till next dot * _ext_path * -1 * +1 * +1 * path * ext name * with_openvino_tokenizers * char * revert test * tokenizers from form * update fork * lib * fix cherry-pick * update fork * dont spoil source dir * Generator expressions to disable appending a per-configuration subdirectory * remove versions * fix path * try * try * verbose * spelling * rename file * remove build.tool-args * Release * dont speciify targets * revert 81ec069 * Update tests * No rule to make target package * skip step * test tokenizers are loaded * CPU * dont test Debug * retrigger * minor * 16-cores * retrigger * retrigger * retrigger * -x * str * less verbose * less verbose * less * less * more * no cache * conflicts * cache * export * cached save * rename * rename * 16-cores * no larg * save memory * retrigger * predownload * comment * export name * exports * supress * revert * test_operator_wit_callback_batch_fail * run test_beam_search_decoding only * test_decoding * remove Phi * all * add return bool to streamer to stop generation * add return bool to streamer to stop generation * add return bool to streamer to stop generation * add return bool to streamer to stop generation * dont test StopCriteria.EARLY because it fails * update * remove sudo apt-get install libtbb-dev * submodule from fork * update submodule * update submodule * update submodule * update submodule * set upstream submodule, add copyright headers, shorten commands * space * dir link * retrigger * update * skip * test * put optimum-intel[openvino] back * flake8 * flake8 * optimum[openvino]==1.20.0 * update tests/python_tests/requirements.txt --------- Co-authored-by: Yaroslav Tarkan Co-authored-by: Ekaterina Aidova Co-authored-by: guozhong wang Co-authored-by: Chen Peter Co-authored-by: Pavel Esir --- .github/dependabot.yml | 4 + .github/workflows/causal_lm_cpp.yml | 10 - .github/workflows/genai_package.yml | 23 +- .github/workflows/genai_python_lib.yml | 58 ++- .github/workflows/llm_bench-python.yml | 4 +- .gitignore | 2 - CMakeLists.txt | 41 +- .../lcm_dreamshaper_v7/cpp/requirements.txt | 2 +- .../stable_diffusion_1_5/cpp/requirements.txt | 2 +- .../stable_diffusion_1_5/cpp/src/main.cpp | 8 +- llm_bench/python/benchmark.py | 64 ++- llm_bench/python/requirements.txt | 6 +- llm_bench/python/utils/hook_beam_search.py | 55 ++- .../python/utils/hook_beam_search_old.py | 374 ------------------ llm_bench/python/utils/hook_common.py | 165 +------- llm_bench/python/utils/hook_greedy_search.py | 54 +-- .../python/utils/hook_greedy_search_old.py | 302 -------------- llm_bench/python/utils/metrics_print.py | 38 +- llm_bench/python/utils/output_csv.py | 176 ++++----- llm_bench/python/utils/ov_utils.py | 25 +- llm_bench/python/utils/pt_utils.py | 12 +- pyproject.toml | 13 +- requirements-build.txt | 1 + src/cpp/CMakeLists.txt | 4 +- src/cpp/OpenVINOGenAIConfig.cmake.in | 4 +- src/cpp/include/openvino/genai/tokenizer.hpp | 24 ++ src/cpp/include/openvino/genai/visibility.hpp | 6 +- src/cpp/src/tokenizer.cpp | 33 +- src/cpp/src/utils.cpp | 31 -- src/cpp/src/utils.hpp | 14 - src/python/CMakeLists.txt | 5 +- src/python/openvino_genai/__version__.py | 3 + src/python/py_generate_pipeline.cpp | 85 +--- tests/python_tests/list_test_models.py | 12 +- tests/python_tests/requirements.txt | 5 +- tests/python_tests/test_generate_api.py | 152 ++++--- text_generation/causal_lm/cpp/README.md | 9 - .../causal_lm/cpp/requirements.txt | 3 +- thirdparty/CMakeLists.txt | 34 ++ 39 files changed, 503 insertions(+), 1360 deletions(-) delete mode 100644 llm_bench/python/utils/hook_beam_search_old.py delete mode 100644 llm_bench/python/utils/hook_greedy_search_old.py create mode 100644 thirdparty/CMakeLists.txt diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 789167949f..85614b7032 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -12,6 +12,10 @@ updates: directory: "image_generation/lcm_dreamshaper_v7/cpp/scripts/" schedule: interval: "weekly" + - package-ecosystem: "pip" + directory: "./tests/python_tests/" + schedule: + interval: "weekly" - package-ecosystem: "pip" directory: "text_generation/causal_lm/cpp/" schedule: diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index b86f49af35..d78a574e7e 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -30,7 +30,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -58,7 +57,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -228,7 +226,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -256,7 +253,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -284,7 +280,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j 15 @@ -312,7 +307,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -340,7 +334,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -379,7 +372,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -424,7 +416,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j 15 @@ -469,7 +460,6 @@ jobs: source ./ov/setupvars.sh python -m pip install --upgrade-strategy eager -r ./text_generation/causal_lm/cpp/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - sudo apt-get install libtbb-dev optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml index 384ed5643f..32395e79af 100644 --- a/.github/workflows/genai_package.yml +++ b/.github/workflows/genai_package.yml @@ -9,6 +9,8 @@ jobs: matrix: build-type: [Release, Debug] runs-on: ubuntu-20.04 + env: + CMAKE_BUILD_PARALLEL_LEVEL: null steps: - uses: actions/checkout@v4 with: @@ -19,20 +21,17 @@ jobs: - run: mkdir ./ov/ - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - run: sudo apt-get install libtbb-dev - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt - if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ov/samples/cpp/ -B ./samples\ build/ && cmake --build ./samples\ build/ --config ${{ matrix.build-type }} -j && cmake --install ./samples\ build/ --config ${{ matrix.build-type }} --component samples_bin --prefix s\ pace + if: ${{ 'Release' != matrix.build-type }} - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - if: ${{ 'Release' == matrix.build-type }} - run: source ./ov/setupvars.sh && timeout 50s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" - if: ${{ 'Release' == matrix.build-type }} macos_genai_package: strategy: @@ -48,16 +47,16 @@ jobs: python-version: 3.8 - run: mkdir ./ov/ - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz - - run: brew install coreutils ninja scons + - run: brew install coreutils scons - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt - if: ${{ 'Release' == matrix.build-type }} - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 if: ${{ 'Release' == matrix.build-type }} - run: source ./ov/setupvars.sh && timeout 50s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" @@ -68,6 +67,8 @@ jobs: matrix: build-type: [Release, Debug] runs-on: windows-latest + env: + CMAKE_BUILD_PARALLEL_LEVEL: null defaults: run: shell: cmd @@ -85,10 +86,10 @@ jobs: - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64 - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install" if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt - if: ${{ 'Release' == matrix.build-type }} - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release if: ${{ 'Release' == matrix.build-type }} + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 if: ${{ 'Release' == matrix.build-type }} - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml index 9f8382b2b3..9e950e63d7 100644 --- a/.github/workflows/genai_python_lib.yml +++ b/.github/workflows/genai_python_lib.yml @@ -5,10 +5,10 @@ concurrency: cancel-in-progress: true jobs: ubuntu_genai_python_lib: - # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env + # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env. runs-on: ubuntu-22.04 env: - # A tokenizers' dependency fails to compile with Ninja in CenOS7 env + # A tokenizers' dependency fails to compile with Ninja in CenOS7 env. CMAKE_GENERATOR: Unix Makefiles CMAKE_BUILD_PARALLEL_LEVEL: null steps: @@ -19,30 +19,25 @@ jobs: with: python-version: 3.8 - run: mkdir ./ov/ - - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_centos7_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI + # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI. + - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_centos7_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. - - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --verbose - - run: source ./ov/setupvars.sh && PYTHONPATH=./build/ python -c "from openvino_genai import LLMPipeline" + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + # --exitfirst to exit instantly on first error because tests are slow and produce lots of logs slowing down GitHub Actions logs view. + - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py --exitfirst -m precommit - run: source ./ov/setupvars.sh && python -m pip install . --config-settings=build-dir="build" --verbose - - run: python -c "from openvino_genai import LLMPipeline" - - name: GenAI Python API tests - run: | - cd ./tests/python_tests/ - python -m pip install -r requirements.txt - models=$(python list_test_models.py) - echo "$models" | while read -r model_name model_path; do - optimum-cli export openvino --trust-remote-code --weight-format fp16 --model "$model_name" "$model_path" - done - GENAI_BUILD_DIR=../../build python -m pytest test_generate_api.py -v -m precommit + - run: python -m pytest ./tests/python_tests/test_generate_api.py --exitfirst -m precommit macos_genai_python_lib: runs-on: macos-12 env: + # A tokenizers' dependency fails to compile with Ninja. CMAKE_GENERATOR: Unix Makefiles + CMAKE_BUILD_PARALLEL_LEVEL: null steps: - uses: actions/checkout@v4 with: @@ -52,24 +47,16 @@ jobs: python-version: 3.8 - run: mkdir ./ov/ - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz - - run: brew install coreutils ninja scons + - run: brew install coreutils scons - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. - - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --verbose - - run: source ./ov/setupvars.sh && PYTHONPATH=./build/ python -c "from openvino_genai import LLMPipeline" + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --verbose + - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py --exitfirst -m precommit - run: source ./ov/setupvars.sh && python -m pip install . --config-settings=build-dir="build" --verbose - run: python -c "from openvino_genai import LLMPipeline" - - name: GenAI Python API tests - run: | - cd ./tests/python_tests/ - python -m pip install -r requirements.txt - models=$(python list_test_models.py) - echo "$models" | while read -r model_name model_path; do - optimum-cli export openvino --trust-remote-code --weight-format fp16 --model "$model_name" "$model_path" - done - GENAI_BUILD_DIR=../../build python -m pytest test_generate_api.py + - run: python -m pytest ./tests/python_tests/test_generate_api.py --exitfirst -m precommit windows_genai_python_lib: runs-on: windows-latest @@ -87,11 +74,12 @@ jobs: python-version: 3.8 - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/windows/w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64.zip - run: unzip ov.zip - # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal - # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --verbose - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -m pip install . --verbose - - run: python -c "from openvino_genai import LLMPipeline" - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j - - run: set "PYTHONPATH=./build/" && call w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. + # Shorten the next setupvars calls. + - run: mklink /D ov w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64 + - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j + - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. + - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_generate_api.py --exitfirst -m precommit + - run: call ./ov/setupvars.bat && python -m pip install . --config-settings=build-dir="build" --verbose + - run: python -m pytest ./tests/python_tests/test_generate_api.py --exitfirst -m precommit diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index 0731458182..b0ba14430d 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -39,7 +39,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install flake8 pytest black - pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt + GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt pip install openvino-nightly - name: Lint with flake8 @@ -73,7 +73,7 @@ jobs: python-version: 3.8 - name: Test stateful run: | - python -m pip install -r llm_bench/python/requirements.txt + GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt python -m pip uninstall --yes openvino python -m pip install openvino-nightly python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful diff --git a/.gitignore b/.gitignore index 5c88a00fdc..2e39ce5394 100644 --- a/.gitignore +++ b/.gitignore @@ -39,5 +39,3 @@ CMakeUserPresets.json *.?env* *.pyc __pycache__ - -*.so \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0148ca6dd1..896cf67a81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,11 @@ cmake_minimum_required(VERSION 3.15) # Multi config generators such as Visual Studio ignore CMAKE_BUILD_TYPE. Multi config generators are configured with # CMAKE_CONFIGURATION_TYPES, but limiting options in it completely removes such build options get_property(GENERATOR_IS_MULTI_CONFIG_VAR GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) -if(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE) +if(CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") + # 'Ninja Multi-Config' specific, see: + # https://cmake.org/cmake/help/latest/variable/CMAKE_DEFAULT_BUILD_TYPE.html + set(CMAKE_DEFAULT_BUILD_TYPE "Release" CACHE STRING "CMake default build type") +elseif(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE) message(STATUS "CMAKE_BUILD_TYPE is not defined, 'Release' will be used") # Setting CMAKE_BUILD_TYPE as CACHE must go before project(). Otherwise project() sets its value and set() doesn't take an effect set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...") @@ -15,41 +19,14 @@ endif() project(OpenVINOGenAI VERSION 2024.2.0.0) -add_subdirectory(./thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/") -# Put binaries to a single dir to mimic package structure. -set_target_properties(openvino_tokenizers PROPERTIES - # Generator expressions to disable appending a per-configuration subdirectory (Release, Debug). - # ARCHIVE_OUTPUT is irrelevant. It's here just to keep all the artifacts in one place. - ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" - LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" - RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" -) -if(TARGET core_tokenizers) - set_target_properties(core_tokenizers PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" - LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" - RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" - ) -else() - # Prebuilt dependencies - if(WIN32) - set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/core_tokenizers.dll" - "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icudt70.dll" - "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icuuc70.dll") - elseif(LINUX) - set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so") - elseif(APPLE) - set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.dylib") - endif() - add_custom_command(OUTPUT "${extra_libs}" - COMMAND "${CMAKE_COMMAND}" -E copy "${extra_libs}" "${CMAKE_BINARY_DIR}/openvino_genai/" - DEPENDS openvino_tokenizers) -endif() +add_subdirectory(./thirdparty/) add_subdirectory(src) add_subdirectory(text_generation/causal_lm/cpp) install(DIRECTORY text_generation/causal_lm/cpp/ DESTINATION samples/cpp/causal_lm COMPONENT cpp_samples_genai) install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) -set(CPACK_GENERATOR "ZIP") +if(MSVC AND NOT DEFINED CPACK_GENERATOR) + set(CPACK_GENERATOR "ZIP") +endif() include(CPack) diff --git a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt index 047e0d826f..e86e1c2eb1 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt +++ b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt @@ -1,4 +1,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu torch==2.2.2+cpu diffusers==0.27.2 -optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 +optimum-intel[openvino]==1.17.0 diff --git a/image_generation/stable_diffusion_1_5/cpp/requirements.txt b/image_generation/stable_diffusion_1_5/cpp/requirements.txt index 29b40d70c4..dd5faeb7de 100644 --- a/image_generation/stable_diffusion_1_5/cpp/requirements.txt +++ b/image_generation/stable_diffusion_1_5/cpp/requirements.txt @@ -2,5 +2,5 @@ torch==2.2.2+cpu diffusers==0.27.2 transformers==4.39.3 -optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 +optimum-intel[openvino]==1.17.0 huggingface_hub[cli]==0.22.2 diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp index 7f9f9afc3b..d5ea333ef0 100644 --- a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp +++ b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp @@ -368,11 +368,11 @@ int32_t main(int32_t argc, char* argv[]) try { ov::Tensor text_embeddings = text_encoder(models, positive_prompt, negative_prompt); - std::shared_ptr scheduler = std::make_shared(); - scheduler->set_timesteps(num_inference_steps); - std::vector timesteps = scheduler->get_timesteps(); - for (uint32_t n = 0; n < num_images; n++) { + std::shared_ptr scheduler = std::make_shared(); + scheduler->set_timesteps(num_inference_steps); + std::vector timesteps = scheduler->get_timesteps(); + std::uint32_t seed = num_images == 1 ? user_seed : user_seed + n; const size_t unet_in_channels = static_cast(sample_shape[1].get_length()); diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 79537315cd..b95c2395aa 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -34,6 +34,7 @@ DEFAULT_SUPER_RESOLUTION_STEPS = 50 DEFAULT_SUPER_RESOLUTION_WIDTH = 128 DEFAULT_SUPER_RESOLUTION_HEIGHT = 128 +DEFAULT_OUTPUT_TOKEN_SIZE = 512 MAX_OUTPUT_TOKEN_SIZE = 64 * 1024 mem_consumption = MemConsumption() @@ -87,22 +88,22 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, # Remove `token_type_ids` from inputs input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data input_token_size = input_tokens[0].numel() + + max_output_token_size = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] + max_output_token_size = MAX_OUTPUT_TOKEN_SIZE if max_output_token_size > MAX_OUTPUT_TOKEN_SIZE else max_output_token_size if args['batch_size'] > 1: out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) out_str += " Batch_size={}, ".format(args['batch_size']) out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) - if args['infer_count'] is not None: - out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) + out_str += 'all max_output_token_size: {} * {}'.format(max_output_token_size, args['batch_size']) log.info(out_str) max_rss_mem_consumption = '' max_shared_mem_consumption = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.start_collect_memory_consumption() - min_gen_tokens = 0 if args['infer_count'] is None else args['infer_count'] - max_gen_tokens = MAX_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] start = time.perf_counter() - result = model.generate(**input_data, min_new_tokens=int(min_gen_tokens), max_new_tokens=int(max_gen_tokens), num_beams=args['num_beams'], use_cache=True) + result = model.generate(**input_data, max_new_tokens=int(max_output_token_size), num_beams=args['num_beams'], use_cache=True) end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() @@ -123,7 +124,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, else: generated_text_len = len(result[bs_idx]) num_tokens += generated_text_len - if generated_text_len > max_gen_tokens: + if generated_text_len > max_output_token_size: log.error('Output token size is over max output token size!') result_text = generated_text[bs_idx] if args["output_dir"] is not None: @@ -132,14 +133,15 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, if num == 0: warmup_md5[prompt_index] = result_md5_list per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) - tm_list = bench_hook.get_time_list() - log.debug('latency of all tokens:') - [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] - tm_infer_list = bench_hook.get_time_infer_list() + tm_list = [] + tm_infer_list = [] + if bench_hook is not None: + tm_list = bench_hook.get_time_list() + tm_infer_list = bench_hook.get_time_infer_list() iter_data = gen_iterate_data( num, input_token_size * args['batch_size'], - len(tm_infer_list), + max_output_token_size, num_tokens, generation_time, per_token_time, @@ -168,8 +170,9 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) else: utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) - bench_hook.clear_time_list() - bench_hook.clear_time_infer_list() + if bench_hook is not None: + bench_hook.clear_time_list() + bench_hook.clear_time_infer_list() def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_index, streamer, model_precision, proc_id): @@ -197,7 +200,6 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data max_shared_mem_consumption = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.start_collect_memory_consumption() - min_gen_tokens = 0 if args['infer_count'] is None else args['infer_count'] max_gen_tokens = MAX_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] streamer.reset() start = time.perf_counter() @@ -269,12 +271,11 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data streamer.reset() - def run_text_generation_benchmark(model_path, framework, device, args, num_iters): model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) text_gen_fn = run_text_generation if not use_genai else run_text_generation_genai - - model_precision = utils.model_utils.get_model_precision(model_path.parents._parts) + + model_precision = utils.model_utils.get_model_precision(model_path.parts) iter_data_list = [] warmup_md5 = {} input_text_list = utils.model_utils.get_prompts(args) @@ -304,7 +305,7 @@ def run_text_generation_benchmark(model_path, framework, device, args, num_iters def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id): - if args.genai: + if args['genai']: log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") set_seed(args['seed']) input_text = image_param['prompt'] @@ -397,7 +398,7 @@ def run_image_generation_benchmark(model_path, framework, device, args, num_iter def run_image_classification(model_path, framework, device, args, num_iters=10): - if args.genai: + if args['genai']: log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") model, input_size = FW_UTILS[framework].create_image_classification_model(model_path, device, **args) @@ -469,7 +470,7 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_iters): - if args.genai: + if args['genai']: log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") pipe, pretrain_time = FW_UTILS[framework].create_ldm_super_resolution_model(model_path, device, **args) iter_data_list = [] @@ -497,6 +498,7 @@ def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_ # if num_iters == 0, just output warm-up data proc_id = os.getpid() + prompt_idx_list = [image_id for image_id, image_param in enumerate(images)] for num in range(num_iters + 1): image_id = 0 for img in images: @@ -507,7 +509,7 @@ def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_ run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, image_id, tm_list, proc_id) tm_list.clear() image_id = image_id + 1 - utils.metrics_print.print_average(iter_data_list, [], 0, False) + utils.metrics_print.print_average(iter_data_list, prompt_idx_list, 1, False) return iter_data_list, pretrain_time @@ -519,15 +521,6 @@ def num_iters_type(x): return x -def num_infer_count_type(x): - x = int(x) - if x < 1: - raise argparse.ArgumentTypeError('Minimum input value is 1') - elif x > MAX_OUTPUT_TOKEN_SIZE: - raise argparse.ArgumentTypeError(f'Max input value is {MAX_OUTPUT_TOKEN_SIZE}') - return x - - def get_argprser(): parser = argparse.ArgumentParser('LLM benchmarking tool', add_help=True, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files', required=TabError) @@ -541,8 +534,9 @@ def get_argprser(): '-ic', '--infer_count', default=None, - type=num_infer_count_type, - help='set the output token size, the value must be greater than 0.' + type=int, + help='limit the output token size ' + f'(default {DEFAULT_OUTPUT_TOKEN_SIZE}) of text_gen and code_gen models.', ) parser.add_argument( '-n', @@ -617,7 +611,7 @@ def get_argprser(): def main(): - log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout) + log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = get_argprser() model_path, framework, model_args, model_name = utils.model_utils.analyze_args(args) @@ -640,10 +634,10 @@ def main(): if args.report is not None or args.report_json is not None: model_precision = '' if framework == 'ov': - ir_conversion_frontend = utils.model_utils.get_ir_conversion_frontend(model_name, model_path.parents._parts) + ir_conversion_frontend = utils.model_utils.get_ir_conversion_frontend(model_name, model_path.parts) if ir_conversion_frontend != '': framework = framework + '(' + ir_conversion_frontend + ')' - model_precision = utils.model_utils.get_model_precision(model_path.parents._parts) + model_precision = utils.model_utils.get_model_precision(model_path.parts) if args.report is not None: utils.output_csv.write_result( args.report, diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index 87224e5d85..fb59aa5d15 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -4,11 +4,11 @@ openvino>=2024.1.0 auto-gptq>=0.5.1 # for gptq pillow torch -transformers>=4.33.0 +transformers>=4.39.0 diffusers>=0.22.0 #optimum is in dependency list of optimum-intel -git+https://github.com/huggingface/optimum-intel.git@e1b6a59c55157d0feb4d53945cbbe191e5c0f243#egg=optimum-intel -git+https://github.com/openvinotoolkit/nncf.git#egg=nncf +git+https://github.com/huggingface/optimum-intel.git@8c2b787cc75a45ae4670d37970a5394eba90eedc#egg=optimum-intel +git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging psutil timm diff --git a/llm_bench/python/utils/hook_beam_search.py b/llm_bench/python/utils/hook_beam_search.py index 0463fd9008..99b0a9e5c3 100644 --- a/llm_bench/python/utils/hook_beam_search.py +++ b/llm_bench/python/utils/hook_beam_search.py @@ -5,20 +5,24 @@ import time import torch import warnings -import transformers import logging as log -import utils.hook_common as hook_common from torch import nn -from packaging import version from typing import Optional, Tuple, Union, List from transformers.generation.stopping_criteria import ( + EosTokenCriteria, StoppingCriteriaList, validate_stopping_criteria, ) from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.beam_search import BeamScorer +from transformers.generation.utils import ( + _split_model_inputs, + stack_model_outputs, +) from transformers.utils import ModelOutput -import utils.hook_beam_search_old as hook_old_beam + + +logger = log.getLogger(__name__) class GenerateBeamDecoderOnlyOutput(ModelOutput): @@ -52,8 +56,8 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput): tm_infer_list = [] -# Transformers version: Release/v4.39.2 97c00cdfe132164dbd793447a088432fa359fd36 -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L2823 +# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99 +# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2911 # Add the function of collecting latency def new_beam_search( self, @@ -200,7 +204,25 @@ def new_beam_search( if len(stopping_criteria) == 0: warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if eos_token_id is not None: + logger.warning_once( + "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" + " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." + " Otherwise make sure to set `model.generation_config.eos_token_id`", + FutureWarning, + ) + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + else: + # TODO remove when the method is totally private and beam scorer refactored + # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever + eos_token_id = [ + criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id") + ] + eos_token_id = eos_token_id[0] if eos_token_id else None + if eos_token_id is None and self.generation_config.eos_token_id is not None: + eos_token_id = self.generation_config.eos_token_id + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] output_scores = output_scores if output_scores is not None else self.generation_config.output_scores @@ -275,6 +297,7 @@ def new_beam_search( "transo_xl", "xlnet", "cpm", + "jamba", ] ): raise RuntimeError( @@ -282,7 +305,7 @@ def new_beam_search( f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature." ) - inputs_per_sub_batches = hook_common._split_model_inputs( + inputs_per_sub_batches = _split_model_inputs( model_inputs, split_size=batch_size, full_batch_size=batch_beam_size ) outputs_per_sub_batch = [ @@ -295,7 +318,7 @@ def new_beam_search( for inputs_per_sub_batch in inputs_per_sub_batches ] - outputs = hook_common.stack_model_outputs(outputs_per_sub_batch) + outputs = stack_model_outputs(outputs_per_sub_batch) else: # Unchanged original behavior outputs = self( @@ -305,7 +328,6 @@ def new_beam_search( output_hidden_states=output_hidden_states, ) tm_infer_list.append(time.perf_counter() - tic_infer) - if synced_gpus and this_peer_finished: cur_len = cur_len + 1 continue # don't waste resources running the code we don't need @@ -461,15 +483,6 @@ def get_time_infer_list(self): global tm_infer_list return tm_infer_list - def new_forward(self, model, model_type=None): + def new_forward(self, model): """Define a new beam search function.""" - min_version = version.parse(hook_common.TRANS_MIN_VERSION) - trans_version = version.parse(transformers.__version__) - if trans_version < min_version: - log.warning(f'The function of getting latency of beam search will not be available with current transformers version:{trans_version}') - else: - min_second_version = version.parse(hook_common.TRANS_SENCOND_VERSION) - if trans_version >= min_second_version: - model._beam_search = new_beam_search.__get__(model, model.__class__) - else: - model.beam_search = hook_old_beam.old_beam_search.__get__(model, model.__class__) \ No newline at end of file + model._beam_search = new_beam_search.__get__(model, model.__class__) \ No newline at end of file diff --git a/llm_bench/python/utils/hook_beam_search_old.py b/llm_bench/python/utils/hook_beam_search_old.py deleted file mode 100644 index a1b1845f1e..0000000000 --- a/llm_bench/python/utils/hook_beam_search_old.py +++ /dev/null @@ -1,374 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# flake8: noqa -import time -import torch -import warnings -import transformers -import torch.distributed as dist -import logging as log -from torch import nn -from packaging import version -from typing import Optional, Tuple, Union, List -from transformers.generation.stopping_criteria import ( - StoppingCriteriaList, - validate_stopping_criteria, -) -from transformers.generation.logits_process import LogitsProcessorList -from transformers.generation.beam_search import BeamScorer -from transformers.utils import ModelOutput -import utils.hook_beam_search as hook_beam - - -class BeamSearchEncoderDecoderOutput(ModelOutput): - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[torch.LongTensor] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -class BeamSearchDecoderOnlyOutput(ModelOutput): - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[torch.LongTensor] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput] - - -# Transformers version: Release/v4.35.2 514de24abfd4416aeba6a6455ad5920f57f3567d -# Copied from https://github.com/huggingface/transformers/blob/514de24abfd4416aeba6a6455ad5920f57f3567d/src/transformers/generation/utils.py#L2894 -# Add the function of collecting latency -def old_beam_search( - self, - input_ids: torch.LongTensor, - beam_scorer: BeamScorer, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: bool = False, - **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **beam search decoding** and - can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - - - In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate() - instead. For an overview of generation strategies and code examples, check the [following - guide](../generation_strategies). - - - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - beam_scorer (`BeamScorer`): - An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`Union[int, List[int]]`, *optional*): - The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForSeq2SeqLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... BeamSearchScorer, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - - >>> encoder_input_str = "translate English to German: How old are you?" - >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids - - - >>> # lets run beam search using 3 beams - >>> num_beams = 3 - >>> # define decoder start token ids - >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) - >>> input_ids = input_ids * model.config.decoder_start_token_id - - >>> # add encoder_outputs to model keyword arguments - >>> model_kwargs = { - ... "encoder_outputs": model.get_encoder()( - ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True - ... ) - ... } - - >>> # instantiate beam scorer - >>> beam_scorer = BeamSearchScorer( - ... batch_size=1, - ... num_beams=num_beams, - ... device=model.device, - ... ) - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - - >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Wie alt bist du?'] - ```""" - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - if len(stopping_criteria) == 0: - warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions - ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.generation_config.return_dict_in_generate - ) - - batch_size = len(beam_scorer._beam_hyps) - num_beams = beam_scorer.num_beams - - batch_beam_size, cur_len = input_ids.shape - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - beam_indices = ( - tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None - ) - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - - # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens - # of the first beam are considered to avoid sampling the exact same tokens across all beams. - beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False # used by synced_gpus only - while True: - tic = time.perf_counter() - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - tic_infer = time.perf_counter() - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - hook_beam.tm_infer_list.append(time.perf_counter() - tic_infer) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - next_token_scores = nn.functional.log_softmax( - next_token_logits, dim=-1 - ) # (batch_size * num_beams, vocab_size) - - next_token_scores_processed = logits_processor(input_ids, next_token_scores) - next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as( - next_token_scores_processed - ) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_scores_processed,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # reshape for beam search - vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) - - # Sample 1 + len(eos_token_id) next tokens for each beam so we have at least 1 non eos token per beam. - n_eos_tokens = len(eos_token_id) if eos_token_id else 0 - next_token_scores, next_tokens = torch.topk( - next_token_scores, max(2, 1 + n_eos_tokens) * num_beams, dim=1, largest=True, sorted=True - ) - - next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") - next_tokens = next_tokens % vocab_size - - # stateless - beam_outputs = beam_scorer.process( - input_ids, - next_token_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - beam_indices=beam_indices, - ) - - beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) - - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder, - ) - if model_kwargs["past_key_values"] is not None: - model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx) - - if return_dict_in_generate and output_scores: - beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) - - # increase cur_len - cur_len = cur_len + 1 - hook_beam.tm_list.append(time.perf_counter() - tic) - if beam_scorer.is_done or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - sequence_outputs = beam_scorer.finalize( - input_ids, - beam_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - beam_indices=beam_indices, - ) - - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - - if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=sequence_outputs["beam_indices"], - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return BeamSearchDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=sequence_outputs["beam_indices"], - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return sequence_outputs["sequences"] \ No newline at end of file diff --git a/llm_bench/python/utils/hook_common.py b/llm_bench/python/utils/hook_common.py index 5e93385d45..3ff78c9f68 100644 --- a/llm_bench/python/utils/hook_common.py +++ b/llm_bench/python/utils/hook_common.py @@ -2,149 +2,26 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # flake8: noqa -import torch -from typing import Union, List, Dict -from transformers.utils import ModelOutput - -TRANS_MIN_VERSION = '4.36.0' -TRANS_SENCOND_VERSION = '4.39.0' - - -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4783 -def _split(data, full_batch_size: int, split_size: int = None): - """ - Takes care of three cases: - 1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim - 2. data is a tuple: e.g. hidden_states, attentions etc. Keep the tuple as it is and split each tensor in it and - return a list of tuples - 3. data is a tuple of tuples, e.g. past_key_values. Keep the tuple as it is and split each tuple in it and - return a list of tuples of tuples - (see documentation of ModelOutput) - """ - if data is None: - return [None] * (full_batch_size // split_size) - if isinstance(data, torch.Tensor): - return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)] - elif isinstance(data, tuple): - # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example) - if isinstance(data[0], tuple): - return [ - tuple(tuple(tensor[i : i + split_size] for tensor in inner_tuple) for inner_tuple in data) - for i in range(0, full_batch_size, split_size) - ] - +import logging as log +import transformers +from packaging import version + +TRANS_MIN_VERSION = '4.40.0' + + +def get_bench_hook(num_beams, ov_model): + min_version = version.parse(TRANS_MIN_VERSION) + trans_version = version.parse(transformers.__version__) + search_type = 'beam search' if num_beams > 1 else 'greedy search' + if trans_version >= min_version: + import utils.hook_greedy_search + import utils.hook_beam_search + if num_beams > 1: + bench_hook = utils.hook_beam_search.BeamSearchHook() else: - return [ - tuple(sub_tensor[i : i + split_size] for sub_tensor in data) - for i in range(0, full_batch_size, split_size) - ] + bench_hook = utils.hook_greedy_search.GreedySearchHook() + bench_hook.new_forward(ov_model) else: - raise ValueError(f"Unexpected attribute type: {type(data)}") - - -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4814 -def _split_model_inputs( - model_input: Union[ModelOutput, Dict], split_size: int, full_batch_size: int -) -> List[Union[ModelOutput, Dict]]: - """ - Split a ModelOutput object (or its subclasses) or Dict into a list of same-class objects based on a specified split - size. The input object is dict when it was prepared for forward pass and ModelOutput when it was returned from - previous forward pass. - """ - # Edge case: if model_input is None, return a list of Nones - # this happens with Whisper where encoder_outputs is None - if model_input is None: - return [model_input] * (full_batch_size // split_size) - # Infer the class from the object - model_output_cls = type(model_input) - if (full_batch_size % split_size) != 0: - raise ValueError("`full_batch_size` must be divisible by `split_size`") - - if split_size > full_batch_size: - raise ValueError("`split_size` must be smaller or equal to `full_batch_size`") - - # Helper function to split tensors or tuples of tensors - - # Find all the dataclass fields (e.g., last_hidden_state, pooler_output etc.) and split them - keys = ( - model_input.__dataclass_fields__.keys() if hasattr(model_input, "__dataclass_fields__") else model_input.keys() - ) - # We only keep keys that are in the model_input - keys = [k for k in keys if k in model_input] - # Here we can have four types of values: tensors, tuples of tensors and booleans, and encoder_outputs which is a - # ModelOutput object. - # bool should not be split but replicated for each split - bool_keys = [k for k in keys if isinstance(model_input[k], bool) or k == "cache_position"] - keys_to_ignore = ["cache_position", "encoder_outputs"] - non_bool_keys = [k for k in keys if not isinstance(model_input[k], bool) and k not in keys_to_ignore] - - # we split the tensors and tuples of tensors - data_split_list = [ - {k: _split(model_input[k], full_batch_size, split_size)[i] for k in non_bool_keys} - for i in range(full_batch_size // split_size) - ] - # bool values are the same and replicated for each split - bool_data = {k: model_input[k] for k in bool_keys} - # encoder_outputs is a ModelOutput object and should be split by its own - if "encoder_outputs" in model_input: - encoder_outputs_split = _split_model_inputs(model_input["encoder_outputs"], split_size, full_batch_size) - data_split_list = [ - {**data_split, "encoder_outputs": encoder_outputs_split[i]} for i, data_split in enumerate(data_split_list) - ] - - # Convert each dictionary in the list to an object of the inferred class - split_model_inputs: List[Union[ModelOutput, Dict]] = [ - model_output_cls(**data_split, **bool_data) for data_split in data_split_list - ] - - return split_model_inputs - - -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L4871 -def stack_model_outputs(model_outputs: List[ModelOutput]) -> ModelOutput: - """ - Stack a list of ModelOutput objects (or its subclasses) along the batch_size dimension. The function infers the - specific ModelOutput subclass from the list provided. - """ - if not model_outputs: - raise ValueError("Input list is empty.") - - # Infer the class from the first object in the list - model_output_cls = type(model_outputs[0]) - - # Ensure all objects are of the same type - if not all(isinstance(obj, model_output_cls) for obj in model_outputs): - raise ValueError("All elements in the list should be of the same type.") - - # Helper function to concat tensors or tuples of tensors - def _concat(data): - """ - Reverse of `_split` function above. - """ - if any(data is None for data in data): - return None - if isinstance(data[0], torch.Tensor): - return torch.cat(data, dim=0) - elif isinstance(data[0], tuple): - # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example) - if isinstance(data[0][0], tuple): - return tuple( - tuple(torch.cat([attr[i][j] for attr in data], dim=0) for j in range(len(data[0][0]))) - for i in range(len(data[0])) - ) - else: - return tuple(torch.cat([attr[i] for attr in data], dim=0) for i in range(len(data[0]))) - elif isinstance(data[0], (int, float)): - # If the elements are integers or floats, return a tensor - return torch.tensor(data) - else: - raise ValueError(f"Unexpected attribute type: {type(data[0])}") - - # Use a dictionary comprehension to gather attributes from all objects and concatenate them - concatenated_data = { - k: _concat([getattr(model_output, k) for model_output in model_outputs]) - for k in model_output_cls.__dataclass_fields__.keys() - } - - # Return a new object of the inferred class with the concatenated attributes - return model_output_cls(**concatenated_data) \ No newline at end of file + log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}') + bench_hook = None + return bench_hook \ No newline at end of file diff --git a/llm_bench/python/utils/hook_greedy_search.py b/llm_bench/python/utils/hook_greedy_search.py index 7fcbff2fd7..a3912726d7 100644 --- a/llm_bench/python/utils/hook_greedy_search.py +++ b/llm_bench/python/utils/hook_greedy_search.py @@ -5,20 +5,19 @@ import time import torch import warnings -import transformers -import torch.distributed as dist import logging as log -import utils.hook_common as hook_common -from packaging import version from typing import Optional, Tuple, Union, List from transformers.generation.stopping_criteria import ( + EosTokenCriteria, StoppingCriteriaList, validate_stopping_criteria, ) from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.streamers import BaseStreamer from transformers.utils import ModelOutput -import utils.hook_greedy_search_old as hook_old_greedy + + +logger = log.getLogger(__name__) class GenerateDecoderOnlyOutput(ModelOutput): @@ -47,8 +46,8 @@ class GenerateEncoderDecoderOutput(ModelOutput): tm_list = [] tm_infer_list = [] -# Transformers version: Release/v4.39.2 97c00cdfe132164dbd793447a088432fa359fd36 -# Copied from https://github.com/huggingface/transformers/blob/v4.39-release/src/transformers/generation/utils.py#L2244 +# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99 +# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2310 # Add the function of collecting latency def new_greedy_search( self, @@ -173,10 +172,27 @@ def new_greedy_search( ) stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if eos_token_id is not None: + logger.warning_once( + "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" + " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." + " Otherwise make sure to set `model.generation_config.eos_token_id`", + FutureWarning, + ) + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + else: + # TODO remove when the method is totally private + # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever + eos_token_id = [ + criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id") + ] + eos_token_id = eos_token_id[0] if eos_token_id else None + if eos_token_id is None and self.generation_config.eos_token_id is not None: + eos_token_id = self.generation_config.eos_token_id + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] - eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None output_scores = output_scores if output_scores is not None else self.generation_config.output_scores output_attentions = ( output_attentions if output_attentions is not None else self.generation_config.output_attentions @@ -274,12 +290,6 @@ def new_greedy_search( is_encoder_decoder=self.config.is_encoder_decoder, ) - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id_tensor is not None: - unfinished_sequences = unfinished_sequences.mul( - next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) - ) - unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) this_peer_finished = unfinished_sequences.max() == 0 tm_list.append(time.perf_counter() - tic) @@ -340,15 +350,7 @@ def get_time_infer_list(self): global tm_infer_list return tm_infer_list - def new_forward(self, model, model_type=None): + def new_forward(self, model): """Define a new greedy search function.""" - min_version = version.parse(hook_common.TRANS_MIN_VERSION) - trans_version = version.parse(transformers.__version__) - if trans_version < min_version: - log.warning(f'The function of getting latency of greedy search will not be available with current transformers version:{trans_version}') - else: - min_second_version = version.parse(hook_common.TRANS_SENCOND_VERSION) - if trans_version >= min_second_version: - model._greedy_search = new_greedy_search.__get__(model, model.__class__) - else: - model.greedy_search = hook_old_greedy.old_greedy_search.__get__(model, model.__class__) + model._greedy_search = new_greedy_search.__get__(model, model.__class__) + diff --git a/llm_bench/python/utils/hook_greedy_search_old.py b/llm_bench/python/utils/hook_greedy_search_old.py deleted file mode 100644 index 595aa596da..0000000000 --- a/llm_bench/python/utils/hook_greedy_search_old.py +++ /dev/null @@ -1,302 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# flake8: noqa -import time -import torch -import warnings -import torch.distributed as dist -from typing import Optional, Tuple, Union, List -from transformers.generation.stopping_criteria import ( - StoppingCriteriaList, - validate_stopping_criteria, -) -from transformers.generation.logits_process import LogitsProcessorList -from transformers.generation.streamers import BaseStreamer -from transformers.utils import ModelOutput -import utils.hook_greedy_search as hook_greedy - - -class GreedySearchDecoderOnlyOutput(ModelOutput): - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -class GreedySearchEncoderDecoderOutput(ModelOutput): - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput] - -# Transformers version: Release/v4.35.2 514de24abfd4416aeba6a6455ad5920f57f3567d -# Copied from https://github.com/huggingface/transformers/blob/514de24abfd4416aeba6a6455ad5920f57f3567d/src/transformers/generation/utils.py#L2353 -# Add the function of collecting latency -def old_greedy_search( - self, - input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[Union[int, List[int]]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: bool = False, - streamer: Optional["BaseStreamer"] = None, - **model_kwargs, - ) -> Union[GreedySearchOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be - used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - - - In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate() - instead. For an overview of generation strategies and code examples, check the [following - guide](../generation_strategies). - - - - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`Union[int, List[int]]`, *optional*): - The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - streamer (`BaseStreamer`, *optional*): - Streamer object that will be used to stream the generated sequences. Generated tokens are passed - through `streamer.put(token_ids)` and the streamer is responsible for any further processing. - model_kwargs: - Additional model specific keyword arguments will be forwarded to the `forward` function of the model. - If model is an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForCausalLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... StoppingCriteriaList, - ... MaxLengthCriteria, - ... ) - - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") - - >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token - >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id - - >>> input_prompt = "It might be possible to" - >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id), - ... ] - ... ) - >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) - - >>> outputs = model.greedy_search( - ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria - ... ) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ["It might be possible to get a better understanding of the nature of the problem, but it's not"] - ```""" - # init values - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: - warnings.warn( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", - UserWarning, - ) - stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) - pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None - output_scores = output_scores if output_scores is not None else self.generation_config.output_scores - output_attentions = ( - output_attentions if output_attentions is not None else self.generation_config.output_attentions - ) - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.generation_config.return_dict_in_generate - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = () if (return_dict_in_generate and output_attentions) else None - cross_attentions = () if (return_dict_in_generate and output_attentions) else None - decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None - ) - - # keep track of which sequences are already finished - unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) - - this_peer_finished = False # used by synced_gpus only - while True: - tic = time.perf_counter() - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - tic_infer = time.perf_counter() - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - hook_greedy.tm_infer_list.append(time.perf_counter() - tic_infer) - - if synced_gpus and this_peer_finished: - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - next_tokens_scores = logits_processor(input_ids, next_token_logits) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_tokens_scores,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # argmax - next_tokens = torch.argmax(next_tokens_scores, dim=-1) - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") - next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) - - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - if streamer is not None: - streamer.put(next_tokens.cpu()) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id_tensor is not None: - unfinished_sequences = unfinished_sequences.mul( - next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) - ) - - # stop when each sentence is finished - if unfinished_sequences.max() == 0: - this_peer_finished = True - - # stop if we exceed the maximum length - if stopping_criteria(input_ids, scores): - this_peer_finished = True - hook_greedy.tm_list.append(time.perf_counter() - tic) - if this_peer_finished and not synced_gpus: - break - - if streamer is not None: - streamer.end() - - if return_dict_in_generate: - if self.config.is_encoder_decoder: - return GreedySearchEncoderDecoderOutput( - sequences=input_ids, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return GreedySearchDecoderOnlyOutput( - sequences=input_ids, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return input_ids \ No newline at end of file diff --git a/llm_bench/python/utils/metrics_print.py b/llm_bench/python/utils/metrics_print.py index 650a0e4d28..9e82a95128 100644 --- a/llm_bench/python/utils/metrics_print.py +++ b/llm_bench/python/utils/metrics_print.py @@ -43,6 +43,9 @@ def print_metrics( f"[{iter_str}] First token latency: {iter_data['first_token_latency']:.2f} ms/{latency_unit}, " f"other tokens latency: {iter_data['other_tokens_avg_latency']:.2f} ms/{latency_unit}, len of tokens: {len(tms)} * {batch_size}", ) + else: + if tokenization_time: + log.warning(f'[{iter_str}] No hook data output for first token latency and other tokens latency') if len(tms_infer) > 0: iter_data['first_token_infer_latency'] = tms_infer[0] * 1000 if len(tms_infer) > 0 else -1 iter_data['other_tokens_infer_avg_latency'] = sum(tms_infer[1:]) / (len(tms_infer) - 1) * 1000 if len(tms_infer) > 1 else -1 @@ -50,6 +53,9 @@ def print_metrics( f"[{iter_str}] First infer latency: {iter_data['first_token_infer_latency']:.2f} ms/infer, " f"other infers latency: {iter_data['other_tokens_infer_avg_latency']:.2f} ms/infer, inference count: {len(tms_infer)}", ) + else: + if tokenization_time: + log.warning(f'[{iter_str}] No hook data output for first infer latency and other infers latency') if stable_diffusion is not None: print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion) output_str = '' @@ -106,7 +112,7 @@ def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=Fa f"vqvae decoder step count: 1",) -def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size): +def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen): for p_idx in prompt_idx_list: avg_1st_token_latency = 0 avg_2nd_tokens_latency = 0 @@ -118,21 +124,30 @@ def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch if iter_data['iteration'] == 0: continue if iter_data['prompt_idx'] == p_idx: - avg_1st_token_latency += iter_data['first_token_latency'] - avg_2nd_tokens_latency += iter_data['other_tokens_avg_latency'] - avg_input_size += iter_data['input_size'] + avg_1st_token_latency += iter_data['first_token_latency'] if iter_data['first_token_latency'] != '' else 0 + avg_2nd_tokens_latency += iter_data['other_tokens_avg_latency'] if iter_data['other_tokens_avg_latency'] != '' else 0 + avg_input_size += iter_data['input_size'] if iter_data['input_size'] != '' else 0 index_num = index_num + 1 if index_num > 0: avg_1st_token_latency = avg_1st_token_latency / index_num avg_2nd_tokens_latency = avg_2nd_tokens_latency / index_num avg_input_size = int(avg_input_size / index_num) - avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000 - latency_unit = 'token' + if avg_2nd_tokens_latency > 0: + avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000 + latency_unit = 'token' if is_text_gen is True else 'step' if batch_size > 1: - latency_unit = '{}tokens'.format(batch_size) - prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token lantency: {:.2f} ms/{}, ' \ - '2nd tokens latency: {:.2f} ms/{}, 2nd tokens throughput: {:.2f} tokens/s' \ - .format(p_idx, avg_input_size, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput) + if is_text_gen is True: + latency_unit = '{}tokens'.format(batch_size) + else: + latency_unit = '{}steps'.format(batch_size) + if is_text_gen is True: + prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token lantency: {:.2f} ms/{}, ' \ + '2nd tokens latency: {:.2f} ms/{}, 2nd tokens throughput: {:.2f} tokens/s' \ + .format(p_idx, avg_input_size, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput) + else: + prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] 1st step of unet latency {:.2f} ms/{}, ' \ + '2nd steps of unet latency: {:.2f} ms/{}, 2nd steps throughput: {:.2f} steps/s' \ + .format(p_idx, avg_1st_token_latency, latency_unit, avg_2nd_tokens_latency, latency_unit, avg_2nd_token_tput) def print_average(iter_data_list, prompt_idx_list, batch_size, is_text_gen=False): @@ -156,8 +171,7 @@ def print_average(iter_data_list, prompt_idx_list, batch_size, is_text_gen=False if total_iters > 0: prompt_dict = {} - if is_text_gen is True: - output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size) + output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen) log.info('<<< Warm-up iteration is excluded. >>>') out_str = '[Total] Iterations: {}'.format(total_iters) for prompt_key in prompt_dict: diff --git a/llm_bench/python/utils/output_csv.py b/llm_bench/python/utils/output_csv.py index 17b2b983e1..72a26d8c89 100644 --- a/llm_bench/python/utils/output_csv.py +++ b/llm_bench/python/utils/output_csv.py @@ -2,6 +2,8 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 import csv +import numpy as np +import copy from pathlib import Path @@ -40,13 +42,13 @@ def output_comments(result, use_case, writer): comment_list.append('detokenization_time: Tokenizer decode time') comment_list.append('pretrain_time: Total time of load model and compile model') comment_list.append('generation_time: Time for one interaction. (e.g. The duration of answering one question or generating one picture)') - comment_list.append('iteration=0: warm-up; iteration=-1: average (exclude warm-up)') + comment_list.append('iteration=0: warm-up; iteration=avg: average (exclude warm-up);iteration=mini: minimum value (exclude warm-up);' + 'iteration=median: median value (exclude warm-up);') comment_list.append( - 'max_rss_mem: max rss memory consumption;' 'the value in -1 iteration row is the maximum value of all available RSS memory numbers in iterations', + 'max_rss_mem: max rss memory consumption;' ) comment_list.append( 'max_shared_mem: max shared memory consumption;' - 'the value in -1 iteration row is the maximum value of all available shared memory numbers in iterations', ) for comments in comment_list: @@ -54,6 +56,66 @@ def output_comments(result, use_case, writer): writer.writerow(result) +def output_avg_min_median(iter_data_list): + prompt_idxs = [] + for iter_data in iter_data_list: + prompt_idxs.append(iter_data['prompt_idx']) + prompt_idxs = list(set(prompt_idxs)) + result = {} + for prompt_idx in prompt_idxs: + same_prompt_datas = [] + for iter_data in iter_data_list: + if iter_data['prompt_idx'] == prompt_idx and iter_data['iteration'] > 0: + same_prompt_datas.append(iter_data) + key_word = ['input_size', 'infer_count', 'generation_time', 'output_size', 'latency', 'first_token_latency', 'other_tokens_avg_latency', + 'first_token_infer_latency', 'other_tokens_infer_avg_latency', 'tokenization_time', 'detokenization_time'] + if len(same_prompt_datas) > 0: + iters_idx = ['avg', 'mini', 'median'] + result[prompt_idx] = [copy.deepcopy(same_prompt_datas[0]) for i in range(3)] + for i in range(len(iters_idx)): + result[prompt_idx][i]['iteration'] = iters_idx[i] + for key in key_word: + values = [] + for prompt in same_prompt_datas: + if prompt[key] != '': + values.append(prompt[key]) + if len(values) > 0: + result[prompt_idx][0][key] = np.mean(values) + result[prompt_idx][1][key] = np.min(values) + result[prompt_idx][2][key] = np.median(values) + return result + + +def gen_data_to_csv(result, iter_data, pretrain_time): + generation_time = iter_data['generation_time'] + latency = iter_data['latency'] + first_latency = iter_data['first_token_latency'] + other_latency = iter_data['other_tokens_avg_latency'] + first_token_infer_latency = iter_data['first_token_infer_latency'] + other_token_infer_latency = iter_data['other_tokens_infer_avg_latency'] + rss_mem = iter_data['max_rss_mem_consumption'] + shared_mem = iter_data['max_shared_mem_consumption'] + token_time = iter_data['tokenization_time'] + detoken_time = iter_data['detokenization_time'] + result['iteration'] = str(iter_data['iteration']) + result['pretrain_time(s)'] = pretrain_time + result['input_size'] = iter_data['input_size'] + result['infer_count'] = iter_data['infer_count'] + result['generation_time(s)'] = round(generation_time, 5) if generation_time != '' else generation_time + result['output_size'] = iter_data['output_size'] + result['latency(ms)'] = round(latency, 5) if latency != '' else latency + result['result_md5'] = iter_data['result_md5'] + result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency + result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency + result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency + result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency + result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem + result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem + result['prompt_idx'] = iter_data['prompt_idx'] + result['tokenization_time'] = round(token_time, 5) if token_time != '' else token_time + result['detokenization_time'] = round(detoken_time, 5) if detoken_time != '' else detoken_time + + def write_result(report_file, model, framework, device, model_args, iter_data_list, pretrain_time, model_precision): header = [ 'iteration', @@ -86,17 +148,6 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li with open(out_file, 'w+', newline='') as f: writer = csv.DictWriter(f, header) writer.writeheader() - - total_generation_time = 0 - total_num_tokens = 0 - total_input_size = 0 - total_infer_count = 0 - total_first_token_latency = 0 - total_other_tokens_avg_latency = 0 - total_first_token_infer_latency = 0 - total_other_tokens_infer_avg_latency = 0 - total_max_rss_mem_consumption = 0 - total_max_shared_mem_consumption = 0 result = {} result['model'] = model result['framework'] = framework @@ -105,97 +156,16 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li result['precision'] = model_precision result['num_beams'] = model_args['num_beams'] result['batch_size'] = model_args['batch_size'] - total_iters = len(iter_data_list) - - skip_iter_nums = 0 - for i in range(total_iters): + for i in range(len(iter_data_list)): iter_data = iter_data_list[i] - generation_time = iter_data['generation_time'] - latency = iter_data['latency'] - first_latency = iter_data['first_token_latency'] - other_latency = iter_data['other_tokens_avg_latency'] - first_token_infer_latency = iter_data['first_token_infer_latency'] - other_token_infer_latency = iter_data['other_tokens_infer_avg_latency'] - rss_mem = iter_data['max_rss_mem_consumption'] - shared_mem = iter_data['max_shared_mem_consumption'] - token_time = iter_data['tokenization_time'] - detoken_time = iter_data['detokenization_time'] - result['iteration'] = str(iter_data['iteration']) - if i > 0: - result['pretrain_time(s)'] = '' - - result['input_size'] = iter_data['input_size'] - result['infer_count'] = iter_data['infer_count'] - result['generation_time(s)'] = round(generation_time, 5) if generation_time != '' else generation_time - result['output_size'] = iter_data['output_size'] - result['latency(ms)'] = round(latency, 5) if latency != '' else latency - result['result_md5'] = iter_data['result_md5'] - result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency - result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency - result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency - result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency - result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem - result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem - result['prompt_idx'] = iter_data['prompt_idx'] - result['tokenization_time'] = round(token_time, 5) if token_time != '' else token_time - result['detokenization_time'] = round(detoken_time, 5) if detoken_time != '' else detoken_time + pre_time = '' if i > 0 else result['pretrain_time(s)'] + gen_data_to_csv(result, iter_data, pre_time) writer.writerow(result) - # Skip the warm-up iteration - if iter_data['iteration'] > 0: - if iter_data['generation_time'] != '': - total_generation_time += iter_data['generation_time'] - if iter_data['output_size'] != '': - total_num_tokens += iter_data['output_size'] - if iter_data['input_size'] != '': - total_input_size += iter_data['input_size'] - if iter_data['first_token_latency'] != '': - total_first_token_latency += iter_data['first_token_latency'] - if iter_data['other_tokens_avg_latency'] != '': - total_other_tokens_avg_latency += iter_data['other_tokens_avg_latency'] - if iter_data['first_token_infer_latency'] != '': - total_first_token_infer_latency += iter_data['first_token_infer_latency'] - if iter_data['other_tokens_infer_avg_latency'] != '': - total_other_tokens_infer_avg_latency += iter_data['other_tokens_infer_avg_latency'] - if iter_data['infer_count'] != '': - total_infer_count += iter_data['infer_count'] - else: - skip_iter_nums = skip_iter_nums + 1 - if iter_data['max_rss_mem_consumption'] != '': - if iter_data['max_rss_mem_consumption'] > total_max_rss_mem_consumption: - total_max_rss_mem_consumption = iter_data['max_rss_mem_consumption'] - if iter_data['max_shared_mem_consumption'] != '': - if iter_data['max_shared_mem_consumption'] > total_max_shared_mem_consumption: - total_max_shared_mem_consumption = iter_data['max_shared_mem_consumption'] - total_iters -= skip_iter_nums - if total_iters > 0: - result['iteration'] = str('-1') - result['pretrain_time(s)'] = '' - if total_input_size > 0: - result['input_size'] = round(total_input_size / total_iters, 5) - if total_infer_count > 0: - result['infer_count'] = round(total_infer_count / total_iters, 5) - if total_generation_time > 0: - result['generation_time(s)'] = round(total_generation_time / total_iters, 5) - if total_num_tokens > 0: - avg_per_token_time = total_generation_time * 1000 / total_num_tokens - result['output_size'] = round(total_num_tokens / total_iters, 5) - result['latency(ms)'] = round(avg_per_token_time, 5) - else: - result['output_size'] = '' - result['latency(ms)'] = '' - if total_first_token_latency > 0: - result['1st_latency(ms)'] = round(total_first_token_latency / total_iters, 5) - if total_other_tokens_avg_latency > 0: - result['2nd_avg_latency(ms)'] = round(total_other_tokens_avg_latency / total_iters, 5) - if total_first_token_infer_latency > 0: - result['1st_infer_latency(ms)'] = round(total_first_token_infer_latency / total_iters, 5) - if total_other_tokens_infer_avg_latency > 0: - result['2nd_infer_avg_latency(ms)'] = round(total_other_tokens_infer_avg_latency / total_iters, 5) - if total_max_rss_mem_consumption > 0: - result['max_rss_mem(MB)'] = total_max_rss_mem_consumption - if total_max_shared_mem_consumption > 0: - result['max_shared_mem(MB)'] = total_max_shared_mem_consumption - writer.writerow(result) + res_data = output_avg_min_median(iter_data_list) + for key in res_data.keys(): + for data in res_data[key]: + gen_data_to_csv(result, data, '') + writer.writerow(result) output_comments(result, model_args['use_case'], writer) diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py index 1ec9f58195..6aa0e1e403 100644 --- a/llm_bench/python/utils/ov_utils.py +++ b/llm_bench/python/utils/ov_utils.py @@ -9,9 +9,7 @@ import torch import time import types -import utils.hook_greedy_search -import utils.hook_beam_search - +import utils.hook_common as hook_common from utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES import openvino.runtime.opset13 as opset @@ -121,6 +119,7 @@ def decode_ov_tokenizer(self, token_ids, *args, **kwargs): hf_tokenizer.decode = types.MethodType(decode_ov_tokenizer, hf_tokenizer) return hf_tokenizer + def create_text_gen_model(model_path, device, **kwargs): """Create text generation model. @@ -148,7 +147,7 @@ def create_text_gen_model(model_path, device, **kwargs): if kwargs["batch_size"] > 1 or kwargs["num_beams"] > 1: log.warning("OpenVINO GenAI based benchmarking implmented only for batch_size == 1 and num_beams == 1") elif model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"]]: - log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking") + log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking") else: return create_genai_text_gen_model(model_path, device, ov_config, **kwargs) remote_code = False @@ -169,11 +168,7 @@ def create_text_gen_model(model_path, device, **kwargs): if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING['t5']): patch_inter_processing_and_compile(ov_model, **kwargs) end = time.perf_counter() - if kwargs['num_beams'] > 1: - bench_hook = utils.hook_beam_search.BeamSearchHook() - else: - bench_hook = utils.hook_greedy_search.GreedySearchHook() - bench_hook.new_forward(ov_model, model_type) + bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], ov_model) from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') # load token @@ -185,7 +180,6 @@ def create_text_gen_model(model_path, device, **kwargs): def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): import openvino_genai - import openvino_tokenizers from transformers import AutoTokenizer class TokenStreamer(openvino_genai.StreamerBase): @@ -214,16 +208,16 @@ def get_tokens(self): def get_time_list(self): return self.token_generation_time - + if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists(): convert_ov_tokenizer(model_path) - + core = Core() hf_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) ov_tok = core.read_model(model_path / "openvino_tokenizer.xml") ov_detok = core.read_model(model_path / "openvino_detokenizer.xml") hf_tokenizer = build_ov_tokenizer_wrapper(hf_tokenizer, ov_tok, ov_detok) - + start = time.perf_counter() # TO DO: add plugin config @@ -234,6 +228,7 @@ def get_time_list(self): return llm_pipe, hf_tokenizer, end - start, streamer, True + def convert_ov_tokenizer(tokenizer_path): from optimum.exporters.openvino.convert import export_tokenizer from transformers import AutoTokenizer @@ -277,12 +272,12 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs): def is_genai_available(log=False): + import importlib try: - import openvino_genai + importlib.import_module('openvino_genai') except ImportError as ex: if log: log.warning("Attempt to load OpenVINO GenaAI package failed. Please install openvino_genai package. Full error message available in debug mode") log.debug(ex) return False return True - \ No newline at end of file diff --git a/llm_bench/python/utils/pt_utils.py b/llm_bench/python/utils/pt_utils.py index d703f4bb1a..ccf401330c 100644 --- a/llm_bench/python/utils/pt_utils.py +++ b/llm_bench/python/utils/pt_utils.py @@ -7,11 +7,7 @@ import os import time import logging as log -import openvino.torch # noqa: F401 -import utils.hook_greedy_search -import utils.hook_beam_search - -MAX_CONNECT_TIME = 50 +import utils.hook_common as hook_common def set_bf16(model, device, **kwargs): @@ -95,11 +91,7 @@ def create_text_gen_model(model_path, device, **kwargs): else: raise RuntimeError('==Failure ==: no device to load') - if kwargs['num_beams'] > 1: - bench_hook = utils.hook_beam_search.BeamSearchHook() - else: - bench_hook = utils.hook_greedy_search.GreedySearchHook() - bench_hook.new_forward(model, model_type) + bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], model) if kwargs['torch_compile_backend']: backend = kwargs['torch_compile_backend'] diff --git a/pyproject.toml b/pyproject.toml index dbab155062..1531e39fd2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "openvino_genai" version = "2024.2.0.0" description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai" requires-python = ">=3.8" -readme = {file = "text_generation/causal_lm/cpp/README.md", content-type="text/markdown"} +readme = {file = "src/README.md", content-type="text/markdown"} license = {text = "OSI Approved :: Apache Software License"} authors = [ { name = "OpenVINO Developers", email = "openvino@intel.com" }, @@ -20,8 +20,9 @@ dependencies = [ ] [tool.scikit-build] -cmake.source-dir = "./" cmake.build-type = "Release" +cmake.source-dir = "./" +cmake.targets = ["py_generate_pipeline"] # Adding genai would trigger a Release build and Debug build after it. py_generate_pipeline depends on genai and genai will be built anyway. It's not been investigated why both build types are triggered. install.components = ["wheel_genai"] sdist.cmake = true wheel.packages = ["src/python/openvino_genai"] @@ -37,5 +38,11 @@ __version__ = "${version}" [build-system] # TODO: add build.tool-args = ["--parallel"] after scikit-build-core is updated to 0.9.4+. -requires = ["scikit-build-core~=0.8.0", "cmake~=3.23"] # See https://github.com/openvinotoolkit/openvino_tokenizers/pull/123 +requires = ["scikit-build-core~=0.8.0"] # See https://github.com/openvinotoolkit/openvino_tokenizers/pull/123 build-backend = "scikit_build_core.build" + +[tool.pytest.ini_options] +markers = [ + "nightly", + "precommit: (deselect with '-m \"precommit\"')", +] diff --git a/requirements-build.txt b/requirements-build.txt index d75687fa2a..81be222a8b 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1 +1,2 @@ build~=1.2.1 +cmake~=3.23 diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index ec909de271..399ce29084 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -39,7 +39,7 @@ ov_genai_build_jinja2cpp() file(GLOB SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") -set(TARGET_NAME genai) +set(TARGET_NAME openvino_genai) add_library(${TARGET_NAME} SHARED ${SOURCE_FILES}) add_library(openvino::genai ALIAS ${TARGET_NAME}) @@ -53,7 +53,7 @@ target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17) # Extract two last digits from CMAKE_PROJECT_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols. string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${CMAKE_PROJECT_VERSION_MAJOR}) set_target_properties(${TARGET_NAME} PROPERTIES - OUTPUT_NAME openvino_genai + EXPORT_NAME genai VERSION ${CMAKE_PROJECT_VERSION} SOVERSION ${MAJOR_SUFFIX}${CMAKE_PROJECT_VERSION_MINOR}${CMAKE_PROJECT_VERSION_PATCH} ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" diff --git a/src/cpp/OpenVINOGenAIConfig.cmake.in b/src/cpp/OpenVINOGenAIConfig.cmake.in index 18c0bb4e48..c1f9c86c52 100644 --- a/src/cpp/OpenVINOGenAIConfig.cmake.in +++ b/src/cpp/OpenVINOGenAIConfig.cmake.in @@ -3,8 +3,8 @@ include(CMakeFindDependencyMacro) find_dependency(OpenVINO COMPONENTS Runtime) -if(NOT TARGET genai) +if(NOT TARGET openvino_genai) include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake") endif() -check_required_components(openvino_genai) +check_required_components(OpenVINOGenAI) diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 5dcc1a2670..002b0cb4ca 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -3,6 +3,7 @@ #pragma once +#include #include #include #include @@ -84,5 +85,28 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { std::shared_ptr m_pimpl; }; +/** +* @brief Returns an absolute path. The path is this library's directory + * concatenated with openvino_tokenizers OS specific + * * name (.so, .dll, .dylib, lib prefix). This is part of the interface + * because it's reused in Python bindings. + * tokenizers_relative_to_genai() and ScopedVar allow passing a path to + * openvino_tokenizers through env var removing one argument from + * Tokenizer's constructor. +*/ +OPENVINO_GENAI_EXPORTS std::filesystem::path tokenizers_relative_to_genai(); + +/** +* @brief Sets ENVIRONMENT_VARIABLE_NAME to environment_variable_value + * and unsets in destructor. Does nothing if ENVIRONMENT_VARIABLE_NAME + * was already defined. +*/ +class OPENVINO_GENAI_EXPORTS ScopedVar { +public: + explicit ScopedVar(const std::string& environment_variable_value); + ~ScopedVar(); + bool was_already_set; + static constexpr char ENVIRONMENT_VARIABLE_NAME[] = "OPENVINO_TOKENIZERS_PATH_GENAI"; +}; } // namespace genai } // namespace ov diff --git a/src/cpp/include/openvino/genai/visibility.hpp b/src/cpp/include/openvino/genai/visibility.hpp index 6a8cf756e0..4a1a60bb61 100644 --- a/src/cpp/include/openvino/genai/visibility.hpp +++ b/src/cpp/include/openvino/genai/visibility.hpp @@ -1,10 +1,12 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#pragma once + #include "openvino/core/visibility.hpp" -#ifdef genai_EXPORTS +#ifdef openvino_genai_EXPORTS # define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_EXPORTS #else # define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_IMPORTS -#endif // genai_EXPORTS +#endif // openvino_genai_EXPORTS diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index a380b401a1..7cf4f973ea 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -97,7 +97,6 @@ std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path #endif return path.parent_path() / tokenizers; } - } // namespace namespace ov { @@ -118,7 +117,7 @@ class Tokenizer::TokenizerImpl { if (ov::genai::utils::is_xml(tokenizers_path)) OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file"); - const char* ov_tokenizers_path = getenv(ov::genai::utils::get_tokenizers_env_name()); + const char* ov_tokenizers_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME); if (ov_tokenizers_path) { core.add_extension(ov_tokenizers_path); } else { @@ -203,7 +202,7 @@ class Tokenizer::TokenizerImpl { }; Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device) { - ov::genai::utils::GenAIEnvManager env_manager(with_openvino_tokenizers(get_ov_genai_library_path()).string()); + ov::genai::ScopedVar env_manager(tokenizers_relative_to_genai().string()); m_pimpl = std::make_shared(tokenizers_path, device); } @@ -261,5 +260,33 @@ void Tokenizer::set_eos_token_id(int64_t eos_token_id) { Tokenizer::~Tokenizer() = default; +std::filesystem::path tokenizers_relative_to_genai() { + return with_openvino_tokenizers(get_ov_genai_library_path()); +} + +ScopedVar::ScopedVar(const std::string& environment_variable_value) { +#ifdef _WIN32 + char* value = nullptr; + size_t len = 0; + _dupenv_s(&value, &len, ENVIRONMENT_VARIABLE_NAME); + if (value == nullptr) + _putenv_s(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str()); +#else + if (!getenv(ENVIRONMENT_VARIABLE_NAME)) + setenv(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str(), 1); +#endif + else + was_already_set = true; +} + +ScopedVar::~ScopedVar() { + if (!was_already_set) { +#ifdef _WIN32 + _putenv_s(ENVIRONMENT_VARIABLE_NAME, ""); +#else + unsetenv(ENVIRONMENT_VARIABLE_NAME); +#endif + } +} } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 7dac6571dc..3acb2c28c0 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -141,37 +141,6 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) { } return new_atten_mask; } - -GenAIEnvManager::GenAIEnvManager(const std::string& path) { - #ifdef _WIN32 - char* value = nullptr; - size_t len = 0; - _dupenv_s(&value, &len, ov::genai::utils::get_tokenizers_env_name()); - if (value == nullptr) - _putenv_s(ov::genai::utils::get_tokenizers_env_name(), path.c_str()); - #else - if (!getenv(ov::genai::utils::get_tokenizers_env_name())) - setenv(ov::genai::utils::get_tokenizers_env_name(), path.c_str(), 1); - #endif - else - was_already_set = true; -} - -GenAIEnvManager::~GenAIEnvManager() { - if (!was_already_set){ - #ifdef _WIN32 - _putenv_s(ov::genai::utils::get_tokenizers_env_name(), ""); - #else - unsetenv(ov::genai::utils::get_tokenizers_env_name()); - #endif - } -} - -const char* get_tokenizers_env_name() { - return "OPENVINO_TOKENIZERS_PATH_GENAI"; -} - - } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 84ee8b711f..55b510f81f 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -58,20 +58,6 @@ void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param = config_map.at(name).as(); } } - -const char* get_tokenizers_env_name(); - -// const char* OV_TOKENIZERS_ENV_NAME = "OPENVINO_TOKENIZERS_PATH_GENAI"; - -class GenAIEnvManager { -public: - GenAIEnvManager(const std::string& path); - ~GenAIEnvManager(); -private: - bool was_already_set; -}; - } // namespace utils } // namespace genai } // namespace ov - diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index e53ba6ca02..1adeee111f 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -8,7 +8,6 @@ FetchContent_Declare( URL https://github.com/pybind/pybind11/archive/refs/tags/v2.12.0.tar.gz URL_HASH SHA256=bf8f242abd1abcd375d516a7067490fb71abd79519a282d22b6e4d19282185a7 ) -set(CMAKE_POSITION_INDEPENDENT_CODE ON) FetchContent_GetProperties(pybind11) if(NOT pybind11_POPULATED) FetchContent_Populate(pybind11) @@ -42,11 +41,11 @@ endif() find_package(Python3 REQUIRED COMPONENTS Interpreter Development) install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__init__.py" "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) -install(TARGETS genai py_generate_pipeline LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) +install(TARGETS openvino_genai py_generate_pipeline LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) # wheel_genai component is used for wheel generation in pyproject.toml. # Exclude wheel_genai from normal packaging because there's pygenai_X_Y component for that. -install(TARGETS genai py_generate_pipeline +install(TARGETS openvino_genai py_generate_pipeline LIBRARY DESTINATION . COMPONENT wheel_genai RUNTIME DESTINATION . COMPONENT wheel_genai EXCLUDE_FROM_ALL) diff --git a/src/python/openvino_genai/__version__.py b/src/python/openvino_genai/__version__.py index 79da913d68..472f83a46f 100644 --- a/src/python/openvino_genai/__version__.py +++ b/src/python/openvino_genai/__version__.py @@ -1,2 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + # Will be overwritten by pyproject.toml or cmake. __version__ = "0.0.0.0" diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 12cc3136bb..6644f3a35d 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -6,6 +6,7 @@ #include #include #include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/tokenizer.hpp" #ifdef _WIN32 # include @@ -34,47 +35,6 @@ std::string get_absolute_file_path(const std::string& path) { } #endif -namespace { - -// dublicates GenAIEnvManager from ov::genai::utils, since -// it was problematic getting access to that on Win - -const char* get_tokenizers_env_name() { return "OPENVINO_TOKENIZERS_PATH_GENAI"; } - -class GenAIEnvManager { -public: - GenAIEnvManager(const std::string& path) { - #ifdef _WIN32 - char* value = nullptr; - size_t len = 0; - _dupenv_s(&value, &len, ::get_tokenizers_env_name()); - if (value == nullptr) - _putenv_s(::get_tokenizers_env_name(), path.c_str()); - #else - if (!getenv(::get_tokenizers_env_name())) - setenv(::get_tokenizers_env_name(), path.c_str(), 1); - #endif - else - was_already_set = true; - } - - ~GenAIEnvManager() { - if (!was_already_set){ - #ifdef _WIN32 - _putenv_s(::get_tokenizers_env_name(), ""); - #else - unsetenv(::get_tokenizers_env_name()); - #endif - } - } - -private: - bool was_already_set; -}; - -} - - namespace py = pybind11; using ov::genai::LLMPipeline; using ov::genai::Tokenizer; @@ -136,44 +96,11 @@ py::object call_with_kwargs(LLMPipeline& pipeline, const std::string& text, cons return call_with_config(pipeline, text, config, kwargs.contains("streamer") ? kwargs["streamer"].cast() : std::monostate()); } -std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { -#ifdef _WIN32 - constexpr char tokenizers[] = "openvino_tokenizers.dll"; -#elif __linux__ - constexpr char tokenizers[] = "libopenvino_tokenizers.so"; -#elif __APPLE__ - constexpr char tokenizers[] = "libopenvino_tokenizers.dylib"; -#endif - return path.parent_path() / tokenizers; -} - -std::string get_ov_genai_bindings_path() { -#ifdef _WIN32 - CHAR genai_library_path[MAX_PATH]; - HMODULE hm = NULL; - if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, - reinterpret_cast(get_ov_genai_bindings_path), - &hm)) { - std::stringstream ss; - ss << "GetModuleHandle returned " << GetLastError(); - throw std::runtime_error(ss.str()); - } - GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path)); - return std::string(genai_library_path); -#elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__) - Dl_info info; - dladdr(reinterpret_cast(get_ov_genai_bindings_path), &info); - return get_absolute_file_path(info.dli_fname).c_str(); -#else -# error "Unsupported OS" -#endif // _WIN32 -} - std::string ov_tokenizers_module_path() { // Try a path relative to build artifacts folder first. - std::filesystem::path from_library = with_openvino_tokenizers(get_ov_genai_bindings_path()); - if (std::filesystem::exists(from_library)) { - return from_library.string(); + std::filesystem::path from_relative = ov::genai::tokenizers_relative_to_genai(); + if (std::filesystem::exists(from_relative)) { + return from_relative.string(); } return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path")); } @@ -217,7 +144,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "LLMPipeline") .def(py::init([](const std::string& model_path, const std::string& device) { - ::GenAIEnvManager env_manager(ov_tokenizers_module_path()); + ov::genai::ScopedVar env_manager(ov_tokenizers_module_path()); return std::make_unique(model_path, device);}), py::arg("model_path"), "path to the model path", py::arg("device") = "CPU", "device on which inference will be done", @@ -241,7 +168,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def(py::init([](py::object infer_request, const Tokenizer& tokenizer, OptionalGenerationConfig config) { - ::GenAIEnvManager env_manager(ov_tokenizers_module_path()); + ov::genai::ScopedVar env_manager(ov_tokenizers_module_path()); return std::make_unique(get_request_from_pyobj(infer_request), tokenizer, config); }), py::arg("infer_request"), "infer_request", diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py index b45844f2a7..514b2e5326 100644 --- a/tests/python_tests/list_test_models.py +++ b/tests/python_tests/list_test_models.py @@ -1,6 +1,12 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pathlib + def models_list(): model_ids = [ - ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0"), + ("katuni4ka/tiny-random-phi3", "tiny-random-phi3"), + # ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0"), # ("microsoft/phi-1_5", "phi-1_5/"), # ("google/gemma-2b-it", "gemma-2b-it"), @@ -13,8 +19,8 @@ def models_list(): # ("databricks/dolly-v2-12b", "dolly-v2-12b"), ] import os - prefix = os.getenv('GENAI_MODELS_PATH_PREFIX', '') - return [(model_id, os.path.join(prefix, model_path)) for model_id, model_path in model_ids] + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) + return [(model_id, prefix / model_path) for model_id, model_path in model_ids] if __name__ == "__main__": diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index e536fd531e..fa7db3f2e8 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,4 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +optimum[openvino]==1.20.0 pytest -transformers -torch -optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 \ No newline at end of file diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 953059fcaa..4b9a8c5bca 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -1,30 +1,45 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import functools +import openvino import openvino_genai +import openvino_tokenizers +import optimum.intel from openvino_genai import StopCriteria import pytest +import transformers from list_test_models import models_list from typing import Union, List, Dict -@pytest.fixture(scope="module", params=models_list(), - ids=lambda param: param[0].split('/', 1)[1] if '/' in param[0] else param[0]) -def model_fixture(request): - model_id, path = request.param - from transformers import AutoTokenizer, AutoModelForCausalLM - tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModelForCausalLM.from_pretrained(model_id) - yield model_id, path, tokenizer, model - - import gc - del tokenizer - del model - gc.collect() +@functools.lru_cache(1) +def read_model(params): + model_id, path = params + tokenizer = transformers.AutoTokenizer.from_pretrained(model_id) + if not (path / 'openvino_model.xml').is_file(): + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) + openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") + optimum.intel.openvino.OVModelForCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=True, + compile=False, device='CPU', load_in_8bit=False + ).save_pretrained(path) + # Return AutoModelForCausalLM instead of OVModelForCausalLM because + # there's no way to disable mmap for now. That prohibits the same + # model from being opened twice at the same time. + return ( + model_id, + path, + tokenizer, + transformers.AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True), + openvino_genai.LLMPipeline(str(path)), + ) -def run_hf_ov_genai_comparison_batched(model_fixture, generation_config: Dict, prompts: Union[str, List[str]]): - model_id, path, tokenizer, model = model_fixture +def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): + model_id, path, tokenizer, model, pipe = model_descr device = 'CPU' config = generation_config.copy() # to avoid side effects @@ -58,7 +73,7 @@ def run_hf_ov_genai_comparison_batched(model_fixture, generation_config: Dict, p hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) import openvino_genai as ov_genai - pipe = ov_genai.LLMPipeline(path, device) + pipe = ov_genai.LLMPipeline(str(path), device) config['num_return_sequences'] = num_beams * len(prompts) ov_outputs = pipe.generate(prompts, **config) @@ -71,10 +86,9 @@ def run_hf_ov_genai_comparison_batched(model_fixture, generation_config: Dict, p print(f'ov_output: {ov_output}') assert hf_output == ov_output - -def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt): +def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt): device = 'CPU' - model_id, path, tokenizer, model = model_fixture + model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects @@ -93,7 +107,7 @@ def run_hf_ov_genai_comparison(model_fixture, generation_config: Dict, prompt): hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) import openvino_genai as ov_genai - pipe = ov_genai.LLMPipeline(path, device) + pipe = ov_genai.LLMPipeline(str(path), device) ov_output = pipe.generate(prompt, **config) if config.get('num_return_sequences', 1) > 1: @@ -127,24 +141,30 @@ def stop_criteria_map(): (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), ] @pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("model_descr", models_list()) @pytest.mark.precommit -def test_decoding(model_fixture, generation_config, prompt): - run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) +def test_decoding(model_descr, generation_config, prompt): + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) test_configs = [ dict(max_new_tokens=20), - dict( max_new_tokens=20, num_beam_groups=3, num_beams=15,diversity_penalty=1.0) + dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) ] batched_prompts = [['table is made of', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], ['hello', 'Here is the longest nowel ever: '], ['Alan Turing was a', 'return 0', '你好! 你好嗎?']] @pytest.mark.parametrize("generation_config", test_configs) @pytest.mark.parametrize("prompts", batched_prompts) +@pytest.mark.parametrize("model_descr", models_list()) @pytest.mark.precommit -def test_multibatch(model_fixture, generation_config, prompts): +@pytest.mark.xfail( + raises=AssertionError, reason="assert hf_output == ov_output fails", + strict=True, +) +def test_multibatch(model_descr, generation_config, prompts): generation_config['pad_token_id'] = 2 - run_hf_ov_genai_comparison_batched(model_fixture, generation_config, prompts) + run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] @@ -153,8 +173,9 @@ def test_multibatch(model_fixture, generation_config, prompts): @pytest.mark.parametrize("max_new_tokens", [20, 15]) @pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5]) @pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("model_descr", models_list()) @pytest.mark.precommit -def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, +def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): generation_config = dict( num_beam_groups=num_beam_groups, @@ -163,14 +184,15 @@ def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, num_return_sequences=num_beam_groups * group_size, max_new_tokens=max_new_tokens, ) - run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) @pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) @pytest.mark.parametrize("prompt", prompts) @pytest.mark.parametrize("max_new_tokens", [10, 80]) +@pytest.mark.parametrize("model_descr", models_list()) @pytest.mark.precommit -def test_stop_criteria(model_fixture, stop_criteria, prompt, max_new_tokens): +def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence # while genai ends sentence with if (stop_criteria == StopCriteria.EARLY): @@ -183,7 +205,7 @@ def test_stop_criteria(model_fixture, stop_criteria, prompt, max_new_tokens): max_new_tokens=max_new_tokens, stop_criteria=stop_criteria, ) - run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) # test long sequences @@ -191,8 +213,10 @@ def test_stop_criteria(model_fixture, stop_criteria, prompt, max_new_tokens): @pytest.mark.parametrize("group_size", [5]) @pytest.mark.parametrize("max_new_tokens", [800, 2000]) @pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.skip(reason="Will be enabled in nightly since the test are computationally expensive") @pytest.mark.nightly -def test_beam_search_long_sentences(model_fixture, num_beam_groups, group_size, +def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, max_new_tokens, prompt): generation_config = dict( num_beam_groups=num_beam_groups, @@ -201,7 +225,7 @@ def test_beam_search_long_sentences(model_fixture, num_beam_groups, group_size, num_return_sequences=num_beam_groups * group_size, max_new_tokens=max_new_tokens, ) - run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) def user_defined_callback(subword): @@ -210,30 +234,32 @@ def user_defined_callback(subword): @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit -def test_callback_one_string(model_fixture, callback): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') - pipe.generate('', openvino_genai.GenerationConfig(), callback) +def test_callback_one_string(callback): + pipe = read_model(models_list()[0])[4] + generation_config = pipe.get_generation_config() + generation_config.max_new_tokens = 10 + pipe.generate('', generation_config, callback) @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit -def test_callback_batch_fail(model_fixture, callback): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_callback_batch_fail(callback): + pipe = read_model(models_list()[0])[4] with pytest.raises(RuntimeError): pipe.generate(['1', '2'], openvino_genai.GenerationConfig(), callback) @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit -def test_callback_kwargs_one_string(model_fixture, callback): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_callback_kwargs_one_string(callback): + pipe = read_model(models_list()[0])[4] pipe.generate('', max_new_tokens=10, streamer=callback) @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit -def test_callback_kwargs_batch_fail(model_fixture, callback): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_callback_kwargs_batch_fail(callback): + pipe = read_model(models_list()[0])[4] with pytest.raises(RuntimeError): pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback) @@ -250,30 +276,32 @@ def end(self): @pytest.mark.precommit -def test_streamer_one_string(model_fixture): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_streamer_one_string(): + pipe = read_model(models_list()[0])[4] + generation_config = pipe.get_generation_config() + generation_config.max_new_tokens = 10 printer = Printer(pipe.get_tokenizer()) - pipe.generate('', openvino_genai.GenerationConfig(), printer) + pipe.generate('', generation_config, printer) @pytest.mark.precommit -def test_streamer_batch_fail(model_fixture): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_streamer_batch_fail(): + pipe = read_model(models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): pipe.generate(['1', '2'], openvino_genai.GenerationConfig(), printer) @pytest.mark.precommit -def test_streamer_kwargs_one_string(model_fixture): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_streamer_kwargs_one_string(): + pipe = read_model(models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) - pipe.generate('', do_sample=True, streamer=printer) + pipe.generate('', max_new_tokens=10, do_sample=True, streamer=printer) @pytest.mark.precommit -def test_streamer_kwargs_batch_fail(model_fixture): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_streamer_kwargs_batch_fail(): + pipe = read_model(models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): pipe.generate('', num_beams=2, streamer=printer) @@ -281,29 +309,31 @@ def test_streamer_kwargs_batch_fail(model_fixture): @pytest.mark.precommit @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) -def test_operator_with_callback_one_string(model_fixture, callback): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') - pipe('', openvino_genai.GenerationConfig(), callback) +def test_operator_with_callback_one_string(callback): + pipe = read_model(models_list()[0])[4] + ten_tokens = pipe.get_generation_config() + ten_tokens.max_new_tokens = 10 + pipe('', ten_tokens, callback) @pytest.mark.precommit @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) -def test_operator_with_callback_batch_fail(model_fixture, callback): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') - with pytest.raises(Exception): +def test_operator_with_callback_batch_fail(callback): + pipe = read_model(models_list()[0])[4] + with pytest.raises(TypeError): pipe(['1', '2'], openvino_genai.GenerationConfig(), callback) @pytest.mark.precommit -def test_operator_with_streamer_kwargs_one_string(model_fixture): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_operator_with_streamer_kwargs_one_string(): + pipe = read_model(models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) - pipe('', do_sample=True, streamer=printer) + pipe('', max_new_tokens=10, do_sample=True, streamer=printer) @pytest.mark.precommit -def test_operator_with_streamer_kwargs_batch_fail(model_fixture): - pipe = openvino_genai.LLMPipeline(model_fixture[1], 'CPU') +def test_operator_with_streamer_kwargs_batch_fail(): + pipe = read_model(models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): pipe('', num_beams=2, streamer=printer) diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md index 08b91ab70e..21f3a066a4 100644 --- a/text_generation/causal_lm/cpp/README.md +++ b/text_generation/causal_lm/cpp/README.md @@ -55,15 +55,6 @@ This approach reduces the need for multiple infer requests to the main model, en Install [OpenVINO Archives >= 2024.1](docs.openvino.ai/install). `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. `` below refers to the extraction location. -## Install `libtbb-dev` on Linux - -> [!NOTE] -> `tbb` development files are installed with OpenVINO Archive on Windows and macOS. - -```sh -sudo apt-get install libtbb-dev -``` - ## Build `greedy_causal_lm`, `beam_search_causal_lm` and `openvino_tokenizers` ### Linux/macOS diff --git a/text_generation/causal_lm/cpp/requirements.txt b/text_generation/causal_lm/cpp/requirements.txt index e1c10930ad..d16301ad3e 100644 --- a/text_generation/causal_lm/cpp/requirements.txt +++ b/text_generation/causal_lm/cpp/requirements.txt @@ -1,5 +1,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu -optimum[openvino]==1.19.2 -optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 +optimum[openvino]==1.20.0 einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt new file mode 100644 index 0000000000..3e2f7deaf2 --- /dev/null +++ b/thirdparty/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +add_subdirectory(./openvino_tokenizers/ "${CMAKE_BINARY_DIR}/openvino_tokenizers/") +# Put binaries to a single dir to mimic package structure. +set_target_properties(openvino_tokenizers PROPERTIES + # Generator expressions to disable appending a per-configuration subdirectory (Release, Debug). + # ARCHIVE_OUTPUT is irrelevant. It's here just to keep all the artifacts in one place. + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) +if(TARGET core_tokenizers) + set_target_properties(core_tokenizers PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + ) +else() + # Prebuilt dependencies + if(WIN32) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/core_tokenizers.dll" + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icudt70.dll" + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icuuc70.dll") + elseif(LINUX) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so") + elseif(APPLE) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.dylib") + endif() + add_custom_command(OUTPUT "${extra_libs}" + COMMAND "${CMAKE_COMMAND}" -E copy "${extra_libs}" "${CMAKE_BINARY_DIR}/openvino_genai/" + DEPENDS openvino_tokenizers) +endif()