diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 13ea4d754c..9856294340 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -16,10 +16,10 @@ concurrency: cancel-in-progress: true env: - l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17130-65a8f393ba0/l_openvino_toolkit_ubuntu20_2024.5.0.dev20241024_x86_64.tgz - l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17130-65a8f393ba0/l_openvino_toolkit_ubuntu22_2024.5.0.dev20241024_x86_64.tgz - m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/m_openvino_toolkit_macos_12_6_2024.5.0.dev20241014_x86_64.tgz - w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17130-65a8f393ba0/w_openvino_toolkit_windows_2024.5.0.dev20241024_x86_64.zip + l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17180-08365f6fddc/l_openvino_toolkit_ubuntu20_2024.5.0.dev20241028_x86_64.tgz + l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17180-08365f6fddc/l_openvino_toolkit_ubuntu22_2024.5.0.dev20241028_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17180-08365f6fddc/m_openvino_toolkit_macos_12_6_2024.5.0.dev20241028_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17180-08365f6fddc/w_openvino_toolkit_windows_2024.5.0.dev20241028_x86_64.zip jobs: cpp-multinomial-greedy_causal_lm-ubuntu: runs-on: ubuntu-20.04-8-cores @@ -46,7 +46,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T @@ -105,7 +105,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Compare @@ -241,7 +241,7 @@ jobs: - name: Download and convert model run: | call .\ov\setupvars.bat - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino -m TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T @@ -299,7 +299,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat - run: > @@ -333,7 +333,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat - run: > @@ -368,7 +368,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 - run: > @@ -403,7 +403,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 - run: > @@ -438,7 +438,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b @@ -488,7 +488,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past @@ -560,7 +560,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 - name: Run Generation @@ -615,7 +615,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat - name: Run Generation @@ -670,7 +670,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Compare @@ -736,7 +736,7 @@ jobs: run: | source ./ov/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt opencv-python --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt opencv-python - name: Download and convert MiniCPM-V-2_6 model and an image run: | python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv @@ -860,7 +860,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests @@ -906,7 +906,7 @@ jobs: - name: Download and convert and model run: | call .\ov\setupvars.bat - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests @@ -951,7 +951,7 @@ jobs: - name: Download and convert and model run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Run gtests diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index 1598cf1597..d806bf4d79 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -18,8 +18,8 @@ concurrency: env: PYTHON_VERSION: '3.9' - LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17130-65a8f393ba0/l_openvino_toolkit_ubuntu22_2024.5.0.dev20241024_x86_64.tgz - WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17130-65a8f393ba0/w_openvino_toolkit_windows_2024.5.0.dev20241024_x86_64.zip + LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17180-08365f6fddc/l_openvino_toolkit_ubuntu22_2024.5.0.dev20241028_x86_64.tgz + WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17180-08365f6fddc/w_openvino_toolkit_windows_2024.5.0.dev20241028_x86_64.zip OV_INSTALL_DIR: ${{ github.workspace }}/ov jobs: @@ -60,7 +60,7 @@ jobs: run: | source openvino_lcm_cpp/bin/activate python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt - name: Download and convert models and tokenizer run: | @@ -120,7 +120,7 @@ jobs: run: | . "./openvino_lcm_cpp/Scripts/Activate.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt - name: Download and convert models and tokenizer run: | diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index 2a52a0e839..db7dd91a16 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -18,8 +18,8 @@ concurrency: env: PYTHON_VERSION: '3.10' - LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17130-65a8f393ba0/l_openvino_toolkit_ubuntu20_2024.5.0.dev20241024_x86_64.tgz - WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17130-65a8f393ba0/w_openvino_toolkit_windows_2024.5.0.dev20241024_x86_64.zip + LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17180-08365f6fddc/l_openvino_toolkit_ubuntu20_2024.5.0.dev20241028_x86_64.tgz + WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-17180-08365f6fddc/w_openvino_toolkit_windows_2024.5.0.dev20241028_x86_64.zip OV_INSTALL_DIR: ${{ github.workspace }}/ov jobs: @@ -60,7 +60,7 @@ jobs: run: | source openvino_sd_cpp/bin/activate python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt - name: Download and convert models and tokenizer run: | @@ -134,7 +134,7 @@ jobs: run: | . "./openvino_sd_cpp/Scripts/Activate.ps1" python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt - name: Download and convert models and tokenizer run: | diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md index 41bfee3942..947160e092 100644 --- a/samples/cpp/beam_search_causal_lm/README.md +++ b/samples/cpp/beam_search_causal_lm/README.md @@ -6,15 +6,17 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` ## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + `beam_search_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md index 1a46db05d9..d7b3f6ac21 100644 --- a/samples/cpp/benchmark_genai/README.md +++ b/samples/cpp/benchmark_genai/README.md @@ -6,7 +6,7 @@ This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt @@ -15,6 +15,8 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B ## Usage +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + ```sh benchmark_genai [OPTIONS] ``` diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md index 3f736985c2..bdc1d294ee 100644 --- a/samples/cpp/chat_sample/README.md +++ b/samples/cpp/chat_sample/README.md @@ -6,7 +6,7 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt @@ -15,6 +15,8 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B ## Run: +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + `chat_sample TinyLlama-1.1B-Chat-v1.0` diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md index 79852e0d10..2f3a7751bf 100644 --- a/samples/cpp/greedy_causal_lm/README.md +++ b/samples/cpp/greedy_causal_lm/README.md @@ -6,7 +6,7 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt @@ -15,6 +15,8 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B ## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + `greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md index 21c9a07e77..35ca054fdd 100644 --- a/samples/cpp/multinomial_causal_lm/README.md +++ b/samples/cpp/multinomial_causal_lm/README.md @@ -6,7 +6,7 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt @@ -15,6 +15,8 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B ## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + `multinomial_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md index c5517c5bf6..2057ff2c6f 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/README.md +++ b/samples/cpp/prompt_lookup_decoding_lm/README.md @@ -8,7 +8,7 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh source /setupvars.sh @@ -18,6 +18,8 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B ## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + `prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md index 48d8e50c92..c8e52c75ab 100644 --- a/samples/cpp/speculative_decoding_lm/README.md +++ b/samples/cpp/speculative_decoding_lm/README.md @@ -12,7 +12,7 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt @@ -22,6 +22,8 @@ optimum-cli export openvino --trust-remote-code --model meta-llama/Llama-2-7b-ch ## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + `speculative_decoding_lm TinyLlama-1.1B-Chat-v1.0 Llama-2-7b-chat-hf "Why is the Sun yellow?"` diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index 12f51df0eb..66758531da 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -12,7 +12,7 @@ int main(int argc, char* argv[]) try { ov::genai::GenerationConfig config; config.max_new_tokens = 100; - // Speculative decoding generation parameters are mutually excluded + // Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded // add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration config.num_assistant_tokens = 5; // add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold` diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md index fa58838dad..5f1388683f 100644 --- a/samples/cpp/text2image/README.md +++ b/samples/cpp/text2image/README.md @@ -19,7 +19,7 @@ Users can change the sample code and play with the following generation paramete The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt @@ -28,6 +28,8 @@ optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task sta ## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + `stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'` ### Examples diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md index 99ba417baf..96b1c78ec0 100644 --- a/samples/cpp/visual_language_chat/README.md +++ b/samples/cpp/visual_language_chat/README.md @@ -6,7 +6,7 @@ This example showcases inference of Visual language models (VLMs): [`openbmb/Min The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt @@ -15,6 +15,8 @@ optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code Mi ## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. `visual_language_chat miniCPM-V-2_6 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md index af779aab9e..773135b648 100644 --- a/samples/cpp/whisper_speech_recognition/README.md +++ b/samples/cpp/whisper_speech_recognition/README.md @@ -6,7 +6,7 @@ This example showcases inference of speech recognition Whisper Models. The appli The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +It's not required to install [../../export-requirements.txt](../../export requirements.txt) for deployment if the model has already been exported. ```sh pip install --upgrade-strategy eager -r ../../requirements.txt @@ -21,6 +21,8 @@ You can download example audio file: https://storage.openvinotoolkit.org/models_ ## Run +Follow [Get Started with Samples](https://docs.openvino.ai/2024/learn-openvino/openvino-samples/get-started-demos.html) to run the sample. + `whisper_speech_recognition whisper-base how_are_you_doing_today.wav` Output: diff --git a/samples/deployment-requirements.txt b/samples/deployment-requirements.txt new file mode 100644 index 0000000000..c94f8d6a13 --- /dev/null +++ b/samples/deployment-requirements.txt @@ -0,0 +1,5 @@ +--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly +--pre +openvino_genai~=2024.5.0.0.dev +librosa==0.10.2 # For Whisper +pillow==11.0.0 # Image processing diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt new file mode 100644 index 0000000000..9850dfd6e0 --- /dev/null +++ b/samples/export-requirements.txt @@ -0,0 +1,11 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly +--pre +openvino-tokenizers~=2024.5.0.0.dev +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +numpy<2.0.0; sys_platform == 'darwin' +einops==0.8.0 # For Qwen +transformers_stream_generator==0.0.5 # For Qwen +diffusers==0.31.0 +timm==1.0.11 # For exporting InternVL2 +torchvision # For visual language models diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md index 8dd06f264c..fac6a26e8e 100644 --- a/samples/python/beam_search_causal_lm/README.md +++ b/samples/python/beam_search_causal_lm/README.md @@ -6,16 +6,18 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` ## Run -`beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + +`python beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md index 9baf17c4d7..95f24b6eca 100644 --- a/samples/python/benchmark_genai/README.md +++ b/samples/python/benchmark_genai/README.md @@ -6,15 +6,18 @@ This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` + ## Usage +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + ```sh python benchmark_genai.py [OPTIONS] ``` diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md index dc2c39b3a5..7e3c206431 100644 --- a/samples/python/chat_sample/README.md +++ b/samples/python/chat_sample/README.md @@ -6,16 +6,18 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` ## Run: -`chat_sample.py TinyLlama-1.1B-Chat-v1.0` +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + +`python chat_sample.py TinyLlama-1.1B-Chat-v1.0` Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md index 7aeabe9ac2..a634e21cb0 100644 --- a/samples/python/greedy_causal_lm/README.md +++ b/samples/python/greedy_causal_lm/README.md @@ -10,16 +10,18 @@ There are two sample files: The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` ## Run -`greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + +`python greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md index 351773ec0d..69a3cd4008 100644 --- a/samples/python/multinomial_causal_lm/README.md +++ b/samples/python/multinomial_causal_lm/README.md @@ -8,16 +8,18 @@ This sample also contains example implementation of an iterable streamer with bu The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 ``` ## Run -`multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + +`python multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. diff --git a/samples/python/speculative_decoding_lm/README.md b/samples/python/speculative_decoding_lm/README.md index c0e38706f9..22df151af3 100644 --- a/samples/python/speculative_decoding_lm/README.md +++ b/samples/python/speculative_decoding_lm/README.md @@ -12,17 +12,20 @@ This example showcases inference of text-generation Large Language Models (LLMs) The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. + Download assisting and main model to run speculative decoding sample. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b ``` ## Run +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + `python speculative_decoding_lm.py ./dolly-v2-7b ./dolly-v2-3b "Why is the Sun yellow?"` diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py index 38dccbf1bb..857941d45b 100755 --- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py +++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py @@ -36,7 +36,11 @@ def main(): config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 + # Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded + # add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration config.num_assistant_tokens = 5 + # add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold` + # config.assistant_confidence_threshold = 0.4 # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. diff --git a/samples/python/text2image/README.md b/samples/python/text2image/README.md index 3be523b4a8..1a59107e85 100644 --- a/samples/python/text2image/README.md +++ b/samples/python/text2image/README.md @@ -19,15 +19,17 @@ Users can change the sample code and play with the following generation paramete The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 ``` ## Run +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + `python main.py ./dreamlike_anime_1_0_ov/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting"` ### Examples @@ -47,7 +49,7 @@ Here is an example how to run the sample with a single adapter. First download a Then run `lora.py`: -`python lora.py ./lora_stable_diffusion dreamlike_anime_1_0_ov/FP16 "curly-haired unicorn in the forest, anime, line" soulcard.safetensors 0.7` +`python lora.py ./dreamlike_anime_1_0_ov/FP16 "curly-haired unicorn in the forest, anime, line" soulcard.safetensors 0.7` The sample generates two images with and without adapters applied using the same prompt: - `lora.bmp` with adapters applied diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md index 06355d9ee5..e8744a8c8f 100644 --- a/samples/python/visual_language_chat/README.md +++ b/samples/python/visual_language_chat/README.md @@ -6,17 +6,20 @@ This example showcases inference of text-generation Vision Language Models (VLMs The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --model openbmb/MiniCPM-V-2_6 --trust-remote-code MiniCPM-V-2_6 ``` ## Run: + [This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. -`visual_language_chat.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + +`python visual_language_chat.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md index e324abfb67..158bd18311 100644 --- a/samples/python/whisper_speech_recognition/README.md +++ b/samples/python/whisper_speech_recognition/README.md @@ -6,10 +6,10 @@ This example showcases inference of speech recognition Whisper Models. The appli The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. +Install [../../export-requirements.txt](../../export-requirements.txt) to convert a model. ```sh -pip install --upgrade-strategy eager -r ../../requirements.txt +pip install --upgrade-strategy eager -r ../../export-requirements.txt optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base ``` @@ -28,7 +28,9 @@ python recorder.py ## Run the Whisper model -`whisper_speech_recognition whisper-base how_are_you_doing_today.wav` +Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample: + +`python whisper_speech_recognition.py whisper-base how_are_you_doing_today.wav` Output: ``` diff --git a/samples/requirements.txt b/samples/requirements.txt index 2ccd59e609..ae7f6ebe43 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -1,9 +1,2 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git -numpy<2.0.0; sys_platform == 'darwin' -einops==0.8.0 # For Qwen -transformers_stream_generator==0.0.5 # For Qwen -diffusers==0.30.3 -librosa # For Whisper -torchvision # For visual language models -timm # For exporting InternVL2 +-r ./deployment-requirements.txt +-r ./export-requirements.txt diff --git a/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp index b1088f7448..1bb9bf97b4 100644 --- a/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp +++ b/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp @@ -24,6 +24,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { size_t latent_channels = 4; size_t out_channels = 3; float scaling_factor = 0.18215f; + float shift_factor = 0.0609f; std::vector block_out_channels = { 64 }; explicit Config(const std::filesystem::path& config_path); @@ -48,6 +49,8 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL { AutoencoderKL& compile(const std::string& device, const ov::AnyMap& properties = {}); + const Config& get_config() const; + template ov::util::EnableIfAllStringAny compile( const std::string& device, diff --git a/src/cpp/include/openvino/genai/text2image/pipeline.hpp b/src/cpp/include/openvino/genai/text2image/pipeline.hpp index a8201cf6c9..54d540179b 100644 --- a/src/cpp/include/openvino/genai/text2image/pipeline.hpp +++ b/src/cpp/include/openvino/genai/text2image/pipeline.hpp @@ -19,6 +19,7 @@ #include "openvino/genai/text2image/clip_text_model_with_projection.hpp" #include "openvino/genai/text2image/unet2d_condition_model.hpp" #include "openvino/genai/text2image/autoencoder_kl.hpp" +#include "openvino/genai/text2image/sd3_transformer_2d_model.hpp" namespace ov { namespace genai { @@ -57,7 +58,8 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { LCM, LMS_DISCRETE, DDIM, - EULER_DISCRETE + EULER_DISCRETE, + FLOW_MATCH_EULER_DISCRETE }; static std::shared_ptr from_config(const std::filesystem::path& scheduler_config_path, @@ -67,12 +69,13 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { }; struct OPENVINO_GENAI_EXPORTS GenerationConfig { - // LCM: prompt only w/o negative prompt - // SD XL: prompt2 and negative_prompt2 - // FLUX: prompt2 (prompt if prompt2 is not defined explicitly) - // SD 3: prompt2, prompt3 (with fallback to prompt) and negative_prompt2, negative_prompt3 + // LCM: prompt only w/o negative_prompt + // SD XL: prompt_2 and negative_prompt2 + // FLUX: prompt_2 (prompt if prompt_2 is not defined explicitly) + // SD 3: prompt_2, prompt3 (with fallback to prompt) and negative_prompt_2, negative_prompt_3 std::optional prompt_2 = std::nullopt, prompt_3 = std::nullopt; - std::string negative_prompt, negative_prompt_2, negative_prompt_3; + std::string negative_prompt; + std::optional negative_prompt_2 = std::nullopt, negative_prompt_3 = std::nullopt; size_t num_images_per_prompt = 1; @@ -131,6 +134,14 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { const UNet2DConditionModel& unet, const AutoencoderKL& vae_decoder); + // creates SD3 pipeline from building blocks + static Text2ImagePipeline stable_diffusion_3( + const std::shared_ptr& scheduler_type, + const CLIPTextModelWithProjection& clip_text_model_1, + const CLIPTextModelWithProjection& clip_text_model_2, + const SD3Transformer2DModel& transformer, + const AutoencoderKL& vae_decoder); + GenerationConfig get_generation_config() const; void set_generation_config(const GenerationConfig& generation_config); @@ -160,6 +171,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { class StableDiffusionPipeline; class StableDiffusionXLPipeline; + class StableDiffusion3Pipeline; }; // diff --git a/src/cpp/include/openvino/genai/text2image/sd3_transformer_2d_model.hpp b/src/cpp/include/openvino/genai/text2image/sd3_transformer_2d_model.hpp new file mode 100644 index 0000000000..674f29168a --- /dev/null +++ b/src/cpp/include/openvino/genai/text2image/sd3_transformer_2d_model.hpp @@ -0,0 +1,77 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "openvino/core/any.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/tensor.hpp" + +#include "openvino/genai/visibility.hpp" + +namespace ov { +namespace genai { + +class OPENVINO_GENAI_EXPORTS SD3Transformer2DModel { +public: + struct Config { + size_t sample_size = 128; + size_t patch_size = 2; + size_t in_channels = 16; + size_t num_layers = 18; + size_t attention_head_dim = 64; + size_t num_attention_heads = 18; + size_t joint_attention_dim = 4096; + size_t caption_projection_dim = 1152; + size_t pooled_projection_dim = 2048; + size_t out_channels = 16; + size_t pos_embed_max_size = 96; + std::vector block_out_channels = { 128, 256, 512, 512 }; + + explicit Config(const std::filesystem::path& config_path); + }; + + explicit SD3Transformer2DModel(const std::filesystem::path& root_dir); + + SD3Transformer2DModel(const std::filesystem::path& root_dir, + const std::string& device, + const ov::AnyMap& properties = {}); + + template ::value, bool>::type = true> + SD3Transformer2DModel(const std::filesystem::path& root_dir, const std::string& device, Properties&&... properties) + : SD3Transformer2DModel(root_dir, device, ov::AnyMap{std::forward(properties)...}) {} + + SD3Transformer2DModel(const SD3Transformer2DModel&); + + const Config& get_config() const; + + SD3Transformer2DModel& reshape(int batch_size, int height, int width, int tokenizer_model_max_length); + + SD3Transformer2DModel& compile(const std::string& device, const ov::AnyMap& properties = {}); + + template + ov::util::EnableIfAllStringAny compile(const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward(properties)...}); + } + + void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states); + + ov::Tensor infer(const ov::Tensor latent, const ov::Tensor timestep); + + size_t get_vae_scale_factor() const; + +private: + Config m_config; + ov::InferRequest m_request; + std::shared_ptr m_model; + size_t m_vae_scale_factor; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 09abbe29ab..c56d02afef 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -123,7 +123,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { static ManualTimer step_timer("step()"); step_timer.start(); - // Pull awaiting requests _pull_awaiting_requests(); m_pipeline_metrics.requests = m_requests.size(); @@ -148,8 +147,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { if (scheduler_output.m_total_num_scheduled_tokens == 0) { for (size_t i = 0; i < m_requests.size(); ++i) { SequenceGroup::Ptr sequence_group = m_requests[i]; - sequence_group->set_out_of_memory(); - sequence_group->notify_handle(); + if (!sequence_group->is_waiting()) { + sequence_group->set_out_of_memory(); + sequence_group->notify_handle(); + } } _free_non_running_requests(); return; diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp index 78e92d6c76..8276edb36b 100644 --- a/src/cpp/src/continuous_batching_impl.hpp +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -49,7 +49,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc const DeviceConfig& device_config, ov::Core& core); - void _pull_awaiting_requests(); + virtual void _pull_awaiting_requests(); void _fill_prompt_log_probs(std::vector& sequence_groups, ov::Tensor& logits); public: diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index cc7236af42..7a9653cd85 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -120,7 +120,7 @@ bool GenerationConfig::is_multinomial() const { } bool GenerationConfig::is_speculative_decoding() const { - return assistant_confidence_threshold > 0 || num_assistant_tokens > 0; + return (assistant_confidence_threshold > 0 || num_assistant_tokens > 0); } void GenerationConfig::validate() const { diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 2657c79df7..38deb74186 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -630,8 +630,6 @@ stop_sample_tokens(Sequence::Ptr running_sequence, size_t& max_removed_tokens_per_request) { running_sequence->remove_last_tokens(token_idx); max_removed_tokens_per_request = std::max(max_removed_tokens_per_request, token_idx); - running_sequence->set_status(SequenceStatus::FINISHED); - running_sequence->set_finish_reason(GenerationFinishReason::STOP); } void @@ -652,26 +650,48 @@ align_all_sequence_len(SequenceGroup::Ptr& sequence_group, logit_processor.update_generated_len(min_generated_tokens); } -bool -validate_candidate(Sequence::Ptr running_sequence, - size_t& token_idx, - Token& sampled_token, - bool& is_extend_sequence, - size_t& max_removed_tokens) { - if (token_idx > 0) { - const auto& generated_tokens = running_sequence->get_generated_ids(); - auto it = generated_tokens.rbegin(); - std::advance(it, token_idx - 1); - // to validate candidates from assisting model and remove incorrect ones from generated sequence - if (*it != sampled_token.m_index) { - running_sequence->remove_last_tokens(token_idx); - max_removed_tokens = std::max(max_removed_tokens, token_idx); - is_extend_sequence = true; - return false; - } else { - sampled_token.m_index = *it; - } +bool Sampler::validate_candidate( + Sequence::Ptr running_sequence, + size_t& token_idx, + Token& sampled_token, + bool& is_extend_sequence, + size_t& max_removed_tokens, + bool do_sample) { + OPENVINO_ASSERT(token_idx > 0); + const auto& generated_tokens = running_sequence->get_generated_ids(); + auto it_token_id = generated_tokens.rbegin(); + std::advance(it_token_id, token_idx - 1); + + bool is_candidate_accepted = false; + // first tokens in case of speculative decoding should be generated by main model + if (do_sample && + running_sequence->get_generated_len() != running_sequence->get_sequence_group_ptr()->get_num_tokens_to_validate()) { + const auto& generated_log_probs = running_sequence->get_generated_log_probs(); + auto it_log_prob = generated_log_probs.rbegin(); + std::advance(it_log_prob, token_idx - 1); + + float p_i = std::exp(*it_log_prob), + q_i = std::exp(sampled_token.m_log_prob), + probability_ratio = p_i / q_i; + + auto dist = std::uniform_int_distribution<>(0, 100); // equivalent to multinomial with number of trials == 1 + float r_i = dist(rng_engine); + r_i /= 100; + is_candidate_accepted = r_i <= probability_ratio; + } else { + is_candidate_accepted = *it_token_id == sampled_token.m_index; } + + // to validate candidates from assisting model and remove incorrect ones from generated sequence + if (!is_candidate_accepted) { + running_sequence->remove_last_tokens(token_idx); + max_removed_tokens = std::max(max_removed_tokens, token_idx); + is_extend_sequence = true; + return false; + } else { + sampled_token.m_index = *it_token_id; + } + return true; } @@ -759,8 +779,9 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, // flag to add sampled token to generated sequence or extend logit processors only bool is_extend_sequence = token_offset == 0 || is_generate_n_tokens, is_validation_passed = true; - if (is_validation_mode_enabled && !is_generate_n_tokens) { - is_validation_passed = validate_candidate(running_sequences[running_sequence_id], token_offset, sampled_token_id, is_extend_sequence, max_removed_tokens_per_request); + if (is_validation_mode_enabled && !is_extend_sequence) { + is_validation_passed = validate_candidate(running_sequences[running_sequence_id], token_offset, sampled_token_id, + is_extend_sequence, max_removed_tokens_per_request, sampling_params.do_sample); // update log prob just while validation process if (!is_extend_sequence) { OPENVINO_ASSERT(generated_and_verified_len < running_sequences[running_sequence_id]->get_generated_len()); @@ -775,6 +796,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, } min_generated_len = std::min(min_generated_len, running_sequence->get_generated_len()); } + align_all_sequence_len(sequence_group, min_generated_len, logit_processor); for (const auto& dropped_seq_id : _try_finish_generation(sequence_group)) { sampler_output.m_dropped_sequences.push_back(dropped_seq_id); } @@ -799,7 +821,9 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, } // Notify handle after sampling is done. // For non-streaming this is effective only when the generation is finished. - sequence_group->notify_handle(); + OPENVINO_ASSERT(num_tokens_to_process >= max_removed_tokens_per_request); + size_t num_output_token_to_push = num_tokens_to_process - max_removed_tokens_per_request + 1; + sequence_group->notify_handle(num_output_token_to_push); } else { // we are in prompt processing phase when prompt is split into chunks and processed step by step } @@ -810,7 +834,6 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, sequence_group->finish_iteration(); // decrease sequence_group context in case of candidates generated by draft_model were not accepted by main_model if (max_removed_tokens_per_request) { - align_all_sequence_len(sequence_group, min_generated_len, logit_processor); auto min_processed_tokens = sequence_group->get_prompt_len() + min_generated_len - 1; sequence_group->update_processed_tokens_num(min_processed_tokens); logit_processor.update_generated_len(min_processed_tokens); diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 83b2ddb692..dd7d7d4eb9 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -48,6 +48,9 @@ class Sampler { std::vector _multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence); std::vector _try_finish_generation(SequenceGroup::Ptr & sequence_group); + bool validate_candidate(Sequence::Ptr running_sequence, size_t& token_idx, Token& sampled_token, + bool& is_extend_sequence, size_t& max_removed_tokens, bool do_sample); + // request ID => beam search tracking information std::map m_beam_search_info; diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index b2532b220c..c5be82f0f2 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -609,7 +609,7 @@ class SequenceGroup { m_generation_stream->push(std::move(outputs)); } - void notify_handle() { + void notify_handle(size_t num_output_token_to_push = 0) { if (out_of_memory()) { set_generation_status(GenerationStatus::IGNORED); } else if (has_finished()) { @@ -625,12 +625,8 @@ class SequenceGroup { // (after stop string is detected its tokens are already sent) if (num_total_seqs() == 1 && (m_sampling_params.stop_strings.empty() || m_sampling_params.include_stop_str_in_output)) { - auto previous_step_gen_len = get_num_processed_tokens() > 0 ? get_num_processed_tokens() - get_prompt_len() + 1 : 0; - auto generation_len = m_sequences.front()->get_generated_len(); - if (previous_step_gen_len < generation_len) { - auto token_to_print = generation_len - previous_step_gen_len; - push_partial_outputs(token_to_print); - } + if (num_output_token_to_push) + push_partial_outputs(num_output_token_to_push); } else if (has_finished() || out_of_memory()) { push_outputs(); } diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp index fd30e9f608..c649c544a6 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp @@ -20,23 +20,16 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::Contin void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::finish_request(SequenceGroup::Ptr request) { - - for (const auto& sequence : request->get_sequences()) { - m_scheduler->free_sequence(sequence->get_id()); + for (const auto& sequence: request->get_sequences()) { + if (m_scheduler->has_block_table(sequence->get_id())) { + m_scheduler->free_sequence(sequence->get_id()); + } } m_sampler->clear_request_info(request->get_request_id()); + request->set_generation_status(GenerationStatus::DROPPED_BY_HANDLE); } void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::finish_request(int64_t request_id) { - // finish all request s in case of -1 - if (request_id == -1) { - while (!m_requests.empty()) { - const auto& request = *m_requests.rbegin(); - finish_request(request); - m_requests.pop_back(); - } - return; - } for (size_t i = 0; i < m_requests.size(); ++i) { auto& request = m_requests[i]; if (request->get_request_id() != request_id) { @@ -50,8 +43,6 @@ void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::f GeneratedRequests ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::get_generated_requests() { - _pull_awaiting_requests(); - GeneratedRequests result; for (const auto& request : m_requests) { const auto& request_id = request->get_request_id(); @@ -197,8 +188,6 @@ UpdateRequestResult ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::init_request_by_candidate( uint64_t request_id, const GeneratedSequences& candidates) { - _pull_awaiting_requests(); - for (auto& request : m_requests) { if (request->get_request_id() != request_id) { continue; @@ -218,8 +207,6 @@ UpdateRequestResult ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::update_request(uint64_t request_id, const GeneratedSequences& candidates, bool is_update_logit_processor) { - _pull_awaiting_requests(); - UpdateRequestResult result{0, 0}; for (auto& request : m_requests) { if (request_id != request->get_request_id()) { @@ -227,14 +214,9 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::update } std::vector running_sequences = request->get_running_sequences(); + OPENVINO_ASSERT(running_sequences.size() > 0); size_t min_generated_tokens, min_candidate_len; - if (request->get_context_len() == 0 && !request->get_num_tokens_to_validate()) { - if (candidates.begin()->second.log_probs.empty()) { - // lock generation in case on empty generation - request->pause_generation(true); - return result; - } - // init request by sequences in case the pipeline was not started + if (running_sequences.front()->get_generated_len() == 0 && !request->get_num_tokens_to_validate()) { m_sampler->create_logit_processor(request_id, request->get_sampling_parameters(), request->get_prompt_ids()); auto& logit_processor = m_sampler->get_logit_processor(request_id); result.inserted_tokens_cnt = init_request(request, candidates, logit_processor, is_update_logit_processor); @@ -270,11 +252,21 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::update // update request context information to provide correct scheduling phase const size_t num_processed_tokens = request->get_num_processed_tokens(), prompt_len = request->get_prompt_len(), - updated_context_len = min_candidate_len + prompt_len; - if (num_processed_tokens > 0) + updated_context_len = min_candidate_len + prompt_len, + max_new_tokens = request->get_sampling_parameters().max_new_tokens; + size_t generated_len = request->get_context_len() - request->get_prompt_len(); + if (num_processed_tokens > 0) { request->update_processed_tokens_num(num_processed_tokens - result.removed_tokens_cnt); + generated_len -= result.removed_tokens_cnt; + } request->set_num_validated_tokens(result.inserted_tokens_cnt); request->pause_generation(false); + generated_len += result.inserted_tokens_cnt; + + // to pause `draft_model` generation in case of `generated_len >= max_new_tokens - 1` to generate last token by `main_model` + if (!m_is_validation_mode_enabled && (generated_len >= max_new_tokens - 1 || result.inserted_tokens_cnt == 0)) { + request->pause_generation(true); + } break; } @@ -282,13 +274,8 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::update } void -ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::unlock_next_request_generation() { - for (auto& request : m_requests) { - if (!request->has_finished() && !request->can_generate_tokens()) { - request->pause_generation(false); - return; - } - } +ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::pull_awaiting_requests() { + ContinuousBatchingImpl::_pull_awaiting_requests(); } void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::multistep() { @@ -308,13 +295,16 @@ void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::m request->pause_generation(true); } else if (request->get_num_processed_tokens() == 0 && sampling_params.num_return_sequences > 1) { request->pause_generation(true); - } else if (sampling_params.num_assistant_tokens <= generated_tokens_cnt) { + } else if (sampling_params.num_assistant_tokens <= generated_tokens_cnt && sampling_params.assistant_confidence_threshold == 0.f) { request->pause_generation(true); - } else if (request->get_num_processed_tokens() - request->get_prompt_len() + 1 >= sampling_params.max_new_tokens - 1) { + } else if (request->get_context_len() >= request->get_prompt_len() && + (request->get_context_len() - request->get_prompt_len()) >= sampling_params.max_new_tokens - 1) { + request->pause_generation(true); + } else if (sampling_params.max_new_tokens == 0) { request->pause_generation(true); } to_generate |= request->can_generate_tokens(); } } } -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp index a75a160f14..0040708b4b 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp @@ -23,9 +23,9 @@ class ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl : bool is_validation_mode_enabled); void multistep(); - void finish_request(int64_t request_id = -1); - void unlock_next_request_generation(); + void finish_request(int64_t request_id = -1); + void pull_awaiting_requests(); GeneratedRequests get_generated_requests(); UpdateRequestResult update_request(uint64_t request_id, const GeneratedSequences& candidates, bool is_update_logit_processor); @@ -33,5 +33,6 @@ class ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl : protected: void finish_request(SequenceGroup::Ptr request); + void _pull_awaiting_requests() override {}; }; } \ No newline at end of file diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index 2008f1fb9a..864646d5cd 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -82,7 +82,8 @@ GenerationHandle ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request(uint64_t request_id, const ov::Tensor& input_ids, ov::genai::GenerationConfig sampling_params) { - m_draft_pipeline->add_request(request_id, input_ids, sampling_params); + std::lock_guard lock(m_draft_generations_mutex); + m_draft_generations.insert({request_id, m_draft_pipeline->add_request(request_id, input_ids, sampling_params)}); return m_main_pipeline->add_request(request_id, input_ids, sampling_params); }; @@ -90,7 +91,8 @@ GenerationHandle ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) { - m_draft_pipeline->add_request(request_id, prompt, sampling_params); + std::lock_guard lock(m_draft_generations_mutex); + m_draft_generations.insert({request_id, m_draft_pipeline->add_request(request_id, prompt, sampling_params)}); return m_main_pipeline->add_request(request_id, prompt, sampling_params); } @@ -112,12 +114,18 @@ void print_generated_request(const ov::genai::GeneratedRequests& requests) { } void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() { + // this blocks adding new requests during step as it may break coherence between main and draft models + std::lock_guard lock{m_draft_generations_mutex}; + m_draft_pipeline->pull_awaiting_requests(); + m_main_pipeline->pull_awaiting_requests(); + // generate candidates by draft model ManualTimer draft_timer("speculative_decoding: draft_model: multistep()"); draft_timer.start(); m_draft_pipeline->multistep(); draft_timer.end(); m_sd_metrics.draft_duration += draft_timer.get_duration(); + m_pipeline_metrics = m_main_pipeline->get_metrics(); // to generate num_matches statistic std::map update_sequence_info; @@ -133,6 +141,7 @@ void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() { m_main_pipeline->step(); main_timer.end(); m_sd_metrics.main_duration += main_timer.get_duration(); + m_pipeline_metrics = m_main_pipeline->get_metrics(); auto main_generated_requests = m_main_pipeline->get_generated_requests(); for (const auto& checked_sequence : main_generated_requests) { @@ -145,8 +154,8 @@ void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() { auto request_id = draft_request.first; if (!main_generated_requests.count(request_id)) { m_draft_pipeline->finish_request(request_id); - // in case of some requests not to started, unlock generation of next request - m_draft_pipeline->unlock_next_request_generation(); + // remove draft_generation_handle from queue + m_draft_generations.erase(request_id); } auto updated_seq_info = update_sequence_info[request_id]; float acceptance_rate = 1 - static_cast(updated_seq_info.removed_tokens_cnt) / updated_seq_info.inserted_tokens_cnt; @@ -175,18 +184,16 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< } }, streamer); - std::vector main_generations, draft_generations; + std::vector main_generations; for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) { OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); main_generations.push_back(m_main_pipeline->add_request(request_id, input_ids[request_id], sampling_params[request_id])); auto draft_sampling_params = sampling_params[request_id]; // set the parameters do not stop draft generation without stopping of the same request for main pipeline - draft_sampling_params.max_new_tokens = draft_sampling_params.max_new_tokens + 1; - draft_sampling_params.min_new_tokens = draft_sampling_params.min_new_tokens + 1; draft_sampling_params.ignore_eos = true; - draft_generations.push_back(m_draft_pipeline->add_request(request_id, input_ids[request_id], draft_sampling_params)); - // decrease generation len to generate last token by main model + std::lock_guard lock(m_draft_generations_mutex); + m_draft_generations.insert({request_id, m_draft_pipeline->add_request(request_id, input_ids[request_id], draft_sampling_params)}); } std::vector results; @@ -210,7 +217,6 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< if (streamer_ptr) { streamer_ptr->end(); } - draft_generations.clear(); for (size_t generation_idx = 0; generation_idx < main_generations.size(); ++generation_idx) { const auto& generation = main_generations[generation_idx]; diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index b427e311b4..f854713b5e 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -30,6 +30,9 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat protected: std::shared_ptr m_main_pipeline, m_draft_pipeline; SpeculativeDecodingMetrics m_sd_metrics; + // Mutex protecting access to m_draft_generations, so add_request and step methods can be called from different threads + std::mutex m_draft_generations_mutex; + std::map m_draft_generations; public: SpeculativeDecodingImpl(const std::filesystem::path& main_models_path, diff --git a/src/cpp/src/text2image/models/autoencoder_kl.cpp b/src/cpp/src/text2image/models/autoencoder_kl.cpp index fca9c21050..c9d51cb844 100644 --- a/src/cpp/src/text2image/models/autoencoder_kl.cpp +++ b/src/cpp/src/text2image/models/autoencoder_kl.cpp @@ -32,6 +32,7 @@ AutoencoderKL::Config::Config(const std::filesystem::path& config_path) { read_json_param(data, "latent_channels", latent_channels); read_json_param(data, "out_channels", out_channels); read_json_param(data, "scaling_factor", scaling_factor); + read_json_param(data, "shift_factor", shift_factor); read_json_param(data, "block_out_channels", block_out_channels); } @@ -52,6 +53,10 @@ AutoencoderKL::AutoencoderKL(const std::filesystem::path& root_dir, AutoencoderKL::AutoencoderKL(const AutoencoderKL&) = default; +const AutoencoderKL::Config& AutoencoderKL::get_config() const { + return m_config; +} + AutoencoderKL& AutoencoderKL::reshape(int batch_size, int height, int width) { OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); diff --git a/src/cpp/src/text2image/models/sd3_transformer_2d_model.cpp b/src/cpp/src/text2image/models/sd3_transformer_2d_model.cpp new file mode 100644 index 0000000000..7db52f5e8b --- /dev/null +++ b/src/cpp/src/text2image/models/sd3_transformer_2d_model.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/sd3_transformer_2d_model.hpp" + +#include + +#include "json_utils.hpp" +#include "utils.hpp" + +namespace ov { +namespace genai { + +SD3Transformer2DModel::Config::Config(const std::filesystem::path& config_path) { + std::ifstream file(config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "sample_size", sample_size); + read_json_param(data, "patch_size", patch_size); + read_json_param(data, "in_channels", in_channels); + read_json_param(data, "num_layers", num_layers); + read_json_param(data, "attention_head_dim", attention_head_dim); + read_json_param(data, "num_attention_heads", num_attention_heads); + read_json_param(data, "joint_attention_dim", joint_attention_dim); + read_json_param(data, "caption_projection_dim", caption_projection_dim); + read_json_param(data, "pooled_projection_dim", pooled_projection_dim); + read_json_param(data, "out_channels", out_channels); + read_json_param(data, "pos_embed_max_size", pos_embed_max_size); + + file.close(); + + // block_out_channels should be read from VAE encoder / decoder config to compute proper m_vae_scale_factor + std::filesystem::path vae_config_path = config_path.parent_path().parent_path() / "vae_decoder" / "config.json"; + file.open(vae_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", vae_config_path); + data = nlohmann::json::parse(file); + read_json_param(data, "block_out_channels", block_out_channels); +} + +SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir) + : m_config(root_dir / "config.json") { + m_model = utils::singleton_core().read_model((root_dir / "openvino_model.xml").string()); + + // compute VAE scale factor + m_vae_scale_factor = std::pow(2, m_config.block_out_channels.size() - 1); +} + +SD3Transformer2DModel::SD3Transformer2DModel(const std::filesystem::path& root_dir, + const std::string& device, + const ov::AnyMap& properties) + : SD3Transformer2DModel(root_dir) { + compile(device, properties); +} + +SD3Transformer2DModel::SD3Transformer2DModel(const SD3Transformer2DModel&) = default; + +const SD3Transformer2DModel::Config& SD3Transformer2DModel::get_config() const { + return m_config; +} + +SD3Transformer2DModel& SD3Transformer2DModel::reshape(int batch_size, + int height, + int width, + int tokenizer_model_max_length) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); + + // hidden_states=latent_model_input, + // timestep=timestep, + // encoder_hidden_states=prompt_embeds, + // pooled_projections=pooled_prompt_embeds, + + height /= m_vae_scale_factor; + width /= m_vae_scale_factor; + + std::map name_to_shape; + + for (auto&& input : m_model->inputs()) { + std::string input_name = input.get_any_name(); + name_to_shape[input_name] = input.get_partial_shape(); + if (input_name == "timestep") { + name_to_shape[input_name][0] = batch_size; + } else if (input_name == "hidden_states") { + name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width}; + } else if (input_name == "encoder_hidden_states") { + name_to_shape[input_name][0] = batch_size; + name_to_shape[input_name][1] = + tokenizer_model_max_length * + 2; // x2 is necessary because of the concatenation of prompt_embeds and t5_prompt_embeds + } else if (input_name == "pooled_projections") { + name_to_shape[input_name][0] = batch_size; + } + } + + m_model->reshape(name_to_shape); + + return *this; +} + +SD3Transformer2DModel& SD3Transformer2DModel::compile(const std::string& device, const ov::AnyMap& properties) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); + ov::CompiledModel compiled_model = utils::singleton_core().compile_model(m_model, device, properties); + m_request = compiled_model.create_infer_request(); + // release the original model + m_model.reset(); + + return *this; +} + +void SD3Transformer2DModel::set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) { + OPENVINO_ASSERT(m_request, "Transformer model must be compiled first"); + m_request.set_tensor(tensor_name, encoder_hidden_states); +} + +size_t SD3Transformer2DModel::get_vae_scale_factor() const { + return m_vae_scale_factor; +} + +ov::Tensor SD3Transformer2DModel::infer(const ov::Tensor latent_model_input, const ov::Tensor timestep) { + OPENVINO_ASSERT(m_request, "Transformer model must be compiled first. Cannot infer non-compiled model"); + + m_request.set_tensor("hidden_states", latent_model_input); + m_request.set_tensor("timestep", timestep); + m_request.infer(); + + return m_request.get_output_tensor(); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/numpy_utils.cpp b/src/cpp/src/text2image/numpy_utils.cpp index 9554681820..b263573e47 100644 --- a/src/cpp/src/text2image/numpy_utils.cpp +++ b/src/cpp/src/text2image/numpy_utils.cpp @@ -74,6 +74,85 @@ std::vector interp(const std::vector& x, const std::vector< return interp_res; } +void concat_3d_by_rows(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { + OPENVINO_ASSERT( + shape_1[0] == shape_2[0] && shape_1[1] == shape_2[1], + "Tensors for concatenation must have the same dimensions"); + + for (size_t i = 0; i < shape_1[0]; ++i) { + for (size_t j = 0; j < shape_1[1]; ++j) { + size_t offset_1 = (i * shape_1[1] + j) * shape_1[2]; + size_t offset_2 = (i * shape_2[1] + j) * shape_2[2]; + + size_t step = (i * shape_1[1] + j) * (shape_1[2] + shape_2[2]); + + std::memcpy(res + step, data_1 + offset_1, shape_1[2] * sizeof(float)); + std::memcpy(res + step + shape_1[2], + data_2 + offset_2, + shape_2[2] * sizeof(float)); + } + } +} + +void concat_2d_by_rows(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { + OPENVINO_ASSERT( + shape_1[0] == shape_2[0], + "Tensors for concatenation must have the same dimensions"); + + for (size_t i = 0; i < shape_1[0]; ++i) { + size_t offset_1 = i * shape_1[1]; + size_t offset_2 = i * shape_2[1]; + + size_t step = i * (shape_1[1] + shape_2[1]); + + std::memcpy(res + step, data_1 + offset_1, shape_1[1] * sizeof(float)); + std::memcpy(res + step + shape_1[1], + data_2 + offset_2, + shape_2[1] * sizeof(float)); + } +} + +void concat_3d_by_cols(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { + OPENVINO_ASSERT( + shape_1[0] == shape_2[0] && shape_1[2] == shape_2[2], + "Tensors for concatenation must have the same dimensions"); + + for (size_t i = 0; i < shape_1[0]; ++i) { + size_t shift_1 = i * shape_1[1] * shape_1[2]; + size_t shift_2 = i * shape_2[1] * shape_2[2]; + + size_t step = shift_1 + shift_2; + + std::memcpy(res + step, data_1 + shift_1, shape_1[1] * shape_1[2] * sizeof(float)); + std::memcpy(res + step + shape_1[1] * shape_1[2], data_2 + shift_2, shape_2[1] * shape_2[2] * sizeof(float)); + } +} + +void concat_3d_by_channels(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { + OPENVINO_ASSERT( + shape_1[1] == shape_2[1] && shape_1[2] == shape_2[2], + "Tensors for concatenation must have the same dimensions"); + + size_t size_1 = shape_1[0] * shape_1[1] * shape_1[2]; + size_t size_2 = shape_2[0] * shape_2[1] * shape_2[2]; + + std::memcpy(res, data_1, size_1 * sizeof(float)); + std::memcpy(res + size_1, data_2, size_2 * sizeof(float)); +} + +void concat_2d_by_channels(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2) { + OPENVINO_ASSERT( + shape_1[1] == shape_2[1], + "Tensors for concatenation must have the same dimensions"); + + size_t size_1 = shape_1[0] * shape_1[1]; + size_t size_2 = shape_2[0] * shape_2[1]; + + std::memcpy(res, data_1, size_1 * sizeof(float)); + std::memcpy(res + size_1, data_2, size_2 * sizeof(float)); +} + + } // namespace ov } // namespace genai } // namespace numpy_utils diff --git a/src/cpp/src/text2image/numpy_utils.hpp b/src/cpp/src/text2image/numpy_utils.hpp index d6144eeb99..6c8c6da5ad 100644 --- a/src/cpp/src/text2image/numpy_utils.hpp +++ b/src/cpp/src/text2image/numpy_utils.hpp @@ -6,10 +6,13 @@ #include #include #include +#include #include #include #include +#include "openvino/core/shape.hpp" + namespace ov { namespace genai { namespace numpy_utils { @@ -42,6 +45,12 @@ void rescale_zero_terminal_snr(std::vector& betas); // np.interp(...) implementation std::vector interp(const std::vector& x, const std::vector& xp, const std::vector& fp); +void concat_3d_by_rows(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2); +void concat_3d_by_cols(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2); +void concat_3d_by_channels(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2); +void concat_2d_by_rows(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2); +void concat_2d_by_channels(const float* data_1, const float* data_2, float* res, const ov::Shape shape_1, const ov::Shape shape_2); + } // namespace ov } // namespace genai } // namespace numpy_utils diff --git a/src/cpp/src/text2image/schedulers/ddim.cpp b/src/cpp/src/text2image/schedulers/ddim.cpp index eaeb210cd1..9b1367d84f 100644 --- a/src/cpp/src/text2image/schedulers/ddim.cpp +++ b/src/cpp/src/text2image/schedulers/ddim.cpp @@ -189,6 +189,10 @@ std::vector DDIMScheduler::get_timesteps() const { return m_timesteps; } +std::vector DDIMScheduler::get_float_timesteps() const { + OPENVINO_THROW("DDIMScheduler doesn't support float timesteps"); +} + float DDIMScheduler::get_init_noise_sigma() const { return 1.0f; } diff --git a/src/cpp/src/text2image/schedulers/ddim.hpp b/src/cpp/src/text2image/schedulers/ddim.hpp index d0ab53d0f5..a3be88f9f3 100644 --- a/src/cpp/src/text2image/schedulers/ddim.hpp +++ b/src/cpp/src/text2image/schedulers/ddim.hpp @@ -39,6 +39,8 @@ class DDIMScheduler : public IScheduler { std::vector get_timesteps() const override; + std::vector get_float_timesteps() const override; + float get_init_noise_sigma() const override; void scale_model_input(ov::Tensor sample, size_t inference_step) override; diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.cpp b/src/cpp/src/text2image/schedulers/euler_discrete.cpp index 6ac65177d8..ac4406ad7f 100644 --- a/src/cpp/src/text2image/schedulers/euler_discrete.cpp +++ b/src/cpp/src/text2image/schedulers/euler_discrete.cpp @@ -258,6 +258,10 @@ std::vector EulerDiscreteScheduler::get_timesteps() const { return m_timesteps; } +std::vector EulerDiscreteScheduler::get_float_timesteps() const { + OPENVINO_THROW("EulerDiscreteScheduler doesn't support float timesteps"); +} + float EulerDiscreteScheduler::get_init_noise_sigma() const { float max_sigma = *std::max_element(m_sigmas.begin(), m_sigmas.end()); diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.hpp b/src/cpp/src/text2image/schedulers/euler_discrete.hpp index e6c826f739..21d0778479 100644 --- a/src/cpp/src/text2image/schedulers/euler_discrete.hpp +++ b/src/cpp/src/text2image/schedulers/euler_discrete.hpp @@ -41,6 +41,8 @@ class EulerDiscreteScheduler : public IScheduler { std::vector get_timesteps() const override; + std::vector get_float_timesteps() const override; + float get_init_noise_sigma() const override; void scale_model_input(ov::Tensor sample, size_t inference_step) override; diff --git a/src/cpp/src/text2image/schedulers/flow_match_euler_discrete.cpp b/src/cpp/src/text2image/schedulers/flow_match_euler_discrete.cpp new file mode 100644 index 0000000000..cc19ef490f --- /dev/null +++ b/src/cpp/src/text2image/schedulers/flow_match_euler_discrete.cpp @@ -0,0 +1,149 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/schedulers/flow_match_euler_discrete.hpp" + +#include +#include +#include +#include + +#include "text2image/numpy_utils.hpp" +#include "utils.hpp" + +namespace ov { +namespace genai { + +FlowMatchEulerDiscreteScheduler::Config::Config(const std::filesystem::path& scheduler_config_path) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "num_train_timesteps", num_train_timesteps); + read_json_param(data, "shift", shift); + read_json_param(data, "use_dynamic_shifting", use_dynamic_shifting); + read_json_param(data, "base_shift", base_shift); + read_json_param(data, "max_shift", max_shift); + read_json_param(data, "base_image_seq_len", base_image_seq_len); + read_json_param(data, "max_image_seq_len", max_image_seq_len); +} + +FlowMatchEulerDiscreteScheduler::FlowMatchEulerDiscreteScheduler(const std::filesystem::path& scheduler_config_path) + : FlowMatchEulerDiscreteScheduler(Config(scheduler_config_path)) {} + +FlowMatchEulerDiscreteScheduler::FlowMatchEulerDiscreteScheduler(const Config& scheduler_config) + : m_config(scheduler_config) { + using numpy_utils::linspace; + + int32_t num_train_timesteps = m_config.num_train_timesteps; + float shift = m_config.shift; + + auto linspaced = linspace(1.0f, static_cast(num_train_timesteps), num_train_timesteps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(*it); + } + + std::transform(m_timesteps.begin(), + m_timesteps.end(), + std::back_inserter(m_sigmas), + [num_train_timesteps](float x) { + return x / num_train_timesteps; + }); + + if (!m_config.use_dynamic_shifting) { + std::transform(m_sigmas.begin(), m_sigmas.end(), m_sigmas.begin(), [shift](float x) { + return shift * x / (1 + (shift - 1) * x); + }); + } + + for (size_t i = 0; i < m_timesteps.size(); ++i) { + m_timesteps[i] = m_sigmas[i] * num_train_timesteps; + } + + m_step_index = -1, m_begin_index = -1; + m_sigma_max = m_sigmas[0], m_sigma_min = m_sigmas.back(); +} + +float FlowMatchEulerDiscreteScheduler::sigma_to_t(float sigma) { + return sigma * m_config.num_train_timesteps; +} + +void FlowMatchEulerDiscreteScheduler::set_timesteps(size_t num_inference_steps) { + m_timesteps.clear(); + m_sigmas.clear(); + + m_num_inference_steps = num_inference_steps; + int32_t num_train_timesteps = m_config.num_train_timesteps; + float shift = m_config.shift; + + using numpy_utils::linspace; + m_timesteps = linspace(sigma_to_t(m_sigma_max), sigma_to_t(m_sigma_min), m_num_inference_steps, true); + + for (const float& i : m_timesteps) { + m_sigmas.push_back(i / num_train_timesteps); + } + + OPENVINO_ASSERT(!m_config.use_dynamic_shifting, + "Parameter 'use_dynamic_shifting' is not supported. Please, add support."); + + for (size_t i = 0; i < m_sigmas.size(); ++i) { + m_sigmas[i] = shift * m_sigmas[i] / (1 + (shift - 1) * m_sigmas[i]); + m_timesteps[i] = m_sigmas[i] * num_train_timesteps; + } + m_sigmas.push_back(0); + + m_step_index = -1, m_begin_index = -1; +} + +std::map FlowMatchEulerDiscreteScheduler::step(ov::Tensor noise_pred, + ov::Tensor latents, + size_t inference_step) { + // noise_pred - model_output + // latents - sample + // inference_step + + float* model_output_data = noise_pred.data(); + float* sample_data = latents.data(); + + if (m_step_index == -1) + init_step_index(); + + ov::Tensor prev_sample(latents.get_element_type(), latents.get_shape()); + float* prev_sample_data = prev_sample.data(); + + float sigma_diff = m_sigmas[m_step_index + 1] - m_sigmas[m_step_index]; + + for (size_t i = 0; i < prev_sample.get_size(); ++i) { + prev_sample_data[i] = sample_data[i] + sigma_diff * model_output_data[i]; + } + + m_step_index++; + + return {{"latent", prev_sample}}; +} + +std::vector FlowMatchEulerDiscreteScheduler::get_timesteps() const { + OPENVINO_THROW("FlowMatchEulerDiscreteScheduler doesn't support int timesteps"); +} + +std::vector FlowMatchEulerDiscreteScheduler::get_float_timesteps() const { + return m_timesteps; +} + +float FlowMatchEulerDiscreteScheduler::get_init_noise_sigma() const { + return 1.0f; +} + +void FlowMatchEulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { + return; +} + +void FlowMatchEulerDiscreteScheduler::init_step_index() { + // TODO: support index_for_timestep method + m_step_index = (m_begin_index == -1) ? 0 : m_begin_index; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/flow_match_euler_discrete.hpp b/src/cpp/src/text2image/schedulers/flow_match_euler_discrete.hpp new file mode 100644 index 0000000000..98a068bf01 --- /dev/null +++ b/src/cpp/src/text2image/schedulers/flow_match_euler_discrete.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "text2image/schedulers/types.hpp" +#include "text2image/schedulers/ischeduler.hpp" + +namespace ov { +namespace genai { + +class FlowMatchEulerDiscreteScheduler : public IScheduler { +public: + struct Config { + int32_t num_train_timesteps = 1000; + float shift = 1.0f; + bool use_dynamic_shifting = false; + float base_shift = 0.5f, max_shift = 1.15f; + int32_t base_image_seq_len = 256, max_image_seq_len = 4096; + + Config() = default; + explicit Config(const std::filesystem::path& scheduler_config_path); + }; + + explicit FlowMatchEulerDiscreteScheduler(const std::filesystem::path& scheduler_config_path); + explicit FlowMatchEulerDiscreteScheduler(const Config& scheduler_config); + + void set_timesteps(size_t num_inference_steps) override; + + std::vector get_timesteps() const override; + + std::vector get_float_timesteps() const override; + + float get_init_noise_sigma() const override; + + void scale_model_input(ov::Tensor sample, size_t inference_step) override; + + std::map step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override; + + void set_begin_index(size_t begin_index); + +private: + Config m_config; + + std::vector m_sigmas; + std::vector m_timesteps; + + float m_sigma_min, m_sigma_max; + size_t m_step_index, m_begin_index; + size_t m_num_inference_steps; + + void init_step_index(); + float sigma_to_t(float simga); +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/ischeduler.hpp b/src/cpp/src/text2image/schedulers/ischeduler.hpp index 51039765bf..d625265df2 100644 --- a/src/cpp/src/text2image/schedulers/ischeduler.hpp +++ b/src/cpp/src/text2image/schedulers/ischeduler.hpp @@ -17,6 +17,8 @@ class IScheduler : public Text2ImagePipeline::Scheduler { virtual std::vector get_timesteps() const = 0; + virtual std::vector get_float_timesteps() const = 0; + virtual float get_init_noise_sigma() const = 0; virtual void scale_model_input(ov::Tensor sample, size_t inference_step) = 0; diff --git a/src/cpp/src/text2image/schedulers/lcm.cpp b/src/cpp/src/text2image/schedulers/lcm.cpp index c4f0f072a1..3dcc10d6fd 100644 --- a/src/cpp/src/text2image/schedulers/lcm.cpp +++ b/src/cpp/src/text2image/schedulers/lcm.cpp @@ -208,6 +208,10 @@ std::vector LCMScheduler::get_timesteps() const { return m_timesteps; } +std::vector LCMScheduler::get_float_timesteps() const { + OPENVINO_THROW("LCMScheduler doesn't support float timesteps"); +} + float LCMScheduler::get_init_noise_sigma() const { return 1.0f; } diff --git a/src/cpp/src/text2image/schedulers/lcm.hpp b/src/cpp/src/text2image/schedulers/lcm.hpp index 13b9d9406c..0353d1af0c 100644 --- a/src/cpp/src/text2image/schedulers/lcm.hpp +++ b/src/cpp/src/text2image/schedulers/lcm.hpp @@ -46,6 +46,8 @@ class LCMScheduler : public IScheduler { std::vector get_timesteps() const override; + std::vector get_float_timesteps() const override; + float get_init_noise_sigma() const override; void scale_model_input(ov::Tensor sample, size_t inference_step) override; diff --git a/src/cpp/src/text2image/schedulers/lms_discrete.cpp b/src/cpp/src/text2image/schedulers/lms_discrete.cpp index dbb1358373..e9f1fb5ae7 100644 --- a/src/cpp/src/text2image/schedulers/lms_discrete.cpp +++ b/src/cpp/src/text2image/schedulers/lms_discrete.cpp @@ -187,6 +187,10 @@ std::vector LMSDiscreteScheduler::get_timesteps() const { return m_timesteps; } +std::vector LMSDiscreteScheduler::get_float_timesteps() const { + OPENVINO_THROW("LMSDiscreteScheduler doesn't support float timesteps"); +} + std::map LMSDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) { const float sigma = m_sigmas[inference_step]; diff --git a/src/cpp/src/text2image/schedulers/lms_discrete.hpp b/src/cpp/src/text2image/schedulers/lms_discrete.hpp index 6c0a61a777..f87f1d8a91 100644 --- a/src/cpp/src/text2image/schedulers/lms_discrete.hpp +++ b/src/cpp/src/text2image/schedulers/lms_discrete.hpp @@ -35,6 +35,8 @@ class LMSDiscreteScheduler : public IScheduler { std::vector get_timesteps() const override; + std::vector get_float_timesteps() const override; + float get_init_noise_sigma() const override; void scale_model_input(ov::Tensor sample, size_t inference_step) override; diff --git a/src/cpp/src/text2image/schedulers/scheduler.cpp b/src/cpp/src/text2image/schedulers/scheduler.cpp index 2ee4c2adac..c15aad1626 100644 --- a/src/cpp/src/text2image/schedulers/scheduler.cpp +++ b/src/cpp/src/text2image/schedulers/scheduler.cpp @@ -11,6 +11,7 @@ #include "text2image/schedulers/lms_discrete.hpp" #include "text2image/schedulers/ddim.hpp" #include "text2image/schedulers/euler_discrete.hpp" +#include "text2image/schedulers/flow_match_euler_discrete.hpp" namespace ov { namespace genai { @@ -38,6 +39,8 @@ std::shared_ptr Text2ImagePipeline::Scheduler::fr scheduler = std::make_shared(scheduler_config_path); } else if (scheduler_type == Scheduler::Type::EULER_DISCRETE) { scheduler = std::make_shared(scheduler_config_path); + } else if (scheduler_type == Scheduler::Type::FLOW_MATCH_EULER_DISCRETE) { + scheduler = std::make_shared(scheduler_config_path); } else { OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one"); } diff --git a/src/cpp/src/text2image/schedulers/types.cpp b/src/cpp/src/text2image/schedulers/types.cpp index 0ca970f359..aed46e7d70 100644 --- a/src/cpp/src/text2image/schedulers/types.cpp +++ b/src/cpp/src/text2image/schedulers/types.cpp @@ -51,6 +51,8 @@ void read_json_param(const nlohmann::json& data, const std::string& name, Text2I param = Text2ImagePipeline::Scheduler::LMS_DISCRETE; else if (scheduler_type_str == "EulerDiscreteScheduler") param = Text2ImagePipeline::Scheduler::EULER_DISCRETE; + else if (scheduler_type_str == "FlowMatchEulerDiscreteScheduler") + param = Text2ImagePipeline::Scheduler::FLOW_MATCH_EULER_DISCRETE; else if (!scheduler_type_str.empty()) { OPENVINO_THROW("Unsupported value for 'prediction_type' ", scheduler_type_str); } diff --git a/src/cpp/src/text2image/stable_diffusion_3_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_3_pipeline.hpp new file mode 100644 index 0000000000..8999f95306 --- /dev/null +++ b/src/cpp/src/text2image/stable_diffusion_3_pipeline.hpp @@ -0,0 +1,618 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "text2image/diffusion_pipeline.hpp" +#include "text2image/numpy_utils.hpp" +#include "utils.hpp" + +namespace { + +// src - input tensor with data for padding +// res - zeros tonsor with target shape +void padding_right(const float* src, float* res, const ov::Shape src_size, const ov::Shape res_size) { + OPENVINO_ASSERT(src_size[0] == res_size[0] && src_size[1] == res_size[1], + "Tensors for padding_right must have the same dimensions"); + + for (size_t i = 0; i < res_size[0]; ++i) { + for (size_t j = 0; j < res_size[1]; ++j) { + size_t offset_1 = (i * res_size[1] + j) * res_size[2]; + size_t offset_2 = (i * src_size[1] + j) * src_size[2]; + + std::memcpy(res + offset_1, src + offset_2, src_size[2] * sizeof(float)); + } + } +} + +ov::Tensor tensor_batch_copy(const ov::Tensor input, const size_t num_images_per_prompt, size_t batch_size_multiplier) { + ov::Shape repeated_shape = input.get_shape(); + repeated_shape[0] *= num_images_per_prompt; + ov::Tensor tensor_repeated(input.get_element_type(), repeated_shape); + + for (size_t n = 0; n < num_images_per_prompt; ++n) { + batch_copy(input, tensor_repeated, 0, n); + } + + return tensor_repeated; +} + +ov::Tensor split_2d_by_batch(const ov::Tensor input, size_t batch_num) { + ov::Tensor result(input.get_element_type(), {1, input.get_shape()[1]}); + + size_t shift = batch_num * input.get_shape()[1]; + std::memcpy(result.data(), input.data() + shift, result.get_shape()[1] * sizeof(float)); + + return result; +} + +ov::Tensor split_3d_by_batch(const ov::Tensor input, size_t batch_num) { + ov::Tensor result(input.get_element_type(), {1, input.get_shape()[1], input.get_shape()[2]}); + + size_t shift = batch_num * input.get_shape()[1] * input.get_shape()[2]; + std::memcpy(result.data(), + input.data() + shift, + result.get_shape()[1] * input.get_shape()[2] * sizeof(float)); + + return result; +} + +} // namespace + +namespace ov { +namespace genai { + +class Text2ImagePipeline::StableDiffusion3Pipeline : public Text2ImagePipeline::DiffusionPipeline { +public: + explicit StableDiffusion3Pipeline(const std::filesystem::path& root_dir) { + const std::filesystem::path model_index_path = root_dir / "model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get(); + if (text_encoder == "CLIPTextModelWithProjection") { + m_clip_text_encoder_1 = std::make_shared(root_dir / "text_encoder"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string text_encoder_2 = data["text_encoder_2"][1].get(); + if (text_encoder_2 == "CLIPTextModelWithProjection") { + m_clip_text_encoder_2 = std::make_shared(root_dir / "text_encoder_2"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder_2, "' text encoder type"); + } + + // TODO: + // const std::string text_encoder_3 = data["text_encoder_3"][1].get(); + // if (text_encoder_2 == "T5EncoderModel") { + // m_t5_encoder_model = std::make_shared(root_dir + "/text_encoder_3"); + // } else { + // OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + // } + + const std::string vae = data["vae"][1].get(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared(root_dir / "vae_decoder"); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + const std::string transformer = data["transformer"][1].get(); + if (transformer == "SD3Transformer2DModel") { + m_transformer = std::make_shared(root_dir / "transformer"); + } else { + OPENVINO_THROW("Unsupported '", transformer, "'Transformer type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get()); + } + + StableDiffusion3Pipeline(const std::filesystem::path& root_dir, + const std::string& device, + const ov::AnyMap& properties) { + const std::filesystem::path model_index_path = root_dir / "model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir / "scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get(); + if (text_encoder == "CLIPTextModelWithProjection") { + m_clip_text_encoder_1 = + std::make_shared(root_dir / "text_encoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string text_encoder_2 = data["text_encoder_2"][1].get(); + if (text_encoder_2 == "CLIPTextModelWithProjection") { + m_clip_text_encoder_2 = + std::make_shared(root_dir / "text_encoder_2", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder_2, "' text encoder type"); + } + + // TODO: text_encoder_3 + + const std::string vae = data["vae"][1].get(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared(root_dir / "vae_decoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + const std::string transformer = data["transformer"][1].get(); + if (transformer == "SD3Transformer2DModel") { + m_transformer = std::make_shared(root_dir / "transformer", device, properties); + } else { + OPENVINO_THROW("Unsupported '", transformer, "'Transformer type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get()); + } + + StableDiffusion3Pipeline(const CLIPTextModelWithProjection& clip_text_model_1, + const CLIPTextModelWithProjection& clip_text_model_2, + const SD3Transformer2DModel& transformer, + const AutoencoderKL& vae_decoder) + : m_clip_text_encoder_1(std::make_shared(clip_text_model_1)), + m_clip_text_encoder_2(std::make_shared(clip_text_model_2)), + m_vae_decoder(std::make_shared(vae_decoder)), + m_transformer(std::make_shared(transformer)) {} + + void reshape(const int num_images_per_prompt, + const int height, + const int width, + const float guidance_scale) override { + check_image_size(height, width); + + const size_t batch_size_multiplier = + do_classifier_free_guidance(guidance_scale) ? 2 : 1; // Transformer accepts 2x batch in case of CFG + m_clip_text_encoder_1->reshape(batch_size_multiplier); + m_clip_text_encoder_2->reshape(batch_size_multiplier); + m_transformer->reshape(num_images_per_prompt * batch_size_multiplier, + height, + width, + m_clip_text_encoder_1->get_config().max_position_embeddings); + m_vae_decoder->reshape(num_images_per_prompt, height, width); + } + + void compile(const std::string& device, const ov::AnyMap& properties) override { + m_clip_text_encoder_1->compile(device, properties); + m_clip_text_encoder_2->compile(device, properties); + m_vae_decoder->compile(device, properties); + m_transformer->compile(device, properties); + } + + ov::Tensor generate(const std::string& positive_prompt, const ov::AnyMap& properties) override { + using namespace numpy_utils; + GenerationConfig generation_config = m_generation_config; + generation_config.update_generation_config(properties); + + const auto& transformer_config = m_transformer->get_config(); + const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) + ? 2 + : 1; // Transformer accepts 2x batch in case of CFG + + const size_t vae_scale_factor = m_transformer->get_vae_scale_factor(); + + if (generation_config.height < 0) + generation_config.height = transformer_config.sample_size * vae_scale_factor; + if (generation_config.width < 0) + generation_config.width = transformer_config.sample_size * vae_scale_factor; + + check_inputs(generation_config); + + if (generation_config.random_generator == nullptr) { + uint32_t seed = time(NULL); + generation_config.random_generator = std::make_shared(seed); + } + + // Input tensors for transformer model + ov::Tensor prompt_embeds_inp, pooled_prompt_embeds_inp; + + // 1. Encode positive prompt: + std::string prompt_2_str = + generation_config.prompt_2 != std::nullopt ? *generation_config.prompt_2 : positive_prompt; + std::string prompt_3_str = + generation_config.prompt_3 != std::nullopt ? *generation_config.prompt_3 : positive_prompt; + + std::string negative_prompt_1_str = generation_config.negative_prompt; + std::string negative_prompt_2_str = generation_config.negative_prompt_2 != std::nullopt + ? *generation_config.negative_prompt_2 + : negative_prompt_1_str; + std::string negative_prompt_3_str = generation_config.negative_prompt_3 != std::nullopt + ? *generation_config.negative_prompt_3 + : negative_prompt_1_str; + + // text_encoder_1_output - stores positive and negative pooled_prompt_embeds + ov::Tensor text_encoder_1_output = + m_clip_text_encoder_1->infer(positive_prompt, + negative_prompt_1_str, + do_classifier_free_guidance(generation_config.guidance_scale)); + + // get positive pooled_prompt_embed_out + ov::Tensor pooled_prompt_embed_out = split_2d_by_batch(text_encoder_1_output, 1); + + // text_encoder_1_hidden_state - stores positive and negative prompt_embeds + size_t idx_hidden_state_1 = m_clip_text_encoder_1->get_config().num_hidden_layers + 1; + ov::Tensor text_encoder_1_hidden_state = m_clip_text_encoder_1->get_output_tensor(idx_hidden_state_1); + // get positive prompt_embed_out + ov::Tensor prompt_embed_out = split_3d_by_batch(text_encoder_1_hidden_state, 1); + + // text_encoder_2_output - stores positive and negative pooled_prompt_2_embeds + ov::Tensor text_encoder_2_output = + m_clip_text_encoder_2->infer(prompt_2_str, + negative_prompt_2_str, + do_classifier_free_guidance(generation_config.guidance_scale)); + + // get positive pooled_prompt_2_embed_out + ov::Tensor pooled_prompt_2_embed_out = split_2d_by_batch(text_encoder_2_output, 1); + + // text_encoder_2_hidden_state - stores positive and negative prompt_2_embeds + size_t idx_hidden_state_2 = m_clip_text_encoder_2->get_config().num_hidden_layers + 1; + ov::Tensor text_encoder_2_hidden_state = m_clip_text_encoder_2->get_output_tensor(idx_hidden_state_2); + // get positive prompt_2_embed_out + ov::Tensor prompt_2_embed_out = split_3d_by_batch(text_encoder_2_hidden_state, 1); + + ov::Tensor pooled_prompt_embed, prompt_embed, pooled_prompt_2_embed, prompt_2_embed; + if (generation_config.num_images_per_prompt == 1) { + pooled_prompt_embed = pooled_prompt_embed_out; + prompt_embed = prompt_embed_out; + pooled_prompt_2_embed = pooled_prompt_2_embed_out; + prompt_2_embed = prompt_2_embed_out; + } else { + pooled_prompt_embed = tensor_batch_copy(pooled_prompt_embed_out, + generation_config.num_images_per_prompt, + batch_size_multiplier); + prompt_embed = + tensor_batch_copy(prompt_embed_out, generation_config.num_images_per_prompt, batch_size_multiplier); + pooled_prompt_2_embed = tensor_batch_copy(pooled_prompt_2_embed_out, + generation_config.num_images_per_prompt, + batch_size_multiplier); + prompt_2_embed = + tensor_batch_copy(prompt_2_embed_out, generation_config.num_images_per_prompt, batch_size_multiplier); + } + + // concatenate hidden_states from two encoders + ov::Shape pr_emb_shape = prompt_embed.get_shape(); + ov::Shape pr_emb_2_shape = prompt_2_embed.get_shape(); + + ov::Shape clip_prompt_embeds_shape = {pr_emb_shape[0], pr_emb_shape[1], pr_emb_shape[2] + pr_emb_2_shape[2]}; + ov::Tensor clip_prompt_embeds(prompt_embed.get_element_type(), clip_prompt_embeds_shape); + + const float* pr_emb_1_data = prompt_embed.data(); + const float* pr_emb_2_data = prompt_2_embed.data(); + float* clip_prompt_embeds_data = clip_prompt_embeds.data(); + + concat_3d_by_rows(pr_emb_1_data, pr_emb_2_data, clip_prompt_embeds_data, pr_emb_shape, pr_emb_2_shape); + + // TODO: text_encoder_3 + ov::Shape t5_prompt_embed_shape = {generation_config.num_images_per_prompt, + m_clip_text_encoder_1->get_config().max_position_embeddings, + transformer_config.joint_attention_dim}; + + std::vector t5_prompt_embed( + t5_prompt_embed_shape[0] * t5_prompt_embed_shape[1] * t5_prompt_embed_shape[2], + 0.0f); + + // padding for clip_prompt_embeds + ov::Shape pad_embeds_shape = {clip_prompt_embeds_shape[0], + clip_prompt_embeds_shape[1], + t5_prompt_embed_shape[2]}; + + std::vector pad_embeds(pad_embeds_shape[0] * pad_embeds_shape[1] * pad_embeds_shape[2], 0.0f); + padding_right(clip_prompt_embeds_data, pad_embeds.data(), clip_prompt_embeds_shape, pad_embeds_shape); + + // prompt_embeds = torch.cat([pad_embeds, t5_prompt_embed], dim=-2) + ov::Shape prompt_embeds_shape = {pad_embeds_shape[0], + pad_embeds_shape[1] + t5_prompt_embed_shape[1], + pad_embeds_shape[2]}; + ov::Tensor prompt_embeds(ov::element::f32, prompt_embeds_shape); + float* prompt_embeds_data = prompt_embeds.data(); + concat_3d_by_cols(pad_embeds.data(), + t5_prompt_embed.data(), + prompt_embeds_data, + pad_embeds_shape, + t5_prompt_embed_shape); + + // pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1) + ov::Shape p_pr_emb_shape = pooled_prompt_embed.get_shape(); + ov::Shape p_pr_emb_2_shape = pooled_prompt_2_embed.get_shape(); + + const float* pooled_prompt_embed_data = pooled_prompt_embed.data(); + const float* pooled_prompt_2_embed_data = pooled_prompt_2_embed.data(); + + ov::Shape pooled_prompt_embeds_shape = {p_pr_emb_shape[0], p_pr_emb_shape[1] + p_pr_emb_2_shape[1]}; + ov::Tensor pooled_prompt_embeds(ov::element::f32, pooled_prompt_embeds_shape); + float* pooled_prompt_embeds_data = pooled_prompt_embeds.data(); + + concat_2d_by_rows(pooled_prompt_embed_data, + pooled_prompt_2_embed_data, + pooled_prompt_embeds_data, + p_pr_emb_shape, + p_pr_emb_2_shape); + // From steps above we'll use prompt_embeds and pooled_prompt_embeds tensors + + if (do_classifier_free_guidance(generation_config.guidance_scale)) { + // 2. Encode negative prompt: + + ov::Tensor negative_pooled_prompt_embed_out = split_2d_by_batch(text_encoder_1_output, 0); + ov::Tensor negative_prompt_embed_out = split_3d_by_batch(text_encoder_1_hidden_state, 0); + ov::Tensor negative_pooled_prompt_2_embed_out = split_2d_by_batch(text_encoder_2_output, 0); + ov::Tensor negative_prompt_2_embed_out = split_3d_by_batch(text_encoder_2_hidden_state, 0); + + ov::Tensor negative_pooled_prompt_embed, negative_prompt_embed, negative_pooled_prompt_2_embed, + negative_prompt_2_embed; + if (generation_config.num_images_per_prompt == 1) { + negative_pooled_prompt_embed = negative_pooled_prompt_embed_out; + negative_prompt_embed = negative_prompt_embed_out; + negative_pooled_prompt_2_embed = negative_pooled_prompt_2_embed_out; + negative_prompt_2_embed = negative_prompt_2_embed_out; + } else { + negative_pooled_prompt_embed = tensor_batch_copy(negative_pooled_prompt_embed_out, + generation_config.num_images_per_prompt, + batch_size_multiplier); + negative_prompt_embed = tensor_batch_copy(negative_prompt_embed_out, + generation_config.num_images_per_prompt, + batch_size_multiplier); + negative_pooled_prompt_2_embed = tensor_batch_copy(negative_pooled_prompt_2_embed_out, + generation_config.num_images_per_prompt, + batch_size_multiplier); + negative_prompt_2_embed = tensor_batch_copy(negative_prompt_2_embed_out, + generation_config.num_images_per_prompt, + batch_size_multiplier); + } + + // concatenate hidden_states from two encoders + ov::Shape n_pr_emb_1_shape = negative_prompt_embed.get_shape(); + ov::Shape n_pr_emb_2_shape = negative_prompt_2_embed.get_shape(); + + ov::Shape neg_clip_prompt_embeds_shape = {n_pr_emb_1_shape[0], + n_pr_emb_1_shape[1], + n_pr_emb_1_shape[2] + n_pr_emb_2_shape[2]}; + ov::Tensor neg_clip_prompt_embeds(prompt_embed.get_element_type(), neg_clip_prompt_embeds_shape); + + const float* neg_pr_emb_1_data = negative_prompt_embed.data(); + const float* neg_pr_emb_2_data = negative_prompt_2_embed.data(); + float* neg_clip_prompt_embeds_data = neg_clip_prompt_embeds.data(); + + concat_3d_by_rows(neg_pr_emb_1_data, + neg_pr_emb_2_data, + neg_clip_prompt_embeds_data, + n_pr_emb_1_shape, + n_pr_emb_2_shape); + + std::vector t5_neg_prompt_embed( + t5_prompt_embed_shape[0] * t5_prompt_embed_shape[1] * t5_prompt_embed_shape[2], + 0.0f); + + // padding for neg_clip_prompt_embeds + ov::Shape neg_pad_embeds_shape = {neg_clip_prompt_embeds_shape[0], + neg_clip_prompt_embeds_shape[1], + t5_prompt_embed_shape[2]}; + + std::vector neg_pad_embeds( + neg_pad_embeds_shape[0] * neg_pad_embeds_shape[1] * neg_pad_embeds_shape[2], + 0.0f); + + padding_right(neg_clip_prompt_embeds_data, + neg_pad_embeds.data(), + neg_clip_prompt_embeds_shape, + neg_pad_embeds_shape); + + // negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2) + ov::Shape neg_prompt_embeds_shape = {neg_pad_embeds_shape[0], + neg_pad_embeds_shape[1] + t5_prompt_embed_shape[1], + neg_pad_embeds_shape[2]}; + ov::Tensor neg_prompt_embeds(ov::element::f32, neg_prompt_embeds_shape); + float* neg_prompt_embeds_data = neg_prompt_embeds.data(); + + concat_3d_by_cols(neg_pad_embeds.data(), + t5_neg_prompt_embed.data(), + neg_prompt_embeds_data, + neg_pad_embeds_shape, + t5_prompt_embed_shape); + + // neg_pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], + // dim=-1) + ov::Shape neg_pooled_pr_emb_shape = negative_pooled_prompt_embed.get_shape(); + ov::Shape neg_pooled_pr_2_emb_shape = negative_pooled_prompt_2_embed.get_shape(); + + const float* neg_pooled_pr_emb_data = negative_pooled_prompt_embed.data(); + const float* neg_pooled_pr_2_emb_data = negative_pooled_prompt_2_embed.data(); + + ov::Shape neg_pooled_prompt_embeds_shape = {neg_pooled_pr_emb_shape[0], + neg_pooled_pr_emb_shape[1] + neg_pooled_pr_2_emb_shape[1]}; + ov::Tensor neg_pooled_prompt_embeds(ov::element::f32, neg_pooled_prompt_embeds_shape); + float* neg_pooled_prompt_embeds_data = neg_pooled_prompt_embeds.data(); + + concat_2d_by_rows(neg_pooled_pr_emb_data, + neg_pooled_pr_2_emb_data, + neg_pooled_prompt_embeds_data, + neg_pooled_pr_emb_shape, + neg_pooled_pr_2_emb_shape); + // From steps above we'll use neg_prompt_embeds and neg_pooled_prompt_embeds tensors + + // Fill in transformer inputs: concat positive and negative prompt_embeds + ov::Shape prompt_embeds_inp_shape = {prompt_embeds_shape[0] + neg_prompt_embeds_shape[0], + prompt_embeds_shape[1], + prompt_embeds_shape[2]}; + prompt_embeds_inp = ov::Tensor(ov::element::f32, prompt_embeds_inp_shape); + float* prompt_embeds_inp_data = prompt_embeds_inp.data(); + concat_3d_by_channels(neg_prompt_embeds_data, + prompt_embeds_data, + prompt_embeds_inp_data, + neg_prompt_embeds_shape, + prompt_embeds_shape); + + ov::Shape pooled_prompt_embeds_inp_shape = { + neg_pooled_prompt_embeds_shape[0] + pooled_prompt_embeds_shape[0], + pooled_prompt_embeds_shape[1]}; + + pooled_prompt_embeds_inp = ov::Tensor(ov::element::f32, pooled_prompt_embeds_inp_shape); + float* pooled_prompt_embeds_input_data = pooled_prompt_embeds_inp.data(); + concat_2d_by_channels(neg_pooled_prompt_embeds_data, + pooled_prompt_embeds_data, + pooled_prompt_embeds_input_data, + neg_pooled_prompt_embeds_shape, + pooled_prompt_embeds_shape); + } else { + // Fill in transformer inputs + prompt_embeds_inp = prompt_embeds; + pooled_prompt_embeds_inp = pooled_prompt_embeds; + } + + // 3. Prepare timesteps + m_scheduler->set_timesteps(generation_config.num_inference_steps); + std::vector timesteps = m_scheduler->get_float_timesteps(); + + // 4. Set model inputs + m_transformer->set_hidden_states("encoder_hidden_states", prompt_embeds_inp); + m_transformer->set_hidden_states("pooled_projections", pooled_prompt_embeds_inp); + + // 5. Prepare latent variables + size_t num_channels_latents = m_transformer->get_config().in_channels; + ov::Shape latent_shape{generation_config.num_images_per_prompt, + num_channels_latents, + generation_config.height / vae_scale_factor, + generation_config.width / vae_scale_factor}; + + ov::Shape latent_shape_cfg = latent_shape; + latent_shape_cfg[0] *= batch_size_multiplier; + + ov::Tensor latent(ov::element::f32, latent_shape), latent_cfg(ov::element::f32, latent_shape_cfg); + std::generate_n(latent.data(), latent.get_size(), [&]() -> float { + return generation_config.random_generator->next() * m_scheduler->get_init_noise_sigma(); + }); + + // 6. Denoising loop + ov::Tensor noisy_residual_tensor(ov::element::f32, {}); + ov::Tensor timestep; + + for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; ++inference_step) { + // concat the same latent twice along a batch dimension in case of CFG + if (batch_size_multiplier > 1) { + batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); + batch_copy(latent, + latent_cfg, + 0, + generation_config.num_images_per_prompt, + generation_config.num_images_per_prompt); + + size_t timestep_size = generation_config.num_images_per_prompt * batch_size_multiplier; + timestep = ov::Tensor(ov::element::f32, {timestep_size}); + float* timestep_data = timestep.data(); + std::fill_n(timestep_data, timestep_size, timesteps[inference_step]); + } else { + // just assign to save memory copy + latent_cfg = latent; + timestep = ov::Tensor(ov::element::f32, {1}, ×teps[inference_step]); + } + + ov::Tensor noise_pred_tensor = m_transformer->infer(latent_cfg, timestep); + + ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); + noise_pred_shape[0] /= batch_size_multiplier; + noisy_residual_tensor.set_shape(noise_pred_shape); + + if (batch_size_multiplier > 1) { + // perform guidance + float* noisy_residual = noisy_residual_tensor.data(); + const float* noise_pred_uncond = noise_pred_tensor.data(); + const float* noise_pred_text = noise_pred_uncond + noisy_residual_tensor.get_size(); + + for (size_t i = 0; i < noisy_residual_tensor.get_size(); ++i) { + noisy_residual[i] = noise_pred_uncond[i] + + generation_config.guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); + } + } else { + noisy_residual_tensor = noise_pred_tensor; + } + + auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step); + latent = scheduler_step_result["latent"]; + } + + float* latent_data = latent.data(); + for (size_t i = 0; i < latent.get_size(); ++i) { + latent_data[i] = (latent_data[i] / m_vae_decoder->get_config().scaling_factor) + + m_vae_decoder->get_config().shift_factor; + } + + return m_vae_decoder->infer(latent); + } + +private: + bool do_classifier_free_guidance(float guidance_scale) const { + return guidance_scale >= 1.0; + } + + void initialize_generation_config(const std::string& class_name) override { + assert(m_transformer != nullptr); + assert(m_vae_decoder != nullptr); + + const auto& transformer_config = m_transformer->get_config(); + const size_t vae_scale_factor = m_transformer->get_vae_scale_factor(); + + m_generation_config.height = transformer_config.sample_size * vae_scale_factor; + m_generation_config.width = transformer_config.sample_size * vae_scale_factor; + + if (class_name == "StableDiffusion3Pipeline") { + m_generation_config.guidance_scale = 7.0f; + m_generation_config.num_inference_steps = 28; + } else { + OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); + } + } + + void check_image_size(const int height, const int width) const override { + assert(m_transformer != nullptr); + const size_t vae_scale_factor = m_transformer->get_vae_scale_factor(); + const size_t patch_size = m_transformer->get_config().patch_size; + OPENVINO_ASSERT((height % (vae_scale_factor * patch_size) == 0 || height < 0) && + (width % (vae_scale_factor * patch_size) == 0 || width < 0), + "Both 'width' and 'height' must be divisible by", + vae_scale_factor); + } + + void check_inputs(const GenerationConfig& generation_config) const override { + check_image_size(generation_config.width, generation_config.height); + + const bool is_classifier_free_guidance = do_classifier_free_guidance(generation_config.guidance_scale); + const char* const pipeline_name = "Stable Diffusion 3"; + + OPENVINO_ASSERT( + generation_config.prompt_3 == std::nullopt || generation_config.negative_prompt_3 == std::nullopt, + "T5Encoder is not currently supported, 'prompt_3' and 'negative_prompt_3' can't be used. Please, add " + "support."); + OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt.empty(), + "Negative prompt is not used when guidance scale < 1.0"); + OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_2 == std::nullopt, + "Negative prompt 2 is not used when guidance scale < 1.0"); + OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_3 == std::nullopt, + "Negative prompt 3 is not used when guidance scale < 1.0"); + } + + std::shared_ptr m_transformer; + std::shared_ptr m_clip_text_encoder_1; + std::shared_ptr m_clip_text_encoder_2; + // TODO: + // std::shared_ptr m_t5_encoder_model; + std::shared_ptr m_vae_decoder; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp index ed1508534f..6d0624adce 100644 --- a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp @@ -295,8 +295,8 @@ class Text2ImagePipeline::StableDiffusionPipeline : public Text2ImagePipeline::D } else if (!is_classifier_free_guidance) { OPENVINO_ASSERT(generation_config.negative_prompt.empty(), "Negative prompt is not used when guidance scale < 1.0"); } - OPENVINO_ASSERT(generation_config.negative_prompt_2.empty(), "Negative prompt 2 is not used by ", pipeline_name); - OPENVINO_ASSERT(generation_config.negative_prompt_3.empty(), "Negative prompt 3 is not used by ", pipeline_name); + OPENVINO_ASSERT(generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used by ", pipeline_name); + OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name); } std::shared_ptr m_clip_text_encoder; diff --git a/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp index 8a239f418f..8f8af97e52 100644 --- a/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp +++ b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp @@ -358,8 +358,8 @@ class Text2ImagePipeline::StableDiffusionXLPipeline : public Text2ImagePipeline: OPENVINO_ASSERT(generation_config.prompt_3 == std::nullopt, "Prompt 3 is not used by ", pipeline_name); OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt.empty(), "Negative prompt is not used when guidance scale < 1.0"); - OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_2.empty(), "Negative prompt 2 is not used when guidance scale < 1.0"); - OPENVINO_ASSERT(generation_config.negative_prompt_3.empty(), "Negative prompt 3 is not used by ", pipeline_name); + OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_2 == std::nullopt, "Negative prompt 2 is not used when guidance scale < 1.0"); + OPENVINO_ASSERT(generation_config.negative_prompt_3 == std::nullopt, "Negative prompt 3 is not used by ", pipeline_name); } ov::AnyMap properties_for_text_encoder(ov::AnyMap properties, const std::string& tensor_name_prefix) { diff --git a/src/cpp/src/text2image/text2image_pipeline.cpp b/src/cpp/src/text2image/text2image_pipeline.cpp index a50ca564af..7c92166171 100644 --- a/src/cpp/src/text2image/text2image_pipeline.cpp +++ b/src/cpp/src/text2image/text2image_pipeline.cpp @@ -3,6 +3,7 @@ #include "text2image/stable_diffusion_pipeline.hpp" #include "text2image/stable_diffusion_xl_pipeline.hpp" +#include "text2image/stable_diffusion_3_pipeline.hpp" #include #include @@ -57,6 +58,8 @@ void Text2ImagePipeline::GenerationConfig::update_generation_config(const ov::An void Text2ImagePipeline::GenerationConfig::validate() const { OPENVINO_ASSERT(guidance_scale >= 1.0f || negative_prompt.empty(), "Guidance scale < 1.0 ignores negative prompt"); + OPENVINO_ASSERT(guidance_scale >= 1.0f || negative_prompt_2 == std::nullopt, "Guidance scale < 1.0 ignores negative prompt"); + OPENVINO_ASSERT(guidance_scale >= 1.0f || negative_prompt_3 == std::nullopt, "Guidance scale < 1.0 ignores negative prompt"); } // @@ -71,6 +74,8 @@ Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir) { m_impl = std::make_shared(root_dir); } else if (class_name == "StableDiffusionXLPipeline") { m_impl = std::make_shared(root_dir); + } else if (class_name == "StableDiffusion3Pipeline") { + m_impl = std::make_shared(root_dir); } else { OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); } @@ -84,6 +89,8 @@ Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir, co m_impl = std::make_shared(root_dir, device, properties); } else if (class_name == "StableDiffusionXLPipeline") { m_impl = std::make_shared(root_dir, device, properties); + } else if (class_name == "StableDiffusion3Pipeline") { + m_impl = std::make_shared(root_dir, device, properties); } else { OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); } @@ -117,10 +124,10 @@ Text2ImagePipeline Text2ImagePipeline::latent_consistency_model( Text2ImagePipeline Text2ImagePipeline::stable_diffusion_xl( const std::shared_ptr& scheduler, - const CLIPTextModel& clip_text_model, - const CLIPTextModelWithProjection& clip_text_model_with_projection, - const UNet2DConditionModel& unet, - const AutoencoderKL& vae_decoder) { + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) { auto impl = std::make_shared(clip_text_model, clip_text_model_with_projection, unet, vae_decoder); assert(scheduler != nullptr); @@ -129,6 +136,20 @@ Text2ImagePipeline Text2ImagePipeline::stable_diffusion_xl( return Text2ImagePipeline(impl); } +Text2ImagePipeline Text2ImagePipeline::stable_diffusion_3( + const std::shared_ptr& scheduler, + const CLIPTextModelWithProjection& clip_text_model_1, + const CLIPTextModelWithProjection& clip_text_model_2, + const SD3Transformer2DModel& transformer, + const AutoencoderKL& vae_decoder){ + auto impl = std::make_shared(clip_text_model_1, clip_text_model_2, transformer, vae_decoder); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Text2ImagePipeline(impl); +} + Text2ImagePipeline::GenerationConfig Text2ImagePipeline::get_generation_config() const { return m_impl->get_generation_config(); } diff --git a/src/cpp/src/visual_language/input_embedder.cpp b/src/cpp/src/visual_language/input_embedder.cpp index 28f3e8661a..2f1924fd84 100644 --- a/src/cpp/src/visual_language/input_embedder.cpp +++ b/src/cpp/src/visual_language/input_embedder.cpp @@ -130,6 +130,46 @@ class InputsEmbedder::IInputsEmbedder { } return encoded_input_ids; } + + /** + * @brief Unpads an image tensor of a padded and resized image. + * Used for packing image features of llava_next models. + * + * @param tensor An image tensor with a shape (embed_dim, height, width) + * @param original_size A size of original image + * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) + */ + + /** + * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]). + * + * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or [HWC]. + * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C]. + */ + std::vector to_single_image_tensors(const std::vector& images) { + std::vector single_image_tensors; + for (const auto& image : images) { + ov::Tensor reshaped_image = image; + ov::Shape image_shape = image.get_shape(); + switch (image_shape.size()) { + case 3: + reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)}); + break; + case 4: break; + default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); + } + ov::Shape reshaped_image_shape = reshaped_image.get_shape(); + for (size_t batch_idx = 0; batch_idx < reshaped_image_shape.at(0); ++batch_idx) { + ov::Tensor single_image{ + ov::element::u8, + {1, reshaped_image_shape.at(1), reshaped_image_shape.at(2), reshaped_image_shape.at(3)}, + reshaped_image.data() + batch_idx * reshaped_image_shape.at(1) * reshaped_image_shape.at(2) * reshaped_image_shape.at(3) + }; + single_image_tensors.push_back(std::move(single_image)); + } + } + return single_image_tensors; + } }; class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { @@ -161,49 +201,35 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images) override { std::string images_prompt; std::vector embeds; - for (const ov::Tensor& rgb : images) { - ov::Tensor reshaped = rgb; - ov::Shape rgb_shape = rgb.get_shape(); - switch (rgb_shape.size()) { - case 3: - reshaped.set_shape({1, rgb_shape.at(0), rgb_shape.at(1), rgb_shape.at(2)}); - break; - case 4: break; - default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); + + std::vector single_images = to_single_image_tensors(images); + + for (const ov::Tensor& image : single_images) { + EncodedImage encoded_image = m_vision_encoder.encode(image); + if (m_vlm_config.use_image_id) { + images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end; + ++m_image_id; } - ov::Shape reshaped_shape = reshaped.get_shape(); - for (size_t batch_idx = 0; batch_idx < reshaped_shape.at(0); ++batch_idx) { - ov::Tensor single_image{ - ov::element::u8, - {1, reshaped_shape.at(1), reshaped_shape.at(2), reshaped_shape.at(3)}, - reshaped.data() + batch_idx * reshaped_shape.at(1) * reshaped_shape.at(1) * reshaped_shape.at(1) - }; - EncodedImage encoded_image = m_vision_encoder.encode(single_image); - if (m_vlm_config.use_image_id) { - images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end; - ++m_image_id; - } - std::string unk64; - for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { - unk64 += m_vlm_config.unk; - } - images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; - if (encoded_image.slices) { - ov::Shape slices_shape = encoded_image.slices.get_shape(); - for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { - for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { - images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; - } - images_prompt += '\n'; + std::string unk64; + for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { + unk64 += m_vlm_config.unk; + } + images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; + if (encoded_image.slices) { + ov::Shape slices_shape = encoded_image.slices.get_shape(); + for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { + for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { + images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; } - } - if ('\n' != *(images_prompt.end() - 1)) { - // Image wasn't sliced, add \n to the end of image anyway. - // Strangely, \n isn't placed between . images_prompt += '\n'; } - embeds.push_back(std::move(encoded_image)); } + if ('\n' != *(images_prompt.end() - 1)) { + // Image wasn't sliced, add \n to the end of image anyway. + // Strangely, \n isn't placed between . + images_prompt += '\n'; + } + embeds.push_back(std::move(encoded_image)); } images_prompt += prompt; @@ -461,69 +487,86 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images) override { std::string image_token = m_vlm_config.im_start; - std::string formatted_prompt = images.empty() ? prompt : image_token + "\n" + prompt; - - // std::string chat_template_fallback = m_templated_chat_history + " USER: " + formatted_prompt + " ASSISTANT: "; - // chat_template_fallback = chat_template_fallback.erase(0, chat_template_fallback.find_first_not_of(' ')); - // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, chat_template_fallback); + + std::vector single_images = to_single_image_tensors(images); - if (images.empty()) { - return m_embedding.infer(input_ids); - } else { - OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); - EncodedImage encoded_image = m_vision_encoder.encode(images.at(0)); - ov::Tensor image_embeds = encoded_image.resized_source; + std::string formatted_prompt; + std::vector image_embeds; + image_embeds.reserve(single_images.size()); - ov::Tensor text_embeds = m_embedding.infer(input_ids); + for (const auto& image : single_images) { + EncodedImage encoded_image = m_vision_encoder.encode(image); + image_embeds.push_back(std::move(encoded_image.resized_source)); + formatted_prompt += image_token + "\n"; + } + formatted_prompt += prompt; - ov::Tensor encoded_image_token = m_tokenizer.encode(image_token, ov::genai::add_special_tokens(false)).input_ids; - int64_t image_token_id = encoded_image_token.data()[encoded_image_token.get_size() - 1]; + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, chat_template_fallback); + ov::Tensor text_embeds = m_embedding.infer(input_ids); - return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id); + if (images.empty()) { + return text_embeds; } + + ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; + int64_t image_token_id = encoded_image_token.data()[encoded_image_token.get_size() - 1]; + + return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id); } protected: ov::Tensor merge_text_and_image_embeddings_llava( const ov::Tensor& input_ids, const ov::Tensor& text_embeds, - const ov::Tensor& image_embeds, + const std::vector& image_embeds, int64_t image_token_id ) { auto text_embeds_shape = text_embeds.get_shape(); - auto image_embeds_shape = image_embeds.get_shape(); + size_t text_embeds_seq_length = text_embeds_shape[1]; + size_t hidden_size = text_embeds_shape[2]; + const int64_t* input_ids_data = input_ids.data(); + const float* text_embeds_data = text_embeds.data(); + + size_t num_image_tokens = 0; + for (size_t s = 0; s < text_embeds_seq_length; ++s) { + if (input_ids_data[s] == image_token_id) { + num_image_tokens++; + } + } + auto num_images = image_embeds.size(); OPENVINO_ASSERT( - text_embeds_shape[2] == image_embeds_shape[2], - "Incompatible shapes between text_embeds and image_embeds" + num_image_tokens == num_images, + "Number of image tokens in input_ids different from num_images." ); - size_t text_embeds_seq_length = text_embeds_shape[1]; - size_t hidden_size = text_embeds_shape[2]; - size_t image_embeds_seq_length = image_embeds_shape[1]; - - size_t merged_seq_length = text_embeds_seq_length + (image_embeds_seq_length - 1); + size_t total_image_seq_length = 0; + for (const auto& single_image_embeds : image_embeds) { + OPENVINO_ASSERT( + text_embeds_shape[2] == single_image_embeds.get_shape().at(2), + "Incompatible shapes between text_embeds and image_embeds" + ); + total_image_seq_length += single_image_embeds.get_shape().at(1); + } + size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens; ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size}); - - const int64_t* input_ids_data = input_ids.data(); - const float* text_embeds_data = text_embeds.data(); - const float* image_embeds_data = image_embeds.data(); float* merged_data = merged_embeds.data(); - size_t merged_idx = 0; + size_t image_idx = 0; for (size_t s = 0; s < text_embeds_seq_length; ++s) { if (input_ids_data[s] == image_token_id) { - for (size_t i = 0; i < image_embeds_seq_length; ++i) { - std::copy_n(image_embeds_data + i * hidden_size, - hidden_size, - merged_data + merged_idx * hidden_size); - merged_idx++; - } + const float* image_embeds_data = image_embeds[image_idx].data(); + size_t image_seq_length = image_embeds[image_idx].get_shape()[1]; + + std::copy_n(image_embeds_data, + image_seq_length * hidden_size, + merged_data + merged_idx * hidden_size); + merged_idx += image_seq_length; + image_idx++; } else { std::copy_n(text_embeds_data + s * hidden_size, hidden_size, @@ -547,35 +590,47 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images) override { std::string image_token = m_vlm_config.im_start; - std::string formatted_prompt = images.empty() ? prompt : image_token + "\n" + prompt; - // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; - ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, chat_template_fallback); - if (images.empty()) { - return m_embedding.infer(input_ids); - } else { - OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); - EncodedImage encoded_image = m_vision_encoder.encode(images.at(0)); + std::vector single_images = to_single_image_tensors(images); - // Create image_newline tensor with data from config - size_t embed_dim = encoded_image.resized_source.get_shape().at(2); - ov::Tensor image_newline(encoded_image.resized_source.get_element_type(), {embed_dim}); - float* image_newline_data = image_newline.data(); - std::copy(m_vlm_config.image_newline.begin(), m_vlm_config.image_newline.end(), image_newline_data); + std::string formatted_prompt; + std::vector image_embeds; + image_embeds.reserve(single_images.size()); + + ov::Tensor image_newline; - ImageSize original_image_size{images.at(0).get_shape().at(1), images.at(0).get_shape().at(2)}; // [height, width] + for (const auto& image : single_images) { + EncodedImage encoded_image = m_vision_encoder.encode(image); - ov::Tensor image_features = pack_image_features_llava_next(encoded_image, original_image_size, image_newline); + if (!image_newline) { + size_t embed_dim = encoded_image.resized_source.get_shape().at(2); + image_newline = ov::Tensor(encoded_image.resized_source.get_element_type(), {embed_dim}); + float* image_newline_data = image_newline.data(); + std::copy(m_vlm_config.image_newline.begin(), m_vlm_config.image_newline.end(), image_newline_data); + } - ov::Tensor text_embeds = m_embedding.infer(input_ids); + ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)}; // [height, width] - ov::Tensor encoded_image_token = m_tokenizer.encode(image_token, ov::genai::add_special_tokens(false)).input_ids; - int64_t image_token_id = encoded_image_token.data()[encoded_image_token.get_size() - 1]; + ov::Tensor packed_features = pack_image_features_llava_next(encoded_image, original_image_size, image_newline); - return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_features, image_token_id); + image_embeds.push_back(std::move(packed_features)); + formatted_prompt += image_token + "\n"; } + formatted_prompt += prompt; + + ov::Tensor input_ids = get_encoded_input_ids(formatted_prompt, chat_template_fallback); + ov::Tensor text_embeds = m_embedding.infer(input_ids); + + if (images.empty()) { + return text_embeds; + } + + ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; + int64_t image_token_id = encoded_image_token.data()[encoded_image_token.get_size() - 1]; + + return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id); } private: diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 5f2b9232a8..0b6b169f18 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -657,10 +657,13 @@ EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const Processo m_vision_encoder.set_tensor("pixel_values", pixel_values); m_vision_encoder.infer(); - ov::Tensor image_features = m_vision_encoder.get_output_tensor(); + const ov::Tensor& infer_output = m_vision_encoder.get_output_tensor(); + ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); + std::memcpy(image_features.data(), infer_output.data(), infer_output.get_byte_size()); + ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size}; - return {image_features, resized_source_size}; + return {std::move(image_features), resized_source_size}; } EncodedImage VisionEncoder::encode_llava_next(const ov::Tensor& image, const ProcessorConfig& config) { @@ -669,7 +672,10 @@ EncodedImage VisionEncoder::encode_llava_next(const ov::Tensor& image, const Pro m_vision_encoder.set_tensor("pixel_values", pixel_values); m_vision_encoder.infer(); - ov::Tensor image_features = m_vision_encoder.get_output_tensor(); + const ov::Tensor& infer_output = m_vision_encoder.get_output_tensor(); + ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); + std::memcpy(image_features.data(), infer_output.data(), infer_output.get_byte_size()); + ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size}; // Gen number of patches @@ -679,7 +685,7 @@ EncodedImage VisionEncoder::encode_llava_next(const ov::Tensor& image, const Pro int num_patches_h = best_resolution.second / config.size_shortest_edge; EncodedImage encoded_image; - encoded_image.resized_source = image_features; + encoded_image.resized_source = std::move(image_features); encoded_image.resized_source_size = resized_source_size; encoded_image.patches_grid = {num_patches_h, num_patches_w}; return encoded_image; @@ -691,8 +697,11 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce m_vision_encoder.set_tensor("pixel_values", pixel_values); m_vision_encoder.infer(); - ov::Tensor image_features = m_vision_encoder.get_output_tensor(); + const ov::Tensor& infer_output = m_vision_encoder.get_output_tensor(); + ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); + std::memcpy(image_features.data(), infer_output.data(), infer_output.get_byte_size()); + ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size}; - return {image_features, resized_source_size}; + return {std::move(image_features), resized_source_size}; } diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index 6412ec39d5..e1f2483d8e 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -193,6 +193,14 @@ The pipeline can work with other similar topologies produced by `optimum-intel` + + Stable Diffusion 3 + + + + diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index 527dccf55a..774175dd95 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -40,5 +40,4 @@ Generator, CppStdGenerator, draft_model - ) diff --git a/tests/cpp/speculative_decoding.cpp b/tests/cpp/speculative_decoding.cpp index 08ce6aaf66..bb10c2cc8f 100644 --- a/tests/cpp/speculative_decoding.cpp +++ b/tests/cpp/speculative_decoding.cpp @@ -28,6 +28,7 @@ class CBForSDTest : public testing::Test, public ov::genai::ContinuousBatchingPi std::lock_guard lock{m_awaiting_requests_mutex}; m_awaiting_requests.push_back(sequence_group); } + pull_awaiting_requests(); return std::make_shared(sequence_group->get_generation_stream(), sampling_params); }; diff --git a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp index 7c3e75eafa..27c64d04a8 100644 --- a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp +++ b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp @@ -256,8 +256,15 @@ class GenerationInfoCollector { this->start_time = start_time; } - void add_generation(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, size_t request_id) { - ov::genai::GenerationHandle generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], dataset->m_sampling_params[request_id]); + void add_generation(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, size_t request_id, bool is_speculative_decoding_enabled) { + auto sampling_params = dataset->m_sampling_params[request_id]; + if (is_speculative_decoding_enabled) { + // to enable static speculative decoding + sampling_params.num_assistant_tokens = 5; + // to enable dynamic speculative decoding + // sampling_params.assistant_confidence_threshold = 0.4f; + } + ov::genai::GenerationHandle generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], sampling_params); std::lock_guard lock(mutex); generations_info.emplace_back(std::move(generation_handle), dataset->m_input_lens[request_id]); } @@ -306,7 +313,7 @@ class GenerationInfoCollector { } }; -void trafficSimulator(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, std::string request_rate, GenerationInfoCollector* generation_info_collector) { +void trafficSimulator(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, std::string request_rate, GenerationInfoCollector* generation_info_collector, bool is_speculative_decoding_enabled) { double numeric_request_rate; std::random_device rd; std::mt19937 gen(rd()); @@ -333,7 +340,7 @@ void trafficSimulator(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* data generation_info_collector->set_start_time(std::chrono::steady_clock::now()); for (size_t request_id = 0; request_id < dataset->size(); ++request_id) { std::cout << "Traffic thread adding request to the queue..." << std::endl; - generation_info_collector->add_generation(pipe, dataset, request_id); + generation_info_collector->add_generation(pipe, dataset, request_id, is_speculative_decoding_enabled); if (numeric_request_rate > 0) std::this_thread::sleep_for(std::chrono::milliseconds(int(distribution(gen) * 1000))); } @@ -434,6 +441,7 @@ int main(int argc, char* argv[]) try { ("b,max_batch_size", "A maximum number of batched tokens", cxxopts::value()->default_value("256")) ("dynamic_split_fuse", "Whether to use dynamic split-fuse or vLLM scheduling", cxxopts::value()->default_value("true")) ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) + ("draft_model", "Path to assistant model directory", cxxopts::value()->default_value("")) ("dataset", "Path to dataset .json file", cxxopts::value()->default_value("./ShareGPT_V3_unfiltered_cleaned_split.json")) ("max_input_len", "Max input length take from dataset", cxxopts::value()->default_value("1024")) ("max_output_len", "Max output length", cxxopts::value()->default_value("2048")) @@ -462,6 +470,7 @@ int main(int argc, char* argv[]) try { const size_t max_batch_size = result["max_batch_size"].as(); const bool dynamic_split_fuse = result["dynamic_split_fuse"].as(); const std::string models_path = result["model"].as(); + const std::string draft_model_path = result["draft_model"].as(); const std::string dataset_path = result["dataset"].as(); const size_t max_input_len = result["max_input_len"].as(); const size_t max_output_len = result["max_output_len"].as(); @@ -471,6 +480,8 @@ int main(int argc, char* argv[]) try { const size_t cache_size = result["cache_size"].as(); const bool use_cache_eviction = result["use_cache_eviction"].as(); + bool is_speculative_decoding_enabled = !draft_model_path.empty(); + // Create requests for generation Dataset dataset = filtered_dataset(models_path, dataset_path, num_prompts, max_input_len, max_output_len); @@ -509,6 +520,9 @@ int main(int argc, char* argv[]) try { std::cout << "\tPlugin configuration JSON: " << device_config << std::endl; ov::AnyMap device_config_map = {}; + if (is_speculative_decoding_enabled) { + device_config_map.insert({ ov::genai::draft_model(draft_model_path) }); + } if (!parse_plugin_config_string(device_config, device_config_map)) { std::cout << "ERROR: Wrong json parameter in device_config." << std::endl; return EXIT_FAILURE; @@ -524,14 +538,14 @@ int main(int argc, char* argv[]) try { std::atomic finishGenerationThread{false}; if (request_rate == "inf") { - std::thread trafficSimulatorThread(trafficSimulator, &pipe, &dataset, request_rate, &generation_info_collector); + std::thread trafficSimulatorThread(trafficSimulator, &pipe, &dataset, request_rate, &generation_info_collector, is_speculative_decoding_enabled); trafficSimulatorThread.join(); } std::thread lmmEngineThread(llmEngineLoop, &pipe, &dataset, &finishGenerationThread); std::thread statisticsReporterThread(statisticsReporter, &generation_info_collector, num_prompts); if (request_rate != "inf") { - std::thread trafficSimulatorThread(trafficSimulator, &pipe, &dataset, request_rate, &generation_info_collector); + std::thread trafficSimulatorThread(trafficSimulator, &pipe, &dataset, request_rate, &generation_info_collector, is_speculative_decoding_enabled); trafficSimulatorThread.join(); } statisticsReporterThread.join(); diff --git a/tools/who_what_benchmark/README.md b/tools/who_what_benchmark/README.md index b5cad666c8..012782bad3 100644 --- a/tools/who_what_benchmark/README.md +++ b/tools/who_what_benchmark/README.md @@ -1,7 +1,12 @@ -# Simple Accuracy Benchmark for Generative AI models +# Who What Benchmark (WWB) - Simple Accuracy Benchmarking Tool for Generative AI models +The main idea of the benchmark is to estimate the similarity score between embedding computed by for data generated by two models, for example, baseline and optimized. In general, this can be the data created with the model inferred with different tools. Thus, this similarity allows to understand how different data in general. + +WWB provides default datasets for the supported use cases. However, it is relatively easy to plug and use custom datasets. + ## Features +* Command-line interface for Hugging Face and OpenVINO models and API to support broader inference backends. * Simple and quick accuracy test for compressed, quantized, pruned, distilled LLMs. It works with any model that supports HuggingFace Transformers text generation API including: * HuggingFace Transformers compressed models via [Bitsandbytes](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig) * [GPTQ](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig) via HuggingFace API @@ -11,8 +16,46 @@ * Validation of text-to-image pipelines. Computes similarity score between generated images: * Supports Diffusers library and Optimum-Intel via `Text2ImageEvaluator` class. -The main idea is to compare similarity of text generation between baseline and optimized LLMs. +### Installation +Install WWB and its requirements from the source using `pip` or any other package manager. For example, + +* `python -m venv eval_env` +* `source eval_env/bin/activate` +* `pip install -r requirements.txt` +* `pip install openvino.genai` to validate with OpenVINO GenAI API +* `pip install .` + +## Usage +### Compare Text-generation Models (LLMs) +```sh +# Collect ground truth from the baseline Hugging Face Transformer model +wwb --base-model microsoft/Phi-3-mini-4k-instruct --gt-data gt.csv --model-type text --hf + +# Convert model to Optimum-Intel (quantized to 8-bit by default) +optimum-cli export openvino -m microsoft/Phi-3-mini-4k-instruct phi-3-openvino + +# Measure similarity metric for Optimum-OpenVINO inference backend +wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text +# Measure similarity metric for OpenVINO GenAI inference backend +wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text --genai +``` + +### Compare Text-to-image models (Diffusers) +```sh +# Export FP16 model to OpenVINO +optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 sd-lcm-fp16 +# Export model with 8-bit quantized weights to OpenVINO +optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format int8 sd-lcm-int8 +# Collect the references and save the mappling in the .json file. +# Reference images will be stored in the "reference" subfolder under the same path with .json. +wwb --base-model sd-lcm-fp16 --gt-data lcm_test/sd_xl.json --model-type text-to-image +# Compute the metric +# Target images will be stored in the "target" subfolder under the same path with .json. +wwb --target-model sd-lcm-int8 --gt-data lcm_test/sd_xl.json --model-type text-to-image +``` + +### API The API provides a way to access to investigate the worst generated text examples. ```python @@ -49,13 +92,7 @@ prompts = val["text"] metrics_per_prompt, metrics = evaluator.score(optimized_model, test_data=prompts) ``` -### Installing - -* python -m venv eval_env -* source eval_env/bin/activate -* pip install -r requirements.txt - -### CLI example for text-generation models +### Advaned CLI usage ```sh wwb --help @@ -92,18 +129,6 @@ wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv - wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --hf ``` -### Example of Stable Diffusion comparison -```sh -# Export FP16 model -optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 sd-lcm-fp16 -# Export INT8 WOQ model -optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format int8 sd-lcm-int8 -# Collect the references -wwb --base-model sd-lcm-fp16 --gt-data lcm_test/sd_xl.json --model-type text-to-image -# Compute the metric -wwb --target-model sd-lcm-int8 --gt-data lcm_test/sd_xl.json --model-type text-to-image -``` - ### Supported metrics * `similarity` - averaged similarity measured by neural network trained for sentence embeddings. The best is 1.0, the minimum is 0.0, higher-better. diff --git a/tools/who_what_benchmark/requirements.txt b/tools/who_what_benchmark/requirements.txt index 0ffa906756..bea6453c6b 100644 --- a/tools/who_what_benchmark/requirements.txt +++ b/tools/who_what_benchmark/requirements.txt @@ -7,3 +7,4 @@ openvino-tokenizers pandas>=2.0.3 numpy>=1.23.5 tqdm>=4.66.1 +diffusers diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 6e65e477b9..4d8c52fe21 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -251,61 +251,61 @@ def parse_args(): parser.add_argument( "--base-model", default=None, - help="Model to ground truth generation.", + help="Model for ground truth generation.", ) parser.add_argument( "--target-model", default=None, - help="Model to comparison with base_model. Usually it is compressed, quantized version of base_model.", + help="Model to compare against the base_model. Usually it is compressed, quantized version of base_model.", ) parser.add_argument( "--tokenizer", default=None, - help="Tokenizer for divergency metric. If not defined then will be load from base_model or target_model.", + help="Tokenizer for divergency metric. If not provided, it will be load from base_model or target_model.", ) parser.add_argument( "--gt-data", default=None, - help="CSV file with base_model generation. If defined and exists then base_model will not used." - "I defined and not exists them will be generated by base_model evaluation.", + help="CSV file containing GT outputs from base_model. If defined and exists then base_model will not used." + " If the files does not exist, it will be generated by base_model evaluation.", ) parser.add_argument( "--model-type", type=str, choices=["text", "text-to-image", "visual-text"], default="text", - help="Indicated the model type, e.g. 'text' - for causal text generation, 'text-to-image' - for image generation.", + help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation.", ) parser.add_argument( "--data-encoder", type=str, default="sentence-transformers/all-mpnet-base-v2", help="Model for measurement of similarity between base_model and target_model." - "By default it is sentence-transformers/all-mpnet-base-v2," - "but for Chinese LLMs better to use sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2.", + " By default it is sentence-transformers/all-mpnet-base-v2," + " but for Chinese LLMs, better to use sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2.", ) parser.add_argument( "--dataset", type=str, default=None, help="Name of the dataset with prompts. The interface for dataset is load_dataset from datasets library." - "Please provide this argument in format path,name (for example wikitext,wikitext-2-v1)." - "If None then internal list of prompts will be used.", + " Please provide this argument in format path,name (for example wikitext,wikitext-2-v1)." + " If None then internal list of prompts will be used.", ) parser.add_argument( "--dataset-field", type=str, default="text", help="The name of field in dataset for prompts. For example question or context in squad." - "Will be used only if dataset is defined.", + " Will be used only if dataset is defined.", ) parser.add_argument( "--split", type=str, default=None, help="Split of prompts from dataset (for example train, validation, train[:32])." - "Will be used only if dataset is defined.", + " Will be used only if dataset is defined.", ) parser.add_argument( "--output", @@ -377,8 +377,12 @@ def parse_args(): def check_args(args): - assert not (args.base_model is None and args.target_model is None) - assert not (args.base_model is None and args.gt_data is None) + if args.base_model is None and args.target_model is None: + raise ValueError("Wether --base-model or --target-model should be provided") + if args.base_model is None and args.gt_data is None: + raise ValueError("Wether --base-model or --gt-data should be provided") + if args.target_model is None and args.gt_data is None: + raise ValueError("Wether --target-model or --gt-data should be provided") def load_tokenizer(args):