diff --git a/.github/ISSUE_TEMPLATE/good_first_issue.yml b/.github/ISSUE_TEMPLATE/good_first_issue.yml deleted file mode 100644 index f0192d1598..0000000000 --- a/.github/ISSUE_TEMPLATE/good_first_issue.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: Good First Issue -description: Create a Good First Issue for new contributors. -title: "[Good First Issue]: " -labels: ["good first issue"] -body: - - type: textarea - id: context - attributes: - label: Context - description: | - Let the contributors know what your component is responsible for, - what's the importance of the change and why it's needed. - Keep in mind the Good First Issue is for new contributors. - placeholder: What is it and why is it important? - validations: - required: true - - - type: textarea - id: todo_list - attributes: - label: What needs to be done? - description: | - Be as verbose as possible, provide a TODO list if viable. - validations: - required: true - - - type: textarea - id: example_prs - attributes: - label: Example Pull Requests - description: | - Provide example Pull requests, if there are any. - validations: - required: false - - - type: textarea - id: resources - attributes: - label: Resources - description: | - Any materials related to the task, such as operator specifications, - discussions, guides. - value: | - - [Contribution guide - start here!](https://github.com/openvinotoolkit/openvino/blob/master/CONTRIBUTING.md) - - [Intel DevHub Discord channel](https://discord.gg/7pVRxUwdWG) - engage in discussions, ask questions and talk to OpenVINO developers - - [How to link your Pull Request to an issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#manually-linking-a-pull-request-to-an-issue-using-the-pull-request-sidebar) - validations: - required: true - - - type: textarea - id: contact_points - attributes: - label: Contact points - description: | - People who can be asked questions about the task. - placeholder: GitHub users - validations: - required: true - - - type: textarea - id: ticket - attributes: - label: Ticket - description: | - Provide the ticket number, if available. - validations: - required: false diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 85614b7032..f908b5aceb 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -17,6 +17,6 @@ updates: schedule: interval: "weekly" - package-ecosystem: "pip" - directory: "text_generation/causal_lm/cpp/" + directory: "samples/" schedule: interval: "weekly" diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 76fea83d4b..2263277b68 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -7,11 +7,15 @@ on: - samples/** - thirdparty/openvino_tokenizers - "!**.md" -permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/l_openvino_toolkit_ubuntu20_2024.4.0.dev20240730_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/m_openvino_toolkit_macos_12_6_2024.4.0.dev20240730_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/w_openvino_toolkit_windows_2024.4.0.dev20240730_x86_64.zip jobs: cpp-multinomial-greedy_causal_lm-ubuntu: runs-on: ubuntu-20.04-8-cores @@ -25,13 +29,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -52,7 +56,11 @@ jobs: cpp-beam_search_causal_lm-ubuntu: strategy: matrix: - executable: [./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm, python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py] + executable: + [ + ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm, + python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py, + ] runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -64,13 +72,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -180,9 +188,11 @@ jobs: predictions = predictions[:idx] + predictions[idx + len(ref):] " echo "Multi prompt" passed + cpp-greedy_causal_lm-windows: - if: false runs-on: windows-latest + env: + PYTHONIOENCODING: "utf8" defaults: run: shell: cmd @@ -193,17 +203,15 @@ jobs: - uses: actions/setup-python@v4 with: python-version: 3.8 - - name: Install OpenVINO + - run: curl --output ov.zip ${{ env.w_ov_link }} + - run: unzip -d ov ov.zip + - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" shell: bash - run: | - curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/windows/w_openvino_toolkit_windows_2024.2.0.dev20240529_x86_64.zip - unzip ov.zip - - run: mklink /D ov w_openvino_toolkit_windows_2024.2.0.dev20240529_x86_64 - name: Download, convert and build run: | call .\ov\setupvars.bat - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -224,10 +232,10 @@ jobs: echo predictions = predictions[:idx] + predictions[idx + len(ref):] >> ref.py - run: python ref.py - run: > - set PATH=".\build\openvino_genai\;%PATH%" + set PATH=.\build\openvino_genai\;%PATH% && set "PYTHONPATH=./build/" && call .\ov\setupvars.bat - && samples\python\greedy_causal_lm\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt + && python samples\python\greedy_causal_lm\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt - run: fc .\cpp.txt .\py.txt cpp-beam_search_causal_lm-Qwen-7B-Chat: @@ -242,13 +250,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -269,13 +277,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -297,13 +305,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j 15 @@ -325,13 +333,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -353,13 +361,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ @@ -378,7 +386,6 @@ jobs: " echo "Alan Turing was a" passed - cpp-prompt_lookup_decoding_lm-ubuntu: runs-on: ubuntu-20.04-16-cores steps: @@ -391,13 +398,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -435,13 +442,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j 15 @@ -470,7 +477,7 @@ jobs: && export PYTHONPATH=./build/:$PYTHONPATH && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a" | diff ./pred_greedy.txt - - + cpp-greedy_causal_lm-redpajama-3b-chat: runs-on: ubuntu-20.04-4-cores steps: @@ -483,13 +490,13 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j @@ -520,3 +527,187 @@ jobs: && export PYTHONPATH=./build/:$PYTHONPATH && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" | diff ./pred_greedy.txt - + + cpp-chat_sample-ubuntu: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Download, convert and build + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Compare + run: | + source ./ov/setupvars.sh + printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\nStop!\n' > ./input.txt + timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt + python -c " + from transformers import LlamaTokenizer, AutoModelForCausalLM + model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' + tokenizer = LlamaTokenizer.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id) + prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?'] + def gen_prompt(prompt): + return {'role': 'user', 'content': prompt} + def gen_answer(answer): + return {'role': 'assistant', 'content': answer} + chat_history = [] + chat_prompt = '' + output = open('ref.txt', 'w') + for prompt in prompts: + output.write('question:\n') + chat_history.append(gen_prompt(prompt)) + chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt') + answer = model.generate(**tokenized, max_length=1000, do_sample=False) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history.append(gen_answer(answer_str)) + output.write(answer_str) + output.write('\n----------\n') + output.write('question:\n') + output.close() + " + diff pred.txt ref.txt + echo "Chat sample cpp" passed + export PYTHONPATH=./build/:$PYTHONPATH + timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt + diff pred2.txt ref.txt + echo "Chat sample python" passed + + cpp-continuous-batching-ubuntu: + runs-on: ubuntu-20.04-8-cores + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Download, convert and build + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Run gtests + run: | + source ./ov/setupvars.sh + ./build/tests/cpp/tests_continuous_batching + - name: Run accuracy_sample + run: | + source ./ov/setupvars.sh + timeout 50s ./build/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5 + if: False # Fails with nightly ov + - name: Run throughput_benchmark + run: | + wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + source ./ov/setupvars.sh + timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 + timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 + if: False # Fails with nightly ov + + cpp-continuous-batching-windows: + runs-on: windows-latest + env: + PYTHONIOENCODING: "utf8" + defaults: + run: + shell: cmd + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install OpenVINO + run: | + curl --output ov.zip ${{ env.w_ov_link }} + unzip -d ov ov.zip + dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" + shell: bash + - name: Install dependencies and build + run: | + call .\ov\setupvars.bat + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Run gtests + run: | + set PATH=.\build\openvino_genai\;%PATH% + call .\ov\setupvars.bat + .\build\tests\cpp\Release\tests_continuous_batching.exe + - name: Run accuracy_sample + run: | + set PATH=.\build\openvino_genai\;%PATH% + call .\ov\setupvars.bat + .\build\samples\cpp\continuous_batching_accuracy\Release\continuous_batching_accuracy.exe -m .\TinyLlama-1.1B-Chat-v1.0\ -n 5 + if: False # Fails with nightly ov + - name: Run throughput_benchmark + run: | + curl -o .\ShareGPT_V3_unfiltered_cleaned_split.json -s -L "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" + set PATH=.\build\openvino_genai\;%PATH% + call .\ov\setupvars.bat + .\build\samples\cpp\continuous_batching_benchmark\Release\continuous_batching_benchmark.exe -n 2 -m .\TinyLlama-1.1B-Chat-v1.0\ --dataset .\ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 + if: False # Fails with nightly ov + + cpp-continuous-batching-macos: + runs-on: macos-12 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + brew install coreutils scons + - name: Download, convert and build + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Run gtests + run: | + source ./ov/setupvars.sh + ./build/tests/cpp/tests_continuous_batching + - name: Run accuracy_sample + run: | + source ./ov/setupvars.sh + timeout 120s ./build/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5 + if: False # Fails with nightly ov + - name: Run throughput_benchmark + run: | + wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + source ./ov/setupvars.sh + ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 5 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 + if: False # Fails with nightly ov diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml index 93e5128860..d89ad2097b 100644 --- a/.github/workflows/genai_package.yml +++ b/.github/workflows/genai_package.yml @@ -1,9 +1,13 @@ name: genai_package on: pull_request -permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }} cancel-in-progress: true +env: + l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/l_openvino_toolkit_ubuntu20_2024.4.0.dev20240730_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/m_openvino_toolkit_macos_12_6_2024.4.0.dev20240730_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/w_openvino_toolkit_windows_2024.4.0.dev20240730_x86_64.zip jobs: ubuntu_genai_package: strategy: @@ -20,21 +24,21 @@ jobs: with: python-version: 3.8 - run: mkdir ./ov/ - - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j - - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace - if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ov/samples/cpp/ -B ./samples\ build/ && cmake --build ./samples\ build/ --config ${{ matrix.build-type }} -j && cmake --install ./samples\ build/ --config ${{ matrix.build-type }} --component samples_bin --prefix s\ pace if: ${{ 'Release' != matrix.build-type }} - run: source ./ov/setupvars.sh && timeout 25s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" - run: source ./ov/setupvars.sh && timeout 25s ./ov/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 - if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only + if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only macos_genai_package: strategy: @@ -49,34 +53,34 @@ jobs: with: python-version: 3.8 - run: mkdir ./ov/ - - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz - run: brew install coreutils scons - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j - - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace - if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - run: > source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ov/samples/cpp/ -B ./samples\ build/ && cmake --build ./samples\ build/ --config ${{ matrix.build-type }} -j && cmake --install ./samples\ build/ --config ${{ matrix.build-type }} --component samples_bin --prefix s\ pace if: ${{ 'Release' != matrix.build-type }} - - run: source ./ov/setupvars.sh && timeout 25s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" + - run: source ./ov/setupvars.sh && timeout 30s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" - run: source ./ov/setupvars.sh && timeout 25s ./ov/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 - if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only + if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only windows_genai_package: - if: false strategy: matrix: build-type: [Release, Debug] runs-on: windows-latest env: CMAKE_BUILD_PARALLEL_LEVEL: null + PYTHONIOENCODING: "utf8" defaults: run: shell: cmd @@ -87,18 +91,21 @@ jobs: - uses: actions/setup-python@v4 with: python-version: 3.8 - - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/windows/w_openvino_toolkit_windows_2024.2.0.dev20240529_x86_64.zip - - run: unzip ov.zip - # Shorten the next setupvars calls. - - run: mklink /D ov w_openvino_toolkit_windows_2024.2.0.dev20240529_x86_64 + - run: > + curl --output ov.zip ${{ env.w_ov_link }} + && unzip -d ov ov.zip + && dirs=(ov/*) + && mv ov/*/* ov + && rmdir "${dirs[@]}" + shell: bash - run: call ov\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ - run: call ov\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j - - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + - run: call ov\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + - run: call ov\setupvars.bat && python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - run: call ov\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - run: call ov\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov - run: call ov\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install" - if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - run: > call ov\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ov/samples/cpp/ -B "samples build" @@ -106,5 +113,6 @@ jobs: && cmake --install "samples build" --config ${{ matrix.build-type }} --component samples_bin --prefix samples_install if: ${{ 'Release' != matrix.build-type }} - run: call ov\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" - - run: call ov\setupvars.bat && ./ov/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 - if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only + if: ${{ 'Release' == matrix.build-type }} # Tokenizers don't work in debug + - run: call ov\setupvars.bat && python .\ov\samples\python\multinomial_causal_lm\multinomial_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 0 + if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml index 0fb808a881..58e340a5b9 100644 --- a/.github/workflows/genai_python_lib.yml +++ b/.github/workflows/genai_python_lib.yml @@ -1,13 +1,17 @@ name: genai_python_lib on: pull_request -permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.ref_name }} cancel-in-progress: true +env: + l_ov_centos_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/l_openvino_toolkit_centos7_2024.4.0.dev20240730_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/m_openvino_toolkit_macos_12_6_2024.4.0.dev20240730_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.4.0-16161-d253f4fd89c/w_openvino_toolkit_windows_2024.4.0.dev20240730_x86_64.zip jobs: ubuntu_genai_python_lib: # A tokenizers' dependency fails to compile on ubuntu-20 n CenOS7 env. - runs-on: ubuntu-22.04 + runs-on: ubuntu-22.04-16-cores env: # A tokenizers' dependency fails to compile with Ninja in CenOS7 env. CMAKE_GENERATOR: Unix Makefiles @@ -21,14 +25,14 @@ jobs: python-version: 3.8 - run: mkdir ./ov/ # Install CentOS7 instead of Ubuntu to match PyPI distribution ABI. - - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/linux/l_openvino_toolkit_centos7_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: curl ${{ env.l_ov_centos_link }} | tar --directory ./ov/ --strip-components 1 -xz - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j - - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager - - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager + - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/ - run: source ./ov/setupvars.sh && python -m pip install . --verbose - - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: python -m pytest ./tests/python_tests/ macos_genai_python_lib: runs-on: macos-12 @@ -44,21 +48,21 @@ jobs: with: python-version: 3.8 - run: mkdir ./ov/ - - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz - run: brew install coreutils scons - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j - - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager - - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager + - run: source ./ov/setupvars.sh && PYTHONPATH=./build/:$PYTHONPATH python -m pytest ./tests/python_tests/ - run: source ./ov/setupvars.sh && python -m pip install . --verbose - run: python -c "from openvino_genai import LLMPipeline" - - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: python -m pytest ./tests/python_tests/ windows_genai_python_lib: - if: false runs-on: windows-latest env: CMAKE_BUILD_PARALLEL_LEVEL: null + PYTHONIOENCODING: "utf8" defaults: run: shell: cmd @@ -69,14 +73,16 @@ jobs: - uses: actions/setup-python@v4 with: python-version: 3.8 - - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/windows/w_openvino_toolkit_windows_2024.2.0.dev20240529_x86_64.zip - - run: unzip ov.zip - # Shorten the next setupvars calls. - - run: mklink /D ov w_openvino_toolkit_windows_2024.2.0.dev20240529_x86_64 + - name: Install OpenVINO + run: | + curl --output ov.zip ${{ env.w_ov_link }} + unzip -d ov ov.zip + dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" + shell: bash - run: call ./ov/setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: call ./ov/setupvars.bat && cmake --build ./build/ --config Release -j - - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --upgrade-strategy eager + - run: call ./ov/setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --upgrade-strategy eager # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. - - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: set "PYTHONPATH=./build/" && call ./ov/setupvars.bat && python -m pytest ./tests/python_tests/ - run: call ./ov/setupvars.bat && python -m pip install . --verbose - - run: python -m pytest ./tests/python_tests/test_generate_api.py -m precommit + - run: python -m pytest ./tests/python_tests/ diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml index 4f4f02974a..8d6398027b 100644 --- a/.github/workflows/lcm_dreamshaper_cpp.yml +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -5,6 +5,7 @@ on: paths: - image_generation/lcm_dreamshaper_v7/cpp/** - image_generation/common/** + - samples/requirements.txt - .github/workflows/lcm_dreamshaper_cpp.yml - thirdparty/openvino_tokenizers - "!**.md" @@ -42,15 +43,15 @@ jobs: run: | conda activate openvino_lcm_cpp conda update -c conda-forge --all - conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.4.0.dev20240726 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_lcm_cpp - python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] - python -m pip install -r requirements.txt + python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} @@ -87,15 +88,15 @@ jobs: run: | conda activate openvino_lcm_cpp conda update -c conda-forge --all - conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.4.0.dev20240726 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_lcm_cpp - python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] - python -m pip install -r requirements.txt + python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index 830d6bcfe6..c947bdb4b0 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -5,6 +5,7 @@ on: paths: - image_generation/stable_diffusion_1_5/cpp/** - image_generation/common/** + - samples/requirements.txt - .github/workflows/stable_diffusion_1_5_cpp.yml - thirdparty/openvino_tokenizers - "!**.md" @@ -41,15 +42,15 @@ jobs: - name: Install OpenVINO and other conda dependencies run: | conda activate openvino_sd_cpp - conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.4.0.dev20240726 c-compiler cxx-compiler git make cmake conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_sd_cpp - python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] - python -m pip install -r requirements.txt + python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ../../requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} @@ -85,14 +86,14 @@ jobs: - name: Install OpenVINO and other conda dependencies run: | conda activate openvino_sd_cpp - conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.2.0.dev20240513 c-compiler cxx-compiler git make cmake + conda install -c conda-forge -c conda-forge/label/openvino_dev openvino==2024.4.0.dev20240726 c-compiler cxx-compiler git make cmake - name: Install python dependencies working-directory: ${{ env.working_directory }} run: | conda activate openvino_sd_cpp python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] - python -m pip install -r requirements.txt + python -m pip install -r ../../requirements.txt - name: Download and convert model and tokenizer working-directory: ${{ env.working_directory }} diff --git a/.gitignore b/.gitignore index da1d717331..83f354d57a 100644 --- a/.gitignore +++ b/.gitignore @@ -33,5 +33,5 @@ CMakeUserPresets.json # Python-specific *.?env* *.pyc -.env __pycache__ +.py-build-cmake_cache diff --git a/.gitmodules b/.gitmodules index f545d4e872..f72fd83489 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,3 @@ [submodule "thirdparty/openvino_tokenizers"] path = thirdparty/openvino_tokenizers url = https://github.com/openvinotoolkit/openvino_tokenizers.git - branch = master diff --git a/CMakeLists.txt b/CMakeLists.txt index 2cc8179a3c..6029398c0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,12 +17,23 @@ elseif(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...") endif() +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +if(POLICY CMP0169) + cmake_policy(SET CMP0169 OLD) +endif() + project(OpenVINOGenAI - VERSION 2024.2.0.0 + VERSION 2024.4.0.0 DESCRIPTION "OpenVINO GenAI" HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai" LANGUAGES CXX) +option(INSTALL_GTEST "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" OFF) +option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF) + # Find OpenVINODeveloperPackage first to compile with SDL flags find_package(OpenVINODeveloperPackage QUIET PATHS "${OpenVINO_DIR}") @@ -32,16 +43,34 @@ endif() include(cmake/features.cmake) +if(ENABLE_PYTHON) + # the following two calls are required for cross-compilation + if(OpenVINODeveloperPackage_DIR) + ov_find_python3(REQUIRED) + ov_detect_python_module_extension() + else() + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) + else() + find_package(Python3 REQUIRED COMPONENTS Interpreter Development) + endif() + endif() +endif() + add_subdirectory(thirdparty) add_subdirectory(src) add_subdirectory(samples) +add_subdirectory(tests/cpp) -add_subdirectory(text_generation/causal_lm/cpp/continuous_batching) - -install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) -install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) +install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) +install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) if(NOT DEFINED CPACK_ARCHIVE_COMPONENT_INSTALL) set(CPACK_ARCHIVE_COMPONENT_INSTALL ON) endif() set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) +# Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614 +set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_docs) +if(ENABLE_PYTHON) + list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR}) +endif() include(CPack) diff --git a/README.md b/README.md index f31e64db36..2d9c04513b 100644 --- a/README.md +++ b/README.md @@ -26,10 +26,12 @@ It includes the following pipelines: - C++: 1. [beam_search_causal_lm](./samples/cpp/beam_search_causal_lm/README.md) 2. [chat_sample](./samples/cpp/chat_sample/README.md) - 3. [greedy_causal_lm](./samples/cpp/greedy_causal_lm/README.md) - 4. [multinomial_causal_lm](./samples/cpp/multinomial_causal_lm/README.md) - 5. [prompt_lookup_decoding_lm](./samples/cpp/prompt_lookup_decoding_lm/README.md) - 6. [speculative_decoding_lm](./samples/cpp/speculative_decoding_lm/README.md) + 3. [continuous_batching_accuracy](./samples/cpp/continuous_batching_accuracy) + 4. [continuous_batching_benchmark](./samples/cpp/continuous_batching_benchmark) + 5. [greedy_causal_lm](./samples/cpp/greedy_causal_lm/README.md) + 6. [multinomial_causal_lm](./samples/cpp/multinomial_causal_lm/README.md) + 7. [prompt_lookup_decoding_lm](./samples/cpp/prompt_lookup_decoding_lm/README.md) + 8. [speculative_decoding_lm](./samples/cpp/speculative_decoding_lm/README.md) 3. [Stable Diffuison (with LoRA) C++ image generation pipeline](./image_generation/stable_diffusion_1_5/cpp/README.md) 4. [Latent Consistency Model (with LoRA) C++ image generation pipeline](./image_generation/lcm_dreamshaper_v7/cpp/README.md) diff --git a/assets/style.css b/assets/style.css deleted file mode 100644 index 561524c691..0000000000 --- a/assets/style.css +++ /dev/null @@ -1,319 +0,0 @@ -body { - font-family: Helvetica, Arial, sans-serif; - font-size: 12px; - /* do not increase min-width as some may use split screens */ - min-width: 800px; - color: #999; -} - -h1 { - font-size: 24px; - color: black; -} - -h2 { - font-size: 16px; - color: black; -} - -p { - color: black; -} - -a { - color: #999; -} - -table { - border-collapse: collapse; -} - -/****************************** - * SUMMARY INFORMATION - ******************************/ -#environment td { - padding: 5px; - border: 1px solid #e6e6e6; - vertical-align: top; -} -#environment tr:nth-child(odd) { - background-color: #f6f6f6; -} -#environment ul { - margin: 0; - padding: 0 20px; -} - -/****************************** - * TEST RESULT COLORS - ******************************/ -span.passed, -.passed .col-result { - color: green; -} - -span.skipped, -span.xfailed, -span.rerun, -.skipped .col-result, -.xfailed .col-result, -.rerun .col-result { - color: orange; -} - -span.error, -span.failed, -span.xpassed, -.error .col-result, -.failed .col-result, -.xpassed .col-result { - color: red; -} - -.col-links__extra { - margin-right: 3px; -} - -/****************************** - * RESULTS TABLE - * - * 1. Table Layout - * 2. Extra - * 3. Sorting items - * - ******************************/ -/*------------------ - * 1. Table Layout - *------------------*/ -#results-table { - border: 1px solid #e6e6e6; - color: #999; - font-size: 12px; - width: 100%; -} -#results-table th, -#results-table td { - padding: 5px; - border: 1px solid #e6e6e6; - text-align: left; -} -#results-table th { - font-weight: bold; -} - -/*------------------ - * 2. Extra - *------------------*/ -.logwrapper { - max-height: 230px; - overflow-y: scroll; - background-color: #e6e6e6; -} -.logwrapper.expanded { - max-height: none; -} -.logwrapper.expanded .logexpander:after { - content: "collapse [-]"; -} -.logwrapper .logexpander { - z-index: 1; - position: sticky; - top: 10px; - width: max-content; - border: 1px solid; - border-radius: 3px; - padding: 5px 7px; - margin: 10px 0 10px calc(100% - 80px); - cursor: pointer; - background-color: #e6e6e6; -} -.logwrapper .logexpander:after { - content: "expand [+]"; -} -.logwrapper .logexpander:hover { - color: #000; - border-color: #000; -} -.logwrapper .log { - min-height: 40px; - position: relative; - top: -50px; - height: calc(100% + 50px); - border: 1px solid #e6e6e6; - color: black; - display: block; - font-family: "Courier New", Courier, monospace; - padding: 5px; - padding-right: 80px; - white-space: pre-wrap; -} - -div.media { - border: 1px solid #e6e6e6; - float: right; - height: 240px; - margin: 0 5px; - overflow: hidden; - width: 320px; -} - -.media-container { - display: grid; - grid-template-columns: 25px auto 25px; - align-items: center; - flex: 1 1; - overflow: hidden; - height: 200px; -} - -.media-container--fullscreen { - grid-template-columns: 0px auto 0px; -} - -.media-container__nav--right, -.media-container__nav--left { - text-align: center; - cursor: pointer; -} - -.media-container__viewport { - cursor: pointer; - text-align: center; - height: inherit; -} -.media-container__viewport img, -.media-container__viewport video { - object-fit: cover; - width: 100%; - max-height: 100%; -} - -.media__name, -.media__counter { - display: flex; - flex-direction: row; - justify-content: space-around; - flex: 0 0 25px; - align-items: center; -} - -.collapsible td:not(.col-links) { - cursor: pointer; -} -.collapsible td:not(.col-links):hover::after { - color: #bbb; - font-style: italic; - cursor: pointer; -} - -.col-result { - width: 130px; -} -.col-result:hover::after { - content: " (hide details)"; -} - -.col-result.collapsed:hover::after { - content: " (show details)"; -} - -#environment-header h2:hover::after { - content: " (hide details)"; - color: #bbb; - font-style: italic; - cursor: pointer; - font-size: 12px; -} - -#environment-header.collapsed h2:hover::after { - content: " (show details)"; - color: #bbb; - font-style: italic; - cursor: pointer; - font-size: 12px; -} - -/*------------------ - * 3. Sorting items - *------------------*/ -.sortable { - cursor: pointer; -} -.sortable.desc:after { - content: " "; - position: relative; - left: 5px; - bottom: -12.5px; - border: 10px solid #4caf50; - border-bottom: 0; - border-left-color: transparent; - border-right-color: transparent; -} -.sortable.asc:after { - content: " "; - position: relative; - left: 5px; - bottom: 12.5px; - border: 10px solid #4caf50; - border-top: 0; - border-left-color: transparent; - border-right-color: transparent; -} - -.hidden, .summary__reload__button.hidden { - display: none; -} - -.summary__data { - flex: 0 0 550px; -} -.summary__reload { - flex: 1 1; - display: flex; - justify-content: center; -} -.summary__reload__button { - flex: 0 0 300px; - display: flex; - color: white; - font-weight: bold; - background-color: #4caf50; - text-align: center; - justify-content: center; - align-items: center; - border-radius: 3px; - cursor: pointer; -} -.summary__reload__button:hover { - background-color: #46a049; -} -.summary__spacer { - flex: 0 0 550px; -} - -.controls { - display: flex; - justify-content: space-between; -} - -.filters, -.collapse { - display: flex; - align-items: center; -} -.filters button, -.collapse button { - color: #999; - border: none; - background: none; - cursor: pointer; - text-decoration: underline; -} -.filters button:hover, -.collapse button:hover { - color: #ccc; -} - -.filter__label { - margin-right: 10px; -} diff --git a/image_generation/README.md b/image_generation/README.md index d6163e4a3d..5098877342 100644 --- a/image_generation/README.md +++ b/image_generation/README.md @@ -1,4 +1,4 @@ -## Image generation +## Image Generation The current folder contains: - Common folder with: @@ -6,3 +6,4 @@ The current folder contains: - [imwrite](./common/imwrite) library to dump `ov::Tensor` to `.bmp` image - Image generation samples: - [Stable Diffuison (with LoRA) C++ image generation pipeline](./stable_diffusion_1_5/cpp) + - [OpenVINO Latent Consistency Model C++ image generation pipeline](./lcm_dreamshaper_v7/cpp) diff --git a/image_generation/common/diffusers/CMakeLists.txt b/image_generation/common/diffusers/CMakeLists.txt index 3e6b6fa0f3..6aee1d2173 100644 --- a/image_generation/common/diffusers/CMakeLists.txt +++ b/image_generation/common/diffusers/CMakeLists.txt @@ -21,6 +21,10 @@ target_link_libraries(diffusers PUBLIC openvino::runtime) include(FetchContent) +if(POLICY CMP0169) + cmake_policy(SET CMP0169 OLD) +endif() + FetchContent_Declare(eigen3) FetchContent_GetProperties(eigen3) if(NOT eigen3_POPULATED) diff --git a/image_generation/common/diffusers/src/scheduler_lms_discrete.cpp b/image_generation/common/diffusers/src/scheduler_lms_discrete.cpp index 92d545eb03..35c86c82ea 100644 --- a/image_generation/common/diffusers/src/scheduler_lms_discrete.cpp +++ b/image_generation/common/diffusers/src/scheduler_lms_discrete.cpp @@ -161,6 +161,10 @@ std::vector LMSDiscreteScheduler::get_timesteps() const { } std::map LMSDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) { + if (inference_step == 0) { + m_derivative_list.clear(); + } + // LMS step function: std::vector derivative; derivative.reserve(latents.get_size()); diff --git a/image_generation/lcm_dreamshaper_v7/cpp/README.md b/image_generation/lcm_dreamshaper_v7/cpp/README.md index 24008f3988..c5c0b08cd9 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/README.md +++ b/image_generation/lcm_dreamshaper_v7/cpp/README.md @@ -1,10 +1,11 @@ -# OpenVINO Latent Consistency Model C++ image generation pipeline -The pure C++ text-to-image pipeline, driven by the OpenVINO native API for SD v1.5 Latent Consistency Model with LCM Scheduler. It includes advanced features like LoRA integration with safetensors and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). Loading `openvino_tokenizers` to `ov::Core` enables tokenization. [The common folder](../../common/) contains schedulers for image generation and `imwrite()` for saving `bmp` images. This demo has been tested for Linux platform only. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/latent-consistency-models-image-generation/lcm-lora-controlnet.ipynb) which provides an example of image generaztion in Python. +# OpenVINO Latent Consistency Model C++ Image Generation Pipeline + +The pure C++ text-to-image pipeline, driven by the OpenVINO native API for SD v1.5 Latent Consistency Model with LCM Scheduler. It includes advanced features like [LoRA](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#lora) integration with [safetensors](https://huggingface.co/docs/safetensors/index#format) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). Loading `openvino_tokenizers` to `ov::Core` enables tokenization. [The common folder](../../common/) contains schedulers for image generation and `imwrite()` for saving `bmp` images. This demo has been tested for Linux platform only. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/latent-consistency-models-image-generation/lcm-lora-controlnet.ipynb) which provides an example of image generaztion in Python. > [!NOTE] > This tutorial assumes that the current working directory is `/image_generation/lcm_dreamshaper_v7/cpp/` and all paths are relative to this folder. -## Step 1: Prepare build environment +## Step 1: Prepare Build Environment Prerequisites: - Conda ([installation guide](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)) @@ -14,6 +15,7 @@ C++ Packages: * [OpenVINO](https://docs.openvino.ai/2024/get-started/install-openvino.html): Model inference Prepare a python environment and install dependencies: + ```shell conda create -n openvino_lcm_cpp python==3.10 conda activate openvino_lcm_cpp @@ -23,38 +25,44 @@ conda install -c conda-forge openvino=2024.2.0 c-compiler cxx-compiler git make conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH ``` -## Step 2: Latent Consistency Model and Tokenizer models - -### Latent Consistency Model model +## Step 2: Obtain Latent Consistency Model 1. Install dependencies to import models from HuggingFace: ```shell git submodule update --init conda activate openvino_lcm_cpp - python -m pip install -r requirements.txt + python -m pip install -r ../../requirements.txt python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] ``` -2. Download the model from Huggingface and convert it to OpenVINO IR via [optimum-intel CLI](https://github.com/huggingface/optimum-intel). Example command for downloading and exporting FP16 model: +2. Download the model from Huggingface and convert it to OpenVINO IR via [optimum-intel CLI](https://github.com/huggingface/optimum-intel). + + Example command for downloading [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) model and exporting it with FP16 precision: `optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 models/lcm_dreamshaper_v7/FP16` -If https://huggingface.co/ is down, the script won't be able to download the model. + You can also choose other precision and export FP32 or INT8 model. -> [!NOTE] -> Only static model is currently supported for this sample. + Please, refer to the official website for [πŸ€— Optimum](https://huggingface.co/docs/optimum/main/en/index) and [optimum-intel](https://github.com/huggingface/optimum-intel) to read more details. + + If https://huggingface.co/ is down, the script won't be able to download the model. + +### (Optional) Enable LoRA Weights with Safetensors + +Low-Rank Adaptation (LoRA) is a technique introduced to deal with the problem of fine-tuning Diffusers and Large Language Models (LLMs). In the case of Stable Diffusion fine-tuning, LoRA can be applied to the cross-attention layers for the image representations with the latent described. -### LoRA enabling with safetensors +LoRA weights can be enabled for Unet model of Stable Diffusion pipeline to generate images with different styles. -Refer to [python pipeline blog](https://blog.openvino.ai/blog-posts/enable-lora-weights-with-stable-diffusion-controlnet-pipeline). -The safetensor model is loaded via [safetensors.h](https://github.com/hsnyder/safetensors.h). The layer name and weight are modified with `Eigen Lib` and inserted into the LCM model with `ov::pass::MatcherPass` in the file [common/diffusers/src/lora.cpp](https://github.com/openvinotoolkit/openvino.genai/blob/master/image_generation/common/diffusers/src/lora.cpp). +In this sample LoRA weights are used in [safetensors]((https://huggingface.co/docs/safetensors/index#format)) format. +Safetensors is a serialization format developed by Hugging Face that is specifically designed for efficiently storing and loading large tensors. It provides a lightweight and efficient way to serialize tensors, making it easier to store and load machine learning models. -LCM model [lcm_dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and Lora [soulcard](https://civitai.com/models/67927?modelVersionId=72591) are tested in this pipeline. +The LoRA safetensors model is loaded via [safetensors.h](https://github.com/hsnyder/safetensors.h). The layer name and weight are modified with `Eigen` library and inserted into the SD models with `ov::pass::MatcherPass` in the file [common/diffusers/src/lora.cpp](https://github.com/openvinotoolkit/openvino.genai/blob/master/image_generation/common/diffusers/src/lora.cpp). -Download and put safetensors and model IR into the models folder. +There are various LoRA models on https://civitai.com/tag/lora and on HuggingFace, you can consider to choose your own LoRA model in safetensor format. For example, you can use LoRA [soulcard model](https://civitai.com/models/67927?modelVersionId=72591). +Download and put LoRA safetensors model into the models directory. When running the built sample provide the path to the LoRA model with `-l, --loraPath arg` argument. -## Step 3: Build the LCM application +## Step 3: Build the LCM Application ```shell conda activate openvino_lcm_cpp @@ -64,16 +72,17 @@ cmake --build build --config Release --parallel ## Step 4: Run Pipeline ```shell -./build/lcm_dreamshaper [-p ] [-s ] [--height ] [--width ] [-d ] [-r ] [-a ] [-h ] [-m ] [-t ] +./build/lcm_dreamshaper [-p ] [-s ] [--height ] [--width ] [-d ] [-r ] [-a ] [-h ] [-m ] [-t ] [--guidanceScale ] [--dynamic] Usage: lcm_dreamshaper [OPTION...] ``` -* `-p, --posPrompt arg` Initial positive prompt for LCM (default: a beautiful pink unicorn) +* `-p, --posPrompt arg` Initial positive prompt for LCM (default: "a beautiful pink unicorn") * `-d, --device arg` AUTO, CPU, or GPU. Doesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only (default: CPU) * `--step arg` Number of diffusion step (default: 4) * `-s, --seed arg` Number of random seed to generate latent (default: 42) +* `--guidanceScale arg` A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality (default: 8.0) * `--num arg` Number of image output (default: 1) * `--height arg` Height of output image (default: 512) * `--width arg` Width of output image (default: 512) @@ -93,7 +102,7 @@ Example: Positive prompt: a beautiful pink unicorn -Read the numpy latent input and noise for scheduler instead of C++ std lib for the alignment with Python pipeline. +To read the numpy latent input and noise for scheduler instead of C++ std lib for the alignment with Python pipeline, use `-r, --readNPLatent` argument. * Generate image with random data generated by Python: `./build/lcm_dreamshaper -r` @@ -103,10 +112,27 @@ Read the numpy latent input and noise for scheduler instead of C++ std lib for t ![image](./cpp_random.bmp) -* Generate image with soulcard lora and C++ generated latent and noise: `./stable_diffusion -r -l path/to/soulcard.safetensors` +* Generate image with soulcard lora and C++ generated latent and noise: `./build/lcm_dreamshaper -l path/to/soulcard.safetensors` ![image](./lora_cpp_random.bmp) ## Benchmark: -For the generation quality, C++ random generation with MT19937 results is differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. Hence, please use `-r, --readNPLatent` for the alignment with Python (this latent file is for output image 512X512 only) +For the generation quality, C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. Hence, please use `-r, --readNPLatent` for the alignment with Python (this latent file is for output image 512X512 only) + +## Notes + +#### Guidance Scale + +Guidance scale controls how similar the generated image will be to the prompt. A higher guidance scale means the model will try to generate an image that follows the prompt more strictly. A lower guidance scale means the model will have more creativity. +`guidance_scale` is a way to increase the adherence to the conditional signal that guides the generation (text, in this case) as well as overall sample quality. It is also known as [classifier-free guidance](https://arxiv.org/abs/2207.12598). + +#### Negative Prompt + +Negative prompts don't work with LCM because they don’t have any effect on the denoising process. +When a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the unconditional prompt "" (the empty string). +Due to this, LCMs currently do not support negative prompts. + +#### LoRA Weights Enabling + +Refer to the [OpenVINO blog](https://blog.openvino.ai/blog-posts/enable-lora-weights-with-stable-diffusion-controlnet-pipeline) to get more information on enabling LoRA weights. diff --git a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt b/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt deleted file mode 100644 index e86e1c2eb1..0000000000 --- a/image_generation/lcm_dreamshaper_v7/cpp/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -torch==2.2.2+cpu -diffusers==0.27.2 -optimum-intel[openvino]==1.17.0 diff --git a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp index 0b06d22067..e79082f547 100644 --- a/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp +++ b/image_generation/lcm_dreamshaper_v7/cpp/src/main.cpp @@ -24,6 +24,20 @@ const size_t TOKENIZER_MODEL_MAX_LENGTH = 77; // 'model_max_length' parameter from 'tokenizer_config.json' const size_t VAE_SCALE_FACTOR = 8; +class Timer { + const decltype(std::chrono::steady_clock::now()) m_start; + +public: + Timer(const std::string& scope) : m_start(std::chrono::steady_clock::now()) { + (std::cout << scope << ": ").flush(); + } + + ~Timer() { + auto m_end = std::chrono::steady_clock::now(); + std::cout << std::chrono::duration(m_end - m_start).count() << " ms" << std::endl; + } +}; + ov::Tensor randn_tensor(ov::Shape shape, bool use_np_latents, uint32_t seed = 42) { ov::Tensor noise(ov::element::f32, shape); if (use_np_latents) { @@ -129,11 +143,13 @@ StableDiffusionModels compile_models(const std::string& model_path, // read LoRA weights std::map lora_weights; if (!lora_path.empty()) { + Timer t("Loading and multiplying LoRA weights"); lora_weights = read_lora_adapters(lora_path, alpha); } // Text encoder { + Timer t("Loading and compiling text encoder"); auto text_encoder_model = core.read_model(model_path + "/text_encoder/openvino_model.xml"); if (!use_dynamic_shapes) { reshape_text_encoder(text_encoder_model, batch_size, TOKENIZER_MODEL_MAX_LENGTH); @@ -144,6 +160,7 @@ StableDiffusionModels compile_models(const std::string& model_path, // UNet { + Timer t("Loading and compiling UNet"); auto unet_model = core.read_model(model_path + "/unet/openvino_model.xml"); if (!use_dynamic_shapes) { reshape_unet(unet_model, batch_size, height, width, TOKENIZER_MODEL_MAX_LENGTH); @@ -154,6 +171,7 @@ StableDiffusionModels compile_models(const std::string& model_path, // VAE decoder { + Timer t("Loading and compiling VAE decoder"); auto vae_decoder_model = core.read_model(model_path + "/vae_decoder/openvino_model.xml"); if (!use_dynamic_shapes) { reshape_vae_decoder(vae_decoder_model, height, width); @@ -166,6 +184,7 @@ StableDiffusionModels compile_models(const std::string& model_path, // Tokenizer { + Timer t("Loading and compiling tokenizer"); // Tokenizer model wil be loaded to CPU: OpenVINO Tokenizers can be inferred on a CPU device only. models.tokenizer = core.compile_model(model_path + "/tokenizer/openvino_tokenizer.xml", "CPU"); } @@ -264,6 +283,7 @@ int32_t main(int32_t argc, char* argv[]) try { ("d,device", "AUTO, CPU, or GPU.\nDoesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only", cxxopts::value()->default_value("CPU")) ("step", "Number of diffusion steps", cxxopts::value()->default_value("4")) ("s,seed", "Number of random seed to generate latent for one image output", cxxopts::value()->default_value("42")) + ("guidanceScale", "A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality", cxxopts::value()->default_value("8.0")) ("num", "Number of image output", cxxopts::value()->default_value("1")) ("height","Height of output image",cxxopts::value()->default_value("512")) ("width", "Width of output image", cxxopts::value()->default_value("512")) @@ -294,6 +314,7 @@ int32_t main(int32_t argc, char* argv[]) try { const std::string device = result["device"].as(); const uint32_t num_inference_steps = result["step"].as(); const uint32_t user_seed = result["seed"].as(); + const float guidance_scale = result["guidanceScale"].as(); const uint32_t num_images = result["num"].as(); const uint32_t height = result["height"].as(); const uint32_t width = result["width"].as(); @@ -336,43 +357,51 @@ int32_t main(int32_t argc, char* argv[]) try { (sample_shape[2] * VAE_SCALE_FACTOR == height && sample_shape[3] * VAE_SCALE_FACTOR == width), "UNet model has static shapes [1, 4, H/8, W/8] or dynamic shapes [?, 4, ?, ?]"); - // no negative prompt for LCM model: - // https://huggingface.co/docs/diffusers/api/pipelines/latent_consistency_models#diffusers.LatentConsistencyModelPipeline - ov::Tensor text_embeddings = text_encoder(models, positive_prompt); + std::string result_image_path; - std::shared_ptr scheduler = std::make_shared(LCMScheduler( - 1000, 0.00085f, 0.012f, BetaSchedule::SCALED_LINEAR, - PredictionType::EPSILON, {}, 50, true, 10.0f, false, - false, 1.0f, 0.995f, 1.0f, read_np_latent, user_seed)); - scheduler->set_timesteps(num_inference_steps); - std::vector timesteps = scheduler->get_timesteps(); + // Stable Diffusion pipeline + { + Timer t("Running Stable Diffusion pipeline"); - float guidance_scale = 8.0; - const size_t unet_time_cond_proj_dim = static_cast(models.unet.input("timestep_cond").get_partial_shape()[1].get_length()); - ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, unet_time_cond_proj_dim); + // no negative prompt for LCM model: + // https://huggingface.co/docs/diffusers/api/pipelines/latent_consistency_models#diffusers.LatentConsistencyModelPipeline + ov::Tensor text_embeddings = text_encoder(models, positive_prompt); - const size_t unet_in_channels = static_cast(sample_shape[1].get_length()); - ov::Shape latent_model_input_shape = ov::Shape({1, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR}); + std::shared_ptr scheduler = std::make_shared(LCMScheduler( + 1000, 0.00085f, 0.012f, BetaSchedule::SCALED_LINEAR, + PredictionType::EPSILON, {}, 50, true, 10.0f, false, + false, 1.0f, 0.995f, 1.0f, read_np_latent, user_seed)); + scheduler->set_timesteps(num_inference_steps); + std::vector timesteps = scheduler->get_timesteps(); - ov::Tensor denoised(ov::element::f32, latent_model_input_shape); + const size_t unet_time_cond_proj_dim = static_cast(models.unet.input("timestep_cond").get_partial_shape()[1].get_length()); + ov::Tensor guidance_scale_embedding = get_w_embedding(guidance_scale, unet_time_cond_proj_dim); - for (uint32_t n = 0; n < num_images; n++) { - std::uint32_t seed = num_images == 1 ? user_seed: user_seed + n; - ov::Tensor latent_model_input = randn_tensor(latent_model_input_shape, read_np_latent, seed); + const size_t unet_in_channels = static_cast(sample_shape[1].get_length()); + ov::Shape latent_model_input_shape = ov::Shape({1, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR}); - for (size_t inference_step = 0; inference_step < num_inference_steps; inference_step++) { - ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); - ov::Tensor noisy_residual = unet(unet_infer_request, latent_model_input, timestep, text_embeddings, guidance_scale_embedding); + ov::Tensor denoised(ov::element::f32, latent_model_input_shape); - auto step_res = scheduler->step(noisy_residual, latent_model_input, inference_step); - latent_model_input = step_res["latent"], denoised = step_res["denoised"]; - } + for (uint32_t n = 0; n < num_images; n++) { + std::uint32_t seed = num_images == 1 ? user_seed: user_seed + n; + ov::Tensor latent_model_input = randn_tensor(latent_model_input_shape, read_np_latent, seed); + + for (size_t inference_step = 0; inference_step < num_inference_steps; inference_step++) { + ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); + ov::Tensor noisy_residual = unet(unet_infer_request, latent_model_input, timestep, text_embeddings, guidance_scale_embedding); - ov::Tensor decoded_image = vae_decoder(models.vae_decoder, denoised); - imwrite(std::string("./images/seed_") + std::to_string(seed) + ".bmp", postprocess_image(decoded_image), true); - std::cout << "Result image saved to: " << std::string("./images/seed_") + std::to_string(seed) + ".bmp" << std::endl; + auto step_res = scheduler->step(noisy_residual, latent_model_input, inference_step); + latent_model_input = step_res["latent"], denoised = step_res["denoised"]; + } + + ov::Tensor decoded_image = vae_decoder(models.vae_decoder, denoised); + result_image_path = std::string("./images/seed_") + std::to_string(seed) + ".bmp"; + imwrite(result_image_path, postprocess_image(decoded_image), true); + } } + std::cout << "Result image is saved to: " << result_image_path << std::endl; + return EXIT_SUCCESS; } catch (const std::exception& error) { std::cerr << error.what() << '\n'; diff --git a/image_generation/requirements.txt b/image_generation/requirements.txt new file mode 100644 index 0000000000..5c346f3844 --- /dev/null +++ b/image_generation/requirements.txt @@ -0,0 +1,2 @@ +-r ../samples/requirements.txt +diffusers==0.29.2 diff --git a/image_generation/stable_diffusion_1_5/cpp/README.md b/image_generation/stable_diffusion_1_5/cpp/README.md index b448b618bc..d8fa0cd736 100644 --- a/image_generation/stable_diffusion_1_5/cpp/README.md +++ b/image_generation/stable_diffusion_1_5/cpp/README.md @@ -1,20 +1,21 @@ -# OpenVINO Stable Diffusion (with LoRA) C++ image generation pipeline -The pure C++ text-to-image pipeline, driven by the OpenVINO native C++ API for Stable Diffusion v1.5 with LMS Discrete Scheduler, supports both static and dynamic model inference. It includes advanced features like [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora) integration with [safetensors](https://huggingface.co/docs/safetensors/index#format) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). Loading `openvino_tokenizers` to `ov::Core` enables tokenization. The sample uses [diffusers](../../common/diffusers) for image generation and [imwrite](../../common/imwrite) for saving `.bmp` images. This demo has been tested on Windows and Unix platforms. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/stable-diffusion-text-to-image) which provides an example of image generation in Python. +# OpenVINO Stable Diffusion (with LoRA) C++ Image Generation Pipeline + +The pure C++ text-to-image pipeline, driven by the OpenVINO native C++ API for Stable Diffusion v1.5 with LMS Discrete Scheduler, supports both static and dynamic model inference. It includes advanced features like [LoRA](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#lora) integration with [safetensors](https://huggingface.co/docs/safetensors/index#format) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). Loading `openvino_tokenizers` to `ov::Core` enables tokenization. The sample uses [diffusers](../../common/diffusers) for image generation and [imwrite](../../common/imwrite) for saving `.bmp` images. This demo has been tested on Windows and Unix platforms. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/stable-diffusion-text-to-image) which provides an example of image generation in Python. > [!NOTE] >This tutorial assumes that the current working directory is `/image_generation/stable_diffusion_1_5/cpp/` and all paths are relative to this folder. -## Step 1: Prepare build environment +## Step 1: Prepare Build Environment Prerequisites: - Conda ([installation guide](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)) - C++ Packages: * [CMake](https://cmake.org/download/): Cross-platform build tool * [OpenVINO](https://docs.openvino.ai/install): Model inference. `master` and possibly the latest `releases/*` branch correspond to not yet released OpenVINO versions. https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/ can be used for these branches early testing. Prepare a python environment and install dependencies: + ```shell conda create -n openvino_sd_cpp python==3.10 conda activate openvino_sd_cpp @@ -23,45 +24,52 @@ conda install -c conda-forge openvino=2024.2.0 c-compiler cxx-compiler git make conda env config vars set LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH ``` -## Step 2: Convert Stable Diffusion v1.5 and Tokenizer models - -### Stable Diffusion v1.5 model: +## Step 2: Obtain Stable Diffusion Model 1. Install dependencies to import models from HuggingFace: -```shell -git submodule update --init -# Reactivate Conda environment after installing dependencies and setting env vars -conda activate openvino_sd_cpp -python -m pip install -r requirements.txt -python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] -``` -2. Download a huggingface SD v1.5 model like: -- [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) -- [dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) to run Stable Diffusion with LoRA adapters. - Example command for downloading and exporting FP16 model: + ```shell + git submodule update --init + # Reactivate Conda environment after installing dependencies and setting env vars + conda activate openvino_sd_cpp + python -m pip install -r ../../requirements.txt + python -m pip install ../../../thirdparty/openvino_tokenizers/[transformers] + ``` + +2. Download the model from Huggingface and convert it to OpenVINO IR via [optimum-intel CLI](https://github.com/huggingface/optimum-intel). + + Example models to download: + - [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) + - [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) + + Example command for downloading [dreamlike-art/dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) model and exporting it with FP16 precision: - `optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike_anime_1_0_ov/FP16` + `optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike_anime_1_0_ov/FP16` - You can also choose other precision and export FP32 or INT8 model. + You can also choose other precision and export FP32 or INT8 model. - Please, refer to the official website for [πŸ€— Optimum](https://huggingface.co/docs/optimum/main/en/index) and [optimum-intel](https://github.com/huggingface/optimum-intel) to read more details. + Please, refer to the official website for [πŸ€— Optimum](https://huggingface.co/docs/optimum/main/en/index) and [optimum-intel](https://github.com/huggingface/optimum-intel) to read more details. - If https://huggingface.co/ is down, the script won't be able to download the model. + If https://huggingface.co/ is down, the script won't be able to download the model. > [!NOTE] > Now the pipeline support batch size = 1 only, i.e. static model `(1, 3, 512, 512)` -### LoRA enabling with safetensors +### (Optional) Enable LoRA Weights with Safetensors -Refer to [python pipeline blog](https://blog.openvino.ai/blog-posts/enable-lora-weights-with-stable-diffusion-controlnet-pipeline). -The safetensor model is loaded via [safetensors.h](https://github.com/hsnyder/safetensors.h). The layer name and weight are modified with `Eigen` library and inserted into the SD models with `ov::pass::MatcherPass` in the file [common/diffusers/src/lora.cpp](https://github.com/openvinotoolkit/openvino.genai/blob/master/image_generation/common/diffusers/src/lora.cpp). +Low-Rank Adaptation (LoRA) is a technique introduced to deal with the problem of fine-tuning Diffusers and Large Language Models (LLMs). In the case of Stable Diffusion fine-tuning, LoRA can be applied to the cross-attention layers for the image representations with the latent described. -SD model [dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) and LoRA [soulcard](https://civitai.com/models/67927?modelVersionId=72591) are tested in this pipeline. +LoRA weights can be enabled for Unet model of Stable Diffusion pipeline to generate images with different styles. -Download and put safetensors and model IR into the models folder. +In this sample LoRA weights are used in [safetensors]((https://huggingface.co/docs/safetensors/index#format)) format. +Safetensors is a serialization format developed by Hugging Face that is specifically designed for efficiently storing and loading large tensors. It provides a lightweight and efficient way to serialize tensors, making it easier to store and load machine learning models. -## Step 3: Build the SD application +The LoRA safetensors model is loaded via [safetensors.h](https://github.com/hsnyder/safetensors.h). The layer name and weight are modified with `Eigen` library and inserted into the SD models with `ov::pass::MatcherPass` in the file [common/diffusers/src/lora.cpp](https://github.com/openvinotoolkit/openvino.genai/blob/master/image_generation/common/diffusers/src/lora.cpp). + +There are various LoRA models on https://civitai.com/tag/lora and on HuggingFace, you can consider to choose your own LoRA model in safetensor format. For example, you can use LoRA [soulcard model](https://civitai.com/models/67927?modelVersionId=72591). +Download and put LoRA safetensors model into the models directory. When running the built sample provide the path to the LoRA model with `-l, --loraPath arg` argument. + +## Step 3: Build the SD Application ```shell conda activate openvino_sd_cpp @@ -71,17 +79,18 @@ cmake --build build --parallel ## Step 4: Run Pipeline ```shell -./build/stable_diffusion [-p ] [-n ] [-s ] [--height ] [--width ] [-d ] [-r ] [-l ] [-a ] [-h ] [-m ] [-t ] [--dynamic] +./build/stable_diffusion [-p ] [-n ] [-s ] [--height ] [--width ] [-d ] [-r ] [-l ] [-a ] [-h ] [-m ] [-t ] [--guidanceScale ] [--dynamic] Usage: stable_diffusion [OPTION...] ``` -* `-p, --posPrompt arg` Initial positive prompt for SD (default: cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting) -* `-n, --negPrompt arg` Default is empty with space (default: ) +* `-p, --posPrompt arg` Initial positive prompt for SD (default: "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting") +* `-n, --negPrompt arg` The prompt to guide the image generation away from. Ignored when not using guidance (`--guidanceScale` is less than `1`) (default: "") * `-d, --device arg` AUTO, CPU, or GPU. Doesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only (default: CPU) * `--step arg` Number of diffusion step ( default: 20) * `-s, --seed arg` Number of random seed to generate latent (default: 42) +* `--guidanceScale arg` A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality (default: 7.5) * `--num arg` Number of image output(default: 1) * `--height arg` Height of output image (default: 512) * `--width arg` Width of output image (default: 512) @@ -101,15 +110,15 @@ Usage: Positive prompt: cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting -Negative prompt: (empty, here couldn't use OV tokenizer, check the issues for details) +Negative prompt: (empty, check the [Notes](#negative-prompt) for details) -Read the numpy latent instead of C++ std lib for the alignment with Python pipeline +To read the numpy latent instead of C++ std lib for the alignment with Python pipeline, use `-r, --readNPLatent` argument. * Generate image without lora `./build/stable_diffusion -r` ![](./without_lora.bmp) -* Generate image with soulcard lora `./build/stable_diffusion -r` +* Generate image with soulcard lora `./build/stable_diffusion -r -l path/to/soulcard.safetensors` ![](./soulcard_lora.bmp) @@ -117,6 +126,24 @@ Read the numpy latent instead of C++ std lib for the alignment with Python pipel ![](./704x448.bmp) -## Notes: +## Notes + +For the generation quality, be careful with the negative prompt and random latent generation. C++ random generation with MT19937 results differ from `numpy.random.randn()`. Hence, please use `-r, --readNPLatent` for the alignment with Python (this latent file is for output image 512X512 only). + +#### Guidance Scale + +Guidance scale controls how similar the generated image will be to the prompt. A higher guidance scale means the model will try to generate an image that follows the prompt more strictly. A lower guidance scale means the model will have more creativity. +`guidance_scale` is a way to increase the adherence to the conditional signal that guides the generation (text, in this case) as well as overall sample quality. It is also known as [classifier-free guidance](https://arxiv.org/abs/2207.12598). + +#### Negative Prompt + +To improve image generation quality, model supports negative prompting. Technically, positive prompt steers the diffusion toward the images associated with it, while negative prompt steers the diffusion away from it. +In other words, negative prompt declares undesired concepts for generation image, e.g. if we want to have colorful and bright image, gray scale image will be result which we want to avoid, in this case gray scale can be treated as negative prompt. +The positive and negative prompt are in equal footing. You can always use one with or without the other. More explanation of how it works can be found in this [article](https://stable-diffusion-art.com/how-negative-prompt-work/). + +> [!NOTE] +> Negative prompting is applicable only for high guidance scale (at least > 1). + +#### LoRA Weights Enabling -For the generation quality, be careful with the negative prompt and random latent generation. C++ random generation with MT19937 results is differ from `numpy.random.randn()`. Hence, please use `-r, --readNPLatent` for the alignment with Python (this latent file is for output image 512X512 only) +Refer to the [OpenVINO blog](https://blog.openvino.ai/blog-posts/enable-lora-weights-with-stable-diffusion-controlnet-pipeline) to get more information on enabling LoRA weights. diff --git a/image_generation/stable_diffusion_1_5/cpp/requirements.txt b/image_generation/stable_diffusion_1_5/cpp/requirements.txt deleted file mode 100644 index dd5faeb7de..0000000000 --- a/image_generation/stable_diffusion_1_5/cpp/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -torch==2.2.2+cpu -diffusers==0.27.2 -transformers==4.39.3 -optimum-intel[openvino]==1.17.0 -huggingface_hub[cli]==0.22.2 diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp index d5ea333ef0..8fbac91375 100644 --- a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp +++ b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp @@ -85,11 +85,11 @@ void reshape_text_encoder(std::shared_ptr model, size_t batch_size, s model->reshape(idx_to_shape); } -void reshape_unet_encoder(std::shared_ptr model, - int64_t batch_size, - int64_t height, - int64_t width, - int64_t tokenizer_model_max_length) { +void reshape_unet(std::shared_ptr model, + int64_t batch_size, + int64_t height, + int64_t width, + int64_t tokenizer_model_max_length) { // The factor of 2 comes from the guidance scale > 1 for (auto input : model->inputs()) { if (input.get_any_name().find("timestep_cond") == std::string::npos) { @@ -170,7 +170,7 @@ StableDiffusionModels compile_models(const std::string& model_path, Timer t("Loading and compiling UNet"); auto unet_model = core.read_model(model_path + "/unet/openvino_model.xml"); if (!use_dynamic_shapes) { - reshape_unet_encoder(unet_model, batch_size, height, width, TOKENIZER_MODEL_MAX_LENGTH); + reshape_unet(unet_model, batch_size, height, width, TOKENIZER_MODEL_MAX_LENGTH); } apply_lora(unet_model, lora_weights["unet"]); models.unet = core.compile_model(unet_model, device); @@ -199,7 +199,7 @@ StableDiffusionModels compile_models(const std::string& model_path, return models; } -ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt, std::string& neg_prompt) { +ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt, std::string& neg_prompt, bool do_classifier_free_guidance) { const size_t HIDDEN_SIZE = static_cast(models.text_encoder.output(0).get_partial_shape()[2].get_length()); const int32_t EOS_TOKEN_ID = 49407, PAD_TOKEN_ID = EOS_TOKEN_ID; const ov::Shape input_ids_shape({1, TOKENIZER_MODEL_MAX_LENGTH}); @@ -225,6 +225,10 @@ ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt, s ov::Tensor text_embeddings(ov::element::f32, {2, TOKENIZER_MODEL_MAX_LENGTH, HIDDEN_SIZE}); + if (!do_classifier_free_guidance && neg_prompt != "") { + throw std::invalid_argument("Negative prompt is ignored when --guidanceScale < 1.0. Please remove --negPrompt argument."); + } + compute_text_embeddings(neg_prompt, ov::Tensor(text_embeddings, {0, 0, 0}, {1, TOKENIZER_MODEL_MAX_LENGTH, HIDDEN_SIZE})); compute_text_embeddings(pos_prompt, @@ -240,21 +244,7 @@ ov::Tensor unet(ov::InferRequest req, ov::Tensor sample, ov::Tensor timestep, ov req.infer(); - ov::Tensor noise_pred_tensor = req.get_output_tensor(); - ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); - noise_pred_shape[0] = 1; - - // perform guidance - const float guidance_scale = 7.5f; - const float* noise_pred_uncond = noise_pred_tensor.data(); - const float* noise_pred_text = noise_pred_uncond + ov::shape_size(noise_pred_shape); - - ov::Tensor noisy_residual(noise_pred_tensor.get_element_type(), noise_pred_shape); - for (size_t i = 0; i < ov::shape_size(noise_pred_shape); ++i) - noisy_residual.data()[i] = - noise_pred_uncond[i] + guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); - - return noisy_residual; + return req.get_output_tensor(); } ov::Tensor vae_decoder(ov::CompiledModel& decoder_compiled_model, ov::Tensor sample) { @@ -286,11 +276,12 @@ int32_t main(int32_t argc, char* argv[]) try { cxxopts::Options options("stable_diffusion", "Stable Diffusion implementation in C++ using OpenVINO\n"); options.add_options() - ("p,posPrompt", "Initial positive prompt for SD ", cxxopts::value()->default_value("cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting")) - ("n,negPrompt", "Defaut is empty with space", cxxopts::value()->default_value(" ")) + ("p,posPrompt", "Initial positive prompt for SD", cxxopts::value()->default_value("cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting")) + ("n,negPrompt", "The prompt to guide the image generation away from. Ignored when not using guidance (`--guidanceScale` is less than `1`)", cxxopts::value()->default_value("")) ("d,device", "AUTO, CPU, or GPU.\nDoesn't apply to Tokenizer model, OpenVINO Tokenizers can be inferred on a CPU device only", cxxopts::value()->default_value("CPU")) ("step", "Number of diffusion steps", cxxopts::value()->default_value("20")) ("s,seed", "Number of random seed to generate latent for one image output", cxxopts::value()->default_value("42")) + ("guidanceScale", "A higher guidance scale value encourages the model to generate images closely linked to the text prompt at the expense of lower image quality", cxxopts::value()->default_value("7.5")) ("num", "Number of image output", cxxopts::value()->default_value("1")) ("height", "Destination image height", cxxopts::value()->default_value("512")) ("width", "Destination image width", cxxopts::value()->default_value("512")) @@ -321,6 +312,7 @@ int32_t main(int32_t argc, char* argv[]) try { const std::string device = result["device"].as(); const uint32_t num_inference_steps = result["step"].as(); const uint32_t user_seed = result["seed"].as(); + const float guidance_scale = result["guidanceScale"].as(); const uint32_t num_images = result["num"].as(); const uint32_t height = result["height"].as(); const uint32_t width = result["width"].as(); @@ -353,8 +345,9 @@ int32_t main(int32_t argc, char* argv[]) try { return EXIT_FAILURE; } - // Stable Diffusion pipeline const size_t batch_size = 1; + const bool do_classifier_free_guidance = guidance_scale > 1.0; + StableDiffusionModels models = compile_models(model_path, device, lora_path, alpha, use_cache, use_dynamic_shapes, batch_size, height, width); ov::InferRequest unet_infer_request = models.unet.create_infer_request(); @@ -364,49 +357,69 @@ int32_t main(int32_t argc, char* argv[]) try { (sample_shape[2] * VAE_SCALE_FACTOR == height && sample_shape[3] * VAE_SCALE_FACTOR == width), "UNet model has static shapes [1, 4, H/8, W/8] or dynamic shapes [?, 4, ?, ?]"); - Timer t("Running Stable Diffusion pipeline"); + std::string result_image_path; + + // Stable Diffusion pipeline + { + Timer t("Running Stable Diffusion pipeline"); - ov::Tensor text_embeddings = text_encoder(models, positive_prompt, negative_prompt); + ov::Tensor text_embeddings = text_encoder(models, positive_prompt, negative_prompt, do_classifier_free_guidance); - for (uint32_t n = 0; n < num_images; n++) { std::shared_ptr scheduler = std::make_shared(); scheduler->set_timesteps(num_inference_steps); std::vector timesteps = scheduler->get_timesteps(); - std::uint32_t seed = num_images == 1 ? user_seed : user_seed + n; + for (uint32_t n = 0; n < num_images; n++) { + std::uint32_t seed = num_images == 1 ? user_seed : user_seed + n; - const size_t unet_in_channels = static_cast(sample_shape[1].get_length()); + const size_t unet_in_channels = static_cast(sample_shape[1].get_length()); - // latents are multiplied by 'init_noise_sigma' - ov::Shape latent_shape = ov::Shape({batch_size, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR}); - ov::Shape latent_model_input_shape = latent_shape; - ov::Tensor noise = randn_tensor(latent_shape, read_np_latent, seed); - latent_model_input_shape[0] = 2; // Unet accepts batch 2 - ov::Tensor latent(ov::element::f32, latent_shape), - latent_model_input(ov::element::f32, latent_model_input_shape); - for (size_t i = 0; i < noise.get_size(); ++i) { - latent.data()[i] = noise.data()[i] * scheduler->get_init_noise_sigma(); - } + // latents are multiplied by 'init_noise_sigma' + ov::Shape latent_shape = ov::Shape({batch_size, unet_in_channels, height / VAE_SCALE_FACTOR, width / VAE_SCALE_FACTOR}); + ov::Shape latent_model_input_shape = latent_shape; + ov::Tensor noise = randn_tensor(latent_shape, read_np_latent, seed); + latent_model_input_shape[0] = 2; // Unet accepts batch 2 + ov::Tensor latent(ov::element::f32, latent_shape), + latent_model_input(ov::element::f32, latent_model_input_shape); + for (size_t i = 0; i < noise.get_size(); ++i) { + latent.data()[i] = noise.data()[i] * scheduler->get_init_noise_sigma(); + } - for (size_t inference_step = 0; inference_step < num_inference_steps; inference_step++) { - // concat the same latent twice along a batch dimension - latent.copy_to( - ov::Tensor(latent_model_input, {0, 0, 0, 0}, {1, latent_shape[1], latent_shape[2], latent_shape[3]})); - latent.copy_to( - ov::Tensor(latent_model_input, {1, 0, 0, 0}, {2, latent_shape[1], latent_shape[2], latent_shape[3]})); + for (size_t inference_step = 0; inference_step < num_inference_steps; inference_step++) { + // concat the same latent twice along a batch dimension + latent.copy_to( + ov::Tensor(latent_model_input, {0, 0, 0, 0}, {1, latent_shape[1], latent_shape[2], latent_shape[3]})); + latent.copy_to( + ov::Tensor(latent_model_input, {1, 0, 0, 0}, {2, latent_shape[1], latent_shape[2], latent_shape[3]})); - scheduler->scale_model_input(latent_model_input, inference_step); + scheduler->scale_model_input(latent_model_input, inference_step); - ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); - ov::Tensor noisy_residual = unet(unet_infer_request, latent_model_input, timestep, text_embeddings); + ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); + ov::Tensor noise_pred_tensor = unet(unet_infer_request, latent_model_input, timestep, text_embeddings); - latent = scheduler->step(noisy_residual, latent, inference_step)["latent"]; - } + ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); + noise_pred_shape[0] = 1; + + ov::Tensor noisy_residual(noise_pred_tensor.get_element_type(), noise_pred_shape); - ov::Tensor decoded_image = vae_decoder(models.vae_decoder, latent); - imwrite(std::string("./images/seed_") + std::to_string(seed) + ".bmp", postprocess_image(decoded_image), true); + // perform guidance + const float* noise_pred_uncond = noise_pred_tensor.data(); + const float* noise_pred_text = noise_pred_uncond + ov::shape_size(noise_pred_shape); + for (size_t i = 0; i < ov::shape_size(noise_pred_shape); ++i) + noisy_residual.data()[i] = + noise_pred_uncond[i] + guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); + + latent = scheduler->step(noisy_residual, latent, inference_step)["latent"]; + } + + ov::Tensor decoded_image = vae_decoder(models.vae_decoder, latent); + result_image_path = std::string("./images/seed_") + std::to_string(seed) + ".bmp"; + imwrite(result_image_path, postprocess_image(decoded_image), true); + } } + std::cout << "Result image is saved to: " << result_image_path << std::endl; + return EXIT_SUCCESS; } catch (const std::exception& error) { std::cerr << error.what() << '\n'; diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 7d3d2cad9d..2e4a23da41 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -73,7 +73,7 @@ def gen_iterate_data( return iter_data -def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_index, bench_hook, model_precision, proc_id): +def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, bench_hook, model_precision, proc_id): set_seed(args['seed']) input_text_list = [input_text] * args['batch_size'] if args["output_dir"] is not None and num == 0: @@ -101,7 +101,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, mem_consumption.start_collect_memory_consumption() max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] start = time.perf_counter() - if args['infer_count'] is not None: + if args['infer_count'] is not None and args['end_token_stopping'] is False: model.generation_config.eos_token_id = None model.config.eos_token_id = None result = model.generate( @@ -150,8 +150,10 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, if args["output_dir"] is not None: utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) - if num == 0: - warmup_md5[prompt_index] = result_md5_list + if len(md5_list[num]) == 0: + md5_list[num] = {prompt_index : result_md5_list} + else: + md5_list[num][prompt_index] = result_md5_list per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) tm_list = [] tm_infer_list = [] @@ -190,10 +192,18 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, batch_size=args['batch_size'] ) if num > 0: - warmup_md5_list = warmup_md5[prompt_index] - if result_md5_list != warmup_md5_list: - log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} is different from warm-up's md5 {warmup_md5_list}") + prev_md5 = md5_list[num - 1][prompt_index] + if result_md5_list != prev_md5: + log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " + f"is different from md5 of the {num - 1} iteration {prev_md5}") utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + if num == 1: + # if the device is CPU, throw exception + if args['devices'].lower().startswith('cpu') is True: + assert (result_md5_list == prev_md5) + else: + # throw exception + assert (result_md5_list == prev_md5) else: utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) if bench_hook is not None: @@ -201,7 +211,7 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, bench_hook.clear_time_infer_list() -def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_index, streamer, model_precision, proc_id): +def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id): set_seed(args['seed']) input_text_list = [input_text] * args['batch_size'] if args["output_dir"] is not None and num == 0: @@ -254,8 +264,10 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data if args["output_dir"] is not None: utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) - if num == 0: - warmup_md5[prompt_index] = result_md5_list + if len(md5_list[num]) == 0: + md5_list[num] = {prompt_index : result_md5_list} + else: + md5_list[num][prompt_index] = result_md5_list per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) tm_list = streamer.get_time_list() log.debug('latency of all tokens:') @@ -286,10 +298,18 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data batch_size=args['batch_size'] ) if num > 0: - warmup_md5_list = warmup_md5[prompt_index] - if result_md5_list != warmup_md5_list: - log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} is different from warm-up's md5 {warmup_md5_list}") + prev_md5 = md5_list[num - 1][prompt_index] + if result_md5_list != prev_md5: + log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " + f"is different from md5 of the {num - 1} iteration {prev_md5}") utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + if num == 1: + # if the device is CPU, throw exception + if args['devices'].lower().startswith('cpu') is True: + assert (result_md5_list == prev_md5) + else: + # throw exception + assert (result_md5_list == prev_md5) else: utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) streamer.reset() @@ -299,9 +319,8 @@ def run_text_generation_benchmark(model_path, framework, device, args, num_iters model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) model_precision = utils.model_utils.get_model_precision(model_path.parts) iter_data_list = [] - warmup_md5 = {} + md5_list = {num : {} for num in range(num_iters + 1)} input_text_list = utils.model_utils.get_prompts(args) - text_gen_fn = run_text_generation if not use_genai else run_text_generation_genai if args['prompt_index'] is None: prompt_idx_list = [prompt_idx for prompt_idx, input_text in enumerate(input_text_list)] text_list = input_text_list @@ -325,13 +344,13 @@ def run_text_generation_benchmark(model_path, framework, device, args, num_iters for idx, input_text in enumerate(text_list): if num == 0: log.info(f'[warm-up] Input text: {input_text}') - text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_idx_list[idx], bench_hook, model_precision, proc_id) + text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_idx_list[idx], bench_hook, model_precision, proc_id) else: for idx, input_text in enumerate(text_list): for num in range(num_iters + 1): if num == 0: log.info(f'[warm-up] Input text: {input_text}') - text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, warmup_md5, prompt_idx_list[idx], bench_hook, model_precision, proc_id) + text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_idx_list[idx], bench_hook, model_precision, proc_id) utils.metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], True) return iter_data_list, pretrain_time @@ -674,6 +693,11 @@ def get_argprser(): parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files') utils.model_utils.add_stateful_model_arguments(parser) parser.add_argument("--genai", action="store_true") + parser.add_argument( + '--end_token_stopping', + action='store_true', + help='Stop the generation even output token size does not achieve infer_count or max token size ({DEFAULT_OUTPUT_TOKEN_SIZE}}).' + ) return parser.parse_args() diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py index eed79c7948..d0627d3cf2 100644 --- a/llm_bench/python/convert.py +++ b/llm_bench/python/convert.py @@ -1310,13 +1310,22 @@ def convert_codegen2(args): if config.model_type == "codegen": config.model_type = "codegen2" cuda, post_init = patch_gptq(config) - pt_model = AutoModelForCausalLM.from_pretrained( - args.model_id, - trust_remote_code=True, - config=AutoConfig.from_pretrained(args.model_id, trust_remote_code=True), + precision = args.precision + compression_only = ( + args.compress_weights + and not args.force_convert + and not is_torch_compression(args) + and is_ov_model_provided(args.model_id, args.output_dir, precision) ) - pt_model.config = config - convert_optimum_causallm_base(pt_model, args, model_config=config) + pt_model = None + if not compression_only: + pt_model = AutoModelForCausalLM.from_pretrained( + args.model_id, + trust_remote_code=True, + config=AutoConfig.from_pretrained(args.model_id, trust_remote_code=True), + ) + pt_model.config = config + convert_optimum_causallm_base(pt_model, args, config, compression_only) if post_init is not None: unpatch_gptq(cuda, post_init) @@ -1399,13 +1408,14 @@ def main(): "-c", "--compress_weights", type=str, - choices=["INT8", "INT8_ASYM", "INT8_SYM", "4BIT_DEFAULT", "4BIT_MAXIMUM", "INT4_SYM", "INT4_ASYM"], + choices=["INT8", "INT8_ASYM", "INT8_SYM", "4BIT_DEFAULT", "4BIT_MAXIMUM", "INT4_SYM", "INT4_ASYM", "E2M1"], nargs="+", help=( "The weight compression option, e.g. INT8 - INT8 weights (deprecated, please use INT8_ASYM instead), " "4BIT_DEFAULT - for 4-bit compression with predefined configs with performance-accuracy trade-off, " "4BIT_MAXIMUM - for 4-bit compression with predefined configs for the best performance, " - "INT4_* - for INT4 compressed weights." + "INT4_* - for INT4 compressed weights, " + "E2M1 - for fp4 compression with fp8 (e8m0) scales." ), ) compression_group.add_argument( @@ -1449,6 +1459,11 @@ def main(): action="store_true", help="Apply AWQ algorithm during compression", ) + compression_group.add_argument( + "--scale_estimation", + action="store_true", + help="Apply scale estimation algorithm during compression", + ) add_stateful_model_arguments(parser) args = parser.parse_args() diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index 7e3c2ef31c..bbeb5de89e 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -1,15 +1,16 @@ --extra-index-url https://download.pytorch.org/whl/cpu numpy -openvino>=2024.2.0 -openvino_tokenizers>=2024.2.0 -openvino_genai>=2024.2.0 +--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly +openvino +openvino-tokenizers +openvino_genai auto-gptq>=0.5.1 # for gptq pillow torch transformers>=4.40.0 diffusers>=0.22.0 #optimum is in dependency list of optimum-intel -git+https://github.com/huggingface/optimum-intel.git@0a6075b44e2a6c721d6fbd7795b7804a0ce41d02#egg=optimum-intel +git+https://github.com/huggingface/optimum-intel.git@6388aeb8738b63e28fc594af84df94590e77cb9a#egg=optimum-intel git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging psutil diff --git a/llm_bench/python/utils/config_class.py b/llm_bench/python/utils/config_class.py index ac50158f41..1bf5cfda27 100644 --- a/llm_bench/python/utils/config_class.py +++ b/llm_bench/python/utils/config_class.py @@ -63,6 +63,7 @@ 'decoder', 't5', 'falcon', + "glm", 'gpt-', 'gpt2', 'aquila', @@ -102,7 +103,8 @@ "internlm", "olmo", "phi3", - "starcoder" + "starcoder", + "instruct-gpt" ], 'ldm_super_resolution': ['ldm-super-resolution'], } diff --git a/llm_bench/python/utils/conversion_utils/helpers.py b/llm_bench/python/utils/conversion_utils/helpers.py index d7545950d8..578f473c08 100644 --- a/llm_bench/python/utils/conversion_utils/helpers.py +++ b/llm_bench/python/utils/conversion_utils/helpers.py @@ -160,10 +160,14 @@ def get_data_aware_args(ov_model, tokenizer, config, compression_args, args): res['mode'] = dataset_args['sensitivity_metric'] if 'awq' in dataset_args: res['awq'] = dataset_args['awq'] + if 'scale_estimation' in dataset_args: + res['scale_estimation'] = dataset_args['scale_estimation'] elif args.dataset is not None: dataset_params = args.dataset if args.awq: res['awq'] = args.awq + if args.scale_estimation: + res['scale_estimation'] = args.scale_estimation if dataset_params is not None: # for example "wikitext,wikitext-2-v1,train[:1000],text" @@ -185,7 +189,11 @@ def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_w if "INT8" in compress_weights_format and "INT8_ASYM" in COMPRESSION_OPTIONS: warnings.warn("Usage INT8 mode is deprecated and will be removed soon. Please use INT8_ASYM instead", DeprecationWarning) if "4BIT_DEFAULT" in compress_weights_format: - compression_args = _check_default_4bit_configs(config) + try: + # TODO: remove this path when support of an older version optimum-intel is deprecated + compression_args = _check_default_4bit_configs(config) + except TypeError: + compression_args = _check_default_4bit_configs(config.name_or_path) if compression_args: sym = compression_args.pop("sym", False) compression_args.pop("bits", 4) @@ -195,7 +203,7 @@ def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_w if model_id in INT4_MODEL_CONFIGURATION: compression_args = INT4_MODEL_CONFIGURATION[model_id] else: - compression_args = COMPRESSION_OPTIONS["INT4_SYM"] + compression_args = COMPRESSION_OPTIONS["INT4_ASYM"] if compression_args is None: compression_args = COMPRESSION_OPTIONS[compress_weights_format] diff --git a/llm_bench/python/utils/model_utils.py b/llm_bench/python/utils/model_utils.py index abd9ac5598..19b53f3ad6 100644 --- a/llm_bench/python/utils/model_utils.py +++ b/llm_bench/python/utils/model_utils.py @@ -134,10 +134,12 @@ def analyze_args(args): model_args['subsequent'] = args.subsequent model_args['output_dir'] = args.output_dir model_args['genai'] = args.genai + model_args['devices'] = args.device model_args['prompt_index'] = [] if args.prompt_index is not None else None if model_args['prompt_index'] is not None: # Deduplication [model_args['prompt_index'].append(i) for i in args.prompt_index if i not in model_args['prompt_index']] + model_args['end_token_stopping'] = args.end_token_stopping model_framework = args.framework model_path = Path(args.model) diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py index b0d0d93aa1..01d0dd95b3 100644 --- a/llm_bench/python/utils/nncf_utils.py +++ b/llm_bench/python/utils/nncf_utils.py @@ -19,6 +19,11 @@ "ratio": 1, "all_layers": True, }, + "E2M1": { + "mode": nncf.CompressWeightsMode.E2M1, + "group_size": 32, + "all_layers": True, + }, } if "INT8_ASYM" in nncf.CompressWeightsMode.__members__: @@ -36,7 +41,6 @@ def get_compressed_path(output_dir: str, base_precision, option: str): "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64}, "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8}, - "bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6}, "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128}, "zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8, "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": True}}, @@ -53,7 +57,7 @@ def get_compressed_path(output_dir: str, base_precision, option: str): "rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8}, "chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72}, "qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6}, - "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True}, + "open-llama-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 1.0, "all_layers": True}, "falcon-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True}, "orca-mini-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "all_layers": True, "dataset": {"name": "wikitext,wikitext-2-v1,train[:1000],text", "awq": False}}, @@ -68,4 +72,8 @@ def get_compressed_path(output_dir: str, base_precision, option: str): "red-pajama-incite-chat-3b-v1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, "vicuna-7b-v1.5": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 1.0}, "stablelm-tuned-alpha-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, + "longchat-b7": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9}, + "starcoder2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9}, + "tiny-llama-1.1b-chat": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.8}, + "phi-2": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128, "ratio": 0.9}, } diff --git a/llm_bench/python/utils/ov_model_classes.py b/llm_bench/python/utils/ov_model_classes.py index abc4c89aa8..0ade0f1299 100644 --- a/llm_bench/python/utils/ov_model_classes.py +++ b/llm_bench/python/utils/ov_model_classes.py @@ -288,8 +288,11 @@ def __init__( **kwargs, ): super().__init__(model, config, device, dynamic_shapes, ov_config, model_save_dir, **kwargs) - self.key_value_input_names = ['past_key_values'] - self.key_value_output_names = [o.any_name for o in self.model.outputs[1:]] + self.is_v1 = False + if not self.stateful and not self.key_value_input_names: + self.is_v1 = True + self.key_value_input_names = ['past_key_values'] + self.key_value_output_names = [o.any_name for o in self.model.outputs[1:]] def prepare_inputs_for_generation( self, @@ -300,6 +303,13 @@ def prepare_inputs_for_generation( past: Optional[torch.Tensor] = None, **kwargs, ) -> dict: + if not self.is_v1: + return super().prepare_inputs_for_generation( + input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, + position_ids=position_ids, + past=past, + **kwargs + ) batch_size, seq_length = input_ids.shape mask = self.mask_token_id g_mask = self.gmask_token_id @@ -430,6 +440,9 @@ def forward( past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, **kwargs, ) -> CausalLMOutputWithPast: + + if not self.is_v1: + return super().forward(input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, **kwargs) self.compile() inputs = {} diff --git a/pyproject.toml b/pyproject.toml index 6cc7440046..1ea9c9b85f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openvino_genai" -version = "2024.2.0.0" +version = "2024.4.0.0" description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai" requires-python = ">=3.8" readme = {file = "src/README.md", content-type="text/markdown"} @@ -16,7 +16,7 @@ classifiers = [ "Programming Language :: Python :: 3.12", ] dependencies = [ - "openvino_tokenizers~=2024.2.0.0" + "openvino_tokenizers~=2024.4.0.0.dev" ] [tool.py-build-cmake.module] diff --git a/requirements-build.txt b/requirements-build.txt index 8885e223ea..2611a89b08 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1 +1 @@ -cmake~=3.29 \ No newline at end of file +cmake~=3.30 \ No newline at end of file diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 9e35946472..5339817c1f 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -4,10 +4,13 @@ add_subdirectory(cpp/beam_search_causal_lm) add_subdirectory(cpp/chat_sample) +add_subdirectory(cpp/continuous_batching_accuracy) +add_subdirectory(cpp/continuous_batching_benchmark) add_subdirectory(cpp/greedy_causal_lm) add_subdirectory(cpp/multinomial_causal_lm) add_subdirectory(cpp/prompt_lookup_decoding_lm) add_subdirectory(cpp/speculative_decoding_lm) +add_subdirectory(cpp/benchmark_genai) install(FILES requirements.txt DESTINATION samples COMPONENT cpp_samples_genai) @@ -18,6 +21,7 @@ install(DIRECTORY cpp/greedy_causal_lm cpp/multinomial_causal_lm # Don't install prompt_lookup_decoding_lm and speculative_decoding_lm because they don't use openvino_genai library and arent verifyed yet. + # Don't install continuous_batching_accuracy and continuous_batching_benchmark because they depend on json. DESTINATION samples/cpp COMPONENT cpp_samples_genai) install(DIRECTORY diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md index a104288911..0d2ee83bfc 100644 --- a/samples/cpp/beam_search_causal_lm/README.md +++ b/samples/cpp/beam_search_causal_lm/README.md @@ -1,4 +1,4 @@ -# Text generation C++ sample that supports most popular models like LLaMA 2 +# Text generation C++ sample that supports most popular models like LLaMA 3 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `beam_search_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt new file mode 100644 index 0000000000..5178c528ab --- /dev/null +++ b/samples/cpp/benchmark_genai/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. +) + +include(FetchContent) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +add_executable(benchmark_genai benchmark_genai.cpp) +target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) +set_target_properties(benchmark_genai PROPERTIES + COMPILE_PDB_NAME benchmark_genai + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS benchmark_genai + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md new file mode 100644 index 0000000000..616bb6a36d --- /dev/null +++ b/samples/cpp/benchmark_genai/README.md @@ -0,0 +1,47 @@ +# LLMs benchmarking sample + +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Usage + +```sh +benchmark_vanilla_genai [OPTIONS] +``` + +### Options + +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + +### Output: + +``` +benchmark_vanilla_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10 +``` + +``` +Load time: 3405.69 ms +Generate time: 1430.77 Β± 3.04 ms +Tokenization time: 0.51 Β± 0.02 ms +Detokenization time: 0.37 Β± 0.01 ms +TTFT: 81.60 Β± 0.54 ms +TPOT: 71.52 Β± 2.72 ms +Throughput tokens/s: 13.98 Β± 0.53 +``` + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp new file mode 100644 index 0000000000..287d6b379a --- /dev/null +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) try { + cxxopts::Options options("benchmark_vanilla_genai", "Help command"); + + options.add_options() + ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) + ("p,prompt", "Prompt", cxxopts::value()->default_value("The Sky is blue because")) + ("nw,num_warmup", "Number of warmup iterations", cxxopts::value()->default_value(std::to_string(1))) + ("n,num_iter", "Number of iterations", cxxopts::value()->default_value(std::to_string(3))) + ("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value()->default_value(std::to_string(20))) + ("d,device", "device", cxxopts::value()->default_value("CPU")) + ("h,help", "Print usage"); + + cxxopts::ParseResult result; + try { + result = options.parse(argc, argv); + } catch (const cxxopts::exceptions::exception& e) { + std::cout << e.what() << "\n\n"; + std::cout << options.help() << std::endl; + return EXIT_FAILURE; + } + + if (result.count("help")) { + std::cout << options.help() << std::endl; + return EXIT_SUCCESS; + } + + std::string prompt = result["prompt"].as(); + const std::string model_path = result["model"].as(); + std::string device = result["device"].as(); + size_t num_warmup = result["num_warmup"].as(); + size_t num_iter = result["num_iter"].as(); + + ov::genai::GenerationConfig config; + config.max_new_tokens = result["max_new_tokens"].as(); + + ov::genai::LLMPipeline pipe(model_path, device); + + for (size_t i = 0; i < num_warmup; i++) + pipe.generate(prompt, config); + + ov::genai::DecodedResults res = pipe.generate(prompt, config); + ov::genai::PerfMetrics metrics = res.perf_metrics; + for (size_t i = 0; i < num_iter - 1; i++) { + res = pipe.generate(prompt, config); + metrics = metrics + res.perf_metrics; + } + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl; + std::cout << "Generate time: " << metrics.get_generate_duration().mean << " Β± " << metrics.get_generate_duration().std << " ms" << std::endl; + std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " Β± " << metrics.get_tokenization_duration().std << " ms" << std::endl; + std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " Β± " << metrics.get_detokenization_duration().std << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " Β± " << metrics.get_ttft().std << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " Β± " << metrics.get_tpot().std << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " Β± " << metrics.get_throughput().std << " tokens/s" << std::endl; + + return 0; +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md index 4baa8385ef..3f736985c2 100644 --- a/samples/cpp/chat_sample/README.md +++ b/samples/cpp/chat_sample/README.md @@ -1,4 +1,4 @@ -# C++ chat_sample that supports most popular models like LLaMA 2 +# C++ chat_sample that supports most popular models like LLaMA 3 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. @@ -17,8 +17,28 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `chat_sample TinyLlama-1.1B-Chat-v1.0` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +#### Missing chat template + +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp index fa0442d415..ae4dad88a2 100644 --- a/samples/cpp/chat_sample/chat_sample.cpp +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -10,14 +10,14 @@ int main(int argc, char* argv[]) try { std::string prompt; std::string model_path = argv[1]; - std::string device = "CPU"; // GPU can be used as well + std::string device = "CPU"; // GPU, NPU can be used as well ov::genai::LLMPipeline pipe(model_path, "CPU"); ov::genai::GenerationConfig config; config.max_new_tokens = 100; std::function streamer = [](std::string word) { std::cout << word << std::flush; - // Return flag correspods whether generation should be stopped. + // Return flag corresponds whether generation should be stopped. // false means continue generation. return false; }; diff --git a/samples/cpp/continuous_batching_accuracy/CMakeLists.txt b/samples/cpp/continuous_batching_accuracy/CMakeLists.txt new file mode 100644 index 0000000000..26dc9bc7b8 --- /dev/null +++ b/samples/cpp/continuous_batching_accuracy/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# start of dependencies + +include(FetchContent) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +if(NOT TARGET nlohmann_json) + FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) + FetchContent_MakeAvailable(nlohmann_json) +endif() + +find_package(OpenVINO REQUIRED COMPONENTS Runtime) + +# end of dependencies + +set(TARGET_NAME continuous_batching_accuracy) +add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) +target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai cxxopts::cxxopts) diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp similarity index 70% rename from text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp rename to samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp index ac3b9cb548..77485e36db 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/apps/accuracy_sample.cpp +++ b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp @@ -4,10 +4,9 @@ #include #include -#include "continuous_batching_pipeline.hpp" -#include "tokenizer.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" -void print_generation_result(const GenerationResult& generation_result) { +void print_generation_result(const ov::genai::GenerationResult& generation_result) { for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) { std::cout << "Answer " << output_id << " (" << generation_result.m_scores[output_id] << ") : " << generation_result.m_generation_ids[output_id] << std::endl; } @@ -52,14 +51,14 @@ int main(int argc, char* argv[]) try { "What is OpenVINO?", }; - std::vector sampling_params_examples { - GenerationConfig::beam_search(), - GenerationConfig::greedy(), - GenerationConfig::multinomial(), + std::vector sampling_params_examples { + ov::genai::beam_search(), + ov::genai::greedy(), + ov::genai::multinomial(), }; std::vector prompts(num_prompts); - std::vector sampling_params(num_prompts); + std::vector sampling_params(num_prompts); for (size_t request_id = 0; request_id < num_prompts; ++request_id) { prompts[request_id] = prompt_examples[request_id % prompt_examples.size()]; @@ -68,37 +67,38 @@ int main(int argc, char* argv[]) try { // Perform the inference - SchedulerConfig scheduler_config { - // batch size - .max_num_batched_tokens = 32, - // cache params - .num_kv_blocks = 364, - .block_size = 32, - // mode - vLLM or dynamic_split_fuse - .dynamic_split_fuse = dynamic_split_fuse, - // vLLM specific params - .max_num_seqs = 2, - }; - - ContinuousBatchingPipeline pipe(models_path, scheduler_config); - std::vector generation_results = pipe.generate(prompts, sampling_params); + ov::genai::SchedulerConfig scheduler_config; + // batch size + scheduler_config.max_num_batched_tokens = 32; + // cache params + scheduler_config.num_kv_blocks = 364; + scheduler_config.block_size = 32; + // mode - vLLM or dynamic_split_fuse + scheduler_config.dynamic_split_fuse = dynamic_split_fuse; + // vLLM specific params + scheduler_config.max_num_seqs = 2; + + // It's possible to construct a Tokenizer from a different path. + // If the Tokenizer isn't specified, it's loaded from the same folder. + ov::genai::ContinuousBatchingPipeline pipe(models_path, ov::genai::Tokenizer{models_path}, scheduler_config); + std::vector generation_results = pipe.generate(prompts, sampling_params); for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) { - const GenerationResult & generation_result = generation_results[request_id]; + const ov::genai::GenerationResult & generation_result = generation_results[request_id]; std::cout << "Question: " << prompts[request_id] << std::endl; switch (generation_result.m_status) { - case GenerationStatus::FINISHED: + case ov::genai::GenerationStatus::FINISHED: print_generation_result(generation_result); break; - case GenerationStatus::IGNORED: + case ov::genai::GenerationStatus::IGNORED: std::cout << "Request was ignored due to lack of memory." < 0) { std::cout << "Partial result:" << std::endl; print_generation_result(generation_result); } break; - case GenerationStatus::DROPPED_BY_PIPELINE: + case ov::genai::GenerationStatus::DROPPED_BY_PIPELINE: std::cout << "Request was aborted." < 0) { std::cout << "Partial result:" << std::endl; diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/CMakeLists.txt b/samples/cpp/continuous_batching_benchmark/CMakeLists.txt similarity index 63% rename from text_generation/causal_lm/cpp/continuous_batching/apps/CMakeLists.txt rename to samples/cpp/continuous_batching_benchmark/CMakeLists.txt index 86c7046605..34a15f58d7 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/apps/CMakeLists.txt +++ b/samples/cpp/continuous_batching_benchmark/CMakeLists.txt @@ -5,6 +5,10 @@ include(FetchContent) +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + FetchContent_Declare(cxxopts URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) @@ -22,12 +26,6 @@ find_package(Threads REQUIRED) # end of dependencies -set(TARGET_NAME accuracy_sample) -add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) -target_link_libraries(${TARGET_NAME} PRIVATE openvino::continuous_batching cxxopts::cxxopts) -target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20) - -set(TARGET_NAME throughput_benchmark) +set(TARGET_NAME continuous_batching_benchmark) add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) -target_link_libraries(${TARGET_NAME} PRIVATE openvino::continuous_batching nlohmann_json::nlohmann_json cxxopts::cxxopts Threads::Threads) -target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20) +target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai nlohmann_json::nlohmann_json cxxopts::cxxopts Threads::Threads) diff --git a/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp b/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp similarity index 90% rename from text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp rename to samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp index e409d796a0..a687bd61d1 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark.cpp +++ b/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp @@ -11,14 +11,12 @@ #include #include - -#include #include #include -#include "tokenizer.hpp" -#include "continuous_batching_pipeline.hpp" -#include "generation_handle.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_handle.hpp" namespace { @@ -37,7 +35,7 @@ class AutoStartTimer { struct Dataset { std::vector m_prompts; - std::vector m_sampling_params; + std::vector m_sampling_params; std::vector m_input_lens, m_output_lens; size_t m_total_input_len = 0; @@ -50,7 +48,7 @@ struct Dataset { m_output_lens.reserve(size); } - void push_data(std::string prompt, GenerationConfig sampling_params) { + void push_data(std::string prompt, ov::genai::GenerationConfig sampling_params) { m_prompts.push_back(prompt); m_sampling_params.push_back(sampling_params); } @@ -95,7 +93,7 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data sampled_dataset.reserve(num_prompt_candidates); dataset.reserve(num_prompt_candidates); - Tokenizer tokenizer(models_path); + ov::genai::Tokenizer tokenizer(models_path); for (auto json_data_iterator = json_dataset.begin(); json_data_iterator != json_dataset.end() && dataset.size() < num_prompt_candidates; ++json_data_iterator) { auto & json_data = *json_data_iterator; @@ -108,10 +106,10 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data std::string human_question = json_data["conversations"][0]["value"]; std::string gpt_answer = json_data["conversations"][1]["value"]; - ov::Tensor _input_ids_prompt = tokenizer.encode(human_question); + ov::Tensor _input_ids_prompt = tokenizer.encode(human_question).input_ids; size_t input_len = _input_ids_prompt.get_size(); - ov::Tensor _input_ids_answer = tokenizer.encode(gpt_answer); + ov::Tensor _input_ids_answer = tokenizer.encode(gpt_answer).input_ids; size_t output_len = _input_ids_answer.get_size(); // Prune too short sequences. @@ -121,8 +119,13 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data if (input_len > max_input_len || (input_len + output_len) > 2048) continue; - GenerationConfig greedy_search = GenerationConfig::greedy(); + ov::genai::GenerationConfig greedy_search = ov::genai::greedy(); greedy_search.max_new_tokens = std::min(max_output_len, output_len); + greedy_search.repetition_penalty = 1.0; + greedy_search.frequency_penalty = 0.0; + greedy_search.presence_penalty = 0.0; + greedy_search.diversity_penalty = 0.0; + greedy_search.length_penalty = 0.0; dataset.push_data(human_question, greedy_search); dataset.push_lens(input_len, output_len); @@ -178,14 +181,14 @@ class GenerationInfo { size_t num_input_tokens; }; - GenerationHandle generation_handle; + ov::genai::GenerationHandle generation_handle; std::chrono::steady_clock::time_point start_time; std::unordered_map sequences_info; bool active = true; size_t input_len; public: - GenerationInfo(GenerationHandle generation_handle, size_t input_len) : input_len(input_len) + GenerationInfo(ov::genai::GenerationHandle generation_handle, size_t input_len) : input_len(input_len) { this->generation_handle = std::move(generation_handle); start_time = std::chrono::steady_clock::now(); @@ -197,13 +200,13 @@ class GenerationInfo { sequences_info.at(sequence_id).update(); } - void update(GenerationOutputs& outputs){ + void update(ov::genai::GenerationOutputs& outputs){ for (auto const& output: outputs) { update_sequence(output.first); } } - GenerationOutputs read() { + ov::genai::GenerationOutputs read() { return generation_handle->read(); } @@ -212,7 +215,7 @@ class GenerationInfo { } bool is_finished() { - return generation_handle->get_status() == GenerationStatus::FINISHED; + return generation_handle->get_status() == ov::genai::GenerationStatus::FINISHED; } void set_inactive() { @@ -249,13 +252,13 @@ class GenerationInfoCollector { this->start_time = start_time; } - void add_generation(ContinuousBatchingPipeline* pipe, Dataset* dataset, size_t request_id) { - GenerationHandle generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], dataset->m_sampling_params[request_id]); + void add_generation(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, size_t request_id) { + ov::genai::GenerationHandle generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], dataset->m_sampling_params[request_id]); std::lock_guard lock(mutex); generations_info.emplace_back(std::move(generation_handle), dataset->m_input_lens[request_id]); } - int run() { + size_t run() { std::lock_guard lock(mutex); for (GenerationInfo& generation_info : generations_info) { if (!generation_info.is_active()) @@ -299,7 +302,7 @@ class GenerationInfoCollector { } }; -void trafficSimulator(ContinuousBatchingPipeline* pipe, Dataset* dataset, std::string request_rate, GenerationInfoCollector* generation_info_collector) { +void trafficSimulator(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, std::string request_rate, GenerationInfoCollector* generation_info_collector) { double numeric_request_rate; std::random_device rd; std::mt19937 gen(rd()); @@ -333,7 +336,7 @@ void trafficSimulator(ContinuousBatchingPipeline* pipe, Dataset* dataset, std::s std::cout << "All requests sent, traffic simulation finished. Exiting thread." << std::endl; } -void llmEngineLoop(ContinuousBatchingPipeline* pipe, Dataset* dataset, std::atomic* finishThread) { +void llmEngineLoop(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, std::atomic* finishThread) { std::cout << "Launching LLM engine thread" << std::endl; size_t num_finished = 0; @@ -425,7 +428,7 @@ int main(int argc, char* argv[]) try { options.add_options() ("n,num_prompts", "A number of prompts", cxxopts::value()->default_value("1000")) ("b,max_batch_size", "A maximum number of batched tokens", cxxopts::value()->default_value("256")) - ("dynamic_split_fuse", "Whether to use dynamic split-fuse or vLLM scheduling", cxxopts::value()->default_value("false")) + ("dynamic_split_fuse", "Whether to use dynamic split-fuse or vLLM scheduling", cxxopts::value()->default_value("true")) ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value(".")) ("dataset", "Path to dataset .json file", cxxopts::value()->default_value("./ShareGPT_V3_unfiltered_cleaned_split.json")) ("max_input_len", "Max input length take from dataset", cxxopts::value()->default_value("1024")) @@ -466,13 +469,12 @@ int main(int argc, char* argv[]) try { Dataset dataset = filtered_dataset(models_path, dataset_path, num_prompts, max_input_len, max_output_len); // Perform the first inference - SchedulerConfig scheduler_config { - .max_num_batched_tokens = max_batch_size, - .cache_size = cache_size, - .block_size = 32, - .dynamic_split_fuse = dynamic_split_fuse, - .max_num_seqs = 256, // not used if dynamic_split_fuse=True - }; + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = max_batch_size, + scheduler_config.cache_size = cache_size, + scheduler_config.block_size = 32, + scheduler_config.dynamic_split_fuse = dynamic_split_fuse, + scheduler_config.max_num_seqs = 256, // not used if dynamic_split_fuse=True std::cout << "Benchmarking parameters: " << std::endl; std::cout << "\tMax number of batched tokens: " << scheduler_config.max_num_batched_tokens << std::endl; @@ -495,7 +497,7 @@ int main(int argc, char* argv[]) try { // Benchmarking std::cout << "Loading models, creating pipelines, preparing environment..." << std::endl; - ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, device_config_map); + ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, device_config_map); std::cout << "Setup finished, launching LLM executor, traffic simulation and statistics reporter threads" << std::endl; diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md index 3c0758ee6b..79852e0d10 100644 --- a/samples/cpp/greedy_causal_lm/README.md +++ b/samples/cpp/greedy_causal_lm/README.md @@ -1,4 +1,4 @@ -# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 2 +# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 3 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt index efcac50f09..98bc76ee3c 100644 --- a/samples/cpp/multinomial_causal_lm/CMakeLists.txt +++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt @@ -11,7 +11,7 @@ set_target_properties(multinomial_causal_lm PROPERTIES COMPILE_PDB_NAME multinomial_causal_lm # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_features(greedy_causal_lm PRIVATE cxx_std_11) +target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11) install(TARGETS multinomial_causal_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md index 731d03e3c1..21c9a07e77 100644 --- a/samples/cpp/multinomial_causal_lm/README.md +++ b/samples/cpp/multinomial_causal_lm/README.md @@ -1,4 +1,4 @@ -# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 2 +# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 3 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `multinomial_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md index 980c0cd19c..c5517c5bf6 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/README.md +++ b/samples/cpp/prompt_lookup_decoding_lm/README.md @@ -1,4 +1,4 @@ -# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 2 +# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 3 [Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. @@ -20,8 +20,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index cd6de37753..51ac654aac 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -37,7 +37,7 @@ struct TextStreamer { void put(int64_t token) { token_cache.push_back(token); std::string text = detokenize(detokenizer, token_cache); - if (!text.empty() && '\n' == text.back()) { + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { // Flush the cache after the new line symbol std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; token_cache.clear(); @@ -47,13 +47,18 @@ struct TextStreamer { if (text.size() >= 3 && text.compare(text.size() - 3, 3, "οΏ½") == 0) { // Don't print incomplete text return; + } else if (text.size() > print_len) { + // It is possible to have a shorter text after adding new token. + // Print to output only if text lengh is increaeseds. + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); } - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); } void end() { std::string text = detokenize(detokenizer, token_cache); + if (text.size() <= print_len) + return ; std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; token_cache.clear(); print_len = 0; diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md index 7abcb6782a..644ebd2c94 100644 --- a/samples/cpp/speculative_decoding_lm/README.md +++ b/samples/cpp/speculative_decoding_lm/README.md @@ -1,4 +1,4 @@ -# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 2 +# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 3 Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model. @@ -24,8 +24,20 @@ optimum-cli export openvino --trust-remote-code --model meta-llama/Llama-2-7b-ch `speculative_decoding_lm TinyLlama-1.1B-Chat-v1.0 Llama-2-7b-chat-hf "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index b0c40a7a9f..4927b7d795 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -37,7 +37,7 @@ struct TextStreamer { void put(int64_t token) { token_cache.push_back(token); std::string text = detokenize(detokenizer, token_cache); - if (!text.empty() && '\n' == text.back()) { + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { // Flush the cache after the new line symbol std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; token_cache.clear(); @@ -47,13 +47,18 @@ struct TextStreamer { if (text.size() >= 3 && text.compare(text.size() - 3, 3, "οΏ½") == 0) { // Don't print incomplete text return; + } else if (text.size() > print_len) { + // It is possible to have a shorter text after adding new token. + // Print to output only if text lengh is increaesed. + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); } - std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); } void end() { std::string text = detokenize(detokenizer, token_cache); + if (text.size() <= print_len) + return ; std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; token_cache.clear(); print_len = 0; diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md index ff5286d010..7e412db379 100644 --- a/samples/python/beam_search_causal_lm/README.md +++ b/samples/python/beam_search_causal_lm/README.md @@ -1,4 +1,4 @@ -# Text generation Python sample that supports most popular models like LLaMA 2 +# Text generation Python sample that supports most popular models like LLaMA 3 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md new file mode 100644 index 0000000000..9baf17c4d7 --- /dev/null +++ b/samples/python/benchmark_genai/README.md @@ -0,0 +1,47 @@ +# LLMs benchmarking sample + +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Usage + +```sh +python benchmark_genai.py [OPTIONS] +``` + +### Options + +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + +### Output: + +``` +python benchmark_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10 +``` + +``` +Load time: 3405.69 ms +Generate time: 1430.77 Β± 3.04 ms +Tokenization time: 0.51 Β± 0.02 ms +Detokenization time: 0.37 Β± 0.01 ms +TTFT: 81.60 Β± 0.54 ms +TPOT: 71.52 Β± 2.72 ms +Throughput tokens/s: 13.98 Β± 0.53 +``` + +For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics). diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py new file mode 100755 index 0000000000..9851483880 --- /dev/null +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -0,0 +1,49 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai as ov_genai + +def main(): + parser = argparse.ArgumentParser(description="Help command") + parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") + parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") + parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") + parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") + parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") + parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") + + args = parser.parse_args() + + # Perf metrics is stored in DecodedResults. + # In order to get DecodedResults instead of a string input should be a list. + prompt = [args.prompt] + model_path = args.model + device = args.device + num_warmup = args.num_warmup + num_iter = args.num_iter + + config = ov_genai.GenerationConfig() + config.max_new_tokens = args.max_new_tokens + + pipe = ov_genai.LLMPipeline(model_path, device) + + for _ in range(num_warmup): + pipe.generate(prompt, config) + + res = pipe.generate(prompt, config) + perf_metrics = res.perf_metrics + for _ in range(num_iter - 1): + res = pipe.generate(prompt, config) + perf_metrics += res.perf_metrics + + print(f"Load time: {perf_metrics.get_load_time():.2f} ms") + print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} Β± {perf_metrics.get_generate_duration().std:.2f} ms") + print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} Β± {perf_metrics.get_tokenization_duration().std:.2f} ms") + print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} Β± {perf_metrics.get_detokenization_duration().std:.2f} ms") + print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} Β± {perf_metrics.get_ttft().std:.2f} ms") + print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} Β± {perf_metrics.get_tpot().std:.2f} ms") + print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} Β± {perf_metrics.get_throughput().std:.2f} tokens/s") + +if __name__ == "__main__": + main() diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md index 34d71fab8a..66fe4b0d93 100644 --- a/samples/python/chat_sample/README.md +++ b/samples/python/chat_sample/README.md @@ -1,4 +1,4 @@ -# Python chat_sample that supports most popular models like LLaMA 2 +# Python chat_sample that supports most popular models like LLaMA 3 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. @@ -17,8 +17,28 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `chat_sample.py TinyLlama-1.1B-Chat-v1.0` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +#### Missing chat template + +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` \ No newline at end of file diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/chat_sample/chat_sample.py index eb51692436..29e81026d6 100755 --- a/samples/python/chat_sample/chat_sample.py +++ b/samples/python/chat_sample/chat_sample.py @@ -8,7 +8,7 @@ def streamer(subword): print(subword, end='', flush=True) - # Return flag correspods whether generation should be stopped. + # Return flag corresponds whether generation should be stopped. # False means continue generation. return False diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md index 7c87b04aad..1f0eb333ea 100644 --- a/samples/python/greedy_causal_lm/README.md +++ b/samples/python/greedy_causal_lm/README.md @@ -1,4 +1,4 @@ -# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 2 +# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md index d76b933663..0778868e6a 100644 --- a/samples/python/multinomial_causal_lm/README.md +++ b/samples/python/multinomial_causal_lm/README.md @@ -1,4 +1,4 @@ -# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 2 +# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3 This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. @@ -17,8 +17,20 @@ optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B `multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/requirements.txt b/samples/requirements.txt index d16301ad3e..7c0ffb05e9 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -1,4 +1,4 @@ --extra-index-url https://download.pytorch.org/whl/cpu -optimum[openvino]==1.20.0 +optimum[openvino]==1.21.2 einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/src/README.md b/src/README.md index 2d044b8519..893ffb5ea9 100644 --- a/src/README.md +++ b/src/README.md @@ -5,10 +5,24 @@ It hides the complexity of the generation process and minimizes the amount of co ## Install OpenVINOβ„’ GenAI +> **NOTE**: Please make sure that you are following the versions compatibility rules, refer to the [OpenVINOβ„’ GenAI Dependencies](#openvino-genai-dependencies) for more information. + The OpenVINOβ„’ GenAI flavor is available for installation via Archive and PyPI distributions. To install OpenVINOβ„’ GenAI, refer to the [Install Guide](https://docs.openvino.ai/2024/get-started/install-openvino.html). -To build OpenVINOβ„’ GenAI library from source, refer to the [Build Instructions](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/BUILD.md). +To build OpenVINOβ„’ GenAI library from source, refer to the [Build Instructions](./docs/BUILD.md). + +### OpenVINOβ„’ GenAI Dependencies + +OpenVINOβ„’ GenAI depends on [OpenVINO](https://github.com/openvinotoolkit/openvino) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). + +When installing OpenVINOβ„’ GenAI from PyPi, the same versions of OpenVINO and OpenVINO Tokenizers are used (e.g. `openvino==2024.3.0` and `openvino-tokenizers==2024.3.0.0` are installed for `openvino-genai==2024.3.0`). +If you update one of the dependency packages (e.g. `pip install openvino --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly`), versions might be incompatible due to different ABI and running OpenVINO GenAI can result in errors (e.g. `ImportError: libopenvino.so.2430: cannot open shared object file: No such file or directory`). +Having packages version in format `...`, only `` part of the full version can be varied to ensure ABI compatibility, while changing ``, `` or `` parts of the version might break ABI. + +GenAI, Tokenizers, and OpenVINO wheels for Linux on PyPI are compiled with `_GLIBCXX_USE_CXX11_ABI=0` to cover a wider range of platforms. In contrast, C++ archive distributions for Ubuntu are compiled with `_GLIBCXX_USE_CXX11_ABI=1`. It is not possible to mix different Application Binary Interfaces (ABIs) because doing so results in a link error. This incompatibility prevents the use of, for example, OpenVINO from C++ archive distributions alongside GenAI from PyPI. + +If you want to try OpenVINO GenAI with different dependencies versions (**not** prebuilt packages as archives or python wheels), build OpenVINO GenAI library from source. ## Usage @@ -16,16 +30,16 @@ To build OpenVINOβ„’ GenAI library from source, refer to the [Build Instructions 1. Installed OpenVINOβ„’ GenAI - > If OpenVINO GenAI is installed via archive distribution or built from source, you will need to install additional python dependencies (e.g. `optimum-cli` for simplified model downloading and exporting, it's not required to install [./samples/cpp/requirements.txt](./samples/cpp/requirements.txt) for deployment if the model has already been exported): - > - > ```sh - > # (Optional) Clone OpenVINO GenAI repository if it does not exist - > git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git - > cd openvino.genai - > # Install python dependencies - > python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release - > python -m pip install --upgrade-strategy eager -r ./samples/cpp/requirements.txt - > ``` + > To use OpenVINO GenAI with models that are already in OpenVINO format, no additional python dependencies are needed. To + > convert models with optimum-cli and to run the examples, install the dependencies in [./samples/requirements.txt](./samples/requirements.txt): + ```sh + # (Optional) Clone OpenVINO GenAI repository if it does not exist + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + # Install python dependencies + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt + ``` 2. A model in OpenVINO IR format @@ -42,7 +56,7 @@ A simple example: ```python import openvino_genai as ov_genai pipe = ov_genai.LLMPipeline(model_path, "CPU") -print(pipe.generate("The Sun is yellow because")) +print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) ``` Calling generate with custom generation config parameters, e.g. config for grouped beam search: @@ -50,7 +64,7 @@ Calling generate with custom generation config parameters, e.g. config for group import openvino_genai as ov_genai pipe = ov_genai.LLMPipeline(model_path, "CPU") -result = pipe.generate("The Sun is yellow because", max_new_tokens=30, num_groups=3, group_size=5, diversity_penalty=1.5) +result = pipe.generate("The Sun is yellow because", max_new_tokens=100, num_beam_groups=3, num_beams=15, diversity_penalty=1.5) print(result) ``` @@ -64,7 +78,7 @@ A simple chat in Python: import openvino_genai as ov_genai pipe = ov_genai.LLMPipeline(model_path) -config = {'max_new_tokens': 100, 'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.5} +config = {'max_new_tokens': 100, 'num_beam_groups': 3, 'num_beams': 15, 'diversity_penalty': 1.5} pipe.set_generation_config(config) pipe.start_chat() @@ -73,7 +87,7 @@ while True: Β  Β  prompt = input() if prompt == 'Stop!': Β  Β  Β  Β  break -Β  Β  print(pipe(prompt)) +Β  Β  print(pipe(prompt, max_new_tokens=200)) pipe.finish_chat() ``` @@ -89,7 +103,7 @@ A simple example: int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::genai::LLMPipeline pipe(model_path, "CPU"); - std::cout << pipe.generate("The Sun is yellow because"); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(256)); } ``` @@ -104,8 +118,8 @@ int main(int argc, char* argv[]) { ov::genai::GenerationConfig config; config.max_new_tokens = 256; - config.num_groups = 3; - config.group_size = 5; + config.num_beam_groups = 3; + config.num_beams = 15; config.diversity_penalty = 1.0f; std::cout << pipe.generate("The Sun is yellow because", config); @@ -125,8 +139,8 @@ int main(int argc, char* argv[]) { ov::genai::GenerationConfig config; config.max_new_tokens = 100; - config.num_groups = 3; - config.group_size = 5; + config.num_beam_groups = 3; + config.num_beams = 15; config.diversity_penalty = 1.0f; pipe.start_chat(); @@ -155,11 +169,11 @@ int main(int argc, char* argv[]) { auto streamer = [](std::string word) { std::cout << word << std::flush; - // Return flag correspods whether generation should be stopped. + // Return flag corresponds whether generation should be stopped. // false means continue generation. return false; }; - std::cout << pipe.generate("The Sun is yellow bacause", streamer); + std::cout << pipe.generate("The Sun is yellow bacause", ov::genai::streamer(streamer), ov::genai::max_new_tokens(200)); } ``` @@ -192,14 +206,105 @@ int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::genai::LLMPipeline pipe(model_path, "CPU"); - std::cout << pipe.generate("The Sun is yellow because", custom_streamer); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::streamer(custom_streamer), ov::genai::max_new_tokens(200)); } ``` +### Performance Metrics + +`openvino_genai.PerfMetrics` (referred as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` holds fields with mean and standard deviations for the following metrics: +- Time To the First Token (TTFT), ms +- Time per Output Token (TPOT), ms/token +- Generate total duration, ms +- Tokenization duration, ms +- Detokenization duration, ms +- Throughput, tokens/s + +and: +- Load time, ms +- Number of generated tokens +- Number of tokens in the input prompt + +Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `openvino_genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`. + +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +perf_metrics = result.perf_metrics + +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') +print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.get_throughput()get_.mean():.2f} tokens/s') +``` + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + auto result = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto perf_metrics = result.perf_metrics; + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " tokens/s" << std::endl; +} +``` +output: +```sh +mean_generate_duration: 76.28 +mean_ttft: 42.58 +mean_tpot 3.80 +``` + +>**Note**: If the input prompt is just a string, the generate function returns only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs. + +Several `perf_metrics` can be added to each other. In that case `raw_metrics` are concatenated and mean/std values are recalculated. This accumulates statistics from several `generate()` calls + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + auto result_1 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto result_2 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto perf_metrics = result_1.perf_metrics + result_2.perf_metrics + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " tokens/s" << std::endl; +} +``` + +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20) +perf_metrics = res_1.perf_metrics + res_2.perf_metrics + +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') +print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') +``` + +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples. + ## How It Works -For information on how OpenVINOβ„’ GenAI works, refer to the [How It Works Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/HOW_IT_WORKS.md). +For information on how OpenVINOβ„’ GenAI works, refer to the [How It Works Section](./docs/HOW_IT_WORKS.md). ## Supported Models -For a list of supported models, refer to the [Supported Models Section](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/2/src/docs/SUPPORTED_MODELS.md). +For a list of supported models, refer to the [Supported Models Section](./docs/SUPPORTED_MODELS.md). diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index 524ddad425..33db207518 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -58,7 +58,7 @@ set_target_properties(${TARGET_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" ) -# Extract two last digits from OpenVINOGenAI_PROJECT_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols. +# Extract two last digits from OpenVINOGenAI_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols. string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${OpenVINOGenAI_VERSION_MAJOR}) if(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND LINUX) # Don't pack symlinks but append version hash to the name for wheel @@ -69,7 +69,7 @@ elseif(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND APPELE) SUFFIX .${MAJOR_SUFFIX}${OpenVINOGenAI_VERSION_MINOR}${OpenVINOGenAI_VERSION_PATCH}${CMAKE_SHARED_LIBRARY_SUFFIX}) else() set_target_properties(${TARGET_NAME} PROPERTIES - VERSION ${OpenVINOGenAI_PROJECT_VERSION} + VERSION ${OpenVINOGenAI_VERSION} SOVERSION ${MAJOR_SUFFIX}${OpenVINOGenAI_VERSION_MINOR}${OpenVINOGenAI_VERSION_PATCH}) endif() @@ -105,7 +105,8 @@ install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION runtime/include COMPONENT core_genai_dev) install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake - NAMESPACE openvino:: DESTINATION runtime/cmake) + NAMESPACE openvino:: DESTINATION runtime/cmake + COMPONENT core_genai_dev) include(CMakePackageConfigHelpers) configure_package_config_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/OpenVINOGenAIConfig.cmake.in" diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp new file mode 100644 index 0000000000..626a51c5da --- /dev/null +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -0,0 +1,84 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "openvino/genai/scheduler_config.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/generation_handle.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/visibility.hpp" + +namespace ov::genai { +struct PipelineMetrics { + // All requests as viewed by the pipeline + size_t requests = 0; + // Requests scheduled for processing + size_t scheduled_requests = 0; + // Percentage of KV cache usage + float cache_usage = 0.0; +}; + +class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { + class Impl; + std::shared_ptr m_impl; + +public: + ContinuousBatchingPipeline(const std::string& models_path, + const SchedulerConfig& scheduler_config, + const std::string& device = "CPU", + const ov::AnyMap& llm_plugin_config = {}, + const ov::AnyMap& tokenizer_plugin_config = {}); + + /** + * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs. + * + * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json + * @param scheduler_config + * @param tokenizer manually initialized ov::genai::Tokenizer + * @param device optional device + * @param plugin_config optional plugin_config + */ + ContinuousBatchingPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device="CPU", + const ov::AnyMap& plugin_config={} + ); + + ov::genai::Tokenizer get_tokenizer(); + + ov::genai::GenerationConfig get_config() const; + + PipelineMetrics get_metrics() const; + + GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params); + GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params); + + void step(); + + bool has_non_finished_requests(); + + // more high level interface, which can process multiple prompts in continuous batching manner + std::vector generate(const std::vector& input_ids, const std::vector& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{}); + std::vector generate(const std::vector& prompts, const std::vector& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{}); + + /** + * @brief start chat with keeping history in kv cache. + * + * @param system_message optional system message. + */ + void start_chat(const std::string& system_message = ""); + + /** + * @brief finish chat and clear kv cache. + */ + void finish_chat(); +}; +} diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 99a461deda..c74349fd4f 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -33,6 +33,7 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. * @param ignore_eos if set to true, then generation will not stop even if token is met. * @param eos_token_id token_id of (end of sentence) + * @param min_new_tokens set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching. * * Beam search specific parameters: * @param num_beams number of beams for beam search. 1 disables beam search. @@ -56,6 +57,9 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER }; * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. + * @param presence_penalty reduces absolute log prob if the token was generated at least once. Ignored for non continuous batching. + * @param frequency_penalty reduces absolute log prob as many times as the token was generated. Ignored for non continuous batching. + * @param rng_seed initializes random generator. Ignored for non continuous batching. */ class OPENVINO_GENAI_EXPORTS GenerationConfig { public: @@ -66,6 +70,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { size_t max_new_tokens = SIZE_MAX; size_t max_length = SIZE_MAX; bool ignore_eos = false; + size_t min_new_tokens = 0; // Beam search specific size_t num_beam_groups = 1; @@ -79,13 +84,20 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { // Multinomial float temperature = 1.0f; float top_p = 1.0f; - size_t top_k = 50; + size_t top_k = std::numeric_limits::max(); bool do_sample = false; float repetition_penalty = 1.0f; + float presence_penalty = 0.0; + float frequency_penalty = 0.0f; + size_t rng_seed = 0; // EOS special token int64_t eos_token_id = -1; + /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. + * Otherwise verifies eos_token_id == tokenizer_eos_token_id. + */ + void set_eos_token_id(size_t tokenizer_eos_token_id); size_t get_max_new_tokens(size_t prompt_length = 0) const; bool is_greedy_decoding() const; bool is_beam_search() const; @@ -110,6 +122,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { static constexpr ov::Property max_new_tokens{"max_new_tokens"}; static constexpr ov::Property max_length{"max_length"}; static constexpr ov::Property ignore_eos{"ignore_eos"}; +static constexpr ov::Property min_new_tokens{"min_new_tokens"}; static constexpr ov::Property num_beam_groups{"num_beam_groups"}; static constexpr ov::Property num_beams{"num_beams"}; @@ -125,6 +138,13 @@ static constexpr ov::Property top_k{"top_k"}; static constexpr ov::Property do_sample{"do_sample"}; static constexpr ov::Property repetition_penalty{"repetition_penalty"}; static constexpr ov::Property eos_token_id{"eos_token_id"}; +static constexpr ov::Property presence_penalty{"presence_penalty"}; +static constexpr ov::Property frequency_penalty{"frequency_penalty"}; +static constexpr ov::Property rng_seed{"rng_seed"}; +// Predefined Configs +OPENVINO_GENAI_EXPORTS GenerationConfig beam_search(); +OPENVINO_GENAI_EXPORTS GenerationConfig greedy(); +OPENVINO_GENAI_EXPORTS GenerationConfig multinomial(); } // namespace genai } // namespace ov diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp similarity index 70% rename from text_generation/causal_lm/cpp/continuous_batching/library/include/generation_handle.hpp rename to src/cpp/include/openvino/genai/generation_handle.hpp index 63d40ca935..8d00ae0e9b 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/include/generation_handle.hpp +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -6,9 +6,10 @@ #include #include -#include "generation_config.hpp" - +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/visibility.hpp" +namespace ov::genai { enum class GenerationStatus { RUNNING = 0, // Default status for ongoing generation FINISHED = 1, // Status set when generation has been finished @@ -17,6 +18,20 @@ enum class GenerationStatus { DROPPED_BY_HANDLE = 4 // Status set when generation handle is dropped }; +struct EncodedGenerationResult { + // request ID - obsolete when handle API is approved as handle will connect results with prompts. + uint64_t m_request_id; + + // in a generic case we have multiple generation results per initial prompt + // depending on sampling parameters (e.g. beam search or parallel sampling) + std::vector> m_generation_ids; + // scores + std::vector m_scores; + + // Status of generation + GenerationStatus m_status = GenerationStatus::RUNNING; +}; + struct GenerationResult { // request ID - obsolete when handle API is approved as handle will connect results with prompts. uint64_t m_request_id; @@ -40,12 +55,12 @@ using GenerationOutputs = std::unordered_map; class GenerationStream; -class GenerationHandleImpl { +class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { std::shared_ptr m_generation_stream; - GenerationConfig m_sampling_params; + ov::genai::GenerationConfig m_sampling_params; public: - GenerationHandleImpl(std::shared_ptr generation_stream, const GenerationConfig& sampling_params) : + GenerationHandleImpl(std::shared_ptr generation_stream, const ov::genai::GenerationConfig& sampling_params) : m_generation_stream(generation_stream), m_sampling_params(sampling_params) {}; @@ -59,6 +74,7 @@ class GenerationHandleImpl { bool can_read(); + GenerationOutputs back(); // Reads result of a generation for single iteration GenerationOutputs read(); // Reads all generated tokens for all sequences @@ -66,3 +82,4 @@ class GenerationHandleImpl { }; using GenerationHandle = std::unique_ptr; +} diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 034f2e7433..4be298128e 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -5,16 +5,18 @@ #include #include +#include #include "openvino/core/any.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/tokenizer.hpp" #include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/perf_metrics.hpp" namespace ov { namespace genai { -// Return flag correspods whether generation should be stopped: false means continue generation, true means stop. +// Return flag corresponds whether generation should be stopped: false means continue generation, true means stop. using StreamerVariant = std::variant, std::shared_ptr, std::monostate>; using OptionalGenerationConfig = std::optional; using EncodedInputs = std::variant; @@ -29,11 +31,13 @@ using StringInputs = std::variant>; * * @param tokens sequence of resulting tokens * @param scores sum of logarithmic probabilities of all tokens in the sequence +* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics */ class EncodedResults { public: std::vector> tokens; std::vector scores; + PerfMetrics perf_metrics; }; /** @@ -42,11 +46,13 @@ class EncodedResults { * * @param texts vector of resulting sequences * @param scores scores for each sequence +* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics */ class DecodedResults { public: std::vector texts; std::vector scores; + PerfMetrics perf_metrics; // @brief Convert DecodedResults to a string. operator std::string() const { @@ -82,6 +88,8 @@ class DecodedResults { } }; +class LLMPipelineImplBase; + /** * @brief This class is used for generation with LLMs. */ @@ -114,10 +122,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { ); /** - * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs. + * @brief Constructs a LLMPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs. * * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json - * @param tokenizer manually initialized ov::Tokenizer + * @param tokenizer manually initialized ov::genai::Tokenizer * @param device optional device * @param plugin_config optional plugin_config */ @@ -213,11 +221,23 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { GenerationConfig get_generation_config() const; void set_generation_config(const GenerationConfig& config); - void start_chat(); + + /** + * @brief start chat with keeping history in kv cache. + * Turns on keeping KV cache between generate calls and automatic applying of chat templates. + * In case if beam search is used, KV cache is kept fot the generated sequence with maximal scores. + * + * @param system_message optional system message. + */ + void start_chat(const std::string& system_message = ""); + + /** + * @brief finish chat and clear kv cache. + * Turns off keeping KV cache between generate calls. + */ void finish_chat(); private: - class LLMPipelineImpl; - std::unique_ptr m_pimpl; + std::unique_ptr m_pimpl; }; std::pair streamer(StreamerVariant func); diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp new file mode 100644 index 0000000000..ad53d8d941 --- /dev/null +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -0,0 +1,149 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include "openvino/genai/visibility.hpp" +#include +#include +#include + +namespace ov { +namespace genai { + +using TimePoint = std::chrono::steady_clock::time_point; +using MicroSeconds = std::chrono::duration>; + +/** + * @brief Structure with raw performance metrics for each generation before any statistics are calculated. + * + * @param generate_durations Durations for each generate call in microseconds. + * @param tokenization_durations Durations for the tokenization process in microseconds. + * @param detokenization_durations Durations for the detokenization process in microseconds. + * @param m_times_to_first_token Times to the first token for each call in microseconds. + * @param m_new_token_times Time points for each new token generated. + * @param m_batch_sizes Batch sizes for each generate call. + * @param m_durations Total durations for each generate call in microseconds. + * @param num_generated_tokens Total number of tokens generated. + * @param num_input_tokens Total number of tokens in the input prompt. + */ +struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { + std::vector generate_durations; + std::vector tokenization_durations; + std::vector detokenization_durations; + + std::vector m_times_to_first_token; + std::vector m_new_token_times; + std::vector m_batch_sizes; + std::vector m_durations; + + size_t num_generated_tokens; + size_t num_input_tokens; +}; + +/** +* @brief Structure to store mean and standart deviation values. +*/ +struct OPENVINO_GENAI_EXPORTS MeanStdPair { + float mean; + float std; +}; + +/** + * @brief Holds performance metrics for each generate call. + * + * PerfMetrics holds fields with mean and standard deviations for the following metrics: + * - Time To the First Token (TTFT), ms + * - Time per Output Token (TPOT), ms/token + * - Generate total duration, ms + * - Tokenization duration, ms + * - Detokenization duration, ms + * - Throughput, tokens/s + * + * Additional fields include: + * - Load time, ms + * - Number of generated tokens + * - Number of tokens in the input prompt + * + * Preverable way to access values is via get functions. Getters calculate mean and std values from raw_metrics are return pairs. + * If mean and std were already calcualted getters return cached values. + * @param get_load_time Returns the load time in milliseconds. + * @param get_num_generated_tokens Returns the number of generated tokens. + * @param get_num_input_tokens Returns the number of tokens in the input prompt. + * @param get_ttft Returns the mean and standard deviation of TTFT. + * @param get_tpot Returns the mean and standard deviation of TPOT. + * @param get_throughput Returns the mean and standard deviation of throughput. + * @param get_generate_duration Returns the mean and standard deviation of generate duration. + * @param get_tokenization_duration Returns the mean and standard deviation of tokenization duration. + * @param get_detokenization_duration Returns the mean and standard deviation of detokenization duration. + * @param get_microsec Converts a duration to microseconds. + * @param m_evaluated Flag indicating if raw metrics were evaluated. + * If false, current mean/std TTFT, TPOT, etc. are not actual and evaluate_statistics() should recalculate them. + * @param evaluate_statistics Calculates mean and standard deviation values from raw_metrics. + * Optional start_time can be provided to update durations. + * @param operator+ Adds two PerfMetrics objects. + * @param operator+= Adds and assigns the right-hand PerfMetrics to the current object. + * @param raw_metrics A structure of RawPerfMetrics type that holds raw metrics. + * @param load_time Load time in milliseconds. + * + * Cached mean and standard deviations. + * @param ttft Mean and standard deviation of Time to the First Token (TTFT) in milliseconds. + * @param tpot Mean and standard deviation of Time per Output Token (TPOT) in milliseconds per token. + * @param throughput Mean and standard deviation of tokens per second. + * @param generate_duration Mean and standard deviation of the total duration of generate calls in milliseconds. + * @param tokenization_duration Mean and standard deviation of the tokenization duration in milliseconds. + * @param detokenization_duration Mean and standard deviation of the detokenization duration in milliseconds. + * @param num_generated_tokens Number of generated tokens. + * @param num_input_tokens Number of tokens in the input prompt. + */ +struct OPENVINO_GENAI_EXPORTS PerfMetrics { + float load_time; // Load time in ms. + MeanStdPair ttft; // Time to the first token (in ms) (TTTFT). + MeanStdPair tpot; // Time (in ms) per output token (TPOT). + MeanStdPair throughput; // Tokens per second. + + MeanStdPair generate_duration; + MeanStdPair tokenization_duration = {-1, -1}; + MeanStdPair detokenization_duration = {-1. -1}; + + size_t num_generated_tokens; + size_t num_input_tokens; + + float get_load_time(); // Load time in ms. + float get_num_generated_tokens(); + float get_num_input_tokens(); + MeanStdPair get_ttft(); // Time to the first token (in ms) (TTTFT). + MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT). + MeanStdPair get_throughput(); // Tokens per second. + + MeanStdPair get_generate_duration(); + MeanStdPair get_tokenization_duration(); + MeanStdPair get_detokenization_duration(); + + // Flag indicating if raw metrics were evaluated. + // If false means current mean/std ttft, tpot, etc. are not actual + // and evaluate_statistics() should recalculate them. + bool m_evaluated = false; + + /** + * @brief calculates mean/std values from raw_metrics. + * + * @param start_time optional start_time in case if duration needs to be updated. + */ + void evaluate_statistics(std::optional start_time = std::nullopt); + + /** + * @brief convert duration to microseconds + * + * @param duration duration in + */ + static float get_microsec(std::chrono::steady_clock::duration duration); + PerfMetrics operator+(const PerfMetrics& metrics) const; + PerfMetrics& operator+=(const PerfMetrics& right); + + RawPerfMetrics raw_metrics; +}; + +} // namespace genai +} // namespace ov diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp similarity index 61% rename from text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp rename to src/cpp/include/openvino/genai/scheduler_config.hpp index d468a84460..aca823fa63 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/include/scheduler_config.hpp +++ b/src/cpp/include/openvino/genai/scheduler_config.hpp @@ -5,6 +5,7 @@ #include +namespace ov::genai { struct SchedulerConfig { // a maximum number of tokens to batch // (in constrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch) @@ -15,7 +16,7 @@ struct SchedulerConfig { std::size_t num_kv_blocks = 0; // total size of KV cache in GB - std::size_t cache_size = 0; + std::size_t cache_size = 1; // block size for KV cache std::size_t block_size = 32; @@ -29,4 +30,13 @@ struct SchedulerConfig { // max number of scheduled sequences (you can think of it as "max batch size") std::size_t max_num_seqs = 256; + + // Enable caching of KV-blocks. + // When turned on all previously calculated KV-caches are kept in memory for future usages. + // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. + // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. + // When turend off only KV-cache required for batch calculation is kept in memory and + // when a sequence has finished genegartion its cache is released. + bool enable_prefix_caching = false; }; +} diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp index 04d350cc5d..dc42f047f9 100644 --- a/src/cpp/include/openvino/genai/streamer_base.hpp +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -21,6 +21,8 @@ class StreamerBase { /// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one virtual void end() = 0; + + virtual ~StreamerBase() = default; }; diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index a9f3e112b8..425c30128b 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -26,10 +26,10 @@ struct TokenizedInputs { class OPENVINO_GENAI_EXPORTS Tokenizer { public: /** - * @brief ov::Tokenizer constructor. + * @brief ov::genai::Tokenizer constructor. * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path */ - Tokenizer(const std::string& tokenizer_path); + Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config = {}); /** * @brief encode a single prompt @@ -79,7 +79,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @return A string with the transformed and concatenated prompts from the chat history. * @throws Exception if the chat template was unable to parse the input history. */ - std::string apply_chat_template(const ChatHistory& history, + std::string apply_chat_template(ChatHistory history, bool add_generation_prompt, const std::string& chat_template="") const; diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp new file mode 100644 index 0000000000..d9815610c5 --- /dev/null +++ b/src/cpp/src/block_manager.hpp @@ -0,0 +1,577 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#include "sequence_group.hpp" + +namespace ov::genai { +class KVCacheBlock { + int m_ref_count; + int m_index; + size_t m_hash; + size_t m_num_hashed_tokens; + std::chrono::time_point m_timestamp; +public: + using Ptr = std::shared_ptr; + using CPtr = std::shared_ptr; + + explicit KVCacheBlock(int index) + : m_ref_count(0), + m_index(index), + m_timestamp(std::chrono::system_clock::now()) { } + + int get_index() const { + return m_index; + } + + bool is_free() const { + return m_ref_count == 0; + } + + void increment() { + ++m_ref_count; + } + + void release() { + OPENVINO_ASSERT(m_ref_count > 0); + --m_ref_count; + } + + bool copy_on_write() const { + return m_ref_count > 1; + } + + int get_references_count() const { + return m_ref_count; + } + + size_t get_hash() const { + return m_hash; + } + + size_t get_num_hashed_tokens() const { + return m_num_hashed_tokens; + } + + void set_hash(size_t hash, size_t num_hashed_tokens) { + m_hash = hash; + m_num_hashed_tokens = num_hashed_tokens; + } + + void set_timestamp(const std::chrono::time_point& timestamp) { + m_timestamp = timestamp; + } + + std::chrono::time_point get_timestamp() { + return m_timestamp; + } +}; + + +class Evictor { + std::map blocks; +public: + void add(size_t hash, KVCacheBlock::Ptr block) { + blocks[hash] = block; + } + + static bool block_is_less(const std::pair& lhs, const std::pair& rhs) { + return lhs.second->get_timestamp() < rhs.second->get_timestamp(); + } + + KVCacheBlock::Ptr get_block(size_t hash) { + if (blocks.find(hash)== blocks.end()) + { + return nullptr; + } + KVCacheBlock::Ptr block = blocks[hash]; + block->set_timestamp(std::chrono::system_clock::now()); + block->increment(); + blocks.erase(hash); + return block; + } + + KVCacheBlock::Ptr get_lru_block() { + if (!blocks.size()) { + return nullptr; + } + auto hash_block = std::min_element(std::begin(blocks), std::end(blocks), block_is_less); + auto block = hash_block->second; + block->set_timestamp(std::chrono::system_clock::now()); + block->increment(); + blocks.erase(hash_block->first); + return block; + } + + size_t num_blocks() const { + return blocks.size(); + } +}; + + +class BlockAllocator { + std::list m_free_blocks; + ov::genai::Evictor m_evictor; + int m_total_num_blocks; + bool m_enable_prefix_caching; +public: + BlockAllocator(int num_blocks, bool enable_prefix_caching) : + m_total_num_blocks(num_blocks), m_enable_prefix_caching(enable_prefix_caching) { + for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) { + m_free_blocks.push_back(std::make_shared(block_id)); + } + } + + ~BlockAllocator() { + // sanity check to validate that all blocks are freed + // OPENVINO_ASSERT(m_total_num_blocks == m_free_blocks.size()); + } + + size_t num_free_blocks() const { + return m_free_blocks.size() + m_evictor.num_blocks(); + } + + bool can_allocate_blocks(size_t num_blocks) const { + return num_blocks <= num_free_blocks(); + } + + void free(KVCacheBlock::Ptr block) { + block->release(); + if (block->is_free()) { + if (m_enable_prefix_caching) + { + m_evictor.add(block->get_hash(), block); + } + else { + m_free_blocks.push_back(block); + } + } + } + + KVCacheBlock::Ptr allocate_block() { + OPENVINO_ASSERT(!m_enable_prefix_caching); + OPENVINO_ASSERT(can_allocate_blocks(1)); + KVCacheBlock::Ptr allocated_block = m_free_blocks.front(); + allocated_block->increment(); + m_free_blocks.pop_front(); + return allocated_block; + } + + KVCacheBlock::Ptr allocate_block(size_t hash, size_t num_hashed_tokens, std::map& cached_blocks) { + OPENVINO_ASSERT(m_enable_prefix_caching); + OPENVINO_ASSERT(can_allocate_blocks(1)); + auto block = m_evictor.get_block(hash); + if (block != nullptr) { + // use cached block from evictor + cached_blocks[hash] = block; + return block; + } + // TODO: Currently we cache all allocated blocks which might be redundant for beam search, + // where blocks of non-used candidates are not needed in cache. + // This part can be improved if we cache only blocks for prompt. + if (cached_blocks.find(hash) != cached_blocks.end()) { + // use cashed block from cached_blocks + block = cached_blocks[hash]; + cached_blocks[hash]->increment(); + return block; + } + if (m_free_blocks.size() > 0) { + // allocate new empty block + KVCacheBlock::Ptr allocated_block = m_free_blocks.front(); + allocated_block->increment(); + allocated_block->set_hash(hash, num_hashed_tokens); + cached_blocks[hash] = allocated_block; + + m_free_blocks.pop_front(); + return allocated_block; + } + if (m_evictor.num_blocks() > 0) { + // get least resently used block from evictor and reuse it + KVCacheBlock::Ptr block = m_evictor.get_lru_block(); + cached_blocks.erase(block->get_hash()); + + // update block with new hash + block->set_hash(hash, num_hashed_tokens); + cached_blocks[hash] = block; + return block; + } + // out of memory + return nullptr; + } + + KVCacheBlock::Ptr get_cached_block(size_t hash, std::map& cached_blocks) { + auto block = m_evictor.get_block(hash); + if (block != nullptr) { + // use cashed block from evictor + cached_blocks[hash] = block; + return block; + } + if (cached_blocks.find(hash) != cached_blocks.end()) { + // use cashed block from cached_blocks + // TODO: add tokens validation in case of hash collision + block = cached_blocks[hash]; + cached_blocks[hash]->increment(); + return block; + } + return nullptr; + } + + float get_used_percentage() const { + return static_cast(m_total_num_blocks - num_free_blocks()) / m_total_num_blocks; + } +}; + +class BlockManager { + BlockAllocator m_allocator; + bool m_enable_prefix_caching; + size_t m_block_size; + // TODO: caching time can probably be improved if we use the prefix tree + std::map cached_blocks; + + // stores blocks for each sequence (not sequence group) + // the same block can be seen in multiple block_tables for different sequences + std::map> m_block_table; +public: + BlockManager(int num_blocks, bool enable_prefix_caching, size_t block_size) + : m_allocator(num_blocks, enable_prefix_caching), m_enable_prefix_caching(enable_prefix_caching), m_block_size(block_size) { } + + ~BlockManager() { + // sanity check that all sequences are freed + // OPENVINO_ASSERT(m_block_table.empty()); + } + + const std::vector& get_block_table(uint64_t seq_id) { + OPENVINO_ASSERT(m_block_table.count(seq_id) == 1); + return m_block_table[seq_id]; + } + + const size_t free_rightest_blocks(SequenceGroup::Ptr sequence_group) { + size_t blocks_released = 0; + auto running_sequences = sequence_group->get_not_finished_sequences(); + std::set blocks_released_indices; + for (size_t idx = 0; idx < running_sequences.size(); ++idx) { + auto seq_id = running_sequences[idx]->get_id(); + OPENVINO_ASSERT(m_block_table.count(seq_id) > 0, "Invalid sequence group."); + auto block_table = m_block_table[seq_id]; + if (free_last_block(seq_id)) { + blocks_released++; + } + } + return blocks_released; + } + + const bool free_group_partially_multiple_runnning_sequence(SequenceGroup::Ptr sequence_group, size_t num_required_blocks, size_t& phisical_blocks_released, size_t& logical_blocks_released) { + phisical_blocks_released = 0; + logical_blocks_released = 0; + while (num_required_blocks > phisical_blocks_released) { + size_t released_count = free_rightest_blocks(sequence_group); + logical_blocks_released += 1; + if (get_number_of_blocks_occupied_by_sequence(sequence_group) == 0) { + break; + } + phisical_blocks_released += released_count; + } + return num_required_blocks <= phisical_blocks_released; + } + + const bool free_group_partially_single_runnning_sequence(SequenceGroup::Ptr sequence_group, size_t num_required_blocks, size_t& phisical_blocks_released) { + auto sequences = sequence_group->get_not_finished_sequences(); + OPENVINO_ASSERT(sequences.size() == 1); + auto running_sequence = sequences[0]; + auto seq_id = running_sequence->get_id(); + if (!has_block_table(seq_id)) { + // no blocks are allocated for this sequence, so it can't be preempted + return false; + } + auto block_table = get_block_table(seq_id); + auto prev_blocks_count = num_free_blocks(); + free_sequence_partially_single_runnning_sequence(seq_id, num_required_blocks); + + // calculate the number of released blocks + phisical_blocks_released = num_free_blocks() - prev_blocks_count; + + return num_required_blocks <= phisical_blocks_released; + } + + const size_t get_number_of_blocks_occupied_by_sequence(SequenceGroup::Ptr sequence_group) { + auto running_sequences = sequence_group->get_not_finished_sequences(); + size_t num_blocks = 0; + std::set indices; + for (size_t idx = 0; idx < running_sequences.size(); ++idx) { + auto seq_id = running_sequences[idx]->get_id(); + if (m_block_table.count(seq_id) == 0) { + continue; + } + // OPENVINO_ASSERT(m_block_table.count(seq_id) > 0, "Invalid sequence group."); + auto block_table = m_block_table[seq_id]; + size_t last_idx = block_table.back()->get_index(); + if (indices.find(last_idx) != indices.end()) { + continue; + } + else { + indices.insert(last_idx); + num_blocks += block_table.size(); + } + } + return num_blocks; + } + + const bool has_block_table(uint64_t seq_id) { + return m_block_table.count(seq_id) > 0; + } + + size_t num_free_blocks() const { + return m_allocator.num_free_blocks(); + } + + bool can_allocate_blocks(size_t num_blocks) const { + return m_allocator.can_allocate_blocks(num_blocks); + } + + void allocate(ov::genai::Sequence::CPtr sequence, size_t num_blocks, const ov::genai::TokenIds& prompt_ids = {}) { + OPENVINO_ASSERT(num_blocks > 0 && can_allocate_blocks(num_blocks)); + if (m_enable_prefix_caching) { + OPENVINO_ASSERT(prompt_ids.size() > 0, "prompt_ids should be set for hash calculation."); + } + auto sequence_id = sequence->get_id(); + auto block_table = m_block_table[sequence_id]; + auto content_length = sequence->get_generated_len() + prompt_ids.size(); + size_t num_hashed_tokens = block_table.size() * m_block_size; + + for (size_t i = 0; i < num_blocks; ++i) { + + ov::genai::KVCacheBlock::Ptr block = nullptr; + if (m_enable_prefix_caching) { + num_hashed_tokens += m_block_size; + if (num_hashed_tokens > content_length) { + num_hashed_tokens = content_length; + } + auto hash = sequence->get_hash(num_hashed_tokens, prompt_ids); + block = m_allocator.allocate_block(hash, num_hashed_tokens, cached_blocks); + } + else { + block = m_allocator.allocate_block(); + } + OPENVINO_ASSERT(block != nullptr); + m_block_table[sequence_id].push_back(block); + } + } + + float get_used_percentage() const { + return m_allocator.get_used_percentage(); + } + + void fork_sequence(uint64_t parent_id, uint64_t child_id) { + OPENVINO_ASSERT(m_block_table.count(child_id) == 0); + m_block_table[child_id].reserve(m_block_table[parent_id].size()); + for (KVCacheBlock::Ptr & block : m_block_table[parent_id]) { + block->increment(); + m_block_table[child_id].push_back(block); + } + } + + void free_sequence(size_t seq_id) { + auto block_table = m_block_table[seq_id]; + + for (KVCacheBlock::Ptr& block : block_table) { + m_allocator.free(block); + } + + OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); + } + + bool free_last_block(size_t seq_id) { + auto block_table = m_block_table[seq_id]; + OPENVINO_ASSERT(block_table.size() >= 1); + size_t block_idx = m_block_table[seq_id].size() - 1; + m_allocator.free(block_table[block_idx]); + m_block_table[seq_id].resize(m_block_table[seq_id].size() - 1); + + if (m_block_table[seq_id].size() == 0) { + OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); + } + return block_table[block_idx]->is_free(); + } + + void free_sequence_partially_single_runnning_sequence(size_t seq_id, size_t block_num) { + // this method is applicable only for groups with single sequences + + auto block_table = m_block_table[seq_id]; + OPENVINO_ASSERT(block_table.size() >= block_num); + for (size_t idx = 0; idx < block_num; idx++) { + size_t block_idx = m_block_table[seq_id].size() - idx - 1; + m_allocator.free(block_table[block_idx]); + OPENVINO_ASSERT(block_table[block_idx]->is_free()); + } + m_block_table[seq_id].resize(m_block_table[seq_id].size() - block_num); + + if (m_block_table[seq_id].size() == 0) { + OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); + } + } + + bool can_append_slots(SequenceGroup::CPtr seq_group) { + return required_blocks_count(seq_group) <= m_allocator.num_free_blocks(); + } + + size_t required_blocks_count(SequenceGroup::CPtr seq_group) { + std::vector running_sequences = seq_group->get_running_sequences(); + size_t blocks_count= 0; // totat number of needed blocks for sequence group + std::set last_block_ids; // unique last block indices + + for (auto seq: running_sequences) { + auto seq_id = seq->get_id(); + if (m_block_table.find(seq_id) == m_block_table.end()) { + // the block table is empty, so we need to allocate the number of blocks equal to number of logical blocks + blocks_count += seq_group->get_num_logical_blocks(); + continue; + } + auto& block_table = m_block_table[seq_id]; + size_t num_physical_blocks = block_table.size(); + OPENVINO_ASSERT(num_physical_blocks > 0); + + if (num_physical_blocks > seq_group->get_num_logical_blocks()) + // new blocks are not required + continue; + + size_t last_block_id = block_table.back()->get_index(); + + if (last_block_ids.find(last_block_id) != last_block_ids.end()) + // this block was already processed + continue; + last_block_ids.insert(last_block_id); + + size_t needed_blocks_per_sequence = seq_group->get_num_logical_blocks() - num_physical_blocks; + + KVCacheBlock::Ptr last_block = block_table.back(); + if (last_block->copy_on_write()) { + // block is used only by multiple sequences + auto references_count = last_block->get_references_count(); + + if (needed_blocks_per_sequence == 0) { + // case when last block is not completely filled and needs to be copied n - 1 times, where n - references count + blocks_count += references_count - 1; + } + else { + blocks_count += needed_blocks_per_sequence * references_count; + } + } + else { + // block is used only by one sequence + blocks_count += needed_blocks_per_sequence; + } + } + return blocks_count; + } + + std::map> append_slots(SequenceGroup::CPtr seq_group) { + + size_t num_logical_blocks = seq_group->get_num_logical_blocks(); + std::vector running_sequences = seq_group->get_running_sequences(); + + std::map> copy_blocks_map; + for (size_t i = 0; i < running_sequences.size(); ++i) { + Sequence::CPtr sequence = running_sequences[i]; + auto seq_id = sequence->get_id(); + auto& block_table = m_block_table[seq_id]; + size_t num_physical_blocks = block_table.size(); + + if (num_logical_blocks > num_physical_blocks) { + OPENVINO_ASSERT(can_allocate_blocks(num_logical_blocks - num_physical_blocks)); + allocate(sequence, num_logical_blocks - num_physical_blocks, seq_group->get_prompt_ids()); + } else { + OPENVINO_ASSERT(num_logical_blocks == num_physical_blocks, "A number of physical and logic blocks must be the same in this code path"); + KVCacheBlock::Ptr last_block = block_table.back(); + if (last_block->copy_on_write()) { + // we need to fork current block, because reference counter is more than 1 + KVCacheBlock::Ptr new_block = nullptr; + if (m_enable_prefix_caching) { + auto hash = sequence->get_hash(seq_group->get_context_len(), seq_group->get_prompt_ids()); + new_block = m_allocator.allocate_block(hash, seq_group->get_context_len(), cached_blocks); + cached_blocks[hash] = new_block; + } + else { + new_block = m_allocator.allocate_block(); + } + block_table[num_physical_blocks - 1] = new_block; + // write information about block forking for later usage in CacheManager + copy_blocks_map[last_block->get_index()].push_back(new_block->get_index()); + // release `last_block` usage + m_allocator.free(last_block); + } else { + // we are the only users of this block + if (m_enable_prefix_caching) { + // update hash of block + auto prev_hash = last_block->get_hash(); + auto hash = sequence->get_hash(seq_group->get_context_len(), seq_group->get_prompt_ids()); + last_block->set_hash(hash, seq_group->get_context_len()); + cached_blocks.erase(prev_hash); + cached_blocks[hash] = last_block; + } + } + } + } + + // it returns information which blocks should be forked by CacheManager + return copy_blocks_map; + } + + + void _restore_cached_blocks(SequenceGroup::Ptr group, size_t block_size) { + auto prompt_ids = group->get_prompt_ids(); + auto sequences = group->get_not_finished_sequences(); + OPENVINO_ASSERT(sequences.size() == 1); + auto sequence = sequences[0]; + auto seq_id = sequence->get_id(); + auto& block_table = m_block_table[seq_id]; + + size_t content_len = 0; + while (content_len < prompt_ids.size()) { + size_t prev_iteration_content_len = content_len; + content_len += block_size; + if (content_len > prompt_ids.size()) { + content_len = prompt_ids.size(); + } + // restore fully filled blocks + auto hash = sequence->get_hash(content_len, prompt_ids); + auto block = m_allocator.get_cached_block(hash, cached_blocks); + if (block != nullptr) { + block->set_timestamp(std::chrono::system_clock::now()); + m_block_table[seq_id].push_back(block); + group->update_processed_tokens_num(content_len); + } + else { + // restore partially filled block + for (size_t i = 1; i < block_size; i++) { + if (prev_iteration_content_len + i > prompt_ids.size()) { + break; + } + auto hash = sequence->get_hash(prev_iteration_content_len + i, prompt_ids); + auto block = m_allocator.get_cached_block(hash, cached_blocks); + if (block != nullptr) { + block->set_timestamp(std::chrono::system_clock::now()); + m_block_table[seq_id].push_back(block); + group->update_processed_tokens_num(prev_iteration_content_len + i); + + size_t new_tokens_count_in_block = std::min(content_len, prev_iteration_content_len + block_size); + if (new_tokens_count_in_block > prev_iteration_content_len + i) { + cached_blocks.erase(hash); + auto new_hash = sequence->get_hash(new_tokens_count_in_block, prompt_ids); + cached_blocks[new_hash] = block; + } + + break; + } + } + break; + } + } + } +}; +} diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp similarity index 99% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp rename to src/cpp/src/cache_manager.hpp index 11e4dbb380..7553fe36ab 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -9,6 +9,7 @@ #include "device_config.hpp" +namespace ov::genai { class CacheManager { DeviceConfig m_device_config; std::vector m_key_cache; @@ -82,3 +83,4 @@ class CacheManager { } } }; +} diff --git a/src/cpp/src/circular_buffer_queue.hpp b/src/cpp/src/circular_buffer_queue.hpp new file mode 100644 index 0000000000..086854e68e --- /dev/null +++ b/src/cpp/src/circular_buffer_queue.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace ov::genai { + +// From OVMS: +// https://github.com/openvinotoolkit/model_server/blob/d73e85cbb8ac1d761754cb2064a00551a9ffc655/src/queue.hpp#L34 +template +class CircularBufferQueue +{ + int m_front_idx; + std::atomic m_back_idx; + std::vector m_values; + std::queue> m_promises; + std::vector m_data; + std::mutex m_front_mut; + std::mutex m_queue_mutex; + +public: + + CircularBufferQueue(size_t length, const std::function& create_fn) : + m_values(length), + m_front_idx{0}, + m_back_idx{0} { + std::iota(m_values.begin(), m_values.end(), 0); + m_data.reserve(length); + for (size_t i = 0; i < length; i++) { + m_data.emplace_back(std::move(create_fn())); + } + } + + CircularBufferQueue(const CircularBufferQueue&) = delete; + CircularBufferQueue(const CircularBufferQueue&&) = delete; + CircularBufferQueue& operator=(const CircularBufferQueue&) = delete; + + T& get(int value) { + return m_data[value]; + } + + std::future get_idle() { + int value; + std::promise idle_promise; + std::future idle_future = idle_promise.get_future(); + std::unique_lock lk(m_front_mut); + if (m_values[m_front_idx] < 0) { + std::unique_lock queueLock(m_queue_mutex); + m_promises.push(std::move(idle_promise)); + } else { + value = m_values[m_front_idx]; + m_values[m_front_idx] = -1; + m_front_idx = (m_front_idx + 1) % m_values.size(); + lk.unlock(); + idle_promise.set_value(value); + } + return idle_future; + } + + void return_to(int value) { + std::unique_lock lk(m_queue_mutex); + if (m_promises.size()) { + std::promise promise = std::move(m_promises.front()); + m_promises.pop(); + lk.unlock(); + promise.set_value(value); + return; + } + int old_back = m_back_idx.load(); + while (!m_back_idx.compare_exchange_weak( + old_back, + (old_back + 1) % m_values.size(), + std::memory_order_relaxed)) { + } + m_values[old_back] = value; + } +}; + +template +class CircularBufferQueueElementGuard { + CircularBufferQueue* m_queue; + int m_value; +public: + CircularBufferQueueElementGuard(CircularBufferQueue* queue) : m_queue(queue) { + m_value = m_queue->get_idle().get(); // blocking until we get the element + } + + T& get() { + return m_queue->get(m_value); + } + + ~CircularBufferQueueElementGuard() { + m_queue->return_to(m_value); + } +}; + +} diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp similarity index 55% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp rename to src/cpp/src/continuous_batching_pipeline.cpp index 3d22644782..a66a88cad4 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/continuous_batching_pipeline.cpp +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -5,26 +5,36 @@ #include #include -#include "continuous_batching_pipeline.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_handle.hpp" +#include "openvino/genai/tokenizer.hpp" #include "cache_manager.hpp" #include "sampler.hpp" #include "model_runner.hpp" #include "scheduler.hpp" +#include "text_callback_streamer.hpp" #include "timer.hpp" -#include "tokenizer.hpp" - #include "debug_utils.hpp" +using namespace ov::genai; + +template struct overloaded : Ts... {using Ts::operator()...;}; +template overloaded(Ts...) -> overloaded; + void apply_paged_attention_transformations(std::shared_ptr model, DeviceConfig& device_config); class ContinuousBatchingPipeline::Impl { - std::shared_ptr m_tokenizer; + ov::genai::Tokenizer m_tokenizer; std::shared_ptr m_scheduler; std::shared_ptr m_cache_manager; std::shared_ptr m_model_runner; std::shared_ptr m_sampler; - GenerationConfig m_generation_config; + // TODO (mzegla): GenerationConfig is request specific object + // and pipeline only uses default rng_seed. + ov::genai::GenerationConfig m_generation_config; + + PipelineMetrics m_pipeline_metrics; struct PerfTime { float m_paged_attention_time_ms = 0.0f; @@ -46,6 +56,8 @@ class ContinuousBatchingPipeline::Impl { std::vector m_awaiting_requests; // Mutex protecting access to m_awaiting_requests, so add_request and step methods can be called from different threads std::mutex m_awaiting_requests_mutex; + bool m_is_chat_conversation = false; + ChatHistory m_history; void _free_non_running_requests() { @@ -56,6 +68,7 @@ class ContinuousBatchingPipeline::Impl { for (const auto& sequence: request->get_sequences()) { m_scheduler->free_sequence(sequence->get_id()); } + m_sampler->clear_beam_search_info(request->get_request_id()); requests_iterator = m_requests.erase(requests_iterator); } else { requests_iterator++; @@ -64,9 +77,9 @@ class ContinuousBatchingPipeline::Impl { } public: - Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string device, const ov::AnyMap& plugin_config) { + Impl(const std::string& models_path, const Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& plugin_config) : + m_tokenizer{tokenizer} { ov::Core core; - m_tokenizer = std::make_shared(models_path); // The model can be compiled for GPU as well std::shared_ptr model = core.read_model(models_path + "/openvino_model.xml"); @@ -99,26 +112,24 @@ class ContinuousBatchingPipeline::Impl { // read default generation config } - GenerationConfig get_config() const { + Impl(const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const ov::AnyMap& llm_plugin_config, const ov::AnyMap& tokenizer_plugin_config) + : Impl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config} {} + + ov::genai::GenerationConfig get_config() const { return m_generation_config; } - std::shared_ptr get_tokenizer() { + PipelineMetrics get_metrics() const { + return m_pipeline_metrics; + } + + ov::genai::Tokenizer get_tokenizer() { return m_tokenizer; } - GenerationHandle add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params) { - sampling_params.set_eos_token_id(m_tokenizer->get_eos_token_id()); + GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, ov::genai::GenerationConfig sampling_params) { + sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id()); sampling_params.validate(); - - ov::Tensor input_ids; - { - static ManualTimer timer("tokenize"); - timer.start(); - input_ids = m_tokenizer->encode(prompt); - timer.end(); - } - SequenceGroup::Ptr sequence_group = std::make_shared(request_id, input_ids, sampling_params, m_scheduler->get_config().block_size); { @@ -128,6 +139,14 @@ class ContinuousBatchingPipeline::Impl { return std::make_unique(sequence_group->get_generation_stream(), sampling_params); } + GenerationHandle add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) { + static ManualTimer timer("tokenize"); + timer.start(); + ov::Tensor input_ids = m_tokenizer.encode(prompt).input_ids; + timer.end(); + return add_request(request_id, input_ids, sampling_params); + } + void step() { static ManualTimer step_timer("step()"); step_timer.start(); @@ -139,11 +158,14 @@ class ContinuousBatchingPipeline::Impl { m_awaiting_requests.clear(); } + m_pipeline_metrics.requests = m_requests.size(); Scheduler::Output scheduler_output; { static ManualTimer timer("scheduling"); timer.start(); scheduler_output = m_scheduler->schedule(m_requests); + m_pipeline_metrics.scheduled_requests = scheduler_output.m_scheduled_sequence_groups_ids.size(); + m_pipeline_metrics.cache_usage = scheduler_output.m_cache_usage; m_cache_manager->copy_blocks(scheduler_output.m_block_copy_map); timer.end(); } @@ -222,25 +244,47 @@ class ContinuousBatchingPipeline::Impl { return !m_awaiting_requests.empty() || !m_requests.empty(); } - std::vector generate(const std::vector prompts, std::vector sampling_params) { + std::vector generate(const std::vector& input_ids, const std::vector& sampling_params, const StreamerVariant& streamer) { OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request"); - OPENVINO_ASSERT(prompts.size() == sampling_params.size()); + OPENVINO_ASSERT(input_ids.size() == sampling_params.size()); + const std::shared_ptr& streamer_ptr = std::visit(overloaded{ + [](std::monostate) -> std::shared_ptr { + return nullptr; + }, + [](const std::shared_ptr& streamer) { + return streamer; + }, + [this](const std::function& streamer) -> std::shared_ptr { + return std::make_unique(m_tokenizer, streamer); + } + }, streamer); std::vector generations; - for (size_t request_id = 0; request_id < prompts.size(); ++request_id) { - generations.push_back(add_request(request_id, prompts[request_id], sampling_params[request_id])); + for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) { + OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); + generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id])); } - std::vector results; + std::vector results; results.reserve(m_awaiting_requests.size()); - while (has_non_finished_requests()) { + bool continue_generation = true; + while (has_non_finished_requests() && continue_generation) { step(); + if (streamer_ptr) { + std::unordered_map token = generations.at(0).get()->back(); + OPENVINO_ASSERT(1 == token.size()); + OPENVINO_ASSERT(1 == token.begin()->second.generated_token_ids.size()); + continue_generation = !streamer_ptr->put(token.begin()->second.generated_token_ids.at(0)); + } + } + if (streamer_ptr) { + streamer_ptr->end(); } for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) { const auto& generation = generations[generation_idx]; - GenerationResult result; + EncodedGenerationResult result; result.m_request_id = 1; std::vector generation_outputs = generation->read_all(); std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) { @@ -250,38 +294,107 @@ class ContinuousBatchingPipeline::Impl { auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size()); for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) { const auto& generation_output = generation_outputs[generation_output_idx]; - std::string output_text = m_tokenizer->decode(generation_output.generated_token_ids); - result.m_generation_ids.push_back(output_text); + result.m_generation_ids.push_back(std::move(generation_output.generated_token_ids)); result.m_scores.push_back(generation_output.score); } result.m_status = generation->get_status(); - results.push_back(result); + results.push_back(std::move(result)); } - OPENVINO_ASSERT(results.size() == prompts.size()); + OPENVINO_ASSERT(results.size() == input_ids.size()); return results; } + + std::vector generate(const std::vector& prompts, std::vector sampling_params, const StreamerVariant& streamer) { + std::vector input_ids; + static ManualTimer timer("tokenize"); + if (m_is_chat_conversation) { + OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts"); + m_history.push_back({{"role", "user"}, {"content", prompts.at(0)}}); + constexpr bool add_generation_prompt = true; + std::string history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + timer.start(); + input_ids.push_back(m_tokenizer.encode(history).input_ids); + timer.end(); + } else { + input_ids.reserve(prompts.size()); + for (const std::string& prompt : prompts) { + timer.start(); + input_ids.push_back(m_tokenizer.encode(prompt).input_ids); + timer.end(); + } + } + std::vector encoded = generate(input_ids, sampling_params, streamer); + std::vector decoded; + decoded.reserve(encoded.size()); + for (EncodedGenerationResult& res : encoded) { + std::vector generated; + generated.reserve(res.m_generation_ids.size()); + for (size_t idx = 0; idx < res.m_generation_ids.size(); ++idx) { + generated.push_back(m_tokenizer.decode(res.m_generation_ids.at(idx))); + if (m_is_chat_conversation && 0 == idx) { + m_history.push_back({{"role", "assistant"}, {"content", generated.back()}}); + } + } + decoded.push_back(GenerationResult{ + res.m_request_id, + std::move(generated), + std::move(res.m_scores), + res.m_status + }); + } + return decoded; + } + + void start_chat(const std::string& system_message) { + if (!system_message.empty()) { + m_history.push_back({{"role", "system"}, {"content", system_message}}); + } + m_is_chat_conversation = true; + }; + + void finish_chat() { + m_is_chat_conversation = false; + m_history.clear(); + }; }; ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path, const SchedulerConfig& scheduler_config, const std::string& device, - const ov::AnyMap& plugin_config ) { - m_impl = std::make_shared(models_path, scheduler_config, device, plugin_config); + const ov::AnyMap& llm_plugin_config, + const ov::AnyMap& tokenizer_plugin_config) { + m_impl = std::make_shared(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config); } -std::shared_ptr ContinuousBatchingPipeline::get_tokenizer() { +ContinuousBatchingPipeline::ContinuousBatchingPipeline( + const std::string& model_path, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config +) : m_impl{std::make_shared(model_path, tokenizer, scheduler_config, device, plugin_config)} {} + +ov::genai::Tokenizer ContinuousBatchingPipeline::get_tokenizer() { return m_impl->get_tokenizer(); } -GenerationConfig ContinuousBatchingPipeline::get_config() const{ +ov::genai::GenerationConfig ContinuousBatchingPipeline::get_config() const{ return m_impl->get_config(); } -GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params) { +PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{ + return m_impl->get_metrics(); +} + +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params) { return m_impl->add_request(request_id, prompt, sampling_params); } +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params) { + return m_impl->add_request(request_id, input_ids, sampling_params); +} + void ContinuousBatchingPipeline::step() { m_impl->step(); } @@ -290,6 +403,18 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() { return m_impl->has_non_finished_requests(); } -std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, std::vector sampling_params) { - return m_impl->generate(prompts, sampling_params); -} \ No newline at end of file +std::vector ContinuousBatchingPipeline::generate(const std::vector& input_ids, const std::vector& sampling_params, const StreamerVariant& streamer) { + return m_impl->generate(input_ids, sampling_params, streamer); +} + +std::vector ContinuousBatchingPipeline::generate(const std::vector& prompts, const std::vector& sampling_params, const StreamerVariant& streamer) { + return m_impl->generate(prompts, sampling_params, streamer); +} + +void ContinuousBatchingPipeline::start_chat(const std::string& system_message) { + m_impl->start_chat(system_message); +}; + +void ContinuousBatchingPipeline::finish_chat() { + m_impl->finish_chat(); +}; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp similarity index 100% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/debug_utils.hpp rename to src/cpp/src/debug_utils.hpp diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp b/src/cpp/src/device_config.hpp similarity index 97% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp rename to src/cpp/src/device_config.hpp index 010d9b2ba2..f2ed5d424b 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -7,8 +7,9 @@ #include "openvino/core/shape.hpp" #include "openvino/core/type/element_type.hpp" -#include "scheduler_config.hpp" +#include "openvino/genai/scheduler_config.hpp" +namespace ov::genai { class DeviceConfig { ov::element::Type m_kv_cache_type; ov::Shape m_key_cache_shape, m_value_cache_shape; @@ -87,3 +88,4 @@ class DeviceConfig { return m_num_kv_blocks; } }; +} diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index ce313de1c3..6578a6bd08 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -49,6 +49,16 @@ GenerationConfig::GenerationConfig(const std::string& json_path) { } } +void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { + if (eos_token_id < 0) { + eos_token_id = tokenizer_eos_token_id; + } else { + OPENVINO_ASSERT(eos_token_id == tokenizer_eos_token_id, + "EOS token ID is different in generation config (", eos_token_id, ") and tokenizer (", + tokenizer_eos_token_id, ")"); + } +} + void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) { using ov::genai::utils::read_anymap_param; @@ -96,8 +106,9 @@ void GenerationConfig::validate() const { "Beam search with sampling is not supported yet. " "Please either set do_sample=false to use beam search " "or set num_beams=1 if you with to use multinomial sampling."); - OPENVINO_ASSERT(num_return_sequences <= num_beams, "num_return_sequences must be less or equal to num_beams"); + OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0"); OPENVINO_ASSERT(max_new_tokens > 0, "'max_new_tokens' must be greater than 0"); + OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens"); // max_new_tokens has priority over max_length // if max_new_tokens is defined no need to check max_length @@ -123,7 +134,48 @@ void GenerationConfig::validate() const { OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); + if (is_beam_search()) { + OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); + } else { + OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); + OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); + } } +GenerationConfig beam_search() { + GenerationConfig beam_search_config; + beam_search_config.num_beams = 4; + beam_search_config.num_return_sequences = 3; + beam_search_config.num_beam_groups = 2; + beam_search_config.max_new_tokens = 100; + beam_search_config.diversity_penalty = 2.0f; + return beam_search_config; +} + +GenerationConfig greedy() { + GenerationConfig greedy_config; + greedy_config.temperature = 0.0f; + greedy_config.ignore_eos = true; + greedy_config.num_return_sequences = 1; + greedy_config.repetition_penalty = 3.0f; + greedy_config.presence_penalty = 0.1f; + greedy_config.frequency_penalty = 0.01f; + greedy_config.max_new_tokens = 30; + return greedy_config; +} + +GenerationConfig multinomial() { + GenerationConfig multinomial_config; + multinomial_config.do_sample = true; + multinomial_config.temperature = 0.9f; + multinomial_config.top_p = 0.9f; + multinomial_config.top_k = 20; + multinomial_config.num_return_sequences = 3; + multinomial_config.presence_penalty = 0.01f; + multinomial_config.frequency_penalty = 0.1f; + multinomial_config.min_new_tokens = 15; + multinomial_config.max_new_tokens = 30; + return multinomial_config; +} } // namespace genai } // namespace ov diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp similarity index 90% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/generation_handle.cpp rename to src/cpp/src/generation_handle.cpp index ddd591c207..26cc12604f 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/generation_handle.cpp +++ b/src/cpp/src/generation_handle.cpp @@ -3,9 +3,11 @@ #include -#include "generation_handle.hpp" +#include "openvino/genai/generation_handle.hpp" #include "generation_stream.hpp" +using namespace ov::genai; + GenerationHandleImpl::~GenerationHandleImpl() { m_generation_stream->drop(); } @@ -18,6 +20,10 @@ bool GenerationHandleImpl::can_read() { return m_generation_stream->can_read(); } +std::unordered_map GenerationHandleImpl::back() { + return m_generation_stream->back(); +} + std::unordered_map GenerationHandleImpl::read() { return m_generation_stream->read(); } diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/generation_stream.hpp b/src/cpp/src/generation_stream.hpp similarity index 86% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/generation_stream.hpp rename to src/cpp/src/generation_stream.hpp index f750ac9798..1ac2eefef9 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/generation_stream.hpp +++ b/src/cpp/src/generation_stream.hpp @@ -4,11 +4,11 @@ #pragma once #include #include -#include "continuous_batching_pipeline.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_handle.hpp" #include "synchronized_queue.hpp" -#include "generation_handle.hpp" - +namespace ov::genai { class GenerationStream { std::mutex m_mutex; GenerationStatus m_status = GenerationStatus::RUNNING; @@ -31,6 +31,9 @@ class GenerationStream { } // Retriving vector of pairs as we can generate multiple outputs for a single prompt + GenerationOutputs back() { + return m_output_queue.back(); + } GenerationOutputs read() { return m_output_queue.pull(); } @@ -54,3 +57,4 @@ class GenerationStream { m_status = GenerationStatus::DROPPED_BY_HANDLE; } }; +} diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp index 48125b7ab8..8dc56b4ba8 100644 --- a/src/cpp/src/greedy_decoding.cpp +++ b/src/cpp/src/greedy_decoding.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/perf_metrics.hpp" #include "utils.hpp" namespace ov { @@ -12,56 +12,27 @@ EncodedResults greedy_decoding( ov::Tensor input_ids, ov::Tensor attention_mask, const ov::genai::GenerationConfig generation_config, - const std::shared_ptr streamer, - const bool is_chat_conversation, - const bool is_cache_empty + const std::shared_ptr streamer, + std::optional position_ids ) { ov::Shape prompts_shape = input_ids.get_shape(); const size_t batch_size = prompts_shape[0]; size_t running_batch_size = batch_size; size_t prompt_len = prompts_shape[1]; - - auto num_inputs = m_model_runner.get_compiled_model().inputs().size(); - bool position_ids_available = num_inputs == 4; - ov::Tensor position_ids; + size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len); + // Initialize results and performance metrics. EncodedResults results; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; + results.scores.resize(running_batch_size); results.tokens.resize(running_batch_size); std::fill(results.scores.begin(), results.scores.end(), 0); - - int64_t kv_cache_len = 0; - if (is_chat_conversation && !is_cache_empty) { - OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1"); - - // between subsequent runs attention_mask should not be modified - auto atten_mask_history = m_model_runner.get_tensor("attention_mask"); - kv_cache_len = atten_mask_history.get_shape()[1]; - - size_t prompt_len = attention_mask.get_shape()[1]; - ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}}; - - std::copy(atten_mask_history.data(), atten_mask_history.data() + kv_cache_len, - new_atten_mask.data()); - std::copy(attention_mask.data(), attention_mask.data() + prompt_len, - new_atten_mask.data() + kv_cache_len); - - m_model_runner.set_tensor("attention_mask", new_atten_mask); - } else if (!is_cache_empty) { - OPENVINO_THROW("KV cache contains initial values but generate is run not in chat scenario. " - "Initial KV cache can contain values only if start_chat() is called."); - } else { - m_model_runner.set_tensor("attention_mask", attention_mask); - } - if (position_ids_available) { - position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; - utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len); - } - m_model_runner.set_tensor("input_ids", input_ids); - if (position_ids_available) - m_model_runner.set_tensor("position_ids", position_ids); + m_model_runner.set_tensor("attention_mask", attention_mask); + if (position_ids.has_value()) + m_model_runner.set_tensor("position_ids", *position_ids); m_model_runner.get_tensor("beam_idx").set_shape({running_batch_size}); auto beam_data = m_model_runner.get_tensor("beam_idx").data(); @@ -83,6 +54,9 @@ EncodedResults greedy_decoding( eos_met[batch] = (out_token == generation_config.eos_token_id); m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + if (streamer && streamer->put(token_iter_results[0])) { return results; } @@ -91,9 +65,9 @@ EncodedResults greedy_decoding( if (!generation_config.ignore_eos && all_are_eos) return results; - size_t max_tokens = generation_config.get_max_new_tokens(prompt_len); - for (size_t i = 0; i < max_tokens - 1; ++i) { - if (position_ids_available) + + for (size_t i = 0; i < max_new_tokens - 1; ++i) { + if (position_ids.has_value()) utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); @@ -113,6 +87,8 @@ EncodedResults greedy_decoding( m_model_runner.get_tensor("input_ids").data()[batch] = out_token; } + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); if (streamer && streamer->put(token_iter_results[0])) return results; @@ -139,8 +115,9 @@ EncodedResults greedy_decoding( if (streamer) { streamer->end(); } + return results; } } //namespace genai -} //namespace ov \ No newline at end of file +} //namespace ov diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index f907156125..1b9729b2f6 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -312,24 +312,6 @@ std::vector>> finalize(GroupBeamSearcher&& group_b return finalized; } -void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { - request.set_tensor("input_ids", input_ids); - request.set_tensor("attention_mask", attention_mask); - - ov::Shape input_shape = input_ids.get_shape(); - auto num_inputs = request.get_compiled_model().inputs().size(); - bool position_ids_available = num_inputs == 4; - if (position_ids_available){ - ov::Tensor position_ids = request.get_tensor("position_ids"); - position_ids.set_shape(input_shape); - ov::genai::utils::initialize_position_ids(position_ids, attention_mask); - } - - ov::Tensor beam_idx = request.get_tensor("beam_idx"); - beam_idx.set_shape({input_shape.at(0)}); - std::fill_n(beam_idx.data(), input_shape.at(0), 0); -} - void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector next_beams) { ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; ov::Shape original_shape = original_mask.get_shape(); @@ -361,34 +343,54 @@ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention } } +void reset_all_inputs_to_empty_tensors(ov::InferRequest& request) { + request.set_tensor("input_ids", ov::Tensor(ov::element::i64, {0, 0})); + request.set_tensor("beam_idx", ov::Tensor(ov::element::i32, {0})); + if (request.get_compiled_model().inputs().size() == 4) + request.set_tensor("position_ids", ov::Tensor(ov::element::i64, {0, 0})); +} } // namespace namespace ov { namespace genai { -EncodedResults beam_search(ov::InferRequest& lm, +std::pair beam_search(ov::InferRequest& lm, ov::Tensor input_ids, ov::Tensor attention_mask, - GenerationConfig config) { + GenerationConfig config, + std::optional position_ids, + std::optional selected_beam_idx) { OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, "number of beams should be divisible by number of groups"); - - // Initialize beam search + auto batch_size = input_ids.get_shape().at(0); + auto sequence_length = input_ids.get_shape().at(1); + + // Initialize beam search. const int64_t* prompt_data = input_ids.data(); std::vector> prompts; prompts.reserve(batch_size); for (size_t batch = 0; batch < batch_size; batch++) { - size_t sequence_length = input_ids.get_shape().at(1); size_t batch_offset = batch * sequence_length; const int64_t* prompt_start = prompt_data + batch_offset; prompts.push_back(std::vector{prompt_start, prompt_start + sequence_length}); } - initialize_inputs(input_ids, attention_mask, lm); + lm.set_tensor("input_ids", input_ids); + lm.set_tensor("attention_mask", attention_mask); + if (position_ids.has_value()) + lm.set_tensor("position_ids", *position_ids); + + ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size}); + auto beam_data = beam_idx.data(); + if (selected_beam_idx.has_value()) + beam_data[0] = *selected_beam_idx; + else + std::fill_n(beam_data, batch_size, 0); + lm.set_tensor("beam_idx", beam_idx); Parameters parameters{std::move(prompts)}; - parameters.max_new_tokens = config.max_new_tokens; + parameters.max_new_tokens = config.get_max_new_tokens(sequence_length); parameters.eos_token_id = config.eos_token_id; parameters.n_groups = config.num_beam_groups; parameters.group_size = config.num_beams / config.num_beam_groups; @@ -400,34 +402,52 @@ EncodedResults beam_search(ov::InferRequest& lm, std::vector next_tokens; std::vector next_beams; - auto num_inputs = lm.get_compiled_model().inputs().size(); - bool position_ids_available = num_inputs == 4; - - for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) { + + // Reserve for performance counters. + std::vector new_token_times; + std::vector batch_sizes; + new_token_times.reserve(parameters.max_new_tokens); + batch_sizes.reserve(parameters.max_new_tokens); + + for (size_t length_count = 0; ; ++length_count) { lm.infer(); std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); - if (next_tokens.empty()) { + new_token_times.emplace_back(std::chrono::steady_clock::now()); + batch_sizes.emplace_back(batch_size); + + if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) { + // Break the cycle before masks are extended in update_attention_mask_with_beams. + // If generation is continued, attention_mask length should be equal to KV cache size. break; } - size_t batch_size = next_tokens.size(); + + size_t running_batch_size = next_tokens.size(); // Set pointers - lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); - lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); + lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {running_batch_size, 1}, next_tokens.data()}); + lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {running_batch_size}, next_beams.data()}); + // Set auxiliary inputs update_attention_mask_with_beams(lm.get_tensor("attention_mask"), next_beams); - if (position_ids_available) + if (position_ids.has_value()) update_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask")); } + reset_all_inputs_to_empty_tensors(lm); + auto scores_comparator = [](Beam& left, Beam& right) { return (left.score > right.score); }; auto result = finalize(std::move(group_beam_searcher)); ov::genai::EncodedResults results; + int32_t res_selected_beam_idx = 0; results.scores.reserve(config.num_return_sequences * result.size()); results.tokens.reserve(config.num_return_sequences * result.size()); + auto& raw_perf_counters = results.perf_metrics.raw_metrics; + raw_perf_counters.m_new_token_times = new_token_times; + raw_perf_counters.m_batch_sizes = batch_sizes; + // align output with HF for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) { auto prompt_group = result.at(prompt_id); @@ -445,6 +465,7 @@ EncodedResults beam_search(ov::InferRequest& lm, plain_beams.end(), scores_comparator ); + res_selected_beam_idx = plain_beams.at(0).get().global_beam_idx; for ( auto beam = plain_beams.begin(); beam != plain_beams.begin() + config.num_return_sequences; @@ -454,7 +475,8 @@ EncodedResults beam_search(ov::InferRequest& lm, results.tokens.push_back(std::move(beam->get().tokens)); } } - return results; + + return {results, res_selected_beam_idx}; } } // namespace genai diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 39b0840074..b121fe9e6d 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -7,59 +7,43 @@ #include #include #include +#include "openvino/genai/continuous_batching_pipeline.hpp" #include "openvino/genai/generation_config.hpp" #include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/perf_metrics.hpp" +#include "llm_pipeline_base.hpp" +#include "llm_pipeline_static.hpp" #include "utils.hpp" #include "text_callback_streamer.hpp" namespace { -const std::string STREAMER_ARG_NAME = "streamer"; -const std::string CONFIG_ARG_NAME = "generation_config"; +ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& fisrt, const ov::genai::TokenizedInputs& second){ + auto first_size = fisrt.input_ids.get_size(); + auto second_size = second.input_ids.get_size(); + ov::Shape new_shape{1, first_size - second_size}; -ov::genai::GenerationConfig from_config_json_if_exists(const std::filesystem::path& model_path) { - auto config_file_path = model_path / "generation_config.json"; - if (std::filesystem::exists(config_file_path)) { - return ov::genai::GenerationConfig((config_file_path).string()); - } else { - return ov::genai::GenerationConfig{}; - } -} + ov::Tensor new_input_ids(ov::element::i64, new_shape); + auto data_ptr = fisrt.input_ids.data(); + std::copy(data_ptr + second_size, data_ptr + first_size, new_input_ids.data()); -ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) { - ov::genai::StreamerVariant streamer = std::monostate(); - - if (config_map.count(STREAMER_ARG_NAME)) { - auto any_val = config_map.at(STREAMER_ARG_NAME); - if (any_val.is>()) { - streamer = any_val.as>(); - } else if (any_val.is>()) { - streamer = any_val.as>(); - } - } - return streamer; -} + ov::Tensor new_attention_mask(ov::element::i64, new_shape); + std::fill_n(new_attention_mask.data(), new_shape[1], 1); -ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map) { - if (config_map.count(CONFIG_ARG_NAME)) - return config_map.at(CONFIG_ARG_NAME).as(); - else - return std::nullopt; + return {new_input_ids, new_attention_mask}; } - } namespace ov { namespace genai { ov::genai::EncodedResults greedy_decoding( - ov::InferRequest& model_runner, - ov::Tensor prompts, - ov::Tensor attention_mask, - const GenerationConfig sampling_params, - const std::shared_ptr streamer, - const bool is_chat_conversation = false, - const bool is_cache_empty = true + ov::InferRequest& model_runner, + ov::Tensor prompts, + ov::Tensor attention_mask, + const GenerationConfig sampling_params, + const std::shared_ptr streamer, + std::optional position_ids ); ov::genai::EncodedResults multinominal_decoding( @@ -67,46 +51,46 @@ ov::genai::EncodedResults multinominal_decoding( ov::Tensor prompts, ov::Tensor attention_mask, GenerationConfig sampling_params, - std::shared_ptr streamer + std::shared_ptr streamer, + std::optional position_ids ); -EncodedResults beam_search( +std::pair beam_search( ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attention_mask, - GenerationConfig config + GenerationConfig config, + std::optional position_ids, + std::optional selected_beam_idx ); - -class LLMPipeline::LLMPipelineImpl { +class StatefulLLMPipeline final : public LLMPipelineImplBase { public: ov::InferRequest m_model_runner; - Tokenizer m_tokenizer; - GenerationConfig m_generation_config; bool is_chat_conversation = false; bool m_is_cache_empty = true; + std::optional m_selected_beam = std::nullopt; ChatHistory m_history; std::string m_templated_chat_history = ""; - LLMPipelineImpl( - const ov::InferRequest& request, - const ov::genai::Tokenizer& tokenizer, + StatefulLLMPipeline( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, OptionalGenerationConfig generation_config=std::nullopt - ): m_model_runner(request), - m_tokenizer(tokenizer) { - GenerationConfig default_config; - m_generation_config = (generation_config.has_value()) ? *generation_config : default_config; + ): LLMPipelineImplBase(tokenizer), + m_model_runner(request) { + GenerationConfig default_config; + m_generation_config = (generation_config.has_value()) ? *generation_config : default_config; } - LLMPipelineImpl( + StatefulLLMPipeline( const std::filesystem::path& model_path, const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config ): - m_tokenizer(tokenizer), - m_generation_config{from_config_json_if_exists(model_path)} + LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(model_path)) { ov::Core core; core.set_property(device, plugin_config); @@ -114,42 +98,61 @@ class LLMPipeline::LLMPipelineImpl { // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) - m_generation_config.eos_token_id = m_tokenizer.get_eos_token_id(); + m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); } - LLMPipelineImpl( + StatefulLLMPipeline( const std::filesystem::path& model_path, const std::string& device, const ov::AnyMap& plugin_config - ): LLMPipelineImpl{model_path, Tokenizer(model_path.string()), device, plugin_config} {} + ): StatefulLLMPipeline{model_path, Tokenizer(model_path.string()), device, plugin_config} {} DecodedResults generate( - StringInputs inputs, - OptionalGenerationConfig generation_config, + StringInputs inputs, + OptionalGenerationConfig generation_config, StreamerVariant streamer - ) { + ) override { + auto start_time = std::chrono::steady_clock::now(); GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - EncodedInputs encoded_input; - + TokenizedInputs encoded_input; + if (auto input_vector = std::get_if>(&inputs)) { + OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts"); encoded_input = m_tokenizer.encode(*input_vector); } else if (auto input_prompt = std::get_if(&inputs)) { std::string& prompt = *input_prompt; if (is_chat_conversation) { + // KV cache in model already contains prompts and answers from previous iterations. + // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns + // token_ids = {, ...}. So if tokenizer applies only to the new prompt, + // will be inserted on every iteration. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt + // and takes only the difference between them. + // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but + // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. + m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - - prompt = new_templated_chat_history.substr(m_templated_chat_history.size()); + auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history); + if (m_is_cache_empty) { + encoded_input = new_chat_tokens; + } else { + auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history); + encoded_input = subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens); + } m_templated_chat_history = new_templated_chat_history; + } else { + encoded_input = m_tokenizer.encode(prompt); } - - encoded_input = m_tokenizer.encode(prompt); } + auto encode_stop_time = std::chrono::steady_clock::now(); + auto encoded_results = generate(encoded_input, config, streamer); - auto encoded_results = generate(encoded_input, config, streamer); + auto decode_start_time = std::chrono::steady_clock::now(); DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + auto decode_stop_time = std::chrono::steady_clock::now(); if (is_chat_conversation) { // Tail of chat template is missing in KV cache. @@ -159,17 +162,28 @@ class LLMPipeline::LLMPipelineImpl { m_history.push_back({{"role", "assistant"}, {"content", answer}}); } + // generate_durations + decoded_results.perf_metrics = encoded_results.perf_metrics; + + auto& raw_counters = decoded_results.perf_metrics.raw_metrics; + auto stop_time = std::chrono::steady_clock::now(); + raw_counters.generate_durations = std::vector(); + raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time)); + raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time)); + + decoded_results.perf_metrics.evaluate_statistics(start_time); return decoded_results; } EncodedResults generate( const EncodedInputs& inputs, - OptionalGenerationConfig generation_config, + OptionalGenerationConfig generation_config, StreamerVariant streamer - ) { + ) override { + auto start_time = std::chrono::steady_clock::now(); ov::Tensor input_ids; ov::Tensor attention_mask; - if (auto data = std::get_if(&inputs)) { input_ids = *data; attention_mask = ov::genai::utils::init_attention_mask(input_ids); @@ -177,14 +191,14 @@ class LLMPipeline::LLMPipelineImpl { input_ids = data->input_ids; attention_mask = data->attention_mask; } - + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; - + // If eos_token_id was not provided, take value from default m_generation_config if (config.eos_token_id == -1) config.eos_token_id = m_generation_config.eos_token_id; config.validate(); - + std::shared_ptr streamer_ptr; if (auto streamer_obj = std::get_if(&streamer)) { streamer_ptr = nullptr; @@ -206,47 +220,116 @@ class LLMPipeline::LLMPipelineImpl { "(input_ids, attention_mask, position_ids, beam_idx) " "but you have '" + std::to_string(num_inputs) + "' inputs"); + + size_t kv_cache_len = 0; + ov::Tensor concatenated_attention_mask; + if (is_chat_conversation && !m_is_cache_empty) { + OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1"); + // If history is saved in KV cache, concatenate new attention_mask with the already existing. + // Between subsequent runs attention_mask should not be modified. + auto atten_mask_history = m_model_runner.get_tensor("attention_mask"); + auto prompt_len = attention_mask.get_shape()[1]; + kv_cache_len = atten_mask_history.get_shape()[1]; + + ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}}; + auto start_atten_hst = atten_mask_history.data() + kv_cache_len * (*m_selected_beam); + std::copy(start_atten_hst, start_atten_hst + kv_cache_len, + new_atten_mask.data()); + std::copy(attention_mask.data(), attention_mask.data() + prompt_len, + new_atten_mask.data() + kv_cache_len); + concatenated_attention_mask = new_atten_mask; + } else { + concatenated_attention_mask = attention_mask; + } + + bool position_ids_available = (num_inputs == 4); + std::optional position_ids = std::nullopt; + if (position_ids_available) { + position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; + utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len); + } + ov::genai::EncodedResults result; if (config.is_greedy_decoding()) { - result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask, - config, streamer_ptr, - is_chat_conversation, m_is_cache_empty); + result = ov::genai::greedy_decoding(m_model_runner, input_ids, concatenated_attention_mask, + config, streamer_ptr, position_ids); + m_selected_beam = 0; } else if (config.is_beam_search()) { - result = beam_search(m_model_runner, input_ids, attention_mask, config); + std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask, + config, position_ids, m_selected_beam); } else if (config.is_multinomial()) { - result = multinominal_decoding(m_model_runner, input_ids, attention_mask, config, streamer_ptr); + result = multinominal_decoding(m_model_runner, input_ids, concatenated_attention_mask, + config, streamer_ptr, position_ids); + m_selected_beam = 0; } else { OPENVINO_THROW("No decoding algorithm found for provided configuration parameters."); } if (!is_chat_conversation) { m_model_runner.reset_state(); + m_selected_beam = std::nullopt; } else { m_is_cache_empty = false; } - - return result; + auto stop_time = std::chrono::steady_clock::now(); + + // If is called without tokenization then that stat will not be reported. + auto& metrics = result.perf_metrics; + metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); + metrics.load_time = this->m_load_time_ms; + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + metrics.evaluate_statistics(start_time); + return result; + } + + void start_chat(const std::string& system_message) override { + is_chat_conversation = true; + m_selected_beam = std::nullopt; + if (!m_is_cache_empty) { + m_model_runner.reset_state(); + m_is_cache_empty = true; + m_history = {}; + m_templated_chat_history = ""; + } + if (system_message.empty()) + return; + + m_history.push_back({{"role", "system"}, {"content", system_message}}); + constexpr bool add_generation_prompt = false; + + m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + } + + void finish_chat() override { + is_chat_conversation = false; + m_selected_beam = std::nullopt; + if (!m_is_cache_empty) { + m_model_runner.reset_state(); + m_is_cache_empty = true; + m_history = {}; + m_templated_chat_history = ""; + } } }; DecodedResults LLMPipeline::generate( - StringInputs inputs, - OptionalGenerationConfig generation_config, + StringInputs inputs, + OptionalGenerationConfig generation_config, StreamerVariant streamer ) { return m_pimpl->generate(inputs, generation_config, streamer); } DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) { - auto config_arg = get_config_from_map(config_map); + auto config_arg = utils::get_config_from_map(config_map); GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); config.update_generation_config(config_map); - return m_pimpl->generate(text, config, get_streamer_from_map(config_map)); + return m_pimpl->generate(text, config, utils::get_streamer_from_map(config_map)); } EncodedResults LLMPipeline::generate( - const EncodedInputs& inputs, + const EncodedInputs& inputs, OptionalGenerationConfig generation_config, StreamerVariant streamer ) { @@ -254,37 +337,184 @@ EncodedResults LLMPipeline::generate( } EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) { - auto config_arg = get_config_from_map(config_map); + auto config_arg = utils::get_config_from_map(config_map); GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); config.update_generation_config(config_map); - return m_pimpl->generate(inputs, config, get_streamer_from_map(config_map)); + return m_pimpl->generate(inputs, config, utils::get_streamer_from_map(config_map)); } std::pair streamer(StreamerVariant func) { if (auto streamer_obj = std::get_if>(&func)) { - return {STREAMER_ARG_NAME, Any::make>(*streamer_obj)}; + return {utils::STREAMER_ARG_NAME, Any::make>(*streamer_obj)}; } else { auto callback = std::get>(func); - return {STREAMER_ARG_NAME, Any::make>(callback)}; - } + return {utils::STREAMER_ARG_NAME, Any::make>(callback)}; + } } std::pair generation_config(const GenerationConfig& config) { - return {CONFIG_ARG_NAME, Any::make(config)}; + return {utils::CONFIG_ARG_NAME, Any::make(config)}; } } // namespace genai } // namespace ov -using namespace std; +namespace { +using namespace ov::genai; + +template struct overloaded : Ts... {using Ts::operator()...;}; +template overloaded(Ts...) -> overloaded; + +Tokenizer dont_construct() { + OPENVINO_THROW("Continuous Batching backend can't be constructed" + "from ireq because the model must be transformed"); +} + +class ContinuousBatchingAdapter final : public LLMPipelineImplBase { +public: + ContinuousBatchingPipeline m_impl; + + ContinuousBatchingAdapter( + const ov::InferRequest& request, + const Tokenizer& tokenizer, + OptionalGenerationConfig generation_config + ): LLMPipelineImplBase{dont_construct()}, m_impl{"", {}} {} + + ContinuousBatchingAdapter( + const std::filesystem::path& model_path, + const Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config + ): LLMPipelineImplBase{tokenizer}, m_impl{ + model_path.string(), + tokenizer, + SchedulerConfig{}, + device, + plugin_config + } {} + + ContinuousBatchingAdapter( + const std::filesystem::path& model_path, + const std::string& device, + const ov::AnyMap& plugin_config + ): LLMPipelineImplBase{Tokenizer(model_path.string())}, m_impl{ + model_path.string(), + m_tokenizer, + SchedulerConfig{}, + device, + plugin_config + } {} + + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override { + std::vector prompts = std::visit(overloaded{ + [](const std::string& prompt) { + return std::vector{prompt}; + }, + [](std::vector& prompts) { + return prompts; + } + }, inputs); + const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config; + // -1 == config.eos_token_id and config.validate() are handled in m_impl. + std::vector generated = m_impl.generate( + prompts, + std::vector{prompts.size(), config}, + streamer + ); + std::vector plain_replies; + std::vector plain_scores; + for (GenerationResult& res : generated) { + if (GenerationStatus::FINISHED != res.m_status) { + OPENVINO_THROW("Got unfinished GenerationStatus"); + } + std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies)); + std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores)); + } + return {std::move(plain_replies), std::move(plain_scores)}; + } + + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override { + std::vector input_ids = std::visit(overloaded{ + [](const ov::Tensor& inp) { + size_t batch_size = inp.get_shape().at(0); + if (1 == batch_size) { + return std::vector{inp}; + } + std::vector input_ids; + input_ids.reserve(batch_size); + size_t max_len = inp.get_shape().at(1); + const int64_t* const source = inp.data(); + for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) { + input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len)); + int64_t* destination = input_ids.back().data(); + std::copy_n(source + batch_id * max_len, max_len, destination); + } + return input_ids; + }, + [](const TokenizedInputs& inp) { + size_t batch_size = inp.input_ids.get_shape().at(0); + std::vector input_ids; + input_ids.reserve(batch_size); + size_t max_len = inp.input_ids.get_shape().at(1); + const int64_t* const source = inp.input_ids.data(); + const int64_t* const attention_mask = inp.attention_mask.data(); + for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) { + input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len)); + int64_t* destination = input_ids.back().data(); + size_t copy_count = 0; + for (size_t idx = 0; idx < max_len; ++idx) { + if (1 == attention_mask[batch_id * max_len + idx]) { + destination[copy_count++] = source[batch_id * max_len + idx]; + } + } + input_ids.back().set_shape({1, copy_count}); + } + return input_ids; + } + }, inputs); + const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config; + // -1 == config.eos_token_id and config.validate() are handled in m_impl. + std::vector generated = m_impl.generate(input_ids, std::vector{input_ids.size(), config}, streamer); + std::vector> plain_tokens; + std::vector plain_scores; + for (EncodedGenerationResult& res : generated) { + if (GenerationStatus::FINISHED != res.m_status) { + OPENVINO_THROW("Got unfinished GenerationStatus"); + } + std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens)); + std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores)); + } + return {std::move(plain_tokens), std::move(plain_scores)}; + } + + void start_chat(const std::string& system_message) override { + m_impl.start_chat(); + }; + + void finish_chat() override { + m_impl.finish_chat(); + }; +}; +} ov::genai::LLMPipeline::LLMPipeline( - const ov::InferRequest& request, - const ov::genai::Tokenizer& tokenizer, + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, OptionalGenerationConfig generation_config ) { - m_pimpl = std::make_unique(request, tokenizer, generation_config); + auto start_time = std::chrono::steady_clock::now(); + m_pimpl = std::make_unique(request, tokenizer, generation_config); + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::LLMPipeline::LLMPipeline( @@ -292,16 +522,34 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::Tokenizer& tokenizer, const std::string& device, const ov::AnyMap& plugin_config -) { - m_pimpl = make_unique(std::filesystem::path(model_path), tokenizer, device, plugin_config); +){ + auto start_time = std::chrono::steady_clock::now(); + if ("CB" == device) { + m_pimpl = std::make_unique(model_path, tokenizer, "CPU", plugin_config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); + } else { + m_pimpl = std::make_unique(model_path, tokenizer, device, plugin_config); + } + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::LLMPipeline::LLMPipeline( - const std::string& path, - const std::string& device, + const std::string& path, + const std::string& device, const ov::AnyMap& config -) { - m_pimpl = make_unique(std::filesystem::path(path), device, config); +){ + auto start_time = std::chrono::steady_clock::now(); + if ("CB" == device) { + m_pimpl = std::make_unique(path, "CPU", config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique(path, device, config); + } else { + m_pimpl = std::make_unique(path, device, config); + } + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast(stop_time - start_time).count(); } ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { @@ -312,29 +560,21 @@ ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() { return m_pimpl->m_tokenizer; } -void ov::genai::LLMPipeline::start_chat() { - m_pimpl->is_chat_conversation = true; - if (!m_pimpl->m_is_cache_empty) { - m_pimpl->m_model_runner.reset_state(); - m_pimpl->m_is_cache_empty = true; - } +void ov::genai::LLMPipeline::start_chat(const std::string& system_message) { + m_pimpl->start_chat(system_message); } void ov::genai::LLMPipeline::finish_chat() { - m_pimpl->is_chat_conversation = false; - if (!m_pimpl->m_is_cache_empty) { - m_pimpl->m_model_runner.reset_state(); - m_pimpl->m_is_cache_empty = true; - } + m_pimpl->finish_chat(); } void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) { - int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id;; + int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id; m_pimpl->m_generation_config = config; // if eos_token_id was not provided in config forward from default config if (config.eos_token_id == -1) m_pimpl->m_generation_config.eos_token_id = default_eos_token_id; - + m_pimpl->m_generation_config.validate(); } diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp new file mode 100644 index 0000000000..7e58cd3b37 --- /dev/null +++ b/src/cpp/src/llm_pipeline_base.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/streamer_base.hpp" + +namespace ov { +namespace genai { + +class LLMPipelineImplBase { +public: + LLMPipelineImplBase(const Tokenizer& tokenizer, + const GenerationConfig& config = {}) + : m_tokenizer(tokenizer), m_generation_config(config) { + } + + virtual DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) = 0; + + virtual EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) = 0; + + virtual void start_chat(const std::string& system_message) = 0; + virtual void finish_chat() = 0; + + virtual ~LLMPipelineImplBase() = default; + + Tokenizer m_tokenizer; + GenerationConfig m_generation_config; + + float m_load_time_ms = 0; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp new file mode 100644 index 0000000000..d05d928df6 --- /dev/null +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -0,0 +1,373 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "llm_pipeline_static.hpp" + +#include "openvino/opsets/opset13.hpp" + +#include "text_callback_streamer.hpp" +#include "utils.hpp" + +#include + +namespace { + +void align_u4_zp_constants(const std::shared_ptr& model) { + for (auto op : model->get_ops()) { + if (ov::op::util::is_constant(op)) { + auto cst_op = std::dynamic_pointer_cast(op); + const auto cst_op_out = cst_op->output(0); + if (cst_op_out.get_element_type() == ov::element::u4 && ov::shape_size(cst_op_out.get_shape()) == 1u) { + ov::Tensor cst_tensor(ov::element::u4, cst_op_out.get_shape()); + *static_cast(cst_tensor.data()) = cst_op->get_vector()[0] & 0x0f; + auto new_cst_op = std::make_shared(cst_tensor); + for (auto target_input : cst_op_out.get_target_inputs()) { + target_input.replace_source_output(new_cst_op); + } + } + } + } +} + +std::shared_ptr add_slices_to_kvcache_inputs(const std::shared_ptr& model) { + const auto kvcache_name_pattern = "past_key_values"; + std::vector> new_params; + for (auto param : model->get_parameters()) { + auto tensor_name = param->get_output_tensor(0).get_any_name(); + if (tensor_name.find(kvcache_name_pattern) == std::string::npos) { + new_params.push_back(param); + continue; + } + auto shape = param->get_output_shape(0); + shape[2] += 1; + + auto new_param = std::make_shared(param->get_element_type(), shape); + new_param->set_friendly_name(tensor_name); + new_param->outputs().begin()->get_tensor().set_names(param->outputs().begin()->get_tensor().get_names()); + + auto slice_start = std::make_shared( + ov::element::Type_t::i32, ov::Shape{1}, std::vector{1} + ); + auto slice_stop = std::make_shared( + ov::element::Type_t::i32, ov::Shape{1}, std::vector{static_cast(shape[2])} + ); + auto slice_step = std::make_shared( + ov::element::Type_t::i32, ov::Shape{1}, std::vector{1} + ); + auto slice_axes = std::make_shared( + ov::element::Type_t::i32, ov::Shape{1}, std::vector{2} + ); + auto slice_node = std::make_shared( + new_param, slice_start->output(0), slice_stop->output(0), slice_step->output(0), slice_axes->output(0) + ); + slice_node->set_friendly_name(tensor_name + "_Slice"); + for (auto target_input : param->output(0).get_target_inputs()) { + target_input.replace_source_output(slice_node->output(0)); + } + new_params.push_back(new_param); + } + return std::make_shared(model->get_results(), ov::SinkVector{}, new_params); +} + +void reshape_to_static(std::shared_ptr model, + const uint32_t input_size, + const uint32_t kvcache_size) { + std::map new_shapes; + for (auto input : model->inputs()) { + const auto& input_name = input.get_any_name(); + ov::PartialShape new_shape; + if (input_name.find("input_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else if (input_name.find("attention_mask") != std::string::npos) { + new_shape = ov::PartialShape({1, kvcache_size}); + } else if (input_name.find("position_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else { + const auto& partial_shape = input.get_partial_shape(); + new_shape = ov::PartialShape({1, + partial_shape[1].get_length(), + kvcache_size-input_size, + partial_shape[3].get_length()}); + } + new_shapes.emplace(input_name, new_shape); + } + model->reshape(new_shapes); +} + +void fill_tensor(ov::Tensor tensor, int64_t fill_val, size_t offset = 0u) { + int64_t* tensor_data = tensor.data(); + std::fill(tensor_data + offset, tensor_data + tensor.get_size(), fill_val); +} + +void copy_with_offset(const ov::Tensor& orig, const int32_t offset, ov::Tensor& padded) { + int64_t* orig_data = orig.data(); + int64_t* padded_data = padded.data(); + std::copy(orig_data, orig_data + orig.get_size(), padded_data + offset); +} + +ov::AnyMap extract_config_or_default(const ov::AnyMap& config, const std::string& config_name) { + ov::AnyMap stage_cfg; + if (auto it = config.find(config_name); it != config.end()) { + const auto& map = it->second.as>(); + stage_cfg = { map.begin(), map.end() }; + } else if (config_name == "PREFILL_CONFIG") { + std::map prefill_config = { + { "NPU_USE_NPUW", "YES" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_DCOFF_TYPE", "f16" }, + { "NPUW_DCOFF_SCALE", "YES" }, + { "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" } + }; + stage_cfg.insert(prefill_config.begin(), prefill_config.end()); + } else if (config_name == "GENERATE_CONFIG") { + std::map generate_config = { + { "NPU_USE_NPUW", "YES" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_DCOFF_TYPE", "f16" }, + { "NPUW_DCOFF_SCALE", "YES" }, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" }, + { "NPUW_PARALLEL_COMPILE", "YES" }, + { "NPUW_FUNCALL_ASYNC", "YES" } + }; + stage_cfg.insert(generate_config.begin(), generate_config.end()); + } + return stage_cfg; +} + +} // anonymous namespace + +namespace ov { +namespace genai { + +StaticLLMPipeline::StaticLLMPipeline( + const std::filesystem::path& path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& config +) : LLMPipelineImplBase(tokenizer, + utils::from_config_json_if_exists(path)) { + /* NB: Static LLM pipeline consists of two models, + first to process the input prompt (prefill), second to use in generation loop (kvcache) + + Initialization assumes multiple steps: + 1) Read the template model - this will be kvcache model + 2) Expose KV-cache input and output layers from kvcache model + 3) Clone the model - this will be prefill + 3) Reshape both models to static shape + 4) Add slices to KV-cache inputs for kvcache model, this will make input and output KV-cache + layers to have the same shape and allow outputs writes directly to inputs for the next iteration. + 5) Compile both models + 6) Initialize input tensors for kvcache and prefill models + */ + ov::Core core; + // (1) Read the template model - this will be kvcache model + auto kvcache_model = core.read_model(path / "openvino_model.xml"); + // (2) TODO: Expose KV-cache input and output layers from kvcache model + // (3) Clone the model - this will be prefill + m_prefill_model = m_kvcache_model->clone(); + m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); + // (4) Reshape both models to static shape + m_kvcache_desc = KVCacheDesc { 1024u, 0u }; + const uint32_t max_prompt_size = m_kvcache_desc.total_size; + const uint32_t max_kvcache_size = m_kvcache_desc.total_size; + reshape_to_static(m_prefill_model, max_prompt_size, max_kvcache_size); + reshape_to_static(m_kvcache_model, 1u, max_kvcache_size); + // (5) Add slices to kvcache model + m_kvcache_model = add_slices_to_kvcache_inputs(m_kvcache_model); + // (6) Compile both model + m_prefill_request = core.compile_model( + m_prefill_model, device, extract_config_or_default(config, "PREFILL_CONFIG") + ).create_infer_request(); + m_kvcache_request = core.compile_model( + kvcache_model, device, extract_config_or_default(config, "GENERATE_CONFIG") + ).create_infer_request(); + // (7) Initialize tensors + prepare_for_new_conversation(); +}; + +StaticLLMPipeline::StaticLLMPipeline( + const std::filesystem::path& path, + const std::string& device, + const ov::AnyMap& config +) : StaticLLMPipeline(path, path.string(), device, config) { +} + +void StaticLLMPipeline::start_chat(const std::string& system_message) { + if (!system_message.empty()) { + m_history.push_back({{"role", "system"}, {"content", system_message}}); + } + m_is_chat_conversation = true; +}; + +void StaticLLMPipeline::finish_chat() { + m_is_chat_conversation = false; + m_history.clear(); +}; + +void StaticLLMPipeline::prepare_for_new_conversation() { + fill_tensor(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id()); + fill_tensor(m_prefill_request.get_tensor("position_ids"), 0u); + fill_tensor(m_prefill_request.get_tensor("attention_mask"), 0u); + fill_tensor(m_kvcache_request.get_tensor("attention_mask"), 0u); + m_kvcache_desc.num_stored_tokens = 0u; +} + +DecodedResults StaticLLMPipeline::generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + if (std::holds_alternative>(inputs)) { + OPENVINO_THROW("Currently only batch size=1 is supported"); + } + + OPENVINO_ASSERT(std::holds_alternative(inputs)); + auto& prompt = std::get(inputs); + + if (m_is_chat_conversation) { + m_history.push_back({{"role", "user"}, {"content", prompt}}); + constexpr bool add_generation_prompt = true; + prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + } + + auto tokenized_input = m_tokenizer.encode(prompt); + auto encoded_results = generate(tokenized_input, config, streamer); + DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + + if (m_is_chat_conversation) { + auto answer = decoded_results.texts[0]; + m_history.push_back({{"role", "assistant"}, {"content", answer}}); + } + return decoded_results; +} + +EncodedResults StaticLLMPipeline::generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + ov::Tensor input_ids; + ov::Tensor attention_mask; + + if (auto data = std::get_if(&inputs)) { + input_ids = *data; + attention_mask = ov::genai::utils::init_attention_mask(input_ids); + } else if (auto data = std::get_if(&inputs)) { + input_ids = data->input_ids; + attention_mask = data->attention_mask; + } + + if (input_ids.get_shape().at(0) > 1u) { + OPENVINO_THROW("Currently only batch size=1 is supported"); + } + + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + // If eos_token_id was not provided, take value from default m_generation_config + if (config.eos_token_id == -1) + config.set_eos_token_id(m_generation_config.eos_token_id); + config.validate(); + + std::shared_ptr streamer_ptr; + if (auto streamer_obj = std::get_if(&streamer)) { + streamer_ptr = nullptr; + } else if (auto streamer_obj = std::get_if>(&streamer)) { + streamer_ptr = *streamer_obj; + } else if (auto callback = std::get_if>(&streamer)) { + streamer_ptr = std::make_shared(m_tokenizer, *callback); + } + + if (!config.is_greedy_decoding()) { + OPENVINO_THROW("Currently only greedy decoding is supported"); + } + + ov::genai::EncodedResults results; + // NB: Only batch=1 is supported now + results.scores.resize(1u); + results.scores[0] = 0u; + results.tokens.resize(1u); + + // NB: Check if there is enough space in KV-cache to process input prompt + auto prompt_len = input_ids.get_size(); + if (prompt_len > m_kvcache_desc.total_size) { + OPENVINO_THROW("Currently static pipeline only process up to " + std::to_string(m_kvcache_desc.total_size) + " tokens"); + } + + // NB: From the "generate" perspective, every call is treated as start of new conversation, + // but if continuation is needed, prompt contains information about the entire conversation. + prepare_for_new_conversation(); + + auto padded_input_ids = m_prefill_request.get_tensor("input_ids"); + const size_t offset = padded_input_ids.get_size() - input_ids.get_size(); + copy_with_offset(input_ids, offset, padded_input_ids); + + auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask"); + fill_tensor(padded_attention_mask, 1u, offset); + + auto padded_position_ids = m_prefill_request.get_tensor("position_ids"); + auto* padded_pos_data = padded_position_ids.data(); + std::iota(padded_pos_data + (m_kvcache_desc.total_size - prompt_len + 1), padded_pos_data + padded_position_ids.get_size(), 0u); + + m_prefill_request.infer(); + + // NB: Now there are prompt_len tokens in KV-cache + m_kvcache_desc.num_stored_tokens += prompt_len; + int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0); + results.tokens[0].push_back(last_token); + if (streamer_ptr && streamer_ptr->put(last_token)) { + return results; + } + + padded_attention_mask.copy_to(m_kvcache_request.get_tensor("attention_mask")); + + // Inputs: input_ids, attention_mask, position_ids, ... + // Outputs: logits, ... + const auto kStartInputKVCacheLayers = 3u; + const auto kStartOutputKVCacheLayers = 1u; + + const auto& kvcache_compiled = m_kvcache_request.get_compiled_model(); + for (int i = 0; i < kvcache_compiled.outputs().size() - 1; ++i) { + const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); + const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + auto kvcache_out_tensor = m_kvcache_request.get_tensor(output_name); + m_kvcache_request.set_tensor(input_name, kvcache_out_tensor); + auto prefill_tensor = m_prefill_request.get_tensor(output_name); + auto kvcache_tensor = m_kvcache_request.get_tensor(input_name); + prefill_tensor.copy_to(kvcache_tensor); + } + + auto* input_ids_data = m_kvcache_request.get_tensor("input_ids").data(); + auto* position_ids_data = m_kvcache_request.get_tensor("position_ids").data(); + auto* attention_mask_data = m_kvcache_request.get_tensor("attention_mask").data(); + + const size_t max_tokens = config.get_max_new_tokens(prompt_len); + for (int i = 0; i < max_tokens - 1; ++i) { + input_ids_data[0] = last_token; + position_ids_data[0] = m_kvcache_desc.num_stored_tokens; + attention_mask_data[m_kvcache_desc.total_size - m_kvcache_desc.num_stored_tokens - 1] = 1u; + + m_kvcache_request.infer(); + m_kvcache_desc.num_stored_tokens += 1; + + last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0); + results.tokens[0].push_back(last_token); + + if (streamer_ptr && streamer_ptr->put(last_token)) { + break; + } + + if (last_token == config.eos_token_id && !config.ignore_eos) { + break; + } + + // NB: KV-cache is full, further generation is impossible + if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { + break; + } + + } + return results; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp new file mode 100644 index 0000000000..7560b7e336 --- /dev/null +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -0,0 +1,62 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "llm_pipeline_base.hpp" + +namespace ov { +namespace genai { + +class StaticLLMPipeline final : public LLMPipelineImplBase { +public: + StaticLLMPipeline( + const std::filesystem::path& path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& config + ); + + StaticLLMPipeline( + const std::filesystem::path& path, + const std::string& device, + const ov::AnyMap& config + ); + + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override; + + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override; + + void start_chat(const std::string& system_message) override; + void finish_chat() override; +private: + void prepare_for_new_conversation(); + +private: + struct KVCacheDesc { + uint32_t total_size; + uint32_t num_stored_tokens; + }; + + // FIXME: Ideally, we don't need to keep those + std::shared_ptr m_kvcache_model; + std::shared_ptr m_prefill_model; + + KVCacheDesc m_kvcache_desc; + ov::InferRequest m_kvcache_request; + ov::InferRequest m_prefill_request; + + bool m_is_chat_conversation = false; + ChatHistory m_history; +}; + +} // namespace genai +} // namespace ov diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/logit_processor.hpp b/src/cpp/src/logit_processor.hpp similarity index 64% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/logit_processor.hpp rename to src/cpp/src/logit_processor.hpp index ab151e55aa..cb3ffb37c0 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/logit_processor.hpp +++ b/src/cpp/src/logit_processor.hpp @@ -6,7 +6,7 @@ #include #include -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" struct Token { float m_log_prob = 0.; @@ -21,7 +21,7 @@ using TokenIds = std::vector; class ILogitTransformer { public: - virtual std::vector apply(const std::vector& input_logits) = 0; + virtual void apply(std::vector& logits) = 0; virtual bool is_applicable(size_t generated_tokens_cnt = 0) { return true; @@ -32,18 +32,16 @@ class TopPFilter : public ILogitTransformer { public: TopPFilter(double top_p) : m_top_p(top_p) {} - std::vector apply(const std::vector& input_probs) override { - std::vector tmp(input_probs); - std::sort(tmp.begin(), tmp.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + void apply(std::vector& logits) override { + std::sort(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); float probability_sum = 0.0f; size_t nucleus_size = 0; - for (const auto& probability : tmp) { + for (const auto& probability : logits) { probability_sum += probability.m_log_prob; nucleus_size += 1; if (probability_sum > m_top_p) break; } - tmp.resize(nucleus_size); - return tmp; + logits.resize(nucleus_size); } protected: @@ -54,12 +52,10 @@ class TopKFilter : public ILogitTransformer { public: TopKFilter(size_t top_k) : m_top_k(top_k) {} - std::vector apply(const std::vector& input_probs) override { - std::vector tmp(input_probs); - std::sort(tmp.begin(), tmp.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); - size_t top_k = input_probs.size() >= m_top_k ? m_top_k : input_probs.size(); - tmp.resize(top_k); - return tmp; + void apply(std::vector& logits) override { + std::sort(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + size_t top_k = logits.size() >= m_top_k ? m_top_k : logits.size(); + logits.resize(top_k); } protected: @@ -70,20 +66,18 @@ class TemperatureLogitTransform : public ILogitTransformer { public: TemperatureLogitTransform(double temperature) : m_temperature(temperature) {}; - std::vector apply(const std::vector& input_logits) override { - std::vector output(input_logits.begin(), input_logits.end()); - std::sort(output.begin(), output.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); - float max_logit = output[0].m_log_prob; + void apply(std::vector& logits) override { + auto max_prob_token = std::max_element(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; }); + float max_logit = max_prob_token->m_log_prob; - std::for_each(output.begin(), output.end(), [max_logit, this](Token& val) {val.m_log_prob = expf((val.m_log_prob - max_logit) / this->m_temperature);}); + std::for_each(logits.begin(), logits.end(), [max_logit, this](Token& val) {val.m_log_prob = expf((val.m_log_prob - max_logit) / this->m_temperature);}); float norm_sum = 0.0; - for (const auto& val : output) { + for (const auto& val : logits) { norm_sum += val.m_log_prob; } - std::for_each(output.begin(), output.end(), [norm_sum](Token& val) {val.m_log_prob /= norm_sum;}); - return output; + std::for_each(logits.begin(), logits.end(), [norm_sum](Token& val) {val.m_log_prob /= norm_sum;}); } protected: @@ -124,37 +118,35 @@ class RepetitionPenaltyTransform : public IPenaltyTransformer { m_penalty = repetition_penalty; }; - std::vector apply(const std::vector& input_logits) override { - std::vector output(input_logits.begin(), input_logits.end()); - size_t vocab_size = input_logits.size(); + void apply(std::vector& logits) override { + size_t vocab_size = logits.size(); for (const auto& prompt_id : *m_unique_prompt_token_ids) { OPENVINO_ASSERT((prompt_id >= 0) && (prompt_id < vocab_size), "input_ids token out of bounds"); - OPENVINO_ASSERT(input_logits[prompt_id].m_index == prompt_id, "input_logits must have original index order"); - auto logit_value = output[prompt_id].m_log_prob; + OPENVINO_ASSERT(logits[prompt_id].m_index == prompt_id, "input_logits must have original index order"); + auto logit_value = logits[prompt_id].m_log_prob; if (logit_value >= 0) { - output[prompt_id].m_log_prob /= m_penalty; + logits[prompt_id].m_log_prob /= m_penalty; } else { - output[prompt_id].m_log_prob *= m_penalty; + logits[prompt_id].m_log_prob *= m_penalty; }; } for (const auto& input_id_pair : *m_unique_generated_token_ids) { const auto& input_id = input_id_pair.first; OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); - OPENVINO_ASSERT(input_logits[input_id].m_index == input_id, "input_logits must have original index order"); - auto logit_value = output[input_id].m_log_prob; + OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order"); + auto logit_value = logits[input_id].m_log_prob; if (logit_value >= 0) { - output[input_id].m_log_prob /= m_penalty; + logits[input_id].m_log_prob /= m_penalty; } else { - output[input_id].m_log_prob *= m_penalty; + logits[input_id].m_log_prob *= m_penalty; }; } - return output; } - std::vector apply(const std::vector& input_logits, const TokenIds& input_ids) { + void apply(std::vector& logits, const TokenIds& input_ids) { set_unique_prompt_token_ids(nullptr); extract_generated_tokens(input_ids); - return apply(input_logits); + apply(logits); } void set_unique_prompt_token_ids(const std::shared_ptr>& unique_prompt_token_ids) { @@ -174,14 +166,10 @@ class EOSPenaltyTransform : public ILogitTransformer { EOSPenaltyTransform(size_t eos_token_id, size_t min_generated_tokens) : m_eos_token_id(eos_token_id), m_applicable_tensor_len(min_generated_tokens) {} - std::vector apply(const std::vector& input_logits) { - std::vector output(input_logits.begin(), input_logits.end()); - for (auto& token_id : output) { - if (token_id.m_index == m_eos_token_id) { - token_id.m_log_prob = 0.f; - } - } - return output; + void apply(std::vector& logits) override { + // Since EOS penalty is applied early, the token vector is not sorted + // and we can assume element order match token ids. + logits[m_eos_token_id].m_log_prob = 0.f; } @@ -200,26 +188,24 @@ class FrequencyPenaltyTransform : public IPenaltyTransformer { m_penalty = value; }; - std::vector apply(const std::vector& input_logits) override { - std::vector output(input_logits.begin(), input_logits.end()); - size_t vocab_size = input_logits.size(); + void apply(std::vector& logits) override { + size_t vocab_size = logits.size(); for (const auto& input_id_pair : *m_unique_generated_token_ids) { const auto& input_id = input_id_pair.first; OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); - OPENVINO_ASSERT(input_logits[input_id].m_index == input_id, "input_logits must have original index order"); - auto logit_value = output[input_id].m_log_prob; + OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order"); + auto logit_value = logits[input_id].m_log_prob; if (logit_value >= 0) { - output[input_id].m_log_prob -= m_penalty * input_id_pair.second; + logits[input_id].m_log_prob -= m_penalty * input_id_pair.second; } else { - output[input_id].m_log_prob += m_penalty * input_id_pair.second; + logits[input_id].m_log_prob += m_penalty * input_id_pair.second; }; } - return output; } - std::vector apply(const std::vector& input_logits, const TokenIds& input_ids) { + void apply(std::vector& logits, const TokenIds& input_ids) { extract_generated_tokens(input_ids); - return apply(input_logits); + apply(logits); } }; @@ -229,40 +215,24 @@ class PresencePenaltyTransform : public IPenaltyTransformer { m_penalty = value; }; - std::vector apply(const std::vector& input_logits) override { - std::vector output(input_logits.begin(), input_logits.end()); - size_t vocab_size = input_logits.size(); + void apply(std::vector& logits) override { + size_t vocab_size = logits.size(); for (const auto& input_id_pair : *m_unique_generated_token_ids) { const auto& input_id = input_id_pair.first; OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); - OPENVINO_ASSERT(input_logits[input_id].m_index == input_id, "input_logits must have original index order"); - auto logit_value = output[input_id].m_log_prob; + OPENVINO_ASSERT(logits[input_id].m_index == input_id, "input_logits must have original index order"); + auto logit_value = logits[input_id].m_log_prob; if (logit_value >= 0) { - output[input_id].m_log_prob -= m_penalty; + logits[input_id].m_log_prob -= m_penalty; } else { - output[input_id].m_log_prob += m_penalty; + logits[input_id].m_log_prob += m_penalty; }; } - return output; } - std::vector apply(const std::vector& input_logits, const TokenIds& input_ids) { + void apply(std::vector& logits, const TokenIds& input_ids) { extract_generated_tokens(input_ids); - return apply(input_logits); - } -}; - - -class ProbabilityNormalizeTransform : public ILogitTransformer { -public: - ProbabilityNormalizeTransform() = default; - - std::vector apply(const std::vector& input_probs) override { - std::vector output(input_probs); - float norm_sum = 0.0; - for (const auto& val : output) norm_sum += val.m_log_prob; - for (auto& val : output) val.m_log_prob /= norm_sum; - return output; + apply(logits); } }; @@ -277,7 +247,7 @@ class LogitProcessor { size_t m_generated_tokens = 0; public: - LogitProcessor(const GenerationConfig& sampling_params, + LogitProcessor(const ov::genai::GenerationConfig& sampling_params, const LogitTransformers::TokenIds& input_ids) { for (const auto& input_id : input_ids) { m_unique_prompt_token_ids->insert(input_id); @@ -289,7 +259,7 @@ class LogitProcessor { ); } - if (sampling_params.is_multinomial() || sampling_params.is_greedy_sampling()) { + if (sampling_params.is_multinomial() || sampling_params.is_greedy_decoding()) { if (sampling_params.repetition_penalty != 1.0f) { std::shared_ptr transformer = std::shared_ptr(new LogitTransformers::RepetitionPenaltyTransform(sampling_params.repetition_penalty)); @@ -304,34 +274,31 @@ class LogitProcessor { m_logit_transformers.push_back(transformer); } - if (sampling_params.frequence_penalty != 0.0f) { + if (sampling_params.frequency_penalty != 0.0f) { std::shared_ptr transformer = - std::shared_ptr(new LogitTransformers::FrequencyPenaltyTransform(sampling_params.frequence_penalty)); + std::shared_ptr(new LogitTransformers::FrequencyPenaltyTransform(sampling_params.frequency_penalty)); transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); m_logit_transformers.push_back(transformer); } if (sampling_params.is_multinomial()) { m_logit_transformers.emplace_back(new LogitTransformers::TemperatureLogitTransform(sampling_params.temperature)); - if (sampling_params.top_p != 0.0f) { + if (sampling_params.top_p != 1.0f) { m_logit_transformers.emplace_back(new LogitTransformers::TopPFilter(sampling_params.top_p)); } if (sampling_params.top_k > 0) { m_logit_transformers.emplace_back(new LogitTransformers::TopKFilter(sampling_params.top_k)); } - m_logit_transformers.emplace_back(new LogitTransformers::ProbabilityNormalizeTransform()); } } } - std::vector apply(const std::vector& logits) { - std::vector outputs(logits.begin(), logits.end()); + void apply(std::vector& logits) { for (const auto& transformer : m_logit_transformers) { if (transformer->is_applicable(m_generated_tokens)) { - outputs = transformer->apply(outputs); + transformer->apply(logits); } } - return outputs; } void increment_gen_tokens() { diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/model_runner.hpp b/src/cpp/src/model_runner.hpp similarity index 99% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/model_runner.hpp rename to src/cpp/src/model_runner.hpp index 46c5777a84..5fb2e0f524 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/model_runner.hpp +++ b/src/cpp/src/model_runner.hpp @@ -13,6 +13,7 @@ #include "scheduler.hpp" #include "timer.hpp" +namespace ov::genai { class ModelRunner { ov::InferRequest m_request; SchedulerConfig m_scheduler_config; @@ -141,3 +142,4 @@ class ModelRunner { return m_request.get_output_tensor(); } }; +} diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp index 7457153859..b00c62aed7 100644 --- a/src/cpp/src/multinomial_decoding.cpp +++ b/src/cpp/src/multinomial_decoding.cpp @@ -153,7 +153,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner ov::Tensor input_ids, ov::Tensor attention_mask, ov::genai::GenerationConfig config, - std::shared_ptr streamer) { + std::shared_ptr streamer, + std::optional position_ids) { ov::Shape prompts_shape = input_ids.get_shape(); size_t batch_size = prompts_shape[0]; @@ -161,21 +162,18 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner size_t prompt_len = prompts_shape[1]; - ov::genai::EncodedResults results; + // Initialize results and performance metrics. + EncodedResults results; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; results.scores.resize(batch_size, 0); results.tokens.resize(batch_size); // Initialize inputs m_model_runner.set_tensor("input_ids", input_ids); m_model_runner.set_tensor("attention_mask", attention_mask); - - auto num_inputs = m_model_runner.get_compiled_model().inputs().size(); - bool position_ids_available = num_inputs == 4; - if (position_ids_available) { - ov::Tensor position_ids = m_model_runner.get_tensor("position_ids"); - position_ids.set_shape(input_ids.get_shape()); - std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); - } + + if (position_ids.has_value()) + m_model_runner.set_tensor("position_ids", *position_ids); // Input values are persistent between inference calls. // That allows to set values, which aren't going to change, only once @@ -183,6 +181,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("beam_idx").data()[0] = 0; m_model_runner.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); auto logits_tensor = m_model_runner.get_tensor("logits"); @@ -212,13 +212,11 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner } m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1}); - if (position_ids_available) - m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1}); size_t max_new_tokens = config.get_max_new_tokens(prompt_len); for (size_t i = 0; i < max_new_tokens - 1; i++) { - if (position_ids_available) { + if (position_ids.has_value()) { ov::genai::utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); } @@ -228,6 +226,8 @@ ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner m_model_runner.get_tensor("input_ids").data()[0] = out_token.id; m_model_runner.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); logits = m_model_runner.get_tensor("logits").data(); out_token = sampling.get_out_token(logits, vocab_size, tokens); diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp b/src/cpp/src/paged_attention_transformations.cpp similarity index 80% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp rename to src/cpp/src/paged_attention_transformations.cpp index 887cdbd381..3f343048ea 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/paged_attention_transformations.cpp +++ b/src/cpp/src/paged_attention_transformations.cpp @@ -8,6 +8,14 @@ #include "device_config.hpp" +using namespace ov::genai; + +inline ov::PartialShape to_partial_with_dyn_0_dim(const ov::Shape& static_shape) { + ov::PartialShape partial_shape = static_shape; + partial_shape[0] = ov::Dimension::dynamic(); + return partial_shape; +} + void apply_paged_attention_transformations(std::shared_ptr model, DeviceConfig& device_config) { const ov::op::util::VariableVector& variables = model->get_variables(); OPENVINO_ASSERT(!variables.empty(), "Model is supposed to be stateful"); @@ -31,8 +39,9 @@ void apply_paged_attention_transformations(std::shared_ptr model, Dev for (size_t decoder_layer_id = 0; decoder_layer_id < num_layers; ++decoder_layer_id) { parameters[kv_caches_inputs_offset + 2 * decoder_layer_id]->set_element_type(device_config.get_cache_precision()); parameters[kv_caches_inputs_offset + 2 * decoder_layer_id + 1]->set_element_type(device_config.get_cache_precision()); - parameters[kv_caches_inputs_offset + 2 * decoder_layer_id]->set_partial_shape(device_config.get_key_cache_shape()); - parameters[kv_caches_inputs_offset + 2 * decoder_layer_id + 1]->set_partial_shape(device_config.get_value_cache_shape()); + // TODO: CVS-145270 + parameters[kv_caches_inputs_offset + 2 * decoder_layer_id]->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_key_cache_shape())); + parameters[kv_caches_inputs_offset + 2 * decoder_layer_id + 1]->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_value_cache_shape())); } model->validate_nodes_and_infer_types(); } diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp new file mode 100644 index 0000000000..2f378ab302 --- /dev/null +++ b/src/cpp/src/perf_metrics.cpp @@ -0,0 +1,164 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/perf_metrics.hpp" +#include "openvino/openvino.hpp" +#include +#include +#include + +namespace { + +ov::genai::MeanStdPair calc_mean_and_std(const std::vector& durations) { + // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds. + float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + return acc + duration.count() / 1000.0f; + }); + mean /= durations.size(); + + float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + auto d = duration.count() / 1000.0f; + return acc + d * d; + }); + float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); + return {mean, std}; +} + + +} // namespace + +namespace ov { +namespace genai { + +float PerfMetrics::get_load_time() { + return load_time; +} + +float PerfMetrics::get_num_generated_tokens() { + evaluate_statistics(); + return num_generated_tokens; +} + +float PerfMetrics::get_num_input_tokens() { + evaluate_statistics(); + return num_generated_tokens; +} + +MeanStdPair PerfMetrics::get_ttft() { + evaluate_statistics(); + return ttft; +} + +MeanStdPair PerfMetrics::get_tpot() { + evaluate_statistics(); + return tpot; +} + +MeanStdPair PerfMetrics::get_throughput() { + evaluate_statistics(); + return throughput; +} + +MeanStdPair PerfMetrics::get_generate_duration() { + evaluate_statistics(); + return generate_duration; +} + +MeanStdPair PerfMetrics::get_tokenization_duration() { + evaluate_statistics(); + return tokenization_duration; +} + +MeanStdPair PerfMetrics::get_detokenization_duration() { + evaluate_statistics(); + return detokenization_duration; +} + +float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) { + return std::chrono::duration_cast(duration).count(); +} + +void PerfMetrics::evaluate_statistics(std::optional start_time) { + if (m_evaluated){ + return; + } + // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that. + if (start_time.has_value()) { + auto start_time_val = *start_time; + auto& tok_times = raw_metrics.m_new_token_times; + auto& batch_sizes = raw_metrics.m_batch_sizes; + raw_metrics.m_durations = std::vector(tok_times.size()); + + auto ttft = tok_times[0] - start_time_val; + raw_metrics.m_times_to_first_token = std::vector(); + raw_metrics.m_times_to_first_token.emplace_back(ttft); + num_generated_tokens = 0; + for (size_t i = 0; i < tok_times.size(); ++i) { + raw_metrics.m_durations[i] = tok_times[i] - start_time_val; + + // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms. + raw_metrics.m_durations[i] /= batch_sizes[i]; + num_generated_tokens += batch_sizes[i]; + start_time_val = tok_times[i]; + } + } + + // calc_mean_and_std will convert microsecond to milliseconds. + tpot = calc_mean_and_std(raw_metrics.m_durations); + ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token); + + generate_duration = calc_mean_and_std(raw_metrics.generate_durations); + tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations); + detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations); + + // tokens per second + throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)}; + m_evaluated = true; +} + +PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { + OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); + + // Copy left value to res. + PerfMetrics res = *this; + + // Concatenate durations, batch_sizes first token times. + auto& new_durations = res.raw_metrics.m_durations; + auto& new_batch_sizes = res.raw_metrics.m_batch_sizes; + auto& new_times_to_first_token = res.raw_metrics.m_times_to_first_token; + auto& right_durations = right.raw_metrics.m_durations; + auto& right_batch_sizes = right.raw_metrics.m_batch_sizes; + auto& right_times_to_first_token = right.raw_metrics.m_times_to_first_token; + + new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end()); + new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end()); + new_batch_sizes.insert(new_batch_sizes.end(), right_batch_sizes.begin(), right_batch_sizes.end()); + + // Concatenate tokenization/detokenization and total generation times. + auto& new_tok_durations = res.raw_metrics.tokenization_durations; + auto& new_detok_durations = res.raw_metrics.detokenization_durations; + auto& new_gen_durations = res.raw_metrics.generate_durations; + auto& right_tok_durations = right.raw_metrics.tokenization_durations; + auto& right_detok_durations = right.raw_metrics.detokenization_durations; + auto& right_gen_durations = right.raw_metrics.generate_durations; + + new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end()); + new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); + new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end()); + + res.num_generated_tokens = num_generated_tokens + right.num_generated_tokens; + res.num_input_tokens = num_generated_tokens + right.num_input_tokens; + res.load_time = load_time; + res.m_evaluated = false; + return res; +} + +PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) { + *this = *this + right; + return *this; +} + +} // namespace genai +} // namespace ov diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp b/src/cpp/src/sampler.hpp similarity index 89% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp rename to src/cpp/src/sampler.hpp index 322c447435..6390fc8725 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -19,6 +19,7 @@ #include "scheduler.hpp" #include "sequence_group.hpp" +namespace ov::genai { // Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack std::vector kmp_search(const std::vector& haystack, const std::vector& needle) { if (needle.empty()) { // no_repeat_ngram_size == 1, ban every token @@ -110,14 +111,17 @@ struct Group { std::vector min_heap; // The worst of the best completed beams is the first bool done = false; - int64_t finish(Beam beam, const GenerationConfig& sampling_params) { + int64_t finish(Beam beam, const ov::genai::GenerationConfig& sampling_params) { int64_t preeempted_sequence_id = -1; float generated_len = beam.get_generated_len() + (beam.m_token_id == sampling_params.eos_token_id ? 1 : 0); // HF counts EOS token in generation length beam.m_score /= std::pow(generated_len, sampling_params.length_penalty); min_heap.push_back(beam); std::push_heap(min_heap.begin(), min_heap.end(), greater); - if (min_heap.size() > sampling_params.group_size) { + OPENVINO_ASSERT(sampling_params.num_beams % sampling_params.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups; + if (min_heap.size() > group_size) { std::pop_heap(min_heap.begin(), min_heap.end(), greater); preeempted_sequence_id = min_heap.back().m_sequence->get_id(); min_heap.pop_back(); @@ -126,8 +130,11 @@ struct Group { return preeempted_sequence_id; } - void is_done(const GenerationConfig& sampling_params) { - if (min_heap.size() < sampling_params.group_size) + void is_done(const ov::genai::GenerationConfig& sampling_params) { + OPENVINO_ASSERT(sampling_params.num_beams % sampling_params.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups; + if (min_heap.size() < group_size) return; const Beam& best_running_sequence = ongoing.front(), & worst_finished_sequence = min_heap.front(); @@ -135,15 +142,15 @@ struct Group { float best_sum_logprobs = best_running_sequence.m_score; float worst_score = worst_finished_sequence.m_score; switch (sampling_params.stop_criteria) { - case StopCriteria::EARLY: + case ov::genai::StopCriteria::EARLY: done = true; return; - case StopCriteria::HEURISTIC: { + case ov::genai::StopCriteria::HEURISTIC: { float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), sampling_params.length_penalty); done = worst_score >= highest_attainable_score; return; } - case StopCriteria::NEVER: { + case ov::genai::StopCriteria::NEVER: { size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.max_new_tokens : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), sampling_params.length_penalty); done = worst_score >= highest_attainable_score; @@ -165,7 +172,7 @@ struct SamplerOutput { class GroupBeamSearcher { SequenceGroup::Ptr m_sequence_group; - GenerationConfig m_parameters; + ov::genai::GenerationConfig m_parameters; std::vector m_groups; public: explicit GroupBeamSearcher(SequenceGroup::Ptr sequence_group); @@ -212,8 +219,13 @@ class Sampler { } Token _greedy_sample(const std::vector& logit_vector) const { - auto out_token = std::max_element(logit_vector.begin(), logit_vector.end(), [](const Token& lhs, const Token& rhs) { return lhs.m_log_prob < rhs.m_log_prob; }); - return *out_token; + Token max_token{-std::numeric_limits::infinity() , 0}; + for (const auto& logit : logit_vector) { + if (logit.m_log_prob > max_token.m_log_prob) { + max_token = logit; + } + } + return max_token; } std::vector _multinomial_sample(const std::vector& logit_vector, size_t num_tokens_per_sequence) { @@ -240,6 +252,8 @@ class Sampler { SamplerOutput sample(std::vector & sequence_groups, ov::Tensor logits); void set_seed(size_t seed) { rng_engine.seed(seed); } + + void clear_beam_search_info(uint64_t request_id); }; SamplerOutput Sampler::sample(std::vector & sequence_groups, ov::Tensor logits) { @@ -258,7 +272,7 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, size_t num_running_sequences = sequence_group->num_running_seqs(); size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len); - const GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); + const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); const auto request_id = sequence_group->get_request_id(); if (!m_logit_processors.count(request_id)) { @@ -270,9 +284,9 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data); if (sequence_group->requires_sampling()) { - if (sampling_params.is_greedy_sampling() || sampling_params.is_multinomial()) { + if (sampling_params.is_greedy_decoding() || sampling_params.is_multinomial()) { std::vector running_sequences = sequence_group->get_running_sequences(); - if (sampling_params.is_greedy_sampling()) { + if (sampling_params.is_greedy_decoding()) { OPENVINO_ASSERT(num_running_sequences == 1); } auto register_new_token = [&](const Token& sampled_token_id, Sequence::Ptr running_sequence) { @@ -281,10 +295,10 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, }; for (size_t running_sequence_id = 0; running_sequence_id < num_running_sequences; ++running_sequence_id) { auto logit_vector = _get_logit_vector(sequence_group_logits, running_sequence_id); - logit_vector = logit_processor.apply(logit_vector); + logit_processor.apply(logit_vector); Token sampled_token_id; - if (sampling_params.is_greedy_sampling()) { + if (sampling_params.is_greedy_decoding()) { sampled_token_id = _greedy_sample(logit_vector); } else { // is_multinomial() @@ -318,15 +332,6 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, if (m_beam_search_info.find(request_id) == m_beam_search_info.end()) { m_beam_search_info.emplace(request_id, GroupBeamSearcher(sequence_group)); } - else { - // sequence group can be empty if returned after preemption - if (sequence_group->is_empty()) { - // clear beam search info - m_beam_search_info.erase(request_id); - m_beam_search_info.emplace(request_id, GroupBeamSearcher(sequence_group)); - } - } - // current algorithm already adds new tokens to running sequences and m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output); @@ -360,13 +365,16 @@ SamplerOutput Sampler::sample(std::vector & sequence_groups, GroupBeamSearcher::GroupBeamSearcher(SequenceGroup::Ptr sequence_group) : m_sequence_group(sequence_group), m_parameters{m_sequence_group->get_sampling_parameters()}, - m_groups{m_parameters.num_groups} { + m_groups{m_parameters.num_beam_groups} { OPENVINO_ASSERT(m_sequence_group->num_running_seqs() == 1); + OPENVINO_ASSERT(m_parameters.num_beams % m_parameters.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups; for (Group& group : m_groups) { - group.ongoing.reserve(m_parameters.group_size); + group.ongoing.reserve(group_size); // initially we just add our "base" sequence to beams inside each group - for (size_t i = 0; i < m_parameters.group_size; ++i) + for (size_t i = 0; i < group_size; ++i) group.ongoing.push_back(Beam((*sequence_group)[0])); // to avoid selecting the same tokens for beams within group, let's just initialize score // for the front one @@ -375,10 +383,13 @@ GroupBeamSearcher::GroupBeamSearcher(SequenceGroup::Ptr sequence_group) } void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) { + OPENVINO_ASSERT(m_parameters.num_beams % m_parameters.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups; std::vector next_tokens; std::vector next_beams; - next_tokens.reserve(m_parameters.num_groups * m_parameters.group_size); - next_beams.reserve(m_parameters.num_groups * m_parameters.group_size); + next_tokens.reserve(m_parameters.num_beams); + next_beams.reserve(m_parameters.num_beams); // parent sequence ID -> number of child sequences std::map parent_2_num_childs_map; @@ -447,7 +458,7 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp continue; std::vector candidates; - candidates.reserve(m_parameters.group_size * 2 * m_parameters.group_size); + candidates.reserve(group_size * 2 * group_size); for (const Beam& beam : group.ongoing) { std::vector tokens = log_softmax(logits, beam.m_global_beam_idx); @@ -486,7 +497,7 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp try_to_finish_candidate(group, new_candidate); } else { candidates.push_back(new_candidate); - if (++add_count == 2 * m_parameters.group_size) { + if (++add_count == 2 * group_size) { break; } } @@ -494,16 +505,16 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp } // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam - OPENVINO_ASSERT(candidates.size() >= 2 * m_parameters.group_size, "No beams left to search"); + OPENVINO_ASSERT(candidates.size() >= 2 * group_size, "No beams left to search"); - auto to_sort = candidates.begin() + ptrdiff_t(2 * m_parameters.group_size); + auto to_sort = candidates.begin() + ptrdiff_t(2 * group_size); std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { Beam & candidate = candidates[cand_idx]; if (m_parameters.eos_token_id == candidate.m_token_id) { // If beam_token does not belong to top num_beams tokens, it should not be added - if (cand_idx >= m_parameters.group_size) + if (cand_idx >= group_size) continue; // try to finish candidate @@ -513,7 +524,7 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp child_beams_per_group[group_id].push_back(candidate); // if num childs are enough - if (child_beams_per_group[group_id].size() == m_parameters.group_size) { + if (child_beams_per_group[group_id].size() == group_size) { break; } } @@ -573,4 +584,9 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp group.ongoing = child_beams_per_group[group_id]; } } -} \ No newline at end of file +} + +void Sampler::clear_beam_search_info(uint64_t request_id) { + m_beam_search_info.erase(request_id); +} +} diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp b/src/cpp/src/scheduler.hpp similarity index 86% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp rename to src/cpp/src/scheduler.hpp index 2fd3d7b175..cbd6668f90 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -7,11 +7,11 @@ #include #include +#include "openvino/genai/scheduler_config.hpp" #include "block_manager.hpp" #include "sequence_group.hpp" -#include "block_manager.hpp" -#include "scheduler_config.hpp" +namespace ov::genai { class Scheduler { SchedulerConfig m_config; BlockManager m_block_manager; @@ -28,14 +28,19 @@ class Scheduler { size_t m_total_num_scheduled_tokens = 0; // dedicated prompt phase bool is_prompt = false; + // current cache usage + float m_cache_usage = 0.0; }; explicit Scheduler(const SchedulerConfig & config = {}) : - m_config(config), m_block_manager(m_config.num_kv_blocks) { } + m_config(config), m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, m_config.block_size) { } Output schedule(std::vector& sequence_groups) { Output scheduler_output; + if (m_config.enable_prefix_caching) + _restore_cached_blocks(sequence_groups); + if (m_config.dynamic_split_fuse) { // deepspeed-mii case // generation phase is always scheduled first @@ -56,6 +61,7 @@ class Scheduler { } _clear_waiting_sequences(sequence_groups); + scheduler_output.m_cache_usage = m_block_manager.get_used_percentage(); return scheduler_output; } @@ -98,52 +104,45 @@ class Scheduler { size_t prev_blocks_count = m_block_manager.num_free_blocks(); size_t num_running_sequences = sequence_group->num_running_seqs(); size_t preempted_tokens = 0; + size_t num_blocks_occupied_by_sequence = m_block_manager.get_number_of_blocks_occupied_by_sequence(sequence_group); - if (num_running_sequences > 1) { - for (size_t s = 0; s < sequence_group->num_running_seqs(); ++s) { - auto seq_id = (*sequence_group)[s]->get_id(); + if (num_blocks_occupied_by_sequence <= blocks_needed) { + auto sequences = sequence_group->get_not_finished_sequences(); + for (size_t s = 0; s < sequences.size(); ++s) { + auto seq_id = sequences[s]->get_id(); m_block_manager.free_sequence(seq_id); } - sequence_group->reset(); + sequence_group->preempt_tokens(processed_tokens); sequence_group->set_waiting(); return m_block_manager.num_free_blocks() > prev_blocks_count; } - // currently partial preemtion is enabled only for single running sequence case - // TODO: implement partial preemption for case with muliple sequences in group - for (size_t s = 0; s < num_running_sequences; ++s) { - auto seq_id = (*sequence_group)[s]->get_id(); - if (!m_block_manager.has_block_table(seq_id)) { - // no blocks are allocated for this sequence, so it can't be preempted - return false; - } - auto block_table = m_block_manager.get_block_table(seq_id); - size_t required_blocks = blocks_needed - total_num_released_blocks; - if (required_blocks >= block_table.size()) { - // fully drop a sequence(s) from block_manager - m_block_manager.free_sequence(seq_id); - } - else { - m_block_manager.free_sequence_partially(seq_id, required_blocks); - } - - // calculate the number of released blocks - auto released_blocks = m_block_manager.num_free_blocks() - prev_blocks_count; - total_num_released_blocks += released_blocks; - prev_blocks_count = m_block_manager.num_free_blocks(); - + if (num_running_sequences > 1) { + size_t phisycal_blocks_released; + size_t logical_blocks_released; + m_block_manager.free_group_partially_multiple_runnning_sequence(sequence_group, blocks_needed, phisycal_blocks_released, logical_blocks_released); // calculate the number of preempted tokens auto tokens_in_last_block = processed_tokens % block_size; if (tokens_in_last_block == 0) { tokens_in_last_block = block_size; } + preempted_tokens = tokens_in_last_block + std::max((int)logical_blocks_released - 1, 0) * block_size; - preempted_tokens += tokens_in_last_block + std::max((int)released_blocks - 1, 0) * block_size; - if (m_block_manager.num_free_blocks() >= blocks_needed) { - break; + } + else { + OPENVINO_ASSERT(num_running_sequences == 1); + size_t phisycal_blocks_released; + m_block_manager.free_group_partially_single_runnning_sequence(sequence_group, blocks_needed, phisycal_blocks_released); + + // calculate the number of preempted tokens + auto tokens_in_last_block = processed_tokens % block_size; + if (tokens_in_last_block == 0) { + tokens_in_last_block = block_size; } + preempted_tokens = tokens_in_last_block + std::max((int)phisycal_blocks_released - 1, 0) * block_size; } + // case when preemption requires preempt prompt tokens if (!m_config.dynamic_split_fuse && processed_tokens - preempted_tokens < sequence_group->get_prompt_len()) { // preempt prompt fully to not leave partially generated prompt @@ -170,6 +169,15 @@ class Scheduler { return std::numeric_limits::max(); } + void _restore_cached_blocks(const std::vector& sequence_groups) { + for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; + if (sequence_group->can_generate_tokens() || sequence_group->num_running_seqs() != 1) + continue; + m_block_manager._restore_cached_blocks(sequence_group, m_config.block_size); + } + } + void _apply_preemption(size_t sequence_group_id, const std::vector& sequence_groups) { SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; @@ -225,7 +233,7 @@ class Scheduler { if (num_scheduled_tokens > 0) { // allocate KV blocks if required if (num_scheduled_blocks > 0) - m_block_manager.allocate(seq_id, num_scheduled_blocks); + m_block_manager.allocate(sequence, num_scheduled_blocks, sequence_group->get_prompt_ids()); // and schedule tokens sequence_group->schedule_tokens(num_scheduled_tokens); @@ -329,7 +337,8 @@ class Scheduler { // prompt phases can have a single running sequence OPENVINO_ASSERT(num_running_seqs == 1); // here we also assume that sequence must be scheduler in a single shot and has no already generated context - OPENVINO_ASSERT(sequence_group->get_context_len() == 0); + if (!m_config.enable_prefix_caching) + OPENVINO_ASSERT(sequence_group->get_context_len() == 0); size_t num_available_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens; size_t sequence_len = sequence_group->get_num_available_tokens_for_batching(); @@ -357,11 +366,15 @@ class Scheduler { Sequence::Ptr sequence = (*sequence_group)[0]; uint64_t seq_id = sequence->get_id(); - // allocate KV blocks - m_block_manager.allocate(seq_id, num_required_blocks); // and schedule tokens sequence_group->schedule_tokens(sequence_len); + // allocate KV blocks + if (sequence_group->get_num_processed_tokens() == 0) + m_block_manager.allocate(sequence, num_required_blocks, sequence_group->get_prompt_ids()); + else + m_block_manager.append_slots(sequence_group); + // add information to scheduler_output { scheduler_output.m_scheduled_sequence_groups_ids.push_back(sequence_group_id); @@ -385,3 +398,4 @@ class Scheduler { } } }; +} diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp similarity index 80% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp rename to src/cpp/src/sequence_group.hpp index 8f680af616..d5b9506b2c 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -6,11 +6,13 @@ #include #include #include +#include -#include "generation_handle.hpp" -#include "generation_config.hpp" +#include "openvino/genai/generation_handle.hpp" +#include "openvino/genai/generation_config.hpp" #include "generation_stream.hpp" +namespace ov::genai { enum class SequenceStatus { RUNNING = 0, FINISHED = 1, @@ -115,11 +117,26 @@ class Sequence { return m_cumulative_log_prob; } - float get_beam_search_score(const GenerationConfig& sampling_params) const { + float get_beam_search_score(const ov::genai::GenerationConfig& sampling_params) const { float cumulative_log_prob = get_cumulative_log_probs(), current_length = get_generated_len(); float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty); return score; } + + // Each KV block can be uniquely identified by + // the tokens within the block and the tokens in the prefix before the block. + // hash(prefix tokens + block tokens) <--> KV Block + size_t get_hash(size_t content_length, const ov::genai::TokenIds& prompt_ids) const { + std::vector content; + OPENVINO_ASSERT(content_length <= prompt_ids.size() + m_generated_ids.size()); + content.insert( content.end(), prompt_ids.begin(), prompt_ids.begin() + std::min(prompt_ids.size(), content_length)); + if (content_length > prompt_ids.size()) { + content.insert(content.end(), m_generated_ids.begin(), m_generated_ids.begin() + content_length - prompt_ids.size()); + } + const char* data = reinterpret_cast(content.data()); + std::size_t size = content.size() * sizeof(content[0]); + return std::hash{}(std::string_view(data, size)); + } }; // contains a list of Sequences in generic case (beam search or parallel sampling) @@ -129,7 +146,7 @@ class Sequence { class SequenceGroup { uint64_t m_request_id; std::vector m_sequences; - GenerationConfig m_sampling_params; + ov::genai::GenerationConfig m_sampling_params; std::size_t m_block_size; TokenIds m_prompt_ids; GenerationStream::Ptr m_generation_stream; @@ -146,7 +163,7 @@ class SequenceGroup { // context length of longest sequence within a group size_t m_max_content_len = 0; - SequenceGroup(uint64_t request_id, const GenerationConfig& sampling_params, std::size_t block_size) + SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size) : m_request_id(request_id), m_sampling_params(sampling_params), m_block_size(block_size) { @@ -156,11 +173,11 @@ class SequenceGroup { using Ptr = std::shared_ptr; using CPtr = std::shared_ptr; - SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const GenerationConfig& sampling_params, std::size_t block_size) + SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size) : SequenceGroup(request_id, ov::Tensor(ov::element::i64, ov::Shape{input_ids.size()}, (void *)input_ids.data()), sampling_params, block_size) { } - SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const GenerationConfig& sampling_params, std::size_t block_size) + SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size) : SequenceGroup(request_id, sampling_params, block_size) { add_sequence(Sequence::create(m_next_sequence_id++)); @@ -266,6 +283,17 @@ class SequenceGroup { return running_seqs; } + std::vector get_not_finished_sequences() { + std::vector running_seqs; + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (!m_sequences[seq_id]->has_finished()) { + running_seqs.emplace_back(m_sequences[seq_id]); + } + } + + return running_seqs; + } + std::vector get_running_sequences() const { std::vector running_seqs; for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { @@ -333,6 +361,11 @@ class SequenceGroup { clear_scheduled_tokens(); } + void update_processed_tokens_num(size_t processed_tokens) { + m_num_processed_tokens = processed_tokens; + m_max_content_len = processed_tokens; + } + void clear_waiting_sequences() { for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { if (m_sequences[seq_id]->is_waiting()) { @@ -363,28 +396,10 @@ class SequenceGroup { return m_sequences.back(); } - const GenerationConfig& get_sampling_parameters() const { + const ov::genai::GenerationConfig& get_sampling_parameters() const { return m_sampling_params; } - void reset() { - m_sequences.clear(); - m_next_sequence_id = 0; - add_sequence(Sequence::create(m_next_sequence_id++)); - clear_scheduled_tokens(); - m_num_processed_tokens = 0; - m_max_content_len = 0; - } - - bool is_empty() { - if (m_sequences.size() > 1) - return false; - OPENVINO_ASSERT(m_sequences.size() == 1); - if (m_sequences[0]->get_generated_len() > 0 || m_sequences[0]->get_cumulative_log_probs() != 0.0f) - return false; - return true; - } - void set_out_of_memory() { for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { if (m_sequences[seq_id]->is_running()) { @@ -431,59 +446,48 @@ class SequenceGroup { return m_generation_stream->get_status() == GenerationStatus::DROPPED_BY_HANDLE; } - void notify_handle() { + void push_outputs() { GenerationOutputs outputs; + for (auto& sequence: m_sequences) { + GenerationOutput output; + output.generated_token_ids = sequence->get_generated_ids(); + output.score = sequence->get_beam_search_score(m_sampling_params); + outputs.emplace(sequence->get_grouped_id(), output); + } + m_generation_stream->push(outputs); + } - // For beam search streaming is not available, so we notify only upon finishing - if(m_sampling_params.is_beam_search()) { - if (has_finished()) { - std::vector finished_sequences = get_finished_sequences(); - - OPENVINO_ASSERT(finished_sequences.size() == num_total_seqs() && has_finished()); - for (auto& sequence: finished_sequences) { - GenerationOutput output; - output.generated_token_ids = sequence->get_generated_ids(); - output.score = sequence->get_beam_search_score(m_sampling_params); - outputs.emplace(sequence->get_grouped_id(), output); - } - - if (outputs.size()) { - m_generation_stream->push(outputs); - } - } - // For greedy or multinomial sampling we decide whever to stream partial results depending on the user parameter - } else if (m_sampling_params.is_greedy_sampling() || m_sampling_params.is_multinomial()) { - // TO DO: Now we always stream for greedy search for the sake of benchmarking - if (num_total_seqs() == 1 /* m_sampling_params.stream */) { - // TODO: support streamimg for n seqs - for (auto& sequence : m_sequences) { - // todo: check seq.is_finished() to generate without several - // or is it ok to use padding? - const auto last_gen_token = sequence->get_last_generation_output(); - outputs.emplace(sequence->get_grouped_id(), last_gen_token); - } - m_generation_stream->push(outputs); - } else if (has_finished()) { - std::vector finished_sequences = get_finished_sequences(); - - OPENVINO_ASSERT(finished_sequences.size() == num_total_seqs() && has_finished()); - for (auto& sequence: finished_sequences) { - GenerationOutput output; - output.generated_token_ids = sequence->get_generated_ids(); - output.score = sequence->get_cumulative_log_probs(); - outputs.emplace(sequence->get_grouped_id(), output); - } - - if (outputs.size()) { - m_generation_stream->push(outputs); - } - } + void push_partial_outputs() { + GenerationOutputs outputs; + // TODO: support streamimg for n seqs + for (auto& sequence : m_sequences) { + // todo: check seq.is_finished() to generate without several + // or is it ok to use padding? + const auto last_gen_token = sequence->get_last_generation_output(); + outputs.emplace(sequence->get_grouped_id(), last_gen_token); } + m_generation_stream->push(outputs); + } + void notify_handle() { if (out_of_memory()) { set_generation_status(GenerationStatus::IGNORED); } else if (has_finished()) { set_generation_status(GenerationStatus::FINISHED); } + // For beam search streaming is not available, so we notify only upon finishing + if(m_sampling_params.is_beam_search()) { + if (has_finished() || out_of_memory()) { + push_outputs(); + } + } else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) { + // TO DO: Now we always stream for greedy search for the sake of benchmarking + if (num_total_seqs() == 1) { + push_partial_outputs(); + } else if (has_finished() || out_of_memory()) { + push_outputs(); + } + } } }; +} diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/synchronized_queue.hpp b/src/cpp/src/synchronized_queue.hpp similarity index 84% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/synchronized_queue.hpp rename to src/cpp/src/synchronized_queue.hpp index 0c2cd3180d..bd025f1b7d 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/synchronized_queue.hpp +++ b/src/cpp/src/synchronized_queue.hpp @@ -17,6 +17,12 @@ class SynchronizedQueue SynchronizedQueue(const SynchronizedQueue&&) = delete; SynchronizedQueue& operator=(const SynchronizedQueue&) = delete; + T back() { + std::unique_lock lock(m_mutex); + m_cv.wait(lock, [this]{return !m_queue.empty();}); + return m_queue.back(); + } + T pull() { std::unique_lock lock(m_mutex); m_cv.wait(lock, [this]{return !m_queue.empty();}); diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index 8302594655..b2b5c9a463 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -15,25 +15,32 @@ bool TextCallbackStreamer::put(int64_t token) { std::stringstream res; m_tokens_cache.push_back(token); std::string text = m_tokenizer.decode(m_tokens_cache); - if (!text.empty() && '\n' == text.back()) { + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { // Flush the cache after the new line symbol res << std::string_view{text.data() + print_len, text.size() - print_len}; m_tokens_cache.clear(); print_len = 0; return on_finalized_subword_callback(res.str()); } + if (text.size() >= 3 && text.compare(text.size() - 3, 3, "οΏ½") == 0) { // Don't print incomplete text return on_finalized_subword_callback(res.str()); + } else if (text.size() > print_len) { + // It is possible to have a shorter text after adding new token. + // Print to output only if text lengh is increaesed. + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); } - res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; - print_len = text.size(); + return on_finalized_subword_callback(res.str()); } void TextCallbackStreamer::end() { std::stringstream res; std::string text = m_tokenizer.decode(m_tokens_cache); + if (text.size() <= print_len) + return ; res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; m_tokens_cache.clear(); print_len = 0; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/timer.hpp b/src/cpp/src/timer.hpp similarity index 100% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/timer.hpp rename to src/cpp/src/timer.hpp diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index ef9235b298..44b6b30d49 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -6,8 +6,11 @@ #include "utils.hpp" #include #include +#include #include "tokenizers_path.hpp" +#include "circular_buffer_queue.hpp" #include +#include namespace { @@ -55,8 +58,12 @@ namespace genai { class Tokenizer::TokenizerImpl { public: - ov::InferRequest m_tokenize_request; - ov::InferRequest m_detokenizer_request; + ov::CompiledModel m_tokenizer; + ov::CompiledModel m_detokenizer; + + std::unique_ptr> m_ireq_queue_tokenizer; + std::unique_ptr> m_ireq_queue_detokenizer; + int64_t m_pad_token_id = -1; int64_t m_bos_token_id = -1; int64_t m_eos_token_id = -1; @@ -69,7 +76,7 @@ class Tokenizer::TokenizerImpl { TokenizerImpl() = default; - TokenizerImpl(std::filesystem::path tokenizer_path) + TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& plugin_config) : m_chat_template{chat_template_from_tokenizer_json_if_exists(tokenizer_path)} { ov::Core core; @@ -90,14 +97,30 @@ class Tokenizer::TokenizerImpl { read_tokenizer_config_if_necessary(tokenizer_path); auto device = "CPU"; // currently openvino_tokenizer supports only CPU - m_tokenize_request = core.compile_model(tokenizer_path / "openvino_tokenizer.xml", - device).create_infer_request(); - m_detokenizer_request = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", - device).create_infer_request(); + m_tokenizer = core.compile_model(tokenizer_path / "openvino_tokenizer.xml", + device, plugin_config); + m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", + device, plugin_config); + + + const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests); + m_ireq_queue_tokenizer = std::make_unique>( + INFER_REQUEST_QUEUE_SIZE, + [this]() -> ov::InferRequest { + return std::move(this->m_tokenizer.create_infer_request()); + }); + m_ireq_queue_detokenizer = std::make_unique>( + INFER_REQUEST_QUEUE_SIZE, + [this]() -> ov::InferRequest { + return std::move(this->m_detokenizer.create_infer_request()); + }); // Get special token ids by inference if they are not defined. - // todo: do not call until CVS-143410 is resolved - // infer_special_tokens_if_necessary(); + infer_special_tokens_if_necessary(); + // Initialize tokenizer's cache to save time later. + // infer_special_tokens_if_necessary() already could do that + // but it didn't run decode() for sure. + decode(encode("").input_ids); } // load special tokens ids from config.json @@ -138,9 +161,9 @@ class Tokenizer::TokenizerImpl { read_token_content_str(eos_token_key_name, m_eos_token); } - // Read string representation of special tokens if they exists. + // Read string representation of special tokens if they exist. // Also tries to load special token ids from added_tokens_decoder if they exist. - // Will not override special token strings or ids if they already exist + // Will not override special token strings or ids if they already exist. void read_tokenizer_config_if_necessary(const std::filesystem::path& tokenizer_path) { if (m_pad_token_id != -1 && m_bos_token_id != -1 && m_eos_token_id != -1 && !m_pad_token.empty() && !m_bos_token.empty() && !m_eos_token.empty()) { @@ -226,25 +249,35 @@ class Tokenizer::TokenizerImpl { } TokenizedInputs encode(std::string prompt) { + CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_tokenizer.get()); size_t batch_size = 1; - m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); - m_tokenize_request.infer(); - return get_copied_results(); + infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + return get_copied_results( + infer_request_guard.get().get_tensor("input_ids"), + infer_request_guard.get().get_tensor("attention_mask") + ); } TokenizedInputs encode(std::vector& prompts) { - m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); - auto size_ = m_tokenize_request.get_input_tensor().get_shape(); - m_tokenize_request.infer(); - - auto res = get_copied_results(); - pad_left(res.input_ids, res.attention_mask); - return {res.input_ids, res.attention_mask}; + TokenizedInputs unpadded; + { + CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_tokenizer.get()); + infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); + auto size_ = infer_request_guard.get().get_input_tensor().get_shape(); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + + unpadded = get_copied_results( + infer_request_guard.get().get_tensor("input_ids"), + infer_request_guard.get().get_tensor("attention_mask") + ); + } + return pad_left(unpadded.input_ids, unpadded.attention_mask); } - TokenizedInputs get_copied_results() { - auto input_ids = m_tokenize_request.get_tensor("input_ids"); - auto attention_mask = m_tokenize_request.get_tensor("attention_mask"); + TokenizedInputs get_copied_results(ov::Tensor input_ids, ov::Tensor attention_mask) { ov::Tensor input_ids_ = ov::Tensor(input_ids.get_element_type(), input_ids.get_shape()); ov::Tensor attention_mask_ = ov::Tensor(attention_mask.get_element_type(), attention_mask.get_shape()); input_ids.copy_to(input_ids_); @@ -254,20 +287,24 @@ class Tokenizer::TokenizerImpl { } std::string decode(std::vector tokens) { + CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_detokenizer.get()); size_t batch_size = 1; - m_detokenizer_request.set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()}); - m_detokenizer_request.infer(); - return m_detokenizer_request.get_output_tensor().data()[0]; + infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()}); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + return infer_request_guard.get().get_output_tensor().data()[0]; } std::vector decode(ov::Tensor tokens) { OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64"); OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]"); - m_detokenizer_request.set_input_tensor(tokens); - m_detokenizer_request.infer(); + CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_detokenizer.get()); + infer_request_guard.get().set_input_tensor(tokens); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); - auto res = m_detokenizer_request.get_output_tensor(); + auto res = infer_request_guard.get().get_output_tensor(); auto res_data = res.data(); return std::vector(res_data, res_data + res.get_shape()[0]); } @@ -288,9 +325,11 @@ class Tokenizer::TokenizerImpl { std::fill(tokens_data + i * max_len + line_len, tokens_data + (i + 1) * max_len, m_pad_token_id); } - m_detokenizer_request.set_input_tensor(tokens); - m_detokenizer_request.infer(); - auto res = m_detokenizer_request.get_output_tensor(); + CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_detokenizer.get()); + infer_request_guard.get().set_input_tensor(tokens); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + auto res = infer_request_guard.get().get_output_tensor(); auto res_data = res.data(); return std::vector(res_data, res_data + res.get_shape()[0]); } @@ -306,32 +345,66 @@ class Tokenizer::TokenizerImpl { std::string res = ""; ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); - + if (res.empty()) + return res; + // Replace what jinja2cpp doesn't support std::pair replace_str_map[] = { - {"\n'}", "\n' }"}, - {".strip()", "\"\""} + {"'}", "' }"}, + {"{'", "{ '"}, + {".strip()", ""} }; - if (!res.empty()) { - for (const auto& [from, to] : replace_str_map) { - size_t pos = 0; - while ((pos = res.find(from, pos)) != std::string::npos) { - res.replace(pos, from.size(), to); - pos += to.size(); - } + + for (const auto& [from, to] : replace_str_map) { + size_t pos = 0; + while ((pos = res.find(from, pos)) != std::string::npos) { + res.replace(pos, from.size(), to); + pos += to.size(); } } return res; - } + } - std::string apply_chat_template(const ChatHistory& history, + std::string apply_chat_template(ChatHistory history, bool add_generation_prompt, const std::string& chat_template) const { + auto chat_tpl = chat_template.empty() ? m_chat_template : chat_template; + OPENVINO_ASSERT(!chat_tpl.empty(), + "Chat template wasn't found. This may indicate that the model wasn't trained for chat scenario." + " Please add 'chat_template' to tokenizer_config.json to use the model in chat scenario." + " For more information see the section Troubleshooting in README.md"); + + // Jinja2Cpp does not support Python-style slicing, e.g. [1:]. + // If chat template contains such slicing, we replace it with custom function `slice()` (user-defined callable) + // that is defined below and does the same list slicing logic. + std::string slice_string = "messages[1:]"; + std::string replacement_slice_string = "slice(messages, 1)"; + size_t slice_pos = chat_tpl.find(slice_string); + if (slice_pos != std::string::npos) { + chat_tpl.replace(slice_pos, slice_string.length(), replacement_slice_string); + } + jinja2::UserCallable slice_callable = jinja2::MakeCallable( + [](const jinja2::ValuesList& list, const int64_t start) { + if (list.empty()) + return jinja2::Value(); + jinja2::ValuesList result; + int64_t stop = list.size(); + int64_t step = 1; + for (int64_t i = start; i < stop && i < list.size(); i += step) + { + result.push_back(list.at(i)); + } + + return jinja2::Value(result); + }, + jinja2::ArgInfo{"list"}, jinja2::ArgInfo{"start"} + ); + jinja2::TemplateEnv env; env.GetSettings().lstripBlocks = true; env.GetSettings().trimBlocks = true; jinja2::Template tpl(&env); - tpl.Load(chat_template.empty() ? m_chat_template : chat_template); + tpl.Load(chat_tpl); jinja2::ValuesList jinja_messages; jinja2::ValuesMap jinja_message; @@ -346,16 +419,22 @@ class Tokenizer::TokenizerImpl { {"eos_token", m_eos_token}, {"pad_token", m_pad_token}, {"add_generation_prompt", add_generation_prompt}, + {"slice", slice_callable}, }; - return tpl.RenderAsString(params).value(); - } - + try { + return tpl.RenderAsString(params).value(); + } catch (const std::exception& error) { + OPENVINO_THROW("Chat template for the current model is not supported by Jinja2Cpp. " + "Please apply template manually to your prompt before calling generate. " + "For exmaple: user{user_prompt}model"); + } + } }; -Tokenizer::Tokenizer(const std::string& tokenizer_path) { +Tokenizer::Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config) { ScopedVar env_manager(tokenizers_relative_to_genai().string()); - m_pimpl = std::make_shared(tokenizer_path); + m_pimpl = std::make_shared(tokenizer_path, plugin_config); } TokenizedInputs Tokenizer::encode(const std::string prompt) { @@ -410,7 +489,7 @@ std::string Tokenizer::get_eos_token() const { return m_pimpl->m_eos_token; } -std::string Tokenizer::apply_chat_template(const ChatHistory& history, +std::string Tokenizer::apply_chat_template(ChatHistory history, bool add_generation_prompt, const std::string& chat_template) const { return m_pimpl->apply_chat_template(history, add_generation_prompt, chat_template); diff --git a/src/cpp/src/tokenizers_path.hpp b/src/cpp/src/tokenizers_path.hpp index d2c3ef3b5e..4899daccc4 100644 --- a/src/cpp/src/tokenizers_path.hpp +++ b/src/cpp/src/tokenizers_path.hpp @@ -86,7 +86,7 @@ std::filesystem::path tokenizers_relative_to_genai() { // was already defined. class ScopedVar { public: - bool was_already_set; + bool was_already_set{false}; static constexpr char ENVIRONMENT_VARIABLE_NAME[] = "OPENVINO_TOKENIZERS_PATH_GENAI"; explicit ScopedVar(const std::string& environment_variable_value) { #ifdef _WIN32 diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 410d311d84..2bc20186be 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -155,6 +155,36 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) { return new_atten_mask; } +ov::genai::GenerationConfig from_config_json_if_exists(const std::filesystem::path& model_path) { + auto config_file_path = model_path / "generation_config.json"; + if (std::filesystem::exists(config_file_path)) { + return ov::genai::GenerationConfig((config_file_path).string()); + } else { + return ov::genai::GenerationConfig{}; + } +} + +ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) { + ov::genai::StreamerVariant streamer = std::monostate(); + + if (config_map.count(STREAMER_ARG_NAME)) { + auto any_val = config_map.at(STREAMER_ARG_NAME); + if (any_val.is>()) { + streamer = any_val.as>(); + } else if (any_val.is>()) { + streamer = any_val.as>(); + } + } + return streamer; +} + +ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map) { + if (config_map.count(CONFIG_ARG_NAME)) + return config_map.at(CONFIG_ARG_NAME).as(); + else + return std::nullopt; +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 452dc451f9..25acc1c87f 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -3,6 +3,8 @@ #pragma once +#include "openvino/genai/llm_pipeline.hpp" + #include #include @@ -65,6 +67,15 @@ void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& } } +const std::string STREAMER_ARG_NAME = "streamer"; +const std::string CONFIG_ARG_NAME = "generation_config"; + +ov::genai::GenerationConfig from_config_json_if_exists(const std::filesystem::path& model_path); + +ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map); + +ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map); + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md index 472b2e8842..548309b7d7 100644 --- a/src/docs/BUILD.md +++ b/src/docs/BUILD.md @@ -1,99 +1,207 @@ # How to Build OpenVINOβ„’ GenAI -## Build for Linux Systems +> **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`. +The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html). -### Software Requirements +## Software Requirements + +### Linux - [CMake](https://cmake.org/download/) 3.23 or higher - GCC 7.5 or higher - Python 3.8 or higher +- Git -### Build Instructions +### Windows -1. Clone OpenVINO GenAI repository and init submodules: +- [CMake](https://cmake.org/download/) 3.23 or higher +- Microsoft Visual Studio 2019 or higher, version 16.3 or later +- Python 3.8 or higher +- Git for Windows +- [NSIS](https://sourceforge.net/projects/nsis/) + +### macOS + +- [CMake](https://cmake.org/download/) 3.23 or higher +- [brew](https://brew.sh/) package manager to install additional dependencies: + ```sh + brew install coreutils scons + ``` +- Clang compiler and other command line tools from Xcode 10.1 or higher: + ```sh + xcode-select --install + ``` +- Python 3.8 or higher +- Git + + +## Build Instructions + +### Build OpenVINO, OpenVINO Tokenizers, and OpenVINO GenAI From Source + +1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build). +The path to the OpenVINO install directory is referred as `` throughout the document. +2. Clone OpenVINO GenAI repository and init submodules: ```sh git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git cd openvino.genai ``` -2. Download OpenVINO archive and install dependencies: - +3. Set up the environment: + + #### Option 1 - using OpenVINO `setupvars` script: + + Linux and macOS: + ```sh + source /setupvars.sh + ``` + + Windows Command Prompt: + ```cmd + call \setupvars.bat + ``` + + Windows PowerShell: + ```cmd + . /setupvars.ps1 + ``` + + #### Option 2 - setting environment variables manually: + + Linux: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` + + macOS: ```sh - mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/linux/l_openvino_toolkit_ubuntu20_2024.2.0.dev20240524_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz - sudo ./ov/install_dependencies/install_openvino_dependencies.sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export DYLD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` + + Windows Command Prompt: + ```cmd + set OpenVINO_DIR=\runtime + set PYTHONPATH=\python;%CD%\build;%PYTHONPATH% + set OPENVINO_LIB_PATHS=\bin\intel64\Release;%OPENVINO_LIB_PATHS% + set PATH=%OPENVINO_LIB_PATHS%;%PATH% ``` -3. Build the project: + + Windows PowerShell: + ```sh + $env:OpenVINO_DIR = "\runtime" + $env:PYTHONPATH = "\python;$PWD\build;$env:PYTHONPATH" + $env:OPENVINO_LIB_PATHS = "\bin\intel64\Release;$env:OPENVINO_LIB_PATHS" + $env:PATH = "$env:OPENVINO_LIB_PATHS;$env:PATH" + ``` + +4. Build the project: ```sh - source ./ov/setupvars.sh cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov + cmake --build ./build/ --config Release -j ``` -## Build for Windows Systems +5. Install OpenVINO GenAI: -### Software Requirements + #### Option 1 - using cmake: + + The following command will store built OpenVINO GenAI artifacts along with OpenVINO in ``: -- [CMake](https://cmake.org/download/) 3.23 or higher -- Microsoft Visual Studio 2019 or higher, version 16.3 or later -- Python 3.8 or higher -- Git for Windows + ```sh + cmake --install ./build/ --config Release --prefix + ``` -### Build Instructions + #### Option 2 - setting paths to built OpenVINO GenAI artifacts manually: -1. Clone OpenVINO GenAI repository and init submodules: + The path to the OpenVINO GenAI root directory is referred as `` throughout the document. + + Linux: ```sh - git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git - cd openvino.genai + export PYTHONPATH=/build/:$PYTHONPATH + export LD_LIBRARY_PATH=/build/openvino_genai/:$LD_LIBRARY_PATH ``` -2. Download OpenVINO archive and install dependencies: - + + macOS: ```sh - mkdir ./ov/ - curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc1/windows/w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64.zip - unzip ov.zip - mklink /D ov w_openvino_toolkit_windows_2024.2.0.dev20240524_x86_64 + export PYTHONPATH=/build:$PYTHONPATH + export DYLD_LIBRARY_PATH=/build/openvino_genai:$DYLD_LIBRARY_PATH ``` -3. Build the project: + + Windows Command Prompt: + ```cmd + set PYTHONPATH=\build;%PYTHONPATH% + set PATH=\build\openvino_genai;%PATH% + ``` + + Windows PowerShell: ```sh - call ov\setupvars.bat - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov + $env:PYTHONPATH = "\build;$env:PYTHONPATH" + $env:PATH = "\build\openvino_genai;$env:PATH" ``` -## Build for macOS Systems +To optimize the package size, you can reduce the ICU (International Components for Unicode) data size when OpenVINO Tokenizers are built as a submodule of OpenVINO GenAI. +For more information please refer to the [OpenVINO Tokenizers instructions](https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#reducing-the-icu-data-size). -### Software Requirements -- [CMake](https://cmake.org/download/) 3.23 or higher -- [brew](https://brew.sh/) package manager to install additional dependencies: +### Build OpenVINO GenAI Wheel + +1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build) +The path to the openvino install directory is referred as throughout the document. +2. Clone OpenVINO GenAI repository and init submodules: ```sh - brew install coreutils scons + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai ``` -- Clang compiler and other command line tools from Xcode 10.1 or higher: +2. Set up the environment: + - Option 1 - using OpenVINO `setupvars.sh` script: + ```sh + source /setupvars.sh + ``` + - Option 2 - setting environment variables manually: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` +3. Upgrade pip to ensure you have the latest version: ```sh - xcode-select --install + python -m pip install --upgrade pip + ``` +4. Build the wheel in the `dist` directory: + ```sh + python -m pip wheel . -w dist/ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release ``` -- Python 3.8 or higher -### Build Instructions +### Install OpenVINO GenAI From Source 1. Clone OpenVINO GenAI repository and init submodules: ```sh git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git cd openvino.genai ``` -2. Download OpenVINO archive and install dependencies: - +2. Set up the environment: + - Option 1 - using OpenVINO `setupvars.sh` script: + ```sh + source /setupvars.sh + ``` + - Option 2 - setting environment variables manually: + ```sh + export OpenVINO_DIR=/runtime + export PYTHONPATH=/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` +3. Upgrade pip to ensure you have the latest version: ```sh - mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/pre-release/2024.2.0rc2/macos/m_openvino_toolkit_macos_12_6_2024.2.0.dev20240529_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + python -m pip install --upgrade pip ``` -3. Build the project: +4. Install the package directly from source: ```sh - source ./ov/setupvars.sh - cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - cmake --build ./build/ --config Release --target package -j - cmake --install ./build/ --config Release --prefix ov + python -m pip install . + ``` +5. To verify the installation, run a simple Python script: + ```python + import openvino_genai + print(openvino_genai.__version__) ``` diff --git a/text_generation/causal_lm/cpp/continuous_batching/README.md b/src/docs/DOCKER.md similarity index 94% rename from text_generation/causal_lm/cpp/continuous_batching/README.md rename to src/docs/DOCKER.md index 12a41aff28..38764864ad 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/README.md +++ b/src/docs/DOCKER.md @@ -33,7 +33,7 @@ cd /workspace/openvino.genai/ cd /path/to/openvino mkdir build cd build -cmake -DENABLE_PYTHON=ON -DCMAKE_BUILD_TYPE={ov_build_type} .. +cmake -DCMAKE_BUILD_TYPE={ov_build_type} .. make -j24 ``` 2. Set PYTHONPATH, LD_LIBRARY_PATH and OpenVINO_DIR environment variables: @@ -47,7 +47,7 @@ export OpenVINO_DIR=/path/to/openvino/{ov_build_type} cd /path/to/openvino.genai/thirdparty/openvino_tokenizers mkdir build cd build -cmake -DENABLE_PYTHON=ON -DCMAKE_BUILD_TYPE={ov_build_type} .. +cmake -DCMAKE_BUILD_TYPE={ov_build_type} .. make -j24 ``` 4. Create virtual environment to generate models and run python tests: @@ -71,7 +71,7 @@ mkdir /path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/ 7. Generate cmake project: ``` cd build -cmake -DCMAKE_BUILD_TYPE=Debug -DOpenVINO_DIR=/path/to/openvino/build -DENABLE_APPS=ON -DENABLE_PYTHON=ON .. +cmake -DCMAKE_BUILD_TYPE=Debug -DOpenVINO_DIR=/path/to/openvino/build .. ``` 8. Build the project ``` diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index 0e6099db03..3eb2af17b4 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -45,7 +45,19 @@ - LlamaForCausalLM + LlamaForCausalLM + Llama 3 + + + + + + Llama 2
    diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 0350ff5bb0..39ecce8989 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -3,33 +3,25 @@ # include(FetchContent) + FetchContent_Declare( pybind11 URL https://github.com/pybind/pybind11/archive/refs/tags/v2.12.0.tar.gz URL_HASH SHA256=bf8f242abd1abcd375d516a7067490fb71abd79519a282d22b6e4d19282185a7 ) FetchContent_GetProperties(pybind11) +# search for FindPython3.cmake instead of legacy modules +set(PYBIND11_FINDPYTHON ON) + if(NOT pybind11_POPULATED) FetchContent_Populate(pybind11) - # search for FindPython3.cmake instead of legacy modules - set(PYBIND11_FINDPYTHON ON) - # the following two calls are required for cross-compilation - if(OpenVINODeveloperPackage_DIR) - ov_find_python3(REQUIRED) - ov_detect_python_module_extension() - else() - if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) - else() - find_package(Python3 REQUIRED COMPONENTS Interpreter Development) - endif() - endif() add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) endif() pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp) target_link_libraries(py_generate_pipeline PRIVATE openvino::genai) set_target_properties(py_generate_pipeline PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" ) file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" DESTINATION "${CMAKE_BINARY_DIR}/openvino_genai/") @@ -64,10 +56,10 @@ endif() install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" DESTINATION python/openvino_genai - COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR}) install(TARGETS py_generate_pipeline LIBRARY DESTINATION python/openvino_genai - COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR}) install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" DESTINATION openvino_genai diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index deeabb0399..da4ec24529 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -17,16 +17,8 @@ DecodedResults, EncodedResults, StreamerBase, - StopCriteria + StopCriteria, + ContinuousBatchingPipeline, + GenerationResult, + SchedulerConfig, ) - -__all__ = [ - 'LLMPipeline', - 'Tokenizer', - 'GenerationConfig', - 'TokenizedInputs', - 'DecodedResults', - 'EncodedResults', - 'StreamerBase', - 'StopCriteria' -] diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 3b93be9c49..a429fc4801 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -6,18 +6,25 @@ #include #include #include +#include "openvino/genai/continuous_batching_pipeline.hpp" #include "openvino/genai/llm_pipeline.hpp" #include #include "../cpp/src/tokenizers_path.hpp" namespace py = pybind11; using ov::genai::ChatHistory; +using ov::genai::ContinuousBatchingPipeline; using ov::genai::DecodedResults; using ov::genai::EncodedInputs; using ov::genai::EncodedResults; using ov::genai::GenerationConfig; +using ov::genai::GenerationResult; using ov::genai::LLMPipeline; +using ov::genai::MeanStdPair; using ov::genai::OptionalGenerationConfig; +using ov::genai::PerfMetrics; +using ov::genai::RawPerfMetrics; +using ov::genai::SchedulerConfig; using ov::genai::StopCriteria; using ov::genai::StreamerBase; using ov::genai::StreamerVariant; @@ -25,8 +32,23 @@ using ov::genai::StringInputs; using ov::genai::TokenizedInputs; using ov::genai::Tokenizer; - -PYBIND11_MAKE_OPAQUE(std::vector); +// When StreamerVariant is used utf-8 decoding is done by pybind and can lead to exception on incomplete texts. +// Therefore strings decoding should be handled with PyUnicode_DecodeUTF8(..., "replace") to not throw errors. +using PyBindStreamerVariant = std::variant, std::shared_ptr, std::monostate>; + +template struct overloaded : Ts... { using Ts::operator()...; }; +template overloaded(Ts...) -> overloaded; + +template +std::vector get_ms(const T& instance, U T::*member) { + // Converts c++ duration to float so that it can be used in Python. + std::vector res; + const auto& durations = instance.*member; + res.reserve(durations.size()); + std::transform(durations.begin(), durations.end(), std::back_inserter(res), + [](const auto& duration) { return duration.count(); }); + return res; +} namespace { @@ -80,11 +102,94 @@ auto generation_config_docstring = R"( repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. )"; +auto raw_perf_metrics_docstring = R"( + Structure with raw performance metrics for each generation before any statistics are calculated. + + :param generate_durations: Durations for each generate call in microseconds. + :type generate_durations: List[MicroSeconds] + + :param tokenization_durations: Durations for the tokenization process in microseconds. + :type tokenization_durations: List[MicroSeconds] + + :param detokenization_durations: Durations for the detokenization process in microseconds. + :type detokenization_durations: List[MicroSeconds] + + :param m_times_to_first_token: Times to the first token for each call in microseconds. + :type m_times_to_first_token: List[MicroSeconds] + + :param m_new_token_times: Time points for each new token generated. + :type m_new_token_times: List[TimePoint] + + :param m_batch_sizes: Batch sizes for each generate call. + :type m_batch_sizes: List[int] + + :param m_durations: Total durations for each generate call in microseconds. + :type m_durations: List[MicroSeconds] + + :param num_generated_tokens: Total number of tokens generated. + :type num_generated_tokens: int + + :param num_input_tokens: Total number of tokens in the input prompt. + :type num_input_tokens: int +)"; + +auto perf_metrics_docstring = R"( + Holds performance metrics for each generate call. + + PerfMetrics holds fields with mean and standard deviations for the following metrics: + - Time To the First Token (TTFT), ms + - Time per Output Token (TPOT), ms/token + - Generate total duration, ms + - Tokenization duration, ms + - Detokenization duration, ms + - Throughput, tokens/s + + Additional fields include: + - Load time, ms + - Number of generated tokens + - Number of tokens in the input prompt + + Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs. + If mean and std were already calculated, getters return cached values. + + :param get_load_time: Returns the load time in milliseconds. + :type get_load_time: float + + :param get_num_generated_tokens: Returns the number of generated tokens. + :type get_num_generated_tokens: int + + :param get_num_input_tokens: Returns the number of tokens in the input prompt. + :type get_num_input_tokens: int + + :param get_ttft: Returns the mean and standard deviation of TTFT. + :type get_ttft: MeanStdPair + + :param get_tpot: Returns the mean and standard deviation of TPOT. + :type get_tpot: MeanStdPair + + :param get_throughput: Returns the mean and standard deviation of throughput. + :type get_throughput: MeanStdPair + + :param get_generate_duration: Returns the mean and standard deviation of generate duration. + :type get_generate_duration: MeanStdPair + + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization duration. + :type get_tokenization_duration: MeanStdPair + + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization duration. + :type get_detokenization_duration: MeanStdPair + + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. + :type raw_metrics: RawPerfMetrics +)"; + +OptionalGenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config, const py::kwargs& kwargs) { + if(!config.has_value() && kwargs.empty()) + return std::nullopt; -GenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& config_, const py::kwargs& kwargs) { - GenerationConfig config; - if(config_.has_value()) - config = *config_; + GenerationConfig res_config; + if(config.has_value()) + res_config = *config; for (const auto& item : kwargs) { std::string key = py::cast(item.first); @@ -96,48 +201,75 @@ GenerationConfig update_config_from_kwargs(const OptionalGenerationConfig& confi // Some HF configs can have parameters for methods currenly unsupported in ov_genai // but if their values are not set / None, then this should not block // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} - return config; + return res_config; } if (key == "max_new_tokens") { - config.max_new_tokens = py::cast(item.second); + res_config.max_new_tokens = py::cast(item.second); } else if (key == "max_length") { - config.max_length = py::cast(item.second); + res_config.max_length = py::cast(item.second); } else if (key == "ignore_eos") { - config.ignore_eos = py::cast(item.second); + res_config.ignore_eos = py::cast(item.second); } else if (key == "num_beam_groups") { - config.num_beam_groups = py::cast(item.second); + res_config.num_beam_groups = py::cast(item.second); } else if (key == "num_beams") { - config.num_beams = py::cast(item.second); + res_config.num_beams = py::cast(item.second); } else if (key == "diversity_penalty") { - config.diversity_penalty = py::cast(item.second); + res_config.diversity_penalty = py::cast(item.second); } else if (key == "length_penalty") { - config.length_penalty = py::cast(item.second); + res_config.length_penalty = py::cast(item.second); } else if (key == "num_return_sequences") { - config.num_return_sequences = py::cast(item.second); + res_config.num_return_sequences = py::cast(item.second); } else if (key == "no_repeat_ngram_size") { - config.no_repeat_ngram_size = py::cast(item.second); + res_config.no_repeat_ngram_size = py::cast(item.second); } else if (key == "stop_criteria") { - config.stop_criteria = py::cast(item.second); + res_config.stop_criteria = py::cast(item.second); } else if (key == "temperature") { - config.temperature = py::cast(item.second); + res_config.temperature = py::cast(item.second); } else if (key == "top_p") { - config.top_p = py::cast(item.second); + res_config.top_p = py::cast(item.second); } else if (key == "top_k") { - config.top_k = py::cast(item.second); + res_config.top_k = py::cast(item.second); } else if (key == "do_sample") { - config.do_sample = py::cast(item.second); + res_config.do_sample = py::cast(item.second); } else if (key == "repetition_penalty") { - config.repetition_penalty = py::cast(item.second); + res_config.repetition_penalty = py::cast(item.second); } else if (key == "eos_token_id") { - config.eos_token_id = py::cast(item.second); + res_config.set_eos_token_id(py::cast(item.second)); } else { throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. " "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters.")); } } - return config; + return res_config; +} + +ov::Any py_object_to_any(const py::object& py_obj); + +bool py_object_is_any_map(const py::object& py_obj) { + if (!py::isinstance(py_obj)) { + return false; + } + auto dict = py::cast(py_obj); + return std::all_of(dict.begin(), dict.end(), [&](const std::pair& elem) { + return py::isinstance(elem.first); + }); +} + +ov::AnyMap py_object_to_any_map(const py::object& py_obj) { + OPENVINO_ASSERT(py_object_is_any_map(py_obj), "Unsupported attribute type."); + ov::AnyMap return_value = {}; + for (auto& item : py::cast(py_obj)) { + std::string key = py::cast(item.first); + py::object value = py::cast(item.second); + if (py_object_is_any_map(value)) { + return_value[key] = py_object_to_any_map(value); + } else { + return_value[key] = py_object_to_any(value); + } + } + return return_value; } ov::Any py_object_to_any(const py::object& py_obj) { @@ -202,6 +334,8 @@ ov::Any py_object_to_any(const py::object& py_obj) { } // OV types + } else if (py_object_is_any_map(py_obj)) { + return py_object_to_any_map(py_obj); } else if (py::isinstance(py_obj)) { return py::cast(py_obj); } else if (py::isinstance(py_obj)) { @@ -254,7 +388,7 @@ py::list handle_utf8_results(const std::vector& decoded_res) { py::list res; for (const auto s: decoded_res) { PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); - res.append(py_s); + res.append(py::reinterpret_steal(py_s)); } return res; } @@ -263,30 +397,54 @@ py::object call_common_generate( LLMPipeline& pipe, const std::variant>& inputs, const OptionalGenerationConfig& config, - const StreamerVariant& streamer, + const PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs ) { auto updated_config = update_config_from_kwargs(config, kwargs); + py::object results; EncodedInputs tensor_data; - - if (auto data = std::get_if(&inputs)) { - return py::cast(pipe.generate(*data, updated_config, streamer)); - } else if (auto data = std::get_if(&inputs)) { - return py::cast(pipe.generate(*data, updated_config, streamer)); - } else if (auto data = std::get_if(&inputs)) { - DecodedResults res = pipe.generate(*data, updated_config, streamer); + StreamerVariant streamer = std::monostate(); + + std::visit(overloaded { + [&streamer](const std::function& py_callback){ + // Wrap python streamer with manual utf-8 decoding. Do not rely + // on pybind automatic decoding since it raises exceptions on incomplete strings. + auto callback_wrapped = [&py_callback](std::string subword) -> bool { + auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); + return py_callback(py::reinterpret_borrow(py_str)); + }; + streamer = callback_wrapped; + }, + [&streamer](std::shared_ptr streamer_cls){ + streamer = streamer_cls; + }, + [](std::monostate none){ /*streamer is already a monostate */ } + }, py_streamer); + + // Call suitable generate overload for each type of input. + std::visit(overloaded { + [&](ov::Tensor ov_tensor) { + results = py::cast(pipe.generate(ov_tensor, updated_config, streamer)); + }, + [&](TokenizedInputs tokenized_input) { + results = py::cast(pipe.generate(tokenized_input, updated_config, streamer)); + }, + [&](std::string string_input) { + DecodedResults res = pipe.generate(string_input, updated_config, streamer); // If input was a string return a single string otherwise return DecodedResults. - if (updated_config.num_return_sequences == 1) { - return handle_utf8_results(res.texts)[0]; + if (updated_config.has_value() && (*updated_config).num_return_sequences == 1) { + results = py::cast(handle_utf8_results(res.texts)[0]); } else { - return py::cast(res); + results = py::cast(res); } - } else if (auto data = std::get_if>(&inputs)) { + }, + [&](std::vector string_input) { // For DecodedResults texts getter already handles utf8 decoding. - return py::cast(pipe.generate(*data, updated_config, streamer)); - } else { - throw std::invalid_argument("Provided input is neither encoded tokens, neither string"); - } + results = py::cast(pipe.generate(string_input, updated_config, streamer)); + }}, + inputs); + + return results; } std::string ov_tokenizers_module_path() { @@ -312,6 +470,17 @@ class ConstructableStreamer: public StreamerBase { } }; +std::ostream& operator << (std::ostream& stream, const GenerationResult& generation_result) { + stream << generation_result.m_request_id << std::endl; + const bool has_scores = !generation_result.m_scores.empty(); + for (size_t i = 0; i < generation_result.m_generation_ids.size(); ++i) { + stream << "{ "; + if (has_scores) + stream << generation_result.m_scores[i] << ", "; + stream << generation_result.m_generation_ids[i] << " }" << std::endl; + } + return stream << std::endl; +} } // namespace @@ -352,7 +521,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { [](LLMPipeline& pipe, const std::variant>& inputs, const OptionalGenerationConfig& generation_config, - const StreamerVariant& streamer, + const PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) { return call_common_generate(pipe, inputs, generation_config, streamer, kwargs); @@ -368,7 +537,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { [](LLMPipeline& pipe, const std::variant>& inputs, const OptionalGenerationConfig& generation_config, - const StreamerVariant& streamer, + const PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) { return call_common_generate(pipe, inputs, generation_config, streamer, kwargs); @@ -380,22 +549,22 @@ PYBIND11_MODULE(py_generate_pipeline, m) { ) .def("get_tokenizer", &LLMPipeline::get_tokenizer) - .def("start_chat", &LLMPipeline::start_chat) + .def("start_chat", &LLMPipeline::start_chat, py::arg("system_message") = "") .def("finish_chat", &LLMPipeline::finish_chat) .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy) .def("set_generation_config", &LLMPipeline::set_generation_config); // Binding for Tokenizer - py::class_(m, "Tokenizer", + py::class_(m, "Tokenizer", R"(openvino_genai.Tokenizer object is used to initialize Tokenizer if it's located in a different path than the main model.)") - .def(py::init([](const std::string& tokenizer_path) { + .def(py::init([](const std::string& tokenizer_path, const std::map& plugin_config) { ScopedVar env_manager(ov_tokenizers_module_path()); - return std::make_unique(tokenizer_path); - }), py::arg("tokenizer_path")) + return std::make_unique(tokenizer_path, properties_to_any_map(plugin_config)); + }), py::arg("tokenizer_path"), py::arg("plugin_config") = ov::AnyMap({})) - .def("encode", [](Tokenizer& tok, std::vector& prompts){ return tok.encode(prompts); }, + .def("encode", [](Tokenizer& tok, std::vector& prompts) { return tok.encode(prompts); }, py::arg("prompts"), R"(Encodes a list of prompts into tokenized inputs.)") @@ -405,8 +574,8 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def( "decode", - [](Tokenizer& tok, std::vector& tokens){ - return handle_utf8_results({tok.decode(tokens)})[0]; + [](Tokenizer& tok, std::vector& tokens) -> py::str { + return handle_utf8_results({tok.decode(tokens)})[0]; }, py::arg("tokens"), R"(Decode a sequence into a string prompt.)" @@ -414,7 +583,7 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def( "decode", - [](Tokenizer& tok, ov::Tensor& tokens){ + [](Tokenizer& tok, ov::Tensor& tokens) -> py::list { return handle_utf8_results(tok.decode(tokens)); }, py::arg("tokens"), @@ -422,14 +591,14 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def( "decode", - [](Tokenizer& tok, std::vector>& tokens){ + [](Tokenizer& tok, std::vector>& tokens) -> py::list{ return handle_utf8_results(tok.decode(tokens)); }, py::arg("tokens"), R"(Decode a batch of tokens into a list of string prompt.)") .def("apply_chat_template", [](Tokenizer& tok, - const ChatHistory& history, + ChatHistory history, bool add_generation_prompt, const std::string& chat_template) { return tok.apply_chat_template(history, add_generation_prompt, chat_template); @@ -460,10 +629,11 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // Binding for GenerationConfig py::class_(m, "GenerationConfig", generation_config_docstring) .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") - .def(py::init([](py::kwargs kwargs) { return update_config_from_kwargs(GenerationConfig(), kwargs); })) + .def(py::init([](py::kwargs kwargs) { return *update_config_from_kwargs(GenerationConfig(), kwargs); })) .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) .def_readwrite("max_length", &GenerationConfig::max_length) .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) + .def_readwrite("min_new_tokens", &GenerationConfig::min_new_tokens) .def_readwrite("num_beam_groups", &GenerationConfig::num_beam_groups) .def_readwrite("num_beams", &GenerationConfig::num_beams) .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) @@ -476,13 +646,56 @@ PYBIND11_MODULE(py_generate_pipeline, m) { .def_readwrite("top_k", &GenerationConfig::top_k) .def_readwrite("do_sample", &GenerationConfig::do_sample) .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) - .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id); + .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id) + .def_readwrite("presence_penalty", &GenerationConfig::presence_penalty) + .def_readwrite("frequency_penalty", &GenerationConfig::frequency_penalty) + .def_readwrite("rng_seed", &GenerationConfig::rng_seed) + .def("set_eos_token_id", &GenerationConfig::set_eos_token_id) + .def("is_beam_search", &GenerationConfig::is_beam_search); py::class_(m, "DecodedResults") .def(py::init<>()) .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); }) .def_readonly("scores", &DecodedResults::scores) - .def("__str__", &DecodedResults::operator std::string);; + .def_readonly("perf_metrics", &DecodedResults::perf_metrics) + .def("__str__", &DecodedResults::operator std::string); + + py::class_(m, "RawPerfMetrics", raw_perf_metrics_docstring) + .def(py::init<>()) + .def_readonly("generate_durations", &RawPerfMetrics::generate_durations) + .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::tokenization_durations); + }) + .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::detokenization_durations); + }) + .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_times_to_first_token); + }) + .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_durations); + }) + .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes) + .def_readonly("num_generated_tokens", &RawPerfMetrics::num_generated_tokens) + .def_readonly("num_input_tokens", &RawPerfMetrics::num_input_tokens); + + py::class_(m, "MeanStdPair") + .def(py::init<>()) + .def_readonly("mean", &MeanStdPair::mean) + .def_readonly("std", &MeanStdPair::std); + + py::class_(m, "PerfMetrics", perf_metrics_docstring) + .def(py::init<>()) + .def("get_generate_duration", &PerfMetrics::get_generate_duration) + .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration) + .def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration) + .def("get_throughput", &PerfMetrics::get_throughput) + .def("get_tpot", &PerfMetrics::get_tpot) + .def("get_ttft", &PerfMetrics::get_ttft) + .def("get_load_time", &PerfMetrics::get_load_time) + .def("__add__", &PerfMetrics::operator+) + .def("__iadd__", &PerfMetrics::operator+=) + .def_readonly("raw_metrics", &PerfMetrics::raw_metrics); py::class_(m, "TokenizedInputs") .def(py::init()) @@ -491,10 +704,86 @@ PYBIND11_MODULE(py_generate_pipeline, m) { py::class_(m, "EncodedResults") .def_readonly("tokens", &EncodedResults::tokens) - .def_readonly("scores", &EncodedResults::scores); + .def_readonly("scores", &EncodedResults::scores) + .def_readonly("perf_metrics", &EncodedResults::perf_metrics); py::class_>(m, "StreamerBase") // Change the holder form unique_ptr to shared_ptr .def(py::init<>()) .def("put", &StreamerBase::put) .def("end", &StreamerBase::end); + + py::class_(m, "GenerationResult") + .def(py::init<>()) + .def_readonly("m_request_id", &GenerationResult::m_request_id) + .def_property("m_generation_ids", + [](GenerationResult &r) -> py::list { + py::list res; + for (auto s: r.m_generation_ids) { + PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); + res.append(py_s); + } + return res; + }, + [](GenerationResult &r, std::vector &generation_ids) { + r.m_generation_ids = generation_ids; + }) + .def_readwrite("m_scores", &GenerationResult::m_scores) + .def("__repr__", + [](const GenerationResult &r) -> py::str{ + std::stringstream stream; + stream << ""; + std::string str = stream.str(); + PyObject* py_s = PyUnicode_DecodeUTF8(str.data(), str.length(), "replace"); + return py::reinterpret_steal(py_s); + } + ) + .def("get_generation_ids", + [](GenerationResult &r) -> py::list { + py::list res; + for (auto s: r.m_generation_ids) { + PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); + res.append(py_s); + } + return res; + }); + + py::class_(m, "SchedulerConfig") + .def(py::init<>()) + .def_readwrite("max_num_batched_tokens", &SchedulerConfig::max_num_batched_tokens) + .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks) + .def_readwrite("cache_size", &SchedulerConfig::cache_size) + .def_readwrite("block_size", &SchedulerConfig::block_size) + .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse) + .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs) + .def_readwrite("enable_prefix_caching", &SchedulerConfig::enable_prefix_caching); + + py::class_(m, "ContinuousBatchingPipeline") + .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, const std::map& tokenizer_plugin_config) { + ScopedVar env_manager(ov_tokenizers_module_path()); + return std::make_unique(model_path, scheduler_config, device, properties_to_any_map(llm_plugin_config), properties_to_any_map(tokenizer_plugin_config)); + }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("llm_plugin_config") = ov::AnyMap({}), py::arg("tokenizer_plugin_config") = ov::AnyMap({})) + .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& plugin_config) { + ScopedVar env_manager(ov_tokenizers_module_path()); + return std::make_unique(model_path, tokenizer, scheduler_config, device, properties_to_any_map(plugin_config)); + }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({})) + .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) + .def("get_config", &ContinuousBatchingPipeline::get_config) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request)) + .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request)) + .def("step", &ContinuousBatchingPipeline::step) + .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) + .def( + "generate", + py::overload_cast&, const std::vector&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), + py::arg("input_ids"), + py::arg("sampling_params"), + py::arg("streamer") = std::monostate{} + ) + .def( + "generate", + py::overload_cast&, const std::vector&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), + py::arg("prompts"), + py::arg("sampling_params"), + py::arg("streamer") = std::monostate{} + ); } diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt new file mode 100644 index 0000000000..083b911416 --- /dev/null +++ b/tests/cpp/CMakeLists.txt @@ -0,0 +1,12 @@ +FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip +) +FetchContent_MakeAvailable(googletest) +set(TEST_TARGET_NAME "tests_continuous_batching") +file(GLOB tests_src + "*.cpp" +) +add_executable(${TEST_TARGET_NAME} ${tests_src}) +target_link_libraries(${TEST_TARGET_NAME} PUBLIC openvino::genai gtest_main) +target_include_directories(${TEST_TARGET_NAME} PRIVATE "${PROJECT_SOURCE_DIR}/src/cpp/src") diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp new file mode 100644 index 0000000000..4621c184f5 --- /dev/null +++ b/tests/cpp/block_manager.cpp @@ -0,0 +1,79 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "openvino/runtime/core.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_config.hpp" +#include "sequence_group.hpp" +#include "scheduler.hpp" + +TEST(TestBlockManager, general_test) { + ov::genai::BlockManager bm = ov::genai::BlockManager(6, false, 4); + ov::genai::TokenIds prompt_ids; + + ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared( + 0, + ov::Tensor(ov::element::i64, { + prompt_ids.size()}, prompt_ids.data()), + ov::genai::beam_search(), + 4); + auto sequence = sequence_group->get_not_finished_sequences()[0]; + bm.allocate(sequence, 6); + auto seq_id = sequence->get_id(); + EXPECT_TRUE(bm.has_block_table(seq_id)); + EXPECT_EQ(bm.get_block_table(seq_id).size(), 6); + EXPECT_EQ(bm.num_free_blocks(), 0); + + bm.free_sequence_partially_single_runnning_sequence(seq_id, 4); + EXPECT_EQ(bm.get_block_table(seq_id).size(), 2); + EXPECT_EQ(bm.num_free_blocks(), 4); + + bm.free_sequence(seq_id); + EXPECT_FALSE(bm.has_block_table(seq_id)); + EXPECT_EQ(bm.num_free_blocks(), 6); + + bm.allocate(sequence, 2); + bm.fork_sequence(seq_id, 1); + EXPECT_TRUE(bm.has_block_table(1)); + EXPECT_EQ(bm.get_block_table(1).back()->get_references_count(), 2); + +} + +TEST(TestBlockManager, required_blocks_count) { + ov::genai::BlockManager bm = ov::genai::BlockManager(8, false, 4); + + std::vector tokens = {0,1,2,3,4}; + ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared( + 0, + ov::Tensor(ov::element::i64, { + tokens.size()}, tokens.data()), + ov::genai::beam_search(), + 4); + sequence_group->schedule_tokens(5); + auto required_blocks = bm.required_blocks_count(sequence_group); + EXPECT_EQ(required_blocks, 2); + EXPECT_TRUE(bm.can_append_slots(sequence_group)); + bm.append_slots(sequence_group); + EXPECT_EQ(bm.num_free_blocks(), 6); + + sequence_group->finish_iteration(); + auto sequence_to_fork = sequence_group->get_running_sequences()[0]; + for (size_t i = 0; i < 4; ++i) { + const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork); + bm.fork_sequence(sequence_to_fork->get_id(), forked_sequence->get_id()); + } + sequence_group->schedule_tokens(1); + required_blocks = bm.required_blocks_count(sequence_group); + EXPECT_EQ(required_blocks, 4); + EXPECT_TRUE(bm.can_append_slots(sequence_group)); + bm.append_slots(sequence_group); + EXPECT_EQ(bm.num_free_blocks(), 2); + sequence_group->finish_iteration(); + + sequence_group->schedule_tokens(3); + required_blocks = bm.required_blocks_count(sequence_group); + EXPECT_EQ(required_blocks, 5); + EXPECT_FALSE(bm.can_append_slots(sequence_group)); +} \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp b/tests/cpp/cache_manager.cpp similarity index 65% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp rename to tests/cpp/cache_manager.cpp index 2fa4790933..edfa483eda 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -10,20 +10,19 @@ TEST(TestCacheManager, general_test) { ov::Core core; - SchedulerConfig scheduler_config = { - .max_num_batched_tokens = 32, - .num_kv_blocks = 0, - .cache_size = 2, - .block_size = 32, - .max_num_seqs = 2, - }; + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 0; + scheduler_config.cache_size = 2; + scheduler_config.block_size = 32; + scheduler_config.max_num_seqs = 2; const std::string device = "CPU"; - DeviceConfig device_config(core, scheduler_config, "CPU"); + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); size_t num_decoder_layers = 12; device_config.set_model_params(12, 64, num_decoder_layers); - auto cache_manager = std::make_shared(device_config); + auto cache_manager = std::make_shared(device_config); size_t allocated_bytes = 0; for (size_t i = 0; i < num_decoder_layers; i++) { diff --git a/tests/cpp/evictor.cpp b/tests/cpp/evictor.cpp new file mode 100644 index 0000000000..9867dfa2b5 --- /dev/null +++ b/tests/cpp/evictor.cpp @@ -0,0 +1,54 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "openvino/runtime/core.hpp" +#include "scheduler.hpp" +#include +#include + +TEST(TestEvictor, general_test) { + ov::genai::Evictor evictor; + auto block0 = std::make_shared(0); + block0->set_hash(77, 1); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block1 = std::make_shared(1); + block1->set_hash(56, 2); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block2 = std::make_shared(2); + block2->set_hash(23, 3); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + evictor.add(block0->get_hash(), block0); + evictor.add(block1->get_hash(), block1); + evictor.add(block2->get_hash(), block2); + EXPECT_EQ(evictor.num_blocks(), 3); + + auto block = evictor.get_block(56); + EXPECT_EQ(block->get_index(), 1); + EXPECT_EQ(block->get_hash(), 56); + EXPECT_EQ(block->get_references_count(), 1); + EXPECT_EQ(evictor.num_blocks(), 2); + + EXPECT_EQ(evictor.get_block(44), nullptr); + EXPECT_EQ(evictor.num_blocks(), 2); + + EXPECT_EQ(evictor.get_lru_block()->get_index(), 0); + EXPECT_EQ(evictor.num_blocks(), 1); + + auto block3 = std::make_shared(7); + block3->set_hash(12, 4); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block4 = std::make_shared(10); + block4->set_hash(99, 5); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + evictor.add(block3->get_hash(), block3); + evictor.add(block4->get_hash(), block4); + block2->set_timestamp(std::chrono::system_clock::now()); + + EXPECT_EQ(evictor.get_lru_block()->get_index(), 7); + EXPECT_EQ(evictor.get_lru_block()->get_index(), 10); + EXPECT_EQ(evictor.get_lru_block()->get_index(), 2); + EXPECT_EQ(evictor.get_lru_block(), nullptr); + EXPECT_EQ(evictor.num_blocks(), 0); +} diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/generate_config.cpp b/tests/cpp/generate_config.cpp similarity index 65% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/tests/generate_config.cpp rename to tests/cpp/generate_config.cpp index 1774553313..05180fb1a4 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/generate_config.cpp +++ b/tests/cpp/generate_config.cpp @@ -3,24 +3,27 @@ #include #include -#include "generation_config.hpp" +#include "openvino/genai/generation_config.hpp" TEST(GenerationConfigTest, invalid_temperature) { - GenerationConfig config; + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.temperature = -0.1; config.do_sample = true; EXPECT_THROW(config.validate(), ov::Exception); } TEST(GenerationConfigTest, valid_temperature) { - GenerationConfig config; + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; config.temperature = 0.1; EXPECT_NO_THROW(config.validate()); } TEST(GenerationConfigTest, invalid_top_p) { - GenerationConfig config; + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; config.top_p = -0.5; EXPECT_THROW(config.validate(), ov::Exception); @@ -29,14 +32,16 @@ TEST(GenerationConfigTest, invalid_top_p) { } TEST(GenerationConfigTest, valid_top_p) { - GenerationConfig config; + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; config.top_p = 0.1; EXPECT_NO_THROW(config.validate()); } TEST(GenerationConfigTest, invalid_repeatition_penalty) { - GenerationConfig config; + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; config.repetition_penalty = -3.0; EXPECT_THROW(config.validate(), ov::Exception); @@ -45,16 +50,18 @@ TEST(GenerationConfigTest, invalid_repeatition_penalty) { } TEST(GenerationConfigTest, valid_repeatition_penalty) { - GenerationConfig config; + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; config.repetition_penalty = 1.8; EXPECT_NO_THROW(config.validate()); - config.repetition_penalty = 0.0; + config.repetition_penalty = 0.1; EXPECT_NO_THROW(config.validate()); } TEST(GenerationConfigTest, invalid_presence_penalty) { - GenerationConfig config; + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; config.presence_penalty = 3.0; EXPECT_THROW(config.validate(), ov::Exception); @@ -63,7 +70,8 @@ TEST(GenerationConfigTest, invalid_presence_penalty) { } TEST(GenerationConfigTest, valid_presence_penalty) { - GenerationConfig config; + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; config.presence_penalty = 1.8; EXPECT_NO_THROW(config.validate()); @@ -71,20 +79,22 @@ TEST(GenerationConfigTest, valid_presence_penalty) { EXPECT_NO_THROW(config.validate()); } -TEST(GenerationConfigTest, invalid_frequence_penalty) { - GenerationConfig config; +TEST(GenerationConfigTest, invalid_frequency_penalty) { + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; - config.frequence_penalty = 3.0; + config.frequency_penalty = 3.0; EXPECT_THROW(config.validate(), ov::Exception); - config.frequence_penalty = -3.1; + config.frequency_penalty = -3.1; EXPECT_THROW(config.validate(), ov::Exception); } -TEST(GenerationConfigTest, valid_frequence_penalty) { - GenerationConfig config; +TEST(GenerationConfigTest, valid_frequency_penalty) { + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; config.do_sample = true; - config.frequence_penalty = 1.8; + config.frequency_penalty = 1.8; EXPECT_NO_THROW(config.validate()); - config.frequence_penalty = -2.0; + config.frequency_penalty = -2.0; EXPECT_NO_THROW(config.validate()); } diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp b/tests/cpp/logit_filtering.cpp similarity index 63% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp rename to tests/cpp/logit_filtering.cpp index d3696a01e9..afedfe6685 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/logit_filtering.cpp +++ b/tests/cpp/logit_filtering.cpp @@ -18,13 +18,14 @@ using TemperatureTransformTest = testing::TestWithParam rhs.m_log_prob; }); - for (size_t i = 0; i < test_result.size(); i++) { - EXPECT_NEAR(test_result[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(test_result[i].m_index, test_struct.expected_output[i].m_index); + transform.apply(logits); + ASSERT_EQ(logits.size(), test_struct.expected_output.size()); + std::sort(logits.begin(), logits.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + for (size_t i = 0; i < logits.size(); i++) { + EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); + EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); } } @@ -51,12 +52,13 @@ using TopPFilteringTest = testing::TestWithParam; TEST_P(TopPFilteringTest, FilterResultEqualToReference) { auto test_struct = GetParam(); + auto logits = test_struct.input; auto transform = TopPFilter(test_struct.top_p); - auto test_result = transform.apply(test_struct.input); - ASSERT_EQ(test_result.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < test_result.size(); i++) { - EXPECT_NEAR(test_result[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(test_result[i].m_index, test_struct.expected_output[i].m_index); + transform.apply(logits); + ASSERT_EQ(logits.size(), test_struct.expected_output.size()); + for (size_t i = 0; i < logits.size(); i++) { + EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); + EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); } } @@ -83,12 +85,13 @@ using TopKFilteringTest = testing::TestWithParam; TEST_P(TopKFilteringTest, FilterResultEqualToReference) { auto test_struct = GetParam(); + auto logits = test_struct.input; auto transform = TopKFilter(test_struct.top_k); - auto test_result = transform.apply(test_struct.input); - ASSERT_EQ(test_result.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < test_result.size(); i++) { - EXPECT_NEAR(test_result[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(test_result[i].m_index, test_struct.expected_output[i].m_index); + transform.apply(logits); + ASSERT_EQ(logits.size(), test_struct.expected_output.size()); + for (size_t i = 0; i < logits.size(); i++) { + EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); + EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); } } @@ -103,38 +106,9 @@ INSTANTIATE_TEST_SUITE_P(VariousInputs, TopKFilteringTest, testing::ValuesIn(TOP_K_TRANSFORM_TEST_CASES)); - -struct ProbabilityNormalizeTransformTestStruct { - std::vector input; - std::vector expected_output; -}; - -using ProbabilityNormalizeTransformTest = testing::TestWithParam; - -TEST_P(ProbabilityNormalizeTransformTest, TransformResultEqualToReference) { - auto test_struct = GetParam(); - auto transform = ProbabilityNormalizeTransform(); - auto test_result = transform.apply(test_struct.input); - ASSERT_EQ(test_result.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < test_result.size(); i++) { - EXPECT_NEAR(test_result[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(test_result[i].m_index, test_struct.expected_output[i].m_index); - } -} - - -const std::vector NORMALIZE_TRANSFORM_TEST_CASES = { - { { {0.090031, 2}, {0.244728, 0}, {0.665241, 1} }, { {0.090031, 2}, {0.244728, 0}, {0.665241, 1} } }, - { { {0.05, 0}, {0.03, 1}, {0.02, 2} }, { {0.5, 0}, {0.3, 1}, {0.2, 2} } }, -}; - -INSTANTIATE_TEST_SUITE_P(VariousInputs, - ProbabilityNormalizeTransformTest, - testing::ValuesIn(NORMALIZE_TRANSFORM_TEST_CASES)); - struct RepetitionPenaltyTransformTestStruct { float penalty; - std::vector input_logits; + std::vector input; TokenIds input_ids; std::vector expected_output; }; @@ -143,12 +117,13 @@ using RepetitionPenaltyTransformTest = testing::TestWithParam input {{43.0f, 0}}; + EXPECT_THROW(transform.apply(input, {1337}), ov::Exception); + input = {{18.0f, 0}}; + EXPECT_THROW(transform.apply(input, {0, -1}), ov::Exception); } struct FrequencyPenaltyTransformTestStruct { float penalty; - std::vector input_logits; + std::vector input; TokenIds input_ids; std::vector expected_output; }; @@ -195,12 +172,13 @@ using FrequencyPenaltyTransformTest = testing::TestWithParam input {{43.0f, 0}}; + EXPECT_THROW(transform.apply(input, {1337}), ov::Exception); + input = {{18.0f, 0}}; + EXPECT_THROW(transform.apply(input, {0, -1}), ov::Exception); } struct PresencePenaltyTransformTestStruct { float penalty; - std::vector input_logits; + std::vector input; TokenIds input_ids; std::vector expected_output; }; @@ -248,12 +228,13 @@ using PresencePenaltyTransformTest = testing::TestWithParam input {{43.0f, 0}}; + EXPECT_THROW(transform.apply(input, {1337}), ov::Exception); + input = {{18.0f, 0}}; + EXPECT_THROW(transform.apply(input, {0, -1}), ov::Exception); } struct EOSPenaltyTransformTestStruct { size_t eos_token_id; - std::vector input_logits; + std::vector input; std::vector expected_output; }; @@ -299,12 +282,13 @@ using EOSPenaltyTransformTest = testing::TestWithParam::max()); - auto test_result = transform.apply(test_struct.input_logits); - ASSERT_EQ(test_result.size(), test_struct.expected_output.size()); - for (size_t i = 0; i < test_result.size(); i++) { - EXPECT_NEAR(test_result[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); - EXPECT_EQ(test_result[i].m_index, test_struct.expected_output[i].m_index); + transform.apply(logits); + ASSERT_EQ(logits.size(), test_struct.expected_output.size()); + for (size_t i = 0; i < logits.size(); i++) { + EXPECT_NEAR(logits[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); + EXPECT_EQ(logits[i].m_index, test_struct.expected_output[i].m_index); } } diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp b/tests/cpp/scheduler.cpp similarity index 72% rename from text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp rename to tests/cpp/scheduler.cpp index 73186f34e0..0a4b04f880 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -4,10 +4,12 @@ #include #include "openvino/runtime/core.hpp" -#include "continuous_batching_pipeline.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_config.hpp" #include "sequence_group.hpp" #include "scheduler.hpp" -#include "generation_config.hpp" + +using namespace ov::genai; void clear_finished_sequences(std::vector& requests) { auto new_end = std::remove_if(requests.begin(), requests.end(), [] (SequenceGroup::CPtr seq_group) -> bool { @@ -16,34 +18,28 @@ void clear_finished_sequences(std::vector& requests) { requests.erase(new_end, requests.end()); } - TEST(TestScheduler, general_test) { - std::vector configs{ - SchedulerConfig { - .max_num_batched_tokens = 32, - .num_kv_blocks = 6, - .block_size = 4, - .dynamic_split_fuse = false, - .max_num_seqs = 5, - }, - SchedulerConfig { - .max_num_batched_tokens = 32, - .num_kv_blocks = 6, - .block_size = 4, - .dynamic_split_fuse = true, - .max_num_seqs = 5, - } - }; + std::array configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 6; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 6; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; for (auto scheduler_config: configs) { std::vector tokens = {0,1,2,3,4,5,6,7}; SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx0 = (*sequence_group1)[0]->get_id(); SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx1 = (*sequence_group2)[0]->get_id(); SequenceGroup::Ptr sequence_group3 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx2 = (*sequence_group3)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2, sequence_group3}; @@ -114,29 +110,24 @@ TEST(TestScheduler, general_test) { } TEST(TestScheduler, test_append_slots_considers_all_sequences) { - std::vector configs{ - SchedulerConfig { - .max_num_batched_tokens = 32, - .num_kv_blocks = 5, - .block_size = 4, - .dynamic_split_fuse = false, - .max_num_seqs = 5, - }, - SchedulerConfig { - .max_num_batched_tokens = 32, - .num_kv_blocks = 5, - .block_size = 4, - .dynamic_split_fuse = true, - .max_num_seqs = 5, - } - }; + std::array configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 5; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 5; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; for (auto scheduler_config: configs) { std::vector tokens = {0,1,2,3,4,5,6,7}; SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx0 = (*sequence_group1)[0]->get_id(); SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; @@ -184,30 +175,25 @@ TEST(TestScheduler, test_append_slots_considers_all_sequences) { TEST(TestScheduler, test_partial_preemption) { - std::vector configs{ - SchedulerConfig { - .max_num_batched_tokens = 32, - .num_kv_blocks = 6, - .block_size = 4, - .dynamic_split_fuse = false, - .max_num_seqs = 5, - }, - SchedulerConfig { - .max_num_batched_tokens = 32, - .num_kv_blocks = 6, - .block_size = 4, - .dynamic_split_fuse = true, - .max_num_seqs = 5, - } - }; + std::array configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 6; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 6; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; for (auto scheduler_config: configs) { std::vector tokens1 = {0,1,2,3,4,5,6,7,8,9,10}; SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens1.size()}, tokens1.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); std::vector tokens2 = {0,1,2,3,4,5,6,7}; auto idx0 = (*sequence_group1)[0]->get_id(); SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens2.size()}, tokens2.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; @@ -281,29 +267,24 @@ TEST(TestScheduler, test_partial_preemption) { } TEST(TestScheduler, test_partially_preempted_prompt) { - std::vector configs{ - SchedulerConfig { - .max_num_batched_tokens = 32, - .num_kv_blocks = 6, - .block_size = 4, - .dynamic_split_fuse = false, - .max_num_seqs = 5, - }, - SchedulerConfig { - .max_num_batched_tokens = 32, - .num_kv_blocks = 6, - .block_size = 4, - .dynamic_split_fuse = true, - .max_num_seqs = 5, - } - }; + std::array configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 6; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 6; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; for (auto scheduler_config: configs) { std::vector tokens = {0,1,2,3,4,5,6,7,8,9,10,11}; SequenceGroup::Ptr sequence_group1 = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx0 = (*sequence_group1)[0]->get_id(); SequenceGroup::Ptr sequence_group2 = std::make_shared(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), - GenerationConfig::greedy(), scheduler_config.block_size); + ov::genai::greedy(), scheduler_config.block_size); auto idx1 = (*sequence_group2)[0]->get_id(); std::vector requests = {sequence_group1, sequence_group2}; @@ -385,3 +366,69 @@ TEST(TestScheduler, test_partially_preempted_prompt) { EXPECT_FALSE(scheduler.has_block_table(idx0)); } } + +TEST(TestScheduler, prefix_caching_test) { + std::array configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 100; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(0).enable_prefix_caching = true; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 100; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; + configs.at(1).enable_prefix_caching = true; + for (auto scheduler_config: configs) { + std::vector prompt_tokens = {0,1,2,3,4,5,6,7}; + std::vector histrory_tokens = {}; + // schedule prompt + Scheduler scheduler = Scheduler(scheduler_config); + + size_t chat_iterations = 10; + + for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) { + std::vector tokens = histrory_tokens; + tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); + SequenceGroup::Ptr sequence_group = std::make_shared(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size); + std::vector requests = {sequence_group}; + + auto out1 = scheduler.schedule(requests); + if (chat_iteration == 0) + EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size()); + else + EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() + 1); + for (auto seq: requests) { + std::vector running_sequences = seq->get_running_sequences(); + running_sequences[0]->append_token(23, 0.7); + seq->finish_iteration(); + } + + // schedule generate + size_t num_generate_tokens = 10; + for (size_t i = 0; i < num_generate_tokens; i++) { + auto out2 = scheduler.schedule(requests); + EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1); + for (auto seq: requests) { + std::vector running_sequences = seq->get_running_sequences(); + running_sequences[0]->append_token(16, 0.9); + seq->finish_iteration(); + } + } + + // finish sequence + auto sequence = requests[0]->get_running_sequences()[0]; + sequence->set_status(SequenceStatus::FINISHED); + auto idx0 = sequence->get_id(); + scheduler.free_sequence(idx0); + auto generated_ids = sequence->get_generated_ids(); + + histrory_tokens.insert(histrory_tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); + histrory_tokens.insert(histrory_tokens.end(), generated_ids.begin(), generated_ids.end()); + } + } + +} diff --git a/tests/python_tests/README.md b/tests/python_tests/README.md new file mode 100644 index 0000000000..e5381708de --- /dev/null +++ b/tests/python_tests/README.md @@ -0,0 +1,47 @@ +# OpenVINOβ„’ GenAI Tests + +This tests aim to validate support for vanilla and continuous batching GenAI APIs. + +## Setup environemnt + +In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md). + +Then install requirements for tests: +```sh +pip install -r tests/python_tests/requirements.txt +``` + +## Run Tests + +```sh +python -m pytest tests/python_tests/ -m precommit +``` + +During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g. +```sh +GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit +``` + +If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g. +```sh +PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit +``` + +## Customise tests run + +Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests: +```sh +python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat" +``` + +If you wish to run all tests except beam search do the following: +```sh +python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search" +``` + +Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g: +```sh +python -m pytest tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct" +``` + +List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py b/tests/python_tests/common.py similarity index 94% rename from text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py rename to tests/python_tests/common.py index 10cfa5d4d2..0a94558274 100644 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/common.py +++ b/tests/python_tests/common.py @@ -7,7 +7,7 @@ from optimum.intel import OVModelForCausalLM from pathlib import Path -from py_continuous_batching import ContinuousBatchingPipeline, GenerationConfig, SchedulerConfig, GenerationResult +from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig from transformers import AutoTokenizer, AutoModelForCausalLM from transformers import GenerationConfig as HFGenerationConfig from typing import List, Tuple @@ -37,7 +37,7 @@ def get_greedy_with_penalties() -> GenerationConfig: generation_config = GenerationConfig() generation_config.num_return_sequences = 1 generation_config.presence_penalty = 2.0 - generation_config.frequence_penalty = 0.2 + generation_config.frequency_penalty = 0.2 generation_config.max_new_tokens = 30 return generation_config @@ -51,21 +51,21 @@ def get_greedy_with_min_and_max_tokens() -> GenerationConfig: def get_beam_search() -> GenerationConfig: generation_config = GenerationConfig() - generation_config.num_groups = 3 - generation_config.group_size = 2 + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = 3 - generation_config.num_return_sequences = generation_config.num_groups * generation_config.group_size + generation_config.num_return_sequences = generation_config.num_beams return generation_config def get_beam_search_min_and_max_tokens() -> GenerationConfig: generation_config = GenerationConfig() - generation_config.num_groups = 3 - generation_config.group_size = 2 + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 generation_config.min_new_tokens = 15 generation_config.max_new_tokens = 30 generation_config.num_return_sequences = 3 - generation_config.num_return_sequences = generation_config.num_groups * generation_config.group_size + generation_config.num_return_sequences = generation_config.num_beams return generation_config def get_multinomial_temperature() -> GenerationConfig: @@ -79,7 +79,7 @@ def get_multinomial_temperature() -> GenerationConfig: def get_multinomial_temperature_and_num_return_sequence() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True - generation_config.temperature = 0.9 + generation_config.temperature = 0.7 generation_config.num_return_sequences = 3 generation_config.max_new_tokens = 30 return generation_config @@ -136,7 +136,7 @@ def get_multinomial_temperature_and_frequence_penalty() -> GenerationConfig: generation_config = GenerationConfig() generation_config.do_sample = True generation_config.temperature = 0.8 - generation_config.frequence_penalty = 0.5 + generation_config.frequency_penalty = 0.5 generation_config.num_return_sequences = 1 generation_config.max_new_tokens = 30 return generation_config @@ -158,7 +158,7 @@ def get_multinomial_max_and_min_token() -> GenerationConfig: multinomial.top_k = 20 multinomial.num_return_sequences = 3 multinomial.presence_penalty = 0.01 - multinomial.frequence_penalty = 0.1 + multinomial.frequency_penalty = 0.1 multinomial.min_new_tokens = 15 multinomial.max_new_tokens = 30 return multinomial @@ -218,10 +218,10 @@ def convert_to_hf( kwargs['pad_token_id'] = default_generation_config.pad_token_id kwargs['repetition_penalty'] = generation_config.repetition_penalty - if generation_config.num_groups * generation_config.group_size > 1: + if generation_config.num_beams > 1: # beam search case - kwargs['num_beam_groups'] = generation_config.num_groups - kwargs['num_beams'] = generation_config.num_groups * generation_config.group_size + kwargs['num_beam_groups'] = generation_config.num_beam_groups + kwargs['num_beams'] = generation_config.num_beams kwargs['diversity_penalty'] = generation_config.diversity_penalty kwargs['length_penalty'] = generation_config.length_penalty kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size @@ -257,7 +257,7 @@ def run_hugging_face( generation_result = GenerationResult() generation_result.m_generation_ids = all_text_batch # sequences_scores are available only for beam search case - if generation_config.is_beam_search: + if generation_config.is_beam_search(): generation_result.m_scores = [score for score in generate_outputs.sequences_scores] generation_results.append(generation_result) @@ -273,7 +273,7 @@ def run_continuous_batching( prompts: List[str], generation_configs : List[GenerationConfig] ) -> List[GenerationResult]: - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config) + pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}, {}) output = pipe.generate(prompts, generation_configs) del pipe shutil.rmtree(model_path) @@ -293,7 +293,7 @@ def get_models_list(file_name: str): def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): - if generation_config.is_beam_search: + if generation_config.is_beam_search(): assert len(hf_result.m_scores) == len(ov_result.m_scores) for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores): # Note, that for fp32 / fp16 models scores are different less than 0.001 diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py index c97c231b7d..f98f47ecf3 100644 --- a/tests/python_tests/conftest.py +++ b/tests/python_tests/conftest.py @@ -1,10 +1,24 @@ +import pytest + + def pytest_make_parametrize_id(config, val, argname): if argname in ['prompt', 'prompts', 'batched_prompts']: return f'{val}' elif argname == 'model_descr': return f"{val[0]}" + elif argname == 'chat_config': + return f"{val[0]}" elif argname in ['stop_criteria', 'generation_config']: return str(val) elif isinstance(val, (int, float, str)): return f'{argname}={val}' return None + +def pytest_addoption(parser): + parser.addoption("--model_ids", help="Select models to run") + +def pytest_configure(config: pytest.Config): + marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly' + pytest.run_marker = marker + pytest.selected_model_ids = config.getoption('--model_ids', default=None) + diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py deleted file mode 100644 index d2d749446e..0000000000 --- a/tests/python_tests/list_test_models.py +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import pathlib -import os - -def models_list(): - model_ids = [ - "katuni4ka/tiny-random-phi3", - # "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - # "facebook/opt-125m", - - # "microsoft/phi-1_5", - # "microsoft/phi-2", - # "THUDM/chatglm2-6b", - # "Qwen/Qwen2-0.5B-Instruct", - # "Qwen/Qwen-7B-Chat", - # "Qwen/Qwen1.5-7B-Chat", - # "argilla/notus-7b-v1", - # "HuggingFaceH4/zephyr-7b-beta", - # "ikala/redpajama-3b-chat", - # "mistralai/Mistral-7B-v0.1", - - # "meta-llama/Llama-2-7b-chat-hf", - # "google/gemma-2b-it", - # "meta-llama/Llama-2-13b-chat-hf", - # "meta-llama/Meta-Llama-3-8B-Instruct", - # "openlm-research/open_llama_3b", - # "openlm-research/open_llama_3b_v2", - # "openlm-research/open_llama_7b", - # "databricks/dolly-v2-12b", - # "databricks/dolly-v2-3b", - ] - - prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) - return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] - - -def chat_models_list(): - model_ids = [ - "Qwen/Qwen2-0.5B-Instruct", - # "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - # "meta-llama/Meta-Llama-3-8B-Instruct", - # "meta-llama/Llama-2-7b-chat-hf", - # "google/gemma-2b-it", - # "google/gemma-7b-it", - ] - - prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) - return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] - - -if __name__ == "__main__": - for model_id, model_path in models_list(): - print(model_id, model_path) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/nightly b/tests/python_tests/models/nightly similarity index 100% rename from text_generation/causal_lm/cpp/continuous_batching/python/tests/models/nightly rename to tests/python_tests/models/nightly diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/precommit b/tests/python_tests/models/precommit similarity index 100% rename from text_generation/causal_lm/cpp/continuous_batching/python/tests/models/precommit rename to tests/python_tests/models/precommit diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models b/tests/python_tests/models/real_models similarity index 100% rename from text_generation/causal_lm/cpp/continuous_batching/python/tests/models/real_models rename to tests/python_tests/models/real_models diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py new file mode 100644 index 0000000000..98b791443b --- /dev/null +++ b/tests/python_tests/ov_genai_test_utils.py @@ -0,0 +1,226 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pathlib +import os +import pytest +import functools +import openvino +import openvino_tokenizers +import openvino_genai as ov_genai +from typing import List, Tuple +from pathlib import Path +import shutil +import json + + +def get_models_list(): + precommit_models = [ + "katuni4ka/tiny-random-phi3", + ] + + nightly_models = [ + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "facebook/opt-125m", + "microsoft/phi-1_5", + "microsoft/phi-2", + "THUDM/chatglm2-6b", + "Qwen/Qwen2-0.5B-Instruct", + "Qwen/Qwen-7B-Chat", + "Qwen/Qwen1.5-7B-Chat", + "argilla/notus-7b-v1", + "HuggingFaceH4/zephyr-7b-beta", + "ikala/redpajama-3b-chat", + "mistralai/Mistral-7B-v0.1", + + # "meta-llama/Llama-2-7b-chat-hf", # Cannot be downloaded without access token + # "google/gemma-2b-it", # Cannot be downloaded without access token. + # "google/gemma-7b-it", # Cannot be downloaded without access token. + "meta-llama/Llama-2-13b-chat-hf", + "meta-llama/Meta-Llama-3-8B-Instruct", + "openlm-research/open_llama_3b", + "openlm-research/open_llama_3b_v2", + "openlm-research/open_llama_7b", + "databricks/dolly-v2-12b", + "databricks/dolly-v2-3b", + ] + + if pytest.run_marker == "precommit": + model_ids = precommit_models + else: + model_ids = nightly_models + + if pytest.selected_model_ids: + model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + # pytest.set_trace() + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) + return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + + +def get_chat_models_list(): + precommit_models = [ + "Qwen/Qwen2-0.5B-Instruct", + ] + + nightly_models = [ + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Llama-2-7b-chat-hf", + # "google/gemma-2b-it", # Cannot be downloaded without access token + # "google/gemma-7b-it", # Cannot be downloaded without access token + ] + + if pytest.run_marker == "precommit": + model_ids = precommit_models + else: + model_ids = nightly_models + + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) + return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + + +def get_chat_templates(): + # Returns chat templates saved in tokenizer_configs.py, + # but skips some models that currently are not processed correctly. + + skipped_models = { + # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template. + # Need to enable and unskip, since it's preset in continious batching and has >100 000 downloads. + "openchat/openchat-3.5-0106", + + # These models fail even on HF so no need to check if applying chat matches. + "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", + "codellama/CodeLlama-34b-Instruct-hf", + "deepseek-ai/deepseek-math-7b-rl", + "allenai/tulu-2-7b", + "alexsobolev/IcaroLM", + "tokyotech-llm/Swallow-7b-instruct-v0.1", + "bofenghuang/vigogne-2-7b-chat", + "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k", + "AliAbdelrasheed/maqa_llama_4bit", + "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored", + + # TODO: Need to support chat templates in more models: CVS-145963 + # Either ov_genai is unable to parse chat_template or results do not match with HF. + "meta-llama/Meta-Llama-3-8B-Instruct", + "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp + "mosaicml/mpt-30b-chat", + "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp + "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp + "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp + "openchat/openchat-3.5-0106", + "casperhansen/llama-3-70b-instruct-awq", + "TheBloke/deepseek-coder-33B-instruct-GPTQ", + "AI-Sweden-Models/gpt-sw3-356m-instruct", + "google/gemma-7b-it", + "THUDM/cogvlm2-llama3-chat-19B", + "KnutJaegersberg/internlm-20b-llama", + "maywell/Synatra-Mixtral-8x7B", + "MediaTek-Research/Breeze-7B-Instruct-v1_0", + "bofenghuang/vigostral-7b-chat", + "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp + "openchat/openchat-3.6-8b-20240522", + "tenyx/TenyxChat-7B-v1", + "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2", + "yam-peleg/Hebrew-Gemma-11B-V2", + "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError + "nlpai-lab/KULLM3", + "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1", + "MediaTek-Research/Breeze-7B-Instruct-v0_1", + "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError + "MLP-KTLim/llama-3-Korean-Bllossom-8B", + "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp + "codellama/CodeLlama-70b-Instruct-hf", + "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp + "BramVanroy/Llama-2-13b-chat-dutch" + } + from tokenizer_configs import get_tokenizer_configs + return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models] + + +@functools.lru_cache(1) +def read_model(params, **tokenizer_kwargs): + model_id, path = params + + from optimum.intel.openvino import OVModelForCausalLM + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + + if path.exists(): + opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, + compile=False, device='CPU') + else: + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, + with_detokenizer=True, + **tokenizer_kwargs) + openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") + + # to store tokenizer config jsons with special tokens + tokenizer.save_pretrained(path) + + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, + compile=False, device='CPU', load_in_8bit=False) + opt_model.generation_config.save_pretrained(path) + opt_model.config.save_pretrained(path) + opt_model.save_pretrained(path) + + return ( + model_id, + path, + tokenizer, + opt_model, + ov_genai.LLMPipeline(str(path), device='CPU', config={"ENABLE_MMAP": False}), + ) + + +# in OpenVINO GenAI this parameter is called stop_criteria, +# while in HF it's called early_stopping. +# HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" +STOP_CRITERIA_MAP = { + ov_genai.StopCriteria.NEVER: "never", + ov_genai.StopCriteria.EARLY: True, + ov_genai.StopCriteria.HEURISTIC: False +} + + +@pytest.fixture(scope="module") +def model_tmp_path(tmpdir_factory): + model_id, path, _, _, _ = read_model(get_models_list()[0]) + temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) + + # copy openvino converted model and tokenizers + for pattern in ['*.xml', '*.bin']: + for src_file in path.glob(pattern): + if src_file.is_file(): + shutil.copy(src_file, temp_path / src_file.name) + yield model_id, Path(temp_path) + + +def load_tok(configs: List[Tuple], temp_path): + # load Tokenizer where all configs are cleared. + # remove existing jsons from previous tests + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + return ov_genai.Tokenizer(str(temp_path), {}) + + +def load_pipe(configs: List[Tuple], temp_path): + # Load LLMPipline where all configs are cleared. + # remove existing jsons from previous tests + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + return ov_genai.LLMPipeline(str(temp_path)) + + +@functools.lru_cache(1) +def get_continuous_batching(path): + return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), 'CB') diff --git a/tests/python_tests/pytest.ini b/tests/python_tests/pytest.ini index 38a6279b5d..541e59c7e3 100644 --- a/tests/python_tests/pytest.ini +++ b/tests/python_tests/pytest.ini @@ -3,5 +3,6 @@ markers = precommit nightly + real_models addopts = -m precommit diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index fa7db3f2e8..23358486d1 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,3 +1,27 @@ --extra-index-url https://download.pytorch.org/whl/cpu -optimum[openvino]==1.20.0 +optimum[openvino]==1.21.2 pytest +# requirements for specific models +# - hf-tiny-model-private/tiny-random-RoFormerForCausalLM +rjieba +# - baichuan-inc/Baichuan2-7B-Chat +bitsandbytes +# - nomic-ai/gpt4all-falcon +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +# - mosaicml/mpt-7b +# - internlm/internlm2-7b +einops +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +transformers_stream_generator +# - openbmb/MiniCPM-V-2 +torchvision +# - openbmb/MiniCPM-V-2 +timm +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +# - Salesforce/xgen-7b-8k-base +tiktoken +# - microsoft/biogpt +sacremoses diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py new file mode 100644 index 0000000000..295674e101 --- /dev/null +++ b/tests/python_tests/test_chat_generate_api.py @@ -0,0 +1,188 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import math +import openvino +import openvino_tokenizers +import openvino_genai as ov_genai +import pytest +from typing import Dict, Tuple +from ov_genai_test_utils import ( + get_models_list, + get_chat_models_list, + read_model, + load_tok, + model_tmp_path, + get_chat_templates, + get_continuous_batching, +) + + +configs = [ + dict(max_new_tokens=20), + dict(num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) +] + + +quenstions = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?' +] + + +@pytest.mark.parametrize("generation_config", configs) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_chat_compare_with_HF(model_descr, generation_config: Dict): + device = 'CPU' + chat_history_hf = [] + chat_history_ov = [] + chat_prompt = '' + + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. + # Need to regenerate openvino_tokenizer/detokenizer. + model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) + + pipe.start_chat() + for prompt in quenstions: + chat_history_hf.append({'role': 'user', 'content': prompt}) + chat_history_ov.append({'role': 'user', 'content': prompt}) + + chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + + answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history_hf.append({'role': 'assistant', 'content': answer_str}) + + answer_ov = pipe.generate(prompt, **generation_config) + chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) + + pipe.finish_chat() + + if chat_history_ov != chat_history_hf: + print(f'hf_output: {chat_history_hf}') + print(f'ov_output: {chat_history_ov}') + assert chat_history_ov == chat_history_hf + + +@pytest.mark.parametrize("generation_config", configs) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict): + # compares with HF when history in ov_genai is save as a text + device = 'CPU' + chat_history_hf = [] + chat_history_ov = [] + chat_prompt = '' + + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. + # Need to regenerate openvino_tokenizer/detokenizer. + model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) + + for prompt in quenstions: + chat_history_hf.append({'role': 'user', 'content': prompt}) + chat_history_ov.append({'role': 'user', 'content': prompt}) + + chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + + answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history_hf.append({'role': 'assistant', 'content': answer_str}) + + chat_prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True) + answer_ov = pipe.generate(chat_prompt, **generation_config) + chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) + + if chat_history_ov != chat_history_hf: + print(f'hf_output: {chat_history_hf}') + print(f'ov_output: {chat_history_ov}') + assert chat_history_ov == chat_history_hf + + +@pytest.mark.parametrize("generation_config", configs) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict): + # Check that when history is stored in KV cache results are the same as when history stored in a text. + device ='CPU' + + chat_history_with_kv_cache = [] + chat_history_ov = [] + + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. + # Need to regenerate openvino_tokenizer/detokenizer. + model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) + pipe_with_kv_cache = ov_genai.LLMPipeline(str(path), device, config={"ENABLE_MMAP": False}) + + pipe_with_kv_cache.start_chat() + for question in quenstions: + chat_history_with_kv_cache.append({'role': 'user', 'content': question}) + answer = pipe_with_kv_cache.generate(question, **generation_config) + chat_history_with_kv_cache.append({'role': 'assistant', 'content': answer}) + + chat_history_ov.append({'role': 'user', 'content': question}) + prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True) + answer = pipe.generate(prompt, **generation_config) + chat_history_ov.append({'role': 'assistant', 'content': answer}) + pipe_with_kv_cache.finish_chat() + + if chat_history_ov != chat_history_with_kv_cache: + print(f'kvcache_hist: {chat_history_with_kv_cache}') + print(f'text_history: {chat_history_ov}') + assert chat_history_ov == chat_history_with_kv_cache + + +conversation = [ + {'role': 'user', 'content': '1+1='}, + {'role': 'assistant', 'content': '1 + 1 = 2'}, + {'role': 'user', 'content': 'What is the previous answer?'}, + {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. \n Please ask me your next question.'}, + {'role': 'user', 'content': 'Why is the sun yellow?'}, + {'role': 'assistant', 'content': 'Because it emits yeloow light.'}, + {'role': 'user', 'content': 'What was my first question?'}, +] +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize('chat_config', get_chat_templates()) +def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): + tokenizer_config = chat_config[1] + + # Will load openvino_model for tiny-random-phi as a placeholder + # but indeed only Tokenizer and apply_chat_template will be tested. + model_id, path, tokenizer, opt_model, pipe = read_model(get_models_list()[0]) + + full_history_str_hf = tokenizer.apply_chat_template(conversation, + add_generation_prompt=False, + tokenize=False, + **tokenizer_config) + + tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1]) + full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False) + if full_history_str != full_history_str_hf: + print(f'hf reference: {full_history_str_hf}') + print(f'ov_genai out: {full_history_str}') + assert full_history_str == full_history_str_hf + + +@pytest.mark.parametrize("generation_config", configs[1:]) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793") +def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict): + model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat')) + cb = get_continuous_batching(path) + stateful.start_chat() + cb.start_chat() + for question in quenstions: + generated = cb.generate(question, **generation_config) + reference = stateful.generate(question, **generation_config) + assert generated == reference + # Test that finish_chat() doesn't fail just in case. + cb.finish_chat() diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py index 6788f62edd..fe306e2a37 100644 --- a/tests/python_tests/test_generate_api.py +++ b/tests/python_tests/test_generate_api.py @@ -1,63 +1,29 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import functools -import openvino -import openvino_tokenizers -import optimum.intel -from openvino_genai import StopCriteria import openvino_genai as ov_genai +from openvino_genai import StopCriteria import pytest import transformers -from list_test_models import models_list, chat_models_list -from typing import Union, List, Dict, Tuple, Optional +from typing import Union, List, Dict, Optional import numpy as np import openvino as ov import sys from pathlib import Path -import shutil -import json import torch - - -@functools.lru_cache(1) -def read_model(params): - model_id, path = params - - from optimum.intel.openvino import OVModelForCausalLM - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - - if path.exists(): - opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, - compile=False, device='CPU') - else: - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, - add_special_tokens=True, - with_detokenizer=True) - openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") - openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") - - # to store tokenizer config jsons with special tokens - tokenizer.save_pretrained(path) - - opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, - compile=False, device='CPU', load_in_8bit=False) - opt_model.generation_config.save_pretrained(path) - opt_model.config.save_pretrained(path) - opt_model.save_pretrained(path) - - return ( - model_id, - path, - tokenizer, - opt_model, - ov_genai.LLMPipeline(str(path), device='CPU', config={"ENABLE_MMAP": False}), - ) +import math +from ov_genai_test_utils import ( + get_models_list, + read_model, + load_pipe, + load_tok, + model_tmp_path, + STOP_CRITERIA_MAP, + get_continuous_batching, +) def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): - device = 'CPU' model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects num_beams = config['num_beams'] if 'num_beams' in config else 1 @@ -76,7 +42,7 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')] + generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] generation_config_hf.pop('ignore_eos', None) # Encode the batch of prompts @@ -91,8 +57,6 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro prompt_count = idx // num_beams hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) - pipe = ov_genai.LLMPipeline(str(path), device) - ov_outputs = pipe.generate(prompts, **config).texts hf_outputs.sort() @@ -104,7 +68,6 @@ def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, pro assert hf_output == ov_output def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str): - device = 'CPU' model_id, path, tokenizer, model, pipe = model_descr config = generation_config.copy() # to avoid side effects @@ -112,22 +75,20 @@ def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str if 'do_sample' not in config: # Some HF models have default do_sample = True, and if we set beam search generation config # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set exlicitly to False, but only if test arguments omitted this arg. + # Need to set explicitly to False, but only if test arguments omitted this arg. # Do not apply 'repetition_penalty' if sampling is not used. config['do_sample'] = False config['repetition_penalty'] = None generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')] + generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] generation_config_hf.pop('ignore_eos', None) encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf) hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:], skip_special_tokens=True) - pipe = ov_genai.LLMPipeline(str(path), device) - ov_output = pipe.generate(prompt, **config) if config.get('num_return_sequences', 1) > 1: assert hf_output in ov_output.texts @@ -159,7 +120,7 @@ def hf_ov_genai_tensors_comparison( generation_config_hf = config.copy() if generation_config_hf.get('stop_criteria'): - generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')] + generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] generation_config_hf.pop('ignore_eos', None) if attention_mask is not None: @@ -179,17 +140,6 @@ def hf_ov_genai_tensors_comparison( assert np.all(ov_res == hf_res) -def stop_criteria_map(): - # in OpenVINO GenAI this parameter is called stop_criteria, - # while in HF it's called early_stopping. - # HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" - return { - StopCriteria.NEVER: "never", - StopCriteria.EARLY: True, - StopCriteria.HEURISTIC: False - } - - test_cases = [ (dict(max_new_tokens=20), 'table is made of'), (dict(max_new_tokens=20), 'δ½ ε₯½οΌ δ½ ε₯½ε—ŽοΌŸ'), @@ -199,8 +149,9 @@ def stop_criteria_map(): (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), ] @pytest.mark.parametrize("generation_config,prompt", test_cases) -@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_decoding(model_descr, generation_config, prompt): run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) @@ -210,18 +161,86 @@ def test_decoding(model_descr, generation_config, prompt): (np.array([[1, 4, 42]], dtype=np.int64), np.array([[1, 1, 1]], dtype=np.int64)), ] @pytest.mark.parametrize("inputs", input_tensors_list) -@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.xfail( raises=TypeError, - reason="pybind was unable to find overloads with tensor inputs on Linux", + reason="pybind was unable to find ov::Tensor from openvino yet", strict=False, - condition=sys.platform == "linux" + condition=sys.platform in ["linux", "win32"] ) @pytest.mark.precommit +@pytest.mark.nightly def test_ov_tensors(model_descr, inputs): hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs) +prompts = [ + 'table is made of', + 'δ½ ε₯½οΌ δ½ ε₯½ε—ŽοΌŸ', + 'Alan Turing was a', + 'The Sun is yellow because', + ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a'] +] +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.xfail( + raises=TypeError, + reason="pybind was unable to find ov::Tensor from openvino yet", + strict=False, + condition=sys.platform in ["linux", "win32"] +) +def test_genai_tokenizer_encode(model_descr, prompt): + model_id, path, tokenizer, model, pipe = read_model(model_descr) + tok = pipe.get_tokenizer() + + encoded_ov = tok.encode(prompt).input_ids.data + if isinstance(prompt, list): + encoded_hf = tokenizer.batch_encode_plus(prompt)['input_ids'] + for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf): + assert np.all(tokens_ov == tokens_hf) + else: + encoded_hf = tokenizer.encode(prompt) + assert np.all(encoded_hf == encoded_ov[0]) + +encoded_prompts = [ + [1, 1591, 338, 1754, 310], + [1, 17102, 323, 3864, 471, 263], + + # chineze characters + [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882], + + # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer after removing the last token + [3113, 264, 364, 267], + + # batched tokens + [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102, 323, 3864, 471, 263]] +] +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.parametrize("encoded_prompt", encoded_prompts) +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.xfail( + raises=TypeError, + reason="pybind was unable to find ov::Tensor from openvino yet", + strict=False, + condition=sys.platform in ["linux", "win32"] +) +def test_genai_tokenizer_decode(model_descr, encoded_prompt): + model_id, path, tokenizer, model, pipe = read_model(model_descr) + tok = pipe.get_tokenizer() + decoded_ov = tok.decode(encoded_prompt) + + if isinstance(encoded_prompt[0], list): + decoded_hf = tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True) + for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf): + assert np.all(tokens_ov == tokens_hf) + else: + decoded_hf = tokenizer.decode(encoded_prompt, skip_special_tokens=True) + assert decoded_hf == decoded_ov + + test_configs = [ dict(max_new_tokens=20), dict(max_new_tokens=200, ignore_eos=True), @@ -235,8 +254,9 @@ def test_ov_tensors(model_descr, inputs): ] @pytest.mark.parametrize("generation_config", test_configs) @pytest.mark.parametrize("prompts", batched_prompts) -@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_multibatch(model_descr, generation_config, prompts): run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) @@ -247,8 +267,9 @@ def test_multibatch(model_descr, generation_config, prompts): @pytest.mark.parametrize("max_new_tokens", [20, 15]) @pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5]) @pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_beam_search_decoding(model_descr, num_beam_groups, group_size, max_new_tokens, diversity_penalty, prompt): generation_config = dict( @@ -264,8 +285,9 @@ def test_beam_search_decoding(model_descr, num_beam_groups, group_size, @pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) @pytest.mark.parametrize("prompt", prompts) @pytest.mark.parametrize("max_new_tokens", [10, 80]) -@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.precommit +@pytest.mark.nightly def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence # while genai ends sentence with @@ -287,7 +309,7 @@ def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): @pytest.mark.parametrize("group_size", [5]) @pytest.mark.parametrize("max_new_tokens", [800, 2000]) @pytest.mark.parametrize("prompt", prompts) -@pytest.mark.parametrize("model_descr", models_list()) +@pytest.mark.parametrize("model_descr", get_models_list()) @pytest.mark.skip(reason="Will be enabled in nightly since the test are computationally expensive") @pytest.mark.nightly def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, @@ -308,8 +330,9 @@ def user_defined_callback(subword): @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly def test_callback_one_string(callback): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] generation_config = pipe.get_generation_config() generation_config.max_new_tokens = 10 pipe.generate('table is made of', generation_config, callback) @@ -317,23 +340,39 @@ def test_callback_one_string(callback): @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly def test_callback_batch_fail(callback): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): pipe.generate(['1', '2'], ov_genai.GenerationConfig(), callback) @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly def test_callback_kwargs_one_string(callback): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] pipe.generate('table is made of', max_new_tokens=10, streamer=callback) +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("model_descr", get_models_list()) +def test_callback_decoding_metallama(model_descr, callback): + # On metallam this prompt generates output which can shorten after adding new tokens. + # Test that streamer correctly handles such cases. + prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature' + if model_descr[0] != 'meta-llama/Meta-Llama-3-8B-Instruct': + pytest.skip() + pipe = read_model(model_descr)[4] + pipe.generate(prompt, max_new_tokens=300, streamer=callback) + @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) @pytest.mark.precommit +@pytest.mark.nightly def test_callback_kwargs_batch_fail(callback): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback) @@ -353,8 +392,9 @@ def end(self): @pytest.mark.precommit +@pytest.mark.nightly def test_streamer_one_string(): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] generation_config = pipe.get_generation_config() generation_config.max_new_tokens = 10 printer = Printer(pipe.get_tokenizer()) @@ -362,97 +402,69 @@ def test_streamer_one_string(): @pytest.mark.precommit +@pytest.mark.nightly def test_streamer_batch_fail(): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): pipe.generate(['1', '2'], ov_genai.GenerationConfig(), printer) @pytest.mark.precommit +@pytest.mark.nightly def test_streamer_kwargs_one_string(): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer) @pytest.mark.precommit +@pytest.mark.nightly def test_streamer_kwargs_batch_fail(): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): pipe.generate('', num_beams=2, streamer=printer) @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) def test_operator_with_callback_one_string(callback): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] ten_tokens = pipe.get_generation_config() ten_tokens.max_new_tokens = 10 pipe('talbe is made of', ten_tokens, callback) @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) def test_operator_with_callback_batch_fail(callback): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] with pytest.raises(RuntimeError): pipe(['1', '2'], ov_genai.GenerationConfig(), callback) @pytest.mark.precommit +@pytest.mark.nightly def test_operator_with_streamer_kwargs_one_string(): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer) @pytest.mark.precommit +@pytest.mark.nightly def test_operator_with_streamer_kwargs_batch_fail(): - pipe = read_model(models_list()[0])[4] + pipe = read_model(get_models_list()[0])[4] printer = Printer(pipe.get_tokenizer()) with pytest.raises(RuntimeError): pipe('', num_beams=2, streamer=printer) -@pytest.fixture(scope="module") -def model_tmp_path(tmpdir_factory): - model_id, path, _, _, _ = read_model(models_list()[0]) - temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) - - # copy openvino converted model and tokenizers - for pattern in ['*.xml', '*.bin']: - for src_file in path.glob(pattern): - if src_file.is_file(): - shutil.copy(src_file, temp_path / src_file.name) - yield model_id, Path(temp_path) - - -# load Tokenizer where all configs are cleared -def load_tok(configs: List[Tuple], temp_path): - # remove existing jsons from previous tests - for json_file in temp_path.glob("*.json"): - json_file.unlink() - - for config_json, config_name in configs: - with (temp_path / config_name).open('w') as f: - json.dump(config_json, f) - return ov_genai.Tokenizer(str(temp_path)) - - -# load LLMPipline where all configs are cleared -def load_pipe(configs: List[Tuple], temp_path): - # remove existing jsons from previous tests - for json_file in temp_path.glob("*.json"): - json_file.unlink() - - for config_json, config_name in configs: - with (temp_path / config_name).open('w') as f: - json.dump(config_json, f) - return ov_genai.LLMPipeline(str(temp_path)) - @pytest.mark.precommit +@pytest.mark.nightly def test_load_special_tokens_ids_1(model_tmp_path): # test when there is an available config.json config_json = { @@ -467,6 +479,7 @@ def test_load_special_tokens_ids_1(model_tmp_path): @pytest.mark.precommit +@pytest.mark.nightly def test_load_special_tokens_str_2(model_tmp_path): # test with special_tokens_map special_tokens_map_json = { @@ -481,6 +494,7 @@ def test_load_special_tokens_str_2(model_tmp_path): @pytest.mark.precommit +@pytest.mark.nightly def test_load_special_tokens_3_(model_tmp_path): # special_tokens_map is not available # but tokenize_config.json exists @@ -507,6 +521,7 @@ def test_load_special_tokens_3_(model_tmp_path): @pytest.mark.precommit +@pytest.mark.nightly def test_load_special_tokens_3(model_tmp_path): # both config.json is availabel and tokenizer_config.json available # check that it does not read int values from tokenizer_config.json if they are in config.json @@ -541,6 +556,7 @@ def test_load_special_tokens_3(model_tmp_path): @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.xfail( raises=AssertionError, reason="CVS-143410 ov tokenizer should be aligned with hf", @@ -584,6 +600,7 @@ def test_load_special_tokens_4(model_tmp_path): ] @pytest.mark.parametrize("generation_config", invalid_configs) @pytest.mark.precommit +@pytest.mark.nightly def test_invalid_configs(model_tmp_path, generation_config): model_id, temp_path = model_tmp_path config_json = {} @@ -593,6 +610,7 @@ def test_invalid_configs(model_tmp_path, generation_config): @pytest.mark.precommit +@pytest.mark.nightly def test_valid_configs(model_tmp_path): model_id, temp_path = model_tmp_path pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path) @@ -611,6 +629,7 @@ def test_valid_configs(model_tmp_path): dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k ] @pytest.mark.precommit +@pytest.mark.nightly @pytest.mark.parametrize("generation_config", invalid_py_configs) def test_python_generation_config_validation(model_tmp_path, generation_config): model_id, temp_path = model_tmp_path @@ -624,95 +643,44 @@ def test_python_generation_config_validation(model_tmp_path, generation_config): @pytest.mark.precommit -@pytest.mark.skipif(sys.platform.startswith("win"), reason="probably not enough space for this model on Win") +@pytest.mark.nightly def test_unicode_pybind_decoding_1(): # On this model this prompt generates unfinished utf string. # Test that pybind will not fail. - model_id, path = ("microsoft/phi-1_5", Path("phi-1_5/")) + model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') pipe = read_model((model_id, path))[4] - res_str = pipe.generate('δ½ ε₯½οΌ δ½ ε₯½ε—ŽοΌŸ', max_new_tokens=20) - assert isinstance(res_str, str) - assert len(res_str) > 0 + res_str = pipe.generate(',', max_new_tokens=4) + assert 'οΏ½' == res_str[-1] + + @pytest.mark.precommit -@pytest.mark.skipif(sys.platform.startswith("win"), reason="probably not enough space for this model on Win") +@pytest.mark.nightly def test_unicode_pybind_decoding_2(): # On this model this prompt generates unfinished utf string. # Test that pybind will not fail. - model_id, path = ("microsoft/phi-1_5", Path("phi-1_5/")) + model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') pipe = read_model((model_id, path))[4] - decoded_results = pipe.generate(['δ½ ε₯½οΌ δ½ ε₯½ε—ŽοΌŸ'], max_new_tokens=20) - assert isinstance(decoded_results, ov_genai.DecodedResults) - assert len(decoded_results.texts[0]) > 0 + res_str = pipe.generate([","], max_new_tokens=4) + assert 'οΏ½' == res_str.texts[0][-1] -quenstions = [ - '1+1=', - 'What is the previous answer?', - 'Why is the sun yellow?', - 'What was my first question?' -] - -configs = [ - dict(max_new_tokens=500), - # dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0) -] -@pytest.mark.parametrize("generation_config", configs) -@pytest.mark.parametrize("model_descr", chat_models_list()) @pytest.mark.precommit -@pytest.mark.skipif(sys.platform == "linux", reason="no space left on linux device for chat models") -def test_chat_1(model_descr, generation_config): - config = generation_config.copy() # to avoid side effects - - if 'do_sample' not in config: - # Some HF models have default do_sample = True, and if we set beam search generation config - # it conflicts with `diversity_penalty` and/or `num_beam_groups`. - # Need to set exlicitly to False, but only if test arguments omitted this arg. - # Do not apply 'repetition_penalty' if sampling is not used. - config['do_sample'] = False - config['repetition_penalty'] = None - - config_hf = config.copy() - if config_hf.get('stop_criteria'): - config_hf['early_stopping'] = stop_criteria_map()[config_hf.pop('stop_criteria')] - config_hf.pop('ignore_eos', None) - - chat_history_hf = [] - chat_history_ov = [] - chat_prompt = '' - model_id, path, tokenizer, model_opt, pipe = read_model(model_descr) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, add_special_tokens=False, with_detokenizer=True) - openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") - openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") - ov_genai.LLMPipeline(str(path), device='CPU', config={"ENABLE_MMAP": False}) - - pipe.start_chat() - for prompt in quenstions: - chat_history_hf.append({'role': 'user', 'content': prompt}) - chat_history_ov.append({'role': 'user', 'content': prompt}) - - chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) - tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) - - answer = model_opt.generate(**tokenized, **config_hf) - answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) - chat_history_hf.append({'role': 'assistant', 'content': answer_str}) - - answer_ov = pipe.generate(prompt, **config) - chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) - - pipe.finish_chat() - - if chat_history_ov != chat_history_hf: - print(f'hf_output: {chat_history_hf}') - print(f'ov_output: {chat_history_ov}') - assert chat_history_ov == chat_history_hf - pipe.generate('δ½ ε₯½οΌ δ½ ε₯½ε—ŽοΌŸ', max_new_tokens=20) +@pytest.mark.nightly +def test_unicode_pybind_decoding_3(): + # On this model this prompt generates unfinished utf-8 string + # and streams it. Test that pybind will not fail while we pass string to python. + model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + pipe = read_model((model_id, path))[4] + res_str = [] + pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x)) + assert 'οΏ½' == res_str[-1] @pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") @pytest.mark.precommit -@pytest.mark.skipif(sys.platform.startswith("win"), reason="probably not enough space for this model on Win") +@pytest.mark.nightly +@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win") def test_left_pad(): # test left pad tokenizer post processing implementation prompts = [ @@ -737,3 +705,41 @@ def test_left_pad(): models[2].pad_token = models[2].eos_token run_hf_ov_genai_comparison_batched(models, config, prompts) + + +@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("prompt", batched_prompts) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793") +def test_continuous_batching_vs_stateful(model_descr, prompt, generation_config): + model_id, path, tokenizer, model, stateful = read_model(( + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + Path("TinyLlama-1.1B-Chat-v1.0") + )) + config = ov_genai.GenerationConfig() + config.max_new_tokens = 100 + cb = get_continuous_batching(path) + generated = cb.generate(prompt, **generation_config) + reference = stateful.generate(prompt, **generation_config) + assert generated.texts == reference.texts + if 1 != generation_config.get("num_return_sequences", 1): + # Stateful puts zeroes to generated.scores. Don't compare them. + for gen, ref in zip(generated.scores, reference.scores): + assert math.isclose(gen, ref, abs_tol=0.0003) + +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.skip("continuous_batching seg faults with nightly ov. Ticket 147793") +def test_cb_streamer_vs_return_vs_stateful(model_descr, prompt): + model_id, path, tokenizer, model, stateful = read_model(( + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + Path("TinyLlama-1.1B-Chat-v1.0") + )) + cb = get_continuous_batching(path) + streamed = [] + generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) + reference = stateful.generate(prompt, max_new_tokens=20) + assert generated == "".join(streamed) + assert "".join(streamed) == reference diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py new file mode 100644 index 0000000000..4f0f656ca4 --- /dev/null +++ b/tests/python_tests/test_preemption.py @@ -0,0 +1,181 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import sys +import pytest + +from openvino_genai import GenerationConfig +from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ + DEFAULT_SCHEDULER_CONFIG, get_scheduler_config, run_test_pipeline, get_models_list, get_beam_search, get_greedy, \ + get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ + get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p +from test_sampling import RandomSamplingTestStruct, get_current_plarform_ref_texts + + +pytest.skip("continuous_batching fails with nightly ov", allow_module_level=True) + + +def get_greedy_seq_len_300() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 3 + generation_config.max_new_tokens = 300 + return generation_config + +def get_beam_search_seq_len_300() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.max_new_tokens = 300 + generation_config.num_return_sequences = generation_config.num_beams + return generation_config + +scheduler_params_list = [({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), + ({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), + ({"num_kv_blocks": 10, "block_size": 32, "dynamic_split_fuse": True}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 10, "block_size": 32, "dynamic_split_fuse": False}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), + ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), + ({"num_kv_blocks": 100, "block_size": 32, "dynamic_split_fuse": True}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 100, "block_size": 32, "dynamic_split_fuse": False}, get_beam_search_seq_len_300())] +@pytest.mark.parametrize("params", scheduler_params_list) +@pytest.mark.precommit +def test_preemption(tmp_path, params): + run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + + +multinomial_params = RandomSamplingTestStruct( + generation_config=[ + get_multinomial_temperature(), + get_multinomial_temperature_and_top_p(), + get_multinomial_temperature_and_top_k(), + ], + prompts=[ + "What is OpenVINO?", + "How are you?", + "Tell me something about Canada?", + ], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is" + ], + [ + " You're getting much better results from doing this, than you are by not doing this. I have a BH and I was so far" + ], + [ + "\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version." + ], + ], + "win32": [ + [ + "\n\nOpenVINO is a live platform that allows users to create and manage a new library of applications on the Virtuoso server, which can" + ], + [ + " You're getting much better results from doing this, than you are by not doing this. If you are truly trying to do something good," + ], + [ + "\nI'm from Canada, and I'm from the US, so I'm not sure what you're talking about.\nI'm Canadian and I" + ], + ], + }), +) + + +# todo: Anastasiia Pnevskaya: fix the test because it is hanging according max_new_tokens = std::numeric_limits::max() +@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) +@pytest.mark.precommit +def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): + generation_configs = multinomial_params.generation_config + for config in generation_configs: + config.rng_seed = 0 + config.max_new_tokens = 30 + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) + generate_and_compare_with_reference_text(model_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) + + +multinomial_params_n_seq = RandomSamplingTestStruct( + generation_config=[ + get_multinomial_temperature(), + get_multinomial_temperature_and_num_return_sequence(), + get_multinomial_all_parameters(), + ], + prompts=[ + "Artificial intelligence ", + "What is the current", + "Tell me something about UAE?", + ], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\nI've seen this expression used too many times without making sense.\nAs an AI engineer, and as a scientist, we should make everything easier" + ], + [ + " position of the Z-shaped groove?\n0.41\nWhat is the current position of the Z-shaped groove?\n0.11\n", + " status of all of this? I can't stop thinking about it.\nIt's been a while since I've seen it. I found it a", + " status of your blog? Do you accept feedback?\nYes, I’m happy to accept feedback at this time (I’m a" + ], + [ + "\nIt's in the middle of nowhere if you haven’t seen one yet! It might be more convenient there than anywhere else.. maybe take", + "\nUAE is a country with some great culture that has been living under Islamic oppression for almost 60 years now (including 20 years as part of Arab", + "\nNope, just wanted to say how awesome and beautiful it was when my brother came back from an adventure trip across Asia - our 2nd year", + "\nI don't know anything. I'm not sure what kind this sub wants though... but apparently they are pretty bad at making videos/photos", + ], + ], + "win32": [ + [ + "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the economics of" + ], + [ + " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic", + " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25", + " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey" + ], + [ + "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :", + "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still", + "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we", + "\nIt's a country where your parents can never give you anything at all! It also has an extremely low education system for many years... You", + ], + ], + "darwin": [ + [ + "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the rigidity" + ], + [ + " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic", + " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25", + " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey" + ], + [ + "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :", + "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still", + "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we", + "\nIt's a country where your parents can never give you anything at all! It also has an extremely low education system for many years... You", + ], + ], + }), +) + + +@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) +@pytest.mark.precommit +@pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") +def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): + generation_configs = multinomial_params_n_seq.generation_config + for config in generation_configs: + config.rng_seed = 0 + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) + scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) + generate_and_compare_with_reference_text(model_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) \ No newline at end of file diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py new file mode 100644 index 0000000000..f44a81885b --- /dev/null +++ b/tests/python_tests/test_sampling.py @@ -0,0 +1,324 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import os +import sys +import pytest +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path +from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer +from typing import List, TypedDict + +from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \ + generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \ + get_greedy_with_penalties, get_multinomial_temperature, \ + get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ + get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \ + get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ + generate_and_compare_with_reference_text, get_greedy, get_greedy_with_min_and_max_tokens, \ + get_beam_search, get_beam_search_min_and_max_tokens, get_multinomial_max_and_min_token, \ + get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ + generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config + + +pytest.skip("continuous_batching fails with nightly ov", allow_module_level=True) + + +@pytest.mark.precommit +@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) +@pytest.mark.xfail( + raises=RuntimeError, + reason="Test fails with error: CPU: head size must be multiple of 16, current: X. CVS-145986.", + strict=True, +) +def test_sampling_precommit(tmp_path, model_id): + run_test_pipeline(tmp_path, model_id) + + +@pytest.mark.nightly +@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) +def test_sampling_nightly(tmp_path, model_id): + run_test_pipeline(tmp_path, model_id) + +@pytest.mark.real_models +@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) +def test_real_models(tmp_path, model_id): + run_test_pipeline(tmp_path, model_id) + + +@pytest.mark.precommit +def test_eos_beam_search(tmp_path): + ''' + Current test checks that in case of beam search, some generation results + explicitly have EOS token at the end, which is aligned with HF + + Example of current output: + { -1.23264, that I don't know about. + I don't know what you're talking about, but I'm pretty sure it's a Canadian thing. } + ''' + model_id = "facebook/opt-125m" + prompts = ["Tell me something about Canada"] + generation_configs = [get_beam_search()] + scheduler_config = get_scheduler_config() + generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + + +@pytest.mark.precommit +def test_eos_greedy(tmp_path): + ''' + Current test checks that in case of gready, some generation results + explicitly have EOS token at the end, which is aligned with HF: + + Example of current output: + { a software program } + ''' + model_id = "bigscience/bloomz-560m" + prompts = ["What is OpenVINO?"] + generation_configs = [get_greedy()] + scheduler_config = get_scheduler_config() + generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + +@pytest.mark.precommit +@pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_beam_search(), get_beam_search_min_and_max_tokens()], + ids=[ + "greedy", + "greedy_with_min_and_max_tokens", + "greedy_with_repetition_penalty", + "beam", + "beam_search_min_and_max_tokens" + ]) +def test_individual_generation_configs_deterministic(tmp_path, generation_config): + prompts = [ + "What is OpenVINO?", + ] + generation_configs = [generation_config] + model_id : str = "facebook/opt-125m" + generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + + +class PlatformsRefTexts(TypedDict, total=False): + linux: List[List[str]] + win32: List[List[str]] + darwin: List[List[str]] + + +def get_current_plarform_ref_texts(ref_texts: PlatformsRefTexts) -> List[List[str]]: + # mac and win often have identical results + # to avoid duplication, use win32 ref_text if no mac ref_texts were found + if sys.platform == "darwin": + result = ref_texts.get("darwin") or ref_texts.get("win32") + else: + result = ref_texts.get(sys.platform) + if not result: + raise RuntimeError("No ref_texts were provided") + return result + + +@dataclass +class RandomSamplingTestStruct: + generation_config: GenerationConfig + prompts: List[str] + ref_texts: List[List[str]] + + +RANDOM_SAMPLING_TEST_CASES = [ + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_top_p(), + prompts=["What is OpenVINO?"], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application" + ] + ], + "win32": [ + [ + "\n\nOpenVINO is a software development platform designed to allow developers to develop and commercialize the most important software products on the web. OpenV" + ] + ], + }) + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_top_k(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_top_p_and_top_k(), + prompts=["What is OpenVINO?"], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers" + ] + ], + "win32": [ + [ + "\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open" + ] + ], + }), + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_repetition_penalty(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\nOpen Vino's are a new and improved way to find cheap, fast-investment frozen vegetables that have no waste or calories. They're" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_num_return_sequence(), + prompts=["What is location of"], + ref_texts=[ + [ + " the exact same image?\nI've tried multiple times to find it, but I'm still not sure. I am sure it's the exact same", + " your new house?\nAnywhere that has a GPS. It will be up to you.", + " your cat? He is more likely to be on the floor with him.\nTalduck" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_all_parameters(), + prompts=["Tell me something about UAE"], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + " and how it's not like we're all in the same boat right now lol (or even close) πŸ˜‚πŸ˜! Just curious :) If", + "? You are my country... so what does our military do here?? What am i missing out on?? And why don't u tell us?", + "?\nThe U.S government has been doing quite well with foreign-made aircraft for many years under US administration....and they have very good reasons", + "? I think that is a bit of an anomaly, but you might want to ask yourself this question: Where can some young people from Dubai or Bahrain", + ] + ], + "win32": [ + [ + "? I think that is a bit of an anomaly, especially since there aren't many Americans living here (like us). What makes you say they've", + "? You are my country... so what does our future have to do with your problems?? \U0001f609\U0001f608\U0001f495 \U0001f5a4\ufffd", + "?\nThe U.S government has been doing quite well for decades now when compared strictly directly or indirectly as regards security issues.. They even made some", + " and how it's not like we're all in the same boat either! We had such fun meeting each other at different times this past summer :) It", + ] + ], + }), + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_presence_penalty(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which uses a RESTful API for server-side web applications" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_frequence_penalty(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which offers the Linux-based platform. OpenVINO's" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_greedy_with_penalties(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\nOpenVINO is a software that allows users to create and manage their own virtual machines. It's designed for use with Windows, Mac OS X" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_max_and_min_token(), + prompts=["What is OpenVINO?"], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\nOpenVINO is a Linux distro. It's not as simple as using the Linux distro itself. OpenVINO is essentially a dist", + "\nOpenVINO is an open-source open-source software that allows anyone to work with a virtual machine, from a smartphone to an iPhone,", + "\n\nOpenVINO is a social networking tool. OpenVINO is a free virtualization service that works at scale. The tool provides the ability", + ] + ], + "win32": [ + [ + "\nOpenVINO is the latest addition to the OpenVINO series of platforms. OpenVINO is an open source software development framework for all platforms", + "\nOpenVINO is a browser-based virtual assistant that enables developers and developers to quickly communicate with their own virtual machines. Using this virtual assistant,", + "\n\nOpenVINO is a program designed to help you find the best open source open source software. The program, which is a lightweight package and", + ] + ], + }), + ), +] + + +@pytest.mark.precommit +@pytest.mark.parametrize("test_struct", RANDOM_SAMPLING_TEST_CASES, + ids=["multinomial_temperature", + "multinomial_temperature_and_top_p", + "multinomial_temperature_and_top_k", + "multinomial_temperature_top_p_and_top_k", + "multinomial_temperature_and_repetition_penalty", + "multinomial_temperature_and_num_return_sequence", + "multinomial_all_parameters", + "multinomial_temperature_and_presence_penalty", + "multinomial_temperature_and_frequence_penalty", + "greedy_with_penalties", + "multinomial_max_and_min_token"]) +def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct): + generation_config = test_struct.generation_config + + prompts = test_struct.prompts + generation_config.rng_seed = 0 + generation_configs = [generation_config] + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + generate_and_compare_with_reference_text(model_path, prompts, test_struct.ref_texts, generation_configs, DEFAULT_SCHEDULER_CONFIG) + + + +@pytest.mark.precommit +@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()]) +def test_post_oom_health(tmp_path, sampling_config): + generation_config = sampling_config + generation_config.ignore_eos = True + generation_config.max_new_tokens = 1000000 + + scheduler_config = get_scheduler_config() + # Low cache size to trigger OOM quickly + scheduler_config.num_kv_blocks = 10 + generation_configs = [generation_config] + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {}) + # First run should return incomplete response + output = pipe.generate(["What is OpenVINO?"], generation_configs) + assert (len(output)) + assert(len(output[0].m_generation_ids)) + # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM + output = pipe.generate(["What is OpenVINO?"], generation_configs) + assert (len(output)) + assert(len(output[0].m_generation_ids)) + del pipe + shutil.rmtree(model_path) \ No newline at end of file diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py new file mode 100644 index 0000000000..d8a21946cc --- /dev/null +++ b/tests/python_tests/tokenizer_configs.py @@ -0,0 +1,1005 @@ + +def get_tokenizer_configs(): + return { + "meta-llama/Meta-Llama-3-8B-Instruct": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "TheBloke/Mistral-7B-OpenOrca-GPTQ": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|im_end|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "TinyLlama/TinyLlama-1.1B-Chat-v1.0": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "upstage/SOLAR-10.7B-Instruct-v1.0": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}" + }, + "Nondzu/zephyr-speakleash-010-pl-3072-32-16-0.01": { + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy": { + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "Qwen/Qwen1.5-0.5B": { + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "Felladrin/Llama-68M-Chat-v1": { + "bos_token": "<|im_start|>", + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "databricks/dbrx-instruct": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|pad|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}" + }, + "speakleash/Bielik-7B-Instruct-v0.1": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + eos_token }}{% endif %}{% endfor %}" + }, + "internlm/internlm2-chat-7b": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "Qwen/Qwen2-7B-Instruct": { + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "codellama/CodeLlama-34b-Instruct-hf": { + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "OpenBuddy/openbuddy-llama3-8b-v21.1-8k": { + "bos_token": None, + "eos_token": "<|end|>", + "pad_token": "<|pad|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}" + }, + "mosaicml/mpt-30b-chat": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}" + }, + "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": { + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "deepseek-ai/deepseek-coder-6.7b-instruct": { + "bos_token": { + "__type": "AddedToken", + "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "<|EOT|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" + }, + "deepseek-ai/deepseek-math-7b-rl": { + "bos_token": { + "__type": "AddedToken", + "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}" + }, + "FINGU-AI/FinguAI-Chat-v1": { + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "allenai/tulu-2-7b": { + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "maldv/winter-garden-7b-alpha": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}" + }, + "mlabonne/NeuralMonarch-7B": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" + }, + "meta-llama/Llama-2-7b-chat-hf": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "GritLM/GritLM-7B": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "ishorn5/RTLCoder-Deepseek-v1.1": { + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" + }, + "jondurbin/bagel-34b-v0.2": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}" + }, + "openchat/openchat-3.5-0106": { + "bos_token": "", + "eos_token": "<|end_of_turn|>", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}" + }, + "mobiuslabsgmbh/aanaphi2-v0.1": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "[PAD]", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: ' + message['content'].strip() + '\n'}}{% endif %}{% endfor %}" + }, + "typeof/mistral-60m": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" + }, + "turboderp/Cat-Llama-3-70B-instruct": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|im_end|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "saltlux/Ko-Llama3-Luxia-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}" + }, + "h2oai/h2o-danube2-1.8b-chat": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" + }, + "abhishek/autotrain-llama3-70b-orpo-v1": { + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}" + }, + "casperhansen/llama-3-70b-instruct-awq": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" + }, + "01-ai/Yi-1.5-34B-Chat": { + "bos_token": "<|startoftext|>", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + }, + "allenai/OLMo-7B-Instruct": { + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|padding|>", + "unk_token": None, + "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "TheBloke/deepseek-coder-33B-instruct-GPTQ": { + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<|EOT|>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set found_item = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" + }, + "cognitivecomputations/dolphin-2.8-mistral-7b-v02": { + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "alexsobolev/IcaroLM": { + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "tokyotech-llm/Swallow-7b-instruct-v0.1": { + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '' + content.strip() + '' + eos_token }}{% endif %}{% endfor %}" + }, + "instructlab/merlinite-7b-lab": { + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|pad|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}" + }, + "microsoft/Phi-3-medium-128k-instruct": { + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|placeholder6|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + }, + "katuni4ka/tiny-random-phi3": { + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + }, + "microsoft/Phi-3-mini-128k-instruct": { + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|placeholder6|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + }, + "VAGOsolutions/SauerkrautLM-Qwen-32b": { + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + }, + "AI-Sweden-Models/gpt-sw3-356m-instruct": { + "bos_token": None, + "eos_token": None, + "pad_token": None, + "unk_token": None, + "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:" + }, + "google/gemma-7b-it": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" + }, + "ise-uiuc/Magicoder-S-DS-6.7B": { + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}" + }, + "Deci/DeciLM-7B": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n' + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}" + }, + "katuni4ka/tiny-random-minicpm": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}" + }, + "UnicomLLM/Unichat-llama3-Chinese-8B-28K": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content = 'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content = bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}" + }, + "RLHFlow/LLaMA3-SFT": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}" + }, + "bofenghuang/vigogne-2-7b-chat": { + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}" + }, + "aisingapore/sea-lion-7b-instruct": { + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|padding|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}" + }, + "microsoft/Phi-3-small-8k-instruct": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" + }, + "THUDM/cogvlm2-llama3-chat-19B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}" + }, + "tiiuae/falcon-11B": { + "bos_token": ">>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n' + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}" + }, + "Mihaiii/Pallas-0.5": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}" + }, + "prithivida/Asimov-7B-v2": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}" + }, + "dreamgen/opus-v1.2-7b": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}" + }, + "KnutJaegersberg/internlm-20b-llama": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:' + message['content'] + '\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}" + }, + "alpindale/WizardLM-2-8x22B": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}" + }, + "yentinglin/Taiwan-LLM-7B-v2.0-base": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}" + }, + "maywell/Synatra-Mixtral-8x7B": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}" + }, + "MediaTek-Research/Breeze-7B-Instruct-v1_0": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "MTSAIR/multi_verse_model": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}" + }, + "bofenghuang/vigostral-7b-chat": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "SeaLLMs/SeaLLM-7B-v2.5": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "qnguyen3/Master-Yi-9B": { + "bos_token": "<|startoftext|>", + "eos_token": "<|im_end|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + }, + "meetkai/functionary-small-v2.5": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "h2oai/h2o-danube-1.8b-chat": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" + }, + "TheBloke/CodeLlama-70B-Instruct-AWQ": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" + }, + "FairMind/Phi-3-mini-4k-instruct-bnb-4bit-Ita": { + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + }, + "ibm-granite/granite-8b-code-instruct": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}" + }, + "dicta-il/dictalm2.0-instruct": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "nvidia/Llama3-ChatQA-1.5-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}" + }, + "openchat/openchat-3.6-8b-20240522": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k": { + "bos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}" + }, + "tenyx/TenyxChat-7B-v1": { + "bos_token": "", + "eos_token": "<|end_of_turn|>", + "pad_token": "<|end_of_turn|>", + "unk_token": "", + "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}" + }, + "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}" + }, + "SeaLLMs/SeaLLM-7B-v2": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + ''}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser": { + "bos_token": "", + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}" + }, + "vaiv/llamion-14b-chat": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}" + }, + "yam-peleg/Hebrew-Gemma-11B-V2": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}" + }, + "shenzhi-wang/Llama3-8B-Chinese-Chat": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "ericzzz/falcon-rw-1b-chat": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}" + }, + "NLPark/AnFeng_v3_Avocet": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}" + }, + "microsoft/Phi-3-vision-128k-instruct": { + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}" + }, + "jphme/em_german_leo_mistral": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}" + }, + "nlpai-lab/KULLM3": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}" + }, + "MediaTek-Research/Breeze-7B-Instruct-v0_1": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "microsoft/DialoGPT-large": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" + }, + "meta-llama/Meta-Llama-Guard-2-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n\n\n\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}" + }, + "chinoll/Yi-6b-200k-dpo": { + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "shanchen/llama3-8B-slerp-biomed-chat-chinese": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "MLP-KTLim/llama-3-Korean-Bllossom-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "UnfilteredAI/UNfilteredAI-1B": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}" + }, + "abacusai/Smaug-Mixtral-v0.1": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "ProbeMedicalYonseiMAILab/medllama3-v20": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}" + }, + "vinai/PhoGPT-4B-Chat": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}" + }, + "lucyknada/microsoft_WizardLM-2-7B": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}" + }, + "bigcode/starcoder2-15b-instruct-v0.1": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}" + }, + "AliAbdelrasheed/maqa_llama_4bit": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|reserved_special_token_250|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "lightonai/alfred-40b-1023": { + "bos_token": None, + "eos_token": "", + "pad_token": None, + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'system' %}{{ '' + message['content'].strip() + '' }}{% elif message['role'] == 'assistant' %}{{ '' + message['content'] + '' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '' }}{% endif %}{% endfor %}" + }, + "aloobun/CosmicBun-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}" + }, + "Undi95/Mixtral-8x7B-MoE-RP-Story": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <>\\n' + messages[idx]['content'] + '\\n<>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n" + }, + "TIGER-Lab/MAmmoTH2-8B-Plus": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}" + }, + "codellama/CodeLlama-70b-Instruct-hf": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" + }, + "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored": { + "bos_token": "", + "eos_token": "", + "pad_token": "[control_768]", + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}" + }, + "gorilla-llm/gorilla-openfunctions-v2": { + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<|EOT|>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" + }, + "ghost-x/ghost-7b-alpha": { + "bos_token": "", + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n' + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute](' + message['content'] + ')' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "winninghealth/WiNGPT2-Llama-3-8B-Chat": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a" + }, + "BramVanroy/Llama-2-13b-chat-dutch": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\n' + system_message + '\n<>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<>\n' + content.strip() + '\n<>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "THUDM/chatglm3-6b": { + "bos_token": None, + "eos_token": "", + "pad_token": "", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}" + }, + "microsoft/Phi-3-mini-4k-instruct": { + "bos_token": "", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" + }, + "mistralai/Mistral-7B-Instruct-v0.1": { + "bos_token": "", + "eos_token": "", + "pad_token": None, + "unk_token": "", + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n" + } + } diff --git a/text_generation/causal_lm/cpp/continuous_batching/CMakeLists.txt b/text_generation/causal_lm/cpp/continuous_batching/CMakeLists.txt deleted file mode 100644 index ca275d09b7..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/CMakeLists.txt +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -cmake_minimum_required(VERSION 3.15) - -project(continuous_batching) - -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -include(CMakeDependentOption) - -option(ENABLE_APPS "Enable C++ apps" ON) -option(ENABLE_PYTHON "Enable Python API" ON) - -add_subdirectory(library) - -if(ENABLE_APPS) - add_subdirectory(apps) -endif() - -if(ENABLE_PYTHON) - add_subdirectory(python) -endif() diff --git a/text_generation/causal_lm/cpp/continuous_batching/Dockerfile b/text_generation/causal_lm/cpp/continuous_batching/Dockerfile deleted file mode 100644 index c5576673f3..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/Dockerfile +++ /dev/null @@ -1,35 +0,0 @@ -FROM ubuntu:22.04 - -ARG JOBS -WORKDIR /workspace -RUN apt-get update -y && apt-get install -y python3-pip python3-venv git - -# Install OpenVINO -RUN git clone --branch master https://github.com/openvinotoolkit/openvino.git && \ - cd /workspace/openvino && \ - git submodule update --init -- /workspace/openvino/thirdparty/xbyak /workspace/openvino/thirdparty/pugixml /workspace/openvino/thirdparty/open_model_zoo \ - /workspace/openvino/thirdparty/protobuf /workspace/openvino/thirdparty/snappy /workspace/openvino/thirdparty/telemetry /workspace/openvino/src/plugins/intel_cpu/thirdparty/mlas \ - /workspace/openvino/src/plugins/intel_cpu/thirdparty/onednn /workspace/openvino/src/bindings/python/thirdparty/pybind11 && cd - - -RUN /workspace/openvino/install_build_dependencies.sh -RUN python3 -m pip install -r /workspace/openvino/src/bindings/python/wheel/requirements-dev.txt -RUN cmake -DENABLE_PYTHON=ON -DENABLE_PYTHON_PACKAGING=ON -DENABLE_WHEEL=ON -DENABLE_CPPLINT=OFF -DENABLE_SAMPLES=OFF -DENABLE_INTEL_GPU=OFF \ - -DENABLE_INTEL_NPU=OFF -DENABLE_TEMPLATE=OFF -DENABLE_AUTO=OFF -DENABLE_HETERO=OFF -DENABLE_AUTO_BATCH=OFF -DENABLE_OV_TF_FRONTEND=ON -DENABLE_OV_ONNX_FRONTEND=OFF \ - -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF -S /workspace/openvino -B /workspace/openvino_build -RUN cmake --build /workspace/openvino_build --parallel $JOBS -RUN cmake -P /workspace/openvino_build/cmake_install.cmake -RUN python3 -m pip install /workspace/openvino_build/wheels/openvino-2024* -ENV OpenVINO_DIR=/workspace/openvino_build - -# Download dataset -RUN wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -# Build continuous batching library -RUN git clone --branch ct-beam-search https://github.com/ilya-lavrenov/openvino.genai.git && cd /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching && \ - git submodule update --remote --init && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ && cmake --build ./build/ -j $JOBS - -# Install test dependencies -RUN python3 -m pip install --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly/ /workspace/openvino.genai/thirdparty/openvino_tokenizers -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt -ENV PYTHONPATH=/workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/build/python -ENV LD_LIBRARY_PATH=/workspace/openvino.genai/build/openvino_genai/ \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/Makefile b/text_generation/causal_lm/cpp/continuous_batching/Makefile deleted file mode 100644 index 10df90c0f0..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/Makefile +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -GENAI_CPP_DOCKER_IMAGE ?= openvino_llm -GENAI_CPP_IMAGE_TAG ?= latest -HTTP_PROXY := "$(http_proxy)" -HTTPS_PROXY := "$(https_proxy)" -NO_PROXY := "$(no_proxy)" - -ifeq ($(shell uname),Darwin) - # MacOS - CORES_TOTAL := $(shell sysctl -n hw.physicalcpu) -else - # Ubuntu & Redhat - CORES_PER_SOCKET := $(shell lscpu | awk '/^Core\(s\) per socket:/ {print $$NF}') - SOCKETS := $(shell lscpu | awk '/^Socket\(s\):/ {print $$NF}') - CORES_TOTAL := $$(($(SOCKETS) * $(CORES_PER_SOCKET))) -endif -JOBS ?= $(CORES_TOTAL) - -.PHONY: default docker_build \ - -default: docker_build - -.PHONY: docker_build -docker_build: - docker build --build-arg http_proxy="$(http_proxy)" --build-arg no_proxy="$(no_proxy)" --build-arg https_proxy="$(https_proxy)" --build-arg JOBS=$(JOBS) -t $(GENAI_CPP_DOCKER_IMAGE):$(GENAI_CPP_IMAGE_TAG) . \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt b/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt deleted file mode 100644 index d8e594734e..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/library/CMakeLists.txt +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -cmake_minimum_required(VERSION 3.15) - -# start of dependencies - -include(FetchContent) - -if(NOT TARGET nlohmann_json) - FetchContent_Declare(nlohmann_json - URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz - URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) - FetchContent_MakeAvailable(nlohmann_json) -endif() - -find_package(OpenVINO REQUIRED COMPONENTS Runtime) - -# check that SDPA to PA transformtion exists -get_target_property(ov_include_dirs openvino::runtime INTERFACE_INCLUDE_DIRECTORIES) -find_file(spda_to_pa_header sdpa_to_paged_attention.hpp - PATHS ${ov_include_dirs} - PATH_SUFFIXES openvino/pass - DOC "Path to sdpa_to_paged_attention.hpp header" - NO_CACHE REQUIRED NO_DEFAULT_PATH) - -# end of dependencies - -set(TARGET_NAME openvino_continuous_batching) - -add_library(${TARGET_NAME} STATIC - src/tokenizer.cpp - src/generation_config.cpp - src/generation_handle.cpp - src/continuous_batching_pipeline.cpp - src/paged_attention_transformations.cpp) - -add_library(openvino::continuous_batching ALIAS openvino_continuous_batching) - -target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src" - PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") -if(TARGET openvino_tokenizers) - set(OPENVINO_TOKENIZERS_PATH $) -else() - message(FATAL_ERROR "${TEST_TARGET_NAME} must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") -endif() -target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") -set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 14 CXX_STANDARD_REQUIRED ON) - -target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json) - -# -# Installation -# - -include(GNUInstallDirs) - -install(TARGETS ${TARGET_NAME} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT openvino_continuous_batching - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT openvino_continuous_batching - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT openvino_continuous_batching) - -install(DIRECTORY include/ - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} - COMPONENT openvino_continuous_batching - FILES_MATCHING PATTERN "*.hpp") - - -# gtest -FetchContent_Declare( - googletest - URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip -) -FetchContent_MakeAvailable(googletest) - - -set(TEST_TARGET_NAME "tests_continuous_batching") -add_executable(${TEST_TARGET_NAME} "src/tests/scheduler.cpp" "src/tests/block_manager.cpp" "src/tests/logit_filtering.cpp" "src/tests/cache_manager.cpp" "src/tests/generate_config.cpp") -target_link_libraries(${TEST_TARGET_NAME} PUBLIC ${TARGET_NAME} openvino::runtime gtest_main) -target_include_directories(${TEST_TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/" - PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") -target_compile_features(${TEST_TARGET_NAME} PRIVATE cxx_std_20) diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp deleted file mode 100644 index 33dc168375..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/library/include/continuous_batching_pipeline.hpp +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include - -#include "scheduler_config.hpp" -#include "tokenizer.hpp" -#include "generation_config.hpp" -#include "generation_handle.hpp" - -class ContinuousBatchingPipeline { - class Impl; - std::shared_ptr m_impl; - -public: - ContinuousBatchingPipeline(const std::string& models_path, - const SchedulerConfig& scheduler_config, - const std::string& device = "CPU", - const ov::AnyMap& plugin_config = {}); - - std::shared_ptr get_tokenizer(); - - GenerationConfig get_config() const; - - GenerationHandle add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params); - - void step(); - - bool has_non_finished_requests(); - - // more high level interface, which can process multiple prompts in continuous batching manner - std::vector generate(const std::vector& prompts, std::vector sampling_params); -}; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/generation_config.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/generation_config.hpp deleted file mode 100644 index e53cce86a7..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/library/include/generation_config.hpp +++ /dev/null @@ -1,78 +0,0 @@ - -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include -#include - -enum class StopCriteria { - EARLY, - HEURISTIC, - NEVER -}; - -// TODO: implement better interface, because currently sequence is not available to public API -class Sequence; - -struct GenerationConfig { - // Generic - size_t max_new_tokens = std::numeric_limits::max(); - size_t min_new_tokens = 0; - size_t max_length = std::numeric_limits::max(); // m_max_new_tokens should have priority over m_max_length - bool ignore_eos = false; - - // Beam search specific - size_t num_groups = 1; - size_t group_size = 1; // beam_width - float diversity_penalty = 1.0f; // 0.0 means no diversity - StopCriteria stop_criteria = StopCriteria::HEURISTIC; - size_t num_return_sequences = 3; // is used by beam search, in other case is equal to batch size - - float repetition_penalty = 1.0f; // based on token repetition in prompt and generated tests - float presence_penalty = 0.0f; // based on token repetition and generated tests - float frequence_penalty = 0.0f; // based on quantity token repetition and generated tests - float length_penalty = 1.0f; - size_t no_repeat_ngram_size = std::numeric_limits::max(); - std::function early_finish = [] (const Sequence&) { return false; }; - - // Multinomial - float temperature = 0.0f; // by default we use greedy sampling - int top_k = 0; // HF transformers uses a value of 0 or `None` to disable top-K logit warping - float top_p = 1.0f; // by default convsider all tokens - bool do_sample = false; - size_t rng_seed = 0; - - // special tokens IDs - int64_t bos_token_id = -1; - int64_t pad_token_id = -1; - int64_t eos_token_id = -1; - - // reads generation config from HF generation_config.json - static GenerationConfig from_file(const std::string& generation_config_json); - - static GenerationConfig greedy(); - - static GenerationConfig beam_search(); - - static GenerationConfig multinomial(); - - bool is_greedy_sampling() const { - return temperature == 0.0f && !is_beam_search(); - } - - bool is_beam_search() const { - return num_groups * group_size > 1; - } - - bool is_multinomial() const { - return do_sample; - } - - void set_eos_token_id(size_t tokenizer_eos_token_id); - - void validate() const; -}; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/include/tokenizer.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/include/tokenizer.hpp deleted file mode 100644 index 028f05bc76..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/library/include/tokenizer.hpp +++ /dev/null @@ -1,27 +0,0 @@ - -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include - -#include "openvino/runtime/tensor.hpp" - -class Tokenizer { - class Impl; - std::shared_ptr m_impl; - -public: - explicit Tokenizer(const std::string& models_path); - - // note, that returned tensor is shared with internal state of InferRequest - // so, it can be changed. Please, copy values - ov::Tensor encode(std::string prompt); - - std::string decode(std::vector tokens); - - size_t get_eos_token_id() const; -}; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/block_manager.hpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/block_manager.hpp deleted file mode 100644 index b0c3055bce..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/block_manager.hpp +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include -#include -#include - -#include "sequence_group.hpp" - -class KVCacheBlock { - int m_ref_count; - int m_index; -public: - using Ptr = std::shared_ptr; - using CPtr = std::shared_ptr; - - explicit KVCacheBlock(int index) - : m_ref_count(0), - m_index(index) { } - - int get_index() const { - return m_index; - } - - bool is_free() const { - return m_ref_count == 0; - } - - void increment() { - ++m_ref_count; - } - - void release() { - --m_ref_count; - } - - bool copy_on_write() const { - return m_ref_count > 1; - } - - int get_references_count() const { - return m_ref_count; - } -}; - - -class BlockAllocator { - std::list m_free_blocks; - int m_total_num_blocks; -public: - BlockAllocator(int num_blocks) : - m_total_num_blocks(num_blocks) { - for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) { - m_free_blocks.push_back(std::make_shared(block_id)); - } - } - - ~BlockAllocator() { - // sanity check to validate that all blocks are freed - // OPENVINO_ASSERT(m_total_num_blocks == m_free_blocks.size()); - } - - size_t num_free_blocks() const { - return m_free_blocks.size(); - } - - bool can_allocate_blocks(size_t num_blocks) const { - return num_blocks <= m_free_blocks.size(); - } - - void free(KVCacheBlock::Ptr block) { - block->release(); - if (block->is_free()) { - m_free_blocks.push_back(block); - } - } - - KVCacheBlock::Ptr allocate_block() { - OPENVINO_ASSERT(can_allocate_blocks(1)); - KVCacheBlock::Ptr allocated_block = m_free_blocks.front(); - allocated_block->increment(); - m_free_blocks.pop_front(); - return allocated_block; - } - - float get_used_percentage() const { - return static_cast(m_total_num_blocks - m_free_blocks.size()) / m_total_num_blocks; - } -}; - -class BlockManager { - BlockAllocator m_allocator; - - // stores blocks for each sequence (not sequence group) - // the same block can be seen in multiple block_tables for different sequences - std::map> m_block_table; -public: - BlockManager(int num_blocks) - : m_allocator(num_blocks) { } - - ~BlockManager() { - // sanity check that all sequences are freed - // OPENVINO_ASSERT(m_block_table.empty()); - } - - const std::vector& get_block_table(uint64_t seq_id) { - OPENVINO_ASSERT(m_block_table.count(seq_id) == 1); - return m_block_table[seq_id]; - } - - const bool has_block_table(uint64_t seq_id) { - return m_block_table.count(seq_id) > 0; - } - - size_t num_free_blocks() const { - return m_allocator.num_free_blocks(); - } - - bool can_allocate_blocks(size_t num_blocks) const { - return m_allocator.can_allocate_blocks(num_blocks); - } - - void allocate(uint64_t sequence_id, size_t num_blocks) { - OPENVINO_ASSERT(num_blocks > 0 && can_allocate_blocks(num_blocks)); - - for (size_t i = 0; i < num_blocks; ++i) { - m_block_table[sequence_id].push_back(m_allocator.allocate_block()); - } - } - - void fork_sequence(uint64_t parent_id, uint64_t child_id) { - OPENVINO_ASSERT(m_block_table.count(child_id) == 0); - m_block_table[child_id].reserve(m_block_table[parent_id].size()); - for (KVCacheBlock::Ptr & block : m_block_table[parent_id]) { - block->increment(); - m_block_table[child_id].push_back(block); - } - } - - void free_sequence(size_t seq_id) { - auto block_table = m_block_table[seq_id]; - - for (KVCacheBlock::Ptr& block : block_table) { - m_allocator.free(block); - } - - OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); - } - - void free_sequence_partially(size_t seq_id, size_t block_num) { - // currently this method is applicable only for groups with single sequences - // TODO: support for groups with multiple sequences - auto block_table = m_block_table[seq_id]; - - OPENVINO_ASSERT(block_table.size() >= block_num); - for (size_t idx = 0; idx < block_num; idx++) { - size_t block_idx = m_block_table[seq_id].size() - idx - 1; - m_allocator.free(block_table[block_idx]); - OPENVINO_ASSERT(block_table[block_idx]->is_free()); - } - m_block_table[seq_id].resize(m_block_table[seq_id].size() - block_num); - - if (m_block_table.size() == 0) { - OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); - } - } - - bool can_append_slots(SequenceGroup::CPtr seq_group) { - return required_blocks_count(seq_group) <= m_allocator.num_free_blocks(); - } - - size_t required_blocks_count(SequenceGroup::CPtr seq_group) { - std::vector running_sequences = seq_group->get_running_sequences(); - size_t blocks_count= 0; // totat number of needed blocks for sequence group - std::set last_block_ids; // unique last block indices - - for (auto seq: running_sequences) { - auto seq_id = seq->get_id(); - if (m_block_table.find(seq_id) == m_block_table.end()) { - // the block table is empty, so we need to allocate the number of blocks equal to number of logical blocks - blocks_count += seq_group->get_num_logical_blocks(); - continue; - } - auto& block_table = m_block_table[seq_id]; - size_t num_physical_blocks = block_table.size(); - OPENVINO_ASSERT(num_physical_blocks > 0); - - if (num_physical_blocks > seq_group->get_num_logical_blocks()) - // new blocks are not required - continue; - - size_t last_block_id = block_table.back()->get_index(); - - if (last_block_ids.find(last_block_id) != last_block_ids.end()) - // this block was already processed - continue; - - size_t needed_blocks_per_sequence = seq_group->get_num_logical_blocks() - num_physical_blocks; - - KVCacheBlock::Ptr last_block = block_table.back(); - if (last_block->copy_on_write()) { - // block is used only by multiple sequences - auto references_count = last_block->get_references_count(); - - if (needed_blocks_per_sequence == 0) { - // case when last block is not completely filled and needs to be copied n - 1 times, where n - references count - blocks_count += references_count - 1; - } - else { - blocks_count += needed_blocks_per_sequence * references_count; - } - } - else { - // block is used only by one sequence - blocks_count += needed_blocks_per_sequence; - } - } - return blocks_count; - } - - std::map> append_slots(SequenceGroup::CPtr seq_group) { - - size_t num_logical_blocks = seq_group->get_num_logical_blocks(); - std::vector running_sequences = seq_group->get_running_sequences(); - - std::map> copy_blocks_map; - for (size_t i = 0; i < running_sequences.size(); ++i) { - Sequence::CPtr sequence = running_sequences[i]; - auto seq_id = sequence->get_id(); - auto& block_table = m_block_table[seq_id]; - size_t num_physical_blocks = block_table.size(); - - if (num_logical_blocks > num_physical_blocks) { - OPENVINO_ASSERT(can_allocate_blocks(num_logical_blocks - num_physical_blocks)); - allocate(seq_id, num_logical_blocks - num_physical_blocks); - } else { - OPENVINO_ASSERT(num_logical_blocks == num_physical_blocks, "A number of physical and logic blocks must be the same in this code path"); - KVCacheBlock::Ptr last_block = block_table.back(); - - if (last_block->copy_on_write()) { - // we need to fork current block, because reference counter is more than 1 - KVCacheBlock::Ptr new_block = m_allocator.allocate_block(); - block_table[num_physical_blocks - 1] = new_block; - // write information about block forking for later usage in CacheManager - copy_blocks_map[last_block->get_index()].push_back(new_block->get_index()); - // release `last_block` usage - m_allocator.free(last_block); - } else { - // nothing to do, because we are the only users of this block - } - } - } - - // it returns information which blocks should be forked by CacheManager - return copy_blocks_map; - } -}; diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/generation_config.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/generation_config.cpp deleted file mode 100644 index 54e3f045f6..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/generation_config.cpp +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "nlohmann/json.hpp" - -#include "generation_config.hpp" - -#include "openvino/core/except.hpp" - -void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { - if (eos_token_id < 0) { - eos_token_id = tokenizer_eos_token_id; - } else { - OPENVINO_ASSERT(eos_token_id == tokenizer_eos_token_id, - "EOS token ID is different in generation config (", eos_token_id, ") and tokenizer (", - tokenizer_eos_token_id, ")"); - } -} - -void GenerationConfig::validate() const { - OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens"); - OPENVINO_ASSERT(min_new_tokens >= 0, "min_new_tokens must be greater 0"); - OPENVINO_ASSERT(max_new_tokens >= 0, "max_new_tokens must be greater 0"); - if (is_beam_search()) { - OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); - } else { - OPENVINO_ASSERT(repetition_penalty >= 0.0f, "repetition penalty must be a positive value"); - OPENVINO_ASSERT(frequence_penalty >= -2.0f && frequence_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); - OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); - if (is_multinomial()) { - OPENVINO_ASSERT(top_p > 0.0f && top_p <= 1.0f, "top_p must be in the interval (0, 1]"); - OPENVINO_ASSERT(temperature >= 0.0f, "temperature must be a positive value"); - } - } -} - -GenerationConfig GenerationConfig::from_file(const std::string& generation_config_json) { - std::ifstream f(generation_config_json); - nlohmann::json json_data = nlohmann::json::parse(f); - - GenerationConfig config; - - config.bos_token_id = json_data.value("bos_token_id", -1); - config.eos_token_id = json_data.value("eos_token_id", -1); - config.pad_token_id = json_data.value("pad_token_id", -1); - - config.num_return_sequences = json_data.value("num_return_sequences", 1); - - config.max_new_tokens = json_data.value("max_new_tokens", std::numeric_limits::max()); - config.min_new_tokens = json_data.value("min_new_tokens", 0); - config.max_length = json_data.value("max_length", std::numeric_limits::max()); - - config.temperature = json_data.value("temperature", 0.0f); - config.do_sample = json_data.value("do_sample", false); - config.top_p = json_data.value("top_p", 0.0f); - - // beam_search_params - config.num_groups = json_data.value("num_beam_groups", 1); - config.diversity_penalty = json_data.value("diversity_penalty", 1.0f); - config.repetition_penalty = json_data.value("repetition_penalty", 1.0f); - config.frequence_penalty = json_data.value("frequence_penalty", 0.0f); - config.presence_penalty = json_data.value("presence_penalty", 0.0f); - const int num_beams = json_data.value("num_beams", 1); - config.group_size = num_beams / config.num_groups; - - return config; -} - -GenerationConfig GenerationConfig::greedy() { - GenerationConfig greedy_params; - greedy_params.temperature = 0.0f; - greedy_params.ignore_eos = true; - greedy_params.num_return_sequences = 1; - greedy_params.repetition_penalty = 3.0f; - greedy_params.presence_penalty = 0.1f; - greedy_params.frequence_penalty = 0.01f; - greedy_params.max_new_tokens = 30; - return greedy_params; -} - -GenerationConfig GenerationConfig::beam_search() { - GenerationConfig beam_search; - beam_search.num_groups = 2; - beam_search.num_return_sequences = 3; - beam_search.group_size = 2; - beam_search.max_new_tokens = 100; - beam_search.diversity_penalty = 2.0f; - return beam_search; -} - -GenerationConfig GenerationConfig::multinomial() { - GenerationConfig multinomial; - multinomial.do_sample = true; - multinomial.temperature = 0.9f; - multinomial.top_p = 0.9f; - multinomial.top_k = 20; - multinomial.num_return_sequences = 3; - multinomial.presence_penalty = 0.01f; - multinomial.frequence_penalty = 0.1f; - multinomial.min_new_tokens = 15; - multinomial.max_new_tokens = 30; - return multinomial; -} diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/block_manager.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/block_manager.cpp deleted file mode 100644 index 79762318c9..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tests/block_manager.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include "openvino/runtime/core.hpp" -#include "continuous_batching_pipeline.hpp" -#include "sequence_group.hpp" -#include "scheduler.hpp" -#include "generation_config.hpp" - -TEST(TestBlockManager, general_test) { - BlockManager bm = BlockManager(6); - - bm.allocate(0, 6); - EXPECT_TRUE(bm.has_block_table(0)); - EXPECT_EQ(bm.get_block_table(0).size(), 6); - EXPECT_EQ(bm.num_free_blocks(), 0); - - bm.free_sequence_partially(0, 4); - EXPECT_EQ(bm.get_block_table(0).size(), 2); - EXPECT_EQ(bm.num_free_blocks(), 4); - - bm.free_sequence(0); - EXPECT_FALSE(bm.has_block_table(0)); - EXPECT_EQ(bm.num_free_blocks(), 6); - - bm.allocate(0, 2); - bm.fork_sequence(0, 1); - EXPECT_TRUE(bm.has_block_table(1)); - EXPECT_EQ(bm.get_block_table(1).back()->get_references_count(), 2); -} diff --git a/text_generation/causal_lm/cpp/continuous_batching/library/src/tokenizer.cpp b/text_generation/causal_lm/cpp/continuous_batching/library/src/tokenizer.cpp deleted file mode 100644 index 1153151060..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/library/src/tokenizer.cpp +++ /dev/null @@ -1,73 +0,0 @@ - -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include -#include "openvino/runtime/core.hpp" - -#include "tokenizer.hpp" - -class Tokenizer::Impl { - const size_t TOKENIZER_BATCH_SIZE = 1; - ov::InferRequest m_tokenizer; - ov::InferRequest m_detokenizer; - std::size_t m_eos_token_id; - //Using multiple infer requests hangs. For now we synchronize entire execution on a single infer request. - std::mutex m_tokenizer_mutex; - std::mutex m_detokenizer_mutex; - -public: - explicit Impl(const std::string& models_path) - { - ov::Core core; - core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - - std::shared_ptr tokenizer_model = core.read_model(models_path + "/openvino_tokenizer.xml"); - const ov::AnyMap& rt_info = tokenizer_model->get_rt_info(); - OPENVINO_ASSERT(rt_info.find("eos_token_id") != rt_info.end(), "Failed to detect \"eos_token_id\" in openvino_tokenizer.xml runtime information"); - m_eos_token_id = rt_info.at("eos_token_id").as(); - - // tokenizer and detokenizer work on CPU only - m_tokenizer = core.compile_model( - tokenizer_model, "CPU").create_infer_request(); - m_detokenizer = core.compile_model( - models_path + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - } - - ov::Tensor encode(std::string prompt) { - std::unique_lock lock(m_tokenizer_mutex); - m_tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {TOKENIZER_BATCH_SIZE}, &prompt}); - m_tokenizer.infer(); - ov::Tensor tmp_tensor = m_tokenizer.get_tensor("input_ids"); - ov::Tensor output_tensor(tmp_tensor.get_element_type(), tmp_tensor.get_shape()); - tmp_tensor.copy_to(output_tensor); - return output_tensor; - } - - std::string decode(std::vector tokens) { - std::unique_lock lock(m_detokenizer_mutex); - m_detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {TOKENIZER_BATCH_SIZE, tokens.size()}, tokens.data()}); - m_detokenizer.infer(); - return m_detokenizer.get_output_tensor().data()[0]; - } - - size_t get_eos_token_id() const { - return m_eos_token_id; - } -}; - -Tokenizer::Tokenizer(const std::string& models_path) { - m_impl = std::make_shared(models_path); -} - -ov::Tensor Tokenizer::encode(std::string prompt) { - return m_impl->encode(prompt); -} - -std::string Tokenizer::decode(std::vector tokens) { - return m_impl->decode(tokens); -} - -size_t Tokenizer::get_eos_token_id() const { - return m_impl->get_eos_token_id(); -} diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/CMakeLists.txt b/text_generation/causal_lm/cpp/continuous_batching/python/CMakeLists.txt deleted file mode 100644 index 1a73aa33c8..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/python/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -include(FetchContent) -FetchContent_Declare( - pybind11 - GIT_REPOSITORY https://github.com/pybind/pybind11 - GIT_TAG v2.12.0 -) - -FetchContent_GetProperties(pybind11) -# search for FindPython3.cmake instead of legacy modules -set(PYBIND11_FINDPYTHON ON) -# the following two calls are required for cross-compilation -if(OpenVINODeveloperPackage_DIR) - ov_find_python3(REQUIRED) - ov_detect_python_module_extension() -else() - if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) - else() - find_package(Python3 REQUIRED COMPONENTS Interpreter Development) - endif() -endif() -if(NOT pybind11_POPULATED) - FetchContent_Populate(pybind11) - add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) -endif() - -pybind11_add_module(py_continuous_batching python.cpp) - -target_link_libraries(py_continuous_batching PRIVATE openvino::continuous_batching) diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp b/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp deleted file mode 100644 index 4ea34ad9f7..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/python/python.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "pybind11/pybind11.h" -#include - -#include "continuous_batching_pipeline.hpp" - -namespace py = pybind11; - -std::ostream& operator << (std::ostream& stream, const GenerationResult& generation_result) { - stream << generation_result.m_request_id << std::endl; - const bool has_scores = !generation_result.m_scores.empty(); - for (size_t i = 0; i < generation_result.m_generation_ids.size(); ++i) { - stream << "{ "; - if (has_scores) - stream << generation_result.m_scores[i] << ", "; - stream << generation_result.m_generation_ids[i] << " }" << std::endl; - } - return stream << std::endl; -} - -PYBIND11_MODULE(py_continuous_batching, m) { - py::class_(m, "GenerationResult") - .def(py::init<>()) - .def_readonly("m_request_id", &GenerationResult::m_request_id) - .def_property("m_generation_ids", - [](GenerationResult &r) -> py::list { - py::list res; - for (auto s: r.m_generation_ids) { - - PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); - res.append(py_s); - } - return res; - }, - [](GenerationResult &r, std::vector &generation_ids) { - r.m_generation_ids = generation_ids; - }) - .def_readwrite("m_scores", &GenerationResult::m_scores) - .def("__repr__", - [](const GenerationResult &r) -> py::str{ - std::stringstream stream; - stream << ""; - std::string str = stream.str(); - PyObject* py_s = PyUnicode_DecodeUTF8(str.data(), str.length(), "replace"); - return py::reinterpret_steal(py_s); - } - ) - .def("get_generation_ids", - [](GenerationResult &r) -> py::list { - py::list res; - for (auto s: r.m_generation_ids) { - PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); - res.append(py_s); - } - return res; - }); - - py::enum_(m, "StopCriteria") - .value("EARLY", StopCriteria::EARLY) - .value("HEURISTIC", StopCriteria::HEURISTIC) - .value("NEVER", StopCriteria::NEVER) - .export_values(); - - py::class_(m, "GenerationConfig") - .def(py::init<>()) - .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) - .def_readwrite("min_new_tokens", &GenerationConfig::min_new_tokens) - .def_readwrite("max_length", &GenerationConfig::max_length) - .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) - .def_readwrite("num_groups", &GenerationConfig::num_groups) - .def_readwrite("group_size", &GenerationConfig::group_size) - .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) - .def_readwrite("stop_criteria", &GenerationConfig::stop_criteria) - .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences) - .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) - .def_readwrite("presence_penalty", &GenerationConfig::presence_penalty) - .def_readwrite("frequence_penalty", &GenerationConfig::frequence_penalty) - .def_readwrite("length_penalty", &GenerationConfig::length_penalty) - .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size) - .def_readwrite("temperature", &GenerationConfig::temperature) - .def_readwrite("top_k", &GenerationConfig::top_k) - .def_readwrite("top_p", &GenerationConfig::top_p) - .def_readwrite("do_sample", &GenerationConfig::do_sample) - .def_readwrite("rng_seed", &GenerationConfig::rng_seed) - .def_property_readonly("is_greedy_sampling", &GenerationConfig::is_greedy_sampling) - .def_property_readonly("is_beam_search", &GenerationConfig::is_beam_search); - - py::class_(m, "SchedulerConfig") - .def(py::init<>()) - .def_readwrite("max_num_batched_tokens", &SchedulerConfig::max_num_batched_tokens) - .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks) - .def_readwrite("cache_size", &SchedulerConfig::cache_size) - .def_readwrite("block_size", &SchedulerConfig::block_size) - .def_readwrite("cache_size", &SchedulerConfig::cache_size) - .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse) - .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs); - - py::class_(m, "ContinuousBatchingPipeline") - .def(py::init()) - .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) - .def("get_config", &ContinuousBatchingPipeline::get_config) - .def("add_request", &ContinuousBatchingPipeline::add_request) - .def("step", &ContinuousBatchingPipeline::step) - .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) - .def("generate", &ContinuousBatchingPipeline::generate); - - py::class_>(m, "Tokenizer") - .def(py::init()) - .def("encode", &Tokenizer::encode) - .def("decode", &Tokenizer::decode) - .def("get_eos_token_id", &Tokenizer::get_eos_token_id); -} diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini b/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini deleted file mode 100644 index 7bc73fe855..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/.pytest.ini +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -[pytest] -addopts = -m precommit \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt b/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt deleted file mode 100644 index 568b6886bf..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt +++ /dev/null @@ -1,39 +0,0 @@ ---extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly/ ---extra-index-url https://download.pytorch.org/whl/cpu -# we need at least openvino 2024.2 ---pre -openvino -openvino-tokenizers -# use latest released version once it's available -git+https://github.com/huggingface/optimum-intel.git@main -pytest -pytest-html -# set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer -hf_transfer - -# requirements for specific models -# - hf-tiny-model-private/tiny-random-RoFormerForCausalLM -rjieba -# - baichuan-inc/Baichuan2-7B-Chat -bitsandbytes -# - nomic-ai/gpt4all-falcon -# - Qwen/Qwen-7B -# - Qwen/Qwen-7B-Chat -# - mosaicml/mpt-7b -# - internlm/internlm2-7b -einops -# - Qwen/Qwen-7B -# - Qwen/Qwen-7B-Chat -transformers_stream_generator -# - openbmb/MiniCPM-V-2 -torchvision -# - openbmb/MiniCPM-V-2 -timm -# - Qwen/Qwen-7B -# - Qwen/Qwen-7B-Chat -# - Salesforce/xgen-7b-8k-base -tiktoken -# - microsoft/biogpt -sacremoses -# - TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ -auto-gptq \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py deleted file mode 100644 index 14749e565f..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_preemption.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import pytest -from dataclasses import dataclass -from py_continuous_batching import GenerationConfig, GenerationResult -from typing import List - -from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ - DEFAULT_SCHEDULER_CONFIG, get_scheduler_config, run_test_pipeline, get_models_list, get_beam_search, get_greedy, \ - get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p -from test_sampling import RandomSamplingTestStruct - -scheduler_params_list = [({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), - ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), - ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search())] -@pytest.mark.parametrize("params", scheduler_params_list) -@pytest.mark.precommit -def test_preemption(tmp_path, params): - run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) - -multinomial_params = RandomSamplingTestStruct(generation_config=[get_multinomial_temperature(), - get_multinomial_temperature_and_top_p(), - get_multinomial_temperature_and_top_k()], - prompts=["What is OpenVINO?", - "How are you?", - "Tell me something about Canada?", - ], - ref_texts=[ ["\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is"], - [" You're getting much better results from doing this, than you are by not doing this. I have a BH and I was so far"], - ["\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version."]]) - - - -# todo: Anastasiia Pnevskaya: fix the test because it is hanging according max_new_tokens = std::numeric_limits::max() -@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) -@pytest.mark.precommit -def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): - generation_configs = multinomial_params.generation_config - for config in generation_configs: - config.rng_seed = 0 - config.max_new_tokens = 30 - model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) - - scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(model_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) - -multinomial_params_n_seq = RandomSamplingTestStruct(generation_config=[ - get_multinomial_temperature(), - get_multinomial_temperature_and_num_return_sequence(), - get_multinomial_all_parameters(), - ], - prompts=[ - "Artificial intelligence ", - "What is the current", - "Tell me something about UAE?", - ], - ref_texts=[ - [ - "\nI've seen this expression used too many times without making sense.\nAs an AI engineer, and as a scientist, we should all be looking" - ], - [ - ' significance of 3862?\n3829\nWhat is the greatest common divisor of 15 and 7763?\n9\nCalculate the', - ' third derivative of 939*v**3*r**2 + 133*v**3*r**2 + v**3 - 77*', - " climate in the future? Do we have things to catch on fire, and if so does that mean we'll have a new climate before we have" - ], - [ - "\nIt's in the middle of nowhere if you haven’t seen one yet! It might be more convenient there than anywhere else 😊 we", - '\nUAE is a country with some great culture that has been living under Islamic oppression for almost 60 years now (including 20 years before) so no', - "\nI don't know anything. I'm not sure what kind this sub wants though... but apparently they are pretty bad at taking selfies too..", - '\nNope, just wanted to say how awesome and beautiful it was when my brother came back from an adventure trip across Asia - very much alive on' - ], - ]) - -@pytest.mark.skip(reason="should be fixed by support of n seqs in preemption") -@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) -@pytest.mark.precommit -def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): - generation_configs = multinomial_params_n_seq.generation_config - for config in generation_configs: - config.rng_seed = 0 - model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) - - # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) - scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) - generate_and_compare_with_reference_text(model_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) \ No newline at end of file diff --git a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py b/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py deleted file mode 100644 index c51fb9c61e..0000000000 --- a/text_generation/causal_lm/cpp/continuous_batching/python/tests/test_sampling.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright (C) 2018-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -import os -import pytest -import shutil -from dataclasses import dataclass -from pathlib import Path -from py_continuous_batching import GenerationConfig, ContinuousBatchingPipeline -from typing import List - -from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \ - generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \ - get_greedy_with_penalties, get_multinomial_temperature, \ - get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ - get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \ - get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ - generate_and_compare_with_reference_text, get_greedy, get_greedy_with_min_and_max_tokens, \ - get_beam_search, get_beam_search_min_and_max_tokens, get_multinomial_max_and_min_token, \ - get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ - generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config - - -@pytest.mark.precommit -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) -def test_sampling_precommit(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - - -@pytest.mark.nightly -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) -def test_sampling_nightly(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - -@pytest.mark.real_models -@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) -def test_real_models(tmp_path, model_id): - run_test_pipeline(tmp_path, model_id) - - -@pytest.mark.precommit -def test_eos_beam_search(tmp_path): - ''' - Current test checks that in case of beam search, some generation results - explicitly have EOS token at the end, which is aligned with HF - - Example of current output: - { -1.23264, that I don't know about. - I don't know what you're talking about, but I'm pretty sure it's a Canadian thing. } - ''' - model_id = "facebook/opt-125m" - prompts = ["Tell me something about Canada"] - generation_configs = [get_beam_search()] - scheduler_config = get_scheduler_config() - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) - - -@pytest.mark.precommit -def test_eos_greedy(tmp_path): - ''' - Current test checks that in case of gready, some generation results - explicitly have EOS token at the end, which is aligned with HF: - - Example of current output: - { a software program } - ''' - model_id = "bigscience/bloomz-560m" - prompts = ["What is OpenVINO?"] - generation_configs = [get_greedy()] - scheduler_config = get_scheduler_config() - generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) - -@pytest.mark.precommit -@pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_beam_search(), get_beam_search_min_and_max_tokens()], - ids=[ - "greedy", - "greedy_with_min_and_max_tokens", - "greedy_with_repetition_penalty", - "beam", - "beam_search_min_and_max_tokens" - ]) -def test_individual_generation_configs_deterministic(tmp_path, generation_config): - prompts = [ - "What is OpenVINO?", - ] - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) - - -@dataclass -class RandomSamplingTestStruct: - generation_config: GenerationConfig - prompts: List[str] - ref_texts: List[List[str]] - -RANDOM_SAMPLING_TEST_CASES = [ - RandomSamplingTestStruct(generation_config=get_multinomial_temperature(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_p(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_top_k(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_top_p_and_top_k(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_repetition_penalty(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\nOpen Vino's are a new and improved way to find cheap, fast-investment frozen vegetables that have no waste or calories. They're"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_num_return_sequence(), - prompts=["What is location of"], - ref_texts=[ - [ - ' your instruments? Are they in an armpit? Is it warm? Are your instruments clear? Are there any cuts and scratches', - ' map and where does the game player base base? I tend to like to do all draws on a specific spot (sometimes wide area,', - ' them?\nJust the Mario Maker App, the location is they' - ] - ]), - RandomSamplingTestStruct(generation_config=get_multinomial_all_parameters(), - prompts=["Tell me something about UAE"], - ref_texts=[ - [ - " and how it's not like we're all in the same boat right now lol (or even close) πŸ˜‚πŸ˜! Just curious :) If", - "? You are my country... so what does our military do here?? What am i missing out on?? And why don't u tell us?", - '?\nThe U.S government has been doing quite well with foreign-made aircraft for many years under US administration....and they have very good reasons', - '? I think that is a bit of an anomaly, but you might want to ask yourself this question: Where can some young people from Dubai or Bahrain' - ] - ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_presence_penalty(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which uses a RESTful API for server-side web applications"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_temperature_and_frequence_penalty(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which offers the Linux-based platform. OpenVINO's"] ]), - RandomSamplingTestStruct(generation_config=get_greedy_with_penalties(), - prompts=["What is OpenVINO?"], - ref_texts=[ ["\nOpenVINO is a software that allows users to create and manage their own virtual machines. It's designed for use with Windows, Mac OS X"] ]), - RandomSamplingTestStruct(generation_config=get_multinomial_max_and_min_token(), - prompts=["What is OpenVINO?"], - ref_texts=[ - [ - "\nOpenVINO is a Linux distro. It's not as simple as using the Linux distro itself. OpenVINO is essentially a dist", - '\nOpenVINO is an open-source open-source software that allows anyone to work with a virtual machine, from a smartphone to an iPhone,', - '\n\nOpenVINO is a social networking tool. OpenVINO is a free virtualization service that works at scale. The tool provides the ability' - ] - ]), -] - - -@pytest.mark.precommit -@pytest.mark.parametrize("test_struct", RANDOM_SAMPLING_TEST_CASES, - ids=["multinomial_temperature", - "multinomial_temperature_and_top_p", - "multinomial_temperature_and_top_k", - "multinomial_temperature_top_p_and_top_k", - "multinomial_temperature_and_repetition_penalty", - "multinomial_temperature_and_num_return_sequence", - "multinomial_all_parameters", - "multinomial_temperature_and_presence_penalty", - "multinomial_temperature_and_frequence_penalty", - "greedy_with_penalties", - "multinomial_max_and_min_token"]) -def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct): - generation_config = test_struct.generation_config - - prompts = test_struct.prompts - generation_config.rng_seed = 0 - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) - - generate_and_compare_with_reference_text(model_path, prompts, test_struct.ref_texts, generation_configs, DEFAULT_SCHEDULER_CONFIG) - - - -@pytest.mark.precommit -def test_post_oom_health(tmp_path): - generation_config = get_greedy() - generation_config.ignore_eos = True - generation_config.max_new_tokens = 1000000 - - scheduler_config = get_scheduler_config() - # Low cache size to trigger OOM quickly - scheduler_config.num_kv_blocks = 10 - generation_configs = [generation_config] - model_id : str = "facebook/opt-125m" - model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) - - model_path : Path = tmp_path / model_id - save_ov_model_from_optimum(model, hf_tokenizer, model_path) - - pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config) - # First run should return incomplete response - output = pipe.generate(["What is OpenVINO?"], generation_configs) - assert(len(output)) - # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM - output = pipe.generate(["What is OpenVINO?"], generation_configs) - assert(len(output)) - del pipe - shutil.rmtree(model_path) \ No newline at end of file diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index e5cb83bc4f..b89d05b757 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit e5cb83bc4fd246014f5d4cb0dfb6e2a3d1343dc3 +Subproject commit b89d05b757e45df056b86f1041f6bfeb70d863b6