diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index a89957850f..aac55015c0 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "llm_pipeline_static.hpp" @@ -635,6 +635,31 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) { } } +void stream_generated_tokens(std::shared_ptr streamer_ptr, + GenerationHandle& handle) { + if (streamer_ptr && handle->can_read()) { + std::unordered_map token = handle->back(); + for (const auto& gen_token : token.begin()->second.generated_ids) { + if (streamer_ptr->put(gen_token)) { + handle->drop(); + break; + } + } + } +} + +int64_t get_last_token(SequenceGroup::Ptr sequence_group) { + const auto running_sequences = sequence_group->get_running_sequences(); + OPENVINO_ASSERT(running_sequences.size() == 1u); + const auto sequence = running_sequences.front(); + + size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); + OPENVINO_ASSERT(num_scheduled_tokens == 1u); + + const auto num_processed_tokens = sequence_group->get_num_processed_tokens(); + return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()]; +} + } // anonymous namespace namespace ov { @@ -945,31 +970,6 @@ DecodedResults StaticLLMPipeline::generate( return decoded_results; } -void stream_generated_tokens(std::shared_ptr streamer_ptr, - GenerationHandle& handle) { - if (streamer_ptr && handle->can_read()) { - std::unordered_map token = handle->back(); - for (const auto& gen_token : token.begin()->second.generated_ids) { - if (streamer_ptr->put(gen_token)) { - handle->drop(); - break; - } - } - } -} - -int64_t get_last_token(SequenceGroup::Ptr sequence_group) { - const auto running_sequences = sequence_group->get_running_sequences(); - OPENVINO_ASSERT(running_sequences.size() == 1u); - const auto sequence = running_sequences.front(); - - size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); - OPENVINO_ASSERT(num_scheduled_tokens == 1u); - - const auto num_processed_tokens = sequence_group->get_num_processed_tokens(); - return sequence->get_generated_ids()[num_processed_tokens - sequence_group->get_prompt_len()]; -} - EncodedResults StaticLLMPipeline::generate( const EncodedInputs& inputs, OptionalGenerationConfig generation_config, diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp index 13d7752e2e..8dc7ef49a1 100644 --- a/src/cpp/src/llm_pipeline_static.hpp +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -1,10 +1,9 @@ -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2024-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include -#include #include "llm_pipeline_base.hpp" #include "sampler.hpp" diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index e2a3238676..73a406c695 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "sampler.hpp" diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 271d209f75..df0c406749 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -1,5 +1,5 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 2df8a1f200..6a17cf59b8 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023-2024 Intel Corporation +// Copyright (C) 2023-2025 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once diff --git a/tests/python_tests/test_llm_pipeline_static.py b/tests/python_tests/test_llm_pipeline_static.py index 326386fe31..10e7255309 100644 --- a/tests/python_tests/test_llm_pipeline_static.py +++ b/tests/python_tests/test_llm_pipeline_static.py @@ -99,9 +99,9 @@ def test_generation_compare_with_stateful(generation_config): @pytest.mark.nightly @pytest.mark.parametrize("generation_config", generation_configs) def test_multinomial_sampling(generation_config): - # Multinomial sampling is highly sensitive to raw logits values. For fair comparison, - # a reference implementation producing identical logits (e.g., from StaticLLMPipeline) - # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply + # Multinomial sampling is highly sensitive to raw logits values. For fair comparison, + # a reference implementation producing identical logits (e.g., from StaticLLMPipeline) + # would be necessary. However, the CPU in StatefulPipeline and StaticLLMPipeline may apply # different optimizations due to differences in provided topologies, leading to slight # variations in raw logits. Therefore, there is no reliable reference for validation, # so only ensure that no exceptions are raised. @@ -163,13 +163,13 @@ def test_batch_raise_error(): # TODO: For the further sampling support -generation_config = [ +generation_configs = [ get_beam_search(), # NB: Only num_return_sequences=1 is supported! get_multinomial_all_parameters() ] @pytest.mark.skipif(sys.platform in ["darwin", "linux"], reason="Not supposed to work on mac. Segfault on linux CI") -@pytest.mark.parametrize("generation_config", generation_config) +@pytest.mark.parametrize("generation_config", generation_configs) @pytest.mark.precommit @pytest.mark.nightly def test_unsupported_sampling_raise_error(generation_config):