From 50a33a09ffc65b237815766eb0a194108f4b8e02 Mon Sep 17 00:00:00 2001 From: mzegla Date: Thu, 1 Aug 2024 11:23:55 +0200 Subject: [PATCH 1/4] introduce finish reason --- .../openvino/genai/generation_handle.hpp | 7 +++++++ src/cpp/src/sequence_group.hpp | 19 +++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp index 8d00ae0e9b..c18f11f3e8 100644 --- a/src/cpp/include/openvino/genai/generation_handle.hpp +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -32,6 +32,12 @@ struct EncodedGenerationResult { GenerationStatus m_status = GenerationStatus::RUNNING; }; +enum class GenerationFinishReason { + NONE = 0, // Default value, when generation is not yet finished + STOP = 1, // Generation finished naturally, by reaching end of sequence token + LENGTH = 2 // Generation finished by reaching max_new_tokens limit +}; + struct GenerationResult { // request ID - obsolete when handle API is approved as handle will connect results with prompts. uint64_t m_request_id; @@ -49,6 +55,7 @@ struct GenerationResult { struct GenerationOutput { std::vector generated_token_ids; float score; + GenerationFinishReason finish_reason; }; using GenerationOutputs = std::unordered_map; diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index d5b9506b2c..db227a3436 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -33,6 +33,7 @@ class Sequence { uint64_t m_grouped_id; uint64_t m_id = _get_next_global_sequence_id(); SequenceStatus m_status = SequenceStatus::RUNNING; + GenerationFinishReason m_finish_reason = GenerationFinishReason::NONE; float m_cumulative_log_prob = 0.0f; public: @@ -91,6 +92,14 @@ class Sequence { m_status = status; } + GenerationFinishReason get_finish_reason() const { + return m_finish_reason; + } + + void set_finish_reason(GenerationFinishReason finish_reason) { + m_finish_reason = finish_reason; + } + // appends new tokens to a generated part void append_token(int64_t token_id, float log_prob) { m_cumulative_log_prob += log_prob; @@ -205,6 +214,12 @@ class SequenceGroup { running_sequence->get_generated_ids().back() == m_sampling_params.eos_token_id && !m_sampling_params.ignore_eos) { // stop sequence by max_new_tokens or EOS token running_sequence->set_status(SequenceStatus::FINISHED); + + if (running_sequence->get_generated_ids().back() == m_sampling_params.eos_token_id && !m_sampling_params.ignore_eos) + running_sequence->set_finish_reason(GenerationFinishReason::STOP); + else if (m_sampling_params.max_new_tokens == generated_len) + running_sequence->set_finish_reason(GenerationFinishReason::LENGTH); + dropped_seq_ids.push_back(running_sequence->get_id()); } } @@ -451,7 +466,8 @@ class SequenceGroup { for (auto& sequence: m_sequences) { GenerationOutput output; output.generated_token_ids = sequence->get_generated_ids(); - output.score = sequence->get_beam_search_score(m_sampling_params); + output.score = m_sampling_params.is_beam_search() ? sequence->get_beam_search_score(m_sampling_params) : sequence->get_cumulative_log_probs(); + output.finish_reason = sequence->get_finish_reason(); outputs.emplace(sequence->get_grouped_id(), output); } m_generation_stream->push(outputs); @@ -459,7 +475,6 @@ class SequenceGroup { void push_partial_outputs() { GenerationOutputs outputs; - // TODO: support streamimg for n seqs for (auto& sequence : m_sequences) { // todo: check seq.is_finished() to generate without several // or is it ok to use padding? From 21e680e53203a0e1f269b58692ce20b465d2b2a6 Mon Sep 17 00:00:00 2001 From: mzegla Date: Thu, 1 Aug 2024 16:04:40 +0200 Subject: [PATCH 2/4] set reason for partial push also --- src/cpp/src/generation_handle.cpp | 1 + src/cpp/src/sequence_group.hpp | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp index 26cc12604f..f8e88bfecb 100644 --- a/src/cpp/src/generation_handle.cpp +++ b/src/cpp/src/generation_handle.cpp @@ -36,6 +36,7 @@ void add_partial_result(std::unordered_map& partial_ } else { partial_result_iter->second.generated_token_ids.push_back(iteration_result.second.generated_token_ids[0]); partial_result_iter->second.score = iteration_result.second.score; + partial_result_iter->second.finish_reason = iteration_result.second.finish_reason; } } } diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index db227a3436..a8fb528554 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -111,6 +111,7 @@ class Sequence { OPENVINO_ASSERT(m_generated_ids.size()); output.score = get_cumulative_log_probs(); output.generated_token_ids = std::vector {m_generated_ids.back()}; + output.finish_reason = get_finish_reason(); return output; } @@ -215,10 +216,11 @@ class SequenceGroup { // stop sequence by max_new_tokens or EOS token running_sequence->set_status(SequenceStatus::FINISHED); - if (running_sequence->get_generated_ids().back() == m_sampling_params.eos_token_id && !m_sampling_params.ignore_eos) + if (running_sequence->get_generated_ids().back() == m_sampling_params.eos_token_id && !m_sampling_params.ignore_eos) { running_sequence->set_finish_reason(GenerationFinishReason::STOP); - else if (m_sampling_params.max_new_tokens == generated_len) + } else if (m_sampling_params.max_new_tokens == generated_len) { running_sequence->set_finish_reason(GenerationFinishReason::LENGTH); + } dropped_seq_ids.push_back(running_sequence->get_id()); } From 5b3c1857c631c11de545e50f1ed2f5141f952d27 Mon Sep 17 00:00:00 2001 From: mzegla Date: Thu, 1 Aug 2024 16:51:12 +0200 Subject: [PATCH 3/4] beam search --- src/cpp/src/sampler.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 6390fc8725..2ce87531ff 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -193,6 +193,8 @@ class GroupBeamSearcher { // mark current sequence as finished beam.m_sequence->set_status(SequenceStatus::FINISHED); + // Setting length since this function is used when sequence generated tokens number reaches max_new_tokens + beam.m_sequence->set_finish_reason(GenerationFinishReason::LENGTH); // we also need to drop add ongoing / forked sequences from scheduler sampler_output.m_dropped_sequences.push_back(sequence_id); } @@ -432,6 +434,8 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp Sequence::Ptr forked_sequence = m_sequence_group->fork_sequence(candidate.m_sequence); // and finish immidiately forked_sequence->set_status(SequenceStatus::FINISHED); + // Setting length since this function is used when sequence generated eos token + forked_sequence->set_finish_reason(GenerationFinishReason::STOP); // TODO: make it more simplier // currently, we finish sequence and then fork it in current code From 14c7fa6de09a2779314f52ae9780faae34211767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=82osz=20=C5=BBeglarski?= Date: Fri, 2 Aug 2024 11:16:10 +0200 Subject: [PATCH 4/4] Update src/cpp/src/sampler.hpp --- src/cpp/src/sampler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 2ce87531ff..7aa17ad1c6 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -434,7 +434,7 @@ void GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutp Sequence::Ptr forked_sequence = m_sequence_group->fork_sequence(candidate.m_sequence); // and finish immidiately forked_sequence->set_status(SequenceStatus::FINISHED); - // Setting length since this function is used when sequence generated eos token + // Setting stop since this function is used when sequence generated eos token forked_sequence->set_finish_reason(GenerationFinishReason::STOP); // TODO: make it more simplier