Skip to content

Commit

Permalink
Apeculative decoding
Browse files Browse the repository at this point in the history
  • Loading branch information
iefode committed Oct 13, 2024
1 parent 0e40ec5 commit cc458e2
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::get_pr
min_candidate_len = std::numeric_limits<size_t>::max();
for (const auto& running_sequence : running_sequences) {
const auto& sequence_id = running_sequence->get_grouped_id();
OPENVINO_ASSERT(candidates.count(sequence_id));
if (!candidates.count(sequence_id)) {
continue;
}

const auto& candidate_sequence = candidates.at(sequence_id);

Expand Down Expand Up @@ -209,7 +211,9 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::update
std::tie(min_generated_tokens, min_candidate_len) = get_prefix_len(running_sequences, candidates);

for (auto& running_sequence : running_sequences) {
OPENVINO_ASSERT(candidates.count(running_sequence->get_grouped_id()));
if (!candidates.count(running_sequence->get_grouped_id())) {
continue;
}

result.removed_tokens_cnt = remove_tokens_from_sequence(running_sequence, min_generated_tokens, logit_processor);

Expand Down
52 changes: 28 additions & 24 deletions src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,54 +95,54 @@ bool ContinuousBatchingPipeline::SpeculativeDecodingImpl::has_non_finished_reque

void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() {
// generate candidates by draft model
Timer draft_timer;
draft_timer.start();
m_draft_pipeline->multistep();
draft_timer.end();
m_sd_metrics.draft_duration += draft_timer.get_duration_ms();

// to generate num_matches statistic
std::map<int64_t, ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::UpdateRequestResult> update_sequence_info;
// put candidates to model KV cache
auto draft_generated_requests = m_draft_pipeline->get_generated_requests();
for (const auto& candidate : m_draft_pipeline->get_generated_requests()) {
auto update_result = m_main_pipeline->update_request(candidate.first, candidate.second, false);
update_sequence_info.insert({{candidate.first, update_result}});
}

Timer main_timer;
main_timer.start();
m_main_pipeline->step();
main_timer.end();
m_sd_metrics.main_duration += main_timer.get_duration_ms();

m_main_pipeline->align_all_sequence_len_in_request();

for (const auto& checked_sequence : m_main_pipeline->get_generated_requests()) {
auto main_generated_requests = m_main_pipeline->get_generated_requests();
for (const auto& checked_sequence : main_generated_requests) {
auto update_result = m_draft_pipeline->update_request(checked_sequence.first, checked_sequence.second, true);
update_sequence_info[checked_sequence.first].removed_tokens_cnt = update_result.removed_tokens_cnt;
}

// update to left generation len
std::vector<int64_t> draft_requests_to_drop;
for (auto& request : m_left_gen_len) {
auto updated_seq_info = update_sequence_info[request.first];
if (updated_seq_info.inserted_tokens_cnt == 0 && updated_seq_info.removed_tokens_cnt == 0) {
draft_requests_to_drop.push_back(request.first);
// finish draft request if the generation was complited
for (const auto& draft_request : draft_generated_requests) {
auto request_id = draft_request.first;
if (!main_generated_requests.count(request_id)) {
m_draft_pipeline->finish_request(request_id);
}
OPENVINO_ASSERT(updated_seq_info.inserted_tokens_cnt >= updated_seq_info.removed_tokens_cnt);
auto updated_seq_info = update_sequence_info[request_id];
float acceptance_rate = 1 - static_cast<float>(updated_seq_info.removed_tokens_cnt) / updated_seq_info.inserted_tokens_cnt;
m_sd_metrics.update_acceptance_rate(request.first, acceptance_rate);
auto num_matches = updated_seq_info.inserted_tokens_cnt - updated_seq_info.removed_tokens_cnt;
// std::cout << "num_matches: " << (updated_seq_info.inserted_tokens_cnt - updated_seq_info.removed_tokens_cnt) << " left_len: " << request.second << std::endl;
// std::cout << "insert: " << updated_seq_info.inserted_tokens_cnt << " remove: " << updated_seq_info.removed_tokens_cnt << std::endl;
request.second--;
num_matches = std::min(num_matches, request.second);
request.second -= (num_matches);
if (request.second == 0) {
draft_requests_to_drop.push_back(request.first);
}
}
for (const auto& request_id : draft_requests_to_drop) {
m_draft_pipeline->finish_request(request_id);
m_left_gen_len.erase(request_id);
m_sd_metrics.update_acceptance_rate(request_id, acceptance_rate);
m_sd_metrics.update_draft_accepted_tokens(request_id, (updated_seq_info.inserted_tokens_cnt - updated_seq_info.removed_tokens_cnt));
}
}

std::vector<EncodedGenerationResult>
ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<ov::Tensor>& input_ids,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) {
const StreamerVariant& streamer) {
Timer timer;
timer.start();
OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
OPENVINO_ASSERT(input_ids.size() == sampling_params.size());
const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{
Expand All @@ -163,12 +163,13 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
main_generations.push_back(m_main_pipeline->add_request(request_id, input_ids[request_id], sampling_params[request_id]));

auto draft_sampling_params = sampling_params[request_id];
// set the parameters do not stop draft generation without stopping of the same request for main pipeline
draft_sampling_params.max_new_tokens = SIZE_MAX - 1;
draft_sampling_params.min_new_tokens = SIZE_MAX - 1;
draft_sampling_params.ignore_eos = true;
draft_generations.push_back(m_draft_pipeline->add_request(request_id, input_ids[request_id], draft_sampling_params));
// decrease generation len to generate last token by main model
m_left_gen_len.insert({ request_id, sampling_params[request_id].max_new_tokens - 1 });
// m_left_gen_len.insert({ request_id, sampling_params[request_id].max_new_tokens - 1 });
}

std::vector<EncodedGenerationResult> results;
Expand Down Expand Up @@ -201,6 +202,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
const auto& generation_output = generation_outputs[generation_output_idx];
m_sd_metrics.set_generated_len(generation_idx, generation_outputs[generation_output_idx].generated_ids.size());
result.m_generation_ids.push_back(std::move(generation_output.generated_ids));
result.m_scores.push_back(generation_output.score);
}
Expand All @@ -209,6 +211,8 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
}

OPENVINO_ASSERT(results.size() == input_ids.size());
timer.end();
m_sd_metrics.total_duration = timer.get_duration_ms();
return results;
}
}
47 changes: 1 addition & 46 deletions src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,59 +6,14 @@
#include "openvino/genai/continuous_batching_pipeline.hpp"
#include "continuous_batching_impl.hpp"
#include "continuous_batching_for_speculative_decoding_impl.hpp"
#include "speculative_decoding/speculative_decoding_metrics.hpp"

namespace ov::genai {

class SpeculativeDecodingMetrics {
// percent of draft model using time + draft model gen tokens
using AcceptanceRate = std::vector<float>;
// { request_id, acceptance_rate }
std::map<int64_t, AcceptanceRate> m_acceptance_rate;

public:
float get_avg_acceptance_rate(int64_t request_id = 1) {
float avg_acceptance_rate = 0.f;
if (request_id != -1) {
size_t total_iteration_cnt = 0;
for (const auto& acceptance_rate : m_acceptance_rate) {
avg_acceptance_rate += std::accumulate(acceptance_rate.second.begin(), acceptance_rate.second.end(), 0);
total_iteration_cnt += acceptance_rate.second.size();
}
avg_acceptance_rate /= total_iteration_cnt;
} else {
OPENVINO_ASSERT(m_acceptance_rate.count(request_id));
const auto& acceptance_rate = m_acceptance_rate[request_id];
avg_acceptance_rate = std::accumulate(acceptance_rate.begin(), acceptance_rate.end(), 0);
avg_acceptance_rate /= acceptance_rate.size();
}
return avg_acceptance_rate;
}

void update_acceptance_rate(int64_t request_id, float acceptance_rate) {
if (m_acceptance_rate.count(request_id)) {
m_acceptance_rate[request_id].push_back(acceptance_rate);
} else {
m_acceptance_rate.insert({{ request_id, std::vector<float>{acceptance_rate} }});
}
}

size_t get_iteration_number(int64_t request_id) {
OPENVINO_ASSERT(m_acceptance_rate.count(request_id));
return m_acceptance_rate[request_id].size();
}

};

class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface {
protected:
std::shared_ptr<ContinuousBatchingForSpeculativeDecodingImpl> m_main_pipeline, m_draft_pipeline;
// left generation length per request {request_id, len}
std::map<int64_t, size_t> m_left_gen_len;
SpeculativeDecodingMetrics m_sd_metrics;

bool m_first_infer = false;

void first_step();

public:
SpeculativeDecodingImpl(const std::string& main_models_path,
Expand Down
66 changes: 66 additions & 0 deletions src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include <numeric>

#include "speculative_decoding/speculative_decoding_metrics.hpp"
#include "openvino/runtime/exception.hpp"

namespace ov::genai {

float SpeculativeDecodingMetrics::get_avg_acceptance_rate(int64_t request_id) {
float avg_acceptance_rate = 0.f;
if (request_id != -1) {
size_t total_iteration_cnt = 0;
for (const auto& acceptance_rate : m_acceptance_rate) {
avg_acceptance_rate += std::accumulate(acceptance_rate.second.begin(), acceptance_rate.second.end(), 0);
total_iteration_cnt += acceptance_rate.second.size();
}
avg_acceptance_rate /= total_iteration_cnt;
} else {
OPENVINO_ASSERT(m_acceptance_rate.count(request_id));
const auto& acceptance_rate = m_acceptance_rate[request_id];
avg_acceptance_rate = std::accumulate(acceptance_rate.begin(), acceptance_rate.end(), 0);
avg_acceptance_rate /= acceptance_rate.size();
}
return avg_acceptance_rate;
}

void SpeculativeDecodingMetrics::update_acceptance_rate(int64_t request_id, float acceptance_rate) {
if (m_acceptance_rate.count(request_id)) {
m_acceptance_rate[request_id].push_back(acceptance_rate);
} else {
m_acceptance_rate.insert({{ request_id, std::vector<float>{acceptance_rate} }});
}
}

size_t SpeculativeDecodingMetrics::get_iteration_number(int64_t request_id) {
OPENVINO_ASSERT(m_acceptance_rate.count(request_id));
return m_acceptance_rate[request_id].size();
}

float SpeculativeDecodingMetrics::get_draft_duration_percentage() {
return (draft_duration / total_duration);
}

float SpeculativeDecodingMetrics::get_main_duration_percentage() {
return (main_duration / total_duration);
}

float SpeculativeDecodingMetrics::get_inference_duration_percentage() {
return ((draft_duration + main_duration) / total_duration);
}

float SpeculativeDecodingMetrics::get_draft_accepted_tokens_percentage(int64_t request_id) {
return 0.f;
}

void SpeculativeDecodingMetrics::update_draft_accepted_tokens(int64_t request_id, size_t num_matches) {
m_draft_accepted_tokens[request_id] += num_matches;
}

void SpeculativeDecodingMetrics::set_generated_len(int64_t request_id, size_t generated_len) {
m_generated_len[request_id] = std::max(generated_len, m_generated_len[request_id]);
}

}
63 changes: 63 additions & 0 deletions src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <vector>
#include <chrono>
#include <map>

namespace ov::genai {

class Timer {
double m_total;
decltype(std::chrono::steady_clock::now()) m_start;

public:
Timer() :
m_total(0.) {
}

void start() {
m_start = std::chrono::steady_clock::now();
}

void end() {
auto m_end = std::chrono::steady_clock::now();
m_total += std::chrono::duration<double, std::milli>(m_end - m_start).count();
}

float get_duration_ms() {
return m_total / 1000.;
}
};

class SpeculativeDecodingMetrics {
// percent of draft model using time + draft model gen tokens
using AcceptanceRate = std::vector<float>;
// { request_id, acceptance_rate }
std::map<int64_t, AcceptanceRate> m_acceptance_rate;

std::map<int64_t, size_t> m_draft_accepted_tokens;
std::map<int64_t, size_t> m_generated_len;

public:
float draft_duration = 0, main_duration = 0, total_duration = 0;

float get_avg_acceptance_rate(int64_t request_id);
void update_acceptance_rate(int64_t request_id, float acceptance_rate);

float get_draft_accepted_tokens_percentage(int64_t request_id);
void update_draft_accepted_tokens(int64_t request_id, size_t num_matches);

void set_generated_len(int64_t request_id, size_t generated_len);

size_t get_iteration_number(int64_t request_id);

float get_draft_duration_percentage();
float get_main_duration_percentage();
float get_inference_duration_percentage();

};

}

0 comments on commit cc458e2

Please sign in to comment.