-
Notifications
You must be signed in to change notification settings - Fork 198
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
164 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright (C) 2023-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include <numeric> | ||
|
||
#include "speculative_decoding/speculative_decoding_metrics.hpp" | ||
#include "openvino/runtime/exception.hpp" | ||
|
||
namespace ov::genai { | ||
|
||
float SpeculativeDecodingMetrics::get_avg_acceptance_rate(int64_t request_id) { | ||
float avg_acceptance_rate = 0.f; | ||
if (request_id != -1) { | ||
size_t total_iteration_cnt = 0; | ||
for (const auto& acceptance_rate : m_acceptance_rate) { | ||
avg_acceptance_rate += std::accumulate(acceptance_rate.second.begin(), acceptance_rate.second.end(), 0); | ||
total_iteration_cnt += acceptance_rate.second.size(); | ||
} | ||
avg_acceptance_rate /= total_iteration_cnt; | ||
} else { | ||
OPENVINO_ASSERT(m_acceptance_rate.count(request_id)); | ||
const auto& acceptance_rate = m_acceptance_rate[request_id]; | ||
avg_acceptance_rate = std::accumulate(acceptance_rate.begin(), acceptance_rate.end(), 0); | ||
avg_acceptance_rate /= acceptance_rate.size(); | ||
} | ||
return avg_acceptance_rate; | ||
} | ||
|
||
void SpeculativeDecodingMetrics::update_acceptance_rate(int64_t request_id, float acceptance_rate) { | ||
if (m_acceptance_rate.count(request_id)) { | ||
m_acceptance_rate[request_id].push_back(acceptance_rate); | ||
} else { | ||
m_acceptance_rate.insert({{ request_id, std::vector<float>{acceptance_rate} }}); | ||
} | ||
} | ||
|
||
size_t SpeculativeDecodingMetrics::get_iteration_number(int64_t request_id) { | ||
OPENVINO_ASSERT(m_acceptance_rate.count(request_id)); | ||
return m_acceptance_rate[request_id].size(); | ||
} | ||
|
||
float SpeculativeDecodingMetrics::get_draft_duration_percentage() { | ||
return (draft_duration / total_duration); | ||
} | ||
|
||
float SpeculativeDecodingMetrics::get_main_duration_percentage() { | ||
return (main_duration / total_duration); | ||
} | ||
|
||
float SpeculativeDecodingMetrics::get_inference_duration_percentage() { | ||
return ((draft_duration + main_duration) / total_duration); | ||
} | ||
|
||
float SpeculativeDecodingMetrics::get_draft_accepted_tokens_percentage(int64_t request_id) { | ||
return 0.f; | ||
} | ||
|
||
void SpeculativeDecodingMetrics::update_draft_accepted_tokens(int64_t request_id, size_t num_matches) { | ||
m_draft_accepted_tokens[request_id] += num_matches; | ||
} | ||
|
||
void SpeculativeDecodingMetrics::set_generated_len(int64_t request_id, size_t generated_len) { | ||
m_generated_len[request_id] = std::max(generated_len, m_generated_len[request_id]); | ||
} | ||
|
||
} |
63 changes: 63 additions & 0 deletions
63
src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
// Copyright (C) 2023-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#pragma once | ||
|
||
#include <vector> | ||
#include <chrono> | ||
#include <map> | ||
|
||
namespace ov::genai { | ||
|
||
class Timer { | ||
double m_total; | ||
decltype(std::chrono::steady_clock::now()) m_start; | ||
|
||
public: | ||
Timer() : | ||
m_total(0.) { | ||
} | ||
|
||
void start() { | ||
m_start = std::chrono::steady_clock::now(); | ||
} | ||
|
||
void end() { | ||
auto m_end = std::chrono::steady_clock::now(); | ||
m_total += std::chrono::duration<double, std::milli>(m_end - m_start).count(); | ||
} | ||
|
||
float get_duration_ms() { | ||
return m_total / 1000.; | ||
} | ||
}; | ||
|
||
class SpeculativeDecodingMetrics { | ||
// percent of draft model using time + draft model gen tokens | ||
using AcceptanceRate = std::vector<float>; | ||
// { request_id, acceptance_rate } | ||
std::map<int64_t, AcceptanceRate> m_acceptance_rate; | ||
|
||
std::map<int64_t, size_t> m_draft_accepted_tokens; | ||
std::map<int64_t, size_t> m_generated_len; | ||
|
||
public: | ||
float draft_duration = 0, main_duration = 0, total_duration = 0; | ||
|
||
float get_avg_acceptance_rate(int64_t request_id); | ||
void update_acceptance_rate(int64_t request_id, float acceptance_rate); | ||
|
||
float get_draft_accepted_tokens_percentage(int64_t request_id); | ||
void update_draft_accepted_tokens(int64_t request_id, size_t num_matches); | ||
|
||
void set_generated_len(int64_t request_id, size_t generated_len); | ||
|
||
size_t get_iteration_number(int64_t request_id); | ||
|
||
float get_draft_duration_percentage(); | ||
float get_main_duration_percentage(); | ||
float get_inference_duration_percentage(); | ||
|
||
}; | ||
|
||
} |