Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CB] SpeculativeDecoding impl C++ #907

Merged
merged 48 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
ae67be5
Init version
iefode Sep 30, 2024
1d7d31d
Dirty version
iefode Oct 1, 2024
8204128
Extend accuracy sample
iefode Oct 3, 2024
380309c
Update generation config for speculative decoding, extend step in CBI…
iefode Oct 8, 2024
0d812ac
Merge remote-tracking branch 'upstream/master' into sd_pipe_impl
iefode Oct 8, 2024
93143e5
Metric + fix win build?
iefode Oct 8, 2024
9724723
Update config by model_desc
iefode Oct 9, 2024
ed2f807
tests
iefode Oct 9, 2024
d0ae653
Split cb_sd to separated sample, class and sir
iefode Oct 9, 2024
acd65c5
Remove extra functions
iefode Oct 9, 2024
fc286a3
small update
iefode Oct 9, 2024
99e4695
Update sample
iefode Oct 10, 2024
ab4fb1c
if (!sequence->get_generated_len()) {
iefode Oct 10, 2024
99e0ca9
Several req with SD
iefode Oct 12, 2024
b53ea95
multiseq
iefode Oct 13, 2024
9bcf115
1 seq
iefode Oct 13, 2024
0e40ec5
multiseq
iefode Oct 13, 2024
cc458e2
Apeculative decoding
iefode Oct 13, 2024
5c31966
CI
iefode Oct 13, 2024
81fe9b1
Fix typos
iefode Oct 13, 2024
6b7c9a5
Acheduler one more
iefode Oct 13, 2024
867d030
Split some classes to hpp + ci fix
iefode Oct 14, 2024
a0665af
Different configs for sd + tests
iefode Oct 14, 2024
3e64fc7
Win link
iefode Oct 14, 2024
dd6db79
schedul
iefode Oct 14, 2024
054c5dd
Merge remote-tracking branch 'upstream/master' into sd_pipe_impl
iefode Oct 14, 2024
dc25c6f
Merge remote-tracking branch 'upstream/master' into sd_pipe_impl
iefode Oct 14, 2024
b6700e5
Merge conflict
iefode Oct 14, 2024
60ce854
fix test linking
iefode Oct 15, 2024
3729001
enable cb
iefode Oct 15, 2024
51d2077
review. part 1
iefode Oct 15, 2024
b667a78
Apply review
iefode Oct 15, 2024
0afd2c0
remove old sample + comments
iefode Oct 15, 2024
c2a88e3
Fixed tests + optimization of multiseq
iefode Oct 16, 2024
f916821
Merge remote-tracking branch 'upstream/master' into sd_pipe_impl
iefode Oct 16, 2024
16b39e9
remove extra
iefode Oct 16, 2024
4a14de9
Fix compilation
iefode Oct 16, 2024
43eac4c
Revert draft_model to default constructor
iefode Oct 16, 2024
699f14b
Sync main and draft request generation
iefode Oct 16, 2024
4fe15d6
ci
iefode Oct 16, 2024
3b4a269
remove streaming to file
iefode Oct 16, 2024
b7cf039
Merge remote-tracking branch 'upstream/master' into sd_pipe_impl
iefode Oct 16, 2024
dbb8c07
Fix for CI
iefode Oct 16, 2024
330aa72
Merge branch 'master' into sd_pipe_impl
ilya-lavrenov Oct 17, 2024
d1f203a
Merge remote-tracking branch 'upstream/master' into sd_pipe_impl
iefode Oct 17, 2024
d33c8b8
Merge branch 'sd_pipe_impl' of github.com:iefode/openvino.genai into …
iefode Oct 17, 2024
2f6a2aa
Merge remote-tracking branch 'upstream/master' into sd_pipe_impl
iefode Oct 17, 2024
e8cc09a
Merge branch 'master' into sd_pipe_impl
ilya-lavrenov Oct 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ struct PipelineMetrics {
class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
class ImplInterface;
class ContinuousBatchingImpl;
class SpeculativeDecodingImpl;
iefode marked this conversation as resolved.
Show resolved Hide resolved

friend class SpeculativeDecodingImpl;

std::shared_ptr<ImplInterface> m_impl;

public:
Expand Down
7 changes: 7 additions & 0 deletions src/cpp/include/openvino/genai/llm_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ using StringInputs = std::variant<std::string, std::vector<std::string>>;
*/
static constexpr ov::Property<SchedulerConfig> scheduler_config{"scheduler_config"};

/**
* @brief draft_model_path property serves to activate speculative decoding model in continuous batching pipeline.
* Create SchedulerConfig and fill it with sutable values. Copy or move it to plugin_config.
* And create LLMPipeline instance with this config.
*/
static constexpr ov::Property<SchedulerConfig> draft_model_path{"draft_model_path"};
ilya-lavrenov marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Structure to store resulting batched tokens and scores for each batch sequence.
* The first num_return_sequences elements correspond to the first batch element.
Expand Down
4 changes: 3 additions & 1 deletion src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl(
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& plugin_config) {
const ov::AnyMap& plugin_config,
bool is_validation_mode_enabled) {
m_tokenizer = tokenizer;
m_is_validation_mode_enabled = is_validation_mode_enabled;
ov::Core core;

// The model can be compiled for GPU as well
Expand Down
10 changes: 7 additions & 3 deletions src/cpp/src/continuous_batching_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
size_t step_count = 0;
#endif

bool m_is_validation_mode_enabled = false;

void _free_non_running_requests();
void _notify_requests_dropped_by_handle();
void _register_step_cache_usage(float step_cache_usage);
Expand All @@ -43,14 +45,16 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& plugin_config);
const ov::AnyMap& plugin_config,
bool is_validation_mode_enabled = false);

ContinuousBatchingImpl(const std::string& models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& llm_plugin_config,
const ov::AnyMap& tokenizer_plugin_config)
: ContinuousBatchingImpl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config} {};
const ov::AnyMap& tokenizer_plugin_config,
bool is_validation_mode_enabled = false)
: ContinuousBatchingImpl{models_path, Tokenizer(models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config, is_validation_mode_enabled} {};
ilya-lavrenov marked this conversation as resolved.
Show resolved Hide resolved


GenerationHandle add_request(uint64_t request_id,
Expand Down
10 changes: 9 additions & 1 deletion src/cpp/src/continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "openvino/genai/generation_handle.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "continuous_batching_impl.hpp"
#include "speculative_decoding_impl.hpp"
#include "timer.hpp"
#include "debug_utils.hpp"
#include "cache_state_dumper.hpp"
Expand All @@ -21,7 +22,14 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& model
const std::string& device,
const ov::AnyMap& llm_plugin_config,
const ov::AnyMap& tokenizer_plugin_config) {
m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config);
if (llm_plugin_config.find(ov::genai::draft_model_path.name()) == llm_plugin_config.end()) {
m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config);
} else {
std::string draft_model_path = llm_plugin_config.at(ov::genai::draft_model_path.name()).as<std::string>();
auto llm_plugin_config_without_draft_model = llm_plugin_config;
llm_plugin_config_without_draft_model.erase(ov::genai::draft_model_path.name());
m_impl = std::make_shared<SpeculativeDecodingImpl>(models_path, draft_model_path, scheduler_config, device, llm_plugin_config_without_draft_model, tokenizer_plugin_config);
}
}

ContinuousBatchingPipeline::ContinuousBatchingPipeline(
Expand Down
53 changes: 53 additions & 0 deletions src/cpp/src/speculative_decoding_impl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "speculative_decoding_impl.hpp"

namespace ov::genai {
ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(
const std::string& main_models_path,
const std::string& draft_models_path,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& plugin_config) {
m_main_pipeline = std::make_shared<ContinuousBatchingImpl>(main_models_path, tokenizer, scheduler_config, device, plugin_config, true);
m_draft_pipeline = std::make_shared<ContinuousBatchingImpl>(draft_models_path, tokenizer, scheduler_config, device, plugin_config, false);
}

GenerationHandle
ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request(uint64_t request_id,
const ov::Tensor& input_ids,
ov::genai::GenerationConfig sampling_params) {
return m_main_pipeline->add_request(request_id, input_ids, sampling_params);
};

GenerationHandle
ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request(uint64_t request_id,
const std::string& prompt,
ov::genai::GenerationConfig sampling_params) {
return m_main_pipeline->add_request(request_id, prompt, sampling_params);
}

bool ContinuousBatchingPipeline::SpeculativeDecodingImpl::has_non_finished_requests() {
return m_main_pipeline->has_non_finished_requests();
}

void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() {
}

std::vector<EncodedGenerationResult>
ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<ov::Tensor>& input_ids,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) {
return m_main_pipeline->generate(input_ids, sampling_params, streamer);
}

std::vector<GenerationResult>
ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<std::string>& prompts,
std::vector<ov::genai::GenerationConfig> sampling_params,
const StreamerVariant& streamer) {
return m_main_pipeline->generate(prompts, sampling_params, streamer);
}

}
51 changes: 51 additions & 0 deletions src/cpp/src/speculative_decoding_impl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "openvino/genai/continuous_batching_pipeline.hpp"
#include "continuous_batching_impl.hpp"

namespace ov::genai {
class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBatchingPipeline::ImplInterface {
protected:
std::shared_ptr<ContinuousBatchingImpl> m_main_pipeline, m_draft_pipeline;

public:
SpeculativeDecodingImpl(const std::string& main_models_path,
const std::string& draft_models_path,
const Tokenizer& tokenizer,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& plugin_config);

SpeculativeDecodingImpl(const std::string& main_models_path,
const std::string& draft_models_path,
const SchedulerConfig& scheduler_config,
const std::string& device,
const ov::AnyMap& llm_plugin_config,
const ov::AnyMap& tokenizer_plugin_config)
: SpeculativeDecodingImpl{main_models_path, draft_models_path, Tokenizer(main_models_path, tokenizer_plugin_config), scheduler_config, device, llm_plugin_config} {};


GenerationHandle add_request(uint64_t request_id,
const ov::Tensor& input_ids,
ov::genai::GenerationConfig sampling_params) override;
GenerationHandle add_request(uint64_t request_id,
const std::string& prompt,
ov::genai::GenerationConfig sampling_params) override;

bool has_non_finished_requests() override;

void step() override;

std::vector<EncodedGenerationResult>
generate(const std::vector<ov::Tensor>& input_ids,
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) override;
std::vector<GenerationResult>
generate(const std::vector<std::string>& prompts,
std::vector<ov::genai::GenerationConfig> sampling_params,
const StreamerVariant& streamer) override;
};
}
Loading