Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Jan 27, 2025
1 parent 5924b23 commit 5eb01b6
Show file tree
Hide file tree
Showing 10 changed files with 46 additions and 37 deletions.
2 changes: 1 addition & 1 deletion src/cpp/src/debug_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
template <typename T>
void print_array(T * array, size_t size) {
std::cout << " => [ ";
for (size_t i = 0; i < std::min(size, size_t(10)); ++i) {
for (size_t i = 0; i < size; ++i) {
std::cout << array[i] << " ";
}
std::cout << " ] " << std::endl;
Expand Down
4 changes: 4 additions & 0 deletions src/cpp/src/icontinuous_batching.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0

#include "icontinuous_batching.hpp"
#include "debug_utils.hpp"

namespace ov::genai {

Expand Down Expand Up @@ -66,6 +67,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
// in case when chat_template was not found in tokenizer_config.json or set
encoded_inputs = m_tokenizer.encode(prompt).input_ids;
}
print_tensor("encoded_inputs", encoded_inputs);
input_ids.push_back(encoded_inputs);
tokenization_durations.emplace_back(PerfMetrics::get_microsec(std::chrono::steady_clock::now() - encode_start));
}
Expand All @@ -82,6 +84,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
auto& raw_counters = perf_metrics.raw_metrics;
raw_counters.tokenization_durations.emplace_back(tokenization_durations[i]);

print_array(res.m_generation_ids.at(0).data(), res.m_generation_ids.at(0).size());

std::vector<std::string> generated;
generated.reserve(res.m_generation_ids.size());
for (size_t idx = 0; idx < res.m_generation_ids.size(); ++idx) {
Expand Down
40 changes: 20 additions & 20 deletions src/cpp/src/llm_pipeline_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,18 @@ DecodedResults StatefulLLMPipeline::generate(

if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) {
OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts");
if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
std::vector<std::string> templated_input_vector;
for (auto& input : *input_vector) {
ChatHistory history({{{"role", "user"}, {"content", input}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
templated_input_vector.push_back(templated_prompt);
}
encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
} else {
encoded_input = m_tokenizer.encode(*input_vector);
}
// if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
// std::vector<std::string> templated_input_vector;
// for (auto& input : *input_vector) {
// ChatHistory history({{{"role", "user"}, {"content", input}}});
// constexpr bool add_generation_prompt = true;
// auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
// templated_input_vector.push_back(templated_prompt);
// }
// encoded_input = m_tokenizer.encode(templated_input_vector, ov::genai::add_special_tokens(false));
// } else {
encoded_input = m_tokenizer.encode(*input_vector);
// }
} else if (auto input_prompt = std::get_if<std::string>(&inputs)) {
std::string& prompt = *input_prompt;

Expand Down Expand Up @@ -170,16 +170,16 @@ DecodedResults StatefulLLMPipeline::generate(

// TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied
} else {
std::string& prompt = *input_prompt;
if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
ChatHistory history({{{"role", "user"}, {"content", prompt}}});
constexpr bool add_generation_prompt = true;
auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
} else {
// std::string& prompt = *input_prompt;
// if (config.apply_chat_template && !m_tokenizer.get_chat_template().empty()) {
// ChatHistory history({{{"role", "user"}, {"content", prompt}}});
// constexpr bool add_generation_prompt = true;
// auto templated_prompt = m_tokenizer.apply_chat_template(history, add_generation_prompt);
// encoded_input = m_tokenizer.encode(templated_prompt, ov::genai::add_special_tokens(false));
// } else {
// in case when chat_template was not found in tokenizer_config.json or set
encoded_input = m_tokenizer.encode(prompt);
}
// }
}
}

Expand Down
17 changes: 15 additions & 2 deletions tests/python_tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,13 +253,17 @@ def run_hugging_face(
for prompt, generation_config in zip(prompts, generation_configs):
hf_generation_config = convert_to_hf(opt_model.generation_config, generation_config)
inputs = {}
if hf_tokenizer.chat_template:
if hf_tokenizer.chat_template and generation_config.apply_chat_template:
prompt = hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True)
inputs = hf_tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
else:
inputs = hf_tokenizer(prompt, return_tensors="pt")
input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
prompt_len = 0 if generation_config.echo else input_ids.numel()

if (not generation_config.apply_chat_template):
print("prompt: ", prompt)
print("inputs: ", inputs)

generate_outputs = opt_model.generate(input_ids=input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True)
Expand All @@ -272,14 +276,19 @@ def run_hugging_face(
generation_results.append(generation_result)
else:
inputs = {}
if hf_tokenizer.chat_template:
if hf_tokenizer.chat_template and generation_configs.apply_chat_template:
processed_prompts = []
for prompt in prompts:
processed_prompts.append(hf_tokenizer.apply_chat_template([{'role': 'user', 'content': prompt}], tokenize=False, add_generation_prompt=True))
# process all prompts as a single batch as we have a single generation config for all prompts
inputs = hf_tokenizer(processed_prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=False, padding_side='left')
else:
inputs = hf_tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, padding_side='left')

if (not generation_configs.apply_chat_template):
print("prompt: ", prompt)
print("inputs: ", inputs['input_ids'])

input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']
hf_generation_config = convert_to_hf(opt_model.generation_config, generation_configs)
hf_encoded_outputs = opt_model.generate(input_ids, attention_mask=attention_mask, generation_config=hf_generation_config, tokenizer=hf_tokenizer)
Expand Down Expand Up @@ -487,6 +496,10 @@ def run_llm_pipeline_with_ref(model_id: str,
ov_results = run_llm_pipeline(models_path, prompts, generation_config, use_cb, streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer)
hf_results = run_hugging_face(opt_model, hf_tokenizer, prompts, generation_config)

if (not generation_config.apply_chat_template):
print("ov_results ", ov_results)
print("hf_results: ", hf_results)

compare_generation_results(prompts, hf_results, ov_results, generation_config)


Expand Down
6 changes: 4 additions & 2 deletions tests/python_tests/test_llm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,8 @@ def test_unicode_pybind_decoding_one_string():
# Test that pybind will not fail.
model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
ov_pipe = read_model((model_id, path))[4]
res_str = ov_pipe.generate(',', max_new_tokens=4, apply_chat_template=False)
res_str = ov_pipe.generate(',', max_new_tokens=4, apply_chat_template=True)
print(res_str)
assert '�' == res_str[-1]


Expand All @@ -350,7 +351,8 @@ def test_unicode_pybind_decoding_batched():
# Test that pybind will not fail.
model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3')
ov_pipe = read_model((model_id, path))[4]
res_str = ov_pipe.generate([","], max_new_tokens=4, apply_chat_template=False)
res_str = ov_pipe.generate([","], max_new_tokens=4, apply_chat_template=True)
print(res_str.texts)
assert '�' == res_str.texts[0][-1]


Expand Down
2 changes: 2 additions & 0 deletions tests/python_tests/test_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
])
def test_basic_stop_criteria(tmp_path, generation_config, prompt):
model_id : str = "katuni4ka/tiny-random-phi3"
if 'apply_chat_template' in generation_config:
print("apply_chat_template ", generation_config['apply_chat_template'])
run_llm_pipeline_with_ref(model_id, [prompt], generation_config, tmp_path)


Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

0 comments on commit 5eb01b6

Please sign in to comment.