diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index c853e194cc..a934c2cb71 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -170,6 +170,27 @@ jobs: " echo "你好! 你好嗎?" passed + timeout 1m ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + python -c " + import transformers + with open('pred.txt', 'r') as file: + predictions = file.read() + tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') + prompts = [ + 'Alan Turing was a', + 'return 0', + '你好! 你好嗎?' + ] + for prompt in prompts: + tokenized = tokenizer(prompt, return_tensors='pt') + for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + idx = predictions.find(ref) + if -1 == idx: + raise RuntimeError(f'Missing "{ref=}" from predictions') + predictions = predictions[:idx] + predictions[idx + len(ref):] + " + echo "Multi prompt" passed cpp-beam_search_causal_lm-windows: runs-on: windows-latest steps: @@ -291,7 +312,6 @@ jobs: source ./ov/setupvars.sh convert_tokenizer ./Phi-2/pytorch/dldt/FP16/ --output ./Phi-2/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code timeout 50s ./build/beam_search_causal_lm ./Phi-2/pytorch/dldt/FP16/ 69 > ./pred.txt - cpp-beam_search_causal_lm-notus-7b-v1: runs-on: ubuntu-20.04-16-cores steps: @@ -331,7 +351,7 @@ jobs: - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.3/linux/l_openvino_toolkit_ubuntu20_2023.3.0.13775.ceeafaf64f3_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.1.0-14645-e6dc0865128/l_openvino_toolkit_ubuntu20_2024.1.0.dev20240304_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - name: Download, convert and build run: | diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp index 218b0af9ec..eb90b17b5e 100644 --- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp @@ -5,12 +5,8 @@ #include namespace { -std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { - constexpr size_t BATCH_SIZE = 1; - tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); - tokenizer.infer(); - return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; -} + +enum SPECIAL_TOKEN { PAD_TOKEN = 2 }; std::string detokenize(ov::InferRequest& detokenizer, const std::vector& tokens) { constexpr size_t BATCH_SIZE = 1; @@ -22,52 +18,187 @@ std::string detokenize(ov::InferRequest& detokenizer, const std::vector detokenizer.infer(); return detokenizer.get_output_tensor().data()[0]; } + +std::pair pad_left(ov::Tensor&& input_ids, ov::Tensor&& attention_mask) { + const size_t batch_size = input_ids.get_shape().at(0); + const size_t sequence_length = input_ids.get_shape().at(1); + int64_t* inputs_data = input_ids.data(); + int64_t* attention_mask_data = attention_mask.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + const size_t batch_offset = batch * sequence_length; + + // last token in the sequence is not a PAD_TOKEN, skipping + if (inputs_data[batch_offset + sequence_length - 1] != SPECIAL_TOKEN::PAD_TOKEN) { + continue; + } + + size_t pad_tokens_number = 0; + for (int i = sequence_length - 1; i >= 0; i--) { + const size_t token_offset = batch_offset + i; + + if (inputs_data[token_offset] == SPECIAL_TOKEN::PAD_TOKEN) { + continue; + } + + if (pad_tokens_number == 0) { + pad_tokens_number = sequence_length - i - 1; + } + + std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]); + std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]); + } + } + + return {input_ids, attention_mask}; +} + +std::pair tokenize(ov::InferRequest& tokenizer, std::vector prompts) { + tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); + + tokenizer.infer(); + + pad_left(tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")); + + // fix mask filled with '2' instead of '0' + ov::Tensor attention_mask = tokenizer.get_tensor("attention_mask"); + int64_t* attention_mask_data = attention_mask.data(); + std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0); + + return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; +} + +void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) { + const size_t batch_size = attention_mask.get_shape().at(0); + const size_t sequence_length = attention_mask.get_shape().at(1); + + const int64_t* attention_mask_data = attention_mask.data(); + int64_t* position_ids_data = position_ids.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + const size_t batch_offset = batch * sequence_length; + size_t sum = 0; + + for (size_t i = 0; i < sequence_length; i++) { + const size_t element_offset = batch_offset + i; + position_ids_data[element_offset] = sum; + if (attention_mask_data[element_offset] == 1) { + sum += 1; + } + } + } +} + +void initialize_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { + request.set_tensor("input_ids", input_ids); + request.set_tensor("attention_mask", attention_mask); + + ov::Shape input_shape = input_ids.get_shape(); + + ov::Tensor position_ids = request.get_tensor("position_ids"); + position_ids.set_shape(input_shape); + initialize_position_ids(position_ids, attention_mask); + + ov::Tensor beam_idx = request.get_tensor("beam_idx"); + beam_idx.set_shape({input_shape.at(0)}); + std::fill_n(beam_idx.data(), input_shape.at(0), 0); +} + +void set_attention_mask(ov::Tensor&& attention_mask, std::vector next_beams) { + ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; + ov::Shape original_shape = original_mask.get_shape(); + attention_mask.copy_to(original_mask); + + ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; + attention_mask.set_shape(new_shape); + + for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { + const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); + const size_t result_prompt_offset = beam_id * new_shape.at(1); + + int64_t* dest = attention_mask.data() + result_prompt_offset; + const int64_t* src = original_mask.data() + original_prompt_offset; + + std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); + attention_mask.data()[result_prompt_offset + new_shape.at(1) - 1] = 1; + } +} + +void set_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { + const size_t batch_size = attention_mask.get_shape().at(0); + const size_t sequence_length = attention_mask.get_shape().at(1); + position_ids.set_shape({batch_size, 1}); + + for (size_t batch = 0; batch < batch_size; batch++) { + int64_t* mask_start = attention_mask.data() + batch * sequence_length; + position_ids.data()[batch] = std::accumulate(mask_start, mask_start + sequence_length - 1, 0); + } +} + +std::vector prompts_arguments_to_vector(int argc, char* argv[]) { + std::vector prompts; + prompts.reserve(argc - 2); + for (size_t i = 2; i < argc; i++) { + prompts.push_back(std::string{argv[i]}); + } + return prompts; } +} // namespace + int main(int argc, char* argv[]) try { - if (argc != 3) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''"); + if (argc < 3) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " ''..."); } + // Compile models ov::Core core; core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - //Read the tokenizer model information from the file to later get the runtime information + // Read the tokenizer model information from the file to later get the runtime information auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); // tokenizer and detokenizer work on CPU only - ov::InferRequest tokenizer = core.compile_model( - tokenizer_model, "CPU").create_infer_request(); - auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); - ov::InferRequest detokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); + ov::InferRequest detokenizer = + core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); // The model can be compiled for GPU as well - ov::InferRequest lm = core.compile_model( - std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); - // Initialize inputs - lm.set_tensor("input_ids", input_ids); - lm.set_tensor("attention_mask", attention_mask); - ov::Tensor position_ids = lm.get_tensor("position_ids"); - position_ids.set_shape(input_ids.get_shape()); - std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); - lm.get_tensor("beam_idx").set_shape({1}); - lm.get_tensor("beam_idx").data()[0] = 0; + ov::InferRequest lm = + core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); + + auto [input_ids, attention_mask] = tokenize(tokenizer, prompts_arguments_to_vector(argc, argv)); + + // Initialize beam search + const int64_t* prompt_data = input_ids.data(); + std::vector> prompts; + prompts.reserve(input_ids.get_shape().at(0)); + for (size_t batch = 0; batch < input_ids.get_shape().at(0); batch++) { + size_t sequence_length = input_ids.get_shape().at(1); + size_t batch_offset = batch * sequence_length; + const int64_t* prompt_start = prompt_data + batch_offset; + prompts.push_back(std::vector{prompt_start, prompt_start + sequence_length}); + } // Get the runtime info from the tokenizer model that we read earlier - auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model + auto rt_info = tokenizer_model->get_rt_info(); // Get the runtime info for the model int64_t SPECIAL_EOS_TOKEN; - if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID + if (rt_info.count("eos_token_id") > 0) { // check if the runtime information has a valid EOS token ID SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as(); - + } else { throw std::runtime_error("EOS token ID not found in model's runtime information."); } - const int64_t* prompt_data = input_ids.data(); - Parameters parameters{std::vector{prompt_data, prompt_data + input_ids.get_size()}, SPECIAL_EOS_TOKEN}; + + Parameters parameters{std::move(prompts), SPECIAL_EOS_TOKEN}; GroupBeamSearcher group_beam_searcher{parameters}; + + initialize_inputs(input_ids, attention_mask, lm); + std::vector next_tokens; std::vector next_beams; + for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) { lm.infer(); + std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); if (next_tokens.empty()) { break; @@ -77,17 +208,17 @@ int main(int argc, char* argv[]) try { lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); // Set auxiliary inputs - ov::Tensor attention_mask = lm.get_tensor("attention_mask"); - ov::Shape mask_shape{batch_size, attention_mask.get_shape().at(1) + 1}; - attention_mask.set_shape(mask_shape); - std::fill_n(attention_mask.data(), ov::shape_size(mask_shape), 1); - lm.get_tensor("position_ids").set_shape({batch_size, 1}); - std::fill_n(lm.get_tensor("position_ids").data(), batch_size, mask_shape.at(1) - 1); + set_attention_mask(lm.get_tensor("attention_mask"), next_beams); + set_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask")); } - for (const std::vector& group : finalize(std::move(group_beam_searcher))) { - std::cout << "Group:\n"; - for (const Beam& beam : group) { - std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n'; + + for (const std::vector>& prompt_group : finalize(std::move(group_beam_searcher))) { + std::cout << "Prompt:\n"; + for (const std::vector group : prompt_group) { + std::cout << "Group:\n"; + for (const Beam& beam : group) { + std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n'; + } } } // Model is stateful which means that context (kv-cache) which belongs to a particular diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp index 1703ac79b8..6c97c869a3 100644 --- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp +++ b/text_generation/causal_lm/cpp/group_beam_searcher.hpp @@ -44,9 +44,12 @@ std::vector kmp_search(const std::vector& haystack, const std: return res; } -struct Token {float log_prob; int64_t idx;}; +struct Token { + float log_prob; + int64_t idx; +}; -std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx) { +std::vector log_softmax(const ov::Tensor& logits, const size_t batch_idx) { if (logits.get_shape().at(0) <= batch_idx) { throw std::runtime_error("logits batch size doesn't match the number of beams"); } @@ -55,10 +58,10 @@ std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx) { size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size; const float* beam_logits = logits.data() + batch_offset + sequence_offset; float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size); - float log_sum = std::log(std::accumulate( - beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { + float log_sum = std::log( + std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { return accumulated + std::exp(to_add - max_logit); - })); + })); std::vector tokens; tokens.reserve(vocab_size); for (size_t idx = 0; idx < vocab_size; ++idx) { @@ -77,10 +80,10 @@ bool greater(const Beam& left, const Beam& right) { return left.score > right.score; } -enum class StopCriteria {early, heuristic, never}; +enum class StopCriteria { early, heuristic, never }; struct Parameters { - std::vector prompt; + std::vector> prompts; int64_t eos_token; size_t n_groups = 3; size_t group_size = 5; @@ -89,12 +92,14 @@ struct Parameters { StopCriteria stop_criteria = StopCriteria::heuristic; float length_penalty = 1.0; size_t no_repeat_ngram_size = std::numeric_limits::max(); - - std::function early_finish = [](const Beam&){return false;}; + + std::function early_finish = [](const Beam&) { + return false; + }; }; struct Group { - std::vector ongoing; // Best beams in front + std::vector ongoing; // Best beams in front std::vector min_heap; // The worst of the best completed beams is the first bool done = false; @@ -121,60 +126,97 @@ struct Group { float best_sum_logprobs = ongoing.front().score; float worst_score = min_heap.front().score; switch (parameters.stop_criteria) { - case StopCriteria::early: - done = true; - return; - case StopCriteria::heuristic: { - float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); - done = worst_score >= highest_attainable_score; - return; - } - case StopCriteria::never: { - size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; - float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); - done = worst_score >= highest_attainable_score; - return; - } - default: throw std::runtime_error("Never reached"); + case StopCriteria::early: + done = true; + return; + case StopCriteria::heuristic: { + float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); + done = worst_score >= highest_attainable_score; + return; + } + case StopCriteria::never: { + size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; + float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); + done = worst_score >= highest_attainable_score; + return; + } + default: + throw std::runtime_error("Never reached"); } } }; -struct TokenToBeam {int64_t token_idx; int32_t beam_idx;}; - // GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search // algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values // are used for next inference. select_next_tokens() returns empty, if all groups are completed struct GroupBeamSearcher { Parameters parameters; - std::vector groups; - GroupBeamSearcher(Parameters parameters) : parameters{std::move(parameters)}, groups{parameters.n_groups} { + std::vector> prompts_groups; + + GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} { if (parameters.no_repeat_ngram_size == 0) { throw std::runtime_error("no_repeat_ngram_size must be positive"); } - for (Group& group : groups) { - group.ongoing.resize(parameters.group_size); - group.ongoing.front().score = 0.0; + for (std::vector& prompts_groups : prompts_groups) { + prompts_groups.resize(parameters.n_groups); + for (Group& group : prompts_groups) { + group.ongoing.resize(parameters.group_size); + group.ongoing.front().score = 0.0; + } } } + std::pair, std::vector> select_next_tokens(const ov::Tensor& logits) { std::vector next_tokens; std::vector next_beams; - next_tokens.reserve(parameters.n_groups * parameters.group_size); - next_beams.reserve(parameters.n_groups * parameters.group_size); + + const size_t promts_size = parameters.prompts.size(); + + next_tokens.reserve(promts_size * parameters.n_groups * parameters.group_size); + next_beams.reserve(promts_size * parameters.n_groups * parameters.group_size); + size_t beam_count = 0; - for (Group& group : groups) { - if (!group.done) { + size_t prompt_id = 0; + for (std::vector& groups : prompts_groups) { + for (Group& group : groups) { + if (group.done) { + continue; + } for (Beam& beam : group.ongoing) { - beam.global_beam_idx = beam_count; // beam.tokens.empty() holds for the first select_next_tokens() call. // Every beam is constructed from the single batch at first call - if (!beam.tokens.empty()) { + if (beam.tokens.empty()) { + beam.global_beam_idx = prompt_id; + } else { + beam.global_beam_idx = beam_count; ++beam_count; } } } + + prompt_id += 1; + } + + for (int prompt_id = 0; prompt_id < promts_size; prompt_id++) { + const std::vector prompt = parameters.prompts[prompt_id]; + std::vector& groups = prompts_groups[prompt_id]; + auto [prompt_next_tokens, prompt_next_beams] = select_prompt_next_tokens(logits, prompt, groups); + + next_tokens.insert(next_tokens.end(), prompt_next_tokens.begin(), prompt_next_tokens.end()); + next_beams.insert(next_beams.end(), prompt_next_beams.begin(), prompt_next_beams.end()); } + + return {next_tokens, next_beams}; + } + + std::pair, std::vector> select_prompt_next_tokens(const ov::Tensor& logits, + const std::vector& prompt, + std::vector& groups) { + std::vector next_tokens; + std::vector next_beams; + next_tokens.reserve(parameters.n_groups * parameters.group_size); + next_beams.reserve(parameters.n_groups * parameters.group_size); + for (auto group = groups.begin(); group != groups.end(); ++group) { if (group->done) { continue; @@ -190,7 +232,7 @@ struct GroupBeamSearcher { } } } - std::vector full_text{parameters.prompt}; + std::vector full_text{prompt}; full_text.insert(full_text.end(), beam.tokens.begin(), beam.tokens.end()); if (full_text.size() > 1 && full_text.size() >= parameters.no_repeat_ngram_size) { auto tail_start = full_text.end() - ptrdiff_t(parameters.no_repeat_ngram_size) + 1; @@ -251,16 +293,23 @@ struct GroupBeamSearcher { }; // Consume group_beam_searcher because beams are consumed -std::vector> finalize(GroupBeamSearcher&& group_beam_searcher) { - std::vector> finalized; - finalized.reserve(group_beam_searcher.groups.size()); - for (Group& group : group_beam_searcher.groups) { - if (!group.done) { - for (Beam& beam : group.ongoing) { - group.finish(std::move(beam), group_beam_searcher.parameters); +std::vector>> finalize(GroupBeamSearcher&& group_beam_searcher) { + std::vector>> finalized; + finalized.resize(group_beam_searcher.prompts_groups.size()); + + for (size_t prompt_id = 0; prompt_id < group_beam_searcher.prompts_groups.size(); prompt_id++) { + std::vector& groups = group_beam_searcher.prompts_groups.at(prompt_id); + finalized.at(prompt_id).reserve(groups.size()); + + for (Group& group : groups) { + if (!group.done) { + for (Beam& beam : group.ongoing) { + group.finish(std::move(beam), group_beam_searcher.parameters); + } } + finalized.at(prompt_id).push_back(std::move(group.min_heap)); } - finalized.push_back(std::move(group.min_heap)); } + return finalized; }