diff --git a/llm_blender/blender/blender.py b/llm_blender/blender/blender.py index e54869e..816b685 100755 --- a/llm_blender/blender/blender.py +++ b/llm_blender/blender/blender.py @@ -490,8 +490,8 @@ def fuse( _generations = self.fuser_tokenizer.batch_decode(output_ids, skip_special_tokens=True) generations.extend(_generations) return generations - - def best_of_n_generate( + + def n_generate( self, model, # Union[transformers.PreTrainedModel, vllm.LLM] model_tokenizer:transformers.PreTrainedTokenizer, @@ -500,14 +500,9 @@ def best_of_n_generate( n:int=5, sampling_mode:str="top_p_sampling", batch_size:int=4, - pairrm_cmp_type:str="bubble", - return_all:bool=False, **generate_kwargs:dict, ): - """Decoding enhance generate. - In this process, we will generate multiple generations for each input, - Then we will rank these generations and only return the top-k generations, - thus enhancing the quality of generations. + """We will generate n generations for each input, Args: model: Union[transformers.PreTrainedModel, vllm.LLM] @@ -525,12 +520,6 @@ def best_of_n_generate( if None, will use custom sampling strategy by generate_kwargs batch_size int: batch size for generation - pairrm_cmp_type str: one of ['bubble', 'full'] - - 'bubble': use a single run of bubble sort to get the best of n for quicker speed. Time complexity: O(n) - - 'full': use full pairwise comparison matrix to get the best of n. Time complexity: O(n^2) - return_all bool: - If True, will return all candidates instead of the best of n candidates - The returned candidates will be sorted by the ranker, where the first candidate is the best generate_kwargs: kwargs for model.generate() recommended kwargs: @@ -541,9 +530,8 @@ def best_of_n_generate( Note that num_return_sequences will be set to n, so you don't need to specify it Returns: - best_candidates - - List[str]: Best candidates against the ranker for each input - - List[List[str]]: All candidates against the ranker for each input, when return_all is True + sampled_candidates + - List[List[str]]: All sampled candidates against the ranker for each input """ assert len(inputs) == len(instructions) if instructions is not None else True, "Number of inputs and instructions must be the same if instructions is not None" if sampling_mode == "top_k_sampling": @@ -595,6 +583,64 @@ def best_of_n_generate( bz_outputs = model_tokenizer.batch_decode(bz_output_ids, skip_special_tokens=True) bz_sampled_candidates = [bz_outputs[i: i+n] for i in range(0, len(bz_outputs), n)] sampled_candidates.extend(bz_sampled_candidates) + return sampled_candidates + + def best_of_n_generate( + self, + model, # Union[transformers.PreTrainedModel, vllm.LLM] + model_tokenizer:transformers.PreTrainedTokenizer, + inputs:List[str], + instructions:List[str]=None, + n:int=5, + sampling_mode:str="top_p_sampling", + batch_size:int=4, + pairrm_cmp_type:str="bubble", + return_all:bool=False, + **generate_kwargs:dict, + ): + """Decoding enhance generate. + In this process, we will generate multiple generations for each input, + Then we will rank these generations and only return the top-k generations, + thus enhancing the quality of generations. + + Args: + model: Union[transformers.PreTrainedModel, vllm.LLM] + Huggingface model that could generate with .generate(**generate_kwargs) + model_tokenizer: + Huggingface tokenizer that could tokenize with .__call__(**generate_kwargs) + inputs List[str]: + List of input texts + instructions List[str]: + List of instructions. if not None, will be prepended to the corresponding input + n int: + the n parameter in best-of-n. That is, how many samples to generate for ranking for each input + sampling_mode: + "top_k_sampling" or "top_p_sampling" + if None, will use custom sampling strategy by generate_kwargs + batch_size int: + batch size for generation + pairrm_cmp_type str: one of ['bubble', 'full'] + - 'bubble': use a single run of bubble sort to get the best of n for quicker speed. Time complexity: O(n) + - 'full': use full pairwise comparison matrix to get the best of n. Time complexity: O(n^2) + return_all bool: + If True, will return all candidates instead of the best of n candidates + The returned candidates will be sorted by the ranker, where the first candidate is the best + generate_kwargs: + kwargs for model.generate() + recommended kwargs: + - max_new_tokens: max length of the generation. If not specified, will use model_tokenizer.model_max_length + - top_k: if mode is "top_k_sampling", will use this top_k. if not specified, will use 50 + - top_p: if mode is "top_p_sampling", will use this top_p. if not specified, will use 1.0 + - temperature: temperature for sampling. if not specified, will use 0.7 + Note that num_return_sequences will be set to n, so you don't need to specify it + + Returns: + best_candidates + - List[str]: Best candidates against the ranker for each input + - List[List[str]]: All candidates against the ranker for each input, when return_all is True + """ + sampled_candidates = self.n_generate(model, model_tokenizer, inputs, + instructions=instructions, n=n, sampling_mode=sampling_mode, batch_size=batch_size, **generate_kwargs) best_of_n_outputs = self.get_best_of_n(inputs, sampled_candidates, instructions=instructions, batch_size=min(batch_size, 32),