Skip to content

Commit

Permalink
Test CausalLM generate & pipeline (#110)
Browse files Browse the repository at this point in the history
* add tests & standardize generate

* fix bugs & pipeline

* clean engine before test

* cleanup

* style

* test pipelines

* Quality

* fix tests

* style

* run nvidia-smi

* fix CI - can't compile for 32000 generated tokens on A10G

* fewer new tokens

* still fewer tokens or tests won't pass

* lets test 7b gemma as well

---------

Co-authored-by: Morgan Funtowicz <[email protected]>
  • Loading branch information
fxmarty and mfuntowicz authored Apr 11, 2024
1 parent 746c445 commit 1065a8e
Show file tree
Hide file tree
Showing 12 changed files with 354 additions and 92 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/pr_functional_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ jobs:
run: |
python3 -m pip install --upgrade -e .[quality,tests]
- name: Run nvidia-smi
run: |
nvidia-smi
- name: Run optimum-nvidia functional test-suite
run: |
pytest -n 4 -s -vvvvv -p no:warnings -o log_cli=true --ignore=tests/integration/ tests/
4 changes: 4 additions & 0 deletions .github/workflows/pr_integration_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ jobs:
run: |
python3 -m pip install --upgrade -e .[quality,tests]
- name: Run nvidia-smi
run: |
nvidia-smi
- name: Run optimum-nvidia integration test-suite
run: |
pytest -s -vvvvv -n 1 -p no:warnings -o log_cli=true tests/integration/
4 changes: 1 addition & 3 deletions examples/text-generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,5 @@
max_new_tokens=args.max_new_tokens,
)

generated_text = tokenizer.batch_decode(
generated.flatten(0, 1), skip_special_tokens=True
)
generated_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
print(generated_text)
2 changes: 1 addition & 1 deletion src/optimum/nvidia/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def _save_pretrained(self, save_directory: Path) -> None:
"Please open-up an issue at https://github.com/huggingface/optimum-nvidia"
)

self.transformers_config.save_pretrained(save_directory)
self.config.save_pretrained(save_directory)
if self.generation_config is not None:
self.generation_config.save_pretrained(save_directory)

Expand Down
1 change: 1 addition & 0 deletions src/optimum/nvidia/models/gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def from_config(config: TransformersPretrainedConfig) -> "TensorRTConfig":
share_embedding_table=False,
max_lora_rank=64,
quantization=qconfig,
rotary_base=config.rope_theta,
)

trt_config.mapping.gpus_per_node = min(trt_config.mapping.world_size, 8)
Expand Down
1 change: 1 addition & 0 deletions src/optimum/nvidia/models/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def from_config(config: TransformersPretrainedConfig) -> "TensorRTConfig":
max_lora_rank=64,
head_size=config.hidden_size / config.num_attention_heads,
quantization=qconfig,
rotary_base=config.rope_theta,
)

trt_config.mapping.gpus_per_node = min(trt_config.mapping.world_size, 8)
Expand Down
18 changes: 8 additions & 10 deletions src/optimum/nvidia/models/whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ def __init__(
generation_config = GenerationConfig()
self.generation_config = generation_config

self.transformers_config = transformers_config
self.config = transformers_config

# Encoder.
serialize_path = engines_folders[0] / "rank0.engine"
Expand Down Expand Up @@ -1078,7 +1078,7 @@ def generate(
def raise_unsupported(value: Any, name: str, default: Any = None):
if value != default:
raise ValueError(
f"TensorRTForSpeechSeq2Seq.generate does not support {name} (got {value}). Please open an issue at https://github.com/huggingface/optimum-nvidia/issues."
f"TensorRTForSpeechSeq2Seq.generate does not support the argument {name} (got {name}={value}). Please open an issue at https://github.com/huggingface/optimum-nvidia/issues."
)

raise_unsupported(stopping_criteria, name="stopping_criteria")
Expand Down Expand Up @@ -1109,7 +1109,7 @@ def raise_unsupported(value: Any, name: str, default: Any = None):
)
self._set_token_ids(
generation_config=generation_config,
config=self.transformers_config,
config=self.config,
kwargs=kwargs,
)
self._set_thresholds_and_condition(
Expand All @@ -1126,9 +1126,7 @@ def raise_unsupported(value: Any, name: str, default: Any = None):
batch_size, total_input_frames = self._retrieve_total_input_frames(
input_features=inputs, input_stride=input_stride, kwargs=kwargs
)
num_segment_frames = (
input_stride * self.transformers_config.max_source_positions
)
num_segment_frames = input_stride * self.config.max_source_positions
is_shortform = total_input_frames <= num_segment_frames
if not is_shortform:
raise ValueError(
Expand All @@ -1138,7 +1136,7 @@ def raise_unsupported(value: Any, name: str, default: Any = None):
init_tokens = self._retrieve_init_tokens(
inputs,
generation_config=generation_config,
config=self.transformers_config,
config=self.config,
num_segment_frames=num_segment_frames,
kwargs=kwargs,
)
Expand Down Expand Up @@ -1171,16 +1169,16 @@ def raise_unsupported(value: Any, name: str, default: Any = None):

if (
max_new_tokens + decoder_input_ids.shape[-1]
> self.transformers_config.max_target_positions
> self.config.max_target_positions
):
max_new_tokens = kwargs.get("max_new_tokens", 0)
raise ValueError(
f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` "
f"is {max_new_tokens}. Thus, the combined length of "
f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
f"`max_target_positions` of the Whisper model: {self.transformers_config.max_target_positions}. "
f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
"You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
f"so that their combined length is less than {self.transformers_config.max_target_positions}."
f"so that their combined length is less than {self.config.max_target_positions}."
)

encoder_input_lengths = torch.tensor(
Expand Down
23 changes: 8 additions & 15 deletions src/optimum/nvidia/pipelines/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ class TextGenerationPipeline(Pipeline):
__slots__ = (
"tokenizer",
"_runtime",
"_bos_token_id",
"_eos_token_id",
"_pad_token_id",
)

def __init__(self, model: CausalLM, tokenizer: PreTrainedTokenizer):
Expand All @@ -52,16 +49,15 @@ def __init__(self, model: CausalLM, tokenizer: PreTrainedTokenizer):
self.tokenizer = tokenizer
self._runtime = model

self._bos_token_id = tokenizer.bos_token_id
self._eos_token_id = tokenizer.eos_token_id
self._pad_token_id = tokenizer.pad_token_id

def __call__(self, inputs: Union[str, List[str]], **kwargs):
def __call__(
self, inputs: Union[str, List[str]], add_special_tokens: bool = True, **kwargs
):
(
preprocess_params,
forward_params,
postprocess_params,
) = self._sanitize_parameters(**kwargs)
) = self._sanitize_parameters(add_special_tokens=add_special_tokens, **kwargs)

model_inputs = self.preprocess(inputs, **preprocess_params)
model_outputs = self._forward(model_inputs, **forward_params)
outputs = self.postprocess(model_outputs, **postprocess_params)
Expand Down Expand Up @@ -147,7 +143,7 @@ def _forward(self, model_inputs, **generate_kwargs):
prompt_text = model_inputs.pop("prompt_text")
attention_mask = model_inputs.get("attention_mask", None)

max_new_tokens = generate_kwargs.pop("max_new_tokens", -1)
max_new_tokens = generate_kwargs.pop("max_new_tokens", None)
min_length = generate_kwargs.pop("min_length", -1)
num_beams = generate_kwargs.pop("num_beams", 1)
temperature = generate_kwargs.pop("temperature", 1.0)
Expand Down Expand Up @@ -188,9 +184,6 @@ def _forward(self, model_inputs, **generate_kwargs):
repetition_penalty=repetition_penalty,
length_penalty=length_penalty,
seed=seed,
bos_token_id=self._bos_token_id,
eos_token_id=self._eos_token_id,
pad_token_id=self._pad_token_id,
)

return {
Expand Down Expand Up @@ -243,13 +236,13 @@ def postprocess(

for sequence in generated_sequence:
# Decode text
beam_text = self.tokenizer.batch_decode(
text = self.tokenizer.decode(
sequence,
skip_special_tokens=True,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
)

record = {"generated_text": beam_text}
record = {"generated_text": text}
records.append(record)

return records
Loading

0 comments on commit 1065a8e

Please sign in to comment.