Test CausalLM generate & pipeline (#110)

* add tests & standardize generate * fix bugs & pipeline * clean engine before test * cleanup * style * test pipelines * Quality * fix tests * style * run nvidia-smi * fix CI - can't compile for 32000 generated tokens on A10G * fewer new tokens * still fewer tokens or tests won't pass * lets test 7b gemma as well --------- Co-authored-by: Morgan Funtowicz <[email protected]>
huggingface · Apr 11, 2024 · 1065a8e · 1065a8e
1 parent 746c445
commit 1065a8e
Show file tree

Hide file tree

Showing 12 changed files with 354 additions and 92 deletions.
diff --git a/.github/workflows/pr_functional_tests.yml b/.github/workflows/pr_functional_tests.yml
@@ -55,6 +55,10 @@ jobs:
         run: |
           python3 -m pip install --upgrade -e .[quality,tests]
 
+      - name: Run nvidia-smi
+        run: |
+          nvidia-smi
+
       - name: Run optimum-nvidia functional test-suite
         run: |
           pytest -n 4 -s -vvvvv -p no:warnings -o log_cli=true --ignore=tests/integration/ tests/
diff --git a/.github/workflows/pr_integration_tests.yml b/.github/workflows/pr_integration_tests.yml
@@ -56,6 +56,10 @@ jobs:
         run: |
           python3 -m pip install --upgrade -e .[quality,tests]
 
+      - name: Run nvidia-smi
+        run: |
+          nvidia-smi
+
       - name: Run optimum-nvidia integration test-suite
         run: |
           pytest -s -vvvvv -n 1 -p no:warnings -o log_cli=true tests/integration/
diff --git a/examples/text-generation.py b/examples/text-generation.py
@@ -79,7 +79,5 @@
         max_new_tokens=args.max_new_tokens,
     )
 
-    generated_text = tokenizer.batch_decode(
-        generated.flatten(0, 1), skip_special_tokens=True
-    )
+    generated_text = tokenizer.batch_decode(generated, skip_special_tokens=True)
     print(generated_text)
diff --git a/src/optimum/nvidia/hub.py b/src/optimum/nvidia/hub.py
@@ -403,7 +403,7 @@ def _save_pretrained(self, save_directory: Path) -> None:
                 "Please open-up an issue at https://github.com/huggingface/optimum-nvidia"
             )
 
-        self.transformers_config.save_pretrained(save_directory)
+        self.config.save_pretrained(save_directory)
         if self.generation_config is not None:
             self.generation_config.save_pretrained(save_directory)
 

diff --git a/src/optimum/nvidia/models/gemma.py b/src/optimum/nvidia/models/gemma.py
@@ -75,6 +75,7 @@ def from_config(config: TransformersPretrainedConfig) -> "TensorRTConfig":
             share_embedding_table=False,
             max_lora_rank=64,
             quantization=qconfig,
+            rotary_base=config.rope_theta,
         )
 
         trt_config.mapping.gpus_per_node = min(trt_config.mapping.world_size, 8)

diff --git a/src/optimum/nvidia/models/mistral.py b/src/optimum/nvidia/models/mistral.py
@@ -74,6 +74,7 @@ def from_config(config: TransformersPretrainedConfig) -> "TensorRTConfig":
             max_lora_rank=64,
             head_size=config.hidden_size / config.num_attention_heads,
             quantization=qconfig,
+            rotary_base=config.rope_theta,
         )
 
         trt_config.mapping.gpus_per_node = min(trt_config.mapping.world_size, 8)

diff --git a/src/optimum/nvidia/models/whisper.py b/src/optimum/nvidia/models/whisper.py
@@ -689,7 +689,7 @@ def __init__(
             generation_config = GenerationConfig()
         self.generation_config = generation_config
 
-        self.transformers_config = transformers_config
+        self.config = transformers_config
 
         # Encoder.
         serialize_path = engines_folders[0] / "rank0.engine"
@@ -1078,7 +1078,7 @@ def generate(
         def raise_unsupported(value: Any, name: str, default: Any = None):
             if value != default:
                 raise ValueError(
-                    f"TensorRTForSpeechSeq2Seq.generate does not support {name} (got {value}). Please open an issue at https://github.com/huggingface/optimum-nvidia/issues."
+                    f"TensorRTForSpeechSeq2Seq.generate does not support the argument {name} (got {name}={value}). Please open an issue at https://github.com/huggingface/optimum-nvidia/issues."
                 )
 
         raise_unsupported(stopping_criteria, name="stopping_criteria")
@@ -1109,7 +1109,7 @@ def raise_unsupported(value: Any, name: str, default: Any = None):
         )
         self._set_token_ids(
             generation_config=generation_config,
-            config=self.transformers_config,
+            config=self.config,
             kwargs=kwargs,
         )
         self._set_thresholds_and_condition(
@@ -1126,9 +1126,7 @@ def raise_unsupported(value: Any, name: str, default: Any = None):
         batch_size, total_input_frames = self._retrieve_total_input_frames(
             input_features=inputs, input_stride=input_stride, kwargs=kwargs
         )
-        num_segment_frames = (
-            input_stride * self.transformers_config.max_source_positions
-        )
+        num_segment_frames = input_stride * self.config.max_source_positions
         is_shortform = total_input_frames <= num_segment_frames
         if not is_shortform:
             raise ValueError(
@@ -1138,7 +1136,7 @@ def raise_unsupported(value: Any, name: str, default: Any = None):
         init_tokens = self._retrieve_init_tokens(
             inputs,
             generation_config=generation_config,
-            config=self.transformers_config,
+            config=self.config,
             num_segment_frames=num_segment_frames,
             kwargs=kwargs,
         )
@@ -1171,16 +1169,16 @@ def raise_unsupported(value: Any, name: str, default: Any = None):
 
         if (
             max_new_tokens + decoder_input_ids.shape[-1]
-            > self.transformers_config.max_target_positions
+            > self.config.max_target_positions
         ):
             max_new_tokens = kwargs.get("max_new_tokens", 0)
             raise ValueError(
                 f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` "
                 f"is {max_new_tokens}. Thus, the combined length of "
                 f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
-                f"`max_target_positions` of the Whisper model: {self.transformers_config.max_target_positions}. "
+                f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
                 "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
-                f"so that their combined length is less than {self.transformers_config.max_target_positions}."
+                f"so that their combined length is less than {self.config.max_target_positions}."
             )
 
         encoder_input_lengths = torch.tensor(

diff --git a/src/optimum/nvidia/pipelines/text_generation.py b/src/optimum/nvidia/pipelines/text_generation.py
@@ -38,9 +38,6 @@ class TextGenerationPipeline(Pipeline):
     __slots__ = (
         "tokenizer",
         "_runtime",
-        "_bos_token_id",
-        "_eos_token_id",
-        "_pad_token_id",
     )
 
     def __init__(self, model: CausalLM, tokenizer: PreTrainedTokenizer):
@@ -52,16 +49,15 @@ def __init__(self, model: CausalLM, tokenizer: PreTrainedTokenizer):
         self.tokenizer = tokenizer
         self._runtime = model
 
-        self._bos_token_id = tokenizer.bos_token_id
-        self._eos_token_id = tokenizer.eos_token_id
-        self._pad_token_id = tokenizer.pad_token_id
-
-    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+    def __call__(
+        self, inputs: Union[str, List[str]], add_special_tokens: bool = True, **kwargs
+    ):
         (
             preprocess_params,
             forward_params,
             postprocess_params,
-        ) = self._sanitize_parameters(**kwargs)
+        ) = self._sanitize_parameters(add_special_tokens=add_special_tokens, **kwargs)
+
         model_inputs = self.preprocess(inputs, **preprocess_params)
         model_outputs = self._forward(model_inputs, **forward_params)
         outputs = self.postprocess(model_outputs, **postprocess_params)
@@ -147,7 +143,7 @@ def _forward(self, model_inputs, **generate_kwargs):
         prompt_text = model_inputs.pop("prompt_text")
         attention_mask = model_inputs.get("attention_mask", None)
 
-        max_new_tokens = generate_kwargs.pop("max_new_tokens", -1)
+        max_new_tokens = generate_kwargs.pop("max_new_tokens", None)
         min_length = generate_kwargs.pop("min_length", -1)
         num_beams = generate_kwargs.pop("num_beams", 1)
         temperature = generate_kwargs.pop("temperature", 1.0)
@@ -188,9 +184,6 @@ def _forward(self, model_inputs, **generate_kwargs):
             repetition_penalty=repetition_penalty,
             length_penalty=length_penalty,
             seed=seed,
-            bos_token_id=self._bos_token_id,
-            eos_token_id=self._eos_token_id,
-            pad_token_id=self._pad_token_id,
         )
 
         return {
@@ -243,13 +236,13 @@ def postprocess(
 
         for sequence in generated_sequence:
             # Decode text
-            beam_text = self.tokenizer.batch_decode(
+            text = self.tokenizer.decode(
                 sequence,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             )
 
-            record = {"generated_text": beam_text}
+            record = {"generated_text": text}
             records.append(record)
 
         return records