[BugFix] Stop silent failures on compressed-tensors parsing (vllm-pro…

…ject#9381)
ZeroYuJie · Oct 18, 2024 · 48138a8 · 48138a8
1 parent 343f8e0
commit 48138a8
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 13 deletions.
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -31,4 +31,4 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.6.0 # required for compressed-tensors
+compressed-tensors == 0.7.1 # required for compressed-tensors
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -100,12 +100,21 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
                 target_scheme_map[target][
                     "weights"] = QuantizationArgs.parse_obj(
                         quant_config.get("weights"))
-                try:
-                    target_scheme_map[target][
-                        "input_activations"] = QuantizationArgs.parse_obj(
-                            quant_config.get("input_activations"))
-                except Exception:
-                    target_scheme_map[target]["input_activations"] = None
+
+                target_scheme_map[target]["input_activations"] = None
+                if is_activation_quantization_format(quant_format):
+                    input_activations = quant_config.get("input_activations")
+                    # The only case where we have activation quant supported
+                    # but no input_activations provided in the config
+                    # should be w8a16fp8 w8a16fp8 can also run for cases where
+                    # there is an input_quant but it is ignored
+                    if not input_activations:
+                        assert target_scheme_map[target][
+                            "weights"].type == QuantizationType.FLOAT
+                    else:
+                        target_scheme_map[target][
+                            "input_activations"] = QuantizationArgs.parse_obj(
+                                quant_config.get("input_activations"))
 
         return cls(target_scheme_map=target_scheme_map,
                    ignore=ignore,
@@ -244,8 +253,6 @@ def _get_scheme_from_parts(
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder)
 
-        # Detect If Activation Quantization.
-        # TODO @dsikka: clean-up conditions
         if is_activation_quantization_format(self.quant_format):
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 is_fp8_w8a8_supported = self._check_scheme_supported(
@@ -256,16 +263,19 @@ def _get_scheme_from_parts(
                         is_static_input_scheme=(input_quant
                                                 and not input_quant.dynamic))
                 else:
+                    # note: input_quant will be present for converted models;
+                    # will be ignored during inference post loading
                     return CompressedTensorsW8A16Fp8(
                         strategy=weight_quant.strategy,
-                        is_static_input_scheme=(input_quant
-                                                and not input_quant.dynamic))
+                        is_static_input_scheme=not input_quant.dynamic)
 
+            # note: input_quant can be None
             if self._is_fp8_w8a16(weight_quant, input_quant):
+                is_static_input_scheme = (input_quant
+                                          and not input_quant.dynamic)
                 return CompressedTensorsW8A16Fp8(
                     strategy=weight_quant.strategy,
-                    is_static_input_scheme=(input_quant
-                                            and not input_quant.dynamic))
+                    is_static_input_scheme=is_static_input_scheme)
 
             if self._is_static_tensor_w8a8(weight_quant, input_quant):
                 return CompressedTensorsW8A8Int8(