diff --git a/requirements-common.txt b/requirements-common.txt index aa165ff6d6a5e..1178143409e2e 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -31,3 +31,4 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. +compressed-tensors == 0.6.0 # required for compressed-tensors diff --git a/requirements-test.txt b/requirements-test.txt index 997df9afac763..9787fa2a4a486 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -17,7 +17,6 @@ requests ray[adag]==2.35 sentence-transformers # required for embedding soundfile # required for audio test -compressed-tensors==0.4.0 # required for compressed-tensors timm # required for internvl test transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 5cdb8a8e82280..03097569b2b3b 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -6,13 +6,12 @@ import pytest import torch +from compressed_tensors.quantization import QuantizationType from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationType) @pytest.mark.parametrize( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index abb18d31b5a82..a371f1f4ad2cb 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,6 +1,10 @@ from typing import Any, Dict, List, Optional, cast import torch +from compressed_tensors.config import CompressionFormat +from compressed_tensors.quantization import (QuantizationArgs, + QuantizationStrategy, + QuantizationType) from pydantic import BaseModel from vllm.model_executor.layers.fused_moe import FusedMoE @@ -16,8 +20,7 @@ CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - CompressionFormat, QuantizationArgs, QuantizationStrategy, - QuantizationType, find_matched_target, is_activation_quantization_format, + find_matched_target, is_activation_quantization_format, should_ignore_layer) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index af04d725159f9..733eece4b5fa6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -3,14 +3,14 @@ from typing import Callable, List, Optional import torch +from compressed_tensors import CompressionFormat +from compressed_tensors.quantization import QuantizationStrategy from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( WNA16_SUPPORTED_BITS) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - CompressionFormat, QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 3d55d55cc390d..1671a23d77c63 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,11 +1,10 @@ from typing import Callable, List, Optional import torch +from compressed_tensors.quantization import QuantizationStrategy from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 5931ec36c97d5..7270b302ef965 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,12 +1,11 @@ from typing import Callable, List, Optional import torch +from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 245a35c8783a2..15d9cdbcbb86b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,13 +1,12 @@ from typing import Callable, List, Optional import torch +from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter from vllm.logger import init_logger from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_int8_linear, convert_to_channelwise) from vllm.model_executor.parameter import (BasevLLMParameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index cb65557be8f90..a515738017781 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,12 +1,11 @@ from typing import Callable, List, Optional, Set import torch +from compressed_tensors.quantization import ActivationOrdering from vllm.logger import init_logger from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - ActivationOrdering) from vllm.model_executor.layers.quantization.kernels import ( MPLinearLayerConfig, choose_mp_linear_kernel) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index fc531b9d666e3..a74eaef5efdee 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,111 +1,13 @@ import re -from enum import Enum -from typing import Any, Dict, Iterable, Optional, Union +from typing import Iterable, Optional -from pydantic import BaseModel, Field, field_validator +from compressed_tensors import CompressionFormat from torch.nn import Module from vllm.model_executor.layers.quantization.utils.quant_utils import ( FUSED_LAYER_NAME_MAPPING) -class CompressionFormat(Enum): - dense = "dense" - sparse_bitmask = "sparse-bitmask" - naive_quantized = "naive-quantized" - float_quantized = "float-quantized" - int_quantized = "int-quantized" - pack_quantized = "pack-quantized" - marlin_24 = "marlin-24" - - -class QuantizationType(str, Enum): - """ - Enum storing quantization type options - """ - - INT = "int" - FLOAT = "float" - - -class QuantizationStrategy(str, Enum): - """ - Enum storing quantization strategy options - """ - - TENSOR = "tensor" - CHANNEL = "channel" - GROUP = "group" - BLOCK = "block" - TOKEN = "token" - - -class ActivationOrdering(str, Enum): - """ - Enum storing strategies for activation ordering - - Group: reorder groups and weight\n - Weight: only reorder weight, not groups. Slightly lower latency and - accuracy compared to group actorder\n - """ - - GROUP = "group" - WEIGHT = "weight" - - -class QuantizationArgs(BaseModel): - """ - User facing arguments used to define a quantization config - for weights or activations - - :param num_bits: quantization bit depth - :param type: dtype to quantized to, either int or float - :param symmetric: whether or not quantization scale is symmetric - :param strategy: string determining the scope of scale/zero-point to apply - :param group_size: group length to use for the group strategy - :param block_structure: 2d block structure to use for the block - strategy, must be of the format "2x4", "8x16", etc. - :param dynamic: set True to perform dynamic quantization - - values will not be calibrated during calibration phase, - instead during inference new quantization ranges will be - observed with every sample. Defaults to False for static - quantization. Note that enabling dynamic quantization - will change the default observer to a memoryless one - :param actorder: whether to apply group quantization in decreasing order of - activation. Defaults to None for arbitrary ordering - """ - - num_bits: int = 8 - type: QuantizationType = QuantizationType.INT - symmetric: bool = True - group_size: Optional[int] = None - strategy: Optional[QuantizationStrategy] = None - block_structure: Optional[str] = None - dynamic: bool = False - actorder: Union[ActivationOrdering, bool, None] = None - observer: str = Field( - default="minmax", - description=("The class to use to compute the quantization param - " - "scale and zero-point'"), - ) - observer_kwargs: Dict[str, Any] = Field( - default_factory=dict, - description= - ("optional dict of kwargs to be passed directly to torch quantization " - "Observers constructor excluding quantization range or symmetry"), - ) - - @field_validator("actorder", mode="before") - def validate_actorder(cls, value) -> Optional[ActivationOrdering]: - if isinstance(value, bool): - return ActivationOrdering.GROUP if value else None - - if isinstance(value, str): - return ActivationOrdering(value.lower()) - - return value - - def is_activation_quantization_format(format: str) -> bool: _ACTIVATION_QUANTIZATION_FORMATS = [ CompressionFormat.naive_quantized.value,