Fix warning message for fp32_cpu_offloading in bitsandbytes configs (#…

…34079) * change cpu offload warning for fp8 quantization * change cpu offload warning for fp4 quantization * change cpu offload variable name for fp8 and fp4 quantization
huggingface · Oct 17, 2024 · aa3e35a · aa3e35a
1 parent 6d2b203
commit aa3e35a
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 6 deletions.
diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py
@@ -102,7 +102,7 @@ def validate_environment(self, *args, **kwargs):
                 raise ValueError(
                     "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
                     "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
-                    "in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to "
+                    "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to "
                     "`from_pretrained`. Check "
                     "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu "
                     "for more details. "
@@ -285,7 +285,7 @@ def _process_model_before_weight_loading(
     ):
         from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear
 
-        load_in_8bit_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
+        llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
 
         # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
         if self.quantization_config.llm_int8_skip_modules is None:
@@ -302,7 +302,7 @@ def _process_model_before_weight_loading(
         if isinstance(device_map, dict) and len(device_map.keys()) > 1:
             keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
 
-            if len(keys_on_cpu) > 0 and not load_in_8bit_fp32_cpu_offload:
+            if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload:
                 raise ValueError(
                     "If you want to offload some keys to `cpu` or `disk`, you need to set "
                     "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "

diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py
@@ -101,7 +101,7 @@ def validate_environment(self, *args, **kwargs):
                 raise ValueError(
                     "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
                     "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
-                    "in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to "
+                    "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to "
                     "`from_pretrained`. Check "
                     "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu "
                     "for more details. "
@@ -250,7 +250,7 @@ def _process_model_before_weight_loading(
     ):
         from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear
 
-        load_in_8bit_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
+        llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
 
         # We keep some modules such as the lm_head in their original dtype for numerical stability reasons
         if self.quantization_config.llm_int8_skip_modules is None:
@@ -267,7 +267,7 @@ def _process_model_before_weight_loading(
         if isinstance(device_map, dict) and len(device_map.keys()) > 1:
             keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
 
-            if len(keys_on_cpu) > 0 and not load_in_8bit_fp32_cpu_offload:
+            if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload:
                 raise ValueError(
                     "If you want to offload some keys to `cpu` or `disk`, you need to set "
                     "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "