diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py index eed45192e7ad9c..98d57e22524902 100644 --- a/src/transformers/quantizers/quantizer_bnb_4bit.py +++ b/src/transformers/quantizers/quantizer_bnb_4bit.py @@ -102,7 +102,7 @@ def validate_environment(self, *args, **kwargs): raise ValueError( "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the " "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules " - "in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to " + "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to " "`from_pretrained`. Check " "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu " "for more details. " @@ -285,7 +285,7 @@ def _process_model_before_weight_loading( ): from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear - load_in_8bit_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload + llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload # We keep some modules such as the lm_head in their original dtype for numerical stability reasons if self.quantization_config.llm_int8_skip_modules is None: @@ -302,7 +302,7 @@ def _process_model_before_weight_loading( if isinstance(device_map, dict) and len(device_map.keys()) > 1: keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]] - if len(keys_on_cpu) > 0 and not load_in_8bit_fp32_cpu_offload: + if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload: raise ValueError( "If you want to offload some keys to `cpu` or `disk`, you need to set " "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be " diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py index 020ff7cc6214a7..093d612b914cef 100644 --- a/src/transformers/quantizers/quantizer_bnb_8bit.py +++ b/src/transformers/quantizers/quantizer_bnb_8bit.py @@ -101,7 +101,7 @@ def validate_environment(self, *args, **kwargs): raise ValueError( "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the " "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules " - "in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to " + "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to " "`from_pretrained`. Check " "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu " "for more details. " @@ -250,7 +250,7 @@ def _process_model_before_weight_loading( ): from ..integrations import get_keys_to_not_convert, replace_with_bnb_linear - load_in_8bit_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload + llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload # We keep some modules such as the lm_head in their original dtype for numerical stability reasons if self.quantization_config.llm_int8_skip_modules is None: @@ -267,7 +267,7 @@ def _process_model_before_weight_loading( if isinstance(device_map, dict) and len(device_map.keys()) > 1: keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]] - if len(keys_on_cpu) > 0 and not load_in_8bit_fp32_cpu_offload: + if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload: raise ValueError( "If you want to offload some keys to `cpu` or `disk`, you need to set " "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "