From 32918eb94b23e18df256f45238f4f3bd988c5414 Mon Sep 17 00:00:00 2001 From: achew010 <165894159+achew010@users.noreply.github.com> Date: Tue, 27 Aug 2024 14:56:25 +0800 Subject: [PATCH] Fix formatter (#74) * formatted accelerated-peft Signed-off-by: 1000850000 user * formatted foak Signed-off-by: 1000850000 user --------- Signed-off-by: 1000850000 user --- .github/workflows/format.yml | 2 +- .../fms_acceleration_peft/autogptq_utils.py | 22 ++++++---- .../framework_plugin_autogptq.py | 44 +++++++++++++------ .../accelerated-peft/tests/test_gptqmodel.py | 10 ++--- .../tests/test_peft_plugins.py | 32 +++++++++----- .../accelerated-peft/tests/test_q4_triton.py | 3 ++ plugins/accelerated-peft/tests/test_triton.py | 4 +- .../README.md | 4 +- .../framework_plugin_fast_quantized_peft.py | 19 ++++++-- .../src/fms_acceleration_foak/models/llama.py | 13 +++--- .../fms_acceleration_foak/models/mistral.py | 14 +++--- .../fms_acceleration_foak/models/mixtral.py | 12 ++--- .../src/fms_acceleration_foak/models/utils.py | 2 +- .../tests/test_fused_ops.py | 4 +- 14 files changed, 116 insertions(+), 69 deletions(-) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 90f7210a..d2f9aea6 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -48,7 +48,7 @@ jobs: - name: Run formatter run: | cd plugins/${{ matrix.plugin_name }} - tox -e fmt + tox -e fmt -- . --check - name: Run pytest run: | cd plugins/${{ matrix.plugin_name }} diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/autogptq_utils.py b/plugins/accelerated-peft/src/fms_acceleration_peft/autogptq_utils.py index a62d0543..beda1a14 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/autogptq_utils.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/autogptq_utils.py @@ -16,20 +16,24 @@ # https://spdx.dev/learn/handling-license-info/ # Standard +from functools import partial from typing import Callable, List # Third Party +from fms_acceleration.model_patcher import ( + ModelPatcher, + ModelPatcherRule, + ModelPatcherTrigger, +) from peft import LoraConfig from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ import torch -from fms_acceleration.model_patcher import ModelPatcher, ModelPatcherRule, ModelPatcherTrigger -from functools import partial - # these parameters are to be patched for triton v2 # consider making a map if patching more kernels PATCH_FOR_FSDP_TRITON_V2 = ["qweight", "qzeros"] + def build_patch_to_view_tensor_to_parameter_for_fsdp_gptq( module, torch_dtype, @@ -38,9 +42,7 @@ def build_patch_to_view_tensor_to_parameter_for_fsdp_gptq( # so FSDP can shard them for attr_name in PATCH_FOR_FSDP_TRITON_V2: attr = getattr(module, attr_name) - attr = torch.nn.Parameter( - attr.view(torch_dtype), requires_grad=False - ) + attr = torch.nn.Parameter(attr.view(torch_dtype), requires_grad=False) setattr(module, attr_name, attr) # this patches the forward to convert them back to original @@ -51,18 +53,21 @@ def build_patch_to_view_tensor_to_parameter_for_fsdp_gptq( torch_dtype=torch.int32, # patch it back to ) + def register_tensors_as_parameters_patch_rule(target_module, torch_dtype): # Register patch ModelPatcher.register( ModelPatcherRule( rule_id="autogptq_patch_tensors_as_float_parameters", trigger=ModelPatcherTrigger(check=target_module), - forward_builder = partial( - build_patch_to_view_tensor_to_parameter_for_fsdp_gptq, torch_dtype=torch_dtype + forward_builder=partial( + build_patch_to_view_tensor_to_parameter_for_fsdp_gptq, + torch_dtype=torch_dtype, ), ) ) + def make_sure_no_tensor_in_meta_device( model, use_triton: bool, @@ -133,6 +138,7 @@ def create_new_module_peft( # if module cannot be found, return None which results in a raise in the call-stack return new_module + # consider to move this somewhere more general def patch_forward_to_view_attributes_before_call( old_forward: Callable, diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index 8e95751d..be2404fe 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -35,6 +35,7 @@ # Local from .autogptq_utils import register_tensors_as_parameters_patch_rule + class AutoGPTQAccelerationPlugin(AccelerationPlugin): require_packages = [] @@ -57,11 +58,13 @@ def __init__(self, configurations: Dict[str, Dict]): ) if self.use_external_lib: - from transformers.utils.import_utils import _is_package_available # pylint: disable=import-outside-toplevel - assert ( - _is_package_available("auto_gptq") is True - ), ( - "Unable to use external library, auto_gptq module not found. " + # Third Party + from transformers.utils.import_utils import ( # pylint: disable=import-outside-toplevel + _is_package_available, + ) + + assert _is_package_available("auto_gptq") is True, ( + "Unable to use external library, auto_gptq module not found. " "Refer to README for installation instructions " "as a specific version might be required." ) @@ -71,19 +74,28 @@ def model_loader(self, model_name: str, **kwargs): # Third Party if self.use_external_lib: # Third Party - from auto_gptq import ( # pylint: disable=import-outside-toplevel,import-error - AutoGPTQForCausalLM as GPTQModel, - ) - from auto_gptq import BaseQuantizeConfig as QuantizeConfig # pylint: disable=import-outside-toplevel,import-error from auto_gptq.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error QuantLinear, ) + + from auto_gptq import ( # isort:skip pylint: disable=import-outside-toplevel,import-error + AutoGPTQForCausalLM as GPTQModel, + ) + from auto_gptq import ( # isort:skip pylint: disable=import-outside-toplevel,import-error + BaseQuantizeConfig as QuantizeConfig, + ) else: - from .gptqmodel import GPTQModel, QuantizeConfig # pylint: disable=import-outside-toplevel,import-error - from .gptqmodel.utils import Backend # pylint: disable=import-outside-toplevel,import-error - from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error + # Local + from .gptqmodel import ( # pylint: disable=import-outside-toplevel,import-error + GPTQModel, + QuantizeConfig, + ) + from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error QuantLinear, ) + from .gptqmodel.utils import ( # pylint: disable=import-outside-toplevel,import-error + Backend, + ) # Currently we allow only a quantized checkpoint to be loaded, we do not # implement the quantization process here. @@ -141,7 +153,9 @@ def model_loader(self, model_name: str, **kwargs): kwargs["low_cpu_mem_usage"] = True if self.use_external_lib: # Local - from .autogptq_utils import make_sure_no_tensor_in_meta_device # pylint: disable=import-outside-toplevel + from .autogptq_utils import ( # pylint: disable=import-outside-toplevel + make_sure_no_tensor_in_meta_device, + ) # We patch `make_sure_no_tensor_in_meta_device` # from autogptq to avoid errors on models without bias @@ -250,7 +264,9 @@ def augmentation( ) else: # Local - from .gptqmodel.utils.peft import get_gptq_peft_model # pylint: disable=import-outside-toplevel,import-error + from .gptqmodel.utils.peft import ( # pylint: disable=import-outside-toplevel,import-error + get_gptq_peft_model, + ) (peft_config,) = modifiable_args # unpack modifiable args diff --git a/plugins/accelerated-peft/tests/test_gptqmodel.py b/plugins/accelerated-peft/tests/test_gptqmodel.py index 05fdadbc..e20db946 100644 --- a/plugins/accelerated-peft/tests/test_gptqmodel.py +++ b/plugins/accelerated-peft/tests/test_gptqmodel.py @@ -64,10 +64,10 @@ def load_autogptq_plugin_model( "peft": { "quantization": { "auto_gptq": { - "kernel": "triton_v2", - "from_quantized": True, + "kernel": "triton_v2", + "from_quantized": True, "use_external_lib": use_external_lib, - } + } } } }, @@ -292,10 +292,10 @@ def test_quantizing_pretrained_model_outputs_match( loss_fn = torch.nn.KLDivLoss(reduction="sum") # input should be a distribution in the log space input = torch.nn.functional.log_softmax(refactored_logits, dim=-1) - input = input.view(BS*SEQLEN, -1) + input = input.view(BS * SEQLEN, -1) # target must be prob distribution target = torch.nn.functional.softmax(original_logits, dim=-1) - target = target.view(BS*SEQLEN, -1) + target = target.view(BS * SEQLEN, -1) error = loss_fn(input, target) assert error.lt( LOSS_TOLERANCE diff --git a/plugins/accelerated-peft/tests/test_peft_plugins.py b/plugins/accelerated-peft/tests/test_peft_plugins.py index 6a5176ad..38534d5d 100644 --- a/plugins/accelerated-peft/tests/test_peft_plugins.py +++ b/plugins/accelerated-peft/tests/test_peft_plugins.py @@ -16,6 +16,7 @@ # https://spdx.dev/learn/handling-license-info/ # Standard +from unittest.mock import patch import os # Third Party @@ -26,7 +27,6 @@ update_configuration_contents, ) import pytest -from unittest.mock import patch MODEL_NAME_AUTO_GPTQ = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ" @@ -89,8 +89,10 @@ def test_configure_gptq_plugin(): e.match(f"AutoGPTQAccelerationPlugin: Value at '{key}'") + def test_autogptq_loading(): "Test for correctness of autogptq loading logic" + def autogptq_unavailable(package_name: str): return False @@ -100,13 +102,12 @@ def autogptq_unavailable(package_name: str): # 3. check when using external package and it is not available, an AssertionError is thrown with pytest.raises( AssertionError, - match = "Unable to use external library, auto_gptq module not found. " - "Refer to README for installation instructions as a specific version might be required." + match="Unable to use external library, auto_gptq module not found. " + "Refer to README for installation instructions as a specific version might be required.", ): with patch( - "transformers.utils.import_utils." - "_is_package_available", - autogptq_unavailable, + "transformers.utils.import_utils._is_package_available", + autogptq_unavailable, ): with instantiate_framework( update_configuration_contents( @@ -118,7 +119,11 @@ def autogptq_unavailable(package_name: str): ) as framework: pass - from fms_acceleration_peft.framework_plugin_autogptq import AutoGPTQAccelerationPlugin # pylint: disable=import-outside-toplevel + # First Party + from fms_acceleration_peft.framework_plugin_autogptq import ( # pylint: disable=import-outside-toplevel + AutoGPTQAccelerationPlugin, + ) + # - Test that plugin attribute is set when config field `use_external_lib` is False # When plugin attribute is set correctly, it will route to correct package on model loading with instantiate_framework( @@ -131,21 +136,24 @@ def autogptq_unavailable(package_name: str): ) as framework: for _, plugin in framework.active_plugins: if isinstance(plugin, AutoGPTQAccelerationPlugin): - assert plugin.use_external_lib is False, \ - "Plugin attribute not correctly set from config field" + assert ( + plugin.use_external_lib is False + ), "Plugin attribute not correctly set from config field" # - Test that plugin attribute is set when config field `use_external_lib` is None # When plugin attribute is set correctly, it will route to correct package on model loading default_config = read_configuration(CONFIG_PATH_AUTO_GPTQ) - default_config['peft']['quantization']['auto_gptq'].pop('use_external_lib') + default_config["peft"]["quantization"]["auto_gptq"].pop("use_external_lib") with instantiate_framework( default_config, require_packages_check=False, ) as framework: for _, plugin in framework.active_plugins: if isinstance(plugin, AutoGPTQAccelerationPlugin): - assert plugin.use_external_lib is False, \ - "Plugin attribute not correctly set from config field" + assert ( + plugin.use_external_lib is False + ), "Plugin attribute not correctly set from config field" + # We do not enable the skip since this test does not actually require the packages # installed diff --git a/plugins/accelerated-peft/tests/test_q4_triton.py b/plugins/accelerated-peft/tests/test_q4_triton.py index 1201f342..9c68893e 100644 --- a/plugins/accelerated-peft/tests/test_q4_triton.py +++ b/plugins/accelerated-peft/tests/test_q4_triton.py @@ -29,15 +29,18 @@ CUDA_AVAILABLE = False if torch.cuda.is_available(): + # First Party from fms_acceleration_peft.gptqmodel import Backend, GPTQModel # noqa: E402 from fms_acceleration_peft.gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # noqa: E402 QuantLinear as TritonV2QuantLinear, ) + CUDA_AVAILABLE = True GENERATE_EVAL_SIZE = 100 + class TestsQ4Triton(unittest.TestCase): @unittest.skipIf( CUDA_AVAILABLE is False, diff --git a/plugins/accelerated-peft/tests/test_triton.py b/plugins/accelerated-peft/tests/test_triton.py index aed7af12..0f02745d 100644 --- a/plugins/accelerated-peft/tests/test_triton.py +++ b/plugins/accelerated-peft/tests/test_triton.py @@ -31,8 +31,10 @@ CUDA_AVAILABLE = False if torch.cuda.is_available(): + # First Party from fms_acceleration_peft.gptqmodel import Backend, GPTQModel # noqa: E402 - CUDA_AVAILABLE = True + + CUDA_AVAILABLE = True MODEL_ID = "TheBloke/Llama-7B-GPTQ" DATASET_ID = "timdettmers/openassistant-guanaco" diff --git a/plugins/attention-and-distributed-packing/README.md b/plugins/attention-and-distributed-packing/README.md index 0ff6f2e4..1da9b612 100644 --- a/plugins/attention-and-distributed-packing/README.md +++ b/plugins/attention-and-distributed-packing/README.md @@ -23,10 +23,10 @@ otherwise if `transformers < v4.44.0` the plugin will use an internal implementa To reproduce the benchmarks, simply run the following commands, Reproduce [Padding Free on A100 80GB](scripts/benchmarks/refs_orca/a100_80gb_pf.csv) -`bash scripts/run_benchmarks.sh "1 2" "4 8" benchmark_outputs scenarios-orca.yaml "none"` +`tox -e run-benches -- "1 2" "4 8" benchmark_outputs scenarios-orca.yaml "none"` Reproduce [MultiPack on A100 80GB](scripts/benchmarks/refs_orca/a100_80gb_mp.csv) -`bash scripts/run_benchmarks.sh "2 4 8" "16 32 64" benchmark_outputs scenarios-orca.yaml "padding-free"` +`tox -e run-benches -- "2 4 8" "16 32 64" benchmark_outputs scenarios-orca.yaml "padding-free"` ## Known Issues diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py index d2abd5b1..ff67229c 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py @@ -24,6 +24,7 @@ import torch import torch.distributed as dist + # consider moving this somewhere else later def lora_adapters_switch_ddp_from_fsdp(modules, fsdp_plugin): """ @@ -58,9 +59,20 @@ def _all_reduce_hook(grad): if not B.weight.is_cuda: set_module_tensor_to_device(B, "weight", "cuda") + def register_foak_model_patch_rules(base_type): - from fms_acceleration.model_patcher import ModelPatcher # pylint: disable=import-outside-toplevel - from .models import llama, mistral, mixtral # pylint: disable=import-outside-toplevel + # Third Party + from fms_acceleration.model_patcher import ( # pylint: disable=import-outside-toplevel + ModelPatcher, + ) + + # Local + from .models import ( # pylint: disable=import-outside-toplevel + llama, + mistral, + mixtral, + ) + rules = [ *llama.get_mp_rules(base_type), *mistral.get_mp_rules(base_type), @@ -69,6 +81,7 @@ def register_foak_model_patch_rules(base_type): for _rule in rules: ModelPatcher.register(_rule) + class FastQuantizedPeftAccelerationPlugin(AccelerationPlugin): # NOTE: may remove this when we have generic model rules @@ -122,7 +135,7 @@ def augmentation( ), "need to run in fp16 mixed precision or load model in fp16" # wrapper function to register foak patches - register_foak_model_patch_rules(base_type = self._base_layer) + register_foak_model_patch_rules(base_type=self._base_layer) return model, modifiable_args def get_callbacks_and_ready_for_train( diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py index a934fc1e..58bb456f 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/llama.py @@ -16,17 +16,17 @@ from functools import partial # Third Party -from transformers.models.llama.modeling_llama import ( - LlamaAttention, - LlamaMLP, - LlamaRMSNorm, -) from fms_acceleration.model_patcher import ( ModelPatcherRule, ModelPatcherTrigger, combine_functions, combine_triggers, ) +from transformers.models.llama.modeling_llama import ( + LlamaAttention, + LlamaMLP, + LlamaRMSNorm, +) # Local from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss @@ -34,6 +34,7 @@ from ..kernels.unsloth.rope_embedding import fast_rope_embedding from .utils import KEY_MLP, KEY_O, KEY_QKV, build_lora_fused_ops, trigger_fused_ops + def get_mp_rules(base_type: str): """ Function to access all patch rules in this module. @@ -125,5 +126,5 @@ def get_mp_rules(base_type: str): fast_rope_embedding, None, ), - ) + ), ] diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py index d090da5f..8e773a24 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mistral.py @@ -16,18 +16,17 @@ from functools import partial # Third Party -from transformers.models.mistral.modeling_mistral import ( - MistralAttention, - MistralMLP, - MistralRMSNorm, -) from fms_acceleration.model_patcher import ( ModelPatcherRule, ModelPatcherTrigger, combine_functions, combine_triggers, ) - +from transformers.models.mistral.modeling_mistral import ( + MistralAttention, + MistralMLP, + MistralRMSNorm, +) # Local from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss @@ -35,6 +34,7 @@ from ..kernels.unsloth.rope_embedding import fast_rope_embedding from .utils import KEY_MLP, KEY_O, KEY_QKV, build_lora_fused_ops, trigger_fused_ops + def get_mp_rules(base_type): """ Function to access all patch rules in this module. @@ -117,5 +117,5 @@ def get_mp_rules(base_type): fast_rope_embedding, None, ), - ) + ), ] diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py index 7c0c58ab..67eada1c 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/mixtral.py @@ -16,24 +16,24 @@ from functools import partial # Third Party -from transformers.models.mixtral.modeling_mixtral import ( - MixtralAttention, - MixtralRMSNorm, -) from fms_acceleration.model_patcher import ( ModelPatcherRule, ModelPatcherTrigger, combine_functions, combine_triggers, ) +from transformers.models.mixtral.modeling_mixtral import ( + MixtralAttention, + MixtralRMSNorm, +) # Local from ..kernels.unsloth.cross_entropy_loss import FastCrossEntropyLoss from ..kernels.unsloth.rms_layernorm import fast_rms_layernorm from ..kernels.unsloth.rope_embedding import fast_rope_embedding - from .utils import KEY_O, KEY_QKV, build_lora_fused_ops, trigger_fused_ops + def get_mp_rules(base_type): """ Function to access all patch rules in this module. @@ -100,5 +100,5 @@ def get_mp_rules(base_type): fast_rope_embedding, None, ), - ) + ), ] diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py index 9d624277..3653dc06 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/models/utils.py @@ -4,6 +4,7 @@ import os # Third Party +from fms_acceleration.model_patcher import ModelPatcherTrigger import torch # Local @@ -16,7 +17,6 @@ from ..fused_ops.unsloth_lora.gptq.fast_lora import apply_lora_mlp as fused_op_mlp_gptq from ..fused_ops.unsloth_lora.gptq.fast_lora import apply_lora_o_v2 as fused_op_o_gptq from ..fused_ops.unsloth_lora.gptq.fast_lora import apply_lora_qkv as fused_op_qkv_gptq -from fms_acceleration.model_patcher import ModelPatcherTrigger KEY_QKV = "qkv" KEY_O = "o" diff --git a/plugins/fused-ops-and-kernels/tests/test_fused_ops.py b/plugins/fused-ops-and-kernels/tests/test_fused_ops.py index 237a3a6f..356c00b3 100644 --- a/plugins/fused-ops-and-kernels/tests/test_fused_ops.py +++ b/plugins/fused-ops-and-kernels/tests/test_fused_ops.py @@ -3,6 +3,7 @@ from itertools import product # Third Party +from fms_acceleration.model_patcher import patch_model from peft import LoraConfig from transformers import AutoConfig from transformers.models.llama.modeling_llama import LlamaAttention @@ -10,9 +11,6 @@ import pytest # pylint: disable=import-error import torch -# First Party -from fms_acceleration.model_patcher import patch_model - BNB = "bitsandbytes" GPTQ = "auto_gptq"