diff --git a/tests/test_accelerator.py b/tests/test_accelerator.py index e66f19c171e..054b96214bf 100644 --- a/tests/test_accelerator.py +++ b/tests/test_accelerator.py @@ -11,8 +11,8 @@ from accelerate import DistributedType, infer_auto_device_map, init_empty_weights, load_checkpoint_and_dispatch from accelerate.accelerator import Accelerator from accelerate.state import GradientState, PartialState -from accelerate.test_utils import require_bnb, require_multi_gpu, slow -from accelerate.test_utils.testing import AccelerateTestCase, require_cuda +from accelerate.test_utils import require_bnb, require_multi_device, require_non_cpu, slow, torch_device +from accelerate.test_utils.testing import AccelerateTestCase from accelerate.utils import patch_environment from accelerate.utils.modeling import load_checkpoint_in_model @@ -55,11 +55,11 @@ def parameterized_custom_name_func(func, param_num, param): class AcceleratorTester(AccelerateTestCase): - @require_cuda + @require_non_cpu def test_accelerator_can_be_reinstantiated(self): _ = Accelerator() assert PartialState._shared_state["_cpu"] is False - assert PartialState._shared_state["device"].type == "cuda" + assert PartialState._shared_state["device"].type in ["cuda", "mps", "npu", "xpu"] with self.assertRaises(ValueError): _ = Accelerator(cpu=True) @@ -326,12 +326,17 @@ def test_accelerator_bnb_cpu_error(self): @slow @require_bnb - @require_multi_gpu - def test_accelerator_bnb_multi_gpu(self): + @require_multi_device + def test_accelerator_bnb_multi_device(self): """Tests that the accelerator can be used with the BNB library.""" from transformers import AutoModelForCausalLM - PartialState._shared_state = {"distributed_type": DistributedType.MULTI_GPU} + if torch_device == "cuda": + PartialState._shared_state = {"distributed_type": DistributedType.MULTI_GPU} + elif torch_device == "npu": + PartialState._shared_state = {"distributed_type": DistributedType.MULTI_NPU} + else: + raise ValueError(f"{torch_device} is not supported in test_accelerator_bnb_multi_device.") with init_empty_weights(): model = AutoModelForCausalLM.from_pretrained( @@ -356,8 +361,8 @@ def test_accelerator_bnb_multi_gpu(self): @slow @require_bnb - @require_multi_gpu - def test_accelerator_bnb_multi_gpu_no_distributed(self): + @require_multi_device + def test_accelerator_bnb_multi_device_no_distributed(self): """Tests that the accelerator can be used with the BNB library.""" from transformers import AutoModelForCausalLM @@ -378,21 +383,21 @@ def test_accelerator_bnb_multi_gpu_no_distributed(self): # This should work _ = accelerator.prepare(model) - @require_cuda + @require_non_cpu def test_accelerator_cpu_flag_prepare(self): model = torch.nn.Linear(10, 10) sgd = torch.optim.SGD(model.parameters(), lr=0.01) accelerator = Accelerator(cpu=True) _ = accelerator.prepare(sgd) - @require_cuda + @require_non_cpu def test_can_unwrap_model_fp16(self): # test for a regression introduced in #872 # before the fix, after unwrapping with keep_fp32_wrapper=False, there would be the following error: # Linear.forward() missing 1 required positional argument: 'input' model = create_components()[0] accelerator = Accelerator(mixed_precision="fp16") - inputs = torch.randn(10, 2).cuda() + inputs = torch.randn(10, 2).to(torch_device) model = accelerator.prepare(model) model(inputs) # sanity check that this works diff --git a/tests/test_multigpu.py b/tests/test_multigpu.py index 20a9e5a87cb..140ed7f8247 100644 --- a/tests/test_multigpu.py +++ b/tests/test_multigpu.py @@ -21,11 +21,11 @@ import accelerate from accelerate import Accelerator from accelerate.big_modeling import dispatch_model -from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu +from accelerate.test_utils import assert_exception, device_count, execute_subprocess_async, require_multi_device from accelerate.utils import patch_environment -class MultiGPUTester(unittest.TestCase): +class MultiDeviceTester(unittest.TestCase): def setUp(self): mod_file = inspect.getfile(accelerate.test_utils) self.test_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_script.py"]) @@ -34,35 +34,35 @@ def setUp(self): ) self.operation_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_ops.py"]) - @require_multi_gpu - def test_multi_gpu(self): - print(f"Found {torch.cuda.device_count()} devices.") - cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.test_file_path] + @require_multi_device + def test_multi_device(self): + print(f"Found {device_count} devices.") + cmd = ["torchrun", f"--nproc_per_node={device_count}", self.test_file_path] with patch_environment(omp_num_threads=1): execute_subprocess_async(cmd, env=os.environ.copy()) - @require_multi_gpu - def test_multi_gpu_ops(self): - print(f"Found {torch.cuda.device_count()} devices.") - cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.operation_file_path] + @require_multi_device + def test_multi_device_ops(self): + print(f"Found {device_count} devices.") + cmd = ["torchrun", f"--nproc_per_node={device_count}", self.operation_file_path] print(f"Command: {cmd}") with patch_environment(omp_num_threads=1): execute_subprocess_async(cmd, env=os.environ.copy()) - @require_multi_gpu + @require_multi_device def test_pad_across_processes(self): - cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", inspect.getfile(self.__class__)] + cmd = ["torchrun", f"--nproc_per_node={device_count}", inspect.getfile(self.__class__)] with patch_environment(omp_num_threads=1): execute_subprocess_async(cmd, env=os.environ.copy()) - @require_multi_gpu + @require_multi_device def test_distributed_data_loop(self): """ This TestCase checks the behaviour that occurs during distributed training or evaluation, when the batch size does not evenly divide the dataset size. """ - print(f"Found {torch.cuda.device_count()} devices, using 2 devices only") - cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.data_loop_file_path] + print(f"Found {device_count} devices, using 2 devices only") + cmd = ["torchrun", f"--nproc_per_node={device_count}", self.data_loop_file_path] with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"): execute_subprocess_async(cmd, env=os.environ.copy())