From 9454f437b0ba87e29cae9706cee6be6bb88d28d6 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Mon, 17 Jun 2024 22:42:57 +0800 Subject: [PATCH] [tests] make `TestDeepSpeedModelZoo` device-agnostic (#31402) * fix * use accelerator device count * ci fix --- src/transformers/testing_utils.py | 4 ++++ tests/deepspeed/test_model_zoo.py | 9 +++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index b972d285aed3e5..8dda057f1b9d76 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -2432,6 +2432,10 @@ def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable], BACKEND_MANUAL_SEED = {"cuda": torch.cuda.manual_seed, "cpu": torch.manual_seed, "default": torch.manual_seed} BACKEND_EMPTY_CACHE = {"cuda": torch.cuda.empty_cache, "cpu": None, "default": None} BACKEND_DEVICE_COUNT = {"cuda": torch.cuda.device_count, "cpu": lambda: 0, "default": lambda: 1} +else: + BACKEND_MANUAL_SEED = {"default": None} + BACKEND_EMPTY_CACHE = {"default": None} + BACKEND_DEVICE_COUNT = {"default": lambda: 0} def backend_manual_seed(device: str, seed: int): diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py index ea002f5ddf291e..043b25d693782e 100644 --- a/tests/deepspeed/test_model_zoo.py +++ b/tests/deepspeed/test_model_zoo.py @@ -23,12 +23,13 @@ from transformers import is_torch_available from transformers.testing_utils import ( TestCasePlus, + backend_device_count, execute_subprocess_async, - get_gpu_count, get_tests_dir, require_deepspeed, - require_torch_gpu, + require_torch_accelerator, slow, + torch_device, ) from transformers.trainer_utils import set_seed @@ -143,7 +144,7 @@ def get_launcher(distributed=False): # - it won't be able to handle that # 2. for now testing with just 2 gpus max (since some quality tests may give different # results with mode gpus because we use very little data) - num_gpus = min(2, get_gpu_count()) if distributed else 1 + num_gpus = min(2, backend_device_count(torch_device)) if distributed else 1 master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split() @@ -326,7 +327,7 @@ def parameterized_custom_name_func(func, param_num, param): @slow @require_deepspeed -@require_torch_gpu +@require_torch_accelerator class TestDeepSpeedModelZoo(TestCasePlus): """This class is for testing via an external script - can do multiple gpus"""