Skip to content

Commit

Permalink
device agnostic test_accelerator/test_multigpu (#2343)
Browse files Browse the repository at this point in the history
  • Loading branch information
wangshuai09 authored Jan 18, 2024
1 parent f5c01ee commit ec4f01a
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 27 deletions.
29 changes: 17 additions & 12 deletions tests/test_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from accelerate import DistributedType, infer_auto_device_map, init_empty_weights, load_checkpoint_and_dispatch
from accelerate.accelerator import Accelerator
from accelerate.state import GradientState, PartialState
from accelerate.test_utils import require_bnb, require_multi_gpu, slow
from accelerate.test_utils.testing import AccelerateTestCase, require_cuda
from accelerate.test_utils import require_bnb, require_multi_device, require_non_cpu, slow, torch_device
from accelerate.test_utils.testing import AccelerateTestCase
from accelerate.utils import patch_environment
from accelerate.utils.modeling import load_checkpoint_in_model

Expand Down Expand Up @@ -55,11 +55,11 @@ def parameterized_custom_name_func(func, param_num, param):


class AcceleratorTester(AccelerateTestCase):
@require_cuda
@require_non_cpu
def test_accelerator_can_be_reinstantiated(self):
_ = Accelerator()
assert PartialState._shared_state["_cpu"] is False
assert PartialState._shared_state["device"].type == "cuda"
assert PartialState._shared_state["device"].type in ["cuda", "mps", "npu", "xpu"]
with self.assertRaises(ValueError):
_ = Accelerator(cpu=True)

Expand Down Expand Up @@ -326,12 +326,17 @@ def test_accelerator_bnb_cpu_error(self):

@slow
@require_bnb
@require_multi_gpu
def test_accelerator_bnb_multi_gpu(self):
@require_multi_device
def test_accelerator_bnb_multi_device(self):
"""Tests that the accelerator can be used with the BNB library."""
from transformers import AutoModelForCausalLM

PartialState._shared_state = {"distributed_type": DistributedType.MULTI_GPU}
if torch_device == "cuda":
PartialState._shared_state = {"distributed_type": DistributedType.MULTI_GPU}
elif torch_device == "npu":
PartialState._shared_state = {"distributed_type": DistributedType.MULTI_NPU}
else:
raise ValueError(f"{torch_device} is not supported in test_accelerator_bnb_multi_device.")

with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained(
Expand All @@ -356,8 +361,8 @@ def test_accelerator_bnb_multi_gpu(self):

@slow
@require_bnb
@require_multi_gpu
def test_accelerator_bnb_multi_gpu_no_distributed(self):
@require_multi_device
def test_accelerator_bnb_multi_device_no_distributed(self):
"""Tests that the accelerator can be used with the BNB library."""
from transformers import AutoModelForCausalLM

Expand All @@ -378,21 +383,21 @@ def test_accelerator_bnb_multi_gpu_no_distributed(self):
# This should work
_ = accelerator.prepare(model)

@require_cuda
@require_non_cpu
def test_accelerator_cpu_flag_prepare(self):
model = torch.nn.Linear(10, 10)
sgd = torch.optim.SGD(model.parameters(), lr=0.01)
accelerator = Accelerator(cpu=True)
_ = accelerator.prepare(sgd)

@require_cuda
@require_non_cpu
def test_can_unwrap_model_fp16(self):
# test for a regression introduced in #872
# before the fix, after unwrapping with keep_fp32_wrapper=False, there would be the following error:
# Linear.forward() missing 1 required positional argument: 'input'
model = create_components()[0]
accelerator = Accelerator(mixed_precision="fp16")
inputs = torch.randn(10, 2).cuda()
inputs = torch.randn(10, 2).to(torch_device)
model = accelerator.prepare(model)
model(inputs) # sanity check that this works

Expand Down
30 changes: 15 additions & 15 deletions tests/test_multigpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
import accelerate
from accelerate import Accelerator
from accelerate.big_modeling import dispatch_model
from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu
from accelerate.test_utils import assert_exception, device_count, execute_subprocess_async, require_multi_device
from accelerate.utils import patch_environment


class MultiGPUTester(unittest.TestCase):
class MultiDeviceTester(unittest.TestCase):
def setUp(self):
mod_file = inspect.getfile(accelerate.test_utils)
self.test_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_script.py"])
Expand All @@ -34,35 +34,35 @@ def setUp(self):
)
self.operation_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_ops.py"])

@require_multi_gpu
def test_multi_gpu(self):
print(f"Found {torch.cuda.device_count()} devices.")
cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.test_file_path]
@require_multi_device
def test_multi_device(self):
print(f"Found {device_count} devices.")
cmd = ["torchrun", f"--nproc_per_node={device_count}", self.test_file_path]
with patch_environment(omp_num_threads=1):
execute_subprocess_async(cmd, env=os.environ.copy())

@require_multi_gpu
def test_multi_gpu_ops(self):
print(f"Found {torch.cuda.device_count()} devices.")
cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.operation_file_path]
@require_multi_device
def test_multi_device_ops(self):
print(f"Found {device_count} devices.")
cmd = ["torchrun", f"--nproc_per_node={device_count}", self.operation_file_path]
print(f"Command: {cmd}")
with patch_environment(omp_num_threads=1):
execute_subprocess_async(cmd, env=os.environ.copy())

@require_multi_gpu
@require_multi_device
def test_pad_across_processes(self):
cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", inspect.getfile(self.__class__)]
cmd = ["torchrun", f"--nproc_per_node={device_count}", inspect.getfile(self.__class__)]
with patch_environment(omp_num_threads=1):
execute_subprocess_async(cmd, env=os.environ.copy())

@require_multi_gpu
@require_multi_device
def test_distributed_data_loop(self):
"""
This TestCase checks the behaviour that occurs during distributed training or evaluation,
when the batch size does not evenly divide the dataset size.
"""
print(f"Found {torch.cuda.device_count()} devices, using 2 devices only")
cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.data_loop_file_path]
print(f"Found {device_count} devices, using 2 devices only")
cmd = ["torchrun", f"--nproc_per_node={device_count}", self.data_loop_file_path]
with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"):
execute_subprocess_async(cmd, env=os.environ.copy())

Expand Down

0 comments on commit ec4f01a

Please sign in to comment.