Skip to content

Commit

Permalink
Replace DistributedType.TPU with DistributedType.XLA
Browse files Browse the repository at this point in the history
  • Loading branch information
anw90 committed Dec 4, 2023
1 parent 9c12351 commit 45f44d6
Show file tree
Hide file tree
Showing 31 changed files with 89 additions and 72 deletions.
2 changes: 1 addition & 1 deletion docs/source/quicktour.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ To introduce special behavior in your script for TPUs you can check the `distrib
```python docstyle-ignore
from accelerate import DistributedType

if accelerator.distributed_type == DistributedType.TPU:
if accelerator.distributed_type == DistributedType.XLA:
# do something of static shape
else:
# go crazy and be dynamic
Expand Down
4 changes: 2 additions & 2 deletions examples/by_feature/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down Expand Up @@ -154,7 +154,7 @@ def training_function(config, args):

# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE

Expand Down
4 changes: 2 additions & 2 deletions examples/by_feature/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down Expand Up @@ -157,7 +157,7 @@ def training_function(config, args):

# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE

Expand Down
2 changes: 1 addition & 1 deletion examples/by_feature/deepspeed_with_config_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ def group_texts(examples):
optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)

# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
if accelerator.distributed_type == DistributedType.TPU:
if accelerator.distributed_type == DistributedType.XLA:
model.tie_weights()

# Scheduler and math around the number of training steps.
Expand Down
4 changes: 2 additions & 2 deletions examples/by_feature/early_stopping.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down Expand Up @@ -151,7 +151,7 @@ def training_function(config, args):

# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE

Expand Down
4 changes: 2 additions & 2 deletions examples/by_feature/fsdp_with_peak_mem_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,13 @@ def tokenize_function(examples):

# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down
4 changes: 2 additions & 2 deletions examples/by_feature/gradient_accumulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down Expand Up @@ -126,7 +126,7 @@ def training_function(config, args):
accelerator = Accelerator(
cpu=args.cpu, mixed_precision=args.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps
)
if accelerator.distributed_type == DistributedType.TPU and gradient_accumulation_steps > 1:
if accelerator.distributed_type == DistributedType.XLA and gradient_accumulation_steps > 1:
raise NotImplementedError(
"Gradient accumulation on TPUs is currently not supported. Pass `gradient_accumulation_steps=1`"
)
Expand Down
2 changes: 1 addition & 1 deletion examples/by_feature/local_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down
2 changes: 1 addition & 1 deletion examples/by_feature/megatron_lm_gpt_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ def group_texts(examples):
)

# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
if accelerator.distributed_type == DistributedType.TPU:
if accelerator.distributed_type == DistributedType.XLA:
model.tie_weights()

# We need to recalculate our total training steps as the size of the training dataloader may have changed.
Expand Down
2 changes: 1 addition & 1 deletion examples/by_feature/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down
4 changes: 2 additions & 2 deletions examples/by_feature/multi_process_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down Expand Up @@ -139,7 +139,7 @@ def training_function(config, args):

# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE

Expand Down
4 changes: 2 additions & 2 deletions examples/by_feature/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down Expand Up @@ -149,7 +149,7 @@ def training_function(config, args):

# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE

Expand Down
4 changes: 2 additions & 2 deletions examples/complete_nlp_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,13 @@ def tokenize_function(examples):

# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down
4 changes: 2 additions & 2 deletions examples/nlp_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def tokenize_function(examples):

def collate_fn(examples):
# On TPU it's best to pad everything to the same length or training will be very slow.
max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
# When using mixed precision we want round multiples of 8/16
if accelerator.mixed_precision == "fp8":
pad_to_multiple_of = 16
Expand Down Expand Up @@ -124,7 +124,7 @@ def training_function(config, args):

# If the batch size is too big we use gradient accumulation
gradient_accumulation_steps = 1
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
batch_size = MAX_GPU_BATCH_SIZE

Expand Down
16 changes: 8 additions & 8 deletions src/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def __init__(
if (
(mixed_precision != "bf16")
and getattr(self.state, "downcast_bfloat", False)
and (self.state.distributedType != DistributedType.TPU)
and (self.state.distributedType != DistributedType.XLA)
):
raise ValueError("Can only use `downcast_bf16` when using `mixed_precision='bf16'` and on a TPU")

Expand All @@ -403,7 +403,7 @@ def __init__(
self.gradient_state = GradientState(
gradient_accumulation_plugin=gradient_accumulation_plugin,
)
if self.state.distributed_type == DistributedType.TPU:
if self.state.distributed_type == DistributedType.XLA:
if self.gradient_state.num_steps != 1:
raise ValueError(
"Gradient accumulation is not supported on TPU. Please set `gradient_accumulation_steps` to 1 and don't pass in a `GradientAccumulationPlugin` object."
Expand Down Expand Up @@ -1261,7 +1261,7 @@ def prepare(self, *args, device_placement=None):
# On TPUs, putting the model on the XLA device will create new parameters, so the corresponding optimizer will
# have parameters disconnected from the model (so no training :-( ).
# If the model and optimizer have parameters on different devices we raise an error.
if self.distributed_type == DistributedType.TPU:
if self.distributed_type == DistributedType.XLA:
model_device, optimizer_device = self._get_devices()
if model_device is not None and optimizer_device is not None and model_device != optimizer_device:
raise ValueError(
Expand All @@ -1273,7 +1273,7 @@ def prepare(self, *args, device_placement=None):
)

# If we're dealing with device placement, this deals with that by...
tpu_should_fix_optimizer = self.device_placement and self.distributed_type == DistributedType.TPU
tpu_should_fix_optimizer = self.device_placement and self.distributed_type == DistributedType.XLA
if tpu_should_fix_optimizer or self.mixed_precision == "fp8":
# 1. grabbing old model parameters
old_named_params = self._get_named_parameters(*args)
Expand Down Expand Up @@ -1486,7 +1486,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
elif self.distributed_type == DistributedType.MULTI_CPU:
kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
elif self.distributed_type == DistributedType.TPU and self.state.fork_launched:
elif self.distributed_type == DistributedType.XLA and self.state.fork_launched:
model = xmp.MpModelWrapper(model).to(self.device)
# torch.compile should be called last and only if the model isn't already compiled.
if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
Expand Down Expand Up @@ -1870,7 +1870,7 @@ def prepare_data_loader(
self._dataloaders.append(data_loader)
return data_loader
if device_placement is None:
device_placement = self.device_placement if self.distributed_type != DistributedType.TPU else False
device_placement = self.device_placement if self.distributed_type != DistributedType.XLA else False
prepared_data_loader = prepare_data_loader(
data_loader,
self.device,
Expand Down Expand Up @@ -2119,7 +2119,7 @@ def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
# `accelerator.backward(loss)` is doing that automatically. Therefore, its implementation is not needed
# We cannot return the gradient norm because DeepSpeed does it.
return None
elif self.distributed_type == DistributedType.TPU:
elif self.distributed_type == DistributedType.XLA:
# Reduce gradients first for XLA
for acc_opt in self._optimizers:
opt = acc_opt
Expand Down Expand Up @@ -2740,7 +2740,7 @@ def _inner(folder):
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Saving current state to {output_dir}")

if self.distributed_type == DistributedType.TPU:
if self.distributed_type == DistributedType.XLA:
# Finish running the previous step before checkpointing
xm.mark_step()

Expand Down
8 changes: 4 additions & 4 deletions src/accelerate/commands/config/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def get_cluster_input():
if (
not use_cpu
and is_xpu_available()
and distributed_type not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.TPU]
and distributed_type not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.XLA]
):
ipex_config["use_xpu"] = _ask_field(
"Do you want to use XPU plugin to speed up training on XPU? [yes/NO]:",
Expand Down Expand Up @@ -478,7 +478,7 @@ def get_cluster_input():
DistributedType.MULTI_XPU,
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.TPU,
DistributedType.XLA,
]:
machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "")
if machine_type == "TPU":
Expand Down Expand Up @@ -526,7 +526,7 @@ def get_cluster_input():
default="all",
)

if distributed_type == DistributedType.TPU:
if distributed_type == DistributedType.XLA:
mixed_precision = "no"
main_training_function = _ask_field(
"What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
Expand Down Expand Up @@ -617,7 +617,7 @@ def get_cluster_input():
"Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts."
)

if distributed_type == DistributedType.TPU and mixed_precision == "bf16":
if distributed_type == DistributedType.XLA and mixed_precision == "bf16":
tpu_downcast_bf16 = _ask_field(
"Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no"
)
Expand Down
2 changes: 1 addition & 1 deletion src/accelerate/commands/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ def _validate_launch_command(args):
in (DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_XPU)
else False
)
args.tpu = defaults.distributed_type == DistributedType.TPU
args.tpu = defaults.distributed_type == DistributedType.XLA
args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
args.tpu_use_cluster = defaults.tpu_use_cluster if args.tpu else False
Expand Down
6 changes: 3 additions & 3 deletions src/accelerate/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -910,7 +910,7 @@ def prepare_data_loader(
elif sampler_is_batch_sampler:
dataloader = DataLoaderShard(
new_dataset,
device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
sampler=new_batch_sampler,
batch_size=dataloader.batch_size,
rng_types=rng_types,
Expand All @@ -921,15 +921,15 @@ def prepare_data_loader(
else:
dataloader = DataLoaderShard(
new_dataset,
device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
batch_sampler=new_batch_sampler,
rng_types=rng_types,
synchronized_generator=synchronized_generator,
_drop_last=dataloader.drop_last,
**kwargs,
)

if state.distributed_type == DistributedType.TPU:
if state.distributed_type == DistributedType.XLA:
return MpDeviceLoaderWrapper(dataloader, device)
return dataloader

Expand Down
Loading

0 comments on commit 45f44d6

Please sign in to comment.