Replace DistributedType.TPU with DistributedType.XLA

huggingface · Dec 4, 2023 · 45f44d6 · 45f44d6
1 parent 9c12351
commit 45f44d6
Show file tree

Hide file tree

Showing 31 changed files with 89 additions and 72 deletions.
diff --git a/docs/source/quicktour.md b/docs/source/quicktour.md
@@ -258,7 +258,7 @@ To introduce special behavior in your script for TPUs you can check the `distrib
 ```python docstyle-ignore
 from accelerate import DistributedType
 
-if accelerator.distributed_type == DistributedType.TPU:
+if accelerator.distributed_type == DistributedType.XLA:
     # do something of static shape
 else:
     # go crazy and be dynamic

diff --git a/examples/by_feature/checkpointing.py b/examples/by_feature/checkpointing.py
@@ -86,7 +86,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16
@@ -154,7 +154,7 @@ def training_function(config, args):
 
     # If the batch size is too big we use gradient accumulation
     gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
         gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
         batch_size = MAX_GPU_BATCH_SIZE
 

diff --git a/examples/by_feature/cross_validation.py b/examples/by_feature/cross_validation.py
@@ -106,7 +106,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16
@@ -157,7 +157,7 @@ def training_function(config, args):
 
     # If the batch size is too big we use gradient accumulation
     gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
         gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
         batch_size = MAX_GPU_BATCH_SIZE
 

diff --git a/examples/by_feature/deepspeed_with_config_support.py b/examples/by_feature/deepspeed_with_config_support.py
@@ -512,7 +512,7 @@ def group_texts(examples):
     optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate)
 
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
+    if accelerator.distributed_type == DistributedType.XLA:
         model.tie_weights()
 
     # Scheduler and math around the number of training steps.

diff --git a/examples/by_feature/early_stopping.py b/examples/by_feature/early_stopping.py
@@ -81,7 +81,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16
@@ -151,7 +151,7 @@ def training_function(config, args):
 
     # If the batch size is too big we use gradient accumulation
     gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
         gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
         batch_size = MAX_GPU_BATCH_SIZE
 

diff --git a/examples/by_feature/fsdp_with_peak_mem_tracking.py b/examples/by_feature/fsdp_with_peak_mem_tracking.py
@@ -209,13 +209,13 @@ def tokenize_function(examples):
 
     # If the batch size is too big we use gradient accumulation
     gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
         gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
         batch_size = MAX_GPU_BATCH_SIZE
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16

diff --git a/examples/by_feature/gradient_accumulation.py b/examples/by_feature/gradient_accumulation.py
@@ -81,7 +81,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16
@@ -126,7 +126,7 @@ def training_function(config, args):
     accelerator = Accelerator(
         cpu=args.cpu, mixed_precision=args.mixed_precision, gradient_accumulation_steps=gradient_accumulation_steps
     )
-    if accelerator.distributed_type == DistributedType.TPU and gradient_accumulation_steps > 1:
+    if accelerator.distributed_type == DistributedType.XLA and gradient_accumulation_steps > 1:
         raise NotImplementedError(
             "Gradient accumulation on TPUs is currently not supported. Pass `gradient_accumulation_steps=1`"
         )

diff --git a/examples/by_feature/local_sgd.py b/examples/by_feature/local_sgd.py
@@ -84,7 +84,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16

diff --git a/examples/by_feature/megatron_lm_gpt_pretraining.py b/examples/by_feature/megatron_lm_gpt_pretraining.py
@@ -506,7 +506,7 @@ def group_texts(examples):
     )
 
     # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
+    if accelerator.distributed_type == DistributedType.XLA:
         model.tie_weights()
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.

diff --git a/examples/by_feature/memory.py b/examples/by_feature/memory.py
@@ -86,7 +86,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16

diff --git a/examples/by_feature/multi_process_metrics.py b/examples/by_feature/multi_process_metrics.py
@@ -88,7 +88,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16
@@ -139,7 +139,7 @@ def training_function(config, args):
 
     # If the batch size is too big we use gradient accumulation
     gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
         gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
         batch_size = MAX_GPU_BATCH_SIZE
 

diff --git a/examples/by_feature/tracking.py b/examples/by_feature/tracking.py
@@ -86,7 +86,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16
@@ -149,7 +149,7 @@ def training_function(config, args):
 
     # If the batch size is too big we use gradient accumulation
     gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
         gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
         batch_size = MAX_GPU_BATCH_SIZE
 

diff --git a/examples/complete_nlp_example.py b/examples/complete_nlp_example.py
@@ -103,13 +103,13 @@ def tokenize_function(examples):
 
     # If the batch size is too big we use gradient accumulation
     gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
         gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
         batch_size = MAX_GPU_BATCH_SIZE
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16

diff --git a/examples/nlp_example.py b/examples/nlp_example.py
@@ -79,7 +79,7 @@ def tokenize_function(examples):
 
     def collate_fn(examples):
         # On TPU it's best to pad everything to the same length or training will be very slow.
-        max_length = 128 if accelerator.distributed_type == DistributedType.TPU else None
+        max_length = 128 if accelerator.distributed_type == DistributedType.XLA else None
         # When using mixed precision we want round multiples of 8/16
         if accelerator.mixed_precision == "fp8":
             pad_to_multiple_of = 16
@@ -124,7 +124,7 @@ def training_function(config, args):
 
     # If the batch size is too big we use gradient accumulation
     gradient_accumulation_steps = 1
-    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.TPU:
+    if batch_size > MAX_GPU_BATCH_SIZE and accelerator.distributed_type != DistributedType.XLA:
         gradient_accumulation_steps = batch_size // MAX_GPU_BATCH_SIZE
         batch_size = MAX_GPU_BATCH_SIZE
 

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -386,7 +386,7 @@ def __init__(
         if (
             (mixed_precision != "bf16")
             and getattr(self.state, "downcast_bfloat", False)
-            and (self.state.distributedType != DistributedType.TPU)
+            and (self.state.distributedType != DistributedType.XLA)
         ):
             raise ValueError("Can only use `downcast_bf16` when using `mixed_precision='bf16'` and on a TPU")
 
@@ -403,7 +403,7 @@ def __init__(
         self.gradient_state = GradientState(
             gradient_accumulation_plugin=gradient_accumulation_plugin,
         )
-        if self.state.distributed_type == DistributedType.TPU:
+        if self.state.distributed_type == DistributedType.XLA:
             if self.gradient_state.num_steps != 1:
                 raise ValueError(
                     "Gradient accumulation is not supported on TPU. Please set `gradient_accumulation_steps` to 1 and don't pass in a `GradientAccumulationPlugin` object."
@@ -1261,7 +1261,7 @@ def prepare(self, *args, device_placement=None):
         # On TPUs, putting the model on the XLA device will create new parameters, so the corresponding optimizer will
         # have parameters disconnected from the model (so no training :-( ).
         # If the model and optimizer have parameters on different devices we raise an error.
-        if self.distributed_type == DistributedType.TPU:
+        if self.distributed_type == DistributedType.XLA:
             model_device, optimizer_device = self._get_devices()
             if model_device is not None and optimizer_device is not None and model_device != optimizer_device:
                 raise ValueError(
@@ -1273,7 +1273,7 @@ def prepare(self, *args, device_placement=None):
                 )
 
         # If we're dealing with device placement, this deals with that by...
-        tpu_should_fix_optimizer = self.device_placement and self.distributed_type == DistributedType.TPU
+        tpu_should_fix_optimizer = self.device_placement and self.distributed_type == DistributedType.XLA
         if tpu_should_fix_optimizer or self.mixed_precision == "fp8":
             # 1. grabbing old model parameters
             old_named_params = self._get_named_parameters(*args)
@@ -1486,7 +1486,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
             elif self.distributed_type == DistributedType.MULTI_CPU:
                 kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
                 model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
-            elif self.distributed_type == DistributedType.TPU and self.state.fork_launched:
+            elif self.distributed_type == DistributedType.XLA and self.state.fork_launched:
                 model = xmp.MpModelWrapper(model).to(self.device)
         # torch.compile should be called last and only if the model isn't already compiled.
         if self.state.dynamo_plugin.backend != DynamoBackend.NO and not is_compiled_module(model):
@@ -1870,7 +1870,7 @@ def prepare_data_loader(
                 self._dataloaders.append(data_loader)
             return data_loader
         if device_placement is None:
-            device_placement = self.device_placement if self.distributed_type != DistributedType.TPU else False
+            device_placement = self.device_placement if self.distributed_type != DistributedType.XLA else False
         prepared_data_loader = prepare_data_loader(
             data_loader,
             self.device,
@@ -2119,7 +2119,7 @@ def clip_grad_norm_(self, parameters, max_norm, norm_type=2):
             # `accelerator.backward(loss)` is doing that automatically. Therefore, its implementation is not needed
             # We cannot return the gradient norm because DeepSpeed does it.
             return None
-        elif self.distributed_type == DistributedType.TPU:
+        elif self.distributed_type == DistributedType.XLA:
             # Reduce gradients first for XLA
             for acc_opt in self._optimizers:
                 opt = acc_opt
@@ -2740,7 +2740,7 @@ def _inner(folder):
         os.makedirs(output_dir, exist_ok=True)
         logger.info(f"Saving current state to {output_dir}")
 
-        if self.distributed_type == DistributedType.TPU:
+        if self.distributed_type == DistributedType.XLA:
             # Finish running the previous step before checkpointing
             xm.mark_step()
 

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
@@ -126,7 +126,7 @@ def get_cluster_input():
     if (
         not use_cpu
         and is_xpu_available()
-        and distributed_type not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.TPU]
+        and distributed_type not in [DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.XLA]
     ):
         ipex_config["use_xpu"] = _ask_field(
             "Do you want to use XPU plugin to speed up training on XPU? [yes/NO]:",
@@ -478,7 +478,7 @@ def get_cluster_input():
         DistributedType.MULTI_XPU,
         DistributedType.MULTI_GPU,
         DistributedType.MULTI_NPU,
-        DistributedType.TPU,
+        DistributedType.XLA,
     ]:
         machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "")
         if machine_type == "TPU":
@@ -526,7 +526,7 @@ def get_cluster_input():
             default="all",
         )
 
-    if distributed_type == DistributedType.TPU:
+    if distributed_type == DistributedType.XLA:
         mixed_precision = "no"
         main_training_function = _ask_field(
             "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
@@ -617,7 +617,7 @@ def get_cluster_input():
             "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts."
         )
 
-    if distributed_type == DistributedType.TPU and mixed_precision == "bf16":
+    if distributed_type == DistributedType.XLA and mixed_precision == "bf16":
         tpu_downcast_bf16 = _ask_field(
             "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no"
         )

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
@@ -842,7 +842,7 @@ def _validate_launch_command(args):
                 in (DistributedType.MULTI_GPU, DistributedType.MULTI_NPU, DistributedType.MULTI_XPU)
                 else False
             )
-            args.tpu = defaults.distributed_type == DistributedType.TPU
+            args.tpu = defaults.distributed_type == DistributedType.XLA
             args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
             args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
             args.tpu_use_cluster = defaults.tpu_use_cluster if args.tpu else False

diff --git a/src/accelerate/data_loader.py b/src/accelerate/data_loader.py
@@ -910,7 +910,7 @@ def prepare_data_loader(
     elif sampler_is_batch_sampler:
         dataloader = DataLoaderShard(
             new_dataset,
-            device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
+            device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
             sampler=new_batch_sampler,
             batch_size=dataloader.batch_size,
             rng_types=rng_types,
@@ -921,15 +921,15 @@ def prepare_data_loader(
     else:
         dataloader = DataLoaderShard(
             new_dataset,
-            device=device if put_on_device and state.distributed_type != DistributedType.TPU else None,
+            device=device if put_on_device and state.distributed_type != DistributedType.XLA else None,
             batch_sampler=new_batch_sampler,
             rng_types=rng_types,
             synchronized_generator=synchronized_generator,
             _drop_last=dataloader.drop_last,
             **kwargs,
         )
 
-    if state.distributed_type == DistributedType.TPU:
+    if state.distributed_type == DistributedType.XLA:
         return MpDeviceLoaderWrapper(dataloader, device)
     return dataloader