[example] update vit example for hybrid parallel plugin (#4641)

* update vit example for hybrid plugin * reset tp/pp size * fix dataloader iteration bug * update optimizer passing in evaluation/add grad_accum * change criterion * wrap tqdm * change grad_accum to grad_checkpoint * fix pbar
hpcaitech · Sep 7, 2023 · 295b38f · 295b38f
1 parent 660eed9
commit 295b38f
Show file tree

Hide file tree

Showing 10 changed files with 248 additions and 194 deletions.
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
@@ -884,6 +884,7 @@ def forward(
 
         if self.gradient_checkpointing and self.training:
             if use_cache:
+                logger = logging.get_logger(__name__)
                 logger.warning_once(
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
                 use_cache = False

diff --git a/colossalai/shardformer/modeling/vit.py b/colossalai/shardformer/modeling/vit.py
@@ -1,9 +1,9 @@
-import logging
 import math
 from typing import Dict, List, Optional, Set, Tuple, Union
 
 import torch
 from transformers.models.vit.modeling_vit import BaseModelOutput, ViTEncoder
+from transformers.utils import logging
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
 
@@ -72,18 +72,17 @@ def pp_forward(
             bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
                 Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
             """
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else self.config.output_hidden_states)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if output_attentions is not None:
-            logging.warning('Non-empty output_attentions is not supported for pipeline models at the moment.')
-            output_attentions = None
-        if output_hidden_states is not None:
-            logging.warning('Non-empty output_hidden_states is not supported for pipeline models at the moment.')
-            output_hidden_states = None
+        logger = logging.get_logger(__name__)
+
+        # Preprocess passed in arguments
+        if output_attentions:
+            logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
+            output_attentions = False
+        if output_hidden_states:
+            logger.warning_once('output_hidden_states=True is not supported for pipeline models at the moment.')
+            output_hidden_states = False
 
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head

diff --git a/examples/images/vit/README.md b/examples/images/vit/README.md
@@ -3,7 +3,7 @@
 Vision Transformer is a class of Transformer model tailored for computer vision tasks. It was first proposed in paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) and achieved SOTA results on various tasks at that time.
 
 In our example, we are using pretrained weights of ViT loaded from HuggingFace.
-We adapt the ViT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin, LowLevelZeroPlugin, and GeminiPlugin.
+We adapt the ViT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin (DDP), LowLevelZeroPlugin (Zero1/Zero2), GeminiPlugin (Gemini) and HybridParallelPlugin (any combination of tensor/pipeline/data parallel).
 
 ## Run Demo
 
@@ -25,4 +25,4 @@ You can run benchmark for ViT model by running the following script:
 ```bash
 bash run_benchmark.sh
 ```
-The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your own set of hyperparameters for testing.
+The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your own set of hyperparameters for testing.
diff --git a/examples/images/vit/args.py b/examples/images/vit/args.py
@@ -1,124 +1,82 @@
 from colossalai import get_default_parser
 
+
 def parse_demo_args():
 
     parser = get_default_parser()
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        default="google/vit-base-patch16-224",
-        help="Path to pretrained model or model identifier from huggingface.co/models."
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        default="./output_model.bin",
-        help="The path of your saved model after finetuning."
-    )
+    parser.add_argument("--model_name_or_path",
+                        type=str,
+                        default="google/vit-base-patch16-224",
+                        help="Path to pretrained model or model identifier from huggingface.co/models.")
+    parser.add_argument("--output_path",
+                        type=str,
+                        default="./output_model",
+                        help="The path of your saved model after finetuning.")
     parser.add_argument(
         "--plugin",
         type=str,
         default="gemini",
-        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
-    )
-    parser.add_argument(
-        "--num_epoch",
-        type=int,
-        default=3,
-        help="Number of epochs."
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=32,
-        help="Batch size (per dp group) for the training dataloader."
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=3e-4,
-        help="Initial learning rate (after the potential warmup period) to use."
-    )
-    parser.add_argument(
-        "--warmup_ratio",
-        type=float,
-        default=0.3,
-        help="Ratio of warmup steps against total training steps."
-    )
-    parser.add_argument(
-        "--weight_decay", 
-        type=float, 
-        default=0.1, 
-        help="Weight decay to use."
-    )
-    parser.add_argument(
-        "--seed", 
-        type=int, 
-        default=42, 
-        help="A seed for reproducible training."
-    )
+        help=
+        "Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero', 'hybrid_parallel'."
+    )
+    parser.add_argument("--num_epoch", type=int, default=3, help="Number of epochs.")
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=32,
+                        help="Batch size (per dp group) for the training dataloader.")
+    parser.add_argument("--tp_size",
+                        type=int,
+                        default=1,
+                        help="The size along tensor parallel dimension, only be used when enabling hybrid parallel.")
+    parser.add_argument("--pp_size",
+                        type=int,
+                        default=1,
+                        help="The size along pipeline parallel dimension, only be used when enabling hybrid parallel.")
+    parser.add_argument("--learning_rate",
+                        type=float,
+                        default=3e-4,
+                        help="Initial learning rate (after the potential warmup period) to use.")
+    parser.add_argument("--warmup_ratio",
+                        type=float,
+                        default=0.3,
+                        help="Ratio of warmup steps against total training steps.")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay to use.")
+    parser.add_argument("--grad_checkpoint", type=bool, default=True, help="Whether to use gradient checkpointing.")
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
 
     args = parser.parse_args()
     return args
 
+
 def parse_benchmark_args():
 
     parser = get_default_parser()
 
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        default="google/vit-base-patch16-224",
-        help="Path to a pretrained model or model identifier from huggingface.co/models."
-    )
+    parser.add_argument("--model_name_or_path",
+                        type=str,
+                        default="google/vit-base-patch16-224",
+                        help="Path to a pretrained model or model identifier from huggingface.co/models.")
     parser.add_argument(
         "--plugin",
         type=str,
         default="gemini",
-        help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per dp group) for the training dataloader."
-    )
-    parser.add_argument(
-        "--num_labels",
-        type=int,
-        default=10,
-        help="Number of labels for classification."
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use."
-    )
-    parser.add_argument(
-        "--weight_decay", 
-        type=float, 
-        default=0.0, 
-        help="Weight decay to use."
-    )
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=20,
-        help="Total number of training steps to perform."
-    )
-    parser.add_argument(
-        "--seed", 
-        type=int, 
-        default=42, 
-        help="A seed for reproducible training."
-    )
-    parser.add_argument(
-        "--mem_cap", 
-        type=int, 
-        default=0, 
-        help="Limit on the usage of space for each GPU (in GB)."
-    )
+        help=
+        "Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero', 'hybrid_parallel'."
+    )
+    parser.add_argument("--batch_size",
+                        type=int,
+                        default=8,
+                        help="Batch size (per dp group) for the training dataloader.")
+    parser.add_argument("--num_labels", type=int, default=10, help="Number of labels for classification.")
+    parser.add_argument("--learning_rate",
+                        type=float,
+                        default=5e-5,
+                        help="Initial learning rate (after the potential warmup period) to use.")
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--grad_checkpoint", type=bool, default=True, help="Whether to use gradient checkpointing.")
+    parser.add_argument("--max_train_steps", type=int, default=20, help="Total number of training steps to perform.")
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument("--mem_cap", type=int, default=0, help="Limit on the usage of space for each GPU (in GB).")
     args = parser.parse_args()
 
-    return args
+    return args
diff --git a/examples/images/vit/data.py b/examples/images/vit/data.py
@@ -1,32 +1,38 @@
 import torch
-from torch.utils.data import Dataset
 from datasets import load_dataset
+from torch.utils.data import Dataset
+
 
 class BeansDataset(Dataset):
-    
-    def __init__(self, image_processor, split='train'):
+
+    def __init__(self, image_processor, tp_size=1, split='train'):
 
         super().__init__()
         self.image_processor = image_processor
         self.ds = load_dataset('beans')[split]
         self.label_names = self.ds.features['labels'].names
+        while len(self.label_names) % tp_size != 0:
+            # ensure that the number of labels is multiple of tp_size
+            self.label_names.append(f"pad_label_{len(self.label_names)}")
         self.num_labels = len(self.label_names)
         self.inputs = []
         for example in self.ds:
             self.inputs.append(self.process_example(example))
-    
+
     def __len__(self):
         return len(self.inputs)
 
     def __getitem__(self, idx):
         return self.inputs[idx]
-    
+
     def process_example(self, example):
         input = self.image_processor(example['image'], return_tensors='pt')
         input['labels'] = example['labels']
         return input
-    
+
 
 def beans_collator(batch):
-    return {'pixel_values': torch.cat([data['pixel_values'] for data in batch], dim=0),
-            'labels': torch.tensor([data['labels'] for data in batch], dtype=torch.int64)}
+    return {
+        'pixel_values': torch.cat([data['pixel_values'] for data in batch], dim=0),
+        'labels': torch.tensor([data['labels'] for data in batch], dtype=torch.int64)
+    }
diff --git a/examples/images/vit/run_benchmark.sh b/examples/images/vit/run_benchmark.sh
@@ -5,23 +5,20 @@ export BS=8
 export MEMCAP=0
 export GPUNUM=1
 
-for BS in 8 32 128
+for BS in 8 32
 do
-for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
-do
-for GPUNUM in 1 4
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini" "hybrid_parallel"
 do
 
 MODEL_PATH="google/vit-base-patch16-224"
 torchrun \
   --standalone \
-  --nproc_per_node ${GPUNUM} \
+  --nproc_per_node 4 \
   vit_benchmark.py \
   --model_name_or_path ${MODEL_PATH} \
   --mem_cap ${MEMCAP} \
   --plugin ${PLUGIN} \
   --batch_size ${BS}
-
-done
+
 done
 done
diff --git a/examples/images/vit/run_demo.sh b/examples/images/vit/run_demo.sh
@@ -5,16 +5,21 @@ pip install -r requirements.txt
 MODEL="google/vit-base-patch16-224"
 
 # path for saving model
-OUTPUT_PATH="./output_model.bin"
+OUTPUT_PATH="./output_model"
 
 # plugin(training strategy)
-# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"
+# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"/"hybrid_parallel"
 PLUGIN="gemini"
+#PLUGIN="hybrid_parallel"
+
+# configuration of parallel group sizes, only used when setting PLUGIN to "hybrid_parallel"
+TP_SIZE=2
+PP_SIZE=2
 
 # number of gpus to use
 GPUNUM=4
 
-# batch size per gpu
+# batch size per data parallel group
 BS=16
 
 # learning rate
@@ -38,6 +43,8 @@ torchrun \
   --output_path ${OUTPUT_PATH} \
   --plugin ${PLUGIN} \
   --batch_size ${BS} \
+  --tp_size ${TP_SIZE} \
+  --pp_size ${PP_SIZE} \
   --num_epoch ${EPOCH} \
   --learning_rate ${LR} \
   --weight_decay ${WEIGHT_DECAY} \

diff --git a/examples/images/vit/test_ci.sh b/examples/images/vit/test_ci.sh
@@ -2,18 +2,15 @@ set -xe
 pip install -r requirements.txt
 
 BS=8
-for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
-do
-for GPUNUM in 1 4
+for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini" "hybrid_parallel"
 do
 
 torchrun \
   --standalone \
-  --nproc_per_node ${GPUNUM} \
+  --nproc_per_node 4 \
   vit_benchmark.py \
   --model_name_or_path "google/vit-base-patch16-224" \
   --plugin ${PLUGIN} \
   --batch_size ${BS}
 
 done
-done