Skip to content

Commit

Permalink
[example] update vit example for hybrid parallel plugin (#4641)
Browse files Browse the repository at this point in the history
* update vit example for hybrid plugin

* reset tp/pp size

* fix dataloader iteration bug

* update optimizer passing in evaluation/add grad_accum

* change criterion

* wrap tqdm

* change grad_accum to grad_checkpoint

* fix pbar
  • Loading branch information
Fridge003 authored Sep 7, 2023
1 parent 660eed9 commit 295b38f
Show file tree
Hide file tree
Showing 10 changed files with 248 additions and 194 deletions.
1 change: 1 addition & 0 deletions colossalai/shardformer/modeling/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,7 @@ def forward(

if self.gradient_checkpointing and self.training:
if use_cache:
logger = logging.get_logger(__name__)
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
use_cache = False
Expand Down
21 changes: 10 additions & 11 deletions colossalai/shardformer/modeling/vit.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import logging
import math
from typing import Dict, List, Optional, Set, Tuple, Union

import torch
from transformers.models.vit.modeling_vit import BaseModelOutput, ViTEncoder
from transformers.utils import logging

from colossalai.pipeline.stage_manager import PipelineStageManager

Expand Down Expand Up @@ -72,18 +72,17 @@ def pp_forward(
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""

output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (output_hidden_states
if output_hidden_states is not None else self.config.output_hidden_states)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

if output_attentions is not None:
logging.warning('Non-empty output_attentions is not supported for pipeline models at the moment.')
output_attentions = None
if output_hidden_states is not None:
logging.warning('Non-empty output_hidden_states is not supported for pipeline models at the moment.')
output_hidden_states = None
logger = logging.get_logger(__name__)

# Preprocess passed in arguments
if output_attentions:
logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
output_attentions = False
if output_hidden_states:
logger.warning_once('output_hidden_states=True is not supported for pipeline models at the moment.')
output_hidden_states = False

# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
Expand Down
4 changes: 2 additions & 2 deletions examples/images/vit/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Vision Transformer is a class of Transformer model tailored for computer vision tasks. It was first proposed in paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) and achieved SOTA results on various tasks at that time.

In our example, we are using pretrained weights of ViT loaded from HuggingFace.
We adapt the ViT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin, LowLevelZeroPlugin, and GeminiPlugin.
We adapt the ViT training code to ColossalAI by leveraging [Boosting API](https://colossalai.org/docs/basics/booster_api) loaded with a chosen plugin, where each plugin corresponds to a specific kind of training strategy. This example supports plugins including TorchDDPPlugin (DDP), LowLevelZeroPlugin (Zero1/Zero2), GeminiPlugin (Gemini) and HybridParallelPlugin (any combination of tensor/pipeline/data parallel).

## Run Demo

Expand All @@ -25,4 +25,4 @@ You can run benchmark for ViT model by running the following script:
```bash
bash run_benchmark.sh
```
The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your own set of hyperparameters for testing.
The script will test performance (throughput & peak memory usage) for each combination of hyperparameters. You can also play with this script to configure your own set of hyperparameters for testing.
160 changes: 59 additions & 101 deletions examples/images/vit/args.py
Original file line number Diff line number Diff line change
@@ -1,124 +1,82 @@
from colossalai import get_default_parser


def parse_demo_args():

parser = get_default_parser()
parser.add_argument(
"--model_name_or_path",
type=str,
default="google/vit-base-patch16-224",
help="Path to pretrained model or model identifier from huggingface.co/models."
)
parser.add_argument(
"--output_path",
type=str,
default="./output_model.bin",
help="The path of your saved model after finetuning."
)
parser.add_argument("--model_name_or_path",
type=str,
default="google/vit-base-patch16-224",
help="Path to pretrained model or model identifier from huggingface.co/models.")
parser.add_argument("--output_path",
type=str,
default="./output_model",
help="The path of your saved model after finetuning.")
parser.add_argument(
"--plugin",
type=str,
default="gemini",
help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
)
parser.add_argument(
"--num_epoch",
type=int,
default=3,
help="Number of epochs."
)
parser.add_argument(
"--batch_size",
type=int,
default=32,
help="Batch size (per dp group) for the training dataloader."
)
parser.add_argument(
"--learning_rate",
type=float,
default=3e-4,
help="Initial learning rate (after the potential warmup period) to use."
)
parser.add_argument(
"--warmup_ratio",
type=float,
default=0.3,
help="Ratio of warmup steps against total training steps."
)
parser.add_argument(
"--weight_decay",
type=float,
default=0.1,
help="Weight decay to use."
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="A seed for reproducible training."
)
help=
"Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero', 'hybrid_parallel'."
)
parser.add_argument("--num_epoch", type=int, default=3, help="Number of epochs.")
parser.add_argument("--batch_size",
type=int,
default=32,
help="Batch size (per dp group) for the training dataloader.")
parser.add_argument("--tp_size",
type=int,
default=1,
help="The size along tensor parallel dimension, only be used when enabling hybrid parallel.")
parser.add_argument("--pp_size",
type=int,
default=1,
help="The size along pipeline parallel dimension, only be used when enabling hybrid parallel.")
parser.add_argument("--learning_rate",
type=float,
default=3e-4,
help="Initial learning rate (after the potential warmup period) to use.")
parser.add_argument("--warmup_ratio",
type=float,
default=0.3,
help="Ratio of warmup steps against total training steps.")
parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay to use.")
parser.add_argument("--grad_checkpoint", type=bool, default=True, help="Whether to use gradient checkpointing.")
parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")

args = parser.parse_args()
return args


def parse_benchmark_args():

parser = get_default_parser()

parser.add_argument(
"--model_name_or_path",
type=str,
default="google/vit-base-patch16-224",
help="Path to a pretrained model or model identifier from huggingface.co/models."
)
parser.add_argument("--model_name_or_path",
type=str,
default="google/vit-base-patch16-224",
help="Path to a pretrained model or model identifier from huggingface.co/models.")
parser.add_argument(
"--plugin",
type=str,
default="gemini",
help="Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero'."
)
parser.add_argument(
"--batch_size",
type=int,
default=8,
help="Batch size (per dp group) for the training dataloader."
)
parser.add_argument(
"--num_labels",
type=int,
default=10,
help="Number of labels for classification."
)
parser.add_argument(
"--learning_rate",
type=float,
default=5e-5,
help="Initial learning rate (after the potential warmup period) to use."
)
parser.add_argument(
"--weight_decay",
type=float,
default=0.0,
help="Weight decay to use."
)
parser.add_argument(
"--max_train_steps",
type=int,
default=20,
help="Total number of training steps to perform."
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="A seed for reproducible training."
)
parser.add_argument(
"--mem_cap",
type=int,
default=0,
help="Limit on the usage of space for each GPU (in GB)."
)
help=
"Plugin to use. Valid plugins include 'torch_ddp','torch_ddp_fp16','gemini','low_level_zero', 'hybrid_parallel'."
)
parser.add_argument("--batch_size",
type=int,
default=8,
help="Batch size (per dp group) for the training dataloader.")
parser.add_argument("--num_labels", type=int, default=10, help="Number of labels for classification.")
parser.add_argument("--learning_rate",
type=float,
default=5e-5,
help="Initial learning rate (after the potential warmup period) to use.")
parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
parser.add_argument("--grad_checkpoint", type=bool, default=True, help="Whether to use gradient checkpointing.")
parser.add_argument("--max_train_steps", type=int, default=20, help="Total number of training steps to perform.")
parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
parser.add_argument("--mem_cap", type=int, default=0, help="Limit on the usage of space for each GPU (in GB).")
args = parser.parse_args()

return args
return args
22 changes: 14 additions & 8 deletions examples/images/vit/data.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,38 @@
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
from torch.utils.data import Dataset


class BeansDataset(Dataset):
def __init__(self, image_processor, split='train'):

def __init__(self, image_processor, tp_size=1, split='train'):

super().__init__()
self.image_processor = image_processor
self.ds = load_dataset('beans')[split]
self.label_names = self.ds.features['labels'].names
while len(self.label_names) % tp_size != 0:
# ensure that the number of labels is multiple of tp_size
self.label_names.append(f"pad_label_{len(self.label_names)}")
self.num_labels = len(self.label_names)
self.inputs = []
for example in self.ds:
self.inputs.append(self.process_example(example))

def __len__(self):
return len(self.inputs)

def __getitem__(self, idx):
return self.inputs[idx]

def process_example(self, example):
input = self.image_processor(example['image'], return_tensors='pt')
input['labels'] = example['labels']
return input


def beans_collator(batch):
return {'pixel_values': torch.cat([data['pixel_values'] for data in batch], dim=0),
'labels': torch.tensor([data['labels'] for data in batch], dtype=torch.int64)}
return {
'pixel_values': torch.cat([data['pixel_values'] for data in batch], dim=0),
'labels': torch.tensor([data['labels'] for data in batch], dtype=torch.int64)
}
11 changes: 4 additions & 7 deletions examples/images/vit/run_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,20 @@ export BS=8
export MEMCAP=0
export GPUNUM=1

for BS in 8 32 128
for BS in 8 32
do
for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
do
for GPUNUM in 1 4
for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini" "hybrid_parallel"
do

MODEL_PATH="google/vit-base-patch16-224"
torchrun \
--standalone \
--nproc_per_node ${GPUNUM} \
--nproc_per_node 4 \
vit_benchmark.py \
--model_name_or_path ${MODEL_PATH} \
--mem_cap ${MEMCAP} \
--plugin ${PLUGIN} \
--batch_size ${BS}

done

done
done
13 changes: 10 additions & 3 deletions examples/images/vit/run_demo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,21 @@ pip install -r requirements.txt
MODEL="google/vit-base-patch16-224"

# path for saving model
OUTPUT_PATH="./output_model.bin"
OUTPUT_PATH="./output_model"

# plugin(training strategy)
# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"
# can only be one of "torch_ddp"/"torch_ddp_fp16"/"low_level_zero"/"gemini"/"hybrid_parallel"
PLUGIN="gemini"
#PLUGIN="hybrid_parallel"

# configuration of parallel group sizes, only used when setting PLUGIN to "hybrid_parallel"
TP_SIZE=2
PP_SIZE=2

# number of gpus to use
GPUNUM=4

# batch size per gpu
# batch size per data parallel group
BS=16

# learning rate
Expand All @@ -38,6 +43,8 @@ torchrun \
--output_path ${OUTPUT_PATH} \
--plugin ${PLUGIN} \
--batch_size ${BS} \
--tp_size ${TP_SIZE} \
--pp_size ${PP_SIZE} \
--num_epoch ${EPOCH} \
--learning_rate ${LR} \
--weight_decay ${WEIGHT_DECAY} \
Expand Down
7 changes: 2 additions & 5 deletions examples/images/vit/test_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,15 @@ set -xe
pip install -r requirements.txt

BS=8
for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini"
do
for GPUNUM in 1 4
for PLUGIN in "torch_ddp" "torch_ddp_fp16" "low_level_zero" "gemini" "hybrid_parallel"
do

torchrun \
--standalone \
--nproc_per_node ${GPUNUM} \
--nproc_per_node 4 \
vit_benchmark.py \
--model_name_or_path "google/vit-base-patch16-224" \
--plugin ${PLUGIN} \
--batch_size ${BS}

done
done
Loading

0 comments on commit 295b38f

Please sign in to comment.