Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabled high-performance Automatic Tensor Parallelism (auto TP) for the Qwen2-MoE and DeepSeek-V2 models on multiple GPUs/HPUs #6964

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 33 additions & 16 deletions deepspeed/module_inject/auto_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,8 @@ def __init__(self,
self.keep_module_on_host = keep_module_on_host

def in_module_list(module, module_list):
if 'DeepseekV2' in str(type(module)):
return False
for item in module_list:
if type(item).__name__ == type(module).__name__:
return True
Expand Down Expand Up @@ -258,7 +260,7 @@ def update_policy_list(policy_list, new_module, new_gems):
for i, policy in enumerate(policy_list):
# if module already exists in policy, combine gems and remove duplicates
if policy[0] == type(new_module):
new_gems = set(new_gems + policy[1])
new_gems = list(set(new_gems + policy[1]))
policy_list[i] = tuple([type(new_module), new_gems])
return policy_list
policy_list.append(tuple([type(new_module), new_gems]))
Expand Down Expand Up @@ -306,7 +308,13 @@ def tp_parser(model):
gem_list = gem_list + [layer]
elif 'o_proj' in layer:
gem_list = gem_list + [layer]
elif 'down_proj' in layer:
elif 'down_proj' in layer and not (('DeepseekV2' in str(type(module))) or
('qwen2_moe' in str(type(module)))):
gem_list = gem_list + [layer]
elif 'shared_experts.down_proj' in layer and (('DeepseekV2' in str(type(module))) or
('qwen2_moe' in str(type(module)))):
gem_list = gem_list + [layer]
elif 'mlp.down_proj' in layer and ('DeepseekV2' in str(type(module))):
gem_list = gem_list + [layer]
elif 'attention.dense' in layer and 'GPTNeoX' in str(model):
gem_list = gem_list + [layer]
Expand Down Expand Up @@ -365,7 +373,8 @@ def _replace(self, child, name, conv_linear_layer):
arctic_w2_all_reduce_linear = True
# For MoE MLP model, e.g., deepseek and jamba
down_proj = False
if 'down_proj' in name:
#Deepseek processes different down_proj in different ways.
if 'down_proj' in name and 'DeepseekV2' not in str(type(self.module)):
down_proj = True
# For MLP including chunk layer.
if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)):
Expand Down Expand Up @@ -409,20 +418,28 @@ def _replace(self, child, name, conv_linear_layer):
prepare_tp_fused_qkvw(self.module, child.bias.data, self.mp_size, mp_replace.gpu_index),
device_name, return_new_copy)
else:
data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size, name),
dim=1 if self.conv_linear_layer else 0)
data_dc = move(data[mp_replace.gpu_index], device_name, return_new_copy).detach()
del data

if child.bias is not None:
bias_data = child.bias.data.split(get_shard_size_list(
weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size, name),
dim=0)
bias_data = move(bias_data[mp_replace.gpu_index], device_name, return_new_copy)
bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
del bias_data
if ('shared_experts.down_proj' not in name and 'mlp.down_proj' not in name and 'down_proj' in name \
and ('DeepseekV2' in str(type(self.module)) or 'qwen2_moe' in str(type(self.module)))):
data = child.weight.data.split(get_shard_size_list(weight_shape[1], self.mp_size), dim=1)
data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
del data
bias_data_dc = None if child.bias is None else \
torch.nn.parameter.Parameter(move(child.bias, get_accelerator().current_device_name()))
else:
bias_data_dc = None
data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size, name),
dim=1 if self.conv_linear_layer else 0)
data_dc = move(data[mp_replace.gpu_index], device_name, return_new_copy).detach()
del data

if child.bias is not None:
bias_data = child.bias.data.split(get_shard_size_list(
weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size, name),
dim=0)
bias_data = move(bias_data[mp_replace.gpu_index], device_name, return_new_copy)
bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
del bias_data
else:
bias_data_dc = None

setattr(child, "replaced", True)
return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc, requires_grad=False), bias=bias_data_dc)
Expand Down