diff --git a/vllm/config.py b/vllm/config.py index 3e736c990abd7..0b8f0b0b72e1f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -251,7 +251,8 @@ def get_hidden_size(self) -> int: def get_head_size(self) -> int: # TODO remove hard code - if hasattr(self.hf_text_config, "model_type") and self.hf_text_config.model_type=='deepseek_v2': + if hasattr(self.hf_text_config, "model_type" + ) and self.hf_text_config.model_type == 'deepseek_v2': # FlashAttention suports only head_size 32, 64, 128, 256, we need to pad head_size 192 to 256 return 256 if hasattr(self.hf_text_config, "head_dim"): diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 9075596adf221..27a5df67ef18c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -314,7 +314,7 @@ def fused_topk( topk: int, renormalize: bool, num_expert_group: int = 0, - topk_group: int = 0, + topk_group: int = 0, ): assert hidden_states.shape[0] == gating_output.shape[0], ( "Number of tokens mismatch") @@ -332,13 +332,13 @@ def fused_topk( import vllm._moe_C as moe_kernels topk_weights = torch.empty(M, - topk, - dtype=torch.float32, - device=hidden_states.device) + topk, + dtype=torch.float32, + device=hidden_states.device) topk_ids = torch.empty(M, - topk, - dtype=torch.int32, - device=hidden_states.device) + topk, + dtype=torch.int32, + device=hidden_states.device) token_expert_indicies = torch.empty(M, topk, dtype=torch.int32, @@ -351,15 +351,25 @@ def fused_topk( ) del token_expert_indicies # Not used. Will be used in the future. else: - scores = torch.softmax(gating_output, dim = -1) + scores = torch.softmax(gating_output, dim=-1) num_token = scores.shape[0] - group_scores = scores.view(num_token, num_expert_group, -1).max(dim=-1).values # [n, n_group] - group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1] # [n, top_k_group] + group_scores = scores.view(num_token, num_expert_group, + -1).max(dim=-1).values # [n, n_group] + group_idx = torch.topk(group_scores, + k=topk_group, + dim=-1, + sorted=False)[1] # [n, top_k_group] group_mask = torch.zeros_like(group_scores) # [n, n_group] group_mask.scatter_(1, group_idx, 1) # [n, n_group] - score_mask = group_mask.unsqueeze(-1).expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group).reshape(num_token, -1) # [n, e] + score_mask = group_mask.unsqueeze(-1).expand( + num_token, num_expert_group, + scores.shape[-1] // num_expert_group).reshape(num_token, + -1) # [n, e] tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e] - topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False) + topk_weights, topk_ids = torch.topk(tmp_scores, + k=topk, + dim=-1, + sorted=False) if renormalize: topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) @@ -523,7 +533,8 @@ def fused_moe( assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch" topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk, - renormalize, num_expert_group, topk_group) + renormalize, num_expert_group, + topk_group) return fused_experts(hidden_states, w1, w2, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 0b2f90402d383..711658d649e33 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -488,8 +488,9 @@ def __init__( self.beta_slow = beta_slow # Get n-d magnitude scaling corrected for interpolation. self.mscale = float( - yarn_get_mscale(self.scaling_factor, float(mscale)) - / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * attn_factor) + yarn_get_mscale(self.scaling_factor, float(mscale)) / + yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * + attn_factor) super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style) @@ -505,7 +506,8 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor: self.max_position_embeddings) # Get n-d rotational scaling corrected for extrapolation inv_freq_mask = (1 - _yarn_linear_ramp_mask( - low, high, self.rotary_dim // 2, dtype=torch.float)) * self.extrapolation_factor + low, high, self.rotary_dim // 2, + dtype=torch.float)) * self.extrapolation_factor inv_freq = inv_freq_interpolation * ( 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask return inv_freq @@ -522,7 +524,6 @@ def _compute_cos_sin_cache(self) -> torch.Tensor: print("Cache shape", cache.shape) return cache - def forward( self, positions: torch.Tensor, @@ -562,7 +563,8 @@ def forward( query = query_rot key = key_rot return query, key - + + _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} @@ -627,11 +629,9 @@ def get_rope( if k in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow", "mscale", "mscale_all_dim") } - rotary_emb = DeepseekScalingRotaryEmbedding(head_size, rotary_dim, - original_max_position, - base, is_neox_style, - scaling_factor, - **extra_kwargs) + rotary_emb = DeepseekScalingRotaryEmbedding( + head_size, rotary_dim, original_max_position, base, + is_neox_style, scaling_factor, **extra_kwargs) elif scaling_type == "su": short_factor = rope_scaling["short_factor"] long_factor = rope_scaling["long_factor"] diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 9070c01a37a28..05cd7a135f14e 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -20,10 +20,8 @@ "CohereForCausalLM": ("commandr", "CohereForCausalLM"), "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), - "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"), - "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 705bc9b118d1d..e3bee8b478f78 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -103,10 +103,10 @@ def __init__( self.experts = nn.ModuleList([ DeepseekV2MLP(hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - reduce_results=False) + intermediate_size=config.moe_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False) for idx in range(self.n_routed_experts) ]) self.pack_params() @@ -153,15 +153,16 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = fused_moe(hidden_states, - self.w1, - self.w2, - router_logits, - self.top_k, - renormalize=self.config.norm_topk_prob, - inplace=True, - num_expert_group = self.config.n_group, - topk_group=self.config.topk_group) * self.routed_scaling_factor + final_hidden_states = fused_moe( + hidden_states, + self.w1, + self.w2, + router_logits, + self.top_k, + renormalize=self.config.norm_topk_prob, + inplace=True, + num_expert_group=self.config.n_group, + topk_group=self.config.topk_group) * self.routed_scaling_factor if self.config.n_shared_experts is not None: final_hidden_states = final_hidden_states + shared_output final_hidden_states = tensor_model_parallel_all_reduce( @@ -169,24 +170,31 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return final_hidden_states.view(num_tokens, hidden_dim) + def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: import math if scale <= 1: return 1.0 return 0.1 * mscale * math.log(scale) + 1.0 + class DeepseekV2Attention(nn.Module): + def __init__( self, config: PretrainedConfig, - hidden_size: int, num_heads: int, - qk_nope_head_dim: int, qk_rope_head_dim: int, v_head_dim: int, - q_lora_rank: int, kv_lora_rank: int, + hidden_size: int, + num_heads: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: int, + kv_lora_rank: int, rope_theta: float = 10000, rope_scaling: Optional[Dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, - layer_idx = None, + layer_idx=None, ) -> None: super().__init__() self.layer_idx = layer_idx @@ -206,44 +214,48 @@ def __init__( self.max_position_embeddings = max_position_embeddings if self.q_lora_rank is not None: - self.q_a_proj = ReplicatedLinear( - self.hidden_size, self.q_lora_rank, - bias=False, quant_config=quant_config - ) - self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps) - self.q_b_proj = ColumnParallelLinear( - q_lora_rank, self.num_heads * self.qk_head_dim, - bias=False, quant_config=quant_config - ) + self.q_a_proj = ReplicatedLinear(self.hidden_size, + self.q_lora_rank, + bias=False, + quant_config=quant_config) + self.q_a_layernorm = RMSNorm(self.q_lora_rank, + eps=config.rms_norm_eps) + self.q_b_proj = ColumnParallelLinear(q_lora_rank, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config) else: - self.q_proj = ColumnParallelLinear( - self.hidden_size, self.num_heads * self.qk_head_dim, - bias=False, quant_config=quant_config - ) - - self.kv_a_proj_with_mqa = ReplicatedLinear( - self.hidden_size, self.kv_lora_rank + self.qk_rope_head_dim, - bias=False, quant_config=quant_config - ) - self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps) + self.q_proj = ColumnParallelLinear(self.hidden_size, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config) + + self.kv_a_proj_with_mqa = ReplicatedLinear(self.hidden_size, + self.kv_lora_rank + + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config) + self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, + eps=config.rms_norm_eps) self.kv_b_proj = ColumnParallelLinear( - self.kv_lora_rank, self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), - bias=False, quant_config=quant_config - ) + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + quant_config=quant_config) # O projection. - self.o_proj = RowParallelLinear( - self.num_heads * self.v_head_dim, self.hidden_size, - bias=False, quant_config=quant_config - ) + self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config) rope_scaling['type'] = 'deepseek_yarn' - self.rotary_emb = get_rope( - qk_rope_head_dim, - rotary_dim=qk_rope_head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - is_neox_style=False - ) + self.rotary_emb = get_rope(qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=False) if rope_scaling: mscale_all_dim = rope_scaling.get("mscale_all_dim", False) @@ -255,7 +267,7 @@ def __init__( # self.qk_head_dim, # self.scaling, # num_kv_heads=self.num_heads) - + # TODO, support head_size 192 self.attn = Attention(self.num_local_heads, 256, @@ -272,16 +284,21 @@ def forward( if self.q_lora_rank is not None: q = self.q_a_proj(hidden_states)[0] q = self.q_a_layernorm(q) - q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim) + q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, + self.qk_head_dim) else: - q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads, self.qk_head_dim) - q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads, + self.qk_head_dim) + q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], + dim=-1) latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0] - kv_a, _ = latent_cache.split([self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_a, _ = latent_cache.split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) latent_cache = latent_cache.unsqueeze(1) kv_a = self.kv_a_layernorm(kv_a.contiguous()) kv = self.kv_b_proj(kv_a)[0] - kv = kv.view(-1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim) + kv = kv.view(-1, self.num_local_heads, + self.qk_nope_head_dim + self.v_head_dim) k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) k_pe = latent_cache[:, :, self.kv_lora_rank:] q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe) @@ -289,11 +306,19 @@ def forward( k = torch.empty_like(q) k[..., :self.qk_nope_head_dim] = k_nope k[..., self.qk_nope_head_dim:] = k_pe - q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], value = 0).view(-1, self.num_local_heads * 256) - k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], value = 0).view(-1, self.num_local_heads * 256) - v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], value = 0).view(-1, self.num_local_heads * 256) + q = torch.nn.functional.pad(q, [0, 256 - self.qk_head_dim], + value=0).view(-1, + self.num_local_heads * 256) + k = torch.nn.functional.pad(k, [0, 256 - self.qk_head_dim], + value=0).view(-1, + self.num_local_heads * 256) + v = torch.nn.functional.pad(v, [0, 256 - self.v_head_dim], + value=0).view(-1, + self.num_local_heads * 256) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) - attn_output = attn_output.view(-1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape(-1, self.num_local_heads * self.v_head_dim) + attn_output = attn_output.view( + -1, self.num_local_heads, 256)[..., :self.v_head_dim].reshape( + -1, self.num_local_heads * self.v_head_dim) output, _ = self.o_proj(attn_output) return output @@ -319,13 +344,14 @@ def __init__( qk_nope_head_dim=config.qk_nope_head_dim, qk_rope_head_dim=config.qk_rope_head_dim, v_head_dim=config.v_head_dim, - q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None, + q_lora_rank=config.q_lora_rank + if hasattr(config, "q_lora_rank") else None, kv_lora_rank=config.kv_lora_rank, rope_theta=rope_theta, rope_scaling=rope_scaling, max_position_embeddings=max_position_embeddings, quant_config=quant_config, - layer_idx = layer_idx, + layer_idx=layer_idx, ) if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace @@ -390,7 +416,9 @@ def __init__( config.hidden_size, ) self.layers = nn.ModuleList([ - DeepseekV2DecoderLayer(config, layer_idx, quant_config=quant_config) + DeepseekV2DecoderLayer(config, + layer_idx, + quant_config=quant_config) for layer_idx in range(config.num_hidden_layers) ]) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)