Skip to content

Commit

Permalink
add fp8 kv cache for rocm
Browse files Browse the repository at this point in the history
  • Loading branch information
mht-sharma committed Dec 18, 2024
1 parent 8f66d32 commit fa14d71
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 46 deletions.
52 changes: 42 additions & 10 deletions server/text_generation_server/layers/attention/kv_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,22 @@ def __init__(
device: torch.device,
):
"""Construct the key-value cache for a layer."""

if dtype in {torch.float8_e5m2, torch.float8_e4m3fn} and (
ATTENTION != "flashinfer" or SYSTEM != "cuda"
):
raise ValueError(
"FP8 KV cache is currently only supported for flashinfer on CUDA"
)
if dtype in {torch.float8_e5m2, torch.float8_e4m3fn}:
if (ATTENTION == "flashinfer" and SYSTEM == "cuda") or not (
ATTENTION == "paged" and SYSTEM == "rocm"
):
raise ValueError(
"FP8 KV cache is currently only supported for flashinfer on CUDA and paged attention on ROCM"
)
if SYSTEM == "rocm" and dtype == torch.float8_e5m2:
raise ValueError(
"float8_e5m2 FP8 KV cache is not supported on AMD Rocm"
)

self.kv_cache_dtype_str = "auto"
if SYSTEM == "rocm" and dtype == torch.float8_e4m3fn:
self.kv_cache_dtype_str = "fp8"
dtype = torch.uint8

element_size = torch.tensor([], dtype=dtype).element_size()
if SYSTEM == "ipex" and device.type == "xpu":
Expand Down Expand Up @@ -120,6 +129,16 @@ def can_scale(self, kv_scales: KVScales) -> bool:
"Using FP8 KV cache scales",
)
return True
elif (
self.kv_cache_dtype_str == "fp8"
and ATTENTION == "paged"
and SYSTEM == "rocm"
):
log_once(
logger.info,
"Using FP8 KV cache scales",
)
return True
else:
# We have scales, but not the correct FP8 cache type, so warn once.
log_once(
Expand Down Expand Up @@ -158,7 +177,7 @@ def store(
key_cache = self.kv_cache[0]
value_cache = self.kv_cache[1]

if self.can_scale(kv_scales):
if self.can_scale(kv_scales) and SYSTEM == "cuda":
if kv_scales.key_scale_cpu != 1.0:
key = fp8_quantize(
key.float(),
Expand Down Expand Up @@ -188,7 +207,16 @@ def store(
key_cache.view(-1, shape[-2], shape[-1])[slots] = key
value_cache.view(-1, shape[-2], shape[-1])[slots] = value
else:
paged_reshape_and_cache(key, value, key_cache, value_cache, slots)
paged_reshape_and_cache(
key,
value,
key_cache,
value_cache,
slots,
self.kv_cache_dtype_str,
kv_scales.key_scale_cpu,
kv_scales.value_scale_cpu,
)


def paged_reshape_and_cache(
Expand All @@ -197,7 +225,11 @@ def paged_reshape_and_cache(
key_cache: torch.Tensor,
value_cache: torch.Tensor,
slots: torch.Tensor,
kv_cache_dtype: str = "auto",
k_scale: float = 1.0,
v_scale: float = 1.0,
):

if SYSTEM == "cuda":
try:
import attention_kernels
Expand All @@ -216,7 +248,7 @@ def paged_reshape_and_cache(
f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
)
ops.reshape_and_cache(
key, value, key_cache, value_cache, slots, "auto", 1.0, 1.0
key, value, key_cache, value_cache, slots, kv_cache_dtype, k_scale, v_scale
)
elif SYSTEM == "ipex":
import intel_extension_for_pytorch as ipex
Expand Down
18 changes: 9 additions & 9 deletions server/text_generation_server/layers/attention/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ def paged_attention(
block_size,
max_s,
None,
"auto",
1.0,
1.0,
kv_cache.kv_cache_dtype_str,
kv_scales.key_scale_cpu,
kv_scales.value_scale_cpu,
)
else:
# Run PagedAttention V2.
Expand Down Expand Up @@ -154,9 +154,9 @@ def paged_attention(
block_size,
max_s,
None,
"auto",
1.0,
1.0,
kv_cache.kv_cache_dtype_str,
kv_scales.key_scale_cpu,
kv_scales.value_scale_cpu,
)
else:
ops.paged_attention_rocm(
Expand All @@ -174,9 +174,9 @@ def paged_attention(
block_size,
max_s,
None,
"auto",
1.0,
1.0,
kv_cache.kv_cache_dtype_str,
kv_scales.key_scale_cpu,
kv_scales.value_scale_cpu,
None,
_PARTITION_SIZE,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -398,10 +398,16 @@ def forward(self, hidden_states, adapter_data):
return self.down_proj(out, adapter_data)
else:
gate_up_states = self.gate_up_proj(hidden_states, adapter_data)
gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
return self.down_proj(
self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
output_shape = gate_up_states.shape[:-1] + (self.intermediate_size,)
out = torch.empty(
output_shape, dtype=gate_up_states.dtype, device=gate_up_states.device
)
ops.silu_and_mul(out, gate_up_states)
return self.down_proj(out, adapter_data)
# gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
# return self.down_proj(
# self.act(gate_up_states[:, 0]) * gate_up_states[:, 1], adapter_data
# )


class FlashLlamaLayer(nn.Module):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -520,28 +520,68 @@ def forward(
lm_head_indices: Optional[torch.Tensor] = None,
adapter_data: Optional[torch.Tensor] = None,
) -> torch.Tensor:
true_max_s = max_s
if prefill_cache_indices is not None:
# Slots also need to be sliced as it has the same size as the whole kv tensor
slots = slots[prefill_cache_indices]
elif self.max_past is not None:
# Clamp in decode mode as paged attention requires clamped values whereas the flash attention
# kernel requires the true values
seqlen = seqlen.clamp(max=self.max_past_tensor)

hidden_states = self.model(
input_ids,
position_ids,
cu_seqlen_prefill,
kv_cache,
block_tables,
slots,
seqlen,
max_s,
true_max_s,
prefill_cache_indices,
)
if lm_head_indices is not None:
hidden_states = hidden_states[lm_head_indices]
logits = self.lm_head(hidden_states)

if (
torch.distributed.get_rank() == 0
and input_ids.shape[0] == 262144
and cu_seqlen_prefill is not None
):
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
record_shapes=True,
) as prof:
true_max_s = max_s
if prefill_cache_indices is not None:
# Slots also need to be sliced as it has the same size as the whole kv tensor
slots = slots[prefill_cache_indices]
elif self.max_past is not None:
# Clamp in decode mode as paged attention requires clamped values whereas the flash attention
# kernel requires the true values
seqlen = seqlen.clamp(max=self.max_past_tensor)

hidden_states = self.model(
input_ids,
position_ids,
cu_seqlen_prefill,
kv_cache,
block_tables,
slots,
seqlen,
max_s,
true_max_s,
prefill_cache_indices,
)
if lm_head_indices is not None:
hidden_states = hidden_states[lm_head_indices]
logits = self.lm_head(hidden_states)

prof.export_chrome_trace("/tgi/trace_mistral_prefill.json")
else:
true_max_s = max_s
if prefill_cache_indices is not None:
# Slots also need to be sliced as it has the same size as the whole kv tensor
slots = slots[prefill_cache_indices]
elif self.max_past is not None:
# Clamp in decode mode as paged attention requires clamped values whereas the flash attention
# kernel requires the true values
seqlen = seqlen.clamp(max=self.max_past_tensor)

hidden_states = self.model(
input_ids,
position_ids,
cu_seqlen_prefill,
kv_cache,
block_tables,
slots,
seqlen,
max_s,
true_max_s,
prefill_cache_indices,
)
if lm_head_indices is not None:
hidden_states = hidden_states[lm_head_indices]
logits = self.lm_head(hidden_states)
return logits

0 comments on commit fa14d71

Please sign in to comment.