diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md index b1abd1ee19b..68e487d0a73 100644 --- a/docs/source/reference/launcher.md +++ b/docs/source/reference/launcher.md @@ -93,10 +93,10 @@ Options: ## KV_CACHE_DTYPE ```shell --kv-cache-dtype - Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value is `fp8_e5m2` on CUDA + Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA [env: KV_CACHE_DTYPE=] - [possible values: fp8_e5m2] + [possible values: fp8_e4m3fn, fp8_e5m2] ``` ## TRUST_REMOTE_CODE diff --git a/launcher/src/main.rs b/launcher/src/main.rs index ee259e4341d..55c7a74381f 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -303,6 +303,9 @@ impl std::fmt::Display for Dtype { #[derive(Clone, Copy, Debug, ValueEnum)] enum KVCacheDtype { + #[clap(name = "fp8_e4m3fn")] + Fp8e4m3fn, + #[clap(name = "fp8_e5m2")] Fp8e5m2, } @@ -310,6 +313,9 @@ enum KVCacheDtype { impl std::fmt::Display for KVCacheDtype { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + KVCacheDtype::Fp8e4m3fn => { + write!(f, "fp8_e4m3fn") + } KVCacheDtype::Fp8e5m2 => { write!(f, "fp8_e5m2") } @@ -420,7 +426,7 @@ struct Args { /// Specify the dtype for the key-value cache. When this option is not provided, /// the dtype of the model is used (typically `float16` or `bfloat16`). Currently - /// the only supported value is `fp8_e5m2` on CUDA. + /// the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA. #[clap(long, env, value_enum)] kv_cache_dtype: Option, diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index db390234e43..a363b33a89a 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -31,6 +31,7 @@ class Dtype(str, Enum): class KVCacheDtype(str, Enum): + fp8_e4m3fn = "fp8_e4m3fn" fp8_e5m2 = "fp8_e5m2" diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py index 3960c954985..f1f9ecce01a 100644 --- a/server/text_generation_server/layers/attention/kv_cache.py +++ b/server/text_generation_server/layers/attention/kv_cache.py @@ -24,11 +24,13 @@ def __init__( ): """Construct the key-value cache for a layer.""" - if dtype == torch.float8_e5m2 and ( - ATTENTION != "flashinfer" or SYSTEM != "cuda" + if ( + dtype.itemsize == 1 + and dtype.is_floating_point + and (ATTENTION != "flashinfer" or SYSTEM != "cuda") ): raise ValueError( - "float8_e5m2 KV cache is currently only supported for flashinfer on CUDA" + "FP8 KV cache is currently only supported for flashinfer on CUDA" ) element_size = torch.tensor([], dtype=dtype).element_size() @@ -105,8 +107,8 @@ def store( # TODO: add scale key = key.to(key_cache.dtype) value = value.to(value_cache.dtype) - if key_cache.dtype == torch.float8_e5m2: - # Torch index_put does not support float8_e5m2 yet, so + if key_cache.dtype in {torch.float8_e5m2, torch.float8_e4m3fn}: + # Torch index_put does not support float8_{e5m2,e4m3fn} yet, so # put as raw data instead. key_cache = key_cache.view(torch.uint8) value_cache = value_cache.view(torch.uint8) diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 17eed976467..427ffe6d3b7 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -406,6 +406,8 @@ def get_model( if kv_cache_dtype is None: kv_cache_dtype = dtype + elif kv_cache_dtype == "fp8_e4m3fn": + kv_cache_dtype = torch.float8_e4m3fn elif kv_cache_dtype == "fp8_e5m2": kv_cache_dtype = torch.float8_e5m2 else: