Skip to content

Commit

Permalink
revert f32 KQ to bring speed back
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Nov 6, 2024
1 parent 8d3449d commit 5ac2de7
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
modelbusy = threading.Lock()
requestsinqueue = 0
defaultport = 5001
KcppVersion = "1.77.yr0-ROCm"
KcppVersion = "1.77.yr1-ROCm"
showdebug = True
guimode = False
showsamplerwarning = True
Expand Down
4 changes: 2 additions & 2 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9670,7 +9670,7 @@ static struct ggml_tensor * llm_build_kqv(
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);

ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
// ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);

cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
} else {
Expand All @@ -9679,7 +9679,7 @@ static struct ggml_tensor * llm_build_kqv(

// note: this op tends to require high floating point range
// while for some models F16 is enough, for others it is not, so we default to F32 here
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
// ggml_mul_mat_set_prec(kq, GGML_PREC_F32);

if (model.arch == LLM_ARCH_GROK) {
// need to do the following:
Expand Down

0 comments on commit 5ac2de7

Please sign in to comment.