diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index e48372d92ebe23..a98bd49c37e0cf 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -1,7 +1,28 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
 #include "aclnn_ops.h"
 
 #include <aclnnop/aclnn_avgpool2d.h>
-#include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_cast.h>
 #include <aclnnop/aclnn_constant_pad_nd.h>
 #include <aclnnop/aclnn_copy.h>
@@ -11,6 +32,7 @@
 #include <aclnnop/aclnn_group_norm.h>
 #include <aclnnop/aclnn_index_fill_tensor.h>
 #include <aclnnop/aclnn_layer_norm.h>
+#include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_max_pool.h>
 #include <aclnnop/aclnn_permute.h>
 #include <aclnnop/aclnn_pow_tensor_tensor.h>
@@ -28,14 +50,24 @@
 
 #include <cmath>
 #include <cstring>
-#include <vector>
 #include <exception>
+#include <vector>
+
 #include "kernels/ascendc_kernels.h"
 
+/**
+ * @brief Repeats elements of a tensor along each dimension according to the
+ * specified repeat array.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor to be repeated.
+ * @param acl_dst The destination tensor after repeating.
+ * @param repeat_array The array specifying the number of repetitions along each
+ * dimension.
+ */
 static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                  aclTensor* acl_dst, int64_t* repeat_array) {
+                         aclTensor* acl_dst, int64_t* repeat_array) {
     // repeat tensor along each dim with repeat_array
-
     aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS);
 
     uint64_t workspaceSize = 0;
@@ -46,15 +78,15 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
                                           &workspaceSize, &executor));
 
     if (workspaceSize > 0) {
-        // Memory from allocator will "free" immediately, but this memory
-        // will be distribute to other pointers, but it won't access before
-        // this async task end.
-        // All tasks in same stream will execute in queue.
+        // Memory from allocator will "free" immediately, and this memory
+        // will be alloced to other pointers, but it won't access before
+        // this async task end because all tasks in same stream will execute
+        // in queue.
         ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
         workspaceAddr = workspace_allocator.get();
     }
-    aclrtStream stream = ctx.stream();
-    ACL_CHECK(aclnnRepeat(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(
+        aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream()));
     ACL_CHECK(aclDestroyIntArray(repeats));
 }
 
@@ -62,11 +94,6 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
     GGML_ASSERT(ggml_can_repeat(src, dst));
 
-    size_t nbytes = ggml_nbytes(dst);
-    aclrtStream main_stream = ctx.stream();
-    // Set dst to a zero tensor.
-    ACL_CHECK(aclrtMemsetAsync(dst->data, nbytes, 0, nbytes, main_stream));
-
     aclTensor* acl_src = create_acl_tensor(src);
     aclTensor* acl_dst = create_acl_tensor(dst);
 
@@ -78,10 +105,21 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
+/**
+ * @brief Adds two tensors element-wise and stores the result in a destination
+ * tensor.
+ *
+ * This function performs the operation: dst = acl_src0 + alpha * acl_src1
+ * where alpha is a scalar value.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src0 The first source tensor.
+ * @param acl_src1 The second source tensor.
+ * @param acl_dst The destination tensor where the result will be stored.
+ */
 static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
-               aclTensor* acl_src1, aclTensor* acl_dst) {
+                      aclTensor* acl_src1, aclTensor* acl_dst) {
     // add: dst = acl_src0 + alpha*acl_src1
-
     aclScalar* alpha = nullptr;
     float alphaValue = 1.0f;
     alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
@@ -97,8 +135,7 @@ static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, main_stream));
+    ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyScalar(alpha));
 }
@@ -156,22 +193,31 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
     ACL_CHECK(
-        aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, main_stream));
+        aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyScalar(acl_negative_slope));
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
-static void aclnn_concat(ggml_backend_cann_context& ctx, aclTensorList* tensorList,
-                  aclTensor* acl_dst, int64_t concat_dim) {
+/**
+ * @brief Concatenates a list of tensors along a specified dimension and stores
+ * the result in a destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param tensorList The list of tensors to be concatenated.
+ * @param acl_dst The destination tensor where the concatenated result will be
+ * stored.
+ * @param concat_dim The dimension along which the tensors will be concatenated.
+ */
+static void aclnn_concat(ggml_backend_cann_context& ctx,
+                         aclTensorList* tensorList, aclTensor* acl_dst,
+                         int64_t concat_dim) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
 
-    // dims in llama.cpp is reversed.
     ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst,
                                        &workspaceSize, &executor));
     if (workspaceSize > 0) {
@@ -179,8 +225,7 @@ static void aclnn_concat(ggml_backend_cann_context& ctx, aclTensorList* tensorLi
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, main_stream));
+    ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
 void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@@ -199,10 +244,23 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
+/**
+ * @brief Creates a tensor with values starting from `start`, incremented by
+ * `step`, and ending before `stop`.
+ *
+ * This function performs the operation: [start, stop), out(i+1) = out(i) +
+ * step.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_dst The destination tensor where the values will be stored.
+ * @param start The starting value of the range.
+ * @param stop The ending value of the range (exclusive).
+ * @param step The step size between consecutive values.
+ * @param n_elements The number of elements in the destination tensor.
+ */
 static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
-                  float start, float stop, float step, int64_t n_elements) {
-    // arange: [start, stop), out(i+1) = out(i) + step.
-
+                         float start, float stop, float step,
+                         int64_t n_elements) {
     int64_t steps = (int64_t)std::ceil((stop - start) / step);
     GGML_ASSERT(n_elements == steps);
 
@@ -221,8 +279,8 @@ static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst,
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(aclnnArange(workspaceAddr, workspaceSize, executor, main_stream));
+    ACL_CHECK(
+        aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyScalar(acl_start));
     ACL_CHECK(aclDestroyScalar(acl_end));
@@ -252,8 +310,6 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 }
 
 void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    // y = max(min(x, max_value), min_value).
-
     ggml_tensor* src = dst->src[0];
     GGML_ASSERT(src->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -280,8 +336,7 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, main_stream));
+    ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyScalar(acl_min));
     ACL_CHECK(aclDestroyScalar(acl_max));
@@ -290,8 +345,6 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 }
 
 void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    // acl_dst = acl_src * scale.
-
     ggml_tensor* src = dst->src[0];
 
     // scale factor
@@ -313,8 +366,7 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, main_stream));
+    ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyScalar(scale));
     ACL_CHECK(aclDestroyTensor(acl_src));
@@ -327,7 +379,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 
     aclTensor* acl_src = create_acl_tensor(src);
     aclTensor* acl_dst = create_acl_tensor(dst);
-    ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
+    ggml_cann_pool_alloc temp_buffer_allocator(
+        ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
     void* buffer = temp_buffer_allocator.get();
     aclTensor* tmp_tensor =
         create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne,
@@ -345,9 +398,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
     ACL_CHECK(
-        aclnnArgsort(workspaceAddr, workspaceSize, executor, main_stream));
+        aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     workspaceSize = 0;
     ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, type_mapping(dst->type),
@@ -357,7 +409,7 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, main_stream));
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(tmp_tensor));
@@ -365,8 +417,6 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 }
 
 void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    // layer_norm for one layer.
-
     ggml_tensor* src = dst->src[0];
 
     aclTensor* acl_src = create_acl_tensor(src);
@@ -390,9 +440,8 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream stream = ctx.stream();
-
-    ACL_CHECK(aclnnLayerNorm(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(
+        aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyIntArray(norm));
     ACL_CHECK(aclDestroyTensor(acl_src));
@@ -420,6 +469,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     int64_t ne[] = {n_groups, N};
     size_t nb[] = {type_size, type_size * n_groups};
     size_t n_bytes = N * n_groups;
+
     ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
     void* buffer = temp_buffer_allocator.get();
     aclTensor* acl_mean_out =
@@ -436,9 +486,8 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream stream = ctx.stream();
-
-    ACL_CHECK(aclnnGroupNorm(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(
+        aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_dst));
@@ -447,9 +496,6 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 }
 
 void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    // if inplace: dst = dst + alpha * src1
-    // else:       dst = src0 + alpha * src1
-
     ggml_tensor* src0 = dst->src[0];
     ggml_tensor* src1 = dst->src[1];
 
@@ -473,12 +519,10 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
 
-    aclrtStream stream = ctx.stream();
-
     if (!inplace) {
         size_t cpy_size = ggml_nbytes(dst);
         ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size,
-                                   ACL_MEMCPY_DEVICE_TO_DEVICE, stream));
+                                   ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
         aclTensor* acl_src0 = create_acl_tensor(
             src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
         ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst,
@@ -487,7 +531,8 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
             ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
             workspaceAddr = workspace_allocator.get();
         }
-        ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, stream));
+        ACL_CHECK(
+            aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
         ACL_CHECK(aclDestroyTensor(acl_src0));
     } else {
         ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha,
@@ -496,8 +541,8 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
             ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
             workspaceAddr = workspace_allocator.get();
         }
-        ACL_CHECK(
-            aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, stream));
+        ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor,
+                                  ctx.stream()));
     }
 
     ACL_CHECK(aclDestroyTensor(acl_src1));
@@ -505,8 +550,6 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 }
 
 void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    // reducesum along last dim.
-
     ggml_tensor* src = dst->src[0];
 
     aclTensor* acl_src = create_acl_tensor(src);
@@ -529,8 +572,8 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream stream = ctx.stream();
-    ACL_CHECK(aclnnReduceSum(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(
+        aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_dst));
@@ -539,7 +582,6 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
 void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
                                   ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
-
     aclTensor* acl_src =
         create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
     aclTensor* acl_dst =
@@ -552,8 +594,6 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
 
-    aclrtStream stream = ctx.stream();
-
     ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize(
         acl_src, output_size_array, acl_dst, &workspaceSize, &executor));
     if (workspaceSize > 0) {
@@ -561,14 +601,28 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
         workspaceAddr = workspace_allocator.get();
     }
 
-    ACL_CHECK(
-        aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor,
+                                     ctx.stream()));
 
     ACL_CHECK(aclDestroyIntArray(output_size_array));
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
+/**
+ * @brief Pads a tensor with a specified value along each dimension.
+ *
+ * This function performs padding of the source tensor `acl_src` and stores the
+ * result in the destination tensor `acl_dst`. The padding values for each
+ * dimension are specified in the `paddings` array.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor to be padded.
+ * @param acl_dst The destination tensor where the padded result will be stored.
+ * @param paddings An array specifying the padding values for each dimension.
+ * The size of the array should be twice the number of dimensions of the tensor.
+ * @param value The value to be used for padding. The default value is 0.0.
+ */
 static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
                       aclTensor* acl_dst, int64_t* paddings,
                       float value = 0.0f) {
@@ -587,9 +641,8 @@ static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src,
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream stream = ctx.stream();
-    ACL_CHECK(
-        aclnnConstantPadNd(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor,
+                                 ctx.stream()));
 
     ACL_CHECK(aclDestroyIntArray(acl_pad));
     ACL_CHECK(aclDestroyScalar(acl_value));
@@ -613,22 +666,18 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyTensor(acl_src));
 }
 
-void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    const int32_t* opts = (const int32_t*)dst->op_params;
-    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
-    switch (op) {
-        case GGML_OP_POOL_AVG:
-            ggml_cann_avg_pool2d(ctx, dst);
-            break;
-        case GGML_OP_POOL_MAX:
-            ggml_cann_max_pool2d(ctx, dst);
-            break;
-        case GGML_OP_POOL_COUNT:
-            GGML_ASSERT(false);
-            break;
-    }
-}
-
+/**
+ * @brief Performs 2D average pooling on the input tensor and stores the result
+ * in the destination tensor.
+ *
+ * This function performs average pooling on the source tensor and stores the
+ * result in the destination tensor. The pooling parameters (kernel size,
+ * strides, padding) are specified in the `op_params` of the destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result will be stored. The source
+ * tensor is referenced by `dst->src[0]`.
+ */
 static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
                                  ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
@@ -640,7 +689,6 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
     aclTensor* acl_dst =
         create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
 
-    // params
     const int32_t* opts = (const int32_t*)dst->op_params;
     const int k0 = opts[1];
     const int k1 = opts[2];
@@ -657,17 +705,15 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
     auto* strides = aclCreateIntArray(stride_dims.data(), 2);
     auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2);
 
-    bool ceil_mode = false;  //
+    bool ceil_mode = false;
     bool count_include_pad = true;
     int64_t divisor_override = 0;
     int8_t cube_math_type = 0;
 
-    // execute op api
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
 
-    aclrtStream stream = ctx.stream();
     ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize(
         acl_src, kernel_size, strides, paddings_avg, ceil_mode,
         count_include_pad, divisor_override, cube_math_type, acl_dst,
@@ -677,9 +723,8 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
         ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
         workspaceAddr = workspace_allocator.get();
     }
-    ACL_CHECK(aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
-    // release
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_dst));
     ACL_CHECK(aclDestroyIntArray(kernel_size));
@@ -687,6 +732,18 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
     ACL_CHECK(aclDestroyIntArray(paddings_avg));
 }
 
+/**
+ * @brief Performs 2D max pooling on the input tensor and stores the result in
+ * the destination tensor.
+ *
+ * This function performs max pooling on the source tensor and stores the result
+ * in the destination tensor. The pooling parameters (kernel size, strides,
+ * padding) are specified in the `op_params` of the destination tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result will be stored. The source
+ * tensor is referenced by `dst->src[0]`.
+ */
 static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
                                  ggml_tensor* dst) {
     ggml_tensor* src = dst->src[0];
@@ -697,7 +754,7 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
         create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
     aclTensor* acl_dst =
         create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
-    // params
+
     const int32_t* opts = (const int32_t*)dst->op_params;
     const int k0 = opts[1];
     const int k1 = opts[2];
@@ -715,7 +772,8 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
         temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
     }
 
-    ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
+    ggml_cann_pool_alloc temp_buffer_allocator(
+        ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
     void* buffer = temp_buffer_allocator.get();
     aclTensor* tmp_tensor =
         create_acl_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne,
@@ -743,7 +801,6 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
-    aclrtStream stream = ctx.stream();
 
     ACL_CHECK(aclnnMaxPoolGetWorkspaceSize(
         tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations,
@@ -753,9 +810,8 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
         workspaceAddr = workspace_allocator.get();
     }
 
-    ACL_CHECK(aclnnMaxPool(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
-    // release
     ACL_CHECK(aclDestroyTensor(acl_src));
     ACL_CHECK(aclDestroyTensor(acl_dst));
     ACL_CHECK(aclDestroyTensor(tmp_tensor));
@@ -765,7 +821,34 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
     ACL_CHECK(aclDestroyIntArray(dilations));
 }
 
-static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) {
+void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    const int32_t* opts = (const int32_t*)dst->op_params;
+    enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
+    switch (op) {
+        case GGML_OP_POOL_AVG:
+            ggml_cann_avg_pool2d(ctx, dst);
+            break;
+        case GGML_OP_POOL_MAX:
+            ggml_cann_max_pool2d(ctx, dst);
+            break;
+        case GGML_OP_POOL_COUNT:
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+/**
+ * @brief Copies data from the source tensor to the destination tensor.
+ *
+ * This function copies data from the source tensor `acl_src` to the destination
+ * tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor from which data will be copied.
+ * @param acl_dst The destination tensor where the data will be copied to.
+ */
+static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                      aclTensor* acl_dst) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -778,11 +861,15 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTen
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream stream = ctx.stream();
-    ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor,
+                               ctx.stream()));
 }
 
 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    // Support F16/F32/Q8_0 dtype, and Not support situation of src and dst have
+    // different shape and dst is non-contiguous or src is non-contiguous in
+    // first dim.
+
     ggml_tensor* src = dst->src[0];
 
     aclTensor* acl_src = create_acl_tensor(src);
@@ -793,26 +880,29 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     src->extra = src_extra_allocator.get();
     dst->extra = dst_extra_allocator.get();
     ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
-                          sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
     ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
-                          sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
 
+    if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
+         ggml_are_same_shape(src, dst)) {
+        cann_copy(ctx, acl_src, acl_dst);
+        ACL_CHECK(aclDestroyTensor(acl_src));
+        ACL_CHECK(aclDestroyTensor(acl_dst));
+        return;
+    }
     // TODO: simplefify
-    if (src->type==GGML_TYPE_F16) {
-        if (dst->type==GGML_TYPE_Q8_0) {
+    if (src->type == GGML_TYPE_F16) {
+        if (dst->type == GGML_TYPE_Q8_0) {
             aclrtlaunch_ascendc_quantize_f16_q8_0(
                 24, ctx.stream(), src->data, dst->data,
                 ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
                 ((ggml_tensor*)dst->extra)->ne);
             return;
         }
-        if (dst->type==GGML_TYPE_F16) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                          ACL_CHECK(aclDestroyTensor(acl_src));
-                          ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
+        if (dst->type == GGML_TYPE_F16) {
             if (ggml_is_contiguous(dst)) {
                 const size_t src_type_size = ggml_type_size(src->type);
                 if (src->nb[0] == src_type_size) {
@@ -820,112 +910,84 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                     int64_t rows_num = ggml_nrows(src);
 
                     aclrtlaunch_ascendc_dup_by_rows_fp16(
-                                                rows_num, ctx.stream(),
-                                                src->data, dst->data,
-                                                ((ggml_tensor*)src->extra)->ne,
-                                                ((ggml_tensor*)src->extra)->nb,
-                                                ((ggml_tensor*)dst->extra)->ne,
-                                                ((ggml_tensor*)dst->extra)->nb);
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
                     return;
                 }
                 GGML_ASSERT(false);
             }
             GGML_ASSERT(false);
         }
-        if (dst->type==GGML_TYPE_F32) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
+        if (dst->type == GGML_TYPE_F32) {
             if (ggml_is_contiguous(dst)) {
                 const size_t src_type_size = ggml_type_size(src->type);
                 if (src->nb[0] == src_type_size) {
                     // src0 is contigous on first dimension, copy by rows
                     int64_t rows_num = ggml_nrows(src);
                     aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
-                                                rows_num, ctx.stream(),
-                                                src->data, dst->data,
-                                                ((ggml_tensor*)src->extra)->ne,
-                                                ((ggml_tensor*)src->extra)->nb,
-                                                ((ggml_tensor*)dst->extra)->ne,
-                                                ((ggml_tensor*)dst->extra)->nb);
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
                     return;
                 }
                 GGML_ASSERT(false);
             }
             GGML_ASSERT(false);
         }
-        // TODO
+        // TODO: other dtype.
         GGML_ASSERT(false);
-    }
-    else if (src->type==GGML_TYPE_F32) {
-        //TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
-        //          && nb0 == type_size)
-        if (dst->type==GGML_TYPE_Q8_0) {
+    } else if (src->type == GGML_TYPE_F32) {
+        if (dst->type == GGML_TYPE_Q8_0) {
             aclrtlaunch_ascendc_quantize_f32_q8_0(
                 24, ctx.stream(), src->data, dst->data,
                 ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
                 ((ggml_tensor*)dst->extra)->ne);
             return;
         }
-        if (dst->type==GGML_TYPE_F32) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
+        if (dst->type == GGML_TYPE_F32) {
             if (ggml_is_contiguous(dst)) {
                 const size_t src_type_size = ggml_type_size(src->type);
                 if (src->nb[0] == src_type_size) {
                     // src0 is contigous on first dimension, copy by rows
                     int64_t rows_num = ggml_nrows(src);
                     aclrtlaunch_ascendc_dup_by_rows_fp32(
-                                                rows_num, ctx.stream(),
-                                                src->data, dst->data,
-                                                ((ggml_tensor*)src->extra)->ne,
-                                                ((ggml_tensor*)src->extra)->nb,
-                                                ((ggml_tensor*)dst->extra)->ne,
-                                                ((ggml_tensor*)dst->extra)->nb);
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
                     return;
                 }
                 GGML_ASSERT(false);
-            }
-            else {
-                //TODO: dst not contiguous
+            } else {
+                // TODO: dst not contiguous
                 GGML_ASSERT(false);
             }
         }
-        if (dst->type==GGML_TYPE_F16) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
+        if (dst->type == GGML_TYPE_F16) {
             if (ggml_is_contiguous(dst)) {
                 const size_t src_type_size = ggml_type_size(src->type);
                 if (src->nb[0] == src_type_size) {
                     // src0 is contigous on first dimension, copy by rows
                     int64_t rows_num = ggml_nrows(src);
                     aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
-                                                rows_num, ctx.stream(),
-                                                src->data, dst->data,
-                                                ((ggml_tensor*)src->extra)->ne,
-                                                ((ggml_tensor*)src->extra)->nb,
-                                                ((ggml_tensor*)dst->extra)->ne,
-                                                ((ggml_tensor*)dst->extra)->nb);
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
                     return;
                 }
                 GGML_ASSERT(false);
             }
         }
-        // TODO
         GGML_ASSERT(false);
-    }
-    else {
+    } else {
         if (ggml_are_same_shape(src, dst)) {
             cann_copy(ctx, acl_src, acl_dst);
             ACL_CHECK(aclDestroyTensor(acl_src));
@@ -951,9 +1013,25 @@ aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
 }
 #endif
 
-static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, size_t n_bytes,
-                      int64_t* ne, int64_t dims, aclDataType type,
-                      size_t type_size) {
+/**
+ * @brief Creates an ACL tensor initialized with zeros using a provided buffer.
+ *
+ * This function initializes a tensor with zeros using the specified buffer and
+ * tensor parameters.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param buffer The buffer to be used for the tensor data.
+ * @param n_bytes The size of the buffer in bytes.
+ * @param ne An array specifying the extents (sizes) of each dimension of the
+ * tensor.
+ * @param dims The number of dimensions of the tensor.
+ * @param type The data type of the tensor.
+ * @param type_size The size of each element in the tensor data type.
+ * @return An ACL tensor initialized with zeros.
+ */
+static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
+                             size_t n_bytes, int64_t* ne, int64_t dims,
+                             aclDataType type, size_t type_size) {
     size_t nb[GGML_MAX_DIMS];
     nb[0] = type_size;
     for (int i = 1; i < dims; i++) {
@@ -965,10 +1043,30 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, size_
     return zero;
 }
 
-static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer, size_t n_bytes,
-                      int64_t* ne, int64_t dims, aclDataType type,
-                      size_t type_size, float value = 1.0f) {
-    aclTensor* acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
+/**
+ * @brief Creates an ACL tensor initialized with ones using a provided buffer.
+ *
+ * This function initializes a tensor with ones using the specified buffer and
+ * tensor parameters.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param buffer The buffer to be used for the tensor data.
+ * @param n_bytes The size of the buffer in bytes.
+ * @param ne An array specifying the extents (sizes) of each dimension of the
+ * tensor.
+ * @param dims The number of dimensions of the tensor.
+ * @param type The data type of the tensor.
+ * @param type_size The size of each element in the tensor data type.
+ * @param value The value to be used for initializing the tensor (default
+ * is 1.0).
+ * @return An ACL tensor initialized with ones.
+ */
+static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer,
+                             size_t n_bytes, int64_t* ne, int64_t dims,
+                             aclDataType type, size_t type_size,
+                             float value = 1.0f) {
+    aclTensor* acl_tensor =
+        aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
     float alpha_host = 1.0f;
     aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
     aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
@@ -1008,14 +1106,16 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
     ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
 
-    aclTensor* acl_gamma = aclnn_ones(
-        ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, type_mapping(src->type), ggml_element_size(src));
+    aclTensor* acl_gamma =
+        aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
+                   1, type_mapping(src->type), ggml_element_size(src));
 
-    size_t zero_tensor_n_bytes = src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
+    size_t zero_tensor_n_bytes =
+        src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src);
     ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes);
-    aclTensor* acl_rstd =
-        aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, src->ne, GGML_MAX_DIMS, type_mapping(src->type),
-                   ggml_element_size(src));
+    aclTensor* acl_rstd = aclnn_zero(
+        ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, src->ne,
+        GGML_MAX_DIMS, type_mapping(src->type), ggml_element_size(src));
 
     ACL_CHECK(aclnnRmsNormGetWorkspaceSize(
         acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor));
@@ -1044,12 +1144,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
 
     const int n_past = ((int32_t*)dst->op_params)[0];
 
-    size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *src->ne[3] * ggml_element_size(src);
+    size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *
+                                src->ne[3] * ggml_element_size(src);
     ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
 
-    aclTensor* mask_tensor =
-        aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, GGML_MAX_DIMS, type_mapping(src->type),
-                   ggml_element_size(src), value);
+    aclTensor* mask_tensor = aclnn_ones(
+        ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
+        GGML_MAX_DIMS, type_mapping(src->type), ggml_element_size(src), value);
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -1093,12 +1194,24 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
+/**
+ * @brief Casts the data type of a source tensor to a destination tensor.
+ *
+ * This function casts the data type of the source tensor `acl_src` to the
+ * specified data type `cast_data_type` and stores the result in the destination
+ * tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose data type will be casted.
+ * @param acl_dst The destination tensor where the casted result will be stored.
+ * @param cast_data_type The target data type to which the source tensor will be
+ * casted.
+ */
 static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                aclTensor* acl_dst, aclDataType cast_data_type) {
+                       aclTensor* acl_dst, aclDataType cast_data_type) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
-    aclrtStream stream = ctx.stream();
 
     ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst,
                                         &workspaceSize, &executor));
@@ -1107,11 +1220,26 @@ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
         workspaceAddr = workspace_allocator.get();
     }
 
-    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Permutes the dimensions of a tensor according to a specified order.
+ *
+ * This function permutes the dimensions of the source tensor `acl_src`
+ * according to the order specified in the `new_dim` array and stores the result
+ * in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose dimensions will be permuted.
+ * @param acl_dst The destination tensor where the permuted result will be
+ * stored.
+ * @param new_dim An array specifying the new order of dimensions for the
+ * tensor.
+ * @param dims The number of dimensions in the tensor.
+ */
 static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                   aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
+                          aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) {
     aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
 
     uint64_t workspaceSize = 0;
@@ -1176,7 +1304,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
     GGML_ASSERT(nb10 == sizeof(float));
 
-    // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
+    // im2col: [N, C, H, W] -> [N, IC*KH*KW, OW*OH]
     aclTensor* acl_src1 = create_acl_tensor(src1);
     int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
     size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
@@ -1189,7 +1317,8 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     // Calculate im2col.
     // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
     // dst.elemcount.
-    ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
+    ggml_cann_pool_alloc im2col_allocator(
+        ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1));
     void* tmp_im2col_buffer = im2col_allocator.get();
     aclTensor* tmp_im2col_tensor = create_acl_tensor(
         tmp_im2col_buffer, type_mapping(src1->type), ggml_type_size(src1->type),
@@ -1207,7 +1336,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
-    aclrtStream stream = ctx.stream();
 
     ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations,
                                           paddings, strides, tmp_im2col_tensor,
@@ -1218,7 +1346,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         workspaceAddr = workspace_allocator.get();
     }
 
-    ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, stream));
+    ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     // Cast if dst is f16.
     aclTensor* tmp_cast_tensor = nullptr;
@@ -1239,7 +1367,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                    type_mapping(dst->type));
     }
 
-    // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
+    // Permute: [N, IC*KH*KW, OW*OH] -> [N, OW*OH, IC*KH*KW]
     int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
     size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
     aclTensor* acl_dst =
@@ -1263,6 +1391,15 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ACL_CHECK(aclDestroyIntArray(strides));
 }
 
+/**
+ * @brief Applies element-wise exponential function to the elements of a tensor.
+ *
+ * This function computes the exponential of each element in the source tensor
+ * `acl_src` and stores the result back into the same tensor.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The tensor on which the exponential function will be applied.
+ */
 static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -1279,6 +1416,23 @@ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
         aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Multiplies elements of a tensor by a scalar value, optionally
+ * in-place.
+ *
+ * This function multiplies each element of the source tensor `acl_src` by the
+ * scalar `scale` and stores the result in the destination tensor `acl_dst`. If
+ * `inplace` is true, the operation is performed in-place on `acl_src`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be multiplied.
+ * @param scale The scalar value by which each element of `acl_src` will be
+ * multiplied.
+ * @param acl_dst The destination tensor where the result will be stored if
+ * `inplace` is false.
+ * @param inplace Flag indicating whether to perform the operation in-place on
+ * `acl_src`.
+ */
 static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
                        float scale, aclTensor* acl_dst, bool inplace) {
     aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
@@ -1297,8 +1451,7 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 
         ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor,
                                    ctx.stream()));
-    }
-    else {
+    } else {
         ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst,
                                             &workspaceSize, &executor));
         if (workspaceSize > 0) {
@@ -1306,16 +1459,26 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src,
             workspaceAddr = workspace_allocator.get();
         }
 
-        ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor,
-                            ctx.stream()));
+        ACL_CHECK(
+            aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream()));
     }
 
-
     ACL_CHECK(aclDestroyScalar(acl_scale));
 }
 
-static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                       aclTensor* acl_other) {
+/**
+ * @brief Performs an in-place element-wise multiplication of two tensors.
+ *
+ * This function performs an element-wise multiplication of the tensors
+ * `acl_src` and `acl_other` and stores the result in `acl_src`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor where the multiplication result will be
+ *                stored.
+ * @param acl_other The tensor whose elements will be multiplied with `acl_src`.
+ */
+static void aclnn_inplace_mul(ggml_backend_cann_context& ctx,
+                              aclTensor* acl_src, aclTensor* acl_other) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -1331,8 +1494,21 @@ static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src
         aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
-static void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                          aclTensor* acl_other, aclTensor* acl_dst) {
+/**
+ * @brief Performs element-wise multiplication of two tensors and stores the
+ * result in a destination tensor.
+ *
+ * This function performs element-wise multiplication of the tensors `acl_src`
+ * and `acl_other` and stores the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The first tensor for element-wise multiplication.
+ * @param acl_other The second tensor for element-wise multiplication.
+ * @param acl_dst The destination tensor where the result will be stored.
+ */
+static void aclnn_mul(ggml_backend_cann_context& ctx,
+                                 aclTensor* acl_src, aclTensor* acl_other,
+                                 aclTensor* acl_dst) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -1347,8 +1523,18 @@ static void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_
     ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Applies element-wise cosine function to the elements of a tensor.
+ *
+ * This function computes the cosine of each element in the source tensor `acl_src`
+ * and stores the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the cosine function will be applied.
+ * @param acl_dst The destination tensor where the cosine results will be stored.
+ */
 static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-               aclTensor* acl_dst) {
+                      aclTensor* acl_dst) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -1363,8 +1549,18 @@ static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Applies element-wise sine function to the elements of a tensor.
+ *
+ * This function computes the sine of each element in the source tensor `acl_src`
+ * and stores the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the sine function will be applied.
+ * @param acl_dst The destination tensor where the sine results will be stored.
+ */
 static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-               aclTensor* acl_dst) {
+                      aclTensor* acl_dst) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -1422,7 +1618,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
         tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
     }
 
-    ggml_cann_pool_alloc permute_allocator(ctx.pool(),  ggml_nbytes(src));
+    ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
     void* tmp_permute_buffer = permute_allocator.get();
     aclTensor* tmp_permute_tenosr = create_acl_tensor(
         tmp_permute_buffer, type_mapping(src->type), ggml_type_size(src->type),
@@ -1443,16 +1639,18 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
     int mul_nelements =
         src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
 
-    ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+    ggml_cann_pool_alloc mul_allocator(
+        ctx.pool(), mul_nelements * ggml_type_size(src->type));
     void* tmp_mul_buffer = mul_allocator.get();
     aclTensor* tmp_mul_tensor = create_acl_tensor(
         tmp_mul_buffer, type_mapping(src->type), ggml_type_size(src->type),
         tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
-    aclnn_noinplcace_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor,
+    aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor,
                          tmp_mul_tensor);
 
     // cos
-    ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+    ggml_cann_pool_alloc cos_allocator(
+        ctx.pool(), mul_nelements * ggml_type_size(src->type));
     void* tmp_cos_buffer = cos_allocator.get();
     aclTensor* tmp_cos_tensor = create_acl_tensor(
         tmp_cos_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
@@ -1461,7 +1659,8 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
     aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
 
     // sin
-    ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
+    ggml_cann_pool_alloc sin_allocator(
+        ctx.pool(), mul_nelements * ggml_type_size(src->type));
     void* tmp_sin_buffer = sin_allocator.get();
     aclTensor* tmp_sin_tensor = create_acl_tensor(
         tmp_sin_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
@@ -1486,10 +1685,18 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
+/**
+ * @brief Fills a tensor with a scalar value.
+ *
+ * This function fills the destination tensor `acl_dst` with the scalar value
+ * `scalar`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param scalar The scalar value used to fill the tensor.
+ * @param acl_dst The destination tensor to be filled with the scalar value.
+ */
 static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
-                       aclTensor* acl_dst) {
-    // fill acl_dst with scalar value.
-
+                              aclTensor* acl_dst) {
     auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT);
 
     uint64_t workspaceSize = 0;
@@ -1508,10 +1715,20 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
     ACL_CHECK(aclDestroyScalar(acl_scalar));
 }
 
+/**
+ * @brief Raises each element of a tensor to the power of the corresponding
+ * element in another tensor.
+ *
+ * This function computes the element-wise power of the destination tensor
+ * `acl_dst` raised to the power of the exponent tensor `acl_exp`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_dst The destination tensor, which also serves as the base tensor.
+ * @param acl_exp The exponent tensor, each element of which is used to raise
+ * the corresponding element in the destination tensor.
+ */
 static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
                                     aclTensor* acl_dst, aclTensor* acl_exp) {
-    // acl_dst = acl_dst^acl_exp
-
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -1527,10 +1744,37 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
                                           executor, ctx.stream()));
 }
 
+/**
+ * @brief   Applies the Alibi (Attention with Linear Biases) mechanism to the
+ *          input tensor.
+ *
+ * @details This function implements the Alibi mechanism, which introduces
+ *          learnable biases into the attention scores to simulate relative
+ *          position encoding without the need for explicit positional
+ *          embeddings.
+ *
+ * @param ctx          The backend CANN context for executing operations.
+ * @param acl_src      The source tensor representing the query or key.
+ * @param acl_position The position tensor containing relative positions.
+ * @param acl_dst      The destination tensor where the result will be stored.
+ * @param n_head       The number of attention heads.
+ * @param src_ne       The dimensions of the source tensor.
+ * @param src_nb0      The byte size of the first dimension of the source tensor.
+ * @param max_bias     The maximum bias value used in the Alibi mechanism.
+ * @param dst          The destination tensor object for additional metadata.
+ *
+ * The function performs the following steps:
+ * 1. Calculates the logarithm floor of the number of heads to determine the base for bias calculation.
+ * 2. Initializes arrays with arithmetic sequences and fills them with bias values.
+ * 3. Computes the bias tensor based on the calculated biases and arithmetic sequences.
+ * 4. Reshapes the bias tensor to match the dimensions of the input tensors.
+ * 5. Multiplies the position tensor by the bias tensor.
+ * 6. Adds the result of the multiplication to the source tensor to produce the final output.
+ */
 static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                 aclTensor* acl_position, aclTensor* acl_dst, const int n_head,
-                 int64_t* src_ne, const size_t src_nb0, float max_bias,
-                 ggml_tensor* dst) {
+                        aclTensor* acl_position, aclTensor* acl_dst,
+                        const int n_head, int64_t* src_ne, const size_t src_nb0,
+                        float max_bias, ggml_tensor* dst) {
     GGML_UNUSED(src_ne[1]);
     const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
     GGML_ASSERT(src_nb0 == sizeof(float));
@@ -1542,8 +1786,9 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
 
     // init arange
-    ggml_cann_pool_alloc arange_allocator(ctx.pool(), ne2_ne3 * ggml_type_size(dst->type));
-    void* tmp_arange_buffer = arange_allocator.get();
+    ggml_cann_pool_alloc arange_allocator(ctx.pool(),
+                                          ne2_ne3 * ggml_type_size(dst->type));
+    void* arange_buffer = arange_allocator.get();
 
     // arange1: [1, ..., n_heads_log2_floor+1)
     float start = 1;
@@ -1551,109 +1796,130 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
     float step = 1;
     int64_t n_elements_arange = n_heads_log2_floor;
 
-    int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
-    size_t tmp_arange1_nb[] = {sizeof(dst->type)};
-    aclTensor* tmp_arange1_tensor = create_acl_tensor(
-        tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
-        tmp_arange1_ne, tmp_arange1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    int64_t arange_ne[] = {n_heads_log2_floor};
+    size_t arange_nb[] = {sizeof(dst->type)};
+    aclTensor* arange_tensor = create_acl_tensor(
+        arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        arange_ne, arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
 
-    aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
+    aclnn_arange(ctx, arange_tensor, start, stop, step, n_elements_arange);
 
-    aclTensor* tmp_arange2_tensor = nullptr;
+    // if n_heads_log2_floor smaller than ne2_ne3, need arange_tail_tensor
+    // which is [1, ..., 2 * (k - n_heads_log2_floor) + 1)].
+    aclTensor* arange_tail_tensor = nullptr;
     if (n_heads_log2_floor < ne2_ne3) {
-        // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
         start = 1;
         stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
         step = 2;
         n_elements_arange = ne2_ne3 - n_heads_log2_floor;
-        int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
-        size_t tmp_arange2_nb[] = {sizeof(dst->type)};
-
-        aclTensor* tmp_arange2_tensor = create_acl_tensor(
-            (char*)tmp_arange_buffer + n_heads_log2_floor * ggml_type_size(dst->type),
-            type_mapping(dst->type), ggml_type_size(dst->type), tmp_arange2_ne,
-            tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-        aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
+        int64_t arange_tail_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t arange_tail_nb[] = {sizeof(dst->type)};
+
+        aclTensor* arange_tail_tensor = create_acl_tensor(
+            (char*)arange_buffer +
+                n_heads_log2_floor * ggml_type_size(dst->type),
+            type_mapping(dst->type), ggml_type_size(dst->type), arange_tail_ne,
+            arange_tail_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_arange(ctx, arange_tail_tensor, start, stop, step,
                      n_elements_arange);
     }
 
-    // init mk_base
-    ggml_cann_pool_alloc mk_base_allocator(ctx.pool(), ne2_ne3 * ggml_type_size(dst->type));
-    void* tmp_mk_base_buffer = mk_base_allocator.get();
-    int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
-    size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
-    aclTensor* tmp_mk_base1_tensor = create_acl_tensor(
-        tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
-        tmp_mk_base1_ne, tmp_mk_base1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+    // init mk_base, tensor with n_heads_log2_floor length and set value with m0.
+    ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
+                                           ne2_ne3 * ggml_type_size(dst->type));
+    void* mk_base_buffer = mk_base_allocator.get();
+    int64_t mk_base_ne[] = {n_heads_log2_floor};
+    size_t mk_base_nb[] = {sizeof(dst->type)};
+    aclTensor* mk_base_tensor = create_acl_tensor(
+        mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        mk_base_ne, mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
 
-    aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
+    aclnn_fill_scalar(ctx, m0, mk_base_tensor);
 
-    aclTensor* tmp_mk_base2_tensor = nullptr;
+    // if n_heads_log2_floor smaller than ne2_ne3, need mk_base_tail which has
+    // ne2_ne3 - n_heads_log2_floor and set value with m1.
+    aclTensor* mk_base_tail_tensor = nullptr;
     if (n_heads_log2_floor < ne2_ne3) {
-        int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
-        size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
-        aclTensor* tmp_mk_base2_tensor = create_acl_tensor(
-            (char*)tmp_mk_base_buffer + n_heads_log2_floor * ggml_type_size(dst->type),
-            type_mapping(dst->type), ggml_type_size(dst->type), tmp_mk_base2_ne,
-            tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-        aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
-    }
-
-    // init mk
-    int64_t tmp_mk_base_ne[] = {ne2_ne3};
-    size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
-    aclTensor* tmp_mk_base_tensor = create_acl_tensor(
-        tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
-        tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-    aclTensor* tmp_arange_tensor = create_acl_tensor(
-        tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
-        tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
-    aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
+        int64_t mk_base_tail_ne[] = {ne2_ne3 - n_heads_log2_floor};
+        size_t mk_base_tail_nb[] = {sizeof(dst->type)};
+        aclTensor* mk_base_tail_tensor = create_acl_tensor(
+            (char*)mk_base_buffer +
+                n_heads_log2_floor * ggml_type_size(dst->type),
+            type_mapping(dst->type), ggml_type_size(dst->type), mk_base_tail_ne,
+            mk_base_tail_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
+        aclnn_fill_scalar(ctx, m1, mk_base_tail_tensor);
+    }
+
+    // init mk, mk_base_with_tail_tensor=mk_base_with_tail_tensor^arange_tensor
+    int64_t mk_base_with_tail_ne[] = {ne2_ne3};
+    size_t mk_base_with_tail_nb[] = {sizeof(dst->type)};
+    aclTensor* mk_base_with_tail_tensor = create_acl_tensor(
+        mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        mk_base_with_tail_ne, mk_base_with_tail_nb, GGML_MAX_DIMS - 3,
+        ACL_FORMAT_ND);
+    aclTensor* arange_with_tail_tensor = create_acl_tensor(
+        arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        mk_base_with_tail_ne, mk_base_with_tail_nb, GGML_MAX_DIMS - 3,
+        ACL_FORMAT_ND);
+    aclnn_pow_tensor_tensor(ctx, mk_base_with_tail_tensor,
+                            arange_with_tail_tensor);
 
     // reshape mk
-    int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
-    size_t tmp_mk_nb[GGML_MAX_DIMS];
-    tmp_mk_nb[0] = ggml_type_size(dst->type);
+    int64_t mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
+    size_t mk_nb[GGML_MAX_DIMS];
+    mk_nb[0] = ggml_type_size(dst->type);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
+        mk_nb[i] = mk_nb[i - 1] * mk_ne[i - 1];
     }
-    aclTensor* tmp_mk_tensor = create_acl_tensor(
-        tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
-        tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclTensor* mk_tensor = create_acl_tensor(
+        mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        mk_ne, mk_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
 
     // acl_position * mk
-    int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
-    size_t tmp_output_nb[GGML_MAX_DIMS];
-    tmp_output_nb[0] = ggml_type_size(dst->type);
+    int64_t output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
+    size_t output_nb[GGML_MAX_DIMS];
+    output_nb[0] = ggml_type_size(dst->type);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
+        output_nb[i] = output_nb[i - 1] * output_ne[i - 1];
     }
     ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
-    void* tmp_output_buffer = output_allocator.get();
-    aclTensor* tmp_output_tensor = create_acl_tensor(
-        tmp_output_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
-        tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
-    aclnn_noinplcace_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
+    void* output_buffer = output_allocator.get();
+    aclTensor* output_tensor = create_acl_tensor(
+        output_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
+        output_ne, output_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclnn_mul(ctx, acl_position, mk_tensor, output_tensor);
 
     // add
-    aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
-
-    ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor));
-    ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor));
-    ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor));
-    ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor));
-    ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor));
-    ACL_CHECK(aclDestroyTensor(tmp_arange_tensor));
-    ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
-    ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
+    aclnn_add(ctx, output_tensor, acl_src, acl_dst);
+
+    ACL_CHECK(aclDestroyTensor(arange_tensor));
+    ACL_CHECK(aclDestroyTensor(arange_tail_tensor));
+    ACL_CHECK(aclDestroyTensor(mk_base_tensor));
+    ACL_CHECK(aclDestroyTensor(mk_base_tail_tensor));
+    ACL_CHECK(aclDestroyTensor(mk_base_with_tail_tensor));
+    ACL_CHECK(aclDestroyTensor(arange_with_tail_tensor));
+    ACL_CHECK(aclDestroyTensor(mk_tensor));
+    ACL_CHECK(aclDestroyTensor(output_tensor));
 }
 
 void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_cann_dup(ctx, dst);
 }
 
-static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                       aclTensor* acl_dst) {
+/**
+ * @brief Performs element-wise addition of two tensors in place.
+ *
+ * This function adds the source tensor `acl_src` to the destination tensor
+ * `acl_dst` element-wise and stores the result in the destination tensor
+ * `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor to be added.
+ * @param acl_dst The destination tensor which will hold the result of the
+ * addition.
+ */
+static void aclnn_inplace_add(ggml_backend_cann_context& ctx,
+                              aclTensor* acl_src, aclTensor* acl_dst) {
     aclScalar* alpha = nullptr;
     float alphaValue = 1.0f;
     alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
@@ -1669,16 +1935,28 @@ static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
     ACL_CHECK(
-        aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, main_stream));
+        aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyScalar(alpha));
 }
 
+/**
+ * @brief Applies the softmax function to a tensor along a specified dimension.
+ *
+ * This function computes the softmax of the source tensor `acl_src` along the
+ * specified dimension `dim` and stores the result in the destination tensor
+ * `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor on which the softmax function will be
+ * applied.
+ * @param dim The dimension along which the softmax function will be computed.
+ * @param acl_dst The destination tensor where the softmax results will be
+ * stored.
+ */
 static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
                           int64_t dim, aclTensor* acl_dst) {
-
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -1715,11 +1993,8 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
     void* input_mul_scale_buffer = mul_scale_allocator.get();
     aclTensor* acl_input_mul_scale_tensor = create_acl_tensor(
-                                                    input_mul_scale_buffer,
-                                                    ACL_FLOAT,
-                                                    ggml_type_size(src0->type),
-                                                    src0->ne, src0->nb,
-                                                    GGML_MAX_DIMS);
+        input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
+        src0->nb, GGML_MAX_DIMS);
 
     bool inplace = false;
     aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
@@ -1740,18 +2015,14 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
             }
             src1_fp32_allocator.alloc(n_bytes);
             void* src1_fp32_buffer = src1_fp32_allocator.get();
-            acl_src1_fp32_tensor = create_acl_tensor(src1_fp32_buffer,
-                                                            ACL_FLOAT,
-                                                            sizeof(float),
-                                                            src1->ne,
-                                                            src1_fp32_nb,
-                                                            GGML_MAX_DIMS);
+            acl_src1_fp32_tensor =
+                create_acl_tensor(src1_fp32_buffer, ACL_FLOAT, sizeof(float),
+                                  src1->ne, src1_fp32_nb, GGML_MAX_DIMS);
             aclTensor* acl_src1 = create_acl_tensor(src1);
             aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
 
             ACL_CHECK(aclDestroyTensor(acl_src1));
-        }
-        else {
+        } else {
             acl_src1_fp32_tensor = create_acl_tensor(src1);
         }
 
@@ -1765,8 +2036,8 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
                 tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
             }
             tmp_mask_tensor = create_acl_tensor(
-                src1->data, ACL_FLOAT, sizeof(float),
-                tmp_mask_ne, tmp_mask_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+                src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
+                GGML_MAX_DIMS, ACL_FORMAT_ND);
         }
 
         // alibi
@@ -1777,41 +2048,34 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
         void* output_buffer = output_allocator.get();
         aclTensor* alibi_output_tensor = create_acl_tensor(
-                                                      output_buffer,
-                                                      ACL_FLOAT,
-                                                      ggml_type_size(dst->type),
-                                                      dst->ne, dst->nb,
-                                                      GGML_MAX_DIMS);
-        if (max_bias <=0.0f) {
+            output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
+            dst->nb, GGML_MAX_DIMS);
+        if (max_bias <= 0.0f) {
             // slope = 1.0
             if (tmp_mask_tensor) {
                 aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
-                      alibi_output_tensor);
-            }
-            else {
+                          alibi_output_tensor);
+            } else {
                 aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
-                      alibi_output_tensor);
+                          alibi_output_tensor);
             }
-        }
-        else {
+        } else {
             // slope != 1.0
             if (tmp_mask_tensor) {
                 aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
-                            alibi_output_tensor, n_head, src0->ne, src_nb0, max_bias,
-                            dst);
-            }
-            else {
-                aclnn_alibi(ctx, acl_input_mul_scale_tensor, acl_src1_fp32_tensor,
-                            alibi_output_tensor, n_head, src0->ne, src_nb0, max_bias,
-                            dst);
+                            alibi_output_tensor, n_head, src0->ne, src_nb0,
+                            max_bias, dst);
+            } else {
+                aclnn_alibi(ctx, acl_input_mul_scale_tensor,
+                            acl_src1_fp32_tensor, alibi_output_tensor, n_head,
+                            src0->ne, src_nb0, max_bias, dst);
             }
         }
 
         // softmax
         aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
         ACL_CHECK(aclDestroyTensor(alibi_output_tensor));
-    }
-    else {
+    } else {
         aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
     }
 
@@ -1834,11 +2098,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     src1->extra = src1_extra_allocator.get();
     dst->extra = dst_extra_allocator.get();
     ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
-                          sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
     ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
-                          sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
     ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
-                          sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream()));
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
 
     switch (src0->type) {
         case GGML_TYPE_F32:
@@ -1881,37 +2148,58 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     }
 }
 
-static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                             aclTensor* acl_dst, int64_t dim, int64_t repeats,
-                             int64_t output_size) {
-    // each elem in acl_src will repeat. repeat number is `repeats`, repeats dim
-    // is `dim`.
-
+/**
+ * @brief Repeats elements of a tensor along a specified dimension.
+ *
+ * This function repeats each element of the source tensor `acl_src` a specified
+ * number of times (`repeats`) along the specified dimension `dim` and stores
+ * the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be repeated.
+ * @param acl_dst The destination tensor where the repeated elements will be
+ * stored.
+ * @param dim The dimension along which the elements will be repeated.
+ * @param repeats The number of times each element will be repeated.
+ * @param output_size The size of the output tensor.
+ */
+static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
+                                    aclTensor* acl_src, aclTensor* acl_dst,
+                                    int64_t dim, int64_t repeats,
+                                    int64_t output_size) {
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
 
-    ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(acl_src, repeats,
-                                                              dim, output_size,
-                                                              acl_dst,
-                                                              &workspaceSize,
-                                                              &executor));
+    ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(
+        acl_src, repeats, dim, output_size, acl_dst, &workspaceSize,
+        &executor));
     if (workspaceSize > 0) {
         ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(
-        aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize, executor,
-                                        main_stream));
-
+    ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize,
+                                              executor, ctx.stream()));
 }
 
+/**
+ * @brief Performs matrix multiplication of two tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
 static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
-                   aclTensor* acl_weight, aclTensor* acl_dst) {
-    int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is fp32,
-                               // atlas a2 will transpose it to HFLOAT32.
+                          aclTensor* acl_weight, aclTensor* acl_dst) {
+    int8_t cube_math_type = 1;  // ALLOW_FP32_DOWN_PRECISION, when input is
+                                // fp32, atlas a2 will transpose it to HFLOAT32.
 
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
@@ -1926,28 +2214,42 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(aclnnMatmul(workspaceAddr, workspaceSize, executor, 
-                               main_stream));
+    ACL_CHECK(aclnnMatmul(workspaceAddr, workspaceSize, executor,  ctx.stream()));
 }
 
-static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+/**
+ * @brief Performs matrix multiplication with floating-point precision on
+ * tensors using the CANN backend.
+ *
+ * This function performs matrix multiplication of the input tensor and the
+ * weight tensor, handling broadcasting and transposing as needed, and stores
+ * the result in the destination tensor `dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
+                                 ggml_tensor* dst) {
     ggml_tensor* weight = dst->src[0];  // weight
-    ggml_tensor* input = dst->src[1];  // input
+    ggml_tensor* input = dst->src[1];   // input
 
-    // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto broadcast,
-    // when weight ne2 or ne3 is not 1, weight need repeat.
+    // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
+    // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
     BCAST_MUL_MAT_SHAPE(input, weight, dst);
 
     // transpose weight: [1,2,3,4] -> [1,2,4,3]
     int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
-                           bcast_weight_ne[2], bcast_weight_ne[3], bcast_weight_ne[4], bcast_weight_ne[5]};
+                              bcast_weight_ne[2], bcast_weight_ne[3],
+                              bcast_weight_ne[4], bcast_weight_ne[5]};
     size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
-                          bcast_weight_nb[2], bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5]};
-
+                             bcast_weight_nb[2], bcast_weight_nb[3],
+                             bcast_weight_nb[4], bcast_weight_nb[5]};
 
-    aclTensor* acl_weight_tensor = create_acl_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
-    aclTensor* acl_input_tensor = create_acl_tensor(input, BCAST_MUL_MAT_PARAM(input));
+    aclTensor* acl_weight_tensor =
+        create_acl_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
+    aclTensor* acl_input_tensor =
+        create_acl_tensor(input, BCAST_MUL_MAT_PARAM(input));
     aclTensor* acl_dst = create_acl_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
     aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
 
@@ -1956,7 +2258,21 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ggml_tensor* ds
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
-static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+/**
+ * @brief Performs matrix multiplication with quantized weights and
+ * floating-point inputs using the CANN backend.
+ *
+ * This function performs matrix multiplication of the input tensor `src1` and
+ * the weight tensor `src0`, handling broadcasting, transposing, and
+ * quantization as needed, and stores the result in the destination tensor
+ * `dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
+                                   ggml_tensor* dst) {
     ggml_tensor* src0 = dst->src[0];  // weight
     ggml_tensor* src1 = dst->src[1];  // input
 
@@ -1986,7 +2302,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor*
 
     if (src1->type != GGML_TYPE_F16) {
         aclTensor* acl_src1_tensor = create_acl_tensor(src1);
-        ggml_cann_pool_alloc input_alloctor(ctx.pool(), ggml_nelements(src1) * input_elem_size);
+        ggml_cann_pool_alloc input_alloctor(
+            ctx.pool(), ggml_nelements(src1) * input_elem_size);
         input_buffer = input_alloctor.get();
 
         int64_t* input_cast_ne = src1->ne;
@@ -2010,7 +2327,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor*
     size_t output_elem_size = sizeof(uint16_t);
     int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
     size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
-    ggml_cann_pool_alloc output_alloctor(ctx.pool(), ggml_nelements(dst) * output_elem_size);
+    ggml_cann_pool_alloc output_alloctor(
+        ctx.pool(), ggml_nelements(dst) * output_elem_size);
     void* output_buffer = output_alloctor.get();
     size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
 
@@ -2046,7 +2364,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor*
                 &workspaceSize, &executor));
 
             if (workspaceSize > 0 && workspaceAddr == nullptr) {
-                ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
+                                                         workspaceSize);
                 workspaceAddr = workspace_allocator.get();
             }
 
@@ -2097,9 +2416,24 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     }
 }
 
+/**
+ * @brief Rolls the elements of a tensor along a specified dimension.
+ *
+ * This function rolls the elements of the source tensor `acl_src` by the
+ * specified shifts `shifts` along the specified dimensions `dims`, and stores
+ * the result in the destination tensor `acl_dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be rolled.
+ * @param acl_dst The destination tensor where the rolled elements will be
+ * stored.
+ * @param shifts An array specifying the number of positions by which elements
+ * are shifted.
+ * @param dims An array specifying the dimensions along which elements are
+ * shifted.
+ */
 static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
-
+                       aclTensor* acl_dst, int64_t* shifts, int64_t* dims) {
     aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1);
     aclIntArray* acl_dims = aclCreateIntArray(dims, 1);
 
@@ -2114,20 +2448,29 @@ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(
-        aclnnRoll(workspaceAddr, workspaceSize, executor, main_stream));
+    ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyIntArray(acl_shifts));
     ACL_CHECK(aclDestroyIntArray(acl_dims));
 }
 
+/**
+ * @brief Fills specified positions of a tensor with a scalar value.
+ *
+ * This function fills the positions in the source tensor `acl_src` specified by
+ * `index` along the dimension `dim` with the scalar value `value`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor where the positions will be filled.
+ * @param dim The dimension along which the positions are specified.
+ * @param index An array specifying the positions to be filled.
+ * @param index_num The number of positions specified in the index array.
+ * @param value The scalar value used to fill the specified positions.
+ */
 static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
                                     aclTensor* acl_src, int64_t dim,
                                     int64_t* index, int64_t index_num,
                                     float value) {
-    // position in the @param.index along @param.dim will be filled with @param.value
-
     aclIntArray* acl_index = aclCreateIntArray(index, index_num);
     aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
 
@@ -2135,46 +2478,61 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
 
-    ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize(acl_src, dim,
-                                                          acl_index, acl_value,
-                                                          &workspaceSize,
-                                                          &executor));
+    ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize(
+        acl_src, dim, acl_index, acl_value, &workspaceSize, &executor));
     if (workspaceSize > 0) {
         ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
         workspaceAddr = workspace_allocator.get();
     }
 
-    aclrtStream main_stream = ctx.stream();
-    ACL_CHECK(
-        aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize, executor,
-                                    main_stream));
+    ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize,
+                                          executor, ctx.stream()));
 
     ACL_CHECK(aclDestroyIntArray(acl_index));
     ACL_CHECK(aclDestroyScalar(acl_value));
 }
 
+/**
+ * @brief   Initializes the cache for sine and cosine values.
+ *
+ * @details This function prepares a cache for sine and cosine values which can
+ *          be reused in subsequent computations, potentially improving
+ *          performance by avoiding redundant calculations.
+ *
+ * @param ctx                   The context for the CANN backend operations.
+ * @param dst                   Pointer to the destination tensor where the
+ *                              final result will be stored.
+ * @param acl_cos_repeat_tensor Pointer to the ACL tensor where repeated cosine
+ *                              values will be cached.
+ * @param acl_sin_repeat_tensor Pointer to the ACL tensor where repeated sine
+ *                              values will be cached.
+ * @param theta_scale           A scaling factor applied to the theta values
+ *                              before calculating sine and cosine.
+ * @param is_neox               Boolean flag indicating whether to use the Neox
+ *                              repeat method for caching.
+ */
 static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
                              aclTensor* acl_cos_repeat_tensor,
                              aclTensor* acl_sin_repeat_tensor,
                              float theta_scale, bool is_neox) {
-    // int sin/cos cache, cache has different repeat method depond on @param.is_neox
+    // int sin/cos cache, cache has different repeat method depond on
+    // @param.is_neox
 
-    ggml_tensor* src0 = dst->src[0]; // input
-    ggml_tensor* src1 = dst->src[1]; // position
+    ggml_tensor* src0 = dst->src[0];  // input
+    ggml_tensor* src1 = dst->src[1];  // position
 
     // arange, [0,1,...,ne0/2]
     int64_t arange_length = src0->ne[0] / 2;
     ggml_cann_pool_alloc arange_allocator(ctx.pool(),
-                                          arange_length*sizeof(float_t));
+                                          arange_length * sizeof(float_t));
     void* arange_buffer = arange_allocator.get();
     int64_t arange_ne[] = {arange_length, 1, 1, 1};
     size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
-                          arange_length*sizeof(float_t)};
+                          arange_length * sizeof(float_t)};
 
-    aclTensor* acl_arange_tensor = create_acl_tensor(arange_buffer, ACL_FLOAT,
-                                                     sizeof(float_t),
-                                                     arange_ne, arange_nb,
-                                                     GGML_MAX_DIMS);
+    aclTensor* acl_arange_tensor =
+        create_acl_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t), arange_ne,
+                          arange_nb, GGML_MAX_DIMS);
     float start = 0;
     float step = 1;
     float stop = src0->ne[0] / 2;
@@ -2182,36 +2540,34 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements);
 
     // power
-    // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so use aclnn_pow_tensor_tensor() until fixed.
-    // aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
-    // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor, acl_power_tensor);
+    // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so
+    // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale =
+    // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
+    // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor,
+    // acl_power_tensor);
     ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
-                                               arange_length*sizeof(float_t));
+                                               arange_length * sizeof(float_t));
     void* theta_scale_buffer = theta_scale_allocator.get();
-    aclTensor* acl_theta_scale_tensor = aclnn_ones(ctx, theta_scale_buffer,
-                                                   arange_length*sizeof(float_t),
-                                                   arange_ne, GGML_MAX_DIMS,
-                                                   ACL_FLOAT, sizeof(float_t),
-                                                   theta_scale);
+    aclTensor* acl_theta_scale_tensor = aclnn_ones(
+        ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
+        GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
     aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
 
     // position
-    GGML_ASSERT(src1->type==GGML_TYPE_I32);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
     int64_t position_length = src1->ne[0];
     int64_t position_ne[] = {1, position_length, 1, 1};
     size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t),
-                            sizeof(int32_t)*position_length,
-                            sizeof(int32_t)*position_length};
-    aclTensor* acl_position_tensor = create_acl_tensor(src1->data,
-                                                       type_mapping(src1->type),
-                                                       ggml_type_size(src1->type),
-                                                       position_ne, position_nb,
-                                                       GGML_MAX_DIMS);
+                            sizeof(int32_t) * position_length,
+                            sizeof(int32_t) * position_length};
+    aclTensor* acl_position_tensor = create_acl_tensor(
+        src1->data, type_mapping(src1->type), ggml_type_size(src1->type),
+        position_ne, position_nb, GGML_MAX_DIMS);
 
     // power * position
     int64_t theta_length = arange_length * position_length;
     ggml_cann_pool_alloc theta_allocator(ctx.pool(),
-                                         theta_length*sizeof(float_t));
+                                         theta_length * sizeof(float_t));
     void* theta_buffer = theta_allocator.get();
     int64_t theta_ne[] = {arange_length, position_length, 1, 1};
     size_t theta_nb[GGML_MAX_DIMS];
@@ -2219,10 +2575,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
     }
-    aclTensor* acl_theta_tensor = create_acl_tensor(theta_buffer, ACL_FLOAT,
-                                                    sizeof(float_t), theta_ne,
-                                                    theta_nb, GGML_MAX_DIMS);
-    aclnn_noinplcace_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
+    aclTensor* acl_theta_tensor =
+        create_acl_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), theta_ne,
+                          theta_nb, GGML_MAX_DIMS);
+    aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
                          acl_theta_tensor);
 
     // permute: [0,1,2,3]->[0,2,1,3]
@@ -2233,47 +2589,46 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
         permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1];
     }
     ggml_cann_pool_alloc permute_allocator(ctx.pool(),
-                                           theta_length*sizeof(float_t));
+                                           theta_length * sizeof(float_t));
     void* permute_buffer = permute_allocator.get();
-    aclTensor* acl_permute_tensor = create_acl_tensor(
-                        permute_buffer, ACL_FLOAT, sizeof(float_t),
-                        permute_ne, permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclTensor* acl_permute_tensor =
+        create_acl_tensor(permute_buffer, ACL_FLOAT, sizeof(float_t),
+                          permute_ne, permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
     int64_t permute_dim[] = {0, 2, 1, 3};
     int64_t num_dims = 4;
     aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim,
                   num_dims);
 
     // sin/cos
-    ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length*sizeof(float_t));
+    ggml_cann_pool_alloc sin_allocator(ctx.pool(),
+                                       theta_length * sizeof(float_t));
     void* sin_buffer = sin_allocator.get();
-    aclTensor* acl_sin_tensor = create_acl_tensor(
-                        sin_buffer, ACL_FLOAT, sizeof(float_t),
-                        permute_ne, permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclTensor* acl_sin_tensor =
+        create_acl_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne,
+                          permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
     aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor);
 
-    ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length*sizeof(float_t));
+    ggml_cann_pool_alloc cos_allocator(ctx.pool(),
+                                       theta_length * sizeof(float_t));
     void* cos_buffer = cos_allocator.get();
-    aclTensor* acl_cos_tensor = create_acl_tensor(
-                        cos_buffer, ACL_FLOAT, sizeof(float_t),
-                        permute_ne, permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
+    aclTensor* acl_cos_tensor =
+        create_acl_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne,
+                          permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
     aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
 
     // repeat
     if (is_neox) {
-        int64_t repeatsArray[] = {1,1,1,2};
+        int64_t repeatsArray[] = {1, 1, 1, 2};
         aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
         aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
-    }
-    else {
+    } else {
         int64_t num_repeats = 2;
         int64_t dim = 3;
-        int64_t output_size = arange_length*num_repeats;
-        aclnn_repeat_interleave(ctx, acl_sin_tensor,
-                                acl_sin_repeat_tensor, dim, num_repeats,
-                                output_size);
-        aclnn_repeat_interleave(ctx, acl_cos_tensor,
-                                acl_cos_repeat_tensor, dim, num_repeats,
-                                output_size);
+        int64_t output_size = arange_length * num_repeats;
+        aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
+                                num_repeats, output_size);
+        aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
+                                num_repeats, output_size);
     }
 
     // release
@@ -2289,38 +2644,35 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     // TODO: use ascendc
     // Only test with LLAMA model.
-    ggml_tensor* src0 = dst->src[0]; // input
-    ggml_tensor* src2 = dst->src[2]; // freq_factors
-
-    // TODO: with freq_factors
-    GGML_ASSERT(src2 == NULL);
+    ggml_tensor* src0 = dst->src[0];  // input
+    ggml_tensor* src2 = dst->src[2];  // freq_factors
 
     // param
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+    // const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t*)dst->op_params)[1];
+    const int mode = ((int32_t*)dst->op_params)[2];
+    // const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
 
     GGML_TENSOR_UNARY_OP_LOCALS
 
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
+    memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
+    memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
+    memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
+    memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
+    memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
 
     GGML_ASSERT(n_dims <= ne0);
     GGML_ASSERT(n_dims % 2 == 0);
 
-    // TODO: ext_factor != 0
+    // TODO: with freq_factors, ext_factor != 0, freq_scale != 1
+    GGML_ASSERT(src2 == NULL);
     GGML_ASSERT(ext_factor == 0);
-    // TODO: freq_scale != 1
     GGML_ASSERT(freq_scale == 1);
 
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
     float corr_dims[2];
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
@@ -2329,10 +2681,10 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     const bool is_neox = mode & 2;
 
     // init cos/sin cache
-    ggml_cann_pool_alloc sin_allocator(ctx.pool(), src0->ne[0] * src0->ne[2]
-                                             * sizeof(float_t));
-    ggml_cann_pool_alloc cos_allocator(ctx.pool(), src0->ne[0] * src0->ne[2]
-                                             * sizeof(float_t));
+    ggml_cann_pool_alloc sin_allocator(
+        ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
+    ggml_cann_pool_alloc cos_allocator(
+        ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t));
     void* sin_buffer = sin_allocator.get();
     void* cos_buffer = cos_allocator.get();
 
@@ -2342,16 +2694,12 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
     }
-    aclTensor* acl_sin_reshape_tensor = create_acl_tensor(sin_buffer, ACL_FLOAT,
-                                                          sizeof(float_t),
-                                                          sin_reshape_ne,
-                                                          sin_reshape_nb,
-                                                          GGML_MAX_DIMS);
-    aclTensor* acl_cos_reshape_tensor = create_acl_tensor(cos_buffer, ACL_FLOAT,
-                                                          sizeof(float_t),
-                                                          sin_reshape_ne,
-                                                          sin_reshape_nb,
-                                                          GGML_MAX_DIMS);
+    aclTensor* acl_sin_reshape_tensor =
+        create_acl_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t),
+                          sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
+    aclTensor* acl_cos_reshape_tensor =
+        create_acl_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
+                          sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
     aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
                      theta_scale, is_neox);
 
@@ -2360,32 +2708,25 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclTensor* acl_minus_one_tensor;
     void* minus_one_scale_buffer = nullptr;
     ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
-    ggml_cann_pool_alloc minus_one_scale_allocator(ctx.pool(),
-                                                   sizeof(float_t) * src0->ne[0]);
+    ggml_cann_pool_alloc minus_one_scale_allocator(
+        ctx.pool(), sizeof(float_t) * src0->ne[0]);
     if (!is_neox) {
         // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
         input_roll_buffer = roll_allocator.get();
-        int64_t input_roll_ne[4] = {2, src0->ne[1]*(src0->ne[0]/2), src0->ne[2],
-                                    src0->ne[3]};
+        int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
+                                    src0->ne[2], src0->ne[3]};
         size_t input_roll_nb[GGML_MAX_DIMS];
         input_roll_nb[0] = ggml_type_size(src0->type);
         for (int i = 1; i < GGML_MAX_DIMS; i++) {
             input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
         }
-        aclTensor* acl_input_roll_tensor = create_acl_tensor(
-                                                     input_roll_buffer,
-                                                     type_mapping(src0->type),
-                                                     ggml_type_size(src0->type),
-                                                     input_roll_ne,
-                                                     input_roll_nb,
-                                                     GGML_MAX_DIMS);
+        aclTensor* acl_input_roll_tensor =
+            create_acl_tensor(input_roll_buffer, type_mapping(src0->type),
+                              ggml_type_size(src0->type), input_roll_ne,
+                              input_roll_nb, GGML_MAX_DIMS);
         aclTensor* acl_input_tensor = create_acl_tensor(
-                                                     src0->data,
-                                                     type_mapping(src0->type),
-                                                     ggml_type_size(src0->type),
-                                                     input_roll_ne,
-                                                     input_roll_nb,
-                                                     GGML_MAX_DIMS);
+            src0->data, type_mapping(src0->type), ggml_type_size(src0->type),
+            input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
 
         int64_t shifts[] = {1};
         int64_t dims[] = {3};
@@ -2402,28 +2743,25 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         for (int i = 1; i < GGML_MAX_DIMS; i++) {
             minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
         }
-        acl_minus_one_tensor = aclnn_ones(ctx, minus_one_scale_buffer,
-                                          sizeof(float_t) * src0->ne[0],
-                                          minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT,
-                                          sizeof(float_t), 1);
+        acl_minus_one_tensor = aclnn_ones(
+            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
         int64_t dim = 3;
         int64_t* index = new int64_t[src0->ne[0]];
-        for (int i=0; i<src0->ne[0]; i++) {
-            index[i] = i/2*2;
+        for (int i = 0; i < src0->ne[0]; i++) {
+            index[i] = i / 2 * 2;
         }
         int64_t index_num = src0->ne[0];
         float value = -1;
-        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim ,index, index_num, value);
-    }
-    else {
-        // roll input: [q0,q1,q2,...] -> [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
+        aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
+                                index_num, value);
+    } else {
+        // roll input: [q0,q1,q2,...] ->
+        // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
         input_roll_buffer = roll_allocator.get();
         aclTensor* acl_input_roll_tensor = create_acl_tensor(
-                                                     input_roll_buffer,
-                                                     type_mapping(src0->type),
-                                                     ggml_type_size(src0->type),
-                                                     src0->ne, src0->nb,
-                                                     GGML_MAX_DIMS);
+            input_roll_buffer, type_mapping(src0->type),
+            ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
         aclTensor* acl_input_tensor = create_acl_tensor(src0);
 
         int64_t shifts[] = {src0->ne[0] / 2};
@@ -2442,19 +2780,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         for (int i = 1; i < GGML_MAX_DIMS; i++) {
             minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
         }
-        acl_minus_one_tensor = aclnn_ones(ctx, minus_one_scale_buffer,
-                                          sizeof(float_t) * src0->ne[0],
-                                          minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT,
-                                          sizeof(float_t), 1);
+        acl_minus_one_tensor = aclnn_ones(
+            ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
+            minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
         // -1 * first half
-        int64_t first_half_ne[4] = {src0->ne[0]/2, 1, 1, 1};
+        int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
         size_t first_half_nb[GGML_MAX_DIMS];
         first_half_nb[0] = sizeof(float_t);
         for (int i = 1; i < GGML_MAX_DIMS; i++) {
             first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
         }
-        aclTensor* acl_first_half_tensor = create_acl_tensor(minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t),
-                                                             first_half_ne, first_half_nb, GGML_MAX_DIMS);
+        aclTensor* acl_first_half_tensor = create_acl_tensor(
+            minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
+            first_half_nb, GGML_MAX_DIMS);
         bool inplace = true;
         float scale = -1;
         aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
@@ -2462,10 +2800,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     }
 
     // TODO: n_dims < ne0
-    GGML_ASSERT(n_dims==src0->ne[0]);
+    GGML_ASSERT(n_dims == src0->ne[0]);
 
     // input * scale
-    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0));
+    ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
+                                                  ggml_nbytes(src0));
     void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
     size_t input_nb[GGML_MAX_DIMS];
     input_nb[0] = ggml_type_size(src0->type);
@@ -2473,19 +2812,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
         input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
     }
     aclTensor* acl_input_roll_mul_scale_tensor = create_acl_tensor(
-                                                    input_roll_mul_scale_buffer,
-                                                    type_mapping(src0->type),
-                                                    ggml_type_size(src0->type),
-                                                    src0->ne, input_nb,
-                                                    GGML_MAX_DIMS);
+        input_roll_mul_scale_buffer, type_mapping(src0->type),
+        ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
     aclTensor* acl_input_roll_reshape_tensor = create_acl_tensor(
-                                                    input_roll_buffer,
-                                                    type_mapping(src0->type),
-                                                    ggml_type_size(src0->type),
-                                                    src0->ne, input_nb,
-                                                    GGML_MAX_DIMS);
+        input_roll_buffer, type_mapping(src0->type), ggml_type_size(src0->type),
+        src0->ne, input_nb, GGML_MAX_DIMS);
 
-    aclnn_noinplcace_mul(ctx, acl_input_roll_reshape_tensor,
+    aclnn_mul(ctx, acl_input_roll_reshape_tensor,
                          acl_minus_one_tensor, acl_input_roll_mul_scale_tensor);
 
     // output
@@ -2493,56 +2826,49 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     aclTensor* acl_dst = create_acl_tensor(dst);
     void* output_fp32_buffer;
     if (src0->type == GGML_TYPE_F32) {
+        // dst=src0*cos+input_roll_mul_scale*sin.
         aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
         aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
                           acl_sin_reshape_tensor);
         aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
-        // TODO: ne0 != n_dims in mode2
-    }
-    else if (src0->type == GGML_TYPE_F16) {
-        size_t input_fp32_nb[GGML_MAX_DIMS];
-        input_fp32_nb[0] = sizeof(float_t);
+    } else if (src0->type == GGML_TYPE_F16) {
+        // dst=src0_mul_cos + input_roll_mul_cos.
+        size_t output_fp32_nb[GGML_MAX_DIMS];
+        output_fp32_nb[0] = sizeof(float_t);
         for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
+            output_fp32_nb[i] = output_fp32_nb[i - 1] * dst->ne[i - 1];
         }
-        ggml_cann_pool_alloc fp32_allocator1(ctx.pool(),
-                                             ggml_nelements(dst)*sizeof(float_t));
-        void* input_fp32_buffer1 = fp32_allocator1.get();
-        aclTensor* input_fp32_tensor1 = create_acl_tensor(input_fp32_buffer1,
-                                                          ACL_FLOAT,
-                                                          sizeof(float_t),
-                                                          dst->ne,
-                                                          input_fp32_nb,
-                                                          GGML_MAX_DIMS);
-        ggml_cann_pool_alloc fp32_allocator2(ctx.pool(),
-                                             ggml_nelements(dst)*sizeof(float_t));
-        void* input_fp32_buffer2 = fp32_allocator2.get();
-        aclTensor* input_fp32_tensor2 = create_acl_tensor(input_fp32_buffer2,
-                                                          ACL_FLOAT,
-                                                          sizeof(float_t),
-                                                          dst->ne,
-                                                          input_fp32_nb,
-                                                          GGML_MAX_DIMS);
-
-        ggml_cann_pool_alloc fp32_allocator(ctx.pool(),
-                                            ggml_nelements(dst)*sizeof(float_t));
-        output_fp32_buffer = fp32_allocator.get();
-        aclTensor* output_fp32_tensor = create_acl_tensor(output_fp32_buffer,
-                                                          ACL_FLOAT,
-                                                          sizeof(float_t),
-                                                          dst->ne,
-                                                          input_fp32_nb,
-                                                          GGML_MAX_DIMS);
-        aclnn_noinplcace_mul(ctx, acl_src0, acl_cos_reshape_tensor,
-                             input_fp32_tensor1);
-        aclnn_noinplcace_mul(ctx, acl_input_roll_mul_scale_tensor,
-                             acl_sin_reshape_tensor, input_fp32_tensor2);
-        aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
+        ggml_cann_pool_alloc src0_mul_cos_allocator(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        void* src0_mul_cos_buffer = src0_mul_cos_allocator.get();
+        aclTensor* src0_mul_cos_tensor =
+            create_acl_tensor(src0_mul_cos_buffer, ACL_FLOAT, sizeof(float_t),
+                              dst->ne, output_fp32_nb, GGML_MAX_DIMS);
+
+        ggml_cann_pool_alloc input_roll_mul_cos_allocator(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        void* input_roll_mul_cos_buffer = input_roll_mul_cos_allocator.get();
+        aclTensor* input_roll_mul_cos_tensor =
+            create_acl_tensor(input_roll_mul_cos_buffer, ACL_FLOAT,
+                              sizeof(float_t), dst->ne, output_fp32_nb,
+                              GGML_MAX_DIMS);
+
+        ggml_cann_pool_alloc output_fp32_allocator(
+            ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
+        output_fp32_buffer = output_fp32_allocator.get();
+        aclTensor* output_fp32_tensor =
+            create_acl_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float_t),
+                              dst->ne, output_fp32_nb, GGML_MAX_DIMS);
+        aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor,
+                             src0_mul_cos_tensor);
+        aclnn_mul(ctx, acl_input_roll_mul_scale_tensor,
+                             acl_sin_reshape_tensor, input_roll_mul_cos_tensor);
+        aclnn_add(ctx, src0_mul_cos_tensor, input_roll_mul_cos_tensor,
                   output_fp32_tensor);
         aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
 
-        ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
-        ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
+        ACL_CHECK(aclDestroyTensor(src0_mul_cos_tensor));
+        ACL_CHECK(aclDestroyTensor(input_roll_mul_cos_tensor));
         ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
     }
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 96395409aa9a4b..f49a4e186c3dca 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -14821,8 +14821,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
             llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
 
             need_reserve = true;
-
-            LLAMA_LOG_INFO("\n\n\n\nkv cache updated!!!!!\n\n\n\n");
         }
 
         {