diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index e48372d92ebe23..a98bd49c37e0cf 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -1,7 +1,28 @@ +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + #include "aclnn_ops.h" #include -#include #include #include #include @@ -11,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -28,14 +50,24 @@ #include #include -#include #include +#include + #include "kernels/ascendc_kernels.h" +/** + * @brief Repeats elements of a tensor along each dimension according to the + * specified repeat array. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor to be repeated. + * @param acl_dst The destination tensor after repeating. + * @param repeat_array The array specifying the number of repetitions along each + * dimension. + */ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* repeat_array) { + aclTensor* acl_dst, int64_t* repeat_array) { // repeat tensor along each dim with repeat_array - aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS); uint64_t workspaceSize = 0; @@ -46,15 +78,15 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, &workspaceSize, &executor)); if (workspaceSize > 0) { - // Memory from allocator will "free" immediately, but this memory - // will be distribute to other pointers, but it won't access before - // this async task end. - // All tasks in same stream will execute in queue. + // Memory from allocator will "free" immediately, and this memory + // will be alloced to other pointers, but it won't access before + // this async task end because all tasks in same stream will execute + // in queue. ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); workspaceAddr = workspace_allocator.get(); } - aclrtStream stream = ctx.stream(); - ACL_CHECK(aclnnRepeat(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK( + aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyIntArray(repeats)); } @@ -62,11 +94,6 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; GGML_ASSERT(ggml_can_repeat(src, dst)); - size_t nbytes = ggml_nbytes(dst); - aclrtStream main_stream = ctx.stream(); - // Set dst to a zero tensor. - ACL_CHECK(aclrtMemsetAsync(dst->data, nbytes, 0, nbytes, main_stream)); - aclTensor* acl_src = create_acl_tensor(src); aclTensor* acl_dst = create_acl_tensor(dst); @@ -78,10 +105,21 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_dst)); } +/** + * @brief Adds two tensors element-wise and stores the result in a destination + * tensor. + * + * This function performs the operation: dst = acl_src0 + alpha * acl_src1 + * where alpha is a scalar value. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src0 The first source tensor. + * @param acl_src1 The second source tensor. + * @param acl_dst The destination tensor where the result will be stored. + */ static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, - aclTensor* acl_src1, aclTensor* acl_dst) { + aclTensor* acl_src1, aclTensor* acl_dst) { // add: dst = acl_src0 + alpha*acl_src1 - aclScalar* alpha = nullptr; float alphaValue = 1.0f; alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); @@ -97,8 +135,7 @@ static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, main_stream)); + ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyScalar(alpha)); } @@ -156,22 +193,31 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); ACL_CHECK( - aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, main_stream)); + aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyScalar(acl_negative_slope)); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); } -static void aclnn_concat(ggml_backend_cann_context& ctx, aclTensorList* tensorList, - aclTensor* acl_dst, int64_t concat_dim) { +/** + * @brief Concatenates a list of tensors along a specified dimension and stores + * the result in a destination tensor. + * + * @param ctx The context for the CANN backend operations. + * @param tensorList The list of tensors to be concatenated. + * @param acl_dst The destination tensor where the concatenated result will be + * stored. + * @param concat_dim The dimension along which the tensors will be concatenated. + */ +static void aclnn_concat(ggml_backend_cann_context& ctx, + aclTensorList* tensorList, aclTensor* acl_dst, + int64_t concat_dim) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; - // dims in llama.cpp is reversed. ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { @@ -179,8 +225,7 @@ static void aclnn_concat(ggml_backend_cann_context& ctx, aclTensorList* tensorLi workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, main_stream)); + ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream())); } void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { @@ -199,10 +244,23 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_dst)); } +/** + * @brief Creates a tensor with values starting from `start`, incremented by + * `step`, and ending before `stop`. + * + * This function performs the operation: [start, stop), out(i+1) = out(i) + + * step. + * + * @param ctx The context for the CANN backend operations. + * @param acl_dst The destination tensor where the values will be stored. + * @param start The starting value of the range. + * @param stop The ending value of the range (exclusive). + * @param step The step size between consecutive values. + * @param n_elements The number of elements in the destination tensor. + */ static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst, - float start, float stop, float step, int64_t n_elements) { - // arange: [start, stop), out(i+1) = out(i) + step. - + float start, float stop, float step, + int64_t n_elements) { int64_t steps = (int64_t)std::ceil((stop - start) / step); GGML_ASSERT(n_elements == steps); @@ -221,8 +279,8 @@ static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst, workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK(aclnnArange(workspaceAddr, workspaceSize, executor, main_stream)); + ACL_CHECK( + aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyScalar(acl_start)); ACL_CHECK(aclDestroyScalar(acl_end)); @@ -252,8 +310,6 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - // y = max(min(x, max_value), min_value). - ggml_tensor* src = dst->src[0]; GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -280,8 +336,7 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, main_stream)); + ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyScalar(acl_min)); ACL_CHECK(aclDestroyScalar(acl_max)); @@ -290,8 +345,6 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - // acl_dst = acl_src * scale. - ggml_tensor* src = dst->src[0]; // scale factor @@ -313,8 +366,7 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, main_stream)); + ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyScalar(scale)); ACL_CHECK(aclDestroyTensor(acl_src)); @@ -327,7 +379,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_src = create_acl_tensor(src); aclTensor* acl_dst = create_acl_tensor(dst); - ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t)); + ggml_cann_pool_alloc temp_buffer_allocator( + ctx.pool(), ggml_nelements(dst) * sizeof(int64_t)); void* buffer = temp_buffer_allocator.get(); aclTensor* tmp_tensor = create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, @@ -345,9 +398,8 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); ACL_CHECK( - aclnnArgsort(workspaceAddr, workspaceSize, executor, main_stream)); + aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream())); workspaceSize = 0; ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, type_mapping(dst->type), @@ -357,7 +409,7 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, main_stream)); + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(tmp_tensor)); @@ -365,8 +417,6 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - // layer_norm for one layer. - ggml_tensor* src = dst->src[0]; aclTensor* acl_src = create_acl_tensor(src); @@ -390,9 +440,8 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - aclrtStream stream = ctx.stream(); - - ACL_CHECK(aclnnLayerNorm(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK( + aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyIntArray(norm)); ACL_CHECK(aclDestroyTensor(acl_src)); @@ -420,6 +469,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t ne[] = {n_groups, N}; size_t nb[] = {type_size, type_size * n_groups}; size_t n_bytes = N * n_groups; + ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2); void* buffer = temp_buffer_allocator.get(); aclTensor* acl_mean_out = @@ -436,9 +486,8 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - aclrtStream stream = ctx.stream(); - - ACL_CHECK(aclnnGroupNorm(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK( + aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); @@ -447,9 +496,6 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - // if inplace: dst = dst + alpha * src1 - // else: dst = src0 + alpha * src1 - ggml_tensor* src0 = dst->src[0]; ggml_tensor* src1 = dst->src[1]; @@ -473,12 +519,10 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclOpExecutor* executor; void* workspaceAddr = nullptr; - aclrtStream stream = ctx.stream(); - if (!inplace) { size_t cpy_size = ggml_nbytes(dst); ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, - ACL_MEMCPY_DEVICE_TO_DEVICE, stream)); + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); aclTensor* acl_src0 = create_acl_tensor( src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, @@ -487,7 +531,8 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); workspaceAddr = workspace_allocator.get(); } - ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK( + aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyTensor(acl_src0)); } else { ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha, @@ -496,8 +541,8 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); workspaceAddr = workspace_allocator.get(); } - ACL_CHECK( - aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, + ctx.stream())); } ACL_CHECK(aclDestroyTensor(acl_src1)); @@ -505,8 +550,6 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - // reducesum along last dim. - ggml_tensor* src = dst->src[0]; aclTensor* acl_src = create_acl_tensor(src); @@ -529,8 +572,8 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - aclrtStream stream = ctx.stream(); - ACL_CHECK(aclnnReduceSum(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK( + aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); @@ -539,7 +582,6 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; - aclTensor* acl_src = create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); aclTensor* acl_dst = @@ -552,8 +594,6 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, aclOpExecutor* executor; void* workspaceAddr = nullptr; - aclrtStream stream = ctx.stream(); - ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize( acl_src, output_size_array, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { @@ -561,14 +601,28 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, workspaceAddr = workspace_allocator.get(); } - ACL_CHECK( - aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor, + ctx.stream())); ACL_CHECK(aclDestroyIntArray(output_size_array)); ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); } +/** + * @brief Pads a tensor with a specified value along each dimension. + * + * This function performs padding of the source tensor `acl_src` and stores the + * result in the destination tensor `acl_dst`. The padding values for each + * dimension are specified in the `paddings` array. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor to be padded. + * @param acl_dst The destination tensor where the padded result will be stored. + * @param paddings An array specifying the padding values for each dimension. + * The size of the array should be twice the number of dimensions of the tensor. + * @param value The value to be used for padding. The default value is 0.0. + */ static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst, int64_t* paddings, float value = 0.0f) { @@ -587,9 +641,8 @@ static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src, workspaceAddr = workspace_allocator.get(); } - aclrtStream stream = ctx.stream(); - ACL_CHECK( - aclnnConstantPadNd(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor, + ctx.stream())); ACL_CHECK(aclDestroyIntArray(acl_pad)); ACL_CHECK(aclDestroyScalar(acl_value)); @@ -613,22 +666,18 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_src)); } -void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - const int32_t* opts = (const int32_t*)dst->op_params; - enum ggml_op_pool op = static_cast(opts[0]); - switch (op) { - case GGML_OP_POOL_AVG: - ggml_cann_avg_pool2d(ctx, dst); - break; - case GGML_OP_POOL_MAX: - ggml_cann_max_pool2d(ctx, dst); - break; - case GGML_OP_POOL_COUNT: - GGML_ASSERT(false); - break; - } -} - +/** + * @brief Performs 2D average pooling on the input tensor and stores the result + * in the destination tensor. + * + * This function performs average pooling on the source tensor and stores the + * result in the destination tensor. The pooling parameters (kernel size, + * strides, padding) are specified in the `op_params` of the destination tensor. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result will be stored. The source + * tensor is referenced by `dst->src[0]`. + */ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; @@ -640,7 +689,6 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, aclTensor* acl_dst = create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); - // params const int32_t* opts = (const int32_t*)dst->op_params; const int k0 = opts[1]; const int k1 = opts[2]; @@ -657,17 +705,15 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, auto* strides = aclCreateIntArray(stride_dims.data(), 2); auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2); - bool ceil_mode = false; // + bool ceil_mode = false; bool count_include_pad = true; int64_t divisor_override = 0; int8_t cube_math_type = 0; - // execute op api uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; - aclrtStream stream = ctx.stream(); ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize( acl_src, kernel_size, strides, paddings_avg, ceil_mode, count_include_pad, divisor_override, cube_math_type, acl_dst, @@ -677,9 +723,8 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); workspaceAddr = workspace_allocator.get(); } - ACL_CHECK(aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream())); - // release ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyIntArray(kernel_size)); @@ -687,6 +732,18 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ACL_CHECK(aclDestroyIntArray(paddings_avg)); } +/** + * @brief Performs 2D max pooling on the input tensor and stores the result in + * the destination tensor. + * + * This function performs max pooling on the source tensor and stores the result + * in the destination tensor. The pooling parameters (kernel size, strides, + * padding) are specified in the `op_params` of the destination tensor. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result will be stored. The source + * tensor is referenced by `dst->src[0]`. + */ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; @@ -697,7 +754,7 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, create_acl_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); aclTensor* acl_dst = create_acl_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); - // params + const int32_t* opts = (const int32_t*)dst->op_params; const int k0 = opts[1]; const int k1 = opts[2]; @@ -715,7 +772,8 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1]; } - ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); + ggml_cann_pool_alloc temp_buffer_allocator( + ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); void* buffer = temp_buffer_allocator.get(); aclTensor* tmp_tensor = create_acl_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, @@ -743,7 +801,6 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; - aclrtStream stream = ctx.stream(); ACL_CHECK(aclnnMaxPoolGetWorkspaceSize( tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations, @@ -753,9 +810,8 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, workspaceAddr = workspace_allocator.get(); } - ACL_CHECK(aclnnMaxPool(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream())); - // release ACL_CHECK(aclDestroyTensor(acl_src)); ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyTensor(tmp_tensor)); @@ -765,7 +821,34 @@ static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ACL_CHECK(aclDestroyIntArray(dilations)); } -static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTensor* acl_dst) { +void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + const int32_t* opts = (const int32_t*)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + switch (op) { + case GGML_OP_POOL_AVG: + ggml_cann_avg_pool2d(ctx, dst); + break; + case GGML_OP_POOL_MAX: + ggml_cann_max_pool2d(ctx, dst); + break; + case GGML_OP_POOL_COUNT: + GGML_ASSERT(false); + break; + } +} + +/** + * @brief Copies data from the source tensor to the destination tensor. + * + * This function copies data from the source tensor `acl_src` to the destination + * tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor from which data will be copied. + * @param acl_dst The destination tensor where the data will be copied to. + */ +static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -778,11 +861,15 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, aclTen workspaceAddr = workspace_allocator.get(); } - aclrtStream stream = ctx.stream(); - ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, + ctx.stream())); } void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + // Support F16/F32/Q8_0 dtype, and Not support situation of src and dst have + // different shape and dst is non-contiguous or src is non-contiguous in + // first dim. + ggml_tensor* src = dst->src[0]; aclTensor* acl_src = create_acl_tensor(src); @@ -793,26 +880,29 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { src->extra = src_extra_allocator.get(); dst->extra = dst_extra_allocator.get(); ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); + if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) && + ggml_are_same_shape(src, dst)) { + cann_copy(ctx, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } // TODO: simplefify - if (src->type==GGML_TYPE_F16) { - if (dst->type==GGML_TYPE_Q8_0) { + if (src->type == GGML_TYPE_F16) { + if (dst->type == GGML_TYPE_Q8_0) { aclrtlaunch_ascendc_quantize_f16_q8_0( 24, ctx.stream(), src->data, dst->data, ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, ((ggml_tensor*)dst->extra)->ne); return; } - if (dst->type==GGML_TYPE_F16) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } + if (dst->type == GGML_TYPE_F16) { if (ggml_is_contiguous(dst)) { const size_t src_type_size = ggml_type_size(src->type); if (src->nb[0] == src_type_size) { @@ -820,112 +910,84 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t rows_num = ggml_nrows(src); aclrtlaunch_ascendc_dup_by_rows_fp16( - rows_num, ctx.stream(), - src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + rows_num, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); return; } GGML_ASSERT(false); } GGML_ASSERT(false); } - if (dst->type==GGML_TYPE_F32) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } + if (dst->type == GGML_TYPE_F32) { if (ggml_is_contiguous(dst)) { const size_t src_type_size = ggml_type_size(src->type); if (src->nb[0] == src_type_size) { // src0 is contigous on first dimension, copy by rows int64_t rows_num = ggml_nrows(src); aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32( - rows_num, ctx.stream(), - src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + rows_num, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); return; } GGML_ASSERT(false); } GGML_ASSERT(false); } - // TODO + // TODO: other dtype. GGML_ASSERT(false); - } - else if (src->type==GGML_TYPE_F32) { - //TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size - // && nb0 == type_size) - if (dst->type==GGML_TYPE_Q8_0) { + } else if (src->type == GGML_TYPE_F32) { + if (dst->type == GGML_TYPE_Q8_0) { aclrtlaunch_ascendc_quantize_f32_q8_0( 24, ctx.stream(), src->data, dst->data, ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, ((ggml_tensor*)dst->extra)->ne); return; } - if (dst->type==GGML_TYPE_F32) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } + if (dst->type == GGML_TYPE_F32) { if (ggml_is_contiguous(dst)) { const size_t src_type_size = ggml_type_size(src->type); if (src->nb[0] == src_type_size) { // src0 is contigous on first dimension, copy by rows int64_t rows_num = ggml_nrows(src); aclrtlaunch_ascendc_dup_by_rows_fp32( - rows_num, ctx.stream(), - src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + rows_num, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); return; } GGML_ASSERT(false); - } - else { - //TODO: dst not contiguous + } else { + // TODO: dst not contiguous GGML_ASSERT(false); } } - if (dst->type==GGML_TYPE_F16) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } + if (dst->type == GGML_TYPE_F16) { if (ggml_is_contiguous(dst)) { const size_t src_type_size = ggml_type_size(src->type); if (src->nb[0] == src_type_size) { // src0 is contigous on first dimension, copy by rows int64_t rows_num = ggml_nrows(src); aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16( - rows_num, ctx.stream(), - src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + rows_num, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); return; } GGML_ASSERT(false); } } - // TODO GGML_ASSERT(false); - } - else { + } else { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); ACL_CHECK(aclDestroyTensor(acl_src)); @@ -951,9 +1013,25 @@ aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize, } #endif -static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, size_t n_bytes, - int64_t* ne, int64_t dims, aclDataType type, - size_t type_size) { +/** + * @brief Creates an ACL tensor initialized with zeros using a provided buffer. + * + * This function initializes a tensor with zeros using the specified buffer and + * tensor parameters. + * + * @param ctx The context for the CANN backend operations. + * @param buffer The buffer to be used for the tensor data. + * @param n_bytes The size of the buffer in bytes. + * @param ne An array specifying the extents (sizes) of each dimension of the + * tensor. + * @param dims The number of dimensions of the tensor. + * @param type The data type of the tensor. + * @param type_size The size of each element in the tensor data type. + * @return An ACL tensor initialized with zeros. + */ +static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, + size_t n_bytes, int64_t* ne, int64_t dims, + aclDataType type, size_t type_size) { size_t nb[GGML_MAX_DIMS]; nb[0] = type_size; for (int i = 1; i < dims; i++) { @@ -965,10 +1043,30 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, size_ return zero; } -static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer, size_t n_bytes, - int64_t* ne, int64_t dims, aclDataType type, - size_t type_size, float value = 1.0f) { - aclTensor* acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size); +/** + * @brief Creates an ACL tensor initialized with ones using a provided buffer. + * + * This function initializes a tensor with ones using the specified buffer and + * tensor parameters. + * + * @param ctx The context for the CANN backend operations. + * @param buffer The buffer to be used for the tensor data. + * @param n_bytes The size of the buffer in bytes. + * @param ne An array specifying the extents (sizes) of each dimension of the + * tensor. + * @param dims The number of dimensions of the tensor. + * @param type The data type of the tensor. + * @param type_size The size of each element in the tensor data type. + * @param value The value to be used for initializing the tensor (default + * is 1.0). + * @return An ACL tensor initialized with ones. + */ +static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer, + size_t n_bytes, int64_t* ne, int64_t dims, + aclDataType type, size_t type_size, + float value = 1.0f) { + aclTensor* acl_tensor = + aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size); float alpha_host = 1.0f; aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT); aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT); @@ -1008,14 +1106,16 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src); ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); - aclTensor* acl_gamma = aclnn_ones( - ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, type_mapping(src->type), ggml_element_size(src)); + aclTensor* acl_gamma = + aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, + 1, type_mapping(src->type), ggml_element_size(src)); - size_t zero_tensor_n_bytes = src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src); + size_t zero_tensor_n_bytes = + src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src); ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes); - aclTensor* acl_rstd = - aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, src->ne, GGML_MAX_DIMS, type_mapping(src->type), - ggml_element_size(src)); + aclTensor* acl_rstd = aclnn_zero( + ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, src->ne, + GGML_MAX_DIMS, type_mapping(src->type), ggml_element_size(src)); ACL_CHECK(aclnnRmsNormGetWorkspaceSize( acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor)); @@ -1044,12 +1144,13 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, const int n_past = ((int32_t*)dst->op_params)[0]; - size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] *src->ne[3] * ggml_element_size(src); + size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] * + src->ne[3] * ggml_element_size(src); ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); - aclTensor* mask_tensor = - aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, GGML_MAX_DIMS, type_mapping(src->type), - ggml_element_size(src), value); + aclTensor* mask_tensor = aclnn_ones( + ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, + GGML_MAX_DIMS, type_mapping(src->type), ggml_element_size(src), value); uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -1093,12 +1194,24 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, ACL_CHECK(aclDestroyTensor(acl_dst)); } +/** + * @brief Casts the data type of a source tensor to a destination tensor. + * + * This function casts the data type of the source tensor `acl_src` to the + * specified data type `cast_data_type` and stores the result in the destination + * tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose data type will be casted. + * @param acl_dst The destination tensor where the casted result will be stored. + * @param cast_data_type The target data type to which the source tensor will be + * casted. + */ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, aclDataType cast_data_type) { + aclTensor* acl_dst, aclDataType cast_data_type) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; - aclrtStream stream = ctx.stream(); ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst, &workspaceSize, &executor)); @@ -1107,11 +1220,26 @@ static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, workspaceAddr = workspace_allocator.get(); } - ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream())); } +/** + * @brief Permutes the dimensions of a tensor according to a specified order. + * + * This function permutes the dimensions of the source tensor `acl_src` + * according to the order specified in the `new_dim` array and stores the result + * in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose dimensions will be permuted. + * @param acl_dst The destination tensor where the permuted result will be + * stored. + * @param new_dim An array specifying the new order of dimensions for the + * tensor. + * @param dims The number of dimensions in the tensor. + */ static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) { + aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) { aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims); uint64_t workspaceSize = 0; @@ -1176,7 +1304,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH] + // im2col: [N, C, H, W] -> [N, IC*KH*KW, OW*OH] aclTensor* acl_src1 = create_acl_tensor(src1); int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N}; size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; @@ -1189,7 +1317,8 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // Calculate im2col. // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * // dst.elemcount. - ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1)); + ggml_cann_pool_alloc im2col_allocator( + ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1)); void* tmp_im2col_buffer = im2col_allocator.get(); aclTensor* tmp_im2col_tensor = create_acl_tensor( tmp_im2col_buffer, type_mapping(src1->type), ggml_type_size(src1->type), @@ -1207,7 +1336,6 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; - aclrtStream stream = ctx.stream(); ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations, paddings, strides, tmp_im2col_tensor, @@ -1218,7 +1346,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { workspaceAddr = workspace_allocator.get(); } - ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream())); // Cast if dst is f16. aclTensor* tmp_cast_tensor = nullptr; @@ -1239,7 +1367,7 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { type_mapping(dst->type)); } - // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] + // Permute: [N, IC*KH*KW, OW*OH] -> [N, OW*OH, IC*KH*KW] int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; aclTensor* acl_dst = @@ -1263,6 +1391,15 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyIntArray(strides)); } +/** + * @brief Applies element-wise exponential function to the elements of a tensor. + * + * This function computes the exponential of each element in the source tensor + * `acl_src` and stores the result back into the same tensor. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The tensor on which the exponential function will be applied. + */ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) { uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -1279,6 +1416,23 @@ static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) { aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream())); } +/** + * @brief Multiplies elements of a tensor by a scalar value, optionally + * in-place. + * + * This function multiplies each element of the source tensor `acl_src` by the + * scalar `scale` and stores the result in the destination tensor `acl_dst`. If + * `inplace` is true, the operation is performed in-place on `acl_src`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose elements will be multiplied. + * @param scale The scalar value by which each element of `acl_src` will be + * multiplied. + * @param acl_dst The destination tensor where the result will be stored if + * `inplace` is false. + * @param inplace Flag indicating whether to perform the operation in-place on + * `acl_src`. + */ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, float scale, aclTensor* acl_dst, bool inplace) { aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); @@ -1297,8 +1451,7 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor, ctx.stream())); - } - else { + } else { ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst, &workspaceSize, &executor)); if (workspaceSize > 0) { @@ -1306,16 +1459,26 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, workspaceAddr = workspace_allocator.get(); } - ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, - ctx.stream())); + ACL_CHECK( + aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream())); } - ACL_CHECK(aclDestroyScalar(acl_scale)); } -static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_other) { +/** + * @brief Performs an in-place element-wise multiplication of two tensors. + * + * This function performs an element-wise multiplication of the tensors + * `acl_src` and `acl_other` and stores the result in `acl_src`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor where the multiplication result will be + * stored. + * @param acl_other The tensor whose elements will be multiplied with `acl_src`. + */ +static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, + aclTensor* acl_src, aclTensor* acl_other) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1331,8 +1494,21 @@ static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream())); } -static void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_other, aclTensor* acl_dst) { +/** + * @brief Performs element-wise multiplication of two tensors and stores the + * result in a destination tensor. + * + * This function performs element-wise multiplication of the tensors `acl_src` + * and `acl_other` and stores the result in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The first tensor for element-wise multiplication. + * @param acl_other The second tensor for element-wise multiplication. + * @param acl_dst The destination tensor where the result will be stored. + */ +static void aclnn_mul(ggml_backend_cann_context& ctx, + aclTensor* acl_src, aclTensor* acl_other, + aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1347,8 +1523,18 @@ static void aclnn_noinplcace_mul(ggml_backend_cann_context& ctx, aclTensor* acl_ ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream())); } +/** + * @brief Applies element-wise cosine function to the elements of a tensor. + * + * This function computes the cosine of each element in the source tensor `acl_src` + * and stores the result in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor on which the cosine function will be applied. + * @param acl_dst The destination tensor where the cosine results will be stored. + */ static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst) { + aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1363,8 +1549,18 @@ static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream())); } +/** + * @brief Applies element-wise sine function to the elements of a tensor. + * + * This function computes the sine of each element in the source tensor `acl_src` + * and stores the result in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor on which the sine function will be applied. + * @param acl_dst The destination tensor where the sine results will be stored. + */ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst) { + aclTensor* acl_dst) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1422,7 +1618,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; } - ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src)); + ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src)); void* tmp_permute_buffer = permute_allocator.get(); aclTensor* tmp_permute_tenosr = create_acl_tensor( tmp_permute_buffer, type_mapping(src->type), ggml_type_size(src->type), @@ -1443,16 +1639,18 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3]; - ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + ggml_cann_pool_alloc mul_allocator( + ctx.pool(), mul_nelements * ggml_type_size(src->type)); void* tmp_mul_buffer = mul_allocator.get(); aclTensor* tmp_mul_tensor = create_acl_tensor( tmp_mul_buffer, type_mapping(src->type), ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_noinplcace_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, + aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor); // cos - ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + ggml_cann_pool_alloc cos_allocator( + ctx.pool(), mul_nelements * ggml_type_size(src->type)); void* tmp_cos_buffer = cos_allocator.get(); aclTensor* tmp_cos_tensor = create_acl_tensor( tmp_cos_buffer, type_mapping(dst->type), ggml_type_size(dst->type), @@ -1461,7 +1659,8 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor); // sin - ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + ggml_cann_pool_alloc sin_allocator( + ctx.pool(), mul_nelements * ggml_type_size(src->type)); void* tmp_sin_buffer = sin_allocator.get(); aclTensor* tmp_sin_tensor = create_acl_tensor( tmp_sin_buffer, type_mapping(dst->type), ggml_type_size(dst->type), @@ -1486,10 +1685,18 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ACL_CHECK(aclDestroyTensor(acl_dst)); } +/** + * @brief Fills a tensor with a scalar value. + * + * This function fills the destination tensor `acl_dst` with the scalar value + * `scalar`. + * + * @param ctx The context for the CANN backend operations. + * @param scalar The scalar value used to fill the tensor. + * @param acl_dst The destination tensor to be filled with the scalar value. + */ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, - aclTensor* acl_dst) { - // fill acl_dst with scalar value. - + aclTensor* acl_dst) { auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); uint64_t workspaceSize = 0; @@ -1508,10 +1715,20 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, ACL_CHECK(aclDestroyScalar(acl_scalar)); } +/** + * @brief Raises each element of a tensor to the power of the corresponding + * element in another tensor. + * + * This function computes the element-wise power of the destination tensor + * `acl_dst` raised to the power of the exponent tensor `acl_exp`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_dst The destination tensor, which also serves as the base tensor. + * @param acl_exp The exponent tensor, each element of which is used to raise + * the corresponding element in the destination tensor. + */ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_dst, aclTensor* acl_exp) { - // acl_dst = acl_dst^acl_exp - uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1527,10 +1744,37 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, executor, ctx.stream())); } +/** + * @brief Applies the Alibi (Attention with Linear Biases) mechanism to the + * input tensor. + * + * @details This function implements the Alibi mechanism, which introduces + * learnable biases into the attention scores to simulate relative + * position encoding without the need for explicit positional + * embeddings. + * + * @param ctx The backend CANN context for executing operations. + * @param acl_src The source tensor representing the query or key. + * @param acl_position The position tensor containing relative positions. + * @param acl_dst The destination tensor where the result will be stored. + * @param n_head The number of attention heads. + * @param src_ne The dimensions of the source tensor. + * @param src_nb0 The byte size of the first dimension of the source tensor. + * @param max_bias The maximum bias value used in the Alibi mechanism. + * @param dst The destination tensor object for additional metadata. + * + * The function performs the following steps: + * 1. Calculates the logarithm floor of the number of heads to determine the base for bias calculation. + * 2. Initializes arrays with arithmetic sequences and fills them with bias values. + * 3. Computes the bias tensor based on the calculated biases and arithmetic sequences. + * 4. Reshapes the bias tensor to match the dimensions of the input tensors. + * 5. Multiplies the position tensor by the bias tensor. + * 6. Adds the result of the multiplication to the source tensor to produce the final output. + */ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_position, aclTensor* acl_dst, const int n_head, - int64_t* src_ne, const size_t src_nb0, float max_bias, - ggml_tensor* dst) { + aclTensor* acl_position, aclTensor* acl_dst, + const int n_head, int64_t* src_ne, const size_t src_nb0, + float max_bias, ggml_tensor* dst) { GGML_UNUSED(src_ne[1]); const int64_t ne2_ne3 = src_ne[2] * src_ne[3]; GGML_ASSERT(src_nb0 == sizeof(float)); @@ -1542,8 +1786,9 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); // init arange - ggml_cann_pool_alloc arange_allocator(ctx.pool(), ne2_ne3 * ggml_type_size(dst->type)); - void* tmp_arange_buffer = arange_allocator.get(); + ggml_cann_pool_alloc arange_allocator(ctx.pool(), + ne2_ne3 * ggml_type_size(dst->type)); + void* arange_buffer = arange_allocator.get(); // arange1: [1, ..., n_heads_log2_floor+1) float start = 1; @@ -1551,109 +1796,130 @@ static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, float step = 1; int64_t n_elements_arange = n_heads_log2_floor; - int64_t tmp_arange1_ne[] = {n_heads_log2_floor}; - size_t tmp_arange1_nb[] = {sizeof(dst->type)}; - aclTensor* tmp_arange1_tensor = create_acl_tensor( - tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), - tmp_arange1_ne, tmp_arange1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + int64_t arange_ne[] = {n_heads_log2_floor}; + size_t arange_nb[] = {sizeof(dst->type)}; + aclTensor* arange_tensor = create_acl_tensor( + arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + arange_ne, arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange); + aclnn_arange(ctx, arange_tensor, start, stop, step, n_elements_arange); - aclTensor* tmp_arange2_tensor = nullptr; + // if n_heads_log2_floor smaller than ne2_ne3, need arange_tail_tensor + // which is [1, ..., 2 * (k - n_heads_log2_floor) + 1)]. + aclTensor* arange_tail_tensor = nullptr; if (n_heads_log2_floor < ne2_ne3) { - // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1) start = 1; stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1; step = 2; n_elements_arange = ne2_ne3 - n_heads_log2_floor; - int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor}; - size_t tmp_arange2_nb[] = {sizeof(dst->type)}; - - aclTensor* tmp_arange2_tensor = create_acl_tensor( - (char*)tmp_arange_buffer + n_heads_log2_floor * ggml_type_size(dst->type), - type_mapping(dst->type), ggml_type_size(dst->type), tmp_arange2_ne, - tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step, + int64_t arange_tail_ne[] = {ne2_ne3 - n_heads_log2_floor}; + size_t arange_tail_nb[] = {sizeof(dst->type)}; + + aclTensor* arange_tail_tensor = create_acl_tensor( + (char*)arange_buffer + + n_heads_log2_floor * ggml_type_size(dst->type), + type_mapping(dst->type), ggml_type_size(dst->type), arange_tail_ne, + arange_tail_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclnn_arange(ctx, arange_tail_tensor, start, stop, step, n_elements_arange); } - // init mk_base - ggml_cann_pool_alloc mk_base_allocator(ctx.pool(), ne2_ne3 * ggml_type_size(dst->type)); - void* tmp_mk_base_buffer = mk_base_allocator.get(); - int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor}; - size_t tmp_mk_base1_nb[] = {sizeof(dst->type)}; - aclTensor* tmp_mk_base1_tensor = create_acl_tensor( - tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), - tmp_mk_base1_ne, tmp_mk_base1_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + // init mk_base, tensor with n_heads_log2_floor length and set value with m0. + ggml_cann_pool_alloc mk_base_allocator(ctx.pool(), + ne2_ne3 * ggml_type_size(dst->type)); + void* mk_base_buffer = mk_base_allocator.get(); + int64_t mk_base_ne[] = {n_heads_log2_floor}; + size_t mk_base_nb[] = {sizeof(dst->type)}; + aclTensor* mk_base_tensor = create_acl_tensor( + mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + mk_base_ne, mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor); + aclnn_fill_scalar(ctx, m0, mk_base_tensor); - aclTensor* tmp_mk_base2_tensor = nullptr; + // if n_heads_log2_floor smaller than ne2_ne3, need mk_base_tail which has + // ne2_ne3 - n_heads_log2_floor and set value with m1. + aclTensor* mk_base_tail_tensor = nullptr; if (n_heads_log2_floor < ne2_ne3) { - int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor}; - size_t tmp_mk_base2_nb[] = {sizeof(dst->type)}; - aclTensor* tmp_mk_base2_tensor = create_acl_tensor( - (char*)tmp_mk_base_buffer + n_heads_log2_floor * ggml_type_size(dst->type), - type_mapping(dst->type), ggml_type_size(dst->type), tmp_mk_base2_ne, - tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor); - } - - // init mk - int64_t tmp_mk_base_ne[] = {ne2_ne3}; - size_t tmp_mk_base_nb[] = {sizeof(dst->type)}; - aclTensor* tmp_mk_base_tensor = create_acl_tensor( - tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), - tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclTensor* tmp_arange_tensor = create_acl_tensor( - tmp_arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), - tmp_mk_base_ne, tmp_mk_base_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); - aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor); + int64_t mk_base_tail_ne[] = {ne2_ne3 - n_heads_log2_floor}; + size_t mk_base_tail_nb[] = {sizeof(dst->type)}; + aclTensor* mk_base_tail_tensor = create_acl_tensor( + (char*)mk_base_buffer + + n_heads_log2_floor * ggml_type_size(dst->type), + type_mapping(dst->type), ggml_type_size(dst->type), mk_base_tail_ne, + mk_base_tail_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclnn_fill_scalar(ctx, m1, mk_base_tail_tensor); + } + + // init mk, mk_base_with_tail_tensor=mk_base_with_tail_tensor^arange_tensor + int64_t mk_base_with_tail_ne[] = {ne2_ne3}; + size_t mk_base_with_tail_nb[] = {sizeof(dst->type)}; + aclTensor* mk_base_with_tail_tensor = create_acl_tensor( + mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + mk_base_with_tail_ne, mk_base_with_tail_nb, GGML_MAX_DIMS - 3, + ACL_FORMAT_ND); + aclTensor* arange_with_tail_tensor = create_acl_tensor( + arange_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + mk_base_with_tail_ne, mk_base_with_tail_nb, GGML_MAX_DIMS - 3, + ACL_FORMAT_ND); + aclnn_pow_tensor_tensor(ctx, mk_base_with_tail_tensor, + arange_with_tail_tensor); // reshape mk - int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]}; - size_t tmp_mk_nb[GGML_MAX_DIMS]; - tmp_mk_nb[0] = ggml_type_size(dst->type); + int64_t mk_ne[] = {1, 1, src_ne[2], src_ne[3]}; + size_t mk_nb[GGML_MAX_DIMS]; + mk_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { - tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1]; + mk_nb[i] = mk_nb[i - 1] * mk_ne[i - 1]; } - aclTensor* tmp_mk_tensor = create_acl_tensor( - tmp_mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), - tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + aclTensor* mk_tensor = create_acl_tensor( + mk_base_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + mk_ne, mk_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); // acl_position * mk - int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]}; - size_t tmp_output_nb[GGML_MAX_DIMS]; - tmp_output_nb[0] = ggml_type_size(dst->type); + int64_t output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]}; + size_t output_nb[GGML_MAX_DIMS]; + output_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { - tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1]; + output_nb[i] = output_nb[i - 1] * output_ne[i - 1]; } ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst)); - void* tmp_output_buffer = output_allocator.get(); - aclTensor* tmp_output_tensor = create_acl_tensor( - tmp_output_buffer, type_mapping(dst->type), ggml_type_size(dst->type), - tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); - aclnn_noinplcace_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor); + void* output_buffer = output_allocator.get(); + aclTensor* output_tensor = create_acl_tensor( + output_buffer, type_mapping(dst->type), ggml_type_size(dst->type), + output_ne, output_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_mul(ctx, acl_position, mk_tensor, output_tensor); // add - aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst); - - ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor)); - ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor)); - ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor)); - ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor)); - ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor)); - ACL_CHECK(aclDestroyTensor(tmp_arange_tensor)); - ACL_CHECK(aclDestroyTensor(tmp_mk_tensor)); - ACL_CHECK(aclDestroyTensor(tmp_output_tensor)); + aclnn_add(ctx, output_tensor, acl_src, acl_dst); + + ACL_CHECK(aclDestroyTensor(arange_tensor)); + ACL_CHECK(aclDestroyTensor(arange_tail_tensor)); + ACL_CHECK(aclDestroyTensor(mk_base_tensor)); + ACL_CHECK(aclDestroyTensor(mk_base_tail_tensor)); + ACL_CHECK(aclDestroyTensor(mk_base_with_tail_tensor)); + ACL_CHECK(aclDestroyTensor(arange_with_tail_tensor)); + ACL_CHECK(aclDestroyTensor(mk_tensor)); + ACL_CHECK(aclDestroyTensor(output_tensor)); } void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_dup(ctx, dst); } -static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst) { +/** + * @brief Performs element-wise addition of two tensors in place. + * + * This function adds the source tensor `acl_src` to the destination tensor + * `acl_dst` element-wise and stores the result in the destination tensor + * `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor to be added. + * @param acl_dst The destination tensor which will hold the result of the + * addition. + */ +static void aclnn_inplace_add(ggml_backend_cann_context& ctx, + aclTensor* acl_src, aclTensor* acl_dst) { aclScalar* alpha = nullptr; float alphaValue = 1.0f; alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); @@ -1669,16 +1935,28 @@ static void aclnn_inplace_add(ggml_backend_cann_context& ctx, aclTensor* acl_src workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); ACL_CHECK( - aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, main_stream)); + aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyScalar(alpha)); } +/** + * @brief Applies the softmax function to a tensor along a specified dimension. + * + * This function computes the softmax of the source tensor `acl_src` along the + * specified dimension `dim` and stores the result in the destination tensor + * `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor on which the softmax function will be + * applied. + * @param dim The dimension along which the softmax function will be computed. + * @param acl_dst The destination tensor where the softmax results will be + * stored. + */ static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src, int64_t dim, aclTensor* acl_dst) { - uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -1715,11 +1993,8 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes); void* input_mul_scale_buffer = mul_scale_allocator.get(); aclTensor* acl_input_mul_scale_tensor = create_acl_tensor( - input_mul_scale_buffer, - ACL_FLOAT, - ggml_type_size(src0->type), - src0->ne, src0->nb, - GGML_MAX_DIMS); + input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne, + src0->nb, GGML_MAX_DIMS); bool inplace = false; aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace); @@ -1740,18 +2015,14 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } src1_fp32_allocator.alloc(n_bytes); void* src1_fp32_buffer = src1_fp32_allocator.get(); - acl_src1_fp32_tensor = create_acl_tensor(src1_fp32_buffer, - ACL_FLOAT, - sizeof(float), - src1->ne, - src1_fp32_nb, - GGML_MAX_DIMS); + acl_src1_fp32_tensor = + create_acl_tensor(src1_fp32_buffer, ACL_FLOAT, sizeof(float), + src1->ne, src1_fp32_nb, GGML_MAX_DIMS); aclTensor* acl_src1 = create_acl_tensor(src1); aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT); ACL_CHECK(aclDestroyTensor(acl_src1)); - } - else { + } else { acl_src1_fp32_tensor = create_acl_tensor(src1); } @@ -1765,8 +2036,8 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1]; } tmp_mask_tensor = create_acl_tensor( - src1->data, ACL_FLOAT, sizeof(float), - tmp_mask_ne, tmp_mask_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); } // alibi @@ -1777,41 +2048,34 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes); void* output_buffer = output_allocator.get(); aclTensor* alibi_output_tensor = create_acl_tensor( - output_buffer, - ACL_FLOAT, - ggml_type_size(dst->type), - dst->ne, dst->nb, - GGML_MAX_DIMS); - if (max_bias <=0.0f) { + output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne, + dst->nb, GGML_MAX_DIMS); + if (max_bias <= 0.0f) { // slope = 1.0 if (tmp_mask_tensor) { aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor, - alibi_output_tensor); - } - else { + alibi_output_tensor); + } else { aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor, - alibi_output_tensor); + alibi_output_tensor); } - } - else { + } else { // slope != 1.0 if (tmp_mask_tensor) { aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor, - alibi_output_tensor, n_head, src0->ne, src_nb0, max_bias, - dst); - } - else { - aclnn_alibi(ctx, acl_input_mul_scale_tensor, acl_src1_fp32_tensor, - alibi_output_tensor, n_head, src0->ne, src_nb0, max_bias, - dst); + alibi_output_tensor, n_head, src0->ne, src_nb0, + max_bias, dst); + } else { + aclnn_alibi(ctx, acl_input_mul_scale_tensor, + acl_src1_fp32_tensor, alibi_output_tensor, n_head, + src0->ne, src_nb0, max_bias, dst); } } // softmax aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst); ACL_CHECK(aclDestroyTensor(alibi_output_tensor)); - } - else { + } else { aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst); } @@ -1834,11 +2098,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { src1->extra = src1_extra_allocator.get(); dst->extra = dst_extra_allocator.get(); ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, ctx.stream())); + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); switch (src0->type) { case GGML_TYPE_F32: @@ -1881,37 +2148,58 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } } -static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t dim, int64_t repeats, - int64_t output_size) { - // each elem in acl_src will repeat. repeat number is `repeats`, repeats dim - // is `dim`. - +/** + * @brief Repeats elements of a tensor along a specified dimension. + * + * This function repeats each element of the source tensor `acl_src` a specified + * number of times (`repeats`) along the specified dimension `dim` and stores + * the result in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose elements will be repeated. + * @param acl_dst The destination tensor where the repeated elements will be + * stored. + * @param dim The dimension along which the elements will be repeated. + * @param repeats The number of times each element will be repeated. + * @param output_size The size of the output tensor. + */ +static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, + aclTensor* acl_src, aclTensor* acl_dst, + int64_t dim, int64_t repeats, + int64_t output_size) { uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; - ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize(acl_src, repeats, - dim, output_size, - acl_dst, - &workspaceSize, - &executor)); + ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize( + acl_src, repeats, dim, output_size, acl_dst, &workspaceSize, + &executor)); if (workspaceSize > 0) { ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK( - aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize, executor, - main_stream)); - + ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize, + executor, ctx.stream())); } +/** + * @brief Performs matrix multiplication of two tensors. + * + * This function computes the matrix multiplication of the input tensor + * `acl_input` and the weight tensor `acl_weight`, and stores the result in the + * destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_input The input tensor for the matrix multiplication. + * @param acl_weight The weight tensor for the matrix multiplication. + * @param acl_dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, - aclTensor* acl_weight, aclTensor* acl_dst) { - int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is fp32, - // atlas a2 will transpose it to HFLOAT32. + aclTensor* acl_weight, aclTensor* acl_dst) { + int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is + // fp32, atlas a2 will transpose it to HFLOAT32. uint64_t workspaceSize = 0; aclOpExecutor* executor; @@ -1926,28 +2214,42 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK(aclnnMatmul(workspaceAddr, workspaceSize, executor, - main_stream)); + ACL_CHECK(aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream())); } -static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { +/** + * @brief Performs matrix multiplication with floating-point precision on + * tensors using the CANN backend. + * + * This function performs matrix multiplication of the input tensor and the + * weight tensor, handling broadcasting and transposing as needed, and stores + * the result in the destination tensor `dst`. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { ggml_tensor* weight = dst->src[0]; // weight - ggml_tensor* input = dst->src[1]; // input + ggml_tensor* input = dst->src[1]; // input - // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto broadcast, - // when weight ne2 or ne3 is not 1, weight need repeat. + // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto + // broadcast, when weight ne2 or ne3 is not 1, weight need repeat. BCAST_MUL_MAT_SHAPE(input, weight, dst); // transpose weight: [1,2,3,4] -> [1,2,4,3] int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], - bcast_weight_ne[2], bcast_weight_ne[3], bcast_weight_ne[4], bcast_weight_ne[5]}; + bcast_weight_ne[2], bcast_weight_ne[3], + bcast_weight_ne[4], bcast_weight_ne[5]}; size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], - bcast_weight_nb[2], bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5]}; - + bcast_weight_nb[2], bcast_weight_nb[3], + bcast_weight_nb[4], bcast_weight_nb[5]}; - aclTensor* acl_weight_tensor = create_acl_tensor(weight, transpose_ne, transpose_nb, bcast_dims); - aclTensor* acl_input_tensor = create_acl_tensor(input, BCAST_MUL_MAT_PARAM(input)); + aclTensor* acl_weight_tensor = + create_acl_tensor(weight, transpose_ne, transpose_nb, bcast_dims); + aclTensor* acl_input_tensor = + create_acl_tensor(input, BCAST_MUL_MAT_PARAM(input)); aclTensor* acl_dst = create_acl_tensor(dst, BCAST_MUL_MAT_PARAM(dst)); aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); @@ -1956,7 +2258,21 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ggml_tensor* ds ACL_CHECK(aclDestroyTensor(acl_dst)); } -static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* dst) { +/** + * @brief Performs matrix multiplication with quantized weights and + * floating-point inputs using the CANN backend. + * + * This function performs matrix multiplication of the input tensor `src1` and + * the weight tensor `src0`, handling broadcasting, transposing, and + * quantization as needed, and stores the result in the destination tensor + * `dst`. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src1 = dst->src[1]; // input @@ -1986,7 +2302,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* if (src1->type != GGML_TYPE_F16) { aclTensor* acl_src1_tensor = create_acl_tensor(src1); - ggml_cann_pool_alloc input_alloctor(ctx.pool(), ggml_nelements(src1) * input_elem_size); + ggml_cann_pool_alloc input_alloctor( + ctx.pool(), ggml_nelements(src1) * input_elem_size); input_buffer = input_alloctor.get(); int64_t* input_cast_ne = src1->ne; @@ -2010,7 +2327,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* size_t output_elem_size = sizeof(uint16_t); int64_t output_ne[] = {dst->ne[0], dst->ne[1]}; size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]}; - ggml_cann_pool_alloc output_alloctor(ctx.pool(), ggml_nelements(dst) * output_elem_size); + ggml_cann_pool_alloc output_alloctor( + ctx.pool(), ggml_nelements(dst) * output_elem_size); void* output_buffer = output_alloctor.get(); size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1]; @@ -2046,7 +2364,8 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx, ggml_tensor* &workspaceSize, &executor)); if (workspaceSize > 0 && workspaceAddr == nullptr) { - ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), + workspaceSize); workspaceAddr = workspace_allocator.get(); } @@ -2097,9 +2416,24 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } } +/** + * @brief Rolls the elements of a tensor along a specified dimension. + * + * This function rolls the elements of the source tensor `acl_src` by the + * specified shifts `shifts` along the specified dimensions `dims`, and stores + * the result in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose elements will be rolled. + * @param acl_dst The destination tensor where the rolled elements will be + * stored. + * @param shifts An array specifying the number of positions by which elements + * are shifted. + * @param dims An array specifying the dimensions along which elements are + * shifted. + */ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* shifts, int64_t* dims) { - + aclTensor* acl_dst, int64_t* shifts, int64_t* dims) { aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1); aclIntArray* acl_dims = aclCreateIntArray(dims, 1); @@ -2114,20 +2448,29 @@ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK( - aclnnRoll(workspaceAddr, workspaceSize, executor, main_stream)); + ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream())); ACL_CHECK(aclDestroyIntArray(acl_shifts)); ACL_CHECK(aclDestroyIntArray(acl_dims)); } +/** + * @brief Fills specified positions of a tensor with a scalar value. + * + * This function fills the positions in the source tensor `acl_src` specified by + * `index` along the dimension `dim` with the scalar value `value`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor where the positions will be filled. + * @param dim The dimension along which the positions are specified. + * @param index An array specifying the positions to be filled. + * @param index_num The number of positions specified in the index array. + * @param value The scalar value used to fill the specified positions. + */ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src, int64_t dim, int64_t* index, int64_t index_num, float value) { - // position in the @param.index along @param.dim will be filled with @param.value - aclIntArray* acl_index = aclCreateIntArray(index, index_num); aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); @@ -2135,46 +2478,61 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, aclOpExecutor* executor; void* workspaceAddr = nullptr; - ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize(acl_src, dim, - acl_index, acl_value, - &workspaceSize, - &executor)); + ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize( + acl_src, dim, acl_index, acl_value, &workspaceSize, &executor)); if (workspaceSize > 0) { ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); workspaceAddr = workspace_allocator.get(); } - aclrtStream main_stream = ctx.stream(); - ACL_CHECK( - aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize, executor, - main_stream)); + ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize, + executor, ctx.stream())); ACL_CHECK(aclDestroyIntArray(acl_index)); ACL_CHECK(aclDestroyScalar(acl_value)); } +/** + * @brief Initializes the cache for sine and cosine values. + * + * @details This function prepares a cache for sine and cosine values which can + * be reused in subsequent computations, potentially improving + * performance by avoiding redundant calculations. + * + * @param ctx The context for the CANN backend operations. + * @param dst Pointer to the destination tensor where the + * final result will be stored. + * @param acl_cos_repeat_tensor Pointer to the ACL tensor where repeated cosine + * values will be cached. + * @param acl_sin_repeat_tensor Pointer to the ACL tensor where repeated sine + * values will be cached. + * @param theta_scale A scaling factor applied to the theta values + * before calculating sine and cosine. + * @param is_neox Boolean flag indicating whether to use the Neox + * repeat method for caching. + */ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_cos_repeat_tensor, aclTensor* acl_sin_repeat_tensor, float theta_scale, bool is_neox) { - // int sin/cos cache, cache has different repeat method depond on @param.is_neox + // int sin/cos cache, cache has different repeat method depond on + // @param.is_neox - ggml_tensor* src0 = dst->src[0]; // input - ggml_tensor* src1 = dst->src[1]; // position + ggml_tensor* src0 = dst->src[0]; // input + ggml_tensor* src1 = dst->src[1]; // position // arange, [0,1,...,ne0/2] int64_t arange_length = src0->ne[0] / 2; ggml_cann_pool_alloc arange_allocator(ctx.pool(), - arange_length*sizeof(float_t)); + arange_length * sizeof(float_t)); void* arange_buffer = arange_allocator.get(); int64_t arange_ne[] = {arange_length, 1, 1, 1}; size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t), - arange_length*sizeof(float_t)}; + arange_length * sizeof(float_t)}; - aclTensor* acl_arange_tensor = create_acl_tensor(arange_buffer, ACL_FLOAT, - sizeof(float_t), - arange_ne, arange_nb, - GGML_MAX_DIMS); + aclTensor* acl_arange_tensor = + create_acl_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t), arange_ne, + arange_nb, GGML_MAX_DIMS); float start = 0; float step = 1; float stop = src0->ne[0] / 2; @@ -2182,36 +2540,34 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements); // power - // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so use aclnn_pow_tensor_tensor() until fixed. - // aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); - // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor, acl_power_tensor); + // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so + // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale = + // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); + // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor, + // acl_power_tensor); ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(), - arange_length*sizeof(float_t)); + arange_length * sizeof(float_t)); void* theta_scale_buffer = theta_scale_allocator.get(); - aclTensor* acl_theta_scale_tensor = aclnn_ones(ctx, theta_scale_buffer, - arange_length*sizeof(float_t), - arange_ne, GGML_MAX_DIMS, - ACL_FLOAT, sizeof(float_t), - theta_scale); + aclTensor* acl_theta_scale_tensor = aclnn_ones( + ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne, + GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale); aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor); // position - GGML_ASSERT(src1->type==GGML_TYPE_I32); + GGML_ASSERT(src1->type == GGML_TYPE_I32); int64_t position_length = src1->ne[0]; int64_t position_ne[] = {1, position_length, 1, 1}; size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), - sizeof(int32_t)*position_length, - sizeof(int32_t)*position_length}; - aclTensor* acl_position_tensor = create_acl_tensor(src1->data, - type_mapping(src1->type), - ggml_type_size(src1->type), - position_ne, position_nb, - GGML_MAX_DIMS); + sizeof(int32_t) * position_length, + sizeof(int32_t) * position_length}; + aclTensor* acl_position_tensor = create_acl_tensor( + src1->data, type_mapping(src1->type), ggml_type_size(src1->type), + position_ne, position_nb, GGML_MAX_DIMS); // power * position int64_t theta_length = arange_length * position_length; ggml_cann_pool_alloc theta_allocator(ctx.pool(), - theta_length*sizeof(float_t)); + theta_length * sizeof(float_t)); void* theta_buffer = theta_allocator.get(); int64_t theta_ne[] = {arange_length, position_length, 1, 1}; size_t theta_nb[GGML_MAX_DIMS]; @@ -2219,10 +2575,10 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, for (int i = 1; i < GGML_MAX_DIMS; i++) { theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1]; } - aclTensor* acl_theta_tensor = create_acl_tensor(theta_buffer, ACL_FLOAT, - sizeof(float_t), theta_ne, - theta_nb, GGML_MAX_DIMS); - aclnn_noinplcace_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, + aclTensor* acl_theta_tensor = + create_acl_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, + theta_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, acl_theta_tensor); // permute: [0,1,2,3]->[0,2,1,3] @@ -2233,47 +2589,46 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1]; } ggml_cann_pool_alloc permute_allocator(ctx.pool(), - theta_length*sizeof(float_t)); + theta_length * sizeof(float_t)); void* permute_buffer = permute_allocator.get(); - aclTensor* acl_permute_tensor = create_acl_tensor( - permute_buffer, ACL_FLOAT, sizeof(float_t), - permute_ne, permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + aclTensor* acl_permute_tensor = + create_acl_tensor(permute_buffer, ACL_FLOAT, sizeof(float_t), + permute_ne, permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); int64_t permute_dim[] = {0, 2, 1, 3}; int64_t num_dims = 4; aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim, num_dims); // sin/cos - ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length*sizeof(float_t)); + ggml_cann_pool_alloc sin_allocator(ctx.pool(), + theta_length * sizeof(float_t)); void* sin_buffer = sin_allocator.get(); - aclTensor* acl_sin_tensor = create_acl_tensor( - sin_buffer, ACL_FLOAT, sizeof(float_t), - permute_ne, permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + aclTensor* acl_sin_tensor = + create_acl_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, + permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor); - ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length*sizeof(float_t)); + ggml_cann_pool_alloc cos_allocator(ctx.pool(), + theta_length * sizeof(float_t)); void* cos_buffer = cos_allocator.get(); - aclTensor* acl_cos_tensor = create_acl_tensor( - cos_buffer, ACL_FLOAT, sizeof(float_t), - permute_ne, permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + aclTensor* acl_cos_tensor = + create_acl_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, + permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor); // repeat if (is_neox) { - int64_t repeatsArray[] = {1,1,1,2}; + int64_t repeatsArray[] = {1, 1, 1, 2}; aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray); aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray); - } - else { + } else { int64_t num_repeats = 2; int64_t dim = 3; - int64_t output_size = arange_length*num_repeats; - aclnn_repeat_interleave(ctx, acl_sin_tensor, - acl_sin_repeat_tensor, dim, num_repeats, - output_size); - aclnn_repeat_interleave(ctx, acl_cos_tensor, - acl_cos_repeat_tensor, dim, num_repeats, - output_size); + int64_t output_size = arange_length * num_repeats; + aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim, + num_repeats, output_size); + aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim, + num_repeats, output_size); } // release @@ -2289,38 +2644,35 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // TODO: use ascendc // Only test with LLAMA model. - ggml_tensor* src0 = dst->src[0]; // input - ggml_tensor* src2 = dst->src[2]; // freq_factors - - // TODO: with freq_factors - GGML_ASSERT(src2 == NULL); + ggml_tensor* src0 = dst->src[0]; // input + ggml_tensor* src2 = dst->src[2]; // freq_factors // param float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - //const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; + // const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t*)dst->op_params)[1]; + const int mode = ((int32_t*)dst->op_params)[2]; + // const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((int32_t*)dst->op_params)[4]; GGML_TENSOR_UNARY_OP_LOCALS - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); GGML_ASSERT(n_dims <= ne0); GGML_ASSERT(n_dims % 2 == 0); - // TODO: ext_factor != 0 + // TODO: with freq_factors, ext_factor != 0, freq_scale != 1 + GGML_ASSERT(src2 == NULL); GGML_ASSERT(ext_factor == 0); - // TODO: freq_scale != 1 GGML_ASSERT(freq_scale == 1); - const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float theta_scale = powf(freq_base, -2.0f / n_dims); float corr_dims[2]; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, @@ -2329,10 +2681,10 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const bool is_neox = mode & 2; // init cos/sin cache - ggml_cann_pool_alloc sin_allocator(ctx.pool(), src0->ne[0] * src0->ne[2] - * sizeof(float_t)); - ggml_cann_pool_alloc cos_allocator(ctx.pool(), src0->ne[0] * src0->ne[2] - * sizeof(float_t)); + ggml_cann_pool_alloc sin_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t)); + ggml_cann_pool_alloc cos_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t)); void* sin_buffer = sin_allocator.get(); void* cos_buffer = cos_allocator.get(); @@ -2342,16 +2694,12 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; } - aclTensor* acl_sin_reshape_tensor = create_acl_tensor(sin_buffer, ACL_FLOAT, - sizeof(float_t), - sin_reshape_ne, - sin_reshape_nb, - GGML_MAX_DIMS); - aclTensor* acl_cos_reshape_tensor = create_acl_tensor(cos_buffer, ACL_FLOAT, - sizeof(float_t), - sin_reshape_ne, - sin_reshape_nb, - GGML_MAX_DIMS); + aclTensor* acl_sin_reshape_tensor = + create_acl_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t), + sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); + aclTensor* acl_cos_reshape_tensor = + create_acl_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), + sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, theta_scale, is_neox); @@ -2360,32 +2708,25 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_minus_one_tensor; void* minus_one_scale_buffer = nullptr; ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0)); - ggml_cann_pool_alloc minus_one_scale_allocator(ctx.pool(), - sizeof(float_t) * src0->ne[0]); + ggml_cann_pool_alloc minus_one_scale_allocator( + ctx.pool(), sizeof(float_t) * src0->ne[0]); if (!is_neox) { // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...] input_roll_buffer = roll_allocator.get(); - int64_t input_roll_ne[4] = {2, src0->ne[1]*(src0->ne[0]/2), src0->ne[2], - src0->ne[3]}; + int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2), + src0->ne[2], src0->ne[3]}; size_t input_roll_nb[GGML_MAX_DIMS]; input_roll_nb[0] = ggml_type_size(src0->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1]; } - aclTensor* acl_input_roll_tensor = create_acl_tensor( - input_roll_buffer, - type_mapping(src0->type), - ggml_type_size(src0->type), - input_roll_ne, - input_roll_nb, - GGML_MAX_DIMS); + aclTensor* acl_input_roll_tensor = + create_acl_tensor(input_roll_buffer, type_mapping(src0->type), + ggml_type_size(src0->type), input_roll_ne, + input_roll_nb, GGML_MAX_DIMS); aclTensor* acl_input_tensor = create_acl_tensor( - src0->data, - type_mapping(src0->type), - ggml_type_size(src0->type), - input_roll_ne, - input_roll_nb, - GGML_MAX_DIMS); + src0->data, type_mapping(src0->type), ggml_type_size(src0->type), + input_roll_ne, input_roll_nb, GGML_MAX_DIMS); int64_t shifts[] = {1}; int64_t dims[] = {3}; @@ -2402,28 +2743,25 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; } - acl_minus_one_tensor = aclnn_ones(ctx, minus_one_scale_buffer, - sizeof(float_t) * src0->ne[0], - minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, - sizeof(float_t), 1); + acl_minus_one_tensor = aclnn_ones( + ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], + minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); int64_t dim = 3; int64_t* index = new int64_t[src0->ne[0]]; - for (int i=0; ine[0]; i++) { - index[i] = i/2*2; + for (int i = 0; i < src0->ne[0]; i++) { + index[i] = i / 2 * 2; } int64_t index_num = src0->ne[0]; float value = -1; - aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim ,index, index_num, value); - } - else { - // roll input: [q0,q1,q2,...] -> [q_half,q_half+1,...,q_end,q0,q1,...q_half-1] + aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, + index_num, value); + } else { + // roll input: [q0,q1,q2,...] -> + // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1] input_roll_buffer = roll_allocator.get(); aclTensor* acl_input_roll_tensor = create_acl_tensor( - input_roll_buffer, - type_mapping(src0->type), - ggml_type_size(src0->type), - src0->ne, src0->nb, - GGML_MAX_DIMS); + input_roll_buffer, type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); aclTensor* acl_input_tensor = create_acl_tensor(src0); int64_t shifts[] = {src0->ne[0] / 2}; @@ -2442,19 +2780,19 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; } - acl_minus_one_tensor = aclnn_ones(ctx, minus_one_scale_buffer, - sizeof(float_t) * src0->ne[0], - minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, - sizeof(float_t), 1); + acl_minus_one_tensor = aclnn_ones( + ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], + minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); // -1 * first half - int64_t first_half_ne[4] = {src0->ne[0]/2, 1, 1, 1}; + int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1}; size_t first_half_nb[GGML_MAX_DIMS]; first_half_nb[0] = sizeof(float_t); for (int i = 1; i < GGML_MAX_DIMS; i++) { first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1]; } - aclTensor* acl_first_half_tensor = create_acl_tensor(minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), - first_half_ne, first_half_nb, GGML_MAX_DIMS); + aclTensor* acl_first_half_tensor = create_acl_tensor( + minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne, + first_half_nb, GGML_MAX_DIMS); bool inplace = true; float scale = -1; aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace); @@ -2462,10 +2800,11 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } // TODO: n_dims < ne0 - GGML_ASSERT(n_dims==src0->ne[0]); + GGML_ASSERT(n_dims == src0->ne[0]); // input * scale - ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0)); + ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), + ggml_nbytes(src0)); void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); size_t input_nb[GGML_MAX_DIMS]; input_nb[0] = ggml_type_size(src0->type); @@ -2473,19 +2812,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { input_nb[i] = input_nb[i - 1] * src0->ne[i - 1]; } aclTensor* acl_input_roll_mul_scale_tensor = create_acl_tensor( - input_roll_mul_scale_buffer, - type_mapping(src0->type), - ggml_type_size(src0->type), - src0->ne, input_nb, - GGML_MAX_DIMS); + input_roll_mul_scale_buffer, type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); aclTensor* acl_input_roll_reshape_tensor = create_acl_tensor( - input_roll_buffer, - type_mapping(src0->type), - ggml_type_size(src0->type), - src0->ne, input_nb, - GGML_MAX_DIMS); + input_roll_buffer, type_mapping(src0->type), ggml_type_size(src0->type), + src0->ne, input_nb, GGML_MAX_DIMS); - aclnn_noinplcace_mul(ctx, acl_input_roll_reshape_tensor, + aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, acl_input_roll_mul_scale_tensor); // output @@ -2493,56 +2826,49 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { aclTensor* acl_dst = create_acl_tensor(dst); void* output_fp32_buffer; if (src0->type == GGML_TYPE_F32) { + // dst=src0*cos+input_roll_mul_scale*sin. aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor); aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor); aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst); - // TODO: ne0 != n_dims in mode2 - } - else if (src0->type == GGML_TYPE_F16) { - size_t input_fp32_nb[GGML_MAX_DIMS]; - input_fp32_nb[0] = sizeof(float_t); + } else if (src0->type == GGML_TYPE_F16) { + // dst=src0_mul_cos + input_roll_mul_cos. + size_t output_fp32_nb[GGML_MAX_DIMS]; + output_fp32_nb[0] = sizeof(float_t); for (int i = 1; i < GGML_MAX_DIMS; i++) { - input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; + output_fp32_nb[i] = output_fp32_nb[i - 1] * dst->ne[i - 1]; } - ggml_cann_pool_alloc fp32_allocator1(ctx.pool(), - ggml_nelements(dst)*sizeof(float_t)); - void* input_fp32_buffer1 = fp32_allocator1.get(); - aclTensor* input_fp32_tensor1 = create_acl_tensor(input_fp32_buffer1, - ACL_FLOAT, - sizeof(float_t), - dst->ne, - input_fp32_nb, - GGML_MAX_DIMS); - ggml_cann_pool_alloc fp32_allocator2(ctx.pool(), - ggml_nelements(dst)*sizeof(float_t)); - void* input_fp32_buffer2 = fp32_allocator2.get(); - aclTensor* input_fp32_tensor2 = create_acl_tensor(input_fp32_buffer2, - ACL_FLOAT, - sizeof(float_t), - dst->ne, - input_fp32_nb, - GGML_MAX_DIMS); - - ggml_cann_pool_alloc fp32_allocator(ctx.pool(), - ggml_nelements(dst)*sizeof(float_t)); - output_fp32_buffer = fp32_allocator.get(); - aclTensor* output_fp32_tensor = create_acl_tensor(output_fp32_buffer, - ACL_FLOAT, - sizeof(float_t), - dst->ne, - input_fp32_nb, - GGML_MAX_DIMS); - aclnn_noinplcace_mul(ctx, acl_src0, acl_cos_reshape_tensor, - input_fp32_tensor1); - aclnn_noinplcace_mul(ctx, acl_input_roll_mul_scale_tensor, - acl_sin_reshape_tensor, input_fp32_tensor2); - aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, + ggml_cann_pool_alloc src0_mul_cos_allocator( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* src0_mul_cos_buffer = src0_mul_cos_allocator.get(); + aclTensor* src0_mul_cos_tensor = + create_acl_tensor(src0_mul_cos_buffer, ACL_FLOAT, sizeof(float_t), + dst->ne, output_fp32_nb, GGML_MAX_DIMS); + + ggml_cann_pool_alloc input_roll_mul_cos_allocator( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_roll_mul_cos_buffer = input_roll_mul_cos_allocator.get(); + aclTensor* input_roll_mul_cos_tensor = + create_acl_tensor(input_roll_mul_cos_buffer, ACL_FLOAT, + sizeof(float_t), dst->ne, output_fp32_nb, + GGML_MAX_DIMS); + + ggml_cann_pool_alloc output_fp32_allocator( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + output_fp32_buffer = output_fp32_allocator.get(); + aclTensor* output_fp32_tensor = + create_acl_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float_t), + dst->ne, output_fp32_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, + src0_mul_cos_tensor); + aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, + acl_sin_reshape_tensor, input_roll_mul_cos_tensor); + aclnn_add(ctx, src0_mul_cos_tensor, input_roll_mul_cos_tensor, output_fp32_tensor); aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16); - ACL_CHECK(aclDestroyTensor(input_fp32_tensor1)); - ACL_CHECK(aclDestroyTensor(input_fp32_tensor2)); + ACL_CHECK(aclDestroyTensor(src0_mul_cos_tensor)); + ACL_CHECK(aclDestroyTensor(input_roll_mul_cos_tensor)); ACL_CHECK(aclDestroyTensor(output_fp32_tensor)); } diff --git a/src/llama.cpp b/src/llama.cpp index 96395409aa9a4b..f49a4e186c3dca 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -14821,8 +14821,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { llama_graph_compute(lctx, gf, lctx.cparams.n_threads); need_reserve = true; - - LLAMA_LOG_INFO("\n\n\n\nkv cache updated!!!!!\n\n\n\n"); } {