diff --git a/CMakeLists.txt b/CMakeLists.txt index 0040f71ad2c93..da68c5e70a8d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -841,6 +841,20 @@ if (LLAMA_CANN) endif() endif() + # * libacl_op_compiler.so + if (LLAMA_CANN) + set(lib_dir "${CANN_INSTALL_DIR}/lib64") + find_library(found_lib_acl_op_compiler NAMES acl_op_compiler PATHS ${lib_dir} NO_DEFAULT_PATH) + if (found_lib_acl_op_compiler) + set(lib_acl_op_compiler ${found_lib_acl_op_compiler}) + list(APPEND CANN_LIBRARIES ${lib_acl_op_compiler}) + message(STATUS "CANN: libacl_op_compiler.so is found at ${lib_dir}") + else() + set(LLAMA_CANN OFF) + message(WARNING "CANN: Missing libacl_op_compiler.so. Turning off LLAMA_CANN") + endif() + endif() + # Set headers and libs if (LLAMA_CANN) message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}") diff --git a/ggml-cann.cpp b/ggml-cann.cpp index bc0cb87585ff8..1f68884f22523 100644 --- a/ggml-cann.cpp +++ b/ggml-cann.cpp @@ -8,6 +8,7 @@ #include "ggml-backend-impl.h" #include "ggml-cann/aclnn_ops.h" #include "ggml-cann/common.h" +#include "ggml-cann/acl_ops.h" struct AclLifeCycle { AclLifeCycle() { ACL_CHECK(aclInit(nullptr)); } @@ -346,8 +347,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, ggml_cann_repeat(ctx, dst); break; case GGML_OP_GET_ROWS: - case GGML_OP_DUP: return false; + case GGML_OP_DUP: + ggml_cann_cont(ctx, dst); + break; case GGML_OP_ADD: ggml_cann_add(ctx, dst); break; @@ -394,14 +397,19 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, } break; case GGML_OP_NORM: + ggml_cann_norm(ctx, dst); + break; case GGML_OP_GROUP_NORM: return false; case GGML_OP_CONCAT: ggml_cann_concat(ctx, dst); break; + // TODO: Format need NC1HWC0. case GGML_OP_UPSCALE: - case GGML_OP_PAD: return false; + case GGML_OP_PAD: + ggml_cann_pad(ctx, dst); + break; case GGML_OP_ARANGE: ggml_cann_arange(ctx, dst); break; @@ -413,8 +421,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, case GGML_OP_RMS_NORM: case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - case GGML_OP_SCALE: return false; + case GGML_OP_SCALE: + ggml_cann_scale(ctx, dst); + break; case GGML_OP_SQR: ggml_cann_sqr(ctx, dst); break; @@ -422,12 +432,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, ggml_cann_clamp(ctx, dst); break; case GGML_OP_CPY: + return false; case GGML_OP_CONT: + ggml_cann_cont(ctx, dst); case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: + // Do nothing with these ops. + break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ROPE: @@ -437,8 +451,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, case GGML_OP_SUM_ROWS: return false; case GGML_OP_ARGSORT: - // ggml_cann_argsort(ctx, dst); - // break; + ggml_cann_argsort(ctx, dst); + break; return false; default: return false; @@ -458,7 +472,8 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) { GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) { ggml_backend_cann_context* cann_ctx = (ggml_backend_cann_context*)backend->context; - + ACL_CHECK(aclrtSynchronizeDevice()); + ACL_CHECK(aclrtResetDevice(cann_ctx->device)); delete cann_ctx; delete backend; } @@ -591,8 +606,9 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute( for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor* node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_VIEW || - node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } @@ -627,29 +643,31 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, case GGML_OP_MUL_MAT_ID: case GGML_OP_GET_ROWS: case GGML_OP_CPY: - case GGML_OP_DUP: return false; + case GGML_OP_DUP: + return true; case GGML_OP_REPEAT: case GGML_OP_CONCAT: case GGML_OP_NONE: - return true; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: + return true; case GGML_OP_NORM: - return false; + return true; case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: return true; case GGML_OP_RMS_NORM: - case GGML_OP_SCALE: return false; + case GGML_OP_SCALE: + return true; case GGML_OP_SQR: case GGML_OP_CLAMP: - return true; case GGML_OP_CONT: + return true; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_ROPE: @@ -659,12 +677,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, case GGML_OP_SUM_ROWS: return false; case GGML_OP_ARGSORT: - return false; + return true; case GGML_OP_ACC: case GGML_OP_GROUP_NORM: case GGML_OP_UPSCALE: - case GGML_OP_PAD: return false; + case GGML_OP_PAD: + return true; case GGML_OP_ARANGE: return true; case GGML_OP_TIMESTEP_EMBEDDING: diff --git a/ggml-cann/acl_ops.cpp b/ggml-cann/acl_ops.cpp new file mode 100644 index 0000000000000..8fe4dc6a05d0f --- /dev/null +++ b/ggml-cann/acl_ops.cpp @@ -0,0 +1,132 @@ +#include "acl_ops.h" + +OpCaller::OpCaller() { attrs = aclopCreateAttr(); } + +OpCaller::~OpCaller() { + for (aclTensorDesc* desc : input_descs) { + aclDestroyTensorDesc(desc); + } + for (aclDataBuffer* buffer : input_buffers) { + aclDestroyDataBuffer(buffer); + } + for (aclTensorDesc* desc : output_descs) { + aclDestroyTensorDesc(desc); + } + for (aclDataBuffer* buffer : output_buffers) { + aclDestroyDataBuffer(buffer); + } + // TODO: may free before use. + for (void* ptr : ptrs) { + aclrtFree(ptr); + } + aclopDestroyAttr(attrs); +} + +OpCaller& OpCaller::name(std::string _op_name) { + op_name = _op_name; + return *this; +} + +OpCaller& OpCaller::input_no_contiguous(ggml_tensor* tensor, const char* name) { + aclDataType dtype = type_mapping(tensor->type); + // TODO + int64_t ne[] = {tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]}; + aclTensorDesc* tensor_desc = + aclCreateTensorDesc(dtype, GGML_MAX_DIMS, ne, ACL_FORMAT_ND); + aclSetTensorDescName(tensor_desc, name); + input_descs.push_back(tensor_desc); + aclDataBuffer* data_buffer = + aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor)); + input_buffers.push_back(data_buffer); + return *this; +} + +OpCaller& OpCaller::input(ggml_tensor* tensor, const char* name) { + GGML_ASSERT(ggml_is_contiguous(tensor)); + return input_no_contiguous(tensor, name); +} + +OpCaller& OpCaller::output(ggml_tensor* tensor, const char* name) { + aclDataType dtype = type_mapping(tensor->type); + aclTensorDesc* tensor_desc = + aclCreateTensorDesc(dtype, GGML_MAX_DIMS, tensor->ne, ACL_FORMAT_ND); + aclSetTensorDescName(tensor_desc, name); + output_descs.push_back(tensor_desc); + aclDataBuffer* data_buffer = + aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor)); + output_buffers.push_back(data_buffer); + return *this; +} + +OpCaller& OpCaller::attr(int64_t value, const char* name) { + ACL_CHECK(aclopSetAttrInt(attrs, name, value)); + return *this; +} + +OpCaller& OpCaller::attr(bool value, const char* name) { + ACL_CHECK(aclopSetAttrBool(attrs, name, value)); + return *this; +} + +OpCaller& OpCaller::attr(float value, const char* name) { + ACL_CHECK(aclopSetAttrFloat(attrs, name, value)); + return *this; +} + +OpCaller& OpCaller::run(aclrtStream stream) { + ACL_CHECK(aclSetCompileopt(ACL_OP_JIT_COMPILE, "disable")); + ACL_CHECK(aclopCompileAndExecute( + op_name.c_str(), input_descs.size(), input_descs.data(), + input_buffers.data(), output_buffers.size(), output_descs.data(), + output_buffers.data(), attrs, ACL_ENGINE_SYS, ACL_COMPILE_SYS, nullptr, + stream)); + return *this; +} + +void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + int64_t src_stride[GGML_MAX_DIMS]; + int64_t dst_stride[GGML_MAX_DIMS]; + + for (int i = 0; i < GGML_MAX_DIMS; i++) { + src_stride[i] = src->nb[i] / ggml_type_size(src->type); + dst_stride[i] = dst->nb[i] / ggml_type_size(src->type); + } + + int64_t storage_offset[] = {0}; + int64_t storage_offset_dim[] = {1}; + int64_t size_stride_dim[] = {GGML_MAX_DIMS}; + + OpCaller op; + op.name("ViewCopy") + .input_no_contiguous(dst, "dst") + .input(dst->ne, ACL_INT64, 1, size_stride_dim, "dst_size", ctx.stream()) + .input(dst_stride, ACL_INT64, 1, size_stride_dim, "dst_stride", + ctx.stream()) + .input(storage_offset, ACL_INT64, 1, storage_offset_dim, + "dst_storage_offset", ctx.stream()) + .input_no_contiguous(src, "src") + .input(src->ne, ACL_INT64, 1, size_stride_dim, "src_size", ctx.stream()) + .input(src_stride, ACL_INT64, 1, size_stride_dim, "src_stride", + ctx.stream()) + .input(storage_offset, ACL_INT64, 1, storage_offset_dim, + "src_storage_offset", ctx.stream()) + .output(dst, "dst") + .run(ctx.stream()); + //aclrtSynchronizeStream(ctx.stream()); +} + +void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + int64_t paddings[] = { + 0, dst->ne[3] - src->ne[3], 0, dst->ne[2] - src->ne[2], + 0, dst->ne[1] - src->ne[1], 0, dst->ne[0] - src->ne[0]}; + int64_t dim[] = {GGML_MAX_DIMS, 2}; + OpCaller op; + op.name("Pad") + .input(src, "x") + .input(paddings, ACL_INT64, 2, dim, "paddings", ctx.stream()) + .output(dst, "y") + .run(ctx.stream()); + //aclrtSynchronizeStream(ctx.stream()); +} diff --git a/ggml-cann/acl_ops.h b/ggml-cann/acl_ops.h new file mode 100644 index 0000000000000..654da7c16568a --- /dev/null +++ b/ggml-cann/acl_ops.h @@ -0,0 +1,79 @@ +#ifndef CANN_ACL_OPS +#define CANN_ACL_OPS + +#include +#include + +#include +#include + +#include "bcast.h" +#include "common.h" + +struct OpCaller { + std::string op_name; + std::vector input_descs; + std::vector input_buffers; + std::vector output_descs; + std::vector output_buffers; + aclopAttr* attrs; + std::vector ptrs; + + OpCaller(); + + virtual ~OpCaller(); + + OpCaller& name(std::string _op_name); + + OpCaller& input_no_contiguous(ggml_tensor* tensor, const char* name); + + OpCaller& input(ggml_tensor* tensor, const char* name); + + OpCaller& output(ggml_tensor* tensor, const char* name); + + OpCaller& attr(int64_t value, const char* name); + + OpCaller& attr(bool value, const char* name); + + OpCaller& attr(float value, const char* name); + + template + OpCaller& input(T* values, aclDataType dtype, size_t dims, int64_t* dim, + const char* name, aclrtStream stream = nullptr) { + void* device_ptr = nullptr; + size_t n_elem = 1; + for (size_t i = 0; i < dims; i++) { + n_elem *= dim[i]; + } + + size_t n_bytes = n_elem * sizeof(T); + ACL_CHECK(aclrtMalloc(&device_ptr, n_bytes, ACL_MEM_MALLOC_HUGE_FIRST)); + ptrs.push_back(device_ptr); + if (stream == nullptr) { + ACL_CHECK(aclrtMemcpy(device_ptr, n_bytes, values, n_bytes, + ACL_MEMCPY_HOST_TO_DEVICE)); + } else { + ACL_CHECK(aclrtMemcpyAsync(device_ptr, n_bytes, values, n_bytes, + ACL_MEMCPY_HOST_TO_DEVICE, stream)); + } + + aclTensorDesc* tensor_desc = + aclCreateTensorDesc(dtype, dims, dim, ACL_FORMAT_ND); + aclSetTensorDescName(tensor_desc, name); + input_descs.push_back(tensor_desc); + aclDataBuffer* data_buffer = aclCreateDataBuffer(device_ptr, n_bytes); + input_buffers.push_back(data_buffer); + + return *this; + } + + OpCaller& run(aclrtStream stream = nullptr); +}; + +void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +void ggml_cann_upscale(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +#endif // CANN_ACL_OPS \ No newline at end of file diff --git a/ggml-cann/aclnn_ops.cpp b/ggml-cann/aclnn_ops.cpp index d95938eaca5fa..5e7789106a97a 100644 --- a/ggml-cann/aclnn_ops.cpp +++ b/ggml-cann/aclnn_ops.cpp @@ -1,5 +1,8 @@ #include "aclnn_ops.h" +#include +#include + #include #include #include @@ -91,6 +94,8 @@ void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, &workspaceSize, &executor)); + // TODO, workspace should free after sync. Add alloc memory to + // backend_buffer. if (workspaceSize > 0) { ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); @@ -265,33 +270,175 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } } -// TODO: acl kernel only support INT64 for out tensors. +void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + // scale factor + float v; + memcpy(&v, dst->op_params, sizeof(float)); + + aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT); + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize, + &executor)); + if (workspaceSize > 0) + ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyScalar(scale)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + + if (workspaceSize > 0) { + ACL_CHECK(aclrtFree(workspaceAddr)); + } +} + void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0]; aclTensor* acl_src = create_acl_tensor(src); aclTensor* acl_dst = create_acl_tensor(dst); + void* buffer = nullptr; + ACL_CHECK(aclrtMalloc( + &buffer, ggml_nbytes(dst) / ggml_type_size(dst->type) * sizeof(int64_t), + ACL_MEM_MALLOC_HUGE_FIRST)); + aclTensor* tmp_tensor = + create_acl_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, + dst->nb, GGML_MAX_DIMS); uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; ACL_CHECK(aclnnArgsortGetWorkspaceSize( - acl_src, 0, (order == GGML_SORT_ORDER_DESC ? true : false), acl_dst, + acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor, &workspaceSize, &executor)); - if (workspaceSize > 0) + if (workspaceSize > 0) { ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + } aclrtStream main_stream = ctx.stream(); ACL_CHECK( aclnnArgsort(workspaceAddr, workspaceSize, executor, main_stream)); + if (workspaceSize > 0) { + ACL_CHECK(aclrtFree(workspaceAddr)); + workspaceSize = 0; + } + + ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, type_mapping(dst->type), + acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + } + + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, main_stream)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(tmp_tensor)); ACL_CHECK(aclDestroyTensor(acl_dst)); + // TODO: optimize argsort kernel or free tmp buffers after stream sync. + ACL_CHECK(aclrtSynchronizeStream(main_stream)); + ACL_CHECK(aclrtFree(buffer)); + if (workspaceSize > 0) { ACL_CHECK(aclrtFree(workspaceAddr)); } } + +void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = create_acl_tensor(src); + aclTensor* acl_dst = create_acl_tensor(dst); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + float *weight_host, *bias_host; + int64_t channel = dst->ne[2]; + + weight_host = new float[channel]; + bias_host = new float[channel]; + + for (int i = 0; i < channel; i++) { + weight_host[i] = 1; + bias_host[i] = 0; + } + + aclrtStream stream = ctx.stream(); + + // Input tensors. + void *buffer, *acl_weight, *acl_bias, *acl_mean, *acl_invstd; + ACL_CHECK(aclrtMalloc(&buffer, 4 * channel * sizeof(float), + ACL_MEM_MALLOC_HUGE_FIRST)); + acl_weight = buffer; + acl_bias = acl_weight + sizeof(float) * channel; + acl_mean = acl_bias + sizeof(float) * channel; + acl_invstd = acl_mean + sizeof(float) * channel; + + // Set input params. + ACL_CHECK(aclrtMemcpyAsync(acl_weight, channel, weight_host, channel, + ACL_MEMCPY_HOST_TO_DEVICE, stream)); + ACL_CHECK(aclrtMemcpyAsync(acl_bias, channel, bias_host, channel, + ACL_MEMCPY_HOST_TO_DEVICE, stream)); + delete[] weight_host; + delete[] bias_host; + + // Create input tensors. + int64_t input_tensor_shape[] = {channel}; + size_t input_tensor_stride[] = {1}; + aclTensor* weight = + create_acl_tensor(acl_weight, ACL_FLOAT, sizeof(float), + input_tensor_shape, input_tensor_stride, 1); + aclTensor* bias = + create_acl_tensor(acl_bias, ACL_FLOAT, sizeof(float), + input_tensor_shape, input_tensor_stride, 1); + aclTensor* mean = + create_acl_tensor(acl_mean, ACL_FLOAT, sizeof(float), + input_tensor_shape, input_tensor_stride, 1); + aclTensor* invstd = + create_acl_tensor(acl_invstd, ACL_FLOAT, sizeof(float), + input_tensor_shape, input_tensor_stride, 1); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnBatchNormGetWorkspaceSize( + acl_src, weight, bias, nullptr, nullptr, false, 0, eps, acl_dst, mean, + invstd, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + } + + ACL_CHECK(aclnnBatchNorm(workspaceAddr, workspaceSize, executor, stream)); + + ACL_CHECK(aclDestroyTensor(weight)); + ACL_CHECK(aclDestroyTensor(bias)); + ACL_CHECK(aclDestroyTensor(mean)); + ACL_CHECK(aclDestroyTensor(invstd)); + + // TODO: optimize argsort kernel or free tmp buffers after stream sync. + ACL_CHECK(aclrtSynchronizeStream(stream)); + ACL_CHECK(aclrtFree(buffer)); + + if (workspaceSize > 0) { + ACL_CHECK(aclrtFree(workspaceAddr)); + } +} \ No newline at end of file diff --git a/ggml-cann/aclnn_ops.h b/ggml-cann/aclnn_ops.h index 752736bbe7d90..1c963a4a60cd4 100644 --- a/ggml-cann/aclnn_ops.h +++ b/ggml-cann/aclnn_ops.h @@ -1,3 +1,6 @@ +#ifndef CANN_ACLNN_OPS +#define CANN_ACLNN_OPS + #include #include #include @@ -30,8 +33,12 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst); void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst); + void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); + template @@ -148,3 +155,5 @@ void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclrtFree(workspaceAddr)); } } + +#endif // CANN_ACLNN_OPS \ No newline at end of file diff --git a/ggml-cann/bcast.cpp b/ggml-cann/bcast.cpp index 0a7d478a98439..33d14bac64d4d 100644 --- a/ggml-cann/bcast.cpp +++ b/ggml-cann/bcast.cpp @@ -1,4 +1,6 @@ #include "bcast.h" +#include +#include /** * Mapping ggml_tensor type to acl_tensor type. @@ -29,7 +31,7 @@ aclDataType type_mapping(ggml_type type) { * otherwise, use bcast_ne bcast_stride, which means tensor dims should be * changed to satisfy the broadcast. @sa: get_bcast_shape. */ -aclTensor* create_acl_tensor(const ggml_tensor* tensor, const int64_t* bcast_ne, +aclTensor* create_acl_tensor(const ggml_tensor* tensor, int64_t* bcast_ne, int64_t* bcast_stride, int64_t bcast_dims) { size_t size = ggml_nbytes(tensor); void* deviceAddr = nullptr; @@ -62,6 +64,9 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, const int64_t* bcast_ne, } int64_t dims = (bcast_dims == 0 ? GGML_MAX_DIMS : bcast_dims); + std::reverse(acl_ne, acl_ne + dims); + std::reverse(acl_stride, acl_stride + dims); + aclTensor* acl_tensor = aclCreateTensor(acl_ne, dims, type_mapping(tensor->type), acl_stride, 0, aclFormat::ACL_FORMAT_ND, acl_ne, dims, deviceAddr); @@ -69,6 +74,27 @@ aclTensor* create_acl_tensor(const ggml_tensor* tensor, const int64_t* bcast_ne, return acl_tensor; } +aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size, int64_t* ne, + size_t* nb, int64_t dims) { + + int64_t tmp_ne[GGML_MAX_DIMS * 2]; + int64_t tmp_stride[GGML_MAX_DIMS * 2]; + + memcpy(tmp_ne, ne, dims * sizeof(int64_t)); + for (int i = 0; i < dims; i++) { + tmp_stride[i] = nb[i] / type_size; + } + + std::reverse(tmp_ne, tmp_ne + dims); + std::reverse(tmp_stride, tmp_stride + dims); + + aclTensor* acl_tensor = + aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, 0, + aclFormat::ACL_FORMAT_ND, tmp_ne, dims, data_ptr); + + return acl_tensor; +} + /** * Add extra dims to satisfy acl kernel's broadcast rules (same as numpy). * ggml_tensor dimension order is reversed compared to Python. diff --git a/ggml-cann/bcast.h b/ggml-cann/bcast.h index 82c6c120761c3..f27b00d3a9b18 100644 --- a/ggml-cann/bcast.h +++ b/ggml-cann/bcast.h @@ -1,3 +1,6 @@ +#ifndef CANN_BCAST_H +#define CANN_BCAST_H + #include #include "common.h" @@ -6,10 +9,13 @@ aclDataType type_mapping(ggml_type type); aclTensor* create_acl_tensor(const ggml_tensor* tensor, - const int64_t* bcast_ne = nullptr, + int64_t* bcast_ne = nullptr, int64_t* bcast_stride = nullptr, int64_t bcast_dims = 0); +aclTensor* create_acl_tensor(void* data_ptr, aclDataType dtype, size_t type_size, int64_t* ne, + size_t* nb, int64_t dims); + bool need_bcast(const ggml_tensor* t0, const ggml_tensor* t1); int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, @@ -27,3 +33,5 @@ int64_t get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, bcast_stride_##src0, bcast_stride_##src1); #define BCAST_PARAM(src) bcast_ne_##src, bcast_stride_##src, bcast_dims + +#endif //CANN_BCAST_H \ No newline at end of file diff --git a/ggml-cann/common.h b/ggml-cann/common.h index 9334890ffca32..a6b1060da58ea 100644 --- a/ggml-cann/common.h +++ b/ggml-cann/common.h @@ -1,4 +1,5 @@ -#pragma once +#ifndef CANN_COMMON_H +#define CANN_COMMON_H #include @@ -80,3 +81,5 @@ struct ggml_backend_cann_context { aclrtStream stream() { return stream(device, 0); } }; + +#endif //CANN_COMMON_H \ No newline at end of file