Add aclop

hipudding · Apr 2, 2024 · c330b78 · c330b78
1 parent 5fec9cb
commit c330b78
Show file tree

Hide file tree

Showing 9 changed files with 458 additions and 21 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -841,6 +841,20 @@ if (LLAMA_CANN)
             endif()
         endif()
 
+        # * libacl_op_compiler.so
+        if (LLAMA_CANN)
+            set(lib_dir "${CANN_INSTALL_DIR}/lib64")
+            find_library(found_lib_acl_op_compiler NAMES acl_op_compiler PATHS ${lib_dir} NO_DEFAULT_PATH)
+            if (found_lib_acl_op_compiler)
+                set(lib_acl_op_compiler ${found_lib_acl_op_compiler})
+                list(APPEND CANN_LIBRARIES ${lib_acl_op_compiler})
+                message(STATUS "CANN: libacl_op_compiler.so is found at ${lib_dir}")
+            else()
+                set(LLAMA_CANN OFF)
+                message(WARNING "CANN: Missing libacl_op_compiler.so. Turning off LLAMA_CANN")
+            endif()
+        endif()
+
         # Set headers and libs
         if (LLAMA_CANN)
             message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")

diff --git a/ggml-cann.cpp b/ggml-cann.cpp
@@ -8,6 +8,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml-cann/aclnn_ops.h"
 #include "ggml-cann/common.h"
+#include "ggml-cann/acl_ops.h"
 
 struct AclLifeCycle {
     AclLifeCycle() { ACL_CHECK(aclInit(nullptr)); }
@@ -346,8 +347,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
             ggml_cann_repeat(ctx, dst);
             break;
         case GGML_OP_GET_ROWS:
-        case GGML_OP_DUP:
             return false;
+        case GGML_OP_DUP:
+            ggml_cann_cont(ctx, dst);
+            break;
         case GGML_OP_ADD:
             ggml_cann_add(ctx, dst);
             break;
@@ -394,14 +397,19 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
             }
             break;
         case GGML_OP_NORM:
+            ggml_cann_norm(ctx, dst);
+            break;
         case GGML_OP_GROUP_NORM:
             return false;
         case GGML_OP_CONCAT:
             ggml_cann_concat(ctx, dst);
             break;
+        // TODO: Format need NC1HWC0.
         case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
             return false;
+        case GGML_OP_PAD:
+            ggml_cann_pad(ctx, dst);
+            break;
         case GGML_OP_ARANGE:
             ggml_cann_arange(ctx, dst);
             break;
@@ -413,21 +421,27 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_RMS_NORM:
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT_ID:
-        case GGML_OP_SCALE:
             return false;
+        case GGML_OP_SCALE:
+            ggml_cann_scale(ctx, dst);
+            break;
         case GGML_OP_SQR:
             ggml_cann_sqr(ctx, dst);
             break;
         case GGML_OP_CLAMP:
             ggml_cann_clamp(ctx, dst);
             break;
         case GGML_OP_CPY:
+            return false;
         case GGML_OP_CONT:
+            ggml_cann_cont(ctx, dst);
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
+            // Do nothing with these ops.
+            break;
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_ROPE:
@@ -437,8 +451,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
         case GGML_OP_SUM_ROWS:
             return false;
         case GGML_OP_ARGSORT:
-            // ggml_cann_argsort(ctx, dst);
-            // break;
+            ggml_cann_argsort(ctx, dst);
+            break;
             return false;
         default:
             return false;
@@ -458,7 +472,8 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
 GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
     ggml_backend_cann_context* cann_ctx =
         (ggml_backend_cann_context*)backend->context;
-
+    ACL_CHECK(aclrtSynchronizeDevice());
+    ACL_CHECK(aclrtResetDevice(cann_ctx->device));
     delete cann_ctx;
     delete backend;
 }
@@ -591,8 +606,9 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor* node = cgraph->nodes[i];
 
-        if (ggml_is_empty(node) || node->op == GGML_OP_VIEW ||
-            node->op == GGML_OP_NONE) {
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE ||
+            node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW ||
+            node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
             continue;
         }
 
@@ -627,29 +643,31 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
         case GGML_OP_MUL_MAT_ID:
         case GGML_OP_GET_ROWS:
         case GGML_OP_CPY:
-        case GGML_OP_DUP:
             return false;
+        case GGML_OP_DUP:
+            return true;
         case GGML_OP_REPEAT:
         case GGML_OP_CONCAT:
         case GGML_OP_NONE:
-            return true;
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
+            return true;
         case GGML_OP_NORM:
-            return false;
+            return true;
         case GGML_OP_ADD:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
             return true;
         case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
             return false;
+        case GGML_OP_SCALE:
+            return true;
         case GGML_OP_SQR:
         case GGML_OP_CLAMP:
-            return true;
         case GGML_OP_CONT:
+            return true;
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_ROPE:
@@ -659,12 +677,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
         case GGML_OP_SUM_ROWS:
             return false;
         case GGML_OP_ARGSORT:
-            return false;
+            return true;
         case GGML_OP_ACC:
         case GGML_OP_GROUP_NORM:
         case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
             return false;
+        case GGML_OP_PAD:
+            return true;
         case GGML_OP_ARANGE:
             return true;
         case GGML_OP_TIMESTEP_EMBEDDING:

diff --git a/ggml-cann/acl_ops.cpp b/ggml-cann/acl_ops.cpp
@@ -0,0 +1,132 @@
+#include "acl_ops.h"
+
+OpCaller::OpCaller() { attrs = aclopCreateAttr(); }
+
+OpCaller::~OpCaller() {
+    for (aclTensorDesc* desc : input_descs) {
+        aclDestroyTensorDesc(desc);
+    }
+    for (aclDataBuffer* buffer : input_buffers) {
+        aclDestroyDataBuffer(buffer);
+    }
+    for (aclTensorDesc* desc : output_descs) {
+        aclDestroyTensorDesc(desc);
+    }
+    for (aclDataBuffer* buffer : output_buffers) {
+        aclDestroyDataBuffer(buffer);
+    }
+    // TODO: may free before use.
+    for (void* ptr : ptrs) {
+        aclrtFree(ptr);
+    }
+    aclopDestroyAttr(attrs);
+}
+
+OpCaller& OpCaller::name(std::string _op_name) {
+    op_name = _op_name;
+    return *this;
+}
+
+OpCaller& OpCaller::input_no_contiguous(ggml_tensor* tensor, const char* name) {
+    aclDataType dtype = type_mapping(tensor->type);
+    // TODO
+    int64_t ne[] = {tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]};
+    aclTensorDesc* tensor_desc =
+        aclCreateTensorDesc(dtype, GGML_MAX_DIMS, ne, ACL_FORMAT_ND);
+    aclSetTensorDescName(tensor_desc, name);
+    input_descs.push_back(tensor_desc);
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor));
+    input_buffers.push_back(data_buffer);
+    return *this;
+}
+
+OpCaller& OpCaller::input(ggml_tensor* tensor, const char* name) {
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+    return input_no_contiguous(tensor, name);
+}
+
+OpCaller& OpCaller::output(ggml_tensor* tensor, const char* name) {
+    aclDataType dtype = type_mapping(tensor->type);
+    aclTensorDesc* tensor_desc =
+        aclCreateTensorDesc(dtype, GGML_MAX_DIMS, tensor->ne, ACL_FORMAT_ND);
+    aclSetTensorDescName(tensor_desc, name);
+    output_descs.push_back(tensor_desc);
+    aclDataBuffer* data_buffer =
+        aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor));
+    output_buffers.push_back(data_buffer);
+    return *this;
+}
+
+OpCaller& OpCaller::attr(int64_t value, const char* name) {
+    ACL_CHECK(aclopSetAttrInt(attrs, name, value));
+    return *this;
+}
+
+OpCaller& OpCaller::attr(bool value, const char* name) {
+    ACL_CHECK(aclopSetAttrBool(attrs, name, value));
+    return *this;
+}
+
+OpCaller& OpCaller::attr(float value, const char* name) {
+    ACL_CHECK(aclopSetAttrFloat(attrs, name, value));
+    return *this;
+}
+
+OpCaller& OpCaller::run(aclrtStream stream) {
+    ACL_CHECK(aclSetCompileopt(ACL_OP_JIT_COMPILE, "disable"));
+    ACL_CHECK(aclopCompileAndExecute(
+        op_name.c_str(), input_descs.size(), input_descs.data(),
+        input_buffers.data(), output_buffers.size(), output_descs.data(),
+        output_buffers.data(), attrs, ACL_ENGINE_SYS, ACL_COMPILE_SYS, nullptr,
+        stream));
+    return *this;
+}
+
+void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    int64_t src_stride[GGML_MAX_DIMS];
+    int64_t dst_stride[GGML_MAX_DIMS];
+
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        src_stride[i] = src->nb[i] / ggml_type_size(src->type);
+        dst_stride[i] = dst->nb[i] / ggml_type_size(src->type);
+    }
+
+    int64_t storage_offset[] = {0};
+    int64_t storage_offset_dim[] = {1};
+    int64_t size_stride_dim[] = {GGML_MAX_DIMS};
+
+    OpCaller op;
+    op.name("ViewCopy")
+        .input_no_contiguous(dst, "dst")
+        .input(dst->ne, ACL_INT64, 1, size_stride_dim, "dst_size", ctx.stream())
+        .input(dst_stride, ACL_INT64, 1, size_stride_dim, "dst_stride",
+               ctx.stream())
+        .input(storage_offset, ACL_INT64, 1, storage_offset_dim,
+               "dst_storage_offset", ctx.stream())
+        .input_no_contiguous(src, "src")
+        .input(src->ne, ACL_INT64, 1, size_stride_dim, "src_size", ctx.stream())
+        .input(src_stride, ACL_INT64, 1, size_stride_dim, "src_stride",
+               ctx.stream())
+        .input(storage_offset, ACL_INT64, 1, storage_offset_dim,
+               "src_storage_offset", ctx.stream())
+        .output(dst, "dst")
+        .run(ctx.stream());
+    //aclrtSynchronizeStream(ctx.stream());
+}
+
+void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src = dst->src[0];
+    int64_t paddings[] = {
+        0, dst->ne[3] - src->ne[3], 0, dst->ne[2] - src->ne[2],
+        0, dst->ne[1] - src->ne[1], 0, dst->ne[0] - src->ne[0]};
+    int64_t dim[] = {GGML_MAX_DIMS, 2};
+    OpCaller op;
+    op.name("Pad")
+        .input(src, "x")
+        .input(paddings, ACL_INT64, 2, dim, "paddings", ctx.stream())
+        .output(dst, "y")
+        .run(ctx.stream());
+    //aclrtSynchronizeStream(ctx.stream());
+}
diff --git a/ggml-cann/acl_ops.h b/ggml-cann/acl_ops.h
@@ -0,0 +1,79 @@
+#ifndef CANN_ACL_OPS
+#define CANN_ACL_OPS
+
+#include <acl/acl_op.h>
+#include <acl/acl_op_compiler.h>
+
+#include <string>
+#include <vector>
+
+#include "bcast.h"
+#include "common.h"
+
+struct OpCaller {
+    std::string op_name;
+    std::vector<aclTensorDesc*> input_descs;
+    std::vector<aclDataBuffer*> input_buffers;
+    std::vector<aclTensorDesc*> output_descs;
+    std::vector<aclDataBuffer*> output_buffers;
+    aclopAttr* attrs;
+    std::vector<void*> ptrs;
+
+    OpCaller();
+
+    virtual ~OpCaller();
+
+    OpCaller& name(std::string _op_name);
+
+    OpCaller& input_no_contiguous(ggml_tensor* tensor, const char* name);
+
+    OpCaller& input(ggml_tensor* tensor, const char* name);
+
+    OpCaller& output(ggml_tensor* tensor, const char* name);
+
+    OpCaller& attr(int64_t value, const char* name);
+
+    OpCaller& attr(bool value, const char* name);
+
+    OpCaller& attr(float value, const char* name);
+
+    template <typename T>
+    OpCaller& input(T* values, aclDataType dtype, size_t dims, int64_t* dim,
+                    const char* name, aclrtStream stream = nullptr) {
+        void* device_ptr = nullptr;
+        size_t n_elem = 1;
+        for (size_t i = 0; i < dims; i++) {
+            n_elem *= dim[i];
+        }
+
+        size_t n_bytes = n_elem * sizeof(T);
+        ACL_CHECK(aclrtMalloc(&device_ptr, n_bytes, ACL_MEM_MALLOC_HUGE_FIRST));
+        ptrs.push_back(device_ptr);
+        if (stream == nullptr) {
+            ACL_CHECK(aclrtMemcpy(device_ptr, n_bytes, values, n_bytes,
+                                  ACL_MEMCPY_HOST_TO_DEVICE));
+        } else {
+            ACL_CHECK(aclrtMemcpyAsync(device_ptr, n_bytes, values, n_bytes,
+                                       ACL_MEMCPY_HOST_TO_DEVICE, stream));
+        }
+
+        aclTensorDesc* tensor_desc =
+            aclCreateTensorDesc(dtype, dims, dim, ACL_FORMAT_ND);
+        aclSetTensorDescName(tensor_desc, name);
+        input_descs.push_back(tensor_desc);
+        aclDataBuffer* data_buffer = aclCreateDataBuffer(device_ptr, n_bytes);
+        input_buffers.push_back(data_buffer);
+
+        return *this;
+    }
+
+    OpCaller& run(aclrtStream stream = nullptr);
+};
+
+void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+void ggml_cann_upscale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+#endif  // CANN_ACL_OPS