Skip to content

Commit

Permalink
Add aclop
Browse files Browse the repository at this point in the history
  • Loading branch information
hipudding committed Apr 2, 2024
1 parent 5fec9cb commit c330b78
Show file tree
Hide file tree
Showing 9 changed files with 458 additions and 21 deletions.
14 changes: 14 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,20 @@ if (LLAMA_CANN)
endif()
endif()

# * libacl_op_compiler.so
if (LLAMA_CANN)
set(lib_dir "${CANN_INSTALL_DIR}/lib64")
find_library(found_lib_acl_op_compiler NAMES acl_op_compiler PATHS ${lib_dir} NO_DEFAULT_PATH)
if (found_lib_acl_op_compiler)
set(lib_acl_op_compiler ${found_lib_acl_op_compiler})
list(APPEND CANN_LIBRARIES ${lib_acl_op_compiler})
message(STATUS "CANN: libacl_op_compiler.so is found at ${lib_dir}")
else()
set(LLAMA_CANN OFF)
message(WARNING "CANN: Missing libacl_op_compiler.so. Turning off LLAMA_CANN")
endif()
endif()

# Set headers and libs
if (LLAMA_CANN)
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
Expand Down
49 changes: 34 additions & 15 deletions ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "ggml-backend-impl.h"
#include "ggml-cann/aclnn_ops.h"
#include "ggml-cann/common.h"
#include "ggml-cann/acl_ops.h"

struct AclLifeCycle {
AclLifeCycle() { ACL_CHECK(aclInit(nullptr)); }
Expand Down Expand Up @@ -346,8 +347,10 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
ggml_cann_repeat(ctx, dst);
break;
case GGML_OP_GET_ROWS:
case GGML_OP_DUP:
return false;
case GGML_OP_DUP:
ggml_cann_cont(ctx, dst);
break;
case GGML_OP_ADD:
ggml_cann_add(ctx, dst);
break;
Expand Down Expand Up @@ -394,14 +397,19 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
}
break;
case GGML_OP_NORM:
ggml_cann_norm(ctx, dst);
break;
case GGML_OP_GROUP_NORM:
return false;
case GGML_OP_CONCAT:
ggml_cann_concat(ctx, dst);
break;
// TODO: Format need NC1HWC0.
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
return false;
case GGML_OP_PAD:
ggml_cann_pad(ctx, dst);
break;
case GGML_OP_ARANGE:
ggml_cann_arange(ctx, dst);
break;
Expand All @@ -413,21 +421,27 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
case GGML_OP_RMS_NORM:
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
case GGML_OP_SCALE:
return false;
case GGML_OP_SCALE:
ggml_cann_scale(ctx, dst);
break;
case GGML_OP_SQR:
ggml_cann_sqr(ctx, dst);
break;
case GGML_OP_CLAMP:
ggml_cann_clamp(ctx, dst);
break;
case GGML_OP_CPY:
return false;
case GGML_OP_CONT:
ggml_cann_cont(ctx, dst);
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
// Do nothing with these ops.
break;
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
Expand All @@ -437,8 +451,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
case GGML_OP_SUM_ROWS:
return false;
case GGML_OP_ARGSORT:
// ggml_cann_argsort(ctx, dst);
// break;
ggml_cann_argsort(ctx, dst);
break;
return false;
default:
return false;
Expand All @@ -458,7 +472,8 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;

ACL_CHECK(aclrtSynchronizeDevice());
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
delete cann_ctx;
delete backend;
}
Expand Down Expand Up @@ -591,8 +606,9 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor* node = cgraph->nodes[i];

if (ggml_is_empty(node) || node->op == GGML_OP_VIEW ||
node->op == GGML_OP_NONE) {
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE ||
node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW ||
node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
}

Expand Down Expand Up @@ -627,29 +643,31 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
case GGML_OP_MUL_MAT_ID:
case GGML_OP_GET_ROWS:
case GGML_OP_CPY:
case GGML_OP_DUP:
return false;
case GGML_OP_DUP:
return true;
case GGML_OP_REPEAT:
case GGML_OP_CONCAT:
case GGML_OP_NONE:
return true;
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_NORM:
return false;
return true;
case GGML_OP_ADD:
case GGML_OP_MUL:
case GGML_OP_DIV:
return true;
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
return false;
case GGML_OP_SCALE:
return true;
case GGML_OP_SQR:
case GGML_OP_CLAMP:
return true;
case GGML_OP_CONT:
return true;
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
Expand All @@ -659,12 +677,13 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
case GGML_OP_SUM_ROWS:
return false;
case GGML_OP_ARGSORT:
return false;
return true;
case GGML_OP_ACC:
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
return false;
case GGML_OP_PAD:
return true;
case GGML_OP_ARANGE:
return true;
case GGML_OP_TIMESTEP_EMBEDDING:
Expand Down
132 changes: 132 additions & 0 deletions ggml-cann/acl_ops.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#include "acl_ops.h"

OpCaller::OpCaller() { attrs = aclopCreateAttr(); }

OpCaller::~OpCaller() {
for (aclTensorDesc* desc : input_descs) {
aclDestroyTensorDesc(desc);
}
for (aclDataBuffer* buffer : input_buffers) {
aclDestroyDataBuffer(buffer);
}
for (aclTensorDesc* desc : output_descs) {
aclDestroyTensorDesc(desc);
}
for (aclDataBuffer* buffer : output_buffers) {
aclDestroyDataBuffer(buffer);
}
// TODO: may free before use.
for (void* ptr : ptrs) {
aclrtFree(ptr);
}
aclopDestroyAttr(attrs);
}

OpCaller& OpCaller::name(std::string _op_name) {
op_name = _op_name;
return *this;
}

OpCaller& OpCaller::input_no_contiguous(ggml_tensor* tensor, const char* name) {
aclDataType dtype = type_mapping(tensor->type);
// TODO
int64_t ne[] = {tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]};
aclTensorDesc* tensor_desc =
aclCreateTensorDesc(dtype, GGML_MAX_DIMS, ne, ACL_FORMAT_ND);
aclSetTensorDescName(tensor_desc, name);
input_descs.push_back(tensor_desc);
aclDataBuffer* data_buffer =
aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor));
input_buffers.push_back(data_buffer);
return *this;
}

OpCaller& OpCaller::input(ggml_tensor* tensor, const char* name) {
GGML_ASSERT(ggml_is_contiguous(tensor));
return input_no_contiguous(tensor, name);
}

OpCaller& OpCaller::output(ggml_tensor* tensor, const char* name) {
aclDataType dtype = type_mapping(tensor->type);
aclTensorDesc* tensor_desc =
aclCreateTensorDesc(dtype, GGML_MAX_DIMS, tensor->ne, ACL_FORMAT_ND);
aclSetTensorDescName(tensor_desc, name);
output_descs.push_back(tensor_desc);
aclDataBuffer* data_buffer =
aclCreateDataBuffer(tensor->data, ggml_nbytes(tensor));
output_buffers.push_back(data_buffer);
return *this;
}

OpCaller& OpCaller::attr(int64_t value, const char* name) {
ACL_CHECK(aclopSetAttrInt(attrs, name, value));
return *this;
}

OpCaller& OpCaller::attr(bool value, const char* name) {
ACL_CHECK(aclopSetAttrBool(attrs, name, value));
return *this;
}

OpCaller& OpCaller::attr(float value, const char* name) {
ACL_CHECK(aclopSetAttrFloat(attrs, name, value));
return *this;
}

OpCaller& OpCaller::run(aclrtStream stream) {
ACL_CHECK(aclSetCompileopt(ACL_OP_JIT_COMPILE, "disable"));
ACL_CHECK(aclopCompileAndExecute(
op_name.c_str(), input_descs.size(), input_descs.data(),
input_buffers.data(), output_buffers.size(), output_descs.data(),
output_buffers.data(), attrs, ACL_ENGINE_SYS, ACL_COMPILE_SYS, nullptr,
stream));
return *this;
}

void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];
int64_t src_stride[GGML_MAX_DIMS];
int64_t dst_stride[GGML_MAX_DIMS];

for (int i = 0; i < GGML_MAX_DIMS; i++) {
src_stride[i] = src->nb[i] / ggml_type_size(src->type);
dst_stride[i] = dst->nb[i] / ggml_type_size(src->type);
}

int64_t storage_offset[] = {0};
int64_t storage_offset_dim[] = {1};
int64_t size_stride_dim[] = {GGML_MAX_DIMS};

OpCaller op;
op.name("ViewCopy")
.input_no_contiguous(dst, "dst")
.input(dst->ne, ACL_INT64, 1, size_stride_dim, "dst_size", ctx.stream())
.input(dst_stride, ACL_INT64, 1, size_stride_dim, "dst_stride",
ctx.stream())
.input(storage_offset, ACL_INT64, 1, storage_offset_dim,
"dst_storage_offset", ctx.stream())
.input_no_contiguous(src, "src")
.input(src->ne, ACL_INT64, 1, size_stride_dim, "src_size", ctx.stream())
.input(src_stride, ACL_INT64, 1, size_stride_dim, "src_stride",
ctx.stream())
.input(storage_offset, ACL_INT64, 1, storage_offset_dim,
"src_storage_offset", ctx.stream())
.output(dst, "dst")
.run(ctx.stream());
//aclrtSynchronizeStream(ctx.stream());
}

void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src = dst->src[0];
int64_t paddings[] = {
0, dst->ne[3] - src->ne[3], 0, dst->ne[2] - src->ne[2],
0, dst->ne[1] - src->ne[1], 0, dst->ne[0] - src->ne[0]};
int64_t dim[] = {GGML_MAX_DIMS, 2};
OpCaller op;
op.name("Pad")
.input(src, "x")
.input(paddings, ACL_INT64, 2, dim, "paddings", ctx.stream())
.output(dst, "y")
.run(ctx.stream());
//aclrtSynchronizeStream(ctx.stream());
}
79 changes: 79 additions & 0 deletions ggml-cann/acl_ops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#ifndef CANN_ACL_OPS
#define CANN_ACL_OPS

#include <acl/acl_op.h>
#include <acl/acl_op_compiler.h>

#include <string>
#include <vector>

#include "bcast.h"
#include "common.h"

struct OpCaller {
std::string op_name;
std::vector<aclTensorDesc*> input_descs;
std::vector<aclDataBuffer*> input_buffers;
std::vector<aclTensorDesc*> output_descs;
std::vector<aclDataBuffer*> output_buffers;
aclopAttr* attrs;
std::vector<void*> ptrs;

OpCaller();

virtual ~OpCaller();

OpCaller& name(std::string _op_name);

OpCaller& input_no_contiguous(ggml_tensor* tensor, const char* name);

OpCaller& input(ggml_tensor* tensor, const char* name);

OpCaller& output(ggml_tensor* tensor, const char* name);

OpCaller& attr(int64_t value, const char* name);

OpCaller& attr(bool value, const char* name);

OpCaller& attr(float value, const char* name);

template <typename T>
OpCaller& input(T* values, aclDataType dtype, size_t dims, int64_t* dim,
const char* name, aclrtStream stream = nullptr) {
void* device_ptr = nullptr;
size_t n_elem = 1;
for (size_t i = 0; i < dims; i++) {
n_elem *= dim[i];
}

size_t n_bytes = n_elem * sizeof(T);
ACL_CHECK(aclrtMalloc(&device_ptr, n_bytes, ACL_MEM_MALLOC_HUGE_FIRST));
ptrs.push_back(device_ptr);
if (stream == nullptr) {
ACL_CHECK(aclrtMemcpy(device_ptr, n_bytes, values, n_bytes,
ACL_MEMCPY_HOST_TO_DEVICE));
} else {
ACL_CHECK(aclrtMemcpyAsync(device_ptr, n_bytes, values, n_bytes,
ACL_MEMCPY_HOST_TO_DEVICE, stream));
}

aclTensorDesc* tensor_desc =
aclCreateTensorDesc(dtype, dims, dim, ACL_FORMAT_ND);
aclSetTensorDescName(tensor_desc, name);
input_descs.push_back(tensor_desc);
aclDataBuffer* data_buffer = aclCreateDataBuffer(device_ptr, n_bytes);
input_buffers.push_back(data_buffer);

return *this;
}

OpCaller& run(aclrtStream stream = nullptr);
};

void ggml_cann_cont(ggml_backend_cann_context& ctx, ggml_tensor* dst);

void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);

void ggml_cann_upscale(ggml_backend_cann_context& ctx, ggml_tensor* dst);

#endif // CANN_ACL_OPS
Loading

0 comments on commit c330b78

Please sign in to comment.