Skip to content

Commit f1bde5d

Browse files
committed
add ascend kernel compile struct
1 parent 3528399 commit f1bde5d

11 files changed

+384
-31
lines changed

CMakeLists.txt

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -822,20 +822,37 @@ if (LLAMA_CANN)
822822
endif()
823823
endif()
824824
825-
# Set headers
826-
set(CANN_INCLUDE_DIRS "${CANN_INSTALL_DIR}/include" "${CANN_INSTALL_DIR}/include/aclnn")
825+
# Set header and libs
826+
if(LLAMA_CANN)
827+
set(CANN_INCLUDE_DIRS
828+
${CANN_INSTALL_DIR}/include
829+
${CANN_INSTALL_DIR}/include/aclnn
830+
${CANN_INSTALL_DIR}/acllib/include
831+
)
832+
833+
# TODO: find libs
834+
link_directories(
835+
${CANN_INSTALL_DIR}/lib64
836+
${CANN_INSTALL_DIR}/aarch64-linux/simulator/Ascend910B1/lib
837+
${CANN_INSTALL_DIR}/../8.0.RC1.alpha003/tools/tikicpulib/lib/Ascend910B1)
827838
828-
# Set libs
829-
if (LLAMA_CANN)
830-
# Build Ascendc kernels.
831839
add_subdirectory(ggml-cann/kernels)
832-
list(APPEND CANN_LIBRARIES ascendcl nnopbase opapi acl_op_compiler ascendc_kernels)
833-
LINK_DIRECTORIES(${LINK_DIRECTORIES} ${CANN_INSTALL_DIR}/lib64)
840+
list(APPEND CANN_LIBRARIES
841+
ascendcl
842+
nnopbase
843+
opapi
844+
acl_op_compiler
845+
cann_kernels
846+
)
834847
835848
set(GGML_HEADERS_CANN ggml-cann.h)
836849
file(GLOB GGML_SOURCES_CUDA "ggml-cann/*.cpp")
837850
list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
838-
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${CANN_LIBRARIES})
851+
852+
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
853+
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
854+
855+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${CANN_LIBRARIES} )
839856
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
840857
add_compile_definitions(GGML_USE_CANN)
841858
endif()

ggml-cann.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
431431
ggml_cann_clamp(ctx, dst);
432432
break;
433433
case GGML_OP_CPY:
434-
return false;
434+
ggml_cann_cpy(ctx, dst);
435+
break;
435436
case GGML_OP_CONT:
436437
ggml_cann_dup(ctx, dst);
437438
break;
@@ -664,8 +665,8 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
664665
case GGML_OP_MUL_MAT_ID:
665666
// embedding
666667
case GGML_OP_GET_ROWS:
667-
case GGML_OP_CPY:
668668
return false;
669+
case GGML_OP_CPY:
669670
case GGML_OP_DUP:
670671
case GGML_OP_REPEAT:
671672
case GGML_OP_CONCAT:
@@ -847,7 +848,7 @@ extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
847848

848849
GGML_CALL int ggml_backend_cann_reg_devices() {
849850
ACL_CHECK(aclInit(nullptr));
850-
uint32_t device_count = ggml_backend_cann_get_device_count();
851+
uint32_t device_count = 1;//= ggml_backend_cann_get_device_count();
851852
// initialization
852853
for (uint32_t i = 0; i < device_count; i++) {
853854
char name[128];

ggml-cann/aclnn_ops.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -766,12 +766,7 @@ void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
766766
ACL_CHECK(aclDestroyIntArray(dilations));
767767
}
768768

769-
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
770-
ggml_tensor* src = dst->src[0];
771-
772-
aclTensor* acl_src = create_acl_tensor(src);
773-
aclTensor* acl_dst = create_acl_tensor(dst);
774-
769+
void cann_copy(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_src, aclTensor* acl_dst) {
775770
uint64_t workspaceSize = 0;
776771
aclOpExecutor* executor;
777772
void* workspaceAddr = nullptr;
@@ -786,6 +781,16 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
786781
aclrtStream stream = ctx.stream();
787782
ACL_CHECK(aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, stream));
788783

784+
}
785+
786+
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
787+
ggml_tensor* src = dst->src[0];
788+
789+
aclTensor* acl_src = create_acl_tensor(src);
790+
aclTensor* acl_dst = create_acl_tensor(dst);
791+
792+
cann_copy(ctx, dst, acl_src, acl_dst);
793+
789794
ACL_CHECK(aclDestroyTensor(acl_src));
790795
ACL_CHECK(aclDestroyTensor(acl_dst));
791796
}
@@ -1583,4 +1588,23 @@ void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
15831588
ACL_CHECK(aclDestroyTensor(tmp_mk_tensor));
15841589
ACL_CHECK(aclDestroyTensor(tmp_arange3_tensor));
15851590
ACL_CHECK(aclDestroyTensor(tmp_output_tensor));
1591+
}
1592+
1593+
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1594+
ggml_tensor* src = dst->src[0];
1595+
1596+
aclTensor* acl_src = create_acl_tensor(src);
1597+
aclTensor* acl_dst = create_acl_tensor(dst);
1598+
1599+
if(!ggml_is_quantized(dst->type)) {
1600+
cann_copy(ctx, dst, acl_src, acl_dst);
1601+
} else {
1602+
uint8_t* size = (uint8_t*)ctx.alloc_buffer(dst, sizeof(size_t));
1603+
size_t ne = ggml_nelements(src);
1604+
aclrtMemcpy(size, sizeof(size_t), &ne, sizeof(size_t), ACL_MEMCPY_HOST_TO_DEVICE);
1605+
size_t ne1;
1606+
aclrtMemcpy(&ne1, sizeof(size_t), size, sizeof(size_t), ACL_MEMCPY_DEVICE_TO_HOST);
1607+
1608+
cann_quantize_q4_0(1, nullptr, (uint8_t*)src->data, (uint8_t*)dst->data, size);
1609+
}
15861610
}

ggml-cann/aclnn_ops.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* d
6666

6767
void ggml_cann_alibi(ggml_backend_cann_context& ctx, ggml_tensor* dst);
6868

69+
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
70+
6971
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
7072
aclTensor*, uint64_t*, aclOpExecutor**),
7173
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>

ggml-cann/kernels/CMakeLists.txt

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,46 @@
1-
if(NOT SOC_VERSION)
2-
set(SOC_VERSION "ascend910b3")
1+
if (NOT SOC_TYPE)
2+
set (SOC_TYPE "Ascend910B3")
33
endif()
4-
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
5-
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim/cpu")
64

7-
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
8-
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
9-
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
10-
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
5+
file(GLOB SRC_FILES
6+
dequantize_q4_0.cpp
7+
quantize_q4_0.cpp
8+
)
9+
10+
string(TOLOWER "${CMAKE_BUILD_TYPE}" lowercase_CMAKE_BUILD_TYPE)
11+
if(${lowercase_CMAKE_BUILD_TYPE} STREQUAL "debug")
12+
if (NOT DEFINED ENV{CMAKE_PREFIX_PATH})
13+
set(CMAKE_PREFIX_PATH ${CANN_INSTALL_DIR}/tools/tikicpulib/lib/cmake)
14+
endif()
15+
16+
find_package(tikicpulib REQUIRED)
17+
add_library(cann_kernels ${SRC_FILES} ascendc_kernels.cpp)
18+
target_link_libraries(cann_kernels PRIVATE
19+
ascendcl
20+
tikicpulib::ascend910B1
21+
)
22+
23+
target_compile_features(cann_kernels PRIVATE cxx_std_17)
24+
1125
else()
12-
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
13-
endif()
26+
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
27+
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
28+
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
1429

15-
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
30+
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
31+
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
32+
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
33+
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
34+
else()
35+
message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
36+
endif()
37+
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
1638

17-
ascendc_library(ascendc_kernels STATIC
18-
threshold_opencv_kernel.cpp
19-
)
39+
ascendc_library(ascendc_kernels STATIC
40+
${SRC_FILES}
41+
)
42+
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
43+
44+
add_library(cann_kernels STATIC ascendc_kernels.cpp)
45+
target_link_libraries(cann_kernels PUBLIC ascendc_kernels)
46+
endif()

ggml-cann/kernels/ascendc_kernels.cpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#include "ascendc_kernels.h"
2+
3+
#ifdef __CCE_KT_TEST__
4+
#include "tikicpulib.h"
5+
#else
6+
#include "aclrtlaunch_ascendc_dequantize_q4_0.h"
7+
#include "aclrtlaunch_ascendc_quantize_q4_0.h"
8+
#endif
9+
10+
11+
#ifdef __CCE_KT_TEST__
12+
#include <acl/acl.h>
13+
14+
uint8_t* to_gm(uint8_t* ptr, size_t size) {
15+
uint8_t* gm = (uint8_t*)AscendC::GmAlloc(size);
16+
aclrtMemcpy(gm, size, ptr, size, ACL_MEMCPY_DEVICE_TO_HOST);
17+
return gm;
18+
}
19+
20+
void free_gm(uint8_t* ptr) {
21+
aclrtFree(ptr);
22+
}
23+
24+
extern "C" __global__ __aicore__ void ascendc_dequantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size);
25+
extern "C" __global__ __aicore__ void ascendc_quantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size);
26+
#endif
27+
28+
void cann_dequantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size) {
29+
#ifdef __CCE_KT_TEST__
30+
uint8_t* size_host = to_gm(size, sizeof(size_t));
31+
uint8_t* x_host = to_gm(x, *((size_t*)size_host));
32+
uint8_t* y_host = to_gm(y, *((size_t*)size_host));
33+
AscendC::SetKernelMode(KernelMode::AIV_MODE);
34+
ICPU_RUN_KF(ascendc_dequantize_q4_0, 1, x_host, y_host, size_host);
35+
free_gm(size_host);
36+
free_gm(x_host);
37+
free_gm(y_host);
38+
#else
39+
aclrtlaunch_ascendc_dequantize_q4_0(block_dim, stream, x, y, size);
40+
#endif
41+
}
42+
43+
void cann_quantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size) {
44+
#ifdef __CCE_KT_TEST__
45+
uint8_t* size_host = to_gm(size, sizeof(size_t));
46+
uint8_t* x_host = to_gm(x, *((size_t*)size_host));
47+
uint8_t* y_host = to_gm(y, *((size_t*)size_host));
48+
AscendC::SetKernelMode(KernelMode::AIV_MODE);
49+
ICPU_RUN_KF(ascendc_quantize_q4_0, 1, x_host, y_host, size_host);
50+
free_gm(size_host);
51+
free_gm(x_host);
52+
free_gm(y_host);
53+
#else
54+
aclrtlaunch_ascendc_quantize_q4_0(block_dim, stream, x, y, size);
55+
#endif
56+
}

ggml-cann/kernels/ascendc_kernels.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
#ifndef ASCENDC_KERNELS_H
22
#define ASCENDC_KERNELS_H
33

4+
5+
#include <stdint.h>
6+
7+
void cann_dequantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size);
8+
void cann_quantize_q4_0(uint32_t block_dim, void* stream, uint8_t* x, uint8_t* y, uint8_t* size);
9+
410
#endif //ASCENDC_KERNELS_H

ggml-cann/kernels/dequantize_q4_0.cpp

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
#include "dequantize_q4_0.h"
2+
3+
using namespace AscendC;
4+
5+
#define BUFFER_NUM 2
6+
7+
__aicore__ inline int32_t align_ceil(int32_t n, int32_t align) { return ((n + align) & ~(align-1)); }
8+
9+
__aicore__ inline int32_t align_floor(int32_t n, int32_t align) { return (n & ~(align-1)); }
10+
11+
12+
#define QK4_0 32
13+
typedef struct {
14+
uint16_t d; // delta
15+
uint8_t qs[QK4_0 / 2]; // nibbles / quants
16+
} block_q4_0;
17+
18+
class KernelDequantizeQ4_0
19+
{
20+
public:
21+
__aicore__ inline KernelDequantizeQ4_0() {}
22+
__aicore__ inline void init(GM_ADDR x, GM_ADDR y, size_t size) {
23+
uint64_t src_block_size =
24+
align_ceil(size / GetBlockNum(), sizeof(block_q4_0));
25+
uint64_t src_offset = GetBlockIdx() * src_block_size;
26+
src_block_size =
27+
(src_offset + src_block_size > (size / 32 * sizeof(block_q4_0)))
28+
? (size / 32 * sizeof(block_q4_0) - src_offset)
29+
: src_block_size;
30+
uint64_t dst_block_size =
31+
align_ceil(size / GetBlockNum(), QK4_0 * sizeof(float));
32+
uint64_t dst_offset = GetBlockIdx() * dst_block_size;
33+
dst_block_size =
34+
(dst_offset + dst_block_size > size * sizeof(float))
35+
? (size * sizeof(float) - dst_offset)
36+
: dst_block_size;
37+
38+
xGM.SetGlobalBuffer((__gm__ int4b_t*)x + src_offset, src_block_size);
39+
yGM.SetGlobalBuffer((__gm__ float*)y + dst_offset, dst_block_size);
40+
41+
pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
42+
// Ascendc do not support cast int4b_t -> float, but support int4b_t ->
43+
// half -> float.
44+
pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
45+
pipe.InitBuffer(copy_queue, BUFFER_NUM, QK4_0 * sizeof(float));
46+
pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
47+
}
48+
49+
__aicore__ inline void copy_in(uint32_t offset) {
50+
LocalTensor<int4b_t> x_local = input_queue.AllocTensor<int4b_t>();
51+
// offset + 2 to skip scale.
52+
DataCopy(x_local, xGM[offset + 2], QK4_0);
53+
input_queue.EnQue(x_local);
54+
}
55+
56+
__aicore__ inline void copy_out(uint32_t offset) {
57+
LocalTensor<float> y_local = output_queue.DeQue<float>();
58+
DataCopy(yGM[offset], y_local, QK4_0);
59+
output_queue.FreeTensor(y_local);
60+
}
61+
62+
__aicore__ inline void calculate(uint32_t offset, uint32_t len) {
63+
copy_in(offset);
64+
65+
LocalTensor<int4b_t> x_local = input_queue.DeQue<int4b_t>();
66+
LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
67+
LocalTensor<float> copy_local = copy_queue.AllocTensor<float>();
68+
LocalTensor<float> y_local = output_queue.AllocTensor<float>();
69+
70+
Cast(x_local, cast_local, RoundMode::CAST_NONE, QK4_0);
71+
Cast(cast_local, copy_local, RoundMode::CAST_NONE, QK4_0);
72+
73+
74+
}
75+
76+
__aicore__ inline void run() {
77+
calculate(0, 10);
78+
}
79+
80+
private:
81+
uint64_t block_size;
82+
uint64_t offset;
83+
84+
TPipe pipe;
85+
GlobalTensor<int4b_t> xGM;
86+
GlobalTensor<float> yGM;
87+
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
88+
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
89+
TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
90+
TQue<QuePosition::VECIN, BUFFER_NUM> copy_queue;
91+
};
92+
93+
extern "C" __global__ __aicore__ void ascendc_dequantize_q4_0(GM_ADDR x, GM_ADDR y, GM_ADDR size)
94+
{
95+
size_t size_ub;
96+
auto size_gm_ptr = (__gm__ uint8_t*)size;
97+
auto size_ub_ptr = (uint8_t*)&size_ub;
98+
99+
for (int32_t i = 0; i < sizeof(size_t) / sizeof(uint8_t);
100+
++i, ++size_gm_ptr, ++size_ub_ptr)
101+
{
102+
*size_ub_ptr = *size_gm_ptr;
103+
}
104+
105+
KernelDequantizeQ4_0 dequantize_q4_0;
106+
dequantize_q4_0.init(x, y, size_ub);
107+
dequantize_q4_0.run();
108+
}

ggml-cann/kernels/dequantize_q4_0.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#ifndef DEQUANTIZE_Q4_0_H
2+
#define DEQUANTIZE_Q4_0_H
3+
4+
#include "ascendc_kernels.h"
5+
#include "kernel_operator.h"
6+
7+
#endif //DEQUANTIZE_Q4_0_H

0 commit comments

Comments
 (0)