From f0e09002c366dd7d63f6794a8377cd27c215e6c0 Mon Sep 17 00:00:00 2001 From: shanshan shen Date: Fri, 22 Nov 2024 09:28:44 +0000 Subject: [PATCH 1/5] improve inferencing performance for ascend npu. Co-authored-by: Frank Mai --- ggml/src/ggml-cann/aclnn_ops.cpp | 344 ++++++++++++++++++++++++++++++- ggml/src/ggml-cann/common.h | 9 +- ggml/src/ggml-cann/ggml-cann.cpp | 132 ++++++------ 3 files changed, 413 insertions(+), 72 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a4ec8418e2ab3..8b6e2345113b7 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -2407,7 +2409,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, aclTensor* acl_weight, aclTensor* acl_dst) { int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is // fp32, atlas a2 will transpose it to HFLOAT32. - uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -2425,6 +2426,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream())); } +/** + * @brief Performs matrix multiplication of two 2D tensors. + * + * This function computes the matrix multiplication of the input tensor + * `acl_input` and the weight tensor `acl_weight`, and stores the result in the + * destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst}=\text {acl_input@acl_weight} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_input The input tensor for the matrix multiplication. + * @param acl_weight The weight tensor for the matrix multiplication. + * @param acl_dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input, + aclTensor* acl_weight, aclTensor* acl_dst) { + int8_t cube_math_type = 2; + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst, + cube_math_type, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Performs matrix multiplication of two 3D tensors. + * + * This function computes the matrix multiplication of the input tensor + * `acl_input` and the weight tensor `acl_weight`, and stores the result in the + * destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst}=\text {acl_input@acl_weight} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_input The input tensor for the matrix multiplication. + * @param acl_weight The weight tensor for the matrix multiplication. + * @param acl_dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input, + aclTensor* acl_weight, aclTensor* acl_dst) { + int8_t cube_math_type = 2; + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst, + cube_math_type, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + /** * @brief Performs matrix multiplication with floating-point precision on * tensors using the CANN backend. @@ -2466,6 +2541,70 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, ACL_CHECK(aclDestroyTensor(acl_dst)); } +/** + * @brief Performs matrix multiplication with floating-point precision on + * tensors using the CANN backend. + * + * This function performs matrix multiplication of the input tensor and the + * weight tensor, handling broadcasting and transposing as needed, and stores + * the result in the destination tensor `dst`. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { + ggml_tensor* weight = dst->src[0]; // weight + ggml_tensor* input = dst->src[1]; // input + + // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto + // broadcast, when weight ne2 or ne3 is not 1, weight need repeat. + BCAST_MUL_MAT_SHAPE(input, weight, dst); + + int64_t n_dims = bcast_dims; + if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) { + if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) { + n_dims = 2; + } else if (bcast_input_ne[2] == 1) { + n_dims = 3; + } + } + + aclTensor* acl_input_tensor = + ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims); + int64_t transpose_ne[] = { + bcast_weight_ne[1], bcast_weight_ne[0], + bcast_weight_ne[2], bcast_weight_ne[3], + bcast_weight_ne[4], bcast_weight_ne[5] + }; + size_t transpose_nb[] = { + bcast_weight_nb[1], bcast_weight_nb[0], + bcast_weight_nb[2], bcast_weight_nb[3], + bcast_weight_nb[4], bcast_weight_nb[5] + }; + aclTensor* acl_weight_tensor = + ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims); + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); + + switch (n_dims) { + case 2: + aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + case 3: + aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + default: + aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + } + + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + /** * @brief Performs matrix multiplication with quantized weights and * floating-point inputs using the CANN backend. @@ -2618,16 +2757,215 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); } +/** + * @brief Performs matrix multiplication with quantized weights and + * floating-point inputs using the CANN backend. + * + * This function performs matrix multiplication of the input tensor `src1` and + * the weight tensor `src0`, handling broadcasting, transposing, and + * quantization as needed, and stores the result in the destination tensor + * `dst`. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void ggml_cann_mul_mat_quant2(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + const enum ggml_type type) { + ggml_tensor* src0 = dst->src[0]; // weight + ggml_tensor* src1 = dst->src[1]; // input + + // The shape of the weight is NCHW. + // Matrix multiplication uses HW dims. + // HC is regarded as batch. + // weight need transpose. + float weight_elem_size; + if (type == GGML_TYPE_Q4_0) { + weight_elem_size = float(sizeof(uint8_t)) / 2; + } else if (type == GGML_TYPE_Q8_0) { + weight_elem_size = float(sizeof(uint8_t)); + } else { + GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); + } + float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size}; + size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size; + size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; + + // scale stored at the end of weight. + // scale need transpose. + size_t scale_elem_size = sizeof(uint16_t); + size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size}; + size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; + char* scale_offset = (char*)src0->data + weight_size; + + // input + size_t input_elem_size = sizeof(uint16_t); + int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; + size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size}; + size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size; + ggml_cann_pool_alloc input_alloctor(ctx.pool()); + void* input_buffer = src1->data; + + // case in + if (src1->type != GGML_TYPE_F16) { + aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); + input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); + + int64_t* input_cast_ne = src1->ne; + size_t input_cast_nb[GGML_MAX_DIMS]; + input_cast_nb[0] = sizeof(uint16_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1]; + } + + aclTensor* acl_input_tensor = ggml_cann_create_tensor( + input_buffer, + ACL_FLOAT16, + input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS); + aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); + + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); + } + + // output + size_t output_elem_size = sizeof(uint16_t); + size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size}; + ggml_cann_pool_alloc output_allocator(ctx.pool()); + void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size); + size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size; + + // aclnn + int64_t max_elem_size = 65535; + int64_t split_size = (src0->ne[1] / max_elem_size) + 1; + ggml_cann_pool_alloc workspace_allocator(ctx.pool()); + aclOpExecutor* executor = nullptr; + uint64_t workspaceSize = 0; + void* workspaceAddr = nullptr; + for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { + for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { + int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); + int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); + + int64_t batch1 = (n1 * src1->ne[2]) + c1; + int64_t batch0 = (n0 * src0->ne[2]) + c0; + + aclTensor* acl_input_tensor = ggml_cann_create_tensor( + (char*)input_buffer + batch1 * input_stride, + ACL_FLOAT16, + input_elem_size, input_ne, input_nb, 2); + + // first split + int64_t weight_ne_offset = 0; + int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]}; + int64_t scale_ne_offset = 0; + int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0}; + int64_t output_ne_offset = 0; + int64_t output_ne[2] = {weight_ne[0], dst->ne[1]}; + + aclTensor* acl_weight_tensor = ggml_cann_create_tensor( + (char*)src0->data + batch0 * weight_stride, + ggml_cann_type_mapping(type), + weight_elem_size, weight_ne, weight_nb, 2, + ACL_FORMAT_ND, weight_ne_offset); + aclTensor* acl_scale_tensor = ggml_cann_create_tensor( + scale_offset + batch0 * scale_stride, + ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2, + ACL_FORMAT_ND, scale_ne_offset); + aclTensor* acl_output_tensor = ggml_cann_create_tensor( + (char*)output_buffer + batch1 * output_stride, + ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, + ACL_FORMAT_ND, output_ne_offset); + + ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( + acl_input_tensor, acl_weight_tensor, acl_scale_tensor, + nullptr, nullptr, nullptr, nullptr, QK8_0, + acl_output_tensor, &workspaceSize, &executor)); + if (workspaceAddr == nullptr) { + workspaceAddr = workspace_allocator.alloc(workspaceSize); + } + ACL_CHECK(aclnnWeightQuantBatchMatmulV2( + workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + + // other splits + for (int64_t split = 1; split < split_size; split++) { + weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1]; + weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size; + scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1]; + scale_ne[0] = weight_ne[0]; + output_ne_offset += output_elem_size * output_ne[0] * output_ne[1]; + output_ne[0] = weight_ne[0]; + + acl_weight_tensor = ggml_cann_create_tensor( + (char*)src0->data + batch0 * weight_stride, + ggml_cann_type_mapping(type), + weight_elem_size, weight_ne, weight_nb, 2, + ACL_FORMAT_ND, weight_ne_offset); + acl_scale_tensor = ggml_cann_create_tensor( + scale_offset + batch0 * scale_stride, + ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2, + ACL_FORMAT_ND, scale_ne_offset); + acl_output_tensor = ggml_cann_create_tensor( + (char*)output_buffer + batch1 * output_stride, + ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, + ACL_FORMAT_ND, output_ne_offset); + + ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( + acl_input_tensor, acl_weight_tensor, acl_scale_tensor, + nullptr, nullptr, nullptr, nullptr, QK8_0, + acl_output_tensor, &workspaceSize, &executor)); + ACL_CHECK(aclnnWeightQuantBatchMatmulV2( + workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + } + + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + } + } + + // cast out + if (dst->type != GGML_TYPE_F16) { + int64_t* output_cast_ne = dst->ne; + size_t output_cast_nb[GGML_MAX_DIMS]; + output_cast_nb[0] = sizeof(uint16_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; + } + + aclTensor* acl_output_tensor = ggml_cann_create_tensor( + output_buffer, + ACL_FLOAT16, + output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS); + aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); + aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type)); + + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); + } +} + void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const enum ggml_type type = dst->src[0]->type; switch (type) { case GGML_TYPE_F32: case GGML_TYPE_F16: - ggml_cann_mat_mul_fp(ctx, dst); + ggml_cann_mat_mul_fp2(ctx, dst); break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: - ggml_cann_mul_mat_quant(ctx, dst, type); + ggml_cann_mul_mat_quant2(ctx, dst, type); break; default: GGML_ABORT("fatal error"); diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index edfa496148ff2..5164cb74ec92e 100644 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc { struct ggml_backend_cann_context { int32_t device; /**< Device ID. */ std::string name; /**< Name of the device. */ + std::string description; /**< Description of the device. */ aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ - aclrtStream streams[GGML_CANN_MAX_STREAMS] = { - {nullptr}}; /**< Array of streams for the device. */ + aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */ /** * @brief Constructor for initializing the context with a given device. * @param device Device ID. */ explicit ggml_backend_cann_context(int device) - : device(device), name("CANN" + std::to_string(device)) {} + : device(device), name("CANN" + std::to_string(device)) { + ggml_cann_set_device(device); + description = aclrtGetSocName(); + } /** * @brief Destructor for cleaning up resources. diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 776340881434d..a9a1cfb6356c2 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() { ACL_CHECK(aclrtMemGetAllocationGranularity( &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, &info.devices[id].vmm_granularity)); + + size_t free, total; + ggml_backend_cann_get_device_memory(id, &free, &total); + info.devices[id].total_vram = free; } // TODO: add more device info later. @@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { * @return A pointer to the allocated buffer. */ void* alloc(size_t size, size_t* actual_size) override { + const size_t alignment = 128; + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } #ifdef DEBUG_CANN_MALLOC int nnz = 0; size_t max_size = 0; @@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { return ptr; } void* ptr; - size_t look_ahead_size = (size_t)(1.05 * size); - look_ahead_size = 256 * ((look_ahead_size + 255) / 256); ggml_cann_set_device(device); ACL_CHECK( - aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST)); - *actual_size = look_ahead_size; - pool_size += look_ahead_size; + aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST)); + *actual_size = size; + pool_size += size; #ifdef DEBUG_CANN_MALLOC GGML_LOG_INFO( "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " @@ -294,9 +301,9 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { */ struct ggml_cann_pool_vmm : public ggml_cann_pool { /** - * @brief The maximum size of the virtual memory pool (32 GB). + * @brief The maximum size of the virtual memory pool. */ - static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB + size_t max_size; /** * @brief The device ID associated with this buffer pool. @@ -334,6 +341,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { std::vector map_offsets; /** + * @brief Constructor to initialize the buffer pool with virtual memory for * @brief Constructor to initialize the buffer pool with virtual memory for * a specific device. * @@ -341,7 +349,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ explicit ggml_cann_pool_vmm(int device) : device(device), - granularity(ggml_cann_info().devices[device].vmm_granularity) {} + granularity(ggml_cann_info().devices[device].vmm_granularity) { + auto dev = ggml_cann_info().devices[device]; + granularity = dev.vmm_granularity; + max_size = dev.total_vram; + } /** * @brief Destructor to free all buffers in the virtual memory pool. @@ -370,17 +382,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { // round up the allocation size to the alignment to ensure that all // allocations are aligned for all data types const size_t alignment = 128; - size = alignment * ((size + alignment - 1) / alignment); + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } size_t avail = pool_size - pool_used; if (size > avail) { // round up to the next multiple of the granularity size_t reserve_size = size - avail; - reserve_size = - granularity * ((reserve_size + granularity - 1) / granularity); + reserve_size = GGML_PAD(reserve_size, granularity); - GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE); + GGML_ASSERT(pool_size + reserve_size <= max_size); // allocate more physical memory aclrtPhysicalMemProp prop = {}; @@ -396,7 +410,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { // reserve virtual address space (if not already reserved) if (pool_addr == 0) { ACL_CHECK(aclrtReserveMemAddress( - &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1)); + &pool_addr, max_size, 0, NULL, 1)); } // map at the end of the pool @@ -409,10 +423,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { // add to the pool pool_size += reserve_size; - // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB ( - // reserved %llu MB)\n", - // device, (unsigned long long) (pool_size/1024/1024), - // (unsigned long long) (reserve_size/1024/1024)); +#ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", + device, (unsigned long long) (pool_size/1024/1024), + (unsigned long long) (reserve_size/1024/1024)); +#endif } GGML_ASSERT(pool_addr != 0); @@ -457,8 +472,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( int device) { - // return std::unique_ptr(new ggml_cann_pool_leg(device)); - return std::unique_ptr(new ggml_cann_pool_vmm(device)); + if (device == 0) { + return std::unique_ptr(new ggml_cann_pool_vmm(device)); + } + return std::unique_ptr(new ggml_cann_pool_leg(device)); } // cann buffer @@ -470,23 +487,22 @@ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( */ struct ggml_backend_cann_buffer_context { int32_t device; ///< The device ID associated with this buffer context. - void* dev_ptr = - nullptr; ///< Pointer to the device memory allocated for the buffer. + ggml_cann_pool_alloc* alloc; ///< Pointer to the device memory allocated for the buffer. /** * @brief Constructor to initialize the CANN buffer context. * * @param device The device ID associated with this buffer context. - * @param dev_ptr Pointer to the device memory allocated for the buffer. + * @param alloc Pointer to the device memory allocated for the buffer. */ - ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr) + ggml_backend_cann_buffer_context(int32_t device, ggml_cann_pool_alloc* alloc) : device(device), - dev_ptr(dev_ptr) {} + alloc(alloc) {} /** * @brief Destructor to free the device memory allocated for the buffer. */ - ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); } + ~ggml_backend_cann_buffer_context() { delete alloc; } }; /** @@ -532,7 +548,7 @@ static void* ggml_backend_cann_buffer_get_base( ggml_backend_buffer_t buffer) { ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; - return ctx->dev_ptr; + return ctx->alloc->get(); } /** @@ -939,7 +955,7 @@ static void ggml_backend_cann_buffer_clear( (ggml_backend_cann_buffer_context*)buffer->context; ggml_cann_set_device(ctx->device); - ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size)); + ACL_CHECK(aclrtMemset(ctx->alloc->get(), buffer->size, value, buffer->size)); } /** @@ -1001,25 +1017,13 @@ static const char* ggml_backend_cann_buffer_type_name( static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_cann_buffer_type_context* buft_ctx = - (ggml_backend_cann_buffer_type_context*)buft->context; - - ggml_cann_set_device(buft_ctx->device); - - size = std::max(size, (size_t)1); + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)buft->device->context; - void* dev_ptr; - aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); - if (err != ACL_SUCCESS) { - GGML_LOG_ERROR( - "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", - __func__, size / 1024.0 / 1024.0, buft_ctx->device, - aclGetRecentErrMsg()); - return nullptr; - } + ggml_cann_pool_alloc* alloc = new ggml_cann_pool_alloc(cann_ctx->pool(), size); ggml_backend_cann_buffer_context* ctx = - new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr); + new ggml_backend_cann_buffer_context(cann_ctx->device, alloc); return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size); @@ -1130,10 +1134,10 @@ ggml_backend_cann_buffer_type(int32_t device) { static bool ggml_backend_cann_buffer_type_initialized = false; if (!ggml_backend_cann_buffer_type_initialized) { - for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) { + for (int32_t i = 0; i < ggml_cann_info().device_count; i++) { ggml_backend_cann_buffer_types[i] = { /* .iface = */ ggml_backend_cann_buffer_type_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i), /* .context = */ new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i)}, @@ -1199,10 +1203,15 @@ static void * ggml_cann_host_malloc(size_t size) { return nullptr; } + const size_t alignment = 128; + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } + void * hostPtr = nullptr; aclError err = aclrtMallocHost((void **) &hostPtr, size); if (err != ACL_SUCCESS) { - GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, aclGetRecentErrMsg()); return nullptr; @@ -1863,17 +1872,17 @@ struct ggml_backend_cann_device_context { }; static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; return ctx->name.c_str(); } static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; return ctx->description.c_str(); } static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; ggml_backend_cann_get_device_memory(ctx->device, free, total); } @@ -1900,7 +1909,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; return ggml_backend_cann_init(ctx->device); } @@ -1920,7 +1929,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons static bool ggml_backend_cann_supports_buft( ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (ggml_backend_buft_is_cann(buft)) { - ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context; ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *)buft->context; return buft_ctx->device == dev_ctx->device; @@ -1929,7 +1938,7 @@ static bool ggml_backend_cann_supports_buft( } static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_context * ctx = (ggml_backend_cann_context*)dev->context; return ggml_backend_cann_buffer_type(ctx->device); } @@ -1950,7 +1959,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type( */ static ggml_backend_event_t ggml_backend_cann_device_event_new( ggml_backend_dev_t dev) { - ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context; ggml_cann_set_device(dev_ctx->device); @@ -2058,11 +2067,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() { ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context; for (int i = 0; i < ggml_cann_info().device_count; i++) { - ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context(); - dev_ctx->description = aclrtGetSocName(); - dev_ctx->device = i; - dev_ctx->name = GGML_CANN_NAME + std::to_string(i); - ggml_cann_set_device(i); + ggml_backend_cann_context* dev_ctx = new ggml_backend_cann_context(i); ggml_backend_dev_t dev = new ggml_backend_device { /* .interface = */ ggml_backend_cann_device_interface, /* .reg = */ ®, @@ -2090,17 +2095,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) { return nullptr; } - ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); - if (ctx == nullptr) { - GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); - return nullptr; - } - ggml_cann_set_device(ctx->device); + ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device); ggml_backend_t cann_backend = new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), - /* .context = */ ctx}; + /* .device = */ dev, + /* .context = */ dev->context}; return cann_backend; } From df68663a6314341a578fbe1a9f6376c6fcc68c38 Mon Sep 17 00:00:00 2001 From: shanshan shen Date: Mon, 25 Nov 2024 08:05:54 +0000 Subject: [PATCH 2/5] some modification after review --- ggml/src/ggml-cann/aclnn_ops.cpp | 197 +------------------------------ ggml/src/ggml-cann/ggml-cann.cpp | 32 +++-- 2 files changed, 22 insertions(+), 207 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 1683da2affc2b..25ffd912670b7 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2531,47 +2531,6 @@ static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu * multiplication will be stored. */ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { - ggml_tensor* weight = dst->src[0]; // weight - ggml_tensor* input = dst->src[1]; // input - - // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto - // broadcast, when weight ne2 or ne3 is not 1, weight need repeat. - BCAST_MUL_MAT_SHAPE(input, weight, dst); - - // transpose weight: [1,2,3,4] -> [1,2,4,3] - int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], - bcast_weight_ne[2], bcast_weight_ne[3], - bcast_weight_ne[4], bcast_weight_ne[5]}; - size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], - bcast_weight_nb[2], bcast_weight_nb[3], - bcast_weight_nb[4], bcast_weight_nb[5]}; - - aclTensor* acl_weight_tensor = - ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims); - aclTensor* acl_input_tensor = - ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input)); - aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst)); - aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); - - ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst)); -} - -/** - * @brief Performs matrix multiplication with floating-point precision on - * tensors using the CANN backend. - * - * This function performs matrix multiplication of the input tensor and the - * weight tensor, handling broadcasting and transposing as needed, and stores - * the result in the destination tensor `dst`. - * - * @param ctx The context for the CANN backend operations. - * @param dst The destination tensor where the result of the matrix - * multiplication will be stored. - */ -static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* weight = dst->src[0]; // weight ggml_tensor* input = dst->src[1]; // input @@ -2637,158 +2596,6 @@ static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx, * multiplication will be stored. */ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, - ggml_tensor* dst, - const enum ggml_type type) { - ggml_tensor* src0 = dst->src[0]; // weight - ggml_tensor* src1 = dst->src[1]; // input - - // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC - // is regarded as batch. weight need transpose. - int64_t weight_ne[] = {src0->ne[1], src0->ne[0]}; - float weight_elem_size; - if (type == GGML_TYPE_Q4_0) { - weight_elem_size = float(sizeof(uint8_t)) / 2; - } - else if (type == GGML_TYPE_Q8_0) { - weight_elem_size = float(sizeof(uint8_t)); - } - else { - GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); - } - float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size}; - - // size of one matrix is element_size * height * width. - size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1]; - size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; - - // scale stored at the end of weight. Also need transpose. - GGML_ASSERT(QK4_0 == QK8_0); - int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0}; - size_t scale_elem_size = sizeof(uint16_t); - size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, - scale_elem_size}; - size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0; - char* scale_offset = (char*)src0->data + weight_size; - - // input - void* input_buffer; - size_t input_elem_size = sizeof(uint16_t); - int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; - size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]}; - size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1]; - - ggml_cann_pool_alloc input_alloctor(ctx.pool()); - if (src1->type != GGML_TYPE_F16) { - aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); - input_buffer = input_alloctor.get(); - - int64_t* input_cast_ne = src1->ne; - size_t input_cast_nb[GGML_MAX_DIMS]; - input_cast_nb[0] = sizeof(uint16_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1]; - } - - aclTensor* acl_input_tensor = ggml_cann_create_tensor( - input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, - input_cast_nb, GGML_MAX_DIMS); - aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); - ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); - } else { - input_buffer = src1->data; - } - - // output - size_t output_elem_size = sizeof(uint16_t); - int64_t output_ne[] = {dst->ne[0], dst->ne[1]}; - size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]}; - ggml_cann_pool_alloc output_alloctor( - ctx.pool(), ggml_nelements(dst) * output_elem_size); - void* output_buffer = output_alloctor.get(); - size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1]; - - // aclnn - uint64_t workspaceSize = 0; - aclOpExecutor* executor; - void* workspaceAddr = nullptr; - - for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { - for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { - int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); - int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); - - int64_t batch1 = n1 * src1->ne[2] + c1; - int64_t batch0 = n0 * src0->ne[2] + c0; - - aclTensor* acl_input_tensor = ggml_cann_create_tensor( - (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, - input_elem_size, input_ne, input_nb, 2); - aclTensor* acl_weight_tensor = ggml_cann_create_tensor( - (char*)src0->data + batch0 * weight_stride, - ggml_cann_type_mapping(type), weight_elem_size, weight_ne, - weight_nb, 2); - aclTensor* acl_scale_tensor = ggml_cann_create_tensor( - scale_offset + batch0 * scale_stride, ACL_FLOAT16, - scale_elem_size, scale_ne, scale_nb, 2); - aclTensor* acl_output_tensor = ggml_cann_create_tensor( - (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, - output_elem_size, output_ne, output_nb, 2); - - ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( - acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, - &workspaceSize, &executor)); - - if (workspaceSize > 0 && workspaceAddr == nullptr) { - ggml_cann_pool_alloc workspace_allocator(ctx.pool(), - workspaceSize); - workspaceAddr = workspace_allocator.get(); - } - - ACL_CHECK(aclnnWeightQuantBatchMatmulV2( - workspaceAddr, workspaceSize, executor, ctx.stream())); - - ACL_CHECK(aclDestroyTensor(acl_input_tensor)); - ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); - ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); - ACL_CHECK(aclDestroyTensor(acl_output_tensor)); - } - } - - // cast out - int64_t* output_cast_ne = dst->ne; - size_t output_cast_nb[GGML_MAX_DIMS]; - output_cast_nb[0] = sizeof(uint16_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; - } - - aclTensor* acl_output_tensor = - ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size, - output_cast_ne, output_cast_nb, GGML_MAX_DIMS); - aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); - aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT); - - ACL_CHECK(aclDestroyTensor(acl_output_tensor)); - ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); -} - -/** - * @brief Performs matrix multiplication with quantized weights and - * floating-point inputs using the CANN backend. - * - * This function performs matrix multiplication of the input tensor `src1` and - * the weight tensor `src0`, handling broadcasting, transposing, and - * quantization as needed, and stores the result in the destination tensor - * `dst`. - * - * @param ctx The context for the CANN backend operations. - * @param dst The destination tensor where the result of the matrix - * multiplication will be stored. - */ -static void ggml_cann_mul_mat_quant2(ggml_backend_cann_context& ctx, ggml_tensor* dst, const enum ggml_type type) { ggml_tensor* src0 = dst->src[0]; // weight @@ -2979,11 +2786,11 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { switch (type) { case GGML_TYPE_F32: case GGML_TYPE_F16: - ggml_cann_mat_mul_fp2(ctx, dst); + ggml_cann_mat_mul_fp(ctx, dst); break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: - ggml_cann_mul_mat_quant2(ctx, dst, type); + ggml_cann_mul_mat_quant(ctx, dst, type); break; default: GGML_ABORT("fatal error"); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index a9a1cfb6356c2..531e87c7a5d31 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -341,7 +341,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { std::vector map_offsets; /** - * @brief Constructor to initialize the buffer pool with virtual memory for * @brief Constructor to initialize the buffer pool with virtual memory for * a specific device. * @@ -1872,17 +1871,17 @@ struct ggml_backend_cann_device_context { }; static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; return ctx->name.c_str(); } static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; return ctx->description.c_str(); } static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; ggml_backend_cann_get_device_memory(ctx->device, free, total); } @@ -1909,7 +1908,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); - ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; return ggml_backend_cann_init(ctx->device); } @@ -1929,7 +1928,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons static bool ggml_backend_cann_supports_buft( ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (ggml_backend_buft_is_cann(buft)) { - ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *)buft->context; return buft_ctx->device == dev_ctx->device; @@ -1938,7 +1937,7 @@ static bool ggml_backend_cann_supports_buft( } static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_cann_context * ctx = (ggml_backend_cann_context*)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; return ggml_backend_cann_buffer_type(ctx->device); } @@ -1959,7 +1958,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type( */ static ggml_backend_event_t ggml_backend_cann_device_event_new( ggml_backend_dev_t dev) { - ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context; + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; ggml_cann_set_device(dev_ctx->device); @@ -2067,7 +2066,11 @@ ggml_backend_reg_t ggml_backend_cann_reg() { ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context; for (int i = 0; i < ggml_cann_info().device_count; i++) { - ggml_backend_cann_context* dev_ctx = new ggml_backend_cann_context(i); + ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context(); + dev_ctx->description = aclrtGetSocName(); + dev_ctx->device = i; + dev_ctx->name = GGML_CANN_NAME + std::to_string(i); + ggml_cann_set_device(i); ggml_backend_dev_t dev = new ggml_backend_device { /* .interface = */ ggml_backend_cann_device_interface, /* .reg = */ ®, @@ -2095,12 +2098,17 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) { return nullptr; } - ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device); + ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); + if (ctx == nullptr) { + GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return nullptr; + } + ggml_cann_set_device(ctx->device); ggml_backend_t cann_backend = new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), /* .interface = */ ggml_backend_cann_interface, - /* .device = */ dev, - /* .context = */ dev->context}; + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), + /* .context = */ ctx}; return cann_backend; } From 1c79893ca211bdc7c086bd6e12db86496f85ed44 Mon Sep 17 00:00:00 2001 From: shanshan shen Date: Tue, 26 Nov 2024 07:09:55 +0000 Subject: [PATCH 3/5] some modifications after review --- ggml/src/ggml-cann/ggml-cann.cpp | 38 +++++++++++++++++++------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 531e87c7a5d31..86d3fb1ff65b1 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -471,10 +471,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( int device) { - if (device == 0) { - return std::unique_ptr(new ggml_cann_pool_vmm(device)); - } - return std::unique_ptr(new ggml_cann_pool_leg(device)); + return std::unique_ptr(new ggml_cann_pool_vmm(device)); } // cann buffer @@ -486,22 +483,21 @@ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( */ struct ggml_backend_cann_buffer_context { int32_t device; ///< The device ID associated with this buffer context. - ggml_cann_pool_alloc* alloc; ///< Pointer to the device memory allocated for the buffer. + void* dev_ptr = nullptr; /** * @brief Constructor to initialize the CANN buffer context. * * @param device The device ID associated with this buffer context. - * @param alloc Pointer to the device memory allocated for the buffer. */ - ggml_backend_cann_buffer_context(int32_t device, ggml_cann_pool_alloc* alloc) + ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr) : device(device), - alloc(alloc) {} + dev_ptr(dev_ptr) {} /** * @brief Destructor to free the device memory allocated for the buffer. */ - ~ggml_backend_cann_buffer_context() { delete alloc; } + ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr));} }; /** @@ -547,7 +543,7 @@ static void* ggml_backend_cann_buffer_get_base( ggml_backend_buffer_t buffer) { ggml_backend_cann_buffer_context* ctx = (ggml_backend_cann_buffer_context*)buffer->context; - return ctx->alloc->get(); + return ctx->dev_ptr; } /** @@ -954,7 +950,7 @@ static void ggml_backend_cann_buffer_clear( (ggml_backend_cann_buffer_context*)buffer->context; ggml_cann_set_device(ctx->device); - ACL_CHECK(aclrtMemset(ctx->alloc->get(), buffer->size, value, buffer->size)); + ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size)); } /** @@ -1016,13 +1012,25 @@ static const char* ggml_backend_cann_buffer_type_name( static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)buft->device->context; + ggml_backend_cann_buffer_type_context* buft_ctx = + (ggml_backend_cann_buffer_type_context*)buft->context; + + ggml_cann_set_device(buft_ctx->device); - ggml_cann_pool_alloc* alloc = new ggml_cann_pool_alloc(cann_ctx->pool(), size); + size = std::max(size, (size_t)1); + + void* dev_ptr; + aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); + if (err != ACL_SUCCESS) { + GGML_LOG_ERROR( + "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", + __func__, size / 1024.0 / 1024.0, buft_ctx->device, + aclGetRecentErrMsg()); + return nullptr; + } ggml_backend_cann_buffer_context* ctx = - new ggml_backend_cann_buffer_context(cann_ctx->device, alloc); + new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr); return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size); From e05a398fb3abb9dd94af4fccdcd1cf6b082829d3 Mon Sep 17 00:00:00 2001 From: shanshan shen Date: Tue, 26 Nov 2024 07:32:39 +0000 Subject: [PATCH 4/5] restore some modifications --- ggml/src/ggml-cann/aclnn_ops.cpp | 13 ++++++------- ggml/src/ggml-cann/ggml-cann.cpp | 8 +++++--- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 25ffd912670b7..881e50ac8170a 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2427,6 +2427,7 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, aclTensor* acl_weight, aclTensor* acl_dst) { int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is // fp32, atlas a2 will transpose it to HFLOAT32. + uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr; @@ -2531,7 +2532,7 @@ static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu * multiplication will be stored. */ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { + ggml_tensor* dst) { ggml_tensor* weight = dst->src[0]; // weight ggml_tensor* input = dst->src[1]; // input @@ -2596,8 +2597,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, * multiplication will be stored. */ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, - ggml_tensor* dst, - const enum ggml_type type) { + ggml_tensor* dst, + const enum ggml_type type) { ggml_tensor* src0 = dst->src[0]; // weight ggml_tensor* src1 = dst->src[1]; // input @@ -2617,8 +2618,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size; size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; - // scale stored at the end of weight. - // scale need transpose. + // scale stored at the end of weight. Also need transpose. size_t scale_elem_size = sizeof(uint16_t); size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size}; size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; @@ -2677,8 +2677,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, int64_t batch0 = (n0 * src0->ne[2]) + c0; aclTensor* acl_input_tensor = ggml_cann_create_tensor( - (char*)input_buffer + batch1 * input_stride, - ACL_FLOAT16, + (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2); // first split diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 3bb1666948433..ebccc51dedc3e 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -301,7 +301,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { */ struct ggml_cann_pool_vmm : public ggml_cann_pool { /** - * @brief The maximum size of the virtual memory pool. + * @brief The maximum size of the virtual memory pool (32 GB). */ size_t max_size; @@ -483,12 +483,14 @@ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( */ struct ggml_backend_cann_buffer_context { int32_t device; ///< The device ID associated with this buffer context. - void* dev_ptr = nullptr; + void* dev_ptr = + nullptr; ///< Pointer to the device memory allocated for the buffer. /** * @brief Constructor to initialize the CANN buffer context. * * @param device The device ID associated with this buffer context. + * @param dev_ptr Pointer to the device memory allocated for the buffer. */ ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr) : device(device), @@ -497,7 +499,7 @@ struct ggml_backend_cann_buffer_context { /** * @brief Destructor to free the device memory allocated for the buffer. */ - ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr));} + ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); } }; /** From 33fd470550b112cabc4c35fe797c437c93f417c0 Mon Sep 17 00:00:00 2001 From: shanshan shen Date: Tue, 26 Nov 2024 08:49:26 +0000 Subject: [PATCH 5/5] restore some modifications --- ggml/src/ggml-cann/aclnn_ops.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 881e50ac8170a..4bfd5b5ddbd75 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2427,7 +2427,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, aclTensor* acl_weight, aclTensor* acl_dst) { int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is // fp32, atlas a2 will transpose it to HFLOAT32. - uint64_t workspaceSize = 0; aclOpExecutor* executor; void* workspaceAddr = nullptr;