From f0e09002c366dd7d63f6794a8377cd27c215e6c0 Mon Sep 17 00:00:00 2001
From: shanshan shen <shanshanshen333@gmail.com>
Date: Fri, 22 Nov 2024 09:28:44 +0000
Subject: [PATCH 1/5] improve inferencing performance for ascend npu.

Co-authored-by: Frank Mai <thxCode@thxcode0824@gmail.com>
---
 ggml/src/ggml-cann/aclnn_ops.cpp | 344 ++++++++++++++++++++++++++++++-
 ggml/src/ggml-cann/common.h      |   9 +-
 ggml/src/ggml-cann/ggml-cann.cpp | 132 ++++++------
 3 files changed, 413 insertions(+), 72 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index a4ec8418e2ab3..8b6e2345113b7 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -32,6 +32,8 @@
 #include <aclnnop/aclnn_group_norm.h>
 #include <aclnnop/aclnn_index_fill_tensor.h>
 #include <aclnnop/aclnn_layer_norm.h>
+#include <aclnnop/aclnn_mm.h>
+#include <aclnnop/aclnn_batch_matmul.h>
 #include <aclnnop/aclnn_matmul.h>
 #include <aclnnop/aclnn_max_pool.h>
 #include <aclnnop/aclnn_permute.h>
@@ -2407,7 +2409,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
                           aclTensor* acl_weight, aclTensor* acl_dst) {
     int8_t cube_math_type = 1;  // ALLOW_FP32_DOWN_PRECISION, when input is
                                 // fp32, atlas a2 will transpose it to HFLOAT32.
-
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -2425,6 +2426,80 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
         aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
 }
 
+/**
+ * @brief Performs matrix multiplication of two 2D tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
+                             aclTensor* acl_weight, aclTensor* acl_dst) {
+    int8_t cube_math_type = 2;
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                      cube_math_type, &workspaceSize,
+                                      &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
+/**
+ * @brief Performs matrix multiplication of two 3D tensors.
+ *
+ * This function computes the matrix multiplication of the input tensor
+ * `acl_input` and the weight tensor `acl_weight`, and stores the result in the
+ * destination tensor `acl_dst`.
+ * The operation is defined as:
+ * \f[
+ *     \text {acl_dst}=\text {acl_input@acl_weight}
+ * \f]
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_input The input tensor for the matrix multiplication.
+ * @param acl_weight The weight tensor for the matrix multiplication.
+ * @param acl_dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_input,
+                             aclTensor* acl_weight, aclTensor* acl_dst) {
+    int8_t cube_math_type = 2;
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
+                                               cube_math_type, &workspaceSize,
+                                               &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(
+        aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
 /**
  * @brief Performs matrix multiplication with floating-point precision on
  * tensors using the CANN backend.
@@ -2466,6 +2541,70 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
     ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 
+/**
+ * @brief Performs matrix multiplication with floating-point precision on
+ * tensors using the CANN backend.
+ *
+ * This function performs matrix multiplication of the input tensor and the
+ * weight tensor, handling broadcasting and transposing as needed, and stores
+ * the result in the destination tensor `dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx,
+                                  ggml_tensor* dst) {
+    ggml_tensor* weight = dst->src[0];  // weight
+    ggml_tensor* input = dst->src[1];   // input
+
+    // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
+    // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
+    BCAST_MUL_MAT_SHAPE(input, weight, dst);
+
+    int64_t n_dims = bcast_dims;
+    if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
+        if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
+            n_dims = 2;
+        } else if (bcast_input_ne[2] == 1) {
+            n_dims = 3;
+        }
+    }
+
+    aclTensor* acl_input_tensor =
+        ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
+    int64_t transpose_ne[] = {
+        bcast_weight_ne[1], bcast_weight_ne[0],
+        bcast_weight_ne[2], bcast_weight_ne[3],
+        bcast_weight_ne[4], bcast_weight_ne[5]
+    };
+    size_t transpose_nb[] = {
+        bcast_weight_nb[1], bcast_weight_nb[0],
+        bcast_weight_nb[2], bcast_weight_nb[3],
+        bcast_weight_nb[4], bcast_weight_nb[5]
+    };
+    aclTensor* acl_weight_tensor =
+        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
+    aclTensor* acl_dst =
+        ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
+
+    switch (n_dims) {
+    case 2:
+        aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    case 3:
+        aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    default:
+        aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
+        break;
+    }
+
+    ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
 /**
  * @brief Performs matrix multiplication with quantized weights and
  * floating-point inputs using the CANN backend.
@@ -2618,16 +2757,215 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
 }
 
+/**
+ * @brief Performs matrix multiplication with quantized weights and
+ * floating-point inputs using the CANN backend.
+ *
+ * This function performs matrix multiplication of the input tensor `src1` and
+ * the weight tensor `src0`, handling broadcasting, transposing, and
+ * quantization as needed, and stores the result in the destination tensor
+ * `dst`.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param dst The destination tensor where the result of the matrix
+ * multiplication will be stored.
+ */
+static void ggml_cann_mul_mat_quant2(ggml_backend_cann_context& ctx,
+                                     ggml_tensor* dst,
+                                     const enum ggml_type type) {
+    ggml_tensor* src0 = dst->src[0];  // weight
+    ggml_tensor* src1 = dst->src[1];  // input
+
+    // The shape of the weight is NCHW.
+    // Matrix multiplication uses HW dims.
+    // HC is regarded as batch.
+    // weight need transpose.
+    float weight_elem_size;
+    if (type == GGML_TYPE_Q4_0) {
+        weight_elem_size = float(sizeof(uint8_t)) / 2;
+    } else if (type == GGML_TYPE_Q8_0) {
+        weight_elem_size = float(sizeof(uint8_t));
+    } else {
+        GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
+    }
+    float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
+    size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
+    size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
+
+    // scale stored at the end of weight.
+    // scale need transpose.
+    size_t scale_elem_size = sizeof(uint16_t);
+    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
+    size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
+    char* scale_offset = (char*)src0->data + weight_size;
+
+    // input
+    size_t input_elem_size = sizeof(uint16_t);
+    int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
+    size_t input_nb[] = {input_elem_size,  input_ne[0] * input_elem_size};
+    size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
+    ggml_cann_pool_alloc input_alloctor(ctx.pool());
+    void* input_buffer = src1->data;
+
+    // case in
+    if (src1->type != GGML_TYPE_F16) {
+        aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
+        input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
+
+        int64_t* input_cast_ne = src1->ne;
+        size_t input_cast_nb[GGML_MAX_DIMS];
+        input_cast_nb[0] = sizeof(uint16_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
+        }
+
+        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
+            input_buffer,
+            ACL_FLOAT16,
+            input_elem_size, input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
+        aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
+
+        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
+    }
+
+    // output
+    size_t output_elem_size = sizeof(uint16_t);
+    size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
+    ggml_cann_pool_alloc output_allocator(ctx.pool());
+    void* output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
+    size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
+
+    // aclnn
+    int64_t max_elem_size = 65535;
+    int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
+    ggml_cann_pool_alloc workspace_allocator(ctx.pool());
+    aclOpExecutor* executor = nullptr;
+    uint64_t workspaceSize = 0;
+    void* workspaceAddr = nullptr;
+    for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
+        for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
+            int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
+            int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
+
+            int64_t batch1 = (n1 * src1->ne[2]) + c1;
+            int64_t batch0 = (n0 * src0->ne[2]) + c0;
+
+            aclTensor* acl_input_tensor = ggml_cann_create_tensor(
+                (char*)input_buffer + batch1 * input_stride,
+                ACL_FLOAT16,
+                input_elem_size, input_ne, input_nb, 2);
+
+            // first split
+            int64_t weight_ne_offset = 0;
+            int64_t weight_ne[2] = {max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0]};
+            int64_t scale_ne_offset = 0;
+            int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
+            int64_t output_ne_offset = 0;
+            int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
+
+            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
+                (char*)src0->data + batch0 * weight_stride,
+                ggml_cann_type_mapping(type),
+                weight_elem_size, weight_ne, weight_nb, 2,
+                ACL_FORMAT_ND, weight_ne_offset);
+            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
+                scale_offset + batch0 * scale_stride,
+                ACL_FLOAT16,
+                scale_elem_size, scale_ne, scale_nb, 2,
+                ACL_FORMAT_ND, scale_ne_offset);
+            aclTensor* acl_output_tensor = ggml_cann_create_tensor(
+                (char*)output_buffer + batch1 * output_stride,
+                ACL_FLOAT16,
+                output_elem_size, output_ne, output_nb, 2,
+                ACL_FORMAT_ND, output_ne_offset);
+
+            ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
+                acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
+                nullptr, nullptr, nullptr, nullptr, QK8_0,
+                acl_output_tensor, &workspaceSize, &executor));
+            if (workspaceAddr == nullptr) {
+                workspaceAddr = workspace_allocator.alloc(workspaceSize);
+            }
+            ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
+                workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+            ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+
+            // other splits
+            for (int64_t split = 1; split < split_size; split++) {
+                weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
+                weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
+                scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
+                scale_ne[0] = weight_ne[0];
+                output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
+                output_ne[0] = weight_ne[0];
+
+                acl_weight_tensor = ggml_cann_create_tensor(
+                    (char*)src0->data + batch0 * weight_stride,
+                    ggml_cann_type_mapping(type),
+                    weight_elem_size, weight_ne, weight_nb, 2,
+                    ACL_FORMAT_ND, weight_ne_offset);
+                acl_scale_tensor = ggml_cann_create_tensor(
+                    scale_offset + batch0 * scale_stride,
+                    ACL_FLOAT16,
+                    scale_elem_size, scale_ne, scale_nb, 2,
+                    ACL_FORMAT_ND, scale_ne_offset);
+                acl_output_tensor = ggml_cann_create_tensor(
+                    (char*)output_buffer + batch1 * output_stride,
+                    ACL_FLOAT16,
+                    output_elem_size, output_ne, output_nb, 2,
+                    ACL_FORMAT_ND, output_ne_offset);
+
+                ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
+                    acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
+                    nullptr, nullptr, nullptr, nullptr, QK8_0,
+                    acl_output_tensor, &workspaceSize, &executor));
+                ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
+                    workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+                ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
+                ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
+                ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+            }
+
+            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
+        }
+    }
+
+    // cast out
+    if (dst->type != GGML_TYPE_F16) {
+        int64_t* output_cast_ne = dst->ne;
+        size_t output_cast_nb[GGML_MAX_DIMS];
+        output_cast_nb[0] = sizeof(uint16_t);
+        for (int i = 1; i < GGML_MAX_DIMS; i++) {
+            output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
+        }
+
+        aclTensor* acl_output_tensor = ggml_cann_create_tensor(
+            output_buffer,
+            ACL_FLOAT16,
+            output_elem_size, output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
+        aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
+        aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
+
+        ACL_CHECK(aclDestroyTensor(acl_output_tensor));
+        ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
+    }
+}
+
 void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     const enum ggml_type type = dst->src[0]->type;
     switch (type) {
         case GGML_TYPE_F32:
         case GGML_TYPE_F16:
-            ggml_cann_mat_mul_fp(ctx, dst);
+            ggml_cann_mat_mul_fp2(ctx, dst);
             break;
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q8_0:
-            ggml_cann_mul_mat_quant(ctx, dst, type);
+            ggml_cann_mul_mat_quant2(ctx, dst, type);
             break;
         default:
             GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h
index edfa496148ff2..5164cb74ec92e 100644
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@@ -211,17 +211,20 @@ struct ggml_cann_pool_alloc {
 struct ggml_backend_cann_context {
     int32_t device;                  /**< Device ID. */
     std::string name;                /**< Name of the device. */
+    std::string description;         /**< Description of the device. */
     aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
 
-    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
-        {nullptr}}; /**< Array of streams for the device. */
+    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
 
     /**
      * @brief Constructor for initializing the context with a given device.
      * @param device Device ID.
      */
     explicit ggml_backend_cann_context(int device)
-        : device(device), name("CANN" + std::to_string(device)) {}
+        : device(device), name("CANN" + std::to_string(device)) {
+        ggml_cann_set_device(device);
+        description = aclrtGetSocName();
+    }
 
     /**
      * @brief Destructor for cleaning up resources.
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 776340881434d..a9a1cfb6356c2 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -122,6 +122,10 @@ static ggml_cann_device_info ggml_cann_init() {
         ACL_CHECK(aclrtMemGetAllocationGranularity(
             &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
             &info.devices[id].vmm_granularity));
+
+        size_t free, total;
+        ggml_backend_cann_get_device_memory(id, &free, &total);
+        info.devices[id].total_vram = free;
     }
 
     // TODO: add more device info later.
@@ -208,6 +212,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
      * @return A pointer to the allocated buffer.
      */
     void* alloc(size_t size, size_t* actual_size) override {
+        const size_t alignment = 128;
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
 #ifdef DEBUG_CANN_MALLOC
         int nnz = 0;
         size_t max_size = 0;
@@ -246,13 +255,11 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
             return ptr;
         }
         void* ptr;
-        size_t look_ahead_size = (size_t)(1.05 * size);
-        look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
         ggml_cann_set_device(device);
         ACL_CHECK(
-            aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
-        *actual_size = look_ahead_size;
-        pool_size += look_ahead_size;
+            aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
+        *actual_size = size;
+        pool_size += size;
 #ifdef DEBUG_CANN_MALLOC
         GGML_LOG_INFO(
             "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@@ -294,9 +301,9 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
  */
 struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
-     * @brief The maximum size of the virtual memory pool (32 GB).
+     * @brief The maximum size of the virtual memory pool.
      */
-    static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35;  // 32 GB
+    size_t max_size;
 
     /**
      * @brief The device ID associated with this buffer pool.
@@ -334,6 +341,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     std::vector<void*> map_offsets;
 
     /**
+     * @brief Constructor to initialize the buffer pool with virtual memory for
      * @brief Constructor to initialize the buffer pool with virtual memory for
      * a specific device.
      *
@@ -341,7 +349,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
      */
     explicit ggml_cann_pool_vmm(int device)
         : device(device),
-          granularity(ggml_cann_info().devices[device].vmm_granularity) {}
+          granularity(ggml_cann_info().devices[device].vmm_granularity) {
+        auto dev = ggml_cann_info().devices[device];
+        granularity = dev.vmm_granularity;
+        max_size = dev.total_vram;
+    }
 
     /**
      * @brief Destructor to free all buffers in the virtual memory pool.
@@ -370,17 +382,19 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
         // round up the allocation size to the alignment to ensure that all
         // allocations are aligned for all data types
         const size_t alignment = 128;
-        size = alignment * ((size + alignment - 1) / alignment);
+        size = GGML_PAD(size, alignment);
+        if (size == 0) {
+            size = alignment;
+        }
 
         size_t avail = pool_size - pool_used;
 
         if (size > avail) {
             // round up to the next multiple of the granularity
             size_t reserve_size = size - avail;
-            reserve_size =
-                granularity * ((reserve_size + granularity - 1) / granularity);
+            reserve_size = GGML_PAD(reserve_size, granularity);
 
-            GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
+            GGML_ASSERT(pool_size + reserve_size <= max_size);
 
             // allocate more physical memory
             aclrtPhysicalMemProp prop = {};
@@ -396,7 +410,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // reserve virtual address space (if not already reserved)
             if (pool_addr == 0) {
                 ACL_CHECK(aclrtReserveMemAddress(
-                    &pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
+                    &pool_addr, max_size, 0, NULL, 1));
             }
 
             // map at the end of the pool
@@ -409,10 +423,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
             // add to the pool
             pool_size += reserve_size;
 
-            // GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
-            // reserved %llu MB)\n",
-            //       device, (unsigned long long) (pool_size/1024/1024),
-            //       (unsigned long long) (reserve_size/1024/1024));
+#ifdef DEBUG_CANN_MALLOC
+             GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+                   device, (unsigned long long) (pool_size/1024/1024),
+                   (unsigned long long) (reserve_size/1024/1024));
+#endif
         }
 
         GGML_ASSERT(pool_addr != 0);
@@ -457,8 +472,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  */
 std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
     int device) {
-    // return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
-    return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
+    if (device == 0) {
+        return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
+    }
+    return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
 }
 
 // cann buffer
@@ -470,23 +487,22 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
  */
 struct ggml_backend_cann_buffer_context {
     int32_t device;  ///< The device ID associated with this buffer context.
-    void* dev_ptr =
-        nullptr;  ///< Pointer to the device memory allocated for the buffer.
+    ggml_cann_pool_alloc* alloc;  ///< Pointer to the device memory allocated for the buffer.
 
     /**
      * @brief Constructor to initialize the CANN buffer context.
      *
      * @param device The device ID associated with this buffer context.
-     * @param dev_ptr Pointer to the device memory allocated for the buffer.
+     * @param alloc Pointer to the device memory allocated for the buffer.
      */
-    ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
+    ggml_backend_cann_buffer_context(int32_t device, ggml_cann_pool_alloc* alloc)
         : device(device),
-          dev_ptr(dev_ptr) {}
+          alloc(alloc) {}
 
     /**
      * @brief Destructor to free the device memory allocated for the buffer.
      */
-    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
+    ~ggml_backend_cann_buffer_context() { delete alloc; }
 };
 
 /**
@@ -532,7 +548,7 @@ static void* ggml_backend_cann_buffer_get_base(
     ggml_backend_buffer_t buffer) {
     ggml_backend_cann_buffer_context* ctx =
         (ggml_backend_cann_buffer_context*)buffer->context;
-    return ctx->dev_ptr;
+    return ctx->alloc->get();
 }
 
 /**
@@ -939,7 +955,7 @@ static void ggml_backend_cann_buffer_clear(
         (ggml_backend_cann_buffer_context*)buffer->context;
 
     ggml_cann_set_device(ctx->device);
-    ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
+    ACL_CHECK(aclrtMemset(ctx->alloc->get(), buffer->size, value, buffer->size));
 }
 
 /**
@@ -1001,25 +1017,13 @@ static const char* ggml_backend_cann_buffer_type_name(
 static ggml_backend_buffer_t
 ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
                                            size_t size) {
-    ggml_backend_cann_buffer_type_context* buft_ctx =
-        (ggml_backend_cann_buffer_type_context*)buft->context;
-
-    ggml_cann_set_device(buft_ctx->device);
-
-    size = std::max(size, (size_t)1);
+    ggml_backend_cann_context* cann_ctx =
+        (ggml_backend_cann_context*)buft->device->context;
 
-    void* dev_ptr;
-    aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
-    if (err != ACL_SUCCESS) {
-        GGML_LOG_ERROR(
-            "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
-            __func__, size / 1024.0 / 1024.0, buft_ctx->device,
-            aclGetRecentErrMsg());
-        return nullptr;
-    }
+    ggml_cann_pool_alloc* alloc = new ggml_cann_pool_alloc(cann_ctx->pool(), size);
 
     ggml_backend_cann_buffer_context* ctx =
-        new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
+        new ggml_backend_cann_buffer_context(cann_ctx->device, alloc);
 
     return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
                                     ctx, size);
@@ -1130,10 +1134,10 @@ ggml_backend_cann_buffer_type(int32_t device) {
     static bool ggml_backend_cann_buffer_type_initialized = false;
 
     if (!ggml_backend_cann_buffer_type_initialized) {
-        for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
+        for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
             ggml_backend_cann_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_cann_buffer_type_interface,
-                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
                 /* .context  = */
                  new ggml_backend_cann_buffer_type_context{
                     i, "CANN" + std::to_string(i)},
@@ -1199,10 +1203,15 @@ static void * ggml_cann_host_malloc(size_t size) {
         return nullptr;
     }
 
+    const size_t alignment = 128;
+    size = GGML_PAD(size, alignment);
+    if (size == 0) {
+        size = alignment;
+    }
+
     void * hostPtr = nullptr;
     aclError err = aclrtMallocHost((void **) &hostPtr, size);
     if (err != ACL_SUCCESS) {
-
         GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                            size / 1024.0 / 1024.0, aclGetRecentErrMsg());
         return nullptr;
@@ -1863,17 +1872,17 @@ struct ggml_backend_cann_device_context {
 };
 
 static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
     return ctx->name.c_str();
 }
 
 static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
     return ctx->description.c_str();
 }
 
 static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
     ggml_backend_cann_get_device_memory(ctx->device, free, total);
 }
 
@@ -1900,7 +1909,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
 
 static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
     return ggml_backend_cann_init(ctx->device);
 }
 
@@ -1920,7 +1929,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
 static bool ggml_backend_cann_supports_buft(
     ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
     if (ggml_backend_buft_is_cann(buft)) {
-        ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+        ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context;
         ggml_backend_cann_buffer_type_context * buft_ctx =
                         (ggml_backend_cann_buffer_type_context *)buft->context;
         return buft_ctx->device == dev_ctx->device;
@@ -1929,7 +1938,7 @@ static bool ggml_backend_cann_supports_buft(
 }
 
 static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_context * ctx = (ggml_backend_cann_context*)dev->context;
     return ggml_backend_cann_buffer_type(ctx->device);
 }
 
@@ -1950,7 +1959,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
  */
 static ggml_backend_event_t ggml_backend_cann_device_event_new(
     ggml_backend_dev_t dev) {
-    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
+    ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context;
 
     ggml_cann_set_device(dev_ctx->device);
 
@@ -2058,11 +2067,7 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
             ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
 
             for (int i = 0; i < ggml_cann_info().device_count; i++) {
-                ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
-                dev_ctx->description = aclrtGetSocName();
-                dev_ctx->device = i;
-                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
-                ggml_cann_set_device(i);
+                ggml_backend_cann_context* dev_ctx = new ggml_backend_cann_context(i);
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .interface = */ ggml_backend_cann_device_interface,
                     /* .reg       = */ &reg,
@@ -2090,17 +2095,12 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
         return nullptr;
     }
 
-    ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
-    if (ctx == nullptr) {
-        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
-        return nullptr;
-    }
-    ggml_cann_set_device(ctx->device);
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device);
     ggml_backend_t cann_backend =
         new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
                          /* .interface = */ ggml_backend_cann_interface,
-                         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
-                         /* .context   = */ ctx};
+                         /* .device    = */ dev,
+                         /* .context   = */ dev->context};
 
     return cann_backend;
 }

From df68663a6314341a578fbe1a9f6376c6fcc68c38 Mon Sep 17 00:00:00 2001
From: shanshan shen <shanshanshen333@gmail.com>
Date: Mon, 25 Nov 2024 08:05:54 +0000
Subject: [PATCH 2/5] some modification after review

---
 ggml/src/ggml-cann/aclnn_ops.cpp | 197 +------------------------------
 ggml/src/ggml-cann/ggml-cann.cpp |  32 +++--
 2 files changed, 22 insertions(+), 207 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 1683da2affc2b..25ffd912670b7 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2531,47 +2531,6 @@ static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
  * multiplication will be stored.
  */
 static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
-                                 ggml_tensor* dst) {
-    ggml_tensor* weight = dst->src[0];  // weight
-    ggml_tensor* input = dst->src[1];   // input
-
-    // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
-    // broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
-    BCAST_MUL_MAT_SHAPE(input, weight, dst);
-
-    // transpose weight: [1,2,3,4] -> [1,2,4,3]
-    int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
-                              bcast_weight_ne[2], bcast_weight_ne[3],
-                              bcast_weight_ne[4], bcast_weight_ne[5]};
-    size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
-                             bcast_weight_nb[2], bcast_weight_nb[3],
-                             bcast_weight_nb[4], bcast_weight_nb[5]};
-
-    aclTensor* acl_weight_tensor =
-        ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
-    aclTensor* acl_input_tensor =
-        ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
-    aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
-
-    ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_input_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
-}
-
-/**
- * @brief Performs matrix multiplication with floating-point precision on
- * tensors using the CANN backend.
- *
- * This function performs matrix multiplication of the input tensor and the
- * weight tensor, handling broadcasting and transposing as needed, and stores
- * the result in the destination tensor `dst`.
- *
- * @param ctx The context for the CANN backend operations.
- * @param dst The destination tensor where the result of the matrix
- * multiplication will be stored.
- */
-static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx,
                                   ggml_tensor* dst) {
     ggml_tensor* weight = dst->src[0];  // weight
     ggml_tensor* input = dst->src[1];   // input
@@ -2637,158 +2596,6 @@ static void ggml_cann_mat_mul_fp2(ggml_backend_cann_context& ctx,
  * multiplication will be stored.
  */
 static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
-                                   ggml_tensor* dst,
-                                   const enum ggml_type type) {
-    ggml_tensor* src0 = dst->src[0];  // weight
-    ggml_tensor* src1 = dst->src[1];  // input
-
-    // The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
-    // is regarded as batch. weight need transpose.
-    int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
-    float weight_elem_size;
-    if (type == GGML_TYPE_Q4_0) {
-        weight_elem_size = float(sizeof(uint8_t)) / 2;
-    }
-    else if (type == GGML_TYPE_Q8_0) {
-        weight_elem_size = float(sizeof(uint8_t));
-    }
-    else {
-        GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
-    }
-    float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
-
-    // size of one matrix is element_size * height * width.
-    size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
-    size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
-
-    // scale stored at the end of weight. Also need transpose.
-    GGML_ASSERT(QK4_0 == QK8_0);
-    int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
-    size_t scale_elem_size = sizeof(uint16_t);
-    size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
-                         scale_elem_size};
-    size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
-    char* scale_offset = (char*)src0->data + weight_size;
-
-    // input
-    void* input_buffer;
-    size_t input_elem_size = sizeof(uint16_t);
-    int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
-    size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
-    size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
-
-    ggml_cann_pool_alloc input_alloctor(ctx.pool());
-    if (src1->type != GGML_TYPE_F16) {
-        aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
-        input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
-        input_buffer = input_alloctor.get();
-
-        int64_t* input_cast_ne = src1->ne;
-        size_t input_cast_nb[GGML_MAX_DIMS];
-        input_cast_nb[0] = sizeof(uint16_t);
-        for (int i = 1; i < GGML_MAX_DIMS; i++) {
-            input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
-        }
-
-        aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-            input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
-            input_cast_nb, GGML_MAX_DIMS);
-        aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
-        ACL_CHECK(aclDestroyTensor(acl_input_tensor));
-        ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
-    } else {
-        input_buffer = src1->data;
-    }
-
-    // output
-    size_t output_elem_size = sizeof(uint16_t);
-    int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
-    size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
-    ggml_cann_pool_alloc output_alloctor(
-        ctx.pool(), ggml_nelements(dst) * output_elem_size);
-    void* output_buffer = output_alloctor.get();
-    size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
-
-    // aclnn
-    uint64_t workspaceSize = 0;
-    aclOpExecutor* executor;
-    void* workspaceAddr = nullptr;
-
-    for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
-        for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
-            int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
-            int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
-
-            int64_t batch1 = n1 * src1->ne[2] + c1;
-            int64_t batch0 = n0 * src0->ne[2] + c0;
-
-            aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-                (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
-                input_elem_size, input_ne, input_nb, 2);
-            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
-                (char*)src0->data + batch0 * weight_stride,
-                ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
-                weight_nb, 2);
-            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
-                scale_offset + batch0 * scale_stride, ACL_FLOAT16,
-                scale_elem_size, scale_ne, scale_nb, 2);
-            aclTensor* acl_output_tensor = ggml_cann_create_tensor(
-                (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
-                output_elem_size, output_ne, output_nb, 2);
-
-            ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
-                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
-                nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
-                &workspaceSize, &executor));
-
-            if (workspaceSize > 0 && workspaceAddr == nullptr) {
-                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
-                                                         workspaceSize);
-                workspaceAddr = workspace_allocator.get();
-            }
-
-            ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
-                workspaceAddr, workspaceSize, executor, ctx.stream()));
-
-            ACL_CHECK(aclDestroyTensor(acl_input_tensor));
-            ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
-            ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
-            ACL_CHECK(aclDestroyTensor(acl_output_tensor));
-        }
-    }
-
-    // cast out
-    int64_t* output_cast_ne = dst->ne;
-    size_t output_cast_nb[GGML_MAX_DIMS];
-    output_cast_nb[0] = sizeof(uint16_t);
-    for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
-    }
-
-    aclTensor* acl_output_tensor =
-        ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
-                                output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
-    aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
-    aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
-
-    ACL_CHECK(aclDestroyTensor(acl_output_tensor));
-    ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
-}
-
-/**
- * @brief Performs matrix multiplication with quantized weights and
- * floating-point inputs using the CANN backend.
- *
- * This function performs matrix multiplication of the input tensor `src1` and
- * the weight tensor `src0`, handling broadcasting, transposing, and
- * quantization as needed, and stores the result in the destination tensor
- * `dst`.
- *
- * @param ctx The context for the CANN backend operations.
- * @param dst The destination tensor where the result of the matrix
- * multiplication will be stored.
- */
-static void ggml_cann_mul_mat_quant2(ggml_backend_cann_context& ctx,
                                      ggml_tensor* dst,
                                      const enum ggml_type type) {
     ggml_tensor* src0 = dst->src[0];  // weight
@@ -2979,11 +2786,11 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     switch (type) {
         case GGML_TYPE_F32:
         case GGML_TYPE_F16:
-            ggml_cann_mat_mul_fp2(ctx, dst);
+            ggml_cann_mat_mul_fp(ctx, dst);
             break;
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q8_0:
-            ggml_cann_mul_mat_quant2(ctx, dst, type);
+            ggml_cann_mul_mat_quant(ctx, dst, type);
             break;
         default:
             GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index a9a1cfb6356c2..531e87c7a5d31 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -341,7 +341,6 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
     std::vector<void*> map_offsets;
 
     /**
-     * @brief Constructor to initialize the buffer pool with virtual memory for
      * @brief Constructor to initialize the buffer pool with virtual memory for
      * a specific device.
      *
@@ -1872,17 +1871,17 @@ struct ggml_backend_cann_device_context {
 };
 
 static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
-    ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
     return ctx->name.c_str();
 }
 
 static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
-    ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
     return ctx->description.c_str();
 }
 
 static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
     ggml_backend_cann_get_device_memory(ctx->device, free, total);
 }
 
@@ -1909,7 +1908,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
 
 static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
-    ggml_backend_cann_context * ctx = (ggml_backend_cann_context *)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
     return ggml_backend_cann_init(ctx->device);
 }
 
@@ -1929,7 +1928,7 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
 static bool ggml_backend_cann_supports_buft(
     ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
     if (ggml_backend_buft_is_cann(buft)) {
-        ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context;
+        ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
         ggml_backend_cann_buffer_type_context * buft_ctx =
                         (ggml_backend_cann_buffer_type_context *)buft->context;
         return buft_ctx->device == dev_ctx->device;
@@ -1938,7 +1937,7 @@ static bool ggml_backend_cann_supports_buft(
 }
 
 static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_cann_context * ctx = (ggml_backend_cann_context*)dev->context;
+    ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
     return ggml_backend_cann_buffer_type(ctx->device);
 }
 
@@ -1959,7 +1958,7 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
  */
 static ggml_backend_event_t ggml_backend_cann_device_event_new(
     ggml_backend_dev_t dev) {
-    ggml_backend_cann_context * dev_ctx = (ggml_backend_cann_context *)dev->context;
+    ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
 
     ggml_cann_set_device(dev_ctx->device);
 
@@ -2067,7 +2066,11 @@ ggml_backend_reg_t ggml_backend_cann_reg() {
             ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
 
             for (int i = 0; i < ggml_cann_info().device_count; i++) {
-                ggml_backend_cann_context* dev_ctx = new ggml_backend_cann_context(i);
+                ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
+                dev_ctx->description = aclrtGetSocName();
+                dev_ctx->device = i;
+                dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
+                ggml_cann_set_device(i);
                 ggml_backend_dev_t dev = new ggml_backend_device {
                     /* .interface = */ ggml_backend_cann_device_interface,
                     /* .reg       = */ &reg,
@@ -2095,12 +2098,17 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
         return nullptr;
     }
 
-    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device);
+    ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
+    if (ctx == nullptr) {
+        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
+        return nullptr;
+    }
+    ggml_cann_set_device(ctx->device);
     ggml_backend_t cann_backend =
         new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
                          /* .interface = */ ggml_backend_cann_interface,
-                         /* .device    = */ dev,
-                         /* .context   = */ dev->context};
+                         /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
+                         /* .context   = */ ctx};
 
     return cann_backend;
 }

From 1c79893ca211bdc7c086bd6e12db86496f85ed44 Mon Sep 17 00:00:00 2001
From: shanshan shen <shanshanshen333@gmail.com>
Date: Tue, 26 Nov 2024 07:09:55 +0000
Subject: [PATCH 3/5] some modifications after review

---
 ggml/src/ggml-cann/ggml-cann.cpp | 38 +++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 531e87c7a5d31..86d3fb1ff65b1 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -471,10 +471,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
  */
 std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
     int device) {
-    if (device == 0) {
-        return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
-    }
-    return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
+    return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
 }
 
 // cann buffer
@@ -486,22 +483,21 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
  */
 struct ggml_backend_cann_buffer_context {
     int32_t device;  ///< The device ID associated with this buffer context.
-    ggml_cann_pool_alloc* alloc;  ///< Pointer to the device memory allocated for the buffer.
+    void* dev_ptr = nullptr;
 
     /**
      * @brief Constructor to initialize the CANN buffer context.
      *
      * @param device The device ID associated with this buffer context.
-     * @param alloc Pointer to the device memory allocated for the buffer.
      */
-    ggml_backend_cann_buffer_context(int32_t device, ggml_cann_pool_alloc* alloc)
+    ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
         : device(device),
-          alloc(alloc) {}
+          dev_ptr(dev_ptr) {}
 
     /**
      * @brief Destructor to free the device memory allocated for the buffer.
      */
-    ~ggml_backend_cann_buffer_context() { delete alloc; }
+    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr));}
 };
 
 /**
@@ -547,7 +543,7 @@ static void* ggml_backend_cann_buffer_get_base(
     ggml_backend_buffer_t buffer) {
     ggml_backend_cann_buffer_context* ctx =
         (ggml_backend_cann_buffer_context*)buffer->context;
-    return ctx->alloc->get();
+    return ctx->dev_ptr;
 }
 
 /**
@@ -954,7 +950,7 @@ static void ggml_backend_cann_buffer_clear(
         (ggml_backend_cann_buffer_context*)buffer->context;
 
     ggml_cann_set_device(ctx->device);
-    ACL_CHECK(aclrtMemset(ctx->alloc->get(), buffer->size, value, buffer->size));
+    ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
 }
 
 /**
@@ -1016,13 +1012,25 @@ static const char* ggml_backend_cann_buffer_type_name(
 static ggml_backend_buffer_t
 ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
                                            size_t size) {
-    ggml_backend_cann_context* cann_ctx =
-        (ggml_backend_cann_context*)buft->device->context;
+    ggml_backend_cann_buffer_type_context* buft_ctx =
+        (ggml_backend_cann_buffer_type_context*)buft->context;
+
+    ggml_cann_set_device(buft_ctx->device);
 
-    ggml_cann_pool_alloc* alloc = new ggml_cann_pool_alloc(cann_ctx->pool(), size);
+    size = std::max(size, (size_t)1);
+
+    void* dev_ptr;
+    aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    if (err != ACL_SUCCESS) {
+        GGML_LOG_ERROR(
+            "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
+            __func__, size / 1024.0 / 1024.0, buft_ctx->device,
+            aclGetRecentErrMsg());
+        return nullptr;
+    }
 
     ggml_backend_cann_buffer_context* ctx =
-        new ggml_backend_cann_buffer_context(cann_ctx->device, alloc);
+        new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
 
     return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
                                     ctx, size);

From e05a398fb3abb9dd94af4fccdcd1cf6b082829d3 Mon Sep 17 00:00:00 2001
From: shanshan shen <shanshanshen333@gmail.com>
Date: Tue, 26 Nov 2024 07:32:39 +0000
Subject: [PATCH 4/5] restore some modifications

---
 ggml/src/ggml-cann/aclnn_ops.cpp | 13 ++++++-------
 ggml/src/ggml-cann/ggml-cann.cpp |  8 +++++---
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 25ffd912670b7..881e50ac8170a 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2427,6 +2427,7 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
                           aclTensor* acl_weight, aclTensor* acl_dst) {
     int8_t cube_math_type = 1;  // ALLOW_FP32_DOWN_PRECISION, when input is
                                 // fp32, atlas a2 will transpose it to HFLOAT32.
+    
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;
@@ -2531,7 +2532,7 @@ static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, aclTensor* acl_inpu
  * multiplication will be stored.
  */
 static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
-                                  ggml_tensor* dst) {
+                                 ggml_tensor* dst) {
     ggml_tensor* weight = dst->src[0];  // weight
     ggml_tensor* input = dst->src[1];   // input
 
@@ -2596,8 +2597,8 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
  * multiplication will be stored.
  */
 static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
-                                     ggml_tensor* dst,
-                                     const enum ggml_type type) {
+                                   ggml_tensor* dst,
+                                   const enum ggml_type type) {
     ggml_tensor* src0 = dst->src[0];  // weight
     ggml_tensor* src1 = dst->src[1];  // input
 
@@ -2617,8 +2618,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
     size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
     size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
 
-    // scale stored at the end of weight.
-    // scale need transpose.
+    // scale stored at the end of weight. Also need transpose.
     size_t scale_elem_size = sizeof(uint16_t);
     size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size};
     size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
@@ -2677,8 +2677,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
             int64_t batch0 = (n0 * src0->ne[2]) + c0;
 
             aclTensor* acl_input_tensor = ggml_cann_create_tensor(
-                (char*)input_buffer + batch1 * input_stride,
-                ACL_FLOAT16,
+                (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
                 input_elem_size, input_ne, input_nb, 2);
 
             // first split
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index 3bb1666948433..ebccc51dedc3e 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -301,7 +301,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
  */
 struct ggml_cann_pool_vmm : public ggml_cann_pool {
     /**
-     * @brief The maximum size of the virtual memory pool.
+     * @brief The maximum size of the virtual memory pool (32 GB).
      */
     size_t max_size;
 
@@ -483,12 +483,14 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
  */
 struct ggml_backend_cann_buffer_context {
     int32_t device;  ///< The device ID associated with this buffer context.
-    void* dev_ptr = nullptr;
+    void* dev_ptr =
+        nullptr;  ///< Pointer to the device memory allocated for the buffer.
 
     /**
      * @brief Constructor to initialize the CANN buffer context.
      *
      * @param device The device ID associated with this buffer context.
+     * @param dev_ptr Pointer to the device memory allocated for the buffer.
      */
     ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
         : device(device),
@@ -497,7 +499,7 @@ struct ggml_backend_cann_buffer_context {
     /**
      * @brief Destructor to free the device memory allocated for the buffer.
      */
-    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr));}
+    ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
 };
 
 /**

From 33fd470550b112cabc4c35fe797c437c93f417c0 Mon Sep 17 00:00:00 2001
From: shanshan shen <shanshanshen333@gmail.com>
Date: Tue, 26 Nov 2024 08:49:26 +0000
Subject: [PATCH 5/5] restore some modifications

---
 ggml/src/ggml-cann/aclnn_ops.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 881e50ac8170a..4bfd5b5ddbd75 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2427,7 +2427,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
                           aclTensor* acl_weight, aclTensor* acl_dst) {
     int8_t cube_math_type = 1;  // ALLOW_FP32_DOWN_PRECISION, when input is
                                 // fp32, atlas a2 will transpose it to HFLOAT32.
-    
     uint64_t workspaceSize = 0;
     aclOpExecutor* executor;
     void* workspaceAddr = nullptr;