ggml-org · YavorGIvanov · Feb 9, 2024 · Artefact2 · Feb 9, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -96,6 +96,7 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                              "llama: max. batch size for using peer access")
+option(LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD  "llama: use max release threshold for memory pool" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
@@ -349,6 +350,9 @@ if (LLAMA_CUBLAS)
         endif()
         add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
         add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
+        if (LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD)
+            add_compile_definitions(GGML_CUDA_MEMORY_POOL_USE_MAX_RELEASE_THRESHOLD)
+        endif()
 
         if (LLAMA_STATIC)
             if (WIN32)

diff --git a/Makefile b/Makefile
@@ -413,6 +413,9 @@ else ifdef LLAMA_CUDA_DMMV_Y
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
+ifdef LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD
+	MK_NVCCFLAGS += -DGGML_CUDA_USE_MAX_RELEASE_THRESHOLD
+endif # LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD
 ifdef LLAMA_CUDA_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_F16

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -153,6 +153,11 @@
 // max batch size to use MMQ kernels when tensor cores are available
 #define MMQ_MAX_BATCH_SIZE 32
 
+// The release threshold specifies the maximum amount of memory the CUDA memory pool caches
+// Default value is 0. This means all unused memory is released back to the OS on every synchronization operation
+// Define to use the maximum release threshold. Recommended when a single proces uses the GPU device
+// #define GGML_CUDA_USE_MAX_RELEASE_THRESHOLD
+
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300
 
@@ -10628,7 +10633,9 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer)
 
 GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
-    CUDA_CHECK(cudaFree(ctx->dev_ptr));
+    cudaStream_t main_stream = g_cudaStreams[ctx->device][0];
+    CUDA_CHECK(cudaFreeAsync(ctx->dev_ptr, main_stream));
+    CUDA_CHECK(cudaStreamSynchronize(main_stream));
     delete ctx;
 }
 
@@ -10744,10 +10751,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
 
     size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
 
+    cudaStream_t main_stream = g_cudaStreams[buft_ctx->device][0];
     void * dev_ptr;
-    cudaError_t err = cudaMalloc(&dev_ptr, size);
+    cudaError_t err = cudaMallocAsync(&dev_ptr, size, main_stream);
+    cudaStreamSynchronize(main_stream);
     if (err != cudaSuccess) {
-        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
+        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMallocAsync failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
         return nullptr;
     }
 
@@ -11415,6 +11424,13 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
         /* .context   = */ ctx
     };
 
+#if defined(GGML_CUDA_USE_MAX_RELEASE_THRESHOLD)
+    uint64_t release_threshold = UINT64_MAX;
+    cudaMemPool_t default_mem_pool;
+    CUDA_CHECK(cudaDeviceGetDefaultMemPool(&default_mem_pool, device));
+    CUDA_CHECK(cudaMemPoolSetAttribute(default_mem_pool, cudaMemPoolAttrReleaseThreshold, &release_threshold));
+#endif
+
     return cuda_backend;
 }