diff --git a/CMakeLists.txt b/CMakeLists.txt index a544f2da69d33..049515c83e95d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") +option(LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD "llama: use max release threshold for memory pool" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF) @@ -349,6 +350,9 @@ if (LLAMA_CUBLAS) endif() add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) + if (LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD) + add_compile_definitions(GGML_CUDA_MEMORY_POOL_USE_MAX_RELEASE_THRESHOLD) + endif() if (LLAMA_STATIC) if (WIN32) diff --git a/Makefile b/Makefile index ba73f063709c7..c7783b9080f50 100644 --- a/Makefile +++ b/Makefile @@ -413,6 +413,9 @@ else ifdef LLAMA_CUDA_DMMV_Y else MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1 endif # LLAMA_CUDA_MMV_Y +ifdef LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD + MK_NVCCFLAGS += -DGGML_CUDA_USE_MAX_RELEASE_THRESHOLD +endif # LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD ifdef LLAMA_CUDA_F16 MK_NVCCFLAGS += -DGGML_CUDA_F16 endif # LLAMA_CUDA_F16 diff --git a/ggml-cuda.cu b/ggml-cuda.cu index db9da24594cb2..8b9be462d9d98 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -153,6 +153,11 @@ // max batch size to use MMQ kernels when tensor cores are available #define MMQ_MAX_BATCH_SIZE 32 +// The release threshold specifies the maximum amount of memory the CUDA memory pool caches +// Default value is 0. This means all unused memory is released back to the OS on every synchronization operation +// Define to use the maximum release threshold. Recommended when a single proces uses the GPU device +// #define GGML_CUDA_USE_MAX_RELEASE_THRESHOLD + #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 @@ -10628,7 +10633,9 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - CUDA_CHECK(cudaFree(ctx->dev_ptr)); + cudaStream_t main_stream = g_cudaStreams[ctx->device][0]; + CUDA_CHECK(cudaFreeAsync(ctx->dev_ptr, main_stream)); + CUDA_CHECK(cudaStreamSynchronize(main_stream)); delete ctx; } @@ -10744,10 +10751,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 + cudaStream_t main_stream = g_cudaStreams[buft_ctx->device][0]; void * dev_ptr; - cudaError_t err = cudaMalloc(&dev_ptr, size); + cudaError_t err = cudaMallocAsync(&dev_ptr, size, main_stream); + cudaStreamSynchronize(main_stream); if (err != cudaSuccess) { - fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); + fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMallocAsync failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); return nullptr; } @@ -11415,6 +11424,13 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { /* .context = */ ctx }; +#if defined(GGML_CUDA_USE_MAX_RELEASE_THRESHOLD) + uint64_t release_threshold = UINT64_MAX; + cudaMemPool_t default_mem_pool; + CUDA_CHECK(cudaDeviceGetDefaultMemPool(&default_mem_pool, device)); + CUDA_CHECK(cudaMemPoolSetAttribute(default_mem_pool, cudaMemPoolAttrReleaseThreshold, &release_threshold)); +#endif + return cuda_backend; }