Skip to content

Add CUDA option to use the max release threshold for the default memory pool #5429

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD "llama: use max release threshold for memory pool" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
Expand Down Expand Up @@ -349,6 +350,9 @@ if (LLAMA_CUBLAS)
endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
if (LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD)
add_compile_definitions(GGML_CUDA_MEMORY_POOL_USE_MAX_RELEASE_THRESHOLD)
endif()

if (LLAMA_STATIC)
if (WIN32)
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,9 @@ else ifdef LLAMA_CUDA_DMMV_Y
else
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
endif # LLAMA_CUDA_MMV_Y
ifdef LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD
MK_NVCCFLAGS += -DGGML_CUDA_USE_MAX_RELEASE_THRESHOLD
endif # LLAMA_CUDA_USE_MAX_RELEASE_THRESHOLD
ifdef LLAMA_CUDA_F16
MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # LLAMA_CUDA_F16
Expand Down
22 changes: 19 additions & 3 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,11 @@
// max batch size to use MMQ kernels when tensor cores are available
#define MMQ_MAX_BATCH_SIZE 32

// The release threshold specifies the maximum amount of memory the CUDA memory pool caches
// Default value is 0. This means all unused memory is released back to the OS on every synchronization operation
// Define to use the maximum release threshold. Recommended when a single proces uses the GPU device
// #define GGML_CUDA_USE_MAX_RELEASE_THRESHOLD

#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300

Expand Down Expand Up @@ -10628,7 +10633,9 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer)

GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
CUDA_CHECK(cudaFree(ctx->dev_ptr));
cudaStream_t main_stream = g_cudaStreams[ctx->device][0];
CUDA_CHECK(cudaFreeAsync(ctx->dev_ptr, main_stream));
CUDA_CHECK(cudaStreamSynchronize(main_stream));
delete ctx;
}

Expand Down Expand Up @@ -10744,10 +10751,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe

size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0

cudaStream_t main_stream = g_cudaStreams[buft_ctx->device][0];
void * dev_ptr;
cudaError_t err = cudaMalloc(&dev_ptr, size);
cudaError_t err = cudaMallocAsync(&dev_ptr, size, main_stream);
Copy link
Collaborator

@Artefact2 Artefact2 Feb 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This breaks the build on ROCm. Consider

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 459e4376..9dacf34b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -73,6 +73,8 @@
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #endif
+#define cudaMallocAsync hipMallocAsync
+#define cudaFreeAsync hipFreeAsync
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyPeerAsync hipMemcpyPeerAsync

cudaStreamSynchronize(main_stream);
if (err != cudaSuccess) {
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMallocAsync failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
return nullptr;
}

Expand Down Expand Up @@ -11415,6 +11424,13 @@ GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
/* .context = */ ctx
};

#if defined(GGML_CUDA_USE_MAX_RELEASE_THRESHOLD)
uint64_t release_threshold = UINT64_MAX;
cudaMemPool_t default_mem_pool;
CUDA_CHECK(cudaDeviceGetDefaultMemPool(&default_mem_pool, device));
CUDA_CHECK(cudaMemPoolSetAttribute(default_mem_pool, cudaMemPoolAttrReleaseThreshold, &release_threshold));
#endif

return cuda_backend;
}

Expand Down