llama : add llama_max_parallel_sequences()

ggerganov · ggerganov · commit 0a8cdc3ad408 · 2025-05-23T11:58:58.000+03:00
ggml-ci
diff --git a/include/llama.h b/include/llama.h
@@ -471,6 +471,7 @@ extern "C" {
     LLAMA_API int64_t llama_time_us(void);
 
     LLAMA_API size_t llama_max_devices(void);
+    LLAMA_API size_t llama_max_parallel_sequences(void);
 
     LLAMA_API bool llama_supports_mmap       (void);
     LLAMA_API bool llama_supports_mlock      (void);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -25,7 +25,12 @@ llama_context::llama_context(
 
     const auto & hparams = model.hparams;
 
-    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
+    cparams.n_seq_max = std::max(1u, params.n_seq_max);
+    if (cparams.n_seq_max > LLAMA_MAX_PARALLEL_SEQUENCES) {
+        LLAMA_LOG_WARN("%s: n_seq_max (%d) is larger than the maximum supported (%d) - clamping\n", __func__, cparams.n_seq_max, LLAMA_MAX_PARALLEL_SEQUENCES);
+        cparams.n_seq_max = LLAMA_MAX_PARALLEL_SEQUENCES;
+    }
+
     cparams.n_threads        = params.n_threads;
     cparams.n_threads_batch  = params.n_threads_batch;
     cparams.yarn_ext_factor  = params.yarn_ext_factor;
diff --git a/src/llama-impl.h b/src/llama-impl.h
@@ -5,6 +5,8 @@
 #include <string>
 #include <vector>
 
+#define LLAMA_MAX_PARALLEL_SEQUENCES 64
+
 #ifdef __GNUC__
 #    if defined(__MINGW32__) && !defined(__clang__)
 #        define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "llama.h"
+#include "llama-impl.h"
 
 #include <bitset>
 #include <cassert>
@@ -119,7 +119,7 @@ class llama_kv_cells_unified {
         seq[i].reset(seq_id);
 
         if (seq[i].none()) {
-            pos[i]= -1;
+            pos[i] = -1;
 
             used--;
 
@@ -267,6 +267,6 @@ class llama_kv_cells_unified {
     std::vector<llama_pos> shift;
 
     // TODO: assert n_seq_max <= 64
-    std::vector<std::bitset<64>> seq;
+    std::vector<std::bitset<LLAMA_MAX_PARALLEL_SEQUENCES>> seq;
 };
 
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -37,6 +37,10 @@ size_t llama_max_devices(void) {
     return 16;
 }
 
+size_t llama_max_parallel_sequences(void) {
+    return LLAMA_MAX_PARALLEL_SEQUENCES;
+}
+
 bool llama_supports_mmap(void) {
     return llama_mmap::SUPPORTED;
 }

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,10 @@ size_t llama_max_devices(void) {`
`37`	`37`	`return 16;`
`38`	`38`	`}`
`39`	`39`
	`40`	`+size_t llama_max_parallel_sequences(void) {`
	`41`	`+ return LLAMA_MAX_PARALLEL_SEQUENCES;`
	`42`	`+}`
	`43`	`+`
`40`	`44`	`bool llama_supports_mmap(void) {`
`41`	`45`	`return llama_mmap::SUPPORTED;`
`42`	`46`	`}`