llama : handle aborts and compute errors

ggerganov · ggerganov · commit 7dc61c2d2719 · 2025-05-28T13:54:00.000+03:00
ggml-ci
diff --git a/include/llama.h b/include/llama.h
@@ -677,12 +677,14 @@ extern "C" {
 
     // Returns the smallest position present in the KV cache for the specified sequence
     // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_min(
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
 
     // Returns the largest position present in the KV cache for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -6,9 +6,10 @@
 #include "llama-model.h"
 #include "llama-kv-cache.h"
 
+#include <cinttypes>
 #include <cstring>
+#include <limits>
 #include <stdexcept>
-#include <cinttypes>
 
 //
 // llama_context
@@ -951,19 +952,48 @@ int llama_context::decode(llama_batch & inp_batch) {
 
         res->set_inputs(&ubatch);
 
+        int ret = 0;
+
         const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
         if (compute_status != GGML_STATUS_SUCCESS) {
             switch (compute_status) {
                 case GGML_STATUS_ABORTED:
-                    return 2;
+                    {
+                        ret = 2;
+                    } break;
                 case GGML_STATUS_ALLOC_FAILED:
-                    return -2;
+                    {
+                        ret = -2;
+                    } break;
                 case GGML_STATUS_FAILED:
                 default:
-                    return -3;
+                    {
+                        ret = -3;
+                    }
             }
         }
 
+        if (ret != 0) {
+            // the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
+            llama_pos pos_min[LLAMA_MAX_PARALLEL_SEQUENCES] = { std::numeric_limits<llama_pos>::max() };
+
+            for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+                const auto & seq_id = ubatch.seq_id[i][0];
+
+                pos_min[seq_id] = std::min(pos_min[seq_id], ubatch.pos[i]);
+            }
+
+            for (int s = 0; s < LLAMA_MAX_PARALLEL_SEQUENCES; ++s) {
+                if (pos_min[s] == std::numeric_limits<llama_pos>::max()) {
+                    continue;
+                }
+
+                llama_kv_self_seq_rm(this, s, pos_min[s], -1);
+            }
+
+            return ret;
+        }
+
         // plot the computation graph in dot format (for debugging purposes)
         //if (n_past%100 == 0) {
         //    ggml_graph_dump_dot(gf, NULL, "llama.dot");