fix qwzn2vl mrope position input

ngxson · ngxson · commit c4fea7fe6592 · 2025-03-25T18:35:47.000+01:00
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
@@ -66,8 +66,17 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
         memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
         memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));
 
+        // tranpose from layout 0123012301230123 to 0000111122223333
+        // TODO @ngxson : this is a low-effort solution, generated with the help of LLM; we should improve this in the future
+        std::vector<llama_pos> batch_mrope_pos_T(n_eval * 4);
+        for (int r = 0; r < 4; r++) {
+            for (int c = 0; c < n_eval; c++) {
+                batch_mrope_pos_T[c*4 + r] = batch_mrope_pos[r*n_eval + c];
+            }
+        }
+
         float * batch_embd = image_embed->embed+i*n_embd;
-        const llama_pos * pos = batch_mrope_pos.data();
+        const llama_pos * pos = batch_mrope_pos_T.data();
         auto batch = llama_batch_ext_ptr::init_from_embd(ctx_llama, batch_embd, n_eval, n_embd, pos, 0);
 
         if (llama_decode_ext(ctx_llama, batch.get())) {
@@ -90,13 +99,6 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
             n_eval = n_batch;
         }
 
-        // TODO: add mrope pos ids somewhere else
-        pos.resize(n_eval * 4);
-        std::fill(pos.begin(), pos.end(), 0);
-        for (int j = 0; j < n_eval * 3; j ++) {
-            pos[j] = *st_pos_id + (j % n_eval);
-        }
-
         llama_batch_ext_ptr batch(ctx_llama);
         for (int j = 0; j < n_eval; j++) {
             llama_token token = tokens[i + j];
diff --git a/include/llama.h b/include/llama.h
@@ -928,6 +928,7 @@ extern "C" {
     // Same with llama_batch_init, but initializes the batch with the provided raw embeddings
     // Size of embd should be n_tokens * n_embd
     // Size of pos  should be n_tokens * n_pos_per_token
+    // If one token has multiple pos, the pos must follow the order: 000011112222...
     // n_embd is the number of embeddings per token, can be obtained from llama_model_n_embd()
     // The sequence ID will be fixed to seq_id
     // The batch has to be freed with llama_batch_ext_free()
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -276,15 +276,16 @@ void llama_sbatch::from_batch(const llama_batch_ext & batch, size_t n_embd, bool
 
 llama_batch_allocr::llama_batch_allocr(struct llama_batch & in_batch, llama_pos p0) {
     batch = new llama_batch_ext{
-        /*n_tokens       =*/ in_batch.n_tokens,
-        /*max_tokens     =*/ in_batch.n_tokens,
-        /*is_view        =*/ false,
-        /*tokens         =*/ in_batch.token,
-        /*embd           =*/ in_batch.embd,
-        /*pos            =*/ in_batch.pos,
-        /*n_seq_id       =*/ in_batch.n_seq_id,
-        /*seq_id         =*/ in_batch.seq_id,
-        /*logits         =*/ in_batch.logits,
+        /*n_tokens        =*/ in_batch.n_tokens,
+        /*max_tokens      =*/ in_batch.n_tokens,
+        /*n_pos_per_token =*/ 1,
+        /*is_view         =*/ false,
+        /*tokens          =*/ in_batch.token,
+        /*embd            =*/ in_batch.embd,
+        /*pos             =*/ in_batch.pos,
+        /*n_seq_id        =*/ in_batch.n_seq_id,
+        /*seq_id          =*/ in_batch.seq_id,
+        /*logits          =*/ in_batch.logits,
     };
     GGML_ASSERT(batch->n_tokens > 0);
     if (!in_batch.pos) {
@@ -338,17 +339,18 @@ struct llama_batch llama_batch_get_one(
     };
 }
 
-static struct llama_batch_ext * llama_batch_ext_init_impl(int32_t n_tokens_alloc, int32_t n_embd, int32_t n_seq_max) {
+static struct llama_batch_ext * llama_batch_ext_init_impl(int32_t n_tokens_alloc, int32_t n_embd, int32_t n_seq_max, int32_t n_pos_per_token) {
     llama_batch_ext * batch = new llama_batch_ext{
-        /*n_tokens       =*/ 0,
-        /*max_tokens     =*/ n_tokens_alloc,
-        /*is_view        =*/ false,
-        /*tokens         =*/ nullptr,
-        /*embd           =*/ nullptr,
-        /*pos            =*/ nullptr,
-        /*n_seq_id       =*/ nullptr,
-        /*seq_id         =*/ nullptr,
-        /*logits         =*/ nullptr,
+        /*n_tokens        =*/ 0,
+        /*max_tokens      =*/ n_tokens_alloc,
+        /*n_pos_per_token =*/ n_pos_per_token,
+        /*is_view         =*/ false,
+        /*tokens          =*/ nullptr,
+        /*embd            =*/ nullptr,
+        /*pos             =*/ nullptr,
+        /*n_seq_id        =*/ nullptr,
+        /*seq_id          =*/ nullptr,
+        /*logits          =*/ nullptr,
     };
 
     if (n_embd) {
@@ -371,7 +373,8 @@ static struct llama_batch_ext * llama_batch_ext_init_impl(int32_t n_tokens_alloc
 }
 
 struct llama_batch_ext * llama_batch_ext_init(struct llama_context * ctx) {
-    return llama_batch_ext_init_impl(llama_n_batch(ctx), 0, llama_n_seq_max(ctx));
+    int32_t n_pos_per_token = llama_n_pos_per_token(llama_get_model(ctx));
+    return llama_batch_ext_init_impl(llama_n_batch(ctx), 0, llama_n_seq_max(ctx), n_pos_per_token);
 }
 
 struct llama_batch_ext * llama_batch_ext_init_from_embd(
@@ -381,10 +384,10 @@ struct llama_batch_ext * llama_batch_ext_init_from_embd(
                   size_t   n_embd,
          const llama_pos * pos,
             llama_seq_id   seq_id) {
-    auto model = llama_get_model(ctx);
-    struct llama_batch_ext * batch = llama_batch_ext_init_impl(n_tokens, n_embd, 1);
+    int32_t n_pos_per_token = llama_n_pos_per_token(llama_get_model(ctx));
+    struct llama_batch_ext * batch = llama_batch_ext_init_impl(n_tokens, n_embd, 1, n_pos_per_token);
     memcpy(batch->embd, embd, n_tokens * n_embd * sizeof(float));
-    memcpy(batch->pos,  pos,  n_tokens * llama_n_pos_per_token(model) * sizeof(llama_pos));
+    memcpy(batch->pos,  pos,  n_tokens * n_pos_per_token * sizeof(llama_pos));
     for (size_t i = 0; i < n_tokens; i++) {
         batch->n_seq_id[i]    = 1;
         batch->seq_id  [i][0] = seq_id;
@@ -411,12 +414,16 @@ int32_t llama_batch_ext_add_text(
     }
     const int32_t output_id = batch->n_tokens;
     batch->token   [output_id] = token;
-    batch->pos     [output_id] = pos;
+    batch->n_seq_id[output_id] = n_seq_ids;
+    batch->logits  [output_id] = output;
+    for (int32_t i = 0; i < batch->n_pos_per_token; i++) {
+        // TODO: this is only used by qwen2vl for now, and text tokens only have 3 pos, the last is set to 0; we should improve this code in the future
+        batch->pos[output_id * batch->n_pos_per_token + i] = i < 3 ? pos : 0;
+    }
     batch->n_seq_id[output_id] = n_seq_ids;
     for (size_t j = 0; j < n_seq_ids; j++) {
         batch->seq_id[batch->n_tokens][j] = seq_ids[j];
     }
-    batch->logits  [output_id] = output;
     batch->n_tokens++;
     return output_id;
 }
@@ -461,15 +468,16 @@ struct llama_batch_ext * llama_batch_ext_get_view(
         return nullptr; // not yet supported
     }
     llama_batch_ext * batch_view = new llama_batch_ext{
-        /*n_tokens       =*/ n_tokens,
-        /*max_tokens     =*/ n_tokens,
-        /*is_view        =*/ true,
-        /*tokens         =*/ batch->token    + offset,
-        /*embd           =*/ nullptr,
-        /*pos            =*/ batch->pos      + offset,
-        /*n_seq_id       =*/ batch->n_seq_id + offset,
-        /*seq_id         =*/ batch->seq_id   + offset,
-        /*logits         =*/ batch->logits   + offset,
+        /*n_tokens        =*/ n_tokens,
+        /*max_tokens      =*/ n_tokens,
+        /*n_pos_per_token =*/ batch->n_pos_per_token,
+        /*is_view         =*/ true,
+        /*tokens          =*/ batch->token    + offset,
+        /*embd            =*/ nullptr,
+        /*pos             =*/ batch->pos      + offset * batch->n_pos_per_token,
+        /*n_seq_id        =*/ batch->n_seq_id + offset,
+        /*seq_id          =*/ batch->seq_id   + offset,
+        /*logits          =*/ batch->logits   + offset,
     };
     return batch_view;
 }
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -21,11 +21,12 @@
 struct llama_batch_ext {
     int32_t n_tokens;
     int32_t max_tokens;
+    int32_t n_pos_per_token = 1;
     bool is_view;
 
     llama_token  *  token;
     float        *  embd;
-    llama_pos    *  pos;
+    llama_pos    *  pos; // if multi pos per token: 000011112222...
     int32_t      *  n_seq_id;
     llama_seq_id ** seq_id;
     int8_t       *  logits; // TODO: rename this to "output"
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -6075,6 +6075,11 @@ struct llm_build_qwen2vl : public llm_graph_context {
         // inp_pos - contains the positions
         ggml_tensor * inp_pos = build_inp_pos();
 
+        // TODO @ngxson : transpose layout 0000111122223333 to 0123012301230123, we should improve this in the future
+        inp_pos = ggml_reshape_2d(ctx0, inp_pos, n_tokens, n_pos_per_token);
+        inp_pos = ggml_cont(ctx0, ggml_transpose(ctx0, inp_pos));
+        inp_pos = ggml_reshape_1d(ctx0, inp_pos, n_pos_per_token * n_tokens);
+
         auto * inp_attn = build_attn_inp_kv_unified();
 
         int sections[4];