llama : fix embeddings

ggerganov · ggerganov · commit 008f3fc7912b · 2024-02-29T15:44:07.000+02:00
ggml-ci
diff --git a/common/common.cpp b/common/common.cpp
@@ -1284,7 +1284,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.mul_mat_q         = params.mul_mat_q;
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
-    cparams.embedding         = params.embedding;
+    cparams.embeddings        = params.embedding;
     cparams.rope_scaling_type = params.rope_scaling_type;
     cparams.rope_freq_base    = params.rope_freq_base;
     cparams.rope_freq_scale   = params.rope_freq_scale;
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -19,7 +19,7 @@ static std::vector<std::string> split_lines(const std::string & s) {
 
 static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
     for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, false);
+        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
     }
 }
 
@@ -45,9 +45,13 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     }
 
     // normalize on copy
-    for (int k = 0; k < n_seq; k++) {
-        float * emb = llama_get_embeddings_ith(ctx, k);
-        float * out = output + k * n_embd;
+    for (int i = 0; i < batch.n_tokens; i++) {
+        if (!batch.logits[i]) {
+            continue;
+        }
+
+        float * emb = llama_get_embeddings_ith(ctx, i);
+        float * out = output + batch.seq_id[i][0] * n_embd;
         normalize(emb, out, n_embd);
     }
 }
@@ -145,6 +149,7 @@ int main(int argc, char ** argv) {
     for (int k = 0; k < n_prompts; k++) {
         // clamp to n_batch tokens
         auto & inp = inputs[k];
+
         const uint64_t n_toks = inp.size();
 
         // encode if at capacity
diff --git a/examples/server-embd.py b/examples/server-embd.py
@@ -0,0 +1,34 @@
+import asyncio
+import requests
+import numpy as np
+
+n = 8
+
+result = []
+
+async def requests_post_async(*args, **kwargs):
+    return await asyncio.to_thread(requests.post, *args, **kwargs)
+
+async def main():
+    model_url = "http://127.0.0.1:6900"
+    responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
+        url= f"{model_url}/embedding",
+        json= {"content": "0"*1024}
+    ) for i in range(n)])
+
+    for response in responses:
+        embedding = response.json()["embedding"]
+        print(embedding[-8:])
+        result.append(embedding)
+
+asyncio.run(main())
+
+# compute cosine similarity
+
+for i in range(n-1):
+    for j in range(i+1, n):
+        embedding1 = np.array(result[i])
+        embedding2 = np.array(result[j])
+        similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
+        print(f"Similarity between {i} and {j}: {similarity:.2f}")
+
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1271,7 +1271,7 @@ struct llama_server_context
         queue_results.send(res);
     }
 
-    void send_embedding(llama_client_slot &slot)
+    void send_embedding(llama_client_slot &slot, const llama_batch & batch)
     {
         task_result res;
         res.id = slot.task_id;
@@ -1280,6 +1280,7 @@ struct llama_server_context
         res.stop = true;
 
         const int n_embd = llama_n_embd(model);
+
         if (!params.embedding)
         {
             LOG_WARNING("embedding disabled", {
@@ -1292,12 +1293,19 @@ struct llama_server_context
         }
         else
         {
-            const float *data = llama_get_embeddings(ctx);
-            std::vector<float> embedding(data, data + n_embd);
-            res.result_json = json
-            {
-                {"embedding", embedding },
-            };
+            for (int i = 0; i < batch.n_tokens; ++i) {
+                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                    continue;
+                }
+
+                const float * data = llama_get_embeddings_ith(ctx, i);
+                std::vector<float> embedding(data, data + n_embd);
+
+                res.result_json = json
+                {
+                    {"embedding", embedding },
+                };
+            }
         }
         queue_results.send(res);
     }
@@ -1891,7 +1899,7 @@ struct llama_server_context
                                 ga_i += ga_w/ga_n;
                             }
                         }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
                         slot_npast++;
                     }
 
@@ -1927,7 +1935,7 @@ struct llama_server_context
 
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
         {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
 
             for (auto & slot : slots)
             {
@@ -2000,7 +2008,7 @@ struct llama_server_context
                 // prompt evaluated for embedding
                 if (slot.embedding)
                 {
-                    send_embedding(slot);
+                    send_embedding(slot, batch_view);
                     slot.release();
                     slot.i_batch = -1;
                     continue;
@@ -2359,7 +2367,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 break;
             }
             params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
         }
         else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
         {
diff --git a/llama.cpp b/llama.cpp
@@ -1646,6 +1646,7 @@ struct llama_cparams {
     float defrag_thold;
 
     bool mul_mat_q;
+    bool embeddings;
     bool offload_kqv;
     bool do_pooling;
 
@@ -1936,16 +1937,16 @@ struct llama_context {
     int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
     int32_t n_eval   = 0; // number of eval calls
 
-    // decode output (2-dimensional array: [n_tokens][n_vocab])
+    // logits output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
 #ifndef NDEBUG
     // guard against access to unset logits
     std::vector<bool>  logits_valid;
 #endif
     bool logits_all = false;
 
-    // input embedding (1-dimensional array: [n_embd])
-    std::vector<float> embedding;
+    // embeddings output (2-dimensional array: [n_tokens][n_embd])
+    std::vector<float> embeddings;
 
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
@@ -5987,9 +5988,10 @@ struct llm_build_context {
 
         // get input vectors with right size
         const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
-        struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
+
+        struct ggml_tensor * inp_pos  = ggml_view_1d(ctx0, lctx.inp_pos,  n_tokens, 0);
         struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
-        struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
+        struct ggml_tensor * inp_cls  = ggml_view_1d(ctx0, lctx.inp_cls,  n_tokens, 0);
 
         // construct input embeddings (token, type, position)
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -7971,17 +7973,17 @@ static int llama_decode_internal(
     ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
 
     // the output is always the last tensor in the graph
-    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
+    struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
 
     if (strcmp(res->name, "result_output") == 0) {
         // the embeddings could be the second to last tensor, or the third to last tensor
-        if (strcmp(embeddings->name, "result_norm") != 0) {
-            embeddings = gf->nodes[gf->n_nodes - 3];
-            GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
+        if (strcmp(embd->name, "result_norm") != 0) {
+            embd = gf->nodes[gf->n_nodes - 3];
+            GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
         }
     } else if (strcmp(res->name, "result_embd") == 0) {
-        embeddings = res;
+        embd = res;
         res = nullptr;
     } else {
         GGML_ASSERT(false);
@@ -8051,46 +8053,53 @@ static int llama_decode_internal(
         logits_out.clear();
 #endif
 
-        ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
-        GGML_ASSERT(res_backend != nullptr);
+        ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
+        GGML_ASSERT(backend_res != nullptr);
+
         if (batch.logits) {
             logits_out.resize(n_vocab * n_tokens);
             for (uint32_t i = 0; i < n_tokens; i++) {
                 if (batch.logits[i] == 0) {
                     continue;
                 }
-                ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
+                ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
 #ifndef NDEBUG
                 logits_valid[i] = true;
 #endif
             }
         } else if (lctx.logits_all) {
             logits_out.resize(n_vocab * n_tokens);
-            ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
+            ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
 #ifndef NDEBUG
             std::fill(logits_valid.begin(), logits_valid.end(), true);
 #endif
         } else {
             logits_out.resize(n_vocab);
-            ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
+            ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
 #ifndef NDEBUG
             logits_valid[0] = true;
 #endif
         }
-        ggml_backend_synchronize(res_backend);
+        ggml_backend_synchronize(backend_res);
     }
 
     // extract embeddings
-    if (!lctx.embedding.empty()) {
-        auto & embedding_out = lctx.embedding;
+    if (cparams.embeddings && embd) {
+        auto & embeddings_out = lctx.embeddings;
 
-        const int64_t embd_pos  = res ? n_embd * (n_tokens-1) : 0;
-        const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
+        ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
+        GGML_ASSERT(backend_embd != nullptr);
 
-        embedding_out.resize(embd_size);
-        ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
-        ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
-        ggml_backend_synchronize(embeddings_backend);
+        if (batch.logits) {
+            embeddings_out.resize(n_embd * n_tokens);
+            for (uint32_t i = 0; i < n_tokens; i++) {
+                if (batch.logits[i] == 0) {
+                    continue;
+                }
+                ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
+            }
+        }
+        ggml_backend_synchronize(backend_embd);
     }
 
     // measure the performance only for the single-token evals
@@ -11634,7 +11643,7 @@ struct llama_context_params llama_context_default_params() {
         /*.type_v                      =*/ GGML_TYPE_F16,
         /*.mul_mat_q                   =*/ true,
         /*.logits_all                  =*/ false,
-        /*.embedding                   =*/ false,
+        /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.do_pooling                  =*/ true,
     };
@@ -11785,6 +11794,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.yarn_beta_slow   = params.yarn_beta_slow;
     cparams.defrag_thold     = params.defrag_thold;
     cparams.mul_mat_q        = params.mul_mat_q;
+    cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.do_pooling       = params.do_pooling;
 
@@ -11933,8 +11943,8 @@ struct llama_context * llama_new_context_with_model(
         // resized during inference, reserve maximum
         ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
 
-        if (params.embedding) {
-            ctx->embedding.resize(hparams.n_embd);
+        if (params.embeddings) {
+            ctx->embeddings.reserve(hparams.n_embd*cparams.n_batch);
         }
 
         // graph inputs
@@ -12369,7 +12379,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
     // assume worst case for logits although only currently set ones are serialized
     const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
     const size_t s_embedding_size  = sizeof(size_t);
-    const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
+    const size_t s_embedding       = ctx->embeddings.capacity() * sizeof(float);
     const size_t s_kv_size         = sizeof(size_t);
     const size_t s_kv_ntok         = sizeof(int);
     const size_t s_kv              = ctx->kv_self.total_size();
@@ -12470,12 +12480,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
 
     // copy embeddings
     {
-        const size_t embedding_size = ctx->embedding.size();
+        const size_t embeddings_size = ctx->embeddings.size();
 
-        data_ctx->write(&embedding_size, sizeof(embedding_size));
+        data_ctx->write(&embeddings_size, sizeof(embeddings_size));
 
-        if (embedding_size) {
-            data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
+        if (embeddings_size) {
+            data_ctx->write(ctx->embeddings.data(), embeddings_size * sizeof(float));
         }
     }
 
@@ -12581,15 +12591,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
     // set embeddings
     {
-        size_t embedding_size;
+        size_t embeddings_size;
+
+        memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
 
-        memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
+        GGML_ASSERT(ctx->embeddings.capacity() == embeddings_size);
 
-        GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
+        if (embeddings_size) {
+            ctx->embeddings.resize(embeddings_size);
 
-        if (embedding_size) {
-            memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
-            inp += embedding_size * sizeof(float);
+            memcpy(ctx->embeddings.data(), inp, embeddings_size * sizeof(float));
+            inp += embeddings_size * sizeof(float);
         }
     }
 
@@ -12829,11 +12841,11 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
 }
 
 float * llama_get_embeddings(struct llama_context * ctx) {
-    return ctx->embedding.data();
+    return ctx->embeddings.data();
 }
 
 float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
-    return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
+    return ctx->embeddings.data() + i*ctx->model.hparams.n_embd;
 }
 
 const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
diff --git a/llama.h b/llama.h

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ static std::vector<std::string> split_lines(const std::string & s) {`
`19`	`19`
`20`	`20`	`static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {`
`21`	`21`	`for (size_t i = 0; i < tokens.size(); i++) {`
`22`		`- llama_batch_add(batch, tokens[i], i, { seq_id }, false);`
	`22`	`+ llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);`
`23`	`23`	`}`
`24`	`24`	`}`
`25`	`25`
`@@ -45,9 +45,13 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu`
`45`	`45`	`}`
`46`	`46`
`47`	`47`	`// normalize on copy`
`48`		`- for (int k = 0; k < n_seq; k++) {`
`49`		`- float * emb = llama_get_embeddings_ith(ctx, k);`
`50`		`- float * out = output + k * n_embd;`
	`48`	`+ for (int i = 0; i < batch.n_tokens; i++) {`
	`49`	`+ if (!batch.logits[i]) {`
	`50`	`+ continue;`
	`51`	`+ }`
	`52`	`+`
	`53`	`+ float * emb = llama_get_embeddings_ith(ctx, i);`
	`54`	`+ float * out = output + batch.seq_id[i][0] * n_embd;`
`51`	`55`	`normalize(emb, out, n_embd);`
`52`	`56`	`}`
`53`	`57`	`}`
`@@ -145,6 +149,7 @@ int main(int argc, char ** argv) {`
`145`	`149`	`for (int k = 0; k < n_prompts; k++) {`
`146`	`150`	`// clamp to n_batch tokens`
`147`	`151`	`auto & inp = inputs[k];`
	`152`	`+`
`148`	`153`	`const uint64_t n_toks = inp.size();`
`149`	`154`
`150`	`155`	`// encode if at capacity`
Original file line number	Diff line number	Diff line change
`@@ -1271,7 +1271,7 @@ struct llama_server_context`
`1271`	`1271`	`queue_results.send(res);`
`1272`	`1272`	`}`
`1273`	`1273`
`1274`		`- void send_embedding(llama_client_slot &slot)`
	`1274`	`+ void send_embedding(llama_client_slot &slot, const llama_batch & batch)`
`1275`	`1275`	`{`
`1276`	`1276`	`task_result res;`
`1277`	`1277`	`res.id = slot.task_id;`
`@@ -1280,6 +1280,7 @@ struct llama_server_context`
`1280`	`1280`	`res.stop = true;`
`1281`	`1281`
`1282`	`1282`	`const int n_embd = llama_n_embd(model);`
	`1283`	`+`
`1283`	`1284`	`if (!params.embedding)`
`1284`	`1285`	`{`
`1285`	`1286`	`LOG_WARNING("embedding disabled", {`
`@@ -1292,12 +1293,19 @@ struct llama_server_context`
`1292`	`1293`	`}`
`1293`	`1294`	`else`
`1294`	`1295`	`{`
`1295`		`- const float *data = llama_get_embeddings(ctx);`
`1296`		`- std::vector<float> embedding(data, data + n_embd);`
`1297`		`- res.result_json = json`
`1298`		`- {`
`1299`		`- {"embedding", embedding },`
`1300`		`- };`
	`1296`	`+ for (int i = 0; i < batch.n_tokens; ++i) {`
	`1297`	`+ if (!batch.logits[i] \|\| batch.seq_id[i][0] != slot.id) {`
	`1298`	`+ continue;`
	`1299`	`+ }`
	`1300`	`+`
	`1301`	`+ const float * data = llama_get_embeddings_ith(ctx, i);`
	`1302`	`+ std::vector<float> embedding(data, data + n_embd);`
	`1303`	`+`
	`1304`	`+ res.result_json = json`
	`1305`	`+ {`
	`1306`	`+ {"embedding", embedding },`
	`1307`	`+ };`
	`1308`	`+ }`
`1301`	`1309`	`}`
`1302`	`1310`	`queue_results.send(res);`
`1303`	`1311`	`}`
`@@ -1891,7 +1899,7 @@ struct llama_server_context`
`1891`	`1899`	`ga_i += ga_w/ga_n;`
`1892`	`1900`	`}`
`1893`	`1901`	`}`
`1894`		`- llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);`
	`1902`	`+ llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);`
`1895`	`1903`	`slot_npast++;`
`1896`	`1904`	`}`
`1897`	`1905`
`@@ -1927,7 +1935,7 @@ struct llama_server_context`
`1927`	`1935`
`1928`	`1936`	`for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)`
`1929`	`1937`	`{`
`1930`		`- const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));`
	`1938`	`+ const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);`
`1931`	`1939`
`1932`	`1940`	`for (auto & slot : slots)`
`1933`	`1941`	`{`
`@@ -2000,7 +2008,7 @@ struct llama_server_context`
`2000`	`2008`	`// prompt evaluated for embedding`
`2001`	`2009`	`if (slot.embedding)`
`2002`	`2010`	`{`
`2003`		`- send_embedding(slot);`
	`2011`	`+ send_embedding(slot, batch_view);`
`2004`	`2012`	`slot.release();`
`2005`	`2013`	`slot.i_batch = -1;`
`2006`	`2014`	`continue;`
`@@ -2359,7 +2367,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,`
`2359`	`2367`	`break;`
`2360`	`2368`	`}`
`2361`	`2369`	`params.n_batch = std::stoi(argv[i]);`
`2362`		`- params.n_batch = std::min(512, params.n_batch);`
`2363`	`2370`	`}`
`2364`	`2371`	`else if (arg == "--gpu-layers" \|\| arg == "-ngl" \|\| arg == "--n-gpu-layers")`
`2365`	`2372`	`{`