Skip to content

Commit 008f3fc

Browse files
committed
llama : fix embeddings
ggml-ci
1 parent 87c91c0 commit 008f3fc

File tree

6 files changed

+119
-61
lines changed

6 files changed

+119
-61
lines changed

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1284,7 +1284,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12841284
cparams.mul_mat_q = params.mul_mat_q;
12851285
cparams.seed = params.seed;
12861286
cparams.logits_all = params.logits_all;
1287-
cparams.embedding = params.embedding;
1287+
cparams.embeddings = params.embedding;
12881288
cparams.rope_scaling_type = params.rope_scaling_type;
12891289
cparams.rope_freq_base = params.rope_freq_base;
12901290
cparams.rope_freq_scale = params.rope_freq_scale;

examples/embedding/embedding.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ static std::vector<std::string> split_lines(const std::string & s) {
1919

2020
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
2121
for (size_t i = 0; i < tokens.size(); i++) {
22-
llama_batch_add(batch, tokens[i], i, { seq_id }, false);
22+
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
2323
}
2424
}
2525

@@ -45,9 +45,13 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
4545
}
4646

4747
// normalize on copy
48-
for (int k = 0; k < n_seq; k++) {
49-
float * emb = llama_get_embeddings_ith(ctx, k);
50-
float * out = output + k * n_embd;
48+
for (int i = 0; i < batch.n_tokens; i++) {
49+
if (!batch.logits[i]) {
50+
continue;
51+
}
52+
53+
float * emb = llama_get_embeddings_ith(ctx, i);
54+
float * out = output + batch.seq_id[i][0] * n_embd;
5155
normalize(emb, out, n_embd);
5256
}
5357
}
@@ -145,6 +149,7 @@ int main(int argc, char ** argv) {
145149
for (int k = 0; k < n_prompts; k++) {
146150
// clamp to n_batch tokens
147151
auto & inp = inputs[k];
152+
148153
const uint64_t n_toks = inp.size();
149154

150155
// encode if at capacity

examples/server-embd.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import asyncio
2+
import requests
3+
import numpy as np
4+
5+
n = 8
6+
7+
result = []
8+
9+
async def requests_post_async(*args, **kwargs):
10+
return await asyncio.to_thread(requests.post, *args, **kwargs)
11+
12+
async def main():
13+
model_url = "http://127.0.0.1:6900"
14+
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
15+
url= f"{model_url}/embedding",
16+
json= {"content": "0"*1024}
17+
) for i in range(n)])
18+
19+
for response in responses:
20+
embedding = response.json()["embedding"]
21+
print(embedding[-8:])
22+
result.append(embedding)
23+
24+
asyncio.run(main())
25+
26+
# compute cosine similarity
27+
28+
for i in range(n-1):
29+
for j in range(i+1, n):
30+
embedding1 = np.array(result[i])
31+
embedding2 = np.array(result[j])
32+
similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
33+
print(f"Similarity between {i} and {j}: {similarity:.2f}")
34+

examples/server/server.cpp

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,7 +1271,7 @@ struct llama_server_context
12711271
queue_results.send(res);
12721272
}
12731273

1274-
void send_embedding(llama_client_slot &slot)
1274+
void send_embedding(llama_client_slot &slot, const llama_batch & batch)
12751275
{
12761276
task_result res;
12771277
res.id = slot.task_id;
@@ -1280,6 +1280,7 @@ struct llama_server_context
12801280
res.stop = true;
12811281

12821282
const int n_embd = llama_n_embd(model);
1283+
12831284
if (!params.embedding)
12841285
{
12851286
LOG_WARNING("embedding disabled", {
@@ -1292,12 +1293,19 @@ struct llama_server_context
12921293
}
12931294
else
12941295
{
1295-
const float *data = llama_get_embeddings(ctx);
1296-
std::vector<float> embedding(data, data + n_embd);
1297-
res.result_json = json
1298-
{
1299-
{"embedding", embedding },
1300-
};
1296+
for (int i = 0; i < batch.n_tokens; ++i) {
1297+
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
1298+
continue;
1299+
}
1300+
1301+
const float * data = llama_get_embeddings_ith(ctx, i);
1302+
std::vector<float> embedding(data, data + n_embd);
1303+
1304+
res.result_json = json
1305+
{
1306+
{"embedding", embedding },
1307+
};
1308+
}
13011309
}
13021310
queue_results.send(res);
13031311
}
@@ -1891,7 +1899,7 @@ struct llama_server_context
18911899
ga_i += ga_w/ga_n;
18921900
}
18931901
}
1894-
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
1902+
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false);
18951903
slot_npast++;
18961904
}
18971905

@@ -1927,7 +1935,7 @@ struct llama_server_context
19271935

19281936
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
19291937
{
1930-
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
1938+
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
19311939

19321940
for (auto & slot : slots)
19331941
{
@@ -2000,7 +2008,7 @@ struct llama_server_context
20002008
// prompt evaluated for embedding
20012009
if (slot.embedding)
20022010
{
2003-
send_embedding(slot);
2011+
send_embedding(slot, batch_view);
20042012
slot.release();
20052013
slot.i_batch = -1;
20062014
continue;
@@ -2359,7 +2367,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
23592367
break;
23602368
}
23612369
params.n_batch = std::stoi(argv[i]);
2362-
params.n_batch = std::min(512, params.n_batch);
23632370
}
23642371
else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
23652372
{

llama.cpp

Lines changed: 53 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,6 +1646,7 @@ struct llama_cparams {
16461646
float defrag_thold;
16471647

16481648
bool mul_mat_q;
1649+
bool embeddings;
16491650
bool offload_kqv;
16501651
bool do_pooling;
16511652

@@ -1936,16 +1937,16 @@ struct llama_context {
19361937
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
19371938
int32_t n_eval = 0; // number of eval calls
19381939

1939-
// decode output (2-dimensional array: [n_tokens][n_vocab])
1940+
// logits output (2-dimensional array: [n_tokens][n_vocab])
19401941
std::vector<float> logits;
19411942
#ifndef NDEBUG
19421943
// guard against access to unset logits
19431944
std::vector<bool> logits_valid;
19441945
#endif
19451946
bool logits_all = false;
19461947

1947-
// input embedding (1-dimensional array: [n_embd])
1948-
std::vector<float> embedding;
1948+
// embeddings output (2-dimensional array: [n_tokens][n_embd])
1949+
std::vector<float> embeddings;
19491950

19501951
// memory buffers used to evaluate the model
19511952
std::vector<uint8_t> buf_compute_meta;
@@ -5987,9 +5988,10 @@ struct llm_build_context {
59875988

59885989
// get input vectors with right size
59895990
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5990-
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5991+
5992+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
59915993
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5992-
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5994+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
59935995

59945996
// construct input embeddings (token, type, position)
59955997
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -7971,17 +7973,17 @@ static int llama_decode_internal(
79717973
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
79727974

79737975
// the output is always the last tensor in the graph
7974-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7975-
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7976+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7977+
struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
79767978

79777979
if (strcmp(res->name, "result_output") == 0) {
79787980
// the embeddings could be the second to last tensor, or the third to last tensor
7979-
if (strcmp(embeddings->name, "result_norm") != 0) {
7980-
embeddings = gf->nodes[gf->n_nodes - 3];
7981-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
7981+
if (strcmp(embd->name, "result_norm") != 0) {
7982+
embd = gf->nodes[gf->n_nodes - 3];
7983+
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0);
79827984
}
79837985
} else if (strcmp(res->name, "result_embd") == 0) {
7984-
embeddings = res;
7986+
embd = res;
79857987
res = nullptr;
79867988
} else {
79877989
GGML_ASSERT(false);
@@ -8051,46 +8053,53 @@ static int llama_decode_internal(
80518053
logits_out.clear();
80528054
#endif
80538055

8054-
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
8055-
GGML_ASSERT(res_backend != nullptr);
8056+
ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
8057+
GGML_ASSERT(backend_res != nullptr);
8058+
80568059
if (batch.logits) {
80578060
logits_out.resize(n_vocab * n_tokens);
80588061
for (uint32_t i = 0; i < n_tokens; i++) {
80598062
if (batch.logits[i] == 0) {
80608063
continue;
80618064
}
8062-
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8065+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
80638066
#ifndef NDEBUG
80648067
logits_valid[i] = true;
80658068
#endif
80668069
}
80678070
} else if (lctx.logits_all) {
80688071
logits_out.resize(n_vocab * n_tokens);
8069-
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8072+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
80708073
#ifndef NDEBUG
80718074
std::fill(logits_valid.begin(), logits_valid.end(), true);
80728075
#endif
80738076
} else {
80748077
logits_out.resize(n_vocab);
8075-
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8078+
ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
80768079
#ifndef NDEBUG
80778080
logits_valid[0] = true;
80788081
#endif
80798082
}
8080-
ggml_backend_synchronize(res_backend);
8083+
ggml_backend_synchronize(backend_res);
80818084
}
80828085

80838086
// extract embeddings
8084-
if (!lctx.embedding.empty()) {
8085-
auto & embedding_out = lctx.embedding;
8087+
if (cparams.embeddings && embd) {
8088+
auto & embeddings_out = lctx.embeddings;
80868089

8087-
const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0;
8088-
const int64_t embd_size = res ? n_embd : n_embd * n_tokens;
8090+
ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd);
8091+
GGML_ASSERT(backend_embd != nullptr);
80898092

8090-
embedding_out.resize(embd_size);
8091-
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8092-
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8093-
ggml_backend_synchronize(embeddings_backend);
8093+
if (batch.logits) {
8094+
embeddings_out.resize(n_embd * n_tokens);
8095+
for (uint32_t i = 0; i < n_tokens; i++) {
8096+
if (batch.logits[i] == 0) {
8097+
continue;
8098+
}
8099+
ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
8100+
}
8101+
}
8102+
ggml_backend_synchronize(backend_embd);
80948103
}
80958104

80968105
// measure the performance only for the single-token evals
@@ -11634,7 +11643,7 @@ struct llama_context_params llama_context_default_params() {
1163411643
/*.type_v =*/ GGML_TYPE_F16,
1163511644
/*.mul_mat_q =*/ true,
1163611645
/*.logits_all =*/ false,
11637-
/*.embedding =*/ false,
11646+
/*.embeddings =*/ false,
1163811647
/*.offload_kqv =*/ true,
1163911648
/*.do_pooling =*/ true,
1164011649
};
@@ -11785,6 +11794,7 @@ struct llama_context * llama_new_context_with_model(
1178511794
cparams.yarn_beta_slow = params.yarn_beta_slow;
1178611795
cparams.defrag_thold = params.defrag_thold;
1178711796
cparams.mul_mat_q = params.mul_mat_q;
11797+
cparams.embeddings = params.embeddings;
1178811798
cparams.offload_kqv = params.offload_kqv;
1178911799
cparams.do_pooling = params.do_pooling;
1179011800

@@ -11933,8 +11943,8 @@ struct llama_context * llama_new_context_with_model(
1193311943
// resized during inference, reserve maximum
1193411944
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
1193511945

11936-
if (params.embedding) {
11937-
ctx->embedding.resize(hparams.n_embd);
11946+
if (params.embeddings) {
11947+
ctx->embeddings.reserve(hparams.n_embd*cparams.n_batch);
1193811948
}
1193911949

1194011950
// graph inputs
@@ -12369,7 +12379,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
1236912379
// assume worst case for logits although only currently set ones are serialized
1237012380
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
1237112381
const size_t s_embedding_size = sizeof(size_t);
12372-
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
12382+
const size_t s_embedding = ctx->embeddings.capacity() * sizeof(float);
1237312383
const size_t s_kv_size = sizeof(size_t);
1237412384
const size_t s_kv_ntok = sizeof(int);
1237512385
const size_t s_kv = ctx->kv_self.total_size();
@@ -12470,12 +12480,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
1247012480

1247112481
// copy embeddings
1247212482
{
12473-
const size_t embedding_size = ctx->embedding.size();
12483+
const size_t embeddings_size = ctx->embeddings.size();
1247412484

12475-
data_ctx->write(&embedding_size, sizeof(embedding_size));
12485+
data_ctx->write(&embeddings_size, sizeof(embeddings_size));
1247612486

12477-
if (embedding_size) {
12478-
data_ctx->write(ctx->embedding.data(), embedding_size * sizeof(float));
12487+
if (embeddings_size) {
12488+
data_ctx->write(ctx->embeddings.data(), embeddings_size * sizeof(float));
1247912489
}
1248012490
}
1248112491

@@ -12581,15 +12591,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
1258112591

1258212592
// set embeddings
1258312593
{
12584-
size_t embedding_size;
12594+
size_t embeddings_size;
12595+
12596+
memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
1258512597

12586-
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
12598+
GGML_ASSERT(ctx->embeddings.capacity() == embeddings_size);
1258712599

12588-
GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
12600+
if (embeddings_size) {
12601+
ctx->embeddings.resize(embeddings_size);
1258912602

12590-
if (embedding_size) {
12591-
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
12592-
inp += embedding_size * sizeof(float);
12603+
memcpy(ctx->embeddings.data(), inp, embeddings_size * sizeof(float));
12604+
inp += embeddings_size * sizeof(float);
1259312605
}
1259412606
}
1259512607

@@ -12829,11 +12841,11 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
1282912841
}
1283012842

1283112843
float * llama_get_embeddings(struct llama_context * ctx) {
12832-
return ctx->embedding.data();
12844+
return ctx->embeddings.data();
1283312845
}
1283412846

1283512847
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12836-
return ctx->embedding.data() + i*ctx->model.hparams.n_embd;
12848+
return ctx->embeddings.data() + i*ctx->model.hparams.n_embd;
1283712849
}
1283812850

1283912851
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {

0 commit comments

Comments
 (0)