@@ -1646,6 +1646,7 @@ struct llama_cparams {
1646
1646
float defrag_thold;
1647
1647
1648
1648
bool mul_mat_q;
1649
+ bool embeddings;
1649
1650
bool offload_kqv;
1650
1651
bool do_pooling;
1651
1652
@@ -1936,16 +1937,16 @@ struct llama_context {
1936
1937
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
1937
1938
int32_t n_eval = 0; // number of eval calls
1938
1939
1939
- // decode output (2-dimensional array: [n_tokens][n_vocab])
1940
+ // logits output (2-dimensional array: [n_tokens][n_vocab])
1940
1941
std::vector<float> logits;
1941
1942
#ifndef NDEBUG
1942
1943
// guard against access to unset logits
1943
1944
std::vector<bool> logits_valid;
1944
1945
#endif
1945
1946
bool logits_all = false;
1946
1947
1947
- // input embedding (1 -dimensional array: [n_embd])
1948
- std::vector<float> embedding ;
1948
+ // embeddings output (2 -dimensional array: [n_tokens] [n_embd])
1949
+ std::vector<float> embeddings ;
1949
1950
1950
1951
// memory buffers used to evaluate the model
1951
1952
std::vector<uint8_t> buf_compute_meta;
@@ -5987,9 +5988,10 @@ struct llm_build_context {
5987
5988
5988
5989
// get input vectors with right size
5989
5990
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
5990
- struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5991
+
5992
+ struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5991
5993
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5992
- struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5994
+ struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
5993
5995
5994
5996
// construct input embeddings (token, type, position)
5995
5997
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -7971,17 +7973,17 @@ static int llama_decode_internal(
7971
7973
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
7972
7974
7973
7975
// the output is always the last tensor in the graph
7974
- struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7975
- struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
7976
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
7977
+ struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
7976
7978
7977
7979
if (strcmp(res->name, "result_output") == 0) {
7978
7980
// the embeddings could be the second to last tensor, or the third to last tensor
7979
- if (strcmp(embeddings ->name, "result_norm") != 0) {
7980
- embeddings = gf->nodes[gf->n_nodes - 3];
7981
- GGML_ASSERT(strcmp(embeddings ->name, "result_norm") == 0);
7981
+ if (strcmp(embd ->name, "result_norm") != 0) {
7982
+ embd = gf->nodes[gf->n_nodes - 3];
7983
+ GGML_ASSERT(strcmp(embd ->name, "result_norm") == 0);
7982
7984
}
7983
7985
} else if (strcmp(res->name, "result_embd") == 0) {
7984
- embeddings = res;
7986
+ embd = res;
7985
7987
res = nullptr;
7986
7988
} else {
7987
7989
GGML_ASSERT(false);
@@ -8051,46 +8053,53 @@ static int llama_decode_internal(
8051
8053
logits_out.clear();
8052
8054
#endif
8053
8055
8054
- ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
8055
- GGML_ASSERT(res_backend != nullptr);
8056
+ ggml_backend_t backend_res = ggml_backend_sched_get_node_backend(lctx.sched, res);
8057
+ GGML_ASSERT(backend_res != nullptr);
8058
+
8056
8059
if (batch.logits) {
8057
8060
logits_out.resize(n_vocab * n_tokens);
8058
8061
for (uint32_t i = 0; i < n_tokens; i++) {
8059
8062
if (batch.logits[i] == 0) {
8060
8063
continue;
8061
8064
}
8062
- ggml_backend_tensor_get_async(res_backend , res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8065
+ ggml_backend_tensor_get_async(backend_res , res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
8063
8066
#ifndef NDEBUG
8064
8067
logits_valid[i] = true;
8065
8068
#endif
8066
8069
}
8067
8070
} else if (lctx.logits_all) {
8068
8071
logits_out.resize(n_vocab * n_tokens);
8069
- ggml_backend_tensor_get_async(res_backend , res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8072
+ ggml_backend_tensor_get_async(backend_res , res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
8070
8073
#ifndef NDEBUG
8071
8074
std::fill(logits_valid.begin(), logits_valid.end(), true);
8072
8075
#endif
8073
8076
} else {
8074
8077
logits_out.resize(n_vocab);
8075
- ggml_backend_tensor_get_async(res_backend , res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8078
+ ggml_backend_tensor_get_async(backend_res , res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
8076
8079
#ifndef NDEBUG
8077
8080
logits_valid[0] = true;
8078
8081
#endif
8079
8082
}
8080
- ggml_backend_synchronize(res_backend );
8083
+ ggml_backend_synchronize(backend_res );
8081
8084
}
8082
8085
8083
8086
// extract embeddings
8084
- if (!lctx.embedding.empty() ) {
8085
- auto & embedding_out = lctx.embedding ;
8087
+ if (cparams.embeddings && embd ) {
8088
+ auto & embeddings_out = lctx.embeddings ;
8086
8089
8087
- const int64_t embd_pos = res ? n_embd * (n_tokens-1) : 0 ;
8088
- const int64_t embd_size = res ? n_embd : n_embd * n_tokens ;
8090
+ ggml_backend_t backend_embd = ggml_backend_sched_get_node_backend(lctx.sched, embd) ;
8091
+ GGML_ASSERT(backend_embd != nullptr) ;
8089
8092
8090
- embedding_out.resize(embd_size);
8091
- ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
8092
- ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), embd_pos*sizeof(float), embd_size*sizeof(float));
8093
- ggml_backend_synchronize(embeddings_backend);
8093
+ if (batch.logits) {
8094
+ embeddings_out.resize(n_embd * n_tokens);
8095
+ for (uint32_t i = 0; i < n_tokens; i++) {
8096
+ if (batch.logits[i] == 0) {
8097
+ continue;
8098
+ }
8099
+ ggml_backend_tensor_get_async(backend_embd, embd, embeddings_out.data() + (n_embd*i), (n_embd*i)*sizeof(float), n_embd*sizeof(float));
8100
+ }
8101
+ }
8102
+ ggml_backend_synchronize(backend_embd);
8094
8103
}
8095
8104
8096
8105
// measure the performance only for the single-token evals
@@ -11634,7 +11643,7 @@ struct llama_context_params llama_context_default_params() {
11634
11643
/*.type_v =*/ GGML_TYPE_F16,
11635
11644
/*.mul_mat_q =*/ true,
11636
11645
/*.logits_all =*/ false,
11637
- /*.embedding =*/ false,
11646
+ /*.embeddings =*/ false,
11638
11647
/*.offload_kqv =*/ true,
11639
11648
/*.do_pooling =*/ true,
11640
11649
};
@@ -11785,6 +11794,7 @@ struct llama_context * llama_new_context_with_model(
11785
11794
cparams.yarn_beta_slow = params.yarn_beta_slow;
11786
11795
cparams.defrag_thold = params.defrag_thold;
11787
11796
cparams.mul_mat_q = params.mul_mat_q;
11797
+ cparams.embeddings = params.embeddings;
11788
11798
cparams.offload_kqv = params.offload_kqv;
11789
11799
cparams.do_pooling = params.do_pooling;
11790
11800
@@ -11933,8 +11943,8 @@ struct llama_context * llama_new_context_with_model(
11933
11943
// resized during inference, reserve maximum
11934
11944
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
11935
11945
11936
- if (params.embedding ) {
11937
- ctx->embedding.resize (hparams.n_embd);
11946
+ if (params.embeddings ) {
11947
+ ctx->embeddings.reserve (hparams.n_embd*cparams.n_batch );
11938
11948
}
11939
11949
11940
11950
// graph inputs
@@ -12369,7 +12379,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
12369
12379
// assume worst case for logits although only currently set ones are serialized
12370
12380
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
12371
12381
const size_t s_embedding_size = sizeof(size_t);
12372
- const size_t s_embedding = ctx->embedding.size () * sizeof(float);
12382
+ const size_t s_embedding = ctx->embeddings.capacity () * sizeof(float);
12373
12383
const size_t s_kv_size = sizeof(size_t);
12374
12384
const size_t s_kv_ntok = sizeof(int);
12375
12385
const size_t s_kv = ctx->kv_self.total_size();
@@ -12470,12 +12480,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
12470
12480
12471
12481
// copy embeddings
12472
12482
{
12473
- const size_t embedding_size = ctx->embedding .size();
12483
+ const size_t embeddings_size = ctx->embeddings .size();
12474
12484
12475
- data_ctx->write(&embedding_size , sizeof(embedding_size ));
12485
+ data_ctx->write(&embeddings_size , sizeof(embeddings_size ));
12476
12486
12477
- if (embedding_size ) {
12478
- data_ctx->write(ctx->embedding .data(), embedding_size * sizeof(float));
12487
+ if (embeddings_size ) {
12488
+ data_ctx->write(ctx->embeddings .data(), embeddings_size * sizeof(float));
12479
12489
}
12480
12490
}
12481
12491
@@ -12581,15 +12591,17 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
12581
12591
12582
12592
// set embeddings
12583
12593
{
12584
- size_t embedding_size;
12594
+ size_t embeddings_size;
12595
+
12596
+ memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size);
12585
12597
12586
- memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size );
12598
+ GGML_ASSERT(ctx->embeddings.capacity() == embeddings_size );
12587
12599
12588
- GGML_ASSERT(ctx->embedding.capacity() == embedding_size);
12600
+ if (embeddings_size) {
12601
+ ctx->embeddings.resize(embeddings_size);
12589
12602
12590
- if (embedding_size) {
12591
- memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
12592
- inp += embedding_size * sizeof(float);
12603
+ memcpy(ctx->embeddings.data(), inp, embeddings_size * sizeof(float));
12604
+ inp += embeddings_size * sizeof(float);
12593
12605
}
12594
12606
}
12595
12607
@@ -12829,11 +12841,11 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
12829
12841
}
12830
12842
12831
12843
float * llama_get_embeddings(struct llama_context * ctx) {
12832
- return ctx->embedding .data();
12844
+ return ctx->embeddings .data();
12833
12845
}
12834
12846
12835
12847
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
12836
- return ctx->embedding .data() + i*ctx->model.hparams.n_embd;
12848
+ return ctx->embeddings .data() + i*ctx->model.hparams.n_embd;
12837
12849
}
12838
12850
12839
12851
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
0 commit comments