@@ -982,7 +982,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
982
982
}
983
983
984
984
// alloc memory for graph
985
- bool ok = ggml_backend_sched_alloc_graph (ctx.sched , gf);
985
+ bool ok = ggml_backend_sched_alloc_graph (ctx.sched . get () , gf);
986
986
if (!ok) {
987
987
LLAMA_LOG_ERROR (" failed to alloc memory for graph\n " );
988
988
return -1 ;
@@ -1064,7 +1064,7 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
1064
1064
// compute
1065
1065
LLAMA_LOG_DEBUG (" %s: compute start\n " , __func__);
1066
1066
int64_t t_start = ggml_time_ms ();
1067
- ggml_backend_sched_graph_compute (ctx.sched , gf);
1067
+ ggml_backend_sched_graph_compute (ctx.sched . get () , gf);
1068
1068
1069
1069
// the last node is the embedding tensor
1070
1070
struct ggml_tensor * output_node = ggml_graph_node (gf, -1 );
@@ -1091,6 +1091,92 @@ static int32_t llama_vision_encode_impl(llama_vision_context & ctx, const llama_
1091
1091
// //////////////////////////////////////////////////////////////////////////////////////
1092
1092
// public API
1093
1093
1094
+ struct llama_vision_context_params llama_vision_context_default_params () {
1095
+ return {
1096
+ /* .n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
1097
+ };
1098
+ }
1099
+
1100
+ struct llama_vision_context * llama_vision_init_from_model (const struct llama_model * model, struct llama_vision_context_params params) {
1101
+ if (!model->has_vision ) {
1102
+ return nullptr ;
1103
+ }
1104
+
1105
+ llama_vision_context * ctx = new llama_vision_context;
1106
+ ctx->model = &model->vit ;
1107
+
1108
+ // TODO: this looks ugly, mostly copied from llama.cpp, refactor it in the future
1109
+
1110
+ // init backends
1111
+ {
1112
+ // add CPU backend
1113
+ ctx->backend_cpu = ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr );
1114
+ if (ctx->backend_cpu == nullptr ) {
1115
+ LLAMA_LOG_ERROR (" %s: failed to initialize CPU backend\n " , __func__);
1116
+ llama_vision_free (ctx);
1117
+ return nullptr ;
1118
+ }
1119
+ ctx->backends .emplace_back (ctx->backend_cpu );
1120
+
1121
+ // create a list of the set_n_threads functions in the backends
1122
+ for (auto & backend : ctx->backends ) {
1123
+ ggml_backend_dev_t dev = ggml_backend_get_device (backend.get ());
1124
+ ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg (dev) : nullptr ;
1125
+ if (reg) {
1126
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t ) ggml_backend_reg_get_proc_address (reg, " ggml_backend_set_n_threads" );
1127
+ ggml_backend_set_n_threads_fn (backend.get (), params.n_threads );
1128
+ }
1129
+ }
1130
+ }
1131
+
1132
+ // scheduler and compute buffers
1133
+ {
1134
+ // buffer types used for the compute buffer of each backend
1135
+ std::vector<ggml_backend_buffer_type_t > backend_buft;
1136
+ std::vector<ggml_backend_t > backend_ptrs;
1137
+ for (auto & backend : ctx->backends ) {
1138
+ auto * buft = ggml_backend_get_default_buffer_type (backend.get ());
1139
+ auto backend_type = ggml_backend_dev_type (ggml_backend_get_device (backend.get ()));
1140
+ if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices .empty ()) {
1141
+ // use the host buffer of the first device CPU for faster transfer of the intermediate state
1142
+ auto * dev = model->devices [0 ];
1143
+ auto * host_buft = ggml_backend_dev_host_buffer_type (dev);
1144
+ if (host_buft) {
1145
+ buft = host_buft;
1146
+ }
1147
+ }
1148
+ backend_buft.push_back (buft);
1149
+ backend_ptrs.push_back (backend.get ());
1150
+ }
1151
+
1152
+ const size_t max_nodes = model->max_nodes ();
1153
+
1154
+ // buffer used to store the computation graph and the tensor meta data
1155
+ ctx->buf_compute_meta .resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
1156
+
1157
+ // TODO: support pipeline_parallel
1158
+ const bool pipeline_parallel = false ;
1159
+
1160
+ ctx->sched .reset (ggml_backend_sched_new (backend_ptrs.data (), backend_buft.data (), backend_ptrs.size (), max_nodes, pipeline_parallel));
1161
+
1162
+ if (pipeline_parallel) {
1163
+ LLAMA_LOG_INFO (" %s: pipeline parallelism enabled (n_copies=%d)\n " , __func__, ggml_backend_sched_get_n_copies (ctx->sched .get ()));
1164
+ }
1165
+ }
1166
+
1167
+ const size_t max_nodes = VISION_GRAPH_MAX_NODE; // TODO: make it dynamic
1168
+ ctx->buf_compute_meta .resize (ggml_tensor_overhead ()*max_nodes + ggml_graph_overhead_custom (max_nodes, false ));
1169
+
1170
+ return ctx;
1171
+ }
1172
+
1173
+ void llama_vision_free (struct llama_vision_context * ctx) {
1174
+ if (ctx->ctx_ggml ) {
1175
+ ggml_free (ctx->ctx_ggml );
1176
+ }
1177
+ delete ctx;
1178
+ }
1179
+
1094
1180
struct llama_vision_bitmap * llama_vision_bitmap_init (uint32_t nx, uint32_t ny) {
1095
1181
llama_vision_bitmap * bmp = new llama_vision_bitmap;
1096
1182
bmp->nx = nx;
@@ -1105,16 +1191,15 @@ void llama_vision_bitmap_free(llama_vision_bitmap * bmp) {
1105
1191
}
1106
1192
1107
1193
struct llama_vision_tokens * llama_vision_tokenize (
1108
- struct llama_context * ctx,
1109
- llama_vision_bitmap * bmp) {
1110
- llama_vision_context & vctx = ctx->vctx ;
1111
- switch (vctx.model ->hparams .arch ) {
1194
+ struct llama_vision_context * ctx,
1195
+ struct llama_vision_bitmap * bmp) {
1196
+ switch (ctx->model ->hparams .arch ) {
1112
1197
case LLM_ARCH_VISION_LLAVA:
1113
1198
case LLM_ARCH_VISION_MOBILEVLM:
1114
1199
case LLM_ARCH_VISION_IDEFICS3:
1115
- return new llama_vision_tokens (llama_vision_processor_llava (vctx ).tokenize (*bmp));
1200
+ return new llama_vision_tokens (llama_vision_processor_llava (*ctx ).tokenize (*bmp));
1116
1201
case LLM_ARCH_VISION_MINICPMV:
1117
- return new llama_vision_tokens (llama_vision_processor_llava (vctx ).tokenize (*bmp));
1202
+ return new llama_vision_tokens (llama_vision_processor_llava (*ctx ).tokenize (*bmp));
1118
1203
default :
1119
1204
GGML_ASSERT (false && " unsupported arch" );
1120
1205
}
@@ -1124,19 +1209,18 @@ void llama_vision_tokens_free(llama_vision_tokens * p) {
1124
1209
delete p;
1125
1210
}
1126
1211
1127
- int32_t llama_vision_encode (struct llama_context * ctx, llama_vision_tokens * p) {
1212
+ int32_t llama_vision_encode (struct llama_vision_context * ctx, struct llama_vision_tokens * p) {
1128
1213
if (p->buf .empty ()) {
1129
1214
LLAMA_LOG_ERROR (" %s: nothing to encode\n " , __func__);
1130
1215
return -1 ;
1131
1216
}
1132
1217
1133
- llama_vision_context & vctx = ctx->vctx ;
1134
- auto & hparams = vctx.model ->hparams ;
1218
+ auto & hparams = ctx->model ->hparams ;
1135
1219
switch (hparams.mm_patch_merge_type ) {
1136
1220
case MM_PATCH_MERGE_FLAT:
1137
1221
{
1138
1222
// flat / default llava-1.5 type embedding
1139
- int32_t encoded = llama_vision_encode_impl (vctx , *p);
1223
+ int32_t encoded = llama_vision_encode_impl (*ctx , *p);
1140
1224
if (encoded != 0 ) {
1141
1225
LLAMA_LOG_ERROR (" Unable to encode image\n " );
1142
1226
return encoded;
@@ -1154,8 +1238,8 @@ int32_t llama_vision_encode(struct llama_context * ctx, llama_vision_tokens * p)
1154
1238
return 0 ;
1155
1239
}
1156
1240
1157
- struct ggml_tensor * llama_vision_get_output_tensor (llama_context * ctx) {
1158
- return ctx->vctx . output ;
1241
+ struct ggml_tensor * llama_vision_get_output_tensor (struct llama_vision_context * ctx) {
1242
+ return ctx->output ;
1159
1243
}
1160
1244
1161
1245
// //////////////////////////////////////////////////////////////////////////////////////
0 commit comments