@@ -106,10 +106,10 @@ static uint64_t hash_vector_float(const std::vector<float> & vec) {
106
106
return seed;
107
107
}
108
108
109
- mtmd_input_chunks * mtmd_tokenize (mtmd_context * ctx,
110
- const mtmd_input_text & text ,
111
- const std::vector<mtmd_bitmap> & bitmaps) {
112
- mtmd_input_chunks * output = new mtmd_input_chunks;
109
+ int32_t mtmd_tokenize (mtmd_context * ctx,
110
+ std::vector<mtmd_input_chunk> & output ,
111
+ const mtmd_input_text & text,
112
+ const std::vector<mtmd_bitmap> & bitmaps) {
113
113
auto vocab = llama_model_get_vocab (ctx->text_model );
114
114
115
115
std::string prompt_modified (text.text );
@@ -124,8 +124,8 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
124
124
}
125
125
126
126
std::vector<std::string> parts = string_split_str (text.text , ctx->image_marker );
127
- output-> clear ();
128
- output-> reserve (parts.size ());
127
+ output. clear ();
128
+ output. reserve (parts.size ());
129
129
130
130
size_t i_img = 0 ;
131
131
@@ -141,14 +141,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
141
141
std::move (tokens),
142
142
{},
143
143
};
144
- output-> emplace_back (std::move (chunk));
144
+ output. emplace_back (std::move (chunk));
145
145
146
146
if (&parts.back () != &part) {
147
147
// add image token to middle of 2 parts
148
148
149
149
if (i_img >= bitmaps.size ()) {
150
150
LOG_ERR (" %s: error: not enough images for %d parts\n " , __func__, (int )parts.size ());
151
- return nullptr ;
151
+ return 1 ;
152
152
}
153
153
154
154
// shim layer
@@ -163,10 +163,10 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
163
163
bool ok = clip_image_preprocess (ctx->ctx_clip , img_u8.get (), &batch_f32);
164
164
if (!ok) {
165
165
LOG_ERR (" Unable to preprocess image\n " );
166
- return nullptr ;
166
+ return 2 ;
167
167
}
168
168
169
- mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
169
+ mtmd_image_tokens_ptr image_tokens ( new mtmd_image_tokens) ;
170
170
image_tokens->nx = clip_n_patches (ctx->ctx_clip ); // TODO @ngxson : use clip_n_patches_by_image
171
171
image_tokens->ny = 1 ; // TODO
172
172
image_tokens->batch_f32 = std::move (batch_f32);
@@ -179,14 +179,14 @@ mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
179
179
mtmd_input_chunk chunk{
180
180
MTMD_INPUT_CHUNK_TYPE_IMAGE,
181
181
{},
182
- image_tokens,
182
+ std::move ( image_tokens) ,
183
183
};
184
- output-> emplace_back (std::move (chunk));
184
+ output. emplace_back (std::move (chunk));
185
185
i_img++;
186
186
}
187
187
}
188
188
189
- return output ;
189
+ return 0 ;
190
190
}
191
191
192
192
void mtmd_image_tokens_free (mtmd_image_tokens * image_tokens) {
@@ -195,18 +195,6 @@ void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
195
195
}
196
196
}
197
197
198
- void mtmd_input_chunks_free (mtmd_input_chunks * chunks, bool free_images) {
199
- if (free_images) {
200
- for (auto & chunk : *chunks) {
201
- if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image ) {
202
- mtmd_image_tokens_free (chunk.tokens_image );
203
- chunk.tokens_image = nullptr ;
204
- }
205
- }
206
- }
207
- delete chunks;
208
- }
209
-
210
198
size_t mtmd_image_tokens_get_n_tokens (const mtmd_image_tokens * image_tokens) {
211
199
return image_tokens->n_tokens ();
212
200
}
@@ -238,9 +226,9 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
238
226
return ctx->image_embd_v .data ();
239
227
}
240
228
241
- size_t mtmd_helper_get_n_tokens (mtmd_input_chunks * chunks) {
229
+ size_t mtmd_helper_get_n_tokens (mtmd_input_chunks & chunks) {
242
230
size_t n_tokens = 0 ;
243
- for (auto & chunk : * chunks) {
231
+ for (auto & chunk : chunks) {
244
232
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
245
233
n_tokens += chunk.tokens_text .size ();
246
234
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
@@ -289,16 +277,16 @@ struct decode_embd_batch {
289
277
290
278
int32_t mtmd_helper_eval (mtmd_context * ctx,
291
279
llama_context * lctx,
292
- mtmd_input_chunks * chunks,
280
+ mtmd_input_chunks & chunks,
293
281
llama_pos pos0,
294
282
llama_seq_id seq_id,
295
283
int32_t n_batch) {
296
284
int32_t ret;
297
285
llama_pos n_past = pos0;
298
286
llama_batch text_batch = llama_batch_init (n_batch, 0 , 1 );
299
287
300
- for (auto & chunk : * chunks) {
301
- bool is_last = &chunk == &chunks-> back ();
288
+ for (auto & chunk : chunks) {
289
+ bool is_last = &chunk == &chunks. back ();
302
290
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
303
291
// TODO @ngxson : may need to split into smaller batches
304
292
text_batch.n_tokens = chunk.tokens_text .size ();
@@ -327,7 +315,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
327
315
if (ctx->print_timings ) {
328
316
LOG_INF (" encoding image...\n " );
329
317
}
330
- ret = mtmd_encode (ctx, chunk.tokens_image );
318
+ ret = mtmd_encode (ctx, chunk.tokens_image . get () );
331
319
if (ret != 0 ) {
332
320
LOG_ERR (" failed to encode image\n " );
333
321
llama_batch_free (text_batch);
@@ -337,7 +325,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
337
325
LOG_INF (" image encoded in %" PRId64 " ms\n " , ggml_time_ms () - t0);
338
326
}
339
327
340
- int32_t n_tokens = mtmd_image_tokens_get_n_tokens (chunk.tokens_image );
328
+ int32_t n_tokens = mtmd_image_tokens_get_n_tokens (chunk.tokens_image . get () );
341
329
float * embd = mtmd_get_output_embd (ctx);
342
330
decode_embd_batch batch_img (embd, n_tokens, n_past, 0 );
343
331
int64_t t1 = ggml_time_ms ();
@@ -395,3 +383,7 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
395
383
}
396
384
return false ;
397
385
}
386
+
387
+ void mtmd_image_tokens_deleter::operator ()(mtmd_image_tokens * val) {
388
+ mtmd_image_tokens_free (val);
389
+ }
0 commit comments