From ff4976eed6c355d3f3079f5bdb534ec49f2ab35d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Sep 2024 21:28:55 +0200 Subject: [PATCH 01/11] Flux: clip_l support --- conditioner.hpp | 116 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 87 insertions(+), 29 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index ac2ab7ebf..6185b2347 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1084,41 +1084,76 @@ struct FluxCLIPEmbedder : public Conditioner { auto& t5_tokens = token_and_weights[1].first; auto& t5_weights = token_and_weights[1].second; - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] - struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096] - struct ggml_tensor* pooled = NULL; // [768,] + int64_t t0 = ggml_time_ms(); + struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] + struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096] + struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l] + struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5] + struct ggml_tensor* pooled = NULL; // [768,] std::vector hidden_states_vec; - size_t chunk_len = 256; - size_t chunk_count = t5_tokens.size() / chunk_len; + size_t chunk_len = 77; + size_t chunk_count = clip_l_tokens.size() / chunk_len; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { // clip_l - if (chunk_idx == 0) { - size_t chunk_len_l = 77; - std::vector chunk_tokens(clip_l_tokens.begin(), - clip_l_tokens.begin() + chunk_len_l); - std::vector chunk_weights(clip_l_weights.begin(), - clip_l_weights.begin() + chunk_len_l); + { + std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len, + clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len); + std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len, + clip_l_weights.begin() + (chunk_idx + 1) * chunk_len); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); size_t max_token_idx = 0; - // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); - // max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - // clip_l->compute(n_threads, - // input_ids, - // 0, - // NULL, - // max_token_idx, - // true, - // &pooled, - // work_ctx); - - // clip_l.transformer.text_model.text_projection no in file, ignore - // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection - pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - ggml_set_f32(pooled, 0.f); + clip_l->compute(n_threads, + input_ids, + 0, + NULL, + max_token_idx, + false, + &chunk_hidden_states_l, + work_ctx); + { + auto tensor = chunk_hidden_states_l; + float original_mean = ggml_tensor_mean(tensor); + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float value = ggml_tensor_get_f32(tensor, i0, i1, i2); + value *= chunk_weights[i1]; + ggml_tensor_set_f32(tensor, value, i0, i1, i2); + } + } + } + float new_mean = ggml_tensor_mean(tensor); + ggml_tensor_scale(tensor, (original_mean / new_mean)); + } + if (chunk_idx == 0) { + size_t chunk_len_l = 77; + std::vector chunk_tokens(clip_l_tokens.begin(), + clip_l_tokens.begin() + chunk_len_l); + std::vector chunk_weights(clip_l_weights.begin(), + clip_l_weights.begin() + chunk_len_l); + + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + size_t max_token_idx = 0; + + // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); + // max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); + // clip_l->compute(n_threads, + // input_ids, + // 0, + // NULL, + // max_token_idx, + // true, + // &pooled, + // work_ctx); + + // clip_l.transformer.text_model.text_projection no in file, ignore + // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection + pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); + ggml_set_f32(pooled, 0.f); + } } // t5 @@ -1132,10 +1167,10 @@ struct FluxCLIPEmbedder : public Conditioner { t5->compute(n_threads, input_ids, - &chunk_hidden_states, + &chunk_hidden_states_t5, work_ctx); { - auto tensor = chunk_hidden_states; + auto tensor = chunk_hidden_states_t5; float original_mean = ggml_tensor_mean(tensor); for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { @@ -1151,6 +1186,29 @@ struct FluxCLIPEmbedder : public Conditioner { } } + + // TODO: Maybe there's a better way to do the padding? + auto chunk_hidden_states_l_pad = ggml_new_tensor_3d(work_ctx, + chunk_hidden_states_l->type, + 4096, + chunk_hidden_states_l->ne[1], + chunk_hidden_states_l->ne[2]); // [n_token, 4096] + + for (int i2 = 0; i2 < chunk_hidden_states_l_pad->ne[2]; i2++) { + for (int i1 = 0; i1 < chunk_hidden_states_l_pad->ne[1]; i1++) { + for (int i0 = 0; i0 < chunk_hidden_states_l_pad->ne[0]; i0++) { + float value = 0.f; + if (i0 < chunk_hidden_states_l->ne[0]) { + value = ggml_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2); + } + ggml_tensor_set_f32(chunk_hidden_states_l_pad, value, i0, i1, i2); + } + } + } + + chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_l, chunk_hidden_states_t5, 1); // [n_token*2, 4096] + + int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); if (force_zero_embeddings) { From e6314d39a8d157dd35a39be781e3163754bcd155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Sep 2024 22:22:06 +0200 Subject: [PATCH 02/11] Fix oopsie --- conditioner.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conditioner.hpp b/conditioner.hpp index 6185b2347..e0bf739ec 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1206,7 +1206,7 @@ struct FluxCLIPEmbedder : public Conditioner { } } - chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_l, chunk_hidden_states_t5, 1); // [n_token*2, 4096] + chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_l_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096] int64_t t1 = ggml_time_ms(); From 4d7fed17155584cb2ed627bf987a3463a55d9c3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 5 Sep 2024 02:08:46 +0200 Subject: [PATCH 03/11] I don't know what I'm doing, but it's working better now --- conditioner.hpp | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index e0bf739ec..7515018a0 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1087,20 +1087,28 @@ struct FluxCLIPEmbedder : public Conditioner { int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096] - struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l] - struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5] struct ggml_tensor* pooled = NULL; // [768,] std::vector hidden_states_vec; - size_t chunk_len = 77; - size_t chunk_count = clip_l_tokens.size() / chunk_len; + size_t chunk_len_l = 77; + size_t chunk_count_l = clip_l_tokens.size() / chunk_len_l; + + size_t chunk_len_t5 = 256; + size_t chunk_count_t5 = t5_tokens.size() / chunk_len_t5; + + // TODO: I believe chunk_count_l is actually bigger than chunk_count_t5 + // So this ignores some tokens for clip + size_t chunk_count = chunk_count_t5; + for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { + struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l] + struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5] // clip_l - { - std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len, - clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len); - std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len, - clip_l_weights.begin() + (chunk_idx + 1) * chunk_len); + if(chunk_idx < chunk_count_l) { + std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len_l, + clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len_l); + std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len_l, + clip_l_weights.begin() + (chunk_idx + 1) * chunk_len_l); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); size_t max_token_idx = 0; @@ -1129,7 +1137,6 @@ struct FluxCLIPEmbedder : public Conditioner { ggml_tensor_scale(tensor, (original_mean / new_mean)); } if (chunk_idx == 0) { - size_t chunk_len_l = 77; std::vector chunk_tokens(clip_l_tokens.begin(), clip_l_tokens.begin() + chunk_len_l); std::vector chunk_weights(clip_l_weights.begin(), @@ -1157,11 +1164,11 @@ struct FluxCLIPEmbedder : public Conditioner { } // t5 - { - std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, - t5_tokens.begin() + (chunk_idx + 1) * chunk_len); - std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, - t5_weights.begin() + (chunk_idx + 1) * chunk_len); + if(chunk_idx < chunk_count_t5) { + std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len_t5, + t5_tokens.begin() + (chunk_idx + 1) * chunk_len_t5); + std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len_t5, + t5_weights.begin() + (chunk_idx + 1) * chunk_len_t5); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); @@ -1205,8 +1212,12 @@ struct FluxCLIPEmbedder : public Conditioner { } } } - - chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_l_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096] + + if(chunk_hidden_states_t5 == NULL){ + chunk_hidden_states = chunk_hidden_states_l_pad; + } else { + chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_l_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096] + } int64_t t1 = ggml_time_ms(); From 6ed560911dcb54a6ff7c9d3c3b268c9d2a93bf67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 5 Sep 2024 02:15:30 +0200 Subject: [PATCH 04/11] Use all of the clip --- conditioner.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index 7515018a0..e6be90204 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1096,9 +1096,7 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len_t5 = 256; size_t chunk_count_t5 = t5_tokens.size() / chunk_len_t5; - // TODO: I believe chunk_count_l is actually bigger than chunk_count_t5 - // So this ignores some tokens for clip - size_t chunk_count = chunk_count_t5; + size_t chunk_count = std::max(chunk_count_t5, chunk_count_l); for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l] From 69aad864355bafcf6e1e079507ebbd9541e0cc0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 5 Sep 2024 02:21:00 +0200 Subject: [PATCH 05/11] Revert "Use all of the clip" (it's breaking things) This reverts commit de973c1d637ebb44c3b7625daf923ee6b1a2bc66. --- conditioner.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/conditioner.hpp b/conditioner.hpp index e6be90204..7515018a0 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1096,7 +1096,9 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len_t5 = 256; size_t chunk_count_t5 = t5_tokens.size() / chunk_len_t5; - size_t chunk_count = std::max(chunk_count_t5, chunk_count_l); + // TODO: I believe chunk_count_l is actually bigger than chunk_count_t5 + // So this ignores some tokens for clip + size_t chunk_count = chunk_count_t5; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l] From 7a3a1667cb9fc00ff55a20ee95442fec455643df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Sep 2024 16:20:42 +0200 Subject: [PATCH 06/11] Clip: Fixed for real this time, i swear --- clip.hpp | 6 ++- conditioner.hpp | 127 ++++++++++++------------------------------------ 2 files changed, 36 insertions(+), 97 deletions(-) diff --git a/clip.hpp b/clip.hpp index f9ac631a8..073c6f9fb 100644 --- a/clip.hpp +++ b/clip.hpp @@ -711,7 +711,11 @@ class CLIPTextModel : public GGMLBlock { if (return_pooled) { auto text_projection = params["text_projection"]; ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx); - pooled = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled); + if(text_projection != NULL){ + pooled = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled); + }else{ + LOG_DEBUG("Missing text_projection matrix, assuming identity..."); + } return pooled; } diff --git a/conditioner.hpp b/conditioner.hpp index 7515018a0..f0ae99c74 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1073,7 +1073,7 @@ struct FluxCLIPEmbedder : public Conditioner { return {{clip_l_tokens, clip_l_weights}, {t5_tokens, t5_weights}}; } - SDCondition get_learned_condition_common(ggml_context* work_ctx, + SDCondition get_learned_condition_common(ggml_context* work_ctx, int n_threads, std::vector, std::vector>> token_and_weights, int clip_skip, @@ -1084,100 +1084,62 @@ struct FluxCLIPEmbedder : public Conditioner { auto& t5_tokens = token_and_weights[1].first; auto& t5_weights = token_and_weights[1].second; - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] - struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096] - struct ggml_tensor* pooled = NULL; // [768,] + int64_t t0 = ggml_time_ms(); + struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096] + struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096] + struct ggml_tensor* pooled = NULL; // [768,] std::vector hidden_states_vec; - size_t chunk_len_l = 77; - size_t chunk_count_l = clip_l_tokens.size() / chunk_len_l; - - size_t chunk_len_t5 = 256; - size_t chunk_count_t5 = t5_tokens.size() / chunk_len_t5; - - // TODO: I believe chunk_count_l is actually bigger than chunk_count_t5 - // So this ignores some tokens for clip - size_t chunk_count = chunk_count_t5; - + size_t chunk_len = 256; + size_t chunk_count = t5_tokens.size() / chunk_len; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { - struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l] - struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5] // clip_l - if(chunk_idx < chunk_count_l) { - std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len_l, - clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len_l); - std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len_l, - clip_l_weights.begin() + (chunk_idx + 1) * chunk_len_l); + if (chunk_idx == 0) { + size_t chunk_len_l = 77; + std::vector chunk_tokens(clip_l_tokens.begin(), + clip_l_tokens.begin() + chunk_len_l); + std::vector chunk_weights(clip_l_weights.begin(), + clip_l_weights.begin() + chunk_len_l); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); size_t max_token_idx = 0; + auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); + max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); + LOG_INFO("max_token_idx = %d",max_token_idx); + clip_l->compute(n_threads, input_ids, 0, NULL, max_token_idx, - false, - &chunk_hidden_states_l, + true, + &pooled, work_ctx); - { - auto tensor = chunk_hidden_states_l; - float original_mean = ggml_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_tensor_set_f32(tensor, value, i0, i1, i2); - } - } - } - float new_mean = ggml_tensor_mean(tensor); - ggml_tensor_scale(tensor, (original_mean / new_mean)); - } - if (chunk_idx == 0) { - std::vector chunk_tokens(clip_l_tokens.begin(), - clip_l_tokens.begin() + chunk_len_l); - std::vector chunk_weights(clip_l_weights.begin(), - clip_l_weights.begin() + chunk_len_l); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - size_t max_token_idx = 0; + LOG_INFO("pooled->ne = [%d, %d, %d, %d] ",pooled->ne[0], pooled->ne[1], pooled->ne[2], pooled->ne[3]); - // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); - // max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - // clip_l->compute(n_threads, - // input_ids, - // 0, - // NULL, - // max_token_idx, - // true, - // &pooled, - // work_ctx); - - // clip_l.transformer.text_model.text_projection no in file, ignore - // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection - pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - ggml_set_f32(pooled, 0.f); - } + // clip_l.transformer.text_model.text_projection no in file, ignore + // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection + // pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); + // ggml_set_f32(pooled, 0.f); } // t5 - if(chunk_idx < chunk_count_t5) { - std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len_t5, - t5_tokens.begin() + (chunk_idx + 1) * chunk_len_t5); - std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len_t5, - t5_weights.begin() + (chunk_idx + 1) * chunk_len_t5); + { + std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, + t5_tokens.begin() + (chunk_idx + 1) * chunk_len); + std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, + t5_weights.begin() + (chunk_idx + 1) * chunk_len); auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); t5->compute(n_threads, input_ids, - &chunk_hidden_states_t5, + &chunk_hidden_states, work_ctx); { - auto tensor = chunk_hidden_states_t5; + auto tensor = chunk_hidden_states; float original_mean = ggml_tensor_mean(tensor); for (int i2 = 0; i2 < tensor->ne[2]; i2++) { for (int i1 = 0; i1 < tensor->ne[1]; i1++) { @@ -1193,33 +1155,6 @@ struct FluxCLIPEmbedder : public Conditioner { } } - - // TODO: Maybe there's a better way to do the padding? - auto chunk_hidden_states_l_pad = ggml_new_tensor_3d(work_ctx, - chunk_hidden_states_l->type, - 4096, - chunk_hidden_states_l->ne[1], - chunk_hidden_states_l->ne[2]); // [n_token, 4096] - - for (int i2 = 0; i2 < chunk_hidden_states_l_pad->ne[2]; i2++) { - for (int i1 = 0; i1 < chunk_hidden_states_l_pad->ne[1]; i1++) { - for (int i0 = 0; i0 < chunk_hidden_states_l_pad->ne[0]; i0++) { - float value = 0.f; - if (i0 < chunk_hidden_states_l->ne[0]) { - value = ggml_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2); - } - ggml_tensor_set_f32(chunk_hidden_states_l_pad, value, i0, i1, i2); - } - } - } - - if(chunk_hidden_states_t5 == NULL){ - chunk_hidden_states = chunk_hidden_states_l_pad; - } else { - chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_l_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096] - } - - int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); if (force_zero_embeddings) { From 60f2192e51a38d8b2f3ee14029544a65f5cf73a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Sep 2024 16:49:52 +0200 Subject: [PATCH 07/11] remove useless logging --- conditioner.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index f0ae99c74..0fb0183f4 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1106,7 +1106,6 @@ struct FluxCLIPEmbedder : public Conditioner { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - LOG_INFO("max_token_idx = %d",max_token_idx); clip_l->compute(n_threads, input_ids, @@ -1117,8 +1116,6 @@ struct FluxCLIPEmbedder : public Conditioner { &pooled, work_ctx); - LOG_INFO("pooled->ne = [%d, %d, %d, %d] ",pooled->ne[0], pooled->ne[1], pooled->ne[2], pooled->ne[3]); - // clip_l.transformer.text_model.text_projection no in file, ignore // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection // pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); From 7bbfb105a027a57098fba8d3d94c44912e4bff33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Sep 2024 18:06:46 +0200 Subject: [PATCH 08/11] Apply Flux fixes to SD3 --- conditioner.hpp | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index 0fb0183f4..cffc008ba 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -798,21 +798,21 @@ struct SD3CLIPEmbedder : public Conditioner { } if (chunk_idx == 0) { - // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); - // max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - // clip_l->compute(n_threads, - // input_ids, - // 0, - // NULL, - // max_token_idx, - // true, - // &pooled_l, - // work_ctx); + auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); + max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); + clip_l->compute(n_threads, + input_ids, + 0, + NULL, + max_token_idx, + true, + &pooled_l, + work_ctx); // clip_l.transformer.text_model.text_projection no in file, ignore // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection - pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - ggml_set_f32(pooled_l, 0.f); + // pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); + // ggml_set_f32(pooled_l, 0.f); } } @@ -852,21 +852,21 @@ struct SD3CLIPEmbedder : public Conditioner { } if (chunk_idx == 0) { - // auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID); - // max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - // clip_g->compute(n_threads, - // input_ids, - // 0, - // NULL, - // max_token_idx, - // true, - // &pooled_g, - // work_ctx); + auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID); + max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); + clip_g->compute(n_threads, + input_ids, + 0, + NULL, + max_token_idx, + true, + &pooled_g, + work_ctx); // clip_l.transformer.text_model.text_projection no in file, ignore pooled_g too // TODO: fix pooled_g - pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280); - ggml_set_f32(pooled_g, 0.f); + // pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280); + // ggml_set_f32(pooled_g, 0.f); } } From a96b64d152ac29505774c87fd4ead4004c8fec45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Sep 2024 18:07:32 +0200 Subject: [PATCH 09/11] Remove TODOs --- conditioner.hpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index cffc008ba..03ddfcf41 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -809,10 +809,6 @@ struct SD3CLIPEmbedder : public Conditioner { &pooled_l, work_ctx); - // clip_l.transformer.text_model.text_projection no in file, ignore - // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection - // pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - // ggml_set_f32(pooled_l, 0.f); } } @@ -862,11 +858,7 @@ struct SD3CLIPEmbedder : public Conditioner { true, &pooled_g, work_ctx); - // clip_l.transformer.text_model.text_projection no in file, ignore pooled_g too - // TODO: fix pooled_g - // pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280); - // ggml_set_f32(pooled_g, 0.f); } } @@ -1116,10 +1108,6 @@ struct FluxCLIPEmbedder : public Conditioner { &pooled, work_ctx); - // clip_l.transformer.text_model.text_projection no in file, ignore - // TODO: use torch.eye(embed_dim) as default clip_l.transformer.text_model.text_projection - // pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); - // ggml_set_f32(pooled, 0.f); } // t5 From 113829450023ca8f99c26e7a3cdde0ccbea1bd31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 19 Oct 2024 17:22:08 +0200 Subject: [PATCH 10/11] Fix formatting --- clip.hpp | 6 +++--- conditioner.hpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clip.hpp b/clip.hpp index 073c6f9fb..d7352e3ac 100644 --- a/clip.hpp +++ b/clip.hpp @@ -711,12 +711,12 @@ class CLIPTextModel : public GGMLBlock { if (return_pooled) { auto text_projection = params["text_projection"]; ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx); - if(text_projection != NULL){ + if (text_projection != NULL) { pooled = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled); - }else{ + } else { LOG_DEBUG("Missing text_projection matrix, assuming identity..."); } - return pooled; + return pooled; // [hidden_size, 1, 1] } return x; // [N, n_token, hidden_size] diff --git a/conditioner.hpp b/conditioner.hpp index 03ddfcf41..9f9d5ae1f 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1065,7 +1065,7 @@ struct FluxCLIPEmbedder : public Conditioner { return {{clip_l_tokens, clip_l_weights}, {t5_tokens, t5_weights}}; } - SDCondition get_learned_condition_common(ggml_context* work_ctx, + SDCondition get_learned_condition_common(ggml_context* work_ctx, int n_threads, std::vector, std::vector>> token_and_weights, int clip_skip, From d8c5073567cba8f418503faefe21c68f4688a6b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 24 Oct 2024 17:45:32 +0200 Subject: [PATCH 11/11] Clip-g: Fix text_projection --- clip.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clip.hpp b/clip.hpp index d7352e3ac..bf2a8c149 100644 --- a/clip.hpp +++ b/clip.hpp @@ -712,7 +712,7 @@ class CLIPTextModel : public GGMLBlock { auto text_projection = params["text_projection"]; ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx); if (text_projection != NULL) { - pooled = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, text_projection)), pooled); + pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL); } else { LOG_DEBUG("Missing text_projection matrix, assuming identity..."); }