Support masks for ip2p models

stduhpf · stduhpf · commit eb4677b913e5 · 2025-05-26T19:36:41.000+02:00
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -815,8 +815,7 @@ class StableDiffusionGGML {
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
-                        ggml_tensor* noise_mask      = nullptr) {
-
+                        ggml_tensor* denoise_mask    = NULL) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
 
         // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
@@ -1031,10 +1030,10 @@ class StableDiffusionGGML {
                 pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
-            if (noise_mask != nullptr) {
+            if (denoise_mask != nullptr) {
                 for (int64_t x = 0; x < denoised->ne[0]; x++) {
                     for (int64_t y = 0; y < denoised->ne[1]; y++) {
-                        float mask = ggml_tensor_get_f32(noise_mask, x, y);
+                        float mask = ggml_tensor_get_f32(denoise_mask, x, y);
                         for (int64_t k = 0; k < denoised->ne[2]; k++) {
                             float init = ggml_tensor_get_f32(init_latent, x, y, k);
                             float den  = ggml_tensor_get_f32(denoised, x, y, k);
@@ -1288,7 +1287,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float style_ratio,
                            bool normalize_input,
                            std::string input_id_images_path,
-                           ggml_tensor* concat_latent = NULL) {
+                           ggml_tensor* concat_latent = NULL,
+                           ggml_tensor* denoise_mask  = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1475,7 +1475,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     int W = width / 8;
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-    ggml_tensor* noise_mask = nullptr;
 
     struct ggml_tensor* control_latent = NULL;
     if(sd_version_is_control(sd_ctx->sd->version) && image_hint != NULL){
@@ -1544,8 +1543,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
             concat_latent = empty_latent;
         }
         cond.c_concat   = concat_latent;
+        cond.c_concat   = concat_latent;
         uncond.c_concat = empty_latent;
-        noise_mask    = NULL;
+        denoise_mask    = NULL;
     } else if (sd_version_is_edit(sd_ctx->sd->version) || sd_version_is_control(sd_ctx->sd->version)) {
         LOG_INFO("HERE");
         auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2], init_latent->ne[3]);
@@ -1561,8 +1561,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         LOG_INFO("HERE");
 
         cond.c_concat     = concat_latent;
-    } else {
-        noise_mask = concat_latent;
     }
 
     for (int b = 0; b < batch_count; b++) {
@@ -1599,7 +1597,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      sigmas,
                                                      start_merge_step,
                                                      id_cond,
-                                                     noise_mask);
+                                                     denoise_mask);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -1811,6 +1809,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     sd_image_to_tensor(init_image.data, init_img);
 
     ggml_tensor* concat_latent;
+    ggml_tensor* denoise_mask = NULL;
 
     ggml_tensor* init_latent  = NULL;
     ggml_tensor* init_moments = NULL;
@@ -1950,7 +1949,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                                style_ratio,
                                                normalize_input,
                                                input_id_images_path_c_str,
-                                               concat_latent);
+                                               concat_latent,
+                                               denoise_mask);
 
     size_t t2 = ggml_time_ms();