@@ -1477,10 +1477,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1477
1477
LOG_INFO (" sampling using %s method" , sampling_methods_str[sample_method]);
1478
1478
1479
1479
struct ggml_tensor * control_latent = NULL ;
1480
- if (sd_version_is_control (sd_ctx->sd ->version ) && image_hint != NULL ){
1480
+ if (sd_version_is_control (sd_ctx->sd ->version ) && image_hint != NULL ) {
1481
1481
if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1482
1482
struct ggml_tensor * control_moments = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1483
- control_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, control_moments);
1483
+ control_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, control_moments);
1484
1484
} else {
1485
1485
control_latent = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1486
1486
}
@@ -1560,7 +1560,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1560
1560
}
1561
1561
LOG_INFO (" HERE" );
1562
1562
1563
- cond.c_concat = concat_latent;
1563
+ cond.c_concat = concat_latent;
1564
1564
}
1565
1565
1566
1566
for (int b = 0 ; b < batch_count; b++) {
@@ -1827,16 +1827,23 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1827
1827
} else if (sd_ctx->sd ->version == VERSION_FLEX_2) {
1828
1828
mask_channels = 1 + init_latent->ne [2 ];
1829
1829
}
1830
- ggml_tensor* masked_img = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width, height, 3 , 1 );
1831
- // Restore init_img (encode_first_stage has side effects) TODO: remove the side effects?
1832
- sd_image_to_tensor (init_image.data , init_img);
1833
- sd_apply_mask (init_img, mask_img, masked_img);
1834
1830
ggml_tensor* masked_latent_0 = NULL ;
1835
- if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1836
- ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1837
- masked_latent_0 = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1831
+ if (sd_ctx->sd ->version != VERSION_FLEX_2) {
1832
+ // most inpaint models mask before vae
1833
+ ggml_tensor* masked_img = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width, height, 3 , 1 );
1834
+ // Restore init_img (encode_first_stage has side effects) TODO: remove the side effects?
1835
+ sd_image_to_tensor (init_image.data , init_img);
1836
+ sd_apply_mask (init_img, mask_img, masked_img);
1837
+ if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1838
+ ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1839
+ masked_latent_0 = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1840
+ } else {
1841
+ masked_latent_0 = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1842
+ }
1838
1843
} else {
1839
- masked_latent_0 = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1844
+ // mask after vae
1845
+ masked_latent_0 = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, init_latent->ne [0 ], init_latent->ne [1 ], init_latent->ne [2 ], 1 );
1846
+ sd_apply_mask (init_latent, mask_img, masked_latent_0, 0 .);
1840
1847
}
1841
1848
concat_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, masked_latent_0->ne [0 ], masked_latent_0->ne [1 ], mask_channels + masked_latent_0->ne [2 ], 1 );
1842
1849
for (int ix = 0 ; ix < masked_latent_0->ne [0 ]; ix++) {
0 commit comments