remove old flash attention option and switch vae over to attn_ext

Green-Sky · Green-Sky · commit e904b869d5c1 · 2024-09-07T13:04:20.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -29,7 +29,6 @@ option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
-option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
@@ -61,11 +60,6 @@ if (SD_HIPBLAS)
     endif()
 endif ()
 
-if(SD_FLASH_ATTN)
-    message("-- Use Flash Attention for memory optimization")
-    add_definitions(-DSD_USE_FLASH_ATTENTION)
-endif()
-
 set(SD_LIB stable-diffusion)
 
 file(GLOB SD_LIB_SOURCES 
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -667,32 +667,6 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> split_qkv(struct ggml_context
     return {q, k, v};
 }
 
-// q: [N * n_head, n_token, d_head]
-// k: [N * n_head, n_k, d_head]
-// v: [N * n_head, d_head, n_k]
-// return: [N * n_head, n_token, d_head]
-__STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx,
-                                                        struct ggml_tensor* q,
-                                                        struct ggml_tensor* k,
-                                                        struct ggml_tensor* v,
-                                                        bool mask = false) {
-#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) && !defined(SD_USE_VULKAN) && !defined(SD_USE_SYCL)
-    struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
-#else
-    float d_head = (float)q->ne[0];
-
-    struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, n_token, n_k]
-    kq                     = ggml_scale_inplace(ctx, kq, 1.0f / sqrt(d_head));
-    if (mask) {
-        kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
-    }
-    kq = ggml_soft_max_inplace(ctx, kq);
-
-    struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, n_token, d_head]
-#endif
-    return kqv;
-}
-
 // q: [N, L_q, C] or [N*n_head, L_q, d_head]
 // k: [N, L_k, C] or [N*n_head, L_k, d_head]
 // v: [N, L_k, C] or [N, L_k, n_head, d_head]
@@ -747,6 +721,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
     can_use_flash_attn = can_use_flash_attn && L_k % 256 == 0;
     can_use_flash_attn = can_use_flash_attn && d_head % 64 == 0; // double check
 
+    // cuda max d_head seems to be 256, cpu does seem to work with 512
+    can_use_flash_attn = can_use_flash_attn && d_head <= 256; // double check
+
     if (mask != nullptr) {
         // TODO: figure out if we can bend t5 to work too
         can_use_flash_attn = can_use_flash_attn && mask->ne[2] == 1;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -181,13 +181,7 @@ class StableDiffusionGGML {
             LOG_DEBUG("Using CPU backend");
             backend = ggml_backend_cpu_init();
         }
-#ifdef SD_USE_FLASH_ATTENTION
-#if defined(SD_USE_CUBLAS) || defined(SD_USE_METAL) || defined (SD_USE_SYCL) || defined(SD_USE_VULKAN)
-        LOG_WARN("Flash Attention not supported with GPU Backend");
-#else
-        LOG_INFO("Flash Attention enabled");
-#endif
-#endif
+
         ModelLoader model_loader;
 
         vae_tiling = vae_tiling_;
diff --git a/vae.hpp b/vae.hpp
@@ -99,10 +99,12 @@ class AttnBlock : public UnaryBlock {
         k      = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
         k      = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
 
-        auto v = v_proj->forward(ctx, h_);              // [N, in_channels, h, w]
-        v      = ggml_reshape_3d(ctx, v, h * w, c, n);  // [N, in_channels, h * w]
+        auto v = v_proj->forward(ctx, h_);                          // [N, in_channels, h, w]
+        v      = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));  // [N, h, w, in_channels]
+        v      = ggml_reshape_3d(ctx, v, c, h * w, n);              // [N, h * w, in_channels]
 
-        h_ = ggml_nn_attention(ctx, q, k, v, false);  // [N, h * w, in_channels]
+        //h_ = ggml_nn_attention(ctx, q, k, v, false);  // [N, h * w, in_channels]
+        h_ = ggml_nn_attention_ext(ctx, q, k, v, 1, nullptr, false, true, false);
 
         h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
         h_ = ggml_reshape_4d(ctx, h_, w, h, c, n);               // [N, in_channels, h, w]
@@ -612,4 +614,4 @@ struct AutoEncoderKL : public GGMLRunner {
     };
 };
 
-#endif
+#endif