From 0d778f6b216aa41a571a6bca9b875d6c8cecaea9 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Thu, 11 Apr 2024 00:40:42 +0800
Subject: [PATCH 01/14] Create tgate.md

---
 docs/source/en/optimization/tgate.md | 194 +++++++++++++++++++++++++++
 1 file changed, 194 insertions(+)
 create mode 100644 docs/source/en/optimization/tgate.md

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
new file mode 100644
index 000000000000..52d50e8589d6
--- /dev/null
+++ b/docs/source/en/optimization/tgate.md
@@ -0,0 +1,194 @@
+# TGATE
+
+[TGATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inferences of [`PixArtAlphaPipeline`], [`StableDiffusionPipeline`], and [`StableDiffusionXLPipeline`] by skipping the calculation of cross-attention once it converges. More details can be found at [technical report](https://huggingface.co/papers/2404.02747).
+
+![](https://github.com/HaozheLiu-ST/T-GATE/assets/53887227/bff43e0e-2372-4edc-9ba1-64dcbc649329)
+
+
+
+
+## 🚀 Major Features 
+
+* Training-Free.
+* Easily Integrate into [Diffusers](https://github.com/huggingface/diffusers/tree/main).
+* Only a few lines of code are required.
+* Complementary to [DeepCache](https://github.com/horseee/DeepCache).
+* Friendly support [Stable Diffusion pipelines](https://huggingface.co/stabilityai), [PixArt](https://pixart-alpha.github.io/), and [Latent Consistency Models](https://latent-consistency-models.github.io/).
+* 10%-50% speed up for different models. 
+
+## 📖 Quick Start
+
+### 🛠️ Installation
+
+Start by installing [TGATE](https://github.com/HaozheLiu-ST/T-GATE/tree/release-v.0.1.0):
+
+```
+pip install tgate
+```
+
+#### Requirements
+
+* pytorch>=2.0.0
+* diffusers>=0.27.2
+* transformers==4.37.2
+* DeepCache==0.1.1
+* accelerate
+
+### 🌟 Usage
+
+Accelerate `PixArtAlphaPipeline` with TGATE:
+
+```diff
+import torch
+from diffusers import PixArtAlphaPipeline
+
+pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
+
++ from tgate import TgatePixArtLoader
++ gate_step = 8
++ inference_step = 25
++ pipe = TgatePixArtLoader(
++        pipe,
++        gate_step=gate_step,
++        num_inference_steps=inference_step,
++ )
+pipe = pipe.to("cuda")
+
++ image = pipe.tgate(
++         "An alpaca made of colorful building blocks, cyberpunk.",
++         gate_step=gate_step,
++         num_inference_steps=inference_step,
++ ).images[0]
+```
+
+Accelerate `StableDiffusionXLPipeline` with TGATE:
+
+```diff
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers import DPMSolverMultistepScheduler
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float16,
+            variant="fp16",
+            use_safetensors=True,
+)
+
++ from tgate import TgateSDXLLoader
++ gate_step = 10
++ inference_step = 25
++ pipe = TgateSDXLLoader(
++        pipe,
++        gate_step=gate_step,
++        num_inference_steps=inference_step,
++ )
+
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
++ image = pipe.tgate(
++         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
++         gate_step=gate_step,
++         num_inference_steps=inference_step
++ ).images[0]
+```
+
+Accelerate `StableDiffusionXLPipeline` with [DeepCache](https://github.com/horseee/DeepCache) and TGATE:
+
+```diff
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers import DPMSolverMultistepScheduler
+
+pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+            torch_dtype=torch.float16,
+            variant="fp16",
+            use_safetensors=True,
+)
+
++ from tgate import TgateSDXLDeepCacheLoader
++ gate_step = 10
++ inference_step = 25
++ pipe = TgateSDXLDeepCacheLoader(
++        pipe,
++        cache_interval=3,
++        cache_branch_id=0,
++ )
+
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+
++ image = pipe.tgate(
++         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
++         gate_step=gate_step,
++         num_inference_steps=inference_step
++ ).images[0]
+```
+
+Accelerate `latent-consistency/lcm-sdxl` with TGATE:
+
+```diff
+import torch
+from diffusers import StableDiffusionXLPipeline
+from diffusers import UNet2DConditionModel, LCMScheduler
+from diffusers import DPMSolverMultistepScheduler
+
+unet = UNet2DConditionModel.from_pretrained(
+    "latent-consistency/lcm-sdxl",
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    unet=unet,
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
+
++ from tgate import TgateSDXLLoader
++ gate_step = 1
++ inference_step = 4
++ pipe = TgateSDXLLoader(
++        pipe,
++        gate_step=gate_step,
++        num_inference_steps=inference_step,
++        lcm=True
++ )
+pipe = pipe.to("cuda")
+
++ image = pipe.tgate(
++         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
++         gate_step=gate_step,
++         num_inference_steps=inference_step
++ ).images[0]
+```
+
+TGATE also supports `StableDiffusionPipeline` and `PixArt-alpha/PixArt-LCM-XL-2-1024-MS`.
+More details can be found at [here](https://github.com/HaozheLiu-ST/T-GATE/tree/release-v.0.1.0/main.py).
+
+## 📄 Results
+| Model                 | MACs     | Param     | Latency | Zero-shot 10K-FID on MS-COCO |
+|-----------------------|----------|-----------|---------|---------------------------|
+| SD-1.5                | 16.938T  | 859.520M  | 7.032s  | 23.927                    |
+| SD-1.5 w/ TGATE       | 9.875T   | 815.557M  | 4.313s  | 20.789                    |
+| SD-2.1                | 38.041T  | 865.785M  | 16.121s | 22.609                    |
+| SD-2.1 w/ TGATE       | 22.208T  | 815.433 M | 9.878s  | 19.940                    |
+| SD-XL                 | 149.438T | 2.570B    | 53.187s | 24.628                    |
+| SD-XL w/ TGATE        | 84.438T  | 2.024B    | 27.932s | 22.738                    |
+| Pixart-Alpha          | 107.031T | 611.350M  | 61.502s | 38.669                    |
+| Pixart-Alpha w/ TGATE | 65.318T  | 462.585M  | 37.867s | 35.825                    |
+| DeepCache (SD-XL)     | 57.888T  | -         | 19.931s | 23.755                    |
+| DeepCache w/ TGATE    | 43.868T  | -         | 14.666s | 23.999                    |
+| LCM (SD-XL)           | 11.955T  | 2.570B    | 3.805s  | 25.044                    |
+| LCM w/ TGATE          | 11.171T  | 2.024B    | 3.533s  | 25.028                    |
+| LCM (Pixart-Alpha)    | 8.563T   | 611.350M  | 4.733s  | 36.086                    |
+| LCM w/ TGATE          | 7.623T   | 462.585M  | 4.543s  | 37.048                    |
+
+The latency is tested on a 1080ti commercial card. 
+
+The MACs and Params are calculated by [calflops](https://github.com/MrYxJ/calculate-flops.pytorch). 
+
+The FID is calculated by [PytorchFID](https://github.com/mseitzer/pytorch-fid).

From bf97d3558382a58df44fb09880ad0d0302e889f1 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Sat, 13 Apr 2024 19:28:12 +0800
Subject: [PATCH 02/14] Update _toctree.yml

---
 docs/source/en/_toctree.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 83693485d0e2..8265f315bf6c 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -172,6 +172,8 @@
       title: Token merging
     - local: optimization/deepcache
       title: DeepCache
+    - local: optimization/tgate
+      title: TGATE
     title: General optimizations
   - sections:
     - local: using-diffusers/stable_diffusion_jax_how_to

From df5187b306b579812419c7ff3909ec84985661f5 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:50:36 +0800
Subject: [PATCH 03/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 52d50e8589d6..7727b677c91b 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -1,20 +1,6 @@
-# TGATE
+# T-GATE
 
-[TGATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inferences of [`PixArtAlphaPipeline`], [`StableDiffusionPipeline`], and [`StableDiffusionXLPipeline`] by skipping the calculation of cross-attention once it converges. More details can be found at [technical report](https://huggingface.co/papers/2404.02747).
-
-![](https://github.com/HaozheLiu-ST/T-GATE/assets/53887227/bff43e0e-2372-4edc-9ba1-64dcbc649329)
-
-
-
-
-## 🚀 Major Features 
-
-* Training-Free.
-* Easily Integrate into [Diffusers](https://github.com/huggingface/diffusers/tree/main).
-* Only a few lines of code are required.
-* Complementary to [DeepCache](https://github.com/horseee/DeepCache).
-* Friendly support [Stable Diffusion pipelines](https://huggingface.co/stabilityai), [PixArt](https://pixart-alpha.github.io/), and [Latent Consistency Models](https://latent-consistency-models.github.io/).
-* 10%-50% speed up for different models. 
+[T-GATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inference for [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [PixArt](../api/pipelines/pixart), and [Latency Consistency Model](../api/pipelines/latent_consistency_models.md) pipelines by skipping the cross-attention calculation once it converges. This method doesn't require any additional training and it can speed up inference from 10-50%. T-GATE is also compatible with other optimization methods like [DeepCache](./deepcache).
 
 ## 📖 Quick Start
 

From 0fda09c5b90686cc3513a5618d7101b949bc3d6f Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:50:46 +0800
Subject: [PATCH 04/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 7727b677c91b..5c05bb7169fb 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -2,11 +2,7 @@
 
 [T-GATE](https://github.com/HaozheLiu-ST/T-GATE/tree/main) accelerates inference for [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [PixArt](../api/pipelines/pixart), and [Latency Consistency Model](../api/pipelines/latent_consistency_models.md) pipelines by skipping the cross-attention calculation once it converges. This method doesn't require any additional training and it can speed up inference from 10-50%. T-GATE is also compatible with other optimization methods like [DeepCache](./deepcache).
 
-## 📖 Quick Start
-
-### 🛠️ Installation
-
-Start by installing [TGATE](https://github.com/HaozheLiu-ST/T-GATE/tree/release-v.0.1.0):
+Before you begin, make sure you install T-GATE.
 
 ```
 pip install tgate

From 777a1ab07049daf389c9a542cf3a7da5ccb21867 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:51:00 +0800
Subject: [PATCH 05/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 5c05bb7169fb..3784eeb007f9 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -4,7 +4,7 @@
 
 Before you begin, make sure you install T-GATE.
 
-```
+```bash
 pip install tgate
 ```
 

From 6a57cd3c9ccb08228033b0583daefbbf68eb28b5 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:51:32 +0800
Subject: [PATCH 06/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 3784eeb007f9..616c522b8b05 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -6,7 +6,7 @@ Before you begin, make sure you install T-GATE.
 
 ```bash
 pip install tgate
-```
+pip install -U pytorch diffusers transformers accelerate DeepCache
 
 #### Requirements
 

From ef60e1c2dfb4c0bb5058db0c1c48b7ddcae0f4a9 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:51:39 +0800
Subject: [PATCH 07/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 616c522b8b05..be27ea326ed2 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -8,13 +8,6 @@ Before you begin, make sure you install T-GATE.
 pip install tgate
 pip install -U pytorch diffusers transformers accelerate DeepCache
 
-#### Requirements
-
-* pytorch>=2.0.0
-* diffusers>=0.27.2
-* transformers==4.37.2
-* DeepCache==0.1.1
-* accelerate
 
 ### 🌟 Usage
 

From c2fffb7174abd51694463fe499cd966ba9c6f5b5 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:52:07 +0800
Subject: [PATCH 08/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index be27ea326ed2..6d9980f16165 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -9,9 +9,17 @@ pip install tgate
 pip install -U pytorch diffusers transformers accelerate DeepCache
 
 
-### 🌟 Usage
+To use T-GATE with a pipeline, you need to use its corresponding loader.
 
-Accelerate `PixArtAlphaPipeline` with TGATE:
+| Pipeline | T-GATE Loader |
+|---|---|
+| PixArt | TgatePixArtLoader |
+| Stable Diffusion XL | TgateSDXLLoader |
+| Stable Diffusion XL + DeepCache | TgateSDXLDeepCacheLoader |
+
+Next, create a `TgateLoader` with a pipeline, the gate step(`add brief description here`), and the number of inference steps. Then call the `tgate` method on the pipeline with a prompt, gate step, and the number of inference steps.
+
+Let's see how to enable this for several different pipelines.
 
 ```diff
 import torch

From 3beda26928a2f33c78a8ef57633025b4ac4954cd Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:54:52 +0800
Subject: [PATCH 09/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 6d9980f16165..2717349c6836 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -21,28 +21,24 @@ Next, create a `TgateLoader` with a pipeline, the gate step(`add brief descripti
 
 Let's see how to enable this for several different pipelines.
 
-```diff
+```py
 import torch
 from diffusers import PixArtAlphaPipeline
+from tgate import TgatePixArtLoader
 
 pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
-
-+ from tgate import TgatePixArtLoader
-+ gate_step = 8
-+ inference_step = 25
-+ pipe = TgatePixArtLoader(
-+        pipe,
-+        gate_step=gate_step,
-+        num_inference_steps=inference_step,
-+ )
+pipe = TgatePixArtLoader(
+       pipe,
+       gate_step=8,
+       num_inference_steps=25,
+)
 pipe = pipe.to("cuda")
 
-+ image = pipe.tgate(
-+         "An alpaca made of colorful building blocks, cyberpunk.",
-+         gate_step=gate_step,
-+         num_inference_steps=inference_step,
-+ ).images[0]
-```
+image = pipe.tgate(
+       "An alpaca made of colorful building blocks, cyberpunk.",
+        gate_step=gate_step,
+       num_inference_steps=inference_step,
+).images[0]
 
 Accelerate `StableDiffusionXLPipeline` with TGATE:
 

From 3cfec749424d82c4eb7e0da05861ff3212e93569 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:55:11 +0800
Subject: [PATCH 10/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 2717349c6836..7f055a3b8d72 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -145,8 +145,7 @@ pipe = pipe.to("cuda")
 + ).images[0]
 ```
 
-TGATE also supports `StableDiffusionPipeline` and `PixArt-alpha/PixArt-LCM-XL-2-1024-MS`.
-More details can be found at [here](https://github.com/HaozheLiu-ST/T-GATE/tree/release-v.0.1.0/main.py).
+T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://hf.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS).
 
 ## 📄 Results
 | Model                 | MACs     | Param     | Latency | Zero-shot 10K-FID on MS-COCO |

From 6e0c212db74f557033de239e21f520f54b54f667 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:55:24 +0800
Subject: [PATCH 11/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 7f055a3b8d72..6155a9e565bc 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -147,7 +147,7 @@ pipe = pipe.to("cuda")
 
 T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://hf.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS).
 
-## 📄 Results
+## Benchmarks
 | Model                 | MACs     | Param     | Latency | Zero-shot 10K-FID on MS-COCO |
 |-----------------------|----------|-----------|---------|---------------------------|
 | SD-1.5                | 16.938T  | 859.520M  | 7.032s  | 23.927                    |

From 7af2c44891aefe80e716eefb82ec0ff874780ab3 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 10:55:53 +0800
Subject: [PATCH 12/14] Update docs/source/en/optimization/tgate.md

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/optimization/tgate.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 6155a9e565bc..3efaf597a619 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -165,8 +165,4 @@ T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL
 | LCM (Pixart-Alpha)    | 8.563T   | 611.350M  | 4.733s  | 36.086                    |
 | LCM w/ TGATE          | 7.623T   | 462.585M  | 4.543s  | 37.048                    |
 
-The latency is tested on a 1080ti commercial card. 
-
-The MACs and Params are calculated by [calflops](https://github.com/MrYxJ/calculate-flops.pytorch). 
-
-The FID is calculated by [PytorchFID](https://github.com/mseitzer/pytorch-fid).
+The latency is tested on an NVIDIA 1080TI, MACs and Params are calculated with [calflops](https://github.com/MrYxJ/calculate-flops.pytorch), and the FID is calculated with [PytorchFID](https://github.com/mseitzer/pytorch-fid).

From 60a24a2d1f33ef5894a16c7469f05dc7a4b39712 Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Tue, 16 Apr 2024 11:58:28 +0800
Subject: [PATCH 13/14] Update tgate.md

---
 docs/source/en/optimization/tgate.md | 101 +++++++++++++++------------
 1 file changed, 56 insertions(+), 45 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 3efaf597a619..8ef119823ecf 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -7,6 +7,7 @@ Before you begin, make sure you install T-GATE.
 ```bash
 pip install tgate
 pip install -U pytorch diffusers transformers accelerate DeepCache
+```
 
 
 To use T-GATE with a pipeline, you need to use its corresponding loader.
@@ -16,11 +17,18 @@ To use T-GATE with a pipeline, you need to use its corresponding loader.
 | PixArt | TgatePixArtLoader |
 | Stable Diffusion XL | TgateSDXLLoader |
 | Stable Diffusion XL + DeepCache | TgateSDXLDeepCacheLoader |
+| Stable Diffusion | TgateSDLoader |
+| Stable Diffusion + DeepCache | TgateSDDeepCacheLoader |
 
-Next, create a `TgateLoader` with a pipeline, the gate step(`add brief description here`), and the number of inference steps. Then call the `tgate` method on the pipeline with a prompt, gate step, and the number of inference steps.
+Next, create a `TgateLoader` with a pipeline, the gate step (the time step to stop calculating the cross attention), and the number of inference steps. Then call the `tgate` method on the pipeline with a prompt, gate step, and the number of inference steps.
 
 Let's see how to enable this for several different pipelines.
 
+<hfoptions id="pipelines">
+<hfoption id="PixArt">
+
+Accelerate `PixArtAlphaPipeline` with T-GATE:
+
 ```py
 import torch
 from diffusers import PixArtAlphaPipeline
@@ -31,18 +39,20 @@ pipe = TgatePixArtLoader(
        pipe,
        gate_step=8,
        num_inference_steps=25,
-)
-pipe = pipe.to("cuda")
+).to("cuda")
 
 image = pipe.tgate(
        "An alpaca made of colorful building blocks, cyberpunk.",
         gate_step=gate_step,
        num_inference_steps=inference_step,
 ).images[0]
+```
+</hfoption>
+<hfoption id="Stable Diffusion XL"> 
 
-Accelerate `StableDiffusionXLPipeline` with TGATE:
+Accelerate `StableDiffusionXLPipeline` with T-GATE:
 
-```diff
+```py
 import torch
 from diffusers import StableDiffusionXLPipeline
 from diffusers import DPMSolverMultistepScheduler
@@ -53,29 +63,29 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
             variant="fp16",
             use_safetensors=True,
 )
-
-+ from tgate import TgateSDXLLoader
-+ gate_step = 10
-+ inference_step = 25
-+ pipe = TgateSDXLLoader(
-+        pipe,
-+        gate_step=gate_step,
-+        num_inference_steps=inference_step,
-+ )
-
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
 
-+ image = pipe.tgate(
-+         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-+         gate_step=gate_step,
-+         num_inference_steps=inference_step
-+ ).images[0]
+from tgate import TgateSDXLLoader
+gate_step = 10
+inference_step = 25
+pipe = TgateSDXLLoader(
+       pipe,
+       gate_step=gate_step,
+       num_inference_steps=inference_step,
+).to("cuda")
+
+image = pipe.tgate(
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
+).images[0]
 ```
+</hfoption>
+<hfoption id="StableDiffusionXL with DeepCache">
 
-Accelerate `StableDiffusionXLPipeline` with [DeepCache](https://github.com/horseee/DeepCache) and TGATE:
+Accelerate `StableDiffusionXLPipeline` with [DeepCache](https://github.com/horseee/DeepCache) and T-GATE:
 
-```diff
+```py
 import torch
 from diffusers import StableDiffusionXLPipeline
 from diffusers import DPMSolverMultistepScheduler
@@ -86,18 +96,16 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
             variant="fp16",
             use_safetensors=True,
 )
-
-+ from tgate import TgateSDXLDeepCacheLoader
-+ gate_step = 10
-+ inference_step = 25
-+ pipe = TgateSDXLDeepCacheLoader(
-+        pipe,
-+        cache_interval=3,
-+        cache_branch_id=0,
-+ )
-
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-pipe = pipe.to("cuda")
+
+from tgate import TgateSDXLDeepCacheLoader
+gate_step = 10
+inference_step = 25
+pipe = TgateSDXLDeepCacheLoader(
+       pipe,
+       cache_interval=3,
+       cache_branch_id=0,
+).to("cuda")
 
 + image = pipe.tgate(
 +         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
@@ -105,10 +113,12 @@ pipe = pipe.to("cuda")
 +         num_inference_steps=inference_step
 + ).images[0]
 ```
+</hfoption>
+<hfoption id="Latent Consistency Model">
 
-Accelerate `latent-consistency/lcm-sdxl` with TGATE:
+Accelerate `latent-consistency/lcm-sdxl` with T-GATE:
 
-```diff
+```py
 import torch
 from diffusers import StableDiffusionXLPipeline
 from diffusers import UNet2DConditionModel, LCMScheduler
@@ -135,8 +145,7 @@ pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 +        gate_step=gate_step,
 +        num_inference_steps=inference_step,
 +        lcm=True
-+ )
-pipe = pipe.to("cuda")
++ ).to("cuda")
 
 + image = pipe.tgate(
 +         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
@@ -144,6 +153,8 @@ pipe = pipe.to("cuda")
 +         num_inference_steps=inference_step
 + ).images[0]
 ```
+</hfoption>
+</hfoptions>
 
 T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://hf.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS).
 
@@ -151,18 +162,18 @@ T-GATE also supports [`StableDiffusionPipeline`] and [PixArt-alpha/PixArt-LCM-XL
 | Model                 | MACs     | Param     | Latency | Zero-shot 10K-FID on MS-COCO |
 |-----------------------|----------|-----------|---------|---------------------------|
 | SD-1.5                | 16.938T  | 859.520M  | 7.032s  | 23.927                    |
-| SD-1.5 w/ TGATE       | 9.875T   | 815.557M  | 4.313s  | 20.789                    |
+| SD-1.5 w/ T-GATE       | 9.875T   | 815.557M  | 4.313s  | 20.789                    |
 | SD-2.1                | 38.041T  | 865.785M  | 16.121s | 22.609                    |
-| SD-2.1 w/ TGATE       | 22.208T  | 815.433 M | 9.878s  | 19.940                    |
+| SD-2.1 w/ T-GATE       | 22.208T  | 815.433 M | 9.878s  | 19.940                    |
 | SD-XL                 | 149.438T | 2.570B    | 53.187s | 24.628                    |
-| SD-XL w/ TGATE        | 84.438T  | 2.024B    | 27.932s | 22.738                    |
+| SD-XL w/ T-GATE        | 84.438T  | 2.024B    | 27.932s | 22.738                    |
 | Pixart-Alpha          | 107.031T | 611.350M  | 61.502s | 38.669                    |
-| Pixart-Alpha w/ TGATE | 65.318T  | 462.585M  | 37.867s | 35.825                    |
+| Pixart-Alpha w/ T-GATE | 65.318T  | 462.585M  | 37.867s | 35.825                    |
 | DeepCache (SD-XL)     | 57.888T  | -         | 19.931s | 23.755                    |
-| DeepCache w/ TGATE    | 43.868T  | -         | 14.666s | 23.999                    |
+| DeepCache w/ T-GATE    | 43.868T  | -         | 14.666s | 23.999                    |
 | LCM (SD-XL)           | 11.955T  | 2.570B    | 3.805s  | 25.044                    |
-| LCM w/ TGATE          | 11.171T  | 2.024B    | 3.533s  | 25.028                    |
+| LCM w/ T-GATE          | 11.171T  | 2.024B    | 3.533s  | 25.028                    |
 | LCM (Pixart-Alpha)    | 8.563T   | 611.350M  | 4.733s  | 36.086                    |
-| LCM w/ TGATE          | 7.623T   | 462.585M  | 4.543s  | 37.048                    |
+| LCM w/ T-GATE          | 7.623T   | 462.585M  | 4.543s  | 37.048                    |
 
 The latency is tested on an NVIDIA 1080TI, MACs and Params are calculated with [calflops](https://github.com/MrYxJ/calculate-flops.pytorch), and the FID is calculated with [PytorchFID](https://github.com/mseitzer/pytorch-fid).

From 2eaf934e31c6604760cc937f2380a29fc724402b Mon Sep 17 00:00:00 2001
From: Wentian <94900022+WentianZhang-ML@users.noreply.github.com>
Date: Wed, 17 Apr 2024 02:19:14 +0800
Subject: [PATCH 14/14] Update tgate.md

---
 docs/source/en/optimization/tgate.md | 40 ++++++++++++++--------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index 8ef119823ecf..0b536a215fc0 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -107,11 +107,11 @@ pipe = TgateSDXLDeepCacheLoader(
        cache_branch_id=0,
 ).to("cuda")
 
-+ image = pipe.tgate(
-+         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-+         gate_step=gate_step,
-+         num_inference_steps=inference_step
-+ ).images[0]
+image = pipe.tgate(
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
+).images[0]
 ```
 </hfoption>
 <hfoption id="Latent Consistency Model">
@@ -137,21 +137,21 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
 )
 pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
 
-+ from tgate import TgateSDXLLoader
-+ gate_step = 1
-+ inference_step = 4
-+ pipe = TgateSDXLLoader(
-+        pipe,
-+        gate_step=gate_step,
-+        num_inference_steps=inference_step,
-+        lcm=True
-+ ).to("cuda")
-
-+ image = pipe.tgate(
-+         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-+         gate_step=gate_step,
-+         num_inference_steps=inference_step
-+ ).images[0]
+from tgate import TgateSDXLLoader
+gate_step = 1
+inference_step = 4
+pipe = TgateSDXLLoader(
+       pipe,
+       gate_step=gate_step,
+       num_inference_steps=inference_step,
+       lcm=True
+).to("cuda")
+
+image = pipe.tgate(
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+        gate_step=gate_step,
+        num_inference_steps=inference_step
+).images[0]
 ```
 </hfoption>
 </hfoptions>