add norm_ffn_norm to profile script

vkuzo · vkuzo · commit ef585cfa6523 · 2024-06-13T21:44:12.000-07:00
Summary: This PR adds an example FFN with the preceding and subsequent norms to the profile script. I hope for this to speed up debugging of kernel performance on LLaMa. Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 8eb0020 Pull Request resolved: #282
diff --git a/benchmarks/profile_linear_float8.py b/benchmarks/profile_linear_float8.py
@@ -14,6 +14,8 @@
 import fire
 
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
 from float8_experimental.float8_linear import Float8Linear
 from float8_experimental.float8_linear_utils import (
@@ -38,6 +40,105 @@ def forward(self, x):
         return x
 
 
+# copied from https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/norms.py
+class RMSNorm(nn.Module):
+    """
+    Initialize the RMSNorm normalization layer.
+
+    Args:
+        dim (int): The dimension of the input tensor.
+        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+    Attributes:
+        eps (float): A small value added to the denominator for numerical stability.
+        weight (nn.Parameter): Learnable scaling parameter.
+
+    """
+
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x: torch.Tensor):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x: torch.Tensor):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)  # type: ignore
+
+
+# copied from https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama/model.py
+class FeedForward(nn.Module):
+    """
+    FeedForward module
+
+    Args:
+        dim (int): Input dimension.
+        hidden_dim (int): Hidden dimension of the feedforward layer.
+        multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None.
+
+    Attributes:
+        w1 (Linear): Linear transformation for the first layer.
+        w2 (Linear): Linear transformation for the second layer.
+        w3 (Linear): Linear transformation for the third layer.
+
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
+        for linear in (self.w2, self.w3):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)
+
+
+class NormFFNResidualNorm(nn.Module):
+    """
+    A fragment representing the end of TransformerBlock n and the start
+    of TransformerBlock n + 1, intended to include the fusions relevant
+    to float8 gemms in the FFN module in forward and backward.
+    """
+
+    def __init__(self, dim, hidden_dim, multiple_of, ffn_dim_multiplier):
+        super().__init__()
+        self.ffn_norm = RMSNorm(dim)
+        self.ffn = FeedForward(dim, hidden_dim, multiple_of, ffn_dim_multiplier)
+        self.attn_norm = RMSNorm(dim)
+
+    def forward(self, h):
+        # end of transformer block n
+        x = self.ffn_norm(h)
+        x = self.ffn(x)
+        x = h + x
+        # start of transformer block n + 1
+        x = self.attn_norm(x)
+        return x
+
+
 @dataclass
 class ProfileConfig:
     file_path: Optional[str] = None
@@ -93,40 +194,46 @@ def profile_function(
     return prof
 
 
-@dataclass(frozen=True)
-class ModelParams:
-    M: int
-    K: int
-    N: int
-    ref_dtype: torch.dtype
-    layer_norm: bool = True
-
-
 def main(
     profile_path_prefix: Path,
     compile: bool = True,
     linear_type: str = "dynamic",
-    use_layer_norm: bool = False,
+    model_type: str = "linear",
 ):
-    params = ModelParams(
-        M=4 * 4096,
-        K=8192,
-        N=7168,
-        ref_dtype=torch.bfloat16,
-        layer_norm=use_layer_norm,
-    )
+    assert model_type in ("linear", "ln_linear", "norm_ffn_norm"), "unsupported"
+
     print(f"Compile is set to          | {compile}")
     print(f"Using Linear type:         | {linear_type}")
-    print(f"Use layer norm is set to   | {params.layer_norm}")
+    print(f"model_type is set to       | {model_type}")
 
     device = "cuda"
-    if params.layer_norm:
-        m_ref = LNLinear(params.K, params.N)
+    ref_dtype = torch.bfloat16
+    if model_type == "ln_linear":
+        M, K, N = 4 * 4096, 8192, 7168
+        m_ref = LNLinear(K, N)
+        input_tensor = torch.randn(
+            M, K, device=device, dtype=ref_dtype, requires_grad=True
+        )
+    elif model_type == "norm_ffn_norm":
+        m_ref = NormFFNResidualNorm(
+            dim=4096,
+            hidden_dim=16384,
+            multiple_of=1024,
+            ffn_dim_multiplier=1.3,
+        )
+        input_tensor = torch.randn(
+            1, 8192, 4096, device=device, dtype=ref_dtype
+        ).requires_grad_()
     else:
+        M, K, N = 4 * 4096, 8192, 7168
         m_ref = torch.nn.Sequential(
-            torch.nn.Linear(params.K, params.N, bias=False),
+            torch.nn.Linear(K, N, bias=False),
         )
-    m_ref = m_ref.to(device).to(params.ref_dtype)
+        input_tensor = torch.randn(
+            M, K, device=device, dtype=ref_dtype, requires_grad=True
+        )
+
+    m_ref = m_ref.to(device).to(ref_dtype)
 
     linear_type = LinearType[linear_type.upper()]
     linear_cls = (
@@ -136,10 +243,6 @@ def main(
     m_float8 = copy.deepcopy(m_ref)
     swap_linear_with_float8_linear(m_float8, linear_cls)
 
-    input_tensor = torch.randn(
-        params.M, params.K, device="cuda", dtype=params.ref_dtype, requires_grad=True
-    )
-
     def ref_forw_backward(x):
         out = m_ref(x)
         out.sum().backward()
@@ -173,14 +276,14 @@ def float8_forw_backward_wrapper(x):
         float8_forw_backward_wrapper(input_tensor)
 
     # Profile Reference Model
-    ref_suffix = f"_ref_compile_{compile}.json"
+    ref_suffix = f"_{model_type}_ref_compile_{compile}.json"
     profile_config = ProfileConfig(
         profile_path_prefix + ref_suffix, ref_suffix, iters=5, warmup_iters=5, sync=True
     )
     profile_function(profile_config, ref_forw_backward, input_tensor)
 
     # Profile Float8 Model
-    float8_suffix = f"_float8_compile_{compile}_{linear_type}.json"
+    float8_suffix = f"_{model_type}_float8_compile_{compile}_{linear_type}.json"
     profile_config = ProfileConfig(
         profile_path_prefix + float8_suffix,
         float8_suffix,