add activation on last logic (#2924)

Bill Yang · facebook-github-bot · commit 33aeafa0f4ea · 2025-04-29T09:09:37.000-07:00
Summary: Pull Request resolved: #2924 # context Adding parity for activation on last flag to make module more configurable. activation_on_last is a flag that toggles the given (or default) activation function to the last layer of the MLP. Typically ALL layers of the MLP will have the activation function. If it is set to false, then the last layer will not have the activation function applied. This is so users can optionally use the raw MLP output for their own customized needs. Reviewed By: TroyGarden Differential Revision: D73691616 fbshipit-source-id: 87a720e9b7e10f2bbb478b68a562a1cd90a36199
diff --git a/torchrec/modules/mlp.py b/torchrec/modules/mlp.py
@@ -128,6 +128,7 @@ def __init__(
         ] = torch.relu,
         device: Optional[torch.device] = None,
         dtype: torch.dtype = torch.float32,
+        activation_on_last: bool = True,
     ) -> None:
         super().__init__()
 
@@ -143,7 +144,11 @@ def __init__(
                         layer_sizes[i - 1] if i > 0 else in_size,
                         layer_sizes[i],
                         bias=bias,
-                        activation=extract_module_or_tensor_callable(activation),
+                        activation=(
+                            torch.nn.Identity()
+                            if not activation_on_last and i == len(layer_sizes) - 1
+                            else extract_module_or_tensor_callable(activation)
+                        ),
                         device=device,
                         dtype=dtype,
                     )
@@ -158,7 +163,11 @@ def __init__(
                             layer_sizes[i - 1] if i > 0 else in_size,
                             layer_sizes[i],
                             bias=bias,
-                            activation=SwishLayerNorm(layer_sizes[i], device=device),
+                            activation=(
+                                torch.nn.Identity()
+                                if not activation_on_last and i == len(layer_sizes) - 1
+                                else SwishLayerNorm(layer_sizes[i], device=device)
+                            ),
                             device=device,
                         )
                         for i in range(len(layer_sizes))