remove unused code

drisspg · drisspg · commit d54bd0012bb0 · 2024-06-28T14:59:52.000-07:00
diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_linear.py
@@ -62,10 +62,8 @@ class Float8DynamicLinear(torch.nn.Linear):
     def __init__(self, **super_kwargs):
         super().__init__(**super_kwargs)
 
-    def forward(self, x):
-        x_fp8 = cast_to_float8_e4m3fn(
-            x, self.forward_config, activation_scale=self.activation_scale
-        )
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        x_fp8 = cast_to_float8_e4m3fn(input, self.forward_config)
         if isinstance(self.weight, Float8Tensor):  # cast by FSDP
             w_fp8 = self.weight
         else:
@@ -74,30 +72,6 @@ def forward(self, x):
         y = cast_to_float8_e5m2_bw(y, self.backward_config)
         return y
 
-    def quantize_weight(self, dtype: torch.dtype = e4m3_dtype) -> None:
-        """This functions converts the weight to a Float8Tensor and sets its requires_grad to False.
-
-        Args:
-            dtype: The dtype to quantize the weight to. Default is e4m3_dtype.
-
-        Note:
-            This function is typically called during inference to quantize the weight once since
-            the weight is not updated during inference.
-
-        """
-        assert not isinstance(
-            self.weight, Float8Tensor
-        ), "Weight has already been quantized, cannot quantize again."
-        scale = tensor_to_scale(self.weight, dtype)
-        quantized_weight = to_fp8_no_autograd(
-            self.weight,
-            scale,
-            dtype,
-            self.forward_config,
-        )
-        self.weight = nn.Parameter(quantized_weight)
-        self.weight.requires_grad = False
-
     @classmethod
     def create_meta_class(
         cls, in_features: int, out_features: int
@@ -122,55 +96,37 @@ def set_weight_and_bias(
         self.bias = bias
         return self
 
-    def set_quantization_scales(
-        self, pre_quantize_weight: bool, activation_scale: Optional[torch.Tensor] = None
-    ) -> "Float8DynamicLinear":
-        if pre_quantize_weight:
-            self.quantize_weight()
-
-        self.register_buffer("activation_scale", activation_scale)
-        return self
-
     @classmethod
     def from_float(
         cls,
         mod,
         emulate: bool = False,
-        pre_quantize_weight: bool = False,
-        activation_scale: Optional[torch.Tensor] = None,
     ) -> "Float8DynamicLinear":
         """
         Create an nn.Linear with fp8 compute from a regular nn.Linear
 
         Args:
             mod (torch.nn.Linear): nn.Linear to convert
             emulate (bool): whether to emulate fp8 matmul logic in float32
-            pre_quantize_weight (bool): whether to quantize the weight statically, this is useful
-                for inference where weights are not updated.
-            activation_scale (torch.Tensor): The scale of the input to this linear module, used for
-                for inference when a statically known scale is available.
         """
         return (
             cls.create_meta_class(mod.in_features, mod.out_features)
             .set_mm_configs(emulate)
             .set_weight_and_bias(mod.weight, mod.bias)
-            .set_quantization_scales(pre_quantize_weight, activation_scale)
         )
 
 
 def cast_to_float8_e4m3fn(
     inpt_tensor: torch.Tensor,
     mm_config: ScaledMMConfig,
     reduce_amax: bool = False,
-    activation_scale: Optional[torch.Tensor] = None,
 ) -> Float8Tensor:
     """Casts an input tensor to the Float8 (e4m3fn) format for efficient computation.
 
     Args:
         inpt_tensor: The input tensor to be cast.
         mm_config: Configuration settings for the matrix multiplication
         reduce_amax: Whether to reduce the amax (absolute maximum) among the local distributed group.
-        activation_scale: Optional tensor specifying the scale for activation. Default is None.
 
     Returns:
         Float8Tensor: The input tensor cast to Float8 (e4m3fn) format.
@@ -180,11 +136,7 @@ def cast_to_float8_e4m3fn(
     """
     if tensor_already_casted_to_fp8(inpt_tensor):
         return inpt_tensor
-    scale = (
-        activation_scale
-        if activation_scale is not None
-        else tensor_to_scale(inpt_tensor, e4m3_dtype, reduce_amax)
-    )
+    scale = tensor_to_scale(inpt_tensor, e4m3_dtype, reduce_amax)
     return Float8Tensor.to_float8(inpt_tensor, scale, e4m3_dtype, mm_config=mm_config)
 
 
diff --git a/test/test_inference_flows.py b/test/test_inference_flows.py
@@ -47,170 +47,6 @@ def reset_parameters(self):
                 m.reset_parameters()
 
 
-class TestHPTrainToFP8:
-    def base_test_mlp_transform(self, base_mlp, quantized_mlp, input_tensor):
-        with torch.no_grad():
-            base_output = base_mlp(input_tensor)
-            transformed_output = quantized_mlp(input_tensor)
-
-        # Compute and check SQNR
-        sqnr = compute_error(base_output, transformed_output)
-        assert sqnr.item() > 20, f"SQNR is too low: {sqnr.item()} dB"
-
-    @pytest.mark.parametrize("compile_backend", ["eager", "inductor"])
-    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
-    @unittest.skipIf(
-        not torch.cuda.is_available() or not is_H100,
-        "CUDA not availabl or on non H100 machine",
-    )
-    def test_dynamic_fp8_mlp(self, compile_backend, dtype):
-        original_mlp = FeedForward().to("cuda", dtype=dtype)
-        original_mlp.reset_parameters()
-
-        dynamic_fp8_mlp = copy.deepcopy(original_mlp)
-        swap_linear_with_float8_linear(
-            dynamic_fp8_mlp,
-            Float8DynamicLinear,
-            from_float_kwargs={"pre_quantize_weight": True},
-        )
-
-        batch_size = 4
-        num_tokens = 1024
-        embedding_dim = 4096
-
-        input_tensor = torch.randn(
-            batch_size, num_tokens, embedding_dim, device="cuda", dtype=dtype
-        )
-
-        # Compile the models
-        compiled_original_mlp = torch.compile(original_mlp, backend=compile_backend)
-        compiled_dynamic_fp8_mlp = torch.compile(
-            dynamic_fp8_mlp, backend=compile_backend
-        )
-
-        self.base_test_mlp_transform(
-            compiled_original_mlp, compiled_dynamic_fp8_mlp, input_tensor
-        )
-
-    @pytest.mark.parametrize("compile_backend", ["eager", "inductor"])
-    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
-    @unittest.skipIf(
-        not torch.cuda.is_available() or not is_H100,
-        "CUDA not availabl or on non H100 machine",
-    )
-    def test_static_fp8_mlp(self, compile_backend, dtype):
-        original_mlp = FeedForward().to("cuda", dtype=dtype)
-        original_mlp.reset_parameters()
-
-        static_fp8_mlp = copy.deepcopy(original_mlp)
-        swap_linear_with_float8_linear(
-            static_fp8_mlp,
-            Float8DynamicLinear,
-            from_float_kwargs={
-                "pre_quantize_weight": True,
-                "activation_scale": torch.tensor(
-                    [1.0], device="cuda", dtype=torch.float32
-                ),
-            },
-        )
-
-        batch_size = 4
-        num_tokens = 1024
-        embedding_dim = 4096
-
-        input_tensor = torch.randn(
-            batch_size, num_tokens, embedding_dim, device="cuda", dtype=dtype
-        )
-
-        # Compile the models
-        compiled_original_mlp = torch.compile(original_mlp, backend=compile_backend)
-        compiled_static_fp8_mlp = torch.compile(static_fp8_mlp, backend=compile_backend)
-
-        self.base_test_mlp_transform(
-            compiled_original_mlp, compiled_static_fp8_mlp, input_tensor
-        )
-
-
-class TestFP8TrainToFP8:
-    def train(self, model: nn.Module, dtype: torch.dtype):
-        model.train()
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
-        criterion = nn.MSELoss()
-        target_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype)
-        for _ in range(10):
-            input_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype)
-            optimizer.zero_grad()
-            output = model(input_tensor)
-            loss = criterion(output, target_tensor)
-            loss.backward()
-            optimizer.step()
-        model.eval()
-        return model
-
-    @pytest.mark.parametrize("compile_backend", ["eager", "inductor"])
-    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
-    @unittest.skipIf(
-        not torch.cuda.is_available() or not is_H100,
-        "CUDA not available or on non H100 machine",
-    )
-    def test_fp8_save_and_load(self, compile_backend: str, dtype: torch.dtype):
-        # Initialize FP8 model
-        fp8_mlp = FeedForward().to("cuda", dtype=torch.float32)
-        fp8_mlp.reset_parameters()
-        swap_linear_with_float8_linear(
-            fp8_mlp,
-            Float8DynamicLinear,
-        )
-
-        # Train the model
-        self.train(fp8_mlp, dtype)
-
-        # Generate input tensor and original out
-        input_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype)
-        og_out = fp8_mlp(input_tensor)
-
-        # Save model state dict
-        buffer = io.BytesIO()
-        torch.save(fp8_mlp.state_dict(), buffer)
-
-        # Reset buffer position to the beginning
-        buffer.seek(0)
-
-        # Later on you load the model, will be w/ Float8DynamicLinear on meta device
-        with torch.device("meta"):
-            new_fp8_mlp = FeedForward().to(dtype=dtype)
-            swap_linear_with_float8_linear(
-                new_fp8_mlp,
-                Float8DynamicLinear,
-            )
-
-        # Load the actual data
-        new_fp8_mlp.load_state_dict(
-            torch.load(buffer, weights_only=True), strict=True, assign=True
-        )
-
-        # Dynamic Activations + Quantized Weights
-        def quantize_dynamic_linear(x: nn.Module):
-            if isinstance(x, Float8DynamicLinear):
-                x.set_quantization_scales(pre_quantize_weight=True)
-            return x
-
-        new_fp8_mlp.apply(quantize_dynamic_linear)
-
-        for module in new_fp8_mlp.modules():
-            if isinstance(module, Float8DynamicLinear):
-                assert isinstance(module.weight, Float8Tensor)
-                assert module.weight.requires_grad is False
-
-        new_out = new_fp8_mlp(input_tensor)
-
-        # Assert exact equality
-        assert torch.all(og_out == new_out).item()
-
-
-# WE ARE GOING TO KEEP 1 or the other BELOW IS THE SEPARATE MODULE WORKFLOW
-
-
 class TestHPTrainToFP8LinearInference:
     def base_test_mlp_transform(self, base_mlp, quantized_mlp, input_tensor):
         with torch.no_grad():