|
4 | 4 | # This source code is licensed under the BSD 3-Clause license found in the
|
5 | 5 | # LICENSE file in the root directory of this source tree.
|
6 | 6 | import copy
|
| 7 | +import io |
7 | 8 | import random
|
8 | 9 | import unittest
|
9 | 10 |
|
|
14 | 15 | import torch.nn.functional as F
|
15 | 16 | from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
|
16 | 17 | from float8_experimental.float8_linear_utils import swap_linear_with_float8_linear
|
| 18 | +from float8_experimental.float8_tensor import Float8Tensor |
17 | 19 | from float8_experimental.float8_utils import compute_error
|
18 | 20 |
|
19 | 21 |
|
@@ -123,5 +125,80 @@ def test_static_fp8_mlp(self, compile_backend, dtype):
|
123 | 125 | )
|
124 | 126 |
|
125 | 127 |
|
| 128 | +class TestFP8TrainToFP8: |
| 129 | + def train(self, model: nn.Module, dtype: torch.dtype): |
| 130 | + model.train() |
| 131 | + optimizer = torch.optim.SGD(model.parameters(), lr=0.001) |
| 132 | + criterion = nn.MSELoss() |
| 133 | + target_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype) |
| 134 | + for _ in range(10): |
| 135 | + input_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype) |
| 136 | + optimizer.zero_grad() |
| 137 | + output = model(input_tensor) |
| 138 | + loss = criterion(output, target_tensor) |
| 139 | + loss.backward() |
| 140 | + optimizer.step() |
| 141 | + model.eval() |
| 142 | + return model |
| 143 | + |
| 144 | + @pytest.mark.parametrize("compile_backend", ["eager", "inductor"]) |
| 145 | + @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32]) |
| 146 | + @unittest.skipIf( |
| 147 | + not torch.cuda.is_available() or not is_H100, |
| 148 | + "CUDA not available or on non H100 machine", |
| 149 | + ) |
| 150 | + def test_fp8_save_and_load(self, compile_backend: str, dtype: torch.dtype): |
| 151 | + # Initialize FP8 model |
| 152 | + fp8_mlp = FeedForward().to("cuda", dtype=torch.float32) |
| 153 | + fp8_mlp.reset_parameters() |
| 154 | + swap_linear_with_float8_linear( |
| 155 | + fp8_mlp, |
| 156 | + Float8DynamicLinear, |
| 157 | + ) |
| 158 | + |
| 159 | + # Train the model |
| 160 | + self.train(fp8_mlp, dtype) |
| 161 | + |
| 162 | + # Generate input tensor and original out |
| 163 | + input_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype) |
| 164 | + og_out = fp8_mlp(input_tensor) |
| 165 | + |
| 166 | + # Save model state dict |
| 167 | + buffer = io.BytesIO() |
| 168 | + torch.save(fp8_mlp.state_dict(), buffer) |
| 169 | + |
| 170 | + # Reset buffer position to the beginning |
| 171 | + buffer.seek(0) |
| 172 | + |
| 173 | + # Later on you load the model, will be w/ Float8DynamicLinear on meta device |
| 174 | + with torch.device("meta"): |
| 175 | + new_fp8_mlp = FeedForward().to(dtype=dtype) |
| 176 | + swap_linear_with_float8_linear( |
| 177 | + new_fp8_mlp, |
| 178 | + Float8DynamicLinear, |
| 179 | + ) |
| 180 | + |
| 181 | + # Load the actual data |
| 182 | + new_fp8_mlp.load_state_dict(torch.load(buffer), strict=True, assign=True) |
| 183 | + |
| 184 | + # Dynamic Activations + Quantized Weights |
| 185 | + def quantize_dynamic_linear(x: nn.Module): |
| 186 | + if isinstance(x, Float8DynamicLinear): |
| 187 | + x.set_quantization_scales(True) |
| 188 | + return x |
| 189 | + |
| 190 | + new_fp8_mlp.apply(quantize_dynamic_linear) |
| 191 | + |
| 192 | + for module in new_fp8_mlp.modules(): |
| 193 | + if isinstance(module, Float8DynamicLinear): |
| 194 | + assert isinstance(module.weight, Float8Tensor) |
| 195 | + assert module.weight.requires_grad is False |
| 196 | + |
| 197 | + new_out = new_fp8_mlp(input_tensor) |
| 198 | + |
| 199 | + # Assert exact equality |
| 200 | + assert torch.all(og_out == new_out).item() |
| 201 | + |
| 202 | + |
126 | 203 | if __name__ == "__main__":
|
127 | 204 | pytest.main([__file__])
|
0 commit comments