|
40 | 40 | # Check if GPU is available, and if not, use the CPU
|
41 | 41 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
42 | 42 |
|
| 43 | +###################################################################### |
43 | 44 | # Loading CIFAR-10
|
44 | 45 | # ----------------
|
45 |
| - |
46 |
| -###################################################################### |
47 | 46 | # CIFAR-10 is a popular image dataset with ten classes. Our objective is to predict one of the following classes for each input image.
|
48 | 47 | #
|
49 | 48 | # .. figure:: /../_static/img/cifar10.png
|
|
78 | 77 | test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms_cifar)
|
79 | 78 |
|
80 | 79 | ########################################################################
|
81 |
| -# .. note:: This section is for CPU users only who are interested in quick results. |
82 |
| -# Use this option only if you're interested in a small scale experiment. |
83 |
| -# Keep in mind the code should run fairly quickly using any GPU. Select only the first ``num_images_to_keep`` images from the train/test dataset |
| 80 | +# .. note:: This section is for CPU users only who are interested in quick results. Use this option only if you're interested in a small scale experiment. Keep in mind the code should run fairly quickly using any GPU. Select only the first ``num_images_to_keep`` images from the train/test dataset |
84 | 81 | #
|
85 | 82 | # .. code-block:: python
|
86 | 83 | #
|
|
93 | 90 | train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
|
94 | 91 | test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)
|
95 | 92 |
|
| 93 | +###################################################################### |
96 | 94 | # Defining model classes and utility functions
|
97 | 95 | # --------------------------------------------
|
98 |
| - |
99 |
| -###################################################################### |
100 | 96 | # Next, we need to define our model classes. Several user-defined parameters need to be set here. We use two different architectures, keeping the number of filters fixed across our experiments to ensure fair comparisons.
|
101 | 97 | # Both architectures are Convolutional Neural Networks (CNNs) with a different number of convolutional layers that serve as feature extractors, followed by a classifier with 10 classes.
|
102 | 98 | # The number of filters and neurons is smaller for the students.
|
@@ -189,7 +185,7 @@ def train(model, train_loader, epochs, learning_rate, device):
|
189 | 185 | optimizer.zero_grad()
|
190 | 186 | outputs = model(inputs)
|
191 | 187 |
|
192 |
| - # outputs: Output of the network for the collection of images. A vector of dimensionality batch_size |
| 188 | + # outputs: Output of the network for the collection of images. A tensor of dimensionality batch_size x num_classes |
193 | 189 | # labels: The actual labels of the images. Vector of dimensionality batch_size
|
194 | 190 | loss = criterion(outputs, labels)
|
195 | 191 | loss.backward()
|
@@ -220,10 +216,9 @@ def test(model, test_loader, device):
|
220 | 216 | print(f"Test Accuracy: {accuracy:.2f}%")
|
221 | 217 | return accuracy
|
222 | 218 |
|
| 219 | +###################################################################### |
223 | 220 | # Cross-entropy runs
|
224 | 221 | # ------------------
|
225 |
| - |
226 |
| -###################################################################### |
227 | 222 | # For reproducibility, we need to set the torch manual seed. We train networks using different methods, so to compare them fairly,
|
228 | 223 | # it makes sense to initialize the networks with the same weights.
|
229 | 224 | # Start by training the teacher network using cross-entropy:
|
@@ -273,10 +268,9 @@ def test(model, test_loader, device):
|
273 | 268 | print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
|
274 | 269 | print(f"Student accuracy: {test_accuracy_light_ce:.2f}%")
|
275 | 270 |
|
| 271 | +###################################################################### |
276 | 272 | # Knowledge distillation run
|
277 | 273 | # --------------------------
|
278 |
| - |
279 |
| -###################################################################### |
280 | 274 | # Now let's try to improve the test accuracy of the student network by incorporating the teacher.
|
281 | 275 | # Knowledge distillation is a straightforward technique to achieve this,
|
282 | 276 | # based on the fact that both networks output a probability distribution over our classes.
|
@@ -354,10 +348,9 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin
|
354 | 348 | print(f"Student accuracy without teacher: {test_accuracy_light_ce:.2f}%")
|
355 | 349 | print(f"Student accuracy with CE + KD: {test_accuracy_light_ce_and_kd:.2f}%")
|
356 | 350 |
|
| 351 | +###################################################################### |
357 | 352 | # Cosine loss minimization run
|
358 | 353 | # ----------------------------
|
359 |
| - |
360 |
| -###################################################################### |
361 | 354 | # Feel free to play around with the temperature parameter that controls the softness of the softmax function and the loss coefficients.
|
362 | 355 | # In neural networks, it is easy to include to include additional loss functions to the main objectives to achieve goals like better generalization.
|
363 | 356 | # Let's try including an objective for the student, but now let's focus on their hidden states rather than their output layers. In the previous example,
|
@@ -567,10 +560,9 @@ def test_multiple_outputs(model, test_loader, device):
|
567 | 560 | train_cosine_loss(teacher=modified_nn_deep, student=modified_light_nn, train_loader=train_loader, epochs=10, learning_rate=0.001, hidden_rep_loss_weight=0.25, ce_loss_weight=0.75, device=device)
|
568 | 561 | test_accuracy_light_ce_and_cosine_loss = test_multiple_outputs(modified_light_nn, test_loader, device)
|
569 | 562 |
|
| 563 | +###################################################################### |
570 | 564 | # Intermediate regressor run
|
571 | 565 | # --------------------------
|
572 |
| - |
573 |
| -###################################################################### |
574 | 566 | # Our naive minimization does not guarantee better results for several reasons, one being the dimensionality of the vectors.
|
575 | 567 | # Cosine similarity generally works better than Euclidean distance for vectors of higher dimensionality,
|
576 | 568 | # but we were dealing with vectors with 1024 components each, so it is much harder to extract meaningful similarities.
|
@@ -734,10 +726,9 @@ def train_mse_loss(teacher, student, train_loader, epochs, learning_rate, featur
|
734 | 726 | print(f"Student accuracy with CE + CosineLoss: {test_accuracy_light_ce_and_cosine_loss:.2f}%")
|
735 | 727 | print(f"Student accuracy with CE + RegressorMSE: {test_accuracy_light_ce_and_mse_loss:.2f}%")
|
736 | 728 |
|
| 729 | +###################################################################### |
737 | 730 | # Conclusions
|
738 | 731 | # --------------------------------------------
|
739 |
| - |
740 |
| -###################################################################### |
741 | 732 | # None of the methods above increases the number of parameters for the network or inference time,
|
742 | 733 | # so the performance increase comes at the little cost of calculating gradients during training.
|
743 | 734 | # In ML applications, we mostly care about inference time because training happens before the model deployment.
|
|
0 commit comments