Skip to content

Commit bf79878

Browse files
review fixes
1 parent abcdd60 commit bf79878

File tree

1 file changed

+9
-18
lines changed

1 file changed

+9
-18
lines changed

beginner_source/knowledge_distillation_tutorial.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,9 @@
4040
# Check if GPU is available, and if not, use the CPU
4141
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
4242

43+
######################################################################
4344
# Loading CIFAR-10
4445
# ----------------
45-
46-
######################################################################
4746
# CIFAR-10 is a popular image dataset with ten classes. Our objective is to predict one of the following classes for each input image.
4847
#
4948
# .. figure:: /../_static/img/cifar10.png
@@ -78,9 +77,7 @@
7877
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms_cifar)
7978

8079
########################################################################
81-
# .. note:: This section is for CPU users only who are interested in quick results.
82-
# Use this option only if you're interested in a small scale experiment.
83-
# Keep in mind the code should run fairly quickly using any GPU. Select only the first ``num_images_to_keep`` images from the train/test dataset
80+
# .. note:: This section is for CPU users only who are interested in quick results. Use this option only if you're interested in a small scale experiment. Keep in mind the code should run fairly quickly using any GPU. Select only the first ``num_images_to_keep`` images from the train/test dataset
8481
#
8582
# .. code-block:: python
8683
#
@@ -93,10 +90,9 @@
9390
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
9491
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)
9592

93+
######################################################################
9694
# Defining model classes and utility functions
9795
# --------------------------------------------
98-
99-
######################################################################
10096
# Next, we need to define our model classes. Several user-defined parameters need to be set here. We use two different architectures, keeping the number of filters fixed across our experiments to ensure fair comparisons.
10197
# Both architectures are Convolutional Neural Networks (CNNs) with a different number of convolutional layers that serve as feature extractors, followed by a classifier with 10 classes.
10298
# The number of filters and neurons is smaller for the students.
@@ -189,7 +185,7 @@ def train(model, train_loader, epochs, learning_rate, device):
189185
optimizer.zero_grad()
190186
outputs = model(inputs)
191187

192-
# outputs: Output of the network for the collection of images. A vector of dimensionality batch_size
188+
# outputs: Output of the network for the collection of images. A tensor of dimensionality batch_size x num_classes
193189
# labels: The actual labels of the images. Vector of dimensionality batch_size
194190
loss = criterion(outputs, labels)
195191
loss.backward()
@@ -220,10 +216,9 @@ def test(model, test_loader, device):
220216
print(f"Test Accuracy: {accuracy:.2f}%")
221217
return accuracy
222218

219+
######################################################################
223220
# Cross-entropy runs
224221
# ------------------
225-
226-
######################################################################
227222
# For reproducibility, we need to set the torch manual seed. We train networks using different methods, so to compare them fairly,
228223
# it makes sense to initialize the networks with the same weights.
229224
# Start by training the teacher network using cross-entropy:
@@ -273,10 +268,9 @@ def test(model, test_loader, device):
273268
print(f"Teacher accuracy: {test_accuracy_deep:.2f}%")
274269
print(f"Student accuracy: {test_accuracy_light_ce:.2f}%")
275270

271+
######################################################################
276272
# Knowledge distillation run
277273
# --------------------------
278-
279-
######################################################################
280274
# Now let's try to improve the test accuracy of the student network by incorporating the teacher.
281275
# Knowledge distillation is a straightforward technique to achieve this,
282276
# based on the fact that both networks output a probability distribution over our classes.
@@ -354,10 +348,9 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin
354348
print(f"Student accuracy without teacher: {test_accuracy_light_ce:.2f}%")
355349
print(f"Student accuracy with CE + KD: {test_accuracy_light_ce_and_kd:.2f}%")
356350

351+
######################################################################
357352
# Cosine loss minimization run
358353
# ----------------------------
359-
360-
######################################################################
361354
# Feel free to play around with the temperature parameter that controls the softness of the softmax function and the loss coefficients.
362355
# In neural networks, it is easy to include to include additional loss functions to the main objectives to achieve goals like better generalization.
363356
# Let's try including an objective for the student, but now let's focus on their hidden states rather than their output layers. In the previous example,
@@ -567,10 +560,9 @@ def test_multiple_outputs(model, test_loader, device):
567560
train_cosine_loss(teacher=modified_nn_deep, student=modified_light_nn, train_loader=train_loader, epochs=10, learning_rate=0.001, hidden_rep_loss_weight=0.25, ce_loss_weight=0.75, device=device)
568561
test_accuracy_light_ce_and_cosine_loss = test_multiple_outputs(modified_light_nn, test_loader, device)
569562

563+
######################################################################
570564
# Intermediate regressor run
571565
# --------------------------
572-
573-
######################################################################
574566
# Our naive minimization does not guarantee better results for several reasons, one being the dimensionality of the vectors.
575567
# Cosine similarity generally works better than Euclidean distance for vectors of higher dimensionality,
576568
# but we were dealing with vectors with 1024 components each, so it is much harder to extract meaningful similarities.
@@ -734,10 +726,9 @@ def train_mse_loss(teacher, student, train_loader, epochs, learning_rate, featur
734726
print(f"Student accuracy with CE + CosineLoss: {test_accuracy_light_ce_and_cosine_loss:.2f}%")
735727
print(f"Student accuracy with CE + RegressorMSE: {test_accuracy_light_ce_and_mse_loss:.2f}%")
736728

729+
######################################################################
737730
# Conclusions
738731
# --------------------------------------------
739-
740-
######################################################################
741732
# None of the methods above increases the number of parameters for the network or inference time,
742733
# so the performance increase comes at the little cost of calculating gradients during training.
743734
# In ML applications, we mostly care about inference time because training happens before the model deployment.

0 commit comments

Comments
 (0)