diff --git a/beginner_source/knowledge_distillation_tutorial.py b/beginner_source/knowledge_distillation_tutorial.py index 4601352ff03..49ab9a134dc 100644 --- a/beginner_source/knowledge_distillation_tutorial.py +++ b/beginner_source/knowledge_distillation_tutorial.py @@ -352,7 +352,7 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin # Cosine loss minimization run # ---------------------------- # Feel free to play around with the temperature parameter that controls the softness of the softmax function and the loss coefficients. -# In neural networks, it is easy to include to include additional loss functions to the main objectives to achieve goals like better generalization. +# In neural networks, it is easy to include additional loss functions to the main objectives to achieve goals like better generalization. # Let's try including an objective for the student, but now let's focus on their hidden states rather than their output layers. # Our goal is to convey information from the teacher's representation to the student by including a naive loss function, # whose minimization implies that the flattened vectors that are subsequently passed to the classifiers have become more *similar* as the loss decreases.