Fix the Calculation of the KL divergence loss. (#2785)

Qznan · svekars · web-flow · commit a0e5e66a55e3 · 2024-03-07T13:09:23.000-08:00
Fix the Calculation of the KL divergence loss. Refer to torch.nn.KLDivLoss in pytorch

Co-authored-by: Svetlana Karslioglu &lt;svekars@meta.com&gt;
diff --git a/beginner_source/knowledge_distillation_tutorial.py b/beginner_source/knowledge_distillation_tutorial.py
@@ -324,7 +324,7 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin
             soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)
 
             # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
-            soft_targets_loss = -torch.sum(soft_targets * soft_prob) / soft_prob.size()[0] * (T**2)
+            soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob)) / soft_prob.size()[0] * (T**2)
 
             # Calculate the true label loss
             label_loss = ce_loss(student_logits, labels)