Merge branch 'main' into improve-quantization-recipe

jmarintur · web-flow · commit 3d04f5f1e708 · 2024-03-11T22:38:48.000+01:00
diff --git a/advanced_source/super_resolution_with_onnxruntime.py b/advanced_source/super_resolution_with_onnxruntime.py
@@ -5,7 +5,7 @@
 .. note::
     As of PyTorch 2.1, there are two versions of ONNX Exporter.
 
-    * ``torch.onnx.dynamo_export`is the newest (still in beta) exporter based on the TorchDynamo technology released with PyTorch 2.0.
+    * ``torch.onnx.dynamo_export`` is the newest (still in beta) exporter based on the TorchDynamo technology released with PyTorch 2.0.
     * ``torch.onnx.export`` is based on TorchScript backend and has been available since PyTorch 1.2.0.
 
 In this tutorial, we describe how to convert a model defined
diff --git a/advanced_source/usb_semisup_learn.py b/advanced_source/usb_semisup_learn.py
@@ -81,7 +81,7 @@
 # algorithm on dataset
 # 
 # Note that a CUDA-enabled backend is required for training with the ``semilearn`` package.
-# See `Enabling CUDA in Google Colab <https://pytorch.org/tutorials/beginner/colab#using-cuda>`__ for instructions
+# See `Enabling CUDA in Google Colab <https://pytorch.org/tutorials/beginner/colab#enabling-cuda>`__ for instructions
 # on enabling CUDA in Google Colab.
 #
 import semilearn
diff --git a/beginner_source/ddp_series_multigpu.rst b/beginner_source/ddp_series_multigpu.rst
@@ -78,15 +78,15 @@ Imports
 Constructing the process group
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+-  First, before initializing the group process, call `set_device <https://pytorch.org/docs/stable/generated/torch.cuda.set_device.html?highlight=set_device#torch.cuda.set_device>`__,
+   which sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0`
 -  The process group can be initialized by TCP (default) or from a
    shared file-system. Read more on `process group
    initialization <https://pytorch.org/docs/stable/distributed.html#tcp-initialization>`__
 -  `init_process_group <https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group>`__
    initializes the distributed process group.
 -  Read more about `choosing a DDP
    backend <https://pytorch.org/docs/stable/distributed.html#which-backend-to-use>`__
--  `set_device <https://pytorch.org/docs/stable/generated/torch.cuda.set_device.html?highlight=set_device#torch.cuda.set_device>`__
-   sets the default GPU for each process. This is important to prevent hangs or excessive memory utilization on `GPU:0`
 
 .. code-block:: diff
 
@@ -98,8 +98,9 @@ Constructing the process group
     +   """
     +   os.environ["MASTER_ADDR"] = "localhost"
     +   os.environ["MASTER_PORT"] = "12355"
-    +   init_process_group(backend="nccl", rank=rank, world_size=world_size)
     +   torch.cuda.set_device(rank)
+    +   init_process_group(backend="nccl", rank=rank, world_size=world_size)
+
 
 
 Constructing the DDP model
diff --git a/beginner_source/knowledge_distillation_tutorial.py b/beginner_source/knowledge_distillation_tutorial.py
@@ -324,7 +324,7 @@ def train_knowledge_distillation(teacher, student, train_loader, epochs, learnin
             soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)
 
             # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
-            soft_targets_loss = -torch.sum(soft_targets * soft_prob) / soft_prob.size()[0] * (T**2)
+            soft_targets_loss = torch.sum(soft_targets * (soft_targets.log() - soft_prob)) / soft_prob.size()[0] * (T**2)
 
             # Calculate the true label loss
             label_loss = ce_loss(student_logits, labels)

Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@`
`81`	`81`	`# algorithm on dataset`
`82`	`82`	`#`
`83`	`83`	# Note that a CUDA-enabled backend is required for training with the ``semilearn`` package.
`84`		-# See `Enabling CUDA in Google Colab <https://pytorch.org/tutorials/beginner/colab#using-cuda>`__ for instructions
	`84`	+# See `Enabling CUDA in Google Colab <https://pytorch.org/tutorials/beginner/colab#enabling-cuda>`__ for instructions
`85`	`85`	`# on enabling CUDA in Google Colab.`
`86`	`86`	`#`
`87`	`87`	`import semilearn`