amend

Vincent Moens · Vincent Moens · commit 0d6cba7fa374 · 2024-07-29T19:42:03.000-04:00
diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py
@@ -338,10 +338,12 @@ def timer(cmd):
 # executing the transfer. This makes sense, since we're actually asking python to execute an operation that CUDA will
 # perform anyway before copying the data from host to device.
 #
-# .. note:: Here too, the observation may vary depending on the available hardware.
-#   The pytorch implementation of
+# .. note:: The pytorch implementation of
 #   `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
-#   could be, in rare cases, faster than the corresponding CUDA version.
+#   which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`
+#   could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
+#   Here too, the observation may vary depending on the available hardware, the size of the tensors being sent or
+#   the amount of available RAM.
 #
 # ``non_blocking=True``
 # ~~~~~~~~~~~~~~~~~~~~~
@@ -738,5 +740,6 @@ def pin_copy_to_device_nonblocking(*tensors):
 #
 #  - `CUDA toolkit memory management doc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html>`_
 #  - `CUDA pin-memory note <https://forums.developer.nvidia.com/t/pinned-memory/268474>`_
+#  - `How to Optimize Data Transfers in CUDA C/C++ <https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/>`_
 #  - tensordict :meth:`~tensordict.TensorDict.to` method;
 #

Original file line number	Diff line number	Diff line change
`@@ -338,10 +338,12 @@ def timer(cmd):`
`338`	`338`	`# executing the transfer. This makes sense, since we're actually asking python to execute an operation that CUDA will`
`339`	`339`	`# perform anyway before copying the data from host to device.`
`340`	`340`	`#`
`341`		`-# .. note:: Here too, the observation may vary depending on the available hardware.`
`342`		`-# The pytorch implementation of`
	`341`	`+# .. note:: The pytorch implementation of`
`343`	`342`	# `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
`344`		`-# could be, in rare cases, faster than the corresponding CUDA version.`
	`343`	+# which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`
	`344`	+# could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
	`345`	`+# Here too, the observation may vary depending on the available hardware, the size of the tensors being sent or`
	`346`	`+# the amount of available RAM.`
`345`	`347`	`#`
`346`	`348`	# ``non_blocking=True``
`347`	`349`	`# ~~~~~~~~~~~~~~~~~~~~~`
`@@ -738,5 +740,6 @@ def pin_copy_to_device_nonblocking(*tensors):`
`738`	`740`	`#`
`739`	`741`	# - `CUDA toolkit memory management doc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html>`_
`740`	`742`	# - `CUDA pin-memory note <https://forums.developer.nvidia.com/t/pinned-memory/268474>`_
	`743`	+# - `How to Optimize Data Transfers in CUDA C/C++ <https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/>`_
`741`	`744`	# - tensordict :meth:`~tensordict.TensorDict.to` method;
`742`	`745`	`#`