Merge remote-tracking branch 'origin/pinmem-nonblock-tuto' into pinmem-nonblock-tuto

Vincent Moens · Vincent Moens · commit d3befe4eb784 · 2024-07-30T16:59:36.000-04:00
diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py
@@ -125,7 +125,7 @@
 #   As the following example will show, three requirements must be met to enable this:
 #
 #   1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra,
-#      Tesla or H100 devices have more than one DMA engine.
+#      Tesla, or H100 devices have more than one DMA engine.
 #
 #   2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using
 #      :class:`~torch.cuda.Stream`.
@@ -250,7 +250,7 @@ def benchmark_with_profiler(
 # New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other
 # constructors.
 #
-# Let us check the speed of pinning memory and sending tensors to cuda:
+# Let us check the speed of pinning memory and sending tensors to CUDA:
 
 
 import torch
@@ -318,10 +318,10 @@ def timer(cmd):
 #
 # However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before
 # casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just
-# executing the transfer. This makes sense, since we're actually asking python to execute an operation that CUDA will
+# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will
 # perform anyway before copying the data from host to device.
 #
-# .. note:: The pytorch implementation of
+# .. note:: The PyTorch implementation of
 #   `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
 #   which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`
 #   could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
@@ -505,7 +505,7 @@ def pin_copy_to_device_nonblocking(*tensors):
 
 
 ######################################################################
-# Other copy directions (GPU -> CPU, CPU -> MPS etc.)
+# Other copy directions (GPU -> CPU, CPU -> MPS)
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 #   .. _pinned_memory_other_direction:
@@ -693,7 +693,7 @@ def pin_copy_to_device_nonblocking(*tensors):
 #
 # - **System Architecture**
 #
-#   How is the system's architecture influencing data transfer speeds (e.g., bus speeds, network latency)?
+#   How is the system's architecture influencing data transfer speeds (for example, bus speeds, network latency)?
 #
 # Additionally, allocating a large number of tensors or sizable tensors in pinned memory can monopolize a substantial
 # portion of RAM.
@@ -718,11 +718,11 @@ def pin_copy_to_device_nonblocking(*tensors):
 #
 #   .. _pinned_memory_resources:
 #
-#  If you are dealing with issues with memory copies when using CUDA devices or want to learn more about
-#  what was discussed in this tutorial, check the following references:
+# If you are dealing with issues with memory copies when using CUDA devices or want to learn more about
+# what was discussed in this tutorial, check the following references:
 #
-#  - `CUDA toolkit memory management doc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html>`_
-#  - `CUDA pin-memory note <https://forums.developer.nvidia.com/t/pinned-memory/268474>`_
-#  - `How to Optimize Data Transfers in CUDA C/C++ <https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/>`_
-#  - tensordict :meth:`~tensordict.TensorDict.to` method;
+# - `CUDA toolkit memory management doc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html>`_;
+# - `CUDA pin-memory note <https://forums.developer.nvidia.com/t/pinned-memory/268474>`_;
+# - `How to Optimize Data Transfers in CUDA C/C++ <https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/>`_;
+# - tensordict :meth:`~tensordict.TensorDict.to` method.
 #

Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@`
`125`	`125`	`# As the following example will show, three requirements must be met to enable this:`
`126`	`126`	`#`
`127`	`127`	`# 1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra,`
`128`		`-# Tesla or H100 devices have more than one DMA engine.`
	`128`	`+# Tesla, or H100 devices have more than one DMA engine.`
`129`	`129`	`#`
`130`	`130`	`# 2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using`
`131`	`131`	# :class:`~torch.cuda.Stream`.
`@@ -250,7 +250,7 @@ def benchmark_with_profiler(`
`250`	`250`	# New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other
`251`	`251`	`# constructors.`
`252`	`252`	`#`
`253`		`-# Let us check the speed of pinning memory and sending tensors to cuda:`
	`253`	`+# Let us check the speed of pinning memory and sending tensors to CUDA:`
`254`	`254`
`255`	`255`
`256`	`256`	`import torch`
`@@ -318,10 +318,10 @@ def timer(cmd):`
`318`	`318`	`#`
`319`	`319`	# However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before
`320`	`320`	`# casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just`
`321`		`-# executing the transfer. This makes sense, since we're actually asking python to execute an operation that CUDA will`
	`321`	`+# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will`
`322`	`322`	`# perform anyway before copying the data from host to device.`
`323`	`323`	`#`
`324`		`-# .. note:: The pytorch implementation of`
	`324`	`+# .. note:: The PyTorch implementation of`
`325`	`325`	# `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
`326`	`326`	# which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`
`327`	`327`	# could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
`@@ -505,7 +505,7 @@ def pin_copy_to_device_nonblocking(*tensors):`
`505`	`505`
`506`	`506`
`507`	`507`	`######################################################################`
`508`		`-# Other copy directions (GPU -> CPU, CPU -> MPS etc.)`
	`508`	`+# Other copy directions (GPU -> CPU, CPU -> MPS)`
`509`	`509`	`# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
`510`	`510`	`#`
`511`	`511`	`# .. _pinned_memory_other_direction:`
`@@ -693,7 +693,7 @@ def pin_copy_to_device_nonblocking(*tensors):`
`693`	`693`	`#`
`694`	`694`	`# - System Architecture`
`695`	`695`	`#`
`696`		`-# How is the system's architecture influencing data transfer speeds (e.g., bus speeds, network latency)?`
	`696`	`+# How is the system's architecture influencing data transfer speeds (for example, bus speeds, network latency)?`
`697`	`697`	`#`
`698`	`698`	`# Additionally, allocating a large number of tensors or sizable tensors in pinned memory can monopolize a substantial`
`699`	`699`	`# portion of RAM.`
`@@ -718,11 +718,11 @@ def pin_copy_to_device_nonblocking(*tensors):`
`718`	`718`	`#`
`719`	`719`	`# .. _pinned_memory_resources:`
`720`	`720`	`#`
`721`		`-# If you are dealing with issues with memory copies when using CUDA devices or want to learn more about`
`722`		`-# what was discussed in this tutorial, check the following references:`
	`721`	`+# If you are dealing with issues with memory copies when using CUDA devices or want to learn more about`
	`722`	`+# what was discussed in this tutorial, check the following references:`
`723`	`723`	`#`
`724`		-# - `CUDA toolkit memory management doc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html>`_
`725`		-# - `CUDA pin-memory note <https://forums.developer.nvidia.com/t/pinned-memory/268474>`_
`726`		-# - `How to Optimize Data Transfers in CUDA C/C++ <https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/>`_
`727`		-# - tensordict :meth:`~tensordict.TensorDict.to` method;
	`724`	+# - `CUDA toolkit memory management doc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html>`_;
	`725`	+# - `CUDA pin-memory note <https://forums.developer.nvidia.com/t/pinned-memory/268474>`_;
	`726`	+# - `How to Optimize Data Transfers in CUDA C/C++ <https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/>`_;
	`727`	+# - tensordict :meth:`~tensordict.TensorDict.to` method.
`728`	`728`	`#`