Apply suggestions from code review

Vincent Moens · svekars · web-flow · commit 2f55eb8e7823 · 2024-07-30T19:29:26.000+01:00
Co-authored-by: Svetlana Karslioglu &lt;svekars@meta.com&gt;
diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py
@@ -125,7 +125,7 @@
 #   As the following example will show, three requirements must be met to enable this:
 #
 #   1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra,
-#      Tesla or H100 devices have more than one DMA engine.
+#      Tesla, or H100 devices have more than one DMA engine.
 #
 #   2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using
 #      :class:`~torch.cuda.Stream`.
@@ -250,7 +250,7 @@ def benchmark_with_profiler(
 # New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other
 # constructors.
 #
-# Let us check the speed of pinning memory and sending tensors to cuda:
+# Let us check the speed of pinning memory and sending tensors to CUDA:
 
 
 import torch
@@ -318,10 +318,10 @@ def timer(cmd):
 #
 # However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before
 # casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just
-# executing the transfer. This makes sense, since we're actually asking python to execute an operation that CUDA will
+# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will
 # perform anyway before copying the data from host to device.
 #
-# .. note:: The pytorch implementation of
+# .. note:: The PyTorch implementation of
 #   `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
 #   which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`
 #   could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
@@ -505,7 +505,7 @@ def pin_copy_to_device_nonblocking(*tensors):
 
 
 ######################################################################
-# Other copy directions (GPU -> CPU, CPU -> MPS etc.)
+# Other copy directions (GPU -> CPU, CPU -> MPS)
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #
 #   .. _pinned_memory_other_direction:

Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@`
`125`	`125`	`# As the following example will show, three requirements must be met to enable this:`
`126`	`126`	`#`
`127`	`127`	`# 1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra,`
`128`		`-# Tesla or H100 devices have more than one DMA engine.`
	`128`	`+# Tesla, or H100 devices have more than one DMA engine.`
`129`	`129`	`#`
`130`	`130`	`# 2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using`
`131`	`131`	# :class:`~torch.cuda.Stream`.
`@@ -250,7 +250,7 @@ def benchmark_with_profiler(`
`250`	`250`	# New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other
`251`	`251`	`# constructors.`
`252`	`252`	`#`
`253`		`-# Let us check the speed of pinning memory and sending tensors to cuda:`
	`253`	`+# Let us check the speed of pinning memory and sending tensors to CUDA:`
`254`	`254`
`255`	`255`
`256`	`256`	`import torch`
`@@ -318,10 +318,10 @@ def timer(cmd):`
`318`	`318`	`#`
`319`	`319`	# However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before
`320`	`320`	`# casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just`
`321`		`-# executing the transfer. This makes sense, since we're actually asking python to execute an operation that CUDA will`
	`321`	`+# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will`
`322`	`322`	`# perform anyway before copying the data from host to device.`
`323`	`323`	`#`
`324`		`-# .. note:: The pytorch implementation of`
	`324`	`+# .. note:: The PyTorch implementation of`
`325`	`325`	# `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
`326`	`326`	# which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`
`327`	`327`	# could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
`@@ -505,7 +505,7 @@ def pin_copy_to_device_nonblocking(*tensors):`
`505`	`505`
`506`	`506`
`507`	`507`	`######################################################################`
`508`		`-# Other copy directions (GPU -> CPU, CPU -> MPS etc.)`
	`508`	`+# Other copy directions (GPU -> CPU, CPU -> MPS)`
`509`	`509`	`# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
`510`	`510`	`#`
`511`	`511`	`# .. _pinned_memory_other_direction:`