|
125 | 125 | # As the following example will show, three requirements must be met to enable this:
|
126 | 126 | #
|
127 | 127 | # 1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra,
|
128 |
| -# Tesla or H100 devices have more than one DMA engine. |
| 128 | +# Tesla, or H100 devices have more than one DMA engine. |
129 | 129 | #
|
130 | 130 | # 2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using
|
131 | 131 | # :class:`~torch.cuda.Stream`.
|
@@ -250,7 +250,7 @@ def benchmark_with_profiler(
|
250 | 250 | # New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other
|
251 | 251 | # constructors.
|
252 | 252 | #
|
253 |
| -# Let us check the speed of pinning memory and sending tensors to cuda: |
| 253 | +# Let us check the speed of pinning memory and sending tensors to CUDA: |
254 | 254 |
|
255 | 255 |
|
256 | 256 | import torch
|
@@ -318,10 +318,10 @@ def timer(cmd):
|
318 | 318 | #
|
319 | 319 | # However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before
|
320 | 320 | # casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just
|
321 |
| -# executing the transfer. This makes sense, since we're actually asking python to execute an operation that CUDA will |
| 321 | +# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will |
322 | 322 | # perform anyway before copying the data from host to device.
|
323 | 323 | #
|
324 |
| -# .. note:: The pytorch implementation of |
| 324 | +# .. note:: The PyTorch implementation of |
325 | 325 | # `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
|
326 | 326 | # which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`
|
327 | 327 | # could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
|
@@ -505,7 +505,7 @@ def pin_copy_to_device_nonblocking(*tensors):
|
505 | 505 |
|
506 | 506 |
|
507 | 507 | ######################################################################
|
508 |
| -# Other copy directions (GPU -> CPU, CPU -> MPS etc.) |
| 508 | +# Other copy directions (GPU -> CPU, CPU -> MPS) |
509 | 509 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
510 | 510 | #
|
511 | 511 | # .. _pinned_memory_other_direction:
|
|
0 commit comments