66
66
#
67
67
# When one creates a CPU tensor in PyTorch, the content of this tensor needs to be placed
68
68
# in memory. The memory we talk about here is a rather complex concept worth looking at carefully.
69
- # We distinguish two types of memory that are handled by the Memory Management Unit: the main memory (for simplicity)
69
+ # We distinguish two types of memory that are handled by the Memory Management Unit: the RAM (for simplicity)
70
70
# and the swap space on disk (which may or may not be the hard drive). Together, the available space in disk and RAM (physical memory)
71
71
# make up the virtual memory, which is an abstraction of the total resources available.
72
72
# In short, the virtual memory makes it so that the available space is larger than what can be found on RAM in isolation
78
78
#
79
79
# Typically, when a program accesses a page that is not in RAM, a "page fault" occurs and the operating system (OS) then brings
80
80
# back this page into RAM ("swap in" or "page in").
81
- # In turn, the OS may have to _swap out_ (or _page out_ ) another page to make room for the new page.
81
+ # In turn, the OS may have to swap out (or "page out" ) another page to make room for the new page.
82
82
#
83
- # In contrast to pageable memory, a _pinned_ (or _page-locked_ or _non-pageable_ ) memory is a type of memory that cannot
83
+ # In contrast to pageable memory, a pinned (or page-locked or non-pageable ) memory is a type of memory that cannot
84
84
# be swapped out to disk.
85
85
# It allows for faster and more predictable access times, but has the downside that it is more limited than the
86
86
# pageable memory (aka the main memory).
@@ -158,13 +158,13 @@ def inner(pinned: bool, streamed: bool):
158
158
t1_cuda = t1_cpu_pinned .to (device , non_blocking = True )
159
159
else :
160
160
t2_cuda = t2_cpu_paged .to (device , non_blocking = True )
161
- t2_h2d_event = s .record_event ()
161
+ t_star_cuda_h2d_event = s .record_event ()
162
162
# This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is
163
163
# done in the other stream
164
164
t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda
165
- t1_h2d_event = torch .cuda .current_stream ().record_event ()
166
- t1_h2d_event .synchronize ()
167
- t2_h2d_event .synchronize ()
165
+ t3_cuda_h2d_event = torch .cuda .current_stream ().record_event ()
166
+ t_star_cuda_h2d_event .synchronize ()
167
+ t3_cuda_h2d_event .synchronize ()
168
168
169
169
170
170
# Our profiler: profiles the `inner` function and stores the results in a .json file
@@ -206,7 +206,7 @@ def benchmark_with_profiler(
206
206
#
207
207
# Using a pinned tensor doesn't change the trace much, both operations are still executed consecutively:
208
208
209
- benchmark_with_profiler (streamed = True , pinned = False )
209
+ benchmark_with_profiler (streamed = False , pinned = True )
210
210
211
211
######################################################################
212
212
#
@@ -215,7 +215,7 @@ def benchmark_with_profiler(
215
215
#
216
216
# Sending a pageable tensor to GPU on a separate stream is also a blocking operation:
217
217
218
- benchmark_with_profiler (streamed = False , pinned = True )
218
+ benchmark_with_profiler (streamed = True , pinned = False )
219
219
220
220
######################################################################
221
221
#
@@ -323,7 +323,7 @@ def timer(cmd):
323
323
#
324
324
# .. note:: The PyTorch implementation of
325
325
# `pin_memory <https://github.com/pytorch/pytorch/blob/5298acb5c76855bc5a99ae10016efc86b27949bd/aten/src/ATen/native/Memory.cpp#L58>`_
326
- # which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`
326
+ # which relies on creating a brand new storage in pinned memory through `cudaHostAlloc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gb65da58f444e7230d3322b6126bb4902>`_
327
327
# could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does.
328
328
# Here too, the observation may vary depending on the available hardware, the size of the tensors being sent or
329
329
# the amount of available RAM.
@@ -724,5 +724,5 @@ def pin_copy_to_device_nonblocking(*tensors):
724
724
# - `CUDA toolkit memory management doc <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html>`_;
725
725
# - `CUDA pin-memory note <https://forums.developer.nvidia.com/t/pinned-memory/268474>`_;
726
726
# - `How to Optimize Data Transfers in CUDA C/C++ <https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/>`_;
727
- # - tensordict :meth:`~ tensordict.TensorDictBase.to` method .
727
+ # - ` tensordict doc <https://pytorch.org/ tensordict/stable/index.html>`_ and `repo <https://github.com/pytorch/tensordict>`_ .
728
728
#
0 commit comments