diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index a355966683a..183867b2fcd 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -28,8 +28,9 @@ tensorboard jinja2==3.1.3 pytorch-lightning torchx -torchrl==0.3.0 -tensordict==0.3.0 +# TODO: use stable 0.5 when released +-e git+https://github.com/pytorch/rl.git#egg=torchrl +-e git+https://github.com/pytorch/tensordict.git#egg=tensordict ax-platform nbformat>==5.9.2 datasets diff --git a/_static/img/pinmem/pinmem.png b/_static/img/pinmem/pinmem.png new file mode 100644 index 00000000000..9d84e9d229d Binary files /dev/null and b/_static/img/pinmem/pinmem.png differ diff --git a/_static/img/pinmem/trace_streamed0_pinned0.png b/_static/img/pinmem/trace_streamed0_pinned0.png new file mode 100644 index 00000000000..dedac997b0b Binary files /dev/null and b/_static/img/pinmem/trace_streamed0_pinned0.png differ diff --git a/_static/img/pinmem/trace_streamed0_pinned1.png b/_static/img/pinmem/trace_streamed0_pinned1.png new file mode 100644 index 00000000000..2d5ff462e1a Binary files /dev/null and b/_static/img/pinmem/trace_streamed0_pinned1.png differ diff --git a/_static/img/pinmem/trace_streamed1_pinned0.png b/_static/img/pinmem/trace_streamed1_pinned0.png new file mode 100644 index 00000000000..130182a1978 Binary files /dev/null and b/_static/img/pinmem/trace_streamed1_pinned0.png differ diff --git a/_static/img/pinmem/trace_streamed1_pinned1.png b/_static/img/pinmem/trace_streamed1_pinned1.png new file mode 100644 index 00000000000..c596fcdb691 Binary files /dev/null and b/_static/img/pinmem/trace_streamed1_pinned1.png differ diff --git a/advanced_source/coding_ddpg.py b/advanced_source/coding_ddpg.py index 7dd3acf238d..c634932971b 100644 --- a/advanced_source/coding_ddpg.py +++ b/advanced_source/coding_ddpg.py @@ -182,7 +182,7 @@ # Later, we will see how the target parameters should be updated in TorchRL. # -from tensordict.nn import TensorDictModule +from tensordict.nn import TensorDictModule, TensorDictSequential def _init( @@ -290,12 +290,11 @@ def _loss_actor( ) -> torch.Tensor: td_copy = tensordict.select(*self.actor_in_keys) # Get an action from the actor network: since we made it functional, we need to pass the params - td_copy = self.actor_network(td_copy, params=self.actor_network_params) + with self.actor_network_params.to_module(self.actor_network): + td_copy = self.actor_network(td_copy) # get the value associated with that action - td_copy = self.value_network( - td_copy, - params=self.value_network_params.detach(), - ) + with self.value_network_params.detach().to_module(self.value_network): + td_copy = self.value_network(td_copy) return -td_copy.get("state_action_value") @@ -317,7 +316,8 @@ def _loss_value( td_copy = tensordict.clone() # V(s, a) - self.value_network(td_copy, params=self.value_network_params) + with self.value_network_params.to_module(self.value_network): + self.value_network(td_copy) pred_val = td_copy.get("state_action_value").squeeze(-1) # we manually reconstruct the parameters of the actor-critic, where the first @@ -332,9 +332,8 @@ def _loss_value( batch_size=self.target_actor_network_params.batch_size, device=self.target_actor_network_params.device, ) - target_value = self.value_estimator.value_estimate( - tensordict, target_params=target_params - ).squeeze(-1) + with target_params.to_module(self.actor_critic): + target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1) # Computes the value loss: L2, L1 or smooth L1 depending on `self.loss_function` loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_function) @@ -717,7 +716,7 @@ def get_env_stats(): ActorCriticWrapper, DdpgMlpActor, DdpgMlpQNet, - OrnsteinUhlenbeckProcessWrapper, + OrnsteinUhlenbeckProcessModule, ProbabilisticActor, TanhDelta, ValueOperator, @@ -776,15 +775,18 @@ def make_ddpg_actor( # Exploration # ~~~~~~~~~~~ # -# The policy is wrapped in a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessWrapper` +# The policy is passed into a :class:`~torchrl.modules.OrnsteinUhlenbeckProcessModule` # exploration module, as suggested in the original paper. # Let's define the number of frames before OU noise reaches its minimum value annealing_frames = 1_000_000 -actor_model_explore = OrnsteinUhlenbeckProcessWrapper( +actor_model_explore = TensorDictSequential( actor, - annealing_num_steps=annealing_frames, -).to(device) + OrnsteinUhlenbeckProcessModule( + spec=actor.spec.clone(), + annealing_num_steps=annealing_frames, + ).to(device), +) if device == torch.device("cpu"): actor_model_explore.share_memory() @@ -1168,7 +1170,7 @@ def ceil_div(x, y): ) # update the exploration strategy - actor_model_explore.step(current_frames) + actor_model_explore[1].step(current_frames) collector.shutdown() del collector diff --git a/en-wordlist.txt b/en-wordlist.txt index b52d8374d3e..62762ab69cc 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -1,3 +1,4 @@ + ACL ADI AOT @@ -50,6 +51,7 @@ DDP DDPG DDQN DLRM +DMA DNN DQN DataLoaders @@ -68,6 +70,8 @@ Ecker ExportDB FC FGSM +tensordict +DataLoader's FLAVA FSDP FX @@ -139,6 +143,7 @@ MKLDNN MLP MLPs MNIST +MPS MUC MacBook MacOS @@ -219,6 +224,7 @@ STR SVE SciPy Sequentials +Sharding Sigmoid SoTA Sohn @@ -254,6 +260,7 @@ VLDB VQA VS Code ViT +Volterra WMT WSI WSIs @@ -336,11 +343,11 @@ dataset’s deallocation decompositions decorrelated -devicemesh deserialize deserialized desynchronization deterministically +devicemesh dimensionality dir discontiguous @@ -384,6 +391,7 @@ hessian hessians histoencoder histologically +homonymous hotspot hvp hyperparameter @@ -459,6 +467,7 @@ optimizer's optimizers otsu overfitting +pageable parallelizable parallelization parametrization @@ -522,7 +531,6 @@ runtime runtimes scalable sharded -Sharding softmax sparsified sparsifier @@ -609,4 +617,4 @@ warmstarting warmup webp wsi -wsis +wsis \ No newline at end of file diff --git a/index.rst b/index.rst index 7781628c851..91517834fd8 100644 --- a/index.rst +++ b/index.rst @@ -3,6 +3,7 @@ Welcome to PyTorch Tutorials **What's new in PyTorch tutorials?** +* `A guide on good usage of non_blocking and pin_memory() in PyTorch `__ * `Introduction to Distributed Pipeline Parallelism `__ * `Introduction to Libuv TCPStore Backend `__ * `Asynchronous Saving with Distributed Checkpoint (DCP) `__ @@ -93,6 +94,13 @@ Welcome to PyTorch Tutorials :link: intermediate/tensorboard_tutorial.html :tags: Interpretability,Getting-Started,TensorBoard +.. customcarditem:: + :header: Good usage of `non_blocking` and `pin_memory()` in PyTorch + :card_description: A guide on best practices to copy data from CPU to GPU. + :image: _static/img/pinmem.png + :link: intermediate/pinmem_nonblock.html + :tags: Getting-Started + .. Image/Video .. customcarditem:: @@ -969,6 +977,7 @@ Additional Resources beginner/pytorch_with_examples beginner/nn_tutorial intermediate/tensorboard_tutorial + intermediate/pinmem_nonblock .. toctree:: :maxdepth: 2 diff --git a/intermediate_source/dqn_with_rnn_tutorial.py b/intermediate_source/dqn_with_rnn_tutorial.py index 991a0ff8bd6..6ea09559392 100644 --- a/intermediate_source/dqn_with_rnn_tutorial.py +++ b/intermediate_source/dqn_with_rnn_tutorial.py @@ -298,7 +298,7 @@ # either by passing a string or an action-spec. This allows us to use # Categorical (sometimes called "sparse") encoding or the one-hot version of it. # -qval = QValueModule(action_space=env.action_spec) +qval = QValueModule(spec=env.action_spec) ###################################################################### # .. note:: diff --git a/intermediate_source/pinmem_nonblock.py b/intermediate_source/pinmem_nonblock.py new file mode 100644 index 00000000000..78e57b6cf1b --- /dev/null +++ b/intermediate_source/pinmem_nonblock.py @@ -0,0 +1,728 @@ +# -*- coding: utf-8 -*- +""" +A guide on good usage of ``non_blocking`` and ``pin_memory()`` in PyTorch +========================================================================= + +**Author**: `Vincent Moens `_ + +Introduction +------------ + +Transferring data from the CPU to the GPU is fundamental in many PyTorch applications. +It's crucial for users to understand the most effective tools and options available for moving data between devices. +This tutorial examines two key methods for device-to-device data transfer in PyTorch: +:meth:`~torch.Tensor.pin_memory` and :meth:`~torch.Tensor.to` with the ``non_blocking=True`` option. + +What you will learn +~~~~~~~~~~~~~~~~~~~ + +Optimizing the transfer of tensors from the CPU to the GPU can be achieved through asynchronous transfers and memory +pinning. However, there are important considerations: + +- Using ``tensor.pin_memory().to(device, non_blocking=True)`` can be up to twice as slow as a straightforward ``tensor.to(device)``. +- Generally, ``tensor.to(device, non_blocking=True)`` is an effective choice for enhancing transfer speed. +- While ``cpu_tensor.to("cuda", non_blocking=True).mean()`` executes correctly, attempting + ``cuda_tensor.to("cpu", non_blocking=True).mean()`` will result in erroneous outputs. + +Preamble +~~~~~~~~ + +The performance reported in this tutorial are conditioned on the system used to build the tutorial. +Although the conclusions are applicable across different systems, the specific observations may vary slightly +depending on the hardware available, especially on older hardware. +The primary objective of this tutorial is to offer a theoretical framework for understanding CPU to GPU data transfers. +However, any design decisions should be tailored to individual cases and guided by benchmarked throughput measurements, +as well as the specific requirements of the task at hand. + +""" + +import torch + +assert torch.cuda.is_available(), "A cuda device is required to run this tutorial" + + +###################################################################### +# +# This tutorial requires tensordict to be installed. If you don't have tensordict in your environment yet, install it +# by running the following command in a separate cell: +# +# .. code-block:: bash +# +# # Install tensordict with the following command +# !pip3 install tensordict +# +# We start by outlining the theory surrounding these concepts, and then move to concrete test examples of the features. +# +# +# Background +# ---------- +# +# .. _pinned_memory_background: +# +# Memory management basics +# ~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_memory: +# +# When one creates a CPU tensor in PyTorch, the content of this tensor needs to be placed +# in memory. The memory we talk about here is a rather complex concept worth looking at carefully. +# We distinguish two types of memory that are handled by the Memory Management Unit: the RAM (for simplicity) +# and the swap space on disk (which may or may not be the hard drive). Together, the available space in disk and RAM (physical memory) +# make up the virtual memory, which is an abstraction of the total resources available. +# In short, the virtual memory makes it so that the available space is larger than what can be found on RAM in isolation +# and creates the illusion that the main memory is larger than it actually is. +# +# In normal circumstances, a regular CPU tensor is pageable which means that it is divided in blocks called pages that +# can live anywhere in the virtual memory (both in RAM or on disk). As mentioned earlier, this has the advantage that +# the memory seems larger than what the main memory actually is. +# +# Typically, when a program accesses a page that is not in RAM, a "page fault" occurs and the operating system (OS) then brings +# back this page into RAM ("swap in" or "page in"). +# In turn, the OS may have to swap out (or "page out") another page to make room for the new page. +# +# In contrast to pageable memory, a pinned (or page-locked or non-pageable) memory is a type of memory that cannot +# be swapped out to disk. +# It allows for faster and more predictable access times, but has the downside that it is more limited than the +# pageable memory (aka the main memory). +# +# .. figure:: /_static/img/pinmem/pinmem.png +# :alt: +# +# CUDA and (non-)pageable memory +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_cuda_pageable_memory: +# +# To understand how CUDA copies a tensor from CPU to CUDA, let's consider the two scenarios above: +# +# - If the memory is page-locked, the device can access the memory directly in the main memory. The memory addresses are well +# defined and functions that need to read these data can be significantly accelerated. +# - If the memory is pageable, all the pages will have to be brought to the main memory before being sent to the GPU. +# This operation may take time and is less predictable than when executed on page-locked tensors. +# +# More precisely, when CUDA sends pageable data from CPU to GPU, it must first create a page-locked copy of that data +# before making the transfer. +# +# Asynchronous vs. Synchronous Operations with ``non_blocking=True`` (CUDA ``cudaMemcpyAsync``) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_async_sync: +# +# When executing a copy from a host (e.g., CPU) to a device (e.g., GPU), the CUDA toolkit offers modalities to do these +# operations synchronously or asynchronously with respect to the host. +# +# In practice, when calling :meth:`~torch.Tensor.to`, PyTorch always makes a call to +# `cudaMemcpyAsync `_. +# If ``non_blocking=False`` (default), a ``cudaStreamSynchronize`` will be called after each and every ``cudaMemcpyAsync``, making +# the call to :meth:`~torch.Tensor.to` blocking in the main thread. +# If ``non_blocking=True``, no synchronization is triggered, and the main thread on the host is not blocked. +# Therefore, from the host perspective, multiple tensors can be sent to the device simultaneously, +# as the thread does not need to wait for one transfer to be completed to initiate the other. +# +# .. note:: In general, the transfer is blocking on the device side (even if it isn't on the host side): +# the copy on the device cannot occur while another operation is being executed. +# However, in some advanced scenarios, a copy and a kernel execution can be done simultaneously on the GPU side. +# As the following example will show, three requirements must be met to enable this: +# +# 1. The device must have at least one free DMA (Direct Memory Access) engine. Modern GPU architectures such as Volterra, +# Tesla, or H100 devices have more than one DMA engine. +# +# 2. The transfer must be done on a separate, non-default cuda stream. In PyTorch, cuda streams can be handles using +# :class:`~torch.cuda.Stream`. +# +# 3. The source data must be in pinned memory. +# +# We demonstrate this by running profiles on the following script. +# + +import contextlib + +from torch.cuda import Stream + + +s = Stream() + +torch.manual_seed(42) +t1_cpu_pinned = torch.randn(1024**2 * 5, pin_memory=True) +t2_cpu_paged = torch.randn(1024**2 * 5, pin_memory=False) +t3_cuda = torch.randn(1024**2 * 5, device="cuda:0") + +assert torch.cuda.is_available() +device = torch.device("cuda", torch.cuda.current_device()) + + +# The function we want to profile +def inner(pinned: bool, streamed: bool): + with torch.cuda.stream(s) if streamed else contextlib.nullcontext(): + if pinned: + t1_cuda = t1_cpu_pinned.to(device, non_blocking=True) + else: + t2_cuda = t2_cpu_paged.to(device, non_blocking=True) + t_star_cuda_h2d_event = s.record_event() + # This operation can be executed during the CPU to GPU copy if and only if the tensor is pinned and the copy is + # done in the other stream + t3_cuda_mul = t3_cuda * t3_cuda * t3_cuda + t3_cuda_h2d_event = torch.cuda.current_stream().record_event() + t_star_cuda_h2d_event.synchronize() + t3_cuda_h2d_event.synchronize() + + +# Our profiler: profiles the `inner` function and stores the results in a .json file +def benchmark_with_profiler( + pinned, + streamed, +) -> None: + torch._C._profiler._set_cuda_sync_enabled_val(True) + wait, warmup, active = 1, 1, 2 + num_steps = wait + warmup + active + rank = 0 + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + schedule=torch.profiler.schedule( + wait=wait, warmup=warmup, active=active, repeat=1, skip_first=1 + ), + ) as prof: + for step_idx in range(1, num_steps + 1): + inner(streamed=streamed, pinned=pinned) + if rank is None or rank == 0: + prof.step() + prof.export_chrome_trace(f"trace_streamed{int(streamed)}_pinned{int(pinned)}.json") + + +###################################################################### +# Loading these profile traces in chrome (``chrome://tracing``) shows the following results: first, let's see +# what happens if both the arithmetic operation on ``t3_cuda`` is executed after the pageable tensor is sent to GPU +# in the main stream: +# + +benchmark_with_profiler(streamed=False, pinned=False) + +###################################################################### +# .. figure:: /_static/img/pinmem/trace_streamed0_pinned0.png +# :alt: +# +# Using a pinned tensor doesn't change the trace much, both operations are still executed consecutively: + +benchmark_with_profiler(streamed=False, pinned=True) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed0_pinned1.png +# :alt: +# +# Sending a pageable tensor to GPU on a separate stream is also a blocking operation: + +benchmark_with_profiler(streamed=True, pinned=False) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed1_pinned0.png +# :alt: +# +# Only pinned tensors copies to GPU on a separate stream overlap with another cuda kernel executed on +# the main stream: + +benchmark_with_profiler(streamed=True, pinned=True) + +###################################################################### +# +# .. figure:: /_static/img/pinmem/trace_streamed1_pinned1.png +# :alt: +# +# A PyTorch perspective +# --------------------- +# +# .. _pinned_memory_pt_perspective: +# +# ``pin_memory()`` +# ~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_pinned: +# +# PyTorch offers the possibility to create and send tensors to page-locked memory through the +# :meth:`~torch.Tensor.pin_memory` method and constructor arguments. +# CPU tensors on a machine where CUDA is initialized can be cast to pinned memory through the :meth:`~torch.Tensor.pin_memory` +# method. Importantly, ``pin_memory`` is blocking on the main thread of the host: it will wait for the tensor to be copied to +# page-locked memory before executing the next operation. +# New tensors can be directly created in pinned memory with functions like :func:`~torch.zeros`, :func:`~torch.ones` and other +# constructors. +# +# Let us check the speed of pinning memory and sending tensors to CUDA: + + +import torch +import gc +from torch.utils.benchmark import Timer +import matplotlib.pyplot as plt + + +def timer(cmd): + median = ( + Timer(cmd, globals=globals()) + .adaptive_autorange(min_run_time=1.0, max_run_time=20.0) + .median + * 1000 + ) + print(f"{cmd}: {median: 4.4f} ms") + return median + + +# A tensor in pageable memory +pageable_tensor = torch.randn(1_000_000) + +# A tensor in page-locked (pinned) memory +pinned_tensor = torch.randn(1_000_000, pin_memory=True) + +# Runtimes: +pageable_to_device = timer("pageable_tensor.to('cuda:0')") +pinned_to_device = timer("pinned_tensor.to('cuda:0')") +pin_mem = timer("pageable_tensor.pin_memory()") +pin_mem_to_device = timer("pageable_tensor.pin_memory().to('cuda:0')") + +# Ratios: +r1 = pinned_to_device / pageable_to_device +r2 = pin_mem_to_device / pageable_to_device + +# Create a figure with the results +fig, ax = plt.subplots() + +xlabels = [0, 1, 2] +bar_labels = [ + "pageable_tensor.to(device) (1x)", + f"pinned_tensor.to(device) ({r1:4.2f}x)", + f"pageable_tensor.pin_memory().to(device) ({r2:4.2f}x)" + f"\npin_memory()={100*pin_mem/pin_mem_to_device:.2f}% of runtime.", +] +values = [pageable_to_device, pinned_to_device, pin_mem_to_device] +colors = ["tab:blue", "tab:red", "tab:orange"] +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime (pin-memory)") +ax.set_xticks([]) +ax.legend() + +plt.show() + +# Clear tensors +del pageable_tensor, pinned_tensor +_ = gc.collect() + +###################################################################### +# +# We can observe that casting a pinned-memory tensor to GPU is indeed much faster than a pageable tensor, because under +# the hood, a pageable tensor must be copied to pinned memory before being sent to GPU. +# +# However, contrary to a somewhat common belief, calling :meth:`~torch.Tensor.pin_memory()` on a pageable tensor before +# casting it to GPU should not bring any significant speed-up, on the contrary this call is usually slower than just +# executing the transfer. This makes sense, since we're actually asking Python to execute an operation that CUDA will +# perform anyway before copying the data from host to device. +# +# .. note:: The PyTorch implementation of +# `pin_memory `_ +# which relies on creating a brand new storage in pinned memory through `cudaHostAlloc `_ +# could be, in rare cases, faster than transitioning data in chunks as ``cudaMemcpy`` does. +# Here too, the observation may vary depending on the available hardware, the size of the tensors being sent or +# the amount of available RAM. +# +# ``non_blocking=True`` +# ~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_non_blocking: +# +# As mentioned earlier, many PyTorch operations have the option of being executed asynchronously with respect to the host +# through the ``non_blocking`` argument. +# +# Here, to account accurately of the benefits of using ``non_blocking``, we will design a slightly more complex +# experiment since we want to assess how fast it is to send multiple tensors to GPU with and without calling +# ``non_blocking``. +# + + +# A simple loop that copies all tensors to cuda +def copy_to_device(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.to("cuda:0")) + return result + + +# A loop that copies all tensors to cuda asynchronously +def copy_to_device_nonblocking(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.to("cuda:0", non_blocking=True)) + # We need to synchronize + torch.cuda.synchronize() + return result + + +# Create a list of tensors +tensors = [torch.randn(1000) for _ in range(1000)] +to_device = timer("copy_to_device(*tensors)") +to_device_nonblocking = timer("copy_to_device_nonblocking(*tensors)") + +# Ratio +r1 = to_device_nonblocking / to_device + +# Plot the results +fig, ax = plt.subplots() + +xlabels = [0, 1] +bar_labels = [f"to(device) (1x)", f"to(device, non_blocking=True) ({r1:4.2f}x)"] +colors = ["tab:blue", "tab:red"] +values = [to_device, to_device_nonblocking] + +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime (non-blocking)") +ax.set_xticks([]) +ax.legend() + +plt.show() + + +###################################################################### +# To get a better sense of what is happening here, let us profile these two functions: + + +from torch.profiler import profile, ProfilerActivity + + +def profile_mem(cmd): + with profile(activities=[ProfilerActivity.CPU]) as prof: + exec(cmd) + print(cmd) + print(prof.key_averages().table(row_limit=10)) + + +###################################################################### +# Let's see the call stack with a regular ``to(device)`` first: +# + +print("Call to `to(device)`", profile_mem("copy_to_device(*tensors)")) + +###################################################################### +# and now the ``non_blocking`` version: +# + +print( + "Call to `to(device, non_blocking=True)`", + profile_mem("copy_to_device_nonblocking(*tensors)"), +) + + +###################################################################### +# The results are without any doubt better when using ``non_blocking=True``, as all transfers are initiated simultaneously +# on the host side and only one synchronization is done. +# +# The benefit will vary depending on the number and the size of the tensors as well as depending on the hardware being +# used. +# +# .. note:: Interestingly, the blocking ``to("cuda")`` actually performs the same asynchronous device casting operation +# (``cudaMemcpyAsync``) as the one with ``non_blocking=True`` with a synchronization point after each copy. +# +# Synergies +# ~~~~~~~~~ +# +# .. _pinned_memory_synergies: +# +# Now that we have made the point that data transfer of tensors already in pinned memory to GPU is faster than from +# pageable memory, and that we know that doing these transfers asynchronously is also faster than synchronously, we can +# benchmark combinations of these approaches. First, let's write a couple of new functions that will call ``pin_memory`` +# and ``to(device)`` on each tensor: +# + + +def pin_copy_to_device(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.pin_memory().to("cuda:0")) + return result + + +def pin_copy_to_device_nonblocking(*tensors): + result = [] + for tensor in tensors: + result.append(tensor.pin_memory().to("cuda:0", non_blocking=True)) + # We need to synchronize + torch.cuda.synchronize() + return result + + +###################################################################### +# The benefits of using :meth:`~torch.Tensor.pin_memory` are more pronounced for +# somewhat large batches of large tensors: +# + +tensors = [torch.randn(1_000_000) for _ in range(1000)] +page_copy = timer("copy_to_device(*tensors)") +page_copy_nb = timer("copy_to_device_nonblocking(*tensors)") + +tensors_pinned = [torch.randn(1_000_000, pin_memory=True) for _ in range(1000)] +pinned_copy = timer("copy_to_device(*tensors_pinned)") +pinned_copy_nb = timer("copy_to_device_nonblocking(*tensors_pinned)") + +pin_and_copy = timer("pin_copy_to_device(*tensors)") +pin_and_copy_nb = timer("pin_copy_to_device_nonblocking(*tensors)") + +# Plot +strategies = ("pageable copy", "pinned copy", "pin and copy") +blocking = { + "blocking": [page_copy, pinned_copy, pin_and_copy], + "non-blocking": [page_copy_nb, pinned_copy_nb, pin_and_copy_nb], +} + +x = torch.arange(3) +width = 0.25 +multiplier = 0 + + +fig, ax = plt.subplots(layout="constrained") + +for attribute, runtimes in blocking.items(): + offset = width * multiplier + rects = ax.bar(x + offset, runtimes, width, label=attribute) + ax.bar_label(rects, padding=3, fmt="%.2f") + multiplier += 1 + +# Add some text for labels, title and custom x-axis tick labels, etc. +ax.set_ylabel("Runtime (ms)") +ax.set_title("Runtime (pin-mem and non-blocking)") +ax.set_xticks([0, 1, 2]) +ax.set_xticklabels(strategies) +plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") +ax.legend(loc="upper left", ncols=3) + +plt.show() + +del tensors, tensors_pinned +_ = gc.collect() + + +###################################################################### +# Other copy directions (GPU -> CPU, CPU -> MPS) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. _pinned_memory_other_direction: +# +# Until now, we have operated under the assumption that asynchronous copies from the CPU to the GPU are safe. +# This is generally true because CUDA automatically handles synchronization to ensure that the data being accessed is +# valid at read time. +# However, this guarantee does not extend to transfers in the opposite direction, from GPU to CPU. +# Without explicit synchronization, these transfers offer no assurance that the copy will be complete at the time of +# data access. Consequently, the data on the host might be incomplete or incorrect, effectively rendering it garbage: +# + + +tensor = ( + torch.arange(1, 1_000_000, dtype=torch.double, device="cuda") + .expand(100, 999999) + .clone() +) +torch.testing.assert_close( + tensor.mean(), torch.tensor(500_000, dtype=torch.double, device="cuda") +), tensor.mean() +try: + i = -1 + for i in range(100): + cpu_tensor = tensor.to("cpu", non_blocking=True) + torch.testing.assert_close( + cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double) + ) + print("No test failed with non_blocking") +except AssertionError: + print(f"{i}th test failed with non_blocking. Skipping remaining tests") +try: + i = -1 + for i in range(100): + cpu_tensor = tensor.to("cpu", non_blocking=True) + torch.cuda.synchronize() + torch.testing.assert_close( + cpu_tensor.mean(), torch.tensor(500_000, dtype=torch.double) + ) + print("No test failed with synchronize") +except AssertionError: + print(f"One test failed with synchronize: {i}th assertion!") + + +###################################################################### +# The same considerations apply to copies from the CPU to non-CUDA devices, such as MPS. +# Generally, asynchronous copies to a device are safe without explicit synchronization only when the target is a +# CUDA-enabled device. +# +# In summary, copying data from CPU to GPU is safe when using ``non_blocking=True``, but for any other direction, +# ``non_blocking=True`` can still be used but the user must make sure that a device synchronization is executed after +# the data is accessed. +# +# Practical recommendations +# ------------------------- +# +# .. _pinned_memory_recommendations: +# +# We can now wrap up some early recommendations based on our observations: +# +# In general, ``non_blocking=True`` will provide good throughput, regardless of whether the original tensor is or +# isn't in pinned memory. +# If the tensor is already in pinned memory, the transfer can be accelerated, but sending it to +# pin memory manually from python main thread is a blocking operation on the host, and hence will annihilate much of +# the benefit of using ``non_blocking=True`` (as CUDA does the `pin_memory` transfer anyway). +# +# One might now legitimately ask what use there is for the :meth:`~torch.Tensor.pin_memory` method. +# In the following section, we will explore further how this can be used to accelerate the data transfer even more. +# +# Additional considerations +# ------------------------- +# +# .. _pinned_memory_considerations: +# +# PyTorch notoriously provides a :class:`~torch.utils.data.DataLoader` class whose constructor accepts a +# ``pin_memory`` argument. +# Considering our previous discussion on ``pin_memory``, you might wonder how the ``DataLoader`` manages to +# accelerate data transfers if memory pinning is inherently blocking. +# +# The key lies in the DataLoader's use of a separate thread to handle the transfer of data from pageable to pinned +# memory, thus preventing any blockage in the main thread. +# +# To illustrate this, we will use the TensorDict primitive from the homonymous library. +# When invoking :meth:`~tensordict.TensorDict.to`, the default behavior is to send tensors to the device asynchronously, +# followed by a single call to ``torch.device.synchronize()`` afterwards. +# +# Additionally, ``TensorDict.to()`` includes a ``non_blocking_pin`` option which initiates multiple threads to execute +# ``pin_memory()`` before proceeding with to ``to(device)``. +# This approach can further accelerate data transfers, as demonstrated in the following example. +# +# + +from tensordict import TensorDict +import torch +from torch.utils.benchmark import Timer +import matplotlib.pyplot as plt + +# Create the dataset +td = TensorDict({str(i): torch.randn(1_000_000) for i in range(1000)}) + +# Runtimes +copy_blocking = timer("td.to('cuda:0', non_blocking=False)") +copy_non_blocking = timer("td.to('cuda:0')") +copy_pin_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=0)") +copy_pin_multithread_nb = timer("td.to('cuda:0', non_blocking_pin=True, num_threads=4)") + +# Rations +r1 = copy_non_blocking / copy_blocking +r2 = copy_pin_nb / copy_blocking +r3 = copy_pin_multithread_nb / copy_blocking + +# Figure +fig, ax = plt.subplots() + +xlabels = [0, 1, 2, 3] +bar_labels = [ + "Blocking copy (1x)", + f"Non-blocking copy ({r1:4.2f}x)", + f"Blocking pin, non-blocking copy ({r2:4.2f}x)", + f"Non-blocking pin, non-blocking copy ({r3:4.2f}x)", +] +values = [copy_blocking, copy_non_blocking, copy_pin_nb, copy_pin_multithread_nb] +colors = ["tab:blue", "tab:red", "tab:orange", "tab:green"] + +ax.bar(xlabels, values, label=bar_labels, color=colors) + +ax.set_ylabel("Runtime (ms)") +ax.set_title("Device casting runtime") +ax.set_xticks([]) +ax.legend() + +plt.show() + +###################################################################### +# In this example, we are transferring many large tensors from the CPU to the GPU. +# This scenario is ideal for utilizing multithreaded ``pin_memory()``, which can significantly enhance performance. +# However, if the tensors are small, the overhead associated with multithreading may outweigh the benefits. +# Similarly, if there are only a few tensors, the advantages of pinning tensors on separate threads become limited. +# +# As an additional note, while it might seem advantageous to create permanent buffers in pinned memory to shuttle +# tensors from pageable memory before transferring them to the GPU, this strategy does not necessarily expedite +# computation. The inherent bottleneck caused by copying data into pinned memory remains a limiting factor. +# +# Moreover, transferring data that resides on disk (whether in shared memory or files) to the GPU typically requires an +# intermediate step of copying the data into pinned memory (located in RAM). +# Utilizing non_blocking for large data transfers in this context can significantly increase RAM consumption, +# potentially leading to adverse effects. +# +# In practice, there is no one-size-fits-all solution. +# The effectiveness of using multithreaded ``pin_memory`` combined with ``non_blocking`` transfers depends on a +# variety of factors, including the specific system, operating system, hardware, and the nature of the tasks +# being executed. +# Here is a list of factors to check when trying to speed-up data transfers between CPU and GPU, or comparing +# throughput's across scenarios: +# +# - **Number of available cores** +# +# How many CPU cores are available? Is the system shared with other users or processes that might compete for +# resources? +# +# - **Core utilization** +# +# Are the CPU cores heavily utilized by other processes? Does the application perform other CPU-intensive tasks +# concurrently with data transfers? +# +# - **Memory utilization** +# +# How much pageable and page-locked memory is currently being used? Is there sufficient free memory to allocate +# additional pinned memory without affecting system performance? Remember that nothing comes for free, for instance +# ``pin_memory`` will consume RAM and may impact other tasks. +# +# - **CUDA Device Capabilities** +# +# Does the GPU support multiple DMA engines for concurrent data transfers? What are the specific capabilities and +# limitations of the CUDA device being used? +# +# - **Number of tensors to be sent** +# +# How many tensors are transferred in a typical operation? +# +# - **Size of the tensors to be sent** +# +# What is the size of the tensors being transferred? A few large tensors or many small tensors may not benefit from +# the same transfer program. +# +# - **System Architecture** +# +# How is the system's architecture influencing data transfer speeds (for example, bus speeds, network latency)? +# +# Additionally, allocating a large number of tensors or sizable tensors in pinned memory can monopolize a substantial +# portion of RAM. +# This reduces the available memory for other critical operations, such as paging, which can negatively impact the +# overall performance of an algorithm. +# +# Conclusion +# ---------- +# +# .. _pinned_memory_conclusion: +# +# Throughout this tutorial, we have explored several critical factors that influence transfer speeds and memory +# management when sending tensors from the host to the device. We've learned that using ``non_blocking=True`` generally +# accelerates data transfers, and that :meth:`~torch.Tensor.pin_memory` can also enhance performance if implemented +# correctly. However, these techniques require careful design and calibration to be effective. +# +# Remember that profiling your code and keeping an eye on the memory consumption are essential to optimize resource +# usage and achieve the best possible performance. +# +# Additional resources +# -------------------- +# +# .. _pinned_memory_resources: +# +# If you are dealing with issues with memory copies when using CUDA devices or want to learn more about +# what was discussed in this tutorial, check the following references: +# +# - `CUDA toolkit memory management doc `_; +# - `CUDA pin-memory note `_; +# - `How to Optimize Data Transfers in CUDA C/C++ `_; +# - `tensordict doc `_ and `repo `_. +#