diff --git a/_static/img/hta/comm_across_ranks.png b/_static/img/hta/comm_across_ranks.png new file mode 100644 index 00000000000..2336de3bcbc Binary files /dev/null and b/_static/img/hta/comm_across_ranks.png differ diff --git a/_static/img/hta/counts_diff.png b/_static/img/hta/counts_diff.png new file mode 100644 index 00000000000..34575c145de Binary files /dev/null and b/_static/img/hta/counts_diff.png differ diff --git a/_static/img/hta/cuda_kernel_launch.png b/_static/img/hta/cuda_kernel_launch.png new file mode 100644 index 00000000000..e57c54a2fc5 Binary files /dev/null and b/_static/img/hta/cuda_kernel_launch.png differ diff --git a/_static/img/hta/cuda_kernel_launch_stats.png b/_static/img/hta/cuda_kernel_launch_stats.png new file mode 100644 index 00000000000..33a160fc752 Binary files /dev/null and b/_static/img/hta/cuda_kernel_launch_stats.png differ diff --git a/_static/img/hta/duration_diff.png b/_static/img/hta/duration_diff.png new file mode 100644 index 00000000000..050d491c872 Binary files /dev/null and b/_static/img/hta/duration_diff.png differ diff --git a/_static/img/hta/idle_time.png b/_static/img/hta/idle_time.png new file mode 100644 index 00000000000..782bfe9adb5 Binary files /dev/null and b/_static/img/hta/idle_time.png differ diff --git a/_static/img/hta/idle_time_breakdown_percentage.png b/_static/img/hta/idle_time_breakdown_percentage.png new file mode 100644 index 00000000000..3bab5946eab Binary files /dev/null and b/_static/img/hta/idle_time_breakdown_percentage.png differ diff --git a/_static/img/hta/idle_time_summary.png b/_static/img/hta/idle_time_summary.png new file mode 100644 index 00000000000..101b696b534 Binary files /dev/null and b/_static/img/hta/idle_time_summary.png differ diff --git a/_static/img/hta/kernel_metrics_df.png b/_static/img/hta/kernel_metrics_df.png new file mode 100644 index 00000000000..53eefb58b0c Binary files /dev/null and b/_static/img/hta/kernel_metrics_df.png differ diff --git a/_static/img/hta/kernel_type_breakdown.png b/_static/img/hta/kernel_type_breakdown.png new file mode 100644 index 00000000000..29a29cf89b2 Binary files /dev/null and b/_static/img/hta/kernel_type_breakdown.png differ diff --git a/_static/img/hta/launch_delay_outliers.png b/_static/img/hta/launch_delay_outliers.png new file mode 100644 index 00000000000..9bb455adea4 Binary files /dev/null and b/_static/img/hta/launch_delay_outliers.png differ diff --git a/_static/img/hta/mem_bandwidth_queue_length.png b/_static/img/hta/mem_bandwidth_queue_length.png new file mode 100644 index 00000000000..9df5383b5d9 Binary files /dev/null and b/_static/img/hta/mem_bandwidth_queue_length.png differ diff --git a/_static/img/hta/overlap_df.png b/_static/img/hta/overlap_df.png new file mode 100644 index 00000000000..ef164a28a12 Binary files /dev/null and b/_static/img/hta/overlap_df.png differ diff --git a/_static/img/hta/overlap_plot.png b/_static/img/hta/overlap_plot.png new file mode 100644 index 00000000000..acd449bc7ff Binary files /dev/null and b/_static/img/hta/overlap_plot.png differ diff --git a/_static/img/hta/pie_charts.png b/_static/img/hta/pie_charts.png new file mode 100644 index 00000000000..fa9137109a6 Binary files /dev/null and b/_static/img/hta/pie_charts.png differ diff --git a/_static/img/hta/queue_length_summary.png b/_static/img/hta/queue_length_summary.png new file mode 100644 index 00000000000..639a03fb6d1 Binary files /dev/null and b/_static/img/hta/queue_length_summary.png differ diff --git a/_static/img/hta/runtime_outliers.png b/_static/img/hta/runtime_outliers.png new file mode 100644 index 00000000000..1e2dfff9006 Binary files /dev/null and b/_static/img/hta/runtime_outliers.png differ diff --git a/_static/img/hta/short_gpu_kernels.png b/_static/img/hta/short_gpu_kernels.png new file mode 100644 index 00000000000..ff382a3a7f0 Binary files /dev/null and b/_static/img/hta/short_gpu_kernels.png differ diff --git a/_static/img/hta/temporal_breakdown_df.png b/_static/img/hta/temporal_breakdown_df.png new file mode 100644 index 00000000000..dce1829d113 Binary files /dev/null and b/_static/img/hta/temporal_breakdown_df.png differ diff --git a/_static/img/hta/temporal_breakdown_plot.png b/_static/img/hta/temporal_breakdown_plot.png new file mode 100644 index 00000000000..9c5f45c1d35 Binary files /dev/null and b/_static/img/hta/temporal_breakdown_plot.png differ diff --git a/beginner_source/hta_intro_tutorial.rst b/beginner_source/hta_intro_tutorial.rst new file mode 100644 index 00000000000..6d9dd9bfbd8 --- /dev/null +++ b/beginner_source/hta_intro_tutorial.rst @@ -0,0 +1,384 @@ +Introduction to Holistic Trace Analysis +======================================= +**Author:** `Anupam Bhatnagar `_ + +Setup +----- + +In this tutorial we demonstrate how to use Holistic Trace Analysis (HTA) to +analyze traces from a distributed training job. To get started follow the steps +below: + +Installing HTA +^^^^^^^^^^^^^^ + +We recommend using a Conda environment to install HTA. To install Anaconda, see +`here `_. + +1) Install HTA using pip + +.. code-block:: python + + pip install HolisticTraceAnalysis + +2) [Optional and recommended] Setup a conda environment + +.. code-block:: python + + # create the environment env_name + conda create -n env_name + + # activate the environment + conda activate env_name + + # deactivate the environment + conda deactivate + +Getting started +^^^^^^^^^^^^^^^ + +Launch a jupyter notebook and set the ``trace_dir`` variable to the location of the traces. + +.. code-block:: python + + from hta.trace_analysis import TraceAnalysis + trace_dir = "/path/to/folder/with/traces" + analyzer = TraceAnalysis(trace_dir=trace_dir) + + +Temporal Breakdown +------------------ + +To best utilize the GPUs it is vital to understand where the GPU is spending +time for a given job. Is the GPU spending time on computation, communication, +memory events, or is it idle? The temporal breakdown feature breaks down the +time spent in three categories + +1) Idle time - GPU is idle. +2) Compute time - GPU is being used for matrix multiplications or vector operations. +3) Non-compute time - GPU is being used for communication or memory events. + +To achieve high training efficiency the code should maximize compute time and +minimize idle time and non-compute time. The function below returns +a dataframe containing the temporal breakdown for each rank. + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + time_spent_df = analyzer.get_temporal_breakdown() + + +.. image:: ../_static/img/hta/temporal_breakdown_df.png + +When the ``visualize`` argument is set to True in the `get_temporal_breakdown +`_ +function it also generates a bar graph representing the breakdown by rank. + +.. image:: ../_static/img/hta/temporal_breakdown_plot.png + + +Idle Time Breakdown +------------------- +Understanding how much time the GPU is idle and its causes can help direct +optimization strategies. A GPU is considered idle when no kernel is running on +it. We developed an algorithm to categorize the Idle time into 3 categories: + +#. Host wait: is the idle duration on the GPU due to the CPU not enqueuing + kernels fast enough to keep the GPU busy. These kinds of inefficiencies can + be resolved by examining the CPU operators that are contributing to the slow + down, increasing the batch size and applying operator fusion. + +#. Kernel wait: constitutes the short overhead to launch consecutive kernels on + the GPU. The idle time attributed to this category can be minimized by using + CUDA Graph optimizations. + +#. Other wait: Lastly, this category includes idle we could not currently + attribute due to insufficient information. The likely causes include + synchronization among CUDA streams using CUDA events and delays in launching + kernels. + +The host wait time can be interpreted as the time when the GPU is stalling due +to the CPU. To attribute the idle time as kernel wait we use the following +heuristic: + + | **gap between consecutive kernels < threshold** + +The default threshold value is 30 nanoseconds and can be configured using the +``consecutive_kernel_delay`` argument. By default, the idle time breakdown is +computed for rank 0 only. In order to calculate the breakdown for other ranks, +use the ``ranks`` argument in the `get_idle_time_breakdown +`_ +function. The idle time breakdown can be generated as follows: + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + idle_time_df = analyzer.get_idle_time_breakdown() + +.. image:: ../_static/img/hta/idle_time_breakdown_percentage.png + +The function returns a tuple of dataframes. The first dataframe contains the +idle time by category on each stream for each rank. + +.. image:: ../_static/img/hta/idle_time.png + :scale: 100% + :align: center + +The second dataframe is generated when ``show_idle_interval_stats`` is set to +``True``. It contains the summary statistics of the idle time for each stream +on each rank. + +.. image:: ../_static/img/hta/idle_time_summary.png + :scale: 100% + +.. tip:: + By default, the idle time breakdown presents the percentage of each of the + idle time categories. Setting the ``visualize_pctg`` argument to ``False``, + the function renders with absolute time on the y-axis. + + +Kernel Breakdown +---------------- + +The kernel breakdown feature breaks down the time spent for each kernel type +i.e. communication (COMM), computation (COMP), and memory (MEM) across all +ranks and presents the proportion of time spent in each category. The +percentage of time spent in each category as a pie chart. + +.. image:: ../_static/img/hta/kernel_type_breakdown.png + :align: center + +The kernel breakdown can be calculated as follows: + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + kernel_type_metrics_df, kernel_metrics_df = analyzer.get_gpu_kernel_breakdown() + +The first dataframe returned by the function contains the raw values used to +generate the Pie chart. + +Kernel Duration Distribution +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The second dataframe returned by `get_gpu_kernel_breakdown +`_ +contains duration summary statistics for each kernel. In particular, this +includes the count, min, max, average, standard deviation, sum and kernel type +for each kernel on each rank. + +.. image:: ../_static/img/hta/kernel_metrics_df.png + :align: center + +Using this data HTA creates many visualizations to identify performance +bottlenecks. + +#. Pie charts of the top kernels for each kernel type for each rank. + +#. Bar graphs of the average duration across all ranks for each of the top + kernels and for each kernel type. + +.. image:: ../_static/img/hta/pie_charts.png + +.. tip:: + All images are generated using plotly. Hovering on the graph shows the + mode bar on the top right which allows the user to zoom, pan, select and + download the graph. + +The pie charts above shows the top 5 computation, communication and memory +kernels. Similar pie charts are generated for each rank. The pie charts can be +configured to show the top k kernels using the ``num_kernels`` argument passed +to the `get_gpu_kernel_breakdown` function. Additionally, the +``duration_ratio`` argument can be used to tune the percentage of time that +needs to be analyzed. If both ``num_kernels`` and ``duration_ratio`` are +specified, then ``num_kernels`` takes precedence. + +.. image:: ../_static/img/hta/comm_across_ranks.png + +The bar graph above shows the average duration of the NCCL AllReduce kernel +across all the ranks. The black lines indicate the minimum and maximum time +taken on each rank. + +.. warning:: + When using jupyter-lab set the "image_renderer" argument value to + "jupyterlab" otherwise the graphs will not render in the notebook. + +For a detailed walkthrough of this feature see the `gpu_kernel_breakdown +notebook +`_ +in the examples folder of the repo. + + +Communication Computation Overlap +--------------------------------- + +In distributed training a significant amount of time is spent in communication +and synchronization events between GPUs. To achieve high GPU efficiency (i.e. +TFLOPS/GPU) it is vital to keep the GPU oversubscribed with computation +kernels. In other words, the GPU should not be blocked due to unresolved data +dependencies. One way to measure the extent to which computation is blocked by +data dependencies is to calculate the communication computation overlap. Higher +GPU efficiency is observed if communication events overlap computation events. +Lack of communication and computation overlap will lead to the GPU being idle, +thus the efficiency would be low. To sum up, higher communication computation +overlap is desirable. To calculate the overlap percentage for each rank we +measure the following ratio: + + | **(time spent in computation while communicating) / (time spent in communication)** + +Communication computation overlap can be calculated as follows: + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + overlap_df = analyzer.get_comm_comp_overlap() + +The function returns a dataframe containing the overlap percentage +for each rank. + +.. image:: ../_static/img/hta/overlap_df.png + :align: center + :scale: 50% + +When the ``visualize`` argument is set to True, the `get_comm_comp_overlap +`_ +function also generates a bar graph representing the overlap by rank. + +.. image:: ../_static/img/hta/overlap_plot.png + + +Augmented Counters +------------------ + +Memory Bandwidth & Queue Length Counters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Memory bandwidth counters measure the memory copy bandwidth used while copying +the data from H2D, D2H and D2D by memory copy (memcpy) and memory set (memset) +events. HTA also computes the number of outstanding operations on each CUDA +stream. We refer to this as **queue length**. When the queue length on a stream +is 1024 or larger new events cannot be scheduled on that stream and the CPU +will stall until the events on the GPU stream have processed. + +The `generate_trace_with_counters +`_ +API outputs a new trace file with the memory bandwidth and queue length +counters. The new trace file contains tracks which indicate the memory +bandwidth used by memcpy/memset operations and tracks for the queue length on +each stream. By default, these counters are generated using the rank 0 +trace file and the new file contains the suffix ``_with_counters`` in its name. +Users have the option to generate the counters for multiple ranks by using the +``ranks`` argument in the `generate_trace_with_counters` API. + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir = "/path/to/trace/folder") + analyzer.generate_trace_with_counters() + +A screenshot of the generated trace file with augmented counters. + +.. image:: ../_static/img/hta/mem_bandwidth_queue_length.png + :scale: 100% + +HTA also provides a summary of the memory copy bandwidth and queue length +counters as well as the time series of the counters for the profiled portion of +the code using the following API: + +#. `get_memory_bw_summary + `_ + +#. `get_queue_length_summary + `_ + +#. `get_memory_bw_time_series + `_ + +#. `get_queue_length_time_series + `_ + +To view the summary and time series use: + +.. code-block:: python + + # generate summary + mem_bw_summary = analyzer.get_memory_bw_summary() + queue_len_summary = analyzer.get_queue_length_summary() + + # get time series + mem_bw_series = analyzer.get_memory_bw_time_series() + queue_len_series = analyzer.get_queue_length_series() + +The summary contains the count, min, max, mean, standard deviation, 25th, 50th, +and 75th percentile. + +.. image:: ../_static/img/hta/queue_length_summary.png + :scale: 100% + :align: center + +The time series only contains the points when a value changes. Once a value is +observed the time series stays constant until the next update. The memory +bandwidth and queue length time series functions return a dictionary whose key +is the rank and the value is the time series for that rank. By default, the +time series is computed for rank 0 only. + + +CUDA Kernel Launch Statistics +----------------------------- + +.. image:: ../_static/img/hta/cuda_kernel_launch.png + +For each event launched on the GPU there is a corresponding scheduling event on +the CPU e.g. CudaLaunchKernel, CudaMemcpyAsync, CudaMemsetAsync. These events +are linked by a common correlation id in the trace. See figure above. This +feature computes the duration of the CPU runtime event, its corresponding GPU +kernel and the launch delay i.e. the difference between GPU kernel starting and +CPU operator ending. The kernel launch info can be generated as follows: + +.. code-block:: python + + analyzer = TraceAnalysis(trace_dir="/path/to/trace/dir") + kernel_info_df = analyzer.get_cuda_kernel_launch_stats() + +A screenshot of the generated dataframe is given below. + +.. image:: ../_static/img/hta/cuda_kernel_launch_stats.png + :scale: 100% + :align: center + +The duration of the CPU op, GPU kernel and the launch delay allows us to find: + +#. **Short GPU kernels** - GPU kernels with duration less than the + corresponding CPU runtime event. + +#. **Runtime event outliers** - CPU runtime events with excessive duration. + +#. **Launch delay outliers** - GPU kernels which take too long to be scheduled. + +HTA generates distribution plots for each of the aforementioned three categories. + + +**Short GPU kernels** + +Usually, the launch time on the CPU side is between 5-20 microseconds. In some +cases the GPU execution time is lower than the launch time itself. The graph +below allows us to find how frequently such instances appear in the code. + +.. image:: ../_static/img/hta/short_gpu_kernels.png + + +**Runtime event outliers** + +The runtime outliers depend on the cutoff used to classify the outliers, hence +the `get_cuda_kernel_launch_stats +`_ +API provides the ``runtime_cutoff`` argument to configure the value. + +.. image:: ../_static/img/hta/runtime_outliers.png + +**Launch delay outliers** + +The launch delay outliers depend on the cutoff used to classify the outliers, +hence the `get_cuda_kernel_launch_stats` API provides the +``launch_delay_cutoff`` argument to configure the value. + +.. image:: ../_static/img/hta/launch_delay_outliers.png diff --git a/beginner_source/hta_trace_diff_tutorial.rst b/beginner_source/hta_trace_diff_tutorial.rst new file mode 100644 index 00000000000..77ac398d625 --- /dev/null +++ b/beginner_source/hta_trace_diff_tutorial.rst @@ -0,0 +1,67 @@ +Trace Diff using Holistic Trace Analysis +======================================== +**Author:** `Anupam Bhatnagar `_ + + +Occasionally, users need to identify the changes in PyTorch operators and CUDA +kernels resulting from a code change. To support such a requirement, HTA +provides a trace comparison feature. This feature allows the user to input two +sets of trace files where the first can be thought of as the *control group* +and the second as the *test group* as in an A/B test. The ``TraceDiff`` class +provides functions to compare the differences between traces and functionality +to visualize these differences. In particular, users can find operators and +kernels which were added and removed from each group along with the frequency +of each operator/kernel and the cumulative time taken by the operator/kernel. +The `TraceDiff `_ class has 4 methods: + +#. `compare_traces + `_ - + Compare the frequency and total duration of CPU operators and GPU kernels from + two sets of traces. + +#. `ops_diff `_ - + Get the operators and kernels which have been: + + #. **added** to the test trace and are absent in the control trace + #. **deleted** from the test trace and are present in the control trace + #. **increased** in frequency in the test trace and exist in the control trace + #. **decreased** in frequency in the test trace and exist in the control trace + #. **unchanged** between the two sets of traces + +#. `visualize_counts_diff + `_ + +#. `visualize_duration_diff + `_ + +The last two methods can be used to visualize various changes in frequency and +duration of CPU operators and GPU kernels, using the output of the +``compare_traces`` method. + +E.g. The top 10 operators with increase in frequency can be computed as +follows: + +.. code-block:: python + + df = compare_traces_output.sort_values(by="diff_counts", ascending=False).head(10) + TraceDiff.visualize_counts_diff(df) + +.. image:: ../_static/img/hta/counts_diff.png + +Similarly, the top 10 ops with the largest change in duration can be computed as +follows: + +.. code-block:: python + + df = compare_traces_output.sort_values(by="diff_duration", ascending=False) + # The duration differerence can be overshadowed by the "ProfilerStep", + # so we can filter it out to show the trend of other operators. + df = df.loc[~df.index.str.startswith("ProfilerStep")].head(10) + TraceDiff.visualize_duration_diff(df) + +.. image:: ../_static/img/hta/duration_diff.png + +For a detailed example of this feature see the `trace_diff_demo notebook +`_ +in the examples folder of the repository. + diff --git a/en-wordlist.txt b/en-wordlist.txt index 4f9020a86e8..4a6538b04da 100644 --- a/en-wordlist.txt +++ b/en-wordlist.txt @@ -554,3 +554,15 @@ pre otsu rgb histologically +Conda +CudaLaunchKernel +CudaMemcpyAsync +CudaMemsetAsync +HTA +bw +conda +cuda +enqueuing +jupyter +memcpy +memset diff --git a/index.rst b/index.rst index 842635efbfd..de3aace6284 100644 --- a/index.rst +++ b/index.rst @@ -378,6 +378,23 @@ What's new in PyTorch tutorials? :link: advanced/super_resolution_with_onnxruntime.html :tags: Production,ONNX +.. customcarditem:: + :header: Profiling PyTorch + :card_description: Learn how to profile a PyTorch application + :link: beginner/profiler.html + :tags: Profiling + +.. customcarditem:: + :header: Profiling PyTorch + :card_description: Introduction to Holistic Trace Analysis + :link: beginner/hta_intro_tutorial.html + :tags: Profiling + +.. customcarditem:: + :header: Profiling PyTorch + :card_description: Trace Diff using Holistic Trace Analysis + :link: beginner/hta_trace_diff_tutorial.html + :tags: Profiling .. Code Transformations with FX @@ -993,6 +1010,16 @@ Additional Resources advanced/super_resolution_with_onnxruntime intermediate/realtime_rpi +.. toctree:: + :maxdepth: 2 + :includehidden: + :hidden: + :caption: Profiling PyTorch + + beginner/profiler + beginner/hta_intro_tutorial + beginner/hta_trace_diff_tutorial + .. toctree:: :maxdepth: 2 :includehidden: