HTA tutorial

anupamb · anupamb · commit 60cee891f3c1 · 2023-12-26T19:24:15.000-08:00
diff --git a/beginner_source/hta_intro.py b/beginner_source/hta_intro.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+"""
+Introduction to Holistic Trace Analysis
+------------
+**Author:** `Anupam Bhatnagar <https://github.com/anupambhatnagar>`_
+
+.. note::
+    Visualizations have been set to False to keep the notebook size small. When
+    running the notebook locally set the visualize variable to True to display
+    the plots.
+
+"""
+
+##############################################################
+# Setup and loading traces
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# In this demo we analyze the traces from a distributed training job which used 8 GPUs. To run the code on your laptop:
+# 
+# 1) Install Holistic Trace Analysis via pip. `pip install HolisticTraceAnalysis`
+# 2) [Optional and recommended] Setup a conda environment. See here for details.
+# 3) Edit the `hta_install_dir` vairable below to the folder in your local `HolisticTraceAnalysis` installation.
+
+from hta.trace_analysis import TraceAnalysis
+hta_install_dir = "/path/to/HolisticTraceAnalysis"
+trace_dir = hta_install_dir + "/tests/data/vision_transformer/"
+analyzer = TraceAnalysis(trace_dir=trace_dir)
+
+
+##############################################################
+# Temporal Breakdown
+# ~~~~~~~~~~~~~~~~~~
+# 
+# The temporal breakdown feature gives a breakdown of time spent by the GPU as follows:
+# 
+# 1) Idle time - GPU idle
+# 2) Compute time - GPU busy with computation events
+# 3) Non compute time - GPU busy with communication or memory events
+
+time_spent_df = analyzer.get_temporal_breakdown(visualize=False)
+print(time_spent_df)
+
+
+##############################################################
+# Kernel Breakdown
+# ~~~~~~~~~~~~~~~~
+#
+# This feature computes the following:
+#
+# 1) Breakdown of time spent among kernel types (Computation, Communication, Memory) across all ranks.
+# 2) Kernels taking the most time on each rank by kernel type.
+# 3) Distribution of average time across ranks for the kernels taking the most time.
diff --git a/index.rst b/index.rst
@@ -378,6 +378,17 @@ What's new in PyTorch tutorials?
    :link: advanced/super_resolution_with_onnxruntime.html
    :tags: Production,ONNX
 
+.. customcarditem::
+   :header: Profiling PyTorch
+   :card_description: Learn how to profile a PyTorch application
+   :link: beginner/profiler.html
+   :tags: Profiling
+
+.. customcarditem::
+   :header: Profiling PyTorch
+   :card_description: Introduction to Holistic Trace Analysis
+   :link: beginner/hta_intro.html
+   :tags: Profiling
 
 .. Code Transformations with FX
 
@@ -993,6 +1004,15 @@ Additional Resources
    advanced/super_resolution_with_onnxruntime
    intermediate/realtime_rpi
 
+.. toctree::
+   :maxdepth: 2
+   :includehidden:
+   :hidden:
+   :caption: Profiling PyTorch
+
+   beginner/profiler
+   beginner/hta_intro
+
 .. toctree::
    :maxdepth: 2
    :includehidden:
diff --git a/requirements.txt b/requirements.txt
@@ -20,7 +20,7 @@ bs4
 awscliv2==2.1.1
 flask
 spacy==3.4.1
-ray[tune]==2.4.0
+#ray[tune]==2.4.0
 tensorboard
 jinja2==3.0.3
 pytorch-lightning
@@ -32,9 +32,9 @@ nbformat>=4.2.0
 datasets
 transformers
 torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
-onnx
-onnxscript
-onnxruntime
+#onnx
+#onnxscript
+#onnxruntime
 
 importlib-metadata==6.8.0
 
@@ -58,6 +58,6 @@ pyopengl
 gymnasium[mujoco]==0.27.0
 timm
 iopath
-pygame==2.1.2
+#pygame==2.1.2
 pycocotools
 semilearn==0.3.2