From d0806c8762d2f7cbc880fe9d54b269f1281e59ff Mon Sep 17 00:00:00 2001 From: wz337 Date: Thu, 27 Jun 2024 15:30:27 -0700 Subject: [PATCH 1/3] update device mesh to include slicing --- recipes_source/distributed_device_mesh.rst | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/recipes_source/distributed_device_mesh.rst b/recipes_source/distributed_device_mesh.rst index dbc4a810434..5e46c68fc99 100644 --- a/recipes_source/distributed_device_mesh.rst +++ b/recipes_source/distributed_device_mesh.rst @@ -148,6 +148,26 @@ Then, run the following `torch elastic/torchrun Date: Thu, 27 Jun 2024 17:02:37 -0700 Subject: [PATCH 2/3] Update recipes_source/distributed_device_mesh.rst Co-authored-by: Svetlana Karslioglu --- recipes_source/distributed_device_mesh.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/distributed_device_mesh.rst b/recipes_source/distributed_device_mesh.rst index 5e46c68fc99..c2b2c7a99f8 100644 --- a/recipes_source/distributed_device_mesh.rst +++ b/recipes_source/distributed_device_mesh.rst @@ -150,7 +150,7 @@ Then, run the following `torch elastic/torchrun Date: Tue, 9 Jul 2024 14:29:42 -0700 Subject: [PATCH 3/3] Update distributed_device_mesh.rst --- recipes_source/distributed_device_mesh.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/distributed_device_mesh.rst b/recipes_source/distributed_device_mesh.rst index c2b2c7a99f8..08ee247db0a 100644 --- a/recipes_source/distributed_device_mesh.rst +++ b/recipes_source/distributed_device_mesh.rst @@ -163,7 +163,7 @@ DeviceMesh allows users to slice child mesh from the parent mesh and re-use the tp_mesh = mesh_3d["tp"] # Users can access the underlying process group thru `get_group` API. - replicate_group = hsdp_mesh["Replicate"].get_group() + replicate_group = hsdp_mesh["replicate"].get_group() shard_group = hsdp_mesh["Shard"].get_group() tp_group = tp_mesh.get_group()