[Optimizer Overlap] Move hooks to own file (#71601)

rohan-varma · pytorchmergebot · commit 9b3a56eecfab · 2022-01-23T00:04:32.000Z
Summary: Pull Request resolved: #71601 Moves current prototype optimizer overlap to its own file for a better namespace. No code changes besides a few comment fixes. Note that this code is still prototype and not expected to be used by an end user. ghstack-source-id: 147458826 Test Plan: CI Reviewed By: cbalioglu Differential Revision: D33662678 fbshipit-source-id: 3cc931323230a4b66c02b9e6f744aaf5c48d4d34 (cherry picked from commit 5070595)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -9,6 +9,7 @@
     default_hooks as default,
     powerSGD_hook as powerSGD,
     quantization_hooks as quantization,
+    optimizer_overlap_hooks as optimizer_overlap,
 )
 
 
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py
@@ -105,62 +105,6 @@ def decompress(fut):
     return fut.then(decompress)
 
 
-class _OptimizerHookState(object):
-    """
-    Holds state for running optimizer in-line after DDP communication hook.
-    Currently contains only optimizer class which must have a method `step_param`.
-    """
-
-    __slots__ = ["functional_optimizer"]
-
-    def __init__(
-        self, functional_optim_cls, *functional_optim_args, **functional_optim_kwargs
-    ):
-        self.functional_optimizer = functional_optim_cls(
-            [],
-            *functional_optim_args,
-            **functional_optim_kwargs,
-            _allow_empty_param_list=True,
-        )
-        if not hasattr(self.functional_optimizer, "step_param"):
-            raise ValueError(
-                f"Class {functional_optim_cls} must implement method step_param."
-            )
-
-
-# TODO: Add an example to use such a wrapper.
-def _hook_then_optimizer(
-    hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]],
-    optimizer_state: _OptimizerHookState,
-) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
-    r"""
-    Runs optimizer in a functional fashion after DDP communication hook.
-
-    .. warning ::
-        This API is experimental adn subject to change.
-    """
-
-
-    def hook_then_optimizer_wrapper(
-        hook_state, bucket: dist.GradBucket
-    ) -> torch.futures.Future[torch.Tensor]:
-        # Run original hook
-        fut = hook(hook_state, bucket)
-
-        def optimizer_step(fut):
-            gradient_tensors = bucket.gradients()
-            model_params = bucket.parameters()
-            for grad_tensor, model_param in zip(gradient_tensors, model_params):
-                optimizer_state.functional_optimizer.step_param(
-                    model_param,
-                    grad_tensor,
-                )
-            return bucket.buffer()
-        return fut.then(optimizer_step)
-
-    return hook_then_optimizer_wrapper
-
-
 def fp16_compress_wrapper(
     hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]
 ) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py b/torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py
@@ -0,0 +1,60 @@
+from typing import Any, Callable
+
+import torch
+import torch.distributed as dist
+
+
+class _OptimizerHookState(object):
+    """
+    Holds state for running optimizer in-line after DDP communication hook.
+    Currently contains only optimizer class which must have a method `step_param`.
+    """
+
+    __slots__ = ["functional_optimizer"]
+
+    def __init__(
+        self, functional_optim_cls, *functional_optim_args, **functional_optim_kwargs
+    ):
+        self.functional_optimizer = functional_optim_cls(
+            [],
+            *functional_optim_args,
+            **functional_optim_kwargs,
+            _allow_empty_param_list=True,
+        )
+        if not hasattr(self.functional_optimizer, "step_param"):
+            raise ValueError(
+                f"Class {functional_optim_cls} must implement method step_param."
+            )
+
+
+# TODO: Add an example to use such a wrapper.
+def _hook_then_optimizer(
+    hook: Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]],
+    optimizer_state: _OptimizerHookState,
+) -> Callable[[Any, dist.GradBucket], torch.futures.Future[torch.Tensor]]:
+    r"""
+    Runs optimizer in a functional fashion after DDP communication hook.
+
+    .. warning ::
+        This API is experimental adn subject to change.
+    """
+
+    def hook_then_optimizer_wrapper(
+        hook_state, bucket: dist.GradBucket
+    ) -> torch.futures.Future[torch.Tensor]:
+        # Run original hook
+        fut = hook(hook_state, bucket)
+
+        def optimizer_step(fut):
+            gradient_tensors = bucket.gradients()
+            model_params = bucket.parameters()
+            for grad_tensor, model_param in zip(gradient_tensors, model_params):
+                optimizer_state.functional_optimizer.step_param(
+                    model_param,
+                    grad_tensor,
+                )
+            return bucket.buffer()
+
+        return fut.then(optimizer_step)
+
+    return hook_then_optimizer_wrapper
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
@@ -15,8 +15,6 @@
 import torch
 import torch.cuda
 import torch.distributed as dist
-import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
-import torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook as powerSGD
 import torch.distributed.algorithms.model_averaging.averagers as averagers
 import torch.distributed.algorithms.model_averaging.utils as model_averaging_utils
 import torch.nn as nn
@@ -25,9 +23,13 @@
 from torch._utils_internal import TEST_MASTER_ADDR as MASTER_ADDR
 from torch._utils_internal import TEST_MASTER_PORT as MASTER_PORT
 from torch.cuda.amp import GradScaler, autocast
-from torch.distributed.algorithms.ddp_comm_hooks import default_hooks as default
+
 from torch.distributed.algorithms.ddp_comm_hooks import (
+    post_localSGD_hook as post_localSGD,
+    powerSGD_hook as powerSGD,
+    default_hooks as default,
     quantization as quantization_hooks,
+    optimizer_overlap as optimizer_overlap_hooks
 )
 from torch.distributed.distributed_c10d import (
     get_world_size,
@@ -3944,14 +3946,14 @@ def _test_ddp_hook_with_optimizer_parity(
                     # Register hook that runs allreduce + functional optimizer
                     # step.
                     allreduce_hook = default.allreduce_hook
-                    opt_hook_state = default._OptimizerHookState(
+                    opt_hook_state = optimizer_overlap_hooks._OptimizerHookState(
                         functional_optim_cls,
                         *functional_optim_args,
                         **functional_optim_kwargs,
                     )
                     ddp_model_with_optimizer_hook.register_comm_hook(
                         None,
-                        default._hook_then_optimizer(allreduce_hook, opt_hook_state),
+                        optimizer_overlap_hooks._hook_then_optimizer(allreduce_hook, opt_hook_state),
                     )
                     # Create DDP model with no hook that does optimizer after
                     # backward.

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`default_hooks as default,`
`10`	`10`	`powerSGD_hook as powerSGD,`
`11`	`11`	`quantization_hooks as quantization,`
	`12`	`+ optimizer_overlap_hooks as optimizer_overlap,`
`12`	`13`	`)`
`13`	`14`
`14`	`15`