Turn on AOTAutogradCache by default on open source

jamesjwu · jamesjwu · commit 241ae0ed95e6 · 2024-12-06T09:07:40.000-08:00
ghstack-source-id: 9a82bec Pull Request resolved: #141981
diff --git a/test/dynamo/test_aot_autograd.py b/test/dynamo/test_aot_autograd.py
@@ -8,6 +8,7 @@
 import torch
 import torch._dynamo
 import torch._dynamo.test_case
+import torch._inductor.test_case
 import torch.fx.traceback as fx_traceback
 import torch.utils._pytree as pytree
 from torch._dynamo.testing import (
@@ -45,7 +46,7 @@ def is_dynamic_shape_test(test_name):
 lib.impl("maybe_dupe_op", maybe_dupe_op, "Meta")
 
 
-class AotAutogradFallbackTests(torch._dynamo.test_case.TestCase):
+class AotAutogradFallbackTests(torch._inductor.test_case.TestCase):
     def test_LSTM(self):
         # https://github.com/pytorch/torchdynamo/issues/1147
         class Repro(torch.nn.Module):
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -96,7 +96,7 @@ def should_use_local_autograd_cache():
 
 
 def autograd_cache_enabled():
-    return should_use_local_autograd_cache() or should_use_remote_autograd_cache()
+    return (should_use_local_autograd_cache(), should_use_remote_autograd_cache())
 
 
 def check_node_safe(node: Node):
diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -36,7 +36,6 @@
 from .autograd_cache import (
     AOTAutogradCache,
     AOTAutogradCacheEntry,
-    autograd_cache_enabled,
     CompiledBackward,
     CompiledForward,
     should_use_remote_autograd_cache,
@@ -149,14 +148,13 @@ def aot_dispatch_base(
     flat_fn, flat_args, fw_metadata = pre_compile(
         wrappers, flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
     )
-
     fw_module, updated_flat_args, maybe_subclass_meta = aot_dispatch_base_graph(  # type: ignore[misc]
         flat_fn, flat_args, aot_config, fw_metadata=fw_metadata
     )
     # Save the forward_graph_str right after aot_dispatch_base_graph,
     # to save in the cache
     aot_forward_graph_str = None
-    if autograd_cache_enabled():
+    if aot_config.cache_info is not None:
         aot_forward_graph_str = fw_module.print_readable(
             print_output=False, include_stride=True, include_device=True
         )
@@ -218,7 +216,7 @@ def aot_dispatch_base(
         compiled_fw, aot_config, runtime_metadata=fw_metadata
     )
     cache_info = aot_config.cache_info
-    if autograd_cache_enabled() and cache_info:
+    if cache_info is not None:
         if fw_key := getattr(compiled_fw, "_fx_graph_cache_key", None):
             time_taken_ns = time.time_ns() - cache_info.start_time_ns
             entry = AOTAutogradCacheEntry(
@@ -824,13 +822,12 @@ def aot_dispatch_autograd(
 
     try_save_cache_entry: Optional[Callable] = None
 
-    if autograd_cache_enabled():
-        cache_info = aot_config.cache_info
-        if cache_info is not None:
-            forward_time_taken_ns = time.time_ns() - cache_info.start_time_ns
-        else:
-            forward_time_taken_ns = None
+    if aot_config.cache_info is not None:
+        forward_time_taken_ns = time.time_ns() - aot_config.cache_info.start_time_ns
 
+        # NB: aot_config here is technically not needed as an argument: we could just
+        # close over aot_config.cache_info, since aot_config never changes.
+        # But closing over random variables is confusing IMO, so I'm leaving it.
         def try_save_cache_entry(  # noqa: F811
             compiled_bw_func, _fw_metadata, aot_config
         ):
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
@@ -1137,20 +1137,22 @@ def dispatch_and_compile():
             )
         return compiled_fn
 
-    # Autograd cache stuff
-    remote = should_use_remote_autograd_cache()
-    local = should_use_local_autograd_cache()
     # We only care if the forward will return an OutputCode.
-    if (local or remote) and isinstance(fw_compiler, SerializableAOTDispatchCompiler):
-        compiled_fn = AOTAutogradCache.load(
-            dispatch_and_compile,
-            mod,
-            fake_flat_args,
-            aot_config,
-            cudagraphs,
-            local,
-            remote,
-        )
+    if isinstance(fw_compiler, SerializableAOTDispatchCompiler):
+        local = should_use_local_autograd_cache()
+        remote = should_use_remote_autograd_cache()
+        if local or remote:
+            compiled_fn = AOTAutogradCache.load(
+                dispatch_and_compile,
+                mod,
+                fake_flat_args,
+                aot_config,
+                cudagraphs,
+                local,
+                remote,
+            )
+        else:
+            compiled_fn = dispatch_and_compile()
     else:
         compiled_fn = dispatch_and_compile()
 
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
@@ -32,8 +32,12 @@
 # Applies CSE to the graph before partitioning
 cse = True
 
+from torch._inductor.config import is_fbcode
+
 
-enable_autograd_cache = os.environ.get("TORCHINDUCTOR_AUTOGRAD_CACHE", "0") == "1"
+enable_autograd_cache = (
+    os.environ.get("TORCHINDUCTOR_AUTOGRAD_CACHE", "0" if is_fbcode() else "1") == "1"
+)
 
 
 def remote_autograd_cache_default() -> Optional[bool]:
@@ -63,13 +67,12 @@ def remote_autograd_cache_default() -> Optional[bool]:
 # eventually: either default this config to false completely
 # once XLA pin update works,
 # or default config to true and fix relevant bugs
-from torch._inductor.config import is_fbcode
 
 
 # View replay is currently not compatible with AOTAutogradCache, since
 # FunctionalTensors are not serializable. We'll need to make them
 # serializable before enabling warm cache with this config turned on.
-view_replay_for_aliased_outputs = (not is_fbcode()) and (not enable_autograd_cache)
+view_replay_for_aliased_outputs = not is_fbcode()
 
 # Restricts the amount of computation AOTAutograd can do.
 # NB: We have essentially disabled this heuristic now. However, this is kept