From 42f9ce534befc440893645bc49468a6898be25a0 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Wed, 7 May 2025 12:00:07 -0700 Subject: [PATCH 01/10] tc --- .jenkins/validate_tutorials_built.py | 1 - 1 file changed, 1 deletion(-) diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py index f5cd187dbc..5c9e60e90b 100644 --- a/.jenkins/validate_tutorials_built.py +++ b/.jenkins/validate_tutorials_built.py @@ -53,7 +53,6 @@ "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release. "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed. "intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe - "intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help ] def tutorial_source_dirs() -> List[Path]: From 24b945b6323f529480a20a3b2914ac7d8f400c13 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Wed, 7 May 2025 22:15:15 -0700 Subject: [PATCH 02/10] tc --- conf.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/conf.py b/conf.py index 05cfa11ca1..dd2acd91f1 100644 --- a/conf.py +++ b/conf.py @@ -49,6 +49,36 @@ pio.renderers.default = 'sphinx_gallery' +import sphinx_gallery.gen_rst +import multiprocessing + +# Save the original function +def isolated_call(func, args, kwargs, result_queue): + try: + result = func(*args, **kwargs) + result_queue.put((True, result)) + except Exception as e: + result_queue.put((False, str(e))) + +def make_isolated_version(func): + def wrapper(*args, **kwargs): + result_queue = multiprocessing.Queue() + p = multiprocessing.Process( + target=isolated_call, + args=(func, args, kwargs, result_queue) + ) + p.start() + p.join() + success, result = result_queue.get() + if success: + return result + else: + raise RuntimeError(f"Error in isolated process: {result}") + return wrapper + +# Monkey-patch +sphinx_gallery.gen_rst.generate_file_rst = make_isolated_version(sphinx_gallery.gen_rst.generate_file_rst + try: import torchvision except ImportError: From 0f259b86a68f652259afbb8f7af16442a2ff9b4c Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Wed, 7 May 2025 22:15:24 -0700 Subject: [PATCH 03/10] tc --- conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf.py b/conf.py index dd2acd91f1..11d0cbd62d 100644 --- a/conf.py +++ b/conf.py @@ -77,7 +77,7 @@ def wrapper(*args, **kwargs): return wrapper # Monkey-patch -sphinx_gallery.gen_rst.generate_file_rst = make_isolated_version(sphinx_gallery.gen_rst.generate_file_rst +sphinx_gallery.gen_rst.generate_file_rst = make_isolated_version(sphinx_gallery.gen_rst.generate_file_rst) try: import torchvision From 64db1c461a4c220c96eb9b0f6621cc6700798be5 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Thu, 8 May 2025 09:00:04 -0700 Subject: [PATCH 04/10] tc --- conf.py | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/conf.py b/conf.py index 11d0cbd62d..ffb9a2ad5a 100644 --- a/conf.py +++ b/conf.py @@ -52,32 +52,26 @@ import sphinx_gallery.gen_rst import multiprocessing -# Save the original function -def isolated_call(func, args, kwargs, result_queue): +# Monkey patching sphinx gallery to run each example in an isolated process so +# that we don't need to worry about examples changing global state +def call_fn(func, args, kwargs, result_queue): try: result = func(*args, **kwargs) result_queue.put((True, result)) except Exception as e: result_queue.put((False, str(e))) -def make_isolated_version(func): +def call_in_subprocess(func): def wrapper(*args, **kwargs): - result_queue = multiprocessing.Queue() - p = multiprocessing.Process( - target=isolated_call, - args=(func, args, kwargs, result_queue) - ) - p.start() - p.join() - success, result = result_queue.get() - if success: - return result - else: - raise RuntimeError(f"Error in isolated process: {result}") + pool = multiprocessing.Pool(processes=1) + p = pool.apply_async(func, args, kwargs) + pool.close() + pool.join() + return p.get() return wrapper # Monkey-patch -sphinx_gallery.gen_rst.generate_file_rst = make_isolated_version(sphinx_gallery.gen_rst.generate_file_rst) +sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(sphinx_gallery.gen_rst.generate_file_rst) try: import torchvision @@ -128,18 +122,19 @@ def wrapper(*args, **kwargs): # -- Sphinx-gallery configuration -------------------------------------------- def reset_seeds(gallery_conf, fname): - torch.cuda.empty_cache() - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - torch._dynamo.reset() - torch._inductor.config.force_disable_caches = True - torch.manual_seed(42) - torch.set_default_device(None) - random.seed(10) - numpy.random.seed(10) - torch.set_grad_enabled(True) - - gc.collect() + pass + # torch.cuda.empty_cache() + # torch.backends.cudnn.deterministic = True + # torch.backends.cudnn.benchmark = False + # torch._dynamo.reset() + # torch._inductor.config.force_disable_caches = True + # torch.manual_seed(42) + # torch.set_default_device(None) + # random.seed(10) + # numpy.random.seed(10) + # torch.set_grad_enabled(True) + + # gc.collect() sphinx_gallery_conf = { 'examples_dirs': ['beginner_source', 'intermediate_source', From 8a64582f3392386970d6ac59531a2c7a4905543b Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Thu, 8 May 2025 09:40:21 -0700 Subject: [PATCH 05/10] tc --- conf.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/conf.py b/conf.py index ffb9a2ad5a..5a7225745a 100644 --- a/conf.py +++ b/conf.py @@ -63,11 +63,18 @@ def call_fn(func, args, kwargs, result_queue): def call_in_subprocess(func): def wrapper(*args, **kwargs): - pool = multiprocessing.Pool(processes=1) - p = pool.apply_async(func, args, kwargs) - pool.close() - pool.join() - return p.get() + result_queue = multiprocessing.Queue() + p = multiprocessing.Process( + target=call_fn, + args=(func, args, kwargs, result_queue) + ) + p.start() + p.join() + success, result = result_queue.get() + if success: + return result + else: + raise RuntimeError(f"Error in subprocess: {result}") return wrapper # Monkey-patch From e34f72c7119db860f98579bbca9555fd954f1113 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Thu, 8 May 2025 09:40:55 -0700 Subject: [PATCH 06/10] tc --- conf.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/conf.py b/conf.py index 5a7225745a..b62987cced 100644 --- a/conf.py +++ b/conf.py @@ -54,7 +54,8 @@ # Monkey patching sphinx gallery to run each example in an isolated process so # that we don't need to worry about examples changing global state -def call_fn(func, args, kwargs, result_queue): +def call_fn(func, args, kwargs): + return func(*args, **kwargs) try: result = func(*args, **kwargs) result_queue.put((True, result)) @@ -63,6 +64,11 @@ def call_fn(func, args, kwargs, result_queue): def call_in_subprocess(func): def wrapper(*args, **kwargs): + pool = multiprocessing.Pool(processes=1) + p = pool.apply_async(call_fn,(func, args, kwargs)) + pool.close() + pool.join() + return p.get() result_queue = multiprocessing.Queue() p = multiprocessing.Process( target=call_fn, From 2d483fc48e04b8d504a67d8c531bf86816a90db1 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Thu, 8 May 2025 09:52:32 -0700 Subject: [PATCH 07/10] tc --- conf.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/conf.py b/conf.py index b62987cced..5a7225745a 100644 --- a/conf.py +++ b/conf.py @@ -54,8 +54,7 @@ # Monkey patching sphinx gallery to run each example in an isolated process so # that we don't need to worry about examples changing global state -def call_fn(func, args, kwargs): - return func(*args, **kwargs) +def call_fn(func, args, kwargs, result_queue): try: result = func(*args, **kwargs) result_queue.put((True, result)) @@ -64,11 +63,6 @@ def call_fn(func, args, kwargs): def call_in_subprocess(func): def wrapper(*args, **kwargs): - pool = multiprocessing.Pool(processes=1) - p = pool.apply_async(call_fn,(func, args, kwargs)) - pool.close() - pool.join() - return p.get() result_queue = multiprocessing.Queue() p = multiprocessing.Process( target=call_fn, From fdcc8d75ce4a439d1058274aadf7de93683d48a6 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Thu, 15 May 2025 10:42:19 -0700 Subject: [PATCH 08/10] tc --- conf.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/conf.py b/conf.py index 5a7225745a..b6ae30c140 100644 --- a/conf.py +++ b/conf.py @@ -33,8 +33,6 @@ sys.path.insert(0, os.path.abspath('./.jenkins')) import pytorch_sphinx_theme import torch -import numpy -import gc import glob import random import shutil @@ -52,8 +50,18 @@ import sphinx_gallery.gen_rst import multiprocessing -# Monkey patching sphinx gallery to run each example in an isolated process so -# that we don't need to worry about examples changing global state +# Monkey patch sphinx gallery to run each example in an isolated process so that +# we don't need to worry about examples changing global state. +# +# Other option 1: Parallelism was added to sphinx gallery (a later version that +# we are not using yet) using joblib, but it seems to result in errors for us, +# and it has no effect if you set parallel = 1 (it will not put each file run +# into its own process and run singly) so you need parallel >= 2, and there may +# be tutorials that cannot be run in parallel. +# +# Other option 2: Run sphinx gallery once per file (similar to how we shard in +# CI but with shard sizes of 1), but running sphinx gallery for each file has a +# ~5min overhead, resulting in the entire suite taking ~2x time def call_fn(func, args, kwargs, result_queue): try: result = func(*args, **kwargs) @@ -77,7 +85,6 @@ def wrapper(*args, **kwargs): raise RuntimeError(f"Error in subprocess: {result}") return wrapper -# Monkey-patch sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(sphinx_gallery.gen_rst.generate_file_rst) try: @@ -128,21 +135,6 @@ def wrapper(*args, **kwargs): # -- Sphinx-gallery configuration -------------------------------------------- -def reset_seeds(gallery_conf, fname): - pass - # torch.cuda.empty_cache() - # torch.backends.cudnn.deterministic = True - # torch.backends.cudnn.benchmark = False - # torch._dynamo.reset() - # torch._inductor.config.force_disable_caches = True - # torch.manual_seed(42) - # torch.set_default_device(None) - # random.seed(10) - # numpy.random.seed(10) - # torch.set_grad_enabled(True) - - # gc.collect() - sphinx_gallery_conf = { 'examples_dirs': ['beginner_source', 'intermediate_source', 'advanced_source', 'recipes_source', 'prototype_source'], From a9972bc29f2f8f87f7dc9d0d242b542dd684369e Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Thu, 15 May 2025 10:54:02 -0700 Subject: [PATCH 09/10] tc --- conf.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/conf.py b/conf.py index b6ae30c140..f1c051cd0b 100644 --- a/conf.py +++ b/conf.py @@ -53,14 +53,14 @@ # Monkey patch sphinx gallery to run each example in an isolated process so that # we don't need to worry about examples changing global state. # -# Other option 1: Parallelism was added to sphinx gallery (a later version that -# we are not using yet) using joblib, but it seems to result in errors for us, -# and it has no effect if you set parallel = 1 (it will not put each file run -# into its own process and run singly) so you need parallel >= 2, and there may -# be tutorials that cannot be run in parallel. +# Alt option 1: Parallelism was added to sphinx gallery (a later version that we +# are not using yet) using joblib, but it seems to result in errors for us, and +# it has no effect if you set parallel = 1 (it will not put each file run into +# its own process and run singly) so you need parallel >= 2, and there may be +# tutorials that cannot be run in parallel. # -# Other option 2: Run sphinx gallery once per file (similar to how we shard in -# CI but with shard sizes of 1), but running sphinx gallery for each file has a +# Alt option 2: Run sphinx gallery once per file (similar to how we shard in CI +# but with shard sizes of 1), but running sphinx gallery for each file has a # ~5min overhead, resulting in the entire suite taking ~2x time def call_fn(func, args, kwargs, result_queue): try: From 919d7ed221cd29f00d74898ea8d4d73b007297bc Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Thu, 15 May 2025 11:12:42 -0700 Subject: [PATCH 10/10] tc --- conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/conf.py b/conf.py index f1c051cd0b..e189f9449f 100644 --- a/conf.py +++ b/conf.py @@ -145,7 +145,6 @@ def wrapper(*args, **kwargs): 'first_notebook_cell': ("# For tips on running notebooks in Google Colab, see\n" "# https://pytorch.org/tutorials/beginner/colab\n" "%matplotlib inline"), - 'reset_modules': (reset_seeds), 'ignore_pattern': r'_torch_export_nightly_tutorial.py', 'pypandoc': {'extra_args': ['--mathjax', '--toc'], 'filters': ['.jenkins/custom_pandoc_filter.py'],