Merge branch 'pytorch:main' into main

weifengpy · web-flow · commit 9e7d160c53f7 · 2025-05-15T18:36:36.000-07:00
diff --git a/.jenkins/validate_tutorials_built.py b/.jenkins/validate_tutorials_built.py
@@ -53,7 +53,6 @@
     "intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.
     "advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.
     "intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe
-    "intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help
 ]
 
 def tutorial_source_dirs() -> List[Path]:
diff --git a/conf.py b/conf.py
@@ -33,8 +33,6 @@
 sys.path.insert(0, os.path.abspath('./.jenkins'))
 import pytorch_sphinx_theme
 import torch
-import numpy
-import gc
 import glob
 import random
 import shutil
@@ -49,6 +47,46 @@
 pio.renderers.default = 'sphinx_gallery'
 
 
+import sphinx_gallery.gen_rst
+import multiprocessing
+
+# Monkey patch sphinx gallery to run each example in an isolated process so that
+# we don't need to worry about examples changing global state.
+#
+# Alt option 1: Parallelism was added to sphinx gallery (a later version that we
+# are not using yet) using joblib, but it seems to result in errors for us, and
+# it has no effect if you set parallel = 1 (it will not put each file run into
+# its own process and run singly) so you need parallel >= 2, and there may be
+# tutorials that cannot be run in parallel.
+#
+# Alt option 2: Run sphinx gallery once per file (similar to how we shard in CI
+# but with shard sizes of 1), but running sphinx gallery for each file has a
+# ~5min overhead, resulting in the entire suite taking ~2x time
+def call_fn(func, args, kwargs, result_queue):
+    try:
+        result = func(*args, **kwargs)
+        result_queue.put((True, result))
+    except Exception as e:
+        result_queue.put((False, str(e)))
+
+def call_in_subprocess(func):
+    def wrapper(*args, **kwargs):
+        result_queue = multiprocessing.Queue()
+        p = multiprocessing.Process(
+            target=call_fn,
+            args=(func, args, kwargs, result_queue)
+        )
+        p.start()
+        p.join()
+        success, result = result_queue.get()
+        if success:
+            return result
+        else:
+            raise RuntimeError(f"Error in subprocess: {result}")
+    return wrapper
+
+sphinx_gallery.gen_rst.generate_file_rst = call_in_subprocess(sphinx_gallery.gen_rst.generate_file_rst)
+
 try:
     import torchvision
 except ImportError:
@@ -97,20 +135,6 @@
 
 # -- Sphinx-gallery configuration --------------------------------------------
 
-def reset_seeds(gallery_conf, fname):
-    torch.cuda.empty_cache()
-    torch.backends.cudnn.deterministic = True
-    torch.backends.cudnn.benchmark = False
-    torch._dynamo.reset()
-    torch._inductor.config.force_disable_caches = True
-    torch.manual_seed(42)
-    torch.set_default_device(None)
-    random.seed(10)
-    numpy.random.seed(10)
-    torch.set_grad_enabled(True)
-
-    gc.collect()
-
 sphinx_gallery_conf = {
     'examples_dirs': ['beginner_source', 'intermediate_source',
                       'advanced_source', 'recipes_source', 'prototype_source'],
@@ -121,7 +145,6 @@ def reset_seeds(gallery_conf, fname):
     'first_notebook_cell': ("# For tips on running notebooks in Google Colab, see\n"
                             "# https://pytorch.org/tutorials/beginner/colab\n"
                             "%matplotlib inline"),
-    'reset_modules': (reset_seeds),
     'ignore_pattern': r'_torch_export_nightly_tutorial.py',
     'pypandoc': {'extra_args': ['--mathjax', '--toc'],
                  'filters': ['.jenkins/custom_pandoc_filter.py'],
diff --git a/intermediate_source/transformer_building_blocks.py b/intermediate_source/transformer_building_blocks.py
@@ -1,4 +1,7 @@
 """
+.. meta::
+  :description: Learn how to optimize transformer models by replacing nn.Transformer with Nested Tensors and torch.compile() for significant performance gains in PyTorch.
+
 Accelerating PyTorch Transformers by replacing ``nn.Transformer`` with Nested Tensors and ``torch.compile()``
 =============================================================================================================
 **Author:** `Mikayla Gawarecki <https://github.com/mikaylagawarecki>`_
diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py
@@ -369,3 +369,8 @@ def benchmark(func, *args, **kwargs):
 # how implement multi-head attention for transformers in a way that avoids computation on padding.
 # For more information, check out the docs for the
 # `torch.nested <https://pytorch.org/docs/stable/nested.html>`__ namespace.
+#
+# See Also
+# --------
+#
+# * `Accelerating PyTorch Transformers by replacing nn.Transformer with Nested Tensors and torch.compile() <https://docs.pytorch.org/tutorials/intermediate/transformer_building_blocks.html`__
diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst
@@ -273,7 +273,7 @@ Prototype features are not available as part of binary distributions like PyPI o
    :header: (prototype) Using GPUDirect Storage
    :card_description: Learn how to use GPUDirect Storage in PyTorch.
    :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
-   :link: ../prototype/gpudirect_storage.html
+   :link: ../prototype/gpu_direct_storage.html
    :tags: GPUDirect-Storage
 
 .. End of tutorial card section

Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,6 @@`
`53`	`53`	`"intermediate_source/tensorboard_profiler_tutorial", # reenable after 2.0 release.`
`54`	`54`	`"advanced_source/semi_structured_sparse", # reenable after 3303 is fixed.`
`55`	`55`	`"intermediate_source/torchrec_intro_tutorial", # reenable after 3302 is fixe`
`56`		`- "intermediate_source/memory_format_tutorial", # causes other tutorials like torch_logs fail. "state" issue, reseting dynamo didn't help`
`57`	`56`	`]`
`58`	`57`
`59`	`58`	`def tutorial_source_dirs() -> List[Path]:`