Merge branch 'main' into docs/autoload

svekars · web-flow · commit 9a1b2f7b2571 · 2024-09-25T09:49:40.000-07:00
diff --git a/.github/workflows/StalePRs.yml b/.github/workflows/StalePRs.yml
@@ -0,0 +1,157 @@
+# A workflow copied from the pytorch/pytorch repo stale PRs that implements similar logic to actions/stale.
+#
+# Compared to actions/stale, it is implemented to make API requests proportional
+# to the number of stale PRs, not the total number of issues in the repo. This
+# is because PyTorch has a lot of issues/PRs, so the actions/stale runs into
+# rate limits way too quickly.
+#
+# The behavior is:
+# - If a PR is not labeled stale, after 60 days inactivity label the PR as stale and comment about it.
+# - If a PR is labeled stale, after 30 days inactivity close the PR.
+# - `high priority` and `no-stale` PRs are exempt.
+
+name: Close stale pull requests
+
+on:
+  schedule:
+    # Run at midnight UTC.
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+
+jobs:
+  stale:
+    if: ${{ github.repository == 'pytorch/tutorials' }}
+    runs-on: ubuntu-latest 
+    permissions:
+      contents: read
+      pull-requests: write
+
+    steps:
+      - uses: actions/github-script@v6
+        with:
+          script: |
+            // Do some dumb retries on requests.
+            const retries = 7;
+            const baseBackoff = 100;
+            const sleep = timeout => new Promise(resolve => setTimeout(resolve, timeout));
+            github.hook.wrap('request', async (request, options) => {
+              for (let attempt = 1; attempt <= retries; attempt++) {
+                try {
+                  return await request(options);
+                } catch (err) {
+                  if (attempt < retries) {
+                    core.warning(`Request getting retried. Attempt: ${attempt}`);
+                    await sleep(baseBackoff * Math.pow(2, attempt));
+                    continue;
+                  }
+                  throw err;
+                }
+              }
+            });
+
+            const MAX_API_REQUESTS = 100;
+
+            // If a PRs not labeled stale, label them stale after no update for 60 days.
+            const STALE_LABEL_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 60;
+            // For PRs already labeled stale, close after not update for 30 days.
+            const STALE_CLOSE_THRESHOLD_MS = 1000 * 60 * 60 * 24 * 30;
+
+            const STALE_MESSAGE =
+              "Looks like this PR hasn't been updated in a while so we're going to go ahead and mark this as `Stale`. <br>" +
+              "Feel free to remove the `Stale` label if you feel this was a mistake. <br>" +
+              "If you are unable to remove the `Stale` label please contact a maintainer in order to do so. <br>" +
+              "If you want the bot to never mark this PR stale again, add the `no-stale` label.<br>" +
+              "`Stale` pull requests will automatically be closed after 30 days of inactivity.<br>";
+
+            let numAPIRequests = 0;
+            let numProcessed = 0;
+
+            async function processPull(pull) {
+              core.info(`[${pull.number}] URL: ${pull.html_url}`);
+              numProcessed += 1;
+              const labels = pull.labels.map((label) => label.name);
+
+              // Skip if certain labels are present.
+              if (labels.includes("no-stale") || labels.includes("high priority")) {
+                core.info(`[${pull.number}] Skipping because PR has an exempting label.`);
+                return false;
+              }
+
+              // Check if the PR is stale, according to our configured thresholds.
+              let staleThresholdMillis;
+              if (labels.includes("Stale")) {
+                core.info(`[${pull.number}] PR is labeled stale, checking whether we should close it.`);
+                staleThresholdMillis = STALE_CLOSE_THRESHOLD_MS;
+              } else {
+                core.info(`[${pull.number}] Checking whether to label PR as stale.`);
+                staleThresholdMillis = STALE_LABEL_THRESHOLD_MS;
+              }
+
+              const millisSinceLastUpdated =
+                new Date().getTime() - new Date(pull.updated_at).getTime();
+
+              if (millisSinceLastUpdated < staleThresholdMillis) {
+                core.info(`[${pull.number}] Skipping because PR was updated recently`);
+                return false;
+              }
+
+              // At this point, we know we should do something.
+              // For PRs already labeled stale, close them.
+              if (labels.includes("Stale")) {
+                core.info(`[${pull.number}] Closing PR.`);
+                numAPIRequests += 1;
+                await github.rest.issues.update({
+                  owner: "pytorch",
+                  repo: "tutorials",
+                  issue_number: pull.number,
+                  state: "closed",
+                });
+              } else {
+                // For PRs not labeled stale, label them stale.
+                core.info(`[${pull.number}] Labeling PR as stale.`);
+
+                numAPIRequests += 1;
+                await github.rest.issues.createComment({
+                  owner: "pytorch",
+                  repo: "tutorials",
+                  issue_number: pull.number,
+                  body: STALE_MESSAGE,
+                });
+
+                numAPIRequests += 1;
+                await github.rest.issues.addLabels({
+                  owner: "pytorch",
+                  repo: "tutorials",
+                  issue_number: pull.number,
+                  labels: ["Stale"],
+                });
+              }
+            }
+
+            for await (const response of github.paginate.iterator(
+              github.rest.pulls.list,
+              {
+                owner: "pytorch",
+                repo: "tutorials",
+                state: "open",
+                sort: "created",
+                direction: "asc",
+                per_page: 100,
+              }
+            )) {
+              numAPIRequests += 1;
+              const pulls = response.data;
+              // Awaiting in a loop is intentional here. We want to serialize execution so
+              // that log groups are printed correctl
+              for (const pull of pulls) {
+                if (numAPIRequests > MAX_API_REQUESTS) {
+                  core.warning("Max API requests exceeded, exiting.");
+                  process.exit(0);
+                }
+                await core.group(`Processing PR #${pull.number}`, async () => {
+                  await processPull(pull);
+                });
+              }
+            }
+            core.info(`Processed ${numProcessed} PRs total.`);
+
diff --git a/prototype_source/flight_recorder_tutorial.rst b/prototype_source/flight_recorder_tutorial.rst
@@ -46,15 +46,15 @@ Flight Recorder consists of two core parts:
 
 Enabling Flight Recorder
 ------------------------
-There are two required environment variables to get the initial version of Flight Recorder working.
+There are three required environment variables to get the initial version of Flight Recorder working.
 
-- ``TORCH_NCCL_DEBUG_INFO_TEMP_FILE``: Setting the path where the flight recorder will be dumped with file prefix. One file per
-  rank. The default value is ``/tmp/nccl_trace_rank_``.
 - ``TORCH_NCCL_TRACE_BUFFER_SIZE = (0, N)``: Setting ``N`` to a positive number enables collection.
   ``N`` represents the number of entries that will be kept internally in a circular buffer.
-  We recommended to set this value at *2000*.
+  We recommended to set this value at *2000*. The default value is ``2000``.
 - ``TORCH_NCCL_DUMP_ON_TIMEOUT = (true, false)``: Setting this to ``true`` will write out diagnostic files to disk on job timeout.
-  If enabled, there will be one file per rank output in the job's running directory.
+  If enabled, there will be one file per rank output in the job's running directory. The default value is ``false``.
+- ``TORCH_NCCL_DEBUG_INFO_TEMP_FILE``: Setting the path where the flight recorder will be dumped with file prefix. One file per
+  rank. The default value is ``/tmp/nccl_trace_rank_``.
 
 **Optional settings:**
 
@@ -74,7 +74,8 @@ Additional Settings
      ``fast`` is a new experimental mode that is shown to be much faster than the traditional ``addr2line``.
      Use this setting in conjunction with ``TORCH_NCCL_TRACE_CPP_STACK`` to collect C++ traces in the Flight Recorder data.
 - If you prefer not to have the flight recorder data dumped into the local disk but rather onto your own storage, you can define your own writer class.
-  This class should inherit from class ``::c10d::DebugInfoWriter`` and then register the new writer using ``::c10d::DebugInfoWriter::registerWriter``
+  This class should inherit from class ``::c10d::DebugInfoWriter`` `(code) <https://github.com/pytorch/pytorch/blob/release/2.5/torch/csrc/distributed/c10d/NCCLUtils.hpp#L237>`__
+  and then register the new writer using ``::c10d::DebugInfoWriter::registerWriter`` `(code) <https://github.com/pytorch/pytorch/blob/release/2.5/torch/csrc/distributed/c10d/NCCLUtils.hpp#L242>`__
   before we initiate PyTorch distributed.
 
 Retrieving Flight Recorder Data via an API
@@ -189,7 +190,7 @@ command directly:
 Currently, we support two modes for the analyzer script. The first mode allows the script to apply some heuristics to the parsed flight
 recorder dumps to generate a report identifying potential culprits for the timeout. The second mode is simply outputs the raw dumps.
 By default, the script prints flight recoder dumps for all ranks and all ``ProcessGroups``(PGs). This can be narrowed down to certain
-ranks and PGs. An example command is:
+ranks and PGs using the *--selected-ranks* argument. An example command is:
 
 Caveat: tabulate module is needed, so you might need pip install it first.