diff --git a/.circleci/config.yml b/.circleci/config.yml
index b7084096c4b..70b2c7fd5b0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -306,6 +306,10 @@ jobs:
 
 workflows:
   build:
+    when:
+      and: # All must be true to trigger
+        - equal: [ branch1, << pipeline.git.branch >> ]
+        - equal: [ branch2, << pipeline.git.branch >> ]
     jobs:
       # Build jobs that only run on PR
       - pytorch_tutorial_pr_build_worker_0:
@@ -314,365 +318,3 @@ workflows:
               ignore:
               - master
               - main
-      - pytorch_tutorial_pr_build_worker_1:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_2:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_3:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_4:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_5:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_6:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_7:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_8:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_9:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_10:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_11:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_12:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_13:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_14:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_15:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_16:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_17:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_18:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_worker_19:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-      - pytorch_tutorial_pr_build_manager:
-          filters:
-            branches:
-              ignore:
-              - master
-              - main
-          requires:
-          - pytorch_tutorial_pr_build_worker_0
-          - pytorch_tutorial_pr_build_worker_1
-          - pytorch_tutorial_pr_build_worker_2
-          - pytorch_tutorial_pr_build_worker_3
-          - pytorch_tutorial_pr_build_worker_4
-          - pytorch_tutorial_pr_build_worker_5
-          - pytorch_tutorial_pr_build_worker_6
-          - pytorch_tutorial_pr_build_worker_7
-          - pytorch_tutorial_pr_build_worker_8
-          - pytorch_tutorial_pr_build_worker_9
-          - pytorch_tutorial_pr_build_worker_10
-          - pytorch_tutorial_pr_build_worker_11
-          - pytorch_tutorial_pr_build_worker_12
-          - pytorch_tutorial_pr_build_worker_13
-          - pytorch_tutorial_pr_build_worker_14
-          - pytorch_tutorial_pr_build_worker_15
-          - pytorch_tutorial_pr_build_worker_16
-          - pytorch_tutorial_pr_build_worker_17
-          - pytorch_tutorial_pr_build_worker_18
-          - pytorch_tutorial_pr_build_worker_19
-      # Build jobs that only run on trunk
-      - pytorch_tutorial_trunk_build_worker_0:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_1:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_2:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_3:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_4:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_5:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_6:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_7:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_8:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_9:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_10:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_11:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_12:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_13:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_14:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_15:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_16:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_17:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_18:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_worker_19:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-      - pytorch_tutorial_trunk_build_manager:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - master
-              - main
-          requires:
-          - pytorch_tutorial_trunk_build_worker_0
-          - pytorch_tutorial_trunk_build_worker_1
-          - pytorch_tutorial_trunk_build_worker_2
-          - pytorch_tutorial_trunk_build_worker_3
-          - pytorch_tutorial_trunk_build_worker_4
-          - pytorch_tutorial_trunk_build_worker_5
-          - pytorch_tutorial_trunk_build_worker_6
-          - pytorch_tutorial_trunk_build_worker_7
-          - pytorch_tutorial_trunk_build_worker_8
-          - pytorch_tutorial_trunk_build_worker_9
-          - pytorch_tutorial_trunk_build_worker_10
-          - pytorch_tutorial_trunk_build_worker_11
-          - pytorch_tutorial_trunk_build_worker_12
-          - pytorch_tutorial_trunk_build_worker_13
-          - pytorch_tutorial_trunk_build_worker_14
-          - pytorch_tutorial_trunk_build_worker_15
-          - pytorch_tutorial_trunk_build_worker_16
-          - pytorch_tutorial_trunk_build_worker_17
-          - pytorch_tutorial_trunk_build_worker_18
-          - pytorch_tutorial_trunk_build_worker_19
-#      - pytorch_tutorial_windows_pr_build_worker_0:
-#          filters:
-#            branches:
-#              ignore:
-#              - master
-#              - main
-#      - pytorch_tutorial_windows_pr_build_worker_1:
-#          filters:
-#            branches:
-#              ignore:
-#              - master
-#              - main
-#      - pytorch_tutorial_windows_pr_build_worker_2:
-#          filters:
-#            branches:
-#              ignore:
-#              - master
-#              - main
-#      - pytorch_tutorial_windows_pr_build_worker_3:
-#          filters:
-#            branches:
-#              ignore:
-#              - master
-#              - main
-#      - pytorch_tutorial_windows_trunk_build_worker_0:
-#          context: org-member
-#          filters:
-#            branches:
-#              only:
-#              - master
-#              - main
-#      - pytorch_tutorial_windows_trunk_build_worker_1:
-#          context: org-member
-#          filters:
-#            branches:
-#              only:
-#              - master
-#              - main
-#      - pytorch_tutorial_windows_trunk_build_worker_2:
-#          context: org-member
-#          filters:
-#            branches:
-#              only:
-#              - master
-#              - main
-#      - pytorch_tutorial_windows_trunk_build_worker_3:
-#          context: org-member
-#          filters:
-#            branches:
-#              only:
-#              - master
-#              - main
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
deleted file mode 100644
index 0694d221aad..00000000000
--- a/.circleci/config.yml.in
+++ /dev/null
@@ -1,213 +0,0 @@
-# run python regenerate.py to generate config.yml from config.yml.in
-
-version: 2.1
-
-executors:
-  windows-with-nvidia-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-install_official_git_client: &install_official_git_client
-  name: Install Official Git Client
-  no_output_timeout: "1h"
-  command: |
-    set -e
-    sudo apt-get -qq update
-    sudo apt-get -qq install openssh-client git
-
-# This system setup script is meant to run before the CI-related scripts, e.g.,
-# installing Git client, checking out code, setting up CI env, and
-# building/testing.
-setup_linux_system_environment: &setup_linux_system_environment
-  name: Set Up System Environment
-  no_output_timeout: "1h"
-  command: |
-    set -ex
-
-    # Stop background apt updates.  Hypothetically, the kill should not
-    # be necessary, because stop is supposed to send a kill signal to
-    # the process, but we've added it for good luck.  Also
-    # hypothetically, it's supposed to be unnecessary to wait for
-    # the process to block.  We also have that line for good luck.
-    # If you like, try deleting them and seeing if it works.
-    sudo systemctl stop apt-daily.service || true
-    sudo systemctl kill --kill-who=all apt-daily.service || true
-
-    sudo systemctl stop unattended-upgrades.service || true
-    sudo systemctl kill --kill-who=all unattended-upgrades.service || true
-
-    # wait until `apt-get update` has been killed
-    while systemctl is-active --quiet apt-daily.service
-    do
-      sleep 1;
-    done
-    while systemctl is-active --quiet unattended-upgrades.service
-    do
-      sleep 1;
-    done
-
-    # See if we actually were successful
-    systemctl list-units --all | cat
-
-    sudo apt-get purge -y unattended-upgrades
-
-    cat /etc/apt/sources.list
-
-    ps auxfww | grep [a]pt
-    ps auxfww | grep dpkg
-
-pytorch_tutorial_build_defaults: &pytorch_tutorial_build_defaults
-  machine:
-    image: ubuntu-2004-cuda-11.4:202110-01
-  steps:
-  - checkout
-  - run:
-      <<: *setup_linux_system_environment
-  - run:
-      name: Set Up CI Environment
-      no_output_timeout: "1h"
-      command: |
-        set -e
-
-        sudo apt-get -y update
-        sudo apt-get -y install expect-dev moreutils
-
-        sudo pip3 -q install awscli==1.16.35
-
-         if [ -n "${CUDA_VERSION}" ]; then
-           nvidia-smi
-         fi
-
-        # This IAM user only allows read-write access to ECR
-        export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_ONLY}
-        export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY}
-        eval $(aws ecr get-login --region us-east-1 --no-include-email)
-  - run:
-      name: Build
-      no_output_timeout: "20h"
-      command: |
-        set -e
-
-        # for some reason, pip installs it in a different place than what is looked at in the py file
-        sudo pip3 install requests --target=/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages
-        export pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py)
-        echo "PyTorchDockerImageTag: "${pyTorchDockerImageTag}
-
-        cat >/home/circleci/project/ci_build_script.sh \<<EOL
-        # =================== The following code will be executed inside Docker container ===================
-        set -ex
-
-        .jenkins/build.sh
-        # =================== The above code will be executed inside Docker container ===================
-        EOL
-        chmod +x /home/circleci/project/ci_build_script.sh
-
-        export DOCKER_IMAGE=${DOCKER_IMAGE}:${pyTorchDockerImageTag}
-        echo "DOCKER_IMAGE: "${DOCKER_IMAGE}
-        docker pull ${DOCKER_IMAGE} >/dev/null
-        if [ -n "${CUDA_VERSION}" ]; then
-          export id=$(docker run --gpus all -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
-        else
-          export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE})
-        fi
-
-        echo "declare -x JOB_BASE_NAME=${CIRCLE_JOB}" > /home/circleci/project/env
-        echo "declare -x COMMIT_ID=${CIRCLE_SHA1}" >> /home/circleci/project/env
-        echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH}" >> /home/circleci/project/env
-        # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE!
-        set +x
-        if [[ "$CIRCLE_BRANCH" == master || "$CIRCLE_BRANCH" == main ]]; then
-          if [ -z "${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi
-          if [ -z "${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi
-          if [ -z "${GITHUB_PYTORCHBOT_USERNAME}" ]; then exit 1; fi
-          if [ -z "${GITHUB_PYTORCHBOT_TOKEN}" ]; then exit 1; fi
-
-          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env
-          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env
-          echo "declare -x GITHUB_PYTORCHBOT_USERNAME=${GITHUB_PYTORCHBOT_USERNAME}" >> /home/circleci/project/env
-          echo "declare -x GITHUB_PYTORCHBOT_TOKEN=${GITHUB_PYTORCHBOT_TOKEN}" >> /home/circleci/project/env
-        else
-          echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env
-          echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env
-        fi
-        set -x
-
-        echo 'rm /opt/cache/bin/*' | docker exec -u root -i "$id" bash
-        docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace"
-
-        export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
-        echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
-        # Copy docs with plot to a docs dir
-        if docker exec -it "$id" sh -c "test -d ./workspace/docs_with_plot/docs/"; then
-            mkdir /home/circleci/project/docs
-            docker cp "$id:/var/lib/jenkins/workspace/docs_with_plot/docs/." /home/circleci/project/docs
-            echo "Directory copied successfully"
-        else
-            echo "No docs_with_plot directory. Skipping..."
-        fi
-
-  - store_artifacts:
-      path: ./docs
-      destination: tutorials
-
-pytorch_tutorial_build_worker_defaults: &pytorch_tutorial_build_worker_defaults
-  environment:
-    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9"
-    CUDA_VERSION: "9"
-  resource_class: gpu.nvidia.small
-  <<: *pytorch_tutorial_build_defaults
-
-pytorch_tutorial_build_manager_defaults: &pytorch_tutorial_build_manager_defaults
-  environment:
-    DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9"
-  resource_class: medium
-
-
-  <<: *pytorch_tutorial_build_defaults
-{% raw %}
-pytorch_windows_build_worker: &pytorch_windows_build_worker
-  executor: windows-with-nvidia-gpu
-  steps:
-    - checkout
-    - run:
-        name: Install Cuda
-        no_output_timeout: 30m
-        command: |
-          .circleci/scripts/windows_cuda_install.sh
-    - run:
-        name: Generate cache key
-        # This will refresh cache on Sundays, build should generate new cache.
-        command: echo "$(date +"%Y-%U")" > .circleci-weekly
-    - restore_cache:
-        keys:
-          - data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }}
-    - run:
-        name: test
-        no_output_timeout: "1h"
-        command: |
-          .circleci/scripts/build_for_windows.sh
-    - save_cache:
-        key: data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }}
-        paths:
-          - advanced_source/data
-          - beginner_source/data
-          - intermediate_source/data
-          - prototype_source/data
-{% endraw %}
-jobs:
-  {{ jobs("pr") }}
-
-  {{ jobs("trunk") }}
-
-  {{ windows_jobs() }}
-
-workflows:
-  build:
-    jobs:
-      # Build jobs that only run on PR
-      {{ workflows_jobs("pr") }}
-      # Build jobs that only run on trunk
-      {{ workflows_jobs("trunk") }}
-#      {{ windows_workflows_jobs() }}
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
deleted file mode 100644
index f47ee1dfa6f..00000000000
--- a/.circleci/regenerate.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-
-# regenrates config.yml based on config.yml.in
-
-from copy import deepcopy
-import os.path
-
-import jinja2
-import yaml
-from jinja2 import select_autoescape
-
-WORKFLOWS_JOBS_PR = {"filters": {"branches": {"ignore": ["master", "main"]}}}
-
-WORKFLOWS_JOBS_TRUNK = {
-    "context": "org-member",
-    "filters": {"branches": {"only": ["master", "main"]}},
-}
-
-
-def indent(indentation, data_list):
-    return ("\n" + " " * indentation).join(
-        yaml.dump(data_list, default_flow_style=False).splitlines()
-    )
-
-
-def jobs(pr_or_trunk, num_workers=20, indentation=2):
-    jobs = {}
-
-    # all tutorials that need gpu.nvidia.small.multi machines will be routed by
-    # get_files_to_run.py to 0th worker, similarly for gpu.nvidia.large and the
-    # 1st worker
-    needs_gpu_nvidia_small_multi = [0]
-    needs_gpu_nvidia_large = [1]
-    jobs[f"pytorch_tutorial_{pr_or_trunk}_build_manager"] = {
-        "<<": "*pytorch_tutorial_build_manager_defaults"
-    }
-    for i in range(num_workers):
-        job_info = {"<<": "*pytorch_tutorial_build_worker_defaults"}
-        if i in needs_gpu_nvidia_small_multi:
-            job_info["resource_class"] = "gpu.nvidia.small.multi"
-        if i in needs_gpu_nvidia_large:
-            job_info["resource_class"] = "gpu.nvidia.large"
-        jobs[f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}"] = job_info
-
-    return indent(indentation, jobs).replace("'", "")
-
-
-def workflows_jobs(pr_or_trunk, indentation=6, num_workers=20):
-    jobs = []
-    job_info = deepcopy(
-        WORKFLOWS_JOBS_PR if pr_or_trunk == "pr" else WORKFLOWS_JOBS_TRUNK
-    )
-
-    for i in range(num_workers):
-        jobs.append(
-            {f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}": deepcopy(job_info)}
-        )
-
-    job_info["requires"] = [
-        f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}" for i in range(num_workers)
-    ]
-    jobs.append({f"pytorch_tutorial_{pr_or_trunk}_build_manager": deepcopy(job_info)})
-    return indent(indentation, jobs)
-
-
-def windows_jobs(indentation=2, num_workers=4):
-    jobs = {}
-    for i in range(num_workers):
-        jobs[f"pytorch_tutorial_windows_pr_build_worker_{i}"] = {
-            "<<": "*pytorch_windows_build_worker"
-        }
-        jobs[f"pytorch_tutorial_windows_trunk_build_worker_{i}"] = {
-            "<<": "*pytorch_windows_build_worker"
-        }
-    return indent(indentation, jobs).replace("'", "")
-
-
-def windows_workflows_jobs(indentation=6, num_workers=4):
-    jobs = []
-    job_info = WORKFLOWS_JOBS_PR
-    for i in range(num_workers):
-        jobs.append(
-            {f"pytorch_tutorial_windows_pr_build_worker_{i}": deepcopy(job_info)}
-        )
-
-    job_info = WORKFLOWS_JOBS_TRUNK
-    for i in range(num_workers):
-        jobs.append(
-            {f"pytorch_tutorial_windows_trunk_build_worker_{i}": deepcopy(job_info)}
-        )
-
-    return ("\n#").join(indent(indentation, jobs).splitlines())
-
-
-if __name__ == "__main__":
-
-    directory = os.path.dirname(__file__)
-    env = jinja2.Environment(
-        loader=jinja2.FileSystemLoader(directory),
-        lstrip_blocks=True,
-        autoescape=select_autoescape(enabled_extensions=("html", "xml")),
-        keep_trailing_newline=True,
-    )
-    with open(os.path.join(directory, "config.yml"), "w") as f:
-        f.write(
-            env.get_template("config.yml.in").render(
-                jobs=jobs,
-                workflows_jobs=workflows_jobs,
-                windows_jobs=windows_jobs,
-                windows_workflows_jobs=windows_workflows_jobs,
-            )
-        )
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 0392eb3a00d..8c3604b99fb 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,4 +8,4 @@ Fixes #ISSUE_NUMBER
 - [ ] The issue that is being fixed is referred in the description (see above "Fixes #ISSUE_NUMBER")
 - [ ] Only one issue is addressed in this pull request
 - [ ] Labels from the issue that this PR is fixing are added to this pull request
-- [ ] No unnessessary issues are included into this pull request.
+- [ ] No unnecessary issues are included into this pull request.
diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py
index 597f4b5e034..5da80f24f5b 100644
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@@ -14,6 +14,9 @@ def main():
     repo = g.get_repo(f'{repo_owner}/{repo_name}')
     pull_request = repo.get_pull(pull_request_number)
     pull_request_body = pull_request.body
+    # PR without description
+    if pull_request_body is None:
+        return
 
     # get issue number from the PR body
     if not re.search(r'#\d{1,5}', pull_request_body):
diff --git a/.github/workflows/build-tutorials.yml b/.github/workflows/build-tutorials.yml
new file mode 100644
index 00000000000..c242a1897c6
--- /dev/null
+++ b/.github/workflows/build-tutorials.yml
@@ -0,0 +1,192 @@
+name: Build tutorials
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  worker:
+    name: pytorch_tutorial_build_worker
+    strategy:
+      matrix:
+        include:
+          - { shard: 1, num_shards: 6, runner: "linux.16xlarge.nvidia.gpu" }
+          - { shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }
+          - { shard: 3, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" }
+          - { shard: 4, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" }
+          - { shard: 5, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" }
+          - { shard: 6, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" }
+      fail-fast: false
+    runs-on: ${{ matrix.runner }}
+    env:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9"
+      CUDA_VERSION: "9"
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            All testing is done inside the container, to start an interactive session run:
+              docker exec -it $(docker container ps --format '{{.ID}}') bash
+
+      - name: Checkout Tutorials
+        uses: actions/checkout@v3
+
+      - name: Setup Linux
+        uses: pytorch/pytorch/.github/actions/setup-linux@main
+
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+
+      - name: Calculate docker image
+        shell: bash
+        id: docker-image
+        run: |
+          set -ex
+
+          # for some reason, pip installs it in a different place than what is looked at in the py file
+          pip3 install requests==2.26
+          pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py)
+
+          echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.docker-image.outputs.docker-image }}
+
+      - name: Build
+        shell: bash
+        env:
+          DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }}
+          NUM_WORKERS: ${{ matrix.num_shards }}
+          WORKER_ID: ${{ matrix.shard }}
+          COMMIT_ID: ${{ github.sha }}
+          JOB_TYPE: worker
+          COMMIT_SOURCE: ${{ github.ref }}
+        run: |
+          set -ex
+
+          chmod +x ".jenkins/build.sh"
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e WORKER_ID \
+            -e NUM_WORKERS \
+            -e COMMIT_ID \
+            -e JOB_TYPE \
+            -e COMMIT_SOURCE \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --tty \
+            --detach \
+            --user jenkins \
+            --name="${container_name}" \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+
+          echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash
+
+          docker exec -t "${container_name}" sh -c ".jenkins/build.sh"
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
+
+  manager:
+    name: pytorch_tutorial_build_manager
+    needs: worker
+    runs-on: [self-hosted, linux.2xlarge]
+    env:
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9"
+      CUDA_VERSION: "9"
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          instructions: |
+            All testing is done inside the container, to start an interactive session run:
+              docker exec -it $(docker container ps --format '{{.ID}}') bash
+
+      - name: Checkout Tutorials
+        uses: actions/checkout@v3
+
+      - name: Setup Linux
+        uses: pytorch/pytorch/.github/actions/setup-linux@main
+
+      - name: Calculate docker image
+        shell: bash
+        id: docker-image
+        run: |
+          set -ex
+
+          # for some reason, pip installs it in a different place than what is looked at in the py file
+          pip3 install requests==2.26
+          pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py)
+
+          echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.docker-image.outputs.docker-image }}
+
+      - name: Build
+        shell: bash
+        env:
+          DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }}
+          NUM_WORKERS: 6
+          WORKER_ID: ${{ matrix.shard }}
+          COMMIT_ID: ${{ github.sha }}
+          JOB_TYPE: manager
+          COMMIT_SOURCE: ${{ github.ref }}
+          GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.PYTORCHBOT_TOKEN }}
+        run: |
+          set -ex
+
+          chmod +x ".jenkins/build.sh"
+
+          container_name=$(docker run \
+            ${GPU_FLAG:-} \
+            -e WORKER_ID \
+            -e NUM_WORKERS \
+            -e COMMIT_ID \
+            -e JOB_TYPE \
+            -e COMMIT_SOURCE \
+            -e GITHUB_PYTORCHBOT_TOKEN \
+            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
+            --tty \
+            --detach \
+            --user jenkins \
+            --name="${container_name}" \
+            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
+            -w /var/lib/jenkins/workspace \
+            "${DOCKER_IMAGE}"
+          )
+
+          echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash
+
+          docker exec -t "${container_name}" sh -c ".jenkins/build.sh"
+
+      - name: Upload docs preview
+        uses: seemethere/upload-artifact-s3@v5
+        if: ${{ github.event_name == 'pull_request' }}
+        with:
+          retention-days: 14
+          s3-bucket: doc-previews
+          if-no-files-found: error
+          path: docs
+          s3-prefix: pytorch/tutorials/${{ github.event.pull_request.number }}
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        if: always()
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index d09b0a8782a..f13966ff84b 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -1,10 +1,8 @@
+#!/bin/bash
+
 set -ex
 
-if [[ "$COMMIT_SOURCE" == master || "$COMMIT_SOURCE" == main ]]; then
-  export BUCKET_NAME=pytorch-tutorial-build-master
-else
-  export BUCKET_NAME=pytorch-tutorial-build-pull-request
-fi
+export BUCKET_NAME=pytorch-tutorial-build-pull-request
 
 # set locale for click dependency in spacy
 export LC_ALL=C.UTF-8
@@ -25,7 +23,7 @@ pip install -r $DIR/../requirements.txt
 # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
 # RC Link
 # pip uninstall -y torch torchvision torchaudio torchtext
-# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch  torchvision torchaudio torchtext 
+# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch  torchvision torchaudio torchtext
 # pip uninstall -y torch torchvision torchaudio torchtext
 # pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu116/torch_test.html torch torchdata torchvision torchaudio torchtext
 
@@ -37,8 +35,7 @@ awsv2 -i
 awsv2 configure set default.s3.multipart_threshold 5120MB
 
 # Decide whether to parallelize tutorial builds, based on $JOB_BASE_NAME
-export NUM_WORKERS=20
-if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
+if [[ "${JOB_TYPE}" == "worker" ]]; then
   # Step 1: Remove runnable code from tutorials that are not supposed to be run
   python $DIR/remove_runnable_code.py beginner_source/aws_distributed_training_tutorial.py beginner_source/aws_distributed_training_tutorial.py || true
   # python $DIR/remove_runnable_code.py advanced_source/ddp_pipeline_tutorial.py advanced_source/ddp_pipeline_tutorial.py || true
@@ -47,7 +44,7 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
   # python $DIR/remove_runnable_code.py intermediate_source/spatial_transformer_tutorial.py intermediate_source/spatial_transformer_tutorial.py || true
   # Temp remove for 1.10 release.
   # python $DIR/remove_runnable_code.py advanced_source/neural_style_tutorial.py advanced_source/neural_style_tutorial.py || true
-  
+
   # TODO: Fix bugs in these tutorials to make them runnable again
   # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true
 
@@ -56,7 +53,6 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
 
   # Step 2: Keep certain tutorials based on file count, and remove runnable code in all other tutorials
   # IMPORTANT NOTE: We assume that each tutorial has a UNIQUE filename.
-  export WORKER_ID=$(echo "${JOB_BASE_NAME}" | tr -dc '0-9')
   FILES_TO_RUN=$(python .jenkins/get_files_to_run.py)
   echo "FILES_TO_RUN: " ${FILES_TO_RUN}
 
@@ -116,26 +112,18 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then
 
   # Step 6: Copy generated files to S3, tag with commit ID
   7z a worker_${WORKER_ID}.7z docs
-  awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z --acl public-read
-elif [[ "${JOB_BASE_NAME}" == *manager ]]; then
+  awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z
+elif [[ "${JOB_TYPE}" == "manager" ]]; then
   # Step 1: Generate no-plot HTML pages for all tutorials
   make html-noplot
   cp -r _build/html docs
 
   # Step 2: Wait for all workers to finish
-  set +e
-  for ((worker_id=0;worker_id<NUM_WORKERS;worker_id++)); do
-    until awsv2 s3api head-object --bucket ${BUCKET_NAME} --key ${COMMIT_ID}/worker_$worker_id.7z
-    do
-      echo "Waiting for worker $worker_id to finish..."
-      sleep 5
-    done
-  done
-  set -e
+  # Don't actually need to do this because gha will wait
 
   # Step 3: Download generated with-plot HTML files and static files from S3, merge into one folder
   mkdir -p docs_with_plot/docs
-  for ((worker_id=0;worker_id<NUM_WORKERS;worker_id++)); do
+  for ((worker_id=1;worker_id<NUM_WORKERS+1;worker_id++)); do
     awsv2 s3 cp s3://${BUCKET_NAME}/${COMMIT_ID}/worker_$worker_id.7z worker_$worker_id.7z
     7z x worker_$worker_id.7z -oworker_$worker_id
     yes | cp -R worker_$worker_id/docs/* docs_with_plot/docs
@@ -150,16 +138,16 @@ elif [[ "${JOB_BASE_NAME}" == *manager ]]; then
 
   # Step 6: Copy generated HTML files and static files to S3
   7z a manager.7z docs
-  awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z --acl public-read
+  awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z
 
   # Step 7: push new HTML files and static files to gh-pages
-  if [[ "$COMMIT_SOURCE" == master || "$COMMIT_SOURCE" == main ]]; then
+  if [[ "$COMMIT_SOURCE" == "refs/heads/master" || "$COMMIT_SOURCE" == "refs/heads/main" ]]; then
     git clone https://github.com/pytorch/tutorials.git -b gh-pages gh-pages
     cp -r docs/* gh-pages/
     pushd gh-pages
     # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE!
     set +x
-    git remote set-url origin https://${GITHUB_PYTORCHBOT_USERNAME}:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/tutorials.git
+    git remote set-url origin https://pytorchbot:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/tutorials.git
     set -x
     git add -f -A || true
     git config user.email "soumith+bot@pytorch.org"
diff --git a/.jenkins/get_files_to_run.py b/.jenkins/get_files_to_run.py
index ae04d387b46..6ea5dea37b4 100644
--- a/.jenkins/get_files_to_run.py
+++ b/.jenkins/get_files_to_run.py
@@ -11,7 +11,7 @@
 
 def get_all_files() -> List[str]:
     sources = [x.relative_to(REPO_BASE_DIR) for x in REPO_BASE_DIR.glob("*_source/**/*.py") if 'data' not in x.parts]
-    return [str(x) for x in sources]
+    return sorted([str(x) for x in sources])
 
 
 def read_metadata() -> Dict[str, Any]:
@@ -87,8 +87,8 @@ def parse_args() -> Any:
     from argparse import ArgumentParser
     parser = ArgumentParser("Select files to run")
     parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", 20)))
-    parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", 0)))
+    parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", "20")))
+    parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", "1")))
     return parser.parse_args()
 
 
@@ -96,7 +96,7 @@ def main() -> None:
     args = parse_args()
 
     all_files = get_all_files()
-    files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num]
+    files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num - 1]
     if not args.dry_run:
         remove_other_files(all_files, compute_files_to_keep(files_to_run))
     stripped_file_names = [Path(x).stem for x in files_to_run]
diff --git a/_static/img/seq-seq-images/attention-decoder-network.png b/_static/img/seq-seq-images/attention-decoder-network.png
index 243f87c6e97..d31d42a5af1 100755
Binary files a/_static/img/seq-seq-images/attention-decoder-network.png and b/_static/img/seq-seq-images/attention-decoder-network.png differ
diff --git a/advanced_source/cpp_frontend.rst b/advanced_source/cpp_frontend.rst
index 11033951ece..901658183c7 100644
--- a/advanced_source/cpp_frontend.rst
+++ b/advanced_source/cpp_frontend.rst
@@ -1216,9 +1216,6 @@ tensors and display them with matplotlib:
 
 .. code-block:: python
 
-  from __future__ import print_function
-  from __future__ import unicode_literals
-
   import argparse
 
   import matplotlib.pyplot as plt
diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py
index 3d84fc508bc..ee4dab7e7ec 100644
--- a/advanced_source/neural_style_tutorial.py
+++ b/advanced_source/neural_style_tutorial.py
@@ -14,7 +14,7 @@
 developed by Leon A. Gatys, Alexander S. Ecker and Matthias Bethge.
 Neural-Style, or Neural-Transfer, allows you to take an image and
 reproduce it with a new artistic style. The algorithm takes three images,
-an input image, a content-image, and a style-image, and changes the input 
+an input image, a content-image, and a style-image, and changes the input
 to resemble the content of the content-image and the artistic style of the style-image.
 
  
@@ -47,8 +47,6 @@
 # -  ``torchvision.models`` (train or load pretrained models)
 # -  ``copy`` (to deep copy the models; system package)
 
-from __future__ import print_function
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -72,6 +70,7 @@
 # method is used to move tensors or modules to a desired device. 
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.set_default_device(device)
 
 ######################################################################
 # Loading the Images
@@ -263,7 +262,7 @@ def forward(self, input):
 # network to evaluation mode using ``.eval()``.
 # 
 
-cnn = models.vgg19(pretrained=True).features.to(device).eval()
+cnn = models.vgg19(pretrained=True).features.eval()
 
 
 
@@ -273,8 +272,8 @@ def forward(self, input):
 # We will use them to normalize the image before sending it into the network.
 # 
 
-cnn_normalization_mean = torch.tensor([0.485, 0.456, 0.406]).to(device)
-cnn_normalization_std = torch.tensor([0.229, 0.224, 0.225]).to(device)
+cnn_normalization_mean = torch.tensor([0.485, 0.456, 0.406])
+cnn_normalization_std = torch.tensor([0.229, 0.224, 0.225])
 
 # create a module to normalize input image so we can easily put it in a
 # ``nn.Sequential``
@@ -310,7 +309,7 @@ def get_style_model_and_losses(cnn, normalization_mean, normalization_std,
                                content_layers=content_layers_default,
                                style_layers=style_layers_default):
     # normalization module
-    normalization = Normalization(normalization_mean, normalization_std).to(device)
+    normalization = Normalization(normalization_mean, normalization_std)
 
     # just in order to have an iterable access to or list of content/style
     # losses
@@ -375,7 +374,7 @@ def get_style_model_and_losses(cnn, normalization_mean, normalization_std,
 #
 # ::
 #
-#    input_img = torch.randn(content_img.data.size(), device=device)
+#    input_img = torch.randn(content_img.data.size())
 
 # add the original input image to the figure:
 plt.figure()
@@ -423,6 +422,9 @@ def run_style_transfer(cnn, normalization_mean, normalization_std,
     # We want to optimize the input and not the model parameters so we
     # update all the requires_grad fields accordingly
     input_img.requires_grad_(True)
+    # We also put the model in evaluation mode, so that specific layers 
+    # such as dropout or batch normalization layers behave correctly. 
+    model.eval()
     model.requires_grad_(False)
 
     optimizer = get_input_optimizer(input_img)
diff --git a/advanced_source/rpc_ddp_tutorial.rst b/advanced_source/rpc_ddp_tutorial.rst
index 747c12f6d4f..5c7aeffb2f9 100644
--- a/advanced_source/rpc_ddp_tutorial.rst
+++ b/advanced_source/rpc_ddp_tutorial.rst
@@ -1,6 +1,6 @@
 Combining Distributed DataParallel with Distributed RPC Framework
 =================================================================
-**Authors**: `Pritam Damania <https://github.com/pritamdamania87>`_ and `Yi Wang <https://github.com/SciPioneer>`_
+**Authors**: `Pritam Damania <https://github.com/pritamdamania87>`_ and `Yi Wang <https://github.com/wayi1>`_
 
 .. note::
    |edit| View and edit this tutorial in `github <https://github.com/pytorch/tutorials/blob/main/advanced_source/rpc_ddp_tutorial.rst>`__.
diff --git a/advanced_source/super_resolution_with_onnxruntime.py b/advanced_source/super_resolution_with_onnxruntime.py
index eb184e85109..835a79bd3a0 100644
--- a/advanced_source/super_resolution_with_onnxruntime.py
+++ b/advanced_source/super_resolution_with_onnxruntime.py
@@ -16,10 +16,7 @@
 and `ONNX Runtime <https://github.com/microsoft/onnxruntime>`__.
 You can get binary builds of ONNX and ONNX Runtime with
 ``pip install onnx onnxruntime``.
-Note that ONNX Runtime is compatible with Python versions 3.5 to 3.7.
-
-``NOTE``: This tutorial needs PyTorch master branch which can be installed by following
-the instructions `here <https://github.com/pytorch/pytorch#from-source>`__
+ONNX Runtime recommends using the latest stable runtime for PyTorch.
 
 """
 
diff --git a/beginner_source/Intro_to_TorchScript_tutorial.py b/beginner_source/Intro_to_TorchScript_tutorial.py
index d369c4fbf80..21ee32ff384 100644
--- a/beginner_source/Intro_to_TorchScript_tutorial.py
+++ b/beginner_source/Intro_to_TorchScript_tutorial.py
@@ -33,6 +33,7 @@
 
 import torch  # This is all you need to use both PyTorch and TorchScript!
 print(torch.__version__)
+torch.manual_seed(191009)  # set the seed for reproducibility
 
 
 ######################################################################
@@ -308,7 +309,7 @@ def forward(self, x, h):
 
 # New inputs
 x, h = torch.rand(3, 4), torch.rand(3, 4)
-traced_cell(x, h)
+print(scripted_cell(x, h))
 
 
 ######################################################################
diff --git a/beginner_source/basics/optimization_tutorial.py b/beginner_source/basics/optimization_tutorial.py
index 0fb508d1ccc..a1603510b96 100644
--- a/beginner_source/basics/optimization_tutorial.py
+++ b/beginner_source/basics/optimization_tutorial.py
@@ -149,6 +149,9 @@ def forward(self, x):
 
 def train_loop(dataloader, model, loss_fn, optimizer):
     size = len(dataloader.dataset)
+    # Set the model to training mode - important for batch normalization and dropout layers
+    # Unnecessary in this situation but added for best practices
+    model.train()
     for batch, (X, y) in enumerate(dataloader):
         # Compute prediction and loss
         pred = model(X)
@@ -165,10 +168,15 @@ def train_loop(dataloader, model, loss_fn, optimizer):
 
 
 def test_loop(dataloader, model, loss_fn):
+    # Set the model to evaluation mode - important for batch normalization and dropout layers
+    # Unnecessary in this situation but added for best practices
+    model.eval()
     size = len(dataloader.dataset)
     num_batches = len(dataloader)
     test_loss, correct = 0, 0
 
+    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
+    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
     with torch.no_grad():
         for X, y in dataloader:
             pred = model(X)
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
index 02185a6ba3e..44310cc3620 100644
--- a/beginner_source/chatbot_tutorial.py
+++ b/beginner_source/chatbot_tutorial.py
@@ -92,11 +92,6 @@
 # After that, let’s import some necessities.
 #
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import torch
 from torch.jit import script, trace
 import torch.nn as nn
diff --git a/beginner_source/data_loading_tutorial.py b/beginner_source/data_loading_tutorial.py
index 322d9b3009c..7ec18236b33 100644
--- a/beginner_source/data_loading_tutorial.py
+++ b/beginner_source/data_loading_tutorial.py
@@ -18,7 +18,6 @@
 
 """
 
-from __future__ import print_function, division
 import os
 import torch
 import pandas as pd
@@ -165,9 +164,7 @@ def __getitem__(self, idx):
 
 fig = plt.figure()
 
-for i in range(len(face_dataset)):
-    sample = face_dataset[i]
-
+for i, sample in enumerate(face_dataset):
     print(i, sample['image'].shape, sample['landmarks'].shape)
 
     ax = plt.subplot(1, 4, i + 1)
@@ -268,8 +265,8 @@ def __call__(self, sample):
         h, w = image.shape[:2]
         new_h, new_w = self.output_size
 
-        top = np.random.randint(0, h - new_h)
-        left = np.random.randint(0, w - new_w)
+        top = np.random.randint(0, h - new_h + 1)
+        left = np.random.randint(0, w - new_w + 1)
 
         image = image[top: top + new_h,
                       left: left + new_w]
@@ -294,7 +291,7 @@ def __call__(self, sample):
 
 ######################################################################
 # .. note::
-#     In the example above, `RandomCrop` uses an external library's random number generator 
+#     In the example above, `RandomCrop` uses an external library's random number generator
 #     (in this case, Numpy's `np.random.int`). This can result in unexpected behavior with `DataLoader`
 #     (see `here <https://pytorch.org/docs/stable/notes/faq.html#my-data-loader-workers-return-identical-random-numbers>`_).
 #     In practice, it is safer to stick to PyTorch's random number generator, e.g. by using `torch.randint` instead.
@@ -356,9 +353,7 @@ def __call__(self, sample):
                                                ToTensor()
                                            ]))
 
-for i in range(len(transformed_dataset)):
-    sample = transformed_dataset[i]
-
+for i, sample in enumerate(transformed_dataset):
     print(i, sample['image'].size(), sample['landmarks'].size())
 
     if i == 3:
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
index d98683741e5..1a1f9c38606 100644
--- a/beginner_source/dcgan_faces_tutorial.py
+++ b/beginner_source/dcgan_faces_tutorial.py
@@ -112,7 +112,6 @@
 # will be explained in the coming sections.
 # 
 
-from __future__ import print_function
 #%matplotlib inline
 import argparse
 import os
@@ -120,7 +119,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.parallel
-import torch.backends.cudnn as cudnn
 import torch.optim as optim
 import torch.utils.data
 import torchvision.datasets as dset
@@ -137,6 +135,7 @@
 print("Random Seed: ", manualSeed)
 random.seed(manualSeed)
 torch.manual_seed(manualSeed)
+torch.use_deterministic_algorithms(True) # Needed for reproducible results
 
 
 ######################################################################
diff --git a/beginner_source/deep_learning_60min_blitz.rst b/beginner_source/deep_learning_60min_blitz.rst
index 09ac232cc49..6c96c403455 100644
--- a/beginner_source/deep_learning_60min_blitz.rst
+++ b/beginner_source/deep_learning_60min_blitz.rst
@@ -20,11 +20,12 @@ Goal of this tutorial:
 - Understand PyTorch’s Tensor library and neural networks at a high level.
 - Train a small neural network to classify images
 
-To run the tutorials below, make sure you have the `torch`_ and `torchvision`_
-packages installed.
+To run the tutorials below, make sure you have the `torch`_, `torchvision`_,
+and `matplotlib`_ packages installed.
 
 .. _torch: https://github.com/pytorch/pytorch
 .. _torchvision: https://github.com/pytorch/vision
+.. _matplotlib: https://github.com/matplotlib/matplotlib
 
 .. toctree::
    :hidden:
diff --git a/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py b/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py
index 5e985b58598..508fa5a057a 100644
--- a/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py
+++ b/beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py
@@ -101,11 +101,6 @@
 # maximum length output that the model is capable of producing.
 #
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/beginner_source/examples_autograd/polynomial_autograd.py b/beginner_source/examples_autograd/polynomial_autograd.py
index 05744ff560c..9c992d2ca4d 100755
--- a/beginner_source/examples_autograd/polynomial_autograd.py
+++ b/beginner_source/examples_autograd/polynomial_autograd.py
@@ -18,23 +18,23 @@
 import math
 
 dtype = torch.float
-device = torch.device("cpu")
-# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch.set_default_device(device)
 
 # Create Tensors to hold input and outputs.
 # By default, requires_grad=False, which indicates that we do not need to
 # compute gradients with respect to these Tensors during the backward pass.
-x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype)
 y = torch.sin(x)
 
 # Create random Tensors for weights. For a third order polynomial, we need
 # 4 weights: y = a + b x + c x^2 + d x^3
 # Setting requires_grad=True indicates that we want to compute gradients with
 # respect to these Tensors during the backward pass.
-a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
-b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
-c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
-d = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+a = torch.randn((), dtype=dtype, requires_grad=True)
+b = torch.randn((), dtype=dtype, requires_grad=True)
+c = torch.randn((), dtype=dtype, requires_grad=True)
+d = torch.randn((), dtype=dtype, requires_grad=True)
 
 learning_rate = 1e-6
 for t in range(2000):
diff --git a/beginner_source/fgsm_tutorial.py b/beginner_source/fgsm_tutorial.py
index fa23680496c..6071cb2fb35 100644
--- a/beginner_source/fgsm_tutorial.py
+++ b/beginner_source/fgsm_tutorial.py
@@ -90,7 +90,6 @@
 # into the implementation.
 # 
 
-from __future__ import print_function
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -99,13 +98,6 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
-# NOTE: This is a hack to get around "User-agent" limitations when downloading MNIST datasets
-#       see, https://github.com/pytorch/vision/issues/3497 for more information
-from six.moves import urllib
-opener = urllib.request.build_opener()
-opener.addheaders = [('User-agent', 'Mozilla/5.0')]
-urllib.request.install_opener(opener)
-
 
 ######################################################################
 # Implementation
@@ -141,6 +133,8 @@
 epsilons = [0, .05, .1, .15, .2, .25, .3]
 pretrained_model = "data/lenet_mnist_model.pth"
 use_cuda=True
+# Set random seed for reproducibility
+torch.manual_seed(42)
 
 
 ######################################################################
@@ -179,18 +173,18 @@ def forward(self, x):
 test_loader = torch.utils.data.DataLoader(
     datasets.MNIST('../data', train=False, download=True, transform=transforms.Compose([
             transforms.ToTensor(),
-            ])), 
+            ])),
         batch_size=1, shuffle=True)
 
 # Define what device we are using
 print("CUDA Available: ",torch.cuda.is_available())
-device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")
+device = torch.device("cuda" if use_cuda and torch.cuda.is_available() else "cpu")
 
 # Initialize the network
 model = Net().to(device)
 
 # Load the pretrained model
-model.load_state_dict(torch.load(pretrained_model, map_location='cpu'))
+model.load_state_dict(torch.load(pretrained_model, weights_only=True, map_location='cpu'))
 
 # Set the model in evaluation mode. In this case this is for the Dropout layers
 model.eval()
@@ -290,7 +284,7 @@ def test( model, device, test_loader, epsilon ):
         if final_pred.item() == target.item():
             correct += 1
             # Special case for saving 0 epsilon examples
-            if (epsilon == 0) and (len(adv_examples) < 5):
+            if epsilon == 0 and len(adv_examples) < 5:
                 adv_ex = perturbed_data.squeeze().detach().cpu().numpy()
                 adv_examples.append( (init_pred.item(), final_pred.item(), adv_ex) )
         else:
@@ -301,7 +295,7 @@ def test( model, device, test_loader, epsilon ):
 
     # Calculate final accuracy for this epsilon
     final_acc = correct/float(len(test_loader))
-    print("Epsilon: {}\tTest Accuracy = {} / {} = {}".format(epsilon, correct, len(test_loader), final_acc))
+    print(f"Epsilon: {epsilon}\tTest Accuracy = {correct} / {len(test_loader)} = {final_acc}")
 
     # Return the accuracy and an adversarial example
     return final_acc, adv_examples
@@ -387,9 +381,9 @@ def test( model, device, test_loader, epsilon ):
         plt.xticks([], [])
         plt.yticks([], [])
         if j == 0:
-            plt.ylabel("Eps: {}".format(epsilons[i]), fontsize=14)
+            plt.ylabel(f"Eps: {epsilons[i]}", fontsize=14)
         orig,adv,ex = examples[i][j]
-        plt.title("{} -> {}".format(orig, adv))
+        plt.title(f"{orig} -> {adv}")
         plt.imshow(ex, cmap="gray")
 plt.tight_layout()
 plt.show()
diff --git a/beginner_source/finetuning_torchvision_models_tutorial.rst b/beginner_source/finetuning_torchvision_models_tutorial.rst
new file mode 100644
index 00000000000..711f4b0f99b
--- /dev/null
+++ b/beginner_source/finetuning_torchvision_models_tutorial.rst
@@ -0,0 +1,10 @@
+Finetuning Torchvision Models
+=============================
+
+This tutorial has been moved to https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
+
+It will redirect in 3 seconds.
+
+.. raw:: html
+
+   <meta http-equiv="Refresh" content="3; url='https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html'" />
diff --git a/beginner_source/former_torchies/parallelism_tutorial.py b/beginner_source/former_torchies/parallelism_tutorial.py
index 18c14c43167..a11d844e1bd 100644
--- a/beginner_source/former_torchies/parallelism_tutorial.py
+++ b/beginner_source/former_torchies/parallelism_tutorial.py
@@ -53,7 +53,10 @@ def forward(self, x):
 
 class MyDataParallel(nn.DataParallel):
     def __getattr__(self, name):
-        return getattr(self.module, name)
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.module, name)
     
 ########################################################################
 # **Primitives on which DataParallel is implemented upon:**
diff --git a/beginner_source/introyt/captumyt.py b/beginner_source/introyt/captumyt.py
index 2ff8e9e70b1..cf63b6109b6 100644
--- a/beginner_source/introyt/captumyt.py
+++ b/beginner_source/introyt/captumyt.py
@@ -98,21 +98,24 @@
 Before you get started, you need to have a Python environment with:
 
 -  Python version 3.6 or higher
--  For the Captum Insights example, Flask 1.1 or higher
+-  For the Captum Insights example, Flask 1.1 or higher and Flask-Compress
+   (the latest version is recommended)
 -  PyTorch version 1.2 or higher (the latest version is recommended)
 -  TorchVision version 0.6 or higher (the latest version is recommended)
 -  Captum (the latest version is recommended)
+-  Matplotlib version 3.3.4, since Captum currently uses a Matplotlib
+   function whose arguments have been renamed in later versions
 
 To install Captum in an Anaconda or pip virtual environment, use the
 appropriate command for your environment below:
 
 With ``conda``::
 
-    conda install pytorch torchvision captum -c pytorch
+    conda install pytorch torchvision captum flask-compress matplotlib=3.3.4 -c pytorch
 
 With ``pip``::
 
-    pip install torch torchvision captum
+    pip install torch torchvision captum matplotlib==3.3.4 Flask-Compress
 
 Restart this notebook in the environment you set up, and you’re ready to
 go!
@@ -155,7 +158,7 @@
 # now.
 # 
 
-model = models.resnet101(weights='IMAGENET1K_V1')
+model = models.resnet18(weights='IMAGENET1K_V1')
 model = model.eval()
 
 
diff --git a/beginner_source/introyt/introyt1_tutorial.py b/beginner_source/introyt/introyt1_tutorial.py
index f52c3902c03..a5d65bcab16 100644
--- a/beginner_source/introyt/introyt1_tutorial.py
+++ b/beginner_source/introyt/introyt1_tutorial.py
@@ -288,7 +288,7 @@ def num_flat_features(self, x):
 
 transform = transforms.Compose(
     [transforms.ToTensor(),
-     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616))])
 
 
 ##########################################################################
@@ -297,9 +297,28 @@ def num_flat_features(self, x):
 # -  ``transforms.ToTensor()`` converts images loaded by Pillow into 
 #    PyTorch tensors.
 # -  ``transforms.Normalize()`` adjusts the values of the tensor so
-#    that their average is zero and their standard deviation is 0.5. Most
+#    that their average is zero and their standard deviation is 1.0. Most
 #    activation functions have their strongest gradients around x = 0, so
 #    centering our data there can speed learning.
+#    The values passed to the transform are the means (first tuple) and the
+#    standard deviations (second tuple) of the rgb values of the images in
+#    the dataset. You can calculate these values yourself by running these
+#    few lines of code:
+#          ```
+#           from torch.utils.data import ConcatDataset
+#           transform = transforms.Compose([transforms.ToTensor()])
+#           trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+#                                        download=True, transform=transform)
+#
+#           #stack all train images together into a tensor of shape 
+#           #(50000, 3, 32, 32)
+#           x = torch.stack([sample[0] for sample in ConcatDataset([trainset])])
+#           
+#           #get the mean of each channel            
+#           mean = torch.mean(x, dim=(0,2,3)) #tensor([0.4914, 0.4822, 0.4465])
+#           std = torch.std(x, dim=(0,2,3)) #tensor([0.2470, 0.2435, 0.2616])  
+# 
+#          ```   
 # 
 # There are many more transforms available, including cropping, centering,
 # rotation, and reflection.
diff --git a/beginner_source/introyt/tensorboardyt_tutorial.py b/beginner_source/introyt/tensorboardyt_tutorial.py
index 4c7c356fd0c..29e83066726 100644
--- a/beginner_source/introyt/tensorboardyt_tutorial.py
+++ b/beginner_source/introyt/tensorboardyt_tutorial.py
@@ -64,6 +64,13 @@
 # PyTorch TensorBoard support
 from torch.utils.tensorboard import SummaryWriter
 
+# In case you are using an environment that has TensorFlow installed,
+# such as Google Colab, uncomment the following code to avoid
+# a bug with saving embeddings to your TensorBoard directory
+
+# import tensorflow as tf
+# import tensorboard as tb
+# tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
 
 ######################################################################
 # Showing Images in TensorBoard
diff --git a/beginner_source/introyt/trainingyt.py b/beginner_source/introyt/trainingyt.py
index 929e06c1b57..d9f585411e8 100644
--- a/beginner_source/introyt/trainingyt.py
+++ b/beginner_source/introyt/trainingyt.py
@@ -290,15 +290,19 @@ def train_one_epoch(epoch_index, tb_writer):
     model.train(True)
     avg_loss = train_one_epoch(epoch_number, writer)
     
-    # We don't need gradients on to do reporting
-    model.train(False)
-    
+
     running_vloss = 0.0
-    for i, vdata in enumerate(validation_loader):
-        vinputs, vlabels = vdata
-        voutputs = model(vinputs)
-        vloss = loss_fn(voutputs, vlabels)
-        running_vloss += vloss
+    # Set the model to evaluation mode, disabling dropout and using population 
+    # statistics for batch normalization.
+    model.eval()
+
+    # Disable gradient computation and reduce memory consumption.
+    with torch.no_grad():
+        for i, vdata in enumerate(validation_loader):
+            vinputs, vlabels = vdata
+            voutputs = model(vinputs)
+            vloss = loss_fn(voutputs, vlabels)
+            running_vloss += vloss
     
     avg_vloss = running_vloss / (i + 1)
     print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
index bc32131b93a..183aca1748b 100644
--- a/beginner_source/nn_tutorial.py
+++ b/beginner_source/nn_tutorial.py
@@ -75,6 +75,11 @@
 import numpy as np
 
 pyplot.imshow(x_train[0].reshape((28, 28)), cmap="gray")
+# ``pyplot.show()`` only if not on Colab
+try:
+    import google.colab
+except ImportError:
+    pyplot.show()
 print(x_train.shape)
 
 ###############################################################################
@@ -790,8 +795,7 @@ def __len__(self):
         return len(self.dl)
 
     def __iter__(self):
-        batches = iter(self.dl)
-        for b in batches:
+        for b in self.dl:
             yield (self.func(*b))
 
 train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
diff --git a/beginner_source/transfer_learning_tutorial.py b/beginner_source/transfer_learning_tutorial.py
index b4460bb4fb2..7a2b053763a 100644
--- a/beginner_source/transfer_learning_tutorial.py
+++ b/beginner_source/transfer_learning_tutorial.py
@@ -33,8 +33,6 @@
 # License: BSD
 # Author: Sasank Chilamkurthy
 
-from __future__ import print_function, division
-
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -46,7 +44,8 @@
 import matplotlib.pyplot as plt
 import time
 import os
-import copy
+from PIL import Image
+from tempfile import TemporaryDirectory
 
 cudnn.benchmark = True
 plt.ion()   # interactive mode
@@ -146,67 +145,71 @@ def imshow(inp, title=None):
 def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
     since = time.time()
 
-    best_model_wts = copy.deepcopy(model.state_dict())
-    best_acc = 0.0
-
-    for epoch in range(num_epochs):
-        print(f'Epoch {epoch}/{num_epochs - 1}')
-        print('-' * 10)
-
-        # Each epoch has a training and validation phase
-        for phase in ['train', 'val']:
-            if phase == 'train':
-                model.train()  # Set model to training mode
-            else:
-                model.eval()   # Set model to evaluate mode
-
-            running_loss = 0.0
-            running_corrects = 0
-
-            # Iterate over data.
-            for inputs, labels in dataloaders[phase]:
-                inputs = inputs.to(device)
-                labels = labels.to(device)
-
-                # zero the parameter gradients
-                optimizer.zero_grad()
-
-                # forward
-                # track history if only in train
-                with torch.set_grad_enabled(phase == 'train'):
-                    outputs = model(inputs)
-                    _, preds = torch.max(outputs, 1)
-                    loss = criterion(outputs, labels)
-
-                    # backward + optimize only if in training phase
-                    if phase == 'train':
-                        loss.backward()
-                        optimizer.step()
-
-                # statistics
-                running_loss += loss.item() * inputs.size(0)
-                running_corrects += torch.sum(preds == labels.data)
-            if phase == 'train':
-                scheduler.step()
-
-            epoch_loss = running_loss / dataset_sizes[phase]
-            epoch_acc = running_corrects.double() / dataset_sizes[phase]
-
-            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
-
-            # deep copy the model
-            if phase == 'val' and epoch_acc > best_acc:
-                best_acc = epoch_acc
-                best_model_wts = copy.deepcopy(model.state_dict())
-
-        print()
-
-    time_elapsed = time.time() - since
-    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
-    print(f'Best val Acc: {best_acc:4f}')
-
-    # load best model weights
-    model.load_state_dict(best_model_wts)
+    # Create a temporary directory to save training checkpoints
+    with TemporaryDirectory() as tempdir:
+        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')
+    
+        torch.save(model.state_dict(), best_model_params_path)
+        best_acc = 0.0
+
+        for epoch in range(num_epochs):
+            print(f'Epoch {epoch}/{num_epochs - 1}')
+            print('-' * 10)
+
+            # Each epoch has a training and validation phase
+            for phase in ['train', 'val']:
+                if phase == 'train':
+                    model.train()  # Set model to training mode
+                else:
+                    model.eval()   # Set model to evaluate mode
+
+                running_loss = 0.0
+                running_corrects = 0
+
+                # Iterate over data.
+                for inputs, labels in dataloaders[phase]:
+                    inputs = inputs.to(device)
+                    labels = labels.to(device)
+
+                    # zero the parameter gradients
+                    optimizer.zero_grad()
+
+                    # forward
+                    # track history if only in train
+                    with torch.set_grad_enabled(phase == 'train'):
+                        outputs = model(inputs)
+                        _, preds = torch.max(outputs, 1)
+                        loss = criterion(outputs, labels)
+
+                        # backward + optimize only if in training phase
+                        if phase == 'train':
+                            loss.backward()
+                            optimizer.step()
+
+                    # statistics
+                    running_loss += loss.item() * inputs.size(0)
+                    running_corrects += torch.sum(preds == labels.data)
+                if phase == 'train':
+                    scheduler.step()
+
+                epoch_loss = running_loss / dataset_sizes[phase]
+                epoch_acc = running_corrects.double() / dataset_sizes[phase]
+
+                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
+
+                # deep copy the model
+                if phase == 'val' and epoch_acc > best_acc:
+                    best_acc = epoch_acc
+                    torch.save(model.state_dict(), best_model_params_path)
+
+            print()
+
+        time_elapsed = time.time() - since
+        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
+        print(f'Best val Acc: {best_acc:4f}')
+
+        # load best model weights
+        model.load_state_dict(torch.load(best_model_params_path))
     return model
 
 
@@ -335,6 +338,47 @@ def visualize_model(model, num_images=6):
 plt.ioff()
 plt.show()
 
+
+######################################################################
+# Inference on custom images
+# --------------------------
+#
+# Use the trained model to make predictions on custom images and visualize
+# the predicted class labels along with the images.
+#
+
+def visualize_model_predictions(model,img_path):
+    was_training = model.training
+    model.eval()
+
+    img = Image.open(img_path)
+    img = data_transforms['val'](img)
+    img = img.unsqueeze(0)
+    img = img.to(device)
+
+    with torch.no_grad():
+        outputs = model(img)
+        _, preds = torch.max(outputs, 1)
+
+        ax = plt.subplot(2,2,1)
+        ax.axis('off')
+        ax.set_title(f'Predicted: {class_names[preds[0]]}')
+        imshow(img.cpu().data[0])
+        
+        model.train(mode=was_training)
+
+######################################################################
+#
+
+visualize_model_predictions(
+    model_conv,
+    img_path='data/hymenoptera_data/val/bees/72100438_73de9f17af.jpg'
+)
+
+plt.ioff()
+plt.show()
+
+
 ######################################################################
 # Further Learning
 # -----------------
diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index d93b3d55fe7..d7ebee959e5 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -2,7 +2,7 @@
 Language Modeling with ``nn.Transformer`` and torchtext
 ===============================================================
 
-This is a tutorial on training a sequence-to-sequence model that uses the
+This is a tutorial on training a model to predict the next word in a sequence using the
 `nn.Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ module.
 
 The PyTorch 1.2 release includes a standard transformer module based on the
@@ -29,7 +29,9 @@
 
 ######################################################################
 # In this tutorial, we train a ``nn.TransformerEncoder`` model on a
-# language modeling task. The language modeling task is to assign a
+# language modeling task. Please note that this tutorial does not cover
+# the training of `nn.TransformerDecoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html#torch.nn.TransformerDecoder>`__, as depicted in
+# the right half of the diagram above. The language modeling task is to assign a
 # probability for the likelihood of a given word (or a sequence of words)
 # to follow a sequence of words. A sequence of tokens are passed to the embedding
 # layer first, followed by a positional encoding layer to account for the order
@@ -37,11 +39,14 @@
 # ``nn.TransformerEncoder`` consists of multiple layers of
 # `nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__.
 # Along with the input sequence, a square attention mask is required because the
-# self-attention layers in ``nn.TransformerEncoder`` are only allowed to attend
+# self-attention layers in ``nn.TransformerDecoder`` are only allowed to attend
 # the earlier positions in the sequence. For the language modeling task, any
 # tokens on the future positions should be masked. To produce a probability
 # distribution over output words, the output of the ``nn.TransformerEncoder``
-# model is passed through a linear layer followed by a log-softmax function.
+# model is passed through a linear layer to output unnormalized logits.
+# The log-softmax function isn't applied here due to the later use of
+# `CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html>`__,
+# which requires the inputs to be unnormalized logits.
 #
 
 import math
@@ -51,7 +56,6 @@
 
 import torch
 from torch import nn, Tensor
-import torch.nn.functional as F
 from torch.nn import TransformerEncoder, TransformerEncoderLayer
 from torch.utils.data import dataset
 
@@ -64,19 +68,19 @@ def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
         self.pos_encoder = PositionalEncoding(d_model, dropout)
         encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
-        self.encoder = nn.Embedding(ntoken, d_model)
+        self.embedding = nn.Embedding(ntoken, d_model)
         self.d_model = d_model
-        self.decoder = nn.Linear(d_model, ntoken)
+        self.linear = nn.Linear(d_model, ntoken)
 
         self.init_weights()
 
     def init_weights(self) -> None:
         initrange = 0.1
-        self.encoder.weight.data.uniform_(-initrange, initrange)
-        self.decoder.bias.data.zero_()
-        self.decoder.weight.data.uniform_(-initrange, initrange)
+        self.embedding.weight.data.uniform_(-initrange, initrange)
+        self.linear.bias.data.zero_()
+        self.linear.weight.data.uniform_(-initrange, initrange)
 
-    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
+    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
         """
         Arguments:
             src: Tensor, shape ``[seq_len, batch_size]``
@@ -85,18 +89,13 @@ def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
         Returns:
             output Tensor of shape ``[seq_len, batch_size, ntoken]``
         """
-        src = self.encoder(src) * math.sqrt(self.d_model)
+        src = self.embedding(src) * math.sqrt(self.d_model)
         src = self.pos_encoder(src)
         output = self.transformer_encoder(src, src_mask)
-        output = self.decoder(output)
+        output = self.linear(output)
         return output
 
 
-def generate_square_subsequent_mask(sz: int) -> Tensor:
-    """Generates an upper-triangular matrix of ``-inf``, with zeros on ``diag``."""
-    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
-
-
 ######################################################################
 # ``PositionalEncoding`` module injects some information about the
 # relative or absolute position of the tokens in the sequence. The
@@ -140,6 +139,7 @@ def forward(self, x: Tensor) -> Tensor:
 #  .. code-block:: bash
 #
 #      %%bash
+#      pip install portalocker
 #      pip install torchdata
 #
 # The vocab object is built based on the train dataset and is used to numericalize
@@ -149,7 +149,7 @@ def forward(self, x: Tensor) -> Tensor:
 # into ``batch_size`` columns. If the data does not divide evenly into
 # ``batch_size`` columns, then the data is trimmed to fit. For instance, with
 # the alphabet as the data (total length of 26) and ``batch_size=4``, we would
-# divide the alphabet into 4 sequences of length 6:
+# divide the alphabet into sequences of length 6, resulting in 4 of such sequences.
 #
 # .. math::
 #   \begin{bmatrix}
@@ -286,7 +286,6 @@ def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
 # to prevent gradients from exploding.
 #
 
-import copy
 import time
 
 criterion = nn.CrossEntropyLoss()
@@ -299,16 +298,13 @@ def train(model: nn.Module) -> None:
     total_loss = 0.
     log_interval = 200
     start_time = time.time()
-    src_mask = generate_square_subsequent_mask(bptt).to(device)
 
     num_batches = len(train_data) // bptt
     for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
         data, targets = get_batch(train_data, i)
-        seq_len = data.size(0)
-        if seq_len != bptt:  # only on last batch
-            src_mask = src_mask[:seq_len, :seq_len]
-        output = model(data, src_mask)
-        loss = criterion(output.view(-1, ntokens), targets)
+        output = model(data)
+        output_flat = output.view(-1, ntokens)
+        loss = criterion(output_flat, targets)
 
         optimizer.zero_grad()
         loss.backward()
@@ -330,14 +326,11 @@ def train(model: nn.Module) -> None:
 def evaluate(model: nn.Module, eval_data: Tensor) -> float:
     model.eval()  # turn on evaluation mode
     total_loss = 0.
-    src_mask = generate_square_subsequent_mask(bptt).to(device)
     with torch.no_grad():
         for i in range(0, eval_data.size(0) - 1, bptt):
             data, targets = get_batch(eval_data, i)
             seq_len = data.size(0)
-            if seq_len != bptt:
-                src_mask = src_mask[:seq_len, :seq_len]
-            output = model(data, src_mask)
+            output = model(data)
             output_flat = output.view(-1, ntokens)
             total_loss += seq_len * criterion(output_flat, targets).item()
     return total_loss / (len(eval_data) - 1)
diff --git a/conf.py b/conf.py
index eaa25a956c6..5f88045adb3 100644
--- a/conf.py
+++ b/conf.py
@@ -34,6 +34,7 @@
 import pytorch_sphinx_theme
 import torch
 import glob
+import random
 import shutil
 from custom_directives import IncludeDirective, GalleryItemDirective, CustomGalleryItemDirective, CustomCalloutItemDirective, CustomCardItemDirective
 import distutils.file_util
@@ -85,6 +86,11 @@
 
 # -- Sphinx-gallery configuration --------------------------------------------
 
+def reset_seeds(gallery_conf, fname):
+    torch.manual_seed(42)
+    torch.set_default_device(None)
+    random.seed(10)
+
 sphinx_gallery_conf = {
     'examples_dirs': ['beginner_source', 'intermediate_source',
                       'advanced_source', 'recipes_source', 'prototype_source'],
@@ -94,7 +100,8 @@
     'backreferences_dir': None,
     'first_notebook_cell': ("# For tips on running notebooks in Google Colab, see\n"
                             "# https://pytorch.org/tutorials/beginner/colab\n"
-                            "%matplotlib inline")
+                            "%matplotlib inline"),
+    'reset_modules': (reset_seeds)
 }
 
 if os.getenv('GALLERY_PATTERN'):
diff --git a/en-wordlist.txt b/en-wordlist.txt
index 145d270e7b2..70927ee2b5a 100644
--- a/en-wordlist.txt
+++ b/en-wordlist.txt
@@ -89,6 +89,7 @@ LeNet
 LeakyReLU
 LeakyReLUs
 Lipschitz
+logits
 Lua
 Luong
 MLP
diff --git a/intermediate_source/FSDP_adavnced_tutorial.rst b/intermediate_source/FSDP_adavnced_tutorial.rst
index cce90e8787e..748c8593306 100644
--- a/intermediate_source/FSDP_adavnced_tutorial.rst
+++ b/intermediate_source/FSDP_adavnced_tutorial.rst
@@ -75,7 +75,7 @@ highlight different available features in FSDP that are helpful for training
 large scale model above 3B parameters. Also, we cover specific features for
 Transformer based models. The code for this tutorial is available in  `Pytorch
 Examples
-<https://github.com/HamidShojanazeri/examples/tree/FSDP_example/FSDP/>`__.
+<https://github.com/HamidShojanazeri/examples/tree/FSDP_example/distributed/FSDP/>`__.
 
 
 *Setup*
@@ -97,13 +97,13 @@ Please create a `data` folder, download the WikiHow dataset from `wikihowAll.csv
 `wikihowSep.cs <https://ucsb.app.box.com/s/7yq601ijl1lzvlfu4rjdbbxforzd2oag>`__,
 and place them in the `data` folder.  We will use the wikihow dataset from
 `summarization_dataset
-<https://github.com/HamidShojanazeri/examples/blob/FSDP_example/FSDP/summarization_dataset.py>`__.
+<https://github.com/HamidShojanazeri/examples/blob/FSDP_example/distributed/FSDP/summarization_dataset.py>`__.
 
 Next, we add the following code snippets to a Python script “T5_training.py”.
 
 .. note::
    The full source code for this tutorial is available in `PyTorch examples
-   <https://github.com/HamidShojanazeri/examples/tree/FSDP_example/FSDP>`__.
+   <https://github.com/HamidShojanazeri/examples/tree/FSDP_example/distributed/FSDP>`__.
 
 1.3  Import necessary packages:
 
diff --git a/intermediate_source/char_rnn_classification_tutorial.py b/intermediate_source/char_rnn_classification_tutorial.py
index 9b1f255a51b..0957b109b3a 100644
--- a/intermediate_source/char_rnn_classification_tutorial.py
+++ b/intermediate_source/char_rnn_classification_tutorial.py
@@ -4,11 +4,14 @@
 **************************************************************
 **Author**: `Sean Robertson <https://github.com/spro>`_
 
-We will be building and training a basic character-level RNN to classify
-words. This tutorial, along with the following two, show how to do
-preprocess data for NLP modeling "from scratch", in particular not using
-many of the convenience functions of `torchtext`, so you can see how
-preprocessing for NLP modeling works at a low level.
+We will be building and training a basic character-level Recurrent Neural
+Network (RNN) to classify words. This tutorial, along with two other
+Natural Language Processing (NLP) "from scratch" tutorials
+:doc:`/intermediate/char_rnn_generation_tutorial` and
+:doc:`/intermediate/seq2seq_translation_tutorial`, show how to
+preprocess data to model NLP. In particular these tutorials do not
+use many of the convenience functions of `torchtext`, so you can see how
+preprocessing to model NLP works at a low level.
 
 A character-level RNN reads words as a series of characters -
 outputting a prediction and "hidden state" at each step, feeding its
@@ -32,13 +35,15 @@
     (-2.68) Dutch
 
 
-**Recommended Reading:**
+Recommended Preparation
+=======================
 
-I assume you have at least installed PyTorch, know Python, and
-understand Tensors:
+Before starting this tutorial it is recommended that you have installed PyTorch,
+and have a basic understanding of Python programming language and Tensors:
 
 -  https://pytorch.org/ For installation instructions
 -  :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
+   and learn the basics of Tensors
 -  :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
 -  :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
 
@@ -69,7 +74,6 @@
 ``{language: [names ...]}``. The generic variables "category" and "line"
 (for language and name in our case) are used for later extensibility.
 """
-from __future__ import unicode_literals, print_function, division
 from io import open
 import glob
 import os
@@ -181,10 +185,6 @@ def lineToTensor(line):
 # is just 2 linear layers which operate on an input and hidden state, with
 # a ``LogSoftmax`` layer after the output.
 #
-# .. figure:: https://i.imgur.com/Z2xbySO.png
-#    :alt:
-#
-#
 
 import torch.nn as nn
 
@@ -195,13 +195,13 @@ def __init__(self, input_size, hidden_size, output_size):
         self.hidden_size = hidden_size
 
         self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
-        self.i2o = nn.Linear(input_size + hidden_size, output_size)
+        self.h2o = nn.Linear(hidden_size, output_size)
         self.softmax = nn.LogSoftmax(dim=1)
 
     def forward(self, input, hidden):
         combined = torch.cat((input, hidden), 1)
         hidden = self.i2h(combined)
-        output = self.i2o(combined)
+        output = self.h2o(hidden)
         output = self.softmax(output)
         return output, hidden
 
diff --git a/intermediate_source/char_rnn_generation_tutorial.py b/intermediate_source/char_rnn_generation_tutorial.py
index 6068c84cd0e..5e0f6308c01 100644
--- a/intermediate_source/char_rnn_generation_tutorial.py
+++ b/intermediate_source/char_rnn_generation_tutorial.py
@@ -75,7 +75,6 @@
 and end up with a dictionary ``{language: [names ...]}``.
 
 """
-from __future__ import unicode_literals, print_function, division
 from io import open
 import glob
 import os
@@ -278,7 +277,7 @@ def train(category_tensor, input_line_tensor, target_line_tensor):
 
     rnn.zero_grad()
 
-    loss = 0
+    loss = torch.Tensor([0]) # you can also just simply use ``loss = 0``
 
     for i in range(input_line_tensor.size(0)):
         output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
diff --git a/intermediate_source/ddp_tutorial.rst b/intermediate_source/ddp_tutorial.rst
index 366db8db130..a8955569df5 100644
--- a/intermediate_source/ddp_tutorial.rst
+++ b/intermediate_source/ddp_tutorial.rst
@@ -269,8 +269,8 @@ either the application or the model ``forward()`` method.
         setup(rank, world_size)
 
         # setup mp_model and devices for this process
-        dev0 = (rank * 2) % world_size
-        dev1 = (rank * 2 + 1) % world_size
+        dev0 = rank * 2
+        dev1 = rank * 2 + 1
         mp_model = ToyMpModel(dev0, dev1)
         ddp_mp_model = DDP(mp_model)
 
@@ -293,6 +293,7 @@ either the application or the model ``forward()`` method.
         world_size = n_gpus
         run_demo(demo_basic, world_size)
         run_demo(demo_checkpoint, world_size)
+        world_size = n_gpus//2
         run_demo(demo_model_parallel, world_size)
 
 Initialize DDP with torch.distributed.run/torchrun
diff --git a/intermediate_source/dynamic_quantization_bert_tutorial.rst b/intermediate_source/dynamic_quantization_bert_tutorial.rst
index d618df87d58..dd76d08956f 100644
--- a/intermediate_source/dynamic_quantization_bert_tutorial.rst
+++ b/intermediate_source/dynamic_quantization_bert_tutorial.rst
@@ -68,7 +68,7 @@ built-in F1 score calculation helper function.
 .. code:: shell
 
    pip install sklearn
-   pip install transformers
+   pip install transformers==4.29.2
 
 
 Because we will be using the beta parts of the PyTorch, it is
@@ -92,8 +92,6 @@ In this step we import the necessary Python modules for the tutorial.
 
 .. code:: python
 
-    from __future__ import absolute_import, division, print_function
-
     import logging
     import numpy as np
     import os
@@ -256,6 +254,7 @@ model before and after the dynamic quantization.
     set_seed(42)
 
 
+
 2.2 Load the fine-tuned BERT model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -525,6 +524,10 @@ We can serialize and save the quantized model for the future use using
 
 .. code:: python
 
+    def ids_tensor(shape, vocab_size):
+        #  Creates a random int32 tensor of the shape within the vocab size
+        return torch.randint(0, vocab_size, shape=shape, dtype=torch.int, device='cpu')
+
     input_ids = ids_tensor([8, 128], 2)
     token_type_ids = ids_tensor([8, 128], 2)
     attention_mask = ids_tensor([8, 128], vocab_size=2)
diff --git a/intermediate_source/mario_rl_tutorial.py b/intermediate_source/mario_rl_tutorial.py
index ff653d54c11..eb46feb2ad0 100755
--- a/intermediate_source/mario_rl_tutorial.py
+++ b/intermediate_source/mario_rl_tutorial.py
@@ -53,6 +53,8 @@
 # Super Mario environment for OpenAI Gym
 import gym_super_mario_bros
 
+from tensordict import TensorDict
+from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage
 
 ######################################################################
 # RL Definitions
@@ -348,7 +350,7 @@ def act(self, state):
 class Mario(Mario):  # subclassing for continuity
     def __init__(self, state_dim, action_dim, save_dir):
         super().__init__(state_dim, action_dim, save_dir)
-        self.memory = deque(maxlen=100000)
+        self.memory = TensorDictReplayBuffer(storage=LazyMemmapStorage(100000))
         self.batch_size = 32
 
     def cache(self, state, next_state, action, reward, done):
@@ -373,14 +375,15 @@ def first_if_tuple(x):
         reward = torch.tensor([reward], device=self.device)
         done = torch.tensor([done], device=self.device)
 
-        self.memory.append((state, next_state, action, reward, done,))
+        # self.memory.append((state, next_state, action, reward, done,))
+        self.memory.add(TensorDict({"state": state, "next_state": next_state, "action": action, "reward": reward, "done": done}, batch_size=[]))
 
     def recall(self):
         """
         Retrieve a batch of experiences from memory
         """
-        batch = random.sample(self.memory, self.batch_size)
-        state, next_state, action, reward, done = map(torch.stack, zip(*batch))
+        batch = self.memory.sample(self.batch_size)
+        state, next_state, action, reward, done = (batch.get(key) for key in ("state", "next_state", "action", "reward", "done"))
         return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
 
 
@@ -711,17 +714,18 @@ def record(self, episode, epsilon, step):
                 f"{datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S'):>20}\n"
             )
 
-        for metric in ["ep_rewards", "ep_lengths", "ep_avg_losses", "ep_avg_qs"]:
-            plt.plot(getattr(self, f"moving_avg_{metric}"))
-            plt.savefig(getattr(self, f"{metric}_plot"))
+        for metric in ["ep_lengths", "ep_avg_losses", "ep_avg_qs", "ep_rewards"]:
             plt.clf()
+            plt.plot(getattr(self, f"moving_avg_{metric}"), label=f"moving_avg_{metric}")
+            plt.legend()
+            plt.savefig(getattr(self, f"{metric}_plot"))
 
 
 ######################################################################
 # Let’s play!
 # """""""""""""""
 #
-# In this example we run the training loop for 10 episodes, but for Mario to truly learn the ways of
+# In this example we run the training loop for 40 episodes, but for Mario to truly learn the ways of
 # his world, we suggest running the loop for at least 40,000 episodes!
 #
 use_cuda = torch.cuda.is_available()
@@ -735,7 +739,7 @@ def record(self, episode, epsilon, step):
 
 logger = MetricLogger(save_dir)
 
-episodes = 10
+episodes = 40
 for e in range(episodes):
 
     state = env.reset()
diff --git a/intermediate_source/seq2seq_translation_tutorial.py b/intermediate_source/seq2seq_translation_tutorial.py
index 7953854e60a..c2b0b722e5b 100644
--- a/intermediate_source/seq2seq_translation_tutorial.py
+++ b/intermediate_source/seq2seq_translation_tutorial.py
@@ -45,7 +45,7 @@
    :alt:
 
 To improve upon this model we'll use an `attention
-mechanism <https://arxiv.org/abs/1409.0473>`__, which lets the decoder
+mechanism <https://arxiv.org/abs/1508.04025>`__, which lets the decoder
 learn to focus over a specific range of the input sequence.
 
 **Recommended Reading:**
@@ -66,8 +66,8 @@
    Statistical Machine Translation <https://arxiv.org/abs/1406.1078>`__
 -  `Sequence to Sequence Learning with Neural
    Networks <https://arxiv.org/abs/1409.3215>`__
--  `Neural Machine Translation by Jointly Learning to Align and
-   Translate <https://arxiv.org/abs/1409.0473>`__
+-  `Effective Approaches to Attention-based Neural Machine
+   Translation <https://arxiv.org/abs/1508.04025>`__
 -  `A Neural Conversational Model <https://arxiv.org/abs/1506.05869>`__
 
 You will also find the previous tutorials on
@@ -78,7 +78,6 @@
 
 **Requirements**
 """
-from __future__ import unicode_literals, print_function, division
 from io import open
 import unicodedata
 import string
@@ -441,25 +440,27 @@ def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGT
         self.max_length = max_length
 
         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
-        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
-        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
+        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+        self.alignment_vector = nn.Parameter(torch.Tensor(1, hidden_size))
+        torch.nn.init.xavier_uniform_(self.alignment_vector)
         self.dropout = nn.Dropout(self.dropout_p)
-        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
+        self.gru = nn.GRU(self.hidden_size * 2, self.hidden_size)
         self.out = nn.Linear(self.hidden_size, self.output_size)
 
     def forward(self, input, hidden, encoder_outputs):
-        embedded = self.embedding(input).view(1, 1, -1)
+        embedded = self.embedding(input).view(1, -1)
         embedded = self.dropout(embedded)
 
-        attn_weights = F.softmax(
-            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
-        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
-                                 encoder_outputs.unsqueeze(0))
-
-        output = torch.cat((embedded[0], attn_applied[0]), 1)
-        output = self.attn_combine(output).unsqueeze(0)
+        transformed_hidden = self.fc_hidden(hidden[0])
+        expanded_hidden_state = transformed_hidden.expand(self.max_length, -1)
+        alignment_scores = torch.tanh(expanded_hidden_state +
+                                      self.fc_encoder(encoder_outputs))
+        alignment_scores = self.alignment_vector.mm(alignment_scores.T)
+        attn_weights = F.softmax(alignment_scores, dim=1)
+        context_vector = attn_weights.mm(encoder_outputs)
 
-        output = F.relu(output)
+        output = torch.cat((embedded, context_vector), 1).unsqueeze(0)
         output, hidden = self.gru(output, hidden)
 
         output = F.log_softmax(self.out(output[0]), dim=1)
@@ -762,15 +763,15 @@ def evaluateRandomly(encoder, decoder, n=10):
 #
 
 hidden_size = 256
-encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
-attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
+encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
+attn_decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
 
-trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
+trainIters(encoder, attn_decoder, 75000, print_every=5000)
 
 ######################################################################
 #
 
-evaluateRandomly(encoder1, attn_decoder1)
+evaluateRandomly(encoder, attn_decoder)
 
 
 ######################################################################
@@ -788,7 +789,7 @@ def evaluateRandomly(encoder, decoder, n=10):
 #
 
 output_words, attentions = evaluate(
-    encoder1, attn_decoder1, "je suis trop froid .")
+    encoder, attn_decoder, "je suis trop froid .")
 plt.matshow(attentions.numpy())
 
 
@@ -818,7 +819,7 @@ def showAttention(input_sentence, output_words, attentions):
 
 def evaluateAndShowAttention(input_sentence):
     output_words, attentions = evaluate(
-        encoder1, attn_decoder1, input_sentence)
+        encoder, attn_decoder, input_sentence)
     print('input =', input_sentence)
     print('output =', ' '.join(output_words))
     showAttention(input_sentence, output_words, attentions)
diff --git a/intermediate_source/spatial_transformer_tutorial.py b/intermediate_source/spatial_transformer_tutorial.py
index b566e7e4e0b..49b6b0f0a2b 100644
--- a/intermediate_source/spatial_transformer_tutorial.py
+++ b/intermediate_source/spatial_transformer_tutorial.py
@@ -27,7 +27,6 @@
 # License: BSD
 # Author: Ghassen Hamrouni
 
-from __future__ import print_function
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py
index 440f2257e1a..2b241071b7f 100644
--- a/intermediate_source/tensorboard_profiler_tutorial.py
+++ b/intermediate_source/tensorboard_profiler_tutorial.py
@@ -18,7 +18,7 @@
 -----
 To install ``torch`` and ``torchvision`` use the following command:
 
-::
+.. code-block::
 
    pip install torch torchvision
 
@@ -160,7 +160,7 @@ def train(data):
 #
 # Install PyTorch Profiler TensorBoard Plugin.
 #
-# ::
+# .. code-block::
 #
 #     pip install torch_tb_profiler
 #
@@ -168,7 +168,7 @@ def train(data):
 ######################################################################
 # Launch the TensorBoard.
 #
-# ::
+# .. code-block::
 #
 #     tensorboard --logdir=./log
 #
@@ -176,7 +176,7 @@ def train(data):
 ######################################################################
 # Open the TensorBoard profile URL in Google Chrome browser or Microsoft Edge browser.
 #
-# ::
+# .. code-block::
 #
 #     http://localhost:6006/#pytorch_profiler
 #
@@ -287,7 +287,7 @@ def train(data):
 # In this example, we follow the "Performance Recommendation" and set ``num_workers`` as below,
 # pass a different name such as ``./log/resnet18_4workers`` to ``tensorboard_trace_handler``, and run it again.
 #
-# ::
+# .. code-block::
 #
 #     train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True, num_workers=4)
 #
@@ -316,7 +316,7 @@ def train(data):
 #
 # You can try it by using existing example on Azure
 #
-# ::
+# .. code-block::
 #
 #     pip install azure-storage-blob
 #     tensorboard --logdir=https://torchtbprofiler.blob.core.windows.net/torchtbprofiler/demo/memory_demo_1_10
@@ -366,7 +366,7 @@ def train(data):
 #
 # You can try it by using existing example on Azure:
 #
-# ::
+# .. code-block::
 #
 #     pip install azure-storage-blob
 #     tensorboard --logdir=https://torchtbprofiler.blob.core.windows.net/torchtbprofiler/demo/distributed_bert
diff --git a/intermediate_source/torch_compile_tutorial.py b/intermediate_source/torch_compile_tutorial.py
index d4b8e54b9ed..40a53c263ed 100644
--- a/intermediate_source/torch_compile_tutorial.py
+++ b/intermediate_source/torch_compile_tutorial.py
@@ -69,7 +69,7 @@
 
 def foo(x, y):
     a = torch.sin(x)
-    b = torch.cos(x)
+    b = torch.cos(y)
     return a + b
 opt_foo1 = torch.compile(foo)
 print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10)))
@@ -80,7 +80,7 @@ def foo(x, y):
 @torch.compile
 def opt_foo2(x, y):
     a = torch.sin(x)
-    b = torch.cos(x)
+    b = torch.cos(y)
     return a + b
 print(opt_foo2(torch.randn(10, 10), torch.randn(10, 10)))
 
@@ -105,7 +105,7 @@ def forward(self, x):
 #
 # Let's now demonstrate that using ``torch.compile`` can speed
 # up real models. We will compare standard eager mode and 
-# ``torch.compile`` by evaluating and training ResNet-18 on random data.
+# ``torch.compile`` by evaluating and training a ``torchvision`` model on random data.
 #
 # Before we start, we need to define some utility functions.
 
diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst
index 9e3d1b9655c..21d47e258f7 100644
--- a/intermediate_source/torchvision_tutorial.rst
+++ b/intermediate_source/torchvision_tutorial.rst
@@ -145,7 +145,7 @@ Let’s write a ``torch.utils.data.Dataset`` class for this dataset.
            num_objs = len(obj_ids)
            boxes = []
            for i in range(num_objs):
-               pos = np.where(masks[i])
+               pos = np.nonzero(masks[i])
                xmin = np.min(pos[1])
                xmax = np.max(pos[1])
                ymin = np.min(pos[0])
diff --git a/prototype_source/README.txt b/prototype_source/README.txt
index 94c182dcca0..4ab9ce8f6a9 100644
--- a/prototype_source/README.txt
+++ b/prototype_source/README.txt
@@ -1,8 +1,8 @@
 Prototype Tutorials
 ------------------
 1. distributed_rpc_profiling.rst
-     Profiling PyTorch RPC-Based Workloads
-     https://github.com/pytorch/tutorials/blob/release/1.6/prototype_source/distributed_rpc_profiling.rst
+           Profiling PyTorch RPC-Based Workloads
+           https://github.com/pytorch/tutorials/blob/main/prototype_source/distributed_rpc_profiling.rst
 
 2. graph_mode_static_quantization_tutorial.py
 	   Graph Mode Post Training Static Quantization in PyTorch
@@ -21,8 +21,8 @@ Prototype Tutorials
 	   https://github.com/pytorch/tutorials/blob/main/prototype_source/torchscript_freezing.py
 
 6. vulkan_workflow.rst
-     Vulkan Backend User Workflow
-     https://pytorch.org/tutorials/intermediate/vulkan_workflow.html
+           Vulkan Backend User Workflow
+           https://pytorch.org/tutorials/intermediate/vulkan_workflow.html
 
 7. fx_graph_mode_ptq_static.rst
 	   FX Graph Mode Post Training Static Quantization
diff --git a/prototype_source/fx_graph_mode_ptq_dynamic.py b/prototype_source/fx_graph_mode_ptq_dynamic.py
index eda88ff5c01..98ece5f3d31 100644
--- a/prototype_source/fx_graph_mode_ptq_dynamic.py
+++ b/prototype_source/fx_graph_mode_ptq_dynamic.py
@@ -239,9 +239,27 @@ def evaluate(model_, data_source):
     .set_object_type(nn.LSTM, default_dynamic_qconfig)
     .set_object_type(nn.Linear, default_dynamic_qconfig)
 )
-# Deepcopying the original model because quantization api changes the model inplace and we want
+# Load model to create the original model because quantization api changes the model inplace and we want
 # to keep the original model for future comparison
-model_to_quantize = copy.deepcopy(model)
+
+
+model_to_quantize = LSTMModel(
+    ntoken = ntokens,
+    ninp = 512,
+    nhid = 256,
+    nlayers = 5,
+)
+
+model_to_quantize.load_state_dict(
+    torch.load(
+        model_data_filepath + 'word_language_model_quantize.pth',
+        map_location=torch.device('cpu')
+        )
+    )
+
+model_to_quantize.eval()
+
+
 prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
 print("prepared model:", prepared_model)
 quantized_model = convert_fx(prepared_model)
@@ -289,4 +307,4 @@ def time_model_evaluation(model, test_data):
 # 3. Conclusion
 # -------------
 # This tutorial introduces the api for post training dynamic quantization in FX Graph Mode,
-# which dynamically quantizes the same modules as Eager Mode Quantization.
\ No newline at end of file
+# which dynamically quantizes the same modules as Eager Mode Quantization.
diff --git a/prototype_source/fx_graph_mode_ptq_static.rst b/prototype_source/fx_graph_mode_ptq_static.rst
index f97b1f0a5f2..091673ed2e4 100644
--- a/prototype_source/fx_graph_mode_ptq_static.rst
+++ b/prototype_source/fx_graph_mode_ptq_static.rst
@@ -214,9 +214,9 @@ Download the `torchvision resnet18 model <https://download.pytorch.org/models/re
     float_model = load_model(saved_model_dir + float_model_file).to("cpu")
     float_model.eval()
 
-    # deepcopy the model since we need to keep the original model around
-    import copy
-    model_to_quantize = copy.deepcopy(float_model)
+    # create another instance of the model since
+    # we need to keep the original model around
+    model_to_quantize = load_model(saved_model_dir + float_model_file).to("cpu")
 
 3. Set model to eval mode
 -------------------------
@@ -408,4 +408,4 @@ Running the model in AIBench (with single threading) gives the following result:
 
 As we can see for resnet18 both FX graph mode and eager mode quantized model get similar speedup over the floating point model,
 which is around 2-4x faster than the floating point model. But the actual speedup over floating point model may vary
-depending on model, device, build, input batch sizes, threading etc.
\ No newline at end of file
+depending on model, device, build, input batch sizes, threading etc.
diff --git a/prototype_source/fx_graph_mode_quant_guide.rst b/prototype_source/fx_graph_mode_quant_guide.rst
index bb360861b9f..9072e488a4b 100644
--- a/prototype_source/fx_graph_mode_quant_guide.rst
+++ b/prototype_source/fx_graph_mode_quant_guide.rst
@@ -4,7 +4,7 @@
 **Author**: `Jerry Zhang <https://github.com/jerryzh168>`_
 
 FX Graph Mode Quantization requires a symbolically traceable model.
-We use the FX framework (TODO: link) to convert a symbolically traceable nn.Module instance to IR,
+We use the FX framework to convert a symbolically traceable nn.Module instance to IR,
 and we operate on the IR to execute the quantization passes.
 Please post your question about symbolically tracing your model in `PyTorch Discussion Forum <https://discuss.pytorch.org/c/quantization/17>`_
 
@@ -22,16 +22,19 @@ You can use any combination of these options:
     b. Write your own observed and quantized submodule
 
 
-####################################################################
 If the code that is not symbolically traceable does not need to be quantized, we have the following two options
 to run FX Graph Mode Quantization:
-1.a. Symbolically trace only the code that needs to be quantized
+
+
+Symbolically trace only the code that needs to be quantized
 -----------------------------------------------------------------
 When the whole model is not symbolically traceable but the submodule we want to quantize is
 symbolically traceable, we can run quantization only on that submodule.
+
 before:
 
 .. code:: python
+
   class M(nn.Module):
       def forward(self, x):
           x = non_traceable_code_1(x)
@@ -42,6 +45,7 @@ before:
 after:
 
 .. code:: python
+    
   class FP32Traceable(nn.Module):
       def forward(self, x):
           x = traceable_code(x)
@@ -69,8 +73,7 @@ Note if original model needs to be preserved, you will have to
 copy it yourself before calling the quantization APIs.
 
 
-#####################################################
-1.b. Skip symbolically trace the non-traceable code
+Skip symbolically trace the non-traceable code
 ---------------------------------------------------
 When we have some non-traceable code in the module, and this part of code doesn’t need to be quantized,
 we can factor out this part of the code into a submodule and skip symbolically trace that submodule.
@@ -134,8 +137,7 @@ quantization code:
 
 If the code that is not symbolically traceable needs to be quantized, we have the following two options:
 
-##########################################################
-2.a Refactor your code to make it symbolically traceable
+Refactor your code to make it symbolically traceable
 --------------------------------------------------------
 If it is easy to refactor the code and make the code symbolically traceable,
 we can refactor the code and remove the use of non-traceable constructs in python.
@@ -167,15 +169,10 @@ after:
       return x.permute(0, 2, 1, 3)
 
 
-quantization code:
-
 This can be combined with other approaches and the quantization code
 depends on the model.
 
-
-
-#######################################################
-2.b. Write your own observed and quantized submodule
+Write your own observed and quantized submodule
 -----------------------------------------------------
 
 If the non-traceable code can’t be refactored to be symbolically traceable,
@@ -207,8 +204,8 @@ non-traceable logic, wrapped in a module
   class FP32NonTraceable:
       ...
 
-
-2. Define observed version of FP32NonTraceable
+2. Define observed version of
+FP32NonTraceable
 
 .. code:: python
 
diff --git a/prototype_source/graph_mode_dynamic_bert_tutorial.rst b/prototype_source/graph_mode_dynamic_bert_tutorial.rst
index 2a296ccfa6b..5d76ddef79a 100644
--- a/prototype_source/graph_mode_dynamic_bert_tutorial.rst
+++ b/prototype_source/graph_mode_dynamic_bert_tutorial.rst
@@ -40,8 +40,6 @@ Once all the necesessary packages are downloaded and installed we setup the code
 
 .. code:: python
 
-    from __future__ import absolute_import, division, print_function
-
     import logging
     import numpy as np
     import os
@@ -62,22 +60,9 @@ Once all the necesessary packages are downloaded and installed we setup the code
     from torch.quantization import per_channel_dynamic_qconfig
     from torch.quantization import quantize_dynamic_jit
 
-    global_rng = random.Random()
-
-    def ids_tensor(shape, vocab_size, rng=None, name=None):
+    def ids_tensor(shape, vocab_size):
         #  Creates a random int32 tensor of the shape within the vocab size
-        if rng is None:
-            rng = global_rng
-
-        total_dims = 1
-        for dim in shape:
-            total_dims *= dim
-
-        values = []
-        for _ in range(total_dims):
-            values.append(rng.randint(0, vocab_size - 1))
-
-        return torch.tensor(data=values, dtype=torch.long, device='cpu').view(shape).contiguous()
+        return torch.randint(0, vocab_size, shape=shape, dtype=torch.int, device='cpu')
 
     # Setup logging
     logger = logging.getLogger(__name__)
diff --git a/prototype_source/numeric_suite_tutorial.py b/prototype_source/numeric_suite_tutorial.py
index 35052f4b2f4..ee486d43c3b 100644
--- a/prototype_source/numeric_suite_tutorial.py
+++ b/prototype_source/numeric_suite_tutorial.py
@@ -24,7 +24,6 @@
 
 ##############################################################################
 
-from __future__ import print_function, division, absolute_import
 import numpy as np
 import torch
 import torch.nn as nn
diff --git a/prototype_source/prototype_index.rst b/prototype_source/prototype_index.rst
index cfdb2ffcca3..0f190d51190 100644
--- a/prototype_source/prototype_index.rst
+++ b/prototype_source/prototype_index.rst
@@ -68,6 +68,13 @@ Prototype features are not available as part of binary distributions like PyPI o
    :link: ../prototype/numeric_suite_tutorial.html
    :tags: Debugging,Quantization
 
+.. customcarditem::
+   :header: Quantization in PyTorch 2.0 Export Tutorial
+   :card_description: Learn how to use the Quantization in PyTorch 2.0 Export.
+   :image: ../_static/img/thumbnails/cropped/generic-pytorch-logo.png
+   :link: ../prototype/quantization_in_pytorch_2_0_export_tutorial.html
+   :tags: Quantization
+
 .. Mobile
 
 .. customcarditem::
@@ -193,6 +200,7 @@ Prototype features are not available as part of binary distributions like PyPI o
    prototype/fx_graph_mode_ptq_dynamic.html
    prototype/fx_graph_mode_ptq_static.html
    prototype/graph_mode_dynamic_bert_tutorial.html
+   prototype/quantization_in_pytorch_2_0_export_tutorial.html
    prototype/ios_gpu_workflow.html
    prototype/nnapi_mobilenetv2.html
    prototype/tracing_based_selective_build.html
diff --git a/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst b/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst
new file mode 100644
index 00000000000..c1c22d94e04
--- /dev/null
+++ b/prototype_source/quantization_in_pytorch_2_0_export_tutorial.rst
@@ -0,0 +1,375 @@
+(Work in Progress) Quantization in PyTorch 2.0 Export Tutorial
+==============================================================
+
+**Author**: `Leslie Fang <https://github.com/leslie-fang-intel>`_, `Weiwen Xia <https://github.com/Xia-Weiwen>`__, `Jiong Gong <https://github.com/jgong5>`__, `Kimish Patel <https://github.com/kimishpatel>`__, `Jerry Zhang <https://github.com/jerryzh168>`__
+
+Today we have `FX Graph Mode
+Quantization <https://pytorch.org/docs/stable/quantization.html#prototype-fx-graph-mode-quantization>`__
+which uses ``symbolic_trace`` to capture the model into a graph, and then
+perform quantization transformations on top of the captured model. In a
+similar way, for Quantization 2.0 flow, we will now use the PT2 Export
+workflow to capture the model into a graph, and perform quantization
+transformations on top of the ATen dialect graph. This approach is expected to
+have significantly higher model coverage, better programmability, and
+a simplified UX.
+
+Prerequisites:
+-----------------------
+
+-  `Understanding of torchdynamo concepts in PyTorch <https://pytorch.org/docs/stable/dynamo/index.html>`__
+-  `Understanding of the quantization concepts in PyTorch <https://pytorch.org/docs/master/quantization.html#quantization-api-summary>`__
+-  `Understanding of FX Graph Mode post training static quantization <https://pytorch.org/tutorials/prototype/fx_graph_mode_ptq_static.html>`__
+-  `Understanding of BackendConfig in PyTorch Quantization FX Graph Mode <https://pytorch.org/tutorials/prototype/backend_config_tutorial.html?highlight=backend>`__
+-  `Understanding of QConfig and QConfigMapping in PyTorch Quantization FX Graph Mode <https://pytorch.org/tutorials/prototype/backend_config_tutorial.html#set-up-qconfigmapping-that-satisfies-the-backend-constraints>`__
+
+Previously in ``FX Graph Mode Quantization`` we were using ``QConfigMapping`` for users to specify how the model to be quantized
+and ``BackendConfig`` to specify the supported ways of quantization in their backend.
+This API covers most use cases relatively well, but the main problem is that this API is not fully extensible
+without involvement of the quantization team:
+
+-  This API has limitation around expressing quantization intentions for complicated operator patterns such as in the discussion of
+   `Issue-96288 <https://github.com/pytorch/pytorch/issues/96288>`__ to support ``conv add`` fusion.
+   Supporting ``conv add`` fusion also requires some changes to current already complicated pattern matching code such as in the
+   `PR-97122 <https://github.com/pytorch/pytorch/pull/97122>`__.
+-  This API also has limitation around supporting user's advanced quantization intention to quantize their model. For example, if backend
+   developer only wants to quantize inputs and outputs when the ``linear`` has a third input, it requires co-work from quantization
+   team and backend developer.
+-  This API uses ``QConfigMapping`` and ``BackendConfig`` as separate object. ``QConfigMapping`` describes user's
+   intention of how they want their model to be quantized. ``BackendConfig`` describes what kind of quantization a backend support.
+   ``BackendConfig`` is backend specific, but ``QConfigMapping`` is not. And user can provide a ``QConfigMapping``
+   that is incompatible with a specific ``BackendConfig``. This is not a great UX. Ideally, we can structure this better
+   by making both configuration (``QConfigMapping``) and quantization capability (``BackendConfig``) backend
+   specific. So there will be less confusion about incompatibilities.
+-  In ``QConfig``, we are exposing observer/fake_quant classes as an object for user to configure quantization.
+   This increases the things that user needs to care about, e.g. not only the ``dtype`` but also how the
+   observation should happen. These could potentially be hidden from user to make user interface simpler.
+
+To address these scalability issues, 
+`Quantizer <https://github.com/pytorch/pytorch/blob/3e988316b5976df560c51c998303f56a234a6a1f/torch/ao/quantization/_pt2e/quantizer/quantizer.py#L160>`__
+is introduced for quantization in PyTorch 2.0 export. ``Quantizer`` is a class that users can use to
+programmatically set the quantization specifications for input and output of each node in the model graph. It adds flexibility
+to the quantization API and allows modeling users and backend developers to configure quantization programmatically.
+This will allow users to express how they want an operator pattern to be observed in a more explicit
+way by annotating the appropriate nodes. A backend specific quantizer inherited from base quantizer,
+some methods that need to be implemented:
+
+-  `annotate method <https://github.com/pytorch/pytorch/blob/3e988316b5976df560c51c998303f56a234a6a1f/torch/ao/quantization/_pt2e/quantizer/qnnpack_quantizer.py#L269>`__
+   is used to annotate nodes in the graph with 
+   `QuantizationAnnotation <https://github.com/pytorch/pytorch/blob/07104ca99c9d297975270fb58fda786e60b49b38/torch/ao/quantization/_pt2e/quantizer/quantizer.py#L144>`__
+   objects to convey the desired way of quantization.
+
+Imagine a backend developer who wishes to integrate a third-party backend
+with PyTorch's quantization 2.0 flow. To accomplish this, they would only need
+to define the backend specific quantizer. The high level architecture of
+quantization 2.0 with quantizer could look like this:
+
+::
+
+    float_model(Python)                               Input
+        \                                              /
+         \                                            /
+    —-------------------------------------------------------
+    |                    Dynamo Export                     |
+    —-------------------------------------------------------
+                                |
+                        FX Graph in ATen     QNNPackQuantizer,
+                                |            or X86InductorQuantizer,
+                                |            or <Other Backend Quantizer>
+                                |                /
+    —--------------------------------------------------------
+    |                 prepare_pt2e_quantizer                |
+    —--------------------------------------------------------
+                                |
+                         Calibrate/Train
+                                |
+    —--------------------------------------------------------
+    |                      convert_pt2e                     |
+    —--------------------------------------------------------
+                                |
+                    Reference Quantized Model
+                                |
+    —--------------------------------------------------------
+    |                        Lowering                       |
+    —--------------------------------------------------------
+                                |
+            Executorch, or Inductor, or <Other Backends>
+
+Note: ``prepare_pt2e_quantizer`` will be updated to ``prepare_pt2e`` soon.
+
+An existing quantizer object defined for QNNPack/XNNPack is in
+`QNNPackQuantizer <https://github.com/pytorch/pytorch/blob/main/torch/ao/quantization/_pt2e/quantizer/qnnpack_quantizer.py>`__.
+Taking QNNPackQuantizer as an example, the overall Quantization 2.0 flow could be:
+
+::
+
+    import torch
+    import torch._dynamo as torchdynamo
+    from torch.ao.quantization._quantize_pt2e import convert_pt2e, prepare_pt2e
+    import torch.ao.quantization._pt2e.quantizer.qnnpack_quantizer as qq
+
+    class M(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = torch.nn.Linear(5, 10)
+
+        def forward(self, x):
+            return self.linear(x)
+
+    example_inputs = (torch.randn(1, 5),)
+    model = M().eval()
+
+    # Step 1: Trace the model into an FX graph of flattened ATen operators
+    exported_graph_module, guards = torchdynamo.export(
+        model,
+        *copy.deepcopy(example_inputs),
+        aten_graph=True,
+    )
+
+    # Step 2: Insert observers or fake quantize modules
+    quantizer = qq.QNNPackQuantizer()
+    operator_config = qq.get_symmetric_quantization_config(is_per_channel=True)
+    quantizer.set_global(operator_config)
+    prepared_graph_module = prepare_pt2e_quantizer(exported_graph_module, quantizer)
+
+    # Step 3: Quantize the model
+    convered_graph_module = convert_pt2e(prepared_graph_module)
+
+    # Step 4: Lower Reference Quantized Model into the backend
+
+``Quantizer`` uses annotation API to convey quantization intent for different operators/patterns.
+Annotation API mainly consists of
+`QuantizationSpec <https://github.com/pytorch/pytorch/blob/1ca2e993af6fa6934fca35da6970308ce227ddc7/torch/ao/quantization/_pt2e/quantizer/quantizer.py#L38>`__
+and 
+`QuantizationAnnotation <https://github.com/pytorch/pytorch/blob/07104ca99c9d297975270fb58fda786e60b49b38/torch/ao/quantization/_pt2e/quantizer/quantizer.py#L144>`__.
+
+``QuantizationSpec`` is used to convey intent of how a tensor will be quantized,
+e.g. dtype, bitwidth, min, max values, symmetric vs. asymmetric etc.
+Furthermore, ``QuantizationSpec`` also allows quantizer to specify how a
+tensor value should be observed, e.g. ``MinMaxObserver``, or ``HistogramObserver``
+, or some customized observer.
+
+``QuantizationAnnotation`` composed of ``QuantizationSpec`` objects is used to annotate input tensors
+and output tensor of a pattern. Annotating input tensors is equivalent of annotating input edges,
+while annotating output tensor is equivalent of annotating node. ``QuantizationAnnotation`` is a ``dataclass``
+with several fields:
+
+-  ``input_qspec_map`` field is of class ``Dict`` to map each input tensor (as input edge) to a ``QuantizationSpec``.
+-  ``output_qspec`` field expresses the ``QuantizationSpec`` used to annotate the output tensor;
+-  ``_annotated`` field indicates if this node has already been annotated by quantizer.
+
+To conclude, annotation API requires quantizer to annotate edges (input tensors) or
+nodes (output tensor) of the graph. Now, we will have a step-by-step tutorial for
+how to use the annotation API with different types of ``QuantizationSpec``.
+
+1. Annotate common operator patterns
+--------------------------------------------------------
+
+In order to use the quantized pattern/operators, e.g. ``quantized add``,
+backend developers will have intent to quantize (as expressed by ``QuantizationSpec``)
+inputs, output of the pattern. Following is an example flow (take ``add`` operator as example)
+of how this intent is conveyed in the quantization workflow with annotation API.
+
+-  Step 1: Identify the original floating point pattern in the FX graph. There are
+   several ways to identify this pattern: Quantizer may use a pattern matcher
+   to match the operator pattern; Quantizer may go through the nodes from start to the end and compare
+   the node's target type to match the operator pattern. In this example, we can use the
+   `get_source_partitions <https://github.com/pytorch/pytorch/blob/07104ca99c9d297975270fb58fda786e60b49b38/torch/fx/passes/utils/source_matcher_utils.py#L51>`__
+   to match this pattern. The original floating point ``add`` pattern only contain a single ``add`` node.
+
+::
+
+    add_partitions = get_source_partitions(gm.graph, [operator.add, torch.add])
+    add_partitions = list(itertools.chain(*add_partitions.values()))
+    for add_partition in add_partitions:
+        add_node = add_partition.output_nodes[0]
+
+-  Step 2: Define the ``QuantizationSpec`` for inputs and output of the pattern. ``QuantizationSpec``
+   defines the ``data type``, ``qscheme``, and other quantization parameters about users' intent of
+   how to observe or fake quantize a tensor.
+
+::
+
+    act_quantization_spec = QuantizationSpec(
+        dtype=torch.int8,
+        quant_min=-128,
+        quant_max=127,
+        qscheme=torch.per_tensor_affine,
+        is_dynamic=False,
+        observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
+    )
+
+    input_act_qspec = act_quantization_spec
+    output_act_qspec = act_quantization_spec
+
+-  Step 3: Annotate the inputs and output of the pattern with ``QuantizationAnnotation``.
+   In this example, we will create the ``QuantizationAnnotation`` object with the ``QuantizationSpec``
+   created in above step 2 for two inputs and one output of the ``add`` node.
+
+::
+
+    input_qspec_map = {}
+    input_act0 = add_node.args[0]
+    input_qspec_map[input_act0] = input_act_qspec
+
+    input_act1 = add_node.args[1]
+    input_qspec_map[input_act1] = input_act_qspec
+         
+    add_node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=output_act_qspec,
+        _annotated=True,
+    )
+
+After we annotate the ``add`` node like this, in the following up quantization flow, ``HistogramObserver`` will
+be inserted at its two input nodes and one output node in prepare phase. And ``HistogramObserver`` will be substituted with
+``quantize`` node and ``dequantize`` node in the convert phase.
+
+2. Annotate sharing qparams operators
+--------------------------------------------------------
+
+It is natural that users want to annotate a quantized model where quantization
+parameters can be shared among some tensors explicitly. Two typical use cases are:
+
+-  Example 1: One example is for ``add`` where having both inputs sharing quantization
+   parameters makes operator implementation much easier. Without using of
+   `SharedQuantizationSpec <https://github.com/pytorch/pytorch/blob/1ca2e993af6fa6934fca35da6970308ce227ddc7/torch/ao/quantization/_pt2e/quantizer/quantizer.py#L90>`__,
+   we must annotate ``add`` as example in above section 1, in which two inputs of ``add``
+   has different quantization parameters.
+-  Example 2: Another example is that of sharing quantization parameters between inputs and output.
+   This typically results from operators such as ``maxpool``, ``average_pool``, ``concat`` etc.
+
+``SharedQuantizationSpec`` is designed for this use case to annotate tensors whose quantization
+parameters are shared with other tensors. Input of ``SharedQuantizationSpec`` is an ``EdgeOrNode`` object which 
+can be an input edge or an output value. 
+
+-  Input edge is the connection between input node and the node consuming the input,
+   so it's a ``Tuple[Node, Node]``.
+-  Output value is an FX ``Node``.
+
+Now, if we want to rewrite ``add`` annotation example with ``SharedQuantizationSpec`` to indicate
+two input tensors as sharing quantization parameters. We can define its ``QuantizationAnnotation``
+as this:
+
+-  Step 1: Identify the original floating point pattern in the FX graph. We can use the same
+   methods introduced in ``QuantizationSpec`` example to identify the ``add`` pattern.
+-  Step 2: Annotate input_act0 of ``add`` with ``QuantizationSpec``.
+-  Step 3: Create a ``SharedQuantizationSpec`` object with input edge defined as ``(input_act0, add_node)`` which means to
+   share the observer used for this edge. Then, user can annotate input_act1 with this ``SharedQuantizationSpec``
+   object.
+
+::
+
+    input_qspec_map = {}
+    share_qparams_with_input_act0_qspec = SharedQuantizationSpec((input_act0, add_node))
+    input_qspec_map = {input_act0: act_quantization_spec, input_act1: share_qparams_with_input_act0_qspec}
+
+    add_node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=act_quantization_spec,
+        _annotated=True,
+    )
+
+3. Annotate fixed qparams operators
+--------------------------------------------------------
+
+Another typical use case to annotate a quantized model is for tensors whose
+quantization parameters are known beforehand. For example, operator like ``sigmoid``, which has
+predefined and fixed scale/zero_point at input and output tensors.
+`FixedQParamsQuantizationSpec <https://github.com/pytorch/pytorch/blob/1ca2e993af6fa6934fca35da6970308ce227ddc7/torch/ao/quantization/_pt2e/quantizer/quantizer.py#L90>`__
+is designed for this use case. To use ``FixedQParamsQuantizationSpec``, users need to pass in parameters
+of ``scale`` and ``zero_point`` explicitly.
+
+-  Step 1: Identify the original floating point pattern in the FX graph. We can use the same
+   methods introduced in ``QuantizationSpec`` example to identify the ``sigmoid`` pattern.
+-  Step 2: Create ``FixedQParamsQuantizationSpec`` object with inputs of fixed ``scale``, ``zero_point`` value.
+   These values will be used to create the ``quantize`` node and ``dequantize`` node in the convert phase.
+-  Step 3: Annotate inputs and output to use this ``FixedQParamsQuantizationSpec`` object.
+
+::
+
+    act_qspec = FixedQParamsQuantizationSpec(
+        dtype=torch.uint8,
+        quant_min=0,
+        quant_max=255,
+        qscheme=torch.per_tensor_affine,
+        scale=1.0 / 256.0,
+        zero_point=0,
+    )
+    sigmoid_node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map={input_act: act_qspec},
+        output_qspec=act_qspec,
+        _annotated=True,
+    )
+
+4. Annotate tensor with derived quantization parameters
+---------------------------------------------------------------
+
+Another use case is to define the constraint for tensors whose quantization parameters are derived from other tensors.
+For example, if we want to annotate a convolution node, and define the ``scale`` of its bias input tensor
+as product of the activation tensor's ``scale`` and weight tensor's ``scale``. We can use
+`DerivedQuantizationSpec <https://github.com/pytorch/pytorch/blob/1ca2e993af6fa6934fca35da6970308ce227ddc7/torch/ao/quantization/_pt2e/quantizer/quantizer.py#L102>`__
+to annotate this conv node.
+
+-  Step 1: Identify the original floating point pattern in the FX graph. We can use the same
+   methods introduced in ``QuantizationSpec`` example to identify the ``convolution`` pattern.
+-  Step 2: Define ``derive_qparams_fn`` function, it accepts list of ``ObserverOrFakeQuantize`` (
+   `ObserverBase <https://github.com/pytorch/pytorch/blob/07104ca99c9d297975270fb58fda786e60b49b38/torch/ao/quantization/observer.py#L124>`__
+   or `FakeQuantizeBase <https://github.com/pytorch/pytorch/blob/07104ca99c9d297975270fb58fda786e60b49b38/torch/ao/quantization/fake_quantize.py#L60>`__)
+   as input. From each ``ObserverOrFakeQuantize`` object, user can get the ``scale``, ``zero point`` value.
+   User can define its heuristic about how to derive new ``scale``, ``zero point`` value based on the
+   quantization parameters calculated from the observer or fake quant instances.
+-  Step 3: Define ``DerivedQuantizationSpec`` obejct, it accepts inputs of: list of ``EdgeOrNode`` objects.
+   The observer corresponding to each ``EdgeOrNode`` object will be passed into the ``derive_qparams_fn`` function;
+   ``derive_qparams_fn`` function; several other quantization parameters such as ``dtype``, ``qscheme``.
+-  Step 4: Annotate the inputs and output of this conv node with ``QuantizationAnnotation``.
+
+::
+
+    def derive_qparams_fn(obs_or_fqs: List[ObserverOrFakeQuantize]) -> Tuple[Tensor, Tensor]:
+        assert len(obs_or_fqs) == 2, \
+            "Expecting two obs/fqs, one for activation and one for weight, got: {}".format(len(obs_or_fq))
+        act_obs_or_fq = obs_or_fqs[0]
+        weight_obs_or_fq = obs_or_fqs[1]
+        act_scale, act_zp = act_obs_or_fq.calculate_qparams()
+        weight_scale, weight_zp = weight_obs_or_fq.calculate_qparams()
+        return torch.tensor([act_scale * weight_scale]).to(torch.float32), torch.tensor([0]).to(torch.int32)
+
+    bias_qspec = DerivedQuantizationSpec(
+        derived_from=[(input_act, node), (weight, node)],
+        derive_qparams_fn=derive_qparams_fn,
+        dtype=torch.int32,
+        quant_min=-2**31,
+        quant_max=2**31 - 1,
+        qscheme=torch.per_tensor_symmetric,
+    )
+    input_qspec_map = {input_act: act_quantization_spec, weight: weight_quantization_spec, bias: bias_qspec}
+    node.meta["quantization_annotation"] = QuantizationAnnotation(
+        input_qspec_map=input_qspec_map,
+        output_qspec=act_quantization_spec,
+        _annotated=True,
+    )
+
+5. A Toy Example with Resnet18 
+--------------------------------------------------------
+
+After above annotation methods defined with ``QuantizationAnnotation API``, we can now put them together to construct a ``BackendQuantizer``
+and run a `toy example <https://gist.github.com/leslie-fang-intel/b78ed682aa9b54d2608285c5a4897cfc>`__
+with ``Torchvision Resnet18``. To better understand the final example, here are the classes and utility
+functions that are used in the example:
+
+-  `QuantizationConfig <https://github.com/pytorch/pytorch/blob/73fd7235ad25ff061c087fa4bafc6e8df4d9c299/torch/ao/quantization/_pt2e/quantizer/quantizer.py#L103-L109>`__
+   consists of ``QuantizationSpec`` for activation, weight, and bias separately.
+-  When annotating the model,
+   `get_input_act_qspec <https://github.com/pytorch/pytorch/blob/47cfcf566ab76573452787335f10c9ca185752dc/torch/ao/quantization/_pt2e/quantizer/utils.py#L10>`__,
+   `get_output_act_qspec <https://github.com/pytorch/pytorch/blob/47cfcf566ab76573452787335f10c9ca185752dc/torch/ao/quantization/_pt2e/quantizer/utils.py#L23>`__,
+   `get_weight_qspec <https://github.com/pytorch/pytorch/blob/47cfcf566ab76573452787335f10c9ca185752dc/torch/ao/quantization/_pt2e/quantizer/utils.py#L36>`__, and
+   `get_bias_qspec <https://github.com/pytorch/pytorch/blob/47cfcf566ab76573452787335f10c9ca185752dc/torch/ao/quantization/_pt2e/quantizer/utils.py#L53>`__
+   can be used to get the ``QuantizationSpec`` from ``QuantizationConfig`` for a specific pattern.
+
+6. Conclusion
+---------------------
+
+With this tutorial, we introduce the new quantization path in PyTorch 2.0. Users can learn about
+how to define a ``BackendQuantizer`` with the ``QuantizationAnnotation API`` and integrate it into the quantization 2.0 flow.
+Examples of ``QuantizationSpec``, ``SharedQuantizationSpec``, ``FixedQParamsQuantizationSpec``, and ``DerivedQuantizationSpec``
+are given for specific annotation use case.
diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
index 2cdd37c8035..141bc41a034 100644
--- a/recipes_source/recipes/amp_recipe.py
+++ b/recipes_source/recipes/amp_recipe.py
@@ -76,11 +76,14 @@ def make_model(in_size, out_size, num_layers):
 num_batches = 50
 epochs = 3
 
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+torch.set_default_device(device)
+
 # Creates data in default precision.
 # The same data is used for both default and mixed precision trials below.
 # You don't need to manually change inputs' ``dtype`` when enabling mixed precision.
-data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
-targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
+data = [torch.randn(batch_size, in_size) for _ in range(num_batches)]
+targets = [torch.randn(batch_size, out_size) for _ in range(num_batches)]
 
 loss_fn = torch.nn.MSELoss().cuda()
 
@@ -116,7 +119,7 @@ def make_model(in_size, out_size, num_layers):
 for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
         # Runs the forward pass under ``autocast``.
-        with torch.autocast(device_type='cuda', dtype=torch.float16):
+        with torch.autocast(device_type=device, dtype=torch.float16):
             output = net(input)
             # output is float16 because linear layers ``autocast`` to float16.
             assert output.dtype is torch.float16
@@ -151,7 +154,7 @@ def make_model(in_size, out_size, num_layers):
 
 for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
-        with torch.autocast(device_type='cuda', dtype=torch.float16):
+        with torch.autocast(device_type=device, dtype=torch.float16):
             output = net(input)
             loss = loss_fn(output, target)
 
@@ -184,7 +187,7 @@ def make_model(in_size, out_size, num_layers):
 start_timer()
 for epoch in range(epochs):
     for input, target in zip(data, targets):
-        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp):
+        with torch.autocast(device_type=device, dtype=torch.float16, enabled=use_amp):
             output = net(input)
             loss = loss_fn(output, target)
         scaler.scale(loss).backward()
@@ -202,7 +205,7 @@ def make_model(in_size, out_size, num_layers):
 
 for epoch in range(0): # 0 epochs, this section is for illustration only
     for input, target in zip(data, targets):
-        with torch.autocast(device_type='cuda', dtype=torch.float16):
+        with torch.autocast(device_type=device, dtype=torch.float16):
             output = net(input)
             loss = loss_fn(output, target)
         scaler.scale(loss).backward()
diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
index 7c8aa135b11..0f82fb76d3d 100644
--- a/recipes_source/recipes/tuning_guide.py
+++ b/recipes_source/recipes/tuning_guide.py
@@ -357,7 +357,7 @@ def fused_gelu(x):
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # Instead of calling ``torch.rand(size).cuda()`` to generate a random tensor,
 # produce the output directly on the target device:
-# ``torch.rand(size, device=torch.device('cuda'))``.
+# ``torch.rand(size, device='cuda')``.
 #
 # This is applicable to all functions which create new tensors and accept
 # ``device`` argument: