From f460032606c38edd1e3521fd8a225791b69bc9a8 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 2 Jun 2023 14:28:41 -0700 Subject: [PATCH 01/10] update --- .circleci/config.yml | 678 -------------------------- .circleci/config.yml.in | 213 -------- .circleci/regenerate.py | 112 ----- .github/workflows/build-tutorials.yml | 182 +++++++ .jenkins/build.sh | 34 +- .jenkins/get_files_to_run.py | 8 +- .jenkins/metadata.json | 18 +- 7 files changed, 211 insertions(+), 1034 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100644 .circleci/config.yml.in delete mode 100644 .circleci/regenerate.py create mode 100644 .github/workflows/build-tutorials.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index b7084096c4b..00000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,678 +0,0 @@ -# run python regenerate.py to generate config.yml from config.yml.in - -version: 2.1 - -executors: - windows-with-nvidia-gpu: - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -install_official_git_client: &install_official_git_client - name: Install Official Git Client - no_output_timeout: "1h" - command: | - set -e - sudo apt-get -qq update - sudo apt-get -qq install openssh-client git - -# This system setup script is meant to run before the CI-related scripts, e.g., -# installing Git client, checking out code, setting up CI env, and -# building/testing. -setup_linux_system_environment: &setup_linux_system_environment - name: Set Up System Environment - no_output_timeout: "1h" - command: | - set -ex - - # Stop background apt updates. Hypothetically, the kill should not - # be necessary, because stop is supposed to send a kill signal to - # the process, but we've added it for good luck. Also - # hypothetically, it's supposed to be unnecessary to wait for - # the process to block. We also have that line for good luck. - # If you like, try deleting them and seeing if it works. - sudo systemctl stop apt-daily.service || true - sudo systemctl kill --kill-who=all apt-daily.service || true - - sudo systemctl stop unattended-upgrades.service || true - sudo systemctl kill --kill-who=all unattended-upgrades.service || true - - # wait until `apt-get update` has been killed - while systemctl is-active --quiet apt-daily.service - do - sleep 1; - done - while systemctl is-active --quiet unattended-upgrades.service - do - sleep 1; - done - - # See if we actually were successful - systemctl list-units --all | cat - - sudo apt-get purge -y unattended-upgrades - - cat /etc/apt/sources.list - - ps auxfww | grep [a]pt - ps auxfww | grep dpkg - -pytorch_tutorial_build_defaults: &pytorch_tutorial_build_defaults - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - steps: - - checkout - - run: - <<: *setup_linux_system_environment - - run: - name: Set Up CI Environment - no_output_timeout: "1h" - command: | - set -e - - sudo apt-get -y update - sudo apt-get -y install expect-dev moreutils - - sudo pip3 -q install awscli==1.16.35 - - if [ -n "${CUDA_VERSION}" ]; then - nvidia-smi - fi - - # This IAM user only allows read-write access to ECR - export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_ONLY} - export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY} - eval $(aws ecr get-login --region us-east-1 --no-include-email) - - run: - name: Build - no_output_timeout: "20h" - command: | - set -e - - # for some reason, pip installs it in a different place than what is looked at in the py file - sudo pip3 install requests --target=/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages - export pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) - echo "PyTorchDockerImageTag: "${pyTorchDockerImageTag} - - cat >/home/circleci/project/ci_build_script.sh \</dev/null - if [ -n "${CUDA_VERSION}" ]; then - export id=$(docker run --gpus all -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - else - export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - fi - - echo "declare -x JOB_BASE_NAME=${CIRCLE_JOB}" > /home/circleci/project/env - echo "declare -x COMMIT_ID=${CIRCLE_SHA1}" >> /home/circleci/project/env - echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH}" >> /home/circleci/project/env - # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE! - set +x - if [[ "$CIRCLE_BRANCH" == master || "$CIRCLE_BRANCH" == main ]]; then - if [ -z "${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi - if [ -z "${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi - if [ -z "${GITHUB_PYTORCHBOT_USERNAME}" ]; then exit 1; fi - if [ -z "${GITHUB_PYTORCHBOT_TOKEN}" ]; then exit 1; fi - - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x GITHUB_PYTORCHBOT_USERNAME=${GITHUB_PYTORCHBOT_USERNAME}" >> /home/circleci/project/env - echo "declare -x GITHUB_PYTORCHBOT_TOKEN=${GITHUB_PYTORCHBOT_TOKEN}" >> /home/circleci/project/env - else - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env - fi - set -x - - echo 'rm /opt/cache/bin/*' | docker exec -u root -i "$id" bash - docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace" - - export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' - echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts - # Copy docs with plot to a docs dir - if docker exec -it "$id" sh -c "test -d ./workspace/docs_with_plot/docs/"; then - mkdir /home/circleci/project/docs - docker cp "$id:/var/lib/jenkins/workspace/docs_with_plot/docs/." /home/circleci/project/docs - echo "Directory copied successfully" - else - echo "No docs_with_plot directory. Skipping..." - fi - - - store_artifacts: - path: ./docs - destination: tutorials - -pytorch_tutorial_build_worker_defaults: &pytorch_tutorial_build_worker_defaults - environment: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" - CUDA_VERSION: "9" - resource_class: gpu.nvidia.small - <<: *pytorch_tutorial_build_defaults - -pytorch_tutorial_build_manager_defaults: &pytorch_tutorial_build_manager_defaults - environment: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" - resource_class: medium - - - <<: *pytorch_tutorial_build_defaults - -pytorch_windows_build_worker: &pytorch_windows_build_worker - executor: windows-with-nvidia-gpu - steps: - - checkout - - run: - name: Install Cuda - no_output_timeout: 30m - command: | - .circleci/scripts/windows_cuda_install.sh - - run: - name: Generate cache key - # This will refresh cache on Sundays, build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - keys: - - data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} - - run: - name: test - no_output_timeout: "1h" - command: | - .circleci/scripts/build_for_windows.sh - - save_cache: - key: data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} - paths: - - advanced_source/data - - beginner_source/data - - intermediate_source/data - - prototype_source/data - -jobs: - pytorch_tutorial_pr_build_manager: - <<: *pytorch_tutorial_build_manager_defaults - pytorch_tutorial_pr_build_worker_0: - <<: *pytorch_tutorial_build_worker_defaults - resource_class: gpu.nvidia.small.multi - pytorch_tutorial_pr_build_worker_1: - <<: *pytorch_tutorial_build_worker_defaults - resource_class: gpu.nvidia.large - pytorch_tutorial_pr_build_worker_10: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_11: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_12: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_13: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_14: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_15: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_16: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_17: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_18: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_19: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_2: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_3: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_4: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_5: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_6: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_7: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_8: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_pr_build_worker_9: - <<: *pytorch_tutorial_build_worker_defaults - - pytorch_tutorial_trunk_build_manager: - <<: *pytorch_tutorial_build_manager_defaults - pytorch_tutorial_trunk_build_worker_0: - <<: *pytorch_tutorial_build_worker_defaults - resource_class: gpu.nvidia.small.multi - pytorch_tutorial_trunk_build_worker_1: - <<: *pytorch_tutorial_build_worker_defaults - resource_class: gpu.nvidia.large - pytorch_tutorial_trunk_build_worker_10: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_11: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_12: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_13: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_14: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_15: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_16: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_17: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_18: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_19: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_2: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_3: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_4: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_5: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_6: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_7: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_8: - <<: *pytorch_tutorial_build_worker_defaults - pytorch_tutorial_trunk_build_worker_9: - <<: *pytorch_tutorial_build_worker_defaults - - pytorch_tutorial_windows_pr_build_worker_0: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_pr_build_worker_1: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_pr_build_worker_2: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_pr_build_worker_3: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_trunk_build_worker_0: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_trunk_build_worker_1: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_trunk_build_worker_2: - <<: *pytorch_windows_build_worker - pytorch_tutorial_windows_trunk_build_worker_3: - <<: *pytorch_windows_build_worker - -workflows: - build: - jobs: - # Build jobs that only run on PR - - pytorch_tutorial_pr_build_worker_0: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_1: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_2: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_3: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_4: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_5: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_6: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_7: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_8: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_9: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_10: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_11: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_12: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_13: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_14: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_15: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_16: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_17: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_18: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_19: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_manager: - filters: - branches: - ignore: - - master - - main - requires: - - pytorch_tutorial_pr_build_worker_0 - - pytorch_tutorial_pr_build_worker_1 - - pytorch_tutorial_pr_build_worker_2 - - pytorch_tutorial_pr_build_worker_3 - - pytorch_tutorial_pr_build_worker_4 - - pytorch_tutorial_pr_build_worker_5 - - pytorch_tutorial_pr_build_worker_6 - - pytorch_tutorial_pr_build_worker_7 - - pytorch_tutorial_pr_build_worker_8 - - pytorch_tutorial_pr_build_worker_9 - - pytorch_tutorial_pr_build_worker_10 - - pytorch_tutorial_pr_build_worker_11 - - pytorch_tutorial_pr_build_worker_12 - - pytorch_tutorial_pr_build_worker_13 - - pytorch_tutorial_pr_build_worker_14 - - pytorch_tutorial_pr_build_worker_15 - - pytorch_tutorial_pr_build_worker_16 - - pytorch_tutorial_pr_build_worker_17 - - pytorch_tutorial_pr_build_worker_18 - - pytorch_tutorial_pr_build_worker_19 - # Build jobs that only run on trunk - - pytorch_tutorial_trunk_build_worker_0: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_1: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_2: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_3: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_4: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_5: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_6: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_7: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_8: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_9: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_10: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_11: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_12: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_13: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_14: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_15: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_16: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_17: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_18: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_19: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_manager: - context: org-member - filters: - branches: - only: - - master - - main - requires: - - pytorch_tutorial_trunk_build_worker_0 - - pytorch_tutorial_trunk_build_worker_1 - - pytorch_tutorial_trunk_build_worker_2 - - pytorch_tutorial_trunk_build_worker_3 - - pytorch_tutorial_trunk_build_worker_4 - - pytorch_tutorial_trunk_build_worker_5 - - pytorch_tutorial_trunk_build_worker_6 - - pytorch_tutorial_trunk_build_worker_7 - - pytorch_tutorial_trunk_build_worker_8 - - pytorch_tutorial_trunk_build_worker_9 - - pytorch_tutorial_trunk_build_worker_10 - - pytorch_tutorial_trunk_build_worker_11 - - pytorch_tutorial_trunk_build_worker_12 - - pytorch_tutorial_trunk_build_worker_13 - - pytorch_tutorial_trunk_build_worker_14 - - pytorch_tutorial_trunk_build_worker_15 - - pytorch_tutorial_trunk_build_worker_16 - - pytorch_tutorial_trunk_build_worker_17 - - pytorch_tutorial_trunk_build_worker_18 - - pytorch_tutorial_trunk_build_worker_19 -# - pytorch_tutorial_windows_pr_build_worker_0: -# filters: -# branches: -# ignore: -# - master -# - main -# - pytorch_tutorial_windows_pr_build_worker_1: -# filters: -# branches: -# ignore: -# - master -# - main -# - pytorch_tutorial_windows_pr_build_worker_2: -# filters: -# branches: -# ignore: -# - master -# - main -# - pytorch_tutorial_windows_pr_build_worker_3: -# filters: -# branches: -# ignore: -# - master -# - main -# - pytorch_tutorial_windows_trunk_build_worker_0: -# context: org-member -# filters: -# branches: -# only: -# - master -# - main -# - pytorch_tutorial_windows_trunk_build_worker_1: -# context: org-member -# filters: -# branches: -# only: -# - master -# - main -# - pytorch_tutorial_windows_trunk_build_worker_2: -# context: org-member -# filters: -# branches: -# only: -# - master -# - main -# - pytorch_tutorial_windows_trunk_build_worker_3: -# context: org-member -# filters: -# branches: -# only: -# - master -# - main diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in deleted file mode 100644 index 0694d221aad..00000000000 --- a/.circleci/config.yml.in +++ /dev/null @@ -1,213 +0,0 @@ -# run python regenerate.py to generate config.yml from config.yml.in - -version: 2.1 - -executors: - windows-with-nvidia-gpu: - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -install_official_git_client: &install_official_git_client - name: Install Official Git Client - no_output_timeout: "1h" - command: | - set -e - sudo apt-get -qq update - sudo apt-get -qq install openssh-client git - -# This system setup script is meant to run before the CI-related scripts, e.g., -# installing Git client, checking out code, setting up CI env, and -# building/testing. -setup_linux_system_environment: &setup_linux_system_environment - name: Set Up System Environment - no_output_timeout: "1h" - command: | - set -ex - - # Stop background apt updates. Hypothetically, the kill should not - # be necessary, because stop is supposed to send a kill signal to - # the process, but we've added it for good luck. Also - # hypothetically, it's supposed to be unnecessary to wait for - # the process to block. We also have that line for good luck. - # If you like, try deleting them and seeing if it works. - sudo systemctl stop apt-daily.service || true - sudo systemctl kill --kill-who=all apt-daily.service || true - - sudo systemctl stop unattended-upgrades.service || true - sudo systemctl kill --kill-who=all unattended-upgrades.service || true - - # wait until `apt-get update` has been killed - while systemctl is-active --quiet apt-daily.service - do - sleep 1; - done - while systemctl is-active --quiet unattended-upgrades.service - do - sleep 1; - done - - # See if we actually were successful - systemctl list-units --all | cat - - sudo apt-get purge -y unattended-upgrades - - cat /etc/apt/sources.list - - ps auxfww | grep [a]pt - ps auxfww | grep dpkg - -pytorch_tutorial_build_defaults: &pytorch_tutorial_build_defaults - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - steps: - - checkout - - run: - <<: *setup_linux_system_environment - - run: - name: Set Up CI Environment - no_output_timeout: "1h" - command: | - set -e - - sudo apt-get -y update - sudo apt-get -y install expect-dev moreutils - - sudo pip3 -q install awscli==1.16.35 - - if [ -n "${CUDA_VERSION}" ]; then - nvidia-smi - fi - - # This IAM user only allows read-write access to ECR - export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_ONLY} - export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY} - eval $(aws ecr get-login --region us-east-1 --no-include-email) - - run: - name: Build - no_output_timeout: "20h" - command: | - set -e - - # for some reason, pip installs it in a different place than what is looked at in the py file - sudo pip3 install requests --target=/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages - export pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) - echo "PyTorchDockerImageTag: "${pyTorchDockerImageTag} - - cat >/home/circleci/project/ci_build_script.sh \</dev/null - if [ -n "${CUDA_VERSION}" ]; then - export id=$(docker run --gpus all -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - else - export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - fi - - echo "declare -x JOB_BASE_NAME=${CIRCLE_JOB}" > /home/circleci/project/env - echo "declare -x COMMIT_ID=${CIRCLE_SHA1}" >> /home/circleci/project/env - echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH}" >> /home/circleci/project/env - # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE! - set +x - if [[ "$CIRCLE_BRANCH" == master || "$CIRCLE_BRANCH" == main ]]; then - if [ -z "${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi - if [ -z "${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi - if [ -z "${GITHUB_PYTORCHBOT_USERNAME}" ]; then exit 1; fi - if [ -z "${GITHUB_PYTORCHBOT_TOKEN}" ]; then exit 1; fi - - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x GITHUB_PYTORCHBOT_USERNAME=${GITHUB_PYTORCHBOT_USERNAME}" >> /home/circleci/project/env - echo "declare -x GITHUB_PYTORCHBOT_TOKEN=${GITHUB_PYTORCHBOT_TOKEN}" >> /home/circleci/project/env - else - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env - fi - set -x - - echo 'rm /opt/cache/bin/*' | docker exec -u root -i "$id" bash - docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace" - - export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' - echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts - # Copy docs with plot to a docs dir - if docker exec -it "$id" sh -c "test -d ./workspace/docs_with_plot/docs/"; then - mkdir /home/circleci/project/docs - docker cp "$id:/var/lib/jenkins/workspace/docs_with_plot/docs/." /home/circleci/project/docs - echo "Directory copied successfully" - else - echo "No docs_with_plot directory. Skipping..." - fi - - - store_artifacts: - path: ./docs - destination: tutorials - -pytorch_tutorial_build_worker_defaults: &pytorch_tutorial_build_worker_defaults - environment: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" - CUDA_VERSION: "9" - resource_class: gpu.nvidia.small - <<: *pytorch_tutorial_build_defaults - -pytorch_tutorial_build_manager_defaults: &pytorch_tutorial_build_manager_defaults - environment: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" - resource_class: medium - - - <<: *pytorch_tutorial_build_defaults -{% raw %} -pytorch_windows_build_worker: &pytorch_windows_build_worker - executor: windows-with-nvidia-gpu - steps: - - checkout - - run: - name: Install Cuda - no_output_timeout: 30m - command: | - .circleci/scripts/windows_cuda_install.sh - - run: - name: Generate cache key - # This will refresh cache on Sundays, build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - keys: - - data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} - - run: - name: test - no_output_timeout: "1h" - command: | - .circleci/scripts/build_for_windows.sh - - save_cache: - key: data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} - paths: - - advanced_source/data - - beginner_source/data - - intermediate_source/data - - prototype_source/data -{% endraw %} -jobs: - {{ jobs("pr") }} - - {{ jobs("trunk") }} - - {{ windows_jobs() }} - -workflows: - build: - jobs: - # Build jobs that only run on PR - {{ workflows_jobs("pr") }} - # Build jobs that only run on trunk - {{ workflows_jobs("trunk") }} -# {{ windows_workflows_jobs() }} diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py deleted file mode 100644 index f47ee1dfa6f..00000000000 --- a/.circleci/regenerate.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 - -# regenrates config.yml based on config.yml.in - -from copy import deepcopy -import os.path - -import jinja2 -import yaml -from jinja2 import select_autoescape - -WORKFLOWS_JOBS_PR = {"filters": {"branches": {"ignore": ["master", "main"]}}} - -WORKFLOWS_JOBS_TRUNK = { - "context": "org-member", - "filters": {"branches": {"only": ["master", "main"]}}, -} - - -def indent(indentation, data_list): - return ("\n" + " " * indentation).join( - yaml.dump(data_list, default_flow_style=False).splitlines() - ) - - -def jobs(pr_or_trunk, num_workers=20, indentation=2): - jobs = {} - - # all tutorials that need gpu.nvidia.small.multi machines will be routed by - # get_files_to_run.py to 0th worker, similarly for gpu.nvidia.large and the - # 1st worker - needs_gpu_nvidia_small_multi = [0] - needs_gpu_nvidia_large = [1] - jobs[f"pytorch_tutorial_{pr_or_trunk}_build_manager"] = { - "<<": "*pytorch_tutorial_build_manager_defaults" - } - for i in range(num_workers): - job_info = {"<<": "*pytorch_tutorial_build_worker_defaults"} - if i in needs_gpu_nvidia_small_multi: - job_info["resource_class"] = "gpu.nvidia.small.multi" - if i in needs_gpu_nvidia_large: - job_info["resource_class"] = "gpu.nvidia.large" - jobs[f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}"] = job_info - - return indent(indentation, jobs).replace("'", "") - - -def workflows_jobs(pr_or_trunk, indentation=6, num_workers=20): - jobs = [] - job_info = deepcopy( - WORKFLOWS_JOBS_PR if pr_or_trunk == "pr" else WORKFLOWS_JOBS_TRUNK - ) - - for i in range(num_workers): - jobs.append( - {f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}": deepcopy(job_info)} - ) - - job_info["requires"] = [ - f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}" for i in range(num_workers) - ] - jobs.append({f"pytorch_tutorial_{pr_or_trunk}_build_manager": deepcopy(job_info)}) - return indent(indentation, jobs) - - -def windows_jobs(indentation=2, num_workers=4): - jobs = {} - for i in range(num_workers): - jobs[f"pytorch_tutorial_windows_pr_build_worker_{i}"] = { - "<<": "*pytorch_windows_build_worker" - } - jobs[f"pytorch_tutorial_windows_trunk_build_worker_{i}"] = { - "<<": "*pytorch_windows_build_worker" - } - return indent(indentation, jobs).replace("'", "") - - -def windows_workflows_jobs(indentation=6, num_workers=4): - jobs = [] - job_info = WORKFLOWS_JOBS_PR - for i in range(num_workers): - jobs.append( - {f"pytorch_tutorial_windows_pr_build_worker_{i}": deepcopy(job_info)} - ) - - job_info = WORKFLOWS_JOBS_TRUNK - for i in range(num_workers): - jobs.append( - {f"pytorch_tutorial_windows_trunk_build_worker_{i}": deepcopy(job_info)} - ) - - return ("\n#").join(indent(indentation, jobs).splitlines()) - - -if __name__ == "__main__": - - directory = os.path.dirname(__file__) - env = jinja2.Environment( - loader=jinja2.FileSystemLoader(directory), - lstrip_blocks=True, - autoescape=select_autoescape(enabled_extensions=("html", "xml")), - keep_trailing_newline=True, - ) - with open(os.path.join(directory, "config.yml"), "w") as f: - f.write( - env.get_template("config.yml.in").render( - jobs=jobs, - workflows_jobs=workflows_jobs, - windows_jobs=windows_jobs, - windows_workflows_jobs=windows_workflows_jobs, - ) - ) diff --git a/.github/workflows/build-tutorials.yml b/.github/workflows/build-tutorials.yml new file mode 100644 index 00000000000..1e2d22682e4 --- /dev/null +++ b/.github/workflows/build-tutorials.yml @@ -0,0 +1,182 @@ +name: Build tutorials + +on: + pull_request: + push: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + worker: + name: pytorch_tutorial_build_worker + strategy: + matrix: + include: + - { shard: 1, num_shards: 6, runner: "linux.16xlarge.nvidia.gpu" } + - { shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 3, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" } + - { shard: 4, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" } + - { shard: 5, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" } + - { shard: 6, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" } + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" + CUDA_VERSION: "9" + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + instructions: | + All testing is done inside the container, to start an interactive session run: + docker exec -it $(docker container ps --format '{{.ID}}') bash + + - name: Checkout Tutorials + uses: actions/checkout@v3 + + - name: Setup Linux + uses: pytorch/pytorch/.github/actions/setup-linux@main + + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + uses: pytorch/test-infra/.github/actions/setup-nvidia@main + + - name: Calculate docker image + shell: bash + id: docker-image + run: | + set -ex + + # for some reason, pip installs it in a different place than what is looked at in the py file + pip3 install requests==2.26 + pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) + + echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.docker-image.outputs.docker-image }} + + - name: Build + shell: bash + env: + DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} + NUM_WORKERS: ${{ matrix.num_shards }} + WORKER_ID: ${{ matrix.shard }} + COMMIT_ID: ${{ github.sha }} + JOB_TYPE: worker + COMMIT_SOURCE: ${{ github.ref }} + run: | + set -ex + + chmod +x ".jenkins/build.sh" + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e WORKER_ID \ + -e NUM_WORKERS \ + -e COMMIT_ID \ + -e JOB_TYPE \ + -e COMMIT_SOURCE \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --tty \ + --detach \ + --user jenkins \ + --name="${container_name}" \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + + echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash + + docker exec -t "${container_name}" sh -c ".jenkins/build.sh" + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + + manager: + name: pytorch_tutorial_build_manager + needs: worker + runs-on: [self-hosted, linux.2xlarge] + env: + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" + CUDA_VERSION: "9" + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + instructions: | + All testing is done inside the container, to start an interactive session run: + docker exec -it $(docker container ps --format '{{.ID}}') bash + + - name: Checkout Tutorials + uses: actions/checkout@v3 + + - name: Setup Linux + uses: pytorch/pytorch/.github/actions/setup-linux@main + + - name: Calculate docker image + shell: bash + id: docker-image + run: | + set -ex + + # for some reason, pip installs it in a different place than what is looked at in the py file + pip3 install requests==2.26 + pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) + + echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.docker-image.outputs.docker-image }} + + - name: Build + shell: bash + env: + DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} + NUM_WORKERS: 5 + WORKER_ID: ${{ matrix.shard }} + COMMIT_ID: ${{ github.sha }} + JOB_TYPE: manager + COMMIT_SOURCE: ${{ github.ref }} + GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.PYTORCHBOT_TOKEN }} + run: | + set -ex + + chmod +x ".jenkins/build.sh" + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e WORKER_ID \ + -e NUM_WORKERS \ + -e COMMIT_ID \ + -e JOB_TYPE \ + -e COMMIT_SOURCE \ + -e GITHUB_PYTORCHBOT_TOKEN \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --tty \ + --detach \ + --user jenkins \ + --name="${container_name}" \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + + echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash + + docker exec -t "${container_name}" sh -c ".jenkins/build.sh" + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() diff --git a/.jenkins/build.sh b/.jenkins/build.sh index d09b0a8782a..a153638d9ca 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -1,10 +1,8 @@ +#!/bin/bash + set -ex -if [[ "$COMMIT_SOURCE" == master || "$COMMIT_SOURCE" == main ]]; then - export BUCKET_NAME=pytorch-tutorial-build-master -else - export BUCKET_NAME=pytorch-tutorial-build-pull-request -fi +export BUCKET_NAME=pytorch-tutorial-build-pull-request # set locale for click dependency in spacy export LC_ALL=C.UTF-8 @@ -25,7 +23,7 @@ pip install -r $DIR/../requirements.txt # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html # RC Link # pip uninstall -y torch torchvision torchaudio torchtext -# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext +# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext # pip uninstall -y torch torchvision torchaudio torchtext # pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu116/torch_test.html torch torchdata torchvision torchaudio torchtext @@ -37,8 +35,7 @@ awsv2 -i awsv2 configure set default.s3.multipart_threshold 5120MB # Decide whether to parallelize tutorial builds, based on $JOB_BASE_NAME -export NUM_WORKERS=20 -if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then +if [[ "${JOB_TYPE}" == "worker" ]]; then # Step 1: Remove runnable code from tutorials that are not supposed to be run python $DIR/remove_runnable_code.py beginner_source/aws_distributed_training_tutorial.py beginner_source/aws_distributed_training_tutorial.py || true # python $DIR/remove_runnable_code.py advanced_source/ddp_pipeline_tutorial.py advanced_source/ddp_pipeline_tutorial.py || true @@ -47,7 +44,7 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then # python $DIR/remove_runnable_code.py intermediate_source/spatial_transformer_tutorial.py intermediate_source/spatial_transformer_tutorial.py || true # Temp remove for 1.10 release. # python $DIR/remove_runnable_code.py advanced_source/neural_style_tutorial.py advanced_source/neural_style_tutorial.py || true - + # TODO: Fix bugs in these tutorials to make them runnable again # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true @@ -56,7 +53,6 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then # Step 2: Keep certain tutorials based on file count, and remove runnable code in all other tutorials # IMPORTANT NOTE: We assume that each tutorial has a UNIQUE filename. - export WORKER_ID=$(echo "${JOB_BASE_NAME}" | tr -dc '0-9') FILES_TO_RUN=$(python .jenkins/get_files_to_run.py) echo "FILES_TO_RUN: " ${FILES_TO_RUN} @@ -117,25 +113,17 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then # Step 6: Copy generated files to S3, tag with commit ID 7z a worker_${WORKER_ID}.7z docs awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z --acl public-read -elif [[ "${JOB_BASE_NAME}" == *manager ]]; then +elif [[ "${JOB_TYPE}" == "manager" ]]; then # Step 1: Generate no-plot HTML pages for all tutorials make html-noplot cp -r _build/html docs # Step 2: Wait for all workers to finish - set +e - for ((worker_id=0;worker_id List[str]: sources = [x.relative_to(REPO_BASE_DIR) for x in REPO_BASE_DIR.glob("*_source/**/*.py") if 'data' not in x.parts] - return [str(x) for x in sources] + return sorted([str(x) for x in sources]) def read_metadata() -> Dict[str, Any]: @@ -87,8 +87,8 @@ def parse_args() -> Any: from argparse import ArgumentParser parser = ArgumentParser("Select files to run") parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", 20))) - parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", 0))) + parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", "20"))) + parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", "1"))) return parser.parse_args() @@ -96,7 +96,7 @@ def main() -> None: args = parse_args() all_files = get_all_files() - files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num] + files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num - 1] if not args.dry_run: remove_other_files(all_files, compute_files_to_keep(files_to_run)) stripped_file_names = [Path(x).stem for x in files_to_run] diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 40c0e13c74e..f3ec3a3986b 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -1,10 +1,10 @@ { "intermediate_source/ax_multiobjective_nas_tutorial.py": { "extra_files": ["intermediate_source/mnist_train_nas.py"], - "duration": 2000 + "duration": 1100 }, "beginner_source/dcgan_faces_tutorial.py": { - "duration": 2000 + "duration": 1500 }, "intermediate_source/seq2seq_translation_tutorial.py": { "duration": 1200 @@ -18,12 +18,22 @@ "beginner_source/chatbot_tutorial.py": { "duration": 330 }, + "intermediate_source/char_rnn_generation_tutorial.py": { + "duration": 700 + }, + "intermediate_source/reinforcement_q_learning.py": { + "duration": 600 + }, "intermediate_source/pipeline_tutorial.py": { - "duration": 320, + "duration": 500, "needs": "gpu.nvidia.small.multi" }, + "intermediate_source/reinforcement_ppo.py": { + "duration": 370 + }, "intermediate_source/model_parallel_tutorial.py": { - "needs": "gpu.nvidia.small.multi" + "needs": "gpu.nvidia.small.multi", + "duration": 370 }, "intermediate_source/torch_compile_tutorial.py": { "needs": "gpu.nvidia.large" From fe90d0b28f4ff84e719eff4b7935f2b740134da7 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 2 Jun 2023 14:29:38 -0700 Subject: [PATCH 02/10] update --- recipes_source/recipes/changing_default_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes/changing_default_device.py b/recipes_source/recipes/changing_default_device.py index f5e50b3f0be..61f7c4f26b4 100644 --- a/recipes_source/recipes/changing_default_device.py +++ b/recipes_source/recipes/changing_default_device.py @@ -35,7 +35,7 @@ print(mod(torch.randn(128, 20)).device) ######################################### -# You can also set it globally like this: +# You can also set it globally like this: torch.set_default_device('cuda') From 04bd453913a92c6374029605b2b403004121afeb Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 2 Jun 2023 14:57:58 -0700 Subject: [PATCH 03/10] update --- .jenkins/metadata.json | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index f3ec3a3986b..40c0e13c74e 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -1,10 +1,10 @@ { "intermediate_source/ax_multiobjective_nas_tutorial.py": { "extra_files": ["intermediate_source/mnist_train_nas.py"], - "duration": 1100 + "duration": 2000 }, "beginner_source/dcgan_faces_tutorial.py": { - "duration": 1500 + "duration": 2000 }, "intermediate_source/seq2seq_translation_tutorial.py": { "duration": 1200 @@ -18,22 +18,12 @@ "beginner_source/chatbot_tutorial.py": { "duration": 330 }, - "intermediate_source/char_rnn_generation_tutorial.py": { - "duration": 700 - }, - "intermediate_source/reinforcement_q_learning.py": { - "duration": 600 - }, "intermediate_source/pipeline_tutorial.py": { - "duration": 500, + "duration": 320, "needs": "gpu.nvidia.small.multi" }, - "intermediate_source/reinforcement_ppo.py": { - "duration": 370 - }, "intermediate_source/model_parallel_tutorial.py": { - "needs": "gpu.nvidia.small.multi", - "duration": 370 + "needs": "gpu.nvidia.small.multi" }, "intermediate_source/torch_compile_tutorial.py": { "needs": "gpu.nvidia.large" From 8fa6add9142e1e409a10ce9fc0ac8a2327caeea2 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 2 Jun 2023 14:59:10 -0700 Subject: [PATCH 04/10] update --- recipes_source/recipes/changing_default_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes/changing_default_device.py b/recipes_source/recipes/changing_default_device.py index 61f7c4f26b4..f5e50b3f0be 100644 --- a/recipes_source/recipes/changing_default_device.py +++ b/recipes_source/recipes/changing_default_device.py @@ -35,7 +35,7 @@ print(mod(torch.randn(128, 20)).device) ######################################### -# You can also set it globally like this: +# You can also set it globally like this: torch.set_default_device('cuda') From 5aca75f91faba06365665a02b7e6baac16d3c36a Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 2 Jun 2023 16:29:51 -0700 Subject: [PATCH 05/10] update --- .jenkins/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.jenkins/build.sh b/.jenkins/build.sh index a153638d9ca..f13966ff84b 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -112,7 +112,7 @@ if [[ "${JOB_TYPE}" == "worker" ]]; then # Step 6: Copy generated files to S3, tag with commit ID 7z a worker_${WORKER_ID}.7z docs - awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z --acl public-read + awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z elif [[ "${JOB_TYPE}" == "manager" ]]; then # Step 1: Generate no-plot HTML pages for all tutorials make html-noplot @@ -138,7 +138,7 @@ elif [[ "${JOB_TYPE}" == "manager" ]]; then # Step 6: Copy generated HTML files and static files to S3 7z a manager.7z docs - awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z --acl public-read + awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z # Step 7: push new HTML files and static files to gh-pages if [[ "$COMMIT_SOURCE" == "refs/heads/master" || "$COMMIT_SOURCE" == "refs/heads/main" ]]; then From c5efbe5bd2f3e62d388bf998460dea5e3c3c2ef1 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Fri, 2 Jun 2023 18:08:02 -0700 Subject: [PATCH 06/10] update --- .github/workflows/build-tutorials.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-tutorials.yml b/.github/workflows/build-tutorials.yml index 1e2d22682e4..222de26b9bb 100644 --- a/.github/workflows/build-tutorials.yml +++ b/.github/workflows/build-tutorials.yml @@ -144,7 +144,7 @@ jobs: shell: bash env: DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} - NUM_WORKERS: 5 + NUM_WORKERS: 6 WORKER_ID: ${{ matrix.shard }} COMMIT_ID: ${{ github.sha }} JOB_TYPE: manager From a4fb7a77d3abc46e2f53a23e74dbfe9f5b46c4dd Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Mon, 5 Jun 2023 09:55:03 -0700 Subject: [PATCH 07/10] add empty config yml file --- .circleci/config.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000000..e69de29bb2d From 7ee4390783b55f6daf4565413018d9b08ceaa837 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Mon, 5 Jun 2023 10:02:57 -0700 Subject: [PATCH 08/10] what is the bare minimum --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index e69de29bb2d..3879be90472 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -0,0 +1 @@ +version: 2.1 From 3a073787465811aa8ee9ba187e5bab35441826a4 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Mon, 5 Jun 2023 10:07:13 -0700 Subject: [PATCH 09/10] take 3 --- .circleci/config.yml | 316 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 316 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3879be90472..87e80e4a87f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1 +1,317 @@ +# run python regenerate.py to generate config.yml from config.yml.in + version: 2.1 + +executors: + windows-with-nvidia-gpu: + machine: + resource_class: windows.gpu.nvidia.medium + image: windows-server-2019-nvidia:stable + shell: bash.exe + +install_official_git_client: &install_official_git_client + name: Install Official Git Client + no_output_timeout: "1h" + command: | + set -e + sudo apt-get -qq update + sudo apt-get -qq install openssh-client git + +# This system setup script is meant to run before the CI-related scripts, e.g., +# installing Git client, checking out code, setting up CI env, and +# building/testing. +setup_linux_system_environment: &setup_linux_system_environment + name: Set Up System Environment + no_output_timeout: "1h" + command: | + set -ex + + # Stop background apt updates. Hypothetically, the kill should not + # be necessary, because stop is supposed to send a kill signal to + # the process, but we've added it for good luck. Also + # hypothetically, it's supposed to be unnecessary to wait for + # the process to block. We also have that line for good luck. + # If you like, try deleting them and seeing if it works. + sudo systemctl stop apt-daily.service || true + sudo systemctl kill --kill-who=all apt-daily.service || true + + sudo systemctl stop unattended-upgrades.service || true + sudo systemctl kill --kill-who=all unattended-upgrades.service || true + + # wait until `apt-get update` has been killed + while systemctl is-active --quiet apt-daily.service + do + sleep 1; + done + while systemctl is-active --quiet unattended-upgrades.service + do + sleep 1; + done + + # See if we actually were successful + systemctl list-units --all | cat + + sudo apt-get purge -y unattended-upgrades + + cat /etc/apt/sources.list + + ps auxfww | grep [a]pt + ps auxfww | grep dpkg + +pytorch_tutorial_build_defaults: &pytorch_tutorial_build_defaults + machine: + image: ubuntu-2004-cuda-11.4:202110-01 + steps: + - checkout + - run: + <<: *setup_linux_system_environment + - run: + name: Set Up CI Environment + no_output_timeout: "1h" + command: | + set -e + + sudo apt-get -y update + sudo apt-get -y install expect-dev moreutils + + sudo pip3 -q install awscli==1.16.35 + + if [ -n "${CUDA_VERSION}" ]; then + nvidia-smi + fi + + # This IAM user only allows read-write access to ECR + export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_ONLY} + export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY} + eval $(aws ecr get-login --region us-east-1 --no-include-email) + - run: + name: Build + no_output_timeout: "20h" + command: | + set -e + + # for some reason, pip installs it in a different place than what is looked at in the py file + sudo pip3 install requests --target=/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages + export pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) + echo "PyTorchDockerImageTag: "${pyTorchDockerImageTag} + + cat >/home/circleci/project/ci_build_script.sh \</dev/null + if [ -n "${CUDA_VERSION}" ]; then + export id=$(docker run --gpus all -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) + else + export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) + fi + + echo "declare -x JOB_BASE_NAME=${CIRCLE_JOB}" > /home/circleci/project/env + echo "declare -x COMMIT_ID=${CIRCLE_SHA1}" >> /home/circleci/project/env + echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH}" >> /home/circleci/project/env + # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE! + set +x + if [[ "$CIRCLE_BRANCH" == master || "$CIRCLE_BRANCH" == main ]]; then + if [ -z "${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi + if [ -z "${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi + if [ -z "${GITHUB_PYTORCHBOT_USERNAME}" ]; then exit 1; fi + if [ -z "${GITHUB_PYTORCHBOT_TOKEN}" ]; then exit 1; fi + + echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env + echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env + echo "declare -x GITHUB_PYTORCHBOT_USERNAME=${GITHUB_PYTORCHBOT_USERNAME}" >> /home/circleci/project/env + echo "declare -x GITHUB_PYTORCHBOT_TOKEN=${GITHUB_PYTORCHBOT_TOKEN}" >> /home/circleci/project/env + else + echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env + echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env + fi + set -x + + echo 'rm /opt/cache/bin/*' | docker exec -u root -i "$id" bash + docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace" + + export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' + echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts + # Copy docs with plot to a docs dir + if docker exec -it "$id" sh -c "test -d ./workspace/docs_with_plot/docs/"; then + mkdir /home/circleci/project/docs + docker cp "$id:/var/lib/jenkins/workspace/docs_with_plot/docs/." /home/circleci/project/docs + echo "Directory copied successfully" + else + echo "No docs_with_plot directory. Skipping..." + fi + + - store_artifacts: + path: ./docs + destination: tutorials + +pytorch_tutorial_build_worker_defaults: &pytorch_tutorial_build_worker_defaults + environment: + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" + CUDA_VERSION: "9" + resource_class: gpu.nvidia.small + <<: *pytorch_tutorial_build_defaults + +pytorch_tutorial_build_manager_defaults: &pytorch_tutorial_build_manager_defaults + environment: + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" + resource_class: medium + + + <<: *pytorch_tutorial_build_defaults + +pytorch_windows_build_worker: &pytorch_windows_build_worker + executor: windows-with-nvidia-gpu + steps: + - checkout + - run: + name: Install Cuda + no_output_timeout: 30m + command: | + .circleci/scripts/windows_cuda_install.sh + - run: + name: Generate cache key + # This will refresh cache on Sundays, build should generate new cache. + command: echo "$(date +"%Y-%U")" > .circleci-weekly + - restore_cache: + keys: + - data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} + - run: + name: test + no_output_timeout: "1h" + command: | + .circleci/scripts/build_for_windows.sh + - save_cache: + key: data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} + paths: + - advanced_source/data + - beginner_source/data + - intermediate_source/data + - prototype_source/data + +jobs: + pytorch_tutorial_pr_build_manager: + <<: *pytorch_tutorial_build_manager_defaults + pytorch_tutorial_pr_build_worker_0: + <<: *pytorch_tutorial_build_worker_defaults + resource_class: gpu.nvidia.small.multi + pytorch_tutorial_pr_build_worker_1: + <<: *pytorch_tutorial_build_worker_defaults + resource_class: gpu.nvidia.large + pytorch_tutorial_pr_build_worker_10: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_11: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_12: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_13: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_14: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_15: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_16: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_17: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_18: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_19: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_2: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_3: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_4: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_5: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_6: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_7: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_8: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_pr_build_worker_9: + <<: *pytorch_tutorial_build_worker_defaults + + pytorch_tutorial_trunk_build_manager: + <<: *pytorch_tutorial_build_manager_defaults + pytorch_tutorial_trunk_build_worker_0: + <<: *pytorch_tutorial_build_worker_defaults + resource_class: gpu.nvidia.small.multi + pytorch_tutorial_trunk_build_worker_1: + <<: *pytorch_tutorial_build_worker_defaults + resource_class: gpu.nvidia.large + pytorch_tutorial_trunk_build_worker_10: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_11: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_12: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_13: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_14: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_15: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_16: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_17: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_18: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_19: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_2: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_3: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_4: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_5: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_6: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_7: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_8: + <<: *pytorch_tutorial_build_worker_defaults + pytorch_tutorial_trunk_build_worker_9: + <<: *pytorch_tutorial_build_worker_defaults + + pytorch_tutorial_windows_pr_build_worker_0: + <<: *pytorch_windows_build_worker + pytorch_tutorial_windows_pr_build_worker_1: + <<: *pytorch_windows_build_worker + pytorch_tutorial_windows_pr_build_worker_2: + <<: *pytorch_windows_build_worker + pytorch_tutorial_windows_pr_build_worker_3: + <<: *pytorch_windows_build_worker + pytorch_tutorial_windows_trunk_build_worker_0: + <<: *pytorch_windows_build_worker + pytorch_tutorial_windows_trunk_build_worker_1: + <<: *pytorch_windows_build_worker + pytorch_tutorial_windows_trunk_build_worker_2: + <<: *pytorch_windows_build_worker + pytorch_tutorial_windows_trunk_build_worker_3: + <<: *pytorch_windows_build_worker + +workflows: + build: + when: << pipeline.parameters.this_should_never_be_true >> + jobs: + # Build jobs that only run on PR + - pytorch_tutorial_pr_build_worker_0: + filters: + branches: + ignore: + - master + - main From 4db0c0b195e4bba1f015a8357d2ca5b6ba291eb0 Mon Sep 17 00:00:00 2001 From: Catherine Lee Date: Mon, 5 Jun 2023 10:08:38 -0700 Subject: [PATCH 10/10] update --- .circleci/config.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 87e80e4a87f..70b2c7fd5b0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -306,7 +306,10 @@ jobs: workflows: build: - when: << pipeline.parameters.this_should_never_be_true >> + when: + and: # All must be true to trigger + - equal: [ branch1, << pipeline.git.branch >> ] + - equal: [ branch2, << pipeline.git.branch >> ] jobs: # Build jobs that only run on PR - pytorch_tutorial_pr_build_worker_0: