diff --git a/.circleci/config.yml b/.circleci/config.yml index b7084096c4b..70b2c7fd5b0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -306,6 +306,10 @@ jobs: workflows: build: + when: + and: # All must be true to trigger + - equal: [ branch1, << pipeline.git.branch >> ] + - equal: [ branch2, << pipeline.git.branch >> ] jobs: # Build jobs that only run on PR - pytorch_tutorial_pr_build_worker_0: @@ -314,365 +318,3 @@ workflows: ignore: - master - main - - pytorch_tutorial_pr_build_worker_1: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_2: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_3: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_4: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_5: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_6: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_7: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_8: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_9: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_10: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_11: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_12: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_13: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_14: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_15: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_16: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_17: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_18: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_worker_19: - filters: - branches: - ignore: - - master - - main - - pytorch_tutorial_pr_build_manager: - filters: - branches: - ignore: - - master - - main - requires: - - pytorch_tutorial_pr_build_worker_0 - - pytorch_tutorial_pr_build_worker_1 - - pytorch_tutorial_pr_build_worker_2 - - pytorch_tutorial_pr_build_worker_3 - - pytorch_tutorial_pr_build_worker_4 - - pytorch_tutorial_pr_build_worker_5 - - pytorch_tutorial_pr_build_worker_6 - - pytorch_tutorial_pr_build_worker_7 - - pytorch_tutorial_pr_build_worker_8 - - pytorch_tutorial_pr_build_worker_9 - - pytorch_tutorial_pr_build_worker_10 - - pytorch_tutorial_pr_build_worker_11 - - pytorch_tutorial_pr_build_worker_12 - - pytorch_tutorial_pr_build_worker_13 - - pytorch_tutorial_pr_build_worker_14 - - pytorch_tutorial_pr_build_worker_15 - - pytorch_tutorial_pr_build_worker_16 - - pytorch_tutorial_pr_build_worker_17 - - pytorch_tutorial_pr_build_worker_18 - - pytorch_tutorial_pr_build_worker_19 - # Build jobs that only run on trunk - - pytorch_tutorial_trunk_build_worker_0: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_1: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_2: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_3: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_4: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_5: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_6: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_7: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_8: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_9: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_10: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_11: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_12: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_13: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_14: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_15: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_16: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_17: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_18: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_worker_19: - context: org-member - filters: - branches: - only: - - master - - main - - pytorch_tutorial_trunk_build_manager: - context: org-member - filters: - branches: - only: - - master - - main - requires: - - pytorch_tutorial_trunk_build_worker_0 - - pytorch_tutorial_trunk_build_worker_1 - - pytorch_tutorial_trunk_build_worker_2 - - pytorch_tutorial_trunk_build_worker_3 - - pytorch_tutorial_trunk_build_worker_4 - - pytorch_tutorial_trunk_build_worker_5 - - pytorch_tutorial_trunk_build_worker_6 - - pytorch_tutorial_trunk_build_worker_7 - - pytorch_tutorial_trunk_build_worker_8 - - pytorch_tutorial_trunk_build_worker_9 - - pytorch_tutorial_trunk_build_worker_10 - - pytorch_tutorial_trunk_build_worker_11 - - pytorch_tutorial_trunk_build_worker_12 - - pytorch_tutorial_trunk_build_worker_13 - - pytorch_tutorial_trunk_build_worker_14 - - pytorch_tutorial_trunk_build_worker_15 - - pytorch_tutorial_trunk_build_worker_16 - - pytorch_tutorial_trunk_build_worker_17 - - pytorch_tutorial_trunk_build_worker_18 - - pytorch_tutorial_trunk_build_worker_19 -# - pytorch_tutorial_windows_pr_build_worker_0: -# filters: -# branches: -# ignore: -# - master -# - main -# - pytorch_tutorial_windows_pr_build_worker_1: -# filters: -# branches: -# ignore: -# - master -# - main -# - pytorch_tutorial_windows_pr_build_worker_2: -# filters: -# branches: -# ignore: -# - master -# - main -# - pytorch_tutorial_windows_pr_build_worker_3: -# filters: -# branches: -# ignore: -# - master -# - main -# - pytorch_tutorial_windows_trunk_build_worker_0: -# context: org-member -# filters: -# branches: -# only: -# - master -# - main -# - pytorch_tutorial_windows_trunk_build_worker_1: -# context: org-member -# filters: -# branches: -# only: -# - master -# - main -# - pytorch_tutorial_windows_trunk_build_worker_2: -# context: org-member -# filters: -# branches: -# only: -# - master -# - main -# - pytorch_tutorial_windows_trunk_build_worker_3: -# context: org-member -# filters: -# branches: -# only: -# - master -# - main diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in deleted file mode 100644 index 0694d221aad..00000000000 --- a/.circleci/config.yml.in +++ /dev/null @@ -1,213 +0,0 @@ -# run python regenerate.py to generate config.yml from config.yml.in - -version: 2.1 - -executors: - windows-with-nvidia-gpu: - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -install_official_git_client: &install_official_git_client - name: Install Official Git Client - no_output_timeout: "1h" - command: | - set -e - sudo apt-get -qq update - sudo apt-get -qq install openssh-client git - -# This system setup script is meant to run before the CI-related scripts, e.g., -# installing Git client, checking out code, setting up CI env, and -# building/testing. -setup_linux_system_environment: &setup_linux_system_environment - name: Set Up System Environment - no_output_timeout: "1h" - command: | - set -ex - - # Stop background apt updates. Hypothetically, the kill should not - # be necessary, because stop is supposed to send a kill signal to - # the process, but we've added it for good luck. Also - # hypothetically, it's supposed to be unnecessary to wait for - # the process to block. We also have that line for good luck. - # If you like, try deleting them and seeing if it works. - sudo systemctl stop apt-daily.service || true - sudo systemctl kill --kill-who=all apt-daily.service || true - - sudo systemctl stop unattended-upgrades.service || true - sudo systemctl kill --kill-who=all unattended-upgrades.service || true - - # wait until `apt-get update` has been killed - while systemctl is-active --quiet apt-daily.service - do - sleep 1; - done - while systemctl is-active --quiet unattended-upgrades.service - do - sleep 1; - done - - # See if we actually were successful - systemctl list-units --all | cat - - sudo apt-get purge -y unattended-upgrades - - cat /etc/apt/sources.list - - ps auxfww | grep [a]pt - ps auxfww | grep dpkg - -pytorch_tutorial_build_defaults: &pytorch_tutorial_build_defaults - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - steps: - - checkout - - run: - <<: *setup_linux_system_environment - - run: - name: Set Up CI Environment - no_output_timeout: "1h" - command: | - set -e - - sudo apt-get -y update - sudo apt-get -y install expect-dev moreutils - - sudo pip3 -q install awscli==1.16.35 - - if [ -n "${CUDA_VERSION}" ]; then - nvidia-smi - fi - - # This IAM user only allows read-write access to ECR - export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_ONLY} - export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_ONLY} - eval $(aws ecr get-login --region us-east-1 --no-include-email) - - run: - name: Build - no_output_timeout: "20h" - command: | - set -e - - # for some reason, pip installs it in a different place than what is looked at in the py file - sudo pip3 install requests --target=/opt/circleci/.pyenv/versions/3.9.4/lib/python3.9/site-packages - export pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) - echo "PyTorchDockerImageTag: "${pyTorchDockerImageTag} - - cat >/home/circleci/project/ci_build_script.sh \</dev/null - if [ -n "${CUDA_VERSION}" ]; then - export id=$(docker run --gpus all -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - else - export id=$(docker run -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}) - fi - - echo "declare -x JOB_BASE_NAME=${CIRCLE_JOB}" > /home/circleci/project/env - echo "declare -x COMMIT_ID=${CIRCLE_SHA1}" >> /home/circleci/project/env - echo "declare -x COMMIT_SOURCE=${CIRCLE_BRANCH}" >> /home/circleci/project/env - # DANGER! DO NOT REMOVE THE `set +x` SETTING HERE! - set +x - if [[ "$CIRCLE_BRANCH" == master || "$CIRCLE_BRANCH" == main ]]; then - if [ -z "${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi - if [ -z "${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" ]; then exit 1; fi - if [ -z "${GITHUB_PYTORCHBOT_USERNAME}" ]; then exit 1; fi - if [ -z "${GITHUB_PYTORCHBOT_TOKEN}" ]; then exit 1; fi - - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_MASTER_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x GITHUB_PYTORCHBOT_USERNAME=${GITHUB_PYTORCHBOT_USERNAME}" >> /home/circleci/project/env - echo "declare -x GITHUB_PYTORCHBOT_TOKEN=${GITHUB_PYTORCHBOT_TOKEN}" >> /home/circleci/project/env - else - echo "declare -x AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env - echo "declare -x AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_PYTORCH_TUTORIAL_BUILD_PR_S3_BUCKET}" >> /home/circleci/project/env - fi - set -x - - echo 'rm /opt/cache/bin/*' | docker exec -u root -i "$id" bash - docker cp /home/circleci/project/. "$id:/var/lib/jenkins/workspace" - - export COMMAND='((echo "source ./workspace/env" && echo "sudo chown -R jenkins workspace && cd workspace && ./ci_build_script.sh") | docker exec -u jenkins -i "$id" bash) 2>&1' - echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts - # Copy docs with plot to a docs dir - if docker exec -it "$id" sh -c "test -d ./workspace/docs_with_plot/docs/"; then - mkdir /home/circleci/project/docs - docker cp "$id:/var/lib/jenkins/workspace/docs_with_plot/docs/." /home/circleci/project/docs - echo "Directory copied successfully" - else - echo "No docs_with_plot directory. Skipping..." - fi - - - store_artifacts: - path: ./docs - destination: tutorials - -pytorch_tutorial_build_worker_defaults: &pytorch_tutorial_build_worker_defaults - environment: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" - CUDA_VERSION: "9" - resource_class: gpu.nvidia.small - <<: *pytorch_tutorial_build_defaults - -pytorch_tutorial_build_manager_defaults: &pytorch_tutorial_build_manager_defaults - environment: - DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" - resource_class: medium - - - <<: *pytorch_tutorial_build_defaults -{% raw %} -pytorch_windows_build_worker: &pytorch_windows_build_worker - executor: windows-with-nvidia-gpu - steps: - - checkout - - run: - name: Install Cuda - no_output_timeout: 30m - command: | - .circleci/scripts/windows_cuda_install.sh - - run: - name: Generate cache key - # This will refresh cache on Sundays, build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - keys: - - data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} - - run: - name: test - no_output_timeout: "1h" - command: | - .circleci/scripts/build_for_windows.sh - - save_cache: - key: data-{{ checksum "Makefile" }}-{{ checksum ".circleci-weekly" }} - paths: - - advanced_source/data - - beginner_source/data - - intermediate_source/data - - prototype_source/data -{% endraw %} -jobs: - {{ jobs("pr") }} - - {{ jobs("trunk") }} - - {{ windows_jobs() }} - -workflows: - build: - jobs: - # Build jobs that only run on PR - {{ workflows_jobs("pr") }} - # Build jobs that only run on trunk - {{ workflows_jobs("trunk") }} -# {{ windows_workflows_jobs() }} diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py deleted file mode 100644 index f47ee1dfa6f..00000000000 --- a/.circleci/regenerate.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 - -# regenrates config.yml based on config.yml.in - -from copy import deepcopy -import os.path - -import jinja2 -import yaml -from jinja2 import select_autoescape - -WORKFLOWS_JOBS_PR = {"filters": {"branches": {"ignore": ["master", "main"]}}} - -WORKFLOWS_JOBS_TRUNK = { - "context": "org-member", - "filters": {"branches": {"only": ["master", "main"]}}, -} - - -def indent(indentation, data_list): - return ("\n" + " " * indentation).join( - yaml.dump(data_list, default_flow_style=False).splitlines() - ) - - -def jobs(pr_or_trunk, num_workers=20, indentation=2): - jobs = {} - - # all tutorials that need gpu.nvidia.small.multi machines will be routed by - # get_files_to_run.py to 0th worker, similarly for gpu.nvidia.large and the - # 1st worker - needs_gpu_nvidia_small_multi = [0] - needs_gpu_nvidia_large = [1] - jobs[f"pytorch_tutorial_{pr_or_trunk}_build_manager"] = { - "<<": "*pytorch_tutorial_build_manager_defaults" - } - for i in range(num_workers): - job_info = {"<<": "*pytorch_tutorial_build_worker_defaults"} - if i in needs_gpu_nvidia_small_multi: - job_info["resource_class"] = "gpu.nvidia.small.multi" - if i in needs_gpu_nvidia_large: - job_info["resource_class"] = "gpu.nvidia.large" - jobs[f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}"] = job_info - - return indent(indentation, jobs).replace("'", "") - - -def workflows_jobs(pr_or_trunk, indentation=6, num_workers=20): - jobs = [] - job_info = deepcopy( - WORKFLOWS_JOBS_PR if pr_or_trunk == "pr" else WORKFLOWS_JOBS_TRUNK - ) - - for i in range(num_workers): - jobs.append( - {f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}": deepcopy(job_info)} - ) - - job_info["requires"] = [ - f"pytorch_tutorial_{pr_or_trunk}_build_worker_{i}" for i in range(num_workers) - ] - jobs.append({f"pytorch_tutorial_{pr_or_trunk}_build_manager": deepcopy(job_info)}) - return indent(indentation, jobs) - - -def windows_jobs(indentation=2, num_workers=4): - jobs = {} - for i in range(num_workers): - jobs[f"pytorch_tutorial_windows_pr_build_worker_{i}"] = { - "<<": "*pytorch_windows_build_worker" - } - jobs[f"pytorch_tutorial_windows_trunk_build_worker_{i}"] = { - "<<": "*pytorch_windows_build_worker" - } - return indent(indentation, jobs).replace("'", "") - - -def windows_workflows_jobs(indentation=6, num_workers=4): - jobs = [] - job_info = WORKFLOWS_JOBS_PR - for i in range(num_workers): - jobs.append( - {f"pytorch_tutorial_windows_pr_build_worker_{i}": deepcopy(job_info)} - ) - - job_info = WORKFLOWS_JOBS_TRUNK - for i in range(num_workers): - jobs.append( - {f"pytorch_tutorial_windows_trunk_build_worker_{i}": deepcopy(job_info)} - ) - - return ("\n#").join(indent(indentation, jobs).splitlines()) - - -if __name__ == "__main__": - - directory = os.path.dirname(__file__) - env = jinja2.Environment( - loader=jinja2.FileSystemLoader(directory), - lstrip_blocks=True, - autoescape=select_autoescape(enabled_extensions=("html", "xml")), - keep_trailing_newline=True, - ) - with open(os.path.join(directory, "config.yml"), "w") as f: - f.write( - env.get_template("config.yml.in").render( - jobs=jobs, - workflows_jobs=workflows_jobs, - windows_jobs=windows_jobs, - windows_workflows_jobs=windows_workflows_jobs, - ) - ) diff --git a/.github/workflows/build-tutorials.yml b/.github/workflows/build-tutorials.yml new file mode 100644 index 00000000000..222de26b9bb --- /dev/null +++ b/.github/workflows/build-tutorials.yml @@ -0,0 +1,182 @@ +name: Build tutorials + +on: + pull_request: + push: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + worker: + name: pytorch_tutorial_build_worker + strategy: + matrix: + include: + - { shard: 1, num_shards: 6, runner: "linux.16xlarge.nvidia.gpu" } + - { shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" } + - { shard: 3, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" } + - { shard: 4, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" } + - { shard: 5, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" } + - { shard: 6, num_shards: 6, runner: "linux.4xlarge.nvidia.gpu" } + fail-fast: false + runs-on: ${{ matrix.runner }} + env: + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" + CUDA_VERSION: "9" + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + instructions: | + All testing is done inside the container, to start an interactive session run: + docker exec -it $(docker container ps --format '{{.ID}}') bash + + - name: Checkout Tutorials + uses: actions/checkout@v3 + + - name: Setup Linux + uses: pytorch/pytorch/.github/actions/setup-linux@main + + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + uses: pytorch/test-infra/.github/actions/setup-nvidia@main + + - name: Calculate docker image + shell: bash + id: docker-image + run: | + set -ex + + # for some reason, pip installs it in a different place than what is looked at in the py file + pip3 install requests==2.26 + pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) + + echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.docker-image.outputs.docker-image }} + + - name: Build + shell: bash + env: + DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} + NUM_WORKERS: ${{ matrix.num_shards }} + WORKER_ID: ${{ matrix.shard }} + COMMIT_ID: ${{ github.sha }} + JOB_TYPE: worker + COMMIT_SOURCE: ${{ github.ref }} + run: | + set -ex + + chmod +x ".jenkins/build.sh" + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e WORKER_ID \ + -e NUM_WORKERS \ + -e COMMIT_ID \ + -e JOB_TYPE \ + -e COMMIT_SOURCE \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --tty \ + --detach \ + --user jenkins \ + --name="${container_name}" \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + + echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash + + docker exec -t "${container_name}" sh -c ".jenkins/build.sh" + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + + manager: + name: pytorch_tutorial_build_manager + needs: worker + runs-on: [self-hosted, linux.2xlarge] + env: + DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda12.1-cudnn8-py3-gcc9" + CUDA_VERSION: "9" + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + instructions: | + All testing is done inside the container, to start an interactive session run: + docker exec -it $(docker container ps --format '{{.ID}}') bash + + - name: Checkout Tutorials + uses: actions/checkout@v3 + + - name: Setup Linux + uses: pytorch/pytorch/.github/actions/setup-linux@main + + - name: Calculate docker image + shell: bash + id: docker-image + run: | + set -ex + + # for some reason, pip installs it in a different place than what is looked at in the py file + pip3 install requests==2.26 + pyTorchDockerImageTag=$(python3 .jenkins/get_docker_tag.py) + + echo "docker-image=${DOCKER_IMAGE}:${pyTorchDockerImageTag}" >> "${GITHUB_OUTPUT}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.docker-image.outputs.docker-image }} + + - name: Build + shell: bash + env: + DOCKER_IMAGE: ${{ steps.docker-image.outputs.docker-image }} + NUM_WORKERS: 6 + WORKER_ID: ${{ matrix.shard }} + COMMIT_ID: ${{ github.sha }} + JOB_TYPE: manager + COMMIT_SOURCE: ${{ github.ref }} + GITHUB_PYTORCHBOT_TOKEN: ${{ secrets.PYTORCHBOT_TOKEN }} + run: | + set -ex + + chmod +x ".jenkins/build.sh" + + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e WORKER_ID \ + -e NUM_WORKERS \ + -e COMMIT_ID \ + -e JOB_TYPE \ + -e COMMIT_SOURCE \ + -e GITHUB_PYTORCHBOT_TOKEN \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --tty \ + --detach \ + --user jenkins \ + --name="${container_name}" \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + + echo "rm /opt/cache/bin/*" | docker exec -u root -i "${container_name}" bash + + docker exec -t "${container_name}" sh -c ".jenkins/build.sh" + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() diff --git a/.jenkins/build.sh b/.jenkins/build.sh index d09b0a8782a..f13966ff84b 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -1,10 +1,8 @@ +#!/bin/bash + set -ex -if [[ "$COMMIT_SOURCE" == master || "$COMMIT_SOURCE" == main ]]; then - export BUCKET_NAME=pytorch-tutorial-build-master -else - export BUCKET_NAME=pytorch-tutorial-build-pull-request -fi +export BUCKET_NAME=pytorch-tutorial-build-pull-request # set locale for click dependency in spacy export LC_ALL=C.UTF-8 @@ -25,7 +23,7 @@ pip install -r $DIR/../requirements.txt # Nightly - pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html # RC Link # pip uninstall -y torch torchvision torchaudio torchtext -# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext +# pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu102/torch_test.html torch torchvision torchaudio torchtext # pip uninstall -y torch torchvision torchaudio torchtext # pip install --pre --upgrade -f https://download.pytorch.org/whl/test/cu116/torch_test.html torch torchdata torchvision torchaudio torchtext @@ -37,8 +35,7 @@ awsv2 -i awsv2 configure set default.s3.multipart_threshold 5120MB # Decide whether to parallelize tutorial builds, based on $JOB_BASE_NAME -export NUM_WORKERS=20 -if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then +if [[ "${JOB_TYPE}" == "worker" ]]; then # Step 1: Remove runnable code from tutorials that are not supposed to be run python $DIR/remove_runnable_code.py beginner_source/aws_distributed_training_tutorial.py beginner_source/aws_distributed_training_tutorial.py || true # python $DIR/remove_runnable_code.py advanced_source/ddp_pipeline_tutorial.py advanced_source/ddp_pipeline_tutorial.py || true @@ -47,7 +44,7 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then # python $DIR/remove_runnable_code.py intermediate_source/spatial_transformer_tutorial.py intermediate_source/spatial_transformer_tutorial.py || true # Temp remove for 1.10 release. # python $DIR/remove_runnable_code.py advanced_source/neural_style_tutorial.py advanced_source/neural_style_tutorial.py || true - + # TODO: Fix bugs in these tutorials to make them runnable again # python $DIR/remove_runnable_code.py beginner_source/audio_classifier_tutorial.py beginner_source/audio_classifier_tutorial.py || true @@ -56,7 +53,6 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then # Step 2: Keep certain tutorials based on file count, and remove runnable code in all other tutorials # IMPORTANT NOTE: We assume that each tutorial has a UNIQUE filename. - export WORKER_ID=$(echo "${JOB_BASE_NAME}" | tr -dc '0-9') FILES_TO_RUN=$(python .jenkins/get_files_to_run.py) echo "FILES_TO_RUN: " ${FILES_TO_RUN} @@ -116,26 +112,18 @@ if [[ "${JOB_BASE_NAME}" == *worker_* ]]; then # Step 6: Copy generated files to S3, tag with commit ID 7z a worker_${WORKER_ID}.7z docs - awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z --acl public-read -elif [[ "${JOB_BASE_NAME}" == *manager ]]; then + awsv2 s3 cp worker_${WORKER_ID}.7z s3://${BUCKET_NAME}/${COMMIT_ID}/worker_${WORKER_ID}.7z +elif [[ "${JOB_TYPE}" == "manager" ]]; then # Step 1: Generate no-plot HTML pages for all tutorials make html-noplot cp -r _build/html docs # Step 2: Wait for all workers to finish - set +e - for ((worker_id=0;worker_id List[str]: sources = [x.relative_to(REPO_BASE_DIR) for x in REPO_BASE_DIR.glob("*_source/**/*.py") if 'data' not in x.parts] - return [str(x) for x in sources] + return sorted([str(x) for x in sources]) def read_metadata() -> Dict[str, Any]: @@ -87,8 +87,8 @@ def parse_args() -> Any: from argparse import ArgumentParser parser = ArgumentParser("Select files to run") parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", 20))) - parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", 0))) + parser.add_argument("--num-shards", type=int, default=int(os.environ.get("NUM_WORKERS", "20"))) + parser.add_argument("--shard-num", type=int, default=int(os.environ.get("WORKER_ID", "1"))) return parser.parse_args() @@ -96,7 +96,7 @@ def main() -> None: args = parse_args() all_files = get_all_files() - files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num] + files_to_run = calculate_shards(all_files, num_shards=args.num_shards)[args.shard_num - 1] if not args.dry_run: remove_other_files(all_files, compute_files_to_keep(files_to_run)) stripped_file_names = [Path(x).stem for x in files_to_run]