[CI] Update runner configuration for setup and nightly tests (#9005)

XciD · sayakpaul · commit f87ef1d061d4 · 2024-12-23T13:02:14.000+05:30
* [CI] Update runner configuration for setup and nightly tests

Signed-off-by: Adrien &lt;adrien@huggingface.co&gt;

* fix group

Signed-off-by: Adrien &lt;adrien@huggingface.co&gt;

* update for t4

Signed-off-by: Adrien &lt;adrien@huggingface.co&gt;

---------

Signed-off-by: Adrien &lt;adrien@huggingface.co&gt;
diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
@@ -20,7 +20,8 @@ env:
 
 jobs:
   test-build-docker-images:
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
     if: github.event_name == 'pull_request'
     steps:
       - name: Set up Docker Buildx
@@ -50,7 +51,8 @@ jobs:
         if: steps.file_changes.outputs.all != ''
 
   build-and-push-docker-images:
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
     if: github.event_name != 'pull_request'
 
     permissions:
@@ -98,4 +100,4 @@ jobs:
           slack_channel: ${{ env.CI_SLACK_CHANNEL }}
           title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
           status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
@@ -19,7 +19,8 @@ env:
 jobs:
   setup_torch_cuda_pipeline_matrix:
     name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
     container:
       image: diffusers/diffusers-pytorch-cpu
     outputs:
@@ -55,7 +56,8 @@ jobs:
       max-parallel: 8
       matrix:
         module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host --gpus 0
@@ -105,7 +107,8 @@ jobs:
 
   run_nightly_tests_for_other_torch_modules:
     name: Nightly Torch CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host --gpus 0
@@ -234,7 +237,8 @@ jobs:
 
   run_nightly_onnx_tests:
     name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-onnxruntime-cuda
       options: --gpus 0 --shm-size "16gb" --ipc host
diff --git a/.github/workflows/pr_test_fetcher.yml b/.github/workflows/pr_test_fetcher.yml
@@ -15,7 +15,8 @@ concurrency:
 jobs:
   setup_pr_tests:
     name: Setup PR Tests
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
     container:
       image: diffusers/diffusers-pytorch-cpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -73,7 +74,8 @@ jobs:
       max-parallel: 2
       matrix:
         modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
     container:
       image: diffusers/diffusers-pytorch-cpu
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -123,12 +125,13 @@ jobs:
         config:
           - name: Hub tests for models, schedulers, and pipelines
             framework: hub_tests_pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
             image: diffusers/diffusers-pytorch-cpu
             report: torch_hub
 
     name: ${{ matrix.config.name }}
-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}
     container:
       image: ${{ matrix.config.image }}
       options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
diff --git a/.github/workflows/pr_test_peft_backend.yml b/.github/workflows/pr_test_peft_backend.yml
@@ -71,7 +71,8 @@ jobs:
 
     name: LoRA - ${{ matrix.lib-versions }}
 
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
 
     container:
       image: diffusers/diffusers-pytorch-cpu
@@ -128,4 +129,4 @@ jobs:
       uses: actions/upload-artifact@v2
       with:
         name: pr_${{ matrix.config.report }}_test_reports
-        path: reports
+        path: reports
diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml
@@ -77,28 +77,29 @@ jobs:
         config:
           - name: Fast PyTorch Pipeline CPU tests
             framework: pytorch_pipelines
-            runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
+            runner: aws-highmemory-32-plus
             image: diffusers/diffusers-pytorch-cpu
             report: torch_cpu_pipelines
           - name: Fast PyTorch Models & Schedulers CPU tests
             framework: pytorch_models
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
             image: diffusers/diffusers-pytorch-cpu
             report: torch_cpu_models_schedulers
           - name: Fast Flax CPU tests
             framework: flax
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
             image: diffusers/diffusers-flax-cpu
             report: flax_cpu
           - name: PyTorch Example CPU tests
             framework: pytorch_examples
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
             image: diffusers/diffusers-pytorch-cpu
             report: torch_example_cpu
 
     name: ${{ matrix.config.name }}
 
-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}
 
     container:
       image: ${{ matrix.config.image }}
@@ -180,7 +181,8 @@ jobs:
         config:
           - name: Hub tests for models, schedulers, and pipelines
             framework: hub_tests_pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner:
+              group: aws-general-8-plus
             image: diffusers/diffusers-pytorch-cpu
             report: torch_hub
 
diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml
@@ -19,7 +19,8 @@ env:
 jobs:
   setup_torch_cuda_pipeline_matrix:
     name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
+    runs-on:
+      group: aws-general-8-plus
     container:
       image: diffusers/diffusers-pytorch-cpu
     outputs:
@@ -57,7 +58,8 @@ jobs:
       max-parallel: 8
       matrix:
         module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host --gpus 0
@@ -101,7 +103,8 @@ jobs:
 
   torch_cuda_tests:
     name: Torch CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-pytorch-cuda
       options: --shm-size "16gb" --ipc host --gpus 0
@@ -201,7 +204,8 @@ jobs:
 
   onnx_cuda_tests:
     name: ONNX CUDA Tests
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: diffusers/diffusers-onnxruntime-cuda
       options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
@@ -249,7 +253,8 @@ jobs:
   run_torch_compile_tests:
     name: PyTorch Compile CUDA tests
 
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
 
     container:
       image: diffusers/diffusers-pytorch-compile-cuda
@@ -291,7 +296,8 @@ jobs:
   run_xformers_tests:
     name: PyTorch xformers CUDA tests
 
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
 
     container:
       image: diffusers/diffusers-pytorch-xformers-cuda
@@ -332,7 +338,8 @@ jobs:
   run_examples_tests:
     name: Examples PyTorch CUDA tests on Ubuntu
 
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
 
     container:
       image: diffusers/diffusers-pytorch-cuda
diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml
@@ -29,28 +29,29 @@ jobs:
         config:
           - name: Fast PyTorch CPU tests on Ubuntu
             framework: pytorch
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
             image: diffusers/diffusers-pytorch-cpu
             report: torch_cpu
           - name: Fast Flax CPU tests on Ubuntu
             framework: flax
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
             image: diffusers/diffusers-flax-cpu
             report: flax_cpu
           - name: Fast ONNXRuntime CPU tests on Ubuntu
             framework: onnxruntime
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
             image: diffusers/diffusers-onnxruntime-cpu
             report: onnx_cpu
           - name: PyTorch Example CPU tests on Ubuntu
             framework: pytorch_examples
-            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
+            runner: aws-general-8-plus
             image: diffusers/diffusers-pytorch-cpu
             report: torch_example_cpu
 
     name: ${{ matrix.config.name }}
 
-    runs-on: ${{ matrix.config.runner }}
+    runs-on:
+      group: ${{ matrix.config.runner }}
 
     container:
       image: ${{ matrix.config.image }}
diff --git a/.github/workflows/run_tests_from_a_pr.yml b/.github/workflows/run_tests_from_a_pr.yml
@@ -26,7 +26,8 @@ env:
 jobs:
   run_tests:
     name: "Run a test on our runner from a PR"
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g4dn-2xlarge
     container:
       image: ${{ github.event.inputs.docker_image }}
       options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -70,4 +71,4 @@ jobs:
         env:
             PY_TEST: ${{ github.event.inputs.test }}
         run: |
-          pytest "$PY_TEST"
+          pytest "$PY_TEST"
diff --git a/.github/workflows/ssh-pr-runner.yml b/.github/workflows/ssh-pr-runner.yml
@@ -19,7 +19,8 @@ env:
 jobs:
   ssh_runner:
     name: "SSH"
-    runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
+    runs-on:
+      group: aws-highmemory-32-plus
     container:
       image: ${{ github.event.inputs.docker_image }}
       options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged
diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
@@ -22,7 +22,8 @@ env:
 jobs:
   ssh_runner:
     name: "SSH"
-    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
+    runs-on:
+      group: "${{ github.event.inputs.runner_type }}"
     container:
       image: ${{ github.event.inputs.docker_image }}
       options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged