From 130e00376643e50e308de7cfe72ad273e4024b91 Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Wed, 10 Jul 2024 14:57:11 +0200 Subject: [PATCH 1/2] Use existing CFO image with latest changes instead of building it --- .github/workflows/e2e_tests.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index b83afb4b8..df618a61a 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -24,6 +24,9 @@ concurrency: group: ${{ github.head_ref }}-${{ github.workflow }} cancel-in-progress: true +env: + CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" + jobs: kubernetes: @@ -92,9 +95,7 @@ jobs: echo Setting up CodeFlare stack make setup-e2e echo Deploying CodeFlare operator - IMG="${REGISTRY_ADDRESS}"/codeflare-operator - make image-push -e IMG="${IMG}" - make deploy -e IMG="${IMG}" -e ENV="e2e" + make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager cd .. From 265101c63b58182a182aac25f56cf932e563d885 Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Tue, 9 Jul 2024 15:27:43 +0200 Subject: [PATCH 2/2] Add e2e tests using GPU to execute current test scenarios --- .github/workflows/e2e_tests.yaml | 30 +++++-------------- docs/e2e.md | 20 +++++++++++-- pyproject.toml | 3 +- tests/e2e/local_interactive_sdk_kind_test.py | 16 ++++++++-- tests/e2e/mnist.py | 5 +++- .../e2e/mnist_raycluster_sdk_aw_kind_test.py | 29 ++++++++++++------ tests/e2e/mnist_raycluster_sdk_kind_test.py | 22 ++++++++++---- tests/e2e/support.py | 2 +- 8 files changed, 83 insertions(+), 44 deletions(-) diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index df618a61a..d216df9d7 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -30,26 +30,9 @@ env: jobs: kubernetes: - runs-on: ubuntu-20.04 + runs-on: ubuntu-20.04-4core-gpu steps: - - name: Cleanup - run: | - ls -lart - echo "Initial status:" - df -h - echo "Cleaning up resources:" - sudo swapoff -a - sudo rm -f /swapfile - sudo apt clean - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - docker rmi $(docker image ls -aq) - echo "Final status:" - df -h - - name: Checkout code uses: actions/checkout@v4 with: @@ -85,9 +68,15 @@ jobs: python-version: '3.9' cache: 'pip' # caching pip dependencies + - name: Setup NVidia GPU environment for KinD + uses: ./common/github-actions/nvidia-gpu-setup + - name: Setup and start KinD cluster uses: ./common/github-actions/kind + - name: Install NVidia GPU operator for KinD + uses: ./common/github-actions/nvidia-gpu-operator + - name: Deploy CodeFlare stack id: deploy run: | @@ -104,9 +93,6 @@ jobs: with: user-name: sdk-user - - name: Add kueue resources - run: kubectl apply --server-side -f "https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.2/manifests.yaml" - - name: Configure RBAC for sdk user with limited permissions run: | kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses @@ -136,7 +122,7 @@ jobs: pip install poetry poetry install --with test,docs echo "Running e2e tests..." - poetry run pytest -v -s ./tests/e2e -m kind > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 + poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 env: GRPC_DNS_RESOLVER: "native" diff --git a/docs/e2e.md b/docs/e2e.md index 27324f705..039749d4e 100644 --- a/docs/e2e.md +++ b/docs/e2e.md @@ -5,6 +5,9 @@ ## On KinD clusters Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127.0.0.1 kind`. This will map your localhost IP address to the KinD cluster's hostname. This is already performed on [GitHub Actions](https://github.com/project-codeflare/codeflare-common/blob/1edd775e2d4088a5a0bfddafb06ff3a773231c08/github-actions/kind/action.yml#L70-L72) +If the system you run on contains NVidia GPU then you can enable the GPU support in KinD, this will allow you to run also GPU tests. +To enable GPU on KinD follow [these instructions](https://www.substratus.ai/blog/kind-with-gpus). + - Setup Phase: - Pull the [codeflare-operator repo](https://github.com/project-codeflare/codeflare-operator) and run the following make targets: ``` @@ -64,9 +67,13 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127 - Once we have the codeflare-operator, kuberay-operator and kueue running and ready, we can run the e2e test on the codeflare-sdk repository: ``` poetry install --with test,docs - poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_test.py + poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py + ``` + - If the cluster doesn't have NVidia GPU support then we need to disable NVidia GPU tests by providing proper marker: + ``` + poetry install --with test,docs + poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py -m 'kind and not nvidia_gpu' ``` - ## On OpenShift clusters @@ -83,6 +90,10 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127 kubectl apply --server-side -k "github.com/opendatahub-io/kueue/config/rhoai?ref=dev" ``` +If the system you run on contains NVidia GPU then you can enable the GPU support on OpenShift, this will allow you to run also GPU tests. +To enable GPU on OpenShift follow [these instructions](https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/introduction.html). +Currently the SDK doesn't support tolerations, so e2e tests can't be executed on nodes with taint (i.e. GPU taint). + - Test Phase: - Once we have the codeflare-operator, kuberay-operator and kueue running and ready, we can run the e2e test on the codeflare-sdk repository: ``` @@ -97,3 +108,8 @@ Pre-requisite for KinD clusters: please add in your local `/etc/hosts` file `127 ``` poetry run pytest -v -s ./tests/e2e -m openshift --timeout=1200 ``` + - If the cluster doesn't have NVidia GPU support or GPU nodes have taint then we need to disable NVidia GPU tests by providing proper marker: + ``` + poetry install --with test,docs + poetry run pytest -v -s ./tests/e2e/mnist_raycluster_sdk_kind_test.py -m 'not nvidia_gpu' + ``` diff --git a/pyproject.toml b/pyproject.toml index 457e6de95..be225e908 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ filterwarnings = [ ] markers = [ "kind", - "openshift" + "openshift", + "nvidia_gpu" ] addopts = "--timeout=900" diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py index 8ca0bdac9..4479b41c0 100644 --- a/tests/e2e/local_interactive_sdk_kind_test.py +++ b/tests/e2e/local_interactive_sdk_kind_test.py @@ -27,7 +27,16 @@ def test_local_interactives(self): create_kueue_resources(self) self.run_local_interactives() - def run_local_interactives(self): + @pytest.mark.nvidia_gpu + def test_local_interactives_nvidia_gpu(self): + self.setup_method() + create_namespace(self) + create_kueue_resources(self) + self.run_local_interactives(number_of_gpus=1) + + def run_local_interactives( + self, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 + ): ray_image = get_ray_image() cluster_name = "test-ray-cluster-li" @@ -43,6 +52,7 @@ def run_local_interactives(self): worker_cpu_limits=1, worker_memory_requests=1, worker_memory_limits=2, + worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, image=ray_image, write_to_file=True, verify_tls=False, @@ -59,7 +69,7 @@ def run_local_interactives(self): ray.shutdown() ray.init(address=cluster.local_client_url(), logging_level="DEBUG") - @ray.remote + @ray.remote(num_gpus=number_of_gpus / 2) def heavy_calculation_part(num_iterations): result = 0.0 for i in range(num_iterations): @@ -68,7 +78,7 @@ def heavy_calculation_part(num_iterations): result += math.sin(i) * math.cos(j) * math.tan(k) return result - @ray.remote + @ray.remote(num_gpus=number_of_gpus / 2) def heavy_calculation(num_iterations): results = ray.get( [heavy_calculation_part.remote(num_iterations // 30) for _ in range(30)] diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py index 2971d9c98..55ed91eaa 100644 --- a/tests/e2e/mnist.py +++ b/tests/e2e/mnist.py @@ -32,6 +32,9 @@ print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) +print("ACCELERATOR: is ", os.getenv("ACCELERATOR")) +ACCELERATOR = os.getenv("ACCELERATOR") + class LitMNIST(LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): @@ -149,7 +152,7 @@ def test_dataloader(self): # Initialize a trainer trainer = Trainer( - accelerator="auto", + accelerator=ACCELERATOR, # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs max_epochs=3, callbacks=[TQDMProgressBar(refresh_rate=20)], diff --git a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py index 2aa5da16d..39bd25fda 100644 --- a/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_aw_kind_test.py @@ -24,9 +24,18 @@ def test_mnist_ray_cluster_sdk_kind(self): self.setup_method() create_namespace(self) create_kueue_resources(self) - self.run_mnist_raycluster_sdk_kind() + self.run_mnist_raycluster_sdk_kind(accelerator="cpu") - def run_mnist_raycluster_sdk_kind(self): + @pytest.mark.nvidia_gpu + def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): + self.setup_method() + create_namespace(self) + create_kueue_resources(self) + self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) + + def run_mnist_raycluster_sdk_kind( + self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 + ): ray_image = get_ray_image() cluster = Cluster( @@ -36,11 +45,11 @@ def run_mnist_raycluster_sdk_kind(self): num_workers=1, head_cpus="500m", head_memory=2, - min_cpus="500m", - max_cpus=1, - min_memory=1, - max_memory=2, - num_gpus=0, + worker_cpu_requests="500m", + worker_cpu_limits=1, + worker_memory_requests=1, + worker_memory_limits=4, + worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, image=ray_image, write_to_file=True, verify_tls=False, @@ -58,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self): cluster.details() - self.assert_jobsubmit_withoutlogin_kind(cluster) + self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) # Assertions - def assert_jobsubmit_withoutlogin_kind(self, cluster): + def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): ray_dashboard = cluster.cluster_dashboard_uri() client = RayJobClient(address=ray_dashboard, verify=False) @@ -71,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster): runtime_env={ "working_dir": "./tests/e2e/", "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": {"ACCELERATOR": accelerator}, }, + entrypoint_num_gpus=number_of_gpus, ) print(f"Submitted job with ID: {submission_id}") done = False diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py index af5fcc1f8..356d56f98 100644 --- a/tests/e2e/mnist_raycluster_sdk_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py @@ -25,9 +25,18 @@ def test_mnist_ray_cluster_sdk_kind(self): self.setup_method() create_namespace(self) create_kueue_resources(self) - self.run_mnist_raycluster_sdk_kind() + self.run_mnist_raycluster_sdk_kind(accelerator="cpu") - def run_mnist_raycluster_sdk_kind(self): + @pytest.mark.nvidia_gpu + def test_mnist_ray_cluster_sdk_kind_nvidia_gpu(self): + self.setup_method() + create_namespace(self) + create_kueue_resources(self) + self.run_mnist_raycluster_sdk_kind(accelerator="gpu", number_of_gpus=1) + + def run_mnist_raycluster_sdk_kind( + self, accelerator, gpu_resource_name="nvidia.com/gpu", number_of_gpus=0 + ): ray_image = get_ray_image() cluster = Cluster( @@ -40,7 +49,8 @@ def run_mnist_raycluster_sdk_kind(self): worker_cpu_requests="500m", worker_cpu_limits=1, worker_memory_requests=1, - worker_memory_limits=2, + worker_memory_limits=4, + worker_extended_resource_requests={gpu_resource_name: number_of_gpus}, image=ray_image, write_to_file=True, verify_tls=False, @@ -57,11 +67,11 @@ def run_mnist_raycluster_sdk_kind(self): cluster.details() - self.assert_jobsubmit_withoutlogin_kind(cluster) + self.assert_jobsubmit_withoutlogin_kind(cluster, accelerator, number_of_gpus) # Assertions - def assert_jobsubmit_withoutlogin_kind(self, cluster): + def assert_jobsubmit_withoutlogin_kind(self, cluster, accelerator, number_of_gpus): ray_dashboard = cluster.cluster_dashboard_uri() client = RayJobClient(address=ray_dashboard, verify=False) @@ -70,7 +80,9 @@ def assert_jobsubmit_withoutlogin_kind(self, cluster): runtime_env={ "working_dir": "./tests/e2e/", "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": {"ACCELERATOR": accelerator}, }, + entrypoint_num_gpus=number_of_gpus, ) print(f"Submitted job with ID: {submission_id}") done = False diff --git a/tests/e2e/support.py b/tests/e2e/support.py index d8a06bb70..3eb241536 100644 --- a/tests/e2e/support.py +++ b/tests/e2e/support.py @@ -90,7 +90,7 @@ def create_cluster_queue(self, cluster_queue, flavor): "resources": [ {"name": "cpu", "nominalQuota": 9}, {"name": "memory", "nominalQuota": "36Gi"}, - {"name": "nvidia.com/gpu", "nominalQuota": 0}, + {"name": "nvidia.com/gpu", "nominalQuota": 1}, ], } ],