Skip to content

Adding GPUs to Kind cluster #494

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions hack/run-e2e-kind.sh
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,63 @@ function setup-mcad-env {
do
echo -n "." && sleep 1;
done
}

function extend-resources {
# Patch nodes to provide GPUs resources without physical GPUs.
# This is intended to allow testing of GPU specific features such as histograms.

# Start communication with cluster
kubectl proxy > /dev/null 2>&1 &
proxy_pid=$!

echo "Starting background proxy connection (pid=${proxy_pid})..."

curl 127.0.0.1:8001 > /dev/null 2>&1

if [[ ! $? -eq 0 ]]; then
echo "Calling 'kubectl proxy' did not create a successful connection to the kubelet needed to patch the nodes. Exiting."
exit 1
else
echo "Connected to the kubelet for patching the nodes"
fi


# Variables
resource_name="nvidia.com~1gpu"
resource_count="8"

# Patch nodes
for node_name in $(kubectl get nodes --no-headers -o custom-columns=":metadata.name")
do
echo "- Patching node (add): ${node_name}"

patching_status=$(curl --header "Content-Type: application/json-patch+json" \
--request PATCH \
--data '[{"op": "add", "path": "/status/capacity/'${resource_name}'", "value": "'${resource_count}'"}]' \
http://localhost:8001/api/v1/nodes/${node_name}/status | jq -r '.status')

if [[ ${patching_status} == "Failure" ]]; then
echo "Failed to patch node '${node_name}' with GPU resources"
exit 1
fi

echo
done

# Stop communication with cluster
echo "Killing proxy (pid=${proxy_pid})..."
kill -9 ${proxy_pid}

# Run kuttl tests to confirm GPUs were added correctly
kuttl_test="${ROOT_DIR}/test/kuttl-test-extended-resources.yaml"
echo "kubectl kuttl test --config ${kuttl_test}"
kubectl kuttl test --config ${kuttl_test}
if [ $? -ne 0 ]
then
echo "kuttl e2e test '${kuttl_test}' failure, exiting."
exit 1
fi
}

function kuttl-tests {
Expand Down Expand Up @@ -402,6 +458,7 @@ trap cleanup EXIT
update_test_host
check-prerequisites
kind-up-cluster
extend-resources
setup-mcad-env
# MCAD with quotamanagement options is started by kuttl-tests
kuttl-tests
Expand Down
10 changes: 10 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/00-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
# Verify that GPUs are a resource for the node
apiVersion: v1
kind: Node
metadata:
name: test-worker
status:
allocatable:
nvidia.com/gpu: "8"

5 changes: 5 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/01-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Verify that the namespace was created
apiVersion: v1
kind: Namespace
metadata:
name: extended-resources
4 changes: 4 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/01-install.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: extended-resources
8 changes: 8 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/02-assert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: batch/v1
kind: Job
metadata:
name: gpu-job
namespace: extended-resources
status:
conditions:
- type: Complete
19 changes: 19 additions & 0 deletions test/e2e-kuttl-extended-resources/steps/02-install.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: batch/v1
kind: Job
metadata:
name: gpu-job
namespace: extended-resources
spec:
template:
spec:
restartPolicy: Never
containers:
- name: gpu-job
image: ubuntu:latest
command: [ "/bin/bash", "-c", "--" ]
args: [ "sleep 10;" ]
resources:
requests:
nvidia.com/gpu: 8
limits:
nvidia.com/gpu: 8
7 changes: 7 additions & 0 deletions test/kuttl-test-extended-resources.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kuttl.dev/v1beta1
kind: TestSuite
testDirs:
- test/e2e-kuttl-extended-resources/
timeout: 60
artifactsDir: _output/logs
commands: