diff --git a/test/perf-test/cleanup-mcad-kwok.sh b/test/perf-test/cleanup-mcad-kwok.sh new file mode 100755 index 000000000..b07372823 --- /dev/null +++ b/test/perf-test/cleanup-mcad-kwok.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +for i in `kubectl get appwrapper -n default |grep fake-defaultaw | awk '{print $1}'`; do kubectl delete appwrapper $i -n default ; done diff --git a/test/perf-test/kwokmcadperf.sh b/test/perf-test/kwokmcadperf.sh new file mode 100755 index 000000000..bb0e2ed37 --- /dev/null +++ b/test/perf-test/kwokmcadperf.sh @@ -0,0 +1,166 @@ +#!/bin/bash + +SCRIPT_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`) + +function help() { + echo "usage: kwokmcadperf.sh [-h]" + echo + echo "Description: Runs Appwrapper performance test script(s) in subdirectories under $SCRIPT_DIR." + echo "NOTE: This runs on KWOK Fake nodes only." + echo + echo "Preconditions: " + echo " - The script assumes you've logged into your cluster already. If not, it will tell you to login." + echo " - The script checks that you have the mcad-controller installed, otherwise it'll tell you to install it first." + echo " - The script checks that you have the kwok-controller installed, otherwise it'll tell you to install it first." + echo + echo "Options:" + echo " -h Print this help message" + echo +} + +function check_kubectl_login_status() { + set +e + kubectl get ns default &> /dev/null + res="$?" + set -e + OCP="$res" + if [ $OCP == 1 ] + then + echo "You need to login to your Kubernetes Cluster" + exit 1 + else + echo + echo "Nice, looks like you're logged in" + fi +} + +function check_mcad_installed_status() { + set +e + kubectl get pod -A |grep mcad-controller &> /dev/null + res2="$?" + kubectl get crd |grep appwrapper &> /dev/null + res3="$?" + set -e + MCAD="$res2" + CRD="$res3" + if [[ $MCAD == 1 ]] || [[ $CRD == 1 ]] + then + echo "You need Install MCAD Controller first before running this script" + exit 1 + else + echo "Nice, MCAD Controller is installed" + fi +} + +function check_kwok_installed_status() { + set +e + kubectl get pod -A |grep kwok-controller &> /dev/null + res2="$?" + set -e + KWOK="$res2" + if [[ $KWOK == 1 ]] + then + echo "You need Install the KWOK Controller first before running this script" + exit 1 + else + echo "Nice, the KWOK Controller is installed" + fi +} + + +while getopts hf: option; do + case $option in + h) + help + exit 0 + ;; + *) + ;; + esac +done +shift $((OPTIND-1)) + +# Track whether we have a valid kubectl login +echo "Checking whether we have a valid cluster login or not..." +check_kubectl_login_status + +# Track whether you have the MCAD controller installed +echo "Checking MCAD Controller installation status" +echo +check_mcad_installed_status + +# Track whether you have the KWOK controller installed +echo "Checking MCAD Controller installation status" +echo +check_kwok_installed_status + +echo +read -p "How many fake KWOK appwrapper jobs do you want?" jobs + +# Start the timer now +SECONDS=0 + +echo "jobs number is $jobs" +export STARTTIME=`date +"%T"` +echo " " +echo "Jobs started at: $STARTTIME" |tee fake-job-$STARTTIME.log +echo " " + +# This fixes the number of jobs to be one less so the for loop gets the right amount +((realjobs=$jobs-1)) + +for num in $(eval echo "{0.."$realjobs"}") +do + next_num=$(($num + 1)) + echo "Submitting job $next_num" +# Had to do this OSTYPE because sed acts differently on Linux versus Mac + case "$OSTYPE" in + linux-gnu*) + sed -i "s/fake-defaultaw-schd-spec-with-timeout-$num/fake-defaultaw-schd-spec-with-timeout-$next_num/g" ${SCRIPT_DIR}/preempt-exp-kwok.yaml ;; + darwin*) + sed -i '' "s/fake-defaultaw-schd-spec-with-timeout-$num/fake-defaultaw-schd-spec-with-timeout-$next_num/g" ${SCRIPT_DIR}/preempt-exp-kwok.yaml ;; + *) + sed -i "s/fake-defaultaw-schd-spec-with-timeout-$num/fake-defaultaw-schd-spec-with-timeout-$next_num/g" ${SCRIPT_DIR}/preempt-exp-kwok.yaml ;; + esac + kubectl apply -f ${SCRIPT_DIR}/preempt-exp-kwok.yaml +done + + # Let's reset the original preempt-exp-kwok.yaml file back to original value + case "$OSTYPE" in + linux-gnu*) + sed -i "s/fake-defaultaw-schd-spec-with-timeout-$next_num/fake-defaultaw-schd-spec-with-timeout-1/g" ${SCRIPT_DIR}/preempt-exp-kwok.yaml ;; + darwin*) + sed -i '' "s/fake-defaultaw-schd-spec-with-timeout-$next_num/fake-defaultaw-schd-spec-with-timeout-1/g" ${SCRIPT_DIR}/preempt-exp-kwok.yaml ;; + *) + sed -i "s/fake-defaultaw-schd-spec-with-timeout-$next_num/fake-defaultaw-schd-spec-with-timeout-1/g" ${SCRIPT_DIR}/preempt-exp-kwok.yaml ;; + esac + +# Check for all jobs to report complete +jobstatus=`kubectl get jobs -n default --no-headers --field-selector status.successful=1 |wc -l` + +while [ $jobstatus -lt $jobs ] +do + echo "Number of completed jobs is: " $jobstatus " and the goal is: " $jobs + sleep 10 + jobstatus=`kubectl get jobs -n default --no-headers --field-selector status.successful=1 |wc -l` +done + +echo " " +export FINISHTIME=`date +"%T"` +echo "All $jobstatus jobs finished: $FINISHTIME" |tee -a fake-job-$STARTTIME.log +echo "Total amount of time for $jobs appwrappers is: $SECONDS seconds" |tee -a ${SCRIPT_DIR}/fake-job-$STARTTIME.log +echo " " +echo "Test results are stored in this file: ${SCRIPT_DIR}/fake-job-$next_num-$STARTTIME.log" + +# Rename the log to show the number of jobs used +mv ${SCRIPT_DIR}/fake-job-$STARTTIME.log ${SCRIPT_DIR}/fake-job-$next_num-$STARTTIME.log + +#Ask if you want to auto-cleanup the appwrapper jobs +echo "Do you want to cleanup the most recently created appwrappers? [Y/n]" +read DELETE +if [[ $DELETE == "Y" || $DELETE == "y" ]]; then + echo "OK, deleting" + ${SCRIPT_DIR}/cleanup-mcad-kwok.sh +else + echo "OK, you'll need to cleanup yourself later using ./cleanup-mcad-kwok.sh" +fi diff --git a/test/perf-test/node.yaml b/test/perf-test/node.yaml new file mode 100644 index 000000000..0745b269c --- /dev/null +++ b/test/perf-test/node.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: Node +metadata: + annotations: + node.alpha.kubernetes.io/ttl: "0" + kwok.x-k8s.io/node: fake + labels: + beta.kubernetes.io/arch: amd64 + beta.kubernetes.io/os: linux + kubernetes.io/arch: amd64 + kubernetes.io/hostname: kwok-node-0 + kubernetes.io/os: linux + kubernetes.io/role: agent + node-role.kubernetes.io/agent: "" + type: kwok + name: kwok-node-0 +spec: + taints: # Avoid scheduling actual running pods to fake Node + - effect: NoSchedule + key: kwok.x-k8s.io/node + value: fake +status: + allocatable: + cpu: 32 + memory: 256Gi + pods: 110 + capacity: + cpu: 32 + memory: 256Gi + pods: 110 + nodeInfo: + architecture: amd64 + bootID: "" + containerRuntimeVersion: "" + kernelVersion: "" + kubeProxyVersion: fake + kubeletVersion: fake + machineID: "" + operatingSystem: linux + osImage: "" + systemUUID: "" + phase: Running diff --git a/test/perf-test/nodes.sh b/test/perf-test/nodes.sh new file mode 100755 index 000000000..dc0bcb3ad --- /dev/null +++ b/test/perf-test/nodes.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +SCRIPT_DIR=$(readlink -f `dirname "${BASH_SOURCE[0]}"`) + +function help() { + echo "usage: nodes.sh [-h]" + echo + echo "Description: Creates fake KWOK nodes for performance testing" + echo + echo "Preconditions: " + echo " - The script assumes you've logged into your cluster already. If not, it will tell you to login." + echo " - The script checks that you have the kwok-controller installed, otherwise it'll tell you to install it first." + echo + echo "Options:" + echo " -h Print this help message" + echo +} + +function check_kubectl_login_status() { + set +e + kubectl get ns default &> /dev/null + res="$?" + set -e + OCP="$res" + if [ $OCP == 1 ] + then + echo "You need to login to your Kubernetes Cluster" + exit 1 + else + echo + echo "Nice, looks like you're logged in" + echo "" + fi +} + +function check_kwok_installed_status() { + set +e + kubectl get pod -A |grep kwok-controller &> /dev/null + res2="$?" + set -e + KWOK="$res2" + if [[ $KWOK == 1 ]] + then + echo "You need Install the KWOK Controller first before running this script" + exit 1 + else + echo "Nice, the KWOK Controller is installed" + fi +} + +while getopts hf: option; do + case $option in + h) + help + exit 0 + ;; + *) + ;; + esac +done +shift $((OPTIND-1)) + +# Track whether we have a valid kubectl login +echo "Checking whether we have a valid cluster login or not..." +check_kubectl_login_status + +# Track whether you have the KWOK controller installed +echo "Checking MCAD Controller installation status" +echo +check_kwok_installed_status + +echo +read -p "How many simulated KWOK nodes do you want?" nodes + +echo "Nodes number is $nodes" +echo " " + +# This fixes the number of jobs to be one less so the for loop gets the right amount +((realnodes=$nodes-1)) +echo "The real number of nodes is $realnodes" + +for num in $(eval echo "{0.."$realnodes"}") +do + next_num=$(($num + 1)) + echo "Submitting node $next_num" +# Had to do this OSTYPE because sed acts differently on Linux versus Mac + case "$OSTYPE" in + linux-gnu*) + sed -i "s/kwok-node-$num/kwok-node-$next_num/g" ${SCRIPT_DIR}/node.yaml ;; + darwin*) + sed -i '' "s/kwok-node-$num/kwok-node-$next_num/g" ${SCRIPT_DIR}/node.yaml ${SCRIPT_DIR}/node.yaml ;; + *) + sed -i "/kwok-node-$num/kwok-node-$next_num/g" ${SCRIPT_DIR}/node.yaml ;; + esac + kubectl apply -f ${SCRIPT_DIR}/node.yaml +done + + # Let's reset the original node.yaml file back to original value + case "$OSTYPE" in + linux-gnu*) + sed -i "s/kwok-node-$next_num/kwok-node-0/g" ${SCRIPT_DIR}/node.yaml ;; + darwin*) + sed -i '' "s/kwok-node-$next_num/kwok-node-0/g" ${SCRIPT_DIR}/node.yaml ;; + *) + sed -i "s/kwok-node-$next_num/kwok-node-0/g" ${SCRIPT_DIR}/node.yaml ;; + esac + +# Check for all nodes to report complete +echo "Waiting until all the simualted pods become ready:" +kubectl wait --for=condition=Ready nodes --selector type=kwok --timeout=600s +echo " " +echo "Total amount of simulated nodes requested is: $nodes" +echo "Total number of created nodes is: "`kubectl get nodes --selector type=kwok -o name |wc -l` +kubectl get nodes --selector type=kwok + +echo " " +echo "FYI, to clean up the kwow nodes, issue this:" +echo "kubectl get nodes --selector type=kwok -o name | xargs kubectl delete" diff --git a/test/perf-test/preempt-exp-kwok.yaml b/test/perf-test/preempt-exp-kwok.yaml new file mode 100644 index 000000000..8f7729fc4 --- /dev/null +++ b/test/perf-test/preempt-exp-kwok.yaml @@ -0,0 +1,61 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: fake-defaultaw-schd-spec-with-timeout-1 + namespace: default +spec: + schedulingSpec: + minAvailable: 1 + requeuing: + timeInSeconds: 120 + growthType: "exponential" + priority: 9 + resources: + Items: [] + GenericItems: + - replicas: 1 + completionstatus: Complete + custompodresources: + - replicas: 1 + requests: + cpu: 10m + memory: 10M + nvidia.com/gpu: 0 + limits: + cpu: 500m + memory: 128M + nvidia.com/gpu: 0 + generictemplate: + apiVersion: batch/v1 + kind: Job + metadata: + namespace: default + name: fake-defaultaw-schd-spec-with-timeout-1 + spec: + parallelism: 1 + completions: 1 + template: + metadata: + namespace: default + labels: + appwrapper.mcad.ibm.com: "fake-defaultaw-schd-spec-with-timeout-1" + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: In + values: + - kwok + # A taints was added to an automatically created Node. + # You can remove taints of Node or add this tolerations. + tolerations: + - key: "kwok.x-k8s.io/node" + operator: "Exists" + effect: "NoSchedule" + containers: + - name: fake-defaultaw-schd-spec-with-timeout-1 + image: fake-image + restartPolicy: Never diff --git a/test/perf-test/simulatingnodesandappwrappers.md b/test/perf-test/simulatingnodesandappwrappers.md new file mode 100644 index 000000000..9dc314a93 --- /dev/null +++ b/test/perf-test/simulatingnodesandappwrappers.md @@ -0,0 +1,199 @@ +# Using Kwok to simulate a large Kubernetes/OpenShift Cluster (with or without GPU) + +This is using the OpenSource KWOK tool from https://kwok.sigs.k8s.io/ + +The Steps below show two ways to simulated a large number of KWOK kubernetes nodes. +- The first way is running on your Mac laptop +- The second way is running KWOK inside an existing Kubernetes cluster + +# First Way: Using KWOK to simulate a large number of nodes and MCAD Appwrappers on a Mac laptop +## Step 0. Pre-Reqs +### 0.1 Make sure you have podman (I don't have Docker to test with), installed on your mac with a podman machine of at least 4 cpu and 8GB memory: +``` +brew update +brew upgrade +brew install podman +podman machine init --cpus 4 --memory 8196 +podman machine set --rootful +podman machine start +podman machine list +``` + +### 0.2 Install helm on your laptop, if you don't already have it: +``` +brew install helm +``` + +### 0.3 Create a kind cluster +``` +brew install kind +kind --version +kind create cluster +kubectl get nodes +``` +### 0.4 Check that you see your node: +``` +kubectl get nodes + +NAME STATUS ROLES AGE VERSION +kind-control-plane Ready control-plane 56m v1.26.3 +``` +Note: If you need to get back to your kind cluster context at some point later, the command is: +``` +kubectl cluster-info --context kind-kind +``` + +### 0.5 Install OLM on the cluster: +Note: The latest version changes with time. You can find the latest releases at: https://github.com/operator-framework/operator-lifecycle-manager/releases/ +``` +curl -L https://github.com/operator-framework/operator-lifecycle-manager/releases/download/v0.24.0/install.sh -o install.sh +chmod +x install.sh +./install.sh v0.24.0 +``` +### 0.6 Check that your OLM pods start: +``` +kubectl get pods -A +``` +## Step 1. Deploy NCAD on your cluster +### 1.1 Make sure you have room: # You'll at least 2 free cpu and at least 2GB memory free +``` +kubectl describe node |grep cpu +kubectl describe node |grep mem +``` +### 1.2 Clone the MCAD repo and change directory to it's deployment folder: +``` +git clone https://github.com/project-codeflare/multi-cluster-app-dispatcher.git +cd multi-cluster-app-dispatcher/deployment +``` +### 1.3 Install via helm using the following command - change the image.tag as necessary if you want something specific... +``` +helm install mcad-controller --namespace kube-system --generate-name --set image.repository=quay.io/project-codeflare/mcad-controller --set image.tag=main-v1.29.58 +``` +### 1.4 Check that mcad is running: +``` +kubectl get pods -n kube-system |grep mcad +``` + +## Step 2. Creating simulated KWOK node(s) +### 2.1 cd to where the MCAD performance scripts are located +``` +cd ../test/perf-test +``` + +### 2.2 Run the script ./nodes.sh +``` +./nodes.sh +``` +### 2.3 Check that the requested number of nodes started: +``` +kubectl get nodes --selector type=kwok +``` + +## Step 3. Create some AppWrapper jobs which create simulated pods on the simulated KWOK nodes +### 3.1 Run the script kwokmcadperf.sh +``` +./kwokmcadperf.sh +``` +## Step 4. Cleaning up +### 4.1 Clean up all the simulated AppWrapper jobs with the cleanup-mcad-kwok.sh script: +``` +./cleanup-mcad-kwok.sh +``` +### 4.2 Clean up all the simulated nodes with the following command: +``` +kubectl get nodes --selector type=kwok -o name | xargs kubectl delete +``` + +# Second Way: Using KWOK inside an existing Kubernetes cluster +## Step 0. Pre-Reqs +### 0.1 Requires a Cluster running Kubernetes v1.10 or higher. +``` +kubectl version --short=true +``` + +### 0.2 Access to the `kube-system` namespace. +``` +kubectl get pods -n kube-system +``` + +### 0.3 Requires that the MCAD controller is already installed +``` +kubectl get pods -A |grep mcad-controller +``` + +### 0.4 Install podman, jq, etc... +``` +yum install make podman git tree jq go bc -y +``` + +### 0.5 Install the latest version of Kustomize +``` +OS=$(uname) +curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash +mv kustomize /usr/local/bin +kustomize version +``` + +## Step 1. Install KWOK in Cluster: +### 1.1 Variable Prep: +``` +export KWOK_WORK_DIR=$(mktemp -d) +export KWOK_REPO=kubernetes-sigs/kwok +export KWOK_LATEST_RELEASE=$(curl "https://api.github.com/repos/${KWOK_REPO}/releases/latest" | jq -r '.tag_name') +``` +### 1.2 Render kustomization yaml +``` +cat < "${KWOK_WORK_DIR}/kustomization.yaml" +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +images: + - name: registry.k8s.io/kwok/kwok + newTag: "${KWOK_LATEST_RELEASE}" +resources: + - "https://github.com/${KWOK_REPO}/kustomize/kwok?ref=${KWOK_LATEST_RELEASE}" +EOF +``` +### 1.3 Render it with the prepared variables. +``` +kubectl kustomize "${KWOK_WORK_DIR}" > "${KWOK_WORK_DIR}/kwok.yaml" +``` +## Step 2. Install the KWOK Controller in kube-system namespace: +### 2.1 Apply your rendered yaml file from step 1.3 above: +``` +kubectl apply -f "${KWOK_WORK_DIR}/kwok.yaml" +``` +### 2.2 Check to make sure the kwok controller started: +``` +kubectl get pods -n kube-system |grep kwok-controller +``` + +## Step 3. Creating simulated KWOK node(s) +### 3.1 Clone the MCAD repo and change directory to the test/perf-test folder: +``` +git clone https://github.com/project-codeflare/multi-cluster-app-dispatcher.git +cd multi-cluster-app-dispatcher/test/perf-test +``` + +### 3.2 Run the script ./nodes.sh +``` +./nodes.sh +``` +### 3.3 Check that the requested number of nodes started: +``` +kubectl get nodes --selector type=kwok +``` + +## Step 4. Create some AppWrapper jobs which create simulated pods on the simulated KWOK nodes +### 4.1 Run the script kwokmcadperf.sh +``` +./kwokmcadperf.sh +``` +## Step 5. Cleaning up +### 5.1 Clean up all the simulated AppWrapper jobs with the cleanup-mcad-kwok.sh script: +``` +./cleanup-mcad-kwok.sh +``` +### 5.2 Clean up all the simulated nodes with the following command: +``` +kubectl get nodes --selector type=kwok -o name | xargs kubectl delete +```