Rework SDK priority implementation + updated testing

Maxusmusti · openshift-merge-robot · commit 5412907c2be9 · 2023-09-01T22:30:41.000+02:00
diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py
@@ -61,38 +61,25 @@ def __init__(self, config: ClusterConfiguration):
         self.app_wrapper_yaml = self.create_app_wrapper()
         self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0]
 
-    def evaluate_config(self):
-        if not self.evaluate_dispatch_priority():
-            return False
-        else:
-            return True
-
     def evaluate_dispatch_priority(self):
         priority_class = self.config.dispatch_priority
-        if priority_class is None:
-            return True
-        else:
-            try:
-                config_check()
-                api_instance = client.CustomObjectsApi(api_config_handler())
-                priority_classes = api_instance.list_cluster_custom_object(
-                    group="scheduling.k8s.io",
-                    version="v1",
-                    plural="priorityclasses",
-                )
-                available_priority_classes = [
-                    i["metadata"]["name"] for i in priority_classes["items"]
-                ]
-            except Exception as e:  # pragma: no cover
-                return _kube_api_error_handling(e)
-
-            if priority_class in available_priority_classes:
-                return True
-            else:
-                print(
-                    f"Priority class {priority_class} is not available in the cluster"
-                )
-                return False
+
+        try:
+            config_check()
+            api_instance = client.CustomObjectsApi(api_config_handler())
+            priority_classes = api_instance.list_cluster_custom_object(
+                group="scheduling.k8s.io",
+                version="v1",
+                plural="priorityclasses",
+            )
+        except Exception as e:  # pragma: no cover
+            return _kube_api_error_handling(e)
+
+        for pc in priority_classes["items"]:
+            if pc["metadata"]["name"] == priority_class:
+                return pc["value"]
+        print(f"Priority class {priority_class} is not available in the cluster")
+        return None
 
     def create_app_wrapper(self):
         """
@@ -109,6 +96,16 @@ def create_app_wrapper(self):
                     f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication."
                 )
 
+        # Before attempting to create the cluster AW, let's evaluate the ClusterConfig
+        if self.config.dispatch_priority:
+            priority_val = self.evaluate_dispatch_priority()
+            if priority_val == None:
+                raise ValueError(
+                    "Invalid Cluster Configuration, AppWrapper not generated"
+                )
+        else:
+            priority_val = None
+
         name = self.config.name
         namespace = self.config.namespace
         min_cpu = self.config.min_cpus
@@ -142,6 +139,7 @@ def create_app_wrapper(self):
             local_interactive=local_interactive,
             image_pull_secrets=image_pull_secrets,
             dispatch_priority=dispatch_priority,
+            priority_val=priority_val,
         )
 
     # creates a new cluster with the provided or default spec
@@ -150,12 +148,6 @@ def up(self):
         Applies the AppWrapper yaml, pushing the resource request onto
         the MCAD queue.
         """
-
-        # Before attempting to bring up the cluster let's evaluate the ClusterConfig
-        if not self.evaluate_config():
-            print("Invalid Cluster Configuration")
-            return False
-
         namespace = self.config.namespace
         try:
             config_check()
diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py
@@ -89,12 +89,21 @@ def update_labels(yaml, instascale, instance_types):
         metadata.pop("labels")
 
 
-def update_priority(item, dispatch_priority):
+def update_priority(yaml, item, dispatch_priority, priority_val):
+    spec = yaml.get("spec")
     if dispatch_priority is not None:
+        if priority_val:
+            spec["priority"] = priority_val
+        else:
+            raise ValueError(
+                "AW generation error: Priority value is None, while dispatch_priority is defined"
+            )
         head = item.get("generictemplate").get("spec").get("headGroupSpec")
         worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
         head["template"]["spec"]["priorityClassName"] = dispatch_priority
         worker["template"]["spec"]["priorityClassName"] = dispatch_priority
+    else:
+        spec.pop("priority")
 
 
 def update_custompodresources(
@@ -355,6 +364,7 @@ def generate_appwrapper(
     local_interactive: bool,
     image_pull_secrets: list,
     dispatch_priority: str,
+    priority_val: int,
 ):
     user_yaml = read_template(template)
     appwrapper_name, cluster_name = gen_names(name)
@@ -363,7 +373,7 @@ def generate_appwrapper(
     route_item = resources["resources"].get("GenericItems")[1]
     update_names(user_yaml, item, appwrapper_name, cluster_name, namespace)
     update_labels(user_yaml, instascale, instance_types)
-    update_priority(item, dispatch_priority)
+    update_priority(user_yaml, item, dispatch_priority, priority_val)
     update_custompodresources(
         item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
     )
diff --git a/tests/test-case-prio.yaml b/tests/test-case-prio.yaml
@@ -0,0 +1,197 @@
+apiVersion: mcad.ibm.com/v1beta1
+kind: AppWrapper
+metadata:
+  labels:
+    orderedinstance: cpu.small_gpu.large
+  name: prio-test-cluster
+  namespace: ns
+spec:
+  priority: 10
+  resources:
+    GenericItems:
+    - custompodresources:
+      - limits:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+        replicas: 1
+        requests:
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+      - limits:
+          cpu: 4
+          memory: 6G
+          nvidia.com/gpu: 7
+        replicas: 2
+        requests:
+          cpu: 3
+          memory: 5G
+          nvidia.com/gpu: 7
+      generictemplate:
+        apiVersion: ray.io/v1alpha1
+        kind: RayCluster
+        metadata:
+          labels:
+            appwrapper.mcad.ibm.com: prio-test-cluster
+            controller-tools.k8s.io: '1.0'
+          name: prio-test-cluster
+          namespace: ns
+        spec:
+          autoscalerOptions:
+            idleTimeoutSeconds: 60
+            imagePullPolicy: Always
+            resources:
+              limits:
+                cpu: 500m
+                memory: 512Mi
+              requests:
+                cpu: 500m
+                memory: 512Mi
+            upscalingMode: Default
+          enableInTreeAutoscaling: false
+          headGroupSpec:
+            rayStartParams:
+              block: 'true'
+              dashboard-host: 0.0.0.0
+              num-gpus: '0'
+            serviceType: ClusterIP
+            template:
+              spec:
+                affinity:
+                  nodeAffinity:
+                    requiredDuringSchedulingIgnoredDuringExecution:
+                      nodeSelectorTerms:
+                      - matchExpressions:
+                        - key: prio-test-cluster
+                          operator: In
+                          values:
+                          - prio-test-cluster
+                containers:
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  - name: RAY_USE_TLS
+                    value: '0'
+                  - name: RAY_TLS_SERVER_CERT
+                    value: /home/ray/workspace/tls/server.crt
+                  - name: RAY_TLS_SERVER_KEY
+                    value: /home/ray/workspace/tls/server.key
+                  - name: RAY_TLS_CA_CERT
+                    value: /home/ray/workspace/tls/ca.crt
+                  image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
+                  imagePullPolicy: Always
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command:
+                        - /bin/sh
+                        - -c
+                        - ray stop
+                  name: ray-head
+                  ports:
+                  - containerPort: 6379
+                    name: gcs
+                  - containerPort: 8265
+                    name: dashboard
+                  - containerPort: 10001
+                    name: client
+                  resources:
+                    limits:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 0
+                    requests:
+                      cpu: 2
+                      memory: 8G
+                      nvidia.com/gpu: 0
+                imagePullSecrets:
+                - name: unit-test-pull-secret
+                priorityClassName: default
+          rayVersion: 2.5.0
+          workerGroupSpecs:
+          - groupName: small-group-prio-test-cluster
+            maxReplicas: 2
+            minReplicas: 2
+            rayStartParams:
+              block: 'true'
+              num-gpus: '7'
+            replicas: 2
+            template:
+              metadata:
+                annotations:
+                  key: value
+                labels:
+                  key: value
+              spec:
+                affinity:
+                  nodeAffinity:
+                    requiredDuringSchedulingIgnoredDuringExecution:
+                      nodeSelectorTerms:
+                      - matchExpressions:
+                        - key: prio-test-cluster
+                          operator: In
+                          values:
+                          - prio-test-cluster
+                containers:
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  - name: RAY_USE_TLS
+                    value: '0'
+                  - name: RAY_TLS_SERVER_CERT
+                    value: /home/ray/workspace/tls/server.crt
+                  - name: RAY_TLS_SERVER_KEY
+                    value: /home/ray/workspace/tls/server.key
+                  - name: RAY_TLS_CA_CERT
+                    value: /home/ray/workspace/tls/ca.crt
+                  image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command:
+                        - /bin/sh
+                        - -c
+                        - ray stop
+                  name: machine-learning
+                  resources:
+                    limits:
+                      cpu: 4
+                      memory: 6G
+                      nvidia.com/gpu: 7
+                    requests:
+                      cpu: 3
+                      memory: 5G
+                      nvidia.com/gpu: 7
+                imagePullSecrets:
+                - name: unit-test-pull-secret
+                initContainers:
+                - command:
+                  - sh
+                  - -c
+                  - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
+                    do echo waiting for myservice; sleep 2; done
+                  image: busybox:1.28
+                  name: init-myservice
+                priorityClassName: default
+      replicas: 1
+    - generictemplate:
+        apiVersion: route.openshift.io/v1
+        kind: Route
+        metadata:
+          labels:
+            odh-ray-cluster-service: prio-test-cluster-head-svc
+          name: ray-dashboard-prio-test-cluster
+          namespace: ns
+        spec:
+          port:
+            targetPort: dashboard
+          to:
+            kind: Service
+            name: prio-test-cluster-head-svc
+      replicas: 1
+    Items: []
diff --git a/tests/test-case.yaml b/tests/test-case.yaml
@@ -6,7 +6,6 @@ metadata:
   name: unit-test-cluster
   namespace: ns
 spec:
-  priority: 9
   resources:
     GenericItems:
     - custompodresources:
@@ -109,7 +108,6 @@ spec:
                       nvidia.com/gpu: 0
                 imagePullSecrets:
                 - name: unit-test-pull-secret
-                priorityClassName: default
           rayVersion: 2.5.0
           workerGroupSpecs:
           - groupName: small-group-unit-test-cluster
@@ -177,7 +175,6 @@ spec:
                     do echo waiting for myservice; sleep 2; done
                   image: busybox:1.28
                   name: init-myservice
-                priorityClassName: default
       replicas: 1
     - generictemplate:
         apiVersion: route.openshift.io/v1
diff --git a/tests/unit_test.py b/tests/unit_test.py
@@ -236,7 +236,7 @@ def test_config_creation():
     assert config.instascale
     assert config.machine_types == ["cpu.small", "gpu.large"]
     assert config.image_pull_secrets == ["unit-test-pull-secret"]
-    assert config.dispatch_priority == "default"
+    assert config.dispatch_priority == None
 
 
 def test_cluster_creation():
@@ -248,6 +248,23 @@ def test_cluster_creation():
     )
 
 
+def test_cluster_creation_priority(mocker):
+    mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
+    mocker.patch(
+        "kubernetes.client.CustomObjectsApi.list_cluster_custom_object",
+        return_value={"items": [{"metadata": {"name": "default"}, "value": 10}]},
+    )
+    config = createClusterConfig()
+    config.name = "prio-test-cluster"
+    config.dispatch_priority = "default"
+    cluster = Cluster(config)
+    assert cluster.app_wrapper_yaml == "prio-test-cluster.yaml"
+    assert cluster.app_wrapper_name == "prio-test-cluster"
+    assert filecmp.cmp(
+        "prio-test-cluster.yaml", f"{parent}/tests/test-case-prio.yaml", shallow=True
+    )
+
+
 def test_default_cluster_creation(mocker):
     mocker.patch(
         "codeflare_sdk.cluster.cluster.get_current_namespace",
@@ -2251,6 +2268,7 @@ def test_export_env():
 # Make sure to always keep this function last
 def test_cleanup():
     os.remove("unit-test-cluster.yaml")
+    os.remove("prio-test-cluster.yaml")
     os.remove("unit-test-default-cluster.yaml")
     os.remove("test.yaml")
     os.remove("raytest2.yaml")
diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py