Skip to content

Commit 5412907

Browse files
Maxusmustiopenshift-merge-robot
authored andcommitted
Rework SDK priority implementation + updated testing
1 parent cee3407 commit 5412907

File tree

6 files changed

+257
-43
lines changed

6 files changed

+257
-43
lines changed

src/codeflare_sdk/cluster/cluster.py

Lines changed: 28 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -61,38 +61,25 @@ def __init__(self, config: ClusterConfiguration):
6161
self.app_wrapper_yaml = self.create_app_wrapper()
6262
self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0]
6363

64-
def evaluate_config(self):
65-
if not self.evaluate_dispatch_priority():
66-
return False
67-
else:
68-
return True
69-
7064
def evaluate_dispatch_priority(self):
7165
priority_class = self.config.dispatch_priority
72-
if priority_class is None:
73-
return True
74-
else:
75-
try:
76-
config_check()
77-
api_instance = client.CustomObjectsApi(api_config_handler())
78-
priority_classes = api_instance.list_cluster_custom_object(
79-
group="scheduling.k8s.io",
80-
version="v1",
81-
plural="priorityclasses",
82-
)
83-
available_priority_classes = [
84-
i["metadata"]["name"] for i in priority_classes["items"]
85-
]
86-
except Exception as e: # pragma: no cover
87-
return _kube_api_error_handling(e)
88-
89-
if priority_class in available_priority_classes:
90-
return True
91-
else:
92-
print(
93-
f"Priority class {priority_class} is not available in the cluster"
94-
)
95-
return False
66+
67+
try:
68+
config_check()
69+
api_instance = client.CustomObjectsApi(api_config_handler())
70+
priority_classes = api_instance.list_cluster_custom_object(
71+
group="scheduling.k8s.io",
72+
version="v1",
73+
plural="priorityclasses",
74+
)
75+
except Exception as e: # pragma: no cover
76+
return _kube_api_error_handling(e)
77+
78+
for pc in priority_classes["items"]:
79+
if pc["metadata"]["name"] == priority_class:
80+
return pc["value"]
81+
print(f"Priority class {priority_class} is not available in the cluster")
82+
return None
9683

9784
def create_app_wrapper(self):
9885
"""
@@ -109,6 +96,16 @@ def create_app_wrapper(self):
10996
f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication."
11097
)
11198

99+
# Before attempting to create the cluster AW, let's evaluate the ClusterConfig
100+
if self.config.dispatch_priority:
101+
priority_val = self.evaluate_dispatch_priority()
102+
if priority_val == None:
103+
raise ValueError(
104+
"Invalid Cluster Configuration, AppWrapper not generated"
105+
)
106+
else:
107+
priority_val = None
108+
112109
name = self.config.name
113110
namespace = self.config.namespace
114111
min_cpu = self.config.min_cpus
@@ -142,6 +139,7 @@ def create_app_wrapper(self):
142139
local_interactive=local_interactive,
143140
image_pull_secrets=image_pull_secrets,
144141
dispatch_priority=dispatch_priority,
142+
priority_val=priority_val,
145143
)
146144

147145
# creates a new cluster with the provided or default spec
@@ -150,12 +148,6 @@ def up(self):
150148
Applies the AppWrapper yaml, pushing the resource request onto
151149
the MCAD queue.
152150
"""
153-
154-
# Before attempting to bring up the cluster let's evaluate the ClusterConfig
155-
if not self.evaluate_config():
156-
print("Invalid Cluster Configuration")
157-
return False
158-
159151
namespace = self.config.namespace
160152
try:
161153
config_check()

src/codeflare_sdk/utils/generate_yaml.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,21 @@ def update_labels(yaml, instascale, instance_types):
8989
metadata.pop("labels")
9090

9191

92-
def update_priority(item, dispatch_priority):
92+
def update_priority(yaml, item, dispatch_priority, priority_val):
93+
spec = yaml.get("spec")
9394
if dispatch_priority is not None:
95+
if priority_val:
96+
spec["priority"] = priority_val
97+
else:
98+
raise ValueError(
99+
"AW generation error: Priority value is None, while dispatch_priority is defined"
100+
)
94101
head = item.get("generictemplate").get("spec").get("headGroupSpec")
95102
worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
96103
head["template"]["spec"]["priorityClassName"] = dispatch_priority
97104
worker["template"]["spec"]["priorityClassName"] = dispatch_priority
105+
else:
106+
spec.pop("priority")
98107

99108

100109
def update_custompodresources(
@@ -355,6 +364,7 @@ def generate_appwrapper(
355364
local_interactive: bool,
356365
image_pull_secrets: list,
357366
dispatch_priority: str,
367+
priority_val: int,
358368
):
359369
user_yaml = read_template(template)
360370
appwrapper_name, cluster_name = gen_names(name)
@@ -363,7 +373,7 @@ def generate_appwrapper(
363373
route_item = resources["resources"].get("GenericItems")[1]
364374
update_names(user_yaml, item, appwrapper_name, cluster_name, namespace)
365375
update_labels(user_yaml, instascale, instance_types)
366-
update_priority(item, dispatch_priority)
376+
update_priority(user_yaml, item, dispatch_priority, priority_val)
367377
update_custompodresources(
368378
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
369379
)

tests/test-case-prio.yaml

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
apiVersion: mcad.ibm.com/v1beta1
2+
kind: AppWrapper
3+
metadata:
4+
labels:
5+
orderedinstance: cpu.small_gpu.large
6+
name: prio-test-cluster
7+
namespace: ns
8+
spec:
9+
priority: 10
10+
resources:
11+
GenericItems:
12+
- custompodresources:
13+
- limits:
14+
cpu: 2
15+
memory: 8G
16+
nvidia.com/gpu: 0
17+
replicas: 1
18+
requests:
19+
cpu: 2
20+
memory: 8G
21+
nvidia.com/gpu: 0
22+
- limits:
23+
cpu: 4
24+
memory: 6G
25+
nvidia.com/gpu: 7
26+
replicas: 2
27+
requests:
28+
cpu: 3
29+
memory: 5G
30+
nvidia.com/gpu: 7
31+
generictemplate:
32+
apiVersion: ray.io/v1alpha1
33+
kind: RayCluster
34+
metadata:
35+
labels:
36+
appwrapper.mcad.ibm.com: prio-test-cluster
37+
controller-tools.k8s.io: '1.0'
38+
name: prio-test-cluster
39+
namespace: ns
40+
spec:
41+
autoscalerOptions:
42+
idleTimeoutSeconds: 60
43+
imagePullPolicy: Always
44+
resources:
45+
limits:
46+
cpu: 500m
47+
memory: 512Mi
48+
requests:
49+
cpu: 500m
50+
memory: 512Mi
51+
upscalingMode: Default
52+
enableInTreeAutoscaling: false
53+
headGroupSpec:
54+
rayStartParams:
55+
block: 'true'
56+
dashboard-host: 0.0.0.0
57+
num-gpus: '0'
58+
serviceType: ClusterIP
59+
template:
60+
spec:
61+
affinity:
62+
nodeAffinity:
63+
requiredDuringSchedulingIgnoredDuringExecution:
64+
nodeSelectorTerms:
65+
- matchExpressions:
66+
- key: prio-test-cluster
67+
operator: In
68+
values:
69+
- prio-test-cluster
70+
containers:
71+
- env:
72+
- name: MY_POD_IP
73+
valueFrom:
74+
fieldRef:
75+
fieldPath: status.podIP
76+
- name: RAY_USE_TLS
77+
value: '0'
78+
- name: RAY_TLS_SERVER_CERT
79+
value: /home/ray/workspace/tls/server.crt
80+
- name: RAY_TLS_SERVER_KEY
81+
value: /home/ray/workspace/tls/server.key
82+
- name: RAY_TLS_CA_CERT
83+
value: /home/ray/workspace/tls/ca.crt
84+
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
85+
imagePullPolicy: Always
86+
lifecycle:
87+
preStop:
88+
exec:
89+
command:
90+
- /bin/sh
91+
- -c
92+
- ray stop
93+
name: ray-head
94+
ports:
95+
- containerPort: 6379
96+
name: gcs
97+
- containerPort: 8265
98+
name: dashboard
99+
- containerPort: 10001
100+
name: client
101+
resources:
102+
limits:
103+
cpu: 2
104+
memory: 8G
105+
nvidia.com/gpu: 0
106+
requests:
107+
cpu: 2
108+
memory: 8G
109+
nvidia.com/gpu: 0
110+
imagePullSecrets:
111+
- name: unit-test-pull-secret
112+
priorityClassName: default
113+
rayVersion: 2.5.0
114+
workerGroupSpecs:
115+
- groupName: small-group-prio-test-cluster
116+
maxReplicas: 2
117+
minReplicas: 2
118+
rayStartParams:
119+
block: 'true'
120+
num-gpus: '7'
121+
replicas: 2
122+
template:
123+
metadata:
124+
annotations:
125+
key: value
126+
labels:
127+
key: value
128+
spec:
129+
affinity:
130+
nodeAffinity:
131+
requiredDuringSchedulingIgnoredDuringExecution:
132+
nodeSelectorTerms:
133+
- matchExpressions:
134+
- key: prio-test-cluster
135+
operator: In
136+
values:
137+
- prio-test-cluster
138+
containers:
139+
- env:
140+
- name: MY_POD_IP
141+
valueFrom:
142+
fieldRef:
143+
fieldPath: status.podIP
144+
- name: RAY_USE_TLS
145+
value: '0'
146+
- name: RAY_TLS_SERVER_CERT
147+
value: /home/ray/workspace/tls/server.crt
148+
- name: RAY_TLS_SERVER_KEY
149+
value: /home/ray/workspace/tls/server.key
150+
- name: RAY_TLS_CA_CERT
151+
value: /home/ray/workspace/tls/ca.crt
152+
image: quay.io/project-codeflare/ray:2.5.0-py38-cu116
153+
lifecycle:
154+
preStop:
155+
exec:
156+
command:
157+
- /bin/sh
158+
- -c
159+
- ray stop
160+
name: machine-learning
161+
resources:
162+
limits:
163+
cpu: 4
164+
memory: 6G
165+
nvidia.com/gpu: 7
166+
requests:
167+
cpu: 3
168+
memory: 5G
169+
nvidia.com/gpu: 7
170+
imagePullSecrets:
171+
- name: unit-test-pull-secret
172+
initContainers:
173+
- command:
174+
- sh
175+
- -c
176+
- until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local;
177+
do echo waiting for myservice; sleep 2; done
178+
image: busybox:1.28
179+
name: init-myservice
180+
priorityClassName: default
181+
replicas: 1
182+
- generictemplate:
183+
apiVersion: route.openshift.io/v1
184+
kind: Route
185+
metadata:
186+
labels:
187+
odh-ray-cluster-service: prio-test-cluster-head-svc
188+
name: ray-dashboard-prio-test-cluster
189+
namespace: ns
190+
spec:
191+
port:
192+
targetPort: dashboard
193+
to:
194+
kind: Service
195+
name: prio-test-cluster-head-svc
196+
replicas: 1
197+
Items: []

tests/test-case.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ metadata:
66
name: unit-test-cluster
77
namespace: ns
88
spec:
9-
priority: 9
109
resources:
1110
GenericItems:
1211
- custompodresources:
@@ -109,7 +108,6 @@ spec:
109108
nvidia.com/gpu: 0
110109
imagePullSecrets:
111110
- name: unit-test-pull-secret
112-
priorityClassName: default
113111
rayVersion: 2.5.0
114112
workerGroupSpecs:
115113
- groupName: small-group-unit-test-cluster
@@ -177,7 +175,6 @@ spec:
177175
do echo waiting for myservice; sleep 2; done
178176
image: busybox:1.28
179177
name: init-myservice
180-
priorityClassName: default
181178
replicas: 1
182179
- generictemplate:
183180
apiVersion: route.openshift.io/v1

tests/unit_test.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ def test_config_creation():
236236
assert config.instascale
237237
assert config.machine_types == ["cpu.small", "gpu.large"]
238238
assert config.image_pull_secrets == ["unit-test-pull-secret"]
239-
assert config.dispatch_priority == "default"
239+
assert config.dispatch_priority == None
240240

241241

242242
def test_cluster_creation():
@@ -248,6 +248,23 @@ def test_cluster_creation():
248248
)
249249

250250

251+
def test_cluster_creation_priority(mocker):
252+
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
253+
mocker.patch(
254+
"kubernetes.client.CustomObjectsApi.list_cluster_custom_object",
255+
return_value={"items": [{"metadata": {"name": "default"}, "value": 10}]},
256+
)
257+
config = createClusterConfig()
258+
config.name = "prio-test-cluster"
259+
config.dispatch_priority = "default"
260+
cluster = Cluster(config)
261+
assert cluster.app_wrapper_yaml == "prio-test-cluster.yaml"
262+
assert cluster.app_wrapper_name == "prio-test-cluster"
263+
assert filecmp.cmp(
264+
"prio-test-cluster.yaml", f"{parent}/tests/test-case-prio.yaml", shallow=True
265+
)
266+
267+
251268
def test_default_cluster_creation(mocker):
252269
mocker.patch(
253270
"codeflare_sdk.cluster.cluster.get_current_namespace",
@@ -2251,6 +2268,7 @@ def test_export_env():
22512268
# Make sure to always keep this function last
22522269
def test_cleanup():
22532270
os.remove("unit-test-cluster.yaml")
2271+
os.remove("prio-test-cluster.yaml")
22542272
os.remove("unit-test-default-cluster.yaml")
22552273
os.remove("test.yaml")
22562274
os.remove("raytest2.yaml")

0 commit comments

Comments
 (0)