Skip to content

Commit f74b9df

Browse files
committed
Made Kueue the default queueing strategy
1 parent 1497434 commit f74b9df

File tree

9 files changed

+118
-4
lines changed

9 files changed

+118
-4
lines changed

src/codeflare_sdk/cluster/cluster.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ def create_app_wrapper(self):
191191
ingress_domain = self.config.ingress_domain
192192
ingress_options = self.config.ingress_options
193193
write_to_file = self.config.write_to_file
194+
local_queue = self.config.local_queue
194195
return generate_appwrapper(
195196
name=name,
196197
namespace=namespace,
@@ -217,6 +218,7 @@ def create_app_wrapper(self):
217218
ingress_domain=ingress_domain,
218219
ingress_options=ingress_options,
219220
write_to_file=write_to_file,
221+
local_queue=local_queue,
220222
)
221223

222224
# creates a new cluster with the provided or default spec

src/codeflare_sdk/cluster/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class ClusterConfiguration:
4646
num_gpus: int = 0
4747
template: str = f"{dir}/templates/base-template.yaml"
4848
instascale: bool = False
49-
mcad: bool = True
49+
mcad: bool = False
5050
envs: dict = field(default_factory=dict)
5151
image: str = ""
5252
local_interactive: bool = False
@@ -56,3 +56,4 @@ class ClusterConfiguration:
5656
ingress_options: dict = field(default_factory=dict)
5757
ingress_domain: str = None
5858
write_to_file: bool = False
59+
local_queue: str = None

src/codeflare_sdk/templates/base-template.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ spec:
4444
sdk.codeflare.dev/local_interactive: "False"
4545
labels:
4646
workload.codeflare.dev/appwrapper: "aw-kuberay"
47+
kueue.x-k8s.io/queue-name: local-queue
4748
controller-tools.k8s.io: "1.0"
4849
# A unique identifier for the head node and workers of this cluster.
4950
name: kuberay-cluster

src/codeflare_sdk/utils/generate_yaml.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,9 @@ def write_user_appwrapper(user_yaml, output_file_name):
557557
os.makedirs(directory_path)
558558

559559
with open(output_file_name, "w") as outfile:
560+
del user_yaml["spec"]["resources"]["GenericItems"][0]["generictemplate"][
561+
"metadata"
562+
]["labels"]["kueue.x-k8s.io/queue-name"]
560563
yaml.dump(user_yaml, outfile, default_flow_style=False)
561564

562565
print(f"Written to: {output_file_name}")
@@ -635,7 +638,39 @@ def _create_oauth_sidecar_object(
635638
)
636639

637640

638-
def write_components(user_yaml: dict, output_file_name: str):
641+
def update_local_kueue_label(local_queue: str, namespace: str):
642+
if local_queue is not None:
643+
return local_queue
644+
else:
645+
try:
646+
config_check()
647+
api_instance = client.CustomObjectsApi(api_config_handler())
648+
local_queues = api_instance.list_namespaced_custom_object(
649+
group="kueue.x-k8s.io",
650+
version="v1beta1",
651+
namespace=namespace,
652+
plural="localqueues",
653+
)
654+
except Exception as e: # pragma: no cover
655+
return _kube_api_error_handling(e)
656+
for lq in local_queues["items"]:
657+
if (
658+
"annotations" in lq["metadata"]
659+
and "kueue.x-k8s.io/default-queue" in lq["metadata"]["annotations"]
660+
and lq["metadata"]["annotations"][
661+
"kueue.x-k8s.io/default-queue"
662+
].lower()
663+
== "true"
664+
):
665+
return lq["metadata"]["name"]
666+
raise ValueError(
667+
"Default Local Queue with kueue.x-k8s.io/default-queue: true annotation not found please create a default Local Queue or provide the local_queue name in Cluster Configuration"
668+
)
669+
670+
671+
def write_components(
672+
user_yaml: dict, output_file_name: str, namespace: str, local_queue: str
673+
):
639674
# Create the directory if it doesn't exist
640675
directory_path = os.path.dirname(output_file_name)
641676
if not os.path.exists(directory_path):
@@ -646,6 +681,16 @@ def write_components(user_yaml: dict, output_file_name: str):
646681
with open(output_file_name, "a") as outfile:
647682
for component in components:
648683
if "generictemplate" in component:
684+
if (
685+
"workload.codeflare.dev/appwrapper"
686+
in component["generictemplate"]["metadata"]["labels"]
687+
):
688+
del component["generictemplate"]["metadata"]["labels"][
689+
"workload.codeflare.dev/appwrapper"
690+
]
691+
component["generictemplate"]["metadata"]["labels"][
692+
"kueue.x-k8s.io/queue-name"
693+
] = update_local_kueue_label(local_queue, namespace)
649694
outfile.write("---\n")
650695
yaml.dump(
651696
component["generictemplate"], outfile, default_flow_style=False
@@ -700,6 +745,7 @@ def generate_appwrapper(
700745
ingress_domain: str,
701746
ingress_options: dict,
702747
write_to_file: bool,
748+
local_queue: str,
703749
):
704750
user_yaml = read_template(template)
705751
appwrapper_name, cluster_name = gen_names(name)
@@ -767,7 +813,7 @@ def generate_appwrapper(
767813
if mcad:
768814
write_user_appwrapper(user_yaml, outfile)
769815
else:
770-
write_components(user_yaml, outfile)
816+
write_components(user_yaml, outfile, local_queue)
771817
return outfile
772818
else:
773819
if mcad:

tests/e2e/mnist_raycluster_sdk_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def run_mnist_raycluster_sdk(self):
7171
image=ray_image,
7272
ingress_options=ingress_options,
7373
write_to_file=True,
74+
mcad=True,
7475
)
7576
)
7677

tests/e2e/start_ray_cluster.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
instascale=False,
3939
image=ray_image,
4040
ingress_options=ingress_options,
41+
mcad=True,
4142
)
4243
)
4344

tests/test-case-no-mcad.yamls

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ metadata:
77
sdk.codeflare.dev/local_interactive: 'False'
88
labels:
99
controller-tools.k8s.io: '1.0'
10-
workload.codeflare.dev/appwrapper: unit-test-cluster-ray
10+
kueue.x-k8s.io/queue-name: local-queue-default
1111
name: unit-test-cluster-ray
1212
namespace: ns
1313
spec:

tests/unit_test.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,59 @@ def test_create_app_wrapper_raises_error_with_no_image():
306306
), "Error message did not match expected output."
307307

308308

309+
def get_local_queue(group, version, namespace, plural):
310+
assert group == "kueue.x-k8s.io"
311+
assert version == "v1beta1"
312+
assert namespace == "ns"
313+
assert plural == "localqueues"
314+
local_queues = {
315+
"apiVersion": "kueue.x-k8s.io/v1beta1",
316+
"items": [
317+
{
318+
"apiVersion": "kueue.x-k8s.io/v1beta1",
319+
"kind": "LocalQueue",
320+
"metadata": {
321+
"annotations": {"kueue.x-k8s.io/default-queue": "true"},
322+
"name": "local-queue-default",
323+
"namespace": "ns",
324+
},
325+
"spec": {"clusterQueue": "cluster-queue"},
326+
}
327+
],
328+
"kind": "LocalQueueList",
329+
"metadata": {"continue": "", "resourceVersion": "2266811"},
330+
}
331+
return local_queues
332+
333+
309334
def test_cluster_creation_no_mcad(mocker):
310335
# With written resources
336+
# Create Ray Cluster with no local queue specified
337+
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
338+
mocker.patch(
339+
"kubernetes.client.CustomObjectsApi.get_cluster_custom_object",
340+
return_value={"spec": {"domain": "apps.cluster.awsroute.org"}},
341+
)
342+
mocker.patch(
343+
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
344+
return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
345+
)
346+
config = createClusterConfig()
347+
config.name = "unit-test-cluster-ray"
348+
config.write_to_file = True
349+
config.mcad = False
350+
cluster = Cluster(config)
351+
assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster-ray.yaml"
352+
assert cluster.app_wrapper_name == "unit-test-cluster-ray"
353+
assert filecmp.cmp(
354+
f"{aw_dir}unit-test-cluster-ray.yaml",
355+
f"{parent}/tests/test-case-no-mcad.yamls",
356+
shallow=True,
357+
)
358+
359+
360+
def test_cluster_creation_no_mcad_local_queue(mocker):
361+
# Create Ray Cluster with local queue specified
311362
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
312363
mocker.patch(
313364
"kubernetes.client.CustomObjectsApi.get_cluster_custom_object",
@@ -317,6 +368,7 @@ def test_cluster_creation_no_mcad(mocker):
317368
config.name = "unit-test-cluster-ray"
318369
config.mcad = False
319370
config.write_to_file = True
371+
config.local_queue = "local-queue-default"
320372
cluster = Cluster(config)
321373
assert cluster.app_wrapper_yaml == f"{aw_dir}unit-test-cluster-ray.yaml"
322374
assert cluster.app_wrapper_name == "unit-test-cluster-ray"
@@ -392,6 +444,7 @@ def test_default_cluster_creation(mocker):
392444
name="unit-test-default-cluster",
393445
image="quay.io/project-codeflare/ray:latest-py39-cu118",
394446
ingress_domain="apps.cluster.awsroute.org",
447+
mcad=True,
395448
)
396449
cluster = Cluster(default_config)
397450
test_aw = yaml.safe_load(cluster.app_wrapper_yaml)
@@ -503,6 +556,10 @@ def test_cluster_up_down(mocker):
503556

504557

505558
def test_cluster_up_down_no_mcad(mocker):
559+
mocker.patch(
560+
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
561+
return_value=get_local_queue("kueue.x-k8s.io", "v1beta1", "ns", "localqueues"),
562+
)
506563
mocker.patch("kubernetes.client.ApisApi.get_api_versions")
507564
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
508565
mocker.patch(
@@ -860,6 +917,7 @@ def test_ray_details(mocker, capsys):
860917
image="quay.io/project-codeflare/ray:latest-py39-cu118",
861918
ingress_domain="apps.cluster.awsroute.org",
862919
write_to_file=True,
920+
mcad=True,
863921
)
864922
)
865923
captured = capsys.readouterr()
@@ -2391,6 +2449,7 @@ def test_cluster_status(mocker):
23912449
image="quay.io/project-codeflare/ray:latest-py39-cu118",
23922450
ingress_domain="apps.cluster.awsroute.org",
23932451
write_to_file=True,
2452+
mcad=True,
23942453
)
23952454
)
23962455
mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None)
@@ -2486,6 +2545,7 @@ def test_wait_ready(mocker, capsys):
24862545
image="quay.io/project-codeflare/ray:latest-py39-cu118",
24872546
ingress_domain="apps.cluster.awsroute.org",
24882547
write_to_file=True,
2548+
mcad=True,
24892549
)
24902550
)
24912551
try:
@@ -3262,6 +3322,7 @@ def test_gen_app_wrapper_with_oauth(mocker: MockerFixture):
32623322
image="quay.io/project-codeflare/ray:latest-py39-cu118",
32633323
ingress_domain="apps.cluster.awsroute.org",
32643324
write_to_file=True,
3325+
mcad=True,
32653326
)
32663327
)
32673328
user_yaml = write_user_appwrapper.call_args.args[0]

tests/unit_test_support.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def createClusterConfig():
4343
min_memory=5,
4444
max_memory=6,
4545
num_gpus=7,
46+
mcad=True,
4647
instascale=True,
4748
machine_types=["cpu.small", "gpu.large"],
4849
image_pull_secrets=["unit-test-pull-secret"],

0 commit comments

Comments
 (0)