Skip to content

Commit f086f67

Browse files
committed
feat: split head resources for limits and requests
Signed-off-by: Bobbins228 <mcampbel@redhat.com>
1 parent 9047a4c commit f086f67

File tree

5 files changed

+77
-35
lines changed

5 files changed

+77
-35
lines changed

src/codeflare_sdk/cluster/cluster.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,18 @@ def from_k8_cluster_object(
448448
name=rc["metadata"]["name"],
449449
namespace=rc["metadata"]["namespace"],
450450
machine_types=machine_types,
451+
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][
452+
"containers"
453+
][0]["resources"]["requests"]["cpu"],
454+
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][
455+
"containers"
456+
][0]["resources"]["limits"]["cpu"],
457+
head_memory_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][
458+
"containers"
459+
][0]["resources"]["requests"]["memory"],
460+
head_memory_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][
461+
"containers"
462+
][0]["resources"]["limits"]["memory"],
451463
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
452464
worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
453465
"containers"
@@ -853,23 +865,29 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
853865
status=status,
854866
# for now we are not using autoscaling so same replicas is fine
855867
workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
856-
worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
868+
worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
857869
"containers"
858870
][0]["resources"]["limits"]["memory"],
859-
worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
871+
worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
860872
"containers"
861873
][0]["resources"]["requests"]["memory"],
862874
worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
863875
0
864876
]["resources"]["limits"]["cpu"],
865877
worker_extended_resources=worker_extended_resources,
866878
namespace=rc["metadata"]["namespace"],
867-
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
868-
"resources"
869-
]["limits"]["cpu"],
870-
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
871-
"resources"
872-
]["limits"]["memory"],
879+
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
880+
0
881+
]["resources"]["requests"]["cpu"],
882+
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
883+
0
884+
]["resources"]["limits"]["cpu"],
885+
head_mem_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
886+
0
887+
]["resources"]["requests"]["memory"],
888+
head_mem_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
889+
0
890+
]["resources"]["limits"]["memory"],
873891
head_extended_resources=head_extended_resources,
874892
dashboard=dashboard_url,
875893
)
@@ -892,14 +910,16 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
892910
name=cluster.config.name,
893911
status=cluster.status(print_to_console=False)[0],
894912
workers=cluster.config.num_workers,
895-
worker_mem_min=cluster.config.worker_memory_requests,
896-
worker_mem_max=cluster.config.worker_memory_limits,
913+
worker_mem_requests=cluster.config.worker_memory_requests,
914+
worker_mem_limits=cluster.config.worker_memory_limits,
897915
worker_cpu=cluster.config.worker_cpu_requests,
898916
worker_extended_resources=cluster.config.worker_extended_resource_requests,
899917
namespace=cluster.config.namespace,
900918
dashboard=cluster.cluster_dashboard_uri(),
901-
head_cpus=cluster.config.head_cpus,
902-
head_mem=cluster.config.head_memory,
919+
head_mem_requests=cluster.config.head_memory_requests,
920+
head_mem_limits=cluster.config.head_memory_limits,
921+
head_cpu_requests=cluster.config.head_cpu_requests,
922+
head_cpu_limits=cluster.config.head_cpu_limits,
903923
head_extended_resources=cluster.config.head_extended_resource_requests,
904924
)
905925
if ray.status == CodeFlareClusterStatus.READY:

src/codeflare_sdk/cluster/config.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,16 @@ class ClusterConfiguration:
7575
name: str
7676
namespace: Optional[str] = None
7777
head_info: List[str] = field(default_factory=list)
78-
head_cpus: Union[int, str] = 2
79-
head_memory: Union[int, str] = 8
78+
head_cpu_requests: Union[int, str] = 2
79+
head_cpu_limits: Union[int, str] = 2
80+
head_cpus: Optional[Union[int, str]] = None # Deprecating
81+
head_memory_requests: Union[int, str] = 8
82+
head_memory_limits: Union[int, str] = 8
83+
head_memory: Optional[Union[int, str]] = None # Deprecating
8084
head_gpus: Optional[int] = None # Deprecating
81-
head_extended_resource_requests: Dict[str, int] = field(default_factory=dict)
85+
head_extended_resource_requests: Dict[str, Union[str, int]] = field(
86+
default_factory=dict
87+
)
8288
machine_types: List[str] = field(
8389
default_factory=list
8490
) # ["m4.xlarge", "g4dn.xlarge"]
@@ -100,7 +106,9 @@ class ClusterConfiguration:
100106
write_to_file: bool = False
101107
verify_tls: bool = True
102108
labels: Dict[str, str] = field(default_factory=dict)
103-
worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict)
109+
worker_extended_resource_requests: Dict[str, Union[str, int]] = field(
110+
default_factory=dict
111+
)
104112
extended_resource_mapping: Dict[str, str] = field(default_factory=dict)
105113
overwrite_default_resource_mapping: bool = False
106114
local_queue: Optional[str] = None
@@ -183,14 +191,21 @@ def _str_mem_no_unit_add_GB(self):
183191
self.worker_memory_limits = f"{self.worker_memory_limits}G"
184192

185193
def _memory_to_string(self):
186-
if isinstance(self.head_memory, int):
187-
self.head_memory = f"{self.head_memory}G"
194+
if isinstance(self.head_memory_requests, int):
195+
self.head_memory_requests = f"{self.head_memory_requests}G"
196+
if isinstance(self.head_memory_limits, int):
197+
self.head_memory_limits = f"{self.head_memory_limits}G"
188198
if isinstance(self.worker_memory_requests, int):
189199
self.worker_memory_requests = f"{self.worker_memory_requests}G"
190200
if isinstance(self.worker_memory_limits, int):
191201
self.worker_memory_limits = f"{self.worker_memory_limits}G"
192202

193203
def _cpu_to_resource(self):
204+
if self.head_cpus:
205+
warnings.warn(
206+
"head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits"
207+
)
208+
self.head_cpu_requests = self.head_cpu_limits = self.head_cpus
194209
if self.min_cpus:
195210
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
196211
self.worker_cpu_requests = self.min_cpus
@@ -199,6 +214,11 @@ def _cpu_to_resource(self):
199214
self.worker_cpu_limits = self.max_cpus
200215

201216
def _memory_to_resource(self):
217+
if self.head_memory:
218+
warnings.warn(
219+
"head_memory is being deprecated, use head_memory_requests and head_memory_limits"
220+
)
221+
self.head_memory_requests = self.head_memory_limits = self.head_memory
202222
if self.min_memory:
203223
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
204224
self.worker_memory_requests = f"{self.min_memory}G"

src/codeflare_sdk/cluster/model.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,13 @@ class RayCluster:
7373

7474
name: str
7575
status: RayClusterStatus
76-
head_cpus: int
77-
head_mem: str
76+
head_cpu_requests: int
77+
head_cpu_limits: int
78+
head_mem_requests: str
79+
head_mem_limits: str
7880
workers: int
79-
worker_mem_min: str
80-
worker_mem_max: str
81+
worker_mem_requests: str
82+
worker_mem_limits: str
8183
worker_cpu: int
8284
namespace: str
8385
dashboard: str

src/codeflare_sdk/utils/generate_yaml.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -115,22 +115,22 @@ def update_env(spec, env):
115115

116116
def update_resources(
117117
spec,
118-
worker_cpu_requests,
119-
worker_cpu_limits,
120-
worker_memory_requests,
121-
worker_memory_limits,
118+
cpu_requests,
119+
cpu_limits,
120+
memory_requests,
121+
memory_limits,
122122
custom_resources,
123123
):
124124
container = spec.get("containers")
125125
for resource in container:
126126
requests = resource.get("resources").get("requests")
127127
if requests is not None:
128-
requests["cpu"] = worker_cpu_requests
129-
requests["memory"] = worker_memory_requests
128+
requests["cpu"] = cpu_requests
129+
requests["memory"] = memory_requests
130130
limits = resource.get("resources").get("limits")
131131
if limits is not None:
132-
limits["cpu"] = worker_cpu_limits
133-
limits["memory"] = worker_memory_limits
132+
limits["cpu"] = cpu_limits
133+
limits["memory"] = memory_limits
134134
for k in custom_resources.keys():
135135
limits[k] = custom_resources[k]
136136
requests[k] = custom_resources[k]
@@ -210,10 +210,10 @@ def update_nodes(
210210
# TODO: Eventually add head node configuration outside of template
211211
update_resources(
212212
spec,
213-
cluster.config.head_cpus,
214-
cluster.config.head_cpus,
215-
cluster.config.head_memory,
216-
cluster.config.head_memory,
213+
cluster.config.head_cpu_requests,
214+
cluster.config.head_cpu_limits,
215+
cluster.config.head_memory_requests,
216+
cluster.config.head_memory_limits,
217217
cluster.config.head_extended_resource_requests,
218218
)
219219
else:

src/codeflare_sdk/utils/pretty_print.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]):
136136
name = cluster.name
137137
dashboard = cluster.dashboard
138138
workers = str(cluster.workers)
139-
memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}"
139+
memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}"
140140
cpu = str(cluster.worker_cpu)
141141
gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0))
142142

0 commit comments

Comments
 (0)