Skip to content

Commit 217b094

Browse files
committed
feat: split head resources for limits and requests
Signed-off-by: Bobbins228 <mcampbel@redhat.com>
1 parent e1c1b97 commit 217b094

File tree

5 files changed

+77
-35
lines changed

5 files changed

+77
-35
lines changed

src/codeflare_sdk/cluster/cluster.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,18 @@ def from_k8_cluster_object(
448448
name=rc["metadata"]["name"],
449449
namespace=rc["metadata"]["namespace"],
450450
machine_types=machine_types,
451+
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][
452+
"containers"
453+
][0]["resources"]["requests"]["cpu"],
454+
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][
455+
"containers"
456+
][0]["resources"]["limits"]["cpu"],
457+
head_memory_requests=rc["spec"]["headGroupSpec"]["template"]["spec"][
458+
"containers"
459+
][0]["resources"]["requests"]["memory"],
460+
head_memory_limits=rc["spec"]["headGroupSpec"]["template"]["spec"][
461+
"containers"
462+
][0]["resources"]["limits"]["memory"],
451463
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
452464
worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
453465
"containers"
@@ -837,23 +849,29 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
837849
status=status,
838850
# for now we are not using autoscaling so same replicas is fine
839851
workers=rc["spec"]["workerGroupSpecs"][0]["replicas"],
840-
worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
852+
worker_mem_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
841853
"containers"
842854
][0]["resources"]["limits"]["memory"],
843-
worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
855+
worker_mem_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
844856
"containers"
845857
][0]["resources"]["requests"]["memory"],
846858
worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
847859
0
848860
]["resources"]["limits"]["cpu"],
849861
worker_extended_resources=worker_extended_resources,
850862
namespace=rc["metadata"]["namespace"],
851-
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
852-
"resources"
853-
]["limits"]["cpu"],
854-
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
855-
"resources"
856-
]["limits"]["memory"],
863+
head_cpu_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
864+
0
865+
]["resources"]["requests"]["cpu"],
866+
head_cpu_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
867+
0
868+
]["resources"]["limits"]["cpu"],
869+
head_mem_requests=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
870+
0
871+
]["resources"]["requests"]["memory"],
872+
head_mem_limits=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][
873+
0
874+
]["resources"]["limits"]["memory"],
857875
head_extended_resources=head_extended_resources,
858876
dashboard=dashboard_url,
859877
)
@@ -876,14 +894,16 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
876894
name=cluster.config.name,
877895
status=cluster.status(print_to_console=False)[0],
878896
workers=cluster.config.num_workers,
879-
worker_mem_min=cluster.config.worker_memory_requests,
880-
worker_mem_max=cluster.config.worker_memory_limits,
897+
worker_mem_requests=cluster.config.worker_memory_requests,
898+
worker_mem_limits=cluster.config.worker_memory_limits,
881899
worker_cpu=cluster.config.worker_cpu_requests,
882900
worker_extended_resources=cluster.config.worker_extended_resource_requests,
883901
namespace=cluster.config.namespace,
884902
dashboard=cluster.cluster_dashboard_uri(),
885-
head_cpus=cluster.config.head_cpus,
886-
head_mem=cluster.config.head_memory,
903+
head_mem_requests=cluster.config.head_memory_requests,
904+
head_mem_limits=cluster.config.head_memory_limits,
905+
head_cpu_requests=cluster.config.head_cpu_requests,
906+
head_cpu_limits=cluster.config.head_cpu_limits,
887907
head_extended_resources=cluster.config.head_extended_resource_requests,
888908
)
889909
if ray.status == CodeFlareClusterStatus.READY:

src/codeflare_sdk/cluster/config.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,16 @@ class ClusterConfiguration:
7575
name: str
7676
namespace: Optional[str] = None
7777
head_info: List[str] = field(default_factory=list)
78-
head_cpus: Union[int, str] = 2
79-
head_memory: Union[int, str] = 8
78+
head_cpu_requests: Union[int, str] = 2
79+
head_cpu_limits: Union[int, str] = 2
80+
head_cpus: Optional[Union[int, str]] = None # Deprecating
81+
head_memory_requests: Union[int, str] = 8
82+
head_memory_limits: Union[int, str] = 8
83+
head_memory: Optional[Union[int, str]] = None # Deprecating
8084
head_gpus: Optional[int] = None # Deprecating
81-
head_extended_resource_requests: Dict[str, int] = field(default_factory=dict)
85+
head_extended_resource_requests: Dict[str, Union[str, int]] = field(
86+
default_factory=dict
87+
)
8288
machine_types: List[str] = field(
8389
default_factory=list
8490
) # ["m4.xlarge", "g4dn.xlarge"]
@@ -100,7 +106,9 @@ class ClusterConfiguration:
100106
write_to_file: bool = False
101107
verify_tls: bool = True
102108
labels: Dict[str, str] = field(default_factory=dict)
103-
worker_extended_resource_requests: Dict[str, int] = field(default_factory=dict)
109+
worker_extended_resource_requests: Dict[str, Union[str, int]] = field(
110+
default_factory=dict
111+
)
104112
extended_resource_mapping: Dict[str, str] = field(default_factory=dict)
105113
overwrite_default_resource_mapping: bool = False
106114
local_queue: Optional[str] = None
@@ -183,14 +191,21 @@ def _str_mem_no_unit_add_GB(self):
183191
self.worker_memory_limits = f"{self.worker_memory_limits}G"
184192

185193
def _memory_to_string(self):
186-
if isinstance(self.head_memory, int):
187-
self.head_memory = f"{self.head_memory}G"
194+
if isinstance(self.head_memory_requests, int):
195+
self.head_memory_requests = f"{self.head_memory_requests}G"
196+
if isinstance(self.head_memory_limits, int):
197+
self.head_memory_limits = f"{self.head_memory_limits}G"
188198
if isinstance(self.worker_memory_requests, int):
189199
self.worker_memory_requests = f"{self.worker_memory_requests}G"
190200
if isinstance(self.worker_memory_limits, int):
191201
self.worker_memory_limits = f"{self.worker_memory_limits}G"
192202

193203
def _cpu_to_resource(self):
204+
if self.head_cpus:
205+
warnings.warn(
206+
"head_cpus is being deprecated, use head_cpu_requests and head_cpu_limits"
207+
)
208+
self.head_cpu_requests = self.head_cpu_limits = self.head_cpus
194209
if self.min_cpus:
195210
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
196211
self.worker_cpu_requests = self.min_cpus
@@ -199,6 +214,11 @@ def _cpu_to_resource(self):
199214
self.worker_cpu_limits = self.max_cpus
200215

201216
def _memory_to_resource(self):
217+
if self.head_memory:
218+
warnings.warn(
219+
"head_memory is being deprecated, use head_memory_requests and head_memory_limits"
220+
)
221+
self.head_memory_requests = self.head_memory_limits = self.head_memory
202222
if self.min_memory:
203223
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
204224
self.worker_memory_requests = f"{self.min_memory}G"

src/codeflare_sdk/cluster/model.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,11 +73,13 @@ class RayCluster:
7373

7474
name: str
7575
status: RayClusterStatus
76-
head_cpus: int
77-
head_mem: str
76+
head_cpu_requests: int
77+
head_cpu_limits: int
78+
head_mem_requests: str
79+
head_mem_limits: str
7880
workers: int
79-
worker_mem_min: str
80-
worker_mem_max: str
81+
worker_mem_requests: str
82+
worker_mem_limits: str
8183
worker_cpu: int
8284
namespace: str
8385
dashboard: str

src/codeflare_sdk/utils/generate_yaml.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -115,22 +115,22 @@ def update_env(spec, env):
115115

116116
def update_resources(
117117
spec,
118-
worker_cpu_requests,
119-
worker_cpu_limits,
120-
worker_memory_requests,
121-
worker_memory_limits,
118+
cpu_requests,
119+
cpu_limits,
120+
memory_requests,
121+
memory_limits,
122122
custom_resources,
123123
):
124124
container = spec.get("containers")
125125
for resource in container:
126126
requests = resource.get("resources").get("requests")
127127
if requests is not None:
128-
requests["cpu"] = worker_cpu_requests
129-
requests["memory"] = worker_memory_requests
128+
requests["cpu"] = cpu_requests
129+
requests["memory"] = memory_requests
130130
limits = resource.get("resources").get("limits")
131131
if limits is not None:
132-
limits["cpu"] = worker_cpu_limits
133-
limits["memory"] = worker_memory_limits
132+
limits["cpu"] = cpu_limits
133+
limits["memory"] = memory_limits
134134
for k in custom_resources.keys():
135135
limits[k] = custom_resources[k]
136136
requests[k] = custom_resources[k]
@@ -210,10 +210,10 @@ def update_nodes(
210210
# TODO: Eventually add head node configuration outside of template
211211
update_resources(
212212
spec,
213-
cluster.config.head_cpus,
214-
cluster.config.head_cpus,
215-
cluster.config.head_memory,
216-
cluster.config.head_memory,
213+
cluster.config.head_cpu_requests,
214+
cluster.config.head_cpu_limits,
215+
cluster.config.head_memory_requests,
216+
cluster.config.head_memory_limits,
217217
cluster.config.head_extended_resource_requests,
218218
)
219219
else:

src/codeflare_sdk/utils/pretty_print.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def print_clusters(clusters: List[RayCluster]):
136136
name = cluster.name
137137
dashboard = cluster.dashboard
138138
workers = str(cluster.workers)
139-
memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}"
139+
memory = f"{cluster.worker_mem_requests}~{cluster.worker_mem_limits}"
140140
cpu = str(cluster.worker_cpu)
141141
gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0))
142142

0 commit comments

Comments
 (0)