diff --git a/.gitignore b/.gitignore index 849ddff3b..404fda5ab 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ dist/ +.python-version diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 9464079a9..8ae4c003e 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -43,8 +43,10 @@ def down(self, namespace='default'): oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) def status(self, print_to_console=True): - cluster = _ray_cluster_status(self.config.name) + cluster = _ray_cluster_status(self.config.name) if cluster: + #overriding the number of gpus with requested + cluster.worker_gpu = self.config.gpu if print_to_console: pretty_print.print_clusters([cluster]) return cluster.status @@ -92,6 +94,8 @@ def is_ready(self, print_to_console=True): status = CodeFlareClusterStatus.FAILED if print_to_console: + #overriding the number of gpus with requested + cluster.worker_gpu = self.config.gpu pretty_print.print_clusters([cluster]) return status, ready @@ -123,11 +127,16 @@ def _app_wrapper_status(name, namespace='default') -> Optional[AppWrapper]: def _ray_cluster_status(name, namespace='default') -> Optional[RayCluster]: # FIXME should we check the appwrapper first - with oc.project(namespace), oc.timeout(10*60): - cluster = oc.selector(f'rayclusters/{name}').object() - - if cluster: - return _map_to_ray_cluster(cluster) + cluster = None + try: + with oc.project(namespace), oc.timeout(10*60): + cluster = oc.selector(f'rayclusters/{name}').object() + + if cluster: + return _map_to_ray_cluster(cluster) + except: + pass + return cluster def _get_ray_clusters(namespace='default') -> List[RayCluster]: @@ -161,6 +170,7 @@ def _map_to_ray_cluster(cluster) -> RayCluster: cluster_model = cluster.model return RayCluster( name=cluster.name(), status=RayClusterStatus(cluster_model.status.state.lower()), + #for now we are not using autoscaling so same replicas is fine min_workers=cluster_model.spec.workerGroupSpecs[0].replicas, max_workers=cluster_model.spec.workerGroupSpecs[0].replicas, worker_mem_max=cluster_model.spec.workerGroupSpecs[ @@ -168,7 +178,8 @@ def _map_to_ray_cluster(cluster) -> RayCluster: worker_mem_min=cluster_model.spec.workerGroupSpecs[ 0].template.spec.containers[0].resources.requests.memory, worker_cpu=cluster_model.spec.workerGroupSpecs[0].template.spec.containers[0].resources.limits.cpu, - worker_gpu=0) + worker_gpu=0, #hard to detect currently how many gpus, can override it with what the user asked for + namespace=cluster.namespace()) def _map_to_app_wrapper(cluster) -> AppWrapper: diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index 5ce657a01..0459d841f 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -31,6 +31,7 @@ class RayCluster: worker_mem_max: str worker_cpu: int worker_gpu: int + namespace: str @dataclass class AppWrapper: diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index e721294c1..b4bae44b7 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -22,10 +22,11 @@ def print_app_wrappers_status(app_wrappers:List[AppWrapper]): name = app_wrapper.name status = app_wrapper.status.value - table = Table(box=None, title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:") + table = Table(box=box.ASCII_DOUBLE_HEAD, title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:") + table.add_row("") #empty row for spacing table.add_column("Name", style="cyan", no_wrap=True) table.add_column("Status", style="magenta") - table.add_row("[bold underline]"+name,status) + table.add_row(name,status) table.add_row("") #empty row for spacing console.print(Panel.fit(table)) @@ -47,7 +48,7 @@ def print_clusters(clusters:List[RayCluster], verbose=True): maxcount = str(cluster.max_workers) memory = cluster.worker_mem_min+"~"+cluster.worker_mem_max cpu = str(cluster.worker_cpu) - gpu = str(cluster.worker_mem_max) + gpu = str(cluster.worker_gpu) #owned = bool(cluster["userOwned"]) owned = True @@ -59,7 +60,8 @@ def print_clusters(clusters:List[RayCluster], verbose=True): table0.add_row("") table0.add_row("[bold underline]"+name,status) table0.add_row() - table0.add_row(f"[bold]URI:[/bold] ray://{name}-head-svc:1001") #format that is used to generate the name of the service + #fixme harcded to default for now + table0.add_row(f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001") #format that is used to generate the name of the service table0.add_row() table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]") table0.add_row("") #empty row for spacing