diff --git a/demo-notebooks/batch-job/batch_mnist.ipynb b/demo-notebooks/batch-job/batch_mnist.ipynb index 6512c9be1..0b0c25f61 100644 --- a/demo-notebooks/batch-job/batch_mnist.ipynb +++ b/demo-notebooks/batch-job/batch_mnist.ipynb @@ -24,7 +24,8 @@ " token = \"XXXX\",\n", " server = \"XXXX\",\n", " skip_tls=True\n", - ")" + ")\n", + "auth.login()" ] }, { @@ -43,7 +44,7 @@ "outputs": [], "source": [ "# Create our cluster and submit appwrapper\n", - "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"], auth=auth))" + "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" ] }, { @@ -66,11 +67,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "657ebdfb", "metadata": {}, "source": [ - "Now, we want to check on the status of our resource cluster, until it is finally ready for use." + "Now, we want to check on the status of our resource cluster, and wait until it is finally ready for use." ] }, { @@ -122,7 +124,36 @@ } ], "source": [ - "cluster.is_ready()" + "cluster.status()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a99d5aff", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df71c1ed", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.status()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b3a55fe4", + "metadata": {}, + "source": [ + "Let's quickly verify that the specs of the cluster are as expected." ] }, { @@ -190,7 +221,7 @@ } ], "source": [ - "cluster.status()" + "cluster.details()" ] }, { @@ -1994,6 +2025,16 @@ "source": [ "cluster.down()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d41b90e", + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] } ], "metadata": { @@ -2012,7 +2053,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.13" }, "vscode": { "interpreter": { diff --git a/demo-notebooks/interactive/hf_interactive.ipynb b/demo-notebooks/interactive/hf_interactive.ipynb index b107d1ac3..ed7748216 100644 --- a/demo-notebooks/interactive/hf_interactive.ipynb +++ b/demo-notebooks/interactive/hf_interactive.ipynb @@ -53,12 +53,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Create authentication object for oc user permissions\n", + "# Create authentication object for oc user permissions and login\n", "auth = TokenAuthentication(\n", " token = \"XXXX\",\n", " server = \"XXXX\",\n", " skip_tls = True\n", - ")" + ")\n", + "auth.login()" ] }, { @@ -85,7 +86,7 @@ ], "source": [ "# Create our cluster and submit appwrapper\n", - "cluster = Cluster(ClusterConfiguration(name='hfgputest', min_worker=1, max_worker=1, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"], auth=auth))" + "cluster = Cluster(ClusterConfiguration(name='hfgputest', min_worker=1, max_worker=1, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" ] }, { @@ -107,11 +108,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "657ebdfb", "metadata": {}, "source": [ - "Now, we want to check on the status of our resource cluster, until it is finally ready for use." + "Now, we want to check on the initial status of our resource cluster, then wait until it is finally ready for use." ] }, { @@ -163,7 +165,36 @@ } ], "source": [ - "cluster.is_ready()" + "cluster.status()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d26275e", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2969a4b", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.status()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "477ac246", + "metadata": {}, + "source": [ + "Let's quickly verify that the specs of the cluster are as expected." ] }, { @@ -231,7 +262,7 @@ } ], "source": [ - "cluster.status()" + "cluster.details()" ] }, { @@ -1379,6 +1410,16 @@ "cluster.down()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "61bf4946", + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] + }, { "cell_type": "markdown", "id": "2b7a183b-5e8e-4adb-b9a6-a349e13512a0", @@ -1416,7 +1457,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.13" }, "vscode": { "interpreter": { diff --git a/docs/cluster/auth.html b/docs/cluster/auth.html index 01538a024..bda78622a 100644 --- a/docs/cluster/auth.html +++ b/docs/cluster/auth.html @@ -92,7 +92,7 @@

Module codeflare_sdk.cluster.auth

self.server = server self.skip_tls = skip_tls - def login(self): + def login(self) -> str: """ This function is used to login to an OpenShift cluster using the user's API token and API server address. Depending on the cluster, a user can choose to login in with "--insecure-skip-tls-verify` by setting `skip_tls` @@ -107,15 +107,18 @@

Module codeflare_sdk.cluster.auth

error_msg = osp.result.err() if "The server uses a certificate signed by unknown authority" in error_msg: return "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication" + elif "invalid" in error_msg: + raise PermissionError(error_msg) else: return error_msg return response.out() - def logout(self): + def logout(self) -> str: """ This function is used to logout of an OpenShift cluster. """ - response = oc.invoke("logout") + args = [f"--token={self.token}", f"--server={self.server}:6443"] + response = oc.invoke("logout", args) return response.out() @@ -137,14 +140,14 @@

Module codeflare_sdk.cluster.auth

self.username = username self.password = password - def login(self): + def login(self) -> str: """ This function is used to login to an OpenShift cluster using the user's `username` and `password`. """ response = oc.login(self.username, self.password) return response.out() - def logout(self): + def logout(self) -> str: """ This function is used to logout of an OpenShift cluster. """ @@ -261,14 +264,14 @@

Methods

self.username = username self.password = password - def login(self): + def login(self) -> str: """ This function is used to login to an OpenShift cluster using the user's `username` and `password`. """ response = oc.login(self.username, self.password) return response.out() - def logout(self): + def logout(self) -> str: """ This function is used to logout of an OpenShift cluster. """ @@ -282,7 +285,7 @@

Ancestors

Methods

-def login(self) +def login(self) ‑> str

This function is used to login to an OpenShift cluster using the user's username and password.

@@ -290,7 +293,7 @@

Methods

Expand source code -
def login(self):
+
def login(self) -> str:
     """
     This function is used to login to an OpenShift cluster using the user's `username` and `password`.
     """
@@ -299,7 +302,7 @@ 

Methods

-def logout(self) +def logout(self) ‑> str

This function is used to logout of an OpenShift cluster.

@@ -307,7 +310,7 @@

Methods

Expand source code -
def logout(self):
+
def logout(self) -> str:
     """
     This function is used to logout of an OpenShift cluster.
     """
@@ -346,7 +349,7 @@ 

Methods

self.server = server self.skip_tls = skip_tls - def login(self): + def login(self) -> str: """ This function is used to login to an OpenShift cluster using the user's API token and API server address. Depending on the cluster, a user can choose to login in with "--insecure-skip-tls-verify` by setting `skip_tls` @@ -361,15 +364,18 @@

Methods

error_msg = osp.result.err() if "The server uses a certificate signed by unknown authority" in error_msg: return "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication" + elif "invalid" in error_msg: + raise PermissionError(error_msg) else: return error_msg return response.out() - def logout(self): + def logout(self) -> str: """ This function is used to logout of an OpenShift cluster. """ - response = oc.invoke("logout") + args = [f"--token={self.token}", f"--server={self.server}:6443"] + response = oc.invoke("logout", args) return response.out()

Ancestors

@@ -379,7 +385,7 @@

Ancestors

Methods

-def login(self) +def login(self) ‑> str

This function is used to login to an OpenShift cluster using the user's API token and API server address. @@ -389,7 +395,7 @@

Methods

Expand source code -
def login(self):
+
def login(self) -> str:
     """
     This function is used to login to an OpenShift cluster using the user's API token and API server address.
     Depending on the cluster, a user can choose to login in with "--insecure-skip-tls-verify` by setting `skip_tls`
@@ -404,13 +410,15 @@ 

Methods

error_msg = osp.result.err() if "The server uses a certificate signed by unknown authority" in error_msg: return "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication" + elif "invalid" in error_msg: + raise PermissionError(error_msg) else: return error_msg return response.out()
-def logout(self) +def logout(self) ‑> str

This function is used to logout of an OpenShift cluster.

@@ -418,11 +426,12 @@

Methods

Expand source code -
def logout(self):
+
def logout(self) -> str:
     """
     This function is used to logout of an OpenShift cluster.
     """
-    response = oc.invoke("logout")
+    args = [f"--token={self.token}", f"--server={self.server}:6443"]
+    response = oc.invoke("logout", args)
     return response.out()
diff --git a/docs/cluster/cluster.html b/docs/cluster/cluster.html index 5f3a6aad9..fda90debe 100644 --- a/docs/cluster/cluster.html +++ b/docs/cluster/cluster.html @@ -51,6 +51,7 @@

Module codeflare_sdk.cluster.cluster

""" from os import stat +from time import sleep from typing import List, Optional, Tuple import openshift as oc @@ -127,10 +128,17 @@

Module codeflare_sdk.cluster.cluster

Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ - self.config.auth.login() namespace = self.config.namespace - with oc.project(namespace): - oc.invoke("apply", ["-f", self.app_wrapper_yaml]) + try: + with oc.project(namespace): + oc.invoke("apply", ["-f", self.app_wrapper_yaml]) + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if "Unauthorized" in error_msg: + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + raise osp def down(self): """ @@ -138,54 +146,31 @@

Module codeflare_sdk.cluster.cluster

associated with the cluster. """ namespace = self.config.namespace - with oc.project(namespace): - oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) - self.config.auth.logout() - - def status(self, print_to_console: bool = True): - """ - TO BE UPDATED: Will soon return (and print by default) the cluster's - status, from AppWrapper submission to setup completion. All resource - details will be moved to cluster.details(). - """ - cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster: - # overriding the number of gpus with requested - cluster.worker_gpu = self.config.gpu - if print_to_console: - pretty_print.print_clusters([cluster]) - return cluster.status - else: - if print_to_console: - pretty_print.print_no_resources_found() - return None - - def cluster_uri(self) -> str: - """ - Returns a string containing the cluster's URI. - """ - return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001" - - def cluster_dashboard_uri(self, namespace: str = "default") -> str: - """ - Returns a string containing the cluster's dashboard URI. - """ try: with oc.project(namespace): - route = oc.invoke( - "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] + oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "AppWrapper"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you run cluster.up() yet?" ) - route = route.out().split(" ") - route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] - route = route[0].strip().strip("'") - return f"http://{route}" - except: - return "Dashboard route not available yet. Did you run cluster.up()?" + elif "not found" in error_msg: + print("Cluster not found, have you run cluster.up() yet?") + else: + raise osp - # checks whether the ray cluster is ready - def is_ready(self, print_to_console: bool = True): + def status( + self, print_to_console: bool = True + ) -> Tuple[CodeFlareClusterStatus, bool]: """ - TO BE DEPRECATED: functionality will be added into cluster.status(). + Returns the requested cluster's status, as well as whether or not + it is ready for use. """ ready = False status = CodeFlareClusterStatus.UNKNOWN @@ -198,27 +183,27 @@

Module codeflare_sdk.cluster.cluster

AppWrapperStatus.RUNNING_HOLD_COMPLETION, ]: ready = False - status = CodeFlareClusterStatus.QUEUED + status = CodeFlareClusterStatus.STARTING elif appwrapper.status in [ AppWrapperStatus.FAILED, AppWrapperStatus.DELETED, ]: ready = False status = CodeFlareClusterStatus.FAILED # should deleted be separate - return ready, status # exit early, no need to check ray status + return status, ready # exit early, no need to check ray status elif appwrapper.status in [AppWrapperStatus.PENDING]: ready = False status = CodeFlareClusterStatus.QUEUED if print_to_console: pretty_print.print_app_wrappers_status([appwrapper]) return ( - ready, status, + ready, ) # no need to check the ray status since still in queue # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster: + if cluster and not cluster.status == RayClusterStatus.UNKNOWN: if cluster.status == RayClusterStatus.READY: ready = True status = CodeFlareClusterStatus.READY @@ -232,14 +217,70 @@

Module codeflare_sdk.cluster.cluster

if print_to_console: # overriding the number of gpus with requested cluster.worker_gpu = self.config.gpu - pretty_print.print_clusters([cluster]) + pretty_print.print_cluster_status(cluster) + elif print_to_console: + if status == CodeFlareClusterStatus.UNKNOWN: + pretty_print.print_no_resources_found() + else: + pretty_print.print_app_wrappers_status([appwrapper]) + return status, ready + def wait_ready(self, timeout: Optional[int] = None): + """ + Waits for requested cluster to be ready, up to an optional timeout (s). + Checks every five seconds. + """ + print("Waiting for requested resources to be set up...") + ready = False + status = None + time = 0 + while not ready: + status, ready = self.status(print_to_console=False) + if status == CodeFlareClusterStatus.UNKNOWN: + print( + "WARNING: Current cluster status is unknown, have you run cluster.up yet?" + ) + if not ready: + if timeout and time >= timeout: + raise TimeoutError(f"wait() timed out after waiting {timeout}s") + sleep(5) + time += 5 + print("Requested cluster up and running!") + + def details(self, print_to_console: bool = True) -> RayCluster: + cluster = _copy_to_ray(self) + if print_to_console: + pretty_print.print_clusters([cluster]) + return cluster + + def cluster_uri(self) -> str: + """ + Returns a string containing the cluster's URI. + """ + return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001" + + def cluster_dashboard_uri(self) -> str: + """ + Returns a string containing the cluster's dashboard URI. + """ + try: + with oc.project(self.config.namespace): + route = oc.invoke( + "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] + ) + route = route.out().split(" ") + route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] + route = route[0].strip().strip("'") + return f"http://{route}" + except: + return "Dashboard route not available yet, have you run cluster.up()?" + def list_jobs(self) -> List: """ This method accesses the head ray node in your cluster and lists the running jobs. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.list_jobs() @@ -247,7 +288,7 @@

Module codeflare_sdk.cluster.cluster

""" This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.get_job_status(job_id) @@ -255,7 +296,7 @@

Module codeflare_sdk.cluster.cluster

""" This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.get_job_logs(job_id) @@ -264,7 +305,19 @@

Module codeflare_sdk.cluster.cluster

""" Returns the user's current working namespace. """ - namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip() + try: + namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip() + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + "do not have rights" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you run auth.login() or cluster.up()?" + ) + else: + raise osp return namespace @@ -295,31 +348,71 @@

Module codeflare_sdk.cluster.cluster

def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: - with oc.project(namespace), oc.timeout(10 * 60): - cluster = oc.selector(f"appwrapper/{name}").object() + cluster = None + try: + with oc.project(namespace), oc.timeout(10 * 60): + cluster = oc.selector(f"appwrapper/{name}").object() + except oc.OpenShiftPythonException as osp: # pragma: no cover + msg = osp.msg + if "Expected a single object, but selected 0" in msg: + return cluster + error_msg = osp.result.err() + if not ( + 'the server doesn\'t have a resource type "appwrapper"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise osp + if cluster: return _map_to_app_wrapper(cluster) + return cluster + def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: - # FIXME should we check the appwrapper first cluster = None try: with oc.project(namespace), oc.timeout(10 * 60): cluster = oc.selector(f"rayclusters/{name}").object() + except oc.OpenShiftPythonException as osp: # pragma: no cover + msg = osp.msg + if "Expected a single object, but selected 0" in msg: + return cluster + error_msg = osp.result.err() + if not ( + 'the server doesn\'t have a resource type "rayclusters"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise osp + + if cluster: + return _map_to_ray_cluster(cluster) - if cluster: - return _map_to_ray_cluster(cluster) - except: - pass return cluster def _get_ray_clusters(namespace="default") -> List[RayCluster]: list_of_clusters = [] - - with oc.project(namespace), oc.timeout(10 * 60): - ray_clusters = oc.selector("rayclusters").objects() + try: + with oc.project(namespace), oc.timeout(10 * 60): + ray_clusters = oc.selector("rayclusters").objects() + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "rayclusters"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + else: + raise osp for cluster in ray_clusters: list_of_clusters.append(_map_to_ray_cluster(cluster)) @@ -331,20 +424,39 @@

Module codeflare_sdk.cluster.cluster

) -> List[AppWrapper]: list_of_app_wrappers = [] - with oc.project(namespace), oc.timeout(10 * 60): - app_wrappers = oc.selector("appwrappers").objects() + try: + with oc.project(namespace), oc.timeout(10 * 60): + app_wrappers = oc.selector("appwrappers").objects() + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "appwrappers"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + else: + raise osp for item in app_wrappers: app_wrapper = _map_to_app_wrapper(item) if filter and app_wrapper.status in filter: list_of_app_wrappers.append(app_wrapper) else: + # Unsure what the purpose of the filter is list_of_app_wrappers.append(app_wrapper) return list_of_app_wrappers -def _map_to_ray_cluster(cluster) -> RayCluster: +def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: cluster_model = cluster.model + if type(cluster_model.status.state) == oc.model.MissingModel: + status = RayClusterStatus.UNKNOWN + else: + status = RayClusterStatus(cluster_model.status.state.lower()) with oc.project(cluster.namespace()), oc.timeout(10 * 60): route = ( @@ -355,7 +467,7 @@

Module codeflare_sdk.cluster.cluster

return RayCluster( name=cluster.name(), - status=RayClusterStatus(cluster_model.status.state.lower()), + status=status, # for now we are not using autoscaling so same replicas is fine min_workers=cluster_model.spec.workerGroupSpecs[0].replicas, max_workers=cluster_model.spec.workerGroupSpecs[0].replicas, @@ -381,7 +493,25 @@

Module codeflare_sdk.cluster.cluster

status=AppWrapperStatus(cluster_model.status.state.lower()), can_run=cluster_model.status.canrun, job_state=cluster_model.status.queuejobstate, - )
+ ) + + +def _copy_to_ray(cluster: Cluster) -> RayCluster: + ray = RayCluster( + name=cluster.config.name, + status=cluster.status(print_to_console=False)[0], + min_workers=cluster.config.min_worker, + max_workers=cluster.config.max_worker, + worker_mem_min=cluster.config.min_memory, + worker_mem_max=cluster.config.max_memory, + worker_cpu=cluster.config.min_cpus, + worker_gpu=cluster.config.gpu, + namespace=cluster.config.namespace, + dashboard=cluster.cluster_dashboard_uri(), + ) + if ray.status == CodeFlareClusterStatus.READY: + ray.status = RayClusterStatus.READY + return ray
@@ -404,7 +534,19 @@

Functions

""" Returns the user's current working namespace. """ - namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip() + try: + namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip() + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + "do not have rights" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you run auth.login() or cluster.up()?" + ) + else: + raise osp return namespace
@@ -530,10 +672,17 @@

Classes

Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ - self.config.auth.login() namespace = self.config.namespace - with oc.project(namespace): - oc.invoke("apply", ["-f", self.app_wrapper_yaml]) + try: + with oc.project(namespace): + oc.invoke("apply", ["-f", self.app_wrapper_yaml]) + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if "Unauthorized" in error_msg: + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + raise osp def down(self): """ @@ -541,54 +690,31 @@

Classes

associated with the cluster. """ namespace = self.config.namespace - with oc.project(namespace): - oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) - self.config.auth.logout() - - def status(self, print_to_console: bool = True): - """ - TO BE UPDATED: Will soon return (and print by default) the cluster's - status, from AppWrapper submission to setup completion. All resource - details will be moved to cluster.details(). - """ - cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster: - # overriding the number of gpus with requested - cluster.worker_gpu = self.config.gpu - if print_to_console: - pretty_print.print_clusters([cluster]) - return cluster.status - else: - if print_to_console: - pretty_print.print_no_resources_found() - return None - - def cluster_uri(self) -> str: - """ - Returns a string containing the cluster's URI. - """ - return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001" - - def cluster_dashboard_uri(self, namespace: str = "default") -> str: - """ - Returns a string containing the cluster's dashboard URI. - """ try: with oc.project(namespace): - route = oc.invoke( - "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] + oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "AppWrapper"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you run cluster.up() yet?" ) - route = route.out().split(" ") - route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] - route = route[0].strip().strip("'") - return f"http://{route}" - except: - return "Dashboard route not available yet. Did you run cluster.up()?" + elif "not found" in error_msg: + print("Cluster not found, have you run cluster.up() yet?") + else: + raise osp - # checks whether the ray cluster is ready - def is_ready(self, print_to_console: bool = True): + def status( + self, print_to_console: bool = True + ) -> Tuple[CodeFlareClusterStatus, bool]: """ - TO BE DEPRECATED: functionality will be added into cluster.status(). + Returns the requested cluster's status, as well as whether or not + it is ready for use. """ ready = False status = CodeFlareClusterStatus.UNKNOWN @@ -601,27 +727,27 @@

Classes

AppWrapperStatus.RUNNING_HOLD_COMPLETION, ]: ready = False - status = CodeFlareClusterStatus.QUEUED + status = CodeFlareClusterStatus.STARTING elif appwrapper.status in [ AppWrapperStatus.FAILED, AppWrapperStatus.DELETED, ]: ready = False status = CodeFlareClusterStatus.FAILED # should deleted be separate - return ready, status # exit early, no need to check ray status + return status, ready # exit early, no need to check ray status elif appwrapper.status in [AppWrapperStatus.PENDING]: ready = False status = CodeFlareClusterStatus.QUEUED if print_to_console: pretty_print.print_app_wrappers_status([appwrapper]) return ( - ready, status, + ready, ) # no need to check the ray status since still in queue # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster: + if cluster and not cluster.status == RayClusterStatus.UNKNOWN: if cluster.status == RayClusterStatus.READY: ready = True status = CodeFlareClusterStatus.READY @@ -635,14 +761,70 @@

Classes

if print_to_console: # overriding the number of gpus with requested cluster.worker_gpu = self.config.gpu - pretty_print.print_clusters([cluster]) + pretty_print.print_cluster_status(cluster) + elif print_to_console: + if status == CodeFlareClusterStatus.UNKNOWN: + pretty_print.print_no_resources_found() + else: + pretty_print.print_app_wrappers_status([appwrapper]) + return status, ready + def wait_ready(self, timeout: Optional[int] = None): + """ + Waits for requested cluster to be ready, up to an optional timeout (s). + Checks every five seconds. + """ + print("Waiting for requested resources to be set up...") + ready = False + status = None + time = 0 + while not ready: + status, ready = self.status(print_to_console=False) + if status == CodeFlareClusterStatus.UNKNOWN: + print( + "WARNING: Current cluster status is unknown, have you run cluster.up yet?" + ) + if not ready: + if timeout and time >= timeout: + raise TimeoutError(f"wait() timed out after waiting {timeout}s") + sleep(5) + time += 5 + print("Requested cluster up and running!") + + def details(self, print_to_console: bool = True) -> RayCluster: + cluster = _copy_to_ray(self) + if print_to_console: + pretty_print.print_clusters([cluster]) + return cluster + + def cluster_uri(self) -> str: + """ + Returns a string containing the cluster's URI. + """ + return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001" + + def cluster_dashboard_uri(self) -> str: + """ + Returns a string containing the cluster's dashboard URI. + """ + try: + with oc.project(self.config.namespace): + route = oc.invoke( + "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] + ) + route = route.out().split(" ") + route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] + route = route[0].strip().strip("'") + return f"http://{route}" + except: + return "Dashboard route not available yet, have you run cluster.up()?" + def list_jobs(self) -> List: """ This method accesses the head ray node in your cluster and lists the running jobs. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.list_jobs() @@ -650,7 +832,7 @@

Classes

""" This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.get_job_status(job_id) @@ -658,14 +840,14 @@

Classes

""" This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.get_job_logs(job_id)

Methods

-def cluster_dashboard_uri(self, namespace: str = 'default') ‑> str +def cluster_dashboard_uri(self) ‑> str

Returns a string containing the cluster's dashboard URI.

@@ -673,12 +855,12 @@

Methods

Expand source code -
def cluster_dashboard_uri(self, namespace: str = "default") -> str:
+
def cluster_dashboard_uri(self) -> str:
     """
     Returns a string containing the cluster's dashboard URI.
     """
     try:
-        with oc.project(namespace):
+        with oc.project(self.config.namespace):
             route = oc.invoke(
                 "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
             )
@@ -687,7 +869,7 @@ 

Methods

route = route[0].strip().strip("'") return f"http://{route}" except: - return "Dashboard route not available yet. Did you run cluster.up()?"
+ return "Dashboard route not available yet, have you run cluster.up()?"
@@ -751,87 +933,56 @@

Methods

)
-
-def down(self) +
+def details(self, print_to_console: bool = True) ‑> RayCluster
-

Deletes the AppWrapper yaml, scaling-down and deleting all resources -associated with the cluster.

+
Expand source code -
def down(self):
-    """
-    Deletes the AppWrapper yaml, scaling-down and deleting all resources
-    associated with the cluster.
-    """
-    namespace = self.config.namespace
-    with oc.project(namespace):
-        oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
-    self.config.auth.logout()
+
def details(self, print_to_console: bool = True) -> RayCluster:
+    cluster = _copy_to_ray(self)
+    if print_to_console:
+        pretty_print.print_clusters([cluster])
+    return cluster
-
-def is_ready(self, print_to_console: bool = True) +
+def down(self)
-

TO BE DEPRECATED: functionality will be added into cluster.status().

+

Deletes the AppWrapper yaml, scaling-down and deleting all resources +associated with the cluster.

Expand source code -
def is_ready(self, print_to_console: bool = True):
+
def down(self):
     """
-    TO BE DEPRECATED: functionality will be added into cluster.status().
+    Deletes the AppWrapper yaml, scaling-down and deleting all resources
+    associated with the cluster.
     """
-    ready = False
-    status = CodeFlareClusterStatus.UNKNOWN
-    # check the app wrapper status
-    appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
-    if appwrapper:
-        if appwrapper.status in [
-            AppWrapperStatus.RUNNING,
-            AppWrapperStatus.COMPLETED,
-            AppWrapperStatus.RUNNING_HOLD_COMPLETION,
-        ]:
-            ready = False
-            status = CodeFlareClusterStatus.QUEUED
-        elif appwrapper.status in [
-            AppWrapperStatus.FAILED,
-            AppWrapperStatus.DELETED,
-        ]:
-            ready = False
-            status = CodeFlareClusterStatus.FAILED  # should deleted be separate
-            return ready, status  # exit early, no need to check ray status
-        elif appwrapper.status in [AppWrapperStatus.PENDING]:
-            ready = False
-            status = CodeFlareClusterStatus.QUEUED
-            if print_to_console:
-                pretty_print.print_app_wrappers_status([appwrapper])
-            return (
-                ready,
-                status,
-            )  # no need to check the ray status since still in queue
-
-    # check the ray cluster status
-    cluster = _ray_cluster_status(self.config.name, self.config.namespace)
-    if cluster:
-        if cluster.status == RayClusterStatus.READY:
-            ready = True
-            status = CodeFlareClusterStatus.READY
-        elif cluster.status in [
-            RayClusterStatus.UNHEALTHY,
-            RayClusterStatus.FAILED,
-        ]:
-            ready = False
-            status = CodeFlareClusterStatus.FAILED
-
-        if print_to_console:
-            # overriding the number of gpus with requested
-            cluster.worker_gpu = self.config.gpu
-            pretty_print.print_clusters([cluster])
-    return status, ready
+ namespace = self.config.namespace + try: + with oc.project(namespace): + oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if ( + 'the server doesn\'t have a resource type "AppWrapper"' in error_msg + or "forbidden" in error_msg + or "Unauthorized" in error_msg + or "Missing or incomplete configuration" in error_msg + ): + raise PermissionError( + "Action not permitted, have you run cluster.up() yet?" + ) + elif "not found" in error_msg: + print("Cluster not found, have you run cluster.up() yet?") + else: + raise osp
@@ -847,7 +998,7 @@

Methods

""" This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.get_job_logs(job_id)
@@ -865,7 +1016,7 @@

Methods

""" This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.get_job_status(job_id)
@@ -883,39 +1034,81 @@

Methods

""" This method accesses the head ray node in your cluster and lists the running jobs. """ - dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace) + dashboard_route = self.cluster_dashboard_uri() client = JobSubmissionClient(dashboard_route) return client.list_jobs()
-def status(self, print_to_console: bool = True) +def status(self, print_to_console: bool = True) ‑> Tuple[CodeFlareClusterStatus, bool]
-

TO BE UPDATED: Will soon return (and print by default) the cluster's -status, from AppWrapper submission to setup completion. All resource -details will be moved to cluster.details().

+

Returns the requested cluster's status, as well as whether or not +it is ready for use.

Expand source code -
def status(self, print_to_console: bool = True):
+
def status(
+    self, print_to_console: bool = True
+) -> Tuple[CodeFlareClusterStatus, bool]:
     """
-    TO BE UPDATED: Will soon return (and print by default) the cluster's
-    status, from AppWrapper submission to setup completion. All resource
-    details will be moved to cluster.details().
+    Returns the requested cluster's status, as well as whether or not
+    it is ready for use.
     """
+    ready = False
+    status = CodeFlareClusterStatus.UNKNOWN
+    # check the app wrapper status
+    appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
+    if appwrapper:
+        if appwrapper.status in [
+            AppWrapperStatus.RUNNING,
+            AppWrapperStatus.COMPLETED,
+            AppWrapperStatus.RUNNING_HOLD_COMPLETION,
+        ]:
+            ready = False
+            status = CodeFlareClusterStatus.STARTING
+        elif appwrapper.status in [
+            AppWrapperStatus.FAILED,
+            AppWrapperStatus.DELETED,
+        ]:
+            ready = False
+            status = CodeFlareClusterStatus.FAILED  # should deleted be separate
+            return status, ready  # exit early, no need to check ray status
+        elif appwrapper.status in [AppWrapperStatus.PENDING]:
+            ready = False
+            status = CodeFlareClusterStatus.QUEUED
+            if print_to_console:
+                pretty_print.print_app_wrappers_status([appwrapper])
+            return (
+                status,
+                ready,
+            )  # no need to check the ray status since still in queue
+
+    # check the ray cluster status
     cluster = _ray_cluster_status(self.config.name, self.config.namespace)
-    if cluster:
-        # overriding the number of gpus with requested
-        cluster.worker_gpu = self.config.gpu
-        if print_to_console:
-            pretty_print.print_clusters([cluster])
-        return cluster.status
-    else:
+    if cluster and not cluster.status == RayClusterStatus.UNKNOWN:
+        if cluster.status == RayClusterStatus.READY:
+            ready = True
+            status = CodeFlareClusterStatus.READY
+        elif cluster.status in [
+            RayClusterStatus.UNHEALTHY,
+            RayClusterStatus.FAILED,
+        ]:
+            ready = False
+            status = CodeFlareClusterStatus.FAILED
+
         if print_to_console:
+            # overriding the number of gpus with requested
+            cluster.worker_gpu = self.config.gpu
+            pretty_print.print_cluster_status(cluster)
+    elif print_to_console:
+        if status == CodeFlareClusterStatus.UNKNOWN:
             pretty_print.print_no_resources_found()
-        return None
+ else: + pretty_print.print_app_wrappers_status([appwrapper]) + + return status, ready
@@ -933,10 +1126,50 @@

Methods

Applies the AppWrapper yaml, pushing the resource request onto the MCAD queue. """ - self.config.auth.login() namespace = self.config.namespace - with oc.project(namespace): - oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+ try: + with oc.project(namespace): + oc.invoke("apply", ["-f", self.app_wrapper_yaml]) + except oc.OpenShiftPythonException as osp: # pragma: no cover + error_msg = osp.result.err() + if "Unauthorized" in error_msg: + raise PermissionError( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + raise osp + + +
+def wait_ready(self, timeout: Optional[int] = None) +
+
+

Waits for requested cluster to be ready, up to an optional timeout (s). +Checks every five seconds.

+
+ +Expand source code + +
def wait_ready(self, timeout: Optional[int] = None):
+    """
+    Waits for requested cluster to be ready, up to an optional timeout (s).
+    Checks every five seconds.
+    """
+    print("Waiting for requested resources to be set up...")
+    ready = False
+    status = None
+    time = 0
+    while not ready:
+        status, ready = self.status(print_to_console=False)
+        if status == CodeFlareClusterStatus.UNKNOWN:
+            print(
+                "WARNING: Current cluster status is unknown, have you run cluster.up yet?"
+            )
+        if not ready:
+            if timeout and time >= timeout:
+                raise TimeoutError(f"wait() timed out after waiting {timeout}s")
+            sleep(5)
+            time += 5
+    print("Requested cluster up and running!")
@@ -970,13 +1203,14 @@

cluster_dashboard_uri
  • cluster_uri
  • create_app_wrapper
  • +
  • details
  • down
  • -
  • is_ready
  • job_logs
  • job_status
  • list_jobs
  • status
  • up
  • +
  • wait_ready
  • diff --git a/docs/cluster/config.html b/docs/cluster/config.html index 476ab6fce..e5c95c220 100644 --- a/docs/cluster/config.html +++ b/docs/cluster/config.html @@ -78,8 +78,7 @@

    Module codeflare_sdk.cluster.config

    template: str = f"{dir}/templates/new-template.yaml" instascale: bool = False envs: dict = field(default_factory=dict) - image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" - auth: Authentication = Authentication()
    + image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
    @@ -93,7 +92,7 @@

    Classes

    class ClusterConfiguration -(name: str, namespace: str = 'default', head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103', auth: Authentication = <codeflare_sdk.cluster.auth.Authentication object>) +(name: str, namespace: str = 'default', head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103')

    This dataclass is used to specify resource requirements and other details, and @@ -122,15 +121,10 @@

    Classes

    template: str = f"{dir}/templates/new-template.yaml" instascale: bool = False envs: dict = field(default_factory=dict) - image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" - auth: Authentication = Authentication() + image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"

    Class variables

    -
    var authAuthentication
    -
    -
    -
    var envs : dict
    @@ -212,7 +206,6 @@

    Index

  • ClusterConfiguration

      -
    • auth
    • envs
    • gpu
    • head_info
    • diff --git a/docs/cluster/model.html b/docs/cluster/model.html index 42cefda09..2a4615559 100644 --- a/docs/cluster/model.html +++ b/docs/cluster/model.html @@ -86,9 +86,10 @@

      Module codeflare_sdk.cluster.model

      """ READY = 1 - QUEUED = 2 - FAILED = 3 - UNKNOWN = 4 + STARTING = 2 + QUEUED = 3 + FAILED = 4 + UNKNOWN = 5 @dataclass @@ -240,9 +241,10 @@

      Class variables

      """ READY = 1 - QUEUED = 2 - FAILED = 3 - UNKNOWN = 4 + STARTING = 2 + QUEUED = 3 + FAILED = 4 + UNKNOWN = 5

      Ancestors

      diff --git a/docs/utils/generate_yaml.html b/docs/utils/generate_yaml.html index b2189e519..aacd0804a 100644 --- a/docs/utils/generate_yaml.html +++ b/docs/utils/generate_yaml.html @@ -271,7 +271,7 @@

      Module codeflare_sdk.utils.generate_yaml

      return outfile -def main(): +def main(): # pragma: no cover parser = argparse.ArgumentParser(description="Generate user AppWrapper") parser.add_argument( "--name", @@ -379,7 +379,7 @@

      Module codeflare_sdk.utils.generate_yaml

      return outfile -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover main()
  • @@ -471,7 +471,7 @@

    Functions

    Expand source code -
    def main():
    +
    def main():  # pragma: no cover
         parser = argparse.ArgumentParser(description="Generate user AppWrapper")
         parser.add_argument(
             "--name",
    diff --git a/docs/utils/pretty_print.html b/docs/utils/pretty_print.html
    index 013d6e248..5da3d5b36 100644
    --- a/docs/utils/pretty_print.html
    +++ b/docs/utils/pretty_print.html
    @@ -60,7 +60,7 @@ 

    Module codeflare_sdk.utils.pretty_print

    def print_no_resources_found(): console = Console() - console.print(Panel("[red]No resources found")) + console.print(Panel("[red]No resources found, have you run cluster.up() yet?")) def print_app_wrappers_status(app_wrappers: List[AppWrapper]): @@ -69,22 +69,66 @@

    Module codeflare_sdk.utils.pretty_print

    return # shortcircuit console = Console() + table = Table( + box=box.ASCII_DOUBLE_HEAD, + title="[bold] :rocket: Cluster Queue Status :rocket:", + ) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("Status", style="magenta") + for app_wrapper in app_wrappers: name = app_wrapper.name status = app_wrapper.status.value - - table = Table( - box=box.ASCII_DOUBLE_HEAD, - title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:", - ) - table.add_column("Name", style="cyan", no_wrap=True) - table.add_column("Status", style="magenta") table.add_row(name, status) table.add_row("") # empty row for spacing - console.print(Panel.fit(table)) + console.print(Panel.fit(table)) + + +def print_cluster_status(cluster: RayCluster): + "Pretty prints the status of a passed-in cluster" + if not cluster: + print_no_resources_found + return -def print_clusters(clusters: List[RayCluster], verbose=True): + console = Console() + status = ( + "Active :white_heavy_check_mark:" + if cluster.status == RayClusterStatus.READY + else "Inactive :x:" + ) + name = cluster.name + dashboard = cluster.dashboard + # owned = bool(cluster["userOwned"]) + owned = True + + #'table0' to display the cluster name, status, url, and dashboard link + table0 = Table(box=None, show_header=False) + if owned: + table0.add_row("[white on green][bold]Name") + else: + table0.add_row("") + table0.add_row("[bold underline]" + name, status) + table0.add_row() + # fixme harcded to default for now + table0.add_row( + f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001" + ) # format that is used to generate the name of the service + table0.add_row() + table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]") + table0.add_row("") # empty row for spacing + + # table4 to display table0 and table3, one below the other + table4 = Table(box=None, show_header=False) + table4.add_row(table0) + + # Encompass all details of the cluster in a single panel + table5 = Table(box=None, title="[bold] :rocket: CodeFlare Cluster Status :rocket:") + table5.add_row(Panel.fit(table4)) + console.print(table5) + + +def print_clusters(clusters: List[RayCluster]): if not clusters: print_no_resources_found() return # shortcircuit @@ -96,13 +140,13 @@

    Module codeflare_sdk.utils.pretty_print

    status = ( "Active :white_heavy_check_mark:" if cluster.status == RayClusterStatus.READY - else "InActive :x:" + else "Inactive :x:" ) name = cluster.name dashboard = cluster.dashboard mincount = str(cluster.min_workers) maxcount = str(cluster.max_workers) - memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max + memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max) cpu = str(cluster.worker_cpu) gpu = str(cluster.worker_gpu) # owned = bool(cluster["userOwned"]) @@ -111,7 +155,7 @@

    Module codeflare_sdk.utils.pretty_print

    #'table0' to display the cluster name, status, url, and dashboard link table0 = Table(box=None, show_header=False) if owned: - table0.add_row("[white on green][bold]Owner") + table0.add_row("[white on green][bold]Name") else: table0.add_row("") table0.add_row("[bold underline]" + name, status) @@ -162,7 +206,7 @@

    Module codeflare_sdk.utils.pretty_print

    # than being center aligned on the console/terminal if we simply use console.print(title) table5 = Table( - box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:" + box=None, title="[bold] :rocket: CodeFlare Cluster Details :rocket:" ) table5.add_row(Panel.fit(table4)) console.print(table5) @@ -193,23 +237,76 @@

    Functions

    return # shortcircuit console = Console() + table = Table( + box=box.ASCII_DOUBLE_HEAD, + title="[bold] :rocket: Cluster Queue Status :rocket:", + ) + table.add_column("Name", style="cyan", no_wrap=True) + table.add_column("Status", style="magenta") + for app_wrapper in app_wrappers: name = app_wrapper.name status = app_wrapper.status.value - - table = Table( - box=box.ASCII_DOUBLE_HEAD, - title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:", - ) - table.add_column("Name", style="cyan", no_wrap=True) - table.add_column("Status", style="magenta") table.add_row(name, status) table.add_row("") # empty row for spacing - console.print(Panel.fit(table))
    + + console.print(Panel.fit(table))
    + + +
    +def print_cluster_status(cluster: RayCluster) +
    +
    +

    Pretty prints the status of a passed-in cluster

    +
    + +Expand source code + +
    def print_cluster_status(cluster: RayCluster):
    +    "Pretty prints the status of a passed-in cluster"
    +    if not cluster:
    +        print_no_resources_found
    +        return
    +
    +    console = Console()
    +    status = (
    +        "Active :white_heavy_check_mark:"
    +        if cluster.status == RayClusterStatus.READY
    +        else "Inactive :x:"
    +    )
    +    name = cluster.name
    +    dashboard = cluster.dashboard
    +    # owned = bool(cluster["userOwned"])
    +    owned = True
    +
    +    #'table0' to display the cluster name, status, url, and dashboard link
    +    table0 = Table(box=None, show_header=False)
    +    if owned:
    +        table0.add_row("[white on green][bold]Name")
    +    else:
    +        table0.add_row("")
    +    table0.add_row("[bold underline]" + name, status)
    +    table0.add_row()
    +    # fixme harcded to default for now
    +    table0.add_row(
    +        f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001"
    +    )  # format that is used to generate the name of the service
    +    table0.add_row()
    +    table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]")
    +    table0.add_row("")  # empty row for spacing
    +
    +    # table4 to display table0 and table3, one below the other
    +    table4 = Table(box=None, show_header=False)
    +    table4.add_row(table0)
    +
    +    # Encompass all details of the cluster in a single panel
    +    table5 = Table(box=None, title="[bold] :rocket: CodeFlare Cluster Status :rocket:")
    +    table5.add_row(Panel.fit(table4))
    +    console.print(table5)
    -def print_clusters(clusters: List[RayCluster], verbose=True) +def print_clusters(clusters: List[RayCluster])
    @@ -217,7 +314,7 @@

    Functions

    Expand source code -
    def print_clusters(clusters: List[RayCluster], verbose=True):
    +
    def print_clusters(clusters: List[RayCluster]):
         if not clusters:
             print_no_resources_found()
             return  # shortcircuit
    @@ -229,13 +326,13 @@ 

    Functions

    status = ( "Active :white_heavy_check_mark:" if cluster.status == RayClusterStatus.READY - else "InActive :x:" + else "Inactive :x:" ) name = cluster.name dashboard = cluster.dashboard mincount = str(cluster.min_workers) maxcount = str(cluster.max_workers) - memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max + memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max) cpu = str(cluster.worker_cpu) gpu = str(cluster.worker_gpu) # owned = bool(cluster["userOwned"]) @@ -244,7 +341,7 @@

    Functions

    #'table0' to display the cluster name, status, url, and dashboard link table0 = Table(box=None, show_header=False) if owned: - table0.add_row("[white on green][bold]Owner") + table0.add_row("[white on green][bold]Name") else: table0.add_row("") table0.add_row("[bold underline]" + name, status) @@ -295,7 +392,7 @@

    Functions

    # than being center aligned on the console/terminal if we simply use console.print(title) table5 = Table( - box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:" + box=None, title="[bold] :rocket: CodeFlare Cluster Details :rocket:" ) table5.add_row(Panel.fit(table4)) console.print(table5) @@ -315,7 +412,7 @@

    Functions

    def print_no_resources_found():
         console = Console()
    -    console.print(Panel("[red]No resources found"))
    + console.print(Panel("[red]No resources found, have you run cluster.up() yet?"))
    @@ -337,6 +434,7 @@

    Index

  • Functions

    diff --git a/pyproject.toml b/pyproject.toml index 408a49d9d..9312611ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "codeflare-sdk" -version = "0.2.2" +version = "0.3.0" description = "Python SDK for codeflare client" license = "Apache-2.0"