diff --git a/demo-notebooks/batch-job/batch_mnist.ipynb b/demo-notebooks/batch-job/batch_mnist.ipynb index 6512c9be1..0b0c25f61 100644 --- a/demo-notebooks/batch-job/batch_mnist.ipynb +++ b/demo-notebooks/batch-job/batch_mnist.ipynb @@ -24,7 +24,8 @@ " token = \"XXXX\",\n", " server = \"XXXX\",\n", " skip_tls=True\n", - ")" + ")\n", + "auth.login()" ] }, { @@ -43,7 +44,7 @@ "outputs": [], "source": [ "# Create our cluster and submit appwrapper\n", - "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"], auth=auth))" + "cluster = Cluster(ClusterConfiguration(name='mnisttest', min_worker=2, max_worker=2, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" ] }, { @@ -66,11 +67,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "657ebdfb", "metadata": {}, "source": [ - "Now, we want to check on the status of our resource cluster, until it is finally ready for use." + "Now, we want to check on the status of our resource cluster, and wait until it is finally ready for use." ] }, { @@ -122,7 +124,36 @@ } ], "source": [ - "cluster.is_ready()" + "cluster.status()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a99d5aff", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df71c1ed", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.status()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b3a55fe4", + "metadata": {}, + "source": [ + "Let's quickly verify that the specs of the cluster are as expected." ] }, { @@ -190,7 +221,7 @@ } ], "source": [ - "cluster.status()" + "cluster.details()" ] }, { @@ -1994,6 +2025,16 @@ "source": [ "cluster.down()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d41b90e", + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] } ], "metadata": { @@ -2012,7 +2053,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.13" }, "vscode": { "interpreter": { diff --git a/demo-notebooks/interactive/hf_interactive.ipynb b/demo-notebooks/interactive/hf_interactive.ipynb index b107d1ac3..ed7748216 100644 --- a/demo-notebooks/interactive/hf_interactive.ipynb +++ b/demo-notebooks/interactive/hf_interactive.ipynb @@ -53,12 +53,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Create authentication object for oc user permissions\n", + "# Create authentication object for oc user permissions and login\n", "auth = TokenAuthentication(\n", " token = \"XXXX\",\n", " server = \"XXXX\",\n", " skip_tls = True\n", - ")" + ")\n", + "auth.login()" ] }, { @@ -85,7 +86,7 @@ ], "source": [ "# Create our cluster and submit appwrapper\n", - "cluster = Cluster(ClusterConfiguration(name='hfgputest', min_worker=1, max_worker=1, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"], auth=auth))" + "cluster = Cluster(ClusterConfiguration(name='hfgputest', min_worker=1, max_worker=1, min_cpus=8, max_cpus=8, min_memory=16, max_memory=16, gpu=4, instascale=True, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))" ] }, { @@ -107,11 +108,12 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "657ebdfb", "metadata": {}, "source": [ - "Now, we want to check on the status of our resource cluster, until it is finally ready for use." + "Now, we want to check on the initial status of our resource cluster, then wait until it is finally ready for use." ] }, { @@ -163,7 +165,36 @@ } ], "source": [ - "cluster.is_ready()" + "cluster.status()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d26275e", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.wait_ready()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2969a4b", + "metadata": {}, + "outputs": [], + "source": [ + "cluster.status()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "477ac246", + "metadata": {}, + "source": [ + "Let's quickly verify that the specs of the cluster are as expected." ] }, { @@ -231,7 +262,7 @@ } ], "source": [ - "cluster.status()" + "cluster.details()" ] }, { @@ -1379,6 +1410,16 @@ "cluster.down()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "61bf4946", + "metadata": {}, + "outputs": [], + "source": [ + "auth.logout()" + ] + }, { "cell_type": "markdown", "id": "2b7a183b-5e8e-4adb-b9a6-a349e13512a0", @@ -1416,7 +1457,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.13" }, "vscode": { "interpreter": { diff --git a/docs/cluster/auth.html b/docs/cluster/auth.html index 01538a024..bda78622a 100644 --- a/docs/cluster/auth.html +++ b/docs/cluster/auth.html @@ -92,7 +92,7 @@
codeflare_sdk.cluster.auth
codeflare_sdk.cluster.auth
codeflare_sdk.cluster.auth
-def login(self)
+def login(self) ‑> str
This function is used to login to an OpenShift cluster using the user's username
and password
.
def login(self):
+def login(self) -> str:
"""
This function is used to login to an OpenShift cluster using the user's `username` and `password`.
"""
@@ -299,7 +302,7 @@ Methods
-def logout(self)
+def logout(self) ‑> str
This function is used to logout of an OpenShift cluster.
def logout(self):
+def logout(self) -> str:
"""
This function is used to logout of an OpenShift cluster.
"""
@@ -346,7 +349,7 @@ Methods
self.server = server
self.skip_tls = skip_tls
- def login(self):
+ def login(self) -> str:
"""
This function is used to login to an OpenShift cluster using the user's API token and API server address.
Depending on the cluster, a user can choose to login in with "--insecure-skip-tls-verify` by setting `skip_tls`
@@ -361,15 +364,18 @@ Methods
error_msg = osp.result.err()
if "The server uses a certificate signed by unknown authority" in error_msg:
return "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication"
+ elif "invalid" in error_msg:
+ raise PermissionError(error_msg)
else:
return error_msg
return response.out()
- def logout(self):
+ def logout(self) -> str:
"""
This function is used to logout of an OpenShift cluster.
"""
- response = oc.invoke("logout")
+ args = [f"--token={self.token}", f"--server={self.server}:6443"]
+ response = oc.invoke("logout", args)
return response.out()
Ancestors
@@ -379,7 +385,7 @@ Ancestors
Methods
-def login(self)
+def login(self) ‑> str
-
This function is used to login to an OpenShift cluster using the user's API token and API server address.
@@ -389,7 +395,7 @@
Methods
Expand source code
-def login(self):
+def login(self) -> str:
"""
This function is used to login to an OpenShift cluster using the user's API token and API server address.
Depending on the cluster, a user can choose to login in with "--insecure-skip-tls-verify` by setting `skip_tls`
@@ -404,13 +410,15 @@ Methods
error_msg = osp.result.err()
if "The server uses a certificate signed by unknown authority" in error_msg:
return "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication"
+ elif "invalid" in error_msg:
+ raise PermissionError(error_msg)
else:
return error_msg
return response.out()
-def logout(self)
+def logout(self) ‑> str
-
This function is used to logout of an OpenShift cluster.
@@ -418,11 +426,12 @@ Methods
Expand source code
-def logout(self):
+def logout(self) -> str:
"""
This function is used to logout of an OpenShift cluster.
"""
- response = oc.invoke("logout")
+ args = [f"--token={self.token}", f"--server={self.server}:6443"]
+ response = oc.invoke("logout", args)
return response.out()
diff --git a/docs/cluster/cluster.html b/docs/cluster/cluster.html
index 5f3a6aad9..fda90debe 100644
--- a/docs/cluster/cluster.html
+++ b/docs/cluster/cluster.html
@@ -51,6 +51,7 @@ Module codeflare_sdk.cluster.cluster
"""
from os import stat
+from time import sleep
from typing import List, Optional, Tuple
import openshift as oc
@@ -127,10 +128,17 @@ Module codeflare_sdk.cluster.cluster
Applies the AppWrapper yaml, pushing the resource request onto
the MCAD queue.
"""
- self.config.auth.login()
namespace = self.config.namespace
- with oc.project(namespace):
- oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+ try:
+ with oc.project(namespace):
+ oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if "Unauthorized" in error_msg:
+ raise PermissionError(
+ "Action not permitted, have you put in correct/up-to-date auth credentials?"
+ )
+ raise osp
def down(self):
"""
@@ -138,54 +146,31 @@ Module codeflare_sdk.cluster.cluster
associated with the cluster.
"""
namespace = self.config.namespace
- with oc.project(namespace):
- oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
- self.config.auth.logout()
-
- def status(self, print_to_console: bool = True):
- """
- TO BE UPDATED: Will soon return (and print by default) the cluster's
- status, from AppWrapper submission to setup completion. All resource
- details will be moved to cluster.details().
- """
- cluster = _ray_cluster_status(self.config.name, self.config.namespace)
- if cluster:
- # overriding the number of gpus with requested
- cluster.worker_gpu = self.config.gpu
- if print_to_console:
- pretty_print.print_clusters([cluster])
- return cluster.status
- else:
- if print_to_console:
- pretty_print.print_no_resources_found()
- return None
-
- def cluster_uri(self) -> str:
- """
- Returns a string containing the cluster's URI.
- """
- return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
-
- def cluster_dashboard_uri(self, namespace: str = "default") -> str:
- """
- Returns a string containing the cluster's dashboard URI.
- """
try:
with oc.project(namespace):
- route = oc.invoke(
- "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
+ oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if (
+ 'the server doesn\'t have a resource type "AppWrapper"' in error_msg
+ or "forbidden" in error_msg
+ or "Unauthorized" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise PermissionError(
+ "Action not permitted, have you run cluster.up() yet?"
)
- route = route.out().split(" ")
- route = [x for x in route if f"ray-dashboard-{self.config.name}" in x]
- route = route[0].strip().strip("'")
- return f"http://{route}"
- except:
- return "Dashboard route not available yet. Did you run cluster.up()?"
+ elif "not found" in error_msg:
+ print("Cluster not found, have you run cluster.up() yet?")
+ else:
+ raise osp
- # checks whether the ray cluster is ready
- def is_ready(self, print_to_console: bool = True):
+ def status(
+ self, print_to_console: bool = True
+ ) -> Tuple[CodeFlareClusterStatus, bool]:
"""
- TO BE DEPRECATED: functionality will be added into cluster.status().
+ Returns the requested cluster's status, as well as whether or not
+ it is ready for use.
"""
ready = False
status = CodeFlareClusterStatus.UNKNOWN
@@ -198,27 +183,27 @@ Module codeflare_sdk.cluster.cluster
AppWrapperStatus.RUNNING_HOLD_COMPLETION,
]:
ready = False
- status = CodeFlareClusterStatus.QUEUED
+ status = CodeFlareClusterStatus.STARTING
elif appwrapper.status in [
AppWrapperStatus.FAILED,
AppWrapperStatus.DELETED,
]:
ready = False
status = CodeFlareClusterStatus.FAILED # should deleted be separate
- return ready, status # exit early, no need to check ray status
+ return status, ready # exit early, no need to check ray status
elif appwrapper.status in [AppWrapperStatus.PENDING]:
ready = False
status = CodeFlareClusterStatus.QUEUED
if print_to_console:
pretty_print.print_app_wrappers_status([appwrapper])
return (
- ready,
status,
+ ready,
) # no need to check the ray status since still in queue
# check the ray cluster status
cluster = _ray_cluster_status(self.config.name, self.config.namespace)
- if cluster:
+ if cluster and not cluster.status == RayClusterStatus.UNKNOWN:
if cluster.status == RayClusterStatus.READY:
ready = True
status = CodeFlareClusterStatus.READY
@@ -232,14 +217,70 @@ Module codeflare_sdk.cluster.cluster
if print_to_console:
# overriding the number of gpus with requested
cluster.worker_gpu = self.config.gpu
- pretty_print.print_clusters([cluster])
+ pretty_print.print_cluster_status(cluster)
+ elif print_to_console:
+ if status == CodeFlareClusterStatus.UNKNOWN:
+ pretty_print.print_no_resources_found()
+ else:
+ pretty_print.print_app_wrappers_status([appwrapper])
+
return status, ready
+ def wait_ready(self, timeout: Optional[int] = None):
+ """
+ Waits for requested cluster to be ready, up to an optional timeout (s).
+ Checks every five seconds.
+ """
+ print("Waiting for requested resources to be set up...")
+ ready = False
+ status = None
+ time = 0
+ while not ready:
+ status, ready = self.status(print_to_console=False)
+ if status == CodeFlareClusterStatus.UNKNOWN:
+ print(
+ "WARNING: Current cluster status is unknown, have you run cluster.up yet?"
+ )
+ if not ready:
+ if timeout and time >= timeout:
+ raise TimeoutError(f"wait() timed out after waiting {timeout}s")
+ sleep(5)
+ time += 5
+ print("Requested cluster up and running!")
+
+ def details(self, print_to_console: bool = True) -> RayCluster:
+ cluster = _copy_to_ray(self)
+ if print_to_console:
+ pretty_print.print_clusters([cluster])
+ return cluster
+
+ def cluster_uri(self) -> str:
+ """
+ Returns a string containing the cluster's URI.
+ """
+ return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
+
+ def cluster_dashboard_uri(self) -> str:
+ """
+ Returns a string containing the cluster's dashboard URI.
+ """
+ try:
+ with oc.project(self.config.namespace):
+ route = oc.invoke(
+ "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
+ )
+ route = route.out().split(" ")
+ route = [x for x in route if f"ray-dashboard-{self.config.name}" in x]
+ route = route[0].strip().strip("'")
+ return f"http://{route}"
+ except:
+ return "Dashboard route not available yet, have you run cluster.up()?"
+
def list_jobs(self) -> List:
"""
This method accesses the head ray node in your cluster and lists the running jobs.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.list_jobs()
@@ -247,7 +288,7 @@ Module codeflare_sdk.cluster.cluster
"""
This method accesses the head ray node in your cluster and returns the job status for the provided job id.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.get_job_status(job_id)
@@ -255,7 +296,7 @@ Module codeflare_sdk.cluster.cluster
"""
This method accesses the head ray node in your cluster and returns the logs for the provided job id.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.get_job_logs(job_id)
@@ -264,7 +305,19 @@ Module codeflare_sdk.cluster.cluster
"""
Returns the user's current working namespace.
"""
- namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
+ try:
+ namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if (
+ "do not have rights" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise PermissionError(
+ "Action not permitted, have you run auth.login() or cluster.up()?"
+ )
+ else:
+ raise osp
return namespace
@@ -295,31 +348,71 @@ Module codeflare_sdk.cluster.cluster
def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]:
- with oc.project(namespace), oc.timeout(10 * 60):
- cluster = oc.selector(f"appwrapper/{name}").object()
+ cluster = None
+ try:
+ with oc.project(namespace), oc.timeout(10 * 60):
+ cluster = oc.selector(f"appwrapper/{name}").object()
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ msg = osp.msg
+ if "Expected a single object, but selected 0" in msg:
+ return cluster
+ error_msg = osp.result.err()
+ if not (
+ 'the server doesn\'t have a resource type "appwrapper"' in error_msg
+ or "forbidden" in error_msg
+ or "Unauthorized" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise osp
+
if cluster:
return _map_to_app_wrapper(cluster)
+ return cluster
+
def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]:
- # FIXME should we check the appwrapper first
cluster = None
try:
with oc.project(namespace), oc.timeout(10 * 60):
cluster = oc.selector(f"rayclusters/{name}").object()
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ msg = osp.msg
+ if "Expected a single object, but selected 0" in msg:
+ return cluster
+ error_msg = osp.result.err()
+ if not (
+ 'the server doesn\'t have a resource type "rayclusters"' in error_msg
+ or "forbidden" in error_msg
+ or "Unauthorized" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise osp
+
+ if cluster:
+ return _map_to_ray_cluster(cluster)
- if cluster:
- return _map_to_ray_cluster(cluster)
- except:
- pass
return cluster
def _get_ray_clusters(namespace="default") -> List[RayCluster]:
list_of_clusters = []
-
- with oc.project(namespace), oc.timeout(10 * 60):
- ray_clusters = oc.selector("rayclusters").objects()
+ try:
+ with oc.project(namespace), oc.timeout(10 * 60):
+ ray_clusters = oc.selector("rayclusters").objects()
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if (
+ 'the server doesn\'t have a resource type "rayclusters"' in error_msg
+ or "forbidden" in error_msg
+ or "Unauthorized" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise PermissionError(
+ "Action not permitted, have you put in correct/up-to-date auth credentials?"
+ )
+ else:
+ raise osp
for cluster in ray_clusters:
list_of_clusters.append(_map_to_ray_cluster(cluster))
@@ -331,20 +424,39 @@ Module codeflare_sdk.cluster.cluster
) -> List[AppWrapper]:
list_of_app_wrappers = []
- with oc.project(namespace), oc.timeout(10 * 60):
- app_wrappers = oc.selector("appwrappers").objects()
+ try:
+ with oc.project(namespace), oc.timeout(10 * 60):
+ app_wrappers = oc.selector("appwrappers").objects()
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if (
+ 'the server doesn\'t have a resource type "appwrappers"' in error_msg
+ or "forbidden" in error_msg
+ or "Unauthorized" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise PermissionError(
+ "Action not permitted, have you put in correct/up-to-date auth credentials?"
+ )
+ else:
+ raise osp
for item in app_wrappers:
app_wrapper = _map_to_app_wrapper(item)
if filter and app_wrapper.status in filter:
list_of_app_wrappers.append(app_wrapper)
else:
+ # Unsure what the purpose of the filter is
list_of_app_wrappers.append(app_wrapper)
return list_of_app_wrappers
-def _map_to_ray_cluster(cluster) -> RayCluster:
+def _map_to_ray_cluster(cluster) -> Optional[RayCluster]:
cluster_model = cluster.model
+ if type(cluster_model.status.state) == oc.model.MissingModel:
+ status = RayClusterStatus.UNKNOWN
+ else:
+ status = RayClusterStatus(cluster_model.status.state.lower())
with oc.project(cluster.namespace()), oc.timeout(10 * 60):
route = (
@@ -355,7 +467,7 @@ Module codeflare_sdk.cluster.cluster
return RayCluster(
name=cluster.name(),
- status=RayClusterStatus(cluster_model.status.state.lower()),
+ status=status,
# for now we are not using autoscaling so same replicas is fine
min_workers=cluster_model.spec.workerGroupSpecs[0].replicas,
max_workers=cluster_model.spec.workerGroupSpecs[0].replicas,
@@ -381,7 +493,25 @@ Module codeflare_sdk.cluster.cluster
status=AppWrapperStatus(cluster_model.status.state.lower()),
can_run=cluster_model.status.canrun,
job_state=cluster_model.status.queuejobstate,
- )
+ )
+
+
+def _copy_to_ray(cluster: Cluster) -> RayCluster:
+ ray = RayCluster(
+ name=cluster.config.name,
+ status=cluster.status(print_to_console=False)[0],
+ min_workers=cluster.config.min_worker,
+ max_workers=cluster.config.max_worker,
+ worker_mem_min=cluster.config.min_memory,
+ worker_mem_max=cluster.config.max_memory,
+ worker_cpu=cluster.config.min_cpus,
+ worker_gpu=cluster.config.gpu,
+ namespace=cluster.config.namespace,
+ dashboard=cluster.cluster_dashboard_uri(),
+ )
+ if ray.status == CodeFlareClusterStatus.READY:
+ ray.status = RayClusterStatus.READY
+ return ray
@@ -404,7 +534,19 @@ Functions
"""
Returns the user's current working namespace.
"""
- namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
+ try:
+ namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if (
+ "do not have rights" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise PermissionError(
+ "Action not permitted, have you run auth.login() or cluster.up()?"
+ )
+ else:
+ raise osp
return namespace
@@ -530,10 +672,17 @@ Classes
Applies the AppWrapper yaml, pushing the resource request onto
the MCAD queue.
"""
- self.config.auth.login()
namespace = self.config.namespace
- with oc.project(namespace):
- oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+ try:
+ with oc.project(namespace):
+ oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if "Unauthorized" in error_msg:
+ raise PermissionError(
+ "Action not permitted, have you put in correct/up-to-date auth credentials?"
+ )
+ raise osp
def down(self):
"""
@@ -541,54 +690,31 @@ Classes
associated with the cluster.
"""
namespace = self.config.namespace
- with oc.project(namespace):
- oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
- self.config.auth.logout()
-
- def status(self, print_to_console: bool = True):
- """
- TO BE UPDATED: Will soon return (and print by default) the cluster's
- status, from AppWrapper submission to setup completion. All resource
- details will be moved to cluster.details().
- """
- cluster = _ray_cluster_status(self.config.name, self.config.namespace)
- if cluster:
- # overriding the number of gpus with requested
- cluster.worker_gpu = self.config.gpu
- if print_to_console:
- pretty_print.print_clusters([cluster])
- return cluster.status
- else:
- if print_to_console:
- pretty_print.print_no_resources_found()
- return None
-
- def cluster_uri(self) -> str:
- """
- Returns a string containing the cluster's URI.
- """
- return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
-
- def cluster_dashboard_uri(self, namespace: str = "default") -> str:
- """
- Returns a string containing the cluster's dashboard URI.
- """
try:
with oc.project(namespace):
- route = oc.invoke(
- "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
+ oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if (
+ 'the server doesn\'t have a resource type "AppWrapper"' in error_msg
+ or "forbidden" in error_msg
+ or "Unauthorized" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise PermissionError(
+ "Action not permitted, have you run cluster.up() yet?"
)
- route = route.out().split(" ")
- route = [x for x in route if f"ray-dashboard-{self.config.name}" in x]
- route = route[0].strip().strip("'")
- return f"http://{route}"
- except:
- return "Dashboard route not available yet. Did you run cluster.up()?"
+ elif "not found" in error_msg:
+ print("Cluster not found, have you run cluster.up() yet?")
+ else:
+ raise osp
- # checks whether the ray cluster is ready
- def is_ready(self, print_to_console: bool = True):
+ def status(
+ self, print_to_console: bool = True
+ ) -> Tuple[CodeFlareClusterStatus, bool]:
"""
- TO BE DEPRECATED: functionality will be added into cluster.status().
+ Returns the requested cluster's status, as well as whether or not
+ it is ready for use.
"""
ready = False
status = CodeFlareClusterStatus.UNKNOWN
@@ -601,27 +727,27 @@ Classes
AppWrapperStatus.RUNNING_HOLD_COMPLETION,
]:
ready = False
- status = CodeFlareClusterStatus.QUEUED
+ status = CodeFlareClusterStatus.STARTING
elif appwrapper.status in [
AppWrapperStatus.FAILED,
AppWrapperStatus.DELETED,
]:
ready = False
status = CodeFlareClusterStatus.FAILED # should deleted be separate
- return ready, status # exit early, no need to check ray status
+ return status, ready # exit early, no need to check ray status
elif appwrapper.status in [AppWrapperStatus.PENDING]:
ready = False
status = CodeFlareClusterStatus.QUEUED
if print_to_console:
pretty_print.print_app_wrappers_status([appwrapper])
return (
- ready,
status,
+ ready,
) # no need to check the ray status since still in queue
# check the ray cluster status
cluster = _ray_cluster_status(self.config.name, self.config.namespace)
- if cluster:
+ if cluster and not cluster.status == RayClusterStatus.UNKNOWN:
if cluster.status == RayClusterStatus.READY:
ready = True
status = CodeFlareClusterStatus.READY
@@ -635,14 +761,70 @@ Classes
if print_to_console:
# overriding the number of gpus with requested
cluster.worker_gpu = self.config.gpu
- pretty_print.print_clusters([cluster])
+ pretty_print.print_cluster_status(cluster)
+ elif print_to_console:
+ if status == CodeFlareClusterStatus.UNKNOWN:
+ pretty_print.print_no_resources_found()
+ else:
+ pretty_print.print_app_wrappers_status([appwrapper])
+
return status, ready
+ def wait_ready(self, timeout: Optional[int] = None):
+ """
+ Waits for requested cluster to be ready, up to an optional timeout (s).
+ Checks every five seconds.
+ """
+ print("Waiting for requested resources to be set up...")
+ ready = False
+ status = None
+ time = 0
+ while not ready:
+ status, ready = self.status(print_to_console=False)
+ if status == CodeFlareClusterStatus.UNKNOWN:
+ print(
+ "WARNING: Current cluster status is unknown, have you run cluster.up yet?"
+ )
+ if not ready:
+ if timeout and time >= timeout:
+ raise TimeoutError(f"wait() timed out after waiting {timeout}s")
+ sleep(5)
+ time += 5
+ print("Requested cluster up and running!")
+
+ def details(self, print_to_console: bool = True) -> RayCluster:
+ cluster = _copy_to_ray(self)
+ if print_to_console:
+ pretty_print.print_clusters([cluster])
+ return cluster
+
+ def cluster_uri(self) -> str:
+ """
+ Returns a string containing the cluster's URI.
+ """
+ return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
+
+ def cluster_dashboard_uri(self) -> str:
+ """
+ Returns a string containing the cluster's dashboard URI.
+ """
+ try:
+ with oc.project(self.config.namespace):
+ route = oc.invoke(
+ "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
+ )
+ route = route.out().split(" ")
+ route = [x for x in route if f"ray-dashboard-{self.config.name}" in x]
+ route = route[0].strip().strip("'")
+ return f"http://{route}"
+ except:
+ return "Dashboard route not available yet, have you run cluster.up()?"
+
def list_jobs(self) -> List:
"""
This method accesses the head ray node in your cluster and lists the running jobs.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.list_jobs()
@@ -650,7 +832,7 @@ Classes
"""
This method accesses the head ray node in your cluster and returns the job status for the provided job id.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.get_job_status(job_id)
@@ -658,14 +840,14 @@ Classes
"""
This method accesses the head ray node in your cluster and returns the logs for the provided job id.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.get_job_logs(job_id)
Methods
-def cluster_dashboard_uri(self, namespace: str = 'default') ‑> str
+def cluster_dashboard_uri(self) ‑> str
-
Returns a string containing the cluster's dashboard URI.
@@ -673,12 +855,12 @@ Methods
Expand source code
-def cluster_dashboard_uri(self, namespace: str = "default") -> str:
+def cluster_dashboard_uri(self) -> str:
"""
Returns a string containing the cluster's dashboard URI.
"""
try:
- with oc.project(namespace):
+ with oc.project(self.config.namespace):
route = oc.invoke(
"get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"]
)
@@ -687,7 +869,7 @@ Methods
route = route[0].strip().strip("'")
return f"http://{route}"
except:
- return "Dashboard route not available yet. Did you run cluster.up()?"
+ return "Dashboard route not available yet, have you run cluster.up()?"
@@ -751,87 +933,56 @@ Methods
)
-
-def down(self)
+
+def details(self, print_to_console: bool = True) ‑> RayCluster
-
-
Deletes the AppWrapper yaml, scaling-down and deleting all resources
-associated with the cluster.
+
Expand source code
-def down(self):
- """
- Deletes the AppWrapper yaml, scaling-down and deleting all resources
- associated with the cluster.
- """
- namespace = self.config.namespace
- with oc.project(namespace):
- oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
- self.config.auth.logout()
+def details(self, print_to_console: bool = True) -> RayCluster:
+ cluster = _copy_to_ray(self)
+ if print_to_console:
+ pretty_print.print_clusters([cluster])
+ return cluster
-
-def is_ready(self, print_to_console: bool = True)
+
+def down(self)
-
-
TO BE DEPRECATED: functionality will be added into cluster.status().
+Deletes the AppWrapper yaml, scaling-down and deleting all resources
+associated with the cluster.
Expand source code
-def is_ready(self, print_to_console: bool = True):
+def down(self):
"""
- TO BE DEPRECATED: functionality will be added into cluster.status().
+ Deletes the AppWrapper yaml, scaling-down and deleting all resources
+ associated with the cluster.
"""
- ready = False
- status = CodeFlareClusterStatus.UNKNOWN
- # check the app wrapper status
- appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
- if appwrapper:
- if appwrapper.status in [
- AppWrapperStatus.RUNNING,
- AppWrapperStatus.COMPLETED,
- AppWrapperStatus.RUNNING_HOLD_COMPLETION,
- ]:
- ready = False
- status = CodeFlareClusterStatus.QUEUED
- elif appwrapper.status in [
- AppWrapperStatus.FAILED,
- AppWrapperStatus.DELETED,
- ]:
- ready = False
- status = CodeFlareClusterStatus.FAILED # should deleted be separate
- return ready, status # exit early, no need to check ray status
- elif appwrapper.status in [AppWrapperStatus.PENDING]:
- ready = False
- status = CodeFlareClusterStatus.QUEUED
- if print_to_console:
- pretty_print.print_app_wrappers_status([appwrapper])
- return (
- ready,
- status,
- ) # no need to check the ray status since still in queue
-
- # check the ray cluster status
- cluster = _ray_cluster_status(self.config.name, self.config.namespace)
- if cluster:
- if cluster.status == RayClusterStatus.READY:
- ready = True
- status = CodeFlareClusterStatus.READY
- elif cluster.status in [
- RayClusterStatus.UNHEALTHY,
- RayClusterStatus.FAILED,
- ]:
- ready = False
- status = CodeFlareClusterStatus.FAILED
-
- if print_to_console:
- # overriding the number of gpus with requested
- cluster.worker_gpu = self.config.gpu
- pretty_print.print_clusters([cluster])
- return status, ready
+ namespace = self.config.namespace
+ try:
+ with oc.project(namespace):
+ oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if (
+ 'the server doesn\'t have a resource type "AppWrapper"' in error_msg
+ or "forbidden" in error_msg
+ or "Unauthorized" in error_msg
+ or "Missing or incomplete configuration" in error_msg
+ ):
+ raise PermissionError(
+ "Action not permitted, have you run cluster.up() yet?"
+ )
+ elif "not found" in error_msg:
+ print("Cluster not found, have you run cluster.up() yet?")
+ else:
+ raise osp
@@ -847,7 +998,7 @@ Methods
"""
This method accesses the head ray node in your cluster and returns the logs for the provided job id.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.get_job_logs(job_id)
@@ -865,7 +1016,7 @@ Methods
"""
This method accesses the head ray node in your cluster and returns the job status for the provided job id.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.get_job_status(job_id)
@@ -883,39 +1034,81 @@ Methods
"""
This method accesses the head ray node in your cluster and lists the running jobs.
"""
- dashboard_route = self.cluster_dashboard_uri(namespace=self.config.namespace)
+ dashboard_route = self.cluster_dashboard_uri()
client = JobSubmissionClient(dashboard_route)
return client.list_jobs()
-def status(self, print_to_console: bool = True)
+def status(self, print_to_console: bool = True) ‑> Tuple[CodeFlareClusterStatus, bool]
-
-
TO BE UPDATED: Will soon return (and print by default) the cluster's
-status, from AppWrapper submission to setup completion. All resource
-details will be moved to cluster.details().
+Returns the requested cluster's status, as well as whether or not
+it is ready for use.
Expand source code
-def status(self, print_to_console: bool = True):
+def status(
+ self, print_to_console: bool = True
+) -> Tuple[CodeFlareClusterStatus, bool]:
"""
- TO BE UPDATED: Will soon return (and print by default) the cluster's
- status, from AppWrapper submission to setup completion. All resource
- details will be moved to cluster.details().
+ Returns the requested cluster's status, as well as whether or not
+ it is ready for use.
"""
+ ready = False
+ status = CodeFlareClusterStatus.UNKNOWN
+ # check the app wrapper status
+ appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
+ if appwrapper:
+ if appwrapper.status in [
+ AppWrapperStatus.RUNNING,
+ AppWrapperStatus.COMPLETED,
+ AppWrapperStatus.RUNNING_HOLD_COMPLETION,
+ ]:
+ ready = False
+ status = CodeFlareClusterStatus.STARTING
+ elif appwrapper.status in [
+ AppWrapperStatus.FAILED,
+ AppWrapperStatus.DELETED,
+ ]:
+ ready = False
+ status = CodeFlareClusterStatus.FAILED # should deleted be separate
+ return status, ready # exit early, no need to check ray status
+ elif appwrapper.status in [AppWrapperStatus.PENDING]:
+ ready = False
+ status = CodeFlareClusterStatus.QUEUED
+ if print_to_console:
+ pretty_print.print_app_wrappers_status([appwrapper])
+ return (
+ status,
+ ready,
+ ) # no need to check the ray status since still in queue
+
+ # check the ray cluster status
cluster = _ray_cluster_status(self.config.name, self.config.namespace)
- if cluster:
- # overriding the number of gpus with requested
- cluster.worker_gpu = self.config.gpu
- if print_to_console:
- pretty_print.print_clusters([cluster])
- return cluster.status
- else:
+ if cluster and not cluster.status == RayClusterStatus.UNKNOWN:
+ if cluster.status == RayClusterStatus.READY:
+ ready = True
+ status = CodeFlareClusterStatus.READY
+ elif cluster.status in [
+ RayClusterStatus.UNHEALTHY,
+ RayClusterStatus.FAILED,
+ ]:
+ ready = False
+ status = CodeFlareClusterStatus.FAILED
+
if print_to_console:
+ # overriding the number of gpus with requested
+ cluster.worker_gpu = self.config.gpu
+ pretty_print.print_cluster_status(cluster)
+ elif print_to_console:
+ if status == CodeFlareClusterStatus.UNKNOWN:
pretty_print.print_no_resources_found()
- return None
+ else:
+ pretty_print.print_app_wrappers_status([appwrapper])
+
+ return status, ready
@@ -933,10 +1126,50 @@ Methods
Applies the AppWrapper yaml, pushing the resource request onto
the MCAD queue.
"""
- self.config.auth.login()
namespace = self.config.namespace
- with oc.project(namespace):
- oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+ try:
+ with oc.project(namespace):
+ oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+ except oc.OpenShiftPythonException as osp: # pragma: no cover
+ error_msg = osp.result.err()
+ if "Unauthorized" in error_msg:
+ raise PermissionError(
+ "Action not permitted, have you put in correct/up-to-date auth credentials?"
+ )
+ raise osp
+
+
+
+def wait_ready(self, timeout: Optional[int] = None)
+
+-
+
Waits for requested cluster to be ready, up to an optional timeout (s).
+Checks every five seconds.
+
+
+Expand source code
+
+def wait_ready(self, timeout: Optional[int] = None):
+ """
+ Waits for requested cluster to be ready, up to an optional timeout (s).
+ Checks every five seconds.
+ """
+ print("Waiting for requested resources to be set up...")
+ ready = False
+ status = None
+ time = 0
+ while not ready:
+ status, ready = self.status(print_to_console=False)
+ if status == CodeFlareClusterStatus.UNKNOWN:
+ print(
+ "WARNING: Current cluster status is unknown, have you run cluster.up yet?"
+ )
+ if not ready:
+ if timeout and time >= timeout:
+ raise TimeoutError(f"wait() timed out after waiting {timeout}s")
+ sleep(5)
+ time += 5
+ print("Requested cluster up and running!")
@@ -970,13 +1203,14 @@ cluster_dashboard_uri
cluster_uri
create_app_wrapper
+details
down
-is_ready
job_logs
job_status
list_jobs
status
up
+wait_ready
diff --git a/docs/cluster/config.html b/docs/cluster/config.html
index 476ab6fce..e5c95c220 100644
--- a/docs/cluster/config.html
+++ b/docs/cluster/config.html
@@ -78,8 +78,7 @@ Module codeflare_sdk.cluster.config
template: str = f"{dir}/templates/new-template.yaml"
instascale: bool = False
envs: dict = field(default_factory=dict)
- image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
- auth: Authentication = Authentication()
+ image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
@@ -93,7 +92,7 @@ Classes
class ClusterConfiguration
-(name: str, namespace: str = 'default', head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103', auth: Authentication = <codeflare_sdk.cluster.auth.Authentication object>)
+(name: str, namespace: str = 'default', head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103')
-
This dataclass is used to specify resource requirements and other details, and
@@ -122,15 +121,10 @@
Classes
template: str = f"{dir}/templates/new-template.yaml"
instascale: bool = False
envs: dict = field(default_factory=dict)
- image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
- auth: Authentication = Authentication()
+ image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
Class variables
-var auth : Authentication
--
-
-
var envs : dict
-
@@ -212,7 +206,6 @@
Index
-
ClusterConfiguration
-auth
envs
gpu
head_info
diff --git a/docs/cluster/model.html b/docs/cluster/model.html
index 42cefda09..2a4615559 100644
--- a/docs/cluster/model.html
+++ b/docs/cluster/model.html
@@ -86,9 +86,10 @@ Module codeflare_sdk.cluster.model
"""
READY = 1
- QUEUED = 2
- FAILED = 3
- UNKNOWN = 4
+ STARTING = 2
+ QUEUED = 3
+ FAILED = 4
+ UNKNOWN = 5
@dataclass
@@ -240,9 +241,10 @@ Class variables
"""
READY = 1
- QUEUED = 2
- FAILED = 3
- UNKNOWN = 4
+ STARTING = 2
+ QUEUED = 3
+ FAILED = 4
+ UNKNOWN = 5
Ancestors
@@ -262,6 +264,10 @@ Class variables
-
+var STARTING
+-
+
+
var UNKNOWN
-
@@ -425,6 +431,7 @@
FAILED
QUEUED
READY
+STARTING
UNKNOWN
diff --git a/docs/utils/generate_yaml.html b/docs/utils/generate_yaml.html
index b2189e519..aacd0804a 100644
--- a/docs/utils/generate_yaml.html
+++ b/docs/utils/generate_yaml.html
@@ -271,7 +271,7 @@ Module codeflare_sdk.utils.generate_yaml
return outfile
-def main():
+def main(): # pragma: no cover
parser = argparse.ArgumentParser(description="Generate user AppWrapper")
parser.add_argument(
"--name",
@@ -379,7 +379,7 @@ Module codeflare_sdk.utils.generate_yaml
return outfile
-if __name__ == "__main__":
+if __name__ == "__main__": # pragma: no cover
main()
@@ -471,7 +471,7 @@ Functions
Expand source code
-def main():
+def main(): # pragma: no cover
parser = argparse.ArgumentParser(description="Generate user AppWrapper")
parser.add_argument(
"--name",
diff --git a/docs/utils/pretty_print.html b/docs/utils/pretty_print.html
index 013d6e248..5da3d5b36 100644
--- a/docs/utils/pretty_print.html
+++ b/docs/utils/pretty_print.html
@@ -60,7 +60,7 @@ Module codeflare_sdk.utils.pretty_print
def print_no_resources_found():
console = Console()
- console.print(Panel("[red]No resources found"))
+ console.print(Panel("[red]No resources found, have you run cluster.up() yet?"))
def print_app_wrappers_status(app_wrappers: List[AppWrapper]):
@@ -69,22 +69,66 @@ Module codeflare_sdk.utils.pretty_print
return # shortcircuit
console = Console()
+ table = Table(
+ box=box.ASCII_DOUBLE_HEAD,
+ title="[bold] :rocket: Cluster Queue Status :rocket:",
+ )
+ table.add_column("Name", style="cyan", no_wrap=True)
+ table.add_column("Status", style="magenta")
+
for app_wrapper in app_wrappers:
name = app_wrapper.name
status = app_wrapper.status.value
-
- table = Table(
- box=box.ASCII_DOUBLE_HEAD,
- title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:",
- )
- table.add_column("Name", style="cyan", no_wrap=True)
- table.add_column("Status", style="magenta")
table.add_row(name, status)
table.add_row("") # empty row for spacing
- console.print(Panel.fit(table))
+ console.print(Panel.fit(table))
+
+
+def print_cluster_status(cluster: RayCluster):
+ "Pretty prints the status of a passed-in cluster"
+ if not cluster:
+ print_no_resources_found
+ return
-def print_clusters(clusters: List[RayCluster], verbose=True):
+ console = Console()
+ status = (
+ "Active :white_heavy_check_mark:"
+ if cluster.status == RayClusterStatus.READY
+ else "Inactive :x:"
+ )
+ name = cluster.name
+ dashboard = cluster.dashboard
+ # owned = bool(cluster["userOwned"])
+ owned = True
+
+ #'table0' to display the cluster name, status, url, and dashboard link
+ table0 = Table(box=None, show_header=False)
+ if owned:
+ table0.add_row("[white on green][bold]Name")
+ else:
+ table0.add_row("")
+ table0.add_row("[bold underline]" + name, status)
+ table0.add_row()
+ # fixme harcded to default for now
+ table0.add_row(
+ f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001"
+ ) # format that is used to generate the name of the service
+ table0.add_row()
+ table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]")
+ table0.add_row("") # empty row for spacing
+
+ # table4 to display table0 and table3, one below the other
+ table4 = Table(box=None, show_header=False)
+ table4.add_row(table0)
+
+ # Encompass all details of the cluster in a single panel
+ table5 = Table(box=None, title="[bold] :rocket: CodeFlare Cluster Status :rocket:")
+ table5.add_row(Panel.fit(table4))
+ console.print(table5)
+
+
+def print_clusters(clusters: List[RayCluster]):
if not clusters:
print_no_resources_found()
return # shortcircuit
@@ -96,13 +140,13 @@ Module codeflare_sdk.utils.pretty_print
status = (
"Active :white_heavy_check_mark:"
if cluster.status == RayClusterStatus.READY
- else "InActive :x:"
+ else "Inactive :x:"
)
name = cluster.name
dashboard = cluster.dashboard
mincount = str(cluster.min_workers)
maxcount = str(cluster.max_workers)
- memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max
+ memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max)
cpu = str(cluster.worker_cpu)
gpu = str(cluster.worker_gpu)
# owned = bool(cluster["userOwned"])
@@ -111,7 +155,7 @@ Module codeflare_sdk.utils.pretty_print
#'table0' to display the cluster name, status, url, and dashboard link
table0 = Table(box=None, show_header=False)
if owned:
- table0.add_row("[white on green][bold]Owner")
+ table0.add_row("[white on green][bold]Name")
else:
table0.add_row("")
table0.add_row("[bold underline]" + name, status)
@@ -162,7 +206,7 @@ Module codeflare_sdk.utils.pretty_print
# than being center aligned on the console/terminal if we simply use console.print(title)
table5 = Table(
- box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:"
+ box=None, title="[bold] :rocket: CodeFlare Cluster Details :rocket:"
)
table5.add_row(Panel.fit(table4))
console.print(table5)
@@ -193,23 +237,76 @@ Functions
return # shortcircuit
console = Console()
+ table = Table(
+ box=box.ASCII_DOUBLE_HEAD,
+ title="[bold] :rocket: Cluster Queue Status :rocket:",
+ )
+ table.add_column("Name", style="cyan", no_wrap=True)
+ table.add_column("Status", style="magenta")
+
for app_wrapper in app_wrappers:
name = app_wrapper.name
status = app_wrapper.status.value
-
- table = Table(
- box=box.ASCII_DOUBLE_HEAD,
- title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:",
- )
- table.add_column("Name", style="cyan", no_wrap=True)
- table.add_column("Status", style="magenta")
table.add_row(name, status)
table.add_row("") # empty row for spacing
- console.print(Panel.fit(table))
+
+ console.print(Panel.fit(table))
+
+
+
+def print_cluster_status(cluster: RayCluster)
+
+-
+
Pretty prints the status of a passed-in cluster
+
+
+Expand source code
+
+def print_cluster_status(cluster: RayCluster):
+ "Pretty prints the status of a passed-in cluster"
+ if not cluster:
+ print_no_resources_found
+ return
+
+ console = Console()
+ status = (
+ "Active :white_heavy_check_mark:"
+ if cluster.status == RayClusterStatus.READY
+ else "Inactive :x:"
+ )
+ name = cluster.name
+ dashboard = cluster.dashboard
+ # owned = bool(cluster["userOwned"])
+ owned = True
+
+ #'table0' to display the cluster name, status, url, and dashboard link
+ table0 = Table(box=None, show_header=False)
+ if owned:
+ table0.add_row("[white on green][bold]Name")
+ else:
+ table0.add_row("")
+ table0.add_row("[bold underline]" + name, status)
+ table0.add_row()
+ # fixme harcded to default for now
+ table0.add_row(
+ f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001"
+ ) # format that is used to generate the name of the service
+ table0.add_row()
+ table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]")
+ table0.add_row("") # empty row for spacing
+
+ # table4 to display table0 and table3, one below the other
+ table4 = Table(box=None, show_header=False)
+ table4.add_row(table0)
+
+ # Encompass all details of the cluster in a single panel
+ table5 = Table(box=None, title="[bold] :rocket: CodeFlare Cluster Status :rocket:")
+ table5.add_row(Panel.fit(table4))
+ console.print(table5)
-def print_clusters(clusters: List[RayCluster], verbose=True)
+def print_clusters(clusters: List[RayCluster])
-
@@ -217,7 +314,7 @@
Functions
Expand source code
-def print_clusters(clusters: List[RayCluster], verbose=True):
+def print_clusters(clusters: List[RayCluster]):
if not clusters:
print_no_resources_found()
return # shortcircuit
@@ -229,13 +326,13 @@ Functions
status = (
"Active :white_heavy_check_mark:"
if cluster.status == RayClusterStatus.READY
- else "InActive :x:"
+ else "Inactive :x:"
)
name = cluster.name
dashboard = cluster.dashboard
mincount = str(cluster.min_workers)
maxcount = str(cluster.max_workers)
- memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max
+ memory = str(cluster.worker_mem_min) + "~" + str(cluster.worker_mem_max)
cpu = str(cluster.worker_cpu)
gpu = str(cluster.worker_gpu)
# owned = bool(cluster["userOwned"])
@@ -244,7 +341,7 @@ Functions
#'table0' to display the cluster name, status, url, and dashboard link
table0 = Table(box=None, show_header=False)
if owned:
- table0.add_row("[white on green][bold]Owner")
+ table0.add_row("[white on green][bold]Name")
else:
table0.add_row("")
table0.add_row("[bold underline]" + name, status)
@@ -295,7 +392,7 @@ Functions
# than being center aligned on the console/terminal if we simply use console.print(title)
table5 = Table(
- box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:"
+ box=None, title="[bold] :rocket: CodeFlare Cluster Details :rocket:"
)
table5.add_row(Panel.fit(table4))
console.print(table5)
@@ -315,7 +412,7 @@ Functions
def print_no_resources_found():
console = Console()
- console.print(Panel("[red]No resources found"))
+ console.print(Panel("[red]No resources found, have you run cluster.up() yet?"))
@@ -337,6 +434,7 @@ Index
Functions
diff --git a/pyproject.toml b/pyproject.toml
index 408a49d9d..9312611ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "codeflare-sdk"
-version = "0.2.2"
+version = "0.3.0"
description = "Python SDK for codeflare client"
license = "Apache-2.0"