From e086e3f423b2cd74d646d2c745be5acfb788f094 Mon Sep 17 00:00:00 2001 From: codeflare-machine-account Date: Mon, 23 Oct 2023 20:08:24 +0000 Subject: [PATCH] Changes in docs for release: v0.10.1 --- docs/cluster/cluster.html | 569 +++++++++++++++++++++---------- docs/cluster/config.html | 9 +- docs/job/jobs.html | 11 +- docs/utils/generate_yaml.html | 50 ++- docs/utils/kube_api_helpers.html | 10 +- 5 files changed, 450 insertions(+), 199 deletions(-) diff --git a/docs/cluster/cluster.html b/docs/cluster/cluster.html index 7b391787d..e33b9cc2f 100644 --- a/docs/cluster/cluster.html +++ b/docs/cluster/cluster.html @@ -102,7 +102,7 @@

Module codeflare_sdk.cluster.cluster

self.config = config self.app_wrapper_yaml = self.create_app_wrapper() self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0] - self._client = None + self._job_submission_client = None @property def _client_headers(self): @@ -118,23 +118,25 @@

Module codeflare_sdk.cluster.cluster

return not self.config.openshift_oauth @property - def client(self): - if self._client: - return self._client + def job_client(self): + if self._job_submission_client: + return self._job_submission_client if self.config.openshift_oauth: print( api_config_handler().configuration.get_api_key_with_prefix( "authorization" ) ) - self._client = JobSubmissionClient( + self._job_submission_client = JobSubmissionClient( self.cluster_dashboard_uri(), headers=self._client_headers, verify=self._client_verify_tls, ) else: - self._client = JobSubmissionClient(self.cluster_dashboard_uri()) - return self._client + self._job_submission_client = JobSubmissionClient( + self.cluster_dashboard_uri() + ) + return self._job_submission_client def evaluate_dispatch_priority(self): priority_class = self.config.dispatch_priority @@ -173,6 +175,10 @@

Module codeflare_sdk.cluster.cluster

# Before attempting to create the cluster AW, let's evaluate the ClusterConfig if self.config.dispatch_priority: + if not self.config.mcad: + raise ValueError( + "Invalid Cluster Configuration, cannot have dispatch priority without MCAD" + ) priority_val = self.evaluate_dispatch_priority() if priority_val == None: raise ValueError( @@ -195,6 +201,7 @@

Module codeflare_sdk.cluster.cluster

template = self.config.template image = self.config.image instascale = self.config.instascale + mcad = self.config.mcad instance_types = self.config.machine_types env = self.config.envs local_interactive = self.config.local_interactive @@ -215,6 +222,7 @@

Module codeflare_sdk.cluster.cluster

template=template, image=image, instascale=instascale, + mcad=mcad, instance_types=instance_types, env=env, local_interactive=local_interactive, @@ -239,15 +247,18 @@

Module codeflare_sdk.cluster.cluster

try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - with open(self.app_wrapper_yaml) as f: - aw = yaml.load(f, Loader=yaml.FullLoader) - api_instance.create_namespaced_custom_object( - group="workload.codeflare.dev", - version="v1beta1", - namespace=namespace, - plural="appwrappers", - body=aw, - ) + if self.config.mcad: + with open(self.app_wrapper_yaml) as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + api_instance.create_namespaced_custom_object( + group="workload.codeflare.dev", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + body=aw, + ) + else: + self._component_resources_up(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -260,13 +271,16 @@

Module codeflare_sdk.cluster.cluster

try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - api_instance.delete_namespaced_custom_object( - group="workload.codeflare.dev", - version="v1beta1", - namespace=namespace, - plural="appwrappers", - name=self.app_wrapper_name, - ) + if self.config.mcad: + api_instance.delete_namespaced_custom_object( + group="workload.codeflare.dev", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + name=self.app_wrapper_name, + ) + else: + self._component_resources_down(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -284,42 +298,46 @@

Module codeflare_sdk.cluster.cluster

""" ready = False status = CodeFlareClusterStatus.UNKNOWN - # check the app wrapper status - appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) - if appwrapper: - if appwrapper.status in [ - AppWrapperStatus.RUNNING, - AppWrapperStatus.COMPLETED, - AppWrapperStatus.RUNNING_HOLD_COMPLETION, - ]: - ready = False - status = CodeFlareClusterStatus.STARTING - elif appwrapper.status in [ - AppWrapperStatus.FAILED, - AppWrapperStatus.DELETED, - ]: - ready = False - status = CodeFlareClusterStatus.FAILED # should deleted be separate - return status, ready # exit early, no need to check ray status - elif appwrapper.status in [ - AppWrapperStatus.PENDING, - AppWrapperStatus.QUEUEING, - ]: - ready = False - if appwrapper.status == AppWrapperStatus.PENDING: - status = CodeFlareClusterStatus.QUEUED - else: - status = CodeFlareClusterStatus.QUEUEING - if print_to_console: - pretty_print.print_app_wrappers_status([appwrapper]) - return ( - status, - ready, - ) # no need to check the ray status since still in queue + if self.config.mcad: + # check the app wrapper status + appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) + if appwrapper: + if appwrapper.status in [ + AppWrapperStatus.RUNNING, + AppWrapperStatus.COMPLETED, + AppWrapperStatus.RUNNING_HOLD_COMPLETION, + ]: + ready = False + status = CodeFlareClusterStatus.STARTING + elif appwrapper.status in [ + AppWrapperStatus.FAILED, + AppWrapperStatus.DELETED, + ]: + ready = False + status = CodeFlareClusterStatus.FAILED # should deleted be separate + return status, ready # exit early, no need to check ray status + elif appwrapper.status in [ + AppWrapperStatus.PENDING, + AppWrapperStatus.QUEUEING, + ]: + ready = False + if appwrapper.status == AppWrapperStatus.PENDING: + status = CodeFlareClusterStatus.QUEUED + else: + status = CodeFlareClusterStatus.QUEUEING + if print_to_console: + pretty_print.print_app_wrappers_status([appwrapper]) + return ( + status, + ready, + ) # no need to check the ray status since still in queue # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster and not cluster.status == RayClusterStatus.UNKNOWN: + if cluster: + if cluster.status == RayClusterStatus.UNKNOWN: + ready = False + status = CodeFlareClusterStatus.STARTING if cluster.status == RayClusterStatus.READY: ready = True status = CodeFlareClusterStatus.READY @@ -439,19 +457,19 @@

Module codeflare_sdk.cluster.cluster

""" This method accesses the head ray node in your cluster and lists the running jobs. """ - return self.client.list_jobs() + return self.job_client.list_jobs() def job_status(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ - return self.client.get_job_status(job_id) + return self.job_client.get_job_status(job_id) def job_logs(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ - return self.client.get_job_logs(job_id) + return self.job_client.get_job_logs(job_id) def torchx_config( self, working_dir: str = None, requirements: str = None @@ -467,7 +485,7 @@

Module codeflare_sdk.cluster.cluster

to_return["requirements"] = requirements return to_return - def from_k8_cluster_object(rc): + def from_k8_cluster_object(rc, mcad=True): machine_types = ( rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] @@ -506,6 +524,7 @@

Module codeflare_sdk.cluster.cluster

0 ]["image"], local_interactive=local_interactive, + mcad=mcad, ) return Cluster(cluster_config) @@ -516,6 +535,66 @@

Module codeflare_sdk.cluster.cluster

else: return "None" + def _component_resources_up( + self, namespace: str, api_instance: client.CustomObjectsApi + ): + with open(self.app_wrapper_yaml) as f: + yamls = yaml.load_all(f, Loader=yaml.FullLoader) + for resource in yamls: + if resource["kind"] == "RayCluster": + api_instance.create_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + body=resource, + ) + elif resource["kind"] == "Route": + api_instance.create_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=namespace, + plural="routes", + body=resource, + ) + elif resource["kind"] == "Secret": + secret_instance = client.CoreV1Api(api_config_handler()) + secret_instance.create_namespaced_secret( + namespace=namespace, + body=resource, + ) + + def _component_resources_down( + self, namespace: str, api_instance: client.CustomObjectsApi + ): + with open(self.app_wrapper_yaml) as f: + yamls = yaml.load_all(f, Loader=yaml.FullLoader) + for resource in yamls: + if resource["kind"] == "RayCluster": + api_instance.delete_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + name=self.app_wrapper_name, + ) + elif resource["kind"] == "Route": + name = resource["metadata"]["name"] + api_instance.delete_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=namespace, + plural="routes", + name=name, + ) + elif resource["kind"] == "Secret": + name = resource["metadata"]["name"] + secret_instance = client.CoreV1Api(api_config_handler()) + secret_instance.delete_namespaced_secret( + namespace=namespace, + name=name, + ) + def list_all_clusters(namespace: str, print_to_console: bool = True): """ @@ -581,13 +660,33 @@

Module codeflare_sdk.cluster.cluster

for rc in rcs["items"]: if rc["metadata"]["name"] == cluster_name: - return Cluster.from_k8_cluster_object(rc) + mcad = _check_aw_exists(cluster_name, namespace) + return Cluster.from_k8_cluster_object(rc, mcad=mcad) raise FileNotFoundError( f"Cluster {cluster_name} is not found in {namespace} namespace" ) # private methods +def _check_aw_exists(name: str, namespace: str) -> bool: + try: + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + aws = api_instance.list_namespaced_custom_object( + group="workload.codeflare.dev", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e, print_error=False) + + for aw in aws["items"]: + if aw["metadata"]["name"] == name: + return True + return False + + def _get_ingress_domain(): try: config_check() @@ -692,6 +791,7 @@

Module codeflare_sdk.cluster.cluster

config_check() api_instance = client.CustomObjectsApi(api_config_handler()) + # UPDATE THIS routes = api_instance.list_namespaced_custom_object( group="route.openshift.io", version="v1", @@ -806,7 +906,8 @@

Functions

for rc in rcs["items"]: if rc["metadata"]["name"] == cluster_name: - return Cluster.from_k8_cluster_object(rc) + mcad = _check_aw_exists(cluster_name, namespace) + return Cluster.from_k8_cluster_object(rc, mcad=mcad) raise FileNotFoundError( f"Cluster {cluster_name} is not found in {namespace} namespace" ) @@ -930,7 +1031,7 @@

Classes

self.config = config self.app_wrapper_yaml = self.create_app_wrapper() self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0] - self._client = None + self._job_submission_client = None @property def _client_headers(self): @@ -946,23 +1047,25 @@

Classes

return not self.config.openshift_oauth @property - def client(self): - if self._client: - return self._client + def job_client(self): + if self._job_submission_client: + return self._job_submission_client if self.config.openshift_oauth: print( api_config_handler().configuration.get_api_key_with_prefix( "authorization" ) ) - self._client = JobSubmissionClient( + self._job_submission_client = JobSubmissionClient( self.cluster_dashboard_uri(), headers=self._client_headers, verify=self._client_verify_tls, ) else: - self._client = JobSubmissionClient(self.cluster_dashboard_uri()) - return self._client + self._job_submission_client = JobSubmissionClient( + self.cluster_dashboard_uri() + ) + return self._job_submission_client def evaluate_dispatch_priority(self): priority_class = self.config.dispatch_priority @@ -1001,6 +1104,10 @@

Classes

# Before attempting to create the cluster AW, let's evaluate the ClusterConfig if self.config.dispatch_priority: + if not self.config.mcad: + raise ValueError( + "Invalid Cluster Configuration, cannot have dispatch priority without MCAD" + ) priority_val = self.evaluate_dispatch_priority() if priority_val == None: raise ValueError( @@ -1023,6 +1130,7 @@

Classes

template = self.config.template image = self.config.image instascale = self.config.instascale + mcad = self.config.mcad instance_types = self.config.machine_types env = self.config.envs local_interactive = self.config.local_interactive @@ -1043,6 +1151,7 @@

Classes

template=template, image=image, instascale=instascale, + mcad=mcad, instance_types=instance_types, env=env, local_interactive=local_interactive, @@ -1067,15 +1176,18 @@

Classes

try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - with open(self.app_wrapper_yaml) as f: - aw = yaml.load(f, Loader=yaml.FullLoader) - api_instance.create_namespaced_custom_object( - group="workload.codeflare.dev", - version="v1beta1", - namespace=namespace, - plural="appwrappers", - body=aw, - ) + if self.config.mcad: + with open(self.app_wrapper_yaml) as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + api_instance.create_namespaced_custom_object( + group="workload.codeflare.dev", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + body=aw, + ) + else: + self._component_resources_up(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -1088,13 +1200,16 @@

Classes

try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - api_instance.delete_namespaced_custom_object( - group="workload.codeflare.dev", - version="v1beta1", - namespace=namespace, - plural="appwrappers", - name=self.app_wrapper_name, - ) + if self.config.mcad: + api_instance.delete_namespaced_custom_object( + group="workload.codeflare.dev", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + name=self.app_wrapper_name, + ) + else: + self._component_resources_down(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -1112,42 +1227,46 @@

Classes

""" ready = False status = CodeFlareClusterStatus.UNKNOWN - # check the app wrapper status - appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) - if appwrapper: - if appwrapper.status in [ - AppWrapperStatus.RUNNING, - AppWrapperStatus.COMPLETED, - AppWrapperStatus.RUNNING_HOLD_COMPLETION, - ]: - ready = False - status = CodeFlareClusterStatus.STARTING - elif appwrapper.status in [ - AppWrapperStatus.FAILED, - AppWrapperStatus.DELETED, - ]: - ready = False - status = CodeFlareClusterStatus.FAILED # should deleted be separate - return status, ready # exit early, no need to check ray status - elif appwrapper.status in [ - AppWrapperStatus.PENDING, - AppWrapperStatus.QUEUEING, - ]: - ready = False - if appwrapper.status == AppWrapperStatus.PENDING: - status = CodeFlareClusterStatus.QUEUED - else: - status = CodeFlareClusterStatus.QUEUEING - if print_to_console: - pretty_print.print_app_wrappers_status([appwrapper]) - return ( - status, - ready, - ) # no need to check the ray status since still in queue + if self.config.mcad: + # check the app wrapper status + appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) + if appwrapper: + if appwrapper.status in [ + AppWrapperStatus.RUNNING, + AppWrapperStatus.COMPLETED, + AppWrapperStatus.RUNNING_HOLD_COMPLETION, + ]: + ready = False + status = CodeFlareClusterStatus.STARTING + elif appwrapper.status in [ + AppWrapperStatus.FAILED, + AppWrapperStatus.DELETED, + ]: + ready = False + status = CodeFlareClusterStatus.FAILED # should deleted be separate + return status, ready # exit early, no need to check ray status + elif appwrapper.status in [ + AppWrapperStatus.PENDING, + AppWrapperStatus.QUEUEING, + ]: + ready = False + if appwrapper.status == AppWrapperStatus.PENDING: + status = CodeFlareClusterStatus.QUEUED + else: + status = CodeFlareClusterStatus.QUEUEING + if print_to_console: + pretty_print.print_app_wrappers_status([appwrapper]) + return ( + status, + ready, + ) # no need to check the ray status since still in queue # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster and not cluster.status == RayClusterStatus.UNKNOWN: + if cluster: + if cluster.status == RayClusterStatus.UNKNOWN: + ready = False + status = CodeFlareClusterStatus.STARTING if cluster.status == RayClusterStatus.READY: ready = True status = CodeFlareClusterStatus.READY @@ -1267,19 +1386,19 @@

Classes

""" This method accesses the head ray node in your cluster and lists the running jobs. """ - return self.client.list_jobs() + return self.job_client.list_jobs() def job_status(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ - return self.client.get_job_status(job_id) + return self.job_client.get_job_status(job_id) def job_logs(self, job_id: str) -> str: """ This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ - return self.client.get_job_logs(job_id) + return self.job_client.get_job_logs(job_id) def torchx_config( self, working_dir: str = None, requirements: str = None @@ -1295,7 +1414,7 @@

Classes

to_return["requirements"] = requirements return to_return - def from_k8_cluster_object(rc): + def from_k8_cluster_object(rc, mcad=True): machine_types = ( rc["metadata"]["labels"]["orderedinstance"].split("_") if "orderedinstance" in rc["metadata"]["labels"] @@ -1334,6 +1453,7 @@

Classes

0 ]["image"], local_interactive=local_interactive, + mcad=mcad, ) return Cluster(cluster_config) @@ -1342,7 +1462,67 @@

Classes

ingress_domain = _get_ingress_domain() return f"ray://rayclient-{self.config.name}-{self.config.namespace}.{ingress_domain}" else: - return "None" + return "None" + + def _component_resources_up( + self, namespace: str, api_instance: client.CustomObjectsApi + ): + with open(self.app_wrapper_yaml) as f: + yamls = yaml.load_all(f, Loader=yaml.FullLoader) + for resource in yamls: + if resource["kind"] == "RayCluster": + api_instance.create_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + body=resource, + ) + elif resource["kind"] == "Route": + api_instance.create_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=namespace, + plural="routes", + body=resource, + ) + elif resource["kind"] == "Secret": + secret_instance = client.CoreV1Api(api_config_handler()) + secret_instance.create_namespaced_secret( + namespace=namespace, + body=resource, + ) + + def _component_resources_down( + self, namespace: str, api_instance: client.CustomObjectsApi + ): + with open(self.app_wrapper_yaml) as f: + yamls = yaml.load_all(f, Loader=yaml.FullLoader) + for resource in yamls: + if resource["kind"] == "RayCluster": + api_instance.delete_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + name=self.app_wrapper_name, + ) + elif resource["kind"] == "Route": + name = resource["metadata"]["name"] + api_instance.delete_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=namespace, + plural="routes", + name=name, + ) + elif resource["kind"] == "Secret": + name = resource["metadata"]["name"] + secret_instance = client.CoreV1Api(api_config_handler()) + secret_instance.delete_namespaced_secret( + namespace=namespace, + name=name, + )

Class variables

@@ -1353,7 +1533,7 @@

Class variables

Instance variables

-
var client
+
var job_client
@@ -1361,23 +1541,25 @@

Instance variables

Expand source code
@property
-def client(self):
-    if self._client:
-        return self._client
+def job_client(self):
+    if self._job_submission_client:
+        return self._job_submission_client
     if self.config.openshift_oauth:
         print(
             api_config_handler().configuration.get_api_key_with_prefix(
                 "authorization"
             )
         )
-        self._client = JobSubmissionClient(
+        self._job_submission_client = JobSubmissionClient(
             self.cluster_dashboard_uri(),
             headers=self._client_headers,
             verify=self._client_verify_tls,
         )
     else:
-        self._client = JobSubmissionClient(self.cluster_dashboard_uri())
-    return self._client
+ self._job_submission_client = JobSubmissionClient( + self.cluster_dashboard_uri() + ) + return self._job_submission_client
@@ -1464,6 +1646,10 @@

Methods

# Before attempting to create the cluster AW, let's evaluate the ClusterConfig if self.config.dispatch_priority: + if not self.config.mcad: + raise ValueError( + "Invalid Cluster Configuration, cannot have dispatch priority without MCAD" + ) priority_val = self.evaluate_dispatch_priority() if priority_val == None: raise ValueError( @@ -1486,6 +1672,7 @@

Methods

template = self.config.template image = self.config.image instascale = self.config.instascale + mcad = self.config.mcad instance_types = self.config.machine_types env = self.config.envs local_interactive = self.config.local_interactive @@ -1506,6 +1693,7 @@

Methods

template=template, image=image, instascale=instascale, + mcad=mcad, instance_types=instance_types, env=env, local_interactive=local_interactive, @@ -1551,13 +1739,16 @@

Methods

try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - api_instance.delete_namespaced_custom_object( - group="workload.codeflare.dev", - version="v1beta1", - namespace=namespace, - plural="appwrappers", - name=self.app_wrapper_name, - ) + if self.config.mcad: + api_instance.delete_namespaced_custom_object( + group="workload.codeflare.dev", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + name=self.app_wrapper_name, + ) + else: + self._component_resources_down(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e) @@ -1598,7 +1789,7 @@

Methods

-def from_k8_cluster_object(rc) +def from_k8_cluster_object(rc, mcad=True)
@@ -1606,7 +1797,7 @@

Methods

Expand source code -
def from_k8_cluster_object(rc):
+
def from_k8_cluster_object(rc, mcad=True):
     machine_types = (
         rc["metadata"]["labels"]["orderedinstance"].split("_")
         if "orderedinstance" in rc["metadata"]["labels"]
@@ -1645,6 +1836,7 @@ 

Methods

0 ]["image"], local_interactive=local_interactive, + mcad=mcad, ) return Cluster(cluster_config)
@@ -1688,7 +1880,7 @@

Methods

""" This method accesses the head ray node in your cluster and returns the logs for the provided job id. """ - return self.client.get_job_logs(job_id)
+ return self.job_client.get_job_logs(job_id)
@@ -1704,7 +1896,7 @@

Methods

""" This method accesses the head ray node in your cluster and returns the job status for the provided job id. """ - return self.client.get_job_status(job_id)
+ return self.job_client.get_job_status(job_id)
@@ -1720,7 +1912,7 @@

Methods

""" This method accesses the head ray node in your cluster and lists the running jobs. """ - return self.client.list_jobs()
+ return self.job_client.list_jobs()
@@ -1759,42 +1951,46 @@

Methods

""" ready = False status = CodeFlareClusterStatus.UNKNOWN - # check the app wrapper status - appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) - if appwrapper: - if appwrapper.status in [ - AppWrapperStatus.RUNNING, - AppWrapperStatus.COMPLETED, - AppWrapperStatus.RUNNING_HOLD_COMPLETION, - ]: - ready = False - status = CodeFlareClusterStatus.STARTING - elif appwrapper.status in [ - AppWrapperStatus.FAILED, - AppWrapperStatus.DELETED, - ]: - ready = False - status = CodeFlareClusterStatus.FAILED # should deleted be separate - return status, ready # exit early, no need to check ray status - elif appwrapper.status in [ - AppWrapperStatus.PENDING, - AppWrapperStatus.QUEUEING, - ]: - ready = False - if appwrapper.status == AppWrapperStatus.PENDING: - status = CodeFlareClusterStatus.QUEUED - else: - status = CodeFlareClusterStatus.QUEUEING - if print_to_console: - pretty_print.print_app_wrappers_status([appwrapper]) - return ( - status, - ready, - ) # no need to check the ray status since still in queue + if self.config.mcad: + # check the app wrapper status + appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) + if appwrapper: + if appwrapper.status in [ + AppWrapperStatus.RUNNING, + AppWrapperStatus.COMPLETED, + AppWrapperStatus.RUNNING_HOLD_COMPLETION, + ]: + ready = False + status = CodeFlareClusterStatus.STARTING + elif appwrapper.status in [ + AppWrapperStatus.FAILED, + AppWrapperStatus.DELETED, + ]: + ready = False + status = CodeFlareClusterStatus.FAILED # should deleted be separate + return status, ready # exit early, no need to check ray status + elif appwrapper.status in [ + AppWrapperStatus.PENDING, + AppWrapperStatus.QUEUEING, + ]: + ready = False + if appwrapper.status == AppWrapperStatus.PENDING: + status = CodeFlareClusterStatus.QUEUED + else: + status = CodeFlareClusterStatus.QUEUEING + if print_to_console: + pretty_print.print_app_wrappers_status([appwrapper]) + return ( + status, + ready, + ) # no need to check the ray status since still in queue # check the ray cluster status cluster = _ray_cluster_status(self.config.name, self.config.namespace) - if cluster and not cluster.status == RayClusterStatus.UNKNOWN: + if cluster: + if cluster.status == RayClusterStatus.UNKNOWN: + ready = False + status = CodeFlareClusterStatus.STARTING if cluster.status == RayClusterStatus.READY: ready = True status = CodeFlareClusterStatus.READY @@ -1866,15 +2062,18 @@

Methods

try: config_check() api_instance = client.CustomObjectsApi(api_config_handler()) - with open(self.app_wrapper_yaml) as f: - aw = yaml.load(f, Loader=yaml.FullLoader) - api_instance.create_namespaced_custom_object( - group="workload.codeflare.dev", - version="v1beta1", - namespace=namespace, - plural="appwrappers", - body=aw, - ) + if self.config.mcad: + with open(self.app_wrapper_yaml) as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + api_instance.create_namespaced_custom_object( + group="workload.codeflare.dev", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + body=aw, + ) + else: + self._component_resources_up(namespace, api_instance) except Exception as e: # pragma: no cover return _kube_api_error_handling(e)
@@ -1956,7 +2155,6 @@

Index

  • Cluster

      -
    • client
    • cluster_dashboard_uri
    • cluster_uri
    • create_app_wrapper
    • @@ -1965,6 +2163,7 @@

      evaluate_dispatch_priority
    • from_k8_cluster_object
    • is_dashboard_ready
    • +
    • job_client
    • job_logs
    • job_status
    • list_jobs
    • diff --git a/docs/cluster/config.html b/docs/cluster/config.html index 28830390a..37242b177 100644 --- a/docs/cluster/config.html +++ b/docs/cluster/config.html @@ -78,6 +78,7 @@

      Module codeflare_sdk.cluster.config

      num_gpus: int = 0 template: str = f"{dir}/templates/base-template.yaml" instascale: bool = False + mcad: bool = True envs: dict = field(default_factory=dict) image: str = "quay.io/project-codeflare/ray:latest-py39-cu118" local_interactive: bool = False @@ -97,7 +98,7 @@

      Classes

      class ClusterConfiguration -(name: str, namespace: str = None, head_info: list = <factory>, head_cpus: int = 2, head_memory: int = 8, head_gpus: int = 0, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, num_workers: int = 1, min_memory: int = 2, max_memory: int = 2, num_gpus: int = 0, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'quay.io/project-codeflare/ray:latest-py39-cu118', local_interactive: bool = False, image_pull_secrets: list = <factory>, dispatch_priority: str = None, openshift_oauth: bool = False) +(name: str, namespace: str = None, head_info: list = <factory>, head_cpus: int = 2, head_memory: int = 8, head_gpus: int = 0, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, num_workers: int = 1, min_memory: int = 2, max_memory: int = 2, num_gpus: int = 0, template: str = '/home/runner/work/codeflare-sdk/codeflare-sdk/src/codeflare_sdk/templates/base-template.yaml', instascale: bool = False, mcad: bool = True, envs: dict = <factory>, image: str = 'quay.io/project-codeflare/ray:latest-py39-cu118', local_interactive: bool = False, image_pull_secrets: list = <factory>, dispatch_priority: str = None, openshift_oauth: bool = False)

      This dataclass is used to specify resource requirements and other details, and @@ -127,6 +128,7 @@

      Classes

      num_gpus: int = 0 template: str = f"{dir}/templates/base-template.yaml" instascale: bool = False + mcad: bool = True envs: dict = field(default_factory=dict) image: str = "quay.io/project-codeflare/ray:latest-py39-cu118" local_interactive: bool = False @@ -188,6 +190,10 @@

      Class variables

      +
      var mcad : bool
      +
      +
      +
      var min_cpus : int
      @@ -254,6 +260,7 @@

      machine_types
    • max_cpus
    • max_memory
    • +
    • mcad
    • min_cpus
    • min_memory
    • name
    • diff --git a/docs/job/jobs.html b/docs/job/jobs.html index 266da13ec..ea587df8d 100644 --- a/docs/job/jobs.html +++ b/docs/job/jobs.html @@ -50,9 +50,6 @@

      Module codeflare_sdk.job.jobs

      from torchx.schedulers.ray_scheduler import RayScheduler from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo -from ray.job_submission import JobSubmissionClient - -import openshift as oc if TYPE_CHECKING: from ..cluster.cluster import Cluster @@ -124,9 +121,9 @@

      Module codeflare_sdk.job.jobs

      def _dry_run(self, cluster: "Cluster"): j = f"{cluster.config.num_workers}x{max(cluster.config.num_gpus, 1)}" # # of proc. = # of gpus - runner = get_runner(ray_client=cluster.client) + runner = get_runner(ray_client=cluster.job_client) runner._scheduler_instances["ray"] = RayScheduler( - session_name=runner._name, ray_client=cluster.client + session_name=runner._name, ray_client=cluster.job_client ) return ( runner.dryrun( @@ -381,9 +378,9 @@

      Methods

      def _dry_run(self, cluster: "Cluster"): j = f"{cluster.config.num_workers}x{max(cluster.config.num_gpus, 1)}" # # of proc. = # of gpus - runner = get_runner(ray_client=cluster.client) + runner = get_runner(ray_client=cluster.job_client) runner._scheduler_instances["ray"] = RayScheduler( - session_name=runner._name, ray_client=cluster.client + session_name=runner._name, ray_client=cluster.job_client ) return ( runner.dryrun( diff --git a/docs/utils/generate_yaml.html b/docs/utils/generate_yaml.html index 2091f8b33..4c0ecd3fd 100644 --- a/docs/utils/generate_yaml.html +++ b/docs/utils/generate_yaml.html @@ -488,6 +488,19 @@

      Module codeflare_sdk.utils.generate_yaml

      ) +def write_components(user_yaml: dict, output_file_name: str): + components = user_yaml.get("spec", "resources")["resources"].get("GenericItems") + open(output_file_name, "w").close() + with open(output_file_name, "a") as outfile: + for component in components: + if "generictemplate" in component: + outfile.write("---\n") + yaml.dump( + component["generictemplate"], outfile, default_flow_style=False + ) + print(f"Written to: {output_file_name}") + + def generate_appwrapper( name: str, namespace: str, @@ -503,6 +516,7 @@

      Module codeflare_sdk.utils.generate_yaml

      template: str, image: str, instascale: bool, + mcad: bool, instance_types: list, env, local_interactive: bool, @@ -558,7 +572,10 @@

      Module codeflare_sdk.utils.generate_yaml

      enable_openshift_oauth(user_yaml, cluster_name, namespace) outfile = appwrapper_name + ".yaml" - write_user_appwrapper(user_yaml, outfile) + if not mcad: + write_components(user_yaml, outfile) + else: + write_user_appwrapper(user_yaml, outfile) return outfile @@ -775,7 +792,7 @@

      Functions

      -def generate_appwrapper(name: str, namespace: str, head_cpus: int, head_memory: int, head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, instance_types: list, env, local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int, openshift_oauth: bool) +def generate_appwrapper(name: str, namespace: str, head_cpus: int, head_memory: int, head_gpus: int, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, mcad: bool, instance_types: list, env, local_interactive: bool, image_pull_secrets: list, dispatch_priority: str, priority_val: int, openshift_oauth: bool)
      @@ -798,6 +815,7 @@

      Functions

      template: str, image: str, instascale: bool, + mcad: bool, instance_types: list, env, local_interactive: bool, @@ -853,7 +871,10 @@

      Functions

      enable_openshift_oauth(user_yaml, cluster_name, namespace) outfile = appwrapper_name + ".yaml" - write_user_appwrapper(user_yaml, outfile) + if not mcad: + write_components(user_yaml, outfile) + else: + write_user_appwrapper(user_yaml, outfile) return outfile
      @@ -1209,6 +1230,28 @@

      Functions

      limits["nvidia.com/gpu"] = gpu +
      +def write_components(user_yaml: dict, output_file_name: str) +
      +
      +
      +
      + +Expand source code + +
      def write_components(user_yaml: dict, output_file_name: str):
      +    components = user_yaml.get("spec", "resources")["resources"].get("GenericItems")
      +    open(output_file_name, "w").close()
      +    with open(output_file_name, "a") as outfile:
      +        for component in components:
      +            if "generictemplate" in component:
      +                outfile.write("---\n")
      +                yaml.dump(
      +                    component["generictemplate"], outfile, default_flow_style=False
      +                )
      +    print(f"Written to: {output_file_name}")
      +
      +
      def write_user_appwrapper(user_yaml, output_file_name)
      @@ -1262,6 +1305,7 @@

      Index

    • update_priority
    • update_rayclient_route
    • update_resources
    • +
    • write_components
    • write_user_appwrapper
  • diff --git a/docs/utils/kube_api_helpers.html b/docs/utils/kube_api_helpers.html index 37034ab78..4105a4c5a 100644 --- a/docs/utils/kube_api_helpers.html +++ b/docs/utils/kube_api_helpers.html @@ -54,7 +54,9 @@

    Module codeflare_sdk.utils.kube_api_helpers

    # private methods -def _kube_api_error_handling(e: Exception): # pragma: no cover +def _kube_api_error_handling( + e: Exception, print_error: bool = True +): # pragma: no cover perm_msg = ( "Action not permitted, have you put in correct/up-to-date auth credentials?" ) @@ -63,11 +65,13 @@

    Module codeflare_sdk.utils.kube_api_helpers

    if type(e) == config.ConfigException: raise PermissionError(perm_msg) if type(e) == executing.executing.NotOneValueFound: - print(nf_msg) + if print_error: + print(nf_msg) return if type(e) == client.ApiException: if e.reason == "Not Found": - print(nf_msg) + if print_error: + print(nf_msg) return elif e.reason == "Unauthorized" or e.reason == "Forbidden": raise PermissionError(perm_msg)