diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index fc2c20dac..73f9e38ef 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -18,6 +18,7 @@ def __init__(self, config: ClusterConfiguration): def create_app_wrapper(self): name=self.config.name + namespace=self.config.namespace min_cpu=self.config.min_cpus max_cpu=self.config.max_cpus min_memory=self.config.min_memory @@ -29,21 +30,23 @@ def create_app_wrapper(self): instascale=self.config.instascale instance_types=self.config.machine_types env=self.config.envs - return generate_appwrapper(name=name, min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory, + return generate_appwrapper(name=name, namespace=namespace, min_cpu=min_cpu, max_cpu=max_cpu, min_memory=min_memory, max_memory=max_memory, gpu=gpu, workers=workers, template=template, image=image, instascale=instascale, instance_types=instance_types, env=env) - # creates a new cluster with the provided or default spec - def up(self, namespace='default'): + # creates a new cluster with the provided or default spec + def up(self): + namespace = self.config.namespace with oc.project(namespace): oc.invoke("apply", ["-f", self.app_wrapper_yaml]) - def down(self, namespace='default'): + def down(self): + namespace = self.config.namespace with oc.project(namespace): oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) def status(self, print_to_console=True): - cluster = _ray_cluster_status(self.config.name) + cluster = _ray_cluster_status(self.config.name, self.config.namespace) if cluster: #overriding the number of gpus with requested cluster.worker_gpu = self.config.gpu @@ -55,8 +58,8 @@ def status(self, print_to_console=True): pretty_print.print_no_resources_found() return None - def cluster_uri(self, namespace='default'): - return f'ray://{self.config.name}-head-svc.{namespace}.svc:10001' + def cluster_uri(self): + return f'ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001' def cluster_dashboard_uri(self, namespace='default'): try: @@ -68,13 +71,12 @@ def cluster_dashboard_uri(self, namespace='default'): return "Dashboard route not available yet. Did you run cluster.up()?" - # checks whether the ray cluster is ready def is_ready(self, print_to_console=True): ready = False status = CodeFlareClusterStatus.UNKNOWN # check the app wrapper status - appwrapper = _app_wrapper_status(self.config.name) + appwrapper = _app_wrapper_status(self.config.name, self.config.namespace) if appwrapper: if appwrapper.status in [AppWrapperStatus.RUNNING, AppWrapperStatus.COMPLETED, AppWrapperStatus.RUNNING_HOLD_COMPLETION]: ready = False @@ -91,7 +93,7 @@ def is_ready(self, print_to_console=True): return ready, status# no need to check the ray status since still in queue # check the ray cluster status - cluster = _ray_cluster_status(self.config.name) + cluster = _ray_cluster_status(self.config.name, self.config.namespace) if cluster: if cluster.status == RayClusterStatus.READY: ready = True @@ -106,16 +108,19 @@ def is_ready(self, print_to_console=True): pretty_print.print_clusters([cluster]) return status, ready +def get_current_namespace(): + namespace = oc.invoke("project",["-q"]).actions()[0].out.strip() + return namespace -def list_all_clusters(print_to_console=True): - clusters = _get_ray_clusters() +def list_all_clusters(namespace, print_to_console=True): + clusters = _get_ray_clusters(namespace) if print_to_console: pretty_print.print_clusters(clusters) return clusters -def list_all_queued(print_to_console=True): - app_wrappers = _get_app_wrappers(filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]) +def list_all_queued(namespace, print_to_console=True): + app_wrappers = _get_app_wrappers( namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]) if print_to_console: pretty_print.print_app_wrappers_status(app_wrappers) return app_wrappers @@ -158,7 +163,7 @@ def _get_ray_clusters(namespace='default') -> List[RayCluster]: -def _get_app_wrappers(filter:List[AppWrapperStatus], namespace='default') -> List[AppWrapper]: +def _get_app_wrappers(namespace='default', filter=List[AppWrapperStatus]) -> List[AppWrapper]: list_of_app_wrappers = [] with oc.project(namespace), oc.timeout(10*60): diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index bbd2a90fb..b62419c4f 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -6,6 +6,7 @@ @dataclass class ClusterConfiguration: name: str + namespace: str = "default" head_info: list = field(default_factory=list) machine_types: list = field(default_factory=list) #["m4.xlarge", "g4dn.xlarge"] min_cpus: int = 1 diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index e6a80f0b5..c9d2fe109 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -19,19 +19,22 @@ def gen_names(name): else: return name, name -def update_dashboard_route(route_item, cluster_name): +def update_dashboard_route(route_item, cluster_name, namespace): metadata = route_item.get("generictemplate", {}).get("metadata") metadata["name"] = f'ray-dashboard-{cluster_name}' + metadata["namespace"] = namespace metadata["labels"]["odh-ray-cluster-service"] = f'{cluster_name}-head-svc' spec = route_item.get("generictemplate", {}).get("spec") spec["to"]["name"] = f'{cluster_name}-head-svc' -def update_names(yaml, item, appwrapper_name, cluster_name): +def update_names(yaml, item, appwrapper_name, cluster_name, namespace): metadata = yaml.get("metadata") metadata["name"] = appwrapper_name + metadata["namespace"] = namespace lower_meta = item.get("generictemplate", {}).get("metadata") lower_meta["labels"]["appwrapper.mcad.ibm.com"] = appwrapper_name lower_meta["name"] = cluster_name + lower_meta["namespace"] = namespace def update_labels(yaml, instascale, instance_types): metadata = yaml.get("metadata") @@ -140,17 +143,17 @@ def write_user_appwrapper(user_yaml, output_file_name): yaml.dump(user_yaml, outfile, default_flow_style=False) print(f"Written to: {output_file_name}") -def generate_appwrapper(name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, template, image, instascale, instance_types, env): +def generate_appwrapper(name, namespace, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, template, image, instascale, instance_types, env): user_yaml = read_template(template) appwrapper_name, cluster_name = gen_names(name) resources = user_yaml.get("spec","resources") item = resources["resources"].get("GenericItems")[0] route_item = resources["resources"].get("GenericItems")[1] - update_names(user_yaml, item, appwrapper_name, cluster_name) + update_names(user_yaml, item, appwrapper_name, cluster_name, namespace) update_labels(user_yaml, instascale, instance_types) update_custompodresources(item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers) update_nodes(item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env) - update_dashboard_route(route_item, cluster_name) + update_dashboard_route(route_item, cluster_name, namespace) outfile = appwrapper_name + ".yaml" write_user_appwrapper(user_yaml, outfile) return outfile @@ -169,6 +172,7 @@ def main(): parser.add_argument("--image", required=False, default="rayproject/ray:latest", help="Ray image to be used (defaults to rayproject/ray:latest)") parser.add_argument("--instascale", default=False, required=False, action='store_true', help="Indicates that instascale is installed on the cluster") parser.add_argument("--instance-types", type=str, nargs='+', default=[], required=False, help="Head,worker instance types (space separated)") + parser.add_argument("--namespace", required=False, default="default", help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace") args = parser.parse_args() name = args.name @@ -182,9 +186,10 @@ def main(): image = args.image instascale = args.instascale instance_types = args.instance_types + namespace = args.namespace env = {} - outfile = generate_appwrapper(name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, template, image, instascale, instance_types, env) + outfile = generate_appwrapper(name,namespace, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, template, image, instascale, instance_types, env) return outfile if __name__=="__main__":