From fb1126562df9a430ec7c081f8ff69832281e50e7 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Sat, 3 Dec 2022 17:48:49 -0500 Subject: [PATCH 1/2] Adding GH Actions Formatting CI Check and GH Pages for Docs --- .github/workflows/python-app.yml | 33 ++ README.md | 9 +- docs/cluster/cluster.html | 873 ++++++++++++++++++++++++++++++ docs/cluster/config.html | 234 ++++++++ docs/cluster/index.html | 79 +++ docs/cluster/model.html | 464 ++++++++++++++++ docs/index.html | 65 +++ docs/utils/generate_yaml.html | 885 +++++++++++++++++++++++++++++++ docs/utils/index.html | 72 +++ docs/utils/pretty_print.html | 351 ++++++++++++ 10 files changed, 3064 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/python-app.yml create mode 100644 docs/cluster/cluster.html create mode 100644 docs/cluster/config.html create mode 100644 docs/cluster/index.html create mode 100644 docs/cluster/model.html create mode 100644 docs/index.html create mode 100644 docs/utils/generate_yaml.html create mode 100644 docs/utils/index.html create mode 100644 docs/utils/pretty_print.html diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml new file mode 100644 index 000000000..0236f4a17 --- /dev/null +++ b/.github/workflows/python-app.yml @@ -0,0 +1,33 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + pip install pytest-dependency + pip install pytest-mock + pip install black==22.3.0 + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Check formatting with black + run: | + black --check . diff --git a/README.md b/README.md index e024d4e37..608307b77 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,13 @@ For testing, make sure to have installed: NOTE: Self-contained unit/functional tests coming soon, will live in `tests` folder +For formatting: + - Currently using black v22.3.0 for format checking + - To install, run `pip install black==22.3.0` + - To check file formatting, in top-level dir run `black --check .` + - To auto-reformat all files, remove the `--check` flag + - To reformat an individual file, run `black ` + To build the python package: - - If poetry is not installed: `pip3 install poetry` + - If poetry is not installed: `pip install poetry` - `poetry build` diff --git a/docs/cluster/cluster.html b/docs/cluster/cluster.html new file mode 100644 index 000000000..5e1fe6687 --- /dev/null +++ b/docs/cluster/cluster.html @@ -0,0 +1,873 @@ + + + + + + +codeflare_sdk.cluster.cluster API documentation + + + + + + + + + + + +
+
+
+

Module codeflare_sdk.cluster.cluster

+
+
+

The cluster sub-module contains the definition of the Cluster object, which represents +the resources requested by the user. It also contains functions for checking the +cluster setup queue, a list of all existing clusters, and the user's working namespace.

+
+ +Expand source code + +
# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+The cluster sub-module contains the definition of the Cluster object, which represents
+the resources requested by the user. It also contains functions for checking the
+cluster setup queue, a list of all existing clusters, and the user's working namespace.
+"""
+
+from os import stat
+from typing import List, Optional, Tuple
+
+import openshift as oc
+
+from ..utils import pretty_print
+from ..utils.generate_yaml import generate_appwrapper
+from .config import ClusterConfiguration
+from .model import (
+    AppWrapper,
+    AppWrapperStatus,
+    CodeFlareClusterStatus,
+    RayCluster,
+    RayClusterStatus,
+)
+
+
+class Cluster:
+    """
+    An object for requesting, bringing up, and taking down resources.
+    Can also be used for seeing the resource cluster status and details.
+
+    Note that currently, the underlying implementation is a Ray cluster.
+    """
+
+    def __init__(self, config: ClusterConfiguration):
+        """
+        Create the resource cluster object by passing in a ClusterConfiguration
+        (defined in the config sub-module). An AppWrapper will then be generated
+        based off of the configured resources to represent the desired cluster
+        request.
+        """
+        self.config = config
+        self.app_wrapper_yaml = self.create_app_wrapper()
+        self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0]
+
+    def create_app_wrapper(self):
+        """
+        Called upon cluster object creation, creates an AppWrapper yaml based on
+        the specifications of the ClusterConfiguration.
+        """
+        name = self.config.name
+        namespace = self.config.namespace
+        min_cpu = self.config.min_cpus
+        max_cpu = self.config.max_cpus
+        min_memory = self.config.min_memory
+        max_memory = self.config.max_memory
+        gpu = self.config.gpu
+        workers = self.config.max_worker
+        template = self.config.template
+        image = self.config.image
+        instascale = self.config.instascale
+        instance_types = self.config.machine_types
+        env = self.config.envs
+        return generate_appwrapper(
+            name=name,
+            namespace=namespace,
+            min_cpu=min_cpu,
+            max_cpu=max_cpu,
+            min_memory=min_memory,
+            max_memory=max_memory,
+            gpu=gpu,
+            workers=workers,
+            template=template,
+            image=image,
+            instascale=instascale,
+            instance_types=instance_types,
+            env=env,
+        )
+
+    # creates a new cluster with the provided or default spec
+    def up(self):
+        """
+        Applies the AppWrapper yaml, pushing the resource request onto
+        the MCAD queue.
+        """
+        namespace = self.config.namespace
+        with oc.project(namespace):
+            oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+
+    def down(self):
+        """
+        Deletes the AppWrapper yaml, scaling-down and deleting all resources
+        associated with the cluster.
+        """
+        namespace = self.config.namespace
+        with oc.project(namespace):
+            oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
+
+    def status(self, print_to_console: bool = True):
+        """
+        TO BE UPDATED: Will soon return (and print by default) the cluster's
+        status, from AppWrapper submission to setup completion. All resource
+        details will be moved to cluster.details().
+        """
+        cluster = _ray_cluster_status(self.config.name, self.config.namespace)
+        if cluster:
+            # overriding the number of gpus with requested
+            cluster.worker_gpu = self.config.gpu
+            if print_to_console:
+                pretty_print.print_clusters([cluster])
+            return cluster.status
+        else:
+            if print_to_console:
+                pretty_print.print_no_resources_found()
+            return None
+
+    def cluster_uri(self) -> str:
+        """
+        Returns a string containing the cluster's URI.
+        """
+        return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
+
+    def cluster_dashboard_uri(self, namespace: str = "default") -> str:
+        """
+        Returns a string containing the cluster's dashboard URI.
+        """
+        try:
+            with oc.project(namespace):
+                route = oc.invoke(
+                    "get", ["route", "-o", "jsonpath='{$.items[0].spec.host}'"]
+                )
+                route = route.out().strip().strip("'")
+            return f"http://{route}"
+        except:
+            return "Dashboard route not available yet. Did you run cluster.up()?"
+
+    # checks whether the ray cluster is ready
+    def is_ready(self, print_to_console: bool = True):
+        """
+        TO BE DEPRECATED: functionality will be added into cluster.status().
+        """
+        ready = False
+        status = CodeFlareClusterStatus.UNKNOWN
+        # check the app wrapper status
+        appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
+        if appwrapper:
+            if appwrapper.status in [
+                AppWrapperStatus.RUNNING,
+                AppWrapperStatus.COMPLETED,
+                AppWrapperStatus.RUNNING_HOLD_COMPLETION,
+            ]:
+                ready = False
+                status = CodeFlareClusterStatus.QUEUED
+            elif appwrapper.status in [
+                AppWrapperStatus.FAILED,
+                AppWrapperStatus.DELETED,
+            ]:
+                ready = False
+                status = CodeFlareClusterStatus.FAILED  # should deleted be separate
+                return ready, status  # exit early, no need to check ray status
+            elif appwrapper.status in [AppWrapperStatus.PENDING]:
+                ready = False
+                status = CodeFlareClusterStatus.QUEUED
+                if print_to_console:
+                    pretty_print.print_app_wrappers_status([appwrapper])
+                return (
+                    ready,
+                    status,
+                )  # no need to check the ray status since still in queue
+
+        # check the ray cluster status
+        cluster = _ray_cluster_status(self.config.name, self.config.namespace)
+        if cluster:
+            if cluster.status == RayClusterStatus.READY:
+                ready = True
+                status = CodeFlareClusterStatus.READY
+            elif cluster.status in [
+                RayClusterStatus.UNHEALTHY,
+                RayClusterStatus.FAILED,
+            ]:
+                ready = False
+                status = CodeFlareClusterStatus.FAILED
+
+            if print_to_console:
+                # overriding the number of gpus with requested
+                cluster.worker_gpu = self.config.gpu
+                pretty_print.print_clusters([cluster])
+        return status, ready
+
+
+def get_current_namespace() -> str:
+    """
+    Returns the user's current working namespace.
+    """
+    namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
+    return namespace
+
+
+def list_all_clusters(namespace: str, print_to_console: bool = True):
+    """
+    Returns (and prints by default) a list of all clusters in a given namespace.
+    """
+    clusters = _get_ray_clusters(namespace)
+    if print_to_console:
+        pretty_print.print_clusters(clusters)
+    return clusters
+
+
+def list_all_queued(namespace: str, print_to_console: bool = True):
+    """
+    Returns (and prints by default) a list of all currently queued-up AppWrappers
+    in a given namespace.
+    """
+    app_wrappers = _get_app_wrappers(
+        namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]
+    )
+    if print_to_console:
+        pretty_print.print_app_wrappers_status(app_wrappers)
+    return app_wrappers
+
+
+# private methods
+
+
+def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]:
+    with oc.project(namespace), oc.timeout(10 * 60):
+        cluster = oc.selector(f"appwrapper/{name}").object()
+    if cluster:
+        return _map_to_app_wrapper(cluster)
+
+
+def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]:
+    # FIXME should we check the appwrapper first
+    cluster = None
+    try:
+        with oc.project(namespace), oc.timeout(10 * 60):
+            cluster = oc.selector(f"rayclusters/{name}").object()
+
+        if cluster:
+            return _map_to_ray_cluster(cluster)
+    except:
+        pass
+    return cluster
+
+
+def _get_ray_clusters(namespace="default") -> List[RayCluster]:
+    list_of_clusters = []
+
+    with oc.project(namespace), oc.timeout(10 * 60):
+        ray_clusters = oc.selector("rayclusters").objects()
+
+    for cluster in ray_clusters:
+        list_of_clusters.append(_map_to_ray_cluster(cluster))
+    return list_of_clusters
+
+
+def _get_app_wrappers(
+    namespace="default", filter=List[AppWrapperStatus]
+) -> List[AppWrapper]:
+    list_of_app_wrappers = []
+
+    with oc.project(namespace), oc.timeout(10 * 60):
+        app_wrappers = oc.selector("appwrappers").objects()
+
+    for item in app_wrappers:
+        app_wrapper = _map_to_app_wrapper(item)
+        if filter and app_wrapper.status in filter:
+            list_of_app_wrappers.append(app_wrapper)
+        else:
+            list_of_app_wrappers.append(app_wrapper)
+    return list_of_app_wrappers
+
+
+def _map_to_ray_cluster(cluster) -> RayCluster:
+    cluster_model = cluster.model
+
+    with oc.project(cluster.namespace()), oc.timeout(10 * 60):
+        route = (
+            oc.selector(f"route/ray-dashboard-{cluster.name()}")
+            .object()
+            .model.spec.host
+        )
+
+    return RayCluster(
+        name=cluster.name(),
+        status=RayClusterStatus(cluster_model.status.state.lower()),
+        # for now we are not using autoscaling so same replicas is fine
+        min_workers=cluster_model.spec.workerGroupSpecs[0].replicas,
+        max_workers=cluster_model.spec.workerGroupSpecs[0].replicas,
+        worker_mem_max=cluster_model.spec.workerGroupSpecs[0]
+        .template.spec.containers[0]
+        .resources.limits.memory,
+        worker_mem_min=cluster_model.spec.workerGroupSpecs[0]
+        .template.spec.containers[0]
+        .resources.requests.memory,
+        worker_cpu=cluster_model.spec.workerGroupSpecs[0]
+        .template.spec.containers[0]
+        .resources.limits.cpu,
+        worker_gpu=0,  # hard to detect currently how many gpus, can override it with what the user asked for
+        namespace=cluster.namespace(),
+        dashboard=route,
+    )
+
+
+def _map_to_app_wrapper(cluster) -> AppWrapper:
+    cluster_model = cluster.model
+    return AppWrapper(
+        name=cluster.name(),
+        status=AppWrapperStatus(cluster_model.status.state.lower()),
+        can_run=cluster_model.status.canrun,
+        job_state=cluster_model.status.queuejobstate,
+    )
+
+
+
+
+
+
+
+

Functions

+
+
+def get_current_namespace() ‑> str +
+
+

Returns the user's current working namespace.

+
+ +Expand source code + +
def get_current_namespace() -> str:
+    """
+    Returns the user's current working namespace.
+    """
+    namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
+    return namespace
+
+
+
+def list_all_clusters(namespace: str, print_to_console: bool = True) +
+
+

Returns (and prints by default) a list of all clusters in a given namespace.

+
+ +Expand source code + +
def list_all_clusters(namespace: str, print_to_console: bool = True):
+    """
+    Returns (and prints by default) a list of all clusters in a given namespace.
+    """
+    clusters = _get_ray_clusters(namespace)
+    if print_to_console:
+        pretty_print.print_clusters(clusters)
+    return clusters
+
+
+
+def list_all_queued(namespace: str, print_to_console: bool = True) +
+
+

Returns (and prints by default) a list of all currently queued-up AppWrappers +in a given namespace.

+
+ +Expand source code + +
def list_all_queued(namespace: str, print_to_console: bool = True):
+    """
+    Returns (and prints by default) a list of all currently queued-up AppWrappers
+    in a given namespace.
+    """
+    app_wrappers = _get_app_wrappers(
+        namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]
+    )
+    if print_to_console:
+        pretty_print.print_app_wrappers_status(app_wrappers)
+    return app_wrappers
+
+
+
+
+
+

Classes

+
+
+class Cluster +(config: ClusterConfiguration) +
+
+

An object for requesting, bringing up, and taking down resources. +Can also be used for seeing the resource cluster status and details.

+

Note that currently, the underlying implementation is a Ray cluster.

+

Create the resource cluster object by passing in a ClusterConfiguration +(defined in the config sub-module). An AppWrapper will then be generated +based off of the configured resources to represent the desired cluster +request.

+
+ +Expand source code + +
class Cluster:
+    """
+    An object for requesting, bringing up, and taking down resources.
+    Can also be used for seeing the resource cluster status and details.
+
+    Note that currently, the underlying implementation is a Ray cluster.
+    """
+
+    def __init__(self, config: ClusterConfiguration):
+        """
+        Create the resource cluster object by passing in a ClusterConfiguration
+        (defined in the config sub-module). An AppWrapper will then be generated
+        based off of the configured resources to represent the desired cluster
+        request.
+        """
+        self.config = config
+        self.app_wrapper_yaml = self.create_app_wrapper()
+        self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0]
+
+    def create_app_wrapper(self):
+        """
+        Called upon cluster object creation, creates an AppWrapper yaml based on
+        the specifications of the ClusterConfiguration.
+        """
+        name = self.config.name
+        namespace = self.config.namespace
+        min_cpu = self.config.min_cpus
+        max_cpu = self.config.max_cpus
+        min_memory = self.config.min_memory
+        max_memory = self.config.max_memory
+        gpu = self.config.gpu
+        workers = self.config.max_worker
+        template = self.config.template
+        image = self.config.image
+        instascale = self.config.instascale
+        instance_types = self.config.machine_types
+        env = self.config.envs
+        return generate_appwrapper(
+            name=name,
+            namespace=namespace,
+            min_cpu=min_cpu,
+            max_cpu=max_cpu,
+            min_memory=min_memory,
+            max_memory=max_memory,
+            gpu=gpu,
+            workers=workers,
+            template=template,
+            image=image,
+            instascale=instascale,
+            instance_types=instance_types,
+            env=env,
+        )
+
+    # creates a new cluster with the provided or default spec
+    def up(self):
+        """
+        Applies the AppWrapper yaml, pushing the resource request onto
+        the MCAD queue.
+        """
+        namespace = self.config.namespace
+        with oc.project(namespace):
+            oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+
+    def down(self):
+        """
+        Deletes the AppWrapper yaml, scaling-down and deleting all resources
+        associated with the cluster.
+        """
+        namespace = self.config.namespace
+        with oc.project(namespace):
+            oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
+
+    def status(self, print_to_console: bool = True):
+        """
+        TO BE UPDATED: Will soon return (and print by default) the cluster's
+        status, from AppWrapper submission to setup completion. All resource
+        details will be moved to cluster.details().
+        """
+        cluster = _ray_cluster_status(self.config.name, self.config.namespace)
+        if cluster:
+            # overriding the number of gpus with requested
+            cluster.worker_gpu = self.config.gpu
+            if print_to_console:
+                pretty_print.print_clusters([cluster])
+            return cluster.status
+        else:
+            if print_to_console:
+                pretty_print.print_no_resources_found()
+            return None
+
+    def cluster_uri(self) -> str:
+        """
+        Returns a string containing the cluster's URI.
+        """
+        return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
+
+    def cluster_dashboard_uri(self, namespace: str = "default") -> str:
+        """
+        Returns a string containing the cluster's dashboard URI.
+        """
+        try:
+            with oc.project(namespace):
+                route = oc.invoke(
+                    "get", ["route", "-o", "jsonpath='{$.items[0].spec.host}'"]
+                )
+                route = route.out().strip().strip("'")
+            return f"http://{route}"
+        except:
+            return "Dashboard route not available yet. Did you run cluster.up()?"
+
+    # checks whether the ray cluster is ready
+    def is_ready(self, print_to_console: bool = True):
+        """
+        TO BE DEPRECATED: functionality will be added into cluster.status().
+        """
+        ready = False
+        status = CodeFlareClusterStatus.UNKNOWN
+        # check the app wrapper status
+        appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
+        if appwrapper:
+            if appwrapper.status in [
+                AppWrapperStatus.RUNNING,
+                AppWrapperStatus.COMPLETED,
+                AppWrapperStatus.RUNNING_HOLD_COMPLETION,
+            ]:
+                ready = False
+                status = CodeFlareClusterStatus.QUEUED
+            elif appwrapper.status in [
+                AppWrapperStatus.FAILED,
+                AppWrapperStatus.DELETED,
+            ]:
+                ready = False
+                status = CodeFlareClusterStatus.FAILED  # should deleted be separate
+                return ready, status  # exit early, no need to check ray status
+            elif appwrapper.status in [AppWrapperStatus.PENDING]:
+                ready = False
+                status = CodeFlareClusterStatus.QUEUED
+                if print_to_console:
+                    pretty_print.print_app_wrappers_status([appwrapper])
+                return (
+                    ready,
+                    status,
+                )  # no need to check the ray status since still in queue
+
+        # check the ray cluster status
+        cluster = _ray_cluster_status(self.config.name, self.config.namespace)
+        if cluster:
+            if cluster.status == RayClusterStatus.READY:
+                ready = True
+                status = CodeFlareClusterStatus.READY
+            elif cluster.status in [
+                RayClusterStatus.UNHEALTHY,
+                RayClusterStatus.FAILED,
+            ]:
+                ready = False
+                status = CodeFlareClusterStatus.FAILED
+
+            if print_to_console:
+                # overriding the number of gpus with requested
+                cluster.worker_gpu = self.config.gpu
+                pretty_print.print_clusters([cluster])
+        return status, ready
+
+

Methods

+
+
+def cluster_dashboard_uri(self, namespace: str = 'default') ‑> str +
+
+

Returns a string containing the cluster's dashboard URI.

+
+ +Expand source code + +
def cluster_dashboard_uri(self, namespace: str = "default") -> str:
+    """
+    Returns a string containing the cluster's dashboard URI.
+    """
+    try:
+        with oc.project(namespace):
+            route = oc.invoke(
+                "get", ["route", "-o", "jsonpath='{$.items[0].spec.host}'"]
+            )
+            route = route.out().strip().strip("'")
+        return f"http://{route}"
+    except:
+        return "Dashboard route not available yet. Did you run cluster.up()?"
+
+
+
+def cluster_uri(self) ‑> str +
+
+

Returns a string containing the cluster's URI.

+
+ +Expand source code + +
def cluster_uri(self) -> str:
+    """
+    Returns a string containing the cluster's URI.
+    """
+    return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
+
+
+
+def create_app_wrapper(self) +
+
+

Called upon cluster object creation, creates an AppWrapper yaml based on +the specifications of the ClusterConfiguration.

+
+ +Expand source code + +
def create_app_wrapper(self):
+    """
+    Called upon cluster object creation, creates an AppWrapper yaml based on
+    the specifications of the ClusterConfiguration.
+    """
+    name = self.config.name
+    namespace = self.config.namespace
+    min_cpu = self.config.min_cpus
+    max_cpu = self.config.max_cpus
+    min_memory = self.config.min_memory
+    max_memory = self.config.max_memory
+    gpu = self.config.gpu
+    workers = self.config.max_worker
+    template = self.config.template
+    image = self.config.image
+    instascale = self.config.instascale
+    instance_types = self.config.machine_types
+    env = self.config.envs
+    return generate_appwrapper(
+        name=name,
+        namespace=namespace,
+        min_cpu=min_cpu,
+        max_cpu=max_cpu,
+        min_memory=min_memory,
+        max_memory=max_memory,
+        gpu=gpu,
+        workers=workers,
+        template=template,
+        image=image,
+        instascale=instascale,
+        instance_types=instance_types,
+        env=env,
+    )
+
+
+
+def down(self) +
+
+

Deletes the AppWrapper yaml, scaling-down and deleting all resources +associated with the cluster.

+
+ +Expand source code + +
def down(self):
+    """
+    Deletes the AppWrapper yaml, scaling-down and deleting all resources
+    associated with the cluster.
+    """
+    namespace = self.config.namespace
+    with oc.project(namespace):
+        oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
+
+
+
+def is_ready(self, print_to_console: bool = True) +
+
+

TO BE DEPRECATED: functionality will be added into cluster.status().

+
+ +Expand source code + +
def is_ready(self, print_to_console: bool = True):
+    """
+    TO BE DEPRECATED: functionality will be added into cluster.status().
+    """
+    ready = False
+    status = CodeFlareClusterStatus.UNKNOWN
+    # check the app wrapper status
+    appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
+    if appwrapper:
+        if appwrapper.status in [
+            AppWrapperStatus.RUNNING,
+            AppWrapperStatus.COMPLETED,
+            AppWrapperStatus.RUNNING_HOLD_COMPLETION,
+        ]:
+            ready = False
+            status = CodeFlareClusterStatus.QUEUED
+        elif appwrapper.status in [
+            AppWrapperStatus.FAILED,
+            AppWrapperStatus.DELETED,
+        ]:
+            ready = False
+            status = CodeFlareClusterStatus.FAILED  # should deleted be separate
+            return ready, status  # exit early, no need to check ray status
+        elif appwrapper.status in [AppWrapperStatus.PENDING]:
+            ready = False
+            status = CodeFlareClusterStatus.QUEUED
+            if print_to_console:
+                pretty_print.print_app_wrappers_status([appwrapper])
+            return (
+                ready,
+                status,
+            )  # no need to check the ray status since still in queue
+
+    # check the ray cluster status
+    cluster = _ray_cluster_status(self.config.name, self.config.namespace)
+    if cluster:
+        if cluster.status == RayClusterStatus.READY:
+            ready = True
+            status = CodeFlareClusterStatus.READY
+        elif cluster.status in [
+            RayClusterStatus.UNHEALTHY,
+            RayClusterStatus.FAILED,
+        ]:
+            ready = False
+            status = CodeFlareClusterStatus.FAILED
+
+        if print_to_console:
+            # overriding the number of gpus with requested
+            cluster.worker_gpu = self.config.gpu
+            pretty_print.print_clusters([cluster])
+    return status, ready
+
+
+
+def status(self, print_to_console: bool = True) +
+
+

TO BE UPDATED: Will soon return (and print by default) the cluster's +status, from AppWrapper submission to setup completion. All resource +details will be moved to cluster.details().

+
+ +Expand source code + +
def status(self, print_to_console: bool = True):
+    """
+    TO BE UPDATED: Will soon return (and print by default) the cluster's
+    status, from AppWrapper submission to setup completion. All resource
+    details will be moved to cluster.details().
+    """
+    cluster = _ray_cluster_status(self.config.name, self.config.namespace)
+    if cluster:
+        # overriding the number of gpus with requested
+        cluster.worker_gpu = self.config.gpu
+        if print_to_console:
+            pretty_print.print_clusters([cluster])
+        return cluster.status
+    else:
+        if print_to_console:
+            pretty_print.print_no_resources_found()
+        return None
+
+
+
+def up(self) +
+
+

Applies the AppWrapper yaml, pushing the resource request onto +the MCAD queue.

+
+ +Expand source code + +
def up(self):
+    """
+    Applies the AppWrapper yaml, pushing the resource request onto
+    the MCAD queue.
+    """
+    namespace = self.config.namespace
+    with oc.project(namespace):
+        oc.invoke("apply", ["-f", self.app_wrapper_yaml])
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/cluster/config.html b/docs/cluster/config.html new file mode 100644 index 000000000..4ab10299d --- /dev/null +++ b/docs/cluster/config.html @@ -0,0 +1,234 @@ + + + + + + +codeflare_sdk.cluster.config API documentation + + + + + + + + + + + +
+
+
+

Module codeflare_sdk.cluster.config

+
+
+

The config sub-module contains the definition of the ClusterConfiguration dataclass, +which is used to specify resource requirements and other details when creating a +Cluster object.

+
+ +Expand source code + +
# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+The config sub-module contains the definition of the ClusterConfiguration dataclass,
+which is used to specify resource requirements and other details when creating a
+Cluster object.
+"""
+
+from dataclasses import dataclass, field
+import pathlib
+
+dir = pathlib.Path(__file__).parent.parent.resolve()
+
+
+@dataclass
+class ClusterConfiguration:
+    """
+    This dataclass is used to specify resource requirements and other details, and
+    is passed in as an argument when creating a Cluster object.
+    """
+
+    name: str
+    namespace: str = "default"
+    head_info: list = field(default_factory=list)
+    machine_types: list = field(default_factory=list)  # ["m4.xlarge", "g4dn.xlarge"]
+    min_cpus: int = 1
+    max_cpus: int = 1
+    min_worker: int = 1
+    max_worker: int = 1
+    min_memory: int = 2
+    max_memory: int = 2
+    gpu: int = 0
+    template: str = f"{dir}/templates/new-template.yaml"
+    instascale: bool = False
+    envs: dict = field(default_factory=dict)
+    image: str = "ghcr.io/ibm-ai-foundation/base:ray1.13.0-py38-gpu-pytorch1.12.0cu116-20220826-202124"
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class ClusterConfiguration +(name: str, namespace: str = 'default', head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'ghcr.io/ibm-ai-foundation/base:ray1.13.0-py38-gpu-pytorch1.12.0cu116-20220826-202124') +
+
+

This dataclass is used to specify resource requirements and other details, and +is passed in as an argument when creating a Cluster object.

+
+ +Expand source code + +
class ClusterConfiguration:
+    """
+    This dataclass is used to specify resource requirements and other details, and
+    is passed in as an argument when creating a Cluster object.
+    """
+
+    name: str
+    namespace: str = "default"
+    head_info: list = field(default_factory=list)
+    machine_types: list = field(default_factory=list)  # ["m4.xlarge", "g4dn.xlarge"]
+    min_cpus: int = 1
+    max_cpus: int = 1
+    min_worker: int = 1
+    max_worker: int = 1
+    min_memory: int = 2
+    max_memory: int = 2
+    gpu: int = 0
+    template: str = f"{dir}/templates/new-template.yaml"
+    instascale: bool = False
+    envs: dict = field(default_factory=dict)
+    image: str = "ghcr.io/ibm-ai-foundation/base:ray1.13.0-py38-gpu-pytorch1.12.0cu116-20220826-202124"
+
+

Class variables

+
+
var envs : dict
+
+
+
+
var gpu : int
+
+
+
+
var head_info : list
+
+
+
+
var image : str
+
+
+
+
var instascale : bool
+
+
+
+
var machine_types : list
+
+
+
+
var max_cpus : int
+
+
+
+
var max_memory : int
+
+
+
+
var max_worker : int
+
+
+
+
var min_cpus : int
+
+
+
+
var min_memory : int
+
+
+
+
var min_worker : int
+
+
+
+
var name : str
+
+
+
+
var namespace : str
+
+
+
+
var template : str
+
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/cluster/index.html b/docs/cluster/index.html new file mode 100644 index 000000000..1684c93d8 --- /dev/null +++ b/docs/cluster/index.html @@ -0,0 +1,79 @@ + + + + + + +codeflare_sdk.cluster API documentation + + + + + + + + + + + +
+
+
+

Module codeflare_sdk.cluster

+
+
+
+
+

Sub-modules

+
+
codeflare_sdk.cluster.cluster
+
+

The cluster sub-module contains the definition of the Cluster object, which represents +the resources requested by the user. It also contains functions …

+
+
codeflare_sdk.cluster.config
+
+

The config sub-module contains the definition of the ClusterConfiguration dataclass, +which is used to specify resource requirements and other details …

+
+
codeflare_sdk.cluster.model
+
+

The model sub-module defines Enums containing information for Ray cluster +states and AppWrapper states, and CodeFlare cluster states, as well as +…

+
+
+
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/cluster/model.html b/docs/cluster/model.html new file mode 100644 index 000000000..42cefda09 --- /dev/null +++ b/docs/cluster/model.html @@ -0,0 +1,464 @@ + + + + + + +codeflare_sdk.cluster.model API documentation + + + + + + + + + + + +
+
+
+

Module codeflare_sdk.cluster.model

+
+
+

The model sub-module defines Enums containing information for Ray cluster +states and AppWrapper states, and CodeFlare cluster states, as well as +dataclasses to store information for Ray clusters and AppWrappers.

+
+ +Expand source code + +
# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+The model sub-module defines Enums containing information for Ray cluster
+states and AppWrapper states, and CodeFlare cluster states, as well as
+dataclasses to store information for Ray clusters and AppWrappers.
+"""
+
+from dataclasses import dataclass
+from enum import Enum
+
+
+class RayClusterStatus(Enum):
+    """
+    Defines the possible reportable states of a Ray cluster.
+    """
+
+    # https://github.com/ray-project/kuberay/blob/master/ray-operator/apis/ray/v1alpha1/raycluster_types.go#L95
+    READY = "ready"
+    UNHEALTHY = "unhealthy"
+    FAILED = "failed"
+    UNKNOWN = "unknown"
+
+
+class AppWrapperStatus(Enum):
+    """
+    Defines the possible reportable states of an AppWrapper.
+    """
+
+    PENDING = "pending"
+    RUNNING = "running"
+    FAILED = "failed"
+    DELETED = "deleted"
+    COMPLETED = "completed"
+    RUNNING_HOLD_COMPLETION = "runningholdcompletion"
+
+
+class CodeFlareClusterStatus(Enum):
+    """
+    Defines the possible reportable states of a Codeflare cluster.
+    """
+
+    READY = 1
+    QUEUED = 2
+    FAILED = 3
+    UNKNOWN = 4
+
+
+@dataclass
+class RayCluster:
+    """
+    For storing information about a Ray cluster.
+    """
+
+    name: str
+    status: RayClusterStatus
+    min_workers: int
+    max_workers: int
+    worker_mem_min: str
+    worker_mem_max: str
+    worker_cpu: int
+    worker_gpu: int
+    namespace: str
+    dashboard: str
+
+
+@dataclass
+class AppWrapper:
+    """
+    For storing information about an AppWrapper.
+    """
+
+    name: str
+    status: AppWrapperStatus
+    can_run: bool
+    job_state: str
+
+
+
+
+
+
+
+
+
+

Classes

+
+
+class AppWrapper +(name: str, status: AppWrapperStatus, can_run: bool, job_state: str) +
+
+

For storing information about an AppWrapper.

+
+ +Expand source code + +
class AppWrapper:
+    """
+    For storing information about an AppWrapper.
+    """
+
+    name: str
+    status: AppWrapperStatus
+    can_run: bool
+    job_state: str
+
+

Class variables

+
+
var can_run : bool
+
+
+
+
var job_state : str
+
+
+
+
var name : str
+
+
+
+
var statusAppWrapperStatus
+
+
+
+
+
+
+class AppWrapperStatus +(value, names=None, *, module=None, qualname=None, type=None, start=1) +
+
+

Defines the possible reportable states of an AppWrapper.

+
+ +Expand source code + +
class AppWrapperStatus(Enum):
+    """
+    Defines the possible reportable states of an AppWrapper.
+    """
+
+    PENDING = "pending"
+    RUNNING = "running"
+    FAILED = "failed"
+    DELETED = "deleted"
+    COMPLETED = "completed"
+    RUNNING_HOLD_COMPLETION = "runningholdcompletion"
+
+

Ancestors

+
    +
  • enum.Enum
  • +
+

Class variables

+
+
var COMPLETED
+
+
+
+
var DELETED
+
+
+
+
var FAILED
+
+
+
+
var PENDING
+
+
+
+
var RUNNING
+
+
+
+
var RUNNING_HOLD_COMPLETION
+
+
+
+
+
+
+class CodeFlareClusterStatus +(value, names=None, *, module=None, qualname=None, type=None, start=1) +
+
+

Defines the possible reportable states of a Codeflare cluster.

+
+ +Expand source code + +
class CodeFlareClusterStatus(Enum):
+    """
+    Defines the possible reportable states of a Codeflare cluster.
+    """
+
+    READY = 1
+    QUEUED = 2
+    FAILED = 3
+    UNKNOWN = 4
+
+

Ancestors

+
    +
  • enum.Enum
  • +
+

Class variables

+
+
var FAILED
+
+
+
+
var QUEUED
+
+
+
+
var READY
+
+
+
+
var UNKNOWN
+
+
+
+
+
+
+class RayCluster +(name: str, status: RayClusterStatus, min_workers: int, max_workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, worker_gpu: int, namespace: str, dashboard: str) +
+
+

For storing information about a Ray cluster.

+
+ +Expand source code + +
class RayCluster:
+    """
+    For storing information about a Ray cluster.
+    """
+
+    name: str
+    status: RayClusterStatus
+    min_workers: int
+    max_workers: int
+    worker_mem_min: str
+    worker_mem_max: str
+    worker_cpu: int
+    worker_gpu: int
+    namespace: str
+    dashboard: str
+
+

Class variables

+
+
var dashboard : str
+
+
+
+
var max_workers : int
+
+
+
+
var min_workers : int
+
+
+
+
var name : str
+
+
+
+
var namespace : str
+
+
+
+
var statusRayClusterStatus
+
+
+
+
var worker_cpu : int
+
+
+
+
var worker_gpu : int
+
+
+
+
var worker_mem_max : str
+
+
+
+
var worker_mem_min : str
+
+
+
+
+
+
+class RayClusterStatus +(value, names=None, *, module=None, qualname=None, type=None, start=1) +
+
+

Defines the possible reportable states of a Ray cluster.

+
+ +Expand source code + +
class RayClusterStatus(Enum):
+    """
+    Defines the possible reportable states of a Ray cluster.
+    """
+
+    # https://github.com/ray-project/kuberay/blob/master/ray-operator/apis/ray/v1alpha1/raycluster_types.go#L95
+    READY = "ready"
+    UNHEALTHY = "unhealthy"
+    FAILED = "failed"
+    UNKNOWN = "unknown"
+
+

Ancestors

+
    +
  • enum.Enum
  • +
+

Class variables

+
+
var FAILED
+
+
+
+
var READY
+
+
+
+
var UNHEALTHY
+
+
+
+
var UNKNOWN
+
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 000000000..2b2aa84fb --- /dev/null +++ b/docs/index.html @@ -0,0 +1,65 @@ + + + + + + +codeflare_sdk API documentation + + + + + + + + + + + +
+ + +
+ + + \ No newline at end of file diff --git a/docs/utils/generate_yaml.html b/docs/utils/generate_yaml.html new file mode 100644 index 000000000..b2189e519 --- /dev/null +++ b/docs/utils/generate_yaml.html @@ -0,0 +1,885 @@ + + + + + + +codeflare_sdk.utils.generate_yaml API documentation + + + + + + + + + + + +
+
+
+

Module codeflare_sdk.utils.generate_yaml

+
+
+

This sub-module exists primarily to be used internally by the Cluster object +(in the cluster sub-module) for AppWrapper generation.

+
+ +Expand source code + +
# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This sub-module exists primarily to be used internally by the Cluster object
+(in the cluster sub-module) for AppWrapper generation.
+"""
+
+import yaml
+import sys
+import argparse
+import uuid
+
+
+def read_template(template):
+    with open(template, "r") as stream:
+        try:
+            return yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+
+def gen_names(name):
+    if not name:
+        gen_id = str(uuid.uuid4())
+        appwrapper_name = "appwrapper-" + gen_id
+        cluster_name = "cluster-" + gen_id
+        return appwrapper_name, cluster_name
+    else:
+        return name, name
+
+
+def update_dashboard_route(route_item, cluster_name, namespace):
+    metadata = route_item.get("generictemplate", {}).get("metadata")
+    metadata["name"] = f"ray-dashboard-{cluster_name}"
+    metadata["namespace"] = namespace
+    metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc"
+    spec = route_item.get("generictemplate", {}).get("spec")
+    spec["to"]["name"] = f"{cluster_name}-head-svc"
+
+
+def update_names(yaml, item, appwrapper_name, cluster_name, namespace):
+    metadata = yaml.get("metadata")
+    metadata["name"] = appwrapper_name
+    metadata["namespace"] = namespace
+    lower_meta = item.get("generictemplate", {}).get("metadata")
+    lower_meta["labels"]["appwrapper.mcad.ibm.com"] = appwrapper_name
+    lower_meta["name"] = cluster_name
+    lower_meta["namespace"] = namespace
+
+
+def update_labels(yaml, instascale, instance_types):
+    metadata = yaml.get("metadata")
+    if instascale:
+        if not len(instance_types) > 0:
+            sys.exit(
+                "If instascale is set to true, must provide at least one instance type"
+            )
+        type_str = ""
+        for type in instance_types:
+            type_str += type + "_"
+        type_str = type_str[:-1]
+        metadata["labels"]["orderedinstance"] = type_str
+    else:
+        metadata.pop("labels")
+
+
+def update_custompodresources(
+    item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
+):
+    if "custompodresources" in item.keys():
+        custompodresources = item.get("custompodresources")
+        for i in range(len(custompodresources)):
+            if i == 0:
+                # Leave head node resources as template default
+                continue
+            resource = custompodresources[i]
+            for k, v in resource.items():
+                if k == "replicas" and i == 1:
+                    resource[k] = workers
+                if k == "requests" or k == "limits":
+                    for spec, _ in v.items():
+                        if spec == "cpu":
+                            if k == "limits":
+                                resource[k][spec] = max_cpu
+                            else:
+                                resource[k][spec] = min_cpu
+                        if spec == "memory":
+                            if k == "limits":
+                                resource[k][spec] = str(max_memory) + "G"
+                            else:
+                                resource[k][spec] = str(min_memory) + "G"
+                        if spec == "nvidia.com/gpu":
+                            if i == 0:
+                                resource[k][spec] = 0
+                            else:
+                                resource[k][spec] = gpu
+    else:
+        sys.exit("Error: malformed template")
+
+
+def update_affinity(spec, appwrapper_name, instascale):
+    if instascale:
+        node_selector_terms = (
+            spec.get("affinity")
+            .get("nodeAffinity")
+            .get("requiredDuringSchedulingIgnoredDuringExecution")
+            .get("nodeSelectorTerms")
+        )
+        node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name
+        node_selector_terms[0]["matchExpressions"][0]["key"] = appwrapper_name
+    else:
+        spec.pop("affinity")
+
+
+def update_image(spec, image):
+    containers = spec.get("containers")
+    for container in containers:
+        container["image"] = image
+
+
+def update_env(spec, env):
+    containers = spec.get("containers")
+    for container in containers:
+        if env:
+            if "env" in container:
+                container["env"].extend(env)
+            else:
+                container["env"] = env
+
+
+def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
+    container = spec.get("containers")
+    for resource in container:
+        requests = resource.get("resources").get("requests")
+        if requests is not None:
+            requests["cpu"] = min_cpu
+            requests["memory"] = str(min_memory) + "G"
+            requests["nvidia.com/gpu"] = gpu
+        limits = resource.get("resources").get("limits")
+        if limits is not None:
+            limits["cpu"] = max_cpu
+            limits["memory"] = str(max_memory) + "G"
+            limits["nvidia.com/gpu"] = gpu
+
+
+def update_nodes(
+    item,
+    appwrapper_name,
+    min_cpu,
+    max_cpu,
+    min_memory,
+    max_memory,
+    gpu,
+    workers,
+    image,
+    instascale,
+    env,
+):
+    if "generictemplate" in item.keys():
+        head = item.get("generictemplate").get("spec").get("headGroupSpec")
+        worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
+
+        # Head counts as first worker
+        worker["replicas"] = workers
+        worker["minReplicas"] = workers
+        worker["maxReplicas"] = workers
+        worker["groupName"] = "small-group-" + appwrapper_name
+        worker["rayStartParams"]["num-gpus"] = str(int(gpu))
+
+        for comp in [head, worker]:
+            spec = comp.get("template").get("spec")
+            update_affinity(spec, appwrapper_name, instascale)
+            update_image(spec, image)
+            update_env(spec, env)
+            if comp == head:
+                update_resources(spec, 2, 2, 8, 8, 0)
+            else:
+                update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
+
+
+def write_user_appwrapper(user_yaml, output_file_name):
+    with open(output_file_name, "w") as outfile:
+        yaml.dump(user_yaml, outfile, default_flow_style=False)
+    print(f"Written to: {output_file_name}")
+
+
+def generate_appwrapper(
+    name: str,
+    namespace: str,
+    min_cpu: int,
+    max_cpu: int,
+    min_memory: int,
+    max_memory: int,
+    gpu: int,
+    workers: int,
+    template: str,
+    image: str,
+    instascale: bool,
+    instance_types: list,
+    env,
+):
+    user_yaml = read_template(template)
+    appwrapper_name, cluster_name = gen_names(name)
+    resources = user_yaml.get("spec", "resources")
+    item = resources["resources"].get("GenericItems")[0]
+    route_item = resources["resources"].get("GenericItems")[1]
+    update_names(user_yaml, item, appwrapper_name, cluster_name, namespace)
+    update_labels(user_yaml, instascale, instance_types)
+    update_custompodresources(
+        item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
+    )
+    update_nodes(
+        item,
+        appwrapper_name,
+        min_cpu,
+        max_cpu,
+        min_memory,
+        max_memory,
+        gpu,
+        workers,
+        image,
+        instascale,
+        env,
+    )
+    update_dashboard_route(route_item, cluster_name, namespace)
+    outfile = appwrapper_name + ".yaml"
+    write_user_appwrapper(user_yaml, outfile)
+    return outfile
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate user AppWrapper")
+    parser.add_argument(
+        "--name",
+        required=False,
+        default="",
+        help="User selected name for AppWrapper and Ray Cluster (auto-generated if not provided)",
+    )
+    parser.add_argument(
+        "--min-cpu",
+        type=int,
+        required=True,
+        help="min number of CPU(s) in a worker required for running job",
+    )
+    parser.add_argument(
+        "--max-cpu",
+        type=int,
+        required=True,
+        help="max number of CPU(s) in a worker required for running job",
+    )
+    parser.add_argument(
+        "--min-memory",
+        type=int,
+        required=True,
+        help="min RAM required in a worker for running job, in GB",
+    )
+    parser.add_argument(
+        "--max-memory",
+        type=int,
+        required=True,
+        help="max RAM required in a worker for running job, in GB",
+    )
+    parser.add_argument(
+        "--gpu",
+        type=int,
+        required=True,
+        help="GPU(s) required in a worker for running job",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        required=True,
+        help="How many workers are required in the cluster",
+    )
+    parser.add_argument(
+        "--template", required=True, help="Template AppWrapper yaml file"
+    )
+    parser.add_argument(
+        "--image",
+        required=False,
+        default="rayproject/ray:latest",
+        help="Ray image to be used (defaults to rayproject/ray:latest)",
+    )
+    parser.add_argument(
+        "--instascale",
+        default=False,
+        required=False,
+        action="store_true",
+        help="Indicates that instascale is installed on the cluster",
+    )
+    parser.add_argument(
+        "--instance-types",
+        type=str,
+        nargs="+",
+        default=[],
+        required=False,
+        help="Head,worker instance types (space separated)",
+    )
+    parser.add_argument(
+        "--namespace",
+        required=False,
+        default="default",
+        help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace",
+    )
+
+    args = parser.parse_args()
+    name = args.name
+    min_cpu = args.min_cpu
+    max_cpu = args.max_cpu
+    min_memory = args.min_memory
+    max_memory = args.max_memory
+    gpu = args.gpu
+    workers = args.workers
+    template = args.template
+    image = args.image
+    instascale = args.instascale
+    instance_types = args.instance_types
+    namespace = args.namespace
+    env = {}
+
+    outfile = generate_appwrapper(
+        name,
+        namespace,
+        min_cpu,
+        max_cpu,
+        min_memory,
+        max_memory,
+        gpu,
+        workers,
+        template,
+        image,
+        instascale,
+        instance_types,
+        env,
+    )
+    return outfile
+
+
+if __name__ == "__main__":
+    main()
+
+
+
+
+
+
+
+

Functions

+
+
+def gen_names(name) +
+
+
+
+ +Expand source code + +
def gen_names(name):
+    if not name:
+        gen_id = str(uuid.uuid4())
+        appwrapper_name = "appwrapper-" + gen_id
+        cluster_name = "cluster-" + gen_id
+        return appwrapper_name, cluster_name
+    else:
+        return name, name
+
+
+
+def generate_appwrapper(name: str, namespace: str, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, instance_types: list, env) +
+
+
+
+ +Expand source code + +
def generate_appwrapper(
+    name: str,
+    namespace: str,
+    min_cpu: int,
+    max_cpu: int,
+    min_memory: int,
+    max_memory: int,
+    gpu: int,
+    workers: int,
+    template: str,
+    image: str,
+    instascale: bool,
+    instance_types: list,
+    env,
+):
+    user_yaml = read_template(template)
+    appwrapper_name, cluster_name = gen_names(name)
+    resources = user_yaml.get("spec", "resources")
+    item = resources["resources"].get("GenericItems")[0]
+    route_item = resources["resources"].get("GenericItems")[1]
+    update_names(user_yaml, item, appwrapper_name, cluster_name, namespace)
+    update_labels(user_yaml, instascale, instance_types)
+    update_custompodresources(
+        item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
+    )
+    update_nodes(
+        item,
+        appwrapper_name,
+        min_cpu,
+        max_cpu,
+        min_memory,
+        max_memory,
+        gpu,
+        workers,
+        image,
+        instascale,
+        env,
+    )
+    update_dashboard_route(route_item, cluster_name, namespace)
+    outfile = appwrapper_name + ".yaml"
+    write_user_appwrapper(user_yaml, outfile)
+    return outfile
+
+
+
+def main() +
+
+
+
+ +Expand source code + +
def main():
+    parser = argparse.ArgumentParser(description="Generate user AppWrapper")
+    parser.add_argument(
+        "--name",
+        required=False,
+        default="",
+        help="User selected name for AppWrapper and Ray Cluster (auto-generated if not provided)",
+    )
+    parser.add_argument(
+        "--min-cpu",
+        type=int,
+        required=True,
+        help="min number of CPU(s) in a worker required for running job",
+    )
+    parser.add_argument(
+        "--max-cpu",
+        type=int,
+        required=True,
+        help="max number of CPU(s) in a worker required for running job",
+    )
+    parser.add_argument(
+        "--min-memory",
+        type=int,
+        required=True,
+        help="min RAM required in a worker for running job, in GB",
+    )
+    parser.add_argument(
+        "--max-memory",
+        type=int,
+        required=True,
+        help="max RAM required in a worker for running job, in GB",
+    )
+    parser.add_argument(
+        "--gpu",
+        type=int,
+        required=True,
+        help="GPU(s) required in a worker for running job",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        required=True,
+        help="How many workers are required in the cluster",
+    )
+    parser.add_argument(
+        "--template", required=True, help="Template AppWrapper yaml file"
+    )
+    parser.add_argument(
+        "--image",
+        required=False,
+        default="rayproject/ray:latest",
+        help="Ray image to be used (defaults to rayproject/ray:latest)",
+    )
+    parser.add_argument(
+        "--instascale",
+        default=False,
+        required=False,
+        action="store_true",
+        help="Indicates that instascale is installed on the cluster",
+    )
+    parser.add_argument(
+        "--instance-types",
+        type=str,
+        nargs="+",
+        default=[],
+        required=False,
+        help="Head,worker instance types (space separated)",
+    )
+    parser.add_argument(
+        "--namespace",
+        required=False,
+        default="default",
+        help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace",
+    )
+
+    args = parser.parse_args()
+    name = args.name
+    min_cpu = args.min_cpu
+    max_cpu = args.max_cpu
+    min_memory = args.min_memory
+    max_memory = args.max_memory
+    gpu = args.gpu
+    workers = args.workers
+    template = args.template
+    image = args.image
+    instascale = args.instascale
+    instance_types = args.instance_types
+    namespace = args.namespace
+    env = {}
+
+    outfile = generate_appwrapper(
+        name,
+        namespace,
+        min_cpu,
+        max_cpu,
+        min_memory,
+        max_memory,
+        gpu,
+        workers,
+        template,
+        image,
+        instascale,
+        instance_types,
+        env,
+    )
+    return outfile
+
+
+
+def read_template(template) +
+
+
+
+ +Expand source code + +
def read_template(template):
+    with open(template, "r") as stream:
+        try:
+            return yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+
+
+
+def update_affinity(spec, appwrapper_name, instascale) +
+
+
+
+ +Expand source code + +
def update_affinity(spec, appwrapper_name, instascale):
+    if instascale:
+        node_selector_terms = (
+            spec.get("affinity")
+            .get("nodeAffinity")
+            .get("requiredDuringSchedulingIgnoredDuringExecution")
+            .get("nodeSelectorTerms")
+        )
+        node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name
+        node_selector_terms[0]["matchExpressions"][0]["key"] = appwrapper_name
+    else:
+        spec.pop("affinity")
+
+
+
+def update_custompodresources(item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers) +
+
+
+
+ +Expand source code + +
def update_custompodresources(
+    item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
+):
+    if "custompodresources" in item.keys():
+        custompodresources = item.get("custompodresources")
+        for i in range(len(custompodresources)):
+            if i == 0:
+                # Leave head node resources as template default
+                continue
+            resource = custompodresources[i]
+            for k, v in resource.items():
+                if k == "replicas" and i == 1:
+                    resource[k] = workers
+                if k == "requests" or k == "limits":
+                    for spec, _ in v.items():
+                        if spec == "cpu":
+                            if k == "limits":
+                                resource[k][spec] = max_cpu
+                            else:
+                                resource[k][spec] = min_cpu
+                        if spec == "memory":
+                            if k == "limits":
+                                resource[k][spec] = str(max_memory) + "G"
+                            else:
+                                resource[k][spec] = str(min_memory) + "G"
+                        if spec == "nvidia.com/gpu":
+                            if i == 0:
+                                resource[k][spec] = 0
+                            else:
+                                resource[k][spec] = gpu
+    else:
+        sys.exit("Error: malformed template")
+
+
+
+def update_dashboard_route(route_item, cluster_name, namespace) +
+
+
+
+ +Expand source code + +
def update_dashboard_route(route_item, cluster_name, namespace):
+    metadata = route_item.get("generictemplate", {}).get("metadata")
+    metadata["name"] = f"ray-dashboard-{cluster_name}"
+    metadata["namespace"] = namespace
+    metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc"
+    spec = route_item.get("generictemplate", {}).get("spec")
+    spec["to"]["name"] = f"{cluster_name}-head-svc"
+
+
+
+def update_env(spec, env) +
+
+
+
+ +Expand source code + +
def update_env(spec, env):
+    containers = spec.get("containers")
+    for container in containers:
+        if env:
+            if "env" in container:
+                container["env"].extend(env)
+            else:
+                container["env"] = env
+
+
+
+def update_image(spec, image) +
+
+
+
+ +Expand source code + +
def update_image(spec, image):
+    containers = spec.get("containers")
+    for container in containers:
+        container["image"] = image
+
+
+
+def update_labels(yaml, instascale, instance_types) +
+
+
+
+ +Expand source code + +
def update_labels(yaml, instascale, instance_types):
+    metadata = yaml.get("metadata")
+    if instascale:
+        if not len(instance_types) > 0:
+            sys.exit(
+                "If instascale is set to true, must provide at least one instance type"
+            )
+        type_str = ""
+        for type in instance_types:
+            type_str += type + "_"
+        type_str = type_str[:-1]
+        metadata["labels"]["orderedinstance"] = type_str
+    else:
+        metadata.pop("labels")
+
+
+
+def update_names(yaml, item, appwrapper_name, cluster_name, namespace) +
+
+
+
+ +Expand source code + +
def update_names(yaml, item, appwrapper_name, cluster_name, namespace):
+    metadata = yaml.get("metadata")
+    metadata["name"] = appwrapper_name
+    metadata["namespace"] = namespace
+    lower_meta = item.get("generictemplate", {}).get("metadata")
+    lower_meta["labels"]["appwrapper.mcad.ibm.com"] = appwrapper_name
+    lower_meta["name"] = cluster_name
+    lower_meta["namespace"] = namespace
+
+
+
+def update_nodes(item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env) +
+
+
+
+ +Expand source code + +
def update_nodes(
+    item,
+    appwrapper_name,
+    min_cpu,
+    max_cpu,
+    min_memory,
+    max_memory,
+    gpu,
+    workers,
+    image,
+    instascale,
+    env,
+):
+    if "generictemplate" in item.keys():
+        head = item.get("generictemplate").get("spec").get("headGroupSpec")
+        worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
+
+        # Head counts as first worker
+        worker["replicas"] = workers
+        worker["minReplicas"] = workers
+        worker["maxReplicas"] = workers
+        worker["groupName"] = "small-group-" + appwrapper_name
+        worker["rayStartParams"]["num-gpus"] = str(int(gpu))
+
+        for comp in [head, worker]:
+            spec = comp.get("template").get("spec")
+            update_affinity(spec, appwrapper_name, instascale)
+            update_image(spec, image)
+            update_env(spec, env)
+            if comp == head:
+                update_resources(spec, 2, 2, 8, 8, 0)
+            else:
+                update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
+
+
+
+def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) +
+
+
+
+ +Expand source code + +
def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
+    container = spec.get("containers")
+    for resource in container:
+        requests = resource.get("resources").get("requests")
+        if requests is not None:
+            requests["cpu"] = min_cpu
+            requests["memory"] = str(min_memory) + "G"
+            requests["nvidia.com/gpu"] = gpu
+        limits = resource.get("resources").get("limits")
+        if limits is not None:
+            limits["cpu"] = max_cpu
+            limits["memory"] = str(max_memory) + "G"
+            limits["nvidia.com/gpu"] = gpu
+
+
+
+def write_user_appwrapper(user_yaml, output_file_name) +
+
+
+
+ +Expand source code + +
def write_user_appwrapper(user_yaml, output_file_name):
+    with open(output_file_name, "w") as outfile:
+        yaml.dump(user_yaml, outfile, default_flow_style=False)
+    print(f"Written to: {output_file_name}")
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/utils/index.html b/docs/utils/index.html new file mode 100644 index 000000000..c764c8b16 --- /dev/null +++ b/docs/utils/index.html @@ -0,0 +1,72 @@ + + + + + + +codeflare_sdk.utils API documentation + + + + + + + + + + + +
+
+
+

Module codeflare_sdk.utils

+
+
+
+
+

Sub-modules

+
+
codeflare_sdk.utils.generate_yaml
+
+

This sub-module exists primarily to be used internally by the Cluster object +(in the cluster sub-module) for AppWrapper generation.

+
+
codeflare_sdk.utils.pretty_print
+
+

This sub-module exists primarily to be used internally by the Cluster object +(in the cluster sub-module) for pretty-printing cluster status and details.

+
+
+
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file diff --git a/docs/utils/pretty_print.html b/docs/utils/pretty_print.html new file mode 100644 index 000000000..013d6e248 --- /dev/null +++ b/docs/utils/pretty_print.html @@ -0,0 +1,351 @@ + + + + + + +codeflare_sdk.utils.pretty_print API documentation + + + + + + + + + + + +
+
+
+

Module codeflare_sdk.utils.pretty_print

+
+
+

This sub-module exists primarily to be used internally by the Cluster object +(in the cluster sub-module) for pretty-printing cluster status and details.

+
+ +Expand source code + +
# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This sub-module exists primarily to be used internally by the Cluster object
+(in the cluster sub-module) for pretty-printing cluster status and details.
+"""
+
+from rich import print
+from rich.table import Table
+from rich.console import Console
+from rich.layout import Layout
+from rich.panel import Panel
+from rich import box
+from typing import List
+from ..cluster.model import RayCluster, AppWrapper, RayClusterStatus
+
+
+def print_no_resources_found():
+    console = Console()
+    console.print(Panel("[red]No resources found"))
+
+
+def print_app_wrappers_status(app_wrappers: List[AppWrapper]):
+    if not app_wrappers:
+        print_no_resources_found()
+        return  # shortcircuit
+
+    console = Console()
+    for app_wrapper in app_wrappers:
+        name = app_wrapper.name
+        status = app_wrapper.status.value
+
+        table = Table(
+            box=box.ASCII_DOUBLE_HEAD,
+            title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:",
+        )
+        table.add_column("Name", style="cyan", no_wrap=True)
+        table.add_column("Status", style="magenta")
+        table.add_row(name, status)
+        table.add_row("")  # empty row for spacing
+        console.print(Panel.fit(table))
+
+
+def print_clusters(clusters: List[RayCluster], verbose=True):
+    if not clusters:
+        print_no_resources_found()
+        return  # shortcircuit
+
+    console = Console()
+    title_printed = False
+
+    for cluster in clusters:
+        status = (
+            "Active :white_heavy_check_mark:"
+            if cluster.status == RayClusterStatus.READY
+            else "InActive :x:"
+        )
+        name = cluster.name
+        dashboard = cluster.dashboard
+        mincount = str(cluster.min_workers)
+        maxcount = str(cluster.max_workers)
+        memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max
+        cpu = str(cluster.worker_cpu)
+        gpu = str(cluster.worker_gpu)
+        # owned = bool(cluster["userOwned"])
+        owned = True
+
+        #'table0' to display the cluster name, status, url, and dashboard link
+        table0 = Table(box=None, show_header=False)
+        if owned:
+            table0.add_row("[white on green][bold]Owner")
+        else:
+            table0.add_row("")
+        table0.add_row("[bold underline]" + name, status)
+        table0.add_row()
+        # fixme harcded to default for now
+        table0.add_row(
+            f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001"
+        )  # format that is used to generate the name of the service
+        table0.add_row()
+        table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]")
+        table0.add_row("")  # empty row for spacing
+
+        #'table1' to display the worker counts
+        table1 = Table(box=None)
+        table1.add_row()
+        table1.add_column("Min", style="cyan", no_wrap=True)
+        table1.add_column("Max", style="magenta")
+        table1.add_row()
+        table1.add_row(mincount, maxcount)
+        table1.add_row()
+
+        #'table2' to display the worker resources
+        table2 = Table(box=None)
+        table2.add_column("Memory", style="cyan", no_wrap=True, min_width=10)
+        table2.add_column("CPU", style="magenta", min_width=10)
+        table2.add_column("GPU", style="magenta", min_width=10)
+        table2.add_row()
+        table2.add_row(memory, cpu, gpu)
+        table2.add_row()
+
+        # panels to encompass table1 and table2 into separate cards
+        panel_1 = Panel.fit(table1, title="Workers")
+        panel_2 = Panel.fit(table2, title="Worker specs(each)")
+
+        # table3 to display panel_1 and panel_2 side-by-side in a single row
+        table3 = Table(box=None, show_header=False, title="Cluster Resources")
+        table3.add_row(panel_1, panel_2)
+
+        # table4 to display table0 and table3, one below the other
+        table4 = Table(box=None, show_header=False)
+        table4.add_row(table0)
+        table4.add_row(table3)
+
+        # Encompass all details of the cluster in a single panel
+        if not title_printed:
+            # If first cluster in the list, then create a table with title "Codeflare clusters".
+            # This is done to ensure the title is center aligned on the cluster display tables, rather
+            # than being center aligned on the console/terminal if we simply use console.print(title)
+
+            table5 = Table(
+                box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:"
+            )
+            table5.add_row(Panel.fit(table4))
+            console.print(table5)
+            title_printed = True
+        else:
+            console.print(Panel.fit(table4))
+
+
+
+
+
+
+
+

Functions

+
+
+def print_app_wrappers_status(app_wrappers: List[AppWrapper]) +
+
+
+
+ +Expand source code + +
def print_app_wrappers_status(app_wrappers: List[AppWrapper]):
+    if not app_wrappers:
+        print_no_resources_found()
+        return  # shortcircuit
+
+    console = Console()
+    for app_wrapper in app_wrappers:
+        name = app_wrapper.name
+        status = app_wrapper.status.value
+
+        table = Table(
+            box=box.ASCII_DOUBLE_HEAD,
+            title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:",
+        )
+        table.add_column("Name", style="cyan", no_wrap=True)
+        table.add_column("Status", style="magenta")
+        table.add_row(name, status)
+        table.add_row("")  # empty row for spacing
+        console.print(Panel.fit(table))
+
+
+
+def print_clusters(clusters: List[RayCluster], verbose=True) +
+
+
+
+ +Expand source code + +
def print_clusters(clusters: List[RayCluster], verbose=True):
+    if not clusters:
+        print_no_resources_found()
+        return  # shortcircuit
+
+    console = Console()
+    title_printed = False
+
+    for cluster in clusters:
+        status = (
+            "Active :white_heavy_check_mark:"
+            if cluster.status == RayClusterStatus.READY
+            else "InActive :x:"
+        )
+        name = cluster.name
+        dashboard = cluster.dashboard
+        mincount = str(cluster.min_workers)
+        maxcount = str(cluster.max_workers)
+        memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max
+        cpu = str(cluster.worker_cpu)
+        gpu = str(cluster.worker_gpu)
+        # owned = bool(cluster["userOwned"])
+        owned = True
+
+        #'table0' to display the cluster name, status, url, and dashboard link
+        table0 = Table(box=None, show_header=False)
+        if owned:
+            table0.add_row("[white on green][bold]Owner")
+        else:
+            table0.add_row("")
+        table0.add_row("[bold underline]" + name, status)
+        table0.add_row()
+        # fixme harcded to default for now
+        table0.add_row(
+            f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001"
+        )  # format that is used to generate the name of the service
+        table0.add_row()
+        table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]")
+        table0.add_row("")  # empty row for spacing
+
+        #'table1' to display the worker counts
+        table1 = Table(box=None)
+        table1.add_row()
+        table1.add_column("Min", style="cyan", no_wrap=True)
+        table1.add_column("Max", style="magenta")
+        table1.add_row()
+        table1.add_row(mincount, maxcount)
+        table1.add_row()
+
+        #'table2' to display the worker resources
+        table2 = Table(box=None)
+        table2.add_column("Memory", style="cyan", no_wrap=True, min_width=10)
+        table2.add_column("CPU", style="magenta", min_width=10)
+        table2.add_column("GPU", style="magenta", min_width=10)
+        table2.add_row()
+        table2.add_row(memory, cpu, gpu)
+        table2.add_row()
+
+        # panels to encompass table1 and table2 into separate cards
+        panel_1 = Panel.fit(table1, title="Workers")
+        panel_2 = Panel.fit(table2, title="Worker specs(each)")
+
+        # table3 to display panel_1 and panel_2 side-by-side in a single row
+        table3 = Table(box=None, show_header=False, title="Cluster Resources")
+        table3.add_row(panel_1, panel_2)
+
+        # table4 to display table0 and table3, one below the other
+        table4 = Table(box=None, show_header=False)
+        table4.add_row(table0)
+        table4.add_row(table3)
+
+        # Encompass all details of the cluster in a single panel
+        if not title_printed:
+            # If first cluster in the list, then create a table with title "Codeflare clusters".
+            # This is done to ensure the title is center aligned on the cluster display tables, rather
+            # than being center aligned on the console/terminal if we simply use console.print(title)
+
+            table5 = Table(
+                box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:"
+            )
+            table5.add_row(Panel.fit(table4))
+            console.print(table5)
+            title_printed = True
+        else:
+            console.print(Panel.fit(table4))
+
+
+
+def print_no_resources_found() +
+
+
+
+ +Expand source code + +
def print_no_resources_found():
+    console = Console()
+    console.print(Panel("[red]No resources found"))
+
+
+
+
+
+
+
+ +
+ + + \ No newline at end of file From 6ffa289befaaa7529249bf30da93f7084b4144a2 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Sat, 3 Dec 2022 17:49:54 -0500 Subject: [PATCH 2/2] Remove dups --- docs/codeflare_sdk/cluster/cluster.html | 873 ------------------- docs/codeflare_sdk/cluster/config.html | 234 ------ docs/codeflare_sdk/cluster/index.html | 79 -- docs/codeflare_sdk/cluster/model.html | 464 ---------- docs/codeflare_sdk/index.html | 65 -- docs/codeflare_sdk/utils/generate_yaml.html | 885 -------------------- docs/codeflare_sdk/utils/index.html | 72 -- docs/codeflare_sdk/utils/pretty_print.html | 351 -------- 8 files changed, 3023 deletions(-) delete mode 100644 docs/codeflare_sdk/cluster/cluster.html delete mode 100644 docs/codeflare_sdk/cluster/config.html delete mode 100644 docs/codeflare_sdk/cluster/index.html delete mode 100644 docs/codeflare_sdk/cluster/model.html delete mode 100644 docs/codeflare_sdk/index.html delete mode 100644 docs/codeflare_sdk/utils/generate_yaml.html delete mode 100644 docs/codeflare_sdk/utils/index.html delete mode 100644 docs/codeflare_sdk/utils/pretty_print.html diff --git a/docs/codeflare_sdk/cluster/cluster.html b/docs/codeflare_sdk/cluster/cluster.html deleted file mode 100644 index 5e1fe6687..000000000 --- a/docs/codeflare_sdk/cluster/cluster.html +++ /dev/null @@ -1,873 +0,0 @@ - - - - - - -codeflare_sdk.cluster.cluster API documentation - - - - - - - - - - - -
-
-
-

Module codeflare_sdk.cluster.cluster

-
-
-

The cluster sub-module contains the definition of the Cluster object, which represents -the resources requested by the user. It also contains functions for checking the -cluster setup queue, a list of all existing clusters, and the user's working namespace.

-
- -Expand source code - -
# Copyright 2022 IBM, Red Hat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-The cluster sub-module contains the definition of the Cluster object, which represents
-the resources requested by the user. It also contains functions for checking the
-cluster setup queue, a list of all existing clusters, and the user's working namespace.
-"""
-
-from os import stat
-from typing import List, Optional, Tuple
-
-import openshift as oc
-
-from ..utils import pretty_print
-from ..utils.generate_yaml import generate_appwrapper
-from .config import ClusterConfiguration
-from .model import (
-    AppWrapper,
-    AppWrapperStatus,
-    CodeFlareClusterStatus,
-    RayCluster,
-    RayClusterStatus,
-)
-
-
-class Cluster:
-    """
-    An object for requesting, bringing up, and taking down resources.
-    Can also be used for seeing the resource cluster status and details.
-
-    Note that currently, the underlying implementation is a Ray cluster.
-    """
-
-    def __init__(self, config: ClusterConfiguration):
-        """
-        Create the resource cluster object by passing in a ClusterConfiguration
-        (defined in the config sub-module). An AppWrapper will then be generated
-        based off of the configured resources to represent the desired cluster
-        request.
-        """
-        self.config = config
-        self.app_wrapper_yaml = self.create_app_wrapper()
-        self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0]
-
-    def create_app_wrapper(self):
-        """
-        Called upon cluster object creation, creates an AppWrapper yaml based on
-        the specifications of the ClusterConfiguration.
-        """
-        name = self.config.name
-        namespace = self.config.namespace
-        min_cpu = self.config.min_cpus
-        max_cpu = self.config.max_cpus
-        min_memory = self.config.min_memory
-        max_memory = self.config.max_memory
-        gpu = self.config.gpu
-        workers = self.config.max_worker
-        template = self.config.template
-        image = self.config.image
-        instascale = self.config.instascale
-        instance_types = self.config.machine_types
-        env = self.config.envs
-        return generate_appwrapper(
-            name=name,
-            namespace=namespace,
-            min_cpu=min_cpu,
-            max_cpu=max_cpu,
-            min_memory=min_memory,
-            max_memory=max_memory,
-            gpu=gpu,
-            workers=workers,
-            template=template,
-            image=image,
-            instascale=instascale,
-            instance_types=instance_types,
-            env=env,
-        )
-
-    # creates a new cluster with the provided or default spec
-    def up(self):
-        """
-        Applies the AppWrapper yaml, pushing the resource request onto
-        the MCAD queue.
-        """
-        namespace = self.config.namespace
-        with oc.project(namespace):
-            oc.invoke("apply", ["-f", self.app_wrapper_yaml])
-
-    def down(self):
-        """
-        Deletes the AppWrapper yaml, scaling-down and deleting all resources
-        associated with the cluster.
-        """
-        namespace = self.config.namespace
-        with oc.project(namespace):
-            oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
-
-    def status(self, print_to_console: bool = True):
-        """
-        TO BE UPDATED: Will soon return (and print by default) the cluster's
-        status, from AppWrapper submission to setup completion. All resource
-        details will be moved to cluster.details().
-        """
-        cluster = _ray_cluster_status(self.config.name, self.config.namespace)
-        if cluster:
-            # overriding the number of gpus with requested
-            cluster.worker_gpu = self.config.gpu
-            if print_to_console:
-                pretty_print.print_clusters([cluster])
-            return cluster.status
-        else:
-            if print_to_console:
-                pretty_print.print_no_resources_found()
-            return None
-
-    def cluster_uri(self) -> str:
-        """
-        Returns a string containing the cluster's URI.
-        """
-        return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
-
-    def cluster_dashboard_uri(self, namespace: str = "default") -> str:
-        """
-        Returns a string containing the cluster's dashboard URI.
-        """
-        try:
-            with oc.project(namespace):
-                route = oc.invoke(
-                    "get", ["route", "-o", "jsonpath='{$.items[0].spec.host}'"]
-                )
-                route = route.out().strip().strip("'")
-            return f"http://{route}"
-        except:
-            return "Dashboard route not available yet. Did you run cluster.up()?"
-
-    # checks whether the ray cluster is ready
-    def is_ready(self, print_to_console: bool = True):
-        """
-        TO BE DEPRECATED: functionality will be added into cluster.status().
-        """
-        ready = False
-        status = CodeFlareClusterStatus.UNKNOWN
-        # check the app wrapper status
-        appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
-        if appwrapper:
-            if appwrapper.status in [
-                AppWrapperStatus.RUNNING,
-                AppWrapperStatus.COMPLETED,
-                AppWrapperStatus.RUNNING_HOLD_COMPLETION,
-            ]:
-                ready = False
-                status = CodeFlareClusterStatus.QUEUED
-            elif appwrapper.status in [
-                AppWrapperStatus.FAILED,
-                AppWrapperStatus.DELETED,
-            ]:
-                ready = False
-                status = CodeFlareClusterStatus.FAILED  # should deleted be separate
-                return ready, status  # exit early, no need to check ray status
-            elif appwrapper.status in [AppWrapperStatus.PENDING]:
-                ready = False
-                status = CodeFlareClusterStatus.QUEUED
-                if print_to_console:
-                    pretty_print.print_app_wrappers_status([appwrapper])
-                return (
-                    ready,
-                    status,
-                )  # no need to check the ray status since still in queue
-
-        # check the ray cluster status
-        cluster = _ray_cluster_status(self.config.name, self.config.namespace)
-        if cluster:
-            if cluster.status == RayClusterStatus.READY:
-                ready = True
-                status = CodeFlareClusterStatus.READY
-            elif cluster.status in [
-                RayClusterStatus.UNHEALTHY,
-                RayClusterStatus.FAILED,
-            ]:
-                ready = False
-                status = CodeFlareClusterStatus.FAILED
-
-            if print_to_console:
-                # overriding the number of gpus with requested
-                cluster.worker_gpu = self.config.gpu
-                pretty_print.print_clusters([cluster])
-        return status, ready
-
-
-def get_current_namespace() -> str:
-    """
-    Returns the user's current working namespace.
-    """
-    namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
-    return namespace
-
-
-def list_all_clusters(namespace: str, print_to_console: bool = True):
-    """
-    Returns (and prints by default) a list of all clusters in a given namespace.
-    """
-    clusters = _get_ray_clusters(namespace)
-    if print_to_console:
-        pretty_print.print_clusters(clusters)
-    return clusters
-
-
-def list_all_queued(namespace: str, print_to_console: bool = True):
-    """
-    Returns (and prints by default) a list of all currently queued-up AppWrappers
-    in a given namespace.
-    """
-    app_wrappers = _get_app_wrappers(
-        namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]
-    )
-    if print_to_console:
-        pretty_print.print_app_wrappers_status(app_wrappers)
-    return app_wrappers
-
-
-# private methods
-
-
-def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]:
-    with oc.project(namespace), oc.timeout(10 * 60):
-        cluster = oc.selector(f"appwrapper/{name}").object()
-    if cluster:
-        return _map_to_app_wrapper(cluster)
-
-
-def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]:
-    # FIXME should we check the appwrapper first
-    cluster = None
-    try:
-        with oc.project(namespace), oc.timeout(10 * 60):
-            cluster = oc.selector(f"rayclusters/{name}").object()
-
-        if cluster:
-            return _map_to_ray_cluster(cluster)
-    except:
-        pass
-    return cluster
-
-
-def _get_ray_clusters(namespace="default") -> List[RayCluster]:
-    list_of_clusters = []
-
-    with oc.project(namespace), oc.timeout(10 * 60):
-        ray_clusters = oc.selector("rayclusters").objects()
-
-    for cluster in ray_clusters:
-        list_of_clusters.append(_map_to_ray_cluster(cluster))
-    return list_of_clusters
-
-
-def _get_app_wrappers(
-    namespace="default", filter=List[AppWrapperStatus]
-) -> List[AppWrapper]:
-    list_of_app_wrappers = []
-
-    with oc.project(namespace), oc.timeout(10 * 60):
-        app_wrappers = oc.selector("appwrappers").objects()
-
-    for item in app_wrappers:
-        app_wrapper = _map_to_app_wrapper(item)
-        if filter and app_wrapper.status in filter:
-            list_of_app_wrappers.append(app_wrapper)
-        else:
-            list_of_app_wrappers.append(app_wrapper)
-    return list_of_app_wrappers
-
-
-def _map_to_ray_cluster(cluster) -> RayCluster:
-    cluster_model = cluster.model
-
-    with oc.project(cluster.namespace()), oc.timeout(10 * 60):
-        route = (
-            oc.selector(f"route/ray-dashboard-{cluster.name()}")
-            .object()
-            .model.spec.host
-        )
-
-    return RayCluster(
-        name=cluster.name(),
-        status=RayClusterStatus(cluster_model.status.state.lower()),
-        # for now we are not using autoscaling so same replicas is fine
-        min_workers=cluster_model.spec.workerGroupSpecs[0].replicas,
-        max_workers=cluster_model.spec.workerGroupSpecs[0].replicas,
-        worker_mem_max=cluster_model.spec.workerGroupSpecs[0]
-        .template.spec.containers[0]
-        .resources.limits.memory,
-        worker_mem_min=cluster_model.spec.workerGroupSpecs[0]
-        .template.spec.containers[0]
-        .resources.requests.memory,
-        worker_cpu=cluster_model.spec.workerGroupSpecs[0]
-        .template.spec.containers[0]
-        .resources.limits.cpu,
-        worker_gpu=0,  # hard to detect currently how many gpus, can override it with what the user asked for
-        namespace=cluster.namespace(),
-        dashboard=route,
-    )
-
-
-def _map_to_app_wrapper(cluster) -> AppWrapper:
-    cluster_model = cluster.model
-    return AppWrapper(
-        name=cluster.name(),
-        status=AppWrapperStatus(cluster_model.status.state.lower()),
-        can_run=cluster_model.status.canrun,
-        job_state=cluster_model.status.queuejobstate,
-    )
-
-
-
-
-
-
-
-

Functions

-
-
-def get_current_namespace() ‑> str -
-
-

Returns the user's current working namespace.

-
- -Expand source code - -
def get_current_namespace() -> str:
-    """
-    Returns the user's current working namespace.
-    """
-    namespace = oc.invoke("project", ["-q"]).actions()[0].out.strip()
-    return namespace
-
-
-
-def list_all_clusters(namespace: str, print_to_console: bool = True) -
-
-

Returns (and prints by default) a list of all clusters in a given namespace.

-
- -Expand source code - -
def list_all_clusters(namespace: str, print_to_console: bool = True):
-    """
-    Returns (and prints by default) a list of all clusters in a given namespace.
-    """
-    clusters = _get_ray_clusters(namespace)
-    if print_to_console:
-        pretty_print.print_clusters(clusters)
-    return clusters
-
-
-
-def list_all_queued(namespace: str, print_to_console: bool = True) -
-
-

Returns (and prints by default) a list of all currently queued-up AppWrappers -in a given namespace.

-
- -Expand source code - -
def list_all_queued(namespace: str, print_to_console: bool = True):
-    """
-    Returns (and prints by default) a list of all currently queued-up AppWrappers
-    in a given namespace.
-    """
-    app_wrappers = _get_app_wrappers(
-        namespace, filter=[AppWrapperStatus.RUNNING, AppWrapperStatus.PENDING]
-    )
-    if print_to_console:
-        pretty_print.print_app_wrappers_status(app_wrappers)
-    return app_wrappers
-
-
-
-
-
-

Classes

-
-
-class Cluster -(config: ClusterConfiguration) -
-
-

An object for requesting, bringing up, and taking down resources. -Can also be used for seeing the resource cluster status and details.

-

Note that currently, the underlying implementation is a Ray cluster.

-

Create the resource cluster object by passing in a ClusterConfiguration -(defined in the config sub-module). An AppWrapper will then be generated -based off of the configured resources to represent the desired cluster -request.

-
- -Expand source code - -
class Cluster:
-    """
-    An object for requesting, bringing up, and taking down resources.
-    Can also be used for seeing the resource cluster status and details.
-
-    Note that currently, the underlying implementation is a Ray cluster.
-    """
-
-    def __init__(self, config: ClusterConfiguration):
-        """
-        Create the resource cluster object by passing in a ClusterConfiguration
-        (defined in the config sub-module). An AppWrapper will then be generated
-        based off of the configured resources to represent the desired cluster
-        request.
-        """
-        self.config = config
-        self.app_wrapper_yaml = self.create_app_wrapper()
-        self.app_wrapper_name = self.app_wrapper_yaml.split(".")[0]
-
-    def create_app_wrapper(self):
-        """
-        Called upon cluster object creation, creates an AppWrapper yaml based on
-        the specifications of the ClusterConfiguration.
-        """
-        name = self.config.name
-        namespace = self.config.namespace
-        min_cpu = self.config.min_cpus
-        max_cpu = self.config.max_cpus
-        min_memory = self.config.min_memory
-        max_memory = self.config.max_memory
-        gpu = self.config.gpu
-        workers = self.config.max_worker
-        template = self.config.template
-        image = self.config.image
-        instascale = self.config.instascale
-        instance_types = self.config.machine_types
-        env = self.config.envs
-        return generate_appwrapper(
-            name=name,
-            namespace=namespace,
-            min_cpu=min_cpu,
-            max_cpu=max_cpu,
-            min_memory=min_memory,
-            max_memory=max_memory,
-            gpu=gpu,
-            workers=workers,
-            template=template,
-            image=image,
-            instascale=instascale,
-            instance_types=instance_types,
-            env=env,
-        )
-
-    # creates a new cluster with the provided or default spec
-    def up(self):
-        """
-        Applies the AppWrapper yaml, pushing the resource request onto
-        the MCAD queue.
-        """
-        namespace = self.config.namespace
-        with oc.project(namespace):
-            oc.invoke("apply", ["-f", self.app_wrapper_yaml])
-
-    def down(self):
-        """
-        Deletes the AppWrapper yaml, scaling-down and deleting all resources
-        associated with the cluster.
-        """
-        namespace = self.config.namespace
-        with oc.project(namespace):
-            oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
-
-    def status(self, print_to_console: bool = True):
-        """
-        TO BE UPDATED: Will soon return (and print by default) the cluster's
-        status, from AppWrapper submission to setup completion. All resource
-        details will be moved to cluster.details().
-        """
-        cluster = _ray_cluster_status(self.config.name, self.config.namespace)
-        if cluster:
-            # overriding the number of gpus with requested
-            cluster.worker_gpu = self.config.gpu
-            if print_to_console:
-                pretty_print.print_clusters([cluster])
-            return cluster.status
-        else:
-            if print_to_console:
-                pretty_print.print_no_resources_found()
-            return None
-
-    def cluster_uri(self) -> str:
-        """
-        Returns a string containing the cluster's URI.
-        """
-        return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
-
-    def cluster_dashboard_uri(self, namespace: str = "default") -> str:
-        """
-        Returns a string containing the cluster's dashboard URI.
-        """
-        try:
-            with oc.project(namespace):
-                route = oc.invoke(
-                    "get", ["route", "-o", "jsonpath='{$.items[0].spec.host}'"]
-                )
-                route = route.out().strip().strip("'")
-            return f"http://{route}"
-        except:
-            return "Dashboard route not available yet. Did you run cluster.up()?"
-
-    # checks whether the ray cluster is ready
-    def is_ready(self, print_to_console: bool = True):
-        """
-        TO BE DEPRECATED: functionality will be added into cluster.status().
-        """
-        ready = False
-        status = CodeFlareClusterStatus.UNKNOWN
-        # check the app wrapper status
-        appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
-        if appwrapper:
-            if appwrapper.status in [
-                AppWrapperStatus.RUNNING,
-                AppWrapperStatus.COMPLETED,
-                AppWrapperStatus.RUNNING_HOLD_COMPLETION,
-            ]:
-                ready = False
-                status = CodeFlareClusterStatus.QUEUED
-            elif appwrapper.status in [
-                AppWrapperStatus.FAILED,
-                AppWrapperStatus.DELETED,
-            ]:
-                ready = False
-                status = CodeFlareClusterStatus.FAILED  # should deleted be separate
-                return ready, status  # exit early, no need to check ray status
-            elif appwrapper.status in [AppWrapperStatus.PENDING]:
-                ready = False
-                status = CodeFlareClusterStatus.QUEUED
-                if print_to_console:
-                    pretty_print.print_app_wrappers_status([appwrapper])
-                return (
-                    ready,
-                    status,
-                )  # no need to check the ray status since still in queue
-
-        # check the ray cluster status
-        cluster = _ray_cluster_status(self.config.name, self.config.namespace)
-        if cluster:
-            if cluster.status == RayClusterStatus.READY:
-                ready = True
-                status = CodeFlareClusterStatus.READY
-            elif cluster.status in [
-                RayClusterStatus.UNHEALTHY,
-                RayClusterStatus.FAILED,
-            ]:
-                ready = False
-                status = CodeFlareClusterStatus.FAILED
-
-            if print_to_console:
-                # overriding the number of gpus with requested
-                cluster.worker_gpu = self.config.gpu
-                pretty_print.print_clusters([cluster])
-        return status, ready
-
-

Methods

-
-
-def cluster_dashboard_uri(self, namespace: str = 'default') ‑> str -
-
-

Returns a string containing the cluster's dashboard URI.

-
- -Expand source code - -
def cluster_dashboard_uri(self, namespace: str = "default") -> str:
-    """
-    Returns a string containing the cluster's dashboard URI.
-    """
-    try:
-        with oc.project(namespace):
-            route = oc.invoke(
-                "get", ["route", "-o", "jsonpath='{$.items[0].spec.host}'"]
-            )
-            route = route.out().strip().strip("'")
-        return f"http://{route}"
-    except:
-        return "Dashboard route not available yet. Did you run cluster.up()?"
-
-
-
-def cluster_uri(self) ‑> str -
-
-

Returns a string containing the cluster's URI.

-
- -Expand source code - -
def cluster_uri(self) -> str:
-    """
-    Returns a string containing the cluster's URI.
-    """
-    return f"ray://{self.config.name}-head-svc.{self.config.namespace}.svc:10001"
-
-
-
-def create_app_wrapper(self) -
-
-

Called upon cluster object creation, creates an AppWrapper yaml based on -the specifications of the ClusterConfiguration.

-
- -Expand source code - -
def create_app_wrapper(self):
-    """
-    Called upon cluster object creation, creates an AppWrapper yaml based on
-    the specifications of the ClusterConfiguration.
-    """
-    name = self.config.name
-    namespace = self.config.namespace
-    min_cpu = self.config.min_cpus
-    max_cpu = self.config.max_cpus
-    min_memory = self.config.min_memory
-    max_memory = self.config.max_memory
-    gpu = self.config.gpu
-    workers = self.config.max_worker
-    template = self.config.template
-    image = self.config.image
-    instascale = self.config.instascale
-    instance_types = self.config.machine_types
-    env = self.config.envs
-    return generate_appwrapper(
-        name=name,
-        namespace=namespace,
-        min_cpu=min_cpu,
-        max_cpu=max_cpu,
-        min_memory=min_memory,
-        max_memory=max_memory,
-        gpu=gpu,
-        workers=workers,
-        template=template,
-        image=image,
-        instascale=instascale,
-        instance_types=instance_types,
-        env=env,
-    )
-
-
-
-def down(self) -
-
-

Deletes the AppWrapper yaml, scaling-down and deleting all resources -associated with the cluster.

-
- -Expand source code - -
def down(self):
-    """
-    Deletes the AppWrapper yaml, scaling-down and deleting all resources
-    associated with the cluster.
-    """
-    namespace = self.config.namespace
-    with oc.project(namespace):
-        oc.invoke("delete", ["AppWrapper", self.app_wrapper_name])
-
-
-
-def is_ready(self, print_to_console: bool = True) -
-
-

TO BE DEPRECATED: functionality will be added into cluster.status().

-
- -Expand source code - -
def is_ready(self, print_to_console: bool = True):
-    """
-    TO BE DEPRECATED: functionality will be added into cluster.status().
-    """
-    ready = False
-    status = CodeFlareClusterStatus.UNKNOWN
-    # check the app wrapper status
-    appwrapper = _app_wrapper_status(self.config.name, self.config.namespace)
-    if appwrapper:
-        if appwrapper.status in [
-            AppWrapperStatus.RUNNING,
-            AppWrapperStatus.COMPLETED,
-            AppWrapperStatus.RUNNING_HOLD_COMPLETION,
-        ]:
-            ready = False
-            status = CodeFlareClusterStatus.QUEUED
-        elif appwrapper.status in [
-            AppWrapperStatus.FAILED,
-            AppWrapperStatus.DELETED,
-        ]:
-            ready = False
-            status = CodeFlareClusterStatus.FAILED  # should deleted be separate
-            return ready, status  # exit early, no need to check ray status
-        elif appwrapper.status in [AppWrapperStatus.PENDING]:
-            ready = False
-            status = CodeFlareClusterStatus.QUEUED
-            if print_to_console:
-                pretty_print.print_app_wrappers_status([appwrapper])
-            return (
-                ready,
-                status,
-            )  # no need to check the ray status since still in queue
-
-    # check the ray cluster status
-    cluster = _ray_cluster_status(self.config.name, self.config.namespace)
-    if cluster:
-        if cluster.status == RayClusterStatus.READY:
-            ready = True
-            status = CodeFlareClusterStatus.READY
-        elif cluster.status in [
-            RayClusterStatus.UNHEALTHY,
-            RayClusterStatus.FAILED,
-        ]:
-            ready = False
-            status = CodeFlareClusterStatus.FAILED
-
-        if print_to_console:
-            # overriding the number of gpus with requested
-            cluster.worker_gpu = self.config.gpu
-            pretty_print.print_clusters([cluster])
-    return status, ready
-
-
-
-def status(self, print_to_console: bool = True) -
-
-

TO BE UPDATED: Will soon return (and print by default) the cluster's -status, from AppWrapper submission to setup completion. All resource -details will be moved to cluster.details().

-
- -Expand source code - -
def status(self, print_to_console: bool = True):
-    """
-    TO BE UPDATED: Will soon return (and print by default) the cluster's
-    status, from AppWrapper submission to setup completion. All resource
-    details will be moved to cluster.details().
-    """
-    cluster = _ray_cluster_status(self.config.name, self.config.namespace)
-    if cluster:
-        # overriding the number of gpus with requested
-        cluster.worker_gpu = self.config.gpu
-        if print_to_console:
-            pretty_print.print_clusters([cluster])
-        return cluster.status
-    else:
-        if print_to_console:
-            pretty_print.print_no_resources_found()
-        return None
-
-
-
-def up(self) -
-
-

Applies the AppWrapper yaml, pushing the resource request onto -the MCAD queue.

-
- -Expand source code - -
def up(self):
-    """
-    Applies the AppWrapper yaml, pushing the resource request onto
-    the MCAD queue.
-    """
-    namespace = self.config.namespace
-    with oc.project(namespace):
-        oc.invoke("apply", ["-f", self.app_wrapper_yaml])
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/codeflare_sdk/cluster/config.html b/docs/codeflare_sdk/cluster/config.html deleted file mode 100644 index 4ab10299d..000000000 --- a/docs/codeflare_sdk/cluster/config.html +++ /dev/null @@ -1,234 +0,0 @@ - - - - - - -codeflare_sdk.cluster.config API documentation - - - - - - - - - - - -
-
-
-

Module codeflare_sdk.cluster.config

-
-
-

The config sub-module contains the definition of the ClusterConfiguration dataclass, -which is used to specify resource requirements and other details when creating a -Cluster object.

-
- -Expand source code - -
# Copyright 2022 IBM, Red Hat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-The config sub-module contains the definition of the ClusterConfiguration dataclass,
-which is used to specify resource requirements and other details when creating a
-Cluster object.
-"""
-
-from dataclasses import dataclass, field
-import pathlib
-
-dir = pathlib.Path(__file__).parent.parent.resolve()
-
-
-@dataclass
-class ClusterConfiguration:
-    """
-    This dataclass is used to specify resource requirements and other details, and
-    is passed in as an argument when creating a Cluster object.
-    """
-
-    name: str
-    namespace: str = "default"
-    head_info: list = field(default_factory=list)
-    machine_types: list = field(default_factory=list)  # ["m4.xlarge", "g4dn.xlarge"]
-    min_cpus: int = 1
-    max_cpus: int = 1
-    min_worker: int = 1
-    max_worker: int = 1
-    min_memory: int = 2
-    max_memory: int = 2
-    gpu: int = 0
-    template: str = f"{dir}/templates/new-template.yaml"
-    instascale: bool = False
-    envs: dict = field(default_factory=dict)
-    image: str = "ghcr.io/ibm-ai-foundation/base:ray1.13.0-py38-gpu-pytorch1.12.0cu116-20220826-202124"
-
-
-
-
-
-
-
-
-
-

Classes

-
-
-class ClusterConfiguration -(name: str, namespace: str = 'default', head_info: list = <factory>, machine_types: list = <factory>, min_cpus: int = 1, max_cpus: int = 1, min_worker: int = 1, max_worker: int = 1, min_memory: int = 2, max_memory: int = 2, gpu: int = 0, template: str = '/home/meyceoz/Documents/codeflare-sdk/src/codeflare_sdk/templates/new-template.yaml', instascale: bool = False, envs: dict = <factory>, image: str = 'ghcr.io/ibm-ai-foundation/base:ray1.13.0-py38-gpu-pytorch1.12.0cu116-20220826-202124') -
-
-

This dataclass is used to specify resource requirements and other details, and -is passed in as an argument when creating a Cluster object.

-
- -Expand source code - -
class ClusterConfiguration:
-    """
-    This dataclass is used to specify resource requirements and other details, and
-    is passed in as an argument when creating a Cluster object.
-    """
-
-    name: str
-    namespace: str = "default"
-    head_info: list = field(default_factory=list)
-    machine_types: list = field(default_factory=list)  # ["m4.xlarge", "g4dn.xlarge"]
-    min_cpus: int = 1
-    max_cpus: int = 1
-    min_worker: int = 1
-    max_worker: int = 1
-    min_memory: int = 2
-    max_memory: int = 2
-    gpu: int = 0
-    template: str = f"{dir}/templates/new-template.yaml"
-    instascale: bool = False
-    envs: dict = field(default_factory=dict)
-    image: str = "ghcr.io/ibm-ai-foundation/base:ray1.13.0-py38-gpu-pytorch1.12.0cu116-20220826-202124"
-
-

Class variables

-
-
var envs : dict
-
-
-
-
var gpu : int
-
-
-
-
var head_info : list
-
-
-
-
var image : str
-
-
-
-
var instascale : bool
-
-
-
-
var machine_types : list
-
-
-
-
var max_cpus : int
-
-
-
-
var max_memory : int
-
-
-
-
var max_worker : int
-
-
-
-
var min_cpus : int
-
-
-
-
var min_memory : int
-
-
-
-
var min_worker : int
-
-
-
-
var name : str
-
-
-
-
var namespace : str
-
-
-
-
var template : str
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/codeflare_sdk/cluster/index.html b/docs/codeflare_sdk/cluster/index.html deleted file mode 100644 index 1684c93d8..000000000 --- a/docs/codeflare_sdk/cluster/index.html +++ /dev/null @@ -1,79 +0,0 @@ - - - - - - -codeflare_sdk.cluster API documentation - - - - - - - - - - - -
-
-
-

Module codeflare_sdk.cluster

-
-
-
-
-

Sub-modules

-
-
codeflare_sdk.cluster.cluster
-
-

The cluster sub-module contains the definition of the Cluster object, which represents -the resources requested by the user. It also contains functions …

-
-
codeflare_sdk.cluster.config
-
-

The config sub-module contains the definition of the ClusterConfiguration dataclass, -which is used to specify resource requirements and other details …

-
-
codeflare_sdk.cluster.model
-
-

The model sub-module defines Enums containing information for Ray cluster -states and AppWrapper states, and CodeFlare cluster states, as well as -…

-
-
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/codeflare_sdk/cluster/model.html b/docs/codeflare_sdk/cluster/model.html deleted file mode 100644 index 42cefda09..000000000 --- a/docs/codeflare_sdk/cluster/model.html +++ /dev/null @@ -1,464 +0,0 @@ - - - - - - -codeflare_sdk.cluster.model API documentation - - - - - - - - - - - -
-
-
-

Module codeflare_sdk.cluster.model

-
-
-

The model sub-module defines Enums containing information for Ray cluster -states and AppWrapper states, and CodeFlare cluster states, as well as -dataclasses to store information for Ray clusters and AppWrappers.

-
- -Expand source code - -
# Copyright 2022 IBM, Red Hat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-The model sub-module defines Enums containing information for Ray cluster
-states and AppWrapper states, and CodeFlare cluster states, as well as
-dataclasses to store information for Ray clusters and AppWrappers.
-"""
-
-from dataclasses import dataclass
-from enum import Enum
-
-
-class RayClusterStatus(Enum):
-    """
-    Defines the possible reportable states of a Ray cluster.
-    """
-
-    # https://github.com/ray-project/kuberay/blob/master/ray-operator/apis/ray/v1alpha1/raycluster_types.go#L95
-    READY = "ready"
-    UNHEALTHY = "unhealthy"
-    FAILED = "failed"
-    UNKNOWN = "unknown"
-
-
-class AppWrapperStatus(Enum):
-    """
-    Defines the possible reportable states of an AppWrapper.
-    """
-
-    PENDING = "pending"
-    RUNNING = "running"
-    FAILED = "failed"
-    DELETED = "deleted"
-    COMPLETED = "completed"
-    RUNNING_HOLD_COMPLETION = "runningholdcompletion"
-
-
-class CodeFlareClusterStatus(Enum):
-    """
-    Defines the possible reportable states of a Codeflare cluster.
-    """
-
-    READY = 1
-    QUEUED = 2
-    FAILED = 3
-    UNKNOWN = 4
-
-
-@dataclass
-class RayCluster:
-    """
-    For storing information about a Ray cluster.
-    """
-
-    name: str
-    status: RayClusterStatus
-    min_workers: int
-    max_workers: int
-    worker_mem_min: str
-    worker_mem_max: str
-    worker_cpu: int
-    worker_gpu: int
-    namespace: str
-    dashboard: str
-
-
-@dataclass
-class AppWrapper:
-    """
-    For storing information about an AppWrapper.
-    """
-
-    name: str
-    status: AppWrapperStatus
-    can_run: bool
-    job_state: str
-
-
-
-
-
-
-
-
-
-

Classes

-
-
-class AppWrapper -(name: str, status: AppWrapperStatus, can_run: bool, job_state: str) -
-
-

For storing information about an AppWrapper.

-
- -Expand source code - -
class AppWrapper:
-    """
-    For storing information about an AppWrapper.
-    """
-
-    name: str
-    status: AppWrapperStatus
-    can_run: bool
-    job_state: str
-
-

Class variables

-
-
var can_run : bool
-
-
-
-
var job_state : str
-
-
-
-
var name : str
-
-
-
-
var statusAppWrapperStatus
-
-
-
-
-
-
-class AppWrapperStatus -(value, names=None, *, module=None, qualname=None, type=None, start=1) -
-
-

Defines the possible reportable states of an AppWrapper.

-
- -Expand source code - -
class AppWrapperStatus(Enum):
-    """
-    Defines the possible reportable states of an AppWrapper.
-    """
-
-    PENDING = "pending"
-    RUNNING = "running"
-    FAILED = "failed"
-    DELETED = "deleted"
-    COMPLETED = "completed"
-    RUNNING_HOLD_COMPLETION = "runningholdcompletion"
-
-

Ancestors

-
    -
  • enum.Enum
  • -
-

Class variables

-
-
var COMPLETED
-
-
-
-
var DELETED
-
-
-
-
var FAILED
-
-
-
-
var PENDING
-
-
-
-
var RUNNING
-
-
-
-
var RUNNING_HOLD_COMPLETION
-
-
-
-
-
-
-class CodeFlareClusterStatus -(value, names=None, *, module=None, qualname=None, type=None, start=1) -
-
-

Defines the possible reportable states of a Codeflare cluster.

-
- -Expand source code - -
class CodeFlareClusterStatus(Enum):
-    """
-    Defines the possible reportable states of a Codeflare cluster.
-    """
-
-    READY = 1
-    QUEUED = 2
-    FAILED = 3
-    UNKNOWN = 4
-
-

Ancestors

-
    -
  • enum.Enum
  • -
-

Class variables

-
-
var FAILED
-
-
-
-
var QUEUED
-
-
-
-
var READY
-
-
-
-
var UNKNOWN
-
-
-
-
-
-
-class RayCluster -(name: str, status: RayClusterStatus, min_workers: int, max_workers: int, worker_mem_min: str, worker_mem_max: str, worker_cpu: int, worker_gpu: int, namespace: str, dashboard: str) -
-
-

For storing information about a Ray cluster.

-
- -Expand source code - -
class RayCluster:
-    """
-    For storing information about a Ray cluster.
-    """
-
-    name: str
-    status: RayClusterStatus
-    min_workers: int
-    max_workers: int
-    worker_mem_min: str
-    worker_mem_max: str
-    worker_cpu: int
-    worker_gpu: int
-    namespace: str
-    dashboard: str
-
-

Class variables

-
-
var dashboard : str
-
-
-
-
var max_workers : int
-
-
-
-
var min_workers : int
-
-
-
-
var name : str
-
-
-
-
var namespace : str
-
-
-
-
var statusRayClusterStatus
-
-
-
-
var worker_cpu : int
-
-
-
-
var worker_gpu : int
-
-
-
-
var worker_mem_max : str
-
-
-
-
var worker_mem_min : str
-
-
-
-
-
-
-class RayClusterStatus -(value, names=None, *, module=None, qualname=None, type=None, start=1) -
-
-

Defines the possible reportable states of a Ray cluster.

-
- -Expand source code - -
class RayClusterStatus(Enum):
-    """
-    Defines the possible reportable states of a Ray cluster.
-    """
-
-    # https://github.com/ray-project/kuberay/blob/master/ray-operator/apis/ray/v1alpha1/raycluster_types.go#L95
-    READY = "ready"
-    UNHEALTHY = "unhealthy"
-    FAILED = "failed"
-    UNKNOWN = "unknown"
-
-

Ancestors

-
    -
  • enum.Enum
  • -
-

Class variables

-
-
var FAILED
-
-
-
-
var READY
-
-
-
-
var UNHEALTHY
-
-
-
-
var UNKNOWN
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/codeflare_sdk/index.html b/docs/codeflare_sdk/index.html deleted file mode 100644 index 2b2aa84fb..000000000 --- a/docs/codeflare_sdk/index.html +++ /dev/null @@ -1,65 +0,0 @@ - - - - - - -codeflare_sdk API documentation - - - - - - - - - - - -
- - -
- - - \ No newline at end of file diff --git a/docs/codeflare_sdk/utils/generate_yaml.html b/docs/codeflare_sdk/utils/generate_yaml.html deleted file mode 100644 index b2189e519..000000000 --- a/docs/codeflare_sdk/utils/generate_yaml.html +++ /dev/null @@ -1,885 +0,0 @@ - - - - - - -codeflare_sdk.utils.generate_yaml API documentation - - - - - - - - - - - -
-
-
-

Module codeflare_sdk.utils.generate_yaml

-
-
-

This sub-module exists primarily to be used internally by the Cluster object -(in the cluster sub-module) for AppWrapper generation.

-
- -Expand source code - -
# Copyright 2022 IBM, Red Hat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This sub-module exists primarily to be used internally by the Cluster object
-(in the cluster sub-module) for AppWrapper generation.
-"""
-
-import yaml
-import sys
-import argparse
-import uuid
-
-
-def read_template(template):
-    with open(template, "r") as stream:
-        try:
-            return yaml.safe_load(stream)
-        except yaml.YAMLError as exc:
-            print(exc)
-
-
-def gen_names(name):
-    if not name:
-        gen_id = str(uuid.uuid4())
-        appwrapper_name = "appwrapper-" + gen_id
-        cluster_name = "cluster-" + gen_id
-        return appwrapper_name, cluster_name
-    else:
-        return name, name
-
-
-def update_dashboard_route(route_item, cluster_name, namespace):
-    metadata = route_item.get("generictemplate", {}).get("metadata")
-    metadata["name"] = f"ray-dashboard-{cluster_name}"
-    metadata["namespace"] = namespace
-    metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc"
-    spec = route_item.get("generictemplate", {}).get("spec")
-    spec["to"]["name"] = f"{cluster_name}-head-svc"
-
-
-def update_names(yaml, item, appwrapper_name, cluster_name, namespace):
-    metadata = yaml.get("metadata")
-    metadata["name"] = appwrapper_name
-    metadata["namespace"] = namespace
-    lower_meta = item.get("generictemplate", {}).get("metadata")
-    lower_meta["labels"]["appwrapper.mcad.ibm.com"] = appwrapper_name
-    lower_meta["name"] = cluster_name
-    lower_meta["namespace"] = namespace
-
-
-def update_labels(yaml, instascale, instance_types):
-    metadata = yaml.get("metadata")
-    if instascale:
-        if not len(instance_types) > 0:
-            sys.exit(
-                "If instascale is set to true, must provide at least one instance type"
-            )
-        type_str = ""
-        for type in instance_types:
-            type_str += type + "_"
-        type_str = type_str[:-1]
-        metadata["labels"]["orderedinstance"] = type_str
-    else:
-        metadata.pop("labels")
-
-
-def update_custompodresources(
-    item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
-):
-    if "custompodresources" in item.keys():
-        custompodresources = item.get("custompodresources")
-        for i in range(len(custompodresources)):
-            if i == 0:
-                # Leave head node resources as template default
-                continue
-            resource = custompodresources[i]
-            for k, v in resource.items():
-                if k == "replicas" and i == 1:
-                    resource[k] = workers
-                if k == "requests" or k == "limits":
-                    for spec, _ in v.items():
-                        if spec == "cpu":
-                            if k == "limits":
-                                resource[k][spec] = max_cpu
-                            else:
-                                resource[k][spec] = min_cpu
-                        if spec == "memory":
-                            if k == "limits":
-                                resource[k][spec] = str(max_memory) + "G"
-                            else:
-                                resource[k][spec] = str(min_memory) + "G"
-                        if spec == "nvidia.com/gpu":
-                            if i == 0:
-                                resource[k][spec] = 0
-                            else:
-                                resource[k][spec] = gpu
-    else:
-        sys.exit("Error: malformed template")
-
-
-def update_affinity(spec, appwrapper_name, instascale):
-    if instascale:
-        node_selector_terms = (
-            spec.get("affinity")
-            .get("nodeAffinity")
-            .get("requiredDuringSchedulingIgnoredDuringExecution")
-            .get("nodeSelectorTerms")
-        )
-        node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name
-        node_selector_terms[0]["matchExpressions"][0]["key"] = appwrapper_name
-    else:
-        spec.pop("affinity")
-
-
-def update_image(spec, image):
-    containers = spec.get("containers")
-    for container in containers:
-        container["image"] = image
-
-
-def update_env(spec, env):
-    containers = spec.get("containers")
-    for container in containers:
-        if env:
-            if "env" in container:
-                container["env"].extend(env)
-            else:
-                container["env"] = env
-
-
-def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
-    container = spec.get("containers")
-    for resource in container:
-        requests = resource.get("resources").get("requests")
-        if requests is not None:
-            requests["cpu"] = min_cpu
-            requests["memory"] = str(min_memory) + "G"
-            requests["nvidia.com/gpu"] = gpu
-        limits = resource.get("resources").get("limits")
-        if limits is not None:
-            limits["cpu"] = max_cpu
-            limits["memory"] = str(max_memory) + "G"
-            limits["nvidia.com/gpu"] = gpu
-
-
-def update_nodes(
-    item,
-    appwrapper_name,
-    min_cpu,
-    max_cpu,
-    min_memory,
-    max_memory,
-    gpu,
-    workers,
-    image,
-    instascale,
-    env,
-):
-    if "generictemplate" in item.keys():
-        head = item.get("generictemplate").get("spec").get("headGroupSpec")
-        worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
-
-        # Head counts as first worker
-        worker["replicas"] = workers
-        worker["minReplicas"] = workers
-        worker["maxReplicas"] = workers
-        worker["groupName"] = "small-group-" + appwrapper_name
-        worker["rayStartParams"]["num-gpus"] = str(int(gpu))
-
-        for comp in [head, worker]:
-            spec = comp.get("template").get("spec")
-            update_affinity(spec, appwrapper_name, instascale)
-            update_image(spec, image)
-            update_env(spec, env)
-            if comp == head:
-                update_resources(spec, 2, 2, 8, 8, 0)
-            else:
-                update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
-
-
-def write_user_appwrapper(user_yaml, output_file_name):
-    with open(output_file_name, "w") as outfile:
-        yaml.dump(user_yaml, outfile, default_flow_style=False)
-    print(f"Written to: {output_file_name}")
-
-
-def generate_appwrapper(
-    name: str,
-    namespace: str,
-    min_cpu: int,
-    max_cpu: int,
-    min_memory: int,
-    max_memory: int,
-    gpu: int,
-    workers: int,
-    template: str,
-    image: str,
-    instascale: bool,
-    instance_types: list,
-    env,
-):
-    user_yaml = read_template(template)
-    appwrapper_name, cluster_name = gen_names(name)
-    resources = user_yaml.get("spec", "resources")
-    item = resources["resources"].get("GenericItems")[0]
-    route_item = resources["resources"].get("GenericItems")[1]
-    update_names(user_yaml, item, appwrapper_name, cluster_name, namespace)
-    update_labels(user_yaml, instascale, instance_types)
-    update_custompodresources(
-        item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
-    )
-    update_nodes(
-        item,
-        appwrapper_name,
-        min_cpu,
-        max_cpu,
-        min_memory,
-        max_memory,
-        gpu,
-        workers,
-        image,
-        instascale,
-        env,
-    )
-    update_dashboard_route(route_item, cluster_name, namespace)
-    outfile = appwrapper_name + ".yaml"
-    write_user_appwrapper(user_yaml, outfile)
-    return outfile
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Generate user AppWrapper")
-    parser.add_argument(
-        "--name",
-        required=False,
-        default="",
-        help="User selected name for AppWrapper and Ray Cluster (auto-generated if not provided)",
-    )
-    parser.add_argument(
-        "--min-cpu",
-        type=int,
-        required=True,
-        help="min number of CPU(s) in a worker required for running job",
-    )
-    parser.add_argument(
-        "--max-cpu",
-        type=int,
-        required=True,
-        help="max number of CPU(s) in a worker required for running job",
-    )
-    parser.add_argument(
-        "--min-memory",
-        type=int,
-        required=True,
-        help="min RAM required in a worker for running job, in GB",
-    )
-    parser.add_argument(
-        "--max-memory",
-        type=int,
-        required=True,
-        help="max RAM required in a worker for running job, in GB",
-    )
-    parser.add_argument(
-        "--gpu",
-        type=int,
-        required=True,
-        help="GPU(s) required in a worker for running job",
-    )
-    parser.add_argument(
-        "--workers",
-        type=int,
-        required=True,
-        help="How many workers are required in the cluster",
-    )
-    parser.add_argument(
-        "--template", required=True, help="Template AppWrapper yaml file"
-    )
-    parser.add_argument(
-        "--image",
-        required=False,
-        default="rayproject/ray:latest",
-        help="Ray image to be used (defaults to rayproject/ray:latest)",
-    )
-    parser.add_argument(
-        "--instascale",
-        default=False,
-        required=False,
-        action="store_true",
-        help="Indicates that instascale is installed on the cluster",
-    )
-    parser.add_argument(
-        "--instance-types",
-        type=str,
-        nargs="+",
-        default=[],
-        required=False,
-        help="Head,worker instance types (space separated)",
-    )
-    parser.add_argument(
-        "--namespace",
-        required=False,
-        default="default",
-        help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace",
-    )
-
-    args = parser.parse_args()
-    name = args.name
-    min_cpu = args.min_cpu
-    max_cpu = args.max_cpu
-    min_memory = args.min_memory
-    max_memory = args.max_memory
-    gpu = args.gpu
-    workers = args.workers
-    template = args.template
-    image = args.image
-    instascale = args.instascale
-    instance_types = args.instance_types
-    namespace = args.namespace
-    env = {}
-
-    outfile = generate_appwrapper(
-        name,
-        namespace,
-        min_cpu,
-        max_cpu,
-        min_memory,
-        max_memory,
-        gpu,
-        workers,
-        template,
-        image,
-        instascale,
-        instance_types,
-        env,
-    )
-    return outfile
-
-
-if __name__ == "__main__":
-    main()
-
-
-
-
-
-
-
-

Functions

-
-
-def gen_names(name) -
-
-
-
- -Expand source code - -
def gen_names(name):
-    if not name:
-        gen_id = str(uuid.uuid4())
-        appwrapper_name = "appwrapper-" + gen_id
-        cluster_name = "cluster-" + gen_id
-        return appwrapper_name, cluster_name
-    else:
-        return name, name
-
-
-
-def generate_appwrapper(name: str, namespace: str, min_cpu: int, max_cpu: int, min_memory: int, max_memory: int, gpu: int, workers: int, template: str, image: str, instascale: bool, instance_types: list, env) -
-
-
-
- -Expand source code - -
def generate_appwrapper(
-    name: str,
-    namespace: str,
-    min_cpu: int,
-    max_cpu: int,
-    min_memory: int,
-    max_memory: int,
-    gpu: int,
-    workers: int,
-    template: str,
-    image: str,
-    instascale: bool,
-    instance_types: list,
-    env,
-):
-    user_yaml = read_template(template)
-    appwrapper_name, cluster_name = gen_names(name)
-    resources = user_yaml.get("spec", "resources")
-    item = resources["resources"].get("GenericItems")[0]
-    route_item = resources["resources"].get("GenericItems")[1]
-    update_names(user_yaml, item, appwrapper_name, cluster_name, namespace)
-    update_labels(user_yaml, instascale, instance_types)
-    update_custompodresources(
-        item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
-    )
-    update_nodes(
-        item,
-        appwrapper_name,
-        min_cpu,
-        max_cpu,
-        min_memory,
-        max_memory,
-        gpu,
-        workers,
-        image,
-        instascale,
-        env,
-    )
-    update_dashboard_route(route_item, cluster_name, namespace)
-    outfile = appwrapper_name + ".yaml"
-    write_user_appwrapper(user_yaml, outfile)
-    return outfile
-
-
-
-def main() -
-
-
-
- -Expand source code - -
def main():
-    parser = argparse.ArgumentParser(description="Generate user AppWrapper")
-    parser.add_argument(
-        "--name",
-        required=False,
-        default="",
-        help="User selected name for AppWrapper and Ray Cluster (auto-generated if not provided)",
-    )
-    parser.add_argument(
-        "--min-cpu",
-        type=int,
-        required=True,
-        help="min number of CPU(s) in a worker required for running job",
-    )
-    parser.add_argument(
-        "--max-cpu",
-        type=int,
-        required=True,
-        help="max number of CPU(s) in a worker required for running job",
-    )
-    parser.add_argument(
-        "--min-memory",
-        type=int,
-        required=True,
-        help="min RAM required in a worker for running job, in GB",
-    )
-    parser.add_argument(
-        "--max-memory",
-        type=int,
-        required=True,
-        help="max RAM required in a worker for running job, in GB",
-    )
-    parser.add_argument(
-        "--gpu",
-        type=int,
-        required=True,
-        help="GPU(s) required in a worker for running job",
-    )
-    parser.add_argument(
-        "--workers",
-        type=int,
-        required=True,
-        help="How many workers are required in the cluster",
-    )
-    parser.add_argument(
-        "--template", required=True, help="Template AppWrapper yaml file"
-    )
-    parser.add_argument(
-        "--image",
-        required=False,
-        default="rayproject/ray:latest",
-        help="Ray image to be used (defaults to rayproject/ray:latest)",
-    )
-    parser.add_argument(
-        "--instascale",
-        default=False,
-        required=False,
-        action="store_true",
-        help="Indicates that instascale is installed on the cluster",
-    )
-    parser.add_argument(
-        "--instance-types",
-        type=str,
-        nargs="+",
-        default=[],
-        required=False,
-        help="Head,worker instance types (space separated)",
-    )
-    parser.add_argument(
-        "--namespace",
-        required=False,
-        default="default",
-        help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace",
-    )
-
-    args = parser.parse_args()
-    name = args.name
-    min_cpu = args.min_cpu
-    max_cpu = args.max_cpu
-    min_memory = args.min_memory
-    max_memory = args.max_memory
-    gpu = args.gpu
-    workers = args.workers
-    template = args.template
-    image = args.image
-    instascale = args.instascale
-    instance_types = args.instance_types
-    namespace = args.namespace
-    env = {}
-
-    outfile = generate_appwrapper(
-        name,
-        namespace,
-        min_cpu,
-        max_cpu,
-        min_memory,
-        max_memory,
-        gpu,
-        workers,
-        template,
-        image,
-        instascale,
-        instance_types,
-        env,
-    )
-    return outfile
-
-
-
-def read_template(template) -
-
-
-
- -Expand source code - -
def read_template(template):
-    with open(template, "r") as stream:
-        try:
-            return yaml.safe_load(stream)
-        except yaml.YAMLError as exc:
-            print(exc)
-
-
-
-def update_affinity(spec, appwrapper_name, instascale) -
-
-
-
- -Expand source code - -
def update_affinity(spec, appwrapper_name, instascale):
-    if instascale:
-        node_selector_terms = (
-            spec.get("affinity")
-            .get("nodeAffinity")
-            .get("requiredDuringSchedulingIgnoredDuringExecution")
-            .get("nodeSelectorTerms")
-        )
-        node_selector_terms[0]["matchExpressions"][0]["values"][0] = appwrapper_name
-        node_selector_terms[0]["matchExpressions"][0]["key"] = appwrapper_name
-    else:
-        spec.pop("affinity")
-
-
-
-def update_custompodresources(item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers) -
-
-
-
- -Expand source code - -
def update_custompodresources(
-    item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
-):
-    if "custompodresources" in item.keys():
-        custompodresources = item.get("custompodresources")
-        for i in range(len(custompodresources)):
-            if i == 0:
-                # Leave head node resources as template default
-                continue
-            resource = custompodresources[i]
-            for k, v in resource.items():
-                if k == "replicas" and i == 1:
-                    resource[k] = workers
-                if k == "requests" or k == "limits":
-                    for spec, _ in v.items():
-                        if spec == "cpu":
-                            if k == "limits":
-                                resource[k][spec] = max_cpu
-                            else:
-                                resource[k][spec] = min_cpu
-                        if spec == "memory":
-                            if k == "limits":
-                                resource[k][spec] = str(max_memory) + "G"
-                            else:
-                                resource[k][spec] = str(min_memory) + "G"
-                        if spec == "nvidia.com/gpu":
-                            if i == 0:
-                                resource[k][spec] = 0
-                            else:
-                                resource[k][spec] = gpu
-    else:
-        sys.exit("Error: malformed template")
-
-
-
-def update_dashboard_route(route_item, cluster_name, namespace) -
-
-
-
- -Expand source code - -
def update_dashboard_route(route_item, cluster_name, namespace):
-    metadata = route_item.get("generictemplate", {}).get("metadata")
-    metadata["name"] = f"ray-dashboard-{cluster_name}"
-    metadata["namespace"] = namespace
-    metadata["labels"]["odh-ray-cluster-service"] = f"{cluster_name}-head-svc"
-    spec = route_item.get("generictemplate", {}).get("spec")
-    spec["to"]["name"] = f"{cluster_name}-head-svc"
-
-
-
-def update_env(spec, env) -
-
-
-
- -Expand source code - -
def update_env(spec, env):
-    containers = spec.get("containers")
-    for container in containers:
-        if env:
-            if "env" in container:
-                container["env"].extend(env)
-            else:
-                container["env"] = env
-
-
-
-def update_image(spec, image) -
-
-
-
- -Expand source code - -
def update_image(spec, image):
-    containers = spec.get("containers")
-    for container in containers:
-        container["image"] = image
-
-
-
-def update_labels(yaml, instascale, instance_types) -
-
-
-
- -Expand source code - -
def update_labels(yaml, instascale, instance_types):
-    metadata = yaml.get("metadata")
-    if instascale:
-        if not len(instance_types) > 0:
-            sys.exit(
-                "If instascale is set to true, must provide at least one instance type"
-            )
-        type_str = ""
-        for type in instance_types:
-            type_str += type + "_"
-        type_str = type_str[:-1]
-        metadata["labels"]["orderedinstance"] = type_str
-    else:
-        metadata.pop("labels")
-
-
-
-def update_names(yaml, item, appwrapper_name, cluster_name, namespace) -
-
-
-
- -Expand source code - -
def update_names(yaml, item, appwrapper_name, cluster_name, namespace):
-    metadata = yaml.get("metadata")
-    metadata["name"] = appwrapper_name
-    metadata["namespace"] = namespace
-    lower_meta = item.get("generictemplate", {}).get("metadata")
-    lower_meta["labels"]["appwrapper.mcad.ibm.com"] = appwrapper_name
-    lower_meta["name"] = cluster_name
-    lower_meta["namespace"] = namespace
-
-
-
-def update_nodes(item, appwrapper_name, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, image, instascale, env) -
-
-
-
- -Expand source code - -
def update_nodes(
-    item,
-    appwrapper_name,
-    min_cpu,
-    max_cpu,
-    min_memory,
-    max_memory,
-    gpu,
-    workers,
-    image,
-    instascale,
-    env,
-):
-    if "generictemplate" in item.keys():
-        head = item.get("generictemplate").get("spec").get("headGroupSpec")
-        worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
-
-        # Head counts as first worker
-        worker["replicas"] = workers
-        worker["minReplicas"] = workers
-        worker["maxReplicas"] = workers
-        worker["groupName"] = "small-group-" + appwrapper_name
-        worker["rayStartParams"]["num-gpus"] = str(int(gpu))
-
-        for comp in [head, worker]:
-            spec = comp.get("template").get("spec")
-            update_affinity(spec, appwrapper_name, instascale)
-            update_image(spec, image)
-            update_env(spec, env)
-            if comp == head:
-                update_resources(spec, 2, 2, 8, 8, 0)
-            else:
-                update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
-
-
-
-def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) -
-
-
-
- -Expand source code - -
def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
-    container = spec.get("containers")
-    for resource in container:
-        requests = resource.get("resources").get("requests")
-        if requests is not None:
-            requests["cpu"] = min_cpu
-            requests["memory"] = str(min_memory) + "G"
-            requests["nvidia.com/gpu"] = gpu
-        limits = resource.get("resources").get("limits")
-        if limits is not None:
-            limits["cpu"] = max_cpu
-            limits["memory"] = str(max_memory) + "G"
-            limits["nvidia.com/gpu"] = gpu
-
-
-
-def write_user_appwrapper(user_yaml, output_file_name) -
-
-
-
- -Expand source code - -
def write_user_appwrapper(user_yaml, output_file_name):
-    with open(output_file_name, "w") as outfile:
-        yaml.dump(user_yaml, outfile, default_flow_style=False)
-    print(f"Written to: {output_file_name}")
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/codeflare_sdk/utils/index.html b/docs/codeflare_sdk/utils/index.html deleted file mode 100644 index c764c8b16..000000000 --- a/docs/codeflare_sdk/utils/index.html +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - -codeflare_sdk.utils API documentation - - - - - - - - - - - -
-
-
-

Module codeflare_sdk.utils

-
-
-
-
-

Sub-modules

-
-
codeflare_sdk.utils.generate_yaml
-
-

This sub-module exists primarily to be used internally by the Cluster object -(in the cluster sub-module) for AppWrapper generation.

-
-
codeflare_sdk.utils.pretty_print
-
-

This sub-module exists primarily to be used internally by the Cluster object -(in the cluster sub-module) for pretty-printing cluster status and details.

-
-
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/codeflare_sdk/utils/pretty_print.html b/docs/codeflare_sdk/utils/pretty_print.html deleted file mode 100644 index 013d6e248..000000000 --- a/docs/codeflare_sdk/utils/pretty_print.html +++ /dev/null @@ -1,351 +0,0 @@ - - - - - - -codeflare_sdk.utils.pretty_print API documentation - - - - - - - - - - - -
-
-
-

Module codeflare_sdk.utils.pretty_print

-
-
-

This sub-module exists primarily to be used internally by the Cluster object -(in the cluster sub-module) for pretty-printing cluster status and details.

-
- -Expand source code - -
# Copyright 2022 IBM, Red Hat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This sub-module exists primarily to be used internally by the Cluster object
-(in the cluster sub-module) for pretty-printing cluster status and details.
-"""
-
-from rich import print
-from rich.table import Table
-from rich.console import Console
-from rich.layout import Layout
-from rich.panel import Panel
-from rich import box
-from typing import List
-from ..cluster.model import RayCluster, AppWrapper, RayClusterStatus
-
-
-def print_no_resources_found():
-    console = Console()
-    console.print(Panel("[red]No resources found"))
-
-
-def print_app_wrappers_status(app_wrappers: List[AppWrapper]):
-    if not app_wrappers:
-        print_no_resources_found()
-        return  # shortcircuit
-
-    console = Console()
-    for app_wrapper in app_wrappers:
-        name = app_wrapper.name
-        status = app_wrapper.status.value
-
-        table = Table(
-            box=box.ASCII_DOUBLE_HEAD,
-            title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:",
-        )
-        table.add_column("Name", style="cyan", no_wrap=True)
-        table.add_column("Status", style="magenta")
-        table.add_row(name, status)
-        table.add_row("")  # empty row for spacing
-        console.print(Panel.fit(table))
-
-
-def print_clusters(clusters: List[RayCluster], verbose=True):
-    if not clusters:
-        print_no_resources_found()
-        return  # shortcircuit
-
-    console = Console()
-    title_printed = False
-
-    for cluster in clusters:
-        status = (
-            "Active :white_heavy_check_mark:"
-            if cluster.status == RayClusterStatus.READY
-            else "InActive :x:"
-        )
-        name = cluster.name
-        dashboard = cluster.dashboard
-        mincount = str(cluster.min_workers)
-        maxcount = str(cluster.max_workers)
-        memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max
-        cpu = str(cluster.worker_cpu)
-        gpu = str(cluster.worker_gpu)
-        # owned = bool(cluster["userOwned"])
-        owned = True
-
-        #'table0' to display the cluster name, status, url, and dashboard link
-        table0 = Table(box=None, show_header=False)
-        if owned:
-            table0.add_row("[white on green][bold]Owner")
-        else:
-            table0.add_row("")
-        table0.add_row("[bold underline]" + name, status)
-        table0.add_row()
-        # fixme harcded to default for now
-        table0.add_row(
-            f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001"
-        )  # format that is used to generate the name of the service
-        table0.add_row()
-        table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]")
-        table0.add_row("")  # empty row for spacing
-
-        #'table1' to display the worker counts
-        table1 = Table(box=None)
-        table1.add_row()
-        table1.add_column("Min", style="cyan", no_wrap=True)
-        table1.add_column("Max", style="magenta")
-        table1.add_row()
-        table1.add_row(mincount, maxcount)
-        table1.add_row()
-
-        #'table2' to display the worker resources
-        table2 = Table(box=None)
-        table2.add_column("Memory", style="cyan", no_wrap=True, min_width=10)
-        table2.add_column("CPU", style="magenta", min_width=10)
-        table2.add_column("GPU", style="magenta", min_width=10)
-        table2.add_row()
-        table2.add_row(memory, cpu, gpu)
-        table2.add_row()
-
-        # panels to encompass table1 and table2 into separate cards
-        panel_1 = Panel.fit(table1, title="Workers")
-        panel_2 = Panel.fit(table2, title="Worker specs(each)")
-
-        # table3 to display panel_1 and panel_2 side-by-side in a single row
-        table3 = Table(box=None, show_header=False, title="Cluster Resources")
-        table3.add_row(panel_1, panel_2)
-
-        # table4 to display table0 and table3, one below the other
-        table4 = Table(box=None, show_header=False)
-        table4.add_row(table0)
-        table4.add_row(table3)
-
-        # Encompass all details of the cluster in a single panel
-        if not title_printed:
-            # If first cluster in the list, then create a table with title "Codeflare clusters".
-            # This is done to ensure the title is center aligned on the cluster display tables, rather
-            # than being center aligned on the console/terminal if we simply use console.print(title)
-
-            table5 = Table(
-                box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:"
-            )
-            table5.add_row(Panel.fit(table4))
-            console.print(table5)
-            title_printed = True
-        else:
-            console.print(Panel.fit(table4))
-
-
-
-
-
-
-
-

Functions

-
-
-def print_app_wrappers_status(app_wrappers: List[AppWrapper]) -
-
-
-
- -Expand source code - -
def print_app_wrappers_status(app_wrappers: List[AppWrapper]):
-    if not app_wrappers:
-        print_no_resources_found()
-        return  # shortcircuit
-
-    console = Console()
-    for app_wrapper in app_wrappers:
-        name = app_wrapper.name
-        status = app_wrapper.status.value
-
-        table = Table(
-            box=box.ASCII_DOUBLE_HEAD,
-            title="[bold] :rocket: List of CodeFlare clusters in queue:rocket:",
-        )
-        table.add_column("Name", style="cyan", no_wrap=True)
-        table.add_column("Status", style="magenta")
-        table.add_row(name, status)
-        table.add_row("")  # empty row for spacing
-        console.print(Panel.fit(table))
-
-
-
-def print_clusters(clusters: List[RayCluster], verbose=True) -
-
-
-
- -Expand source code - -
def print_clusters(clusters: List[RayCluster], verbose=True):
-    if not clusters:
-        print_no_resources_found()
-        return  # shortcircuit
-
-    console = Console()
-    title_printed = False
-
-    for cluster in clusters:
-        status = (
-            "Active :white_heavy_check_mark:"
-            if cluster.status == RayClusterStatus.READY
-            else "InActive :x:"
-        )
-        name = cluster.name
-        dashboard = cluster.dashboard
-        mincount = str(cluster.min_workers)
-        maxcount = str(cluster.max_workers)
-        memory = cluster.worker_mem_min + "~" + cluster.worker_mem_max
-        cpu = str(cluster.worker_cpu)
-        gpu = str(cluster.worker_gpu)
-        # owned = bool(cluster["userOwned"])
-        owned = True
-
-        #'table0' to display the cluster name, status, url, and dashboard link
-        table0 = Table(box=None, show_header=False)
-        if owned:
-            table0.add_row("[white on green][bold]Owner")
-        else:
-            table0.add_row("")
-        table0.add_row("[bold underline]" + name, status)
-        table0.add_row()
-        # fixme harcded to default for now
-        table0.add_row(
-            f"[bold]URI:[/bold] ray://{cluster.name}-head-svc.{cluster.namespace}.svc:10001"
-        )  # format that is used to generate the name of the service
-        table0.add_row()
-        table0.add_row(f"[link={dashboard} blue underline]Dashboard:link:[/link]")
-        table0.add_row("")  # empty row for spacing
-
-        #'table1' to display the worker counts
-        table1 = Table(box=None)
-        table1.add_row()
-        table1.add_column("Min", style="cyan", no_wrap=True)
-        table1.add_column("Max", style="magenta")
-        table1.add_row()
-        table1.add_row(mincount, maxcount)
-        table1.add_row()
-
-        #'table2' to display the worker resources
-        table2 = Table(box=None)
-        table2.add_column("Memory", style="cyan", no_wrap=True, min_width=10)
-        table2.add_column("CPU", style="magenta", min_width=10)
-        table2.add_column("GPU", style="magenta", min_width=10)
-        table2.add_row()
-        table2.add_row(memory, cpu, gpu)
-        table2.add_row()
-
-        # panels to encompass table1 and table2 into separate cards
-        panel_1 = Panel.fit(table1, title="Workers")
-        panel_2 = Panel.fit(table2, title="Worker specs(each)")
-
-        # table3 to display panel_1 and panel_2 side-by-side in a single row
-        table3 = Table(box=None, show_header=False, title="Cluster Resources")
-        table3.add_row(panel_1, panel_2)
-
-        # table4 to display table0 and table3, one below the other
-        table4 = Table(box=None, show_header=False)
-        table4.add_row(table0)
-        table4.add_row(table3)
-
-        # Encompass all details of the cluster in a single panel
-        if not title_printed:
-            # If first cluster in the list, then create a table with title "Codeflare clusters".
-            # This is done to ensure the title is center aligned on the cluster display tables, rather
-            # than being center aligned on the console/terminal if we simply use console.print(title)
-
-            table5 = Table(
-                box=None, title="[bold] :rocket: List of CodeFlare clusters :rocket:"
-            )
-            table5.add_row(Panel.fit(table4))
-            console.print(table5)
-            title_printed = True
-        else:
-            console.print(Panel.fit(table4))
-
-
-
-def print_no_resources_found() -
-
-
-
- -Expand source code - -
def print_no_resources_found():
-    console = Console()
-    console.print(Panel("[red]No resources found"))
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file