From 0585f1b1c6a2ffbb0105560f883c092204d02372 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Tue, 20 Jun 2023 15:31:32 -0400 Subject: [PATCH 1/2] Fixed head node configuration to match template --- src/codeflare_sdk/cluster/config.py | 2 +- src/codeflare_sdk/templates/aw-kuberay.yaml | 220 ----------------- .../templates/base-template.yaml | 118 +++++++-- src/codeflare_sdk/templates/new-template.yaml | 228 ------------------ src/codeflare_sdk/utils/generate_yaml.py | 3 +- 5 files changed, 98 insertions(+), 473 deletions(-) delete mode 100644 src/codeflare_sdk/templates/aw-kuberay.yaml delete mode 100644 src/codeflare_sdk/templates/new-template.yaml diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 5d74639d9..25f129256 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -44,7 +44,7 @@ class ClusterConfiguration: min_memory: int = 2 max_memory: int = 2 gpu: int = 0 - template: str = f"{dir}/templates/new-template.yaml" + template: str = f"{dir}/templates/base-template.yaml" instascale: bool = False envs: dict = field(default_factory=dict) image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" diff --git a/src/codeflare_sdk/templates/aw-kuberay.yaml b/src/codeflare_sdk/templates/aw-kuberay.yaml deleted file mode 100644 index e94236495..000000000 --- a/src/codeflare_sdk/templates/aw-kuberay.yaml +++ /dev/null @@ -1,220 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: aw-kuberay-glue - namespace: default -spec: - priority: 9 - resources: - Items: [] - GenericItems: - - replicas: 1 - custompodresources: - - replicas: 4 - requests: - cpu: 2 - memory: 12G - nvidia.com/gpu: 1 - limits: - cpu: 2 - memory: 12G - nvidia.com/gpu: 1 - generictemplate: - # This config demonstrates KubeRay's Ray autoscaler integration. - # The resource requests and limits in this config are too small for production! - # For an example with more realistic resource configuration, see - # ray-cluster.autoscaler.large.yaml. - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: "aw-kuberay-glue" - controller-tools.k8s.io: "1.0" - # A unique identifier for the head node and workers of this cluster. - name: glue-cluster - # finalizers: - # - kubernetes - spec: - # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '1.12.0' - # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. - # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 - # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - enableInTreeAutoscaling: false - # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. - # The example configuration shown below below represents the DEFAULT values. - # (You may delete autoscalerOptions if the defaults are suitable.) - autoscalerOptions: - # upscalingMode is "Default" or "Aggressive." - # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster. - # Default: Upscaling is not rate-limited. - # Aggressive: An alias for Default; upscaling is not rate-limited. - upscalingMode: Default - # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. - idleTimeoutSeconds: 60 - # image optionally overrides the autoscaler's container image. - # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as - # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. - ## image: "my-repo/my-custom-autoscaler-image:tag" - # imagePullPolicy optionally overrides the autoscaler container's image pull policy. - imagePullPolicy: Always - # resources specifies optional resource request and limit overrides for the autoscaler container. - # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. - resources: - limits: - cpu: "500m" - memory: "512Mi" - requests: - cpu: "500m" - memory: "512Mi" - ######################headGroupSpec################################# - # head group template and specs, (perhaps 'group' is not needed in the name) - headGroupSpec: - # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' - serviceType: ClusterIP - # logical group name, for this called head-group, also can be functional - # pod type head or worker - # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block ... - rayStartParams: - # Flag "no-monitor" will be automatically set when autoscaling is enabled. - dashboard-host: '0.0.0.0' - block: 'true' - # num-cpus: '1' # can be auto-completed from the limits - # Use `resources` to optionally specify custom resource annotations for the Ray node. - # The value of `resources` is a string-integer mapping. - # Currently, `resources` must be provided in the specific format demonstrated below: - # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' - num-gpus: '0' - #pod template - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: aw-kuberay-glue #--> key changed to AW name - operator: In - values: - - "aw-kuberay-glue" - containers: - # The Ray head pod - - name: ray-head - image: asm582/codeflare-tl-aws:latest - env: - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: glue-s3-creds - key: AWS_ACCESS_KEY_ID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: glue-s3-creds - key: AWS_SECRET_ACCESS_KEY - - name: ENDPOINT_URL - valueFrom: - secretKeyRef: - name: glue-s3-creds - key: ENDPOINT_URL - imagePullPolicy: Always - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - lifecycle: - preStop: - exec: - command: ["/bin/sh","-c","ray stop"] - resources: - limits: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "0" - requests: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "0" - workerGroupSpecs: - # the pod replicas in this group typed worker - - replicas: 3 - minReplicas: 3 - maxReplicas: 3 - # logical group name, for this called small-group, also can be functional - groupName: small-group - # if worker pods need to be added, we can simply increment the replicas - # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list - # the operator will remove pods from the list until the number of replicas is satisfied - # when a pod is confirmed to be deleted, its name will be removed from the list below - #scaleStrategy: - # workersToDelete: - # - raycluster-complete-worker-small-group-bdtwh - # - raycluster-complete-worker-small-group-hv457 - # - raycluster-complete-worker-small-group-k8tj7 - # the following params are used to complete the ray start: ray start --block ... - rayStartParams: - block: 'true' - num-gpus: '1' - #pod template - template: - metadata: - labels: - key: value - # annotations for pod - annotations: - key: value - # finalizers: - # - kubernetes - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: aw-kuberay-glue #--> key changed to AW name - operator: In - values: - - "aw-kuberay-glue" - initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - - name: init-myservice - image: busybox:1.28 - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] - containers: - - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: asm582/codeflare-tl-aws:latest - env: - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: glue-s3-creds - key: AWS_ACCESS_KEY_ID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: glue-s3-creds - key: AWS_SECRET_ACCESS_KEY - - name: ENDPOINT_URL - valueFrom: - secretKeyRef: - name: glue-s3-creds - key: ENDPOINT_URL - # environment variables to set in the container.Optional. - # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ - lifecycle: - preStop: - exec: - command: ["/bin/sh","-c","ray stop"] - resources: - limits: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "1" diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 100e610ac..5a80a96e3 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -1,21 +1,37 @@ apiVersion: mcad.ibm.com/v1beta1 kind: AppWrapper metadata: - name: raycluster-autoscaler + name: aw-kuberay namespace: default + #new addition + labels: + orderedinstance: "m4.xlarge_g4dn.xlarge" spec: + priority: 9 resources: Items: [] GenericItems: - replicas: 1 + #new addition custompodresources: - - replicas: 2 + - replicas: 1 requests: - cpu: 10 - memory: 512Mi + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 limits: - cpu: 10 - memory: 1G + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - replicas: 3 + requests: + cpu: 2 + memory: 12G + nvidia.com/gpu: 1 + limits: + cpu: 2 + memory: 12G + nvidia.com/gpu: 1 generictemplate: # This config demonstrates KubeRay's Ray autoscaler integration. # The resource requests and limits in this config are too small for production! @@ -25,16 +41,19 @@ spec: kind: RayCluster metadata: labels: + appwrapper.mcad.ibm.com: "aw-kuberay" controller-tools.k8s.io: "1.0" # A unique identifier for the head node and workers of this cluster. - name: raycluster-autoscaler + name: kuberay-cluster + # finalizers: + # - kubernetes spec: # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '2.0.0' + rayVersion: '2.1.0' # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - enableInTreeAutoscaling: true + enableInTreeAutoscaling: false # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. # The example configuration shown below below represents the DEFAULT values. # (You may delete autoscalerOptions if the defaults are suitable.) @@ -79,13 +98,29 @@ spec: # The value of `resources` is a string-integer mapping. # Currently, `resources` must be provided in the specific format demonstrated below: # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' + num-gpus: '0' #pod template template: spec: + #new addition + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: aw-kuberay + operator: In + values: + - "aw-kuberay" containers: # The Ray head pod - - name: ray-head - image: rayproject/ray:2.0.0 + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + name: ray-head + image: rayproject/ray:latest imagePullPolicy: Always ports: - containerPort: 6379 @@ -100,16 +135,18 @@ spec: command: ["/bin/sh","-c","ray stop"] resources: limits: - cpu: "1" - memory: "1G" + cpu: "2" + memory: "8G" + nvidia.com/gpu: "0" requests: - cpu: "500m" - memory: "512Mi" + cpu: "2" + memory: "8G" + nvidia.com/gpu: "0" workerGroupSpecs: # the pod replicas in this group typed worker - - replicas: 1 - minReplicas: 1 - maxReplicas: 300 + - replicas: 3 + minReplicas: 3 + maxReplicas: 3 # logical group name, for this called small-group, also can be functional groupName: small-group # if worker pods need to be added, we can simply increment the replicas @@ -124,6 +161,7 @@ spec: # the following params are used to complete the ray start: ray start --block ... rayStartParams: block: 'true' + num-gpus: 1 #pod template template: metadata: @@ -132,7 +170,18 @@ spec: # annotations for pod annotations: key: value + # finalizers: + # - kubernetes spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: aw-kuberay + operator: In + values: + - "aw-kuberay" initContainers: # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - name: init-myservice @@ -140,7 +189,12 @@ spec: command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: rayproject/ray:2.0.0 + image: rayproject/ray:latest + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP # environment variables to set in the container.Optional. # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ lifecycle: @@ -149,8 +203,26 @@ spec: command: ["/bin/sh","-c","ray stop"] resources: limits: - cpu: "1" - memory: "512Mi" + cpu: "2" + memory: "12G" + nvidia.com/gpu: "1" requests: - cpu: "500m" - memory: "256Mi" + cpu: "2" + memory: "12G" + nvidia.com/gpu: "1" + - replica: 1 + generictemplate: + kind: Route + apiVersion: route.openshift.io/v1 + metadata: + name: ray-dashboard-deployment-name + namespace: default + labels: + # allows me to return name of service that Ray operator creates + odh-ray-cluster-service: deployment-name-head-svc + spec: + to: + kind: Service + name: deployment-name-head-svc + port: + targetPort: dashboard diff --git a/src/codeflare_sdk/templates/new-template.yaml b/src/codeflare_sdk/templates/new-template.yaml deleted file mode 100644 index 585f05baa..000000000 --- a/src/codeflare_sdk/templates/new-template.yaml +++ /dev/null @@ -1,228 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: aw-kuberay - namespace: default - #new addition - labels: - orderedinstance: "m4.xlarge_g4dn.xlarge" -spec: - priority: 9 - resources: - Items: [] - GenericItems: - - replicas: 1 - #new addition - custompodresources: - - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - replicas: 3 - requests: - cpu: 2 - memory: 12G - nvidia.com/gpu: 1 - limits: - cpu: 2 - memory: 12G - nvidia.com/gpu: 1 - generictemplate: - # This config demonstrates KubeRay's Ray autoscaler integration. - # The resource requests and limits in this config are too small for production! - # For an example with more realistic resource configuration, see - # ray-cluster.autoscaler.large.yaml. - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: "aw-kuberay" - controller-tools.k8s.io: "1.0" - # A unique identifier for the head node and workers of this cluster. - name: kuberay-cluster - # finalizers: - # - kubernetes - spec: - # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '1.12.0' - # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod. - # Ray autoscaler integration is supported only for Ray versions >= 1.11.0 - # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0. - enableInTreeAutoscaling: false - # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler. - # The example configuration shown below below represents the DEFAULT values. - # (You may delete autoscalerOptions if the defaults are suitable.) - autoscalerOptions: - # upscalingMode is "Default" or "Aggressive." - # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster. - # Default: Upscaling is not rate-limited. - # Aggressive: An alias for Default; upscaling is not rate-limited. - upscalingMode: Default - # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources. - idleTimeoutSeconds: 60 - # image optionally overrides the autoscaler's container image. - # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as - # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image. - ## image: "my-repo/my-custom-autoscaler-image:tag" - # imagePullPolicy optionally overrides the autoscaler container's image pull policy. - imagePullPolicy: Always - # resources specifies optional resource request and limit overrides for the autoscaler container. - # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required. - resources: - limits: - cpu: "500m" - memory: "512Mi" - requests: - cpu: "500m" - memory: "512Mi" - ######################headGroupSpec################################# - # head group template and specs, (perhaps 'group' is not needed in the name) - headGroupSpec: - # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' - serviceType: ClusterIP - # logical group name, for this called head-group, also can be functional - # pod type head or worker - # rayNodeType: head # Not needed since it is under the headgroup - # the following params are used to complete the ray start: ray start --head --block ... - rayStartParams: - # Flag "no-monitor" will be automatically set when autoscaling is enabled. - dashboard-host: '0.0.0.0' - block: 'true' - # num-cpus: '1' # can be auto-completed from the limits - # Use `resources` to optionally specify custom resource annotations for the Ray node. - # The value of `resources` is a string-integer mapping. - # Currently, `resources` must be provided in the specific format demonstrated below: - # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"' - num-gpus: '0' - #pod template - template: - spec: - #new addition - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: aw-kuberay - operator: In - values: - - "aw-kuberay" - containers: - # The Ray head pod - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - name: ray-head - image: rayproject/ray:latest - imagePullPolicy: Always - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - lifecycle: - preStop: - exec: - command: ["/bin/sh","-c","ray stop"] - resources: - limits: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "0" - requests: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "0" - workerGroupSpecs: - # the pod replicas in this group typed worker - - replicas: 3 - minReplicas: 3 - maxReplicas: 3 - # logical group name, for this called small-group, also can be functional - groupName: small-group - # if worker pods need to be added, we can simply increment the replicas - # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list - # the operator will remove pods from the list until the number of replicas is satisfied - # when a pod is confirmed to be deleted, its name will be removed from the list below - #scaleStrategy: - # workersToDelete: - # - raycluster-complete-worker-small-group-bdtwh - # - raycluster-complete-worker-small-group-hv457 - # - raycluster-complete-worker-small-group-k8tj7 - # the following params are used to complete the ray start: ray start --block ... - rayStartParams: - block: 'true' - num-gpus: 1 - #pod template - template: - metadata: - labels: - key: value - # annotations for pod - annotations: - key: value - # finalizers: - # - kubernetes - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: aw-kuberay - operator: In - values: - - "aw-kuberay" - initContainers: - # the env var $RAY_IP is set by the operator if missing, with the value of the head service name - - name: init-myservice - image: busybox:1.28 - command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"] - containers: - - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc' - image: rayproject/ray:latest - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - # environment variables to set in the container.Optional. - # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/ - lifecycle: - preStop: - exec: - command: ["/bin/sh","-c","ray stop"] - resources: - limits: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: "12G" - nvidia.com/gpu: "1" - - replica: 1 - generictemplate: - kind: Route - apiVersion: route.openshift.io/v1 - metadata: - name: ray-dashboard-deployment-name - namespace: default - labels: - # allows me to return name of service that Ray operator creates - odh-ray-cluster-service: deployment-name-head-svc - spec: - to: - kind: Service - name: deployment-name-head-svc - port: - targetPort: dashboard diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index b80e83e51..36757a2d6 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -185,7 +185,8 @@ def update_nodes( update_image(spec, image) update_env(spec, env) if comp == head: - update_resources(spec, 2, 2, 8, 8, 0) + # TODO: Eventually add head node configuration outside of template + continue else: update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) From de41cf940903d446c282ee294babd5c4236a0149 Mon Sep 17 00:00:00 2001 From: maxusmusti Date: Tue, 20 Jun 2023 16:02:27 -0400 Subject: [PATCH 2/2] Updated unit tests --- src/codeflare_sdk/templates/base-template.yaml | 8 ++++---- tests/test-case-cmd.yaml | 2 +- tests/test-case.yaml | 2 +- tests/unit_test.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 5a80a96e3..c99fd105d 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -135,13 +135,13 @@ spec: command: ["/bin/sh","-c","ray stop"] resources: limits: - cpu: "2" + cpu: 2 memory: "8G" - nvidia.com/gpu: "0" + nvidia.com/gpu: 0 requests: - cpu: "2" + cpu: 2 memory: "8G" - nvidia.com/gpu: "0" + nvidia.com/gpu: 0 workerGroupSpecs: # the pod replicas in this group typed worker - replicas: 3 diff --git a/tests/test-case-cmd.yaml b/tests/test-case-cmd.yaml index 4e5d72534..450ec9668 100644 --- a/tests/test-case-cmd.yaml +++ b/tests/test-case-cmd.yaml @@ -88,7 +88,7 @@ spec: cpu: 2 memory: 8G nvidia.com/gpu: 0 - rayVersion: 1.12.0 + rayVersion: 2.1.0 workerGroupSpecs: - groupName: small-group-unit-cmd-cluster maxReplicas: 2 diff --git a/tests/test-case.yaml b/tests/test-case.yaml index 2aebb0007..133a22229 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -99,7 +99,7 @@ spec: cpu: 2 memory: 8G nvidia.com/gpu: 0 - rayVersion: 1.12.0 + rayVersion: 2.1.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster maxReplicas: 2 diff --git a/tests/unit_test.py b/tests/unit_test.py index f1255dc45..47c0d43a8 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -224,7 +224,7 @@ def test_config_creation(): config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" ) - assert config.template == f"{parent}/src/codeflare_sdk/templates/new-template.yaml" + assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml" assert config.instascale assert config.machine_types == ["cpu.small", "gpu.large"] return config @@ -1983,7 +1983,7 @@ def test_AWManager_submit_remove(mocker, capsys): # Make sure to keep this function and the following function at the end of the file def test_cmd_line_generation(): os.system( - f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/new-template.yaml" + f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/base-template.yaml" ) assert filecmp.cmp( "unit-cmd-cluster.yaml", f"{parent}/tests/test-case-cmd.yaml", shallow=True