From 0585f1b1c6a2ffbb0105560f883c092204d02372 Mon Sep 17 00:00:00 2001
From: maxusmusti <meyceoz@redhat.com>
Date: Tue, 20 Jun 2023 15:31:32 -0400
Subject: [PATCH 1/2] Fixed head node configuration to match template

---
 src/codeflare_sdk/cluster/config.py           |   2 +-
 src/codeflare_sdk/templates/aw-kuberay.yaml   | 220 -----------------
 .../templates/base-template.yaml              | 118 +++++++--
 src/codeflare_sdk/templates/new-template.yaml | 228 ------------------
 src/codeflare_sdk/utils/generate_yaml.py      |   3 +-
 5 files changed, 98 insertions(+), 473 deletions(-)
 delete mode 100644 src/codeflare_sdk/templates/aw-kuberay.yaml
 delete mode 100644 src/codeflare_sdk/templates/new-template.yaml

diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py
index 5d74639d9..25f129256 100644
--- a/src/codeflare_sdk/cluster/config.py
+++ b/src/codeflare_sdk/cluster/config.py
@@ -44,7 +44,7 @@ class ClusterConfiguration:
     min_memory: int = 2
     max_memory: int = 2
     gpu: int = 0
-    template: str = f"{dir}/templates/new-template.yaml"
+    template: str = f"{dir}/templates/base-template.yaml"
     instascale: bool = False
     envs: dict = field(default_factory=dict)
     image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
diff --git a/src/codeflare_sdk/templates/aw-kuberay.yaml b/src/codeflare_sdk/templates/aw-kuberay.yaml
deleted file mode 100644
index e94236495..000000000
--- a/src/codeflare_sdk/templates/aw-kuberay.yaml
+++ /dev/null
@@ -1,220 +0,0 @@
-apiVersion: mcad.ibm.com/v1beta1
-kind: AppWrapper
-metadata:
-  name: aw-kuberay-glue
-  namespace: default
-spec:
-  priority: 9
-  resources:
-    Items: []
-    GenericItems:
-    - replicas: 1
-      custompodresources:
-      - replicas: 4
-        requests:
-          cpu: 2
-          memory: 12G
-          nvidia.com/gpu: 1
-        limits:
-          cpu: 2
-          memory: 12G
-          nvidia.com/gpu: 1
-      generictemplate:
-        # This config demonstrates KubeRay's Ray autoscaler integration.
-        # The resource requests and limits in this config are too small for production!
-        # For an example with more realistic resource configuration, see
-        # ray-cluster.autoscaler.large.yaml.
-        apiVersion: ray.io/v1alpha1
-        kind: RayCluster
-        metadata:
-          labels:
-            appwrapper.mcad.ibm.com: "aw-kuberay-glue"
-            controller-tools.k8s.io: "1.0"
-            # A unique identifier for the head node and workers of this cluster.
-          name: glue-cluster
-          # finalizers:
-          # - kubernetes
-        spec:
-          # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
-          rayVersion: '1.12.0'
-          # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
-          # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
-          # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
-          enableInTreeAutoscaling: false
-          # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
-          # The example configuration shown below below represents the DEFAULT values.
-          # (You may delete autoscalerOptions if the defaults are suitable.)
-          autoscalerOptions:
-            # upscalingMode is "Default" or "Aggressive."
-            # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
-            # Default: Upscaling is not rate-limited.
-            # Aggressive: An alias for Default; upscaling is not rate-limited.
-            upscalingMode: Default
-            # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
-            idleTimeoutSeconds: 60
-            # image optionally overrides the autoscaler's container image.
-            # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
-            # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
-            ## image: "my-repo/my-custom-autoscaler-image:tag"
-            # imagePullPolicy optionally overrides the autoscaler container's image pull policy.
-            imagePullPolicy: Always
-            # resources specifies optional resource request and limit overrides for the autoscaler container.
-            # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
-            resources:
-              limits:
-                cpu: "500m"
-                memory: "512Mi"
-              requests:
-                cpu: "500m"
-                memory: "512Mi"
-          ######################headGroupSpec#################################
-          # head group template and specs, (perhaps 'group' is not needed in the name)
-          headGroupSpec:
-            # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
-            serviceType: ClusterIP
-            # logical group name, for this called head-group, also can be functional
-            # pod type head or worker
-            # rayNodeType: head # Not needed since it is under the headgroup
-            # the following params are used to complete the ray start: ray start --head --block ...
-            rayStartParams:
-              # Flag "no-monitor" will be automatically set when autoscaling is enabled.
-              dashboard-host: '0.0.0.0'
-              block: 'true'
-              # num-cpus: '1' # can be auto-completed from the limits
-              # Use `resources` to optionally specify custom resource annotations for the Ray node.
-              # The value of `resources` is a string-integer mapping.
-              # Currently, `resources` must be provided in the specific format demonstrated below:
-              # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
-              num-gpus: '0'
-            #pod template
-            template:
-              spec:
-                affinity:
-                  nodeAffinity:
-                    requiredDuringSchedulingIgnoredDuringExecution:
-                      nodeSelectorTerms:
-                      - matchExpressions:
-                        - key: aw-kuberay-glue #--> key changed to AW name
-                          operator: In
-                          values:
-                          - "aw-kuberay-glue"
-                containers:
-                # The Ray head pod
-                - name: ray-head
-                  image: asm582/codeflare-tl-aws:latest
-                  env:
-                  - name: AWS_ACCESS_KEY_ID
-                    valueFrom:
-                      secretKeyRef:
-                        name: glue-s3-creds
-                        key: AWS_ACCESS_KEY_ID
-                  - name: AWS_SECRET_ACCESS_KEY
-                    valueFrom:
-                      secretKeyRef:
-                        name: glue-s3-creds
-                        key: AWS_SECRET_ACCESS_KEY
-                  - name: ENDPOINT_URL
-                    valueFrom:
-                      secretKeyRef:
-                        name: glue-s3-creds
-                        key: ENDPOINT_URL
-                  imagePullPolicy: Always
-                  ports:
-                  - containerPort: 6379
-                    name: gcs
-                  - containerPort: 8265
-                    name: dashboard
-                  - containerPort: 10001
-                    name: client
-                  lifecycle:
-                    preStop:
-                      exec:
-                        command: ["/bin/sh","-c","ray stop"]
-                  resources:
-                    limits:
-                      cpu: "2"
-                      memory: "12G"
-                      nvidia.com/gpu: "0"
-                    requests:
-                      cpu: "2"
-                      memory: "12G"
-                      nvidia.com/gpu: "0"
-          workerGroupSpecs:
-          # the pod replicas in this group typed worker
-          - replicas: 3
-            minReplicas: 3
-            maxReplicas: 3
-            # logical group name, for this called small-group, also can be functional
-            groupName: small-group
-            # if worker pods need to be added, we can simply increment the replicas
-            # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
-            # the operator will remove pods from the list until the number of replicas is satisfied
-            # when a pod is confirmed to be deleted, its name will be removed from the list below
-            #scaleStrategy:
-            #  workersToDelete:
-            #  - raycluster-complete-worker-small-group-bdtwh
-            #  - raycluster-complete-worker-small-group-hv457
-            #  - raycluster-complete-worker-small-group-k8tj7
-            # the following params are used to complete the ray start: ray start --block ...
-            rayStartParams:
-              block: 'true'
-              num-gpus: '1'
-            #pod template
-            template:
-              metadata:
-                labels:
-                  key: value
-                # annotations for pod
-                annotations:
-                  key: value
-                # finalizers:
-                # - kubernetes
-              spec:
-                affinity:
-                  nodeAffinity:
-                    requiredDuringSchedulingIgnoredDuringExecution:
-                      nodeSelectorTerms:
-                      - matchExpressions:
-                        - key: aw-kuberay-glue #--> key changed to AW name
-                          operator: In
-                          values:
-                          - "aw-kuberay-glue"
-                initContainers:
-                # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
-                - name: init-myservice
-                  image: busybox:1.28
-                  command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
-                containers:
-                - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-                  image: asm582/codeflare-tl-aws:latest
-                  env:
-                  - name: AWS_ACCESS_KEY_ID
-                    valueFrom:
-                      secretKeyRef:
-                        name: glue-s3-creds
-                        key: AWS_ACCESS_KEY_ID
-                  - name: AWS_SECRET_ACCESS_KEY
-                    valueFrom:
-                      secretKeyRef:
-                        name: glue-s3-creds
-                        key: AWS_SECRET_ACCESS_KEY
-                  - name: ENDPOINT_URL
-                    valueFrom:
-                      secretKeyRef:
-                        name: glue-s3-creds
-                        key: ENDPOINT_URL
-                  # environment variables to set in the container.Optional.
-                  # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
-                  lifecycle:
-                    preStop:
-                      exec:
-                        command: ["/bin/sh","-c","ray stop"]
-                  resources:
-                    limits:
-                      cpu: "2"
-                      memory: "12G"
-                      nvidia.com/gpu: "1"
-                    requests:
-                      cpu: "2"
-                      memory: "12G"
-                      nvidia.com/gpu: "1"
diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml
index 100e610ac..5a80a96e3 100644
--- a/src/codeflare_sdk/templates/base-template.yaml
+++ b/src/codeflare_sdk/templates/base-template.yaml
@@ -1,21 +1,37 @@
 apiVersion: mcad.ibm.com/v1beta1
 kind: AppWrapper
 metadata:
-  name: raycluster-autoscaler
+  name: aw-kuberay
   namespace: default
+  #new addition
+  labels:
+    orderedinstance: "m4.xlarge_g4dn.xlarge"
 spec:
+  priority: 9
   resources:
     Items: []
     GenericItems:
     - replicas: 1
+      #new addition
       custompodresources:
-      - replicas: 2
+      - replicas: 1
         requests:
-          cpu: 10
-          memory: 512Mi
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
         limits:
-          cpu: 10
-          memory: 1G
+          cpu: 2
+          memory: 8G
+          nvidia.com/gpu: 0
+      - replicas: 3
+        requests:
+          cpu: 2
+          memory: 12G
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 2
+          memory: 12G
+          nvidia.com/gpu: 1
       generictemplate:
         # This config demonstrates KubeRay's Ray autoscaler integration.
         # The resource requests and limits in this config are too small for production!
@@ -25,16 +41,19 @@ spec:
         kind: RayCluster
         metadata:
           labels:
+            appwrapper.mcad.ibm.com: "aw-kuberay"
             controller-tools.k8s.io: "1.0"
             # A unique identifier for the head node and workers of this cluster.
-          name: raycluster-autoscaler
+          name: kuberay-cluster
+          # finalizers:
+          # - kubernetes
         spec:
           # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
-          rayVersion: '2.0.0'
+          rayVersion: '2.1.0'
           # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
           # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
           # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
-          enableInTreeAutoscaling: true
+          enableInTreeAutoscaling: false
           # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
           # The example configuration shown below below represents the DEFAULT values.
           # (You may delete autoscalerOptions if the defaults are suitable.)
@@ -79,13 +98,29 @@ spec:
               # The value of `resources` is a string-integer mapping.
               # Currently, `resources` must be provided in the specific format demonstrated below:
               # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
+              num-gpus: '0'
             #pod template
             template:
               spec:
+                #new addition
+                affinity:
+                  nodeAffinity:
+                    requiredDuringSchedulingIgnoredDuringExecution:
+                      nodeSelectorTerms:
+                      - matchExpressions:
+                        - key: aw-kuberay
+                          operator: In
+                          values:
+                          - "aw-kuberay"
                 containers:
                 # The Ray head pod
-                - name: ray-head
-                  image: rayproject/ray:2.0.0
+                - env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
+                  name: ray-head
+                  image: rayproject/ray:latest
                   imagePullPolicy: Always
                   ports:
                   - containerPort: 6379
@@ -100,16 +135,18 @@ spec:
                         command: ["/bin/sh","-c","ray stop"]
                   resources:
                     limits:
-                      cpu: "1"
-                      memory: "1G"
+                      cpu: "2"
+                      memory: "8G"
+                      nvidia.com/gpu: "0"
                     requests:
-                      cpu: "500m"
-                      memory: "512Mi"
+                      cpu: "2"
+                      memory: "8G"
+                      nvidia.com/gpu: "0"
           workerGroupSpecs:
           # the pod replicas in this group typed worker
-          - replicas: 1
-            minReplicas: 1
-            maxReplicas: 300
+          - replicas: 3
+            minReplicas: 3
+            maxReplicas: 3
             # logical group name, for this called small-group, also can be functional
             groupName: small-group
             # if worker pods need to be added, we can simply increment the replicas
@@ -124,6 +161,7 @@ spec:
             # the following params are used to complete the ray start: ray start --block ...
             rayStartParams:
               block: 'true'
+              num-gpus: 1
             #pod template
             template:
               metadata:
@@ -132,7 +170,18 @@ spec:
                 # annotations for pod
                 annotations:
                   key: value
+                # finalizers:
+                # - kubernetes
               spec:
+                affinity:
+                  nodeAffinity:
+                    requiredDuringSchedulingIgnoredDuringExecution:
+                      nodeSelectorTerms:
+                      - matchExpressions:
+                        - key: aw-kuberay
+                          operator: In
+                          values:
+                          - "aw-kuberay"
                 initContainers:
                 # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
                 - name: init-myservice
@@ -140,7 +189,12 @@ spec:
                   command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
                 containers:
                 - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-                  image: rayproject/ray:2.0.0
+                  image: rayproject/ray:latest
+                  env:
+                  - name: MY_POD_IP
+                    valueFrom:
+                      fieldRef:
+                        fieldPath: status.podIP
                   # environment variables to set in the container.Optional.
                   # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
                   lifecycle:
@@ -149,8 +203,26 @@ spec:
                         command: ["/bin/sh","-c","ray stop"]
                   resources:
                     limits:
-                      cpu: "1"
-                      memory: "512Mi"
+                      cpu: "2"
+                      memory: "12G"
+                      nvidia.com/gpu: "1"
                     requests:
-                      cpu: "500m"
-                      memory: "256Mi"
+                      cpu: "2"
+                      memory: "12G"
+                      nvidia.com/gpu: "1"
+    - replica: 1
+      generictemplate:
+        kind: Route
+        apiVersion: route.openshift.io/v1
+        metadata:
+          name: ray-dashboard-deployment-name
+          namespace: default
+          labels:
+            # allows me to return name of service that Ray operator creates
+            odh-ray-cluster-service: deployment-name-head-svc
+        spec:
+          to:
+            kind: Service
+            name: deployment-name-head-svc
+          port:
+            targetPort: dashboard
diff --git a/src/codeflare_sdk/templates/new-template.yaml b/src/codeflare_sdk/templates/new-template.yaml
deleted file mode 100644
index 585f05baa..000000000
--- a/src/codeflare_sdk/templates/new-template.yaml
+++ /dev/null
@@ -1,228 +0,0 @@
-apiVersion: mcad.ibm.com/v1beta1
-kind: AppWrapper
-metadata:
-  name: aw-kuberay
-  namespace: default
-  #new addition
-  labels:
-    orderedinstance: "m4.xlarge_g4dn.xlarge"
-spec:
-  priority: 9
-  resources:
-    Items: []
-    GenericItems:
-    - replicas: 1
-      #new addition
-      custompodresources:
-      - replicas: 1
-        requests:
-          cpu: 2
-          memory: 8G
-          nvidia.com/gpu: 0
-        limits:
-          cpu: 2
-          memory: 8G
-          nvidia.com/gpu: 0
-      - replicas: 3
-        requests:
-          cpu: 2
-          memory: 12G
-          nvidia.com/gpu: 1
-        limits:
-          cpu: 2
-          memory: 12G
-          nvidia.com/gpu: 1
-      generictemplate:
-        # This config demonstrates KubeRay's Ray autoscaler integration.
-        # The resource requests and limits in this config are too small for production!
-        # For an example with more realistic resource configuration, see
-        # ray-cluster.autoscaler.large.yaml.
-        apiVersion: ray.io/v1alpha1
-        kind: RayCluster
-        metadata:
-          labels:
-            appwrapper.mcad.ibm.com: "aw-kuberay"
-            controller-tools.k8s.io: "1.0"
-            # A unique identifier for the head node and workers of this cluster.
-          name: kuberay-cluster
-          # finalizers:
-          # - kubernetes
-        spec:
-          # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
-          rayVersion: '1.12.0'
-          # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.
-          # Ray autoscaler integration is supported only for Ray versions >= 1.11.0
-          # Ray autoscaler integration is Beta with KubeRay >= 0.3.0 and Ray >= 2.0.0.
-          enableInTreeAutoscaling: false
-          # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.
-          # The example configuration shown below below represents the DEFAULT values.
-          # (You may delete autoscalerOptions if the defaults are suitable.)
-          autoscalerOptions:
-            # upscalingMode is "Default" or "Aggressive."
-            # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.
-            # Default: Upscaling is not rate-limited.
-            # Aggressive: An alias for Default; upscaling is not rate-limited.
-            upscalingMode: Default
-            # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.
-            idleTimeoutSeconds: 60
-            # image optionally overrides the autoscaler's container image.
-            # If instance.spec.rayVersion is at least "2.0.0", the autoscaler will default to the same image as
-            # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.
-            ## image: "my-repo/my-custom-autoscaler-image:tag"
-            # imagePullPolicy optionally overrides the autoscaler container's image pull policy.
-            imagePullPolicy: Always
-            # resources specifies optional resource request and limit overrides for the autoscaler container.
-            # For large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.
-            resources:
-              limits:
-                cpu: "500m"
-                memory: "512Mi"
-              requests:
-                cpu: "500m"
-                memory: "512Mi"
-          ######################headGroupSpec#################################
-          # head group template and specs, (perhaps 'group' is not needed in the name)
-          headGroupSpec:
-            # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
-            serviceType: ClusterIP
-            # logical group name, for this called head-group, also can be functional
-            # pod type head or worker
-            # rayNodeType: head # Not needed since it is under the headgroup
-            # the following params are used to complete the ray start: ray start --head --block ...
-            rayStartParams:
-              # Flag "no-monitor" will be automatically set when autoscaling is enabled.
-              dashboard-host: '0.0.0.0'
-              block: 'true'
-              # num-cpus: '1' # can be auto-completed from the limits
-              # Use `resources` to optionally specify custom resource annotations for the Ray node.
-              # The value of `resources` is a string-integer mapping.
-              # Currently, `resources` must be provided in the specific format demonstrated below:
-              # resources: '"{\"Custom1\": 1, \"Custom2\": 5}"'
-              num-gpus: '0'
-            #pod template
-            template:
-              spec:
-                #new addition
-                affinity:
-                  nodeAffinity:
-                    requiredDuringSchedulingIgnoredDuringExecution:
-                      nodeSelectorTerms:
-                      - matchExpressions:
-                        - key: aw-kuberay
-                          operator: In
-                          values:
-                          - "aw-kuberay"
-                containers:
-                # The Ray head pod
-                - env:
-                  - name: MY_POD_IP
-                    valueFrom:
-                      fieldRef:
-                        fieldPath: status.podIP
-                  name: ray-head
-                  image: rayproject/ray:latest
-                  imagePullPolicy: Always
-                  ports:
-                  - containerPort: 6379
-                    name: gcs
-                  - containerPort: 8265
-                    name: dashboard
-                  - containerPort: 10001
-                    name: client
-                  lifecycle:
-                    preStop:
-                      exec:
-                        command: ["/bin/sh","-c","ray stop"]
-                  resources:
-                    limits:
-                      cpu: "2"
-                      memory: "12G"
-                      nvidia.com/gpu: "0"
-                    requests:
-                      cpu: "2"
-                      memory: "12G"
-                      nvidia.com/gpu: "0"
-          workerGroupSpecs:
-          # the pod replicas in this group typed worker
-          - replicas: 3
-            minReplicas: 3
-            maxReplicas: 3
-            # logical group name, for this called small-group, also can be functional
-            groupName: small-group
-            # if worker pods need to be added, we can simply increment the replicas
-            # if worker pods need to be removed, we decrement the replicas, and populate the podsToDelete list
-            # the operator will remove pods from the list until the number of replicas is satisfied
-            # when a pod is confirmed to be deleted, its name will be removed from the list below
-            #scaleStrategy:
-            #  workersToDelete:
-            #  - raycluster-complete-worker-small-group-bdtwh
-            #  - raycluster-complete-worker-small-group-hv457
-            #  - raycluster-complete-worker-small-group-k8tj7
-            # the following params are used to complete the ray start: ray start --block ...
-            rayStartParams:
-              block: 'true'
-              num-gpus: 1
-            #pod template
-            template:
-              metadata:
-                labels:
-                  key: value
-                # annotations for pod
-                annotations:
-                  key: value
-                # finalizers:
-                # - kubernetes
-              spec:
-                affinity:
-                  nodeAffinity:
-                    requiredDuringSchedulingIgnoredDuringExecution:
-                      nodeSelectorTerms:
-                      - matchExpressions:
-                        - key: aw-kuberay
-                          operator: In
-                          values:
-                          - "aw-kuberay"
-                initContainers:
-                # the env var $RAY_IP is set by the operator if missing, with the value of the head service name
-                - name: init-myservice
-                  image: busybox:1.28
-                  command: ['sh', '-c', "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
-                containers:
-                - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
-                  image: rayproject/ray:latest
-                  env:
-                  - name: MY_POD_IP
-                    valueFrom:
-                      fieldRef:
-                        fieldPath: status.podIP
-                  # environment variables to set in the container.Optional.
-                  # Refer to https://kubernetes.io/docs/tasks/inject-data-application/define-environment-variable-container/
-                  lifecycle:
-                    preStop:
-                      exec:
-                        command: ["/bin/sh","-c","ray stop"]
-                  resources:
-                    limits:
-                      cpu: "2"
-                      memory: "12G"
-                      nvidia.com/gpu: "1"
-                    requests:
-                      cpu: "2"
-                      memory: "12G"
-                      nvidia.com/gpu: "1"
-    - replica: 1
-      generictemplate:
-        kind: Route
-        apiVersion: route.openshift.io/v1
-        metadata:
-          name: ray-dashboard-deployment-name
-          namespace: default
-          labels:
-            # allows me to return name of service that Ray operator creates
-            odh-ray-cluster-service: deployment-name-head-svc
-        spec:
-          to:
-            kind: Service
-            name: deployment-name-head-svc
-          port:
-            targetPort: dashboard
diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py
index b80e83e51..36757a2d6 100755
--- a/src/codeflare_sdk/utils/generate_yaml.py
+++ b/src/codeflare_sdk/utils/generate_yaml.py
@@ -185,7 +185,8 @@ def update_nodes(
             update_image(spec, image)
             update_env(spec, env)
             if comp == head:
-                update_resources(spec, 2, 2, 8, 8, 0)
+                # TODO: Eventually add head node configuration outside of template
+                continue
             else:
                 update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
 

From de41cf940903d446c282ee294babd5c4236a0149 Mon Sep 17 00:00:00 2001
From: maxusmusti <meyceoz@redhat.com>
Date: Tue, 20 Jun 2023 16:02:27 -0400
Subject: [PATCH 2/2] Updated unit tests

---
 src/codeflare_sdk/templates/base-template.yaml | 8 ++++----
 tests/test-case-cmd.yaml                       | 2 +-
 tests/test-case.yaml                           | 2 +-
 tests/unit_test.py                             | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml
index 5a80a96e3..c99fd105d 100644
--- a/src/codeflare_sdk/templates/base-template.yaml
+++ b/src/codeflare_sdk/templates/base-template.yaml
@@ -135,13 +135,13 @@ spec:
                         command: ["/bin/sh","-c","ray stop"]
                   resources:
                     limits:
-                      cpu: "2"
+                      cpu: 2
                       memory: "8G"
-                      nvidia.com/gpu: "0"
+                      nvidia.com/gpu: 0
                     requests:
-                      cpu: "2"
+                      cpu: 2
                       memory: "8G"
-                      nvidia.com/gpu: "0"
+                      nvidia.com/gpu: 0
           workerGroupSpecs:
           # the pod replicas in this group typed worker
           - replicas: 3
diff --git a/tests/test-case-cmd.yaml b/tests/test-case-cmd.yaml
index 4e5d72534..450ec9668 100644
--- a/tests/test-case-cmd.yaml
+++ b/tests/test-case-cmd.yaml
@@ -88,7 +88,7 @@ spec:
                       cpu: 2
                       memory: 8G
                       nvidia.com/gpu: 0
-          rayVersion: 1.12.0
+          rayVersion: 2.1.0
           workerGroupSpecs:
           - groupName: small-group-unit-cmd-cluster
             maxReplicas: 2
diff --git a/tests/test-case.yaml b/tests/test-case.yaml
index 2aebb0007..133a22229 100644
--- a/tests/test-case.yaml
+++ b/tests/test-case.yaml
@@ -99,7 +99,7 @@ spec:
                       cpu: 2
                       memory: 8G
                       nvidia.com/gpu: 0
-          rayVersion: 1.12.0
+          rayVersion: 2.1.0
           workerGroupSpecs:
           - groupName: small-group-unit-test-cluster
             maxReplicas: 2
diff --git a/tests/unit_test.py b/tests/unit_test.py
index f1255dc45..47c0d43a8 100644
--- a/tests/unit_test.py
+++ b/tests/unit_test.py
@@ -224,7 +224,7 @@ def test_config_creation():
         config.image
         == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
     )
-    assert config.template == f"{parent}/src/codeflare_sdk/templates/new-template.yaml"
+    assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml"
     assert config.instascale
     assert config.machine_types == ["cpu.small", "gpu.large"]
     return config
@@ -1983,7 +1983,7 @@ def test_AWManager_submit_remove(mocker, capsys):
 # Make sure to keep this function and the following function at the end of the file
 def test_cmd_line_generation():
     os.system(
-        f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/new-template.yaml"
+        f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/base-template.yaml"
     )
     assert filecmp.cmp(
         "unit-cmd-cluster.yaml", f"{parent}/tests/test-case-cmd.yaml", shallow=True