Skip to content

Commit bed345d

Browse files
committed
update to test
1 parent 0af1f73 commit bed345d

File tree

3 files changed

+267
-152
lines changed

3 files changed

+267
-152
lines changed

test/e2e/instascale/instascale_test.go

Lines changed: 0 additions & 133 deletions
This file was deleted.

test/e2e/instascale_test.go

Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
package e2e
2+
3+
import (
4+
"sync"
5+
"testing"
6+
"time"
7+
8+
. "github.com/onsi/gomega"
9+
. "github.com/project-codeflare/codeflare-operator/test/support"
10+
mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"
11+
batchv1 "k8s.io/api/batch/v1"
12+
corev1 "k8s.io/api/core/v1"
13+
"k8s.io/apimachinery/pkg/api/resource"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
)
16+
17+
var (
18+
machinePoolsExist bool
19+
numInitialNodePools int
20+
numInitialMachineSets int
21+
wg = &sync.WaitGroup{}
22+
)
23+
24+
func TestInstascale(t *testing.T) {
25+
26+
test := With(t)
27+
test.T().Parallel()
28+
29+
namespace := test.NewTestNamespace()
30+
31+
// Test configuration
32+
config := &corev1.ConfigMap{
33+
TypeMeta: metav1.TypeMeta{
34+
APIVersion: corev1.SchemeGroupVersion.String(),
35+
Kind: "ConfigMap",
36+
},
37+
ObjectMeta: metav1.ObjectMeta{
38+
Name: "mnist-mcad",
39+
Namespace: namespace.Name,
40+
},
41+
BinaryData: map[string][]byte{
42+
// pip requirements
43+
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
44+
// MNIST training script
45+
"mnist.py": ReadFile(test, "mnist.py"),
46+
},
47+
Immutable: Ptr(true),
48+
}
49+
config, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Create(test.Ctx(), config, metav1.CreateOptions{})
50+
test.Expect(err).NotTo(HaveOccurred())
51+
test.T().Logf("Created ConfigMap %s/%s successfully", config.Namespace, config.Name)
52+
53+
// create OCM connection
54+
instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets("default").Get(test.Ctx(), "instascale-ocm-secret", metav1.GetOptions{})
55+
if err != nil {
56+
test.T().Errorf("unable to retrieve instascale-ocm-secret - Error : %v", err)
57+
}
58+
test.Expect(err).NotTo(HaveOccurred())
59+
ocmToken := string(instascaleOCMSecret.Data["token"])
60+
test.T().Logf("Retrieved Secret %s successfully", instascaleOCMSecret.Name)
61+
62+
connection, err := CreateOCMConnection(ocmToken)
63+
if err != nil {
64+
test.T().Errorf("Unable to create ocm connection - Error : %v", err)
65+
}
66+
defer connection.Close()
67+
68+
machinePoolsExist = true
69+
// check existing cluster resources
70+
numInitialMachinePools, err := MachinePoolsCount(connection)
71+
if err != nil {
72+
test.T().Errorf("Unable to count machine pools - Error : %v", err)
73+
}
74+
75+
if numInitialMachinePools == 0 {
76+
machinePoolsExist = false
77+
numInitialNodePools, err = NodePoolsCount(connection)
78+
if err != nil {
79+
test.T().Errorf("Unable to count node pools - Error : %v", err)
80+
}
81+
if numInitialNodePools == 0 {
82+
numInitialMachineSets, err = MachineSetsCount()
83+
if err != nil {
84+
test.T().Errorf("Unable to count machine sets - Error : %v", err)
85+
}
86+
}
87+
}
88+
89+
// Batch Job
90+
job := &batchv1.Job{
91+
TypeMeta: metav1.TypeMeta{
92+
APIVersion: batchv1.SchemeGroupVersion.String(),
93+
Kind: "Job",
94+
},
95+
ObjectMeta: metav1.ObjectMeta{
96+
Name: "mnist",
97+
Namespace: namespace.Name,
98+
},
99+
Spec: batchv1.JobSpec{
100+
Completions: Ptr(int32(1)),
101+
Parallelism: Ptr(int32(1)),
102+
Template: corev1.PodTemplateSpec{
103+
Spec: corev1.PodSpec{
104+
Containers: []corev1.Container{
105+
{
106+
Name: "job",
107+
Image: GetPyTorchImage(),
108+
Env: []corev1.EnvVar{
109+
corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/test2"},
110+
},
111+
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
112+
Args: []string{"$PYTHONUSERBASE"},
113+
VolumeMounts: []corev1.VolumeMount{
114+
{
115+
Name: "test",
116+
MountPath: "/test",
117+
},
118+
{
119+
Name: "test2",
120+
MountPath: "/test2",
121+
},
122+
},
123+
WorkingDir: "/test2",
124+
},
125+
},
126+
Volumes: []corev1.Volume{
127+
{
128+
Name: "test",
129+
VolumeSource: corev1.VolumeSource{
130+
ConfigMap: &corev1.ConfigMapVolumeSource{
131+
LocalObjectReference: corev1.LocalObjectReference{
132+
Name: config.Name,
133+
},
134+
},
135+
},
136+
},
137+
{
138+
Name: "test2",
139+
VolumeSource: corev1.VolumeSource{
140+
EmptyDir: &corev1.EmptyDirVolumeSource{},
141+
},
142+
},
143+
},
144+
RestartPolicy: corev1.RestartPolicyNever,
145+
},
146+
},
147+
},
148+
}
149+
150+
// create an appwrapper
151+
aw := &mcadv1beta1.AppWrapper{
152+
ObjectMeta: metav1.ObjectMeta{
153+
Name: "test-instascale",
154+
Namespace: namespace.Name,
155+
Labels: map[string]string{
156+
"orderedinstance": "m5.xlarge_g4dn.xlarge",
157+
},
158+
},
159+
Spec: mcadv1beta1.AppWrapperSpec{
160+
AggrResources: mcadv1beta1.AppWrapperResourceList{
161+
GenericItems: []mcadv1beta1.AppWrapperGenericResource{
162+
{
163+
CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{
164+
{
165+
Replicas: 1,
166+
Requests: corev1.ResourceList{
167+
corev1.ResourceCPU: resource.MustParse("250m"),
168+
corev1.ResourceMemory: resource.MustParse("512Mi"),
169+
},
170+
Limits: corev1.ResourceList{
171+
corev1.ResourceCPU: resource.MustParse("500m"),
172+
corev1.ResourceMemory: resource.MustParse("1G"),
173+
},
174+
},
175+
{
176+
Replicas: 1,
177+
Requests: corev1.ResourceList{
178+
corev1.ResourceCPU: resource.MustParse("250m"),
179+
corev1.ResourceMemory: resource.MustParse("512Mi"),
180+
},
181+
Limits: corev1.ResourceList{
182+
corev1.ResourceCPU: resource.MustParse("500m"),
183+
corev1.ResourceMemory: resource.MustParse("1G"),
184+
},
185+
},
186+
},
187+
GenericTemplate: Raw(test, job),
188+
},
189+
},
190+
},
191+
},
192+
}
193+
194+
_, err = test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{})
195+
test.Expect(err).NotTo(HaveOccurred())
196+
test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name)
197+
198+
test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort).
199+
Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive)))
200+
201+
// wait for required resources to be created before checking them again
202+
time.Sleep(TestTimeoutShort)
203+
if !machinePoolsExist {
204+
numNodePools, err := NodePoolsCount(connection)
205+
if err != nil {
206+
test.T().Errorf("Unable to count node pools - Error : %v", err)
207+
}
208+
test.Expect(numNodePools).To(BeNumerically(">", numInitialNodePools))
209+
test.T().Logf("number of node pools increased from %d to %d", numInitialNodePools, numNodePools)
210+
211+
} else if machinePoolsExist {
212+
numMachinePools, err := MachinePoolsCount(connection)
213+
if err != nil {
214+
test.T().Errorf("Unable to count machine pools - Error : %v", err)
215+
}
216+
test.Expect(numMachinePools).To(BeNumerically(">", numInitialMachinePools))
217+
test.T().Logf("number of machine pools increased from %d to %d", numInitialMachinePools, numMachinePools)
218+
} else {
219+
numMachineSets, err := MachineSetsCount()
220+
if err != nil {
221+
test.T().Errorf("Unable to count machine sets - Error : %v", err)
222+
}
223+
test.Expect(numMachineSets).To(BeNumerically(">", numInitialMachineSets))
224+
test.T().Logf("number of machine sets increased from %d to %d", numInitialMachineSets, numMachineSets)
225+
}
226+
227+
test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name)
228+
test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should(
229+
Or(
230+
WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)),
231+
WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)),
232+
))
233+
234+
// Assert the job has completed successfully
235+
test.Expect(GetJob(test, job.Namespace, job.Name)).
236+
To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)))
237+
238+
// AppWrapper not being updated to complete once job is finished
239+
240+
time.Sleep(TestTimeoutMedium)
241+
if !machinePoolsExist {
242+
numNodePoolsFinal, err := NodePoolsCount(connection)
243+
if err != nil {
244+
test.T().Errorf("Unable to count node pools - Error : %v", err)
245+
}
246+
test.Expect(numNodePoolsFinal).To(BeNumerically("==", numInitialNodePools))
247+
test.T().Logf("number of machine pools decreased")
248+
249+
} else if machinePoolsExist {
250+
numMachinePoolsFinal, err := MachinePoolsCount(connection)
251+
if err != nil {
252+
test.T().Errorf("Unable to count machine pools - Error : %v", err)
253+
}
254+
test.Expect(numMachinePoolsFinal).To(BeNumerically("==", numInitialMachinePools))
255+
test.T().Logf("number of machine pools decreased")
256+
} else {
257+
numMachineSetsFinal, err := MachineSetsCount()
258+
if err != nil {
259+
test.T().Errorf("Unable to count machine sets - Error : %v", err)
260+
}
261+
test.Expect(numMachineSetsFinal).To(BeNumerically("==", numInitialMachineSets))
262+
test.T().Logf("number of machine sets decreased")
263+
}
264+
}

0 commit comments

Comments
 (0)