Skip to content

Adding instascale e2e test #271

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.19

require (
github.com/onsi/gomega v1.27.10
github.com/openshift-online/ocm-sdk-go v0.1.368
github.com/openshift/api v0.0.0-20230213134911-7ba313770556
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c
github.com/project-codeflare/instascale v0.0.9
Expand Down Expand Up @@ -69,7 +70,6 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/openshift-online/ocm-sdk-go v0.1.327 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -377,8 +377,8 @@ github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl
github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro=
github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI=
github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M=
github.com/openshift-online/ocm-sdk-go v0.1.327 h1:WR822bGdQoMuZ2+dFdhZz3fpD2NlJhGr+F3FJPXvqFU=
github.com/openshift-online/ocm-sdk-go v0.1.327/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw=
github.com/openshift-online/ocm-sdk-go v0.1.368 h1:qP+gkChV8WDwwpkUw1xUyjTXKdvrwyd70Gff2GMUSeU=
github.com/openshift-online/ocm-sdk-go v0.1.368/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw=
github.com/openshift/api v0.0.0-20230213134911-7ba313770556 h1:7W2fOhJicyEff24VaF7ASNzPtYvr+iSCVft4SIBAzaE=
github.com/openshift/api v0.0.0-20230213134911-7ba313770556/go.mod h1:aQ6LDasvHMvHZXqLHnX2GRmnfTWCF/iIwz8EMTTIE9A=
github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c h1:CV76yFOTXmq9VciBR3Bve5ZWzSxdft7gaMVB3kS0rwg=
Expand Down
132 changes: 132 additions & 0 deletions test/e2e/instascale_app_wrapper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package e2e

import (
. "github.com/onsi/gomega"
mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"

batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "github.com/project-codeflare/codeflare-operator/test/support"
)

func createInstaScaleJobAppWrapper(test Test, namespace *corev1.Namespace, config *corev1.ConfigMap) (*batchv1.Job, *mcadv1beta1.AppWrapper, error) {
// Batch Job
job := &batchv1.Job{
TypeMeta: metav1.TypeMeta{
APIVersion: batchv1.SchemeGroupVersion.String(),
Kind: "Job",
},
ObjectMeta: metav1.ObjectMeta{
Name: "mnist",
Namespace: namespace.Name,
},
Spec: batchv1.JobSpec{
Completions: Ptr(int32(1)),
Parallelism: Ptr(int32(1)),
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "job",
Image: GetPyTorchImage(),
Env: []corev1.EnvVar{
{Name: "PYTHONUSERBASE", Value: "/workdir"},
},
Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"},
Args: []string{"$PYTHONUSERBASE"},
VolumeMounts: []corev1.VolumeMount{
{
Name: "test",
MountPath: "/test",
},
{
Name: "workdir",
MountPath: "/workdir",
},
},
WorkingDir: "/workdir",
},
},
Volumes: []corev1.Volume{
{
Name: "test",
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: config.Name,
},
},
},
},
{
Name: "workdir",
VolumeSource: corev1.VolumeSource{
EmptyDir: &corev1.EmptyDirVolumeSource{},
},
},
},
RestartPolicy: corev1.RestartPolicyNever,
},
},
},
}

// create an appwrapper
aw := &mcadv1beta1.AppWrapper{
ObjectMeta: metav1.ObjectMeta{
Name: "test-instascale",
Namespace: namespace.Name,
Labels: map[string]string{
"orderedinstance": "g4dn.xlarge",
},
},
Spec: mcadv1beta1.AppWrapperSpec{
AggrResources: mcadv1beta1.AppWrapperResourceList{
GenericItems: []mcadv1beta1.AppWrapperGenericResource{
{
CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{
{
Replicas: 1,
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("250m"),
corev1.ResourceMemory: resource.MustParse("512Mi"),
"nvidia.com/gpu": resource.MustParse("1"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1G"),
"nvidia.com/gpu": resource.MustParse("1"),
},
},
{
Replicas: 1,
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("250m"),
corev1.ResourceMemory: resource.MustParse("512Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("500m"),
corev1.ResourceMemory: resource.MustParse("1G"),
},
},
},
GenericTemplate: Raw(test, job),
CompletionStatus: "Complete",
},
},
},
},
}

_, err := test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{})
test.Expect(err).NotTo(HaveOccurred())
test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name)

test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutGpuProvisioning).
Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive)))

return job, aw, err
}
68 changes: 68 additions & 0 deletions test/e2e/instascale_machinepool_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package e2e

import (
"testing"

. "github.com/onsi/gomega"
mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1"

batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"

. "github.com/project-codeflare/codeflare-operator/test/support"
)

func TestInstascaleMachinePool(t *testing.T) {

test := With(t)
test.T().Parallel()

namespace := test.NewTestNamespace()

// Test configuration
testConfigData := map[string][]byte{
// pip requirements
"requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"),
// MNIST training script
"mnist.py": ReadFile(test, "mnist.py"),
}
cm := CreateConfigMap(test, namespace.Name, testConfigData)

//create OCM connection
connection := CreateOCMConnection(test)

defer connection.Close()

// check existing cluster machine pool resources
// look for machine pool with aw name - expect not to find it
test.Expect(GetMachinePools(test, connection)).
ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge"))))

// Setup batch job and AppWrapper
job, aw, err := createInstaScaleJobAppWrapper(test, namespace, cm)
test.Expect(err).NotTo(HaveOccurred())

// look for machine pool with aw name - expect to find it
test.Eventually(MachinePools(test, connection), TestTimeoutLong).
Should(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge"))))

// Assert that the job has completed
test.T().Logf("Waiting for Job %s/%s to complete", job.Namespace, job.Name)
test.Eventually(Job(test, job.Namespace, job.Name), TestTimeoutLong).Should(
Or(
WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)),
WithTransform(ConditionStatus(batchv1.JobFailed), Equal(corev1.ConditionTrue)),
))

// Assert the job has completed successfully
test.Expect(GetJob(test, job.Namespace, job.Name)).
To(WithTransform(ConditionStatus(batchv1.JobComplete), Equal(corev1.ConditionTrue)))

test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort).
Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted)))

// look for machine pool with aw name - expect not to find it
test.Eventually(MachinePools(test, connection), TestTimeoutLong).
ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge"))))

}
28 changes: 28 additions & 0 deletions test/support/clusterpools.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package support

import (
"github.com/onsi/gomega"

ocmsdk "github.com/openshift-online/ocm-sdk-go"
cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1"
)

func MachinePools(t Test, connection *ocmsdk.Connection) func(g gomega.Gomega) []*cmv1.MachinePool {
osdClusterId, found := GetOsdClusterId()
t.Expect(found).To(gomega.BeTrue(), "OSD cluster id not found, please configure environment properly")

return func(g gomega.Gomega) []*cmv1.MachinePool {
machinePoolsListResponse, err := connection.ClustersMgmt().V1().Clusters().Cluster(osdClusterId).MachinePools().List().Send()
g.Expect(err).NotTo(gomega.HaveOccurred())
return machinePoolsListResponse.Items().Slice()
}
}

func GetMachinePools(t Test, connection *ocmsdk.Connection) []*cmv1.MachinePool {
t.T().Helper()
return MachinePools(t, connection)(t)
}

func MachinePoolId(machinePool *cmv1.MachinePool) string {
return machinePool.ID()
}
20 changes: 19 additions & 1 deletion test/support/codeflare.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,14 @@ const (
CodeFlareTestPyTorchImage = "CODEFLARE_TEST_PYTORCH_IMAGE"

// The testing output directory, to write output files into.

CodeFlareTestOutputDir = "CODEFLARE_TEST_OUTPUT_DIR"

// The name of a secret containing InstaScale OCM token.
InstaScaleOcmSecretName = "INSTASCALE_OCM_SECRET_NAME"
// The namespace where a secret containing InstaScale OCM token is stored.
InstaScaleOcmSecretNamespace = "INSTASCALE_OCM_SECRET_NAMESPACE"
// Cluster ID for OSD cluster used in tests, used for testing InstaScale
OsdClusterID = "CLUSTERID"
)

func GetCodeFlareSDKVersion() string {
Expand All @@ -50,6 +56,18 @@ func GetPyTorchImage() string {
return lookupEnvOrDefault(CodeFlareTestPyTorchImage, "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime")
}

func GetInstaScaleOcmSecretName() string {
return lookupEnvOrDefault(InstaScaleOcmSecretName, "instascale-ocm-secret")
}

func GetInstaScaleOcmSecretNamespace() string {
return lookupEnvOrDefault(InstaScaleOcmSecretNamespace, "default")
}

func GetOsdClusterId() (string, bool) {
return os.LookupEnv(OsdClusterID)
}

func lookupEnvOrDefault(key, value string) string {
if v, ok := os.LookupEnv(key); ok {
return v
Expand Down
45 changes: 45 additions & 0 deletions test/support/config_map.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
Copyright 2023.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package support

import (
"github.com/onsi/gomega"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func CreateConfigMap(t Test, namespace string, content map[string][]byte) *corev1.ConfigMap {
configMap := &corev1.ConfigMap{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Kind: "ConfigMap",
},
ObjectMeta: metav1.ObjectMeta{
GenerateName: "config-",
Namespace: namespace,
},
BinaryData: content,
Immutable: Ptr(true),
}

configMap, err := t.Client().Core().CoreV1().ConfigMaps(namespace).Create(t.Ctx(), configMap, metav1.CreateOptions{})
t.Expect(err).NotTo(gomega.HaveOccurred())
t.T().Logf("Created ConfigMap %s/%s successfully", configMap.Namespace, configMap.Name)

return configMap
}
52 changes: 52 additions & 0 deletions test/support/ocm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
Copyright 2023.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package support

import (
"fmt"
"os"

"github.com/onsi/gomega"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

ocmsdk "github.com/openshift-online/ocm-sdk-go"
)

func CreateOCMConnection(test Test) *ocmsdk.Connection {
instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets(GetInstaScaleOcmSecretNamespace()).Get(test.Ctx(), GetInstaScaleOcmSecretName(), metav1.GetOptions{})
test.Expect(err).NotTo(gomega.HaveOccurred())

ocmToken := string(instascaleOCMSecret.Data["token"])
test.T().Logf("Retrieved Secret %s/%s successfully", instascaleOCMSecret.Namespace, instascaleOCMSecret.Name)

connection, err := buildOCMConnection(ocmToken)
test.Expect(err).NotTo(gomega.HaveOccurred())
return connection
}

func buildOCMConnection(secret string) (*ocmsdk.Connection, error) {
connection, err := ocmsdk.NewConnectionBuilder().
Tokens(secret).
Build()
if err != nil || connection == nil {
fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err)
return nil, err
}

return connection, nil
}
Loading