diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index 42ee0b736..759932e95 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -84,6 +84,7 @@ jobs: export CODEFLARE_TEST_TIMEOUT_SHORT=1m export CODEFLARE_TEST_TIMEOUT_MEDIUM=5m export CODEFLARE_TEST_TIMEOUT_LONG=10m + export CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING=30m export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV diff --git a/go.mod b/go.mod index c1caf36cd..f8994c995 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.19 require ( github.com/onsi/gomega v1.27.10 + github.com/openshift-online/ocm-sdk-go v0.1.368 github.com/openshift/api v0.0.0-20230213134911-7ba313770556 github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c github.com/project-codeflare/instascale v0.0.9 @@ -69,7 +70,6 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/openshift-online/ocm-sdk-go v0.1.327 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.14.0 // indirect github.com/prometheus/client_model v0.3.0 // indirect diff --git a/go.sum b/go.sum index 2c06b07d6..e6b42178c 100644 --- a/go.sum +++ b/go.sum @@ -377,8 +377,8 @@ github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAl github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro= github.com/onsi/gomega v1.27.10 h1:naR28SdDFlqrG6kScpT8VWpu1xWY5nJRCF3XaYyBjhI= github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= -github.com/openshift-online/ocm-sdk-go v0.1.327 h1:WR822bGdQoMuZ2+dFdhZz3fpD2NlJhGr+F3FJPXvqFU= -github.com/openshift-online/ocm-sdk-go v0.1.327/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw= +github.com/openshift-online/ocm-sdk-go v0.1.368 h1:qP+gkChV8WDwwpkUw1xUyjTXKdvrwyd70Gff2GMUSeU= +github.com/openshift-online/ocm-sdk-go v0.1.368/go.mod h1:KYOw8kAKAHyPrJcQoVR82CneQ4ofC02Na4cXXaTq4Nw= github.com/openshift/api v0.0.0-20230213134911-7ba313770556 h1:7W2fOhJicyEff24VaF7ASNzPtYvr+iSCVft4SIBAzaE= github.com/openshift/api v0.0.0-20230213134911-7ba313770556/go.mod h1:aQ6LDasvHMvHZXqLHnX2GRmnfTWCF/iIwz8EMTTIE9A= github.com/openshift/client-go v0.0.0-20221019143426-16aed247da5c h1:CV76yFOTXmq9VciBR3Bve5ZWzSxdft7gaMVB3kS0rwg= diff --git a/test/e2e/instascale_app_wrapper.go b/test/e2e/instascale_app_wrapper.go new file mode 100644 index 000000000..f9ec3bd0c --- /dev/null +++ b/test/e2e/instascale_app_wrapper.go @@ -0,0 +1,142 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + . "github.com/project-codeflare/codeflare-operator/test/support" +) + +func createInstaScaleJobAppWrapper(test Test, namespace *corev1.Namespace, config *corev1.ConfigMap) (*batchv1.Job, *mcadv1beta1.AppWrapper, error) { + // Batch Job + job := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: batchv1.SchemeGroupVersion.String(), + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "mnist", + Namespace: namespace.Name, + }, + Spec: batchv1.JobSpec{ + Completions: Ptr(int32(1)), + Parallelism: Ptr(int32(1)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "job", + Image: GetPyTorchImage(), + Env: []corev1.EnvVar{ + {Name: "PYTHONUSERBASE", Value: "/workdir"}, + }, + Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, + Args: []string{"$PYTHONUSERBASE"}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "test", + MountPath: "/test", + }, + { + Name: "workdir", + MountPath: "/workdir", + }, + }, + WorkingDir: "/workdir", + }, + }, + Volumes: []corev1.Volume{ + { + Name: "test", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: config.Name, + }, + }, + }, + }, + { + Name: "workdir", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + } + + // create an appwrapper + aw := &mcadv1beta1.AppWrapper{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-instascale", + Namespace: namespace.Name, + Labels: map[string]string{ + "orderedinstance": "g4dn.xlarge", + }, + }, + Spec: mcadv1beta1.AppWrapperSpec{ + AggrResources: mcadv1beta1.AppWrapperResourceList{ + GenericItems: []mcadv1beta1.AppWrapperGenericResource{ + { + CustomPodResources: []mcadv1beta1.CustomPodResourceTemplate{ + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + "nvidia.com/gpu": resource.MustParse("1"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + "nvidia.com/gpu": resource.MustParse("1"), + }, + }, + { + Replicas: 1, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("250m"), + corev1.ResourceMemory: resource.MustParse("512Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("500m"), + corev1.ResourceMemory: resource.MustParse("1G"), + }, + }, + }, + GenericTemplate: Raw(test, job), + CompletionStatus: "Complete", + }, + }, + }, + }, + } + + _, err := test.Client().MCAD().WorkloadV1beta1().AppWrappers(namespace.Name).Create(test.Ctx(), aw, metav1.CreateOptions{}) + + return job, aw, err +} diff --git a/test/e2e/instascale_machinepool_test.go b/test/e2e/instascale_machinepool_test.go new file mode 100644 index 000000000..b6ad0782c --- /dev/null +++ b/test/e2e/instascale_machinepool_test.go @@ -0,0 +1,75 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "testing" + + . "github.com/onsi/gomega" + mcadv1beta1 "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/apis/controller/v1beta1" + + . "github.com/project-codeflare/codeflare-operator/test/support" +) + +func TestInstascaleMachinePool(t *testing.T) { + test := With(t) + test.T().Parallel() + + if !IsOsd() { + test.T().Skip("Skipping test as not running on an OSD cluster") + } + + namespace := test.NewTestNamespace() + + // Test configuration + cm := CreateConfigMap(test, namespace.Name, map[string][]byte{ + // pip requirements + "requirements.txt": ReadFile(test, "mnist_pip_requirements.txt"), + // MNIST training script + "mnist.py": ReadFile(test, "mnist.py"), + }) + + //create OCM connection + connection := CreateOCMConnection(test) + defer connection.Close() + + // check existing cluster machine pool resources + // look for machine pool with aw name - expect not to find it + test.Expect(GetMachinePools(test, connection)). + ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge")))) + + // Setup batch job and AppWrapper + _, aw, err := createInstaScaleJobAppWrapper(test, namespace, cm) + test.Expect(err).NotTo(HaveOccurred()) + test.T().Logf("AppWrapper created successfully %s/%s", aw.Namespace, aw.Name) + + // assert that AppWrapper goes to "Running" state + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutGpuProvisioning). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateActive))) + + // look for machine pool with aw name - expect to find it + test.Eventually(MachinePools(test, connection), TestTimeoutLong). + Should(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge")))) + + test.Eventually(AppWrapper(test, namespace, aw.Name), TestTimeoutShort). + Should(WithTransform(AppWrapperState, Equal(mcadv1beta1.AppWrapperStateCompleted))) + + // look for machine pool with aw name - expect not to find it + test.Eventually(MachinePools(test, connection), TestTimeoutLong). + ShouldNot(ContainElement(WithTransform(MachinePoolId, Equal("test-instascale-g4dn-xlarge")))) + +} diff --git a/test/support/core.go b/test/support/core.go index ee012c82d..70c48c20e 100644 --- a/test/support/core.go +++ b/test/support/core.go @@ -27,6 +27,27 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) +func CreateConfigMap(t Test, namespace string, content map[string][]byte) *corev1.ConfigMap { + configMap := &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "config-", + Namespace: namespace, + }, + BinaryData: content, + Immutable: Ptr(true), + } + + configMap, err := t.Client().Core().CoreV1().ConfigMaps(namespace).Create(t.Ctx(), configMap, metav1.CreateOptions{}) + t.Expect(err).NotTo(gomega.HaveOccurred()) + t.T().Logf("Created ConfigMap %s/%s successfully", configMap.Namespace, configMap.Name) + + return configMap +} + func Raw(t Test, obj runtime.Object) runtime.RawExtension { t.T().Helper() data, err := json.Marshal(obj) diff --git a/test/support/codeflare.go b/test/support/environment.go similarity index 71% rename from test/support/codeflare.go rename to test/support/environment.go index 04b1f3e96..bf3b2af71 100644 --- a/test/support/codeflare.go +++ b/test/support/environment.go @@ -18,6 +18,7 @@ package support import ( "os" + "strings" ) const ( @@ -30,8 +31,13 @@ const ( CodeFlareTestPyTorchImage = "CODEFLARE_TEST_PYTORCH_IMAGE" // The testing output directory, to write output files into. - CodeFlareTestOutputDir = "CODEFLARE_TEST_OUTPUT_DIR" + + // The namespace where a secret containing InstaScale OCM token is stored and the secret name. + InstaScaleOcmSecret = "INSTASCALE_OCM_SECRET" + + // Cluster ID for OSD cluster used in tests, used for testing InstaScale + OsdClusterID = "CLUSTERID" ) func GetCodeFlareSDKVersion() string { @@ -50,6 +56,23 @@ func GetPyTorchImage() string { return lookupEnvOrDefault(CodeFlareTestPyTorchImage, "pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime") } +func GetInstascaleOcmSecret() (string, string) { + res := strings.SplitN(lookupEnvOrDefault(InstaScaleOcmSecret, "default/instascale-com-secret"), "/", 2) + return res[0], res[1] +} + +func GetOsdClusterId() (string, bool) { + return os.LookupEnv(OsdClusterID) +} + +func IsOsd() bool { + osdClusterId, found := GetOsdClusterId() + if found && osdClusterId != "" { + return true + } + return false +} + func lookupEnvOrDefault(key, value string) string { if v, ok := os.LookupEnv(key); ok { return v diff --git a/test/support/ocm.go b/test/support/ocm.go new file mode 100644 index 000000000..3d6bd7f19 --- /dev/null +++ b/test/support/ocm.go @@ -0,0 +1,74 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package support + +import ( + "fmt" + "os" + + "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + ocmsdk "github.com/openshift-online/ocm-sdk-go" + cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" +) + +func CreateOCMConnection(test Test) *ocmsdk.Connection { + secretNamespace, secretName := GetInstascaleOcmSecret() + instascaleOCMSecret, err := test.Client().Core().CoreV1().Secrets(secretNamespace).Get(test.Ctx(), secretName, metav1.GetOptions{}) + test.Expect(err).NotTo(gomega.HaveOccurred()) + + ocmToken := string(instascaleOCMSecret.Data["token"]) + test.T().Logf("Retrieved Secret %s/%s successfully", instascaleOCMSecret.Namespace, instascaleOCMSecret.Name) + + connection, err := buildOCMConnection(ocmToken) + test.Expect(err).NotTo(gomega.HaveOccurred()) + return connection +} + +func buildOCMConnection(secret string) (*ocmsdk.Connection, error) { + connection, err := ocmsdk.NewConnectionBuilder(). + Tokens(secret). + Build() + if err != nil { + fmt.Fprintf(os.Stderr, "Can't build connection: %v\n", err) + return nil, err + } + + return connection, nil +} + +func MachinePools(t Test, connection *ocmsdk.Connection) func(g gomega.Gomega) []*cmv1.MachinePool { + osdClusterId, found := GetOsdClusterId() + t.Expect(found).To(gomega.BeTrue(), "OSD cluster id not found, please configure environment properly") + + return func(g gomega.Gomega) []*cmv1.MachinePool { + machinePoolsListResponse, err := connection.ClustersMgmt().V1().Clusters().Cluster(osdClusterId).MachinePools().List().Send() + g.Expect(err).NotTo(gomega.HaveOccurred()) + return machinePoolsListResponse.Items().Slice() + } +} + +func GetMachinePools(t Test, connection *ocmsdk.Connection) []*cmv1.MachinePool { + t.T().Helper() + return MachinePools(t, connection)(t) +} + +func MachinePoolId(machinePool *cmv1.MachinePool) string { + return machinePool.ID() +} diff --git a/test/support/support.go b/test/support/support.go index 1255baa8c..36c8c9beb 100644 --- a/test/support/support.go +++ b/test/support/support.go @@ -30,9 +30,10 @@ import ( var ( ApplyOptions = metav1.ApplyOptions{FieldManager: "codeflare-test", Force: true} - TestTimeoutShort = 1 * time.Minute - TestTimeoutMedium = 2 * time.Minute - TestTimeoutLong = 5 * time.Minute + TestTimeoutShort = 1 * time.Minute + TestTimeoutMedium = 2 * time.Minute + TestTimeoutLong = 5 * time.Minute + TestTimeoutGpuProvisioning = 30 * time.Minute ) func init() { @@ -57,6 +58,13 @@ func init() { fmt.Printf("Error parsing CODEFLARE_TEST_TIMEOUT_LONG. Using default value: %s", TestTimeoutLong) } } + if value, ok := os.LookupEnv("CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING"); ok { + if duration, err := time.ParseDuration(value); err == nil { + TestTimeoutGpuProvisioning = duration + } else { + fmt.Printf("Error parsing CODEFLARE_TEST_TIMEOUT_GPU_PROVISIONING. Using default value: %s", TestTimeoutGpuProvisioning) + } + } // Gomega settings gomega.SetDefaultEventuallyTimeout(TestTimeoutShort)