diff --git a/go.mod b/go.mod index 149ba7720..fbac9a7ff 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.20 require ( github.com/onsi/gomega v1.27.10 github.com/openshift/api v0.0.0-20230213134911-7ba313770556 - github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069 + github.com/project-codeflare/codeflare-common v0.0.0-20240111082724-8f0684651717 github.com/project-codeflare/instascale v0.4.0 github.com/project-codeflare/multi-cluster-app-dispatcher v1.39.0 github.com/ray-project/kuberay/ray-operator v1.0.0 diff --git a/go.sum b/go.sum index 797c519bd..e501c2351 100644 --- a/go.sum +++ b/go.sum @@ -386,8 +386,8 @@ github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069 h1:81+ma1mchF/LtAGsf+poAt50kJ/fLYjoTAcZOxci1Yc= -github.com/project-codeflare/codeflare-common v0.0.0-20231129165224-988ba1da9069/go.mod h1:zdi2GCYJX+QyxFWyCLMoTme3NMz/aucWDJWMqKfigxk= +github.com/project-codeflare/codeflare-common v0.0.0-20240111082724-8f0684651717 h1:knUKEKvfEzVuSwQ4NAe2+I/Oxo4WztU5rYR8d/F66Lw= +github.com/project-codeflare/codeflare-common v0.0.0-20240111082724-8f0684651717/go.mod h1:2Ck9LC+6Xi4jTDSlCJoP00tCzSrxek0roLsjvUgL2gY= github.com/project-codeflare/instascale v0.4.0 h1:l/cb+x4FrJ2bN9wXjv1mCngy77tVw0CLMiqJovTAflo= github.com/project-codeflare/instascale v0.4.0/go.mod h1:CpduFXKeuqYW4Ph1CPOJV6dpAdpebOxhbU4CmccZWSo= github.com/project-codeflare/multi-cluster-app-dispatcher v1.39.0 h1:zoS7pEAWK6eGELPCIIHB3W8Zb/a27Rf55ChYso7EV3o= diff --git a/test/e2e/mnist.py b/test/e2e/mnist.py index 244c84d29..4cfe0b43b 100644 --- a/test/e2e/mnist.py +++ b/test/e2e/mnist.py @@ -15,6 +15,7 @@ import os import torch +import requests from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks.progress import TQDMProgressBar from torch import nn @@ -32,6 +33,8 @@ print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) +print("MNIST_DATASET_URL: is ", os.getenv("MNIST_DATASET_URL")) +MNIST_DATASET_URL = os.getenv("MNIST_DATASET_URL") class LitMNIST(LightningModule): def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): @@ -110,8 +113,34 @@ def configure_optimizers(self): #################### def prepare_data(self): - # download - print("Downloading MNIST dataset...") + datasetFiles = [ + "t10k-images-idx3-ubyte", + "t10k-labels-idx1-ubyte", + "train-images-idx3-ubyte", + "train-labels-idx1-ubyte" + ] + + # Create required folder structure + downloadLocation = os.path.join(PATH_DATASETS, "MNIST", "raw") + os.makedirs(downloadLocation, exist_ok=True) + print(f"{downloadLocation} folder_path created!") + + for file in datasetFiles: + print(f"Downloading MNIST dataset {file}... to path : {downloadLocation}") + response = requests.get(f"{MNIST_DATASET_URL}{file}", stream=True) + filePath = os.path.join(downloadLocation, file) + + #to download dataset file + try: + if response.status_code == 200: + open(filePath, 'wb').write(response.content) + print(f"{file}: Downloaded and saved zipped file to path - {filePath}") + else: + print(f"Failed to download file {file}") + except Exception as e: + print(e) + print(f"Downloaded MNIST dataset to... {downloadLocation}") + MNIST(self.data_dir, train=True, download=True) MNIST(self.data_dir, train=False, download=True) diff --git a/test/e2e/mnist_pytorch_mcad_job_test.go b/test/e2e/mnist_pytorch_mcad_job_test.go index cbe55fed2..883457e95 100644 --- a/test/e2e/mnist_pytorch_mcad_job_test.go +++ b/test/e2e/mnist_pytorch_mcad_job_test.go @@ -79,7 +79,8 @@ func TestMNISTPyTorchMCAD(t *testing.T) { Name: "job", Image: GetPyTorchImage(), Env: []corev1.EnvVar{ - corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/workdir"}, + {Name: "PYTHONUSERBASE", Value: "/workdir"}, + {Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()}, }, Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, VolumeMounts: []corev1.VolumeMount{ diff --git a/test/e2e/mnist_rayjob_mcad_raycluster_test.go b/test/e2e/mnist_rayjob_mcad_raycluster_test.go index b8d3f4d06..21bd98ad8 100644 --- a/test/e2e/mnist_rayjob_mcad_raycluster_test.go +++ b/test/e2e/mnist_rayjob_mcad_raycluster_test.go @@ -17,7 +17,6 @@ limitations under the License. package e2e import ( - "encoding/base64" "testing" . "github.com/onsi/gomega" @@ -143,13 +142,6 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { RayStartParams: map[string]string{}, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ - InitContainers: []corev1.Container{ - { - Name: "init-myservice", - Image: "busybox:1.28", - Command: []string{"sh", "-c", "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"}, - }, - }, Containers: []corev1.Container{ { Name: "ray-worker", @@ -230,21 +222,29 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { }, Spec: rayv1.RayJobSpec{ Entrypoint: "python /home/ray/jobs/mnist.py", - RuntimeEnv: base64.StdEncoding.EncodeToString([]byte(` -{ - "pip": [ - "pytorch_lightning==1.5.10", - "torchmetrics==0.9.1", - "torchvision==0.12.0" - ], - "env_vars": { - } -} -`)), + RuntimeEnvYAML: ` + pip: + - pytorch_lightning==1.5.10 + - torchmetrics==0.9.1 + - torchvision==0.12.0 + env_vars: + MNIST_DATASET_URL: "` + GetMnistDatasetURL() + `" +`, ClusterSelector: map[string]string{ RayJobDefaultClusterSelectorKey: rayCluster.Name, }, ShutdownAfterJobFinishes: false, + SubmitterPodTemplate: &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{ + { + Image: GetRayImage(), + Name: "rayjob-submitter-pod", + }, + }, + }, + }, }, } rayJob, err = test.Client().Ray().RayV1().RayJobs(namespace.Name).Create(test.Ctx(), rayJob, metav1.CreateOptions{}) @@ -256,6 +256,10 @@ func TestMNISTRayJobMCADRayCluster(t *testing.T) { test.T().Logf("Connecting to Ray cluster at: %s", rayDashboardURL.String()) rayClient := NewRayClusterClient(rayDashboardURL) + // Wait for Ray job id to be available, this value is needed for writing logs in defer + test.Eventually(RayJob(test, rayJob.Namespace, rayJob.Name), TestTimeoutShort). + Should(WithTransform(RayJobId, Not(BeEmpty()))) + // Retrieving the job logs once it has completed or timed out defer WriteRayJobAPILogs(test, rayClient, GetRayJobId(test, rayJob.Namespace, rayJob.Name)) diff --git a/test/upgrade/olm_upgrade_test.go b/test/upgrade/olm_upgrade_test.go index ac0783e00..8fb4e65fb 100644 --- a/test/upgrade/olm_upgrade_test.go +++ b/test/upgrade/olm_upgrade_test.go @@ -96,7 +96,8 @@ func TestMNISTCreateAppWrapper(t *testing.T) { Name: "job", Image: GetPyTorchImage(), Env: []corev1.EnvVar{ - corev1.EnvVar{Name: "PYTHONUSERBASE", Value: "/workdir"}, + {Name: "PYTHONUSERBASE", Value: "/workdir"}, + {Name: "MNIST_DATASET_URL", Value: GetMnistDatasetURL()}, }, Command: []string{"/bin/sh", "-c", "pip install -r /test/requirements.txt && torchrun /test/mnist.py"}, VolumeMounts: []corev1.VolumeMount{