diff --git a/.github/workflows/mnist-job-test-image.yml b/.github/workflows/mnist-job-test-image.yml new file mode 100644 index 000000000..768104424 --- /dev/null +++ b/.github/workflows/mnist-job-test-image.yml @@ -0,0 +1,33 @@ +# This workflow will build the MNIST job test image and push it to the project-codeflare image registry + +name: MNIST Job Test Image + +on: + workflow_dispatch: + push: + branches: + - main + paths: + - 'test/pytorch_mnist_image/**' + +jobs: + push: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set Go + uses: actions/setup-go@v3 + with: + go-version: v1.20 + + - name: Login to Quay.io + uses: redhat-actions/podman-login@v1 + with: + username: ${{ secrets.QUAY_ID }} + password: ${{ secrets.QUAY_TOKEN }} + registry: quay.io + + - name: Image Build and Push + run: | + make image-mnist-job-test-push diff --git a/Makefile b/Makefile index dccaea286..3b70333b1 100644 --- a/Makefile +++ b/Makefile @@ -85,6 +85,10 @@ ENVTEST_K8S_VERSION = 1.24.2 # used to build the manifests. ENV ?= default +# Image URL to build MNIST job test image +MNIST_JOB_TEST_VERSION ?= v0.0.2 +MNIST_JOB_TEST_IMG ?= $(IMAGE_ORG_BASE)/mnist-job-test:${MNIST_JOB_TEST_VERSION} + # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) ifeq (,$(shell go env GOBIN)) GOBIN=$(shell go env GOPATH)/bin @@ -383,3 +387,11 @@ imports: openshift-goimports ## Organize imports in go files using openshift-goi .PHONY: verify-imports verify-imports: openshift-goimports ## Run import verifications. ./hack/verify-imports.sh $(OPENSHIFT-GOIMPORTS) + +.PHONY: image-mnist-job-test-build +image-mnist-job-test-build: ## Build container image with the MNIST job. + podman build -t ${MNIST_JOB_TEST_IMG} ./test/pytorch_mnist_image + +.PHONY: image-mnist-job-test-push +image-mnist-job-test-push: image-mnist-job-test-build ## Push container image with the MNIST job. + podman push ${MNIST_JOB_TEST_IMG} diff --git a/test/pytorch_mnist_image/Dockerfile b/test/pytorch_mnist_image/Dockerfile new file mode 100644 index 000000000..adbbd7cbc --- /dev/null +++ b/test/pytorch_mnist_image/Dockerfile @@ -0,0 +1,18 @@ +# Build the manager binary +FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime + +WORKDIR /test +COPY entrypoint.sh entrypoint.sh + +# Install MNIST requirements +COPY mnist_pip_requirements.txt requirements.txt +RUN pip install --requirement requirements.txt + +# Prepare MNIST script +COPY mnist.py mnist.py +COPY download_dataset.py download_dataset.py +RUN torchrun download_dataset.py + +USER 65532:65532 +WORKDIR /workdir +ENTRYPOINT ["/test/entrypoint.sh"] diff --git a/test/pytorch_mnist_image/download_dataset.py b/test/pytorch_mnist_image/download_dataset.py new file mode 100644 index 000000000..698f68f67 --- /dev/null +++ b/test/pytorch_mnist_image/download_dataset.py @@ -0,0 +1,21 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from torchvision.datasets import MNIST + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +MNIST(PATH_DATASETS, train=True, download=True) +MNIST(PATH_DATASETS, train=False, download=True) diff --git a/test/pytorch_mnist_image/entrypoint.sh b/test/pytorch_mnist_image/entrypoint.sh new file mode 100755 index 000000000..7487b377a --- /dev/null +++ b/test/pytorch_mnist_image/entrypoint.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +torchrun /test/mnist.py diff --git a/test/pytorch_mnist_image/mnist.py b/test/pytorch_mnist_image/mnist.py new file mode 100644 index 000000000..134d56188 --- /dev/null +++ b/test/pytorch_mnist_image/mnist.py @@ -0,0 +1,159 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +import requests +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.callbacks.progress import TQDMProgressBar +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchmetrics import Accuracy +from torchvision import transforms +from torchvision.datasets import MNIST + +PATH_WORKDIR = os.environ.get("PATH_WORKDIR", ".") +PATH_DATASETS = os.environ.get("PATH_DATASETS", "/test") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 +# %% + +print("prior to running the trainer") +print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) +print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) + +class LitMNIST(LightningModule): + def __init__(self, data_dir=PATH_WORKDIR, hidden_size=64, learning_rate=2e-4): + + super().__init__() + + # Set our init args as class attributes + self.data_dir = data_dir + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + # Hardcode some dataset specific attributes + self.num_classes = 10 + self.dims = (1, 28, 28) + channels, width, height = self.dims + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + # Define PyTorch model + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, self.num_classes), + ) + + self.val_accuracy = Accuracy() + self.test_accuracy = Accuracy() + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.val_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", self.val_accuracy, prog_bar=True) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.test_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("test_loss", loss, prog_bar=True) + self.log("test_acc", self.test_accuracy, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + return optimizer + + #################### + # DATA RELATED HOOKS + #################### + + def prepare_data(self): + MNIST(PATH_DATASETS, train=True, download=True) + MNIST(PATH_DATASETS, train=False, download=True) + + def setup(self, stage=None): + + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(PATH_DATASETS, train=True, transform=self.transform) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST( + PATH_DATASETS, train=False, transform=self.transform + ) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) + + +# Init DataLoader from MNIST Dataset + +model = LitMNIST() + +print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) +print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) + +# Initialize a trainer +trainer = Trainer( + accelerator="auto", + # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + max_epochs=5, + callbacks=[TQDMProgressBar(refresh_rate=20)], + num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), + devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), + strategy="ddp", +) + +# Train the model ⚡ +trainer.fit(model) diff --git a/test/pytorch_mnist_image/mnist_pip_requirements.txt b/test/pytorch_mnist_image/mnist_pip_requirements.txt new file mode 100644 index 000000000..87edeef27 --- /dev/null +++ b/test/pytorch_mnist_image/mnist_pip_requirements.txt @@ -0,0 +1,3 @@ +pytorch_lightning==1.5.10 +torchmetrics==0.9.1 +torchvision==0.12.0