diff --git a/test/pytorch_mnist_image/Dockerfile b/test/pytorch_mnist_image/Dockerfile deleted file mode 100644 index adbbd7cbc..000000000 --- a/test/pytorch_mnist_image/Dockerfile +++ /dev/null @@ -1,18 +0,0 @@ -# Build the manager binary -FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime - -WORKDIR /test -COPY entrypoint.sh entrypoint.sh - -# Install MNIST requirements -COPY mnist_pip_requirements.txt requirements.txt -RUN pip install --requirement requirements.txt - -# Prepare MNIST script -COPY mnist.py mnist.py -COPY download_dataset.py download_dataset.py -RUN torchrun download_dataset.py - -USER 65532:65532 -WORKDIR /workdir -ENTRYPOINT ["/test/entrypoint.sh"] diff --git a/test/pytorch_mnist_image/download_dataset.py b/test/pytorch_mnist_image/download_dataset.py deleted file mode 100644 index 698f68f67..000000000 --- a/test/pytorch_mnist_image/download_dataset.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright 2022 IBM, Red Hat -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from torchvision.datasets import MNIST - -PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") -MNIST(PATH_DATASETS, train=True, download=True) -MNIST(PATH_DATASETS, train=False, download=True) diff --git a/test/pytorch_mnist_image/entrypoint.sh b/test/pytorch_mnist_image/entrypoint.sh deleted file mode 100755 index 7487b377a..000000000 --- a/test/pytorch_mnist_image/entrypoint.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -torchrun /test/mnist.py diff --git a/test/pytorch_mnist_image/mnist.py b/test/pytorch_mnist_image/mnist.py deleted file mode 100644 index 134d56188..000000000 --- a/test/pytorch_mnist_image/mnist.py +++ /dev/null @@ -1,159 +0,0 @@ -# Copyright 2022 IBM, Red Hat -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import torch -import requests -from pytorch_lightning import LightningModule, Trainer -from pytorch_lightning.callbacks.progress import TQDMProgressBar -from torch import nn -from torch.nn import functional as F -from torch.utils.data import DataLoader, random_split -from torchmetrics import Accuracy -from torchvision import transforms -from torchvision.datasets import MNIST - -PATH_WORKDIR = os.environ.get("PATH_WORKDIR", ".") -PATH_DATASETS = os.environ.get("PATH_DATASETS", "/test") -BATCH_SIZE = 256 if torch.cuda.is_available() else 64 -# %% - -print("prior to running the trainer") -print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) -print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) - -class LitMNIST(LightningModule): - def __init__(self, data_dir=PATH_WORKDIR, hidden_size=64, learning_rate=2e-4): - - super().__init__() - - # Set our init args as class attributes - self.data_dir = data_dir - self.hidden_size = hidden_size - self.learning_rate = learning_rate - - # Hardcode some dataset specific attributes - self.num_classes = 10 - self.dims = (1, 28, 28) - channels, width, height = self.dims - self.transform = transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)), - ] - ) - - # Define PyTorch model - self.model = nn.Sequential( - nn.Flatten(), - nn.Linear(channels * width * height, hidden_size), - nn.ReLU(), - nn.Dropout(0.1), - nn.Linear(hidden_size, hidden_size), - nn.ReLU(), - nn.Dropout(0.1), - nn.Linear(hidden_size, self.num_classes), - ) - - self.val_accuracy = Accuracy() - self.test_accuracy = Accuracy() - - def forward(self, x): - x = self.model(x) - return F.log_softmax(x, dim=1) - - def training_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - return loss - - def validation_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - preds = torch.argmax(logits, dim=1) - self.val_accuracy.update(preds, y) - - # Calling self.log will surface up scalars for you in TensorBoard - self.log("val_loss", loss, prog_bar=True) - self.log("val_acc", self.val_accuracy, prog_bar=True) - - def test_step(self, batch, batch_idx): - x, y = batch - logits = self(x) - loss = F.nll_loss(logits, y) - preds = torch.argmax(logits, dim=1) - self.test_accuracy.update(preds, y) - - # Calling self.log will surface up scalars for you in TensorBoard - self.log("test_loss", loss, prog_bar=True) - self.log("test_acc", self.test_accuracy, prog_bar=True) - - def configure_optimizers(self): - optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) - return optimizer - - #################### - # DATA RELATED HOOKS - #################### - - def prepare_data(self): - MNIST(PATH_DATASETS, train=True, download=True) - MNIST(PATH_DATASETS, train=False, download=True) - - def setup(self, stage=None): - - # Assign train/val datasets for use in dataloaders - if stage == "fit" or stage is None: - mnist_full = MNIST(PATH_DATASETS, train=True, transform=self.transform) - self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) - - # Assign test dataset for use in dataloader(s) - if stage == "test" or stage is None: - self.mnist_test = MNIST( - PATH_DATASETS, train=False, transform=self.transform - ) - - def train_dataloader(self): - return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) - - def val_dataloader(self): - return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) - - def test_dataloader(self): - return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) - - -# Init DataLoader from MNIST Dataset - -model = LitMNIST() - -print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) -print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) - -# Initialize a trainer -trainer = Trainer( - accelerator="auto", - # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs - max_epochs=5, - callbacks=[TQDMProgressBar(refresh_rate=20)], - num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), - devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), - strategy="ddp", -) - -# Train the model ⚡ -trainer.fit(model) diff --git a/test/pytorch_mnist_image/mnist_pip_requirements.txt b/test/pytorch_mnist_image/mnist_pip_requirements.txt deleted file mode 100644 index 87edeef27..000000000 --- a/test/pytorch_mnist_image/mnist_pip_requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pytorch_lightning==1.5.10 -torchmetrics==0.9.1 -torchvision==0.12.0