Skip to content

tweak running examples without cuda #794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions dcgan/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
parser.add_argument('--cuda', action='store_true', help='enables cuda')
parser.add_argument('--dry-run', action='store_true', help='check a single training cycle works')
parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
parser.add_argument('--netG', default='', help="path to netG (to continue training)")
parser.add_argument('--netD', default='', help="path to netD (to continue training)")
Expand Down Expand Up @@ -211,6 +212,9 @@ def forward(self, input):
optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))

if opt.dry_run:
opt.niter = 1

for epoch in range(opt.niter):
for i, data in enumerate(dataloader, 0):
############################
Expand Down Expand Up @@ -261,6 +265,8 @@ def forward(self, input):
'%s/fake_samples_epoch_%03d.png' % (opt.outf, epoch),
normalize=True)

if opt.dry_run:
break
# do checkpointing
torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch))
torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch))
10 changes: 7 additions & 3 deletions imagenet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,9 @@ def main_worker(gpu, ngpus_per_node, args):
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()

if args.distributed:
if not torch.cuda.is_available():
print('using CPU, this will be slow')
elif args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
Expand Down Expand Up @@ -281,7 +283,8 @@ def train(train_loader, model, criterion, optimizer, epoch, args):

if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)

# compute output
output = model(images)
Expand Down Expand Up @@ -324,7 +327,8 @@ def validate(val_loader, model, criterion, args):
for i, (images, target) in enumerate(val_loader):
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)

# compute output
output = model(images)
Expand Down
35 changes: 21 additions & 14 deletions mnist/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ def train(args, model, device, train_loader, optimizer, epoch):
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break


def test(model, device, test_loader):
Expand Down Expand Up @@ -83,6 +85,8 @@ def main():
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
Expand All @@ -96,20 +100,23 @@ def main():

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
kwargs = {'batch_size': args.batch_size}
if use_cuda:
kwargs.update({'num_workers': 1,
'pin_memory': True,
'shuffle': True},
)

transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **kwargs)

model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
Expand Down
25 changes: 22 additions & 3 deletions mnist_hogwild/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch.utils.data.sampler import Sampler
from torchvision import datasets, transforms

from train import train, test

Expand All @@ -27,6 +29,8 @@
help='how many training processes to use (default: 2)')
parser.add_argument('--cuda', action='store_true', default=False,
help='enables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')

class Net(nn.Module):
def __init__(self):
Expand All @@ -46,12 +50,26 @@ def forward(self, x):
x = self.fc2(x)
return F.log_softmax(x, dim=1)


if __name__ == '__main__':
args = parser.parse_args()

use_cuda = args.cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
dataloader_kwargs = {'pin_memory': True} if use_cuda else {}
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('../data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
transform=transform)
kwargs = {'batch_size': args.batch_size,
'shuffle': True}
if use_cuda:
kwargs.update({'num_workers': 1,
'pin_memory': True,
})

torch.manual_seed(args.seed)
mp.set_start_method('spawn')
Expand All @@ -61,12 +79,13 @@ def forward(self, x):

processes = []
for rank in range(args.num_processes):
p = mp.Process(target=train, args=(rank, args, model, device, dataloader_kwargs))
p = mp.Process(target=train, args=(rank, args, model, device,
dataset1, kwargs))
# We first train the model across `num_processes` processes
p.start()
processes.append(p)
for p in processes:
p.join()

# Once training is complete, we can test the model
test(args, model, device, dataloader_kwargs)
test(args, model, device, dataset2, kwargs)
24 changes: 6 additions & 18 deletions mnist_hogwild/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,22 @@
import torch
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms


def train(rank, args, model, device, dataloader_kwargs):
def train(rank, args, model, device, dataset, dataloader_kwargs):
torch.manual_seed(args.seed + rank)

train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, num_workers=1,
**dataloader_kwargs)
train_loader = torch.utils.data.DataLoader(dataset, **dataloader_kwargs)

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
train_epoch(epoch, args, model, device, train_loader, optimizer)


def test(args, model, device, dataloader_kwargs):
def test(args, model, device, dataset, dataloader_kwargs):
torch.manual_seed(args.seed)

test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.batch_size, shuffle=True, num_workers=1,
**dataloader_kwargs)
test_loader = torch.utils.data.DataLoader(dataset, **dataloader_kwargs)

test_epoch(model, device, test_loader)

Expand All @@ -49,6 +35,8 @@ def train_epoch(epoch, args, model, device, data_loader, optimizer):
print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
pid, epoch, batch_idx * len(data), len(data_loader.dataset),
100. * batch_idx / len(data_loader), loss.item()))
if args.dry_run:
break


def test_epoch(model, device, data_loader):
Expand Down
63 changes: 45 additions & 18 deletions run_python_examples.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,34 @@
# This script runs through the code in each of the python examples.
# The purpose is just as an integrtion test, not to actually train
# models in any meaningful way. For that reason, most of these set
# epochs = 1.
# epochs = 1 and --dry-run.
#
# Optionally specify a comma separated list of examples to run.
# can be run as:
# ./run_python_examples.sh "install_deps,run_all,clean"
# to pip install dependencies (other than pytorch), run all examples,
# and remove temporary/changed data files.
# Expects pytorch to be installed.
# Expects pytorch, torchvision to be installed.

BASE_DIR=`pwd`"/"`dirname $0`
EXAMPLES=`echo $1 | sed -e 's/ //g'`

if which nvcc ; then
echo "using cuda"
CUDA=1
CUDA_FLAG="--cuda"
else
echo "not using cuda"
CUDA=0
CUDA_FLAG=""
fi
USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())")
case $USE_CUDA in
"True")
echo "using cuda"
CUDA=1
CUDA_FLAG="--cuda"
;;
"False")
echo "not using cuda"
CUDA=0
CUDA_FLAG=""
;;
"")
exit 1;
;;
esac

ERRORS=""

Expand Down Expand Up @@ -63,7 +70,7 @@ function dcgan() {
unzip ${DATACLASS}_train_lmdb.zip || { error "couldn't unzip $DATACLASS"; return; }
popd
fi
python main.py --dataset lsun --dataroot lsun --classes $DATACLASS --niter 1 $CUDA_FLAG || error "dcgan failed"
python main.py --dataset lsun --dataroot lsun --classes $DATACLASS --niter 1 $CUDA_FLAG --dry-run || error "dcgan failed"
}

function fast_neural_style() {
Expand Down Expand Up @@ -92,12 +99,12 @@ function imagenet() {

function mnist() {
start
python main.py --epochs 1 || error "mnist example failed"
python main.py --epochs 1 --dry-run || error "mnist example failed"
}

function mnist_hogwild() {
start
python main.py --epochs 1 $CUDA_FLAG || error "mnist hogwild failed"
python main.py --epochs 1 --dry-run $CUDA_FLAG || error "mnist hogwild failed"
}

function regression() {
Expand All @@ -115,7 +122,7 @@ function snli() {
echo "installing 'en' model if not installed"
python -m spacy download en || { error "couldn't download 'en' model needed for snli"; return; }
echo "training..."
python train.py --epochs 1 --no-bidirectional || error "couldn't train snli"
python train.py --epochs 1 --dev_every 1 --no-bidirectional --dry-run || error "couldn't train snli"
}

function super_resolution() {
Expand All @@ -126,7 +133,7 @@ function super_resolution() {
function time_sequence_prediction() {
start
python generate_sine_wave.py || { error "generate sine wave failed"; return; }
python train.py || error "time sequence prediction training failed"
python train.py --steps 2 || error "time sequence prediction training failed"
}

function vae() {
Expand All @@ -136,18 +143,38 @@ function vae() {

function word_language_model() {
start
python main.py --epochs 1 $CUDA_FLAG || error "word_language_model failed"
python main.py --epochs 1 --dry-run $CUDA_FLAG || error "word_language_model failed"
}

function clean() {
cd $BASE_DIR
rm -rf dcgan/_cache_lsun_classroom_train_lmdb dcgan/fake_samples_epoch_000.png dcgan/lsun/ dcgan/netD_epoch_0.pth dcgan/netG_epoch_0.pth dcgan/real_samples.png fast_neural_style/saved_models.zip fast_neural_style/saved_models/ imagenet/checkpoint.pth.tar imagenet/lsun/ imagenet/model_best.pth.tar imagenet/sample/ snli/.data/ snli/.vector_cache/ snli/results/ super_resolution/dataset/ super_resolution/model_epoch_1.pth word_language_model/model.pt || error "couldn't clean up some files"
rm -rf dcgan/_cache_lsun_classroom_train_lmdb \
dcgan/fake_samples_epoch_000.png dcgan/lsun/ \
dcgan/_cache_lsunclassroomtrainlmdb \
dcgan/netD_epoch_0.pth dcgan/netG_epoch_0.pth \
dcgan/real_samples.png \
fast_neural_style/saved_models.zip \
fast_neural_style/saved_models/ \
imagenet/checkpoint.pth.tar \
imagenet/lsun/ \
imagenet/model_best.pth.tar \
imagenet/sample/ \
snli/.data/ \
snli/.vector_cache/ \
snli/results/ \
super_resolution/dataset/ \
super_resolution/model_epoch_1.pth \
time_sequence_prediction/predict*.pdf \
time_sequence_prediction/traindata.pt \
word_language_model/model.pt || error "couldn't clean up some files"

git checkout fast_neural_style/images/output-images/amber-candy.jpg || error "couldn't clean up fast neural style image"
}

function run_all() {
# cpp
dcgan
# distributed
fast_neural_style
imagenet
mnist
Expand Down
2 changes: 2 additions & 0 deletions snli/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,5 @@
print(log_template.format(time.time()-start,
epoch, iterations, 1+batch_idx, len(train_iter),
100. * (1+batch_idx) / len(train_iter), loss.item(), ' '*8, n_correct/n_total*100, ' '*12))
if args.dry_run:
break
2 changes: 2 additions & 0 deletions snli/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,7 @@ def get_args():
'glove.6B.50d glove.6B.100d glove.6B.200d glove.6B.300d')
parser.add_argument('--resume_snapshot', type=str, default='',
help='model snapshot to resume.')
parser.add_argument('--dry-run', action='store_true',
help='run only a few iterations')
args = parser.parse_args()
return args
6 changes: 5 additions & 1 deletion time_sequence_prediction/train.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
Expand Down Expand Up @@ -36,6 +37,9 @@ def forward(self, input, future = 0):


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--steps', type=int, default=15, help='steps to run')
opt = parser.parse_args()
# set random seed to 0
np.random.seed(0)
torch.manual_seed(0)
Expand All @@ -52,7 +56,7 @@ def forward(self, input, future = 0):
# use LBFGS as optimizer since we can load the whole data to train
optimizer = optim.LBFGS(seq.parameters(), lr=0.8)
#begin to train
for i in range(15):
for i in range(opt.steps):
print('STEP: ', i)
def closure():
optimizer.zero_grad()
Expand Down
Loading