From 9d0e6e7ca0fc04d8b9416921c87b6f18addd6b93 Mon Sep 17 00:00:00 2001 From: mattip Date: Wed, 24 Jun 2020 16:34:58 +0300 Subject: [PATCH 1/2] tweak running examples without cuda --- dcgan/main.py | 6 +++ imagenet/main.py | 10 +++-- mnist/main.py | 53 +++++++++++++++++++------- mnist_hogwild/main.py | 45 ++++++++++++++++++++-- mnist_hogwild/train.py | 22 ++--------- run_python_examples.sh | 63 ++++++++++++++++++++++--------- snli/train.py | 2 + snli/util.py | 2 + time_sequence_prediction/train.py | 6 ++- word_language_model/main.py | 4 ++ 10 files changed, 156 insertions(+), 57 deletions(-) diff --git a/dcgan/main.py b/dcgan/main.py index 674ba620b8..4467e32903 100644 --- a/dcgan/main.py +++ b/dcgan/main.py @@ -26,6 +26,7 @@ parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002') parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5') parser.add_argument('--cuda', action='store_true', help='enables cuda') +parser.add_argument('--dry-run', action='store_true', help='check a single training cycle works') parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use') parser.add_argument('--netG', default='', help="path to netG (to continue training)") parser.add_argument('--netD', default='', help="path to netD (to continue training)") @@ -211,6 +212,9 @@ def forward(self, input): optimizerD = optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) +if opt.dry_run: + opt.niter = 1 + for epoch in range(opt.niter): for i, data in enumerate(dataloader, 0): ############################ @@ -261,6 +265,8 @@ def forward(self, input): '%s/fake_samples_epoch_%03d.png' % (opt.outf, epoch), normalize=True) + if opt.dry_run: + break # do checkpointing torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (opt.outf, epoch)) torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (opt.outf, epoch)) diff --git a/imagenet/main.py b/imagenet/main.py index 2a7fc13f6c..f51b618476 100644 --- a/imagenet/main.py +++ b/imagenet/main.py @@ -136,7 +136,9 @@ def main_worker(gpu, ngpus_per_node, args): print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() - if args.distributed: + if not torch.cuda.is_available(): + print('using CPU, this will be slow') + elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. @@ -281,7 +283,8 @@ def train(train_loader, model, criterion, optimizer, epoch, args): if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) - target = target.cuda(args.gpu, non_blocking=True) + if torch.cuda.is_available(): + target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(images) @@ -324,7 +327,8 @@ def validate(val_loader, model, criterion, args): for i, (images, target) in enumerate(val_loader): if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) - target = target.cuda(args.gpu, non_blocking=True) + if torch.cuda.is_available(): + target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(images) diff --git a/mnist/main.py b/mnist/main.py index 54be6812c1..38bf7d2cf6 100644 --- a/mnist/main.py +++ b/mnist/main.py @@ -83,6 +83,8 @@ def main(): help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') + parser.add_argument('--dry-run', action='store_true', default=False, + help='quickly check a single pass') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', @@ -96,20 +98,43 @@ def main(): device = torch.device("cuda" if use_cuda else "cpu") - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} - train_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.batch_size, shuffle=True, **kwargs) - test_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.test_batch_size, shuffle=True, **kwargs) + kwargs = {'batch_size': args.batch_size} + if use_cuda: + kwargs.update({'num_workers': 1, + 'pin_memory': True, + 'shuffle': True}, + ) + + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + dataset1 = datasets.MNIST('../data', train=True, download=True, + transform=transform) + dataset2 = datasets.MNIST('../data', train=False, + transform=transform) + if args.dry_run: + from torch.utils.data.sampler import Sampler + + class DryRunSampler(Sampler): + r"""Return only two datum from the set of data + """ + + def __init__(self, data_source): + self.data_source = data_source + + def __iter__(self): + return iter(range(2)) + + def __len__(self): + return 2 + + + kwargs['sampler'] = DryRunSampler(dataset1) + kwargs['shuffle'] = False + + train_loader = torch.utils.data.DataLoader(dataset1,**kwargs) + test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) model = Net().to(device) optimizer = optim.Adadelta(model.parameters(), lr=args.lr) diff --git a/mnist_hogwild/main.py b/mnist_hogwild/main.py index 0252719c71..c3b989cce1 100644 --- a/mnist_hogwild/main.py +++ b/mnist_hogwild/main.py @@ -4,6 +4,8 @@ import torch.nn as nn import torch.nn.functional as F import torch.multiprocessing as mp +from torch.utils.data.sampler import Sampler +from torchvision import datasets, transforms from train import train, test @@ -27,6 +29,8 @@ help='how many training processes to use (default: 2)') parser.add_argument('--cuda', action='store_true', default=False, help='enables CUDA training') +parser.add_argument('--dry-run', action='store_true', default=False, + help='quickly check a single pass') class Net(nn.Module): def __init__(self): @@ -46,12 +50,46 @@ def forward(self, x): x = self.fc2(x) return F.log_softmax(x, dim=1) + +class DryRunSampler(Sampler): + r"""Return only two datum from the set of data + """ + + def __init__(self, data_source): + self.data_source = data_source + + def __iter__(self): + return iter(range(2)) + + def __len__(self): + return 2 + + if __name__ == '__main__': args = parser.parse_args() use_cuda = args.cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") - dataloader_kwargs = {'pin_memory': True} if use_cuda else {} + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + dataset1 = datasets.MNIST('../data', train=True, download=True, + transform=transform) + dataset2 = datasets.MNIST('../data', train=False, + transform=transform) + kwargs = {'batch_size': args.batch_size, + 'shuffle': True} + if use_cuda: + kwargs.update({'num_workers': 1, + 'pin_memory': True, + }) + + if args.dry_run: + + kwargs['sampler'] = DryRunSampler(dataset1) + kwargs['shuffle'] = False + torch.manual_seed(args.seed) mp.set_start_method('spawn') @@ -61,7 +99,8 @@ def forward(self, x): processes = [] for rank in range(args.num_processes): - p = mp.Process(target=train, args=(rank, args, model, device, dataloader_kwargs)) + p = mp.Process(target=train, args=(rank, args, model, device, + dataset1, kwargs)) # We first train the model across `num_processes` processes p.start() processes.append(p) @@ -69,4 +108,4 @@ def forward(self, x): p.join() # Once training is complete, we can test the model - test(args, model, device, dataloader_kwargs) + test(args, model, device, dataset2, kwargs) diff --git a/mnist_hogwild/train.py b/mnist_hogwild/train.py index 94dc1a37af..1515b1ddf8 100644 --- a/mnist_hogwild/train.py +++ b/mnist_hogwild/train.py @@ -2,36 +2,22 @@ import torch import torch.optim as optim import torch.nn.functional as F -from torchvision import datasets, transforms -def train(rank, args, model, device, dataloader_kwargs): +def train(rank, args, model, device, dataset, dataloader_kwargs): torch.manual_seed(args.seed + rank) - train_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.batch_size, shuffle=True, num_workers=1, - **dataloader_kwargs) + train_loader = torch.utils.data.DataLoader(dataset, **dataloader_kwargs) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train_epoch(epoch, args, model, device, train_loader, optimizer) -def test(args, model, device, dataloader_kwargs): +def test(args, model, device, dataset, dataloader_kwargs): torch.manual_seed(args.seed) - test_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.batch_size, shuffle=True, num_workers=1, - **dataloader_kwargs) + test_loader = torch.utils.data.DataLoader(dataset, **dataloader_kwargs) test_epoch(model, device, test_loader) diff --git a/run_python_examples.sh b/run_python_examples.sh index 64380ce0ed..c6da7cd3fe 100755 --- a/run_python_examples.sh +++ b/run_python_examples.sh @@ -3,27 +3,34 @@ # This script runs through the code in each of the python examples. # The purpose is just as an integrtion test, not to actually train # models in any meaningful way. For that reason, most of these set -# epochs = 1. +# epochs = 1 and --dry-run. # # Optionally specify a comma separated list of examples to run. # can be run as: # ./run_python_examples.sh "install_deps,run_all,clean" # to pip install dependencies (other than pytorch), run all examples, # and remove temporary/changed data files. -# Expects pytorch to be installed. +# Expects pytorch, torchvision to be installed. BASE_DIR=`pwd`"/"`dirname $0` EXAMPLES=`echo $1 | sed -e 's/ //g'` -if which nvcc ; then - echo "using cuda" - CUDA=1 - CUDA_FLAG="--cuda" -else - echo "not using cuda" - CUDA=0 - CUDA_FLAG="" -fi +USE_CUDA=$(python -c "import torchvision, torch; print(torch.cuda.is_available())") +case $USE_CUDA in + "True") + echo "using cuda" + CUDA=1 + CUDA_FLAG="--cuda" + ;; + "False") + echo "not using cuda" + CUDA=0 + CUDA_FLAG="" + ;; + "") + exit 1; + ;; +esac ERRORS="" @@ -63,7 +70,7 @@ function dcgan() { unzip ${DATACLASS}_train_lmdb.zip || { error "couldn't unzip $DATACLASS"; return; } popd fi - python main.py --dataset lsun --dataroot lsun --classes $DATACLASS --niter 1 $CUDA_FLAG || error "dcgan failed" + python main.py --dataset lsun --dataroot lsun --classes $DATACLASS --niter 1 $CUDA_FLAG --dry-run || error "dcgan failed" } function fast_neural_style() { @@ -92,12 +99,12 @@ function imagenet() { function mnist() { start - python main.py --epochs 1 || error "mnist example failed" + python main.py --epochs 1 --dry-run || error "mnist example failed" } function mnist_hogwild() { start - python main.py --epochs 1 $CUDA_FLAG || error "mnist hogwild failed" + python main.py --epochs 1 --dry-run $CUDA_FLAG || error "mnist hogwild failed" } function regression() { @@ -115,7 +122,7 @@ function snli() { echo "installing 'en' model if not installed" python -m spacy download en || { error "couldn't download 'en' model needed for snli"; return; } echo "training..." - python train.py --epochs 1 --no-bidirectional || error "couldn't train snli" + python train.py --epochs 1 --dev_every 1 --no-bidirectional --dry-run || error "couldn't train snli" } function super_resolution() { @@ -126,7 +133,7 @@ function super_resolution() { function time_sequence_prediction() { start python generate_sine_wave.py || { error "generate sine wave failed"; return; } - python train.py || error "time sequence prediction training failed" + python train.py --steps 2 || error "time sequence prediction training failed" } function vae() { @@ -136,18 +143,38 @@ function vae() { function word_language_model() { start - python main.py --epochs 1 $CUDA_FLAG || error "word_language_model failed" + python main.py --epochs 1 --dry-run $CUDA_FLAG || error "word_language_model failed" } function clean() { cd $BASE_DIR - rm -rf dcgan/_cache_lsun_classroom_train_lmdb dcgan/fake_samples_epoch_000.png dcgan/lsun/ dcgan/netD_epoch_0.pth dcgan/netG_epoch_0.pth dcgan/real_samples.png fast_neural_style/saved_models.zip fast_neural_style/saved_models/ imagenet/checkpoint.pth.tar imagenet/lsun/ imagenet/model_best.pth.tar imagenet/sample/ snli/.data/ snli/.vector_cache/ snli/results/ super_resolution/dataset/ super_resolution/model_epoch_1.pth word_language_model/model.pt || error "couldn't clean up some files" + rm -rf dcgan/_cache_lsun_classroom_train_lmdb \ + dcgan/fake_samples_epoch_000.png dcgan/lsun/ \ + dcgan/_cache_lsunclassroomtrainlmdb \ + dcgan/netD_epoch_0.pth dcgan/netG_epoch_0.pth \ + dcgan/real_samples.png \ + fast_neural_style/saved_models.zip \ + fast_neural_style/saved_models/ \ + imagenet/checkpoint.pth.tar \ + imagenet/lsun/ \ + imagenet/model_best.pth.tar \ + imagenet/sample/ \ + snli/.data/ \ + snli/.vector_cache/ \ + snli/results/ \ + super_resolution/dataset/ \ + super_resolution/model_epoch_1.pth \ + time_sequence_prediction/predict*.pdf \ + time_sequence_prediction/traindata.pt \ + word_language_model/model.pt || error "couldn't clean up some files" git checkout fast_neural_style/images/output-images/amber-candy.jpg || error "couldn't clean up fast neural style image" } function run_all() { + #cpp dcgan + # distributed fast_neural_style imagenet mnist diff --git a/snli/train.py b/snli/train.py index 49e4310cba..7251b635bd 100644 --- a/snli/train.py +++ b/snli/train.py @@ -140,3 +140,5 @@ print(log_template.format(time.time()-start, epoch, iterations, 1+batch_idx, len(train_iter), 100. * (1+batch_idx) / len(train_iter), loss.item(), ' '*8, n_correct/n_total*100, ' '*12)) + if args.dry_run: + break diff --git a/snli/util.py b/snli/util.py index a42bc159a5..1bc8e0b2cc 100644 --- a/snli/util.py +++ b/snli/util.py @@ -65,5 +65,7 @@ def get_args(): 'glove.6B.50d glove.6B.100d glove.6B.200d glove.6B.300d') parser.add_argument('--resume_snapshot', type=str, default='', help='model snapshot to resume.') + parser.add_argument('--dry-run', action='store_true', + help='run only a few iterations') args = parser.parse_args() return args diff --git a/time_sequence_prediction/train.py b/time_sequence_prediction/train.py index ae76d9bb1d..a7fbbd630c 100644 --- a/time_sequence_prediction/train.py +++ b/time_sequence_prediction/train.py @@ -1,4 +1,5 @@ from __future__ import print_function +import argparse import torch import torch.nn as nn import torch.optim as optim @@ -36,6 +37,9 @@ def forward(self, input, future = 0): if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--steps', type=int, default=15, help='steps to run') + opt = parser.parse_args() # set random seed to 0 np.random.seed(0) torch.manual_seed(0) @@ -52,7 +56,7 @@ def forward(self, input, future = 0): # use LBFGS as optimizer since we can load the whole data to train optimizer = optim.LBFGS(seq.parameters(), lr=0.8) #begin to train - for i in range(15): + for i in range(opt.steps): print('STEP: ', i) def closure(): optimizer.zero_grad() diff --git a/word_language_model/main.py b/word_language_model/main.py index 86ad820001..6166036618 100644 --- a/word_language_model/main.py +++ b/word_language_model/main.py @@ -48,6 +48,8 @@ parser.add_argument('--nhead', type=int, default=2, help='the number of heads in the encoder/decoder of the transformer model') +parser.add_argument('--dry-run', action='store_true', + help='verify the code and the model') args = parser.parse_args() @@ -191,6 +193,8 @@ def train(): elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() + if args.dry_run: + break def export_onnx(path, batch_size, seq_len): From d0447a19eb3ed3987b47d9e186e67cb43a31fbd1 Mon Sep 17 00:00:00 2001 From: mattip Date: Wed, 1 Jul 2020 13:27:33 +0300 Subject: [PATCH 2/2] rework dry_run handling in mnist, mnist_hogwild --- mnist/main.py | 22 ++-------------------- mnist_hogwild/main.py | 20 -------------------- mnist_hogwild/train.py | 2 ++ run_python_examples.sh | 2 +- 4 files changed, 5 insertions(+), 41 deletions(-) diff --git a/mnist/main.py b/mnist/main.py index 38bf7d2cf6..0a733cd48b 100644 --- a/mnist/main.py +++ b/mnist/main.py @@ -47,6 +47,8 @@ def train(args, model, device, train_loader, optimizer, epoch): print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) + if args.dry_run: + break def test(model, device, test_loader): @@ -113,26 +115,6 @@ def main(): transform=transform) dataset2 = datasets.MNIST('../data', train=False, transform=transform) - if args.dry_run: - from torch.utils.data.sampler import Sampler - - class DryRunSampler(Sampler): - r"""Return only two datum from the set of data - """ - - def __init__(self, data_source): - self.data_source = data_source - - def __iter__(self): - return iter(range(2)) - - def __len__(self): - return 2 - - - kwargs['sampler'] = DryRunSampler(dataset1) - kwargs['shuffle'] = False - train_loader = torch.utils.data.DataLoader(dataset1,**kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **kwargs) diff --git a/mnist_hogwild/main.py b/mnist_hogwild/main.py index c3b989cce1..a5d9c30f71 100644 --- a/mnist_hogwild/main.py +++ b/mnist_hogwild/main.py @@ -51,20 +51,6 @@ def forward(self, x): return F.log_softmax(x, dim=1) -class DryRunSampler(Sampler): - r"""Return only two datum from the set of data - """ - - def __init__(self, data_source): - self.data_source = data_source - - def __iter__(self): - return iter(range(2)) - - def __len__(self): - return 2 - - if __name__ == '__main__': args = parser.parse_args() @@ -85,12 +71,6 @@ def __len__(self): 'pin_memory': True, }) - if args.dry_run: - - kwargs['sampler'] = DryRunSampler(dataset1) - kwargs['shuffle'] = False - - torch.manual_seed(args.seed) mp.set_start_method('spawn') diff --git a/mnist_hogwild/train.py b/mnist_hogwild/train.py index 1515b1ddf8..49d171c451 100644 --- a/mnist_hogwild/train.py +++ b/mnist_hogwild/train.py @@ -35,6 +35,8 @@ def train_epoch(epoch, args, model, device, data_loader, optimizer): print('{}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( pid, epoch, batch_idx * len(data), len(data_loader.dataset), 100. * batch_idx / len(data_loader), loss.item())) + if args.dry_run: + break def test_epoch(model, device, data_loader): diff --git a/run_python_examples.sh b/run_python_examples.sh index c6da7cd3fe..e4de2fad6d 100755 --- a/run_python_examples.sh +++ b/run_python_examples.sh @@ -172,7 +172,7 @@ function clean() { } function run_all() { - #cpp + # cpp dcgan # distributed fast_neural_style