Skip to content

Commit 83c527f

Browse files
committed
address comments
1 parent da23386 commit 83c527f

File tree

1 file changed

+35
-31
lines changed

1 file changed

+35
-31
lines changed

intermediate_source/model_parallel_tutorial.py

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@
1515
some insights on how to speed up model parallel training.
1616
1717
The high-level idea of model parallel is to place different sub-networks of a
18-
model onto different devices. All input data will run through all devices, but
19-
each device only operates on a part of the entire model. In this post, we will
20-
not try to construct huge models and squeeze them into a limited number of GPUs.
21-
Instead, this post focuses on showing the idea of model parallel. It is up to
22-
the readers to apply the ideas to real-world applications.
18+
model onto different devices, and implement the ``forward`` method accordingly
19+
to move intermediate outputs across devices. As only part of a model operates on
20+
any individual device, a set of devices can collectively serve a larger model.
21+
In this post, we will not try to construct huge models and squeeze them into a
22+
limited number of GPUs. Instead, this post focuses on showing the idea of model
23+
parallel. It is up to the readers to apply the ideas to real-world applications.
2324
2425
Let us start with a toy model that contains two linear layers. To run this
2526
model on two GPUs, simply put each linear layer on a different GPU, and move
@@ -34,20 +35,21 @@
3435
class ToyModel(nn.Module):
3536
def __init__(self):
3637
super(ToyModel, self).__init__()
37-
self.net1 = torch.nn.Linear(10, 10).cuda(0)
38-
self.net2 = torch.nn.Linear(10, 5).cuda(1)
38+
self.net1 = torch.nn.Linear(10, 10).to('cuda:0')
39+
self.relu = torch.nn.ReLU().to('cuda:0')
40+
self.net2 = torch.nn.Linear(10, 5).to('cuda:1')
3941

4042
def forward(self, x):
41-
return self.net2(self.net1(x.cuda(0)).cuda(1))
43+
return self.net2(self.net1(x.to('cuda:0')).to('cuda:1'))
4244

4345
######################################################################
4446
# Note that, the above ``ToyModel`` looks very similar to how one would
45-
# implement it on a single GPU, except the four ``cuda(device)`` calls which
46-
# place linear layers and tensors to on proper devices. That is the only
47-
# place in the model that requires changes. The ``backward()`` and
48-
# ``torch.optim`` will automatically take care of gradients as if the
49-
# model is on one GPU. You only need to make sure that the labels are on the
50-
# same device as the outputs when calling the loss function.
47+
# implement it on a single GPU, except the five ``to(device)`` calls which
48+
# place linear layers and tensors on proper devices. That is the only place in
49+
# the model that requires changes. The ``backward()`` and ``torch.optim`` will
50+
# automatically take care of gradients as if the model is on one GPU. You only
51+
# need to make sure that the labels are on the same device as the outputs when
52+
# calling the loss function.
5153

5254

5355
model = ToyModel()
@@ -56,7 +58,7 @@ def forward(self, x):
5658

5759
optimizer.zero_grad()
5860
outputs = model(torch.randn(20, 10))
59-
labels = torch.randn(20, 5).cuda(1)
61+
labels = torch.randn(20, 5).to('cuda:1')
6062
loss_fn(outputs, labels).backward()
6163
optimizer.step()
6264

@@ -90,18 +92,18 @@ def __init__(self, *args, **kwargs):
9092

9193
self.layer1,
9294
self.layer2
93-
).cuda(0)
95+
).to('cuda:0')
9496

9597
self.seq2 = nn.Sequential(
9698
self.layer3,
9799
self.layer4,
98100
self.avgpool,
99-
).cuda(1)
101+
).to('cuda:1')
100102

101-
self.fc.cuda(1)
103+
self.fc.to('cuda:1')
102104

103105
def forward(self, x):
104-
x = self.seq2(self.seq1(x).cuda(1))
106+
x = self.seq2(self.seq1(x).to('cuda:1'))
105107
return self.fc(x.view(x.size(0), -1))
106108

107109

@@ -122,7 +124,6 @@ def forward(self, x):
122124

123125

124126
import torchvision.models as models
125-
import timeit
126127

127128
num_batches = 3
128129
batch_size = 120
@@ -147,7 +148,7 @@ def train(model):
147148

148149
# run forward pass
149150
optimizer.zero_grad()
150-
outputs = model(inputs.cuda())
151+
outputs = model(inputs.to('cuda:0'))
151152

152153
# run backward pass
153154
labels = labels.to(outputs.device)
@@ -163,8 +164,10 @@ def train(model):
163164
# and plot the execution times with standard deviations.
164165

165166

166-
import numpy as np
167167
import matplotlib.pyplot as plt
168+
plt.switch_backend('agg')
169+
import numpy as np
170+
import timeit
168171

169172
num_repeat = 10
170173

@@ -177,7 +180,7 @@ def train(model):
177180

178181
setup = "from __main__ import train, num_classes;" + \
179182
"import torchvision.models as models;" + \
180-
"model = models.resnet50(num_classes=num_classes).cuda()"
183+
"model = models.resnet50(num_classes=num_classes).to('cuda:0')"
181184
rn_run_times = timeit.repeat(stmt, setup, number=1, repeat=num_repeat)
182185
rn_mean, rn_std = np.mean(rn_run_times), np.std(rn_run_times)
183186

@@ -206,12 +209,13 @@ def plot(means, stds, labels, fig_name):
206209
# :alt:
207210
#
208211
# The result shows that the execution time of model parallel implementation is
209-
# ``4.02/3.75-1=7%`` longer than the existing single-GPU implementation. There
210-
# is room for improvements, as we know one of the two GPUs is sitting idle
211-
# throughout the execution. One option is to further divide each batch into
212-
# a pipeline of splits, such that when one split reaches the second
213-
# sub-network, the following split can be fed into the first sub-network. In
214-
# this way, two consecutive splits can run concurrently on two GPUs.
212+
# ``4.02/3.75-1=7%`` longer than the existing single-GPU implementation. So we
213+
# can conclude there is roughly 7% overhead in copying tensors back and forth
214+
# across the GPUs. There are rooms for improvements, as we know one of the two
215+
# GPUs is sitting idle throughout the execution. One option is to further divide
216+
# each batch into a pipeline of splits, such that when one split reaches the
217+
# second sub-network, the following split can be fed into the first sub-network.
218+
# In this way, two consecutive splits can run concurrently on two GPUs.
215219

216220
######################################################################
217221
# Speed Up by Pipelining Inputs
@@ -230,7 +234,7 @@ def __init__(self, split_size=20, *args, **kwargs):
230234
def forward(self, x):
231235
splits = iter(x.split(self.split_size, dim=0))
232236
s_next = next(splits)
233-
s_prev = self.seq1(s_next).cuda(1)
237+
s_prev = self.seq1(s_next).to('cuda:1')
234238
ret = []
235239

236240
for s_next in splits:
@@ -239,7 +243,7 @@ def forward(self, x):
239243
ret.append(self.fc(s_prev.view(s_prev.size(0), -1)))
240244

241245
# B. s_next runs on cuda:0, which can run concurrently with A
242-
s_prev = self.seq1(s_next).cuda(1)
246+
s_prev = self.seq1(s_next).to('cuda:1')
243247

244248
s_prev = self.seq2(s_prev)
245249
ret.append(self.fc(s_prev.view(s_prev.size(0), -1)))

0 commit comments

Comments
 (0)