15
15
some insights on how to speed up model parallel training.
16
16
17
17
The high-level idea of model parallel is to place different sub-networks of a
18
- model onto different devices. All input data will run through all devices, but
19
- each device only operates on a part of the entire model. In this post, we will
20
- not try to construct huge models and squeeze them into a limited number of GPUs.
21
- Instead, this post focuses on showing the idea of model parallel. It is up to
22
- the readers to apply the ideas to real-world applications.
18
+ model onto different devices, and implement the ``forward`` method accordingly
19
+ to move intermediate outputs across devices. As only part of a model operates on
20
+ any individual device, a set of devices can collectively serve a larger model.
21
+ In this post, we will not try to construct huge models and squeeze them into a
22
+ limited number of GPUs. Instead, this post focuses on showing the idea of model
23
+ parallel. It is up to the readers to apply the ideas to real-world applications.
23
24
24
25
Let us start with a toy model that contains two linear layers. To run this
25
26
model on two GPUs, simply put each linear layer on a different GPU, and move
34
35
class ToyModel (nn .Module ):
35
36
def __init__ (self ):
36
37
super (ToyModel , self ).__init__ ()
37
- self .net1 = torch .nn .Linear (10 , 10 ).cuda (0 )
38
- self .net2 = torch .nn .Linear (10 , 5 ).cuda (1 )
38
+ self .net1 = torch .nn .Linear (10 , 10 ).to ('cuda:0' )
39
+ self .relu = torch .nn .ReLU ().to ('cuda:0' )
40
+ self .net2 = torch .nn .Linear (10 , 5 ).to ('cuda:1' )
39
41
40
42
def forward (self , x ):
41
- return self .net2 (self .net1 (x .cuda ( 0 )).cuda ( 1 ))
43
+ return self .net2 (self .net1 (x .to ( 'cuda:0' )).to ( 'cuda:1' ))
42
44
43
45
######################################################################
44
46
# Note that, the above ``ToyModel`` looks very similar to how one would
45
- # implement it on a single GPU, except the four ``cuda (device)`` calls which
46
- # place linear layers and tensors to on proper devices. That is the only
47
- # place in the model that requires changes. The ``backward()`` and
48
- # ``torch.optim`` will automatically take care of gradients as if the
49
- # model is on one GPU. You only need to make sure that the labels are on the
50
- # same device as the outputs when calling the loss function.
47
+ # implement it on a single GPU, except the five ``to (device)`` calls which
48
+ # place linear layers and tensors on proper devices. That is the only place in
49
+ # the model that requires changes. The ``backward()`` and ``torch.optim`` will
50
+ # automatically take care of gradients as if the model is on one GPU. You only
51
+ # need to make sure that the labels are on the same device as the outputs when
52
+ # calling the loss function.
51
53
52
54
53
55
model = ToyModel ()
@@ -56,7 +58,7 @@ def forward(self, x):
56
58
57
59
optimizer .zero_grad ()
58
60
outputs = model (torch .randn (20 , 10 ))
59
- labels = torch .randn (20 , 5 ).cuda ( 1 )
61
+ labels = torch .randn (20 , 5 ).to ( 'cuda:1' )
60
62
loss_fn (outputs , labels ).backward ()
61
63
optimizer .step ()
62
64
@@ -90,18 +92,18 @@ def __init__(self, *args, **kwargs):
90
92
91
93
self .layer1 ,
92
94
self .layer2
93
- ).cuda ( 0 )
95
+ ).to ( 'cuda:0' )
94
96
95
97
self .seq2 = nn .Sequential (
96
98
self .layer3 ,
97
99
self .layer4 ,
98
100
self .avgpool ,
99
- ).cuda ( 1 )
101
+ ).to ( 'cuda:1' )
100
102
101
- self .fc .cuda ( 1 )
103
+ self .fc .to ( 'cuda:1' )
102
104
103
105
def forward (self , x ):
104
- x = self .seq2 (self .seq1 (x ).cuda ( 1 ))
106
+ x = self .seq2 (self .seq1 (x ).to ( 'cuda:1' ))
105
107
return self .fc (x .view (x .size (0 ), - 1 ))
106
108
107
109
@@ -122,7 +124,6 @@ def forward(self, x):
122
124
123
125
124
126
import torchvision .models as models
125
- import timeit
126
127
127
128
num_batches = 3
128
129
batch_size = 120
@@ -147,7 +148,7 @@ def train(model):
147
148
148
149
# run forward pass
149
150
optimizer .zero_grad ()
150
- outputs = model (inputs .cuda ( ))
151
+ outputs = model (inputs .to ( 'cuda:0' ))
151
152
152
153
# run backward pass
153
154
labels = labels .to (outputs .device )
@@ -163,8 +164,10 @@ def train(model):
163
164
# and plot the execution times with standard deviations.
164
165
165
166
166
- import numpy as np
167
167
import matplotlib .pyplot as plt
168
+ plt .switch_backend ('agg' )
169
+ import numpy as np
170
+ import timeit
168
171
169
172
num_repeat = 10
170
173
@@ -177,7 +180,7 @@ def train(model):
177
180
178
181
setup = "from __main__ import train, num_classes;" + \
179
182
"import torchvision.models as models;" + \
180
- "model = models.resnet50(num_classes=num_classes).cuda( )"
183
+ "model = models.resnet50(num_classes=num_classes).to('cuda:0' )"
181
184
rn_run_times = timeit .repeat (stmt , setup , number = 1 , repeat = num_repeat )
182
185
rn_mean , rn_std = np .mean (rn_run_times ), np .std (rn_run_times )
183
186
@@ -206,12 +209,13 @@ def plot(means, stds, labels, fig_name):
206
209
# :alt:
207
210
#
208
211
# The result shows that the execution time of model parallel implementation is
209
- # ``4.02/3.75-1=7%`` longer than the existing single-GPU implementation. There
210
- # is room for improvements, as we know one of the two GPUs is sitting idle
211
- # throughout the execution. One option is to further divide each batch into
212
- # a pipeline of splits, such that when one split reaches the second
213
- # sub-network, the following split can be fed into the first sub-network. In
214
- # this way, two consecutive splits can run concurrently on two GPUs.
212
+ # ``4.02/3.75-1=7%`` longer than the existing single-GPU implementation. So we
213
+ # can conclude there is roughly 7% overhead in copying tensors back and forth
214
+ # across the GPUs. There are rooms for improvements, as we know one of the two
215
+ # GPUs is sitting idle throughout the execution. One option is to further divide
216
+ # each batch into a pipeline of splits, such that when one split reaches the
217
+ # second sub-network, the following split can be fed into the first sub-network.
218
+ # In this way, two consecutive splits can run concurrently on two GPUs.
215
219
216
220
######################################################################
217
221
# Speed Up by Pipelining Inputs
@@ -230,7 +234,7 @@ def __init__(self, split_size=20, *args, **kwargs):
230
234
def forward (self , x ):
231
235
splits = iter (x .split (self .split_size , dim = 0 ))
232
236
s_next = next (splits )
233
- s_prev = self .seq1 (s_next ).cuda ( 1 )
237
+ s_prev = self .seq1 (s_next ).to ( 'cuda:1' )
234
238
ret = []
235
239
236
240
for s_next in splits :
@@ -239,7 +243,7 @@ def forward(self, x):
239
243
ret .append (self .fc (s_prev .view (s_prev .size (0 ), - 1 )))
240
244
241
245
# B. s_next runs on cuda:0, which can run concurrently with A
242
- s_prev = self .seq1 (s_next ).cuda ( 1 )
246
+ s_prev = self .seq1 (s_next ).to ( 'cuda:1' )
243
247
244
248
s_prev = self .seq2 (s_prev )
245
249
ret .append (self .fc (s_prev .view (s_prev .size (0 ), - 1 )))
0 commit comments