|
161 | 161 | # Note the occurrence of ``aten::convolution`` twice with different input shapes.
|
162 | 162 |
|
163 | 163 | ######################################################################
|
164 |
| -# Profiler can also be used to analyze performance of models executed on GPUs: |
| 164 | +# Profiler can also be used to analyze performance of models executed on GPUs and XPUs: |
| 165 | +# Users could switch between cpu, cuda and xpu |
| 166 | +device = 'cuda' |
165 | 167 |
|
166 |
| -model = models.resnet18().cuda() |
167 |
| -inputs = torch.randn(5, 3, 224, 224).cuda() |
| 168 | +activities = [ProfilerActivity.CPU] |
| 169 | +sort_by_keyword = device + "_time_total" |
168 | 170 |
|
169 |
| -with profile(activities=[ |
170 |
| - ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof: |
| 171 | +if device == 'cuda': |
| 172 | + activities.append(ProfilerActivity.CUDA) |
| 173 | +elif device == 'xpu': |
| 174 | + activities.append(ProfilerActivity.XPU) |
| 175 | + |
| 176 | +model = models.resnet18().to(device) |
| 177 | +inputs = torch.randn(5, 3, 224, 224).to(device) |
| 178 | + |
| 179 | +with profile(activities=activities, record_shapes=True) as prof: |
171 | 180 | with record_function("model_inference"):
|
172 | 181 | model(inputs)
|
173 | 182 |
|
174 |
| -print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) |
| 183 | +print(prof.key_averages().table(sort_by=sort_by_keyword, row_limit=10)) |
175 | 184 |
|
176 | 185 | ######################################################################
|
177 | 186 | # (Note: the first use of CUDA profiling may bring an extra overhead.)
|
|
199 | 208 | # Self CUDA time total: 11.666ms
|
200 | 209 | #
|
201 | 210 | ######################################################################
|
202 |
| -# Profiler can also be used to analyze performance of models executed on XPUs: |
203 |
| - |
204 |
| -model = models.resnet18().xpu() |
205 |
| -inputs = torch.randn(5, 3, 224, 224).xpu() |
206 |
| - |
207 |
| -with profile(activities=[ |
208 |
| - ProfilerActivity.CPU, ProfilerActivity.XPU], record_shapes=True) as prof: |
209 |
| - with record_function("model_inference"): |
210 |
| - model(inputs) |
211 | 211 |
|
212 |
| -print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=10)) |
213 | 212 |
|
214 | 213 | ######################################################################
|
215 | 214 | # (Note: the first use of XPU profiling may bring an extra overhead.)
|
|
307 | 306 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
308 | 307 | #
|
309 | 308 | # Profiling results can be outputted as a ``.json`` trace file:
|
310 |
| -# Tracing CUDA kernels |
| 309 | +# Tracing CUDA or XPU kernels |
| 310 | +# Users could switch between cpu, cuda and xpu |
| 311 | +device = 'cuda' |
311 | 312 |
|
312 |
| -model = models.resnet18().cuda() |
313 |
| -inputs = torch.randn(5, 3, 224, 224).cuda() |
| 313 | +activities = [ProfilerActivity.CPU] |
| 314 | +if device == 'cuda': |
| 315 | + activities.append(ProfilerActivity.CUDA) |
| 316 | +elif device == 'xpu': |
| 317 | + activities.append(ProfilerActivity.XPU) |
314 | 318 |
|
315 |
| -with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: |
| 319 | +model = models.resnet18().to(device) |
| 320 | +inputs = torch.randn(5, 3, 224, 224).to(device) |
| 321 | + |
| 322 | +with profile(activities=activities) as prof: |
316 | 323 | model(inputs)
|
317 | 324 |
|
318 | 325 | prof.export_chrome_trace("trace.json")
|
|
324 | 331 | # .. image:: ../../_static/img/trace_img.png
|
325 | 332 | # :scale: 25 %
|
326 | 333 |
|
327 |
| -# Tracing XPU kernels |
328 |
| - |
329 |
| -model = models.resnet18().xpu() |
330 |
| -inputs = torch.randn(5, 3, 224, 224).xpu() |
331 |
| - |
332 |
| -with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.XPU]) as prof: |
333 |
| - model(inputs) |
334 |
| - |
335 |
| -prof.export_chrome_trace("trace.json") |
336 |
| - |
337 | 334 | ######################################################################
|
338 |
| -# You can examine the sequence of profiled operators and CUDA kernels |
| 335 | +# You can examine the sequence of profiled operators and XPU kernels |
339 | 336 | # in Chrome trace viewer (``chrome://tracing``):
|
340 | 337 | #
|
341 | 338 | # .. image:: ../../_static/img/trace_xpu_img.png
|
|
347 | 344 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
348 | 345 | #
|
349 | 346 | # Profiler can be used to analyze Python and TorchScript stack traces:
|
| 347 | +sort_by_keyword = "self_" + device + "_time_total" |
350 | 348 |
|
351 | 349 | with profile(
|
352 |
| - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], |
| 350 | + activities=activities, |
353 | 351 | with_stack=True,
|
354 | 352 | ) as prof:
|
355 | 353 | model(inputs)
|
356 | 354 |
|
357 | 355 | # Print aggregated stats
|
358 |
| -print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2)) |
| 356 | +print(prof.key_averages(group_by_stack_n=5).table(sort_by=sort_by_keyword, row_limit=2)) |
359 | 357 |
|
360 | 358 | #################################################################################
|
361 | 359 | # The output might look like this (omitting some columns):
|
|
0 commit comments