Skip to content

Commit aaed759

Browse files
renderrr
1 parent bb3106a commit aaed759

File tree

1 file changed

+18
-14
lines changed

1 file changed

+18
-14
lines changed

intermediate_source/transformer_building_blocks.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def forward(self,
260260

261261
###############################################################################
262262
# Utilities
263-
# =========
263+
# ~~~~~~~~~
264264
# In this section, we include a utility to generate semi-realistic data using
265265
# Zipf distribution for sentence lengths. This is used to generate the nested
266266
# query, key and value tensors. We also include a benchmark utility.
@@ -400,11 +400,13 @@ def benchmark(func, *args, **kwargs):
400400
######################################################################################
401401
# For reference some sample outputs on A100:
402402
#
403-
# padded_time=0.03454, padded_peak_memory=4.14 GB
404-
# nested_time=0.00612, nested_peak_memory=0.76 GB
405-
# Difference between vanilla and nested result 0.0
406-
# Nested speedup: 5.65
407-
# Nested peak memory reduction 3.39 GB
403+
# ..code::
404+
#
405+
# padded_time=0.03454, padded_peak_memory=4.14 GB
406+
# nested_time=0.00612, nested_peak_memory=0.76 GB
407+
# Difference between vanilla and nested result 0.0
408+
# Nested speedup: 5.65
409+
# Nested peak memory reduction 3.39 GB
408410
#
409411
# We can also see the same for backward pass
410412

@@ -428,14 +430,16 @@ def benchmark(func, *args, **kwargs):
428430
##################################################################################
429431
# Sample outputs on A100:
430432
#
431-
# ``padded_bw_time``=2.09337, ``padded_bw_peak_mem``=5.10 GB
432-
# ``nested_bw_time``=0.01452, ``nested_bw_peak_mem``=3.24 GB
433-
# Nested backward speedup: 144.13
434-
# Nested backward peak memory reduction 1.86 GB
435-
# Difference in ``out_proj.weight.grad`` 0.000244140625
436-
# Difference in ``packed_proj.weight.grad`` 0.001556396484375
437-
# Difference in ``out_proj.bias.grad`` 0.0
438-
# Difference in ``packed_proj.bias.grad`` 0.001953125
433+
# ..code::
434+
#
435+
# ``padded_bw_time``=2.09337, ``padded_bw_peak_mem``=5.10 GB
436+
# ``nested_bw_time``=0.01452, ``nested_bw_peak_mem``=3.24 GB
437+
# Nested backward speedup: 144.13
438+
# Nested backward peak memory reduction 1.86 GB
439+
# Difference in ``out_proj.weight.grad`` 0.000244140625
440+
# Difference in ``packed_proj.weight.grad`` 0.001556396484375
441+
# Difference in ``out_proj.bias.grad`` 0.0
442+
# Difference in ``packed_proj.bias.grad`` 0.001953125
439443
#
440444

441445
##################################################################################

0 commit comments

Comments
 (0)