@@ -84,8 +84,7 @@ def get_event(self):
84
84
85
85
class SyclTimer :
86
86
"""
87
- Context to measure device time and host wall-time of execution
88
- of commands submitted to :class:`dpctl.SyclQueue`.
87
+ Context to time execution of tasks submitted to :class:`dpctl.SyclQueue`.
89
88
90
89
:Example:
91
90
.. code-block:: python
@@ -99,13 +98,18 @@ class SyclTimer:
99
98
milliseconds_sc = 1e3
100
99
timer = dpctl.SyclTimer(time_scale = milliseconds_sc)
101
100
101
+ untimed_code_block_1
102
102
# use the timer
103
103
with timer(queue=q):
104
- code_block1
104
+ timed_code_block1
105
+
106
+ untimed_code_block_2
105
107
106
108
# use the timer
107
109
with timer(queue=q):
108
- code_block2
110
+ timed_code_block2
111
+
112
+ untimed_code_block_3
109
113
110
114
# retrieve elapsed times in milliseconds
111
115
wall_dt, device_dt = timer.dt
@@ -116,16 +120,41 @@ class SyclTimer:
116
120
associated with these submissions to perform the timing. Thus
117
121
:class:`dpctl.SyclTimer` requires the queue with ``"enable_profiling"``
118
122
property. In order to be able to collect the profiling information,
119
- the ``dt`` property ensures that both submitted barriers complete their
120
- execution and thus effectively synchronizes the queue.
121
-
122
- `device_timer` keyword argument controls the type of tasks submitted.
123
- With `device_timer="queue_barrier"`, queue barrier tasks are used. With
124
- `device_timer="order_manager"`, a single empty body task is inserted
125
- instead relying on order manager (used by `dpctl.tensor` operations) to
123
+ the ``dt`` property ensures that both tasks submitted by the timer
124
+ complete their execution and thus effectively synchronizes the queue.
125
+
126
+ Execution of the above example results in the following task graph,
127
+ where each group of tasks is ordered after the one preceding it,
128
+ ``[tasks_of_untimed_block1]``, ``[timer_fence_start_task]``,
129
+ ``[tasks_of_timed_block1]``, ``[timer_fence_finish_task]``,
130
+ ``[tasks_of_untimed_block2]``, ``[timer_fence_start_task]``,
131
+ ``[tasks_of_timed_block2]``, ``[timer_fence_finish_task]``,
132
+ ``[tasks_of_untimed_block3]``.
133
+
134
+ ``device_timer`` keyword argument controls the type of tasks submitted.
135
+ With ``"queue_barrier"`` value, queue barrier tasks are used. With
136
+ ``"order_manager"`` value, a single empty body task is inserted
137
+ and order manager (used by all `dpctl.tensor` operations) is used to
126
138
order these tasks so that they fence operations performed within
127
139
timer's context.
128
140
141
+ Timing offloading operations that do not use the order manager with
142
+ the timer that uses ``"order_manager"`` as ``device_timer`` value
143
+ will be misleading becaused the tasks submitted by the timer will not
144
+ be ordered with respect to tasks we intend to time.
145
+
146
+ Note, that host timer effectively measures the time of task
147
+ submissions. To measure host timer wall-time that includes execution
148
+ of submitted tasks, make sure to include synchronization point in
149
+ the timed block.
150
+
151
+ :Example:
152
+ .. code-block:: python
153
+
154
+ with timer(q):
155
+ timed_block
156
+ q.wait()
157
+
129
158
Args:
130
159
host_timer (callable, optional):
131
160
A callable such that host_timer() returns current
@@ -134,7 +163,7 @@ class SyclTimer:
134
163
device_timer (Literal["queue_barrier", "order_manager"], optional):
135
164
Device timing method. Default: "queue_barrier".
136
165
time_scale (Union[int, float], optional):
137
- Ratio of the unit of time of interest and one second .
166
+ Ratio of one second and the unit of time-scale of interest.
138
167
Default: ``1``.
139
168
"""
140
169
0 commit comments