Skip to content

Commit fdc684b

Browse files
adierkingcompnerd
authored andcommitted
Optimize Windows contention performance
The dispatch_cascade test is very slow on Windows right now (often taking 4-5 minutes to execute on my machine and sometimes locking it up entirely) because contention is not being handled correctly. `_dispatch_contention_usleep()` is expected to put the thread to sleep, but on Windows it spins on `QueryPerformanceCounter()`. This is causing a huge amount of starvation in the dispatch_cascade test. Implement it using `Sleep()`, and accordingly adjust `DISPATCH_CONTENTION_USLEEP_START` to be 1ms on Windows. Additionally, `_dispatch_contention_spins()` is currently implemented using the `rand_s()` function. This is slow (and experimentally seems to be slower than not randomizing the spin count at all!) because `rand_s()` guarantees cryptographic security, which is unnecessary for dispatch's use case. Replace it with a basic linear congruential generator (from K&R) since there isn't any other `rand_r()` equivalent. Based on the average wall clock times reported by bsdtestharness, this is around 35% faster on my PC (i7-8700K). These changes bring the runtime of the dispatch_cascade test down to around 1-2s at most for me. (It's even faster than this if stdout isn't a console window because that slows down the histogram display.)
1 parent c3b9b8e commit fdc684b

File tree

2 files changed

+14
-21
lines changed

2 files changed

+14
-21
lines changed

src/internal.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -306,9 +306,6 @@ upcast(dispatch_object_t dou)
306306
#include <stdbool.h>
307307
#include <stdint.h>
308308
#include <stdio.h>
309-
#if defined(_WIN32)
310-
#define _CRT_RAND_S
311-
#endif
312309
#include <stdlib.h>
313310
#include <string.h>
314311
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__))

src/shims/yield.h

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,15 @@ void *_dispatch_wait_for_enqueuer(void **ptr);
9999
((DISPATCH_CONTENTION_SPINS_MIN) + ((DISPATCH_CONTENTION_SPINS_MAX) - \
100100
(DISPATCH_CONTENTION_SPINS_MIN)) / 2)
101101
#elif defined(_WIN32)
102-
#define _dispatch_contention_spins() ({ \
103-
unsigned int _value; \
104-
rand_s(&_value); \
105-
(_value & DISPATCH_CONTENTION_SPINS_MAX) | DISPATCH_CONTENTION_SPINS_MIN; })
102+
// Use randomness to prevent threads from resonating at the same frequency and
103+
// permanently contending. Windows doesn't provide rand_r(), so use a simple
104+
// LCG. (msvcrt has rand_s(), but its security guarantees aren't optimal here.)
105+
#define _dispatch_contention_spins() ({ \
106+
static os_atomic(unsigned int) _seed = 1; \
107+
unsigned int _next = os_atomic_load(&_seed, relaxed); \
108+
os_atomic_store(&_seed, _next * 1103515245 + 12345, relaxed); \
109+
((_next >> 24) & (DISPATCH_CONTENTION_SPINS_MAX)) | \
110+
(DISPATCH_CONTENTION_SPINS_MIN); })
106111
#else
107112
// Use randomness to prevent threads from resonating at the same
108113
// frequency and permanently contending.
@@ -160,8 +165,12 @@ void *_dispatch_wait_for_enqueuer(void **ptr);
160165
#pragma mark _dispatch_contention_usleep
161166

162167
#ifndef DISPATCH_CONTENTION_USLEEP_START
168+
#if defined(_WIN32)
169+
#define DISPATCH_CONTENTION_USLEEP_START 1000 // Must be >= 1ms for Sleep()
170+
#else
163171
#define DISPATCH_CONTENTION_USLEEP_START 500
164172
#endif
173+
#endif
165174
#ifndef DISPATCH_CONTENTION_USLEEP_MAX
166175
#define DISPATCH_CONTENTION_USLEEP_MAX 100000
167176
#endif
@@ -176,20 +185,7 @@ void *_dispatch_wait_for_enqueuer(void **ptr);
176185
#endif
177186
#else
178187
#if defined(_WIN32)
179-
DISPATCH_INLINE void
180-
_dispatch_contention_usleep(uint64_t useconds) {
181-
static BOOL bQPFExecuted = FALSE;
182-
static LARGE_INTEGER liFreq;
183-
LARGE_INTEGER liStart, liNow;
184-
185-
if (!bQPFExecuted)
186-
bQPFExecuted = QueryPerformanceFrequency(&liFreq);
187-
188-
QueryPerformanceCounter(&liStart);
189-
do {
190-
QueryPerformanceCounter(&liNow);
191-
} while ((liNow.QuadPart - liStart.QuadPart) / (float)liFreq.QuadPart * 1000 * 1000 < useconds);
192-
}
188+
#define _dispatch_contention_usleep(u) Sleep((u) / 1000)
193189
#else
194190
#define _dispatch_contention_usleep(u) usleep((u))
195191
#endif

0 commit comments

Comments
 (0)