Skip to content

Commit c4fe503

Browse files
committed
runtime: reduce thrashing of gs between ps
One important use case is a pipeline computation that pass values from one Goroutine to the next and then exits or is placed in a wait state. If GOMAXPROCS > 1 a Goroutine running on P1 will enable another Goroutine and then immediately make P1 available to execute it. We need to prevent other Ps from stealing the G that P1 is about to execute. Otherwise the Gs can thrash between Ps causing unneeded synchronization and slowing down throughput. Fix this by changing the stealing logic so that when a P attempts to steal the only G on some other P's run queue, it will pause momentarily to allow the victim P to schedule the G. As part of optimizing stealing we also use a per P victim queue move stolen gs. This eliminates the zeroing of a stack local victim queue which turned out to be expensive. This CL is a necessary but not sufficient prerequisite to changing the default value of GOMAXPROCS to something > 1 which is another CL/discussion. For highly serialized programs, such as GoroutineRing below this can make a large difference. For larger and more parallel programs such as the x/benchmarks there is no noticeable detriment. ~/work/code/src/rsc.io/benchstat/benchstat old.txt new.txt name old mean new mean delta GoroutineRing 30.2µs × (0.98,1.01) 30.1µs × (0.97,1.04) ~ (p=0.941) GoroutineRing-2 113µs × (0.91,1.07) 30µs × (0.98,1.03) -73.17% (p=0.004) GoroutineRing-4 144µs × (0.98,1.02) 32µs × (0.98,1.01) -77.69% (p=0.000) GoroutineRingBuf 32.7µs × (0.97,1.03) 32.5µs × (0.97,1.02) ~ (p=0.795) GoroutineRingBuf-2 120µs × (0.92,1.08) 33µs × (1.00,1.00) -72.48% (p=0.004) GoroutineRingBuf-4 138µs × (0.92,1.06) 33µs × (1.00,1.00) -76.21% (p=0.003) The bench benchmarks show little impact. old new garbage 7032879 7011696 httpold 25509 25301 splayold 1022073 1019499 jsonold 28230624 28081433 Change-Id: I228c48fed8d85c9bbef16a7edc53ab7898506f50 Reviewed-on: https://go-review.googlesource.com/9872 Reviewed-by: Austin Clements <austin@google.com>
1 parent 3b38626 commit c4fe503

File tree

2 files changed

+31
-20
lines changed

2 files changed

+31
-20
lines changed

src/runtime/proc1.go

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,7 +1420,7 @@ top:
14201420
xadd(&sched.nmspinning, 1)
14211421
}
14221422
// random steal from other P's
1423-
for i := 0; i < int(2*gomaxprocs); i++ {
1423+
for i := 0; i < int(4*gomaxprocs); i++ {
14241424
if sched.gcwaiting != 0 {
14251425
goto top
14261426
}
@@ -1429,15 +1429,17 @@ top:
14291429
if _p_ == _g_.m.p.ptr() {
14301430
gp, _ = runqget(_p_)
14311431
} else {
1432-
gp = runqsteal(_g_.m.p.ptr(), _p_)
1432+
stealRunNextG := i > 2*int(gomaxprocs) // first look for ready queues with more than 1 g
1433+
gp = runqsteal(_g_.m.p.ptr(), _p_, stealRunNextG)
14331434
}
14341435
if gp != nil {
14351436
return gp, false
14361437
}
14371438
}
1439+
14381440
stop:
14391441

1440-
// We have nothing to do. If we're in the GC mark phaseand can
1442+
// We have nothing to do. If we're in the GC mark phase and can
14411443
// safely scan and blacken objects, run idle-time marking
14421444
// rather than give up the P.
14431445
if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != nil {
@@ -3461,20 +3463,30 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
34613463
// Grabs a batch of goroutines from local runnable queue.
34623464
// batch array must be of size len(p->runq)/2. Returns number of grabbed goroutines.
34633465
// Can be executed by any P.
3464-
func runqgrab(_p_ *p, batch []*g) uint32 {
3466+
func runqgrab(_p_ *p, batch []*g, stealRunNextG bool) uint32 {
34653467
for {
34663468
h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
34673469
t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer
34683470
n := t - h
34693471
n = n - n/2
34703472
if n == 0 {
3471-
// Try to steal from _p_.runnext.
3472-
if next := _p_.runnext; next != 0 {
3473-
if !_p_.runnext.cas(next, 0) {
3474-
continue
3473+
if stealRunNextG {
3474+
// Try to steal from _p_.runnext.
3475+
if next := _p_.runnext; next != 0 {
3476+
// Sleep to ensure that _p_ isn't about to run the g we
3477+
// are about to steal.
3478+
// The important use case here is when the g running on _p_
3479+
// ready()s another g and then almost immediately blocks.
3480+
// Instead of stealing runnext in this window, back off
3481+
// to give _p_ a chance to schedule runnext. This will avoid
3482+
// thrashing gs between different Ps.
3483+
usleep(100)
3484+
if !_p_.runnext.cas(next, 0) {
3485+
continue
3486+
}
3487+
batch[0] = next.ptr()
3488+
return 1
34753489
}
3476-
batch[0] = next.ptr()
3477-
return 1
34783490
}
34793491
return 0
34803492
}
@@ -3493,15 +3505,13 @@ func runqgrab(_p_ *p, batch []*g) uint32 {
34933505
// Steal half of elements from local runnable queue of p2
34943506
// and put onto local runnable queue of p.
34953507
// Returns one of the stolen elements (or nil if failed).
3496-
func runqsteal(_p_, p2 *p) *g {
3497-
var batch [len(_p_.runq) / 2]*g
3498-
3499-
n := runqgrab(p2, batch[:])
3508+
func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
3509+
n := runqgrab(p2, _p_.runqvictims[:], stealRunNextG)
35003510
if n == 0 {
35013511
return nil
35023512
}
35033513
n--
3504-
gp := batch[n]
3514+
gp := _p_.runqvictims[n]
35053515
if n == 0 {
35063516
return gp
35073517
}
@@ -3511,7 +3521,7 @@ func runqsteal(_p_, p2 *p) *g {
35113521
throw("runqsteal: runq overflow")
35123522
}
35133523
for i := uint32(0); i < n; i++ {
3514-
_p_.runq[(t+i)%uint32(len(_p_.runq))] = batch[i]
3524+
_p_.runq[(t+i)%uint32(len(_p_.runq))] = _p_.runqvictims[i]
35153525
}
35163526
atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
35173527
return gp
@@ -3548,7 +3558,7 @@ func testSchedLocalQueueSteal() {
35483558
gs[j].sig = 0
35493559
runqput(p1, &gs[j], false)
35503560
}
3551-
gp := runqsteal(p2, p1)
3561+
gp := runqsteal(p2, p1, true)
35523562
s := 0
35533563
if gp != nil {
35543564
s++

src/runtime/runtime2.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -353,9 +353,10 @@ type p struct {
353353
goidcacheend uint64
354354

355355
// Queue of runnable goroutines. Accessed without lock.
356-
runqhead uint32
357-
runqtail uint32
358-
runq [256]*g
356+
runqhead uint32
357+
runqtail uint32
358+
runq [256]*g
359+
runqvictims [128]*g // Used to stage victims from another p's runq
359360
// runnext, if non-nil, is a runnable G that was ready'd by
360361
// the current G and should be run next instead of what's in
361362
// runq if there's time remaining in the running G's time

0 commit comments

Comments
 (0)