diff --git a/src/internal/goexperiment/exp_goleakfindergc_off.go b/src/internal/goexperiment/exp_goleakfindergc_off.go new file mode 100644 index 00000000000000..1a141fd5b7cfc7 --- /dev/null +++ b/src/internal/goexperiment/exp_goleakfindergc_off.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build !goexperiment.goroutineleakfindergc + +package goexperiment + +const GoroutineLeakFinderGC = false +const GoroutineLeakFinderGCInt = 0 diff --git a/src/internal/goexperiment/exp_goleakfindergc_on.go b/src/internal/goexperiment/exp_goleakfindergc_on.go new file mode 100644 index 00000000000000..8c816645927656 --- /dev/null +++ b/src/internal/goexperiment/exp_goleakfindergc_on.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build goexperiment.goroutineleakfindergc + +package goexperiment + +const GoroutineLeakFinderGC = true +const GoroutineLeakFinderGCInt = 1 diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go index 63a338883991e0..53d6e92d37e7da 100644 --- a/src/internal/goexperiment/flags.go +++ b/src/internal/goexperiment/flags.go @@ -129,4 +129,7 @@ type Flags struct { // GreenTeaGC enables the Green Tea GC implementation. GreenTeaGC bool + + // GoroutineLeakFinderGC enables the Deadlock GC implementation. + GoroutineLeakFinderGC bool } diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index 9a4611e26e52a2..14028db30efeac 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -1208,7 +1208,7 @@ func (t *SemTable) Enqueue(addr *uint32) { s.releasetime = 0 s.acquiretime = 0 s.ticket = 0 - t.semTable.rootFor(addr).queue(addr, s, false) + t.semTable.rootFor(addr).queue(addr, s, false, false) } // Dequeue simulates dequeuing a waiter for a semaphore (or lock) at addr. diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 7331886af27632..2661f878ecc3c6 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -1247,6 +1247,28 @@ func markBitsForSpan(base uintptr) (mbits markBits) { return mbits } +// isMarkedOrNotInHeap returns true if a pointer is in the heap and marked, +// or if the pointer is not in the heap. Used by goroutine leak detection +// to determine if concurrency resources are reachable in memory. +func isMarkedOrNotInHeap(p unsafe.Pointer) bool { + obj, span, objIndex := findObject(uintptr(p), 0, 0) + if obj != 0 { + mbits := span.markBitsForIndex(objIndex) + return mbits.isMarked() + } + + // If we fall through to get here, the object is not in the heap. + // In this case, it is either a pointer to a stack object or a global resource. + // Treat it as reachable in memory by default, to be safe. + // + // (vsaioc) TODO: we could possibly be more precise by only checking against the stacks + // of runnable goroutines. I don't think this is necessary, based on what we've seen, but + // let's keep the option open in case the runtime evolves. + // This will (naively) lead to quadratic blow-up for goroutine leak detection, + // but if it is only run on demand, maybe the extra cost is not a show-stopper. + return true +} + // advance advances the markBits to the next object in the span. func (m *markBits) advance() { if m.mask == 1<<7 { diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go index f2df1a00e0c683..0f09ca067bcad3 100644 --- a/src/runtime/mgc.go +++ b/src/runtime/mgc.go @@ -373,17 +373,37 @@ type workType struct { // Number of roots of various root types. Set by gcPrepareMarkRoots. // - // nStackRoots == len(stackRoots), but we have nStackRoots for - // consistency. - nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int + // During normal GC cycle, nStackRoots == nLiveStackRoots == len(stackRoots); + // during goroutine leak detection, nLiveStackRoots is the number of stackRoots + // to examine, and nStackRoots == len(stackRoots), which include goroutines that are + // unmarked / not runnable + nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nLiveStackRoots int + + // The following fields monitor the GC phase of the current cycle during + // goroutine leak detection. + goroutineLeakFinder struct { + // The GC has been instructed to perform goroutine leak detection during the next GC cycle; + // it is set by DetectGoroutineLeaks() and unset during gcStart(). + pending atomic.Bool + // The GC is running in goroutine leak detection mode; it is set during gcStart() + // and unset during gcMarkTermination(). Is protected by STW. + enabled bool + // The GC has performed goroutine leak detection during the current GC cycle; it is set + // during gcMarkDone(), right after goroutine leak detection has concluded, and unset during + // gcStart(). Is protected by STW. + done bool + } // Base indexes of each root type. Set by gcPrepareMarkRoots. baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32 - // stackRoots is a snapshot of all of the Gs that existed - // before the beginning of concurrent marking. The backing - // store of this must not be modified because it might be - // shared with allgs. + // stackRoots is a snapshot of all of the Gs that existed before the + // beginning of concurrent marking. During goroutine leak detection, stackRoots + // is partitioned into two sets; to the left of nLiveStackRoots are stackRoots + // of running / runnable goroutines and to the right of nLiveStackRoots are + // stackRoots of unmarked / not runnable goroutines + // gcDiscoverMoreStackRoots modifies the stackRoots array to redo the partition + // after each marking phase iteration. stackRoots []*g // Each type of GC state transition is protected by a lock. @@ -550,6 +570,24 @@ func GC() { releasem(mp) } +// FindGoleaks instructs the Go garbage collector to attempt +// goroutine leak detection during the next GC cycle. +// +// Only operates if goroutineleakfindergc is enabled in GOEXPERIMENT. +// Otherwise, it just runs runtime.GC(). +func FindGoLeaks() { + if !goexperiment.GoroutineLeakFinderGC { + GC() + return + } + + work.goroutineLeakFinder.pending.Store(true) + + for work.goroutineLeakFinder.pending.Load() { + GC() + } +} + // gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has // already completed this mark phase, it returns immediately. func gcWaitOnMark(n uint32) { @@ -695,6 +733,11 @@ func gcStart(trigger gcTrigger) { mode = gcForceMode } else if debug.gcstoptheworld == 2 { mode = gcForceBlockMode + } else if goexperiment.GoroutineLeakFinderGC { + if work.goroutineLeakFinder.pending.Load() { + // Fully stop the world if running deadlock detection. + mode = gcForceBlockMode + } } // Ok, we're doing it! Stop everybody else @@ -757,6 +800,7 @@ func gcStart(trigger gcTrigger) { clearpools() work.cycles.Add(1) + work.goroutineLeakFinder.done = false // Assists and workers can start the moment we start // the world. @@ -788,6 +832,11 @@ func gcStart(trigger gcTrigger) { // possible. setGCPhase(_GCmark) + if work.goroutineLeakFinder.pending.Load() { + work.goroutineLeakFinder.enabled = true + work.goroutineLeakFinder.pending.Store(false) + } + gcBgMarkPrepare() // Must happen before assists are enabled. gcPrepareMarkRoots() @@ -888,6 +937,9 @@ func gcMarkDone() { // Ensure only one thread is running the ragged barrier at a // time. semacquire(&work.markDoneSema) + if work.goroutineLeakFinder.enabled { + gcDiscoverMoreStackRoots() + } top: // Re-check transition condition under transition lock. @@ -994,6 +1046,25 @@ top: }) semrelease(&worldsema) goto top + } else if goexperiment.GoroutineLeakFinderGC { + // If we are detecting goroutine leaks, do so now. + if work.goroutineLeakFinder.enabled && !work.goroutineLeakFinder.done { + // Detect goroutine leaks. If the returned value is true, then + // detection was performed during this cycle. Otherwise, more mark work is needed, + // or live goroutines were found. + work.goroutineLeakFinder.done = findGoleaks() + + getg().m.preemptoff = "" + systemstack(func() { + // Accumulate the time we were stopped before we had to start again. + work.cpuStats.accumulateGCPauseTime(nanotime()-stw.finishedStopping, work.maxprocs) + + now := startTheWorldWithSema(0, stw) + work.pauseNS += now - stw.startedStopping + }) + semrelease(&worldsema) + goto top + } } gcComputeStartingStackSize() @@ -1032,6 +1103,158 @@ top: gcMarkTermination(stw) } +// checkIfMaybeRunnable checks whether a goroutine may still be semantically runnable. +// For goroutines which are semantically runnable, this will eventually return true +// as the GC marking phase progresses. It returns false for leaked goroutines, or for +// goroutines which are not yet computed as possibly runnable by the GC. +func (gp *g) checkIfMaybeRunnable() bool { + // Unmask the goroutine address to ensure we are not + // dereferencing a masked address. + gp = gp.unmask() + + switch gp.waitreason { + case waitReasonSelectNoCases, + waitReasonChanSendNilChan, + waitReasonChanReceiveNilChan: + // Select with no cases or communicating on nil channels + // make goroutines unrunnable by definition. + return false + case waitReasonChanReceive, + waitReasonSelect, + waitReasonChanSend: + // Cycle all through all *sudog to check whether + // the goroutine is waiting on a marked channel. + for sg := gp.waiting; sg != nil; sg = sg.waitlink { + if isMarkedOrNotInHeap(unsafe.Pointer(sg.c)) { + return true + } + } + return false + case waitReasonSyncCondWait, + waitReasonSyncWaitGroupWait, + waitReasonSyncMutexLock, + waitReasonSyncRWMutexLock, + waitReasonSyncRWMutexRLock: + // If waiting on mutexes, wait groups, or condition variables, + // check if the synchronization primitive attached to the sudog is marked. + if gp.waiting != nil { + // Unmask the sema address and check if it's marked. + return isMarkedOrNotInHeap(gcUnmask(gp.waiting.elem)) + } + } + return true +} + +// unmask returns a *g object with an unmasked address. +// +//go:nosplit +func (gp *g) unmask() *g { + return (*g)(gcUnmask(unsafe.Pointer(gp))) +} + +// mask returns a *g object with a masked address. +// +//go:nosplit +func (gp *g) mask() *g { + return (*g)(gcMask(unsafe.Pointer(gp))) +} + +// Check to see if more blocked but marked goroutines exist; +// if so add them into root set and increment work.markrootJobs accordingly +// return true if we need to run another phase of markroots; return false otherwise +func gcDiscoverMoreStackRoots() { + // to begin with we have a set of unchecked stackRoots between + // vIndex and ivIndex. During the loop, anything < vIndex should be + // valid stackRoots and anything >= ivIndex should be invalid stackRoots + // and the loop terminates when the two indices meet + var vIndex, ivIndex int = work.nLiveStackRoots, work.nStackRoots + + // Reorder goroutine list + for vIndex < ivIndex { + gp := work.stackRoots[vIndex] + if gp.checkIfMaybeRunnable() { + work.stackRoots[vIndex] = gp + vIndex = vIndex + 1 + continue + } + for ivIndex = ivIndex - 1; ivIndex != vIndex; ivIndex = ivIndex - 1 { + if swapGp := work.stackRoots[ivIndex]; swapGp.checkIfMaybeRunnable() { + work.stackRoots[ivIndex] = gp + work.stackRoots[vIndex] = swapGp.unmask() + vIndex = vIndex + 1 + break + } + } + } + + var oldRootJobs int32 = int32(atomic.Load(&work.markrootJobs)) + var newRootJobs int32 = int32(work.baseStacks) + int32(vIndex) + + if newRootJobs > oldRootJobs { + // reset markrootNext as it could have been incremented past markrootJobs + work.nLiveStackRoots = vIndex + atomic.Store(&work.markrootJobs, uint32(newRootJobs)) + } +} + +// findGoleaks scans the remaining stackRoots and marks any which are +// blocked over exclusively unreachable concurrency primitives as leaked (deadlocked). +// Returns true if the goroutine leak check was performed (or unnecessary). +// Returns false if the GC cycle has not yet computed all (maybe-)live goroutines. +func findGoleaks() bool { + // Report goroutine leaks and mark them unreachable, and resume marking + // we still need to mark these unreachable *g structs as they + // get reused, but their stack won't get scanned + if work.nLiveStackRoots == work.nStackRoots { + // nStackRoots == nLiveStackRoots means that all goroutines are marked. + return true + } + + // Try to reach another fix point here. Keep scouting for runnable goroutines until + // none are left. + // Valid goroutines may be found after all GC work is drained. + // Make sure these are pushed to the runnable set and ready to be marked. + var foundMoreWork bool + for i := work.nLiveStackRoots; i < work.nStackRoots; i++ { + gp := work.stackRoots[i].unmask() + if readgstatus(gp) == _Gwaiting && !gp.checkIfMaybeRunnable() { + // Blocking unrunnable goroutines will be skipped. + continue + } + work.stackRoots[i] = work.stackRoots[work.nLiveStackRoots] + work.stackRoots[work.nLiveStackRoots] = gp + work.nLiveStackRoots += 1 + // We now have one more markroot job. + work.markrootJobs += 1 + // We might still have some work to do. + // Make sure in the next iteration we will check re-check for new runnable goroutines. + foundMoreWork = true + } + if foundMoreWork { + // We found more work, so we need to resume the marking phase. + return false + } + + // For the remaining goroutines, mark them as unreachable and leaked. + for i := work.nLiveStackRoots; i < work.nStackRoots; i++ { + gp := work.stackRoots[i].unmask() + casgstatus(gp, _Gwaiting, _Gleaked) + fn := findfunc(gp.startpc) + if fn.valid() { + print("goroutine leak! goroutine ", gp.goid, ": ", funcname(fn), " Stack size: ", gp.stack.hi-gp.stack.lo, " bytes\n") + } else { + print("goroutine leak! goroutine ", gp.goid, ": !unnamed goroutine!", " Stack size: ", gp.stack.hi-gp.stack.lo, " bytes\n") + } + traceback(gp.sched.pc, gp.sched.sp, gp.sched.lr, gp) + println() + work.stackRoots[i] = gp + } + // Put the remaining roots as ready for marking and drain them. + work.markrootJobs += uint32(work.nStackRoots - work.nLiveStackRoots) + work.nLiveStackRoots = work.nStackRoots + return true +} + // World must be stopped and mark assists and background workers must be // disabled. func gcMarkTermination(stw worldStop) { @@ -1185,6 +1408,9 @@ func gcMarkTermination(stw worldStop) { } systemstack(func() { + // Pull the GC out of goroutine leak detection mode. + work.goroutineLeakFinder.enabled = false + // The memstats updated above must be updated with the world // stopped to ensure consistency of some values, such as // sched.idleTime and sched.totaltime. memstats also include @@ -1612,7 +1838,9 @@ func gcMarkWorkAvailable(p *p) bool { if !work.full.empty() || !work.spanq.empty() { return true // global work available } - if work.markrootNext < work.markrootJobs { + rootNext := atomic.Load(&work.markrootNext) + rootJobs := atomic.Load(&work.markrootJobs) + if rootNext < rootJobs { return true // root scan work available } return false @@ -1628,8 +1856,10 @@ func gcMark(startTime int64) { work.tstart = startTime // Check that there's no marking work remaining. - if work.full != 0 || work.markrootNext < work.markrootJobs || !work.spanq.empty() { - print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, " spanq.n=", work.spanq.size(), "\n") + rootNext := atomic.Load(&work.markrootNext) + rootJobs := atomic.Load(&work.markrootJobs) + if work.full != 0 || rootNext < rootJobs { + print("runtime: full=", hex(work.full), " next=", rootNext, " jobs=", rootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n") panic("non-empty mark queue after concurrent mark") } diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index a136c7aeaceda2..557b20ee918cc0 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -51,8 +51,76 @@ const ( // Must be a multiple of the pageInUse bitmap element size and // must also evenly divide pagesPerArena. pagesPerSpanRoot = 512 + + gcUndoBitMask = uintptr(uintptrMask >> 2) // This constant reserves some bits of the address space for the GC to use in order to mask addresses. + gcBitMask = ^gcUndoBitMask // This flips every bit in gcUndoBitMask of uinptr width ) +// gcMask masks addresses that should not be automatically marked during the GC. +// +//go:nosplit +func gcMask(p unsafe.Pointer) unsafe.Pointer { + if goexperiment.GoroutineLeakFinderGC { + return unsafe.Pointer(uintptr(p) | gcBitMask) + } + return p +} + +// gcUnmask undoes the bit-mask applied to a pointer. +// +//go:nosplit +func gcUnmask(p unsafe.Pointer) unsafe.Pointer { + if goexperiment.GoroutineLeakFinderGC { + return unsafe.Pointer(uintptr(p) & gcUndoBitMask) + } + return p +} + +// internalBlocked returns true if the goroutine is blocked due to an +// internal (non-leaking) waitReason, e.g. waiting for the netpoller or garbage collector. +// Such goroutines are never leak detection candidates according to the GC. +// +//go:nosplit +func (gp *g) internalBlocked() bool { + reason := gp.waitreason + return reason < waitReasonChanReceiveNilChan || waitReasonSyncWaitGroupWait < reason +} + +// The world must be stopped or allglock must be held. +// go through the snapshot of allgs, putting them into an arrays, +// separated by index, where [0:blockedIndex] contains only running Gs +// allGs[blockedIndex:] contain only blocking Gs +// To avoid GC from marking and scanning the blocked Gs by scanning +// the returned array (which is heap allocated), we mask the highest +// bit of the pointers to Gs with gcBitMask. +func allGsSnapshotSortedForGC() ([]*g, int) { + assertWorldStoppedOrLockHeld(&allglock) + + allgsSorted := make([]*g, len(allgs)) + + // Indices cutting off runnable and blocked Gs. + var currIndex, blockedIndex = 0, len(allgsSorted) - 1 + for _, gp := range allgs { + gp = gp.unmask() + // not sure if we need atomic load because we are stopping the world, + // but do it just to be safe for now + if status := readgstatus(gp); status != _Gwaiting || gp.internalBlocked() { + allgsSorted[currIndex] = gp + currIndex++ + } else { + allgsSorted[blockedIndex] = gp.mask() + blockedIndex-- + } + } + + // Because the world is stopped or allglock is held, allgadd + // cannot happen concurrently with this. allgs grows + // monotonically and existing entries never change, so we can + // simply return a copy of the slice header. For added safety, + // we trim everything past len because that can still change. + return allgsSorted, blockedIndex + 1 +} + // gcPrepareMarkRoots queues root scanning jobs (stacks, globals, and // some miscellany) and initializes scanning-related state. // @@ -102,11 +170,18 @@ func gcPrepareMarkRoots() { // ignore them because they begin life without any roots, so // there's nothing to scan, and any roots they create during // the concurrent phase will be caught by the write barrier. - work.stackRoots = allGsSnapshot() + if work.goroutineLeakFinder.enabled { + work.stackRoots, work.nLiveStackRoots = allGsSnapshotSortedForGC() + } else { + // regular GC --- scan every go routine + work.stackRoots = allGsSnapshot() + work.nLiveStackRoots = len(work.stackRoots) + } + work.nStackRoots = len(work.stackRoots) work.markrootNext = 0 - work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots) + work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nLiveStackRoots) // Calculate base indexes of each root type work.baseData = uint32(fixedRootCount) @@ -119,8 +194,10 @@ func gcPrepareMarkRoots() { // gcMarkRootCheck checks that all roots have been scanned. It is // purely for debugging. func gcMarkRootCheck() { - if work.markrootNext < work.markrootJobs { - print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n") + rootNext := atomic.Load(&work.markrootNext) + rootJobs := atomic.Load(&work.markrootJobs) + if rootNext < rootJobs { + print(rootNext, " of ", rootJobs, " markroot jobs done\n") throw("left over markroot jobs") } @@ -868,7 +945,7 @@ func scanstack(gp *g, gcw *gcWork) int64 { case _Grunning: print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n") throw("scanstack: goroutine not stopped") - case _Grunnable, _Gsyscall, _Gwaiting: + case _Grunnable, _Gsyscall, _Gwaiting, _Gleaked: // ok } @@ -1136,6 +1213,32 @@ func gcDrainMarkWorkerFractional(gcw *gcWork) { gcDrain(gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit) } +func gcUpdateMarkrootNext() (uint32, bool) { + var success bool + var next uint32 = atomic.Load(&work.markrootNext) + var jobs uint32 = atomic.Load(&work.markrootJobs) + + if next < jobs { + // still work available at the moment + for !success { + success = atomic.Cas(&work.markrootNext, next, next+1) + // We manage to snatch a root job. Return the root index. + if success { + return next, true + } + + // Get the latest value of markrootNext. + next = atomic.Load(&work.markrootNext) + jobs := atomic.Load(&work.markrootJobs) + // We are out of markroot jobs. + if next >= jobs { + break + } + } + } + return 0, false +} + // gcDrain scans roots and objects in work buffers, blackening grey // objects until it is unable to get more work. It may return before // GC is done; it's the caller's responsibility to balance work from @@ -1194,13 +1297,14 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) { } } - // Drain root marking jobs. - if work.markrootNext < work.markrootJobs { + rootNext := atomic.Load(&work.markrootNext) + rootJobs := atomic.Load(&work.markrootJobs) + if rootNext < rootJobs { // Stop if we're preemptible, if someone wants to STW, or if // someone is calling forEachP. for !(gp.preempt && (preemptible || sched.gcwaiting.Load() || pp.runSafePointFn != 0)) { - job := atomic.Xadd(&work.markrootNext, +1) - 1 - if job >= work.markrootJobs { + job, success := gcUpdateMarkrootNext() + if !success { break } markroot(gcw, job, flushBgCredit) @@ -1346,9 +1450,9 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 { wbBufFlush() if b = gcw.tryGetObj(); b == 0 { // Try to do a root job. - if work.markrootNext < work.markrootJobs { - job := atomic.Xadd(&work.markrootNext, +1) - 1 - if job < work.markrootJobs { + if atomic.Load(&work.markrootNext) < atomic.Load(&work.markrootJobs) { + job, success := gcUpdateMarkrootNext() + if success { workFlushed += markroot(gcw, job, false) continue } @@ -1512,7 +1616,9 @@ func scanobject(b uintptr, gcw *gcWork) { // At this point we have extracted the next potential pointer. // Quickly filter out nil and pointers back to the current object. - if obj != 0 && obj-b >= n { + // The GC will skip masked addresses if GoroutineLeakFinderGC is enabled. + if obj != 0 && obj-b >= n && + (!goexperiment.GoroutineLeakFinderGC || obj <= gcUndoBitMask) { // Test if obj points into the Go heap and, if so, // mark the object. // diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go index c41c3558359c0c..586abc433ffc0b 100644 --- a/src/runtime/preempt.go +++ b/src/runtime/preempt.go @@ -160,7 +160,7 @@ func suspendG(gp *g) suspendGState { s = _Gwaiting fallthrough - case _Grunnable, _Gsyscall, _Gwaiting: + case _Grunnable, _Gsyscall, _Gwaiting, _Gleaked: // Claim goroutine by setting scan bit. // This may race with execution or readying of gp. // The scan bit keeps it from transition state. @@ -269,6 +269,7 @@ func resumeG(state suspendGState) { case _Grunnable | _Gscan, _Gwaiting | _Gscan, + _Gleaked | _Gscan, _Gsyscall | _Gscan: casfrom_Gscanstatus(gp, s, s&^_Gscan) } diff --git a/src/runtime/proc.go b/src/runtime/proc.go index b41bbe93cf57c7..3c0a5144d2cc86 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -8,6 +8,7 @@ import ( "internal/abi" "internal/cpu" "internal/goarch" + "internal/goexperiment" "internal/goos" "internal/runtime/atomic" "internal/runtime/exithook" @@ -689,7 +690,7 @@ func allgadd(gp *g) { } lock(&allglock) - allgs = append(allgs, gp) + allgs = append(allgs, gp.mask()) if &allgs[0] != allgptr { atomicstorep(unsafe.Pointer(&allgptr), unsafe.Pointer(&allgs[0])) } @@ -708,6 +709,11 @@ func allGsSnapshot() []*g { // monotonically and existing entries never change, so we can // simply return a copy of the slice header. For added safety, // we trim everything past len because that can still change. + if goexperiment.GoroutineLeakFinderGC { + for i, gp := range allgs { + allgs[i] = gp.unmask() + } + } return allgs[:len(allgs):len(allgs)] } @@ -729,7 +735,7 @@ func atomicAllGIndex(ptr **g, i uintptr) *g { func forEachG(fn func(gp *g)) { lock(&allglock) for _, gp := range allgs { - fn(gp) + fn(gp.unmask()) } unlock(&allglock) } @@ -742,7 +748,7 @@ func forEachGRace(fn func(gp *g)) { ptr, length := atomicAllG() for i := uintptr(0); i < length; i++ { gp := atomicAllGIndex(ptr, i) - fn(gp) + fn(gp.unmask()) } return } @@ -1208,6 +1214,7 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) { _Gscanwaiting, _Gscanrunning, _Gscansyscall, + _Gscanleaked, _Gscanpreempted: if newval == oldval&^_Gscan { success = gp.atomicstatus.CompareAndSwap(oldval, newval) @@ -1228,6 +1235,7 @@ func castogscanstatus(gp *g, oldval, newval uint32) bool { case _Grunnable, _Grunning, _Gwaiting, + _Gleaked, _Gsyscall: if newval == oldval|_Gscan { r := gp.atomicstatus.CompareAndSwap(oldval, newval) diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 527611f96a29d9..fe0aad02a01818 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -87,6 +87,9 @@ const ( // ready()ing this G. _Gpreempted // 9 + // _Gleaked represents a leaked goroutine caught by the GC. + _Gleaked // 10 + // _Gscan combined with one of the above states other than // _Grunning indicates that GC is scanning the stack. The // goroutine is not executing user code and the stack is owned @@ -104,6 +107,7 @@ const ( _Gscansyscall = _Gscan + _Gsyscall // 0x1003 _Gscanwaiting = _Gscan + _Gwaiting // 0x1004 _Gscanpreempted = _Gscan + _Gpreempted // 0x1009 + _Gscanleaked = _Gscan + _Gleaked // 0x100a ) const ( @@ -1056,24 +1060,24 @@ const ( waitReasonZero waitReason = iota // "" waitReasonGCAssistMarking // "GC assist marking" waitReasonIOWait // "IO wait" - waitReasonChanReceiveNilChan // "chan receive (nil chan)" - waitReasonChanSendNilChan // "chan send (nil chan)" waitReasonDumpingHeap // "dumping heap" waitReasonGarbageCollection // "garbage collection" waitReasonGarbageCollectionScan // "garbage collection scan" waitReasonPanicWait // "panicwait" - waitReasonSelect // "select" - waitReasonSelectNoCases // "select (no cases)" waitReasonGCAssistWait // "GC assist wait" waitReasonGCSweepWait // "GC sweep wait" waitReasonGCScavengeWait // "GC scavenge wait" - waitReasonChanReceive // "chan receive" - waitReasonChanSend // "chan send" waitReasonFinalizerWait // "finalizer wait" waitReasonForceGCIdle // "force gc (idle)" waitReasonUpdateGOMAXPROCSIdle // "GOMAXPROCS updater (idle)" waitReasonSemacquire // "semacquire" waitReasonSleep // "sleep" + waitReasonChanReceiveNilChan // "chan receive (nil chan)" + waitReasonChanSendNilChan // "chan send (nil chan)" + waitReasonSelect // "select" + waitReasonSelectNoCases // "select (no cases)" + waitReasonChanReceive // "chan receive" + waitReasonChanSend // "chan send" waitReasonSyncCondWait // "sync.Cond.Wait" waitReasonSyncMutexLock // "sync.Mutex.Lock" waitReasonSyncRWMutexRLock // "sync.RWMutex.RLock" @@ -1159,12 +1163,24 @@ func (w waitReason) String() string { return waitReasonStrings[w] } +// isMutexWait returns true if the goroutine is blocked because of +// sync.Mutex.Lock or sync.RWMutex.[R]Lock. +// +//go:nosplit func (w waitReason) isMutexWait() bool { return w == waitReasonSyncMutexLock || w == waitReasonSyncRWMutexRLock || w == waitReasonSyncRWMutexLock } +// isSyncWait returns true if the goroutine is blocked because of +// sync library primitive operations. +// +//go:nosplit +func (w waitReason) isSyncWait() bool { + return waitReasonSyncCondWait <= w && w <= waitReasonSyncWaitGroupWait +} + func (w waitReason) isWaitingForSuspendG() bool { return isWaitingForSuspendG[w] } diff --git a/src/runtime/sema.go b/src/runtime/sema.go index 6af49b1b0c42d9..08167ff217a6a6 100644 --- a/src/runtime/sema.go +++ b/src/runtime/sema.go @@ -21,6 +21,7 @@ package runtime import ( "internal/cpu" + "internal/goexperiment" "internal/runtime/atomic" "unsafe" ) @@ -188,7 +189,7 @@ func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes i } // Any semrelease after the cansemacquire knows we're waiting // (we set nwait above), so go to sleep. - root.queue(addr, s, lifo) + root.queue(addr, s, lifo, reason.isSyncWait()) goparkunlock(&root.lock, reason, traceBlockSync, 4+skipframes) if s.ticket != 0 || cansemacquire(addr) { break @@ -301,9 +302,18 @@ func cansemacquire(addr *uint32) bool { } // queue adds s to the blocked goroutines in semaRoot. -func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) { +func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool, syncSema bool) { s.g = getg() - s.elem = unsafe.Pointer(addr) + pAddr := unsafe.Pointer(addr) + if goexperiment.GoroutineLeakFinderGC { + if syncSema { + // Mask the addr so it doesn't get marked during GC + // through marking of the treap or marking of the blocked goroutine + pAddr = gcMask(unsafe.Pointer(addr)) + s.g.waiting = s + } + } + s.elem = pAddr s.next = nil s.prev = nil s.waiters = 0 @@ -311,7 +321,13 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) { var last *sudog pt := &root.treap for t := *pt; t != nil; t = *pt { - if t.elem == unsafe.Pointer(addr) { + var cmp bool + if goexperiment.GoroutineLeakFinderGC { + cmp = uintptr(gcUnmask(pAddr)) == uintptr(gcUnmask(t.elem)) + } else { + cmp = uintptr(pAddr) == uintptr(t.elem) + } + if cmp { // Already have addr in list. if lifo { // Substitute s in t's place in treap. @@ -357,7 +373,12 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) { return } last = t - if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) { + if goexperiment.GoroutineLeakFinderGC { + cmp = uintptr(gcUnmask(pAddr)) < uintptr(gcUnmask(t.elem)) + } else { + cmp = uintptr(pAddr) < uintptr(t.elem) + } + if cmp { pt = &t.prev } else { pt = &t.next @@ -402,11 +423,24 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) { func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now, tailtime int64) { ps := &root.treap s := *ps + for ; s != nil; s = *ps { - if s.elem == unsafe.Pointer(addr) { + var cmp bool + if goexperiment.GoroutineLeakFinderGC { + cmp = gcUnmask(unsafe.Pointer(addr)) == gcUnmask(s.elem) + } else { + cmp = unsafe.Pointer(addr) == s.elem + } + if cmp { goto Found } - if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) { + + if goexperiment.GoroutineLeakFinderGC { + cmp = uintptr(gcUnmask(unsafe.Pointer(addr))) < uintptr(gcUnmask(s.elem)) + } else { + cmp = uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) + } + if cmp { ps = &s.prev } else { ps = &s.next @@ -470,6 +504,10 @@ Found: } tailtime = s.acquiretime } + if goexperiment.GoroutineLeakFinderGC { + // Goroutine is no longer blocked. Clear the waiting pointer. + s.g.waiting = nil + } s.parent = nil s.elem = nil s.next = nil @@ -590,6 +628,13 @@ func notifyListWait(l *notifyList, t uint32) { // Enqueue itself. s := acquireSudog() s.g = getg() + if goexperiment.GoroutineLeakFinderGC { + // Storing this pointer (masked) so that we can trace + // the condvar address from the blocked goroutine when + // checking for goroutine leaks. + s.elem = gcMask(unsafe.Pointer(l)) + s.g.waiting = s + } s.ticket = t s.releasetime = 0 t0 := int64(0) @@ -607,6 +652,12 @@ func notifyListWait(l *notifyList, t uint32) { if t0 != 0 { blockevent(s.releasetime-t0, 2) } + if goexperiment.GoroutineLeakFinderGC { + // Goroutine is no longer blocked. Clear up its waiting pointer, + // and clean up the sudog before releasing it. + s.g.waiting = nil + s.elem = nil + } releaseSudog(s) } diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go index 00c0f08e5593c8..e8fef35da7d104 100644 --- a/src/runtime/traceback.go +++ b/src/runtime/traceback.go @@ -1206,6 +1206,7 @@ var gStatusStrings = [...]string{ _Gwaiting: "waiting", _Gdead: "dead", _Gcopystack: "copystack", + _Gleaked: "leaked", _Gpreempted: "preempted", } diff --git a/src/runtime/tracestatus.go b/src/runtime/tracestatus.go index 03ec81fc0262a1..8b5eafd170f488 100644 --- a/src/runtime/tracestatus.go +++ b/src/runtime/tracestatus.go @@ -122,7 +122,7 @@ func goStatusToTraceGoStatus(status uint32, wr waitReason) tracev2.GoStatus { tgs = tracev2.GoRunning case _Gsyscall: tgs = tracev2.GoSyscall - case _Gwaiting, _Gpreempted: + case _Gwaiting, _Gpreempted, _Gleaked: // There are a number of cases where a G might end up in // _Gwaiting but it's actually running in a non-preemptive // state but needs to present itself as preempted to the