diff --git a/src/internal/goexperiment/exp_goleakfindergc_off.go b/src/internal/goexperiment/exp_goleakfindergc_off.go
new file mode 100644
index 00000000000000..1a141fd5b7cfc7
--- /dev/null
+++ b/src/internal/goexperiment/exp_goleakfindergc_off.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build !goexperiment.goroutineleakfindergc
+
+package goexperiment
+
+const GoroutineLeakFinderGC = false
+const GoroutineLeakFinderGCInt = 0
diff --git a/src/internal/goexperiment/exp_goleakfindergc_on.go b/src/internal/goexperiment/exp_goleakfindergc_on.go
new file mode 100644
index 00000000000000..8c816645927656
--- /dev/null
+++ b/src/internal/goexperiment/exp_goleakfindergc_on.go
@@ -0,0 +1,8 @@
+// Code generated by mkconsts.go. DO NOT EDIT.
+
+//go:build goexperiment.goroutineleakfindergc
+
+package goexperiment
+
+const GoroutineLeakFinderGC = true
+const GoroutineLeakFinderGCInt = 1
diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go
index 63a338883991e0..53d6e92d37e7da 100644
--- a/src/internal/goexperiment/flags.go
+++ b/src/internal/goexperiment/flags.go
@@ -129,4 +129,7 @@ type Flags struct {
 
 	// GreenTeaGC enables the Green Tea GC implementation.
 	GreenTeaGC bool
+
+	// GoroutineLeakFinderGC enables the Deadlock GC implementation.
+	GoroutineLeakFinderGC bool
 }
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 9a4611e26e52a2..14028db30efeac 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -1208,7 +1208,7 @@ func (t *SemTable) Enqueue(addr *uint32) {
 	s.releasetime = 0
 	s.acquiretime = 0
 	s.ticket = 0
-	t.semTable.rootFor(addr).queue(addr, s, false)
+	t.semTable.rootFor(addr).queue(addr, s, false, false)
 }
 
 // Dequeue simulates dequeuing a waiter for a semaphore (or lock) at addr.
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 7331886af27632..2661f878ecc3c6 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -1247,6 +1247,28 @@ func markBitsForSpan(base uintptr) (mbits markBits) {
 	return mbits
 }
 
+// isMarkedOrNotInHeap returns true if a pointer is in the heap and marked,
+// or if the pointer is not in the heap. Used by goroutine leak detection
+// to determine if concurrency resources are reachable in memory.
+func isMarkedOrNotInHeap(p unsafe.Pointer) bool {
+	obj, span, objIndex := findObject(uintptr(p), 0, 0)
+	if obj != 0 {
+		mbits := span.markBitsForIndex(objIndex)
+		return mbits.isMarked()
+	}
+
+	// If we fall through to get here, the object is not in the heap.
+	// In this case, it is either a pointer to a stack object or a global resource.
+	// Treat it as reachable in memory by default, to be safe.
+	//
+	// (vsaioc) TODO: we could possibly be more precise by only checking against the stacks
+	// of runnable goroutines. I don't think this is necessary, based on what we've seen, but
+	// let's keep the option open in case the runtime evolves.
+	// This will (naively) lead to quadratic blow-up for goroutine leak detection,
+	// but if it is only run on demand, maybe the extra cost is not a show-stopper.
+	return true
+}
+
 // advance advances the markBits to the next object in the span.
 func (m *markBits) advance() {
 	if m.mask == 1<<7 {
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index f2df1a00e0c683..0f09ca067bcad3 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -373,17 +373,37 @@ type workType struct {
 
 	// Number of roots of various root types. Set by gcPrepareMarkRoots.
 	//
-	// nStackRoots == len(stackRoots), but we have nStackRoots for
-	// consistency.
-	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
+	// During normal GC cycle, nStackRoots == nLiveStackRoots == len(stackRoots);
+	// during goroutine leak detection, nLiveStackRoots is the number of stackRoots
+	// to examine, and nStackRoots == len(stackRoots), which include goroutines that are
+	// unmarked / not runnable
+	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nLiveStackRoots int
+
+	// The following fields monitor the GC phase of the current cycle during
+	// goroutine leak detection.
+	goroutineLeakFinder struct {
+		// The GC has been instructed to perform goroutine leak detection during the next GC cycle;
+		// it is set by DetectGoroutineLeaks() and unset during gcStart().
+		pending atomic.Bool
+		// The GC is running in goroutine leak detection mode; it is set during gcStart()
+		// and unset during gcMarkTermination(). Is protected by STW.
+		enabled bool
+		// The GC has performed goroutine leak detection during the current GC cycle; it is set
+		// during gcMarkDone(), right after goroutine leak detection has concluded, and unset during
+		// gcStart(). Is protected by STW.
+		done bool
+	}
 
 	// Base indexes of each root type. Set by gcPrepareMarkRoots.
 	baseData, baseBSS, baseSpans, baseStacks, baseEnd uint32
 
-	// stackRoots is a snapshot of all of the Gs that existed
-	// before the beginning of concurrent marking. The backing
-	// store of this must not be modified because it might be
-	// shared with allgs.
+	// stackRoots is a snapshot of all of the Gs that existed before the
+	// beginning of concurrent marking.  During goroutine leak detection, stackRoots
+	// is partitioned into two sets; to the left of nLiveStackRoots are stackRoots
+	// of running / runnable goroutines and to the right of nLiveStackRoots are
+	// stackRoots of unmarked / not runnable goroutines
+	// gcDiscoverMoreStackRoots modifies the stackRoots array to redo the partition
+	// after each marking phase iteration.
 	stackRoots []*g
 
 	// Each type of GC state transition is protected by a lock.
@@ -550,6 +570,24 @@ func GC() {
 	releasem(mp)
 }
 
+// FindGoleaks instructs the Go garbage collector to attempt
+// goroutine leak detection during the next GC cycle.
+//
+// Only operates if goroutineleakfindergc is enabled in GOEXPERIMENT.
+// Otherwise, it just runs runtime.GC().
+func FindGoLeaks() {
+	if !goexperiment.GoroutineLeakFinderGC {
+		GC()
+		return
+	}
+
+	work.goroutineLeakFinder.pending.Store(true)
+
+	for work.goroutineLeakFinder.pending.Load() {
+		GC()
+	}
+}
+
 // gcWaitOnMark blocks until GC finishes the Nth mark phase. If GC has
 // already completed this mark phase, it returns immediately.
 func gcWaitOnMark(n uint32) {
@@ -695,6 +733,11 @@ func gcStart(trigger gcTrigger) {
 		mode = gcForceMode
 	} else if debug.gcstoptheworld == 2 {
 		mode = gcForceBlockMode
+	} else if goexperiment.GoroutineLeakFinderGC {
+		if work.goroutineLeakFinder.pending.Load() {
+			// Fully stop the world if running deadlock detection.
+			mode = gcForceBlockMode
+		}
 	}
 
 	// Ok, we're doing it! Stop everybody else
@@ -757,6 +800,7 @@ func gcStart(trigger gcTrigger) {
 	clearpools()
 
 	work.cycles.Add(1)
+	work.goroutineLeakFinder.done = false
 
 	// Assists and workers can start the moment we start
 	// the world.
@@ -788,6 +832,11 @@ func gcStart(trigger gcTrigger) {
 	// possible.
 	setGCPhase(_GCmark)
 
+	if work.goroutineLeakFinder.pending.Load() {
+		work.goroutineLeakFinder.enabled = true
+		work.goroutineLeakFinder.pending.Store(false)
+	}
+
 	gcBgMarkPrepare() // Must happen before assists are enabled.
 	gcPrepareMarkRoots()
 
@@ -888,6 +937,9 @@ func gcMarkDone() {
 	// Ensure only one thread is running the ragged barrier at a
 	// time.
 	semacquire(&work.markDoneSema)
+	if work.goroutineLeakFinder.enabled {
+		gcDiscoverMoreStackRoots()
+	}
 
 top:
 	// Re-check transition condition under transition lock.
@@ -994,6 +1046,25 @@ top:
 		})
 		semrelease(&worldsema)
 		goto top
+	} else if goexperiment.GoroutineLeakFinderGC {
+		// If we are detecting goroutine leaks, do so now.
+		if work.goroutineLeakFinder.enabled && !work.goroutineLeakFinder.done {
+			// Detect goroutine leaks. If the returned value is true, then
+			// detection was performed during this cycle. Otherwise, more mark work is needed,
+			// or live goroutines were found.
+			work.goroutineLeakFinder.done = findGoleaks()
+
+			getg().m.preemptoff = ""
+			systemstack(func() {
+				// Accumulate the time we were stopped before we had to start again.
+				work.cpuStats.accumulateGCPauseTime(nanotime()-stw.finishedStopping, work.maxprocs)
+
+				now := startTheWorldWithSema(0, stw)
+				work.pauseNS += now - stw.startedStopping
+			})
+			semrelease(&worldsema)
+			goto top
+		}
 	}
 
 	gcComputeStartingStackSize()
@@ -1032,6 +1103,158 @@ top:
 	gcMarkTermination(stw)
 }
 
+// checkIfMaybeRunnable checks whether a goroutine may still be semantically runnable.
+// For goroutines which are semantically runnable, this will eventually return true
+// as the GC marking phase progresses. It returns false for leaked goroutines, or for
+// goroutines which are not yet computed as possibly runnable by the GC.
+func (gp *g) checkIfMaybeRunnable() bool {
+	// Unmask the goroutine address to ensure we are not
+	// dereferencing a masked address.
+	gp = gp.unmask()
+
+	switch gp.waitreason {
+	case waitReasonSelectNoCases,
+		waitReasonChanSendNilChan,
+		waitReasonChanReceiveNilChan:
+		// Select with no cases or communicating on nil channels
+		// make goroutines unrunnable by definition.
+		return false
+	case waitReasonChanReceive,
+		waitReasonSelect,
+		waitReasonChanSend:
+		// Cycle all through all *sudog to check whether
+		// the goroutine is waiting on a marked channel.
+		for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+			if isMarkedOrNotInHeap(unsafe.Pointer(sg.c)) {
+				return true
+			}
+		}
+		return false
+	case waitReasonSyncCondWait,
+		waitReasonSyncWaitGroupWait,
+		waitReasonSyncMutexLock,
+		waitReasonSyncRWMutexLock,
+		waitReasonSyncRWMutexRLock:
+		// If waiting on mutexes, wait groups, or condition variables,
+		// check if the synchronization primitive attached to the sudog is marked.
+		if gp.waiting != nil {
+			// Unmask the sema address and check if it's marked.
+			return isMarkedOrNotInHeap(gcUnmask(gp.waiting.elem))
+		}
+	}
+	return true
+}
+
+// unmask returns a *g object with an unmasked address.
+//
+//go:nosplit
+func (gp *g) unmask() *g {
+	return (*g)(gcUnmask(unsafe.Pointer(gp)))
+}
+
+// mask returns a *g object with a masked address.
+//
+//go:nosplit
+func (gp *g) mask() *g {
+	return (*g)(gcMask(unsafe.Pointer(gp)))
+}
+
+// Check to see if more blocked but marked goroutines exist;
+// if so add them into root set and increment work.markrootJobs accordingly
+// return true if we need to run another phase of markroots; return false otherwise
+func gcDiscoverMoreStackRoots() {
+	// to begin with we have a set of unchecked stackRoots between
+	// vIndex and ivIndex.  During the loop, anything < vIndex should be
+	// valid stackRoots and anything >= ivIndex should be invalid stackRoots
+	// and the loop terminates when the two indices meet
+	var vIndex, ivIndex int = work.nLiveStackRoots, work.nStackRoots
+
+	// Reorder goroutine list
+	for vIndex < ivIndex {
+		gp := work.stackRoots[vIndex]
+		if gp.checkIfMaybeRunnable() {
+			work.stackRoots[vIndex] = gp
+			vIndex = vIndex + 1
+			continue
+		}
+		for ivIndex = ivIndex - 1; ivIndex != vIndex; ivIndex = ivIndex - 1 {
+			if swapGp := work.stackRoots[ivIndex]; swapGp.checkIfMaybeRunnable() {
+				work.stackRoots[ivIndex] = gp
+				work.stackRoots[vIndex] = swapGp.unmask()
+				vIndex = vIndex + 1
+				break
+			}
+		}
+	}
+
+	var oldRootJobs int32 = int32(atomic.Load(&work.markrootJobs))
+	var newRootJobs int32 = int32(work.baseStacks) + int32(vIndex)
+
+	if newRootJobs > oldRootJobs {
+		// reset markrootNext as it could have been incremented past markrootJobs
+		work.nLiveStackRoots = vIndex
+		atomic.Store(&work.markrootJobs, uint32(newRootJobs))
+	}
+}
+
+// findGoleaks scans the remaining stackRoots and marks any which are
+// blocked over exclusively unreachable concurrency primitives as leaked (deadlocked).
+// Returns true if the goroutine leak check was performed (or unnecessary).
+// Returns false if the GC cycle has not yet computed all (maybe-)live goroutines.
+func findGoleaks() bool {
+	// Report goroutine leaks and mark them unreachable, and resume marking
+	// we still need to mark these unreachable *g structs as they
+	// get reused, but their stack won't get scanned
+	if work.nLiveStackRoots == work.nStackRoots {
+		// nStackRoots == nLiveStackRoots means that all goroutines are marked.
+		return true
+	}
+
+	// Try to reach another fix point here. Keep scouting for runnable goroutines until
+	// none are left.
+	// Valid goroutines may be found after all GC work is drained.
+	// Make sure these are pushed to the runnable set and ready to be marked.
+	var foundMoreWork bool
+	for i := work.nLiveStackRoots; i < work.nStackRoots; i++ {
+		gp := work.stackRoots[i].unmask()
+		if readgstatus(gp) == _Gwaiting && !gp.checkIfMaybeRunnable() {
+			// Blocking unrunnable goroutines will be skipped.
+			continue
+		}
+		work.stackRoots[i] = work.stackRoots[work.nLiveStackRoots]
+		work.stackRoots[work.nLiveStackRoots] = gp
+		work.nLiveStackRoots += 1
+		// We now have one more markroot job.
+		work.markrootJobs += 1
+		// We might still have some work to do.
+		// Make sure in the next iteration we will check re-check for new runnable goroutines.
+		foundMoreWork = true
+	}
+	if foundMoreWork {
+		// We found more work, so we need to resume the marking phase.
+		return false
+	}
+
+	// For the remaining goroutines, mark them as unreachable and leaked.
+	for i := work.nLiveStackRoots; i < work.nStackRoots; i++ {
+		gp := work.stackRoots[i].unmask()
+		casgstatus(gp, _Gwaiting, _Gleaked)
+		fn := findfunc(gp.startpc)
+		if fn.valid() {
+			print("goroutine leak! goroutine ", gp.goid, ": ", funcname(fn), " Stack size: ", gp.stack.hi-gp.stack.lo, " bytes\n")
+		} else {
+			print("goroutine leak! goroutine ", gp.goid, ": !unnamed goroutine!", " Stack size: ", gp.stack.hi-gp.stack.lo, " bytes\n")
+		}
+		traceback(gp.sched.pc, gp.sched.sp, gp.sched.lr, gp)
+		println()
+		work.stackRoots[i] = gp
+	}
+	// Put the remaining roots as ready for marking and drain them.
+	work.markrootJobs += uint32(work.nStackRoots - work.nLiveStackRoots)
+	work.nLiveStackRoots = work.nStackRoots
+	return true
+}
+
 // World must be stopped and mark assists and background workers must be
 // disabled.
 func gcMarkTermination(stw worldStop) {
@@ -1185,6 +1408,9 @@ func gcMarkTermination(stw worldStop) {
 	}
 
 	systemstack(func() {
+		// Pull the GC out of goroutine leak detection mode.
+		work.goroutineLeakFinder.enabled = false
+
 		// The memstats updated above must be updated with the world
 		// stopped to ensure consistency of some values, such as
 		// sched.idleTime and sched.totaltime. memstats also include
@@ -1612,7 +1838,9 @@ func gcMarkWorkAvailable(p *p) bool {
 	if !work.full.empty() || !work.spanq.empty() {
 		return true // global work available
 	}
-	if work.markrootNext < work.markrootJobs {
+	rootNext := atomic.Load(&work.markrootNext)
+	rootJobs := atomic.Load(&work.markrootJobs)
+	if rootNext < rootJobs {
 		return true // root scan work available
 	}
 	return false
@@ -1628,8 +1856,10 @@ func gcMark(startTime int64) {
 	work.tstart = startTime
 
 	// Check that there's no marking work remaining.
-	if work.full != 0 || work.markrootNext < work.markrootJobs || !work.spanq.empty() {
-		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, " spanq.n=", work.spanq.size(), "\n")
+	rootNext := atomic.Load(&work.markrootNext)
+	rootJobs := atomic.Load(&work.markrootJobs)
+	if work.full != 0 || rootNext < rootJobs {
+		print("runtime: full=", hex(work.full), " next=", rootNext, " jobs=", rootJobs, " nDataRoots=", work.nDataRoots, " nBSSRoots=", work.nBSSRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
 		panic("non-empty mark queue after concurrent mark")
 	}
 
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index a136c7aeaceda2..557b20ee918cc0 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -51,8 +51,76 @@ const (
 	// Must be a multiple of the pageInUse bitmap element size and
 	// must also evenly divide pagesPerArena.
 	pagesPerSpanRoot = 512
+
+	gcUndoBitMask = uintptr(uintptrMask >> 2) // This constant reserves some bits of the address space for the GC to use in order to mask addresses.
+	gcBitMask     = ^gcUndoBitMask            // This flips every bit in gcUndoBitMask of uinptr width
 )
 
+// gcMask masks addresses that should not be automatically marked during the GC.
+//
+//go:nosplit
+func gcMask(p unsafe.Pointer) unsafe.Pointer {
+	if goexperiment.GoroutineLeakFinderGC {
+		return unsafe.Pointer(uintptr(p) | gcBitMask)
+	}
+	return p
+}
+
+// gcUnmask undoes the bit-mask applied to a pointer.
+//
+//go:nosplit
+func gcUnmask(p unsafe.Pointer) unsafe.Pointer {
+	if goexperiment.GoroutineLeakFinderGC {
+		return unsafe.Pointer(uintptr(p) & gcUndoBitMask)
+	}
+	return p
+}
+
+// internalBlocked returns true if the goroutine is blocked due to an
+// internal (non-leaking) waitReason, e.g. waiting for the netpoller or garbage collector.
+// Such goroutines are never leak detection candidates according to the GC.
+//
+//go:nosplit
+func (gp *g) internalBlocked() bool {
+	reason := gp.waitreason
+	return reason < waitReasonChanReceiveNilChan || waitReasonSyncWaitGroupWait < reason
+}
+
+// The world must be stopped or allglock must be held.
+// go through the snapshot of allgs, putting them into an arrays,
+// separated by index, where [0:blockedIndex] contains only running Gs
+// allGs[blockedIndex:] contain only blocking Gs
+// To avoid GC from marking and scanning the blocked Gs by scanning
+// the returned array (which is heap allocated), we mask the highest
+// bit of the pointers to Gs with gcBitMask.
+func allGsSnapshotSortedForGC() ([]*g, int) {
+	assertWorldStoppedOrLockHeld(&allglock)
+
+	allgsSorted := make([]*g, len(allgs))
+
+	// Indices cutting off runnable and blocked Gs.
+	var currIndex, blockedIndex = 0, len(allgsSorted) - 1
+	for _, gp := range allgs {
+		gp = gp.unmask()
+		// not sure if we need atomic load because we are stopping the world,
+		// but do it just to be safe for now
+		if status := readgstatus(gp); status != _Gwaiting || gp.internalBlocked() {
+			allgsSorted[currIndex] = gp
+			currIndex++
+		} else {
+			allgsSorted[blockedIndex] = gp.mask()
+			blockedIndex--
+		}
+	}
+
+	// Because the world is stopped or allglock is held, allgadd
+	// cannot happen concurrently with this. allgs grows
+	// monotonically and existing entries never change, so we can
+	// simply return a copy of the slice header. For added safety,
+	// we trim everything past len because that can still change.
+	return allgsSorted, blockedIndex + 1
+}
+
 // gcPrepareMarkRoots queues root scanning jobs (stacks, globals, and
 // some miscellany) and initializes scanning-related state.
 //
@@ -102,11 +170,18 @@ func gcPrepareMarkRoots() {
 	// ignore them because they begin life without any roots, so
 	// there's nothing to scan, and any roots they create during
 	// the concurrent phase will be caught by the write barrier.
-	work.stackRoots = allGsSnapshot()
+	if work.goroutineLeakFinder.enabled {
+		work.stackRoots, work.nLiveStackRoots = allGsSnapshotSortedForGC()
+	} else {
+		// regular GC --- scan every go routine
+		work.stackRoots = allGsSnapshot()
+		work.nLiveStackRoots = len(work.stackRoots)
+	}
+
 	work.nStackRoots = len(work.stackRoots)
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nLiveStackRoots)
 
 	// Calculate base indexes of each root type
 	work.baseData = uint32(fixedRootCount)
@@ -119,8 +194,10 @@ func gcPrepareMarkRoots() {
 // gcMarkRootCheck checks that all roots have been scanned. It is
 // purely for debugging.
 func gcMarkRootCheck() {
-	if work.markrootNext < work.markrootJobs {
-		print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n")
+	rootNext := atomic.Load(&work.markrootNext)
+	rootJobs := atomic.Load(&work.markrootJobs)
+	if rootNext < rootJobs {
+		print(rootNext, " of ", rootJobs, " markroot jobs done\n")
 		throw("left over markroot jobs")
 	}
 
@@ -868,7 +945,7 @@ func scanstack(gp *g, gcw *gcWork) int64 {
 	case _Grunning:
 		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
 		throw("scanstack: goroutine not stopped")
-	case _Grunnable, _Gsyscall, _Gwaiting:
+	case _Grunnable, _Gsyscall, _Gwaiting, _Gleaked:
 		// ok
 	}
 
@@ -1136,6 +1213,32 @@ func gcDrainMarkWorkerFractional(gcw *gcWork) {
 	gcDrain(gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 }
 
+func gcUpdateMarkrootNext() (uint32, bool) {
+	var success bool
+	var next uint32 = atomic.Load(&work.markrootNext)
+	var jobs uint32 = atomic.Load(&work.markrootJobs)
+
+	if next < jobs {
+		// still work available at the moment
+		for !success {
+			success = atomic.Cas(&work.markrootNext, next, next+1)
+			// We manage to snatch a root job. Return the root index.
+			if success {
+				return next, true
+			}
+
+			// Get the latest value of markrootNext.
+			next = atomic.Load(&work.markrootNext)
+			jobs := atomic.Load(&work.markrootJobs)
+			// We are out of markroot jobs.
+			if next >= jobs {
+				break
+			}
+		}
+	}
+	return 0, false
+}
+
 // gcDrain scans roots and objects in work buffers, blackening grey
 // objects until it is unable to get more work. It may return before
 // GC is done; it's the caller's responsibility to balance work from
@@ -1194,13 +1297,14 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 		}
 	}
 
-	// Drain root marking jobs.
-	if work.markrootNext < work.markrootJobs {
+	rootNext := atomic.Load(&work.markrootNext)
+	rootJobs := atomic.Load(&work.markrootJobs)
+	if rootNext < rootJobs {
 		// Stop if we're preemptible, if someone wants to STW, or if
 		// someone is calling forEachP.
 		for !(gp.preempt && (preemptible || sched.gcwaiting.Load() || pp.runSafePointFn != 0)) {
-			job := atomic.Xadd(&work.markrootNext, +1) - 1
-			if job >= work.markrootJobs {
+			job, success := gcUpdateMarkrootNext()
+			if !success {
 				break
 			}
 			markroot(gcw, job, flushBgCredit)
@@ -1346,9 +1450,9 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 					wbBufFlush()
 					if b = gcw.tryGetObj(); b == 0 {
 						// Try to do a root job.
-						if work.markrootNext < work.markrootJobs {
-							job := atomic.Xadd(&work.markrootNext, +1) - 1
-							if job < work.markrootJobs {
+						if atomic.Load(&work.markrootNext) < atomic.Load(&work.markrootJobs) {
+							job, success := gcUpdateMarkrootNext()
+							if success {
 								workFlushed += markroot(gcw, job, false)
 								continue
 							}
@@ -1512,7 +1616,9 @@ func scanobject(b uintptr, gcw *gcWork) {
 
 		// At this point we have extracted the next potential pointer.
 		// Quickly filter out nil and pointers back to the current object.
-		if obj != 0 && obj-b >= n {
+		// The GC will skip masked addresses if GoroutineLeakFinderGC is enabled.
+		if obj != 0 && obj-b >= n &&
+			(!goexperiment.GoroutineLeakFinderGC || obj <= gcUndoBitMask) {
 			// Test if obj points into the Go heap and, if so,
 			// mark the object.
 			//
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
index c41c3558359c0c..586abc433ffc0b 100644
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -160,7 +160,7 @@ func suspendG(gp *g) suspendGState {
 			s = _Gwaiting
 			fallthrough
 
-		case _Grunnable, _Gsyscall, _Gwaiting:
+		case _Grunnable, _Gsyscall, _Gwaiting, _Gleaked:
 			// Claim goroutine by setting scan bit.
 			// This may race with execution or readying of gp.
 			// The scan bit keeps it from transition state.
@@ -269,6 +269,7 @@ func resumeG(state suspendGState) {
 
 	case _Grunnable | _Gscan,
 		_Gwaiting | _Gscan,
+		_Gleaked | _Gscan,
 		_Gsyscall | _Gscan:
 		casfrom_Gscanstatus(gp, s, s&^_Gscan)
 	}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index b41bbe93cf57c7..3c0a5144d2cc86 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -8,6 +8,7 @@ import (
 	"internal/abi"
 	"internal/cpu"
 	"internal/goarch"
+	"internal/goexperiment"
 	"internal/goos"
 	"internal/runtime/atomic"
 	"internal/runtime/exithook"
@@ -689,7 +690,7 @@ func allgadd(gp *g) {
 	}
 
 	lock(&allglock)
-	allgs = append(allgs, gp)
+	allgs = append(allgs, gp.mask())
 	if &allgs[0] != allgptr {
 		atomicstorep(unsafe.Pointer(&allgptr), unsafe.Pointer(&allgs[0]))
 	}
@@ -708,6 +709,11 @@ func allGsSnapshot() []*g {
 	// monotonically and existing entries never change, so we can
 	// simply return a copy of the slice header. For added safety,
 	// we trim everything past len because that can still change.
+	if goexperiment.GoroutineLeakFinderGC {
+		for i, gp := range allgs {
+			allgs[i] = gp.unmask()
+		}
+	}
 	return allgs[:len(allgs):len(allgs)]
 }
 
@@ -729,7 +735,7 @@ func atomicAllGIndex(ptr **g, i uintptr) *g {
 func forEachG(fn func(gp *g)) {
 	lock(&allglock)
 	for _, gp := range allgs {
-		fn(gp)
+		fn(gp.unmask())
 	}
 	unlock(&allglock)
 }
@@ -742,7 +748,7 @@ func forEachGRace(fn func(gp *g)) {
 	ptr, length := atomicAllG()
 	for i := uintptr(0); i < length; i++ {
 		gp := atomicAllGIndex(ptr, i)
-		fn(gp)
+		fn(gp.unmask())
 	}
 	return
 }
@@ -1208,6 +1214,7 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
 		_Gscanwaiting,
 		_Gscanrunning,
 		_Gscansyscall,
+		_Gscanleaked,
 		_Gscanpreempted:
 		if newval == oldval&^_Gscan {
 			success = gp.atomicstatus.CompareAndSwap(oldval, newval)
@@ -1228,6 +1235,7 @@ func castogscanstatus(gp *g, oldval, newval uint32) bool {
 	case _Grunnable,
 		_Grunning,
 		_Gwaiting,
+		_Gleaked,
 		_Gsyscall:
 		if newval == oldval|_Gscan {
 			r := gp.atomicstatus.CompareAndSwap(oldval, newval)
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 527611f96a29d9..fe0aad02a01818 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -87,6 +87,9 @@ const (
 	// ready()ing this G.
 	_Gpreempted // 9
 
+	// _Gleaked represents a leaked goroutine caught by the GC.
+	_Gleaked // 10
+
 	// _Gscan combined with one of the above states other than
 	// _Grunning indicates that GC is scanning the stack. The
 	// goroutine is not executing user code and the stack is owned
@@ -104,6 +107,7 @@ const (
 	_Gscansyscall   = _Gscan + _Gsyscall   // 0x1003
 	_Gscanwaiting   = _Gscan + _Gwaiting   // 0x1004
 	_Gscanpreempted = _Gscan + _Gpreempted // 0x1009
+	_Gscanleaked    = _Gscan + _Gleaked    // 0x100a
 )
 
 const (
@@ -1056,24 +1060,24 @@ const (
 	waitReasonZero                  waitReason = iota // ""
 	waitReasonGCAssistMarking                         // "GC assist marking"
 	waitReasonIOWait                                  // "IO wait"
-	waitReasonChanReceiveNilChan                      // "chan receive (nil chan)"
-	waitReasonChanSendNilChan                         // "chan send (nil chan)"
 	waitReasonDumpingHeap                             // "dumping heap"
 	waitReasonGarbageCollection                       // "garbage collection"
 	waitReasonGarbageCollectionScan                   // "garbage collection scan"
 	waitReasonPanicWait                               // "panicwait"
-	waitReasonSelect                                  // "select"
-	waitReasonSelectNoCases                           // "select (no cases)"
 	waitReasonGCAssistWait                            // "GC assist wait"
 	waitReasonGCSweepWait                             // "GC sweep wait"
 	waitReasonGCScavengeWait                          // "GC scavenge wait"
-	waitReasonChanReceive                             // "chan receive"
-	waitReasonChanSend                                // "chan send"
 	waitReasonFinalizerWait                           // "finalizer wait"
 	waitReasonForceGCIdle                             // "force gc (idle)"
 	waitReasonUpdateGOMAXPROCSIdle                    // "GOMAXPROCS updater (idle)"
 	waitReasonSemacquire                              // "semacquire"
 	waitReasonSleep                                   // "sleep"
+	waitReasonChanReceiveNilChan                      // "chan receive (nil chan)"
+	waitReasonChanSendNilChan                         // "chan send (nil chan)"
+	waitReasonSelect                                  // "select"
+	waitReasonSelectNoCases                           // "select (no cases)"
+	waitReasonChanReceive                             // "chan receive"
+	waitReasonChanSend                                // "chan send"
 	waitReasonSyncCondWait                            // "sync.Cond.Wait"
 	waitReasonSyncMutexLock                           // "sync.Mutex.Lock"
 	waitReasonSyncRWMutexRLock                        // "sync.RWMutex.RLock"
@@ -1159,12 +1163,24 @@ func (w waitReason) String() string {
 	return waitReasonStrings[w]
 }
 
+// isMutexWait returns true if the goroutine is blocked because of
+// sync.Mutex.Lock or sync.RWMutex.[R]Lock.
+//
+//go:nosplit
 func (w waitReason) isMutexWait() bool {
 	return w == waitReasonSyncMutexLock ||
 		w == waitReasonSyncRWMutexRLock ||
 		w == waitReasonSyncRWMutexLock
 }
 
+// isSyncWait returns true if the goroutine is blocked because of
+// sync library primitive operations.
+//
+//go:nosplit
+func (w waitReason) isSyncWait() bool {
+	return waitReasonSyncCondWait <= w && w <= waitReasonSyncWaitGroupWait
+}
+
 func (w waitReason) isWaitingForSuspendG() bool {
 	return isWaitingForSuspendG[w]
 }
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 6af49b1b0c42d9..08167ff217a6a6 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -21,6 +21,7 @@ package runtime
 
 import (
 	"internal/cpu"
+	"internal/goexperiment"
 	"internal/runtime/atomic"
 	"unsafe"
 )
@@ -188,7 +189,7 @@ func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes i
 		}
 		// Any semrelease after the cansemacquire knows we're waiting
 		// (we set nwait above), so go to sleep.
-		root.queue(addr, s, lifo)
+		root.queue(addr, s, lifo, reason.isSyncWait())
 		goparkunlock(&root.lock, reason, traceBlockSync, 4+skipframes)
 		if s.ticket != 0 || cansemacquire(addr) {
 			break
@@ -301,9 +302,18 @@ func cansemacquire(addr *uint32) bool {
 }
 
 // queue adds s to the blocked goroutines in semaRoot.
-func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
+func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool, syncSema bool) {
 	s.g = getg()
-	s.elem = unsafe.Pointer(addr)
+	pAddr := unsafe.Pointer(addr)
+	if goexperiment.GoroutineLeakFinderGC {
+		if syncSema {
+			// Mask the addr so it doesn't get marked during GC
+			// through marking of the treap or marking of the blocked goroutine
+			pAddr = gcMask(unsafe.Pointer(addr))
+			s.g.waiting = s
+		}
+	}
+	s.elem = pAddr
 	s.next = nil
 	s.prev = nil
 	s.waiters = 0
@@ -311,7 +321,13 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
 	var last *sudog
 	pt := &root.treap
 	for t := *pt; t != nil; t = *pt {
-		if t.elem == unsafe.Pointer(addr) {
+		var cmp bool
+		if goexperiment.GoroutineLeakFinderGC {
+			cmp = uintptr(gcUnmask(pAddr)) == uintptr(gcUnmask(t.elem))
+		} else {
+			cmp = uintptr(pAddr) == uintptr(t.elem)
+		}
+		if cmp {
 			// Already have addr in list.
 			if lifo {
 				// Substitute s in t's place in treap.
@@ -357,7 +373,12 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
 			return
 		}
 		last = t
-		if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
+		if goexperiment.GoroutineLeakFinderGC {
+			cmp = uintptr(gcUnmask(pAddr)) < uintptr(gcUnmask(t.elem))
+		} else {
+			cmp = uintptr(pAddr) < uintptr(t.elem)
+		}
+		if cmp {
 			pt = &t.prev
 		} else {
 			pt = &t.next
@@ -402,11 +423,24 @@ func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
 func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now, tailtime int64) {
 	ps := &root.treap
 	s := *ps
+
 	for ; s != nil; s = *ps {
-		if s.elem == unsafe.Pointer(addr) {
+		var cmp bool
+		if goexperiment.GoroutineLeakFinderGC {
+			cmp = gcUnmask(unsafe.Pointer(addr)) == gcUnmask(s.elem)
+		} else {
+			cmp = unsafe.Pointer(addr) == s.elem
+		}
+		if cmp {
 			goto Found
 		}
-		if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
+
+		if goexperiment.GoroutineLeakFinderGC {
+			cmp = uintptr(gcUnmask(unsafe.Pointer(addr))) < uintptr(gcUnmask(s.elem))
+		} else {
+			cmp = uintptr(unsafe.Pointer(addr)) < uintptr(s.elem)
+		}
+		if cmp {
 			ps = &s.prev
 		} else {
 			ps = &s.next
@@ -470,6 +504,10 @@ Found:
 		}
 		tailtime = s.acquiretime
 	}
+	if goexperiment.GoroutineLeakFinderGC {
+		// Goroutine is no longer blocked. Clear the waiting pointer.
+		s.g.waiting = nil
+	}
 	s.parent = nil
 	s.elem = nil
 	s.next = nil
@@ -590,6 +628,13 @@ func notifyListWait(l *notifyList, t uint32) {
 	// Enqueue itself.
 	s := acquireSudog()
 	s.g = getg()
+	if goexperiment.GoroutineLeakFinderGC {
+		// Storing this pointer (masked) so that we can trace
+		// the condvar address from the blocked goroutine when
+		// checking for goroutine leaks.
+		s.elem = gcMask(unsafe.Pointer(l))
+		s.g.waiting = s
+	}
 	s.ticket = t
 	s.releasetime = 0
 	t0 := int64(0)
@@ -607,6 +652,12 @@ func notifyListWait(l *notifyList, t uint32) {
 	if t0 != 0 {
 		blockevent(s.releasetime-t0, 2)
 	}
+	if goexperiment.GoroutineLeakFinderGC {
+		// Goroutine is no longer blocked. Clear up its waiting pointer,
+		// and clean up the sudog before releasing it.
+		s.g.waiting = nil
+		s.elem = nil
+	}
 	releaseSudog(s)
 }
 
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 00c0f08e5593c8..e8fef35da7d104 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -1206,6 +1206,7 @@ var gStatusStrings = [...]string{
 	_Gwaiting:   "waiting",
 	_Gdead:      "dead",
 	_Gcopystack: "copystack",
+	_Gleaked:    "leaked",
 	_Gpreempted: "preempted",
 }
 
diff --git a/src/runtime/tracestatus.go b/src/runtime/tracestatus.go
index 03ec81fc0262a1..8b5eafd170f488 100644
--- a/src/runtime/tracestatus.go
+++ b/src/runtime/tracestatus.go
@@ -122,7 +122,7 @@ func goStatusToTraceGoStatus(status uint32, wr waitReason) tracev2.GoStatus {
 		tgs = tracev2.GoRunning
 	case _Gsyscall:
 		tgs = tracev2.GoSyscall
-	case _Gwaiting, _Gpreempted:
+	case _Gwaiting, _Gpreempted, _Gleaked:
 		// There are a number of cases where a G might end up in
 		// _Gwaiting but it's actually running in a non-preemptive
 		// state but needs to present itself as preempted to the