1 files changed, 205 insertions, 190 deletions
diff --git a/libgo/go/runtime/mgcscavenge.go b/libgo/go/runtime/mgcscavenge.go
index 3fa1c4604d7..adf2b058af4 100644
--- a/libgo/go/runtime/mgcscavenge.go
+++ b/libgo/go/runtime/mgcscavenge.go
@@ -56,6 +56,7 @@
 package runtime
 
 import (
+	"internal/goos"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -90,7 +91,7 @@ const (
 	//
 	// This ratio is used as part of multiplicative factor to help the scavenger account
 	// for the additional costs of using scavenged memory in its pacing.
-	scavengeCostRatio = 0.7 * (sys.GoosDarwin + sys.GoosIos)
+	scavengeCostRatio = 0.7 * (goos.IsDarwin + goos.IsIos)
 
 	// scavengeReservationShards determines the amount of memory the scavenger
 	// should reserve for scavenging at a time. Specifically, the amount of
@@ -104,7 +105,8 @@ func heapRetained() uint64 {
 }
 
 // gcPaceScavenger updates the scavenger's pacing, particularly
-// its rate and RSS goal.
+// its rate and RSS goal. For this, it requires the current heapGoal,
+// and the heapGoal for the previous GC cycle.
 //
 // The RSS goal is based on the current heap goal with a small overhead
 // to accommodate non-determinism in the allocator.
@@ -112,18 +114,22 @@ func heapRetained() uint64 {
 // The pacing is based on scavengePageRate, which applies to both regular and
 // huge pages. See that constant for more information.
 //
+// Must be called whenever GC pacing is updated.
+//
 // mheap_.lock must be held or the world must be stopped.
-func gcPaceScavenger() {
+func gcPaceScavenger(heapGoal, lastHeapGoal uint64) {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
 	// If we're called before the first GC completed, disable scavenging.
 	// We never scavenge before the 2nd GC cycle anyway (we don't have enough
 	// information about the heap yet) so this is fine, and avoids a fault
 	// or garbage data later.
-	if gcController.lastHeapGoal == 0 {
-		mheap_.scavengeGoal = ^uint64(0)
+	if lastHeapGoal == 0 {
+		atomic.Store64(&mheap_.scavengeGoal, ^uint64(0))
 		return
 	}
 	// Compute our scavenging goal.
-	goalRatio := float64(atomic.Load64(&gcController.heapGoal)) / float64(gcController.lastHeapGoal)
+	goalRatio := float64(heapGoal) / float64(lastHeapGoal)
 	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
@@ -151,10 +157,10 @@ func gcPaceScavenger() {
 	// the background scavenger. We disable the background scavenger if there's
 	// less than one physical page of work to do because it's not worth it.
 	if retainedNow <= retainedGoal || retainedNow-retainedGoal < uint64(physPageSize) {
-		mheap_.scavengeGoal = ^uint64(0)
+		atomic.Store64(&mheap_.scavengeGoal, ^uint64(0))
 		return
 	}
-	mheap_.scavengeGoal = retainedGoal
+	atomic.Store64(&mheap_.scavengeGoal, retainedGoal)
 }
 
 // Sleep/wait state of the background scavenger.
@@ -249,7 +255,7 @@ func scavengeSleep(ns int64) int64 {
 // The background scavenger maintains the RSS of the application below
 // the line described by the proportional scavenging statistics in
 // the mheap struct.
-func bgscavenge() {
+func bgscavenge(c chan int) {
 	setSystemGoroutine()
 
 	scavenge.g = getg()
@@ -259,56 +265,93 @@ func bgscavenge() {
 	scavenge.parked = true
 
 	scavenge.timer = new(timer)
-	scavenge.timer.f = func(_ interface{}, _ uintptr) {
+	scavenge.timer.f = func(_ any, _ uintptr) {
 		wakeScavenger()
 	}
 
-	gcenable_setup <- 1
+	c <- 1
 	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
 
-	// Exponentially-weighted moving average of the fraction of time this
-	// goroutine spends scavenging (that is, percent of a single CPU).
-	// It represents a measure of scheduling overheads which might extend
-	// the sleep or the critical time beyond what's expected. Assume no
-	// overhead to begin with.
-	//
-	// TODO(mknyszek): Consider making this based on total CPU time of the
-	// application (i.e. scavengePercent * GOMAXPROCS). This isn't really
-	// feasible now because the scavenger acquires the heap lock over the
-	// scavenging operation, which means scavenging effectively blocks
-	// allocators and isn't scalable. However, given a scalable allocator,
-	// it makes sense to also make the scavenger scale with it; if you're
-	// allocating more frequently, then presumably you're also generating
-	// more work for the scavenger.
-	const idealFraction = scavengePercent / 100.0
-	scavengeEWMA := float64(idealFraction)
+	// idealFraction is the ideal % of overall application CPU time that we
+	// spend scavenging.
+	idealFraction := float64(scavengePercent) / 100.0
 
+	// Input: fraction of CPU time used.
+	// Setpoint: idealFraction.
+	// Output: ratio of critical time to sleep time (determines sleep time).
+	//
+	// The output of this controller is somewhat indirect to what we actually
+	// want to achieve: how much time to sleep for. The reason for this definition
+	// is to ensure that the controller's outputs have a direct relationship with
+	// its inputs (as opposed to an inverse relationship), making it somewhat
+	// easier to reason about for tuning purposes.
+	critSleepController := piController{
+		// Tuned loosely via Ziegler-Nichols process.
+		kp: 0.3375,
+		ti: 3.2e6,
+		tt: 1e9, // 1 second reset time.
+
+		// These ranges seem wide, but we want to give the controller plenty of
+		// room to hunt for the optimal value.
+		min: 0.001,  // 1:1000
+		max: 1000.0, // 1000:1
+	}
+	// It doesn't really matter what value we start at, but we can't be zero, because
+	// that'll cause divide-by-zero issues.
+	critSleepRatio := 0.001
 	for {
 		released := uintptr(0)
-
-		// Time in scavenging critical section.
 		crit := float64(0)
 
-		// Run on the system stack since we grab the heap lock,
-		// and a stack growth with the heap lock means a deadlock.
-		systemstack(func() {
-			lock(&mheap_.lock)
-
+		// Spend at least 1 ms scavenging, otherwise the corresponding
+		// sleep time to maintain our desired utilization is too low to
+		// be reliable.
+		const minCritTime = 1e6
+		for crit < minCritTime {
 			// If background scavenging is disabled or if there's no work to do just park.
-			retained, goal := heapRetained(), mheap_.scavengeGoal
+			retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
 			if retained <= goal {
-				unlock(&mheap_.lock)
-				return
+				break
 			}
 
-			// Scavenge one page, and measure the amount of time spent scavenging.
+			// scavengeQuantum is the amount of memory we try to scavenge
+			// in one go. A smaller value means the scavenger is more responsive
+			// to the scheduler in case of e.g. preemption. A larger value means
+			// that the overheads of scavenging are better amortized, so better
+			// scavenging throughput.
+			//
+			// The current value is chosen assuming a cost of ~10µs/physical page
+			// (this is somewhat pessimistic), which implies a worst-case latency of
+			// about 160µs for 4 KiB physical pages. The current value is biased
+			// toward latency over throughput.
+			const scavengeQuantum = 64 << 10
+
+			// Accumulate the amount of time spent scavenging.
 			start := nanotime()
-			released = mheap_.pages.scavenge(physPageSize, true)
-			mheap_.pages.scav.released += released
-			crit = float64(nanotime() - start)
+			r := mheap_.pages.scavenge(scavengeQuantum)
+			atomic.Xadduintptr(&mheap_.pages.scav.released, r)
+			end := nanotime()
 
-			unlock(&mheap_.lock)
-		})
+			// On some platforms we may see end >= start if the time it takes to scavenge
+			// memory is less than the minimum granularity of its clock (e.g. Windows) or
+			// due to clock bugs.
+			//
+			// In this case, just assume scavenging takes 10 µs per regular physical page
+			// (determined empirically), and conservatively ignore the impact of huge pages
+			// on timing.
+			const approxCritNSPerPhysicalPage = 10e3
+			if end <= start {
+				crit += approxCritNSPerPhysicalPage * float64(r/physPageSize)
+			} else {
+				crit += float64(end - start)
+			}
+			released += r
+
+			// When using fake time just do one loop.
+			if faketime != 0 {
+				break
+			}
+		}
 
 		if released == 0 {
 			lock(&scavenge.lock)
@@ -325,18 +368,13 @@ func bgscavenge() {
 			throw("released less than one physical page of memory")
 		}
 
-		// On some platforms we may see crit as zero if the time it takes to scavenge
-		// memory is less than the minimum granularity of its clock (e.g. Windows).
-		// In this case, just assume scavenging takes 10 µs per regular physical page
-		// (determined empirically), and conservatively ignore the impact of huge pages
-		// on timing.
-		//
-		// We shouldn't ever see a crit value less than zero unless there's a bug of
-		// some kind, either on our side or in the platform we're running on, but be
-		// defensive in that case as well.
-		const approxCritNSPerPhysicalPage = 10e3
-		if crit <= 0 {
-			crit = approxCritNSPerPhysicalPage * float64(released/physPageSize)
+		if crit < minCritTime {
+			// This means there wasn't enough work to actually fill up minCritTime.
+			// That's fine; we shouldn't try to do anything with this information
+			// because it's going result in a short enough sleep request that things
+			// will get messy. Just assume we did at least this much work.
+			// All this means is that we'll sleep longer than we otherwise would have.
+			crit = minCritTime
 		}
 
 		// Multiply the critical time by 1 + the ratio of the costs of using
@@ -347,41 +385,19 @@ func bgscavenge() {
 		// because of the additional overheads of using scavenged memory.
 		crit *= 1 + scavengeCostRatio
 
-		// If we spent more than 10 ms (for example, if the OS scheduled us away, or someone
-		// put their machine to sleep) in the critical section, bound the time we use to
-		// calculate at 10 ms to avoid letting the sleep time get arbitrarily high.
-		const maxCrit = 10e6
-		if crit > maxCrit {
-			crit = maxCrit
-		}
+		// Go to sleep for our current sleepNS.
+		slept := scavengeSleep(int64(crit / critSleepRatio))
 
-		// Compute the amount of time to sleep, assuming we want to use at most
-		// scavengePercent of CPU time. Take into account scheduling overheads
-		// that may extend the length of our sleep by multiplying by how far
-		// off we are from the ideal ratio. For example, if we're sleeping too
-		// much, then scavengeEMWA < idealFraction, so we'll adjust the sleep time
-		// down.
-		adjust := scavengeEWMA / idealFraction
-		sleepTime := int64(adjust * crit / (scavengePercent / 100.0))
-
-		// Go to sleep.
-		slept := scavengeSleep(sleepTime)
-
-		// Compute the new ratio.
-		fraction := crit / (crit + float64(slept))
-
-		// Set a lower bound on the fraction.
-		// Due to OS-related anomalies we may "sleep" for an inordinate amount
-		// of time. Let's avoid letting the ratio get out of hand by bounding
-		// the sleep time we use in our EWMA.
-		const minFraction = 1.0 / 1000.0
-		if fraction < minFraction {
-			fraction = minFraction
-		}
+		// Calculate the CPU time spent.
+		//
+		// This may be slightly inaccurate with respect to GOMAXPROCS, but we're
+		// recomputing this often enough relative to GOMAXPROCS changes in general
+		// (it only changes when the world is stopped, and not during a GC) that
+		// that small inaccuracy is in the noise.
+		cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs))
 
-		// Update scavengeEWMA by merging in the new crit/slept ratio.
-		const alpha = 0.5
-		scavengeEWMA = alpha*fraction + (1-alpha)*scavengeEWMA
+		// Update the critSleepRatio, adjusting until we reach our ideal fraction.
+		critSleepRatio = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit)
 	}
 }
 
@@ -391,16 +407,7 @@ func bgscavenge() {
 // back to the top of the heap.
 //
 // Returns the amount of memory scavenged in bytes.
-//
-// p.mheapLock must be held, but may be temporarily released if
-// mayUnlock == true.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
-func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
-	assertLockHeld(p.mheapLock)
-
+func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
 	var (
 		addrs addrRange
 		gen   uint32
@@ -412,9 +419,11 @@ func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 				break
 			}
 		}
-		r, a := p.scavengeOne(addrs, nbytes-released, mayUnlock)
-		released += r
-		addrs = a
+		systemstack(func() {
+			r, a := p.scavengeOne(addrs, nbytes-released)
+			released += r
+			addrs = a
+		})
 	}
 	// Only unreserve the space which hasn't been scavenged or searched
 	// to ensure we always make progress.
@@ -452,8 +461,9 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 func (p *pageAlloc) scavengeStartGen() {
 	assertLockHeld(p.mheapLock)
 
+	lock(&p.scav.lock)
 	if debug.scavtrace > 0 {
-		printScavTrace(p.scav.gen, p.scav.released, false)
+		printScavTrace(p.scav.gen, atomic.Loaduintptr(&p.scav.released), false)
 	}
 	p.inUse.cloneInto(&p.scav.inUse)
 
@@ -483,9 +493,10 @@ func (p *pageAlloc) scavengeStartGen() {
 	// arena in size, so virtually every heap has the scavenger on.
 	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
 	p.scav.gen++
-	p.scav.released = 0
+	atomic.Storeuintptr(&p.scav.released, 0)
 	p.scav.freeHWM = minOffAddr
 	p.scav.scavLWM = maxOffAddr
+	unlock(&p.scav.lock)
 }
 
 // scavengeReserve reserves a contiguous range of the address space
@@ -494,14 +505,9 @@ func (p *pageAlloc) scavengeStartGen() {
 // first.
 //
 // Returns the reserved range and the scavenge generation number for it.
-//
-// p.mheapLock must be held.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
 func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
-	assertLockHeld(p.mheapLock)
+	lock(&p.scav.lock)
+	gen := p.scav.gen
 
 	// Start by reserving the minimum.
 	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
@@ -509,7 +515,8 @@ func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	// Return early if the size is zero; we don't want to use
 	// the bogus address below.
 	if r.size() == 0 {
-		return r, p.scav.gen
+		unlock(&p.scav.lock)
+		return r, gen
 	}
 
 	// The scavenger requires that base be aligned to a
@@ -520,28 +527,26 @@ func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
 
 	// Remove from inUse however much extra we just pulled out.
 	p.scav.inUse.removeGreaterEqual(newBase)
+	unlock(&p.scav.lock)
+
 	r.base = offAddr{newBase}
-	return r, p.scav.gen
+	return r, gen
 }
 
 // scavengeUnreserve returns an unscavenged portion of a range that was
 // previously reserved with scavengeReserve.
-//
-// p.mheapLock must be held.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
 func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
-	assertLockHeld(p.mheapLock)
-
-	if r.size() == 0 || gen != p.scav.gen {
+	if r.size() == 0 {
 		return
 	}
 	if r.base.addr()%pallocChunkBytes != 0 {
 		throw("unreserving unaligned region")
 	}
-	p.scav.inUse.add(r)
+	lock(&p.scav.lock)
+	if gen == p.scav.gen {
+		p.scav.inUse.add(r)
+	}
+	unlock(&p.scav.lock)
 }
 
 // scavengeOne walks over address range work until it finds
@@ -555,15 +560,10 @@ func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
 //
 // work's base address must be aligned to pallocChunkBytes.
 //
-// p.mheapLock must be held, but may be temporarily released if
-// mayUnlock == true.
-//
-// Must run on the system stack because p.mheapLock must be held.
+// Must run on the systemstack because it acquires p.mheapLock.
 //
 //go:systemstack
-func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
-	assertLockHeld(p.mheapLock)
-
+func (p *pageAlloc) scavengeOne(work addrRange, max uintptr) (uintptr, addrRange) {
 	// Defensively check if we've received an empty address range.
 	// If so, just return.
 	if work.size() == 0 {
@@ -595,40 +595,12 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		minPages = 1
 	}
 
-	// Helpers for locking and unlocking only if mayUnlock == true.
-	lockHeap := func() {
-		if mayUnlock {
-			lock(p.mheapLock)
-		}
-	}
-	unlockHeap := func() {
-		if mayUnlock {
-			unlock(p.mheapLock)
-		}
-	}
-
-	// Fast path: check the chunk containing the top-most address in work,
-	// starting at that address's page index in the chunk.
-	//
-	// Note that work.end() is exclusive, so get the chunk we care about
-	// by subtracting 1.
-	maxAddr := work.limit.addr() - 1
-	maxChunk := chunkIndex(maxAddr)
-	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
-		// We only bother looking for a candidate if there at least
-		// minPages free pages at all.
-		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
-
-		// If we found something, scavenge it and return!
-		if npages != 0 {
-			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
-
-			assertLockHeld(p.mheapLock) // Must be locked on return.
-			return uintptr(npages) * pageSize, work
-		}
+	// Fast path: check the chunk containing the top-most address in work.
+	if r, w := p.scavengeOneFast(work, minPages, maxPages); r != 0 {
+		return r, w
+	} else {
+		work = w
 	}
-	// Update the limit to reflect the fact that we checked maxChunk already.
-	work.limit = offAddr{chunkBase(maxChunk)}
 
 	// findCandidate finds the next scavenge candidate in work optimistically.
 	//
@@ -667,37 +639,61 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// looking for any free and unscavenged page. If we think we see something,
 	// lock and verify it!
 	for work.size() != 0 {
-		unlockHeap()
 
 		// Search for the candidate.
 		candidateChunkIdx, ok := findCandidate(work)
-
-		// Lock the heap. We need to do this now if we found a candidate or not.
-		// If we did, we'll verify it. If not, we need to lock before returning
-		// anyway.
-		lockHeap()
-
 		if !ok {
 			// We didn't find a candidate, so we're done.
 			work.limit = work.base
 			break
 		}
 
+		// Lock, so we can verify what we found.
+		lock(p.mheapLock)
+
 		// Find, verify, and scavenge if we can.
 		chunk := p.chunkOf(candidateChunkIdx)
 		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 		if npages > 0 {
 			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
-
-			assertLockHeld(p.mheapLock) // Must be locked on return.
+			unlock(p.mheapLock)
 			return uintptr(npages) * pageSize, work
 		}
+		unlock(p.mheapLock)
 
 		// We were fooled, so let's continue from where we left off.
 		work.limit = offAddr{chunkBase(candidateChunkIdx)}
 	}
+	return 0, work
+}
 
-	assertLockHeld(p.mheapLock) // Must be locked on return.
+// scavengeOneFast is the fast path for scavengeOne, which just checks the top
+// chunk of work for some pages to scavenge.
+//
+// Must run on the system stack because it acquires the heap lock.
+//
+//go:systemstack
+func (p *pageAlloc) scavengeOneFast(work addrRange, minPages, maxPages uintptr) (uintptr, addrRange) {
+	maxAddr := work.limit.addr() - 1
+	maxChunk := chunkIndex(maxAddr)
+
+	lock(p.mheapLock)
+	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
+		// We only bother looking for a candidate if there at least
+		// minPages free pages at all.
+		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+
+		// If we found something, scavenge it and return!
+		if npages != 0 {
+			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
+			unlock(p.mheapLock)
+			return uintptr(npages) * pageSize, work
+		}
+	}
+	unlock(p.mheapLock)
+
+	// Update the limit to reflect the fact that we checked maxChunk already.
+	work.limit = offAddr{chunkBase(maxChunk)}
 	return 0, work
 }
 
@@ -708,38 +704,57 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 //
 // Returns the base address of the scavenged region.
 //
-// p.mheapLock must be held.
+// p.mheapLock must be held. Unlocks p.mheapLock but reacquires
+// it before returning. Must be run on the systemstack as a result.
+//
+//go:systemstack
 func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
 	assertLockHeld(p.mheapLock)
 
-	p.chunkOf(ci).scavenged.setRange(base, npages)
-
 	// Compute the full address for the start of the range.
 	addr := chunkBase(ci) + uintptr(base)*pageSize
 
+	// Mark the range we're about to scavenge as allocated, because
+	// we don't want any allocating goroutines to grab it while
+	// the scavenging is in progress.
+	if scav := p.allocRange(addr, uintptr(npages)); scav != 0 {
+		throw("double scavenge")
+	}
+
+	// With that done, it's safe to unlock.
+	unlock(p.mheapLock)
+
 	// Update the scavenge low watermark.
+	lock(&p.scav.lock)
 	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
 		p.scav.scavLWM = oAddr
 	}
+	unlock(&p.scav.lock)
 
-	// Only perform the actual scavenging if we're not in a test.
-	// It's dangerous to do so otherwise.
-	if p.test {
-		return addr
-	}
-	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
+	if !p.test {
+		// Only perform the actual scavenging if we're not in a test.
+		// It's dangerous to do so otherwise.
+		sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
 
-	// Update global accounting only when not in test, otherwise
-	// the runtime's accounting will be wrong.
-	nbytes := int64(npages) * pageSize
-	atomic.Xadd64(&memstats.heap_released, nbytes)
+		// Update global accounting only when not in test, otherwise
+		// the runtime's accounting will be wrong.
+		nbytes := int64(npages) * pageSize
+		atomic.Xadd64(&memstats.heap_released, nbytes)
 
-	// Update consistent accounting too.
-	stats := memstats.heapStats.acquire()
-	atomic.Xaddint64(&stats.committed, -nbytes)
-	atomic.Xaddint64(&stats.released, nbytes)
-	memstats.heapStats.release()
+		// Update consistent accounting too.
+		stats := memstats.heapStats.acquire()
+		atomic.Xaddint64(&stats.committed, -nbytes)
+		atomic.Xaddint64(&stats.released, nbytes)
+		memstats.heapStats.release()
+	}
+
+	// Relock the heap, because now we need to make these pages
+	// available allocation. Free them back to the page allocator.
+	lock(p.mheapLock)
+	p.free(addr, uintptr(npages), true)
 
+	// Mark the range as scavenged.
+	p.chunkOf(ci).scavenged.setRange(base, npages)
 	return addr
 }