110 files changed, 4302 insertions, 1979 deletions
diff --git a/libgo/go/runtime/atomic_pointer.go b/libgo/go/runtime/atomic_pointer.go
index 03d8d6a48d5..49b0f2b84fd 100644
--- a/libgo/go/runtime/atomic_pointer.go
+++ b/libgo/go/runtime/atomic_pointer.go
@@ -13,8 +13,6 @@ import (
 // because while ptr does not escape, new does.
 // If new is marked as not escaping, the compiler will make incorrect
 // escape analysis decisions about the pointer value being stored.
-// Instead, these are wrappers around the actual atomics (casp1 and so on)
-// that use noescape to convey which arguments do not escape.
 
 // atomicwb performs a write barrier before an atomic pointer write.
 // The caller should guard the call with "if writeBarrier.enabled".
@@ -37,17 +35,6 @@ func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
 	atomic.StorepNoWB(noescape(ptr), new)
 }
 
-//go:nosplit
-func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
-	// The write barrier is only necessary if the CAS succeeds,
-	// but since it needs to happen before the write becomes
-	// public, we have to do it conservatively all the time.
-	if writeBarrier.enabled {
-		atomicwb(ptr, new)
-	}
-	return atomic.Casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), noescape(old), new)
-}
-
 // Like above, but implement in terms of sync/atomic's uintptr operations.
 // We cannot just call the runtime routines, because the race detector expects
 // to be able to intercept the sync/atomic forms but not the runtime forms.
diff --git a/libgo/go/runtime/cgo_gccgo.go b/libgo/go/runtime/cgo_gccgo.go
index e689b0e2616..aff8130a27c 100644
--- a/libgo/go/runtime/cgo_gccgo.go
+++ b/libgo/go/runtime/cgo_gccgo.go
@@ -84,8 +84,8 @@ func CgocallBack() {
 
 	lockOSThread()
 
-	exitsyscall()
 	gp.m.incgo = false
+	exitsyscall()
 
 	if gp.m.ncgo == 0 {
 		// The C call to Go came from a thread created by C.
@@ -108,37 +108,41 @@ func CgocallBack() {
 func CgocallBackDone() {
 	unlockOSThread()
 
-	// If we are the top level Go function called from C/C++, then
-	// we need to release the m. But don't release it if we are
-	// panicing; since this is the top level, we are going to
-	// crash the program, and we need the g and m to print the
-	// panic values.
-	//
-	// Dropping the m is going to clear g. This function is being
-	// called as a deferred function, so we will return to
-	// deferreturn which will want to clear the _defer field.
-	// As soon as we call dropm another thread may call needm and
-	// start using g, so we must not tamper with the _defer field
-	// after dropm. So clear _defer now.
+	// We are going to stop running in Go mode and return to C mode.
+	// We were almost certainly called by defer; if so, clean up
+	// the defer struct now, before we leave Go mode. But don't
+	// leave Go mode if we are panicing or called from Goexit,
+	// since in those cases we will continue executing deferred functions.
 	gp := getg()
 	mp := gp.m
 	drop := false
-	if mp.dropextram && mp.ncgo == 0 && gp._panic == nil {
+	if gp.deferring && gp._panic == nil && !gp.goexiting {
 		d := gp._defer
-		if d == nil || d.link != nil {
-			throw("unexpected g._defer in CgocallBackDone")
+		if d == nil {
+			throw("no defer struct when deferring")
 		}
-		gp._defer = nil
+		gp._defer = d.link
 		freedefer(d)
-		drop = true
+
+		// If we are the top level Go function called from C,
+		// then we need to release the m.
+		if mp.dropextram && mp.ncgo == 0 {
+			drop = true
+		}
 	}
 
-	gp.m.incgo = true
-	entersyscall()
+	// Don't go back to C mode if we are panicing. Just let the
+	// panic walk up through the Go stack.
+	if gp._panic == nil && !gp.goexiting {
+		gp.m.incgo = true
+		entersyscall()
+	}
 
 	if drop {
 		mp.dropextram = false
 		dropm()
+	} else if gp.deferring && gp._panic == nil && !gp.goexiting {
+		gp.ranCgocallBackDone = true
 	}
 }
 
diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
index 24bf749e5a7..57b42ff2eae 100644
--- a/libgo/go/runtime/cgocall.go
+++ b/libgo/go/runtime/cgocall.go
@@ -254,7 +254,7 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
 	return
 }
 
-// cgoIsGoPointer returns whether the pointer is a Go pointer--a
+// cgoIsGoPointer reports whether the pointer is a Go pointer--a
 // pointer to Go memory. We only care about Go memory that might
 // contain pointers.
 //go:nosplit
@@ -283,7 +283,7 @@ func cgoIsGoPointer(p unsafe.Pointer) bool {
 	return false
 }
 
-// cgoInRange returns whether p is between start and end.
+// cgoInRange reports whether p is between start and end.
 //go:nosplit
 //go:nowritebarrierrec
 func cgoInRange(p unsafe.Pointer, start, end uintptr) bool {
diff --git a/libgo/go/runtime/cgocheck.go b/libgo/go/runtime/cgocheck.go
index d896fb7d79d..c9e40473771 100644
--- a/libgo/go/runtime/cgocheck.go
+++ b/libgo/go/runtime/cgocheck.go
@@ -43,6 +43,13 @@ func cgoCheckWriteBarrier(dst *uintptr, src uintptr) {
 		return
 	}
 
+	// It's OK if writing to memory allocated by persistentalloc.
+	// Do this check last because it is more expensive and rarely true.
+	// If it is false the expense doesn't matter since we are crashing.
+	if inPersistentAlloc(uintptr(unsafe.Pointer(dst))) {
+		return
+	}
+
 	systemstack(func() {
 		println("write of Go pointer", hex(src), "to non-Go memory", hex(uintptr(unsafe.Pointer(dst))))
 		throw(cgoWriteBarrierFail)
@@ -127,7 +134,7 @@ func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) {
 	}
 
 	s := spanOfUnchecked(uintptr(src))
-	if s.state == _MSpanManual {
+	if s.state == mSpanManual {
 		// There are no heap bits for value stored on the stack.
 		// For a channel receive src might be on the stack of some
 		// other goroutine, so we can't unwind the stack even if
diff --git a/libgo/go/runtime/chan.go b/libgo/go/runtime/chan.go
index cb369effb8c..6dfe2f3fc3e 100644
--- a/libgo/go/runtime/chan.go
+++ b/libgo/go/runtime/chan.go
@@ -19,6 +19,7 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/math"
 	"unsafe"
 )
 
@@ -88,7 +89,8 @@ func makechan(t *chantype, size int) *hchan {
 		throw("makechan: bad alignment")
 	}
 
-	if size < 0 || uintptr(size) > maxSliceCap(elem.size) || uintptr(size)*elem.size > maxAlloc-hchanSize {
+	mem, overflow := math.MulUintptr(elem.size, uintptr(size))
+	if overflow || mem > maxAlloc-hchanSize || size < 0 {
 		panic(plainError("makechan: size out of range"))
 	}
 
@@ -98,7 +100,7 @@ func makechan(t *chantype, size int) *hchan {
 	// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
 	var c *hchan
 	switch {
-	case size == 0 || elem.size == 0:
+	case mem == 0:
 		// Queue or element size is zero.
 		c = (*hchan)(mallocgc(hchanSize, nil, true))
 		// Race detector uses this location for synchronization.
@@ -106,12 +108,12 @@ func makechan(t *chantype, size int) *hchan {
 	case elem.kind&kindNoPointers != 0:
 		// Elements do not contain pointers.
 		// Allocate hchan and buf in one call.
-		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*elem.size, nil, true))
+		c = (*hchan)(mallocgc(hchanSize+mem, nil, true))
 		c.buf = add(unsafe.Pointer(c), hchanSize)
 	default:
 		// Elements contain pointers.
 		c = new(hchan)
-		c.buf = mallocgc(uintptr(size)*elem.size, elem, true)
+		c.buf = mallocgc(mem, elem, true)
 	}
 
 	c.elemsize = uint16(elem.size)
@@ -247,6 +249,11 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	gp.param = nil
 	c.sendq.enqueue(mysg)
 	goparkunlock(&c.lock, waitReasonChanSend, traceEvGoBlockSend, 3)
+	// Ensure the value being sent is kept alive until the
+	// receiver copies it out. The sudog has a pointer to the
+	// stack object, but sudogs aren't considered as roots of the
+	// stack tracer.
+	KeepAlive(ep)
 
 	// someone woke us up.
 	if mysg != gp.waiting {
@@ -358,7 +365,7 @@ func closechan(c *hchan) {
 
 	c.closed = 1
 
-	var glist *g
+	var glist gList
 
 	// release all readers
 	for {
@@ -378,8 +385,7 @@ func closechan(c *hchan) {
 		if raceenabled {
 			raceacquireg(gp, c.raceaddr())
 		}
-		gp.schedlink.set(glist)
-		glist = gp
+		glist.push(gp)
 	}
 
 	// release all writers (they will panic)
@@ -397,15 +403,13 @@ func closechan(c *hchan) {
 		if raceenabled {
 			raceacquireg(gp, c.raceaddr())
 		}
-		gp.schedlink.set(glist)
-		glist = gp
+		glist.push(gp)
 	}
 	unlock(&c.lock)
 
 	// Ready all Gs now that we've dropped the channel lock.
-	for glist != nil {
-		gp := glist
-		glist = glist.schedlink.ptr()
+	for !glist.empty() {
+		gp := glist.pop()
 		gp.schedlink = 0
 		goready(gp, 3)
 	}
diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index 770f85e96d4..f437b9a7e0f 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -269,7 +269,7 @@ func TestCgoTracebackContext(t *testing.T) {
 	}
 }
 
-func testCgoPprof(t *testing.T, buildArg, runArg string) {
+func testCgoPprof(t *testing.T, buildArg, runArg, top, bottom string) {
 	t.Parallel()
 	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
 		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
@@ -296,7 +296,7 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 	defer os.Remove(fn)
 
 	for try := 0; try < 2; try++ {
-		cmd := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
+		cmd := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-traces"))
 		// Check that pprof works both with and without explicit executable on command line.
 		if try == 0 {
 			cmd.Args = append(cmd.Args, exe, fn)
@@ -316,30 +316,38 @@ func testCgoPprof(t *testing.T, buildArg, runArg string) {
 			cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
 		}
 
-		top, err := cmd.CombinedOutput()
-		t.Logf("%s:\n%s", cmd.Args, top)
+		out, err := cmd.CombinedOutput()
+		t.Logf("%s:\n%s", cmd.Args, out)
 		if err != nil {
 			t.Error(err)
-		} else if !bytes.Contains(top, []byte("cpuHog")) {
-			t.Error("missing cpuHog in pprof output")
+			continue
+		}
+
+		trace := findTrace(string(out), top)
+		if len(trace) == 0 {
+			t.Errorf("%s traceback missing.", top)
+			continue
+		}
+		if trace[len(trace)-1] != bottom {
+			t.Errorf("invalid traceback origin: got=%v; want=[%s ... %s]", trace, top, bottom)
 		}
 	}
 }
 
 func TestCgoPprof(t *testing.T) {
-	testCgoPprof(t, "", "CgoPprof")
+	testCgoPprof(t, "", "CgoPprof", "cpuHog", "runtime.main")
 }
 
 func TestCgoPprofPIE(t *testing.T) {
-	testCgoPprof(t, "-buildmode=pie", "CgoPprof")
+	testCgoPprof(t, "-buildmode=pie", "CgoPprof", "cpuHog", "runtime.main")
 }
 
 func TestCgoPprofThread(t *testing.T) {
-	testCgoPprof(t, "", "CgoPprofThread")
+	testCgoPprof(t, "", "CgoPprofThread", "cpuHogThread", "cpuHogThread2")
 }
 
 func TestCgoPprofThreadNoTraceback(t *testing.T) {
-	testCgoPprof(t, "", "CgoPprofThreadNoTraceback")
+	testCgoPprof(t, "", "CgoPprofThreadNoTraceback", "cpuHogThread", "runtime._ExternalCode")
 }
 
 func TestRaceProf(t *testing.T) {
@@ -521,3 +529,35 @@ func TestBigStackCallbackCgo(t *testing.T) {
 		t.Errorf("expected %q got %v", want, got)
 	}
 }
+
+func nextTrace(lines []string) ([]string, []string) {
+	var trace []string
+	for n, line := range lines {
+		if strings.HasPrefix(line, "---") {
+			return trace, lines[n+1:]
+		}
+		fields := strings.Fields(strings.TrimSpace(line))
+		if len(fields) == 0 {
+			continue
+		}
+		// Last field contains the function name.
+		trace = append(trace, fields[len(fields)-1])
+	}
+	return nil, nil
+}
+
+func findTrace(text, top string) []string {
+	lines := strings.Split(text, "\n")
+	_, lines = nextTrace(lines) // Skip the header.
+	for len(lines) > 0 {
+		var t []string
+		t, lines = nextTrace(lines)
+		if len(t) == 0 {
+			continue
+		}
+		if t[0] == top {
+			return t
+		}
+	}
+	return nil
+}
diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index 91a5c161097..6627bdc6726 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -657,6 +657,9 @@ func TestTimePprof(t *testing.T) {
 	if runtime.Compiler == "gccgo" {
 		t.Skip("gccgo may not have the pprof tool")
 	}
+	if runtime.GOOS == "aix" {
+		t.Skip("pprof not yet available on AIX (see golang.org/issue/28555)")
+	}
 	fn := runTestProg(t, "testprog", "TimeProf")
 	fn = strings.TrimSpace(fn)
 	defer os.Remove(fn)
diff --git a/libgo/go/runtime/debug/mod.go b/libgo/go/runtime/debug/mod.go
new file mode 100644
index 00000000000..f2948c65cbd
--- /dev/null
+++ b/libgo/go/runtime/debug/mod.go
@@ -0,0 +1,112 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package debug
+
+import (
+	"strings"
+	_ "unsafe" // for go:linkname
+)
+
+// set using cmd/go/internal/modload.ModInfoProg
+var modinfo string
+
+// setmodinfo is visible to code generated by cmd/go/internal/modload.ModInfoProg.
+//go:linkname setmodinfo runtime..z2fdebug.setmodinfo
+func setmodinfo(s string) {
+	modinfo = s
+}
+
+// ReadBuildInfo returns the build information embedded
+// in the running binary. The information is available only
+// in binaries built with module support.
+func ReadBuildInfo() (info *BuildInfo, ok bool) {
+	return readBuildInfo(modinfo)
+}
+
+// BuildInfo represents the build information read from
+// the running binary.
+type BuildInfo struct {
+	Path string    // The main package path
+	Main Module    // The main module information
+	Deps []*Module // Module dependencies
+}
+
+// Module represents a module.
+type Module struct {
+	Path    string  // module path
+	Version string  // module version
+	Sum     string  // checksum
+	Replace *Module // replaced by this module
+}
+
+func readBuildInfo(data string) (*BuildInfo, bool) {
+	if len(data) < 32 {
+		return nil, false
+	}
+	data = data[16 : len(data)-16]
+
+	const (
+		pathLine = "path\t"
+		modLine  = "mod\t"
+		depLine  = "dep\t"
+		repLine  = "=>\t"
+	)
+
+	info := &BuildInfo{}
+
+	var line string
+	// Reverse of cmd/go/internal/modload.PackageBuildInfo
+	for len(data) > 0 {
+		i := strings.IndexByte(data, '\n')
+		if i < 0 {
+			break
+		}
+		line, data = data[:i], data[i+1:]
+		switch {
+		case strings.HasPrefix(line, pathLine):
+			elem := line[len(pathLine):]
+			info.Path = elem
+		case strings.HasPrefix(line, modLine):
+			elem := strings.Split(line[len(modLine):], "\t")
+			if len(elem) != 3 {
+				return nil, false
+			}
+			info.Main = Module{
+				Path:    elem[0],
+				Version: elem[1],
+				Sum:     elem[2],
+			}
+		case strings.HasPrefix(line, depLine):
+			elem := strings.Split(line[len(depLine):], "\t")
+			if len(elem) != 2 && len(elem) != 3 {
+				return nil, false
+			}
+			sum := ""
+			if len(elem) == 3 {
+				sum = elem[2]
+			}
+			info.Deps = append(info.Deps, &Module{
+				Path:    elem[0],
+				Version: elem[1],
+				Sum:     sum,
+			})
+		case strings.HasPrefix(line, repLine):
+			elem := strings.Split(line[len(repLine):], "\t")
+			if len(elem) != 3 {
+				return nil, false
+			}
+			last := len(info.Deps) - 1
+			if last < 0 {
+				return nil, false
+			}
+			info.Deps[last].Replace = &Module{
+				Path:    elem[0],
+				Version: elem[1],
+				Sum:     elem[2],
+			}
+		}
+	}
+	return info, true
+}
diff --git a/libgo/go/runtime/debug_test.go b/libgo/go/runtime/debug_test.go
index 38c764fadb3..12d93de3047 100644
--- a/libgo/go/runtime/debug_test.go
+++ b/libgo/go/runtime/debug_test.go
@@ -18,6 +18,8 @@ package runtime_test
 
 import (
 	"fmt"
+	"io/ioutil"
+	"regexp"
 	"runtime"
 	"runtime/debug"
 	"sync/atomic"
@@ -26,12 +28,23 @@ import (
 )
 
 func startDebugCallWorker(t *testing.T) (g *runtime.G, after func()) {
+	// This can deadlock if run under a debugger because it
+	// depends on catching SIGTRAP, which is usually swallowed by
+	// a debugger.
+	skipUnderDebugger(t)
+
 	// This can deadlock if there aren't enough threads or if a GC
-	// tries to interrupt an atomic loop (see issue #10958).
-	ogomaxprocs := runtime.GOMAXPROCS(2)
+	// tries to interrupt an atomic loop (see issue #10958). We
+	// use 8 Ps so there's room for the debug call worker,
+	// something that's trying to preempt the call worker, and the
+	// goroutine that's trying to stop the call worker.
+	ogomaxprocs := runtime.GOMAXPROCS(8)
 	ogcpercent := debug.SetGCPercent(-1)
 
-	ready := make(chan *runtime.G)
+	// ready is a buffered channel so debugCallWorker won't block
+	// on sending to it. This makes it less likely we'll catch
+	// debugCallWorker while it's in the runtime.
+	ready := make(chan *runtime.G, 1)
 	var stop uint32
 	done := make(chan error)
 	go debugCallWorker(ready, &stop, done)
@@ -61,6 +74,10 @@ func debugCallWorker(ready chan<- *runtime.G, stop *uint32, done chan<- error) {
 	close(done)
 }
 
+// Don't inline this function, since we want to test adjusting
+// pointers in the arguments.
+//
+//go:noinline
 func debugCallWorker2(stop *uint32, x *int) {
 	for atomic.LoadUint32(stop) == 0 {
 		// Strongly encourage x to live in a register so we
@@ -74,6 +91,28 @@ func debugCallTKill(tid int) error {
 	return syscall.Tgkill(syscall.Getpid(), tid, syscall.SIGTRAP)
 }
 
+// skipUnderDebugger skips the current test when running under a
+// debugger (specifically if this process has a tracer). This is
+// Linux-specific.
+func skipUnderDebugger(t *testing.T) {
+	pid := syscall.Getpid()
+	status, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
+	if err != nil {
+		t.Logf("couldn't get proc tracer: %s", err)
+		return
+	}
+	re := regexp.MustCompile(`TracerPid:\s+([0-9]+)`)
+	sub := re.FindSubmatch(status)
+	if sub == nil {
+		t.Logf("couldn't find proc tracer PID")
+		return
+	}
+	if string(sub[1]) == "0" {
+		return
+	}
+	t.Skip("test will deadlock under a debugger")
+}
+
 func TestDebugCall(t *testing.T) {
 	g, after := startDebugCallWorker(t)
 	defer after()
@@ -161,9 +200,11 @@ func debugCallUnsafePointWorker(gpp **runtime.G, ready, stop *uint32) {
 }
 
 func TestDebugCallUnsafePoint(t *testing.T) {
+	skipUnderDebugger(t)
+
 	// This can deadlock if there aren't enough threads or if a GC
 	// tries to interrupt an atomic loop (see issue #10958).
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(8))
 	defer debug.SetGCPercent(debug.SetGCPercent(-1))
 
 	// Test that the runtime refuses call injection at unsafe points.
@@ -182,8 +223,10 @@ func TestDebugCallUnsafePoint(t *testing.T) {
 }
 
 func TestDebugCallPanic(t *testing.T) {
+	skipUnderDebugger(t)
+
 	// This can deadlock if there aren't enough threads.
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(8))
 
 	ready := make(chan *runtime.G)
 	var stop uint32
diff --git a/libgo/go/runtime/env_posix.go b/libgo/go/runtime/env_posix.go
index 399e88f420d..7e44f14edfc 100644
--- a/libgo/go/runtime/env_posix.go
+++ b/libgo/go/runtime/env_posix.go
@@ -12,9 +12,32 @@ func gogetenv(key string) string {
 		throw("getenv before env init")
 	}
 	for _, s := range env {
-		if len(s) > len(key) && s[len(key)] == '=' && s[:len(key)] == key {
+		if len(s) > len(key) && s[len(key)] == '=' && envKeyEqual(s[:len(key)], key) {
 			return s[len(key)+1:]
 		}
 	}
 	return ""
 }
+
+// envKeyEqual reports whether a == b, with ASCII-only case insensitivity
+// on Windows. The two strings must have the same length.
+func envKeyEqual(a, b string) bool {
+	if GOOS == "windows" { // case insensitive
+		for i := 0; i < len(a); i++ {
+			ca, cb := a[i], b[i]
+			if ca == cb || lowerASCII(ca) == lowerASCII(cb) {
+				continue
+			}
+			return false
+		}
+		return true
+	}
+	return a == b
+}
+
+func lowerASCII(c byte) byte {
+	if 'A' <= c && c <= 'Z' {
+		return c + ('a' - 'A')
+	}
+	return c
+}
diff --git a/libgo/go/runtime/export_debug_test.go b/libgo/go/runtime/export_debug_test.go
index 2d2d5356edc..608d7567808 100644
--- a/libgo/go/runtime/export_debug_test.go
+++ b/libgo/go/runtime/export_debug_test.go
@@ -51,19 +51,31 @@ func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error) (in
 	h.gp = gp
 	h.fv, h.argp, h.argSize = fv, argp, argSize
 	h.handleF = h.handle // Avoid allocating closure during signal
-	noteclear(&h.done)
 
 	defer func() { testSigtrap = nil }()
-	testSigtrap = h.inject
-	if err := tkill(tid); err != nil {
-		return nil, err
-	}
-	// Wait for completion.
-	notetsleepg(&h.done, -1)
-	if len(h.err) != 0 {
-		return nil, h.err
+	for i := 0; ; i++ {
+		testSigtrap = h.inject
+		noteclear(&h.done)
+		h.err = ""
+
+		if err := tkill(tid); err != nil {
+			return nil, err
+		}
+		// Wait for completion.
+		notetsleepg(&h.done, -1)
+		if h.err != "" {
+			switch h.err {
+			case "retry _Grunnable", "executing on Go runtime stack":
+				// These are transient states. Try to get out of them.
+				if i < 100 {
+					Gosched()
+					continue
+				}
+			}
+			return nil, h.err
+		}
+		return h.panic, nil
 	}
-	return h.panic, nil
 }
 
 type debugCallHandler struct {
@@ -100,12 +112,18 @@ func (h *debugCallHandler) inject(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 		h.savedRegs.fpstate = nil
 		// Set PC to debugCallV1.
 		ctxt.set_rip(uint64(funcPC(debugCallV1)))
+		// Call injected. Switch to the debugCall protocol.
+		testSigtrap = h.handleF
+	case _Grunnable:
+		// Ask InjectDebugCall to pause for a bit and then try
+		// again to interrupt this goroutine.
+		h.err = plainError("retry _Grunnable")
+		notewakeup(&h.done)
 	default:
 		h.err = plainError("goroutine in unexpected state at call inject")
-		return true
+		notewakeup(&h.done)
 	}
-	// Switch to the debugCall protocol and resume execution.
-	testSigtrap = h.handleF
+	// Resume execution.
 	return true
 }
 
@@ -116,7 +134,7 @@ func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 		return false
 	}
 	f := findfunc(uintptr(ctxt.rip()))
-	if !(hasprefix(funcname(f), "runtime.debugCall") || hasprefix(funcname(f), "debugCall")) {
+	if !(hasPrefix(funcname(f), "runtime.debugCall") || hasPrefix(funcname(f), "debugCall")) {
 		println("trap in unknown function", funcname(f))
 		return false
 	}
@@ -150,6 +168,7 @@ func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 		sp := ctxt.rsp()
 		reason := *(*string)(unsafe.Pointer(uintptr(sp)))
 		h.err = plainError(reason)
+		// Don't wake h.done. We need to transition to status 16 first.
 	case 16:
 		// Restore all registers except RIP and RSP.
 		rip, rsp := ctxt.rip(), ctxt.rsp()
@@ -163,6 +182,7 @@ func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 		notewakeup(&h.done)
 	default:
 		h.err = plainError("unexpected debugCallV1 status")
+		notewakeup(&h.done)
 	}
 	// Resume execution.
 	return true
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index 7f4811c5a0c..d919e0486b2 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -292,6 +292,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 		slow.TotalAlloc = 0
 		slow.Mallocs = 0
 		slow.Frees = 0
+		slow.HeapReleased = 0
 		var bySize [_NumSizeClasses]struct {
 			Mallocs, Frees uint64
 		}
@@ -331,6 +332,10 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			slow.BySize[i].Frees = bySize[i].Frees
 		}
 
+		for i := mheap_.scav.start(); i.valid(); i = i.next() {
+			slow.HeapReleased += uint64(i.span().released())
+		}
+
 		getg().m.mallocing--
 	})
 
@@ -454,3 +459,39 @@ func stackOverflow(x *byte) {
 	var buf [256]byte
 	stackOverflow(&buf[0])
 }
+
+func MapTombstoneCheck(m map[int]int) {
+	// Make sure emptyOne and emptyRest are distributed correctly.
+	// We should have a series of filled and emptyOne cells, followed by
+	// a series of emptyRest cells.
+	h := *(**hmap)(unsafe.Pointer(&m))
+	i := interface{}(m)
+	t := *(**maptype)(unsafe.Pointer(&i))
+
+	for x := 0; x < 1<<h.B; x++ {
+		b0 := (*bmap)(add(h.buckets, uintptr(x)*uintptr(t.bucketsize)))
+		n := 0
+		for b := b0; b != nil; b = b.overflow(t) {
+			for i := 0; i < bucketCnt; i++ {
+				if b.tophash[i] != emptyRest {
+					n++
+				}
+			}
+		}
+		k := 0
+		for b := b0; b != nil; b = b.overflow(t) {
+			for i := 0; i < bucketCnt; i++ {
+				if k < n && b.tophash[i] == emptyRest {
+					panic("early emptyRest")
+				}
+				if k >= n && b.tophash[i] != emptyRest {
+					panic("late non-emptyRest")
+				}
+				if k == n-1 && b.tophash[i] == emptyOne {
+					panic("last non-emptyRest entry is emptyOne")
+				}
+				k++
+			}
+		}
+	}
+}
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index c9d10f16801..d07a5ed024f 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -50,19 +50,13 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	gcshrinkstackoff: setting gcshrinkstackoff=1 disables moving goroutines
 	onto smaller stacks. In this mode, a goroutine's stack can only grow.
 
-	gcrescanstacks: setting gcrescanstacks=1 enables stack
-	re-scanning during the STW mark termination phase. This is
-	helpful for debugging if objects are being prematurely
-	garbage collected.
-
 	gcstoptheworld: setting gcstoptheworld=1 disables concurrent garbage collection,
 	making every garbage collection a stop-the-world event. Setting gcstoptheworld=2
 	also disables concurrent sweeping after the garbage collection finishes.
 
 	gctrace: setting gctrace=1 causes the garbage collector to emit a single line to standard
 	error at each collection, summarizing the amount of memory collected and the
-	length of the pause. Setting gctrace=2 emits the same summary but also
-	repeats each collection. The format of this line is subject to change.
+	length of the pause. The format of this line is subject to change.
 	Currently, it is:
 		gc # @#s #%: #+#+# ms clock, #+#/#/#+# ms cpu, #->#-># MB, # MB goal, # P
 	where the fields are as follows:
@@ -95,6 +89,11 @@ It is a comma-separated list of name=val pairs setting these named variables:
 		released: #  MB released to the system
 		consumed: #  MB allocated from the system
 
+	madvdontneed: setting madvdontneed=1 will use MADV_DONTNEED
+	instead of MADV_FREE on Linux when returning memory to the
+	kernel. This is less efficient, but causes RSS numbers to drop
+	more quickly.
+
 	memprofilerate: setting memprofilerate=X will update the value of runtime.MemProfileRate.
 	When set to 0 memory profiling is disabled.  Refer to the description of
 	MemProfileRate for the default value.
@@ -206,6 +205,7 @@ func Version() string {
 
 // GOOS is the running program's operating system target:
 // one of darwin, freebsd, linux, and so on.
+// To view possible combinations of GOOS and GOARCH, run "go tool dist list".
 const GOOS string = sys.GOOS
 
 // GOARCH is the running program's architecture target:
diff --git a/libgo/go/runtime/fastlog2table.go b/libgo/go/runtime/fastlog2table.go
index c36d5835f64..6ba4a7d3f24 100644
--- a/libgo/go/runtime/fastlog2table.go
+++ b/libgo/go/runtime/fastlog2table.go
@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mkfastlog2table.go
+// Code generated by mkfastlog2table.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkfastlog2table.go for comments.
 
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index 180919bf3c4..384b75f2de3 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -22,6 +22,12 @@ func TestGcSys(t *testing.T) {
 	if os.Getenv("GOGC") == "off" {
 		t.Skip("skipping test; GOGC=off in environment")
 	}
+	if runtime.GOOS == "windows" {
+		t.Skip("skipping test; GOOS=windows http://golang.org/issue/27156")
+	}
+	if runtime.GOOS == "linux" && runtime.GOARCH == "arm64" {
+		t.Skip("skipping test; GOOS=linux GOARCH=arm64 https://github.com/golang/go/issues/27636")
+	}
 	got := runTestProg(t, "testprog", "GCSys")
 	want := "OK\n"
 	if got != want {
@@ -571,8 +577,8 @@ func BenchmarkWriteBarrier(b *testing.B) {
 		n := &node{mkTree(level - 1), mkTree(level - 1)}
 		if level == 10 {
 			// Seed GC with enough early pointers so it
-			// doesn't accidentally switch to mark 2 when
-			// it only has the top of the tree.
+			// doesn't start termination barriers when it
+			// only has the top of the tree.
 			wbRoots = append(wbRoots, n)
 		}
 		return n
diff --git a/libgo/go/runtime/gcinfo_test.go b/libgo/go/runtime/gcinfo_test.go
index ca012bbcd45..89144d520e3 100644
--- a/libgo/go/runtime/gcinfo_test.go
+++ b/libgo/go/runtime/gcinfo_test.go
@@ -21,14 +21,46 @@ const (
 func TestGCInfo(t *testing.T) {
 	t.Skip("skipping on gccgo for now")
 
-	verifyGCInfo(t, "stack Ptr", new(Ptr), infoPtr)
-	verifyGCInfo(t, "stack ScalarPtr", new(ScalarPtr), infoScalarPtr)
-	verifyGCInfo(t, "stack PtrScalar", new(PtrScalar), infoPtrScalar)
-	verifyGCInfo(t, "stack BigStruct", new(BigStruct), infoBigStruct())
-	verifyGCInfo(t, "stack string", new(string), infoString)
-	verifyGCInfo(t, "stack slice", new([]string), infoSlice)
-	verifyGCInfo(t, "stack eface", new(interface{}), infoEface)
-	verifyGCInfo(t, "stack iface", new(Iface), infoIface)
+	{
+		var x Ptr
+		verifyGCInfo(t, "stack Ptr", &x, infoPtr)
+		runtime.KeepAlive(x)
+	}
+	{
+		var x ScalarPtr
+		verifyGCInfo(t, "stack ScalarPtr", &x, infoScalarPtr)
+		runtime.KeepAlive(x)
+	}
+	{
+		var x PtrScalar
+		verifyGCInfo(t, "stack PtrScalar", &x, infoPtrScalar)
+		runtime.KeepAlive(x)
+	}
+	{
+		var x BigStruct
+		verifyGCInfo(t, "stack BigStruct", &x, infoBigStruct())
+		runtime.KeepAlive(x)
+	}
+	{
+		var x string
+		verifyGCInfo(t, "stack string", &x, infoString)
+		runtime.KeepAlive(x)
+	}
+	{
+		var x []string
+		verifyGCInfo(t, "stack slice", &x, infoSlice)
+		runtime.KeepAlive(x)
+	}
+	{
+		var x interface{}
+		verifyGCInfo(t, "stack eface", &x, infoEface)
+		runtime.KeepAlive(x)
+	}
+	{
+		var x Iface
+		verifyGCInfo(t, "stack iface", &x, infoIface)
+		runtime.KeepAlive(x)
+	}
 
 	for i := 0; i < 10; i++ {
 		verifyGCInfo(t, "heap Ptr", escape(new(Ptr)), trimDead(padDead(infoPtr)))
diff --git a/libgo/go/runtime/hash_test.go b/libgo/go/runtime/hash_test.go
index 070edb6e9b6..d57be4c8f73 100644
--- a/libgo/go/runtime/hash_test.go
+++ b/libgo/go/runtime/hash_test.go
@@ -177,13 +177,13 @@ func twoNonZero(h *HashSet, n int) {
 	b := make([]byte, n)
 
 	// all zero
-	h.addB(b[:])
+	h.addB(b)
 
 	// one non-zero byte
 	for i := 0; i < n; i++ {
 		for x := 1; x < 256; x++ {
 			b[i] = byte(x)
-			h.addB(b[:])
+			h.addB(b)
 			b[i] = 0
 		}
 	}
@@ -195,7 +195,7 @@ func twoNonZero(h *HashSet, n int) {
 			for j := i + 1; j < n; j++ {
 				for y := 1; y < 256; y++ {
 					b[j] = byte(y)
-					h.addB(b[:])
+					h.addB(b)
 					b[j] = 0
 				}
 			}
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
index 5ebebf69474..3aa9e8a23f2 100644
--- a/libgo/go/runtime/heapdump.go
+++ b/libgo/go/runtime/heapdump.go
@@ -250,7 +250,7 @@ func dumpgoroutine(gp *g) {
 	dumpint(uint64(gp.goid))
 	dumpint(uint64(gp.gopc))
 	dumpint(uint64(readgstatus(gp)))
-	dumpbool(isSystemGoroutine(gp))
+	dumpbool(isSystemGoroutine(gp, false))
 	dumpbool(false) // isbackground
 	dumpint(uint64(gp.waitsince))
 	dumpstr(gp.waitreason.String())
@@ -313,7 +313,7 @@ func finq_callback(fn *funcval, obj unsafe.Pointer, ft *functype, ot *ptrtype) {
 func dumproots() {
 	// MSpan.types
 	for _, s := range mheap_.allspans {
-		if s.state == _MSpanInUse {
+		if s.state == mSpanInUse {
 			// Finalizers
 			for sp := s.specials; sp != nil; sp = sp.next {
 				if sp.kind != _KindSpecialFinalizer {
@@ -336,7 +336,7 @@ var freemark [_PageSize / 8]bool
 
 func dumpobjs() {
 	for _, s := range mheap_.allspans {
-		if s.state != _MSpanInUse {
+		if s.state != mSpanInUse {
 			continue
 		}
 		p := s.base()
@@ -485,7 +485,7 @@ func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *location, size, allocs,
 func dumpmemprof() {
 	iterate_memprof(dumpmemprof_callback)
 	for _, s := range mheap_.allspans {
-		if s.state != _MSpanInUse {
+		if s.state != mSpanInUse {
 			continue
 		}
 		for sp := s.specials; sp != nil; sp = sp.next {
@@ -506,7 +506,7 @@ var dumphdr = []byte("go1.7 heap dump\n")
 func mdump() {
 	// make sure we're done sweeping
 	for _, s := range mheap_.allspans {
-		if s.state == _MSpanInUse {
+		if s.state == mSpanInUse {
 			s.ensureSwept()
 		}
 	}
@@ -529,7 +529,7 @@ func writeheapdump_m(fd uintptr) {
 	_g_.waitreason = waitReasonDumpingHeap
 
 	// Update stats so we can dump them.
-	// As a side effect, flushes all the MCaches so the MSpan.freelist
+	// As a side effect, flushes all the mcaches so the mspan.freelist
 	// lists contain all the free objects.
 	updatememstats()
 
diff --git a/libgo/go/runtime/internal/atomic/atomic.c b/libgo/go/runtime/internal/atomic/atomic.c
index b87fae96bd3..17c83a28c1c 100644
--- a/libgo/go/runtime/internal/atomic/atomic.c
+++ b/libgo/go/runtime/internal/atomic/atomic.c
@@ -13,7 +13,7 @@ uint32_t Load (uint32_t *ptr)
 uint32_t
 Load (uint32_t *ptr)
 {
-  return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
+  return __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
 }
 
 void *Loadp (void *ptr)
@@ -23,7 +23,7 @@ void *Loadp (void *ptr)
 void *
 Loadp (void *ptr)
 {
-  return __atomic_load_n ((void **) ptr, __ATOMIC_ACQUIRE);
+  return __atomic_load_n ((void **) ptr, __ATOMIC_SEQ_CST);
 }
 
 uint64_t Load64 (uint64_t *ptr)
@@ -35,6 +35,16 @@ Load64 (uint64_t *ptr)
 {
   if (((uintptr_t) ptr & 7) != 0)
     panicmem ();
+  return __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
+}
+
+uint32_t LoadAcq (uint32_t *ptr)
+  __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.LoadAcq")
+  __attribute__ ((no_split_stack));
+
+uint32_t
+LoadAcq (uint32_t *ptr)
+{
   return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
 }
 
@@ -45,7 +55,7 @@ uintptr_t Loaduintptr (uintptr_t *ptr)
 uintptr_t
 Loaduintptr (uintptr_t *ptr)
 {
-  return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
+  return __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
 }
 
 uintgo Loaduint (uintgo *ptr)
@@ -55,7 +65,7 @@ uintgo Loaduint (uintgo *ptr)
 uintgo
 Loaduint (uintgo *ptr)
 {
-  return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
+  return __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
 }
 
 int64_t Loadint64 (int64_t *ptr)
@@ -67,7 +77,7 @@ Loadint64 (int64_t *ptr)
 {
   if (((uintptr_t) ptr & 7) != 0)
     panicmem ();
-  return __atomic_load_n (ptr, __ATOMIC_ACQUIRE);
+  return __atomic_load_n (ptr, __ATOMIC_SEQ_CST);
 }
 
 uint32_t Xadd (uint32_t *ptr, int32_t delta)
@@ -188,6 +198,16 @@ Cas64 (uint64_t *ptr, uint64_t old, uint64_t new)
   return __atomic_compare_exchange_n (ptr, &old, new, false, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
 }
 
+_Bool CasRel (uint32_t *ptr, uint32_t old, uint32_t new)
+  __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.CasRel")
+  __attribute__ ((no_split_stack));
+
+_Bool
+CasRel (uint32_t *ptr, uint32_t old, uint32_t new)
+{
+  return __atomic_compare_exchange_n (ptr, &old, new, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+}
+
 _Bool Casp1 (void **ptr, void *old, void *new)
   __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.Casp1")
   __attribute__ ((no_split_stack));
@@ -230,6 +250,16 @@ Store64 (uint64_t *ptr, uint64_t val)
   __atomic_store_n (ptr, val, __ATOMIC_SEQ_CST);
 }
 
+void StoreRel (uint32_t *ptr, uint32_t val)
+  __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.StoreRel")
+  __attribute__ ((no_split_stack));
+
+void
+StoreRel (uint32_t *ptr, uint32_t val)
+{
+  __atomic_store_n (ptr, val, __ATOMIC_RELEASE);
+}
+
 void Storeuintptr (uintptr_t *ptr, uintptr_t val)
   __asm__ (GOSYM_PREFIX "runtime..z2finternal..z2fatomic.Storeuintptr")
   __attribute__ ((no_split_stack));
diff --git a/libgo/go/runtime/internal/atomic/gccgo.go b/libgo/go/runtime/internal/atomic/gccgo.go
index 696736465fd..e5edbfb17f1 100644
--- a/libgo/go/runtime/internal/atomic/gccgo.go
+++ b/libgo/go/runtime/internal/atomic/gccgo.go
@@ -18,6 +18,9 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func Load64(ptr *uint64) uint64
 
 //go:noescape
+func LoadAcq(ptr *uint32) uint32
+
+//go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
 //go:noescape
@@ -47,11 +50,17 @@ func Or8(ptr *uint8, val uint8)
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
+func CasRel(ptr *uint32, old, new uint32) bool
+
+//go:noescape
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
 func Store64(ptr *uint64, val uint64)
 
+//go:noescape
+func StoreRel(ptr *uint32, val uint32)
+
 // StorepNoWB performs *ptr = val atomically and without a write
 // barrier.
 //
diff --git a/libgo/go/runtime/internal/math/math.go b/libgo/go/runtime/internal/math/math.go
new file mode 100644
index 00000000000..5385f5dd868
--- /dev/null
+++ b/libgo/go/runtime/internal/math/math.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math
+
+import "runtime/internal/sys"
+
+const MaxUintptr = ^uintptr(0)
+
+// MulUintptr returns a * b and whether the multiplication overflowed.
+// On supported platforms this is an intrinsic lowered by the compiler.
+func MulUintptr(a, b uintptr) (uintptr, bool) {
+	if a|b < 1<<(4*sys.PtrSize) || a == 0 {
+		return a * b, false
+	}
+	overflow := b > MaxUintptr/a
+	return a * b, overflow
+}
diff --git a/libgo/go/runtime/internal/math/math_test.go b/libgo/go/runtime/internal/math/math_test.go
new file mode 100644
index 00000000000..303eb63405a
--- /dev/null
+++ b/libgo/go/runtime/internal/math/math_test.go
@@ -0,0 +1,79 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math_test
+
+import (
+	. "runtime/internal/math"
+	"testing"
+)
+
+const (
+	UintptrSize = 32 << (^uintptr(0) >> 63)
+)
+
+type mulUintptrTest struct {
+	a        uintptr
+	b        uintptr
+	overflow bool
+}
+
+var mulUintptrTests = []mulUintptrTest{
+	{0, 0, false},
+	{1000, 1000, false},
+	{MaxUintptr, 0, false},
+	{MaxUintptr, 1, false},
+	{MaxUintptr / 2, 2, false},
+	{MaxUintptr / 2, 3, true},
+	{MaxUintptr, 10, true},
+	{MaxUintptr, 100, true},
+	{MaxUintptr / 100, 100, false},
+	{MaxUintptr / 1000, 1001, true},
+	{1<<(UintptrSize/2) - 1, 1<<(UintptrSize/2) - 1, false},
+	{1 << (UintptrSize / 2), 1 << (UintptrSize / 2), true},
+	{MaxUintptr >> 32, MaxUintptr >> 32, false},
+	{MaxUintptr, MaxUintptr, true},
+}
+
+func TestMulUintptr(t *testing.T) {
+	for _, test := range mulUintptrTests {
+		a, b := test.a, test.b
+		for i := 0; i < 2; i++ {
+			mul, overflow := MulUintptr(a, b)
+			if mul != a*b || overflow != test.overflow {
+				t.Errorf("MulUintptr(%v, %v) = %v, %v want %v, %v",
+					a, b, mul, overflow, a*b, test.overflow)
+			}
+			a, b = b, a
+		}
+	}
+}
+
+var SinkUintptr uintptr
+var SinkBool bool
+
+var x, y uintptr
+
+func BenchmarkMulUintptr(b *testing.B) {
+	x, y = 1, 2
+	b.Run("small", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			var overflow bool
+			SinkUintptr, overflow = MulUintptr(x, y)
+			if overflow {
+				SinkUintptr = 0
+			}
+		}
+	})
+	x, y = MaxUintptr, MaxUintptr-1
+	b.Run("large", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			var overflow bool
+			SinkUintptr, overflow = MulUintptr(x, y)
+			if overflow {
+				SinkUintptr = 0
+			}
+		}
+	})
+}
diff --git a/libgo/go/runtime/lfstack_64bit.go b/libgo/go/runtime/lfstack_64bit.go
index 401f83d9d5b..de40a00324a 100644
--- a/libgo/go/runtime/lfstack_64bit.go
+++ b/libgo/go/runtime/lfstack_64bit.go
@@ -41,8 +41,8 @@ const (
 	ia64CntBits  = 64 - ia64AddrBits + 3
 
 	// On AIX, 64-bit addresses are split into 36-bit segment number and 28-bit
-	// offset in segment.  Segment numbers in the range 0x070000000-0x07FFFFFFF
-	// and 0x0A0000000-0x0AFFFFFFF(LSA) are available for mmap.
+	// offset in segment.  Segment numbers in the range 0x0A0000000-0x0AFFFFFFF(LSA)
+	// are available for mmap.
 	// We assume all lfnode addresses are from memory allocated with mmap.
 	// We use one bit to distinguish between the two ranges.
 	aixAddrBits = 57
@@ -77,11 +77,7 @@ func lfstackUnpack(val uint64) *lfnode {
 		return (*lfnode)(unsafe.Pointer(uintptr(((val & (1<<(64-3) - 1)) >> ia64CntBits << 3) | val&^(1<<(64-3)-1))))
 	}
 	if GOARCH == "ppc64" && GOOS == "aix" {
-		if val&(1<<63) != 0 {
-			return (*lfnode)(unsafe.Pointer(uintptr((val >> aixCntBits << 3) | 0x7<<56)))
-		} else {
-			return (*lfnode)(unsafe.Pointer(uintptr((val >> aixCntBits << 3) | 0xa<<56)))
-		}
+		return (*lfnode)(unsafe.Pointer(uintptr((val >> aixCntBits << 3) | 0xa<<56)))
 	}
 	return (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
 }
diff --git a/libgo/go/runtime/lock_futex.go b/libgo/go/runtime/lock_futex.go
index f7ca1f036e5..9cede2d41c3 100644
--- a/libgo/go/runtime/lock_futex.go
+++ b/libgo/go/runtime/lock_futex.go
@@ -242,7 +242,7 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func pauseSchedulerUntilCallback() bool {
+func beforeIdle() bool {
 	return false
 }
 
diff --git a/libgo/go/runtime/lock_js.go b/libgo/go/runtime/lock_js.go
index df321e51963..f58c915b630 100644
--- a/libgo/go/runtime/lock_js.go
+++ b/libgo/go/runtime/lock_js.go
@@ -92,7 +92,7 @@ func notetsleepg(n *note, ns int64) bool {
 			delay = 1<<31 - 1 // cap to max int32
 		}
 
-		id := scheduleCallback(delay)
+		id := scheduleTimeoutEvent(delay)
 		mp := acquirem()
 		notes[n] = gp
 		notesWithTimeout[n] = noteWithTimeout{gp: gp, deadline: deadline}
@@ -100,7 +100,7 @@ func notetsleepg(n *note, ns int64) bool {
 
 		gopark(nil, nil, waitReasonSleep, traceEvNone, 1)
 
-		clearScheduledCallback(id) // note might have woken early, clear timeout
+		clearTimeoutEvent(id) // note might have woken early, clear timeout
 		mp = acquirem()
 		delete(notes, n)
 		delete(notesWithTimeout, n)
@@ -127,46 +127,68 @@ func notetsleepg(n *note, ns int64) bool {
 func checkTimeouts() {
 	now := nanotime()
 	for n, nt := range notesWithTimeout {
-		if n.key == note_cleared && now > nt.deadline {
+		if n.key == note_cleared && now >= nt.deadline {
 			n.key = note_timeout
 			goready(nt.gp, 1)
 		}
 	}
 }
 
-var waitingForCallback *g
+var returnedEventHandler *g
 
-// sleepUntilCallback puts the current goroutine to sleep until a callback is triggered.
-// It is currently only used by the callback routine of the syscall/js package.
-//go:linkname sleepUntilCallback syscall/js.sleepUntilCallback
-func sleepUntilCallback() {
-	waitingForCallback = getg()
+func init() {
+	// At the toplevel we need an extra goroutine that handles asynchronous events.
+	initg := getg()
+	go func() {
+		returnedEventHandler = getg()
+		goready(initg, 1)
+
+		gopark(nil, nil, waitReasonZero, traceEvNone, 1)
+		returnedEventHandler = nil
+
+		pause(getcallersp() - 16)
+	}()
 	gopark(nil, nil, waitReasonZero, traceEvNone, 1)
-	waitingForCallback = nil
 }
 
-// pauseSchedulerUntilCallback gets called from the scheduler and pauses the execution
-// of Go's WebAssembly code until a callback is triggered. Then it checks for note timeouts
-// and resumes goroutines that are waiting for a callback.
-func pauseSchedulerUntilCallback() bool {
-	if waitingForCallback == nil && len(notesWithTimeout) == 0 {
-		return false
+// beforeIdle gets called by the scheduler if no goroutine is awake.
+// We resume the event handler (if available) which will pause the execution.
+func beforeIdle() bool {
+	if returnedEventHandler != nil {
+		goready(returnedEventHandler, 1)
+		return true
 	}
+	return false
+}
+
+// pause sets SP to newsp and pauses the execution of Go's WebAssembly code until an event is triggered.
+func pause(newsp uintptr)
+
+// scheduleTimeoutEvent tells the WebAssembly environment to trigger an event after ms milliseconds.
+// It returns a timer id that can be used with clearTimeoutEvent.
+func scheduleTimeoutEvent(ms int64) int32
+
+// clearTimeoutEvent clears a timeout event scheduled by scheduleTimeoutEvent.
+func clearTimeoutEvent(id int32)
+
+func handleEvent() {
+	prevReturnedEventHandler := returnedEventHandler
+	returnedEventHandler = nil
 
-	pause()
 	checkTimeouts()
-	if waitingForCallback != nil {
-		goready(waitingForCallback, 1)
-	}
-	return true
-}
+	eventHandler()
+
+	returnedEventHandler = getg()
+	gopark(nil, nil, waitReasonZero, traceEvNone, 1)
+
+	returnedEventHandler = prevReturnedEventHandler
 
-// pause pauses the execution of Go's WebAssembly code until a callback is triggered.
-func pause()
+	pause(getcallersp() - 16)
+}
 
-// scheduleCallback tells the WebAssembly environment to trigger a callback after ms milliseconds.
-// It returns a timer id that can be used with clearScheduledCallback.
-func scheduleCallback(ms int64) int32
+var eventHandler func()
 
-// clearScheduledCallback clears a callback scheduled by scheduleCallback.
-func clearScheduledCallback(id int32)
+//go:linkname setEventHandler syscall/js.setEventHandler
+func setEventHandler(fn func()) {
+	eventHandler = fn
+}
diff --git a/libgo/go/runtime/lock_sema.go b/libgo/go/runtime/lock_sema.go
index 237513ca0cb..c748ca70ec3 100644
--- a/libgo/go/runtime/lock_sema.go
+++ b/libgo/go/runtime/lock_sema.go
@@ -295,7 +295,7 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func pauseSchedulerUntilCallback() bool {
+func beforeIdle() bool {
 	return false
 }
 
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
index ac4759ffbf1..36417fb54f4 100644
--- a/libgo/go/runtime/malloc.go
+++ b/libgo/go/runtime/malloc.go
@@ -106,6 +106,7 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/math"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -135,8 +136,6 @@ const (
 	// have the most objects per span.
 	maxObjsPerSpan = pageSize / 8
 
-	mSpanInUse = _MSpanInUse
-
 	concurrentSweep = _ConcurrentSweep
 
 	_PageSize = 1 << _PageShift
@@ -149,8 +148,7 @@ const (
 	_TinySize      = 16
 	_TinySizeClass = int8(2)
 
-	_FixAllocChunk = 16 << 10               // Chunk size for FixAlloc
-	_MaxMHeapList  = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
+	_FixAllocChunk = 16 << 10 // Chunk size for FixAlloc
 
 	// Per-P, per order stack segment cache size.
 	_StackCacheSize = 32 * 1024
@@ -173,7 +171,7 @@ const (
 	// amd64, addresses are sign-extended beyond heapAddrBits. On
 	// other arches, they are zero-extended.
 	//
-	// On 64-bit platforms, we limit this to 48 bits based on a
+	// On most 64-bit platforms, we limit this to 48 bits based on a
 	// combination of hardware and OS limitations.
 	//
 	// amd64 hardware limits addresses to 48 bits, sign-extended
@@ -191,10 +189,9 @@ const (
 	// bits, in the range [0, 1<<48).
 	//
 	// ppc64, mips64, and s390x support arbitrary 64 bit addresses
-	// in hardware. However, since Go only supports Linux on
-	// these, we lean on OS limits. Based on Linux's processor.h,
-	// the user address space is limited as follows on 64-bit
-	// architectures:
+	// in hardware. On Linux, Go leans on stricter OS limits. Based
+	// on Linux's processor.h, the user address space is limited as
+	// follows on 64-bit architectures:
 	//
 	// Architecture  Name              Maximum Value (exclusive)
 	// ---------------------------------------------------------------------
@@ -211,13 +208,17 @@ const (
 	// exceed Go's 48 bit limit, it's extremely unlikely in
 	// practice.
 	//
+	// On aix/ppc64, the limits is increased to 1<<60 to accept addresses
+	// returned by mmap syscall. These are in range:
+	//  0x0a00000000000000 - 0x0afffffffffffff
+	//
 	// On 32-bit platforms, we accept the full 32-bit address
 	// space because doing so is cheap.
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
 	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle))
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosAix))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 60*sys.GoosAix
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -229,16 +230,17 @@ const (
 	// The number of bits in a heap address, the size of heap
 	// arenas, and the L1 and L2 arena map sizes are related by
 	//
-	//   (1 << addrBits) = arenaBytes * L1entries * L2entries
+	//   (1 << addr bits) = arena size * L1 entries * L2 entries
 	//
 	// Currently, we balance these as follows:
 	//
-	//       Platform  Addr bits  Arena size  L1 entries  L2 size
-	// --------------  ---------  ----------  ----------  -------
-	//       */64-bit         48        64MB           1     32MB
-	// windows/64-bit         48         4MB          64      8MB
-	//       */32-bit         32         4MB           1      4KB
-	//     */mips(le)         31         4MB           1      2KB
+	//       Platform  Addr bits  Arena size  L1 entries   L2 entries
+	// --------------  ---------  ----------  ----------  -----------
+	//       */64-bit         48        64MB           1    4M (32MB)
+	//     aix/64-bit         60       256MB        4096    4M (32MB)
+	// windows/64-bit         48         4MB          64    1M  (8MB)
+	//       */32-bit         32         4MB           1  1024  (4KB)
+	//     */mips(le)         31         4MB           1   512  (2KB)
 
 	// heapArenaBytes is the size of a heap arena. The heap
 	// consists of mappings of size heapArenaBytes, aligned to
@@ -257,7 +259,7 @@ const (
 	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
 	// prefer using heapArenaBytes where possible (we need the
 	// constant to compute some other constants).
-	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit)
+	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoosAix)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (8+20)*sys.GoosAix
 
 	// heapArenaBitmapBytes is the size of each heap arena's bitmap.
 	heapArenaBitmapBytes = heapArenaBytes / (sys.PtrSize * 8 / 2)
@@ -277,7 +279,10 @@ const (
 	// We use the L1 map on 64-bit Windows because the arena size
 	// is small, but the address space is still 48 bits, and
 	// there's a high cost to having a large L2.
-	arenaL1Bits = 6 * (_64bit * sys.GoosWindows)
+	//
+	// We use the L1 map on aix/ppc64 to keep the same L2 value
+	// as on Linux.
+	arenaL1Bits = 6*(_64bit*sys.GoosWindows) + 12*sys.GoosAix
 
 	// arenaL2Bits is the number of bits of the arena number
 	// covered by the second level arena index.
@@ -339,27 +344,27 @@ var physPageSize uintptr
 // may use larger alignment, so the caller must be careful to realign the
 // memory obtained by sysAlloc.
 //
-// SysUnused notifies the operating system that the contents
+// sysUnused notifies the operating system that the contents
 // of the memory region are no longer needed and can be reused
 // for other purposes.
-// SysUsed notifies the operating system that the contents
+// sysUsed notifies the operating system that the contents
 // of the memory region are needed again.
 //
-// SysFree returns it unconditionally; this is only used if
+// sysFree returns it unconditionally; this is only used if
 // an out-of-memory error has been detected midway through
-// an allocation. It is okay if SysFree is a no-op.
+// an allocation. It is okay if sysFree is a no-op.
 //
-// SysReserve reserves address space without allocating memory.
+// sysReserve reserves address space without allocating memory.
 // If the pointer passed to it is non-nil, the caller wants the
-// reservation there, but SysReserve can still choose another
+// reservation there, but sysReserve can still choose another
 // location if that one is unavailable.
-// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
+// NOTE: sysReserve returns OS-aligned memory, but the heap allocator
 // may use larger alignment, so the caller must be careful to realign the
 // memory obtained by sysAlloc.
 //
-// SysMap maps previously reserved address space for use.
+// sysMap maps previously reserved address space for use.
 //
-// SysFault marks a (already sysAlloc'd) region to fault
+// sysFault marks a (already sysAlloc'd) region to fault
 // if accessed. Used only for debugging the runtime.
 
 func mallocinit() {
@@ -432,8 +437,8 @@ func mallocinit() {
 		// allocation at 0x40 << 32 because when using 4k pages with 3-level
 		// translation buffers, the user address space is limited to 39 bits
 		// On darwin/arm64, the address space is even smaller.
-		// On AIX, mmap adresses range starts at 0x0700000000000000 for 64-bit
-		// processes. The new address space allocator starts at 0x0A00000000000000.
+		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
+		// processes.
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
@@ -443,10 +448,11 @@ func mallocinit() {
 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
 			case GOOS == "aix":
 				if i == 0 {
-					p = uintptrMask&(1<<42) | uintptrMask&(0xa0<<52)
-				} else {
-					p = uintptr(i)<<42 | uintptrMask&(0x70<<52)
+					// We don't use addresses directly after 0x0A00000000000000
+					// to avoid collisions with others mmaps done by non-go programs.
+					continue
 				}
+				p = uintptr(i)<<40 | uintptrMask&(0xa0<<52)
 			case raceenabled:
 				// The TSAN runtime requires the heap
 				// to be in the range [0x00c000000000,
@@ -480,7 +486,7 @@ func mallocinit() {
 		// 3. We try to stake out a reasonably large initial
 		// heap reservation.
 
-		const arenaMetaSize = unsafe.Sizeof([1 << arenaBits]heapArena{})
+		const arenaMetaSize = (1 << arenaBits) * unsafe.Sizeof(heapArena{})
 		meta := uintptr(sysReserve(nil, arenaMetaSize))
 		if meta != 0 {
 			mheap_.heapArenaAlloc.init(meta, arenaMetaSize)
@@ -663,6 +669,27 @@ mapped:
 			}
 		}
 
+		// Add the arena to the arenas list.
+		if len(h.allArenas) == cap(h.allArenas) {
+			size := 2 * uintptr(cap(h.allArenas)) * sys.PtrSize
+			if size == 0 {
+				size = physPageSize
+			}
+			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gc_sys))
+			if newArray == nil {
+				throw("out of memory allocating allArenas")
+			}
+			oldSlice := h.allArenas
+			*(*notInHeapSlice)(unsafe.Pointer(&h.allArenas)) = notInHeapSlice{newArray, len(h.allArenas), int(size / sys.PtrSize)}
+			copy(h.allArenas, oldSlice)
+			// Do not free the old backing array because
+			// there may be concurrent readers. Since we
+			// double the array each time, this can lead
+			// to at most 2x waste.
+		}
+		h.allArenas = h.allArenas[:len(h.allArenas)+1]
+		h.allArenas[len(h.allArenas)-1] = ri
+
 		// Store atomically just in case an object from the
 		// new heap arena becomes visible before the heap lock
 		// is released (which shouldn't happen, but there's
@@ -755,6 +782,9 @@ func nextFreeFast(s *mspan) gclinkptr {
 // weight allocation. If it is a heavy weight allocation the caller must
 // determine whether a new GC cycle needs to be started or if the GC is active
 // whether this goroutine needs to assist the GC.
+//
+// Must run in a non-preemptible context since otherwise the owner of
+// c could change.
 func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bool) {
 	s = c.alloc[spc]
 	shouldhelpgc = false
@@ -765,9 +795,7 @@ func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bo
 			println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
 			throw("s.allocCount != s.nelems && freeIndex == s.nelems")
 		}
-		systemstack(func() {
-			c.refill(spc)
-		})
+		c.refill(spc)
 		shouldhelpgc = true
 		s = c.alloc[spc]
 
@@ -1018,7 +1046,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	if shouldhelpgc {
 		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
-			gcStart(gcBackgroundMode, t)
+			gcStart(t)
 		}
 	}
 
@@ -1076,10 +1104,11 @@ func newarray(typ *_type, n int) unsafe.Pointer {
 	if n == 1 {
 		return mallocgc(typ.size, typ, true)
 	}
-	if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
+	mem, overflow := math.MulUintptr(typ.size, uintptr(n))
+	if overflow || mem > maxAlloc || n < 0 {
 		panic(plainError("runtime: allocation size out of range"))
 	}
-	return mallocgc(typ.size*uintptr(n), typ, true)
+	return mallocgc(mem, typ, true)
 }
 
 //go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
@@ -1164,6 +1193,15 @@ var globalAlloc struct {
 	persistentAlloc
 }
 
+// persistentChunkSize is the number of bytes we allocate when we grow
+// a persistentAlloc.
+const persistentChunkSize = 256 << 10
+
+// persistentChunks is a list of all the persistent chunks we have
+// allocated. The list is maintained through the first word in the
+// persistent chunk. This is updated atomically.
+var persistentChunks *notInHeap
+
 // Wrapper around sysAlloc that can allocate small chunks.
 // There is no associated free operation.
 // Intended for things like function/type/debug-related persistent data.
@@ -1184,7 +1222,6 @@ func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 //go:systemstack
 func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	const (
-		chunk    = 256 << 10
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
 	)
 
@@ -1215,15 +1252,24 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 		persistent = &globalAlloc.persistentAlloc
 	}
 	persistent.off = round(persistent.off, align)
-	if persistent.off+size > chunk || persistent.base == nil {
-		persistent.base = (*notInHeap)(sysAlloc(chunk, &memstats.other_sys))
+	if persistent.off+size > persistentChunkSize || persistent.base == nil {
+		persistent.base = (*notInHeap)(sysAlloc(persistentChunkSize, &memstats.other_sys))
 		if persistent.base == nil {
 			if persistent == &globalAlloc.persistentAlloc {
 				unlock(&globalAlloc.mutex)
 			}
 			throw("runtime: cannot allocate memory")
 		}
-		persistent.off = 0
+
+		// Add the new chunk to the persistentChunks list.
+		for {
+			chunks := uintptr(unsafe.Pointer(persistentChunks))
+			*(*uintptr)(unsafe.Pointer(persistent.base)) = chunks
+			if atomic.Casuintptr((*uintptr)(unsafe.Pointer(&persistentChunks)), chunks, uintptr(unsafe.Pointer(persistent.base))) {
+				break
+			}
+		}
+		persistent.off = sys.PtrSize
 	}
 	p := persistent.base.add(persistent.off)
 	persistent.off += size
@@ -1239,6 +1285,21 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	return p
 }
 
+// inPersistentAlloc reports whether p points to memory allocated by
+// persistentalloc. This must be nosplit because it is called by the
+// cgo checker code, which is called by the write barrier code.
+//go:nosplit
+func inPersistentAlloc(p uintptr) bool {
+	chunk := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&persistentChunks)))
+	for chunk != 0 {
+		if p >= chunk && p < chunk+persistentChunkSize {
+			return true
+		}
+		chunk = *(*uintptr)(unsafe.Pointer(chunk))
+	}
+	return false
+}
+
 // linearAlloc is a simple linear allocator that pre-reserves a region
 // of memory and then maps that region as needed. The caller is
 // responsible for locking.
diff --git a/libgo/go/runtime/malloc_test.go b/libgo/go/runtime/malloc_test.go
index 30a7d844c63..c9282bae55e 100644
--- a/libgo/go/runtime/malloc_test.go
+++ b/libgo/go/runtime/malloc_test.go
@@ -170,6 +170,14 @@ func TestTinyAlloc(t *testing.T) {
 	}
 }
 
+func TestPhysicalMemoryUtilization(t *testing.T) {
+	got := runTestProg(t, "testprog", "GCPhys")
+	want := "OK\n"
+	if got != want {
+		t.Fatalf("expected %q, but got %q", want, got)
+	}
+}
+
 type acLink struct {
 	x [1 << 20]byte
 }
@@ -177,6 +185,14 @@ type acLink struct {
 var arenaCollisionSink []*acLink
 
 func TestArenaCollision(t *testing.T) {
+	if GOOS == "darwin" && race.Enabled {
+		// Skip this test on Darwin in race mode because Darwin 10.10 has
+		// issues following arena hints and runs out of them in race mode, so
+		// MAP_FIXED is used to ensure we keep the heap in the memory region the
+		// race detector expects.
+		// TODO(mknyszek): Delete this when Darwin 10.10 is no longer supported.
+		t.Skip("disabled on Darwin with race mode since MAP_FIXED is used")
+	}
 	testenv.MustHaveExec(t)
 
 	// Test that mheap.sysAlloc handles collisions with other
diff --git a/libgo/go/runtime/map.go b/libgo/go/runtime/map.go
index 52462c7e117..5dd5283e1ec 100644
--- a/libgo/go/runtime/map.go
+++ b/libgo/go/runtime/map.go
@@ -55,6 +55,7 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/math"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -103,11 +104,12 @@ const (
 	// Each bucket (including its overflow buckets, if any) will have either all or none of its
 	// entries in the evacuated* states (except during the evacuate() method, which only happens
 	// during map writes and thus no one else can observe the map during that time).
-	empty          = 0 // cell is empty
-	evacuatedEmpty = 1 // cell is empty, bucket is evacuated.
+	emptyRest      = 0 // this cell is empty, and there are no more non-empty cells at higher indexes or overflows.
+	emptyOne       = 1 // this cell is empty
 	evacuatedX     = 2 // key/value is valid.  Entry has been evacuated to first half of larger table.
 	evacuatedY     = 3 // same as above, but evacuated to second half of larger table.
-	minTopHash     = 4 // minimum tophash for a normal filled cell.
+	evacuatedEmpty = 4 // cell is empty, bucket is evacuated.
+	minTopHash     = 5 // minimum tophash for a normal filled cell.
 
 	// flags
 	iterator     = 1 // there may be an iterator using buckets
@@ -119,6 +121,11 @@ const (
 	noCheck = 1<<(8*sys.PtrSize) - 1
 )
 
+// isEmpty reports whether the given tophash array entry represents an empty bucket entry.
+func isEmpty(x uint8) bool {
+	return x <= emptyOne
+}
+
 // A header for a Go map.
 type hmap struct {
 	// Note: the format of the hmap is also encoded in cmd/compile/internal/gc/reflect.go.
@@ -211,7 +218,7 @@ func tophash(hash uintptr) uint8 {
 
 func evacuated(b *bmap) bool {
 	h := b.tophash[0]
-	return h > empty && h < minTopHash
+	return h > emptyOne && h < minTopHash
 }
 
 func (b *bmap) overflow(t *maptype) *bmap {
@@ -311,7 +318,8 @@ func makemap_small() *hmap {
 // If h != nil, the map can be created directly in h.
 // If h.buckets != nil, bucket pointed to can be used as the first bucket.
 func makemap(t *maptype, hint int, h *hmap) *hmap {
-	if hint < 0 || hint > int(maxSliceCap(t.bucket.size)) {
+	mem, overflow := math.MulUintptr(uintptr(hint), t.bucket.size)
+	if overflow || mem > maxAlloc {
 		hint = 0
 	}
 
@@ -321,7 +329,8 @@ func makemap(t *maptype, hint int, h *hmap) *hmap {
 	}
 	h.hash0 = fastrand()
 
-	// find size parameter which will hold the requested # of elements
+	// Find the size parameter B which will hold the requested # of elements.
+	// For hint < 0 overLoadFactor returns false since hint < bucketCnt.
 	B := uint8(0)
 	for overLoadFactor(hint, B) {
 		B++
@@ -439,9 +448,13 @@ func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 		}
 	}
 	top := tophash(hash)
+bucketloop:
 	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
@@ -500,9 +513,13 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 		}
 	}
 	top := tophash(hash)
+bucketloop:
 	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
@@ -547,9 +564,13 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 		}
 	}
 	top := tophash(hash)
+bucketloop:
 	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
@@ -612,7 +633,7 @@ func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 
 	// Set hashWriting after calling alg.hash, since alg.hash may panic,
 	// in which case we have not actually done a write.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -629,14 +650,18 @@ again:
 	var inserti *uint8
 	var insertk unsafe.Pointer
 	var val unsafe.Pointer
+bucketloop:
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
-				if b.tophash[i] == empty && inserti == nil {
+				if isEmpty(b.tophash[i]) && inserti == nil {
 					inserti = &b.tophash[i]
 					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
 					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
 				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
@@ -728,18 +753,22 @@ func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
 
 	// Set hashWriting after calling alg.hash, since alg.hash may panic,
 	// in which case we have not actually done a write (delete).
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork(t, h, bucket)
 	}
 	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	bOrig := b
 	top := tophash(hash)
 search:
 	for ; b != nil; b = b.overflow(t) {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
+				if b.tophash[i] == emptyRest {
+					break search
+				}
 				continue
 			}
 			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
@@ -764,7 +793,39 @@ search:
 			} else {
 				memclrNoHeapPointers(v, t.elem.size)
 			}
-			b.tophash[i] = empty
+			b.tophash[i] = emptyOne
+			// If the bucket now ends in a bunch of emptyOne states,
+			// change those to emptyRest states.
+			// It would be nice to make this a separate function, but
+			// for loops are not currently inlineable.
+			if i == bucketCnt-1 {
+				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
+					goto notLast
+				}
+			} else {
+				if b.tophash[i+1] != emptyRest {
+					goto notLast
+				}
+			}
+			for {
+				b.tophash[i] = emptyRest
+				if i == 0 {
+					if b == bOrig {
+						break // beginning of initial bucket, we're done.
+					}
+					// Find previous bucket, continue at its last entry.
+					c := b
+					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
+					}
+					i = bucketCnt - 1
+				} else {
+					i--
+				}
+				if b.tophash[i] != emptyOne {
+					break
+				}
+			}
+		notLast:
 			h.count--
 			break search
 		}
@@ -899,7 +960,9 @@ next:
 	}
 	for ; i < bucketCnt; i++ {
 		offi := (i + it.offset) & (bucketCnt - 1)
-		if b.tophash[offi] == empty || b.tophash[offi] == evacuatedEmpty {
+		if isEmpty(b.tophash[offi]) || b.tophash[offi] == evacuatedEmpty {
+			// TODO: emptyRest is hard to use here, as we start iterating
+			// in the middle of a bucket. It's feasible, just tricky.
 			continue
 		}
 		k := add(unsafe.Pointer(b), dataOffset+uintptr(offi)*uintptr(t.keysize))
@@ -990,7 +1053,7 @@ func mapclear(t *maptype, h *hmap) {
 		throw("concurrent map writes")
 	}
 
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	h.flags &^= sameSizeGrow
 	h.oldbuckets = nil
@@ -1158,7 +1221,7 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 			v := add(k, bucketCnt*uintptr(t.keysize))
 			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, uintptr(t.keysize)), add(v, uintptr(t.valuesize)) {
 				top := b.tophash[i]
-				if top == empty {
+				if isEmpty(top) {
 					b.tophash[i] = evacuatedEmpty
 					continue
 				}
@@ -1195,7 +1258,7 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 					}
 				}
 
-				if evacuatedX+1 != evacuatedY {
+				if evacuatedX+1 != evacuatedY || evacuatedX^1 != evacuatedY {
 					throw("bad evacuatedN")
 				}
 
@@ -1351,6 +1414,11 @@ func reflect_mapiterkey(it *hiter) unsafe.Pointer {
 	return it.key
 }
 
+//go:linkname reflect_mapitervalue reflect.mapitervalue
+func reflect_mapitervalue(it *hiter) unsafe.Pointer {
+	return it.value
+}
+
 //go:linkname reflect_maplen reflect.maplen
 func reflect_maplen(h *hmap) int {
 	if h == nil {
diff --git a/libgo/go/runtime/map_benchmark_test.go b/libgo/go/runtime/map_benchmark_test.go
index 025c0398d3b..d37dadcb569 100644
--- a/libgo/go/runtime/map_benchmark_test.go
+++ b/libgo/go/runtime/map_benchmark_test.go
@@ -5,6 +5,7 @@ package runtime_test
 
 import (
 	"fmt"
+	"math/rand"
 	"strconv"
 	"strings"
 	"testing"
@@ -206,6 +207,67 @@ func BenchmarkIntMap(b *testing.B) {
 	}
 }
 
+func BenchmarkMapFirst(b *testing.B) {
+	for n := 1; n <= 16; n++ {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			m := make(map[int]bool)
+			for i := 0; i < n; i++ {
+				m[i] = true
+			}
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_ = m[0]
+			}
+		})
+	}
+}
+func BenchmarkMapMid(b *testing.B) {
+	for n := 1; n <= 16; n++ {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			m := make(map[int]bool)
+			for i := 0; i < n; i++ {
+				m[i] = true
+			}
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_ = m[n>>1]
+			}
+		})
+	}
+}
+func BenchmarkMapLast(b *testing.B) {
+	for n := 1; n <= 16; n++ {
+		b.Run(fmt.Sprintf("%d", n), func(b *testing.B) {
+			m := make(map[int]bool)
+			for i := 0; i < n; i++ {
+				m[i] = true
+			}
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_ = m[n-1]
+			}
+		})
+	}
+}
+
+func BenchmarkMapCycle(b *testing.B) {
+	// Arrange map entries to be a permuation, so that
+	// we hit all entries, and one lookup is data dependent
+	// on the previous lookup.
+	const N = 3127
+	p := rand.New(rand.NewSource(1)).Perm(N)
+	m := map[int]int{}
+	for i := 0; i < N; i++ {
+		m[i] = p[i]
+	}
+	b.ResetTimer()
+	j := 0
+	for i := 0; i < b.N; i++ {
+		j = m[j]
+	}
+	sink = uint64(j)
+}
+
 // Accessing the same keys in a row.
 func benchmarkRepeatedLookup(b *testing.B, lookupKeySize int) {
 	m := make(map[string]bool)
@@ -228,6 +290,23 @@ func benchmarkRepeatedLookup(b *testing.B, lookupKeySize int) {
 func BenchmarkRepeatedLookupStrMapKey32(b *testing.B) { benchmarkRepeatedLookup(b, 32) }
 func BenchmarkRepeatedLookupStrMapKey1M(b *testing.B) { benchmarkRepeatedLookup(b, 1<<20) }
 
+func BenchmarkMakeMap(b *testing.B) {
+	b.Run("[Byte]Byte", func(b *testing.B) {
+		var m map[byte]byte
+		for i := 0; i < b.N; i++ {
+			m = make(map[byte]byte, 10)
+		}
+		hugeSink = m
+	})
+	b.Run("[Int]Int", func(b *testing.B) {
+		var m map[int]int
+		for i := 0; i < b.N; i++ {
+			m = make(map[int]int, 10)
+		}
+		hugeSink = m
+	})
+}
+
 func BenchmarkNewEmptyMap(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
@@ -370,3 +449,37 @@ func BenchmarkGoMapClear(b *testing.B) {
 		}
 	})
 }
+
+func BenchmarkMapStringConversion(b *testing.B) {
+	for _, length := range []int{32, 64} {
+		b.Run(strconv.Itoa(length), func(b *testing.B) {
+			bytes := make([]byte, length)
+			b.Run("simple", func(b *testing.B) {
+				b.ReportAllocs()
+				m := make(map[string]int)
+				m[string(bytes)] = 0
+				for i := 0; i < b.N; i++ {
+					_ = m[string(bytes)]
+				}
+			})
+			b.Run("struct", func(b *testing.B) {
+				b.ReportAllocs()
+				type stringstruct struct{ s string }
+				m := make(map[stringstruct]int)
+				m[stringstruct{string(bytes)}] = 0
+				for i := 0; i < b.N; i++ {
+					_ = m[stringstruct{string(bytes)}]
+				}
+			})
+			b.Run("array", func(b *testing.B) {
+				b.ReportAllocs()
+				type stringarray [1]string
+				m := make(map[stringarray]int)
+				m[stringarray{string(bytes)}] = 0
+				for i := 0; i < b.N; i++ {
+					_ = m[stringarray{string(bytes)}]
+				}
+			})
+		})
+	}
+}
diff --git a/libgo/go/runtime/map_fast32.go b/libgo/go/runtime/map_fast32.go
index a9a06a85c26..1fa5cd968e4 100644
--- a/libgo/go/runtime/map_fast32.go
+++ b/libgo/go/runtime/map_fast32.go
@@ -41,7 +41,7 @@ func mapaccess1_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	}
 	for ; b != nil; b = b.overflow(t) {
 		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
-			if *(*uint32)(k) == key && b.tophash[i] != empty {
+			if *(*uint32)(k) == key && !isEmpty(b.tophash[i]) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
 			}
 		}
@@ -81,7 +81,7 @@ func mapaccess2_fast32(t *maptype, h *hmap, key uint32) (unsafe.Pointer, bool) {
 	}
 	for ; b != nil; b = b.overflow(t) {
 		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
-			if *(*uint32)(k) == key && b.tophash[i] != empty {
+			if *(*uint32)(k) == key && !isEmpty(b.tophash[i]) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize)), true
 			}
 		}
@@ -103,7 +103,7 @@ func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -120,13 +120,17 @@ again:
 	var inserti uintptr
 	var insertk unsafe.Pointer
 
+bucketloop:
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] == empty {
+			if isEmpty(b.tophash[i]) {
 				if insertb == nil {
 					inserti = i
 					insertb = b
 				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
@@ -189,7 +193,7 @@ func mapassign_fast32ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -206,13 +210,17 @@ again:
 	var inserti uintptr
 	var insertk unsafe.Pointer
 
+bucketloop:
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] == empty {
+			if isEmpty(b.tophash[i]) {
 				if insertb == nil {
 					inserti = i
 					insertb = b
 				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*4)))
@@ -276,17 +284,18 @@ func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork_fast32(t, h, bucket)
 	}
 	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	bOrig := b
 search:
 	for ; b != nil; b = b.overflow(t) {
 		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 4) {
-			if key != *(*uint32)(k) || b.tophash[i] == empty {
+			if key != *(*uint32)(k) || isEmpty(b.tophash[i]) {
 				continue
 			}
 			// Only clear key if there are pointers in it.
@@ -299,7 +308,37 @@ search:
 			} else {
 				memclrNoHeapPointers(v, t.elem.size)
 			}
-			b.tophash[i] = empty
+			b.tophash[i] = emptyOne
+			// If the bucket now ends in a bunch of emptyOne states,
+			// change those to emptyRest states.
+			if i == bucketCnt-1 {
+				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
+					goto notLast
+				}
+			} else {
+				if b.tophash[i+1] != emptyRest {
+					goto notLast
+				}
+			}
+			for {
+				b.tophash[i] = emptyRest
+				if i == 0 {
+					if b == bOrig {
+						break // beginning of initial bucket, we're done.
+					}
+					// Find previous bucket, continue at its last entry.
+					c := b
+					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
+					}
+					i = bucketCnt - 1
+				} else {
+					i--
+				}
+				if b.tophash[i] != emptyOne {
+					break
+				}
+			}
+		notLast:
 			h.count--
 			break search
 		}
@@ -350,7 +389,7 @@ func evacuate_fast32(t *maptype, h *hmap, oldbucket uintptr) {
 			v := add(k, bucketCnt*4)
 			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 4), add(v, uintptr(t.valuesize)) {
 				top := b.tophash[i]
-				if top == empty {
+				if isEmpty(top) {
 					b.tophash[i] = evacuatedEmpty
 					continue
 				}
diff --git a/libgo/go/runtime/map_fast64.go b/libgo/go/runtime/map_fast64.go
index a2a51fcac6a..d23ac23eb14 100644
--- a/libgo/go/runtime/map_fast64.go
+++ b/libgo/go/runtime/map_fast64.go
@@ -41,7 +41,7 @@ func mapaccess1_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	}
 	for ; b != nil; b = b.overflow(t) {
 		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
-			if *(*uint64)(k) == key && b.tophash[i] != empty {
+			if *(*uint64)(k) == key && !isEmpty(b.tophash[i]) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
 			}
 		}
@@ -81,7 +81,7 @@ func mapaccess2_fast64(t *maptype, h *hmap, key uint64) (unsafe.Pointer, bool) {
 	}
 	for ; b != nil; b = b.overflow(t) {
 		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
-			if *(*uint64)(k) == key && b.tophash[i] != empty {
+			if *(*uint64)(k) == key && !isEmpty(b.tophash[i]) {
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize)), true
 			}
 		}
@@ -103,7 +103,7 @@ func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -120,13 +120,17 @@ again:
 	var inserti uintptr
 	var insertk unsafe.Pointer
 
+bucketloop:
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] == empty {
+			if isEmpty(b.tophash[i]) {
 				if insertb == nil {
 					insertb = b
 					inserti = i
 				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
@@ -189,7 +193,7 @@ func mapassign_fast64ptr(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -206,13 +210,17 @@ again:
 	var inserti uintptr
 	var insertk unsafe.Pointer
 
+bucketloop:
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			if b.tophash[i] == empty {
+			if isEmpty(b.tophash[i]) {
 				if insertb == nil {
 					insertb = b
 					inserti = i
 				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := *((*unsafe.Pointer)(add(unsafe.Pointer(b), dataOffset+i*8)))
@@ -276,17 +284,18 @@ func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork_fast64(t, h, bucket)
 	}
 	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	bOrig := b
 search:
 	for ; b != nil; b = b.overflow(t) {
 		for i, k := uintptr(0), b.keys(); i < bucketCnt; i, k = i+1, add(k, 8) {
-			if key != *(*uint64)(k) || b.tophash[i] == empty {
+			if key != *(*uint64)(k) || isEmpty(b.tophash[i]) {
 				continue
 			}
 			// Only clear key if there are pointers in it.
@@ -299,7 +308,37 @@ search:
 			} else {
 				memclrNoHeapPointers(v, t.elem.size)
 			}
-			b.tophash[i] = empty
+			b.tophash[i] = emptyOne
+			// If the bucket now ends in a bunch of emptyOne states,
+			// change those to emptyRest states.
+			if i == bucketCnt-1 {
+				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
+					goto notLast
+				}
+			} else {
+				if b.tophash[i+1] != emptyRest {
+					goto notLast
+				}
+			}
+			for {
+				b.tophash[i] = emptyRest
+				if i == 0 {
+					if b == bOrig {
+						break // beginning of initial bucket, we're done.
+					}
+					// Find previous bucket, continue at its last entry.
+					c := b
+					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
+					}
+					i = bucketCnt - 1
+				} else {
+					i--
+				}
+				if b.tophash[i] != emptyOne {
+					break
+				}
+			}
+		notLast:
 			h.count--
 			break search
 		}
@@ -350,7 +389,7 @@ func evacuate_fast64(t *maptype, h *hmap, oldbucket uintptr) {
 			v := add(k, bucketCnt*8)
 			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 8), add(v, uintptr(t.valuesize)) {
 				top := b.tophash[i]
-				if top == empty {
+				if isEmpty(top) {
 					b.tophash[i] = evacuatedEmpty
 					continue
 				}
diff --git a/libgo/go/runtime/map_faststr.go b/libgo/go/runtime/map_faststr.go
index 5812b3f1049..eced15a6d5d 100644
--- a/libgo/go/runtime/map_faststr.go
+++ b/libgo/go/runtime/map_faststr.go
@@ -28,7 +28,10 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 			// short key, doing lots of comparisons is ok
 			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
 				k := (*stringStruct)(kptr)
-				if k.len != key.len || b.tophash[i] == empty {
+				if k.len != key.len || isEmpty(b.tophash[i]) {
+					if b.tophash[i] == emptyRest {
+						break
+					}
 					continue
 				}
 				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
@@ -41,7 +44,10 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
 		keymaybe := uintptr(bucketCnt)
 		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
 			k := (*stringStruct)(kptr)
-			if k.len != key.len || b.tophash[i] == empty {
+			if k.len != key.len || isEmpty(b.tophash[i]) {
+				if b.tophash[i] == emptyRest {
+					break
+				}
 				continue
 			}
 			if k.str == key.str {
@@ -117,7 +123,10 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 			// short key, doing lots of comparisons is ok
 			for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
 				k := (*stringStruct)(kptr)
-				if k.len != key.len || b.tophash[i] == empty {
+				if k.len != key.len || isEmpty(b.tophash[i]) {
+					if b.tophash[i] == emptyRest {
+						break
+					}
 					continue
 				}
 				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
@@ -130,7 +139,10 @@ func mapaccess2_faststr(t *maptype, h *hmap, ky string) (unsafe.Pointer, bool) {
 		keymaybe := uintptr(bucketCnt)
 		for i, kptr := uintptr(0), b.keys(); i < bucketCnt; i, kptr = i+1, add(kptr, 2*sys.PtrSize) {
 			k := (*stringStruct)(kptr)
-			if k.len != key.len || b.tophash[i] == empty {
+			if k.len != key.len || isEmpty(b.tophash[i]) {
+				if b.tophash[i] == emptyRest {
+					break
+				}
 				continue
 			}
 			if k.str == key.str {
@@ -202,7 +214,7 @@ func mapassign_faststr(t *maptype, h *hmap, s string) unsafe.Pointer {
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&s)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapassign.
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	if h.buckets == nil {
 		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
@@ -220,13 +232,17 @@ again:
 	var inserti uintptr
 	var insertk unsafe.Pointer
 
+bucketloop:
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
 			if b.tophash[i] != top {
-				if b.tophash[i] == empty && insertb == nil {
+				if isEmpty(b.tophash[i]) && insertb == nil {
 					insertb = b
 					inserti = i
 				}
+				if b.tophash[i] == emptyRest {
+					break bucketloop
+				}
 				continue
 			}
 			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
@@ -294,13 +310,14 @@ func mapdelete_faststr(t *maptype, h *hmap, ky string) {
 	hash := t.key.hashfn(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
 
 	// Set hashWriting after calling alg.hash for consistency with mapdelete
-	h.flags |= hashWriting
+	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
 	if h.growing() {
 		growWork_faststr(t, h, bucket)
 	}
 	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
+	bOrig := b
 	top := tophash(hash)
 search:
 	for ; b != nil; b = b.overflow(t) {
@@ -320,7 +337,37 @@ search:
 			} else {
 				memclrNoHeapPointers(v, t.elem.size)
 			}
-			b.tophash[i] = empty
+			b.tophash[i] = emptyOne
+			// If the bucket now ends in a bunch of emptyOne states,
+			// change those to emptyRest states.
+			if i == bucketCnt-1 {
+				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
+					goto notLast
+				}
+			} else {
+				if b.tophash[i+1] != emptyRest {
+					goto notLast
+				}
+			}
+			for {
+				b.tophash[i] = emptyRest
+				if i == 0 {
+					if b == bOrig {
+						break // beginning of initial bucket, we're done.
+					}
+					// Find previous bucket, continue at its last entry.
+					c := b
+					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
+					}
+					i = bucketCnt - 1
+				} else {
+					i--
+				}
+				if b.tophash[i] != emptyOne {
+					break
+				}
+			}
+		notLast:
 			h.count--
 			break search
 		}
@@ -371,7 +418,7 @@ func evacuate_faststr(t *maptype, h *hmap, oldbucket uintptr) {
 			v := add(k, bucketCnt*2*sys.PtrSize)
 			for i := 0; i < bucketCnt; i, k, v = i+1, add(k, 2*sys.PtrSize), add(v, uintptr(t.valuesize)) {
 				top := b.tophash[i]
-				if top == empty {
+				if isEmpty(top) {
 					b.tophash[i] = evacuatedEmpty
 					continue
 				}
diff --git a/libgo/go/runtime/map_test.go b/libgo/go/runtime/map_test.go
index 13f1d2ea62a..bc5f738c4fa 100644
--- a/libgo/go/runtime/map_test.go
+++ b/libgo/go/runtime/map_test.go
@@ -1147,3 +1147,28 @@ func TestIncrementAfterBulkClearKeyStringValueInt(t *testing.T) {
 		t.Errorf("incremented 0 to %d", n2)
 	}
 }
+
+func TestMapTombstones(t *testing.T) {
+	m := map[int]int{}
+	const N = 10000
+	// Fill a map.
+	for i := 0; i < N; i++ {
+		m[i] = i
+	}
+	runtime.MapTombstoneCheck(m)
+	// Delete half of the entries.
+	for i := 0; i < N; i += 2 {
+		delete(m, i)
+	}
+	runtime.MapTombstoneCheck(m)
+	// Add new entries to fill in holes.
+	for i := N; i < 3*N/2; i++ {
+		m[i] = i
+	}
+	runtime.MapTombstoneCheck(m)
+	// Delete everything.
+	for i := 0; i < 3*N/2; i++ {
+		delete(m, i)
+	}
+	runtime.MapTombstoneCheck(m)
+}
diff --git a/libgo/go/runtime/mbarrier.go b/libgo/go/runtime/mbarrier.go
index 48713157601..d3ffd3c518d 100644
--- a/libgo/go/runtime/mbarrier.go
+++ b/libgo/go/runtime/mbarrier.go
@@ -215,8 +215,6 @@ func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size
 
 //go:nosplit
 func typedslicecopy(typ *_type, dst, src slice) int {
-	// TODO(rsc): If typedslicecopy becomes faster than calling
-	// typedmemmove repeatedly, consider using during func growslice.
 	n := dst.len
 	if n > src.len {
 		n = src.len
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index 42c2015ee44..9c25a215f35 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -242,7 +242,7 @@ func (s *mspan) nextFreeIndex() uintptr {
 	return result
 }
 
-// isFree returns whether the index'th object in s is unallocated.
+// isFree reports whether the index'th object in s is unallocated.
 func (s *mspan) isFree(index uintptr) bool {
 	if index < s.freeindex {
 		return false
@@ -283,9 +283,7 @@ func (m markBits) isMarked() bool {
 	return *m.bytep&m.mask != 0
 }
 
-// setMarked sets the marked bit in the markbits, atomically. Some compilers
-// are not able to inline atomic.Or8 function so if it appears as a hot spot consider
-// inlining it manually.
+// setMarked sets the marked bit in the markbits, atomically.
 func (m markBits) setMarked() {
 	// Might be racing with other updates, so use atomic update always.
 	// We used to be clever here and use a non-atomic update in certain
@@ -368,7 +366,7 @@ func findObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, s *msp
 	s = spanOf(p)
 	// If p is a bad pointer, it may not be in s's bounds.
 	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
-		if s == nil || s.state == _MSpanManual || forStack {
+		if s == nil || s.state == mSpanManual || forStack {
 			// If s is nil, the virtual address has never been part of the heap.
 			// This pointer may be to some mmap'd region, so we allow it.
 			// Pointers into stacks are also ok, the runtime manages these explicitly.
@@ -533,7 +531,7 @@ func (h heapBits) bits() uint32 {
 	return uint32(*h.bitp) >> (h.shift & 31)
 }
 
-// morePointers returns true if this word and all remaining words in this object
+// morePointers reports whether this word and all remaining words in this object
 // are scalars.
 // h must not describe the second word of the object.
 func (h heapBits) morePointers() bool {
@@ -631,7 +629,7 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 			}
 		}
 		return
-	} else if s.state != _MSpanInUse || dst < s.base() || s.limit <= dst {
+	} else if s.state != mSpanInUse || dst < s.base() || s.limit <= dst {
 		// dst was heap memory at some point, but isn't now.
 		// It can't be a global. It must be either our stack,
 		// or in the case of direct channel sends, it could be
@@ -667,6 +665,35 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 	}
 }
 
+// bulkBarrierPreWriteSrcOnly is like bulkBarrierPreWrite but
+// does not execute write barriers for [dst, dst+size).
+//
+// In addition to the requirements of bulkBarrierPreWrite
+// callers need to ensure [dst, dst+size) is zeroed.
+//
+// This is used for special cases where e.g. dst was just
+// created and zeroed with malloc.
+//go:nosplit
+func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) {
+	if (dst|src|size)&(sys.PtrSize-1) != 0 {
+		throw("bulkBarrierPreWrite: unaligned arguments")
+	}
+	if !writeBarrier.needed {
+		return
+	}
+	buf := &getg().m.p.ptr().wbBuf
+	h := heapBitsForAddr(dst)
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		if h.isPointer() {
+			srcx := (*uintptr)(unsafe.Pointer(src + i))
+			if !buf.putFast(0, *srcx) {
+				wbBufFlush(nil, 0)
+			}
+		}
+		h = h.next()
+	}
+}
+
 // bulkBarrierBitmap executes write barriers for copying from [src,
 // src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
 // assumed to start maskOffset bytes into the data covered by the
@@ -1902,6 +1929,20 @@ Run:
 	return totalBits
 }
 
+// materializeGCProg allocates space for the (1-bit) pointer bitmask
+// for an object of size ptrdata.  Then it fills that space with the
+// pointer bitmask specified by the program prog.
+// The bitmask starts at s.startAddr.
+// The result must be deallocated with dematerializeGCProg.
+func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
+	s := mheap_.allocManual((ptrdata/(8*sys.PtrSize)+pageSize-1)/pageSize, &memstats.gc_sys)
+	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
+	return s
+}
+func dematerializeGCProg(s *mspan) {
+	mheap_.freeManual(s, &memstats.gc_sys)
+}
+
 func dumpGCProg(p *byte) {
 	nptr := 0
 	for {
@@ -1962,7 +2003,9 @@ func reflect_gcbits(x interface{}) []byte {
 	return ret
 }
 
-// Returns GC type info for object p for testing.
+// Returns GC type info for the pointer stored in ep for testing.
+// If ep points to the stack, only static live information will be returned
+// (i.e. not for objects which are only dynamically live stack objects).
 func getgcmask(ep interface{}) (mask []byte) {
 	e := *efaceOf(&ep)
 	p := e.data
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index 3dacf9692ec..2045158636a 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Per-thread (in Go, per-P) cache for small objects.
 // No locking needed because it is per-thread (per-P).
@@ -40,6 +43,12 @@ type mcache struct {
 	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
 	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
 	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+
+	// flushGen indicates the sweepgen during which this mcache
+	// was last flushed. If flushGen != mheap_.sweepgen, the spans
+	// in this mcache are stale and need to the flushed so they
+	// can be swept. This is done in acquirep.
+	flushGen uint32
 }
 
 // A gclink is a node in a linked list of blocks, like mlink,
@@ -63,12 +72,13 @@ func (p gclinkptr) ptr() *gclink {
 	return (*gclink)(unsafe.Pointer(p))
 }
 
-// dummy MSpan that contains no free objects.
+// dummy mspan that contains no free objects.
 var emptymspan mspan
 
 func allocmcache() *mcache {
 	lock(&mheap_.lock)
 	c := (*mcache)(mheap_.cachealloc.alloc())
+	c.flushGen = mheap_.sweepgen
 	unlock(&mheap_.lock)
 	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
@@ -93,21 +103,24 @@ func freemcache(c *mcache) {
 	})
 }
 
-// Gets a span that has a free object in it and assigns it
-// to be the cached span for the given sizeclass. Returns this span.
+// refill acquires a new span of span class spc for c. This span will
+// have at least one free object. The current span in c must be full.
+//
+// Must run in a non-preemptible context since otherwise the owner of
+// c could change.
 func (c *mcache) refill(spc spanClass) {
-	_g_ := getg()
-
-	_g_.m.locks++
 	// Return the current cached span to the central lists.
 	s := c.alloc[spc]
 
 	if uintptr(s.allocCount) != s.nelems {
 		throw("refill of span with free space remaining")
 	}
-
 	if s != &emptymspan {
-		s.incache = false
+		// Mark this span as no longer cached.
+		if s.sweepgen != mheap_.sweepgen+3 {
+			throw("bad sweepgen in refill")
+		}
+		atomic.Store(&s.sweepgen, mheap_.sweepgen)
 	}
 
 	// Get a new cached span from the central lists.
@@ -120,8 +133,11 @@ func (c *mcache) refill(spc spanClass) {
 		throw("span has no free space")
 	}
 
+	// Indicate that this span is cached and prevent asynchronous
+	// sweeping in the next sweep phase.
+	s.sweepgen = mheap_.sweepgen + 3
+
 	c.alloc[spc] = s
-	_g_.m.locks--
 }
 
 func (c *mcache) releaseAll() {
@@ -136,3 +152,25 @@ func (c *mcache) releaseAll() {
 	c.tiny = 0
 	c.tinyoffset = 0
 }
+
+// prepareForSweep flushes c if the system has entered a new sweep phase
+// since c was populated. This must happen between the sweep phase
+// starting and the first allocation from c.
+func (c *mcache) prepareForSweep() {
+	// Alternatively, instead of making sure we do this on every P
+	// between starting the world and allocating on that P, we
+	// could leave allocate-black on, allow allocation to continue
+	// as usual, use a ragged barrier at the beginning of sweep to
+	// ensure all cached spans are swept, and then disable
+	// allocate-black. However, with this approach it's difficult
+	// to avoid spilling mark bits into the *next* GC cycle.
+	sg := mheap_.sweepgen
+	if c.flushGen == sg {
+		return
+	} else if c.flushGen != sg-2 {
+		println("bad flushGen", c.flushGen, "in prepareForSweep; sweepgen", sg)
+		throw("bad flushGen")
+	}
+	c.releaseAll()
+	atomic.Store(&c.flushGen, mheap_.sweepgen) // Synchronizes with gcStart
+}
diff --git a/libgo/go/runtime/mcentral.go b/libgo/go/runtime/mcentral.go
index 50a4791e8e6..0196ba44c5d 100644
--- a/libgo/go/runtime/mcentral.go
+++ b/libgo/go/runtime/mcentral.go
@@ -6,8 +6,8 @@
 //
 // See malloc.go for an overview.
 //
-// The MCentral doesn't actually contain the list of free objects; the MSpan does.
-// Each MCentral is two lists of MSpans: those with free objects (c->nonempty)
+// The mcentral doesn't actually contain the list of free objects; the mspan does.
+// Each mcentral is two lists of mspans: those with free objects (c->nonempty)
 // and those that are completely allocated (c->empty).
 
 package runtime
@@ -36,7 +36,7 @@ func (c *mcentral) init(spc spanClass) {
 	c.empty.init()
 }
 
-// Allocate a span to use in an MCache.
+// Allocate a span to use in an mcache.
 func (c *mcentral) cacheSpan() *mspan {
 	// Deduct credit for this span allocation and sweep if necessary.
 	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
@@ -126,8 +126,7 @@ havespan:
 	if trace.enabled && !traceDone {
 		traceGCSweepDone()
 	}
-	cap := int32((s.npages << _PageShift) / s.elemsize)
-	n := cap - int32(s.allocCount)
+	n := int(s.nelems) - int(s.allocCount)
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
@@ -144,7 +143,6 @@ havespan:
 		// heap_live changed.
 		gcController.revise()
 	}
-	s.incache = true
 	freeByteBase := s.freeindex &^ (64 - 1)
 	whichByte := freeByteBase / 8
 	// Init alloc bits cache.
@@ -157,30 +155,56 @@ havespan:
 	return s
 }
 
-// Return span from an MCache.
+// Return span from an mcache.
 func (c *mcentral) uncacheSpan(s *mspan) {
-	lock(&c.lock)
-
-	s.incache = false
-
 	if s.allocCount == 0 {
 		throw("uncaching span but s.allocCount == 0")
 	}
 
-	cap := int32((s.npages << _PageShift) / s.elemsize)
-	n := cap - int32(s.allocCount)
+	sg := mheap_.sweepgen
+	stale := s.sweepgen == sg+1
+	if stale {
+		// Span was cached before sweep began. It's our
+		// responsibility to sweep it.
+		//
+		// Set sweepgen to indicate it's not cached but needs
+		// sweeping and can't be allocated from. sweep will
+		// set s.sweepgen to indicate s is swept.
+		atomic.Store(&s.sweepgen, sg-1)
+	} else {
+		// Indicate that s is no longer cached.
+		atomic.Store(&s.sweepgen, sg)
+	}
+
+	n := int(s.nelems) - int(s.allocCount)
 	if n > 0 {
-		c.empty.remove(s)
-		c.nonempty.insert(s)
-		// mCentral_CacheSpan conservatively counted
-		// unallocated slots in heap_live. Undo this.
-		atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
 		// cacheSpan updated alloc assuming all objects on s
 		// were going to be allocated. Adjust for any that
-		// weren't.
+		// weren't. We must do this before potentially
+		// sweeping the span.
 		atomic.Xadd64(&c.nmalloc, -int64(n))
+
+		lock(&c.lock)
+		c.empty.remove(s)
+		c.nonempty.insert(s)
+		if !stale {
+			// mCentral_CacheSpan conservatively counted
+			// unallocated slots in heap_live. Undo this.
+			//
+			// If this span was cached before sweep, then
+			// heap_live was totally recomputed since
+			// caching this span, so we don't do this for
+			// stale spans.
+			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+		}
+		unlock(&c.lock)
+	}
+
+	if stale {
+		// Now that s is in the right mcentral list, we can
+		// sweep it.
+		s.sweep(false)
 	}
-	unlock(&c.lock)
 }
 
 // freeSpan updates c and s after sweeping s.
@@ -188,17 +212,17 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 // and, based on the number of free objects in s,
 // moves s to the appropriate list of c or returns it
 // to the heap.
-// freeSpan returns true if s was returned to the heap.
+// freeSpan reports whether s was returned to the heap.
 // If preserve=true, it does not move s (the caller
 // must take care of it).
 func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
-	if s.incache {
+	if sg := mheap_.sweepgen; s.sweepgen == sg+1 || s.sweepgen == sg+3 {
 		throw("freeSpan given cached span")
 	}
 	s.needzero = 1
 
 	if preserve {
-		// preserve is set only when called from MCentral_CacheSpan above,
+		// preserve is set only when called from (un)cacheSpan above,
 		// the span must be in the empty list.
 		if !s.inList() {
 			throw("can't preserve unlinked span")
@@ -216,7 +240,7 @@ func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
 	}
 
 	// delay updating sweepgen until here. This is the signal that
-	// the span may be used in an MCache, so it must come after the
+	// the span may be used in an mcache, so it must come after the
 	// linked list operations above (actually, just after the
 	// lock of c above.)
 	atomic.Store(&s.sweepgen, mheap_.sweepgen)
@@ -228,7 +252,7 @@ func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
 
 	c.nonempty.remove(s)
 	unlock(&c.lock)
-	mheap_.freeSpan(s, 0)
+	mheap_.freeSpan(s, false)
 	return true
 }
 
diff --git a/libgo/go/runtime/mfinal.go b/libgo/go/runtime/mfinal.go
index 1a7792c2205..caf2e7e1cce 100644
--- a/libgo/go/runtime/mfinal.go
+++ b/libgo/go/runtime/mfinal.go
@@ -135,6 +135,7 @@ func runfinq() {
 	)
 
 	gp := getg()
+	gp.isFinalizerGoroutine = true
 	for {
 		lock(&finlock)
 		fb := finq
diff --git a/libgo/go/runtime/mfixalloc.go b/libgo/go/runtime/mfixalloc.go
index 1febe782bb6..f9dd6ca474d 100644
--- a/libgo/go/runtime/mfixalloc.go
+++ b/libgo/go/runtime/mfixalloc.go
@@ -12,7 +12,7 @@ import "unsafe"
 
 // FixAlloc is a simple free-list allocator for fixed size objects.
 // Malloc uses a FixAlloc wrapped around sysAlloc to manage its
-// MCache and MSpan objects.
+// mcache and mspan objects.
 //
 // Memory returned by fixalloc.alloc is zeroed by default, but the
 // caller may take responsibility for zeroing allocations by setting
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
index de84084b391..0973f1de6f7 100644
--- a/libgo/go/runtime/mgc.go
+++ b/libgo/go/runtime/mgc.go
@@ -28,8 +28,7 @@
 //    b. Sweep any unswept spans. There will only be unswept spans if
 //    this GC cycle was forced before the expected time.
 //
-// 2. GC performs the "mark 1" sub-phase. In this sub-phase, Ps are
-// allowed to locally cache parts of the work queue.
+// 2. GC performs the mark phase.
 //
 //    a. Prepare for the mark phase by setting gcphase to _GCmark
 //    (from _GCoff), enabling the write barrier, enabling mutator
@@ -54,28 +53,21 @@
 //    object to black and shading all pointers found in the object
 //    (which in turn may add those pointers to the work queue).
 //
-// 3. Once the global work queue is empty (but local work queue caches
-// may still contain work), GC performs the "mark 2" sub-phase.
+//    e. Because GC work is spread across local caches, GC uses a
+//    distributed termination algorithm to detect when there are no
+//    more root marking jobs or grey objects (see gcMarkDone). At this
+//    point, GC transitions to mark termination.
 //
-//    a. GC stops all workers, disables local work queue caches,
-//    flushes each P's local work queue cache to the global work queue
-//    cache, and reenables workers.
-//
-//    b. GC again drains the work queue, as in 2d above.
-//
-// 4. Once the work queue is empty, GC performs mark termination.
+// 3. GC performs mark termination.
 //
 //    a. Stop the world.
 //
 //    b. Set gcphase to _GCmarktermination, and disable workers and
 //    assists.
 //
-//    c. Drain any remaining work from the work queue (typically there
-//    will be none).
-//
-//    d. Perform other housekeeping like flushing mcaches.
+//    c. Perform housekeeping like flushing mcaches.
 //
-// 5. GC performs the sweep phase.
+// 4. GC performs the sweep phase.
 //
 //    a. Prepare for the sweep phase by setting gcphase to _GCoff,
 //    setting up sweep state and disabling the write barrier.
@@ -86,7 +78,7 @@
 //    c. GC does concurrent sweeping in the background and in response
 //    to allocation. See description below.
 //
-// 6. When sufficient allocation has taken place, replay the sequence
+// 5. When sufficient allocation has taken place, replay the sequence
 // starting with 1 above. See discussion of GC rate below.
 
 // Concurrent sweep.
@@ -137,8 +129,8 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -262,21 +254,6 @@ var writeBarrier struct {
 // gcphase == _GCmark.
 var gcBlackenEnabled uint32
 
-// gcBlackenPromptly indicates that optimizations that may
-// hide work from the global work queue should be disabled.
-//
-// If gcBlackenPromptly is true, per-P gcWork caches should
-// be flushed immediately and new objects should be allocated black.
-//
-// There is a tension between allocating objects white and
-// allocating them black. If white and the objects die before being
-// marked they can be collected during this GC cycle. On the other
-// hand allocating them black will reduce _GCmarktermination latency
-// since more work is done in the mark phase. This tension is resolved
-// by allocating white until the mark phase is approaching its end and
-// then allocating black for the remainder of the mark phase.
-var gcBlackenPromptly bool
-
 const (
 	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
 	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
@@ -408,14 +385,14 @@ type gcControllerState struct {
 	// each P that isn't running a dedicated worker.
 	//
 	// For example, if the utilization goal is 25% and there are
-	// no dedicated workers, this will be 0.25. If there goal is
+	// no dedicated workers, this will be 0.25. If the goal is
 	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
 	// this will be 0.05 to make up the missing 5%.
 	//
 	// If this is zero, no fractional workers are needed.
 	fractionalUtilizationGoal float64
 
-	_ [sys.CacheLineSize]byte
+	_ cpu.CacheLinePad
 }
 
 // startCycle resets the GC controller's state and computes estimates
@@ -479,6 +456,12 @@ func (c *gcControllerState) startCycle() {
 		c.fractionalUtilizationGoal = 0
 	}
 
+	// In STW mode, we just want dedicated workers.
+	if debug.gcstoptheworld > 0 {
+		c.dedicatedMarkWorkersNeeded = int64(gomaxprocs)
+		c.fractionalUtilizationGoal = 0
+	}
+
 	// Clear per-P state
 	for _, p := range allp {
 		p.gcAssistTime = 0
@@ -753,7 +736,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	return gp
 }
 
-// pollFractionalWorkerExit returns true if a fractional mark worker
+// pollFractionalWorkerExit reports whether a fractional mark worker
 // should self-preempt. It assumes it is called from the fractional
 // worker.
 func pollFractionalWorkerExit() bool {
@@ -807,7 +790,7 @@ func gcSetTriggerRatio(triggerRatio float64) {
 		trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
 		// Don't trigger below the minimum heap size.
 		minTrigger := heapminimum
-		if !gosweepdone() {
+		if !isSweepDone() {
 			// Concurrent sweep happens in the heap growth
 			// from heap_live to gc_trigger, so ensure
 			// that concurrent sweep has some heap growth
@@ -852,7 +835,7 @@ func gcSetTriggerRatio(triggerRatio float64) {
 	}
 
 	// Update sweep pacing.
-	if gosweepdone() {
+	if isSweepDone() {
 		mheap_.sweepPagesPerByte = 0
 	} else {
 		// Concurrent sweep needs to sweep all of the in-use
@@ -902,7 +885,7 @@ const gcGoalUtilization = 0.30
 // mutator latency.
 const gcBackgroundUtilization = 0.25
 
-// gcCreditSlack is the amount of scan work credit that can can
+// gcCreditSlack is the amount of scan work credit that can
 // accumulate locally before updating gcController.scanWork and,
 // optionally, gcController.bgScanCredit. Lower values give a more
 // accurate assist ratio and make it more likely that assists will
@@ -920,9 +903,9 @@ const gcAssistTimeSlack = 5000
 const gcOverAssistWork = 64 << 10
 
 var work struct {
-	full  lfstack                  // lock-free list of full blocks workbuf
-	empty lfstack                  // lock-free list of empty blocks workbuf
-	pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+	full  lfstack          // lock-free list of full blocks workbuf
+	empty lfstack          // lock-free list of empty blocks workbuf
+	pad0  cpu.CacheLinePad // prevents false-sharing between full/empty and nproc/nwait
 
 	wbufSpans struct {
 		lock mutex
@@ -956,32 +939,15 @@ var work struct {
 	markrootNext uint32 // next markroot job
 	markrootJobs uint32 // number of markroot jobs
 
-	nproc   uint32
-	tstart  int64
-	nwait   uint32
-	ndone   uint32
-	alldone note
-
-	// helperDrainBlock indicates that GC mark termination helpers
-	// should pass gcDrainBlock to gcDrain to block in the
-	// getfull() barrier. Otherwise, they should pass gcDrainNoBlock.
-	//
-	// TODO: This is a temporary fallback to work around races
-	// that cause early mark termination.
-	helperDrainBlock bool
+	nproc  uint32
+	tstart int64
+	nwait  uint32
+	ndone  uint32
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
 	nFlushCacheRoots                    int
 	nDataRoots, nSpanRoots, nStackRoots int
 
-	// markrootDone indicates that roots have been marked at least
-	// once during the current GC cycle. This is checked by root
-	// marking operations that have to happen only during the
-	// first root marking pass, whether that's during the
-	// concurrent mark phase in current GC or mark termination in
-	// STW GC.
-	markrootDone bool
-
 	// Each type of GC state transition is protected by a lock.
 	// Since multiple threads can simultaneously detect the state
 	// transition condition, any thread that detects a transition
@@ -997,8 +963,7 @@ var work struct {
 	// startSema protects the transition from "off" to mark or
 	// mark termination.
 	startSema uint32
-	// markDoneSema protects transitions from mark 1 to mark 2 and
-	// from mark 2 to mark termination.
+	// markDoneSema protects transitions from mark to mark termination.
 	markDoneSema uint32
 
 	bgMarkReady note   // signal background mark worker has started
@@ -1024,15 +989,15 @@ var work struct {
 	// there was neither enough credit to steal or enough work to
 	// do.
 	assistQueue struct {
-		lock       mutex
-		head, tail guintptr
+		lock mutex
+		q    gQueue
 	}
 
 	// sweepWaiters is a list of blocked goroutines to wake when
 	// we transition from mark termination to sweep.
 	sweepWaiters struct {
 		lock mutex
-		head guintptr
+		list gList
 	}
 
 	// cycles is the number of completed GC cycles, where a GC
@@ -1088,7 +1053,7 @@ func GC() {
 	// We're now in sweep N or later. Trigger GC cycle N+1, which
 	// will first finish sweep N if necessary and then enter sweep
 	// termination N+1.
-	gcStart(gcBackgroundMode, gcTrigger{kind: gcTriggerCycle, n: n + 1})
+	gcStart(gcTrigger{kind: gcTriggerCycle, n: n + 1})
 
 	// Wait for mark termination N+1 to complete.
 	gcWaitOnMark(n + 1)
@@ -1097,7 +1062,7 @@ func GC() {
 	// complete the cycle and because runtime.GC() is often used
 	// as part of tests and benchmarks to get the system into a
 	// relatively stable and isolated state.
-	for atomic.Load(&work.cycles) == n+1 && gosweepone() != ^uintptr(0) {
+	for atomic.Load(&work.cycles) == n+1 && sweepone() != ^uintptr(0) {
 		sweep.nbgsweep++
 		Gosched()
 	}
@@ -1147,9 +1112,7 @@ func gcWaitOnMark(n uint32) {
 
 		// Wait until sweep termination, mark, and mark
 		// termination of cycle N complete.
-		gp := getg()
-		gp.schedlink = work.sweepWaiters.head
-		work.sweepWaiters.head.set(gp)
+		work.sweepWaiters.list.push(getg())
 		goparkunlock(&work.sweepWaiters.lock, waitReasonWaitForGCCycle, traceEvGoBlock, 1)
 	}
 }
@@ -1195,7 +1158,7 @@ const (
 	gcTriggerCycle
 )
 
-// test returns true if the trigger condition is satisfied, meaning
+// test reports whether the trigger condition is satisfied, meaning
 // that the exit condition for the _GCoff phase has been met. The exit
 // condition should be tested when allocating.
 func (t gcTrigger) test() bool {
@@ -1228,13 +1191,13 @@ func (t gcTrigger) test() bool {
 	return true
 }
 
-// gcStart transitions the GC from _GCoff to _GCmark (if
-// !mode.stwMark) or _GCmarktermination (if mode.stwMark) by
-// performing sweep termination and GC initialization.
+// gcStart starts the GC. It transitions from _GCoff to _GCmark (if
+// debug.gcstoptheworld == 0) or performs all of GC (if
+// debug.gcstoptheworld != 0).
 //
 // This may return without performing this transition in some cases,
 // such as when called on a system stack or with locks held.
-func gcStart(mode gcMode, trigger gcTrigger) {
+func gcStart(trigger gcTrigger) {
 	// Since this is called from malloc and malloc is called in
 	// the guts of a number of libraries that might be holding
 	// locks, don't attempt to start GC in non-preemptible or
@@ -1257,7 +1220,7 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 	//
 	// We check the transition condition continuously here in case
 	// this G gets delayed in to the next GC cycle.
-	for trigger.test() && gosweepone() != ^uintptr(0) {
+	for trigger.test() && sweepone() != ^uintptr(0) {
 		sweep.nbgsweep++
 	}
 
@@ -1277,12 +1240,11 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 	// We do this after re-checking the transition condition so
 	// that multiple goroutines that detect the heap trigger don't
 	// start multiple STW GCs.
-	if mode == gcBackgroundMode {
-		if debug.gcstoptheworld == 1 {
-			mode = gcForceMode
-		} else if debug.gcstoptheworld == 2 {
-			mode = gcForceBlockMode
-		}
+	mode := gcBackgroundMode
+	if debug.gcstoptheworld == 1 {
+		mode = gcForceMode
+	} else if debug.gcstoptheworld == 2 {
+		mode = gcForceBlockMode
 	}
 
 	// Ok, we're doing it! Stop everybody else
@@ -1292,10 +1254,16 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 		traceGCStart()
 	}
 
-	if mode == gcBackgroundMode {
-		gcBgMarkStartWorkers()
+	// Check that all Ps have finished deferred mcache flushes.
+	for _, p := range allp {
+		if fg := atomic.Load(&p.mcache.flushGen); fg != mheap_.sweepgen {
+			println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
+			throw("p mcache not flushed")
+		}
 	}
 
+	gcBgMarkStartWorkers()
+
 	gcResetMarkState()
 
 	work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
@@ -1324,199 +1292,299 @@ func gcStart(mode gcMode, trigger gcTrigger) {
 	clearpools()
 
 	work.cycles++
-	if mode == gcBackgroundMode { // Do as much work concurrently as possible
-		gcController.startCycle()
-		work.heapGoal = memstats.next_gc
 
-		// Enter concurrent mark phase and enable
-		// write barriers.
-		//
-		// Because the world is stopped, all Ps will
-		// observe that write barriers are enabled by
-		// the time we start the world and begin
-		// scanning.
-		//
-		// Write barriers must be enabled before assists are
-		// enabled because they must be enabled before
-		// any non-leaf heap objects are marked. Since
-		// allocations are blocked until assists can
-		// happen, we want enable assists as early as
-		// possible.
-		setGCPhase(_GCmark)
-
-		gcBgMarkPrepare() // Must happen before assist enable.
-		gcMarkRootPrepare()
-
-		// Mark all active tinyalloc blocks. Since we're
-		// allocating from these, they need to be black like
-		// other allocations. The alternative is to blacken
-		// the tiny block on every allocation from it, which
-		// would slow down the tiny allocator.
-		gcMarkTinyAllocs()
-
-		// At this point all Ps have enabled the write
-		// barrier, thus maintaining the no white to
-		// black invariant. Enable mutator assists to
-		// put back-pressure on fast allocating
-		// mutators.
-		atomic.Store(&gcBlackenEnabled, 1)
-
-		// Assists and workers can start the moment we start
-		// the world.
-		gcController.markStartTime = now
-
-		// Concurrent mark.
-		systemstack(func() {
-			now = startTheWorldWithSema(trace.enabled)
-		})
+	gcController.startCycle()
+	work.heapGoal = memstats.next_gc
+
+	// In STW mode, disable scheduling of user Gs. This may also
+	// disable scheduling of this goroutine, so it may block as
+	// soon as we start the world again.
+	if mode != gcBackgroundMode {
+		schedEnableUser(false)
+	}
+
+	// Enter concurrent mark phase and enable
+	// write barriers.
+	//
+	// Because the world is stopped, all Ps will
+	// observe that write barriers are enabled by
+	// the time we start the world and begin
+	// scanning.
+	//
+	// Write barriers must be enabled before assists are
+	// enabled because they must be enabled before
+	// any non-leaf heap objects are marked. Since
+	// allocations are blocked until assists can
+	// happen, we want enable assists as early as
+	// possible.
+	setGCPhase(_GCmark)
+
+	gcBgMarkPrepare() // Must happen before assist enable.
+	gcMarkRootPrepare()
+
+	// Mark all active tinyalloc blocks. Since we're
+	// allocating from these, they need to be black like
+	// other allocations. The alternative is to blacken
+	// the tiny block on every allocation from it, which
+	// would slow down the tiny allocator.
+	gcMarkTinyAllocs()
+
+	// At this point all Ps have enabled the write
+	// barrier, thus maintaining the no white to
+	// black invariant. Enable mutator assists to
+	// put back-pressure on fast allocating
+	// mutators.
+	atomic.Store(&gcBlackenEnabled, 1)
+
+	// Assists and workers can start the moment we start
+	// the world.
+	gcController.markStartTime = now
+
+	// Concurrent mark.
+	systemstack(func() {
+		now = startTheWorldWithSema(trace.enabled)
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
-	} else {
-		if trace.enabled {
-			// Switch to mark termination STW.
-			traceGCSTWDone()
-			traceGCSTWStart(0)
-		}
-		t := nanotime()
-		work.tMark, work.tMarkTerm = t, t
-		work.heapGoal = work.heap0
-
-		// Perform mark termination. This will restart the world.
-		gcMarkTermination(memstats.triggerRatio)
+	})
+	// In STW mode, we could block the instant systemstack
+	// returns, so don't do anything important here. Make sure we
+	// block rather than returning to user code.
+	if mode != gcBackgroundMode {
+		Gosched()
 	}
 
 	semrelease(&work.startSema)
 }
 
-// gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
-// to mark termination.
+// gcMarkDoneFlushed counts the number of P's with flushed work.
 //
-// This should be called when all mark work has been drained. In mark
-// 1, this includes all root marking jobs, global work buffers, and
-// active work buffers in assists and background workers; however,
-// work may still be cached in per-P work buffers. In mark 2, per-P
-// caches are disabled.
+// Ideally this would be a captured local in gcMarkDone, but forEachP
+// escapes its callback closure, so it can't capture anything.
+//
+// This is protected by markDoneSema.
+var gcMarkDoneFlushed uint32
+
+// debugCachedWork enables extra checks for debugging premature mark
+// termination.
+//
+// For debugging issue #27993.
+const debugCachedWork = false
+
+// gcWorkPauseGen is for debugging the mark completion algorithm.
+// gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
+// Only used if debugCachedWork is true.
+//
+// For debugging issue #27993.
+var gcWorkPauseGen uint32 = 1
+
+// gcMarkDone transitions the GC from mark to mark termination if all
+// reachable objects have been marked (that is, there are no grey
+// objects and can be no more in the future). Otherwise, it flushes
+// all local work to the global queues where it can be discovered by
+// other workers.
+//
+// This should be called when all local mark work has been drained and
+// there are no remaining workers. Specifically, when
+//
+//   work.nwait == work.nproc && !gcMarkWorkAvailable(p)
 //
 // The calling context must be preemptible.
 //
-// Note that it is explicitly okay to have write barriers in this
-// function because completion of concurrent mark is best-effort
-// anyway. Any work created by write barriers here will be cleaned up
-// by mark termination.
+// Flushing local work is important because idle Ps may have local
+// work queued. This is the only way to make that work visible and
+// drive GC to completion.
+//
+// It is explicitly okay to have write barriers in this function. If
+// it does transition to mark termination, then all reachable objects
+// have been marked, so the write barrier cannot shade any more
+// objects.
 func gcMarkDone() {
-top:
+	// Ensure only one thread is running the ragged barrier at a
+	// time.
 	semacquire(&work.markDoneSema)
 
+top:
 	// Re-check transition condition under transition lock.
+	//
+	// It's critical that this checks the global work queues are
+	// empty before performing the ragged barrier. Otherwise,
+	// there could be global work that a P could take after the P
+	// has passed the ragged barrier.
 	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
 		semrelease(&work.markDoneSema)
 		return
 	}
 
-	// Disallow starting new workers so that any remaining workers
-	// in the current mark phase will drain out.
-	//
-	// TODO(austin): Should dedicated workers keep an eye on this
-	// and exit gcDrain promptly?
-	atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
-	prevFractionalGoal := gcController.fractionalUtilizationGoal
-	gcController.fractionalUtilizationGoal = 0
-
-	if !gcBlackenPromptly {
-		// Transition from mark 1 to mark 2.
-		//
-		// The global work list is empty, but there can still be work
-		// sitting in the per-P work caches.
-		// Flush and disable work caches.
-
-		// Disallow caching workbufs and indicate that we're in mark 2.
-		gcBlackenPromptly = true
-
-		// Prevent completion of mark 2 until we've flushed
-		// cached workbufs.
-		atomic.Xadd(&work.nwait, -1)
-
-		// GC is set up for mark 2. Let Gs blocked on the
-		// transition lock go while we flush caches.
-		semrelease(&work.markDoneSema)
-
-		systemstack(func() {
-			// Flush all currently cached workbufs and
-			// ensure all Ps see gcBlackenPromptly. This
-			// also blocks until any remaining mark 1
-			// workers have exited their loop so we can
-			// start new mark 2 workers.
-			forEachP(func(_p_ *p) {
-				wbBufFlush1(_p_)
-				_p_.gcw.dispose()
-			})
+	// Flush all local buffers and collect flushedWork flags.
+	gcMarkDoneFlushed = 0
+	systemstack(func() {
+		gp := getg().m.curg
+		// Mark the user stack as preemptible so that it may be scanned.
+		// Otherwise, our attempt to force all P's to a safepoint could
+		// result in a deadlock as we attempt to preempt a worker that's
+		// trying to preempt us (e.g. for a stack scan).
+		casgstatus(gp, _Grunning, _Gwaiting)
+		forEachP(func(_p_ *p) {
+			// Flush the write barrier buffer, since this may add
+			// work to the gcWork.
+			wbBufFlush1(_p_)
+			// For debugging, shrink the write barrier
+			// buffer so it flushes immediately.
+			// wbBuf.reset will keep it at this size as
+			// long as throwOnGCWork is set.
+			if debugCachedWork {
+				b := &_p_.wbBuf
+				b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
+				b.debugGen = gcWorkPauseGen
+			}
+			// Flush the gcWork, since this may create global work
+			// and set the flushedWork flag.
+			//
+			// TODO(austin): Break up these workbufs to
+			// better distribute work.
+			_p_.gcw.dispose()
+			// Collect the flushedWork flag.
+			if _p_.gcw.flushedWork {
+				atomic.Xadd(&gcMarkDoneFlushed, 1)
+				_p_.gcw.flushedWork = false
+			} else if debugCachedWork {
+				// For debugging, freeze the gcWork
+				// until we know whether we've reached
+				// completion or not. If we think
+				// we've reached completion, but
+				// there's a paused gcWork, then
+				// that's a bug.
+				_p_.gcw.pauseGen = gcWorkPauseGen
+				// Capture the G's stack.
+				for i := range _p_.gcw.pauseStack {
+					_p_.gcw.pauseStack[i].pc = 0
+				}
+				callers(1, _p_.gcw.pauseStack[:])
+			}
 		})
+		casgstatus(gp, _Gwaiting, _Grunning)
+	})
 
-		// Check that roots are marked. We should be able to
-		// do this before the forEachP, but based on issue
-		// #16083 there may be a (harmless) race where we can
-		// enter mark 2 while some workers are still scanning
-		// stacks. The forEachP ensures these scans are done.
-		//
-		// TODO(austin): Figure out the race and fix this
-		// properly.
-		gcMarkRootCheck()
+	if gcMarkDoneFlushed != 0 {
+		if debugCachedWork {
+			// Release paused gcWorks.
+			atomic.Xadd(&gcWorkPauseGen, 1)
+		}
+		// More grey objects were discovered since the
+		// previous termination check, so there may be more
+		// work to do. Keep going. It's possible the
+		// transition condition became true again during the
+		// ragged barrier, so re-check it.
+		goto top
+	}
 
-		// Now we can start up mark 2 workers.
-		atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
-		gcController.fractionalUtilizationGoal = prevFractionalGoal
+	if debugCachedWork {
+		throwOnGCWork = true
+		// Release paused gcWorks. If there are any, they
+		// should now observe throwOnGCWork and panic.
+		atomic.Xadd(&gcWorkPauseGen, 1)
+	}
 
-		incnwait := atomic.Xadd(&work.nwait, +1)
-		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
-			// This loop will make progress because
-			// gcBlackenPromptly is now true, so it won't
-			// take this same "if" branch.
-			goto top
+	// There was no global work, no local work, and no Ps
+	// communicated work since we took markDoneSema. Therefore
+	// there are no grey objects and no more objects can be
+	// shaded. Transition to mark termination.
+	now := nanotime()
+	work.tMarkTerm = now
+	work.pauseStart = now
+	getg().m.preemptoff = "gcing"
+	if trace.enabled {
+		traceGCSTWStart(0)
+	}
+	systemstack(stopTheWorldWithSema)
+	// The gcphase is _GCmark, it will transition to _GCmarktermination
+	// below. The important thing is that the wb remains active until
+	// all marking is complete. This includes writes made by the GC.
+
+	if debugCachedWork {
+		// For debugging, double check that no work was added after we
+		// went around above and disable write barrier buffering.
+		for _, p := range allp {
+			gcw := &p.gcw
+			if !gcw.empty() {
+				printlock()
+				print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
+				if gcw.wbuf1 == nil {
+					print(" wbuf1=<nil>")
+				} else {
+					print(" wbuf1.n=", gcw.wbuf1.nobj)
+				}
+				if gcw.wbuf2 == nil {
+					print(" wbuf2=<nil>")
+				} else {
+					print(" wbuf2.n=", gcw.wbuf2.nobj)
+				}
+				print("\n")
+				if gcw.pauseGen == gcw.putGen {
+					println("runtime: checkPut already failed at this generation")
+				}
+				throw("throwOnGCWork")
+			}
 		}
 	} else {
-		// Transition to mark termination.
-		now := nanotime()
-		work.tMarkTerm = now
-		work.pauseStart = now
-		getg().m.preemptoff = "gcing"
-		if trace.enabled {
-			traceGCSTWStart(0)
+		// For unknown reasons (see issue #27993), there is
+		// sometimes work left over when we enter mark
+		// termination. Detect this and resume concurrent
+		// mark. This is obviously unfortunate.
+		//
+		// Switch to the system stack to call wbBufFlush1,
+		// though in this case it doesn't matter because we're
+		// non-preemptible anyway.
+		restart := false
+		systemstack(func() {
+			for _, p := range allp {
+				wbBufFlush1(p)
+				if !p.gcw.empty() {
+					restart = true
+					break
+				}
+			}
+		})
+		if restart {
+			getg().m.preemptoff = ""
+			systemstack(func() {
+				now := startTheWorldWithSema(true)
+				work.pauseNS += now - work.pauseStart
+			})
+			goto top
 		}
-		systemstack(stopTheWorldWithSema)
-		// The gcphase is _GCmark, it will transition to _GCmarktermination
-		// below. The important thing is that the wb remains active until
-		// all marking is complete. This includes writes made by the GC.
+	}
 
-		// Record that one root marking pass has completed.
-		work.markrootDone = true
+	// Disable assists and background workers. We must do
+	// this before waking blocked assists.
+	atomic.Store(&gcBlackenEnabled, 0)
 
-		// Disable assists and background workers. We must do
-		// this before waking blocked assists.
-		atomic.Store(&gcBlackenEnabled, 0)
+	// Wake all blocked assists. These will run when we
+	// start the world again.
+	gcWakeAllAssists()
 
-		// Wake all blocked assists. These will run when we
-		// start the world again.
-		gcWakeAllAssists()
+	// Likewise, release the transition lock. Blocked
+	// workers and assists will run when we start the
+	// world again.
+	semrelease(&work.markDoneSema)
 
-		// Likewise, release the transition lock. Blocked
-		// workers and assists will run when we start the
-		// world again.
-		semrelease(&work.markDoneSema)
+	// In STW mode, re-enable user goroutines. These will be
+	// queued to run after we start the world.
+	schedEnableUser(true)
 
-		// endCycle depends on all gcWork cache stats being
-		// flushed. This is ensured by mark 2.
-		nextTriggerRatio := gcController.endCycle()
+	// endCycle depends on all gcWork cache stats being flushed.
+	// The termination algorithm above ensured that up to
+	// allocations since the ragged barrier.
+	nextTriggerRatio := gcController.endCycle()
 
-		// Perform mark termination. This will restart the world.
-		gcMarkTermination(nextTriggerRatio)
-	}
+	// Perform mark termination. This will restart the world.
+	gcMarkTermination(nextTriggerRatio)
 }
 
 func gcMarkTermination(nextTriggerRatio float64) {
 	// World is stopped.
 	// Start marktermination which includes enabling the write barrier.
 	atomic.Store(&gcBlackenEnabled, 0)
-	gcBlackenPromptly = false
 	setGCPhase(_GCmarktermination)
 
 	work.heap1 = memstats.heap_live
@@ -1549,35 +1617,22 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	systemstack(func() {
 		work.heap2 = work.bytesMarked
 		if debug.gccheckmark > 0 {
-			// Run a full stop-the-world mark using checkmark bits,
-			// to check that we didn't forget to mark anything during
-			// the concurrent mark process.
+			// Run a full non-parallel, stop-the-world
+			// mark using checkmark bits, to check that we
+			// didn't forget to mark anything during the
+			// concurrent mark process.
 			gcResetMarkState()
 			initCheckmarks()
-			gcMark(startTime)
+			gcw := &getg().m.p.ptr().gcw
+			gcDrain(gcw, 0)
+			wbBufFlush1(getg().m.p.ptr())
+			gcw.dispose()
 			clearCheckmarks()
 		}
 
 		// marking is complete so we can turn the write barrier off
 		setGCPhase(_GCoff)
 		gcSweep(work.mode)
-
-		if debug.gctrace > 1 {
-			startTime = nanotime()
-			// The g stacks have been scanned so
-			// they have gcscanvalid==true and gcworkdone==true.
-			// Reset these so that all stacks will be rescanned.
-			gcResetMarkState()
-			finishsweep_m()
-
-			// Still in STW but gcphase is _GCoff, reset to _GCmarktermination
-			// At this point all objects will be found during the gcMark which
-			// does a complete STW mark and object scan.
-			setGCPhase(_GCmarktermination)
-			gcMark(startTime)
-			setGCPhase(_GCoff) // marking is done, turn off wb.
-			gcSweep(work.mode)
-		}
 	})
 
 	_g_.m.traceback = 0
@@ -1633,8 +1688,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// Bump GC cycle count and wake goroutines waiting on sweep.
 	lock(&work.sweepWaiters.lock)
 	memstats.numgc++
-	injectglist(work.sweepWaiters.head.ptr())
-	work.sweepWaiters.head = 0
+	injectglist(&work.sweepWaiters.list)
 	unlock(&work.sweepWaiters.lock)
 
 	// Finish the current heap profiling cycle and start a new
@@ -1653,6 +1707,16 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// asynchronously because it can take non-trivial time.
 	prepareFreeWorkbufs()
 
+	// Ensure all mcaches are flushed. Each P will flush its own
+	// mcache before allocating, but idle Ps may not. Since this
+	// is necessary to sweep all spans, we need to ensure all
+	// mcaches are flushed before we start the next GC cycle.
+	systemstack(func() {
+		forEachP(func(_p_ *p) {
+			_p_.mcache.prepareForSweep()
+		})
+	})
+
 	// Print gctrace before dropping worldsema. As soon as we drop
 	// worldsema another cycle could start and smash the stats
 	// we're trying to print.
@@ -1856,7 +1920,7 @@ func gcBgMarkWorker(_p_ *p) {
 				}
 				// Go back to draining, this time
 				// without preemption.
-				gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
+				gcDrain(&_p_.gcw, gcDrainFlushBgCredit)
 			case gcMarkWorkerFractionalMode:
 				gcDrain(&_p_.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
 			case gcMarkWorkerIdleMode:
@@ -1865,16 +1929,6 @@ func gcBgMarkWorker(_p_ *p) {
 			casgstatus(gp, _Gwaiting, _Grunning)
 		})
 
-		// If we are nearing the end of mark, dispose
-		// of the cache promptly. We must do this
-		// before signaling that we're no longer
-		// working so that other workers can't observe
-		// no workers and no work while we have this
-		// cached, and before we compute done.
-		if gcBlackenPromptly {
-			_p_.gcw.dispose()
-		}
-
 		// Account for time.
 		duration := nanotime() - startTime
 		switch _p_.gcMarkWorkerMode {
@@ -1921,7 +1975,7 @@ func gcBgMarkWorker(_p_ *p) {
 	}
 }
 
-// gcMarkWorkAvailable returns true if executing a mark worker
+// gcMarkWorkAvailable reports whether executing a mark worker
 // on p is potentially useful. p may be nil, in which case it only
 // checks the global sources of work.
 func gcMarkWorkAvailable(p *p) bool {
@@ -1951,50 +2005,11 @@ func gcMark(start_time int64) {
 	}
 	work.tstart = start_time
 
-	// Queue root marking jobs.
-	gcMarkRootPrepare()
-
-	work.nwait = 0
-	work.ndone = 0
-	work.nproc = uint32(gcprocs())
-
-	if work.full == 0 && work.nDataRoots+work.nSpanRoots+work.nStackRoots == 0 {
-		// There's no work on the work queue and no root jobs
-		// that can produce work, so don't bother entering the
-		// getfull() barrier.
-		//
-		// This will be the situation the vast majority of the
-		// time after concurrent mark. However, we still need
-		// a fallback for STW GC and because there are some
-		// known races that occasionally leave work around for
-		// mark termination.
-		//
-		// We're still hedging our bets here: if we do
-		// accidentally produce some work, we'll still process
-		// it, just not necessarily in parallel.
-		//
-		// TODO(austin): Fix the races and and remove
-		// work draining from mark termination so we don't
-		// need the fallback path.
-		work.helperDrainBlock = false
-	} else {
-		work.helperDrainBlock = true
-	}
-
-	if work.nproc > 1 {
-		noteclear(&work.alldone)
-		helpgc(int32(work.nproc))
-	}
-
-	gchelperstart()
-
-	gcw := &getg().m.p.ptr().gcw
-	if work.helperDrainBlock {
-		gcDrain(gcw, gcDrainBlock)
-	} else {
-		gcDrain(gcw, gcDrainNoBlock)
+	// Check that there's no marking work remaining.
+	if work.full != 0 || work.markrootNext < work.markrootJobs {
+		print("runtime: full=", hex(work.full), " next=", work.markrootNext, " jobs=", work.markrootJobs, " nDataRoots=", work.nDataRoots, " nSpanRoots=", work.nSpanRoots, " nStackRoots=", work.nStackRoots, "\n")
+		panic("non-empty mark queue after concurrent mark")
 	}
-	gcw.dispose()
 
 	if debug.gccheckmark > 0 {
 		// This is expensive when there's a large number of
@@ -2005,25 +2020,52 @@ func gcMark(start_time int64) {
 		throw("work.full != 0")
 	}
 
-	if work.nproc > 1 {
-		notesleep(&work.alldone)
-	}
-
-	// Record that at least one root marking pass has completed.
-	work.markrootDone = true
-
-	// Double-check that all gcWork caches are empty. This should
-	// be ensured by mark 2 before we enter mark termination.
+	// Clear out buffers and double-check that all gcWork caches
+	// are empty. This should be ensured by gcMarkDone before we
+	// enter mark termination.
+	//
+	// TODO: We could clear out buffers just before mark if this
+	// has a non-negligible impact on STW time.
 	for _, p := range allp {
+		// The write barrier may have buffered pointers since
+		// the gcMarkDone barrier. However, since the barrier
+		// ensured all reachable objects were marked, all of
+		// these must be pointers to black objects. Hence we
+		// can just discard the write barrier buffer.
+		if debug.gccheckmark > 0 || throwOnGCWork {
+			// For debugging, flush the buffer and make
+			// sure it really was all marked.
+			wbBufFlush1(p)
+		} else {
+			p.wbBuf.reset()
+		}
+
 		gcw := &p.gcw
 		if !gcw.empty() {
+			printlock()
+			print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
+			if gcw.wbuf1 == nil {
+				print(" wbuf1=<nil>")
+			} else {
+				print(" wbuf1.n=", gcw.wbuf1.nobj)
+			}
+			if gcw.wbuf2 == nil {
+				print(" wbuf2=<nil>")
+			} else {
+				print(" wbuf2.n=", gcw.wbuf2.nobj)
+			}
+			print("\n")
 			throw("P has cached GC work at end of mark termination")
 		}
-		if gcw.scanWork != 0 || gcw.bytesMarked != 0 {
-			throw("P has unflushed stats at end of mark termination")
-		}
+		// There may still be cached empty buffers, which we
+		// need to flush since we're going to free them. Also,
+		// there may be non-zero stats because we allocated
+		// black after the gcMarkDone barrier.
+		gcw.dispose()
 	}
 
+	throwOnGCWork = false
+
 	cachestats()
 
 	// Update the marked heap stat.
@@ -2055,6 +2097,9 @@ func gcSweep(mode gcMode) {
 		throw("non-empty swept list")
 	}
 	mheap_.pagesSwept = 0
+	mheap_.sweepArenas = mheap_.allArenas
+	mheap_.reclaimIndex = 0
+	mheap_.reclaimCredit = 0
 	unlock(&mheap_.lock)
 
 	if !_ConcurrentSweep || mode == gcForceBlockMode {
@@ -2104,9 +2149,20 @@ func gcResetMarkState() {
 	}
 	unlock(&allglock)
 
+	// Clear page marks. This is just 1MB per 64GB of heap, so the
+	// time here is pretty trivial.
+	lock(&mheap_.lock)
+	arenas := mheap_.allArenas
+	unlock(&mheap_.lock)
+	for _, ai := range arenas {
+		ha := mheap_.arenas[ai.l1()][ai.l2()]
+		for i := range ha.pageMarks {
+			ha.pageMarks[i] = 0
+		}
+	}
+
 	work.bytesMarked = 0
 	work.initialHeapLive = atomic.Load64(&memstats.heap_live)
-	work.markrootDone = false
 }
 
 // Hooks for other packages
@@ -2151,48 +2207,6 @@ func clearpools() {
 	unlock(&sched.deferlock)
 }
 
-// gchelper runs mark termination tasks on Ps other than the P
-// coordinating mark termination.
-//
-// The caller is responsible for ensuring that this has a P to run on,
-// even though it's running during STW. Because of this, it's allowed
-// to have write barriers.
-//
-//go:yeswritebarrierrec
-func gchelper() {
-	_g_ := getg()
-	_g_.m.traceback = 2
-	gchelperstart()
-
-	// Parallel mark over GC roots and heap
-	if gcphase == _GCmarktermination {
-		gcw := &_g_.m.p.ptr().gcw
-		if work.helperDrainBlock {
-			gcDrain(gcw, gcDrainBlock) // blocks in getfull
-		} else {
-			gcDrain(gcw, gcDrainNoBlock)
-		}
-		gcw.dispose()
-	}
-
-	nproc := atomic.Load(&work.nproc) // work.nproc can change right after we increment work.ndone
-	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
-		notewakeup(&work.alldone)
-	}
-	_g_.m.traceback = 0
-}
-
-func gchelperstart() {
-	_g_ := getg()
-
-	if _g_.m.helpgc < 0 || _g_.m.helpgc >= _MaxGcproc {
-		throw("gchelperstart: bad m->helpgc")
-	}
-	if _g_ != _g_.m.g0 {
-		throw("gchelper not running on g0 stack")
-	}
-}
-
 // Timing
 
 // itoaDiv formats val/(10**dec) into buf.
diff --git a/libgo/go/runtime/mgc_gccgo.go b/libgo/go/runtime/mgc_gccgo.go
index b396d35146c..85045170b3c 100644
--- a/libgo/go/runtime/mgc_gccgo.go
+++ b/libgo/go/runtime/mgc_gccgo.go
@@ -146,7 +146,7 @@ func registerGCRoots(r *gcRootList) {
 // and carries on.
 func checkPreempt() {
 	gp := getg()
-	if !gp.preempt || gp != gp.m.curg || gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" {
+	if !gp.preempt || gp != gp.m.curg || gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" || gp.m.incgo {
 		return
 	}
 
@@ -165,9 +165,6 @@ func checkPreempt() {
 			mp := acquirem()
 			gcw := &gp.m.p.ptr().gcw
 			scanstack(gp, gcw)
-			if gcBlackenPromptly {
-				gcw.dispose()
-			}
 			releasem(mp)
 			gp.gcscandone = true
 		}
diff --git a/libgo/go/runtime/mgclarge.go b/libgo/go/runtime/mgclarge.go
index e7fa831937a..7b01a117808 100644
--- a/libgo/go/runtime/mgclarge.go
+++ b/libgo/go/runtime/mgclarge.go
@@ -46,13 +46,57 @@ type treapNode struct {
 	priority  uint32     // random number used by treap algorithm to keep tree probabilistically balanced
 }
 
-func (t *treapNode) init() {
-	t.right = nil
-	t.left = nil
-	t.parent = nil
-	t.spanKey = nil
-	t.npagesKey = 0
-	t.priority = 0
+func (t *treapNode) pred() *treapNode {
+	if t.left != nil {
+		// If it has a left child, its predecessor will be
+		// its right most left (grand)child.
+		t = t.left
+		for t.right != nil {
+			t = t.right
+		}
+		return t
+	}
+	// If it has no left child, its predecessor will be
+	// the first grandparent who's right child is its
+	// ancestor.
+	//
+	// We compute this by walking up the treap until the
+	// current node's parent is its parent's right child.
+	//
+	// If we find at any point walking up the treap
+	// that the current node doesn't have a parent,
+	// we've hit the root. This means that t is already
+	// the left-most node in the treap and therefore
+	// has no predecessor.
+	for t.parent != nil && t.parent.right != t {
+		if t.parent.left != t {
+			println("runtime: predecessor t=", t, "t.spanKey=", t.spanKey)
+			throw("node is not its parent's child")
+		}
+		t = t.parent
+	}
+	return t.parent
+}
+
+func (t *treapNode) succ() *treapNode {
+	if t.right != nil {
+		// If it has a right child, its successor will be
+		// its left-most right (grand)child.
+		t = t.right
+		for t.left != nil {
+			t = t.left
+		}
+		return t
+	}
+	// See pred.
+	for t.parent != nil && t.parent.left != t {
+		if t.parent.right != t {
+			println("runtime: predecessor t=", t, "t.spanKey=", t.spanKey)
+			throw("node is not its parent's child")
+		}
+		t = t.parent
+	}
+	return t.parent
 }
 
 // isSpanInTreap is handy for debugging. One should hold the heap lock, usually
@@ -109,6 +153,68 @@ func checkTreapNode(t *treapNode) {
 	}
 }
 
+// treapIter is a bidirectional iterator type which may be used to iterate over a
+// an mTreap in-order forwards (increasing order) or backwards (decreasing order).
+// Its purpose is to hide details about the treap from users when trying to iterate
+// over it.
+//
+// To create iterators over the treap, call start or end on an mTreap.
+type treapIter struct {
+	t *treapNode
+}
+
+// span returns the span at the current position in the treap.
+// If the treap is not valid, span will panic.
+func (i *treapIter) span() *mspan {
+	return i.t.spanKey
+}
+
+// valid returns whether the iterator represents a valid position
+// in the mTreap.
+func (i *treapIter) valid() bool {
+	return i.t != nil
+}
+
+// next moves the iterator forward by one. Once the iterator
+// ceases to be valid, calling next will panic.
+func (i treapIter) next() treapIter {
+	i.t = i.t.succ()
+	return i
+}
+
+// prev moves the iterator backwards by one. Once the iterator
+// ceases to be valid, calling prev will panic.
+func (i treapIter) prev() treapIter {
+	i.t = i.t.pred()
+	return i
+}
+
+// start returns an iterator which points to the start of the treap (the
+// left-most node in the treap).
+func (root *mTreap) start() treapIter {
+	t := root.treap
+	if t == nil {
+		return treapIter{}
+	}
+	for t.left != nil {
+		t = t.left
+	}
+	return treapIter{t: t}
+}
+
+// end returns an iterator which points to the end of the treap (the
+// right-most node in the treap).
+func (root *mTreap) end() treapIter {
+	t := root.treap
+	if t == nil {
+		return treapIter{}
+	}
+	for t.right != nil {
+		t = t.right
+	}
+	return treapIter{t: t}
+}
+
 // insert adds span to the large span treap.
 func (root *mTreap) insert(span *mspan) {
 	npages := span.npages
@@ -120,10 +226,10 @@ func (root *mTreap) insert(span *mspan) {
 			pt = &t.right
 		} else if t.npagesKey > npages {
 			pt = &t.left
-		} else if uintptr(unsafe.Pointer(t.spanKey)) < uintptr(unsafe.Pointer(span)) {
+		} else if t.spanKey.base() < span.base() {
 			// t.npagesKey == npages, so sort on span addresses.
 			pt = &t.right
-		} else if uintptr(unsafe.Pointer(t.spanKey)) > uintptr(unsafe.Pointer(span)) {
+		} else if t.spanKey.base() > span.base() {
 			pt = &t.left
 		} else {
 			throw("inserting span already in treap")
@@ -140,7 +246,6 @@ func (root *mTreap) insert(span *mspan) {
 	// https://faculty.washington.edu/aragon/pubs/rst89.pdf
 
 	t := (*treapNode)(mheap_.treapalloc.alloc())
-	t.init()
 	t.npagesKey = span.npages
 	t.priority = fastrand()
 	t.spanKey = span
@@ -168,7 +273,6 @@ func (root *mTreap) removeNode(t *treapNode) {
 	if t.spanKey.npages != t.npagesKey {
 		throw("span and treap node npages do not match")
 	}
-
 	// Rotate t down to be leaf of tree for removal, respecting priorities.
 	for t.right != nil || t.left != nil {
 		if t.right == nil || t.left != nil && t.left.priority < t.right.priority {
@@ -188,19 +292,16 @@ func (root *mTreap) removeNode(t *treapNode) {
 		root.treap = nil
 	}
 	// Return the found treapNode's span after freeing the treapNode.
-	t.spanKey = nil
-	t.npagesKey = 0
 	mheap_.treapalloc.free(unsafe.Pointer(t))
 }
 
-// remove searches for, finds, removes from the treap, and returns the smallest
-// span that can hold npages. If no span has at least npages return nil.
+// find searches for, finds, and returns the treap node containing the
+// smallest span that can hold npages. If no span has at least npages
+// it returns nil.
 // This is slightly more complicated than a simple binary tree search
 // since if an exact match is not found the next larger node is
 // returned.
-// If the last node inspected > npagesKey not holding
-// a left node (a smaller npages) is the "best fit" node.
-func (root *mTreap) remove(npages uintptr) *mspan {
+func (root *mTreap) find(npages uintptr) *treapNode {
 	t := root.treap
 	for t != nil {
 		if t.spanKey == nil {
@@ -211,9 +312,7 @@ func (root *mTreap) remove(npages uintptr) *mspan {
 		} else if t.left != nil && t.left.npagesKey >= npages {
 			t = t.left
 		} else {
-			result := t.spanKey
-			root.removeNode(t)
-			return result
+			return t
 		}
 	}
 	return nil
@@ -231,24 +330,21 @@ func (root *mTreap) removeSpan(span *mspan) {
 			t = t.right
 		} else if t.npagesKey > npages {
 			t = t.left
-		} else if uintptr(unsafe.Pointer(t.spanKey)) < uintptr(unsafe.Pointer(span)) {
+		} else if t.spanKey.base() < span.base() {
 			t = t.right
-		} else if uintptr(unsafe.Pointer(t.spanKey)) > uintptr(unsafe.Pointer(span)) {
+		} else if t.spanKey.base() > span.base() {
 			t = t.left
 		}
 	}
 	root.removeNode(t)
 }
 
-// scavengetreap visits each node in the treap and scavenges the
-// treapNode's span.
-func scavengetreap(treap *treapNode, now, limit uint64) uintptr {
-	if treap == nil {
-		return 0
-	}
-	return scavengeTreapNode(treap, now, limit) +
-		scavengetreap(treap.left, now, limit) +
-		scavengetreap(treap.right, now, limit)
+// erase removes the element referred to by the current position of the
+// iterator. This operation consumes the given iterator, so it should no
+// longer be used. It is up to the caller to get the next or previous
+// iterator before calling erase, if need be.
+func (root *mTreap) erase(i treapIter) {
+	root.removeNode(i.t)
 }
 
 // rotateLeft rotates the tree rooted at node x.
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
index 631c4d7133b..9da881ed2ab 100644
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@@ -52,62 +52,37 @@ const (
 //
 //go:nowritebarrier
 func gcMarkRootPrepare() {
-	if gcphase == _GCmarktermination {
-		work.nFlushCacheRoots = int(gomaxprocs)
-	} else {
-		work.nFlushCacheRoots = 0
-	}
+	work.nFlushCacheRoots = 0
 
 	work.nDataRoots = 0
 
 	// Only scan globals once per cycle; preferably concurrently.
-	if !work.markrootDone {
-		roots := gcRoots
-		for roots != nil {
-			work.nDataRoots++
-			roots = roots.next
-		}
+	roots := gcRoots
+	for roots != nil {
+		work.nDataRoots++
+		roots = roots.next
 	}
 
-	if !work.markrootDone {
-		// On the first markroot, we need to scan span roots.
-		// In concurrent GC, this happens during concurrent
-		// mark and we depend on addfinalizer to ensure the
-		// above invariants for objects that get finalizers
-		// after concurrent mark. In STW GC, this will happen
-		// during mark termination.
-		//
-		// We're only interested in scanning the in-use spans,
-		// which will all be swept at this point. More spans
-		// may be added to this list during concurrent GC, but
-		// we only care about spans that were allocated before
-		// this mark phase.
-		work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
-
-		// On the first markroot, we need to scan all Gs. Gs
-		// may be created after this point, but it's okay that
-		// we ignore them because they begin life without any
-		// roots, so there's nothing to scan, and any roots
-		// they create during the concurrent phase will be
-		// scanned during mark termination. During mark
-		// termination, allglen isn't changing, so we'll scan
-		// all Gs.
-		work.nStackRoots = int(atomic.Loaduintptr(&allglen))
-	} else {
-		// We've already scanned span roots and kept the scan
-		// up-to-date during concurrent mark.
-		work.nSpanRoots = 0
-
-		// The hybrid barrier ensures that stacks can't
-		// contain pointers to unmarked objects, so on the
-		// second markroot, there's no need to scan stacks.
-		work.nStackRoots = 0
-
-		if debug.gcrescanstacks > 0 {
-			// Scan stacks anyway for debugging.
-			work.nStackRoots = int(atomic.Loaduintptr(&allglen))
-		}
-	}
+	// Scan span roots for finalizer specials.
+	//
+	// We depend on addfinalizer to mark objects that get
+	// finalizers after root marking.
+	//
+	// We're only interested in scanning the in-use spans,
+	// which will all be swept at this point. More spans
+	// may be added to this list during concurrent GC, but
+	// we only care about spans that were allocated before
+	// this mark phase.
+	work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
+
+	// Scan stacks.
+	//
+	// Gs may be created after this point, but it's okay that we
+	// ignore them because they begin life without any roots, so
+	// there's nothing to scan, and any roots they create during
+	// the concurrent phase will be scanned during mark
+	// termination.
+	work.nStackRoots = int(atomic.Loaduintptr(&allglen))
 
 	work.markrootNext = 0
 	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nSpanRoots + work.nStackRoots)
@@ -124,19 +99,10 @@ func gcMarkRootCheck() {
 	lock(&allglock)
 	// Check that stacks have been scanned.
 	var gp *g
-	if gcphase == _GCmarktermination && debug.gcrescanstacks > 0 {
-		for i := 0; i < len(allgs); i++ {
-			gp = allgs[i]
-			if !(gp.gcscandone && gp.gcscanvalid) && readgstatus(gp) != _Gdead {
-				goto fail
-			}
-		}
-	} else {
-		for i := 0; i < work.nStackRoots; i++ {
-			gp = allgs[i]
-			if !gp.gcscandone {
-				goto fail
-			}
+	for i := 0; i < work.nStackRoots; i++ {
+		gp = allgs[i]
+		if !gp.gcscandone {
+			goto fail
 		}
 	}
 	unlock(&allglock)
@@ -188,11 +154,6 @@ func markroot(gcw *gcWork, i uint32) {
 		}
 
 	case i == fixedRootFinalizers:
-		// Only do this once per GC cycle since we don't call
-		// queuefinalizer during marking.
-		if work.markrootDone {
-			break
-		}
 		for fb := allfin; fb != nil; fb = fb.alllink {
 			cnt := uintptr(atomic.Load(&fb.cnt))
 			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), cnt*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
@@ -202,7 +163,7 @@ func markroot(gcw *gcWork, i uint32) {
 		// FIXME: We don't do this for gccgo.
 
 	case baseSpans <= i && i < baseStacks:
-		// mark MSpan.specials
+		// mark mspan.specials
 		markrootSpans(gcw, int(i-baseSpans))
 
 	default:
@@ -278,10 +239,6 @@ func markrootSpans(gcw *gcWork, shard int) {
 	// TODO(austin): There are several ideas for making this more
 	// efficient in issue #11485.
 
-	if work.markrootDone {
-		throw("markrootSpans during second markroot")
-	}
-
 	sg := mheap_.sweepgen
 	spans := mheap_.sweepSpans[mheap_.sweepgen/2%2].block(shard)
 	// Note that work.spans may not include spans that were
@@ -294,7 +251,8 @@ func markrootSpans(gcw *gcWork, shard int) {
 		if s.state != mSpanInUse {
 			continue
 		}
-		if !useCheckmark && s.sweepgen != sg {
+		// Check that this span was swept (it may be cached or uncached).
+		if !useCheckmark && !(s.sweepgen == sg || s.sweepgen == sg+3) {
 			// sweepgen was updated (+2) during non-checkmark GC pass
 			print("sweep ", s.sweepgen, " ", sg, "\n")
 			throw("gc: unswept span")
@@ -492,11 +450,6 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	// will be more cache friendly.
 	gcw := &getg().m.p.ptr().gcw
 	workDone := gcDrainN(gcw, scanWork)
-	// If we are near the end of the mark phase
-	// dispose of the gcw.
-	if gcBlackenPromptly {
-		gcw.dispose()
-	}
 
 	casgstatus(gp, _Gwaiting, _Grunning)
 
@@ -513,8 +466,7 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	incnwait := atomic.Xadd(&work.nwait, +1)
 	if incnwait > work.nproc {
 		println("runtime: work.nwait=", incnwait,
-			"work.nproc=", work.nproc,
-			"gcBlackenPromptly=", gcBlackenPromptly)
+			"work.nproc=", work.nproc)
 		throw("work.nwait > work.nproc")
 	}
 
@@ -539,15 +491,14 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 // new assists from going to sleep after this point.
 func gcWakeAllAssists() {
 	lock(&work.assistQueue.lock)
-	injectglist(work.assistQueue.head.ptr())
-	work.assistQueue.head.set(nil)
-	work.assistQueue.tail.set(nil)
+	list := work.assistQueue.q.popList()
+	injectglist(&list)
 	unlock(&work.assistQueue.lock)
 }
 
 // gcParkAssist puts the current goroutine on the assist queue and parks.
 //
-// gcParkAssist returns whether the assist is now satisfied. If it
+// gcParkAssist reports whether the assist is now satisfied. If it
 // returns false, the caller must retry the assist.
 //
 //go:nowritebarrier
@@ -562,24 +513,17 @@ func gcParkAssist() bool {
 	}
 
 	gp := getg()
-	oldHead, oldTail := work.assistQueue.head, work.assistQueue.tail
-	if oldHead == 0 {
-		work.assistQueue.head.set(gp)
-	} else {
-		oldTail.ptr().schedlink.set(gp)
-	}
-	work.assistQueue.tail.set(gp)
-	gp.schedlink.set(nil)
+	oldList := work.assistQueue.q
+	work.assistQueue.q.pushBack(gp)
 
 	// Recheck for background credit now that this G is in
 	// the queue, but can still back out. This avoids a
 	// race in case background marking has flushed more
 	// credit since we checked above.
 	if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
-		work.assistQueue.head = oldHead
-		work.assistQueue.tail = oldTail
-		if oldTail != 0 {
-			oldTail.ptr().schedlink.set(nil)
+		work.assistQueue.q = oldList
+		if oldList.tail != 0 {
+			oldList.tail.ptr().schedlink.set(nil)
 		}
 		unlock(&work.assistQueue.lock)
 		return false
@@ -600,7 +544,7 @@ func gcParkAssist() bool {
 //
 //go:nowritebarrierrec
 func gcFlushBgCredit(scanWork int64) {
-	if work.assistQueue.head == 0 {
+	if work.assistQueue.q.empty() {
 		// Fast path; there are no blocked assists. There's a
 		// small window here where an assist may add itself to
 		// the blocked queue and park. If that happens, we'll
@@ -612,23 +556,21 @@ func gcFlushBgCredit(scanWork int64) {
 	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
 
 	lock(&work.assistQueue.lock)
-	gp := work.assistQueue.head.ptr()
-	for gp != nil && scanBytes > 0 {
+	for !work.assistQueue.q.empty() && scanBytes > 0 {
+		gp := work.assistQueue.q.pop()
 		// Note that gp.gcAssistBytes is negative because gp
 		// is in debt. Think carefully about the signs below.
 		if scanBytes+gp.gcAssistBytes >= 0 {
 			// Satisfy this entire assist debt.
 			scanBytes += gp.gcAssistBytes
 			gp.gcAssistBytes = 0
-			xgp := gp
-			gp = gp.schedlink.ptr()
-			// It's important that we *not* put xgp in
+			// It's important that we *not* put gp in
 			// runnext. Otherwise, it's possible for user
 			// code to exploit the GC worker's high
 			// scheduler priority to get itself always run
 			// before other goroutines and always in the
 			// fresh quantum started by GC.
-			ready(xgp, 0, false)
+			ready(gp, 0, false)
 		} else {
 			// Partially satisfy this assist.
 			gp.gcAssistBytes += scanBytes
@@ -637,23 +579,10 @@ func gcFlushBgCredit(scanWork int64) {
 			// back of the queue so that large assists
 			// can't clog up the assist queue and
 			// substantially delay small assists.
-			xgp := gp
-			gp = gp.schedlink.ptr()
-			if gp == nil {
-				// gp is the only assist in the queue.
-				gp = xgp
-			} else {
-				xgp.schedlink = 0
-				work.assistQueue.tail.ptr().schedlink.set(xgp)
-				work.assistQueue.tail.set(xgp)
-			}
+			work.assistQueue.q.pushBack(gp)
 			break
 		}
 	}
-	work.assistQueue.head.set(gp)
-	if gp == nil {
-		work.assistQueue.tail.set(nil)
-	}
 
 	if scanBytes > 0 {
 		// Convert from scan bytes back to work.
@@ -707,11 +636,6 @@ func scanstack(gp *g, gcw *gcWork) {
 		// ok
 	}
 
-	mp := gp.m
-	if mp != nil && mp.helpgc != 0 {
-		throw("can't scan gchelper stack")
-	}
-
 	// Scan the stack.
 	if usestackmaps {
 		g := getg()
@@ -775,34 +699,26 @@ type gcDrainFlags int
 
 const (
 	gcDrainUntilPreempt gcDrainFlags = 1 << iota
-	gcDrainNoBlock
 	gcDrainFlushBgCredit
 	gcDrainIdle
 	gcDrainFractional
-
-	// gcDrainBlock means neither gcDrainUntilPreempt or
-	// gcDrainNoBlock. It is the default, but callers should use
-	// the constant for documentation purposes.
-	gcDrainBlock gcDrainFlags = 0
 )
 
 // gcDrain scans roots and objects in work buffers, blackening grey
-// objects until all roots and work buffers have been drained.
+// objects until it is unable to get more work. It may return before
+// GC is done; it's the caller's responsibility to balance work from
+// other Ps.
 //
 // If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt
-// is set. This implies gcDrainNoBlock.
+// is set.
 //
 // If flags&gcDrainIdle != 0, gcDrain returns when there is other work
-// to do. This implies gcDrainNoBlock.
+// to do.
 //
 // If flags&gcDrainFractional != 0, gcDrain self-preempts when
 // pollFractionalWorkerExit() returns true. This implies
 // gcDrainNoBlock.
 //
-// If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
-// unable to get more work. Otherwise, it will block until all
-// blocking calls are blocked in gcDrain.
-//
 // If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work
 // credit to gcController.bgScanCredit every gcCreditSlack units of
 // scan work.
@@ -815,7 +731,6 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 
 	gp := getg().m.curg
 	preemptible := flags&gcDrainUntilPreempt != 0
-	blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainFractional|gcDrainNoBlock) == 0
 	flushBgCredit := flags&gcDrainFlushBgCredit != 0
 	idle := flags&gcDrainIdle != 0
 
@@ -859,17 +774,19 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 			gcw.balance()
 		}
 
-		var b uintptr
-		if blocking {
-			b = gcw.get()
-		} else {
-			b = gcw.tryGetFast()
+		b := gcw.tryGetFast()
+		if b == 0 {
+			b = gcw.tryGet()
 			if b == 0 {
+				// Flush the write barrier
+				// buffer; this may create
+				// more work.
+				wbBufFlush(nil, 0)
 				b = gcw.tryGet()
 			}
 		}
 		if b == 0 {
-			// work barrier reached or tryGet failed.
+			// Unable to get work.
 			break
 		}
 		scanobject(b, gcw)
@@ -895,10 +812,6 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 		}
 	}
 
-	// In blocking mode, write barriers are not allowed after this
-	// point because we must preserve the condition that the work
-	// buffers are empty.
-
 done:
 	// Flush remaining scan work credit.
 	if gcw.scanWork > 0 {
@@ -947,6 +860,12 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 {
 		b := gcw.tryGetFast()
 		if b == 0 {
 			b = gcw.tryGet()
+			if b == 0 {
+				// Flush the write barrier buffer;
+				// this may create more work.
+				wbBufFlush(nil, 0)
+				b = gcw.tryGet()
+			}
 		}
 
 		if b == 0 {
@@ -1005,9 +924,9 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
 		for j := 0; j < 8 && i < n; j++ {
 			if bits&1 != 0 {
 				// Same work as in scanobject; see comments there.
-				obj := *(*uintptr)(unsafe.Pointer(b + i))
-				if obj != 0 {
-					if obj, span, objIndex := findObject(obj, b, i, false); obj != 0 {
+				p := *(*uintptr)(unsafe.Pointer(b + i))
+				if p != 0 {
+					if obj, span, objIndex := findObject(p, b, i, false); obj != 0 {
 						greyobject(obj, b, i, span, gcw, objIndex, false)
 					}
 				}
@@ -1163,7 +1082,7 @@ func scanstackblockwithmap(pc, b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
 				if obj != 0 {
 					o, span, objIndex := findObject(obj, b, i, false)
 					if obj < minPhysPageSize ||
-						span != nil && span.state != _MSpanManual &&
+						span != nil && span.state != mSpanManual &&
 							(obj < span.base() || obj >= span.limit || span.state != mSpanInUse) {
 						print("runtime: found in object at *(", hex(b), "+", hex(i), ") = ", hex(obj), ", pc=", hex(pc), "\n")
 						name, file, line := funcfileline(pc, -1)
@@ -1190,11 +1109,6 @@ func shade(b uintptr) {
 	if obj, span, objIndex := findObject(b, 0, 0, true); obj != 0 {
 		gcw := &getg().m.p.ptr().gcw
 		greyobject(obj, 0, 0, span, gcw, objIndex, true)
-		if gcphase == _GCmarktermination || gcBlackenPromptly {
-			// Ps aren't allowed to cache work during mark
-			// termination.
-			gcw.dispose()
-		}
 	}
 }
 
@@ -1262,8 +1176,14 @@ func greyobject(obj, base, off uintptr, span *mspan, gcw *gcWork, objIndex uintp
 		if mbits.isMarked() {
 			return
 		}
-		// mbits.setMarked() // Avoid extra call overhead with manual inlining.
-		atomic.Or8(mbits.bytep, mbits.mask)
+		mbits.setMarked()
+
+		// Mark span.
+		arena, pageIdx, pageMask := pageIndexOf(span.base())
+		if arena.pageMarks[pageIdx]&pageMask == 0 {
+			atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+		}
+
 		// If this is a noscan object, fast-track it to black
 		// instead of greying it.
 		if span.spanclass.noscan() {
@@ -1301,7 +1221,7 @@ func gcDumpObject(label string, obj, off uintptr) {
 
 	skipped := false
 	size := s.elemsize
-	if s.state == _MSpanManual && size == 0 {
+	if s.state == mSpanManual && size == 0 {
 		// We're printing something from a stack frame. We
 		// don't know how big it is, so just show up to an
 		// including off.
@@ -1338,18 +1258,13 @@ func gcDumpObject(label string, obj, off uintptr) {
 //go:nowritebarrier
 //go:nosplit
 func gcmarknewobject(obj, size, scanSize uintptr) {
-	if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
+	if useCheckmark { // The world should be stopped so this should not happen.
 		throw("gcmarknewobject called while doing checkmark")
 	}
 	markBitsForAddr(obj).setMarked()
 	gcw := &getg().m.p.ptr().gcw
 	gcw.bytesMarked += uint64(size)
 	gcw.scanWork += int64(scanSize)
-	if gcBlackenPromptly {
-		// There shouldn't be anything in the work queue, but
-		// we still need to flush stats.
-		gcw.dispose()
-	}
 }
 
 // gcMarkTinyAllocs greys all active tiny alloc blocks.
@@ -1364,9 +1279,6 @@ func gcMarkTinyAllocs() {
 		_, span, objIndex := findObject(c.tiny, 0, 0, false)
 		gcw := &p.gcw
 		greyobject(c.tiny, 0, 0, span, gcw, objIndex, false)
-		if gcBlackenPromptly {
-			gcw.dispose()
-		}
 	}
 }
 
@@ -1397,7 +1309,7 @@ var useCheckmark = false
 func initCheckmarks() {
 	useCheckmark = true
 	for _, s := range mheap_.allspans {
-		if s.state == _MSpanInUse {
+		if s.state == mSpanInUse {
 			heapBitsForAddr(s.base()).initCheckmarkSpan(s.layout())
 		}
 	}
@@ -1406,7 +1318,7 @@ func initCheckmarks() {
 func clearCheckmarks() {
 	useCheckmark = false
 	for _, s := range mheap_.allspans {
-		if s.state == _MSpanInUse {
+		if s.state == mSpanInUse {
 			heapBitsForAddr(s.base()).clearCheckmarkSpan(s.layout())
 		}
 	}
diff --git a/libgo/go/runtime/mgcsweep.go b/libgo/go/runtime/mgcsweep.go
index 39dd54ea2eb..dcaeb106dc4 100644
--- a/libgo/go/runtime/mgcsweep.go
+++ b/libgo/go/runtime/mgcsweep.go
@@ -4,6 +4,24 @@
 
 // Garbage collector: sweeping
 
+// The sweeper consists of two different algorithms:
+//
+// * The object reclaimer finds and frees unmarked slots in spans. It
+//   can free a whole span if none of the objects are marked, but that
+//   isn't its goal. This can be driven either synchronously by
+//   mcentral.cacheSpan for mcentral spans, or asynchronously by
+//   sweepone from the list of all in-use spans in mheap_.sweepSpans.
+//
+// * The span reclaimer looks for spans that contain no marked objects
+//   and frees whole spans. This is a separate algorithm because
+//   freeing whole spans is the hardest task for the object reclaimer,
+//   but is critical when allocating new spans. The entry point for
+//   this is mheap_.reclaim and it's driven by a sequential scan of
+//   the page marks bitmap in the heap arenas.
+//
+// Both algorithms ultimately call mspan.sweep, which sweeps a single
+// heap span.
+
 package runtime
 
 import (
@@ -54,7 +72,7 @@ func bgsweep(c chan int) {
 	goparkunlock(&sweep.lock, waitReasonGCSweepWait, traceEvGoBlock, 1)
 
 	for {
-		for gosweepone() != ^uintptr(0) {
+		for sweepone() != ^uintptr(0) {
 			sweep.nbgsweep++
 			Gosched()
 		}
@@ -62,7 +80,7 @@ func bgsweep(c chan int) {
 			Gosched()
 		}
 		lock(&sweep.lock)
-		if !gosweepdone() {
+		if !isSweepDone() {
 			// This can happen if a GC runs between
 			// gosweepone returning ^0 above
 			// and the lock being acquired.
@@ -74,9 +92,8 @@ func bgsweep(c chan int) {
 	}
 }
 
-// sweeps one span
-// returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
-//go:nowritebarrier
+// sweepone sweeps some unswept heap span and returns the number of pages returned
+// to the heap, or ^uintptr(0) if there was nothing to sweep.
 func sweepone() uintptr {
 	_g_ := getg()
 	sweepRatio := mheap_.sweepPagesPerByte // For debugging
@@ -90,10 +107,11 @@ func sweepone() uintptr {
 	}
 	atomic.Xadd(&mheap_.sweepers, +1)
 
-	npages := ^uintptr(0)
+	// Find a span to sweep.
+	var s *mspan
 	sg := mheap_.sweepgen
 	for {
-		s := mheap_.sweepSpans[1-sg/2%2].pop()
+		s = mheap_.sweepSpans[1-sg/2%2].pop()
 		if s == nil {
 			atomic.Store(&mheap_.sweepdone, 1)
 			break
@@ -102,23 +120,32 @@ func sweepone() uintptr {
 			// This can happen if direct sweeping already
 			// swept this span, but in that case the sweep
 			// generation should always be up-to-date.
-			if s.sweepgen != sg {
+			if !(s.sweepgen == sg || s.sweepgen == sg+3) {
 				print("runtime: bad span s.state=", s.state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
 				throw("non in-use span in unswept list")
 			}
 			continue
 		}
-		if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) {
-			continue
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			break
 		}
+	}
+
+	// Sweep the span we found.
+	npages := ^uintptr(0)
+	if s != nil {
 		npages = s.npages
-		if !s.sweep(false) {
+		if s.sweep(false) {
+			// Whole span was freed. Count it toward the
+			// page reclaimer credit since these pages can
+			// now be used for span allocation.
+			atomic.Xadduintptr(&mheap_.reclaimCredit, npages)
+		} else {
 			// Span is still in-use, so this returned no
 			// pages to the heap and the span needs to
 			// move to the swept in-use list.
 			npages = 0
 		}
-		break
 	}
 
 	// Decrement the number of active sweepers and if this is the
@@ -132,17 +159,13 @@ func sweepone() uintptr {
 	return npages
 }
 
-//go:nowritebarrier
-func gosweepone() uintptr {
-	var ret uintptr
-	systemstack(func() {
-		ret = sweepone()
-	})
-	return ret
-}
-
-//go:nowritebarrier
-func gosweepdone() bool {
+// isSweepDone reports whether all spans are swept or currently being swept.
+//
+// Note that this condition may transition from false to true at any
+// time as the sweeper runs. It may transition from true to false if a
+// GC runs; to prevent that the caller must be non-preemptible or must
+// somehow block GC progress.
+func isSweepDone() bool {
 	return mheap_.sweepdone != 0
 }
 
@@ -154,20 +177,25 @@ func (s *mspan) ensureSwept() {
 	// (if GC is triggered on another goroutine).
 	_g_ := getg()
 	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
-		throw("MSpan_EnsureSwept: m is not locked")
+		throw("mspan.ensureSwept: m is not locked")
 	}
 
 	sg := mheap_.sweepgen
-	if atomic.Load(&s.sweepgen) == sg {
+	spangen := atomic.Load(&s.sweepgen)
+	if spangen == sg || spangen == sg+3 {
 		return
 	}
-	// The caller must be sure that the span is a MSpanInUse span.
+	// The caller must be sure that the span is a mSpanInUse span.
 	if atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 		s.sweep(false)
 		return
 	}
 	// unfortunate condition, and we don't have efficient means to wait
-	for atomic.Load(&s.sweepgen) != sg {
+	for {
+		spangen := atomic.Load(&s.sweepgen)
+		if spangen == sg || spangen == sg+3 {
+			break
+		}
 		osyield()
 	}
 }
@@ -175,7 +203,7 @@ func (s *mspan) ensureSwept() {
 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
 // Returns true if the span was returned to heap.
-// If preserve=true, don't return it to heap nor relink in MCentral lists;
+// If preserve=true, don't return it to heap nor relink in mcentral lists;
 // caller takes care of it.
 //TODO go:nowritebarrier
 func (s *mspan) sweep(preserve bool) bool {
@@ -183,12 +211,12 @@ func (s *mspan) sweep(preserve bool) bool {
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
 	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
-		throw("MSpan_Sweep: m is not locked")
+		throw("mspan.sweep: m is not locked")
 	}
 	sweepgen := mheap_.sweepgen
 	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
-		print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
-		throw("MSpan_Sweep: bad span state")
+		print("mspan.sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("mspan.sweep: bad span state")
 	}
 
 	if trace.enabled {
@@ -345,8 +373,8 @@ func (s *mspan) sweep(preserve bool) bool {
 		// The span must be in our exclusive ownership until we update sweepgen,
 		// check for potential races.
 		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
-			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
-			throw("MSpan_Sweep: bad span state after sweep")
+			print("mspan.sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+			throw("mspan.sweep: bad span state after sweep")
 		}
 		// Serialization point.
 		// At this point the mark bits are cleared and allocation ready
@@ -360,29 +388,29 @@ func (s *mspan) sweep(preserve bool) bool {
 
 	if nfreedSigned > 0 && spc.sizeclass() != 0 {
 		res = mheap_.central[spc].mcentral.freeSpan(s, preserve, wasempty)
-		// MCentral_FreeSpan updates sweepgen
+		// mcentral.freeSpan updates sweepgen
 	} else if freeToHeap {
 		// Free large span to heap
 
 		// NOTE(rsc,dvyukov): The original implementation of efence
-		// in CL 22060046 used SysFree instead of SysFault, so that
+		// in CL 22060046 used sysFree instead of sysFault, so that
 		// the operating system would eventually give the memory
 		// back to us again, so that an efence program could run
 		// longer without running out of memory. Unfortunately,
-		// calling SysFree here without any kind of adjustment of the
+		// calling sysFree here without any kind of adjustment of the
 		// heap data structures means that when the memory does
 		// come back to us, we have the wrong metadata for it, either in
-		// the MSpan structures or in the garbage collection bitmap.
-		// Using SysFault here means that the program will run out of
+		// the mspan structures or in the garbage collection bitmap.
+		// Using sysFault here means that the program will run out of
 		// memory fairly quickly in efence mode, but at least it won't
 		// have mysterious crashes due to confused memory reuse.
-		// It should be possible to switch back to SysFree if we also
-		// implement and then call some kind of MHeap_DeleteSpan.
+		// It should be possible to switch back to sysFree if we also
+		// implement and then call some kind of mheap.deleteSpan.
 		if debug.efence > 0 {
 			s.limit = 0 // prevent mlookup from finding this span
 			sysFault(unsafe.Pointer(s.base()), size)
 		} else {
-			mheap_.freeSpan(s, 1)
+			mheap_.freeSpan(s, true)
 		}
 		c.local_nlargefree++
 		c.local_largefree += size
@@ -430,7 +458,7 @@ retry:
 	newHeapLive := uintptr(atomic.Load64(&memstats.heap_live)-mheap_.sweepHeapLiveBasis) + spanBytes
 	pagesTarget := int64(mheap_.sweepPagesPerByte*float64(newHeapLive)) - int64(callerSweepPages)
 	for pagesTarget > int64(atomic.Load64(&mheap_.pagesSwept)-sweptBasis) {
-		if gosweepone() == ^uintptr(0) {
+		if sweepone() == ^uintptr(0) {
 			mheap_.sweepPagesPerByte = 0
 			break
 		}
diff --git a/libgo/go/runtime/mgcsweepbuf.go b/libgo/go/runtime/mgcsweepbuf.go
index 6c1118e3857..0491f7ccf6c 100644
--- a/libgo/go/runtime/mgcsweepbuf.go
+++ b/libgo/go/runtime/mgcsweepbuf.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -83,7 +84,7 @@ retry:
 			if newCap == 0 {
 				newCap = gcSweepBufInitSpineCap
 			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, sys.CacheLineSize, &memstats.gc_sys)
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
 			if b.spineCap != 0 {
 				// Blocks are allocated off-heap, so
 				// no write barriers.
@@ -102,7 +103,7 @@ retry:
 		}
 
 		// Allocate a new block and add it to the spine.
-		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), sys.CacheLineSize, &memstats.gc_sys))
+		block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
 		blockp := add(b.spine, sys.PtrSize*top)
 		// Blocks are allocated off-heap, so no write barrier.
 		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
index 99771e2e57f..89d1e0e38e9 100644
--- a/libgo/go/runtime/mgcwork.go
+++ b/libgo/go/runtime/mgcwork.go
@@ -22,6 +22,13 @@ const (
 	workbufAlloc = 32 << 10
 )
 
+// throwOnGCWork causes any operations that add pointers to a gcWork
+// buffer to throw.
+//
+// TODO(austin): This is a temporary debugging measure for issue
+// #27993. To be removed before release.
+var throwOnGCWork bool
+
 func init() {
 	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
 		throw("bad workbufAlloc")
@@ -46,10 +53,7 @@ func init() {
 //
 //     (preemption must be disabled)
 //     gcw := &getg().m.p.ptr().gcw
-//     .. call gcw.put() to produce and gcw.get() to consume ..
-//     if gcBlackenPromptly {
-//         gcw.dispose()
-//     }
+//     .. call gcw.put() to produce and gcw.tryGet() to consume ..
 //
 // It's important that any use of gcWork during the mark phase prevent
 // the garbage collector from transitioning to mark termination since
@@ -83,6 +87,23 @@ type gcWork struct {
 	// Scan work performed on this gcWork. This is aggregated into
 	// gcController by dispose and may also be flushed by callers.
 	scanWork int64
+
+	// flushedWork indicates that a non-empty work buffer was
+	// flushed to the global work list since the last gcMarkDone
+	// termination check. Specifically, this indicates that this
+	// gcWork may have communicated work to another gcWork.
+	flushedWork bool
+
+	// pauseGen causes put operations to spin while pauseGen ==
+	// gcWorkPauseGen if debugCachedWork is true.
+	pauseGen uint32
+
+	// putGen is the pauseGen of the last putGen.
+	putGen uint32
+
+	// pauseStack is the stack at which this P was paused if
+	// debugCachedWork is true.
+	pauseStack [16]location
 }
 
 // Most of the methods of gcWork are go:nowritebarrierrec because the
@@ -101,10 +122,59 @@ func (w *gcWork) init() {
 	w.wbuf2 = wbuf2
 }
 
+func (w *gcWork) checkPut(ptr uintptr, ptrs []uintptr) {
+	if debugCachedWork {
+		alreadyFailed := w.putGen == w.pauseGen
+		w.putGen = w.pauseGen
+		if m := getg().m; m.locks > 0 || m.mallocing != 0 || m.preemptoff != "" || m.p.ptr().status != _Prunning {
+			// If we were to spin, the runtime may
+			// deadlock: the condition above prevents
+			// preemption (see newstack), which could
+			// prevent gcMarkDone from finishing the
+			// ragged barrier and releasing the spin.
+			return
+		}
+		for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
+		}
+		if throwOnGCWork {
+			printlock()
+			if alreadyFailed {
+				println("runtime: checkPut already failed at this generation")
+			}
+			println("runtime: late gcWork put")
+			if ptr != 0 {
+				gcDumpObject("ptr", ptr, ^uintptr(0))
+			}
+			for _, ptr := range ptrs {
+				gcDumpObject("ptrs", ptr, ^uintptr(0))
+			}
+			println("runtime: paused at")
+			for _, loc := range w.pauseStack {
+				if loc.pc == 0 {
+					break
+				}
+				if loc.function != "" {
+					// Obviously this doesn't
+					// relate to ancestor
+					// tracebacks, but this
+					// function prints what we
+					// want.
+					printAncestorTracebackFuncInfo(loc.function, loc.filename, loc.lineno, loc.pc)
+				} else {
+					println("\tunknown PC ", hex(loc.pc), "\n")
+				}
+			}
+			throw("throwOnGCWork")
+		}
+	}
+}
+
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
 //go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
+	w.checkPut(obj, nil)
+
 	flushed := false
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -116,6 +186,7 @@ func (w *gcWork) put(obj uintptr) {
 		wbuf = w.wbuf1
 		if wbuf.nobj == len(wbuf.obj) {
 			putfull(wbuf)
+			w.flushedWork = true
 			wbuf = getempty()
 			w.wbuf1 = wbuf
 			flushed = true
@@ -134,10 +205,12 @@ func (w *gcWork) put(obj uintptr) {
 	}
 }
 
-// putFast does a put and returns true if it can be done quickly
+// putFast does a put and reports whether it can be done quickly
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
+	w.checkPut(obj, nil)
+
 	wbuf := w.wbuf1
 	if wbuf == nil {
 		return false
@@ -159,6 +232,8 @@ func (w *gcWork) putBatch(obj []uintptr) {
 		return
 	}
 
+	w.checkPut(0, obj)
+
 	flushed := false
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -169,6 +244,7 @@ func (w *gcWork) putBatch(obj []uintptr) {
 	for len(obj) > 0 {
 		for wbuf.nobj == len(wbuf.obj) {
 			putfull(wbuf)
+			w.flushedWork = true
 			w.wbuf1, w.wbuf2 = w.wbuf2, getempty()
 			wbuf = w.wbuf1
 			flushed = true
@@ -231,37 +307,6 @@ func (w *gcWork) tryGetFast() uintptr {
 	return wbuf.obj[wbuf.nobj]
 }
 
-// get dequeues a pointer for the garbage collector to trace, blocking
-// if necessary to ensure all pointers from all queues and caches have
-// been retrieved.  get returns 0 if there are no pointers remaining.
-//go:nowritebarrierrec
-func (w *gcWork) get() uintptr {
-	wbuf := w.wbuf1
-	if wbuf == nil {
-		w.init()
-		wbuf = w.wbuf1
-		// wbuf is empty at this point.
-	}
-	if wbuf.nobj == 0 {
-		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-		wbuf = w.wbuf1
-		if wbuf.nobj == 0 {
-			owbuf := wbuf
-			wbuf = getfull()
-			if wbuf == nil {
-				return 0
-			}
-			putempty(owbuf)
-			w.wbuf1 = wbuf
-		}
-	}
-
-	// TODO: This might be a good place to add prefetch code
-
-	wbuf.nobj--
-	return wbuf.obj[wbuf.nobj]
-}
-
 // dispose returns any cached pointers to the global queue.
 // The buffers are being put on the full queue so that the
 // write barriers will not simply reacquire them before the
@@ -275,6 +320,7 @@ func (w *gcWork) dispose() {
 			putempty(wbuf)
 		} else {
 			putfull(wbuf)
+			w.flushedWork = true
 		}
 		w.wbuf1 = nil
 
@@ -283,6 +329,7 @@ func (w *gcWork) dispose() {
 			putempty(wbuf)
 		} else {
 			putfull(wbuf)
+			w.flushedWork = true
 		}
 		w.wbuf2 = nil
 	}
@@ -308,10 +355,14 @@ func (w *gcWork) balance() {
 		return
 	}
 	if wbuf := w.wbuf2; wbuf.nobj != 0 {
+		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		putfull(wbuf)
+		w.flushedWork = true
 		w.wbuf2 = getempty()
 	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
+		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		w.wbuf1 = handoff(wbuf)
+		w.flushedWork = true // handoff did putfull
 	} else {
 		return
 	}
@@ -321,7 +372,7 @@ func (w *gcWork) balance() {
 	}
 }
 
-// empty returns true if w has no mark work available.
+// empty reports whether w has no mark work available.
 //go:nowritebarrierrec
 func (w *gcWork) empty() bool {
 	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
@@ -440,61 +491,6 @@ func trygetfull() *workbuf {
 	return b
 }
 
-// Get a full work buffer off the work.full list.
-// If nothing is available wait until all the other gc helpers have
-// finished and then return nil.
-// getfull acts as a barrier for work.nproc helpers. As long as one
-// gchelper is actively marking objects it
-// may create a workbuffer that the other helpers can work on.
-// The for loop either exits when a work buffer is found
-// or when _all_ of the work.nproc GC helpers are in the loop
-// looking for work and thus not capable of creating new work.
-// This is in fact the termination condition for the STW mark
-// phase.
-//go:nowritebarrier
-func getfull() *workbuf {
-	b := (*workbuf)(work.full.pop())
-	if b != nil {
-		b.checknonempty()
-		return b
-	}
-
-	incnwait := atomic.Xadd(&work.nwait, +1)
-	if incnwait > work.nproc {
-		println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
-		throw("work.nwait > work.nproc")
-	}
-	for i := 0; ; i++ {
-		if work.full != 0 {
-			decnwait := atomic.Xadd(&work.nwait, -1)
-			if decnwait == work.nproc {
-				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
-				throw("work.nwait > work.nproc")
-			}
-			b = (*workbuf)(work.full.pop())
-			if b != nil {
-				b.checknonempty()
-				return b
-			}
-			incnwait := atomic.Xadd(&work.nwait, +1)
-			if incnwait > work.nproc {
-				println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
-				throw("work.nwait > work.nproc")
-			}
-		}
-		if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
-			return nil
-		}
-		if i < 10 {
-			procyield(20)
-		} else if i < 20 {
-			osyield()
-		} else {
-			usleep(100)
-		}
-	}
-}
-
 //go:nowritebarrier
 func handoff(b *workbuf) *workbuf {
 	// Make new buffer with half of b's pointers.
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
index eb98083fecc..7139b0e94e7 100644
--- a/libgo/go/runtime/mheap.go
+++ b/libgo/go/runtime/mheap.go
@@ -9,6 +9,7 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -20,7 +21,7 @@ import (
 const minPhysPageSize = 4096
 
 // Main malloc heap.
-// The heap itself is the "free[]" and "large" arrays,
+// The heap itself is the "free" and "scav" treaps,
 // but all the other global data is here too.
 //
 // mheap must not be heap-allocated because it contains mSpanLists,
@@ -29,13 +30,11 @@ const minPhysPageSize = 4096
 //go:notinheap
 type mheap struct {
 	lock      mutex
-	free      [_MaxMHeapList]mSpanList // free lists of given length up to _MaxMHeapList
-	freelarge mTreap                   // free treap of length >= _MaxMHeapList
-	busy      [_MaxMHeapList]mSpanList // busy lists of large spans of given length
-	busylarge mSpanList                // busy lists of large spans length >= _MaxMHeapList
-	sweepgen  uint32                   // sweep generation, see comment in mspan
-	sweepdone uint32                   // all spans are swept
-	sweepers  uint32                   // number of active sweepone calls
+	free      mTreap // free and non-scavenged spans
+	scav      mTreap // free and scavenged spans
+	sweepgen  uint32 // sweep generation, see comment in mspan
+	sweepdone uint32 // all spans are swept
+	sweepers  uint32 // number of active sweepone calls
 
 	// allspans is a slice of all mspans ever created. Each mspan
 	// appears exactly once.
@@ -61,7 +60,7 @@ type mheap struct {
 	// on the swept stack.
 	sweepSpans [2]gcSweepBuf
 
-	//_ uint32 // align uint64 fields on 32-bit for atomics
+	_ uint32 // align uint64 fields on 32-bit for atomics
 
 	// Proportional sweep
 	//
@@ -81,7 +80,7 @@ type mheap struct {
 	// accounting for current progress. If we could only adjust
 	// the slope, it would create a discontinuity in debt if any
 	// progress has already been made.
-	pagesInUse         uint64  // pages of spans in stats _MSpanInUse; R/W with mheap.lock
+	pagesInUse         uint64  // pages of spans in stats mSpanInUse; R/W with mheap.lock
 	pagesSwept         uint64  // pages swept this cycle; updated atomically
 	pagesSweptBasis    uint64  // pagesSwept to use as the origin of the sweep ratio; updated atomically
 	sweepHeapLiveBasis uint64  // value of heap_live to use as the origin of sweep ratio; written with lock, read without
@@ -89,6 +88,25 @@ type mheap struct {
 	// TODO(austin): pagesInUse should be a uintptr, but the 386
 	// compiler can't 8-byte align fields.
 
+	// Page reclaimer state
+
+	// reclaimIndex is the page index in allArenas of next page to
+	// reclaim. Specifically, it refers to page (i %
+	// pagesPerArena) of arena allArenas[i / pagesPerArena].
+	//
+	// If this is >= 1<<63, the page reclaimer is done scanning
+	// the page marks.
+	//
+	// This is accessed atomically.
+	reclaimIndex uint64
+	// reclaimCredit is spare credit for extra pages swept. Since
+	// the page reclaimer works in large chunks, it may reclaim
+	// more than requested. Any spare pages released go to this
+	// credit pool.
+	//
+	// This is accessed atomically.
+	reclaimCredit uintptr
+
 	// Malloc stats.
 	largealloc  uint64                  // bytes allocated for large objects
 	nlargealloc uint64                  // number of large object allocations
@@ -133,21 +151,35 @@ type mheap struct {
 	// (the actual arenas). This is only used on 32-bit.
 	arena linearAlloc
 
-	//_ uint32 // ensure 64-bit alignment of central
+	// allArenas is the arenaIndex of every mapped arena. This can
+	// be used to iterate through the address space.
+	//
+	// Access is protected by mheap_.lock. However, since this is
+	// append-only and old backing arrays are never freed, it is
+	// safe to acquire mheap_.lock, copy the slice header, and
+	// then release mheap_.lock.
+	allArenas []arenaIdx
+
+	// sweepArenas is a snapshot of allArenas taken at the
+	// beginning of the sweep cycle. This can be read safely by
+	// simply blocking GC (by disabling preemption).
+	sweepArenas []arenaIdx
+
+	_ uint32 // ensure 64-bit alignment of central
 
 	// central free lists for small size classes.
-	// the padding makes sure that the MCentrals are
-	// spaced CacheLineSize bytes apart, so that each MCentral.lock
+	// the padding makes sure that the mcentrals are
+	// spaced CacheLinePadSize bytes apart, so that each mcentral.lock
 	// gets its own cache line.
 	// central is indexed by spanClass.
 	central [numSpanClasses]struct {
 		mcentral mcentral
-		pad      [sys.CacheLineSize - unsafe.Sizeof(mcentral{})%sys.CacheLineSize]byte
+		pad      [cpu.CacheLinePadSize - unsafe.Sizeof(mcentral{})%cpu.CacheLinePadSize]byte
 	}
 
 	spanalloc             fixalloc // allocator for span*
 	cachealloc            fixalloc // allocator for mcache*
-	treapalloc            fixalloc // allocator for treapNodes* used by large objects
+	treapalloc            fixalloc // allocator for treapNodes*
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for special record allocators.
@@ -184,6 +216,29 @@ type heapArena struct {
 	// must not be a safe-point between establishing that an
 	// address is live and looking it up in the spans array.
 	spans [pagesPerArena]*mspan
+
+	// pageInUse is a bitmap that indicates which spans are in
+	// state mSpanInUse. This bitmap is indexed by page number,
+	// but only the bit corresponding to the first page in each
+	// span is used.
+	//
+	// Writes are protected by mheap_.lock.
+	pageInUse [pagesPerArena / 8]uint8
+
+	// pageMarks is a bitmap that indicates which spans have any
+	// marked objects on them. Like pageInUse, only the bit
+	// corresponding to the first page in each span is used.
+	//
+	// Writes are done atomically during marking. Reads are
+	// non-atomic and lock-free since they only occur during
+	// sweeping (and hence never race with writes).
+	//
+	// This is used to quickly find whole spans that can be freed.
+	//
+	// TODO(austin): It would be nice if this was uint64 for
+	// faster scanning, but we don't have 64-bit atomic bit
+	// operations.
+	pageMarks [pagesPerArena / 8]uint8
 }
 
 // arenaHint is a hint for where to grow the heap arenas. See
@@ -196,20 +251,21 @@ type arenaHint struct {
 	next *arenaHint
 }
 
-// An MSpan is a run of pages.
+// An mspan is a run of pages.
 //
-// When a MSpan is in the heap free list, state == MSpanFree
+// When a mspan is in the heap free treap, state == mSpanFree
 // and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
+// If the mspan is in the heap scav treap, then in addition to the
+// above scavenged == true. scavenged == false in all other cases.
 //
-// When a MSpan is allocated, state == MSpanInUse or MSpanManual
+// When a mspan is allocated, state == mSpanInUse or mSpanManual
 // and heapmap(i) == span for all s->start <= i < s->start+s->npages.
 
-// Every MSpan is in one doubly-linked list,
-// either one of the MHeap's free lists or one of the
-// MCentral's span lists.
+// Every mspan is in one doubly-linked list, either in the mheap's
+// busy list or one of the mcentral's span lists.
 
-// An MSpan representing actual memory has state _MSpanInUse,
-// _MSpanManual, or _MSpanFree. Transitions between these states are
+// An mspan representing actual memory has state mSpanInUse,
+// mSpanManual, or mSpanFree. Transitions between these states are
 // constrained as follows:
 //
 // * A span may transition from free to in-use or manual during any GC
@@ -225,19 +281,19 @@ type arenaHint struct {
 type mSpanState uint8
 
 const (
-	_MSpanDead   mSpanState = iota
-	_MSpanInUse             // allocated for garbage collected heap
-	_MSpanManual            // allocated for manual management (e.g., stack allocator)
-	_MSpanFree
+	mSpanDead   mSpanState = iota
+	mSpanInUse             // allocated for garbage collected heap
+	mSpanManual            // allocated for manual management (e.g., stack allocator)
+	mSpanFree
 )
 
 // mSpanStateNames are the names of the span states, indexed by
 // mSpanState.
 var mSpanStateNames = []string{
-	"_MSpanDead",
-	"_MSpanInUse",
-	"_MSpanManual",
-	"_MSpanFree",
+	"mSpanDead",
+	"mSpanInUse",
+	"mSpanManual",
+	"mSpanFree",
 }
 
 // mSpanList heads a linked list of spans.
@@ -257,7 +313,7 @@ type mspan struct {
 	startAddr uintptr // address of first byte of span aka s.base()
 	npages    uintptr // number of pages in span
 
-	manualFreeList gclinkptr // list of free objects in _MSpanManual spans
+	manualFreeList gclinkptr // list of free objects in mSpanManual spans
 
 	// freeindex is the slot index between 0 and nelems at which to begin scanning
 	// for the next free object in this span.
@@ -316,6 +372,8 @@ type mspan struct {
 	// if sweepgen == h->sweepgen - 2, the span needs sweeping
 	// if sweepgen == h->sweepgen - 1, the span is currently being swept
 	// if sweepgen == h->sweepgen, the span is swept and ready to use
+	// if sweepgen == h->sweepgen + 1, the span was cached before sweep began and is still cached, and needs sweeping
+	// if sweepgen == h->sweepgen + 3, the span was swept and then cached and is still cached
 	// h->sweepgen is incremented by 2 after every GC
 
 	sweepgen    uint32
@@ -323,14 +381,13 @@ type mspan struct {
 	baseMask    uint16     // if non-0, elemsize is a power of 2, & this will get object allocation base
 	allocCount  uint16     // number of allocated objects
 	spanclass   spanClass  // size class and noscan (uint8)
-	incache     bool       // being used by an mcache
 	state       mSpanState // mspaninuse etc
 	needzero    uint8      // needs to be zeroed before allocation
 	divShift    uint8      // for divide by elemsize - divMagic.shift
 	divShift2   uint8      // for divide by elemsize - divMagic.shift2
+	scavenged   bool       // whether this span has had its pages released to the OS
 	elemsize    uintptr    // computed from sizeclass or from npages
 	unusedsince int64      // first time spotted by gc in mspanfree state
-	npreleased  uintptr    // number of pages released to the os
 	limit       uintptr    // end of data in span
 	speciallock mutex      // guards specials list
 	specials    *special   // linked list of special records sorted by offset.
@@ -349,6 +406,45 @@ func (s *mspan) layout() (size, n, total uintptr) {
 	return
 }
 
+// physPageBounds returns the start and end of the span
+// rounded in to the physical page size.
+func (s *mspan) physPageBounds() (uintptr, uintptr) {
+	start := s.base()
+	end := start + s.npages<<_PageShift
+	if physPageSize > _PageSize {
+		// Round start and end in.
+		start = (start + physPageSize - 1) &^ (physPageSize - 1)
+		end &^= physPageSize - 1
+	}
+	return start, end
+}
+
+func (s *mspan) scavenge() uintptr {
+	// start and end must be rounded in, otherwise madvise
+	// will round them *out* and release more memory
+	// than we want.
+	start, end := s.physPageBounds()
+	if end <= start {
+		// start and end don't span a whole physical page.
+		return 0
+	}
+	released := end - start
+	memstats.heap_released += uint64(released)
+	s.scavenged = true
+	sysUnused(unsafe.Pointer(start), released)
+	return released
+}
+
+// released returns the number of bytes in this span
+// which were returned back to the OS.
+func (s *mspan) released() uintptr {
+	if !s.scavenged {
+		return 0
+	}
+	start, end := s.physPageBounds()
+	return end - start
+}
+
 // recordspan adds a newly allocated span to h.allspans.
 //
 // This only happens the first time a span is allocated from
@@ -457,7 +553,7 @@ func (i arenaIdx) l2() uint {
 }
 
 // inheap reports whether b is a pointer into a (potentially dead) heap object.
-// It returns false for pointers into _MSpanManual spans.
+// It returns false for pointers into mSpanManual spans.
 // Non-preemptible because it is used by write barriers.
 //go:nowritebarrier
 //go:nosplit
@@ -476,7 +572,7 @@ func inHeapOrStack(b uintptr) bool {
 		return false
 	}
 	switch s.state {
-	case mSpanInUse, _MSpanManual:
+	case mSpanInUse, mSpanManual:
 		return b < s.limit
 	default:
 		return false
@@ -550,6 +646,16 @@ func spanOfHeap(p uintptr) *mspan {
 	return s
 }
 
+// pageIndexOf returns the arena, page index, and page mask for pointer p.
+// The caller must ensure p is in the heap.
+func pageIndexOf(p uintptr) (arena *heapArena, pageIdx uintptr, pageMask uint8) {
+	ai := arenaIndex(p)
+	arena = mheap_.arenas[ai.l1()][ai.l2()]
+	pageIdx = ((p / pageSize) / 8) % uintptr(len(arena.pageInUse))
+	pageMask = byte(1 << ((p / pageSize) % 8))
+	return
+}
+
 // Initialize the heap.
 func (h *mheap) init() {
 	h.treapalloc.init(unsafe.Sizeof(treapNode{}), nil, nil, &memstats.other_sys)
@@ -569,117 +675,182 @@ func (h *mheap) init() {
 	h.spanalloc.zero = false
 
 	// h->mapcache needs no init
-	for i := range h.free {
-		h.free[i].init()
-		h.busy[i].init()
-	}
 
-	h.busylarge.init()
 	for i := range h.central {
 		h.central[i].mcentral.init(spanClass(i))
 	}
 }
 
-// Sweeps spans in list until reclaims at least npages into heap.
-// Returns the actual number of pages reclaimed.
-func (h *mheap) reclaimList(list *mSpanList, npages uintptr) uintptr {
-	n := uintptr(0)
-	sg := mheap_.sweepgen
-retry:
-	for s := list.first; s != nil; s = s.next {
-		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
-			list.remove(s)
-			// swept spans are at the end of the list
-			list.insertBack(s) // Puts it back on a busy list. s is not in the treap at this point.
-			unlock(&h.lock)
-			snpages := s.npages
-			if s.sweep(false) {
-				n += snpages
+// reclaim sweeps and reclaims at least npage pages into the heap.
+// It is called before allocating npage pages to keep growth in check.
+//
+// reclaim implements the page-reclaimer half of the sweeper.
+//
+// h must NOT be locked.
+func (h *mheap) reclaim(npage uintptr) {
+	// This scans pagesPerChunk at a time. Higher values reduce
+	// contention on h.reclaimPos, but increase the minimum
+	// latency of performing a reclaim.
+	//
+	// Must be a multiple of the pageInUse bitmap element size.
+	//
+	// The time required by this can vary a lot depending on how
+	// many spans are actually freed. Experimentally, it can scan
+	// for pages at ~300 GB/ms on a 2.6GHz Core i7, but can only
+	// free spans at ~32 MB/ms. Using 512 pages bounds this at
+	// roughly 100µs.
+	//
+	// TODO(austin): Half of the time spent freeing spans is in
+	// locking/unlocking the heap (even with low contention). We
+	// could make the slow path here several times faster by
+	// batching heap frees.
+	const pagesPerChunk = 512
+
+	// Bail early if there's no more reclaim work.
+	if atomic.Load64(&h.reclaimIndex) >= 1<<63 {
+		return
+	}
+
+	// Disable preemption so the GC can't start while we're
+	// sweeping, so we can read h.sweepArenas, and so
+	// traceGCSweepStart/Done pair on the P.
+	mp := acquirem()
+
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+	arenas := h.sweepArenas
+	locked := false
+	for npage > 0 {
+		// Pull from accumulated credit first.
+		if credit := atomic.Loaduintptr(&h.reclaimCredit); credit > 0 {
+			take := credit
+			if take > npage {
+				// Take only what we need.
+				take = npage
 			}
-			lock(&h.lock)
-			if n >= npages {
-				return n
+			if atomic.Casuintptr(&h.reclaimCredit, credit, credit-take) {
+				npage -= take
 			}
-			// the span could have been moved elsewhere
-			goto retry
-		}
-		if s.sweepgen == sg-1 {
-			// the span is being swept by background sweeper, skip
 			continue
 		}
-		// already swept empty span,
-		// all subsequent ones must also be either swept or in process of sweeping
-		break
-	}
-	return n
-}
 
-// Sweeps and reclaims at least npage pages into heap.
-// Called before allocating npage pages.
-func (h *mheap) reclaim(npage uintptr) {
-	// First try to sweep busy spans with large objects of size >= npage,
-	// this has good chances of reclaiming the necessary space.
-	for i := int(npage); i < len(h.busy); i++ {
-		if h.reclaimList(&h.busy[i], npage) != 0 {
-			return // Bingo!
+		// Claim a chunk of work.
+		idx := uintptr(atomic.Xadd64(&h.reclaimIndex, pagesPerChunk) - pagesPerChunk)
+		if idx/pagesPerArena >= uintptr(len(arenas)) {
+			// Page reclaiming is done.
+			atomic.Store64(&h.reclaimIndex, 1<<63)
+			break
 		}
-	}
 
-	// Then -- even larger objects.
-	if h.reclaimList(&h.busylarge, npage) != 0 {
-		return // Bingo!
-	}
+		if !locked {
+			// Lock the heap for reclaimChunk.
+			lock(&h.lock)
+			locked = true
+		}
 
-	// Now try smaller objects.
-	// One such object is not enough, so we need to reclaim several of them.
-	reclaimed := uintptr(0)
-	for i := 0; i < int(npage) && i < len(h.busy); i++ {
-		reclaimed += h.reclaimList(&h.busy[i], npage-reclaimed)
-		if reclaimed >= npage {
-			return
+		// Scan this chunk.
+		nfound := h.reclaimChunk(arenas, idx, pagesPerChunk)
+		if nfound <= npage {
+			npage -= nfound
+		} else {
+			// Put spare pages toward global credit.
+			atomic.Xadduintptr(&h.reclaimCredit, nfound-npage)
+			npage = 0
 		}
 	}
+	if locked {
+		unlock(&h.lock)
+	}
 
-	// Now sweep everything that is not yet swept.
-	unlock(&h.lock)
-	for {
-		n := sweepone()
-		if n == ^uintptr(0) { // all spans are swept
-			break
+	if trace.enabled {
+		traceGCSweepDone()
+	}
+	releasem(mp)
+}
+
+// reclaimChunk sweeps unmarked spans that start at page indexes [pageIdx, pageIdx+n).
+// It returns the number of pages returned to the heap.
+//
+// h.lock must be held and the caller must be non-preemptible.
+func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
+	// The heap lock must be held because this accesses the
+	// heapArena.spans arrays using potentially non-live pointers.
+	// In particular, if a span were freed and merged concurrently
+	// with this probing heapArena.spans, it would be possible to
+	// observe arbitrary, stale span pointers.
+	n0 := n
+	var nFreed uintptr
+	sg := h.sweepgen
+	for n > 0 {
+		ai := arenas[pageIdx/pagesPerArena]
+		ha := h.arenas[ai.l1()][ai.l2()]
+
+		// Get a chunk of the bitmap to work on.
+		arenaPage := uint(pageIdx % pagesPerArena)
+		inUse := ha.pageInUse[arenaPage/8:]
+		marked := ha.pageMarks[arenaPage/8:]
+		if uintptr(len(inUse)) > n/8 {
+			inUse = inUse[:n/8]
+			marked = marked[:n/8]
 		}
-		reclaimed += n
-		if reclaimed >= npage {
-			break
+
+		// Scan this bitmap chunk for spans that are in-use
+		// but have no marked objects on them.
+		for i := range inUse {
+			inUseUnmarked := inUse[i] &^ marked[i]
+			if inUseUnmarked == 0 {
+				continue
+			}
+
+			for j := uint(0); j < 8; j++ {
+				if inUseUnmarked&(1<<j) != 0 {
+					s := ha.spans[arenaPage+uint(i)*8+j]
+					if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+						npages := s.npages
+						unlock(&h.lock)
+						if s.sweep(false) {
+							nFreed += npages
+						}
+						lock(&h.lock)
+						// Reload inUse. It's possible nearby
+						// spans were freed when we dropped the
+						// lock and we don't want to get stale
+						// pointers from the spans array.
+						inUseUnmarked = inUse[i] &^ marked[i]
+					}
+				}
+			}
 		}
+
+		// Advance.
+		pageIdx += uintptr(len(inUse) * 8)
+		n -= uintptr(len(inUse) * 8)
 	}
-	lock(&h.lock)
+	if trace.enabled {
+		// Account for pages scanned but not reclaimed.
+		traceGCSweepSpan((n0 - nFreed) * pageSize)
+	}
+	return nFreed
 }
 
-// Allocate a new span of npage pages from the heap for GC'd memory
-// and record its size class in the HeapMap and HeapMapCache.
+// alloc_m is the internal implementation of mheap.alloc.
+//
+// alloc_m must run on the system stack because it locks the heap, so
+// any stack growth during alloc_m would self-deadlock.
+//
+//go:systemstack
 func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
 	_g_ := getg()
-	lock(&h.lock)
 
 	// To prevent excessive heap growth, before allocating n pages
 	// we need to sweep and reclaim at least n pages.
 	if h.sweepdone == 0 {
-		// TODO(austin): This tends to sweep a large number of
-		// spans in order to find a few completely free spans
-		// (for example, in the garbage benchmark, this sweeps
-		// ~30x the number of pages its trying to allocate).
-		// If GC kept a bit for whether there were any marks
-		// in a span, we could release these free spans
-		// at the end of GC and eliminate this entirely.
-		if trace.enabled {
-			traceGCSweepStart()
-		}
 		h.reclaim(npage)
-		if trace.enabled {
-			traceGCSweepDone()
-		}
 	}
 
+	lock(&h.lock)
 	// transfer stats from cache to global
 	memstats.heap_scan += uint64(_g_.m.mcache.local_scan)
 	_g_.m.mcache.local_scan = 0
@@ -692,7 +863,7 @@ func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
 		// able to map interior pointer to containing span.
 		atomic.Store(&s.sweepgen, h.sweepgen)
 		h.sweepSpans[h.sweepgen/2%2].push(s) // Add to swept in-use list.
-		s.state = _MSpanInUse
+		s.state = mSpanInUse
 		s.allocCount = 0
 		s.spanclass = spanclass
 		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
@@ -710,6 +881,10 @@ func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
 			s.baseMask = m.baseMask
 		}
 
+		// Mark in-use span in arena page bitmap.
+		arena, pageIdx, pageMask := pageIndexOf(s.base())
+		arena.pageInUse[pageIdx] |= pageMask
+
 		// update stats, sweep lists
 		h.pagesInUse += uint64(npage)
 		if large {
@@ -717,12 +892,6 @@ func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
 			mheap_.largealloc += uint64(s.elemsize)
 			mheap_.nlargealloc++
 			atomic.Xadd64(&memstats.heap_live, int64(npage<<_PageShift))
-			// Swept spans are at the end of lists.
-			if s.npages < uintptr(len(h.busy)) {
-				h.busy[s.npages].insertBack(s)
-			} else {
-				h.busylarge.insertBack(s)
-			}
 		}
 	}
 	// heap_scan and heap_live were updated.
@@ -747,6 +916,12 @@ func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
 	return s
 }
 
+// alloc allocates a new span of npage pages from the GC'd heap.
+//
+// Either large must be true or spanclass must indicates the span's
+// size class and scannability.
+//
+// If needzero is true, the memory for the returned span will be zeroed.
 func (h *mheap) alloc(npage uintptr, spanclass spanClass, large bool, needzero bool) *mspan {
 	// Don't do any operations that lock the heap on the G stack.
 	// It might trigger stack growth, and the stack growth code needs
@@ -784,7 +959,7 @@ func (h *mheap) allocManual(npage uintptr, stat *uint64) *mspan {
 	lock(&h.lock)
 	s := h.allocSpanLocked(npage, stat)
 	if s != nil {
-		s.state = _MSpanManual
+		s.state = mSpanManual
 		s.manualFreeList = 0
 		s.allocCount = 0
 		s.spanclass = 0
@@ -823,48 +998,61 @@ func (h *mheap) setSpans(base, npage uintptr, s *mspan) {
 	}
 }
 
+// pickFreeSpan acquires a free span from internal free list
+// structures if one is available. Otherwise returns nil.
+// h must be locked.
+func (h *mheap) pickFreeSpan(npage uintptr) *mspan {
+	tf := h.free.find(npage)
+	ts := h.scav.find(npage)
+
+	// Check for whichever treap gave us the smaller, non-nil result.
+	// Note that we want the _smaller_ free span, i.e. the free span
+	// closer in size to the amount we requested (npage).
+	var s *mspan
+	if tf != nil && (ts == nil || tf.spanKey.npages <= ts.spanKey.npages) {
+		s = tf.spanKey
+		h.free.removeNode(tf)
+	} else if ts != nil && (tf == nil || tf.spanKey.npages > ts.spanKey.npages) {
+		s = ts.spanKey
+		h.scav.removeNode(ts)
+	}
+	return s
+}
+
 // Allocates a span of the given size.  h must be locked.
 // The returned span has been removed from the
-// free list, but its state is still MSpanFree.
+// free structures, but its state is still mSpanFree.
 func (h *mheap) allocSpanLocked(npage uintptr, stat *uint64) *mspan {
-	var list *mSpanList
 	var s *mspan
 
-	// Try in fixed-size lists up to max.
-	for i := int(npage); i < len(h.free); i++ {
-		list = &h.free[i]
-		if !list.isEmpty() {
-			s = list.first
-			list.remove(s)
-			goto HaveSpan
-		}
+	s = h.pickFreeSpan(npage)
+	if s != nil {
+		goto HaveSpan
 	}
-	// Best fit in list of large spans.
-	s = h.allocLarge(npage) // allocLarge removed s from h.freelarge for us
-	if s == nil {
-		if !h.grow(npage) {
-			return nil
-		}
-		s = h.allocLarge(npage)
-		if s == nil {
-			return nil
-		}
+	// On failure, grow the heap and try again.
+	if !h.grow(npage) {
+		return nil
+	}
+	s = h.pickFreeSpan(npage)
+	if s != nil {
+		goto HaveSpan
 	}
+	throw("grew heap, but no adequate free span found")
 
 HaveSpan:
 	// Mark span in use.
-	if s.state != _MSpanFree {
-		throw("MHeap_AllocLocked - MSpan not free")
+	if s.state != mSpanFree {
+		throw("candidate mspan for allocation is not free")
 	}
 	if s.npages < npage {
-		throw("MHeap_AllocLocked - bad npages")
-	}
-	if s.npreleased > 0 {
-		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
-		memstats.heap_released -= uint64(s.npreleased << _PageShift)
-		s.npreleased = 0
+		throw("candidate mspan for allocation is too small")
 	}
 
+	// First, subtract any memory that was released back to
+	// the OS from s. We will re-scavenge the trimmed section
+	// if necessary.
+	memstats.heap_released -= uint64(s.released())
+
 	if s.npages > npage {
 		// Trim extra and put it back in the heap.
 		t := (*mspan)(h.spanalloc.alloc())
@@ -874,10 +1062,25 @@ HaveSpan:
 		h.setSpan(t.base(), t)
 		h.setSpan(t.base()+t.npages*pageSize-1, t)
 		t.needzero = s.needzero
-		s.state = _MSpanManual // prevent coalescing with s
-		t.state = _MSpanManual
+		// If s was scavenged, then t may be scavenged.
+		start, end := t.physPageBounds()
+		if s.scavenged && start < end {
+			memstats.heap_released += uint64(end - start)
+			t.scavenged = true
+		}
+		s.state = mSpanManual // prevent coalescing with s
+		t.state = mSpanManual
 		h.freeSpanLocked(t, false, false, s.unusedsince)
-		s.state = _MSpanFree
+		s.state = mSpanFree
+	}
+	// "Unscavenge" s only AFTER splitting so that
+	// we only sysUsed whatever we actually need.
+	if s.scavenged {
+		// sysUsed all the pages that are actually available
+		// in the span. Note that we don't need to decrement
+		// heap_released since we already did so earlier.
+		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
+		s.scavenged = false
 	}
 	s.unusedsince = 0
 
@@ -893,21 +1096,6 @@ HaveSpan:
 	return s
 }
 
-// Large spans have a minimum size of 1MByte. The maximum number of large spans to support
-// 1TBytes is 1 million, experimentation using random sizes indicates that the depth of
-// the tree is less that 2x that of a perfectly balanced tree. For 1TByte can be referenced
-// by a perfectly balanced tree with a depth of 20. Twice that is an acceptable 40.
-func (h *mheap) isLargeSpan(npages uintptr) bool {
-	return npages >= uintptr(len(h.free))
-}
-
-// allocLarge allocates a span of at least npage pages from the treap of large spans.
-// Returns nil if no such span currently exists.
-func (h *mheap) allocLarge(npage uintptr) *mspan {
-	// Search treap for smallest span with >= npage pages.
-	return h.freelarge.remove(npage)
-}
-
 // Try to add at least npage pages of memory to the heap,
 // returning whether it worked.
 //
@@ -920,20 +1108,31 @@ func (h *mheap) grow(npage uintptr) bool {
 		return false
 	}
 
+	// Scavenge some pages out of the free treap to make up for
+	// the virtual memory space we just allocated. We prefer to
+	// scavenge the largest spans first since the cost of scavenging
+	// is proportional to the number of sysUnused() calls rather than
+	// the number of pages released, so we make fewer of those calls
+	// with larger spans.
+	h.scavengeLargest(size)
+
 	// Create a fake "in use" span and free it, so that the
 	// right coalescing happens.
 	s := (*mspan)(h.spanalloc.alloc())
 	s.init(uintptr(v), size/pageSize)
 	h.setSpans(s.base(), s.npages, s)
 	atomic.Store(&s.sweepgen, h.sweepgen)
-	s.state = _MSpanInUse
+	s.state = mSpanInUse
 	h.pagesInUse += uint64(s.npages)
 	h.freeSpanLocked(s, false, true, 0)
 	return true
 }
 
 // Free the span back into the heap.
-func (h *mheap) freeSpan(s *mspan, acct int32) {
+//
+// large must match the value of large passed to mheap.alloc. This is
+// used for accounting.
+func (h *mheap) freeSpan(s *mspan, large bool) {
 	systemstack(func() {
 		mp := getg().m
 		lock(&h.lock)
@@ -947,7 +1146,8 @@ func (h *mheap) freeSpan(s *mspan, acct int32) {
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		if acct != 0 {
+		if large {
+			// Match accounting done in mheap.alloc.
 			memstats.heap_objects--
 		}
 		if gcBlackenEnabled != 0 {
@@ -979,21 +1179,25 @@ func (h *mheap) freeManual(s *mspan, stat *uint64) {
 	unlock(&h.lock)
 }
 
-// s must be on a busy list (h.busy or h.busylarge) or unlinked.
+// s must be on the busy list or unlinked.
 func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince int64) {
 	switch s.state {
-	case _MSpanManual:
+	case mSpanManual:
 		if s.allocCount != 0 {
-			throw("MHeap_FreeSpanLocked - invalid stack free")
+			throw("mheap.freeSpanLocked - invalid stack free")
 		}
-	case _MSpanInUse:
+	case mSpanInUse:
 		if s.allocCount != 0 || s.sweepgen != h.sweepgen {
-			print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
-			throw("MHeap_FreeSpanLocked - invalid free")
+			print("mheap.freeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
+			throw("mheap.freeSpanLocked - invalid free")
 		}
 		h.pagesInUse -= uint64(s.npages)
+
+		// Clear in-use bit in arena page bitmap.
+		arena, pageIdx, pageMask := pageIndexOf(s.base())
+		arena.pageInUse[pageIdx] &^= pageMask
 	default:
-		throw("MHeap_FreeSpanLocked - invalid span state")
+		throw("mheap.freeSpanLocked - invalid span state")
 	}
 
 	if acctinuse {
@@ -1002,10 +1206,7 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince i
 	if acctidle {
 		memstats.heap_idle += uint64(s.npages << _PageShift)
 	}
-	s.state = _MSpanFree
-	if s.inList() {
-		h.busyList(s.npages).remove(s)
-	}
+	s.state = mSpanFree
 
 	// Stamp newly unused spans. The scavenger will use that
 	// info to potentially give back some pages to the OS.
@@ -1013,133 +1214,122 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince i
 	if unusedsince == 0 {
 		s.unusedsince = nanotime()
 	}
-	s.npreleased = 0
+
+	// We scavenge s at the end after coalescing if s or anything
+	// it merged with is marked scavenged.
+	needsScavenge := false
+	prescavenged := s.released() // number of bytes already scavenged.
 
 	// Coalesce with earlier, later spans.
-	if before := spanOf(s.base() - 1); before != nil && before.state == _MSpanFree {
+	if before := spanOf(s.base() - 1); before != nil && before.state == mSpanFree {
 		// Now adjust s.
 		s.startAddr = before.startAddr
 		s.npages += before.npages
-		s.npreleased = before.npreleased // absorb released pages
 		s.needzero |= before.needzero
 		h.setSpan(before.base(), s)
+		// If before or s are scavenged, then we need to scavenge the final coalesced span.
+		needsScavenge = needsScavenge || before.scavenged || s.scavenged
+		prescavenged += before.released()
 		// The size is potentially changing so the treap needs to delete adjacent nodes and
 		// insert back as a combined node.
-		if h.isLargeSpan(before.npages) {
-			// We have a t, it is large so it has to be in the treap so we can remove it.
-			h.freelarge.removeSpan(before)
+		if before.scavenged {
+			h.scav.removeSpan(before)
 		} else {
-			h.freeList(before.npages).remove(before)
+			h.free.removeSpan(before)
 		}
-		before.state = _MSpanDead
+		before.state = mSpanDead
 		h.spanalloc.free(unsafe.Pointer(before))
 	}
 
 	// Now check to see if next (greater addresses) span is free and can be coalesced.
-	if after := spanOf(s.base() + s.npages*pageSize); after != nil && after.state == _MSpanFree {
+	if after := spanOf(s.base() + s.npages*pageSize); after != nil && after.state == mSpanFree {
 		s.npages += after.npages
-		s.npreleased += after.npreleased
 		s.needzero |= after.needzero
 		h.setSpan(s.base()+s.npages*pageSize-1, s)
-		if h.isLargeSpan(after.npages) {
-			h.freelarge.removeSpan(after)
+		needsScavenge = needsScavenge || after.scavenged || s.scavenged
+		prescavenged += after.released()
+		if after.scavenged {
+			h.scav.removeSpan(after)
 		} else {
-			h.freeList(after.npages).remove(after)
+			h.free.removeSpan(after)
 		}
-		after.state = _MSpanDead
+		after.state = mSpanDead
 		h.spanalloc.free(unsafe.Pointer(after))
 	}
 
-	// Insert s into appropriate list or treap.
-	if h.isLargeSpan(s.npages) {
-		h.freelarge.insert(s)
+	if needsScavenge {
+		// When coalescing spans, some physical pages which
+		// were not returned to the OS previously because
+		// they were only partially covered by the span suddenly
+		// become available for scavenging. We want to make sure
+		// those holes are filled in, and the span is properly
+		// scavenged. Rather than trying to detect those holes
+		// directly, we collect how many bytes were already
+		// scavenged above and subtract that from heap_released
+		// before re-scavenging the entire newly-coalesced span,
+		// which will implicitly bump up heap_released.
+		memstats.heap_released -= uint64(prescavenged)
+		s.scavenge()
+	}
+
+	// Insert s into the appropriate treap.
+	if s.scavenged {
+		h.scav.insert(s)
 	} else {
-		h.freeList(s.npages).insert(s)
-	}
-}
-
-func (h *mheap) freeList(npages uintptr) *mSpanList {
-	return &h.free[npages]
-}
-
-func (h *mheap) busyList(npages uintptr) *mSpanList {
-	if npages < uintptr(len(h.busy)) {
-		return &h.busy[npages]
-	}
-	return &h.busylarge
-}
-
-func scavengeTreapNode(t *treapNode, now, limit uint64) uintptr {
-	s := t.spanKey
-	var sumreleased uintptr
-	if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
-		start := s.base()
-		end := start + s.npages<<_PageShift
-		if physPageSize > _PageSize {
-			// We can only release pages in
-			// physPageSize blocks, so round start
-			// and end in. (Otherwise, madvise
-			// will round them *out* and release
-			// more memory than we want.)
-			start = (start + physPageSize - 1) &^ (physPageSize - 1)
-			end &^= physPageSize - 1
-			if end <= start {
-				// start and end don't span a
-				// whole physical page.
-				return sumreleased
-			}
-		}
-		len := end - start
-		released := len - (s.npreleased << _PageShift)
-		if physPageSize > _PageSize && released == 0 {
-			return sumreleased
-		}
-		memstats.heap_released += uint64(released)
-		sumreleased += released
-		s.npreleased = len >> _PageShift
-		sysUnused(unsafe.Pointer(start), len)
-	}
-	return sumreleased
-}
-
-func scavengelist(list *mSpanList, now, limit uint64) uintptr {
-	if list.isEmpty() {
-		return 0
-	}
-
-	var sumreleased uintptr
-	for s := list.first; s != nil; s = s.next {
-		if (now-uint64(s.unusedsince)) <= limit || s.npreleased == s.npages {
-			continue
+		h.free.insert(s)
+	}
+}
+
+// scavengeLargest scavenges nbytes worth of spans in unscav
+// starting from the largest span and working down. It then takes those spans
+// and places them in scav. h must be locked.
+func (h *mheap) scavengeLargest(nbytes uintptr) {
+	// Iterate over the treap backwards (from largest to smallest) scavenging spans
+	// until we've reached our quota of nbytes.
+	released := uintptr(0)
+	for t := h.free.end(); released < nbytes && t.valid(); {
+		s := t.span()
+		r := s.scavenge()
+		if r == 0 {
+			// Since we're going in order of largest-to-smallest span, this
+			// means all other spans are no bigger than s. There's a high
+			// chance that the other spans don't even cover a full page,
+			// (though they could) but iterating further just for a handful
+			// of pages probably isn't worth it, so just stop here.
+			//
+			// This check also preserves the invariant that spans that have
+			// `scavenged` set are only ever in the `scav` treap, and
+			// those which have it unset are only in the `free` treap.
+			return
 		}
-		start := s.base()
-		end := start + s.npages<<_PageShift
-		if physPageSize > _PageSize {
-			// We can only release pages in
-			// physPageSize blocks, so round start
-			// and end in. (Otherwise, madvise
-			// will round them *out* and release
-			// more memory than we want.)
-			start = (start + physPageSize - 1) &^ (physPageSize - 1)
-			end &^= physPageSize - 1
-			if end <= start {
-				// start and end don't span a
-				// whole physical page.
-				continue
+		n := t.prev()
+		h.free.erase(t)
+		t = n
+		h.scav.insert(s)
+		released += r
+	}
+}
+
+// scavengeAll visits each node in the unscav treap and scavenges the
+// treapNode's span. It then removes the scavenged span from
+// unscav and adds it into scav before continuing. h must be locked.
+func (h *mheap) scavengeAll(now, limit uint64) uintptr {
+	// Iterate over the treap scavenging spans if unused for at least limit time.
+	released := uintptr(0)
+	for t := h.free.start(); t.valid(); {
+		s := t.span()
+		n := t.next()
+		if (now - uint64(s.unusedsince)) > limit {
+			r := s.scavenge()
+			if r != 0 {
+				h.free.erase(t)
+				h.scav.insert(s)
+				released += r
 			}
 		}
-		len := end - start
-
-		released := len - (s.npreleased << _PageShift)
-		if physPageSize > _PageSize && released == 0 {
-			continue
-		}
-		memstats.heap_released += uint64(released)
-		sumreleased += released
-		s.npreleased = len >> _PageShift
-		sysUnused(unsafe.Pointer(start), len)
+		t = n
 	}
-	return sumreleased
+	return released
 }
 
 func (h *mheap) scavenge(k int32, now, limit uint64) {
@@ -1149,17 +1339,13 @@ func (h *mheap) scavenge(k int32, now, limit uint64) {
 	gp := getg()
 	gp.m.mallocing++
 	lock(&h.lock)
-	var sumreleased uintptr
-	for i := 0; i < len(h.free); i++ {
-		sumreleased += scavengelist(&h.free[i], now, limit)
-	}
-	sumreleased += scavengetreap(h.freelarge.treap, now, limit)
+	released := h.scavengeAll(now, limit)
 	unlock(&h.lock)
 	gp.m.mallocing--
 
 	if debug.gctrace > 0 {
-		if sumreleased > 0 {
-			print("scvg", k, ": ", sumreleased>>20, " MB released\n")
+		if released > 0 {
+			print("scvg", k, ": ", released>>20, " MB released\n")
 		}
 		print("scvg", k, ": inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
 	}
@@ -1181,11 +1367,10 @@ func (span *mspan) init(base uintptr, npages uintptr) {
 	span.npages = npages
 	span.allocCount = 0
 	span.spanclass = 0
-	span.incache = false
 	span.elemsize = 0
-	span.state = _MSpanDead
+	span.state = mSpanDead
 	span.unusedsince = 0
-	span.npreleased = 0
+	span.scavenged = false
 	span.speciallock.key = 0
 	span.specials = nil
 	span.needzero = 0
@@ -1206,9 +1391,9 @@ func (list *mSpanList) init() {
 
 func (list *mSpanList) remove(span *mspan) {
 	if span.list != list {
-		print("runtime: failed MSpanList_Remove span.npages=", span.npages,
+		print("runtime: failed mSpanList.remove span.npages=", span.npages,
 			" span=", span, " prev=", span.prev, " span.list=", span.list, " list=", list, "\n")
-		throw("MSpanList_Remove")
+		throw("mSpanList.remove")
 	}
 	if list.first == span {
 		list.first = span.next
@@ -1231,8 +1416,8 @@ func (list *mSpanList) isEmpty() bool {
 
 func (list *mSpanList) insert(span *mspan) {
 	if span.next != nil || span.prev != nil || span.list != nil {
-		println("runtime: failed MSpanList_Insert", span, span.next, span.prev, span.list)
-		throw("MSpanList_Insert")
+		println("runtime: failed mSpanList.insert", span, span.next, span.prev, span.list)
+		throw("mSpanList.insert")
 	}
 	span.next = list.first
 	if list.first != nil {
@@ -1249,8 +1434,8 @@ func (list *mSpanList) insert(span *mspan) {
 
 func (list *mSpanList) insertBack(span *mspan) {
 	if span.next != nil || span.prev != nil || span.list != nil {
-		println("runtime: failed MSpanList_InsertBack", span, span.next, span.prev, span.list)
-		throw("MSpanList_InsertBack")
+		println("runtime: failed mSpanList.insertBack", span, span.next, span.prev, span.list)
+		throw("mSpanList.insertBack")
 	}
 	span.prev = list.last
 	if list.last != nil {
@@ -1432,9 +1617,6 @@ func addfinalizer(p unsafe.Pointer, f *funcval, ft *functype, ot *ptrtype) bool
 			// Mark the finalizer itself, since the
 			// special isn't part of the GC'd heap.
 			scanblock(uintptr(unsafe.Pointer(&s.fn)), sys.PtrSize, &oneptrmask[0], gcw)
-			if gcBlackenPromptly {
-				gcw.dispose()
-			}
 			releasem(mp)
 		}
 		return true
@@ -1479,7 +1661,7 @@ func setprofilebucket(p unsafe.Pointer, b *bucket) {
 }
 
 // Do whatever cleanup needs to be done to deallocate s. It has
-// already been unlinked from the MSpan specials list.
+// already been unlinked from the mspan specials list.
 func freespecial(s *special, p unsafe.Pointer, size uintptr) {
 	switch s.kind {
 	case _KindSpecialFinalizer:
diff --git a/libgo/go/runtime/mkfastlog2table.go b/libgo/go/runtime/mkfastlog2table.go
index 587ebf476d3..305c84a7c11 100644
--- a/libgo/go/runtime/mkfastlog2table.go
+++ b/libgo/go/runtime/mkfastlog2table.go
@@ -20,7 +20,7 @@ import (
 func main() {
 	var buf bytes.Buffer
 
-	fmt.Fprintln(&buf, "// AUTO-GENERATED by mkfastlog2table.go")
+	fmt.Fprintln(&buf, "// Code generated by mkfastlog2table.go; DO NOT EDIT.")
 	fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
 	fmt.Fprintln(&buf, "// See mkfastlog2table.go for comments.")
 	fmt.Fprintln(&buf)
diff --git a/libgo/go/runtime/mprof.go b/libgo/go/runtime/mprof.go
index 2bbf37a63e4..ab975696fe6 100644
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@@ -734,7 +734,7 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
 	isOK := func(gp1 *g) bool {
 		// Checking isSystemGoroutine here makes GoroutineProfile
 		// consistent with both NumGoroutine and Stack.
-		return gp1 != gp && readgstatus(gp1) != _Gdead && !isSystemGoroutine(gp1)
+		return gp1 != gp && readgstatus(gp1) != _Gdead && !isSystemGoroutine(gp1, false)
 	}
 
 	stopTheWorld("profile")
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 8aac8502e0b..cd9da02afda 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -38,24 +38,10 @@ type mstats struct {
 	heap_alloc    uint64 // bytes allocated and not yet freed (same as alloc above)
 	heap_sys      uint64 // virtual address space obtained from system for GC'd heap
 	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in _MSpanInUse spans
+	heap_inuse    uint64 // bytes in mSpanInUse spans
 	heap_released uint64 // bytes released to the os
 	heap_objects  uint64 // total number of allocated objects
 
-	// TODO(austin): heap_released is both useless and inaccurate
-	// in its current form. It's useless because, from the user's
-	// and OS's perspectives, there's no difference between a page
-	// that has not yet been faulted in and a page that has been
-	// released back to the OS. We could fix this by considering
-	// newly mapped spans to be "released". It's inaccurate
-	// because when we split a large span for allocation, we
-	// "unrelease" all pages in the large span and not just the
-	// ones we split off for use. This is trickier to fix because
-	// we currently don't know which pages of a span we've
-	// released. We could fix it by separating "free" and
-	// "released" spans, but then we have to allocate from runs of
-	// free and released spans.
-
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
 	stacks_inuse uint64 // bytes in manually-managed stack spans
@@ -543,7 +529,7 @@ func updatememstats() {
 		memstats.by_size[i].nfree = 0
 	}
 
-	// Flush MCache's to MCentral.
+	// Flush mcache's to mcentral.
 	systemstack(flushallmcaches)
 
 	// Aggregate local stats.
diff --git a/libgo/go/runtime/mwbbuf.go b/libgo/go/runtime/mwbbuf.go
index 39d13709069..4c875ff2d15 100644
--- a/libgo/go/runtime/mwbbuf.go
+++ b/libgo/go/runtime/mwbbuf.go
@@ -23,6 +23,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -56,6 +57,12 @@ type wbBuf struct {
 	// on. This must be a multiple of wbBufEntryPointers because
 	// the write barrier only checks for overflow once per entry.
 	buf [wbBufEntryPointers * wbBufEntries]uintptr
+
+	// debugGen causes the write barrier buffer to flush after
+	// every write barrier if equal to gcWorkPauseGen. This is for
+	// debugging #27993. This is only set if debugCachedWork is
+	// set.
+	debugGen uint32
 }
 
 const (
@@ -79,7 +86,7 @@ const (
 func (b *wbBuf) reset() {
 	start := uintptr(unsafe.Pointer(&b.buf[0]))
 	b.next = start
-	if gcBlackenPromptly || writeBarrier.cgo {
+	if writeBarrier.cgo || (debugCachedWork && (throwOnGCWork || b.debugGen == atomic.Load(&gcWorkPauseGen))) {
 		// Effectively disable the buffer by forcing a flush
 		// on every barrier.
 		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
@@ -107,6 +114,11 @@ func (b *wbBuf) discard() {
 	b.next = uintptr(unsafe.Pointer(&b.buf[0]))
 }
 
+// empty reports whether b contains no pointers.
+func (b *wbBuf) empty() bool {
+	return b.next == uintptr(unsafe.Pointer(&b.buf[0]))
+}
+
 // putFast adds old and new to the write barrier buffer and returns
 // false if a flush is necessary. Callers should use this as:
 //
@@ -192,10 +204,32 @@ func wbBufFlush(dst *uintptr, src uintptr) {
 	// Switch to the system stack so we don't have to worry about
 	// the untyped stack slots or safe points.
 	systemstack(func() {
-		wbBufFlush1(getg().m.p.ptr())
+		if debugCachedWork {
+			// For debugging, include the old value of the
+			// slot and some other data in the traceback.
+			wbBuf := &getg().m.p.ptr().wbBuf
+			var old uintptr
+			if dst != nil {
+				// dst may be nil in direct calls to wbBufFlush.
+				old = *dst
+			}
+			wbBufFlush1Debug(old, wbBuf.buf[0], wbBuf.buf[1], &wbBuf.buf[0], wbBuf.next)
+		} else {
+			wbBufFlush1(getg().m.p.ptr())
+		}
 	})
 }
 
+// wbBufFlush1Debug is a temporary function for debugging issue
+// #27993. It exists solely to add some context to the traceback.
+//
+//go:nowritebarrierrec
+//go:systemstack
+//go:noinline
+func wbBufFlush1Debug(old, buf1, buf2 uintptr, start *uintptr, next uintptr) {
+	wbBufFlush1(getg().m.p.ptr())
+}
+
 // wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
 //
 // This must not have write barriers because it is part of the write
@@ -212,14 +246,16 @@ func wbBufFlush1(_p_ *p) {
 	n := (_p_.wbBuf.next - start) / unsafe.Sizeof(_p_.wbBuf.buf[0])
 	ptrs := _p_.wbBuf.buf[:n]
 
-	// Reset the buffer.
-	_p_.wbBuf.reset()
+	// Poison the buffer to make extra sure nothing is enqueued
+	// while we're processing the buffer.
+	_p_.wbBuf.next = 0
 
 	if useCheckmark {
 		// Slow path for checkmark mode.
 		for _, ptr := range ptrs {
 			shade(ptr)
 		}
+		_p_.wbBuf.reset()
 		return
 	}
 
@@ -270,9 +306,6 @@ func wbBufFlush1(_p_ *p) {
 
 	// Enqueue the greyed objects.
 	gcw.putBatch(ptrs[:pos])
-	if gcphase == _GCmarktermination || gcBlackenPromptly {
-		// Ps aren't allowed to cache work during mark
-		// termination.
-		gcw.dispose()
-	}
+
+	_p_.wbBuf.reset()
 }
diff --git a/libgo/go/runtime/netpoll.go b/libgo/go/runtime/netpoll.go
index 6016b7d64f5..fa7f1fad71c 100644
--- a/libgo/go/runtime/netpoll.go
+++ b/libgo/go/runtime/netpoll.go
@@ -59,14 +59,15 @@ type pollDesc struct {
 	lock    mutex // protects the following fields
 	fd      uintptr
 	closing bool
-	seq     uintptr // protects from stale timers and ready notifications
+	user    uint32  // user settable cookie
+	rseq    uintptr // protects from stale read timers
 	rg      uintptr // pdReady, pdWait, G waiting for read or nil
 	rt      timer   // read deadline timer (set if rt.f != nil)
 	rd      int64   // read deadline
+	wseq    uintptr // protects from stale write timers
 	wg      uintptr // pdReady, pdWait, G waiting for write or nil
 	wt      timer   // write deadline timer
 	wd      int64   // write deadline
-	user    uint32  // user settable cookie
 }
 
 type pollCache struct {
@@ -95,12 +96,19 @@ func netpollinited() bool {
 	return atomic.Load(&netpollInited) != 0
 }
 
-//go:linkname poll_runtime_pollServerDescriptor internal..z2fpoll.runtime_pollServerDescriptor
+//go:linkname poll_runtime_isPollServerDescriptor internal..z2fpoll.runtime_isPollServerDescriptor
 
-// poll_runtime_pollServerDescriptor returns the descriptor being used,
-// or ^uintptr(0) if the system does not use a poll descriptor.
-func poll_runtime_pollServerDescriptor() uintptr {
-	return netpolldescriptor()
+// poll_runtime_isPollServerDescriptor reports whether fd is a
+// descriptor being used by netpoll.
+func poll_runtime_isPollServerDescriptor(fd uintptr) bool {
+	fds := netpolldescriptor()
+	if GOOS != "aix" {
+		return fd == fds
+	} else {
+		// AIX have a pipe in its netpoll implementation.
+		// Therefore, two fd are returned by netpolldescriptor using a mask.
+		return fd == fds&0xFFFF || fd == (fds>>16)&0xFFFF
+	}
 }
 
 //go:linkname poll_runtime_pollOpen internal..z2fpoll.runtime_pollOpen
@@ -115,9 +123,10 @@ func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	}
 	pd.fd = fd
 	pd.closing = false
-	pd.seq++
+	pd.rseq++
 	pd.rg = 0
 	pd.rd = 0
+	pd.wseq++
 	pd.wg = 0
 	pd.wd = 0
 	unlock(&pd.lock)
@@ -200,19 +209,15 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 		unlock(&pd.lock)
 		return
 	}
-	pd.seq++ // invalidate current timers
-	// Reset current timers.
-	if pd.rt.f != nil {
-		deltimer(&pd.rt)
-		pd.rt.f = nil
-	}
-	if pd.wt.f != nil {
-		deltimer(&pd.wt)
-		pd.wt.f = nil
-	}
-	// Setup new timers.
-	if d != 0 && d <= nanotime() {
-		d = -1
+	rd0, wd0 := pd.rd, pd.wd
+	combo0 := rd0 > 0 && rd0 == wd0
+	if d > 0 {
+		d += nanotime()
+		if d <= 0 {
+			// If the user has a deadline in the future, but the delay calculation
+			// overflows, then set the deadline to the maximum possible value.
+			d = 1<<63 - 1
+		}
 	}
 	if mode == 'r' || mode == 'r'+'w' {
 		pd.rd = d
@@ -220,39 +225,58 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 	if mode == 'w' || mode == 'r'+'w' {
 		pd.wd = d
 	}
-	if pd.rd > 0 && pd.rd == pd.wd {
-		pd.rt.f = netpollDeadline
-		pd.rt.when = pd.rd
-		// Copy current seq into the timer arg.
-		// Timer func will check the seq against current descriptor seq,
-		// if they differ the descriptor was reused or timers were reset.
-		pd.rt.arg = pd
-		pd.rt.seq = pd.seq
-		addtimer(&pd.rt)
-	} else {
+	combo := pd.rd > 0 && pd.rd == pd.wd
+	rtf := netpollReadDeadline
+	if combo {
+		rtf = netpollDeadline
+	}
+	if pd.rt.f == nil {
 		if pd.rd > 0 {
-			pd.rt.f = netpollReadDeadline
+			pd.rt.f = rtf
 			pd.rt.when = pd.rd
+			// Copy current seq into the timer arg.
+			// Timer func will check the seq against current descriptor seq,
+			// if they differ the descriptor was reused or timers were reset.
 			pd.rt.arg = pd
-			pd.rt.seq = pd.seq
+			pd.rt.seq = pd.rseq
 			addtimer(&pd.rt)
 		}
-		if pd.wd > 0 {
+	} else if pd.rd != rd0 || combo != combo0 {
+		pd.rseq++ // invalidate current timers
+		if pd.rd > 0 {
+			modtimer(&pd.rt, pd.rd, 0, rtf, pd, pd.rseq)
+		} else {
+			deltimer(&pd.rt)
+			pd.rt.f = nil
+		}
+	}
+	if pd.wt.f == nil {
+		if pd.wd > 0 && !combo {
 			pd.wt.f = netpollWriteDeadline
 			pd.wt.when = pd.wd
 			pd.wt.arg = pd
-			pd.wt.seq = pd.seq
+			pd.wt.seq = pd.wseq
 			addtimer(&pd.wt)
 		}
+	} else if pd.wd != wd0 || combo != combo0 {
+		pd.wseq++ // invalidate current timers
+		if pd.wd > 0 && !combo {
+			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd, pd.wseq)
+		} else {
+			deltimer(&pd.wt)
+			pd.wt.f = nil
+		}
 	}
 	// If we set the new deadline in the past, unblock currently pending IO if any.
 	var rg, wg *g
-	atomicstorep(unsafe.Pointer(&wg), nil) // full memory barrier between stores to rd/wd and load of rg/wg in netpollunblock
-	if pd.rd < 0 {
-		rg = netpollunblock(pd, 'r', false)
-	}
-	if pd.wd < 0 {
-		wg = netpollunblock(pd, 'w', false)
+	if pd.rd < 0 || pd.wd < 0 {
+		atomic.StorepNoWB(noescape(unsafe.Pointer(&wg)), nil) // full memory barrier between stores to rd/wd and load of rg/wg in netpollunblock
+		if pd.rd < 0 {
+			rg = netpollunblock(pd, 'r', false)
+		}
+		if pd.wd < 0 {
+			wg = netpollunblock(pd, 'w', false)
+		}
 	}
 	unlock(&pd.lock)
 	if rg != nil {
@@ -270,9 +294,10 @@ func poll_runtime_pollUnblock(pd *pollDesc) {
 		throw("runtime: unblock on closing polldesc")
 	}
 	pd.closing = true
-	pd.seq++
+	pd.rseq++
+	pd.wseq++
 	var rg, wg *g
-	atomicstorep(unsafe.Pointer(&rg), nil) // full memory barrier between store to closing and read of rg/wg in netpollunblock
+	atomic.StorepNoWB(noescape(unsafe.Pointer(&rg)), nil) // full memory barrier between store to closing and read of rg/wg in netpollunblock
 	rg = netpollunblock(pd, 'r', false)
 	wg = netpollunblock(pd, 'w', false)
 	if pd.rt.f != nil {
@@ -292,24 +317,22 @@ func poll_runtime_pollUnblock(pd *pollDesc) {
 	}
 }
 
-// make pd ready, newly runnable goroutines (if any) are returned in rg/wg
+// make pd ready, newly runnable goroutines (if any) are added to toRun.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrier
-func netpollready(gpp *guintptr, pd *pollDesc, mode int32) {
-	var rg, wg guintptr
+func netpollready(toRun *gList, pd *pollDesc, mode int32) {
+	var rg, wg *g
 	if mode == 'r' || mode == 'r'+'w' {
-		rg.set(netpollunblock(pd, 'r', true))
+		rg = netpollunblock(pd, 'r', true)
 	}
 	if mode == 'w' || mode == 'r'+'w' {
-		wg.set(netpollunblock(pd, 'w', true))
+		wg = netpollunblock(pd, 'w', true)
 	}
-	if rg != 0 {
-		rg.ptr().schedlink = *gpp
-		*gpp = rg
+	if rg != nil {
+		toRun.push(rg)
 	}
-	if wg != 0 {
-		wg.ptr().schedlink = *gpp
-		*gpp = wg
+	if wg != nil {
+		toRun.push(wg)
 	}
 }
 
@@ -409,7 +432,11 @@ func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
 	lock(&pd.lock)
 	// Seq arg is seq when the timer was set.
 	// If it's stale, ignore the timer event.
-	if seq != pd.seq {
+	currentSeq := pd.rseq
+	if !read {
+		currentSeq = pd.wseq
+	}
+	if seq != currentSeq {
 		// The descriptor was reused or timers were reset.
 		unlock(&pd.lock)
 		return
@@ -420,7 +447,7 @@ func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
 			throw("runtime: inconsistent read deadline")
 		}
 		pd.rd = -1
-		atomicstorep(unsafe.Pointer(&pd.rt.f), nil) // full memory barrier between store to rd and load of rg in netpollunblock
+		atomic.StorepNoWB(unsafe.Pointer(&pd.rt.f), nil) // full memory barrier between store to rd and load of rg in netpollunblock
 		rg = netpollunblock(pd, 'r', false)
 	}
 	var wg *g
@@ -429,7 +456,7 @@ func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
 			throw("runtime: inconsistent write deadline")
 		}
 		pd.wd = -1
-		atomicstorep(unsafe.Pointer(&pd.wt.f), nil) // full memory barrier between store to wd and load of wg in netpollunblock
+		atomic.StorepNoWB(unsafe.Pointer(&pd.wt.f), nil) // full memory barrier between store to wd and load of wg in netpollunblock
 		wg = netpollunblock(pd, 'w', false)
 	}
 	unlock(&pd.lock)
diff --git a/libgo/go/runtime/netpoll_aix.go b/libgo/go/runtime/netpoll_aix.go
index cbeb8c9ed18..86c3e960f42 100644
--- a/libgo/go/runtime/netpoll_aix.go
+++ b/libgo/go/runtime/netpoll_aix.go
@@ -1,4 +1,4 @@
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -8,17 +8,7 @@ import "unsafe"
 
 // This is based on the former libgo/runtime/netpoll_select.c implementation
 // except that it uses poll instead of select and is written in Go.
-
-type pollfd struct {
-	fd      int32
-	events  int16
-	revents int16
-}
-
-const _POLLIN = 0x0001
-const _POLLOUT = 0x0002
-const _POLLHUP = 0x2000
-const _POLLERR = 0x4000
+// It's also based on Solaris implementation for the arming mechanisms
 
 //go:noescape
 //extern poll
@@ -31,62 +21,95 @@ func libc_pipe(fd *int32) int32
 //extern __go_fcntl_uintptr
 func fcntlUintptr(fd, cmd, arg uintptr) (uintptr, uintptr)
 
-func fcntl(fd, cmd int32, arg uintptr) uintptr {
+func fcntl(fd, cmd int32, arg uintptr) int32 {
 	r, _ := fcntlUintptr(uintptr(fd), uintptr(cmd), arg)
-	return r
+	return int32(r)
 }
 
+// pollfd represents the poll structure for AIX operating system.
+type pollfd struct {
+	fd      int32
+	events  int16
+	revents int16
+}
+
+const _POLLIN = 0x0001
+const _POLLOUT = 0x0002
+const _POLLHUP = 0x2000
+const _POLLERR = 0x4000
+const _O_NONBLOCK = 0x4
+
 var (
-	pfds        []pollfd
-	pds         []*pollDesc
-	mtxpoll     mutex
-	mtxset      mutex
-	rdwake      int32
-	wrwake      int32
-	needsUpdate bool
+	pfds           []pollfd
+	pds            []*pollDesc
+	mtxpoll        mutex
+	mtxset         mutex
+	rdwake         int32
+	wrwake         int32
+	pendingUpdates int32
 )
 
+const pollVerbose = false
+
 func netpollinit() {
 	var p [2]int32
 
 	// Create the pipe we use to wakeup poll.
 	if err := libc_pipe(&p[0]); err < 0 {
-		throw("runtime: netpollinit failed to create pipe")
+		throw("netpollinit: failed to create pipe")
 	}
 	rdwake = p[0]
 	wrwake = p[1]
 
-	fl := fcntl(rdwake, _F_GETFL, 0)
+	fl := uintptr(fcntl(rdwake, _F_GETFL, 0))
 	fcntl(rdwake, _F_SETFL, fl|_O_NONBLOCK)
 	fcntl(rdwake, _F_SETFD, _FD_CLOEXEC)
 
-	fl = fcntl(wrwake, _F_GETFL, 0)
+	fl = uintptr(fcntl(wrwake, _F_GETFL, 0))
+	fcntl(wrwake, _F_SETFL, fl|_O_NONBLOCK)
 	fcntl(wrwake, _F_SETFD, _FD_CLOEXEC)
 
 	// Pre-allocate array of pollfd structures for poll.
+	if pollVerbose {
+		println("*** allocating")
+	}
 	pfds = make([]pollfd, 1, 128)
+	if pollVerbose {
+		println("*** allocating done", &pfds[0])
+	}
+
 	// Poll the read side of the pipe.
 	pfds[0].fd = rdwake
 	pfds[0].events = _POLLIN
 
-	// Allocate index to pd array
 	pds = make([]*pollDesc, 1, 128)
 	pds[0] = nil
 }
 
 func netpolldescriptor() uintptr {
-	return ^uintptr(0)
+	// Both fd must be returned
+	if rdwake > 0xFFFF || wrwake > 0xFFFF {
+		throw("netpolldescriptor: invalid fd number")
+	}
+	return uintptr(rdwake<<16 | wrwake)
 }
 
+// netpollwakeup writes on wrwake to wakeup poll before any changes.
 func netpollwakeup() {
-	if !needsUpdate {
-		needsUpdate = true
+	if pendingUpdates == 0 {
+		pendingUpdates = 1
+		if pollVerbose {
+			println("*** writing 1 byte")
+		}
 		b := [1]byte{0}
 		write(uintptr(wrwake), unsafe.Pointer(&b[0]), 1)
 	}
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
+	if pollVerbose {
+		println("*** netpollopen", fd)
+	}
 	lock(&mtxpoll)
 	netpollwakeup()
 
@@ -94,15 +117,16 @@ func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	unlock(&mtxpoll)
 
 	pd.user = uint32(len(pfds))
-	var pfd pollfd
-	pfd.fd = int32(fd)
-	pfds = append(pfds, pfd)
+	pfds = append(pfds, pollfd{fd: int32(fd)})
 	pds = append(pds, pd)
 	unlock(&mtxset)
 	return 0
 }
 
 func netpollclose(fd uintptr) int32 {
+	if pollVerbose {
+		println("*** netpollclose", fd)
+	}
 	lock(&mtxpoll)
 	netpollwakeup()
 
@@ -125,6 +149,9 @@ func netpollclose(fd uintptr) int32 {
 }
 
 func netpollarm(pd *pollDesc, mode int) {
+	if pollVerbose {
+		println("*** netpollarm", pd.fd, mode)
+	}
 	lock(&mtxpoll)
 	netpollwakeup()
 
@@ -141,23 +168,36 @@ func netpollarm(pd *pollDesc, mode int) {
 }
 
 //go:nowritebarrierrec
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	timeout := ^uintptr(0)
 	if !block {
 		timeout = 0
-		return nil
+		return gList{}
+	}
+	if pollVerbose {
+		println("*** netpoll", block)
 	}
 retry:
 	lock(&mtxpoll)
 	lock(&mtxset)
-	needsUpdate = false
+	pendingUpdates = 0
 	unlock(&mtxpoll)
 
+	if pollVerbose {
+		println("*** netpoll before poll")
+	}
 	n := libc_poll(&pfds[0], uintptr(len(pfds)), timeout)
+	if pollVerbose {
+		println("*** netpoll after poll", n)
+	}
 	if n < 0 {
 		e := errno()
 		if e != _EINTR {
-			throw("runtime: poll failed")
+			println("errno=", e, " len(pfds)=", len(pfds))
+			throw("poll failed")
+		}
+		if pollVerbose {
+			println("*** poll failed")
 		}
 		unlock(&mtxset)
 		goto retry
@@ -166,13 +206,16 @@ retry:
 	if n != 0 && pfds[0].revents&(_POLLIN|_POLLHUP|_POLLERR) != 0 {
 		var b [1]byte
 		for read(rdwake, unsafe.Pointer(&b[0]), 1) == 1 {
+			if pollVerbose {
+				println("*** read 1 byte from pipe")
+			}
 		}
 		// Do not look at the other fds in this case as the mode may have changed
 		// XXX only additions of flags are made, so maybe it is ok
 		unlock(&mtxset)
 		goto retry
 	}
-	var gp guintptr
+	var toRun gList
 	for i := 0; i < len(pfds) && n > 0; i++ {
 		pfd := &pfds[i]
 
@@ -186,13 +229,19 @@ retry:
 			pfd.events &= ^_POLLOUT
 		}
 		if mode != 0 {
-			netpollready(&gp, pds[i], mode)
+			if pollVerbose {
+				println("*** netpollready i=", i, "revents=", pfd.revents, "events=", pfd.events, "pd=", pds[i])
+			}
+			netpollready(&toRun, pds[i], mode)
 			n--
 		}
 	}
 	unlock(&mtxset)
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	if pollVerbose {
+		println("*** netpoll returning end")
+	}
+	return toRun
 }
diff --git a/libgo/go/runtime/netpoll_epoll.go b/libgo/go/runtime/netpoll_epoll.go
index ced399d781e..2004fbc4da8 100644
--- a/libgo/go/runtime/netpoll_epoll.go
+++ b/libgo/go/runtime/netpoll_epoll.go
@@ -75,9 +75,9 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // polls for ready network connections
 // returns list of goroutines that become runnable
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	if epfd == -1 {
-		return nil
+		return gList{}
 	}
 	waitms := int32(-1)
 	if !block {
@@ -94,7 +94,7 @@ retry:
 		}
 		goto retry
 	}
-	var gp guintptr
+	var toRun gList
 	for i := int32(0); i < n; i++ {
 		ev := &events[i]
 		if ev.events == 0 {
@@ -110,11 +110,11 @@ retry:
 		if mode != 0 {
 			pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
 
-			netpollready(&gp, pd, mode)
+			netpollready(&toRun, pd, mode)
 		}
 	}
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	return toRun
 }
diff --git a/libgo/go/runtime/netpoll_fake.go b/libgo/go/runtime/netpoll_fake.go
index aab18dc8468..5b1a63a8787 100644
--- a/libgo/go/runtime/netpoll_fake.go
+++ b/libgo/go/runtime/netpoll_fake.go
@@ -27,6 +27,6 @@ func netpollclose(fd uintptr) int32 {
 func netpollarm(pd *pollDesc, mode int) {
 }
 
-func netpoll(block bool) *g {
-	return nil
+func netpoll(block bool) gList {
+	return gList{}
 }
diff --git a/libgo/go/runtime/netpoll_kqueue.go b/libgo/go/runtime/netpoll_kqueue.go
index 1f68effbf9d..3d6265092e6 100644
--- a/libgo/go/runtime/netpoll_kqueue.go
+++ b/libgo/go/runtime/netpoll_kqueue.go
@@ -73,9 +73,9 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	if kq == -1 {
-		return nil
+		return gList{}
 	}
 	var tp *timespec
 	var ts timespec
@@ -93,7 +93,7 @@ retry:
 		}
 		goto retry
 	}
-	var gp guintptr
+	var toRun gList
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 		var mode int32
@@ -117,11 +117,11 @@ retry:
 			mode += 'w'
 		}
 		if mode != 0 {
-			netpollready(&gp, (*pollDesc)(unsafe.Pointer(ev.udata)), mode)
+			netpollready(&toRun, (*pollDesc)(unsafe.Pointer(ev.udata)), mode)
 		}
 	}
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	return toRun
 }
diff --git a/libgo/go/runtime/netpoll_solaris.go b/libgo/go/runtime/netpoll_solaris.go
index a960e93b8bf..b8baffdfc12 100644
--- a/libgo/go/runtime/netpoll_solaris.go
+++ b/libgo/go/runtime/netpoll_solaris.go
@@ -166,9 +166,9 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // polls for ready network connections
 // returns list of goroutines that become runnable
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	if portfd == -1 {
-		return nil
+		return gList{}
 	}
 
 	var wait *timespec
@@ -188,7 +188,7 @@ retry:
 		goto retry
 	}
 
-	var gp guintptr
+	var toRun gList
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 
@@ -219,12 +219,12 @@ retry:
 		}
 
 		if mode != 0 {
-			netpollready(&gp, pd, mode)
+			netpollready(&toRun, pd, mode)
 		}
 	}
 
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	return toRun
 }
diff --git a/libgo/go/runtime/netpoll_stub.go b/libgo/go/runtime/netpoll_stub.go
index a4d6b4608ac..f585333579d 100644
--- a/libgo/go/runtime/netpoll_stub.go
+++ b/libgo/go/runtime/netpoll_stub.go
@@ -10,10 +10,10 @@ var netpollWaiters uint32
 
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) (gp *g) {
+func netpoll(block bool) gList {
 	// Implementation for platforms that do not support
 	// integrated network poller.
-	return
+	return gList{}
 }
 
 func netpollinited() bool {
diff --git a/libgo/go/runtime/netpoll_windows.go b/libgo/go/runtime/netpoll_windows.go
index 134071f5e3c..07ef15ce2f3 100644
--- a/libgo/go/runtime/netpoll_windows.go
+++ b/libgo/go/runtime/netpoll_windows.go
@@ -63,17 +63,17 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // Polls for completed network IO.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) *g {
+func netpoll(block bool) gList {
 	var entries [64]overlappedEntry
 	var wait, qty, key, flags, n, i uint32
 	var errno int32
 	var op *net_op
-	var gp guintptr
+	var toRun gList
 
 	mp := getg().m
 
 	if iocphandle == _INVALID_HANDLE_VALUE {
-		return nil
+		return gList{}
 	}
 	wait = 0
 	if block {
@@ -92,7 +92,7 @@ retry:
 			mp.blocked = false
 			errno = int32(getlasterror())
 			if !block && errno == _WAIT_TIMEOUT {
-				return nil
+				return gList{}
 			}
 			println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
 			throw("runtime: netpoll failed")
@@ -105,7 +105,7 @@ retry:
 			if stdcall5(_WSAGetOverlappedResult, op.pd.fd, uintptr(unsafe.Pointer(op)), uintptr(unsafe.Pointer(&qty)), 0, uintptr(unsafe.Pointer(&flags))) == 0 {
 				errno = int32(getlasterror())
 			}
-			handlecompletion(&gp, op, errno, qty)
+			handlecompletion(&toRun, op, errno, qty)
 		}
 	} else {
 		op = nil
@@ -118,7 +118,7 @@ retry:
 			mp.blocked = false
 			errno = int32(getlasterror())
 			if !block && errno == _WAIT_TIMEOUT {
-				return nil
+				return gList{}
 			}
 			if op == nil {
 				println("runtime: GetQueuedCompletionStatus failed (errno=", errno, ")")
@@ -127,15 +127,15 @@ retry:
 			// dequeued failed IO packet, so report that
 		}
 		mp.blocked = false
-		handlecompletion(&gp, op, errno, qty)
+		handlecompletion(&toRun, op, errno, qty)
 	}
-	if block && gp == 0 {
+	if block && toRun.empty() {
 		goto retry
 	}
-	return gp.ptr()
+	return toRun
 }
 
-func handlecompletion(gpp *guintptr, op *net_op, errno int32, qty uint32) {
+func handlecompletion(toRun *gList, op *net_op, errno int32, qty uint32) {
 	if op == nil {
 		println("runtime: GetQueuedCompletionStatus returned op == nil")
 		throw("runtime: netpoll failed")
@@ -147,5 +147,5 @@ func handlecompletion(gpp *guintptr, op *net_op, errno int32, qty uint32) {
 	}
 	op.errno = errno
 	op.qty = qty
-	netpollready(gpp, op.pd, mode)
+	netpollready(toRun, op.pd, mode)
 }
diff --git a/libgo/go/runtime/os_aix.go b/libgo/go/runtime/os_aix.go
index 246b9c3c944..10036162c76 100644
--- a/libgo/go/runtime/os_aix.go
+++ b/libgo/go/runtime/os_aix.go
@@ -1,7 +1,9 @@
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2018 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build aix
+
 package runtime
 
 import "unsafe"
@@ -55,17 +57,16 @@ func semacreate(mp *m) {
 func semasleep(ns int64) int32 {
 	_m_ := getg().m
 	if ns >= 0 {
-		const CLOCK_REALTIME int64 = 9
 		var ts timespec
 
-		if clock_gettime(CLOCK_REALTIME, &ts) != 0 {
+		if clock_gettime(_CLOCK_REALTIME, &ts) != 0 {
 			throw("clock_gettime")
 		}
-		ts.tv_sec += timespec_sec_t(ns / 1000000000)
-		ts.tv_nsec += timespec_nsec_t(ns % 1000000000)
-		if ts.tv_nsec >= 1000000000 {
-			ts.tv_sec += timespec_sec_t(1)
-			ts.tv_nsec -= timespec_nsec_t(1000000000)
+		ts.tv_sec += ns / 1e9
+		ts.tv_nsec += ns % 1e9
+		if ts.tv_nsec >= 1e9 {
+			ts.tv_sec++
+			ts.tv_nsec -= 1e9
 		}
 
 		if sem_timedwait((*semt)(unsafe.Pointer(_m_.mos.waitsema)), &ts) != 0 {
@@ -73,6 +74,7 @@ func semasleep(ns int64) int32 {
 			if err == _ETIMEDOUT || err == _EAGAIN || err == _EINTR {
 				return -1
 			}
+			println("sem_timedwait err ", err, " ts.tv_sec ", ts.tv_sec, " ts.tv_nsec ", ts.tv_nsec, " ns ", ns, " id ", _m_.id)
 			throw("sem_timedwait")
 		}
 		return 0
@@ -96,3 +98,8 @@ func semawakeup(mp *m) {
 		throw("sem_post")
 	}
 }
+
+const (
+	_CLOCK_REALTIME  = 9
+	_CLOCK_MONOTONIC = 10
+)
diff --git a/libgo/go/runtime/os_freebsd.go b/libgo/go/runtime/os_freebsd.go
index 34939c56f03..4cce6fdb7f4 100644
--- a/libgo/go/runtime/os_freebsd.go
+++ b/libgo/go/runtime/os_freebsd.go
@@ -63,3 +63,42 @@ func futexwakeup(addr *uint32, cnt uint32) {
 		print("umtx_wake_addr=", addr, " ret=", ret, "\n")
 	})
 }
+
+func sysargs(argc int32, argv **byte) {
+	n := argc + 1
+
+	// skip over argv, envp to get to auxv
+	for argv_index(argv, n) != nil {
+		n++
+	}
+
+	// skip NULL separator
+	n++
+
+	// now argv+n is auxv
+	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
+	sysauxv(auxv[:])
+}
+
+const (
+	_AT_NULL     = 0  // Terminates the vector
+	_AT_PAGESZ   = 6  // Page size in bytes
+	_AT_TIMEKEEP = 22 // Pointer to timehands.
+	_AT_HWCAP    = 25 // CPU feature flags
+	_AT_HWCAP2   = 26 // CPU feature flags 2
+)
+
+func sysauxv(auxv []uintptr) {
+	for i := 0; auxv[i] != _AT_NULL; i += 2 {
+		tag, val := auxv[i], auxv[i+1]
+		switch tag {
+		// _AT_NCPUS from auxv shouldn't be used due to golang.org/issue/15206
+		case _AT_PAGESZ:
+			physPageSize = val
+		case _AT_TIMEKEEP:
+			timekeepSharedPage = (*vdsoTimekeep)(unsafe.Pointer(val))
+		}
+
+		archauxv(tag, val)
+	}
+}
diff --git a/libgo/go/runtime/os_linux_arm64.go b/libgo/go/runtime/os_linux_arm64.go
index 013e7ae17ac..30d63bfbdbc 100644
--- a/libgo/go/runtime/os_linux_arm64.go
+++ b/libgo/go/runtime/os_linux_arm64.go
@@ -22,7 +22,15 @@ func archauxv(tag, val uintptr) {
 	case _AT_HWCAP:
 		// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
 		// HWCAP/HWCAP2 bits for hardware capabilities.
-		cpu.HWCap = uint(val)
+		hwcap := uint(val)
+		if GOOS == "android" {
+			// The Samsung S9+ kernel reports support for atomics, but not all cores
+			// actually support them, resulting in SIGILL. See issue #28431.
+			// TODO(elias.naur): Only disable the optimization on bad chipsets.
+			const hwcap_ATOMICS = 1 << 8
+			hwcap &= ^uint(hwcap_ATOMICS)
+		}
+		cpu.HWCap = hwcap
 	case _AT_HWCAP2:
 		cpu.HWCap2 = uint(val)
 	}
diff --git a/libgo/go/runtime/os_windows_arm.go b/libgo/go/runtime/os_windows_arm.go
new file mode 100644
index 00000000000..10aff75e311
--- /dev/null
+++ b/libgo/go/runtime/os_windows_arm.go
@@ -0,0 +1,22 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+//go:nosplit
+func cputicks() int64 {
+	var counter int64
+	stdcall1(_QueryPerformanceCounter, uintptr(unsafe.Pointer(&counter)))
+	return counter
+}
+
+func checkgoarm() {
+	if goarm < 7 {
+		print("Need atomic synchronization instructions, coprocessor ",
+			"access instructions. Recompile using GOARM=7.\n")
+		exit(1)
+	}
+}
diff --git a/libgo/go/runtime/panic.go b/libgo/go/runtime/panic.go
index 752bf7185f7..bfd2f2d542b 100644
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@@ -54,7 +54,7 @@ var indexError = error(errorString("index out of range"))
 
 func panicindex() {
 	name, _, _ := funcfileline(getcallerpc(), -1)
-	if hasprefix(name, "runtime.") {
+	if hasPrefix(name, "runtime.") {
 		throw(string(indexError.(errorString)))
 	}
 	panicCheckMalloc(indexError)
@@ -65,7 +65,7 @@ var sliceError = error(errorString("slice bounds out of range"))
 
 func panicslice() {
 	name, _, _ := funcfileline(getcallerpc(), -1)
-	if hasprefix(name, "runtime.") {
+	if hasPrefix(name, "runtime.") {
 		throw(string(sliceError.(errorString)))
 	}
 	panicCheckMalloc(sliceError)
@@ -151,6 +151,14 @@ func newdefer() *_defer {
 		systemstack(func() {
 			d = new(_defer)
 		})
+		if debugCachedWork {
+			// Duplicate the tail below so if there's a
+			// crash in checkPut we can tell if d was just
+			// allocated or came from the pool.
+			d.link = gp._defer
+			gp._defer = d
+			return d
+		}
 	}
 	d.link = gp._defer
 	gp._defer = d
@@ -242,17 +250,22 @@ func deferreturn(frame *bool) {
 			// code in jmpdefer.
 			var fn func(unsafe.Pointer)
 			*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(noescape(unsafe.Pointer(&pfn)))
+			gp.deferring = true
 			fn(d.arg)
+			gp.deferring = false
 		}
 
-		// If we are returning from a Go function called by a
-		// C function running in a C thread, g may now be nil,
-		// in which case CgocallBackDone will have cleared _defer.
-		// In that case some other goroutine may already be using gp.
+		// If that was CgocallBackDone, it will have freed the
+		// defer for us, since we are no longer running as Go code.
 		if getg() == nil {
 			*frame = true
 			return
 		}
+		if gp.ranCgocallBackDone {
+			gp.ranCgocallBackDone = false
+			*frame = true
+			return
+		}
 
 		gp._defer = d.link
 
@@ -316,7 +329,9 @@ func checkdefer(frame *bool) {
 
 			var fn func(unsafe.Pointer)
 			*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(noescape(unsafe.Pointer(&pfn)))
+			gp.deferring = true
 			fn(d.arg)
+			gp.deferring = false
 
 			freedefer(d)
 
@@ -389,6 +404,7 @@ func Goexit() {
 	// This code is similar to gopanic, see that implementation
 	// for detailed comments.
 	gp := getg()
+	gp.goexiting = true
 	for {
 		d := gp._defer
 		if d == nil {
@@ -409,7 +425,9 @@ func Goexit() {
 
 		var fn func(unsafe.Pointer)
 		*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(noescape(unsafe.Pointer(&pfn)))
+		gp.deferring = true
 		fn(d.arg)
+		gp.deferring = false
 
 		if gp._defer != d {
 			throw("bad defer entry in Goexit")
@@ -419,6 +437,7 @@ func Goexit() {
 		freedefer(d)
 		// Note: we ignore recovers here because Goexit isn't a panic
 	}
+	gp.goexiting = false
 	goexit1()
 }
 
@@ -532,7 +551,9 @@ func gopanic(e interface{}) {
 
 		var fn func(unsafe.Pointer)
 		*(*uintptr)(unsafe.Pointer(&fn)) = uintptr(noescape(unsafe.Pointer(&pfn)))
+		gp.deferring = true
 		fn(d.arg)
+		gp.deferring = false
 
 		if gp._defer != d {
 			throw("bad defer entry in panic")
@@ -649,7 +670,7 @@ func canrecover(retaddr uintptr) bool {
 	}
 
 	name := locs[1].function
-	if hasprefix(name, "runtime.") {
+	if hasPrefix(name, "runtime.") {
 		return true
 	}
 
@@ -676,7 +697,7 @@ func canrecover(retaddr uintptr) bool {
 		}
 
 		// Ignore function in libffi.
-		if hasprefix(name, "ffi_") {
+		if hasPrefix(name, "ffi_") {
 			continue
 		}
 
@@ -690,7 +711,7 @@ func canrecover(retaddr uintptr) bool {
 		}
 
 		// Ignore other functions in the reflect package.
-		if hasprefix(name, "reflect.") || hasprefix(name, ".1reflect.") {
+		if hasPrefix(name, "reflect.") || hasPrefix(name, ".1reflect.") {
 			continue
 		}
 
@@ -700,7 +721,7 @@ func canrecover(retaddr uintptr) bool {
 
 	if i < n {
 		name = locs[i].function
-		if hasprefix(name, "runtime.") {
+		if hasPrefix(name, "runtime.") {
 			return true
 		}
 	}
@@ -734,7 +755,7 @@ func makefuncfficanrecover(loc []location) {
 	}
 
 	name := loc[1].function
-	if hasprefix(name, "runtime.") {
+	if hasPrefix(name, "runtime.") {
 		d.makefunccanrecover = true
 	}
 }
@@ -935,10 +956,13 @@ func fatalpanic(msgs *_panic) {
 // It returns true if panic messages should be printed, or false if
 // the runtime is in bad shape and should just print stacks.
 //
-// It can have write barriers because the write barrier explicitly
-// ignores writes once dying > 0.
+// It must not have write barriers even though the write barrier
+// explicitly ignores writes once dying > 0. Write barriers still
+// assume that g.m.p != nil, and this function may not have P
+// in some contexts (e.g. a panic in a signal handler for a signal
+// sent to an M with no P).
 //
-//go:yeswritebarrierrec
+//go:nowritebarrierrec
 func startpanic_m() bool {
 	_g_ := getg()
 	if mheap_.cachealloc.size == 0 { // very early
@@ -958,8 +982,8 @@ func startpanic_m() bool {
 
 	switch _g_.m.dying {
 	case 0:
+		// Setting dying >0 has the side-effect of disabling this G's writebuf.
 		_g_.m.dying = 1
-		_g_.writebuf = nil
 		atomic.Xadd(&panicking, 1)
 		lock(&paniclk)
 		if debug.schedtrace > 0 || debug.scheddetail > 0 {
@@ -1061,7 +1085,7 @@ func canpanic(gp *g) bool {
 	return true
 }
 
-// isAbortPC returns true if pc is the program counter at which
+// isAbortPC reports whether pc is the program counter at which
 // runtime.abort raises a signal.
 //
 // It is nosplit because it's part of the isgoexception
diff --git a/libgo/go/runtime/pprof/internal/profile/filter.go b/libgo/go/runtime/pprof/internal/profile/filter.go
index 1baa096a49c..9cad866df8c 100644
--- a/libgo/go/runtime/pprof/internal/profile/filter.go
+++ b/libgo/go/runtime/pprof/internal/profile/filter.go
@@ -55,7 +55,7 @@ func (p *Profile) FilterSamplesByName(focus, ignore, hide *regexp.Regexp) (fm, i
 	return
 }
 
-// matchesName returns whether the function name or file in the
+// matchesName reports whether the function name or file in the
 // location matches the regular expression.
 func (loc *Location) matchesName(re *regexp.Regexp) bool {
 	for _, ln := range loc.Line {
diff --git a/libgo/go/runtime/pprof/internal/profile/profile.go b/libgo/go/runtime/pprof/internal/profile/profile.go
index 64c3e3f054d..a6f8354b1e8 100644
--- a/libgo/go/runtime/pprof/internal/profile/profile.go
+++ b/libgo/go/runtime/pprof/internal/profile/profile.go
@@ -200,7 +200,7 @@ var libRx = regexp.MustCompile(`([.]so$|[.]so[._][0-9]+)`)
 // first.
 func (p *Profile) setMain() {
 	for i := 0; i < len(p.Mapping); i++ {
-		file := strings.TrimSpace(strings.Replace(p.Mapping[i].File, "(deleted)", "", -1))
+		file := strings.TrimSpace(strings.ReplaceAll(p.Mapping[i].File, "(deleted)", ""))
 		if len(file) == 0 {
 			continue
 		}
@@ -415,16 +415,16 @@ func (p *Profile) String() string {
 	for _, m := range p.Mapping {
 		bits := ""
 		if m.HasFunctions {
-			bits = bits + "[FN]"
+			bits += "[FN]"
 		}
 		if m.HasFilenames {
-			bits = bits + "[FL]"
+			bits += "[FL]"
 		}
 		if m.HasLineNumbers {
-			bits = bits + "[LN]"
+			bits += "[LN]"
 		}
 		if m.HasInlineFrames {
-			bits = bits + "[IN]"
+			bits += "[IN]"
 		}
 		ss = append(ss, fmt.Sprintf("%d: %#x/%#x/%#x %s %s %s",
 			m.ID,
@@ -573,7 +573,7 @@ func (p *Profile) Demangle(d Demangler) error {
 	return nil
 }
 
-// Empty returns true if the profile contains no samples.
+// Empty reports whether the profile contains no samples.
 func (p *Profile) Empty() bool {
 	return len(p.Sample) == 0
 }
diff --git a/libgo/go/runtime/pprof/pprof.go b/libgo/go/runtime/pprof/pprof.go
index 5128c22b338..996b3cbf468 100644
--- a/libgo/go/runtime/pprof/pprof.go
+++ b/libgo/go/runtime/pprof/pprof.go
@@ -28,6 +28,7 @@
 //            if err != nil {
 //                log.Fatal("could not create CPU profile: ", err)
 //            }
+//            defer f.Close()
 //            if err := pprof.StartCPUProfile(f); err != nil {
 //                log.Fatal("could not start CPU profile: ", err)
 //            }
@@ -41,11 +42,11 @@
 //            if err != nil {
 //                log.Fatal("could not create memory profile: ", err)
 //            }
+//            defer f.Close()
 //            runtime.GC() // get up-to-date statistics
 //            if err := pprof.WriteHeapProfile(f); err != nil {
 //                log.Fatal("could not write memory profile: ", err)
 //            }
-//            f.Close()
 //        }
 //    }
 //
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index fd05a049b5a..bf9f5266e29 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !nacl,!js
+// +build !aix,!nacl,!js
 
 package pprof
 
@@ -82,14 +82,14 @@ func avoidFunctions() []string {
 }
 
 func TestCPUProfile(t *testing.T) {
-	testCPUProfile(t, []string{"pprof.cpuHog1"}, avoidFunctions(), func(dur time.Duration) {
+	testCPUProfile(t, stackContains, []string{"pprof.cpuHog1"}, avoidFunctions(), func(dur time.Duration) {
 		cpuHogger(cpuHog1, &salt1, dur)
 	})
 }
 
 func TestCPUProfileMultithreaded(t *testing.T) {
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
-	testCPUProfile(t, []string{"pprof.cpuHog1", "pprof.cpuHog2"}, avoidFunctions(), func(dur time.Duration) {
+	testCPUProfile(t, stackContains, []string{"pprof.cpuHog1", "pprof.cpuHog2"}, avoidFunctions(), func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
 			cpuHogger(cpuHog1, &salt1, dur)
@@ -101,7 +101,7 @@ func TestCPUProfileMultithreaded(t *testing.T) {
 }
 
 func TestCPUProfileInlining(t *testing.T) {
-	testCPUProfile(t, []string{"pprof.inlinedCallee", "pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
+	testCPUProfile(t, stackContains, []string{"pprof.inlinedCallee", "pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
 		cpuHogger(inlinedCaller, &salt1, dur)
 	})
 }
@@ -139,7 +139,9 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca
 	}
 }
 
-func testCPUProfile(t *testing.T, need []string, avoid []string, f func(dur time.Duration)) {
+// testCPUProfile runs f under the CPU profiler, checking for some conditions specified by need,
+// as interpreted by matches.
+func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) {
 	switch runtime.GOOS {
 	case "darwin":
 		switch runtime.GOARCH {
@@ -178,7 +180,7 @@ func testCPUProfile(t *testing.T, need []string, avoid []string, f func(dur time
 		f(duration)
 		StopCPUProfile()
 
-		if profileOk(t, need, avoid, prof, duration) {
+		if profileOk(t, matches, need, avoid, prof, duration) {
 			return
 		}
 
@@ -191,6 +193,10 @@ func testCPUProfile(t *testing.T, need []string, avoid []string, f func(dur time
 	switch runtime.GOOS {
 	case "darwin", "dragonfly", "netbsd", "solaris":
 		t.Skipf("ignoring failure on %s; see golang.org/issue/13841", runtime.GOOS)
+	case "openbsd":
+		if runtime.GOARCH == "arm" {
+			t.Skipf("ignoring failure on %s/%s; see golang.org/issue/13841", runtime.GOOS, runtime.GOARCH)
+		}
 	}
 	// Ignore the failure if the tests are running in a QEMU-based emulator,
 	// QEMU is not perfect at emulating everything.
@@ -211,7 +217,21 @@ func contains(slice []string, s string) bool {
 	return false
 }
 
-func profileOk(t *testing.T, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
+// stackContains matches if a function named spec appears anywhere in the stack trace.
+func stackContains(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool {
+	for _, loc := range stk {
+		for _, line := range loc.Line {
+			if strings.Contains(line.Function.Name, spec) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+type matchFunc func(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool
+
+func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
 	ok = true
 
 	// Check that profile is well formed, contains 'need', and does not contain
@@ -224,18 +244,16 @@ func profileOk(t *testing.T, need []string, avoid []string, prof bytes.Buffer, d
 		fmt.Fprintf(&buf, "%d:", count)
 		fprintStack(&buf, stk)
 		samples += count
-		for i, name := range need {
-			if semi := strings.Index(name, ";"); semi > -1 {
-				kv := strings.SplitN(name[semi+1:], "=", 2)
-				if len(kv) != 2 || !contains(labels[kv[0]], kv[1]) {
-					continue
-				}
-				name = name[:semi]
+		for i, spec := range need {
+			if matches(spec, count, stk, labels) {
+				have[i] += count
 			}
+		}
+		for i, name := range avoid {
 			for _, loc := range stk {
 				for _, line := range loc.Line {
 					if strings.Contains(line.Function.Name, name) {
-						have[i] += count
+						avoidSamples[i] += count
 					}
 				}
 			}
@@ -316,6 +334,10 @@ func TestCPUProfileWithFork(t *testing.T) {
 		// Use smaller size for Android to avoid crash.
 		heap = 100 << 20
 	}
+	if runtime.GOOS == "windows" && runtime.GOARCH == "arm" {
+		// Use smaller heap for Windows/ARM to avoid crash.
+		heap = 100 << 20
+	}
 	if testing.Short() {
 		heap = 100 << 20
 	}
@@ -408,7 +430,7 @@ func fprintStack(w io.Writer, stk []*profile.Location) {
 
 // Test that profiling of division operations is okay, especially on ARM. See issue 6681.
 func TestMathBigDivide(t *testing.T) {
-	testCPUProfile(t, nil, nil, func(duration time.Duration) {
+	testCPUProfile(t, nil, nil, nil, func(duration time.Duration) {
 		t := time.After(duration)
 		pi := new(big.Int)
 		for {
@@ -426,6 +448,51 @@ func TestMathBigDivide(t *testing.T) {
 	})
 }
 
+// stackContainsAll matches if all functions in spec (comma-separated) appear somewhere in the stack trace.
+func stackContainsAll(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool {
+	for _, f := range strings.Split(spec, ",") {
+		if !stackContains(f, count, stk, labels) {
+			return false
+		}
+	}
+	return true
+}
+
+func TestMorestack(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		t.Skip("no runtime.newstack in gccgo")
+	}
+	testCPUProfile(t, stackContainsAll, []string{"runtime.newstack,runtime/pprof.growstack"}, avoidFunctions(), func(duration time.Duration) {
+		t := time.After(duration)
+		c := make(chan bool)
+		for {
+			go func() {
+				growstack1()
+				c <- true
+			}()
+			select {
+			case <-t:
+				return
+			case <-c:
+			}
+		}
+	})
+}
+
+//go:noinline
+func growstack1() {
+	growstack()
+}
+
+//go:noinline
+func growstack() {
+	var buf [8 << 10]byte
+	use(buf)
+}
+
+//go:noinline
+func use(x [8 << 10]byte) {}
+
 func TestBlockProfile(t *testing.T) {
 	t.Skip("lots of details are different for gccgo; FIXME")
 	type TestCase struct {
@@ -556,7 +623,7 @@ func TestBlockProfile(t *testing.T) {
 		}
 
 		for _, test := range tests {
-			if !regexp.MustCompile(strings.Replace(test.re, "\t", "\t+", -1)).MatchString(prof) {
+			if !regexp.MustCompile(strings.ReplaceAll(test.re, "\t", "\t+")).MatchString(prof) {
 				t.Errorf("Bad %v entry, expect:\n%v\ngot:\n%v", test.name, test.re, prof)
 			}
 		}
@@ -887,8 +954,25 @@ func TestEmptyCallStack(t *testing.T) {
 	}
 }
 
+// stackContainsLabeled takes a spec like funcname;key=value and matches if the stack has that key
+// and value and has funcname somewhere in the stack.
+func stackContainsLabeled(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool {
+	semi := strings.Index(spec, ";")
+	if semi == -1 {
+		panic("no semicolon in key/value spec")
+	}
+	kv := strings.SplitN(spec[semi+1:], "=", 2)
+	if len(kv) != 2 {
+		panic("missing = in key/value spec")
+	}
+	if !contains(labels[kv[0]], kv[1]) {
+		return false
+	}
+	return stackContains(spec[:semi], count, stk, labels)
+}
+
 func TestCPUProfileLabel(t *testing.T) {
-	testCPUProfile(t, []string{"pprof.cpuHogger;key=value"}, avoidFunctions(), func(dur time.Duration) {
+	testCPUProfile(t, stackContainsLabeled, []string{"pprof.cpuHogger;key=value"}, avoidFunctions(), func(dur time.Duration) {
 		Do(context.Background(), Labels("key", "value"), func(context.Context) {
 			cpuHogger(cpuHog1, &salt1, dur)
 		})
@@ -899,7 +983,7 @@ func TestLabelRace(t *testing.T) {
 	// Test the race detector annotations for synchronization
 	// between settings labels and consuming them from the
 	// profile.
-	testCPUProfile(t, []string{"pprof.cpuHogger;key=value"}, avoidFunctions(), func(dur time.Duration) {
+	testCPUProfile(t, stackContainsLabeled, []string{"pprof.cpuHogger;key=value"}, nil, func(dur time.Duration) {
 		start := time.Now()
 		var wg sync.WaitGroup
 		for time.Since(start) < dur {
diff --git a/libgo/go/runtime/pprof/proto.go b/libgo/go/runtime/pprof/proto.go
index d8456befd3d..b82e738f941 100644
--- a/libgo/go/runtime/pprof/proto.go
+++ b/libgo/go/runtime/pprof/proto.go
@@ -213,7 +213,7 @@ func (b *profileBuilder) pbMapping(tag int, id, base, limit, offset uint64, file
 }
 
 // locForPC returns the location ID for addr.
-// addr must be a return PC. This returns the location of the call.
+// addr must a return PC or 1 + the PC of an inline marker. This returns the location of the corresponding call.
 // It may emit to b.pb, so there must be no message encoding in progress.
 func (b *profileBuilder) locForPC(addr uintptr) uint64 {
 	id := uint64(b.locs[addr])
@@ -529,6 +529,14 @@ func parseProcSelfMaps(data []byte, addMapping func(lo, hi, offset uint64, file,
 			continue
 		}
 		file := string(line)
+
+		// Trim deleted file marker.
+		deletedStr := " (deleted)"
+		deletedLen := len(deletedStr)
+		if len(file) >= deletedLen && file[len(file)-deletedLen:] == deletedStr {
+			file = file[:len(file)-deletedLen]
+		}
+
 		if len(inode) == 1 && inode[0] == '0' && file == "" {
 			// Huge-page text mappings list the initial fragment of
 			// mapped but unpopulated memory as being inode 0.
diff --git a/libgo/go/runtime/pprof/proto_test.go b/libgo/go/runtime/pprof/proto_test.go
index 604628ce457..5a915fb4c3a 100644
--- a/libgo/go/runtime/pprof/proto_test.go
+++ b/libgo/go/runtime/pprof/proto_test.go
@@ -218,24 +218,89 @@ c000000000-c000036000 rw-p 00000000 00:00 0
 07000000 07093000 06c00000 /path/to/gobench_server_main
 `
 
+var profSelfMapsTestsWithDeleted = `
+00400000-0040b000 r-xp 00000000 fc:01 787766                             /bin/cat (deleted)
+0060a000-0060b000 r--p 0000a000 fc:01 787766                             /bin/cat (deleted)
+0060b000-0060c000 rw-p 0000b000 fc:01 787766                             /bin/cat (deleted)
+014ab000-014cc000 rw-p 00000000 00:00 0                                  [heap]
+7f7d76af8000-7f7d7797c000 r--p 00000000 fc:01 1318064                    /usr/lib/locale/locale-archive
+7f7d7797c000-7f7d77b36000 r-xp 00000000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77b36000-7f7d77d36000 ---p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d36000-7f7d77d3a000 r--p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d3a000-7f7d77d3c000 rw-p 001be000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d3c000-7f7d77d41000 rw-p 00000000 00:00 0
+7f7d77d41000-7f7d77d64000 r-xp 00000000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f3f000-7f7d77f42000 rw-p 00000000 00:00 0
+7f7d77f61000-7f7d77f63000 rw-p 00000000 00:00 0
+7f7d77f63000-7f7d77f64000 r--p 00022000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f64000-7f7d77f65000 rw-p 00023000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f65000-7f7d77f66000 rw-p 00000000 00:00 0
+7ffc342a2000-7ffc342c3000 rw-p 00000000 00:00 0                          [stack]
+7ffc34343000-7ffc34345000 r-xp 00000000 00:00 0                          [vdso]
+ffffffffff600000-ffffffffff601000 r-xp 00000090 00:00 0                  [vsyscall]
+->
+00400000 0040b000 00000000 /bin/cat
+7f7d7797c000 7f7d77b36000 00000000 /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d41000 7f7d77d64000 00000000 /lib/x86_64-linux-gnu/ld-2.19.so
+7ffc34343000 7ffc34345000 00000000 [vdso]
+ffffffffff600000 ffffffffff601000 00000090 [vsyscall]
+
+00400000-0040b000 r-xp 00000000 fc:01 787766                             /bin/cat with space
+0060a000-0060b000 r--p 0000a000 fc:01 787766                             /bin/cat with space
+0060b000-0060c000 rw-p 0000b000 fc:01 787766                             /bin/cat with space
+014ab000-014cc000 rw-p 00000000 00:00 0                                  [heap]
+7f7d76af8000-7f7d7797c000 r--p 00000000 fc:01 1318064                    /usr/lib/locale/locale-archive
+7f7d7797c000-7f7d77b36000 r-xp 00000000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77b36000-7f7d77d36000 ---p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d36000-7f7d77d3a000 r--p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d3a000-7f7d77d3c000 rw-p 001be000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d3c000-7f7d77d41000 rw-p 00000000 00:00 0
+7f7d77d41000-7f7d77d64000 r-xp 00000000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f3f000-7f7d77f42000 rw-p 00000000 00:00 0
+7f7d77f61000-7f7d77f63000 rw-p 00000000 00:00 0
+7f7d77f63000-7f7d77f64000 r--p 00022000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f64000-7f7d77f65000 rw-p 00023000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f65000-7f7d77f66000 rw-p 00000000 00:00 0
+7ffc342a2000-7ffc342c3000 rw-p 00000000 00:00 0                          [stack]
+7ffc34343000-7ffc34345000 r-xp 00000000 00:00 0                          [vdso]
+ffffffffff600000-ffffffffff601000 r-xp 00000090 00:00 0                  [vsyscall]
+->
+00400000 0040b000 00000000 /bin/cat with space
+7f7d7797c000 7f7d77b36000 00000000 /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d41000 7f7d77d64000 00000000 /lib/x86_64-linux-gnu/ld-2.19.so
+7ffc34343000 7ffc34345000 00000000 [vdso]
+ffffffffff600000 ffffffffff601000 00000090 [vsyscall]
+`
+
 func TestProcSelfMaps(t *testing.T) {
-	for tx, tt := range strings.Split(profSelfMapsTests, "\n\n") {
-		i := strings.Index(tt, "->\n")
-		if i < 0 {
-			t.Fatal("malformed test case")
-		}
-		in, out := tt[:i], tt[i+len("->\n"):]
-		if len(out) > 0 && out[len(out)-1] != '\n' {
-			out += "\n"
-		}
-		var buf bytes.Buffer
-		parseProcSelfMaps([]byte(in), func(lo, hi, offset uint64, file, buildID string) {
-			fmt.Fprintf(&buf, "%08x %08x %08x %s\n", lo, hi, offset, file)
-		})
-		if buf.String() != out {
-			t.Errorf("#%d: have:\n%s\nwant:\n%s\n%q\n%q", tx, buf.String(), out, buf.String(), out)
+
+	f := func(t *testing.T, input string) {
+		for tx, tt := range strings.Split(input, "\n\n") {
+			i := strings.Index(tt, "->\n")
+			if i < 0 {
+				t.Fatal("malformed test case")
+			}
+			in, out := tt[:i], tt[i+len("->\n"):]
+			if len(out) > 0 && out[len(out)-1] != '\n' {
+				out += "\n"
+			}
+			var buf bytes.Buffer
+			parseProcSelfMaps([]byte(in), func(lo, hi, offset uint64, file, buildID string) {
+				fmt.Fprintf(&buf, "%08x %08x %08x %s\n", lo, hi, offset, file)
+			})
+			if buf.String() != out {
+				t.Errorf("#%d: have:\n%s\nwant:\n%s\n%q\n%q", tx, buf.String(), out, buf.String(), out)
+			}
 		}
 	}
+
+	t.Run("Normal", func(t *testing.T) {
+		f(t, profSelfMapsTests)
+	})
+
+	t.Run("WithDeletedFile", func(t *testing.T) {
+		f(t, profSelfMapsTestsWithDeleted)
+	})
 }
 
 // TestMapping checkes the mapping section of CPU profiles
diff --git a/libgo/go/runtime/pprof/protomem.go b/libgo/go/runtime/pprof/protomem.go
index 82565d5245b..1c88aae43a0 100644
--- a/libgo/go/runtime/pprof/protomem.go
+++ b/libgo/go/runtime/pprof/protomem.go
@@ -56,8 +56,8 @@ func writeHeapProto(w io.Writer, p []runtime.MemProfileRecord, rate int64, defau
 		values[0], values[1] = scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
 		values[2], values[3] = scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
 		var blockSize int64
-		if values[0] > 0 {
-			blockSize = values[1] / values[0]
+		if r.AllocObjects > 0 {
+			blockSize = r.AllocBytes / r.AllocObjects
 		}
 		b.pbSample(values, locs, func() {
 			if blockSize != 0 {
diff --git a/libgo/go/runtime/pprof/protomem_test.go b/libgo/go/runtime/pprof/protomem_test.go
index 315d5f0b4d8..471b1ae9c32 100644
--- a/libgo/go/runtime/pprof/protomem_test.go
+++ b/libgo/go/runtime/pprof/protomem_test.go
@@ -48,7 +48,7 @@ func TestConvertMemProfile(t *testing.T) {
 				{ID: 3, Mapping: map2, Address: addr2 + 1},
 				{ID: 4, Mapping: map2, Address: addr2 + 2},
 			},
-			NumLabel: map[string][]int64{"bytes": {829411}},
+			NumLabel: map[string][]int64{"bytes": {512 * 1024}},
 		},
 		{
 			Value: []int64{1, 829411, 0, 0},
@@ -57,7 +57,7 @@ func TestConvertMemProfile(t *testing.T) {
 				{ID: 6, Mapping: map1, Address: addr1 + 2},
 				{ID: 7, Mapping: map2, Address: addr2 + 3},
 			},
-			NumLabel: map[string][]int64{"bytes": {829411}},
+			NumLabel: map[string][]int64{"bytes": {512 * 1024}},
 		},
 	}
 	for _, tc := range []struct {
diff --git a/libgo/go/runtime/print.go b/libgo/go/runtime/print.go
index 3da05ad5f9e..8c396442862 100644
--- a/libgo/go/runtime/print.go
+++ b/libgo/go/runtime/print.go
@@ -110,7 +110,12 @@ func gwrite(b []byte) {
 	}
 	recordForPanic(b)
 	gp := getg()
-	if gp == nil || gp.writebuf == nil {
+	// Don't use the writebuf if gp.m is dying. We want anything
+	// written through gwrite to appear in the terminal rather
+	// than be written to in some buffer, if we're in a panicking state.
+	// Note that we can't just clear writebuf in the gp.m.dying case
+	// because a panic isn't allowed to have any write barriers.
+	if gp == nil || gp.writebuf == nil || gp.m.dying > 0 {
 		writeErr(b)
 		return
 	}
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index 80b04ab4a5d..05dd53d886f 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -20,7 +20,6 @@ import (
 //go:linkname incidlelocked runtime.incidlelocked
 //go:linkname schedinit runtime.schedinit
 //go:linkname ready runtime.ready
-//go:linkname gcprocs runtime.gcprocs
 //go:linkname stopm runtime.stopm
 //go:linkname handoffp runtime.handoffp
 //go:linkname wakep runtime.wakep
@@ -32,7 +31,6 @@ import (
 //go:linkname reentersyscallblock runtime.reentersyscallblock
 //go:linkname exitsyscall runtime.exitsyscall
 //go:linkname gfget runtime.gfget
-//go:linkname helpgc runtime.helpgc
 //go:linkname kickoff runtime.kickoff
 //go:linkname mstart1 runtime.mstart1
 //go:linkname mexit runtime.mexit
@@ -196,8 +194,7 @@ func main() {
 		}
 	}()
 
-	// Record when the world started. Must be after runtime_init
-	// because nanotime on some platforms depends on startNano.
+	// Record when the world started.
 	runtimeInitTime = nanotime()
 
 	main_init_done = make(chan bool)
@@ -285,7 +282,7 @@ func forcegchelper() {
 			println("GC forced")
 		}
 		// Time-triggered, fully concurrent.
-		gcStart(gcBackgroundMode, gcTrigger{kind: gcTriggerTime, now: nanotime()})
+		gcStart(gcTrigger{kind: gcTriggerTime, now: nanotime()})
 	}
 }
 
@@ -479,17 +476,18 @@ const (
 	_GoidCacheBatch = 16
 )
 
-// cpuinit extracts the environment variable GODEBUGCPU from the environment on
-// Linux and Darwin if the GOEXPERIMENT debugcpu was set and calls internal/cpu.Initialize.
+// cpuinit extracts the environment variable GODEBUG from the environment on
+// Unix-like operating systems and calls internal/cpu.Initialize.
 func cpuinit() {
-	const prefix = "GODEBUGCPU="
+	const prefix = "GODEBUG="
 	var env string
 
-	if haveexperiment("debugcpu") && (GOOS == "linux" || GOOS == "darwin") {
+	switch GOOS {
+	case "aix", "darwin", "dragonfly", "freebsd", "netbsd", "openbsd", "solaris", "linux":
 		cpu.DebugOptions = true
 
 		// Similar to goenv_unix but extracts the environment value for
-		// GODEBUGCPU directly.
+		// GODEBUG directly.
 		// TODO(moehrmann): remove when general goenvs() can be called before cpuinit()
 		n := int32(0)
 		for argv_index(argv, argc+1+n) != nil {
@@ -500,7 +498,7 @@ func cpuinit() {
 			p := argv_index(argv, argc+1+i)
 			s := *(*string)(unsafe.Pointer(&stringStruct{unsafe.Pointer(p), findnull(p)}))
 
-			if hasprefix(s, prefix) {
+			if hasPrefix(s, prefix) {
 				env = gostring(p)[len(prefix):]
 				break
 			}
@@ -643,59 +641,6 @@ func ready(gp *g, traceskip int, next bool) {
 	_g_.m.locks--
 }
 
-func gcprocs() int32 {
-	// Figure out how many CPUs to use during GC.
-	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
-	lock(&sched.lock)
-	n := gomaxprocs
-	if n > ncpu {
-		n = ncpu
-	}
-	if n > _MaxGcproc {
-		n = _MaxGcproc
-	}
-	if n > sched.nmidle+1 { // one M is currently running
-		n = sched.nmidle + 1
-	}
-	unlock(&sched.lock)
-	return n
-}
-
-func needaddgcproc() bool {
-	lock(&sched.lock)
-	n := gomaxprocs
-	if n > ncpu {
-		n = ncpu
-	}
-	if n > _MaxGcproc {
-		n = _MaxGcproc
-	}
-	n -= sched.nmidle + 1 // one M is currently running
-	unlock(&sched.lock)
-	return n > 0
-}
-
-func helpgc(nproc int32) {
-	_g_ := getg()
-	lock(&sched.lock)
-	pos := 0
-	for n := int32(1); n < nproc; n++ { // one M is currently running
-		if allp[pos].mcache == _g_.m.mcache {
-			pos++
-		}
-		mp := mget()
-		if mp == nil {
-			throw("gcprocs inconsistency")
-		}
-		mp.helpgc = n
-		mp.p.set(allp[pos])
-		mp.mcache = allp[pos].mcache
-		pos++
-		notewakeup(&mp.park)
-	}
-	unlock(&sched.lock)
-}
-
 // freezeStopWait is a large value that freezetheworld sets
 // sched.stopwait to in order to request that all Gs permanently stop.
 const freezeStopWait = 0x7fffffff
@@ -1154,20 +1099,14 @@ func stopTheWorldWithSema() {
 	}
 }
 
-func mhelpgc() {
-	_g_ := getg()
-	_g_.m.helpgc = -1
-}
-
 func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	_g_ := getg()
 
 	_g_.m.locks++ // disable preemption because it can be holding p in a local var
 	if netpollinited() {
-		gp := netpoll(false) // non-blocking
-		injectglist(gp)
+		list := netpoll(false) // non-blocking
+		injectglist(&list)
 	}
-	add := needaddgcproc()
 	lock(&sched.lock)
 
 	procs := gomaxprocs
@@ -1197,7 +1136,6 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 		} else {
 			// Start M to run P.  Do not start another M below.
 			newm(nil, p)
-			add = false
 		}
 	}
 
@@ -1214,16 +1152,6 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 		wakep()
 	}
 
-	if add {
-		// If GC could have used another helper proc, start one now,
-		// in the hope that it will be available next time.
-		// It would have been even better to start it before the collection,
-		// but doing so requires allocating memory, so it's tricky to
-		// coordinate. This lazy approach works out in practice:
-		// we don't mind if the first couple gc rounds don't have quite
-		// the maximum number of procs.
-		newm(mhelpgc, nil)
-	}
 	_g_.m.locks--
 
 	return startTime
@@ -1288,10 +1216,7 @@ func mstart1() {
 		fn()
 	}
 
-	if _g_.m.helpgc != 0 {
-		_g_.m.helpgc = 0
-		stopm()
-	} else if _g_.m != &m0 {
+	if _g_.m != &m0 {
 		acquirep(_g_.m.nextp.ptr())
 		_g_.m.nextp = 0
 	}
@@ -1591,7 +1516,7 @@ func allocm(_p_ *p, fn func(), allocatestack bool) (mp *m, g0Stack unsafe.Pointe
 // the following strategy: there is a stack of available m's
 // that can be stolen. Using compare-and-swap
 // to pop from the stack has ABA races, so we simulate
-// a lock by doing an exchange (via casp) to steal the stack
+// a lock by doing an exchange (via Casuintptr) to steal the stack
 // head and replace the top pointer with MLOCKED (1).
 // This serves as a simple spin lock that we can use even
 // without an m. The thread that locks the stack in this way
@@ -1925,7 +1850,7 @@ func startTemplateThread() {
 
 // templateThread is a thread in a known-good state that exists solely
 // to start new threads in known-good states when the calling thread
-// may not be a a good state.
+// may not be in a good state.
 //
 // Many programs never need this, so templateThread is started lazily
 // when we first enter a state that might lead to running on a thread
@@ -1977,21 +1902,11 @@ func stopm() {
 		throw("stopm spinning")
 	}
 
-retry:
 	lock(&sched.lock)
 	mput(_g_.m)
 	unlock(&sched.lock)
 	notesleep(&_g_.m.park)
 	noteclear(&_g_.m.park)
-	if _g_.m.helpgc != 0 {
-		// helpgc() set _g_.m.p and _g_.m.mcache, so we have a P.
-		gchelper()
-		// Undo the effects of helpgc().
-		_g_.m.helpgc = 0
-		_g_.m.mcache = nil
-		_g_.m.p = 0
-		goto retry
-	}
 	acquirep(_g_.m.nextp.ptr())
 	_g_.m.nextp = 0
 }
@@ -2279,9 +2194,9 @@ top:
 	// not set lastpoll yet), this thread will do blocking netpoll below
 	// anyway.
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
-		if gp := netpoll(false); gp != nil { // non-blocking
-			// netpoll returns list of goroutines linked by schedlink.
-			injectglist(gp.schedlink.ptr())
+		if list := netpoll(false); !list.empty() { // non-blocking
+			gp := list.pop()
+			injectglist(&list)
 			casgstatus(gp, _Gwaiting, _Grunnable)
 			if trace.enabled {
 				traceGoUnpark(gp, 0)
@@ -2336,10 +2251,10 @@ stop:
 	}
 
 	// wasm only:
-	// Check if a goroutine is waiting for a callback from the WebAssembly host.
-	// If yes, pause the execution until a callback was triggered.
-	if pauseSchedulerUntilCallback() {
-		// A callback was triggered and caused at least one goroutine to wake up.
+	// If a callback returned and no other goroutine is awake,
+	// then pause execution until a callback was triggered.
+	if beforeIdle() {
+		// At least one goroutine got woken.
 		goto top
 	}
 
@@ -2433,29 +2348,30 @@ stop:
 		if _g_.m.spinning {
 			throw("findrunnable: netpoll with spinning")
 		}
-		gp := netpoll(true) // block until new work is available
+		list := netpoll(true) // block until new work is available
 		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
-		if gp != nil {
+		if !list.empty() {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
 			if _p_ != nil {
 				acquirep(_p_)
-				injectglist(gp.schedlink.ptr())
+				gp := list.pop()
+				injectglist(&list)
 				casgstatus(gp, _Gwaiting, _Grunnable)
 				if trace.enabled {
 					traceGoUnpark(gp, 0)
 				}
 				return gp, false
 			}
-			injectglist(gp)
+			injectglist(&list)
 		}
 	}
 	stopm()
 	goto top
 }
 
-// pollWork returns true if there is non-background work this P could
+// pollWork reports whether there is non-background work this P could
 // be doing. This is a fairly lightweight check to be used for
 // background work loops, like idle GC. It checks a subset of the
 // conditions checked by the actual scheduler.
@@ -2468,8 +2384,8 @@ func pollWork() bool {
 		return true
 	}
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
-		if gp := netpoll(false); gp != nil {
-			injectglist(gp)
+		if list := netpoll(false); !list.empty() {
+			injectglist(&list)
 			return true
 		}
 	}
@@ -2494,22 +2410,21 @@ func resetspinning() {
 	}
 }
 
-// Injects the list of runnable G's into the scheduler.
+// Injects the list of runnable G's into the scheduler and clears glist.
 // Can run concurrently with GC.
-func injectglist(glist *g) {
-	if glist == nil {
+func injectglist(glist *gList) {
+	if glist.empty() {
 		return
 	}
 	if trace.enabled {
-		for gp := glist; gp != nil; gp = gp.schedlink.ptr() {
+		for gp := glist.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
 			traceGoUnpark(gp, 0)
 		}
 	}
 	lock(&sched.lock)
 	var n int
-	for n = 0; glist != nil; n++ {
-		gp := glist
-		glist = gp.schedlink.ptr()
+	for n = 0; !glist.empty(); n++ {
+		gp := glist.pop()
 		casgstatus(gp, _Gwaiting, _Grunnable)
 		globrunqput(gp)
 	}
@@ -2517,6 +2432,7 @@ func injectglist(glist *g) {
 	for ; n != 0 && sched.npidle != 0; n-- {
 		startm(nil, false)
 	}
+	*glist = gList{}
 }
 
 // One round of scheduler: find a runnable goroutine and execute it.
@@ -2602,6 +2518,23 @@ top:
 		resetspinning()
 	}
 
+	if sched.disable.user && !schedEnabled(gp) {
+		// Scheduling of this goroutine is disabled. Put it on
+		// the list of pending runnable goroutines for when we
+		// re-enable user scheduling and look again.
+		lock(&sched.lock)
+		if schedEnabled(gp) {
+			// Something re-enabled scheduling while we
+			// were acquiring the lock.
+			unlock(&sched.lock)
+		} else {
+			sched.disable.runnable.pushBack(gp)
+			sched.disable.n++
+			unlock(&sched.lock)
+			goto top
+		}
+	}
+
 	if gp.lockedm != 0 {
 		// Hands off own p to the locked m,
 		// then blocks waiting for a new p.
@@ -2714,7 +2647,7 @@ func goexit0(gp *g) {
 	_g_ := getg()
 
 	casgstatus(gp, _Grunning, _Gdead)
-	if isSystemGoroutine(gp) {
+	if isSystemGoroutine(gp, false) {
 		atomic.Xadd(&sched.ngsys, -1)
 		gp.isSystemGoroutine = false
 	}
@@ -2755,7 +2688,6 @@ func goexit0(gp *g) {
 		print("invalid m->lockedInt = ", _g_.m.lockedInt, "\n")
 		throw("internal lockOSThread error")
 	}
-	_g_.m.lockedExt = 0
 	gfput(_g_.m.p.ptr(), gp)
 	if locked {
 		// The goroutine may have locked this thread because
@@ -2767,6 +2699,10 @@ func goexit0(gp *g) {
 		if GOOS != "plan9" { // See golang.org/issue/22227.
 			_g_.m.exiting = true
 			gogo(_g_.m.g0)
+		} else {
+			// Clear lockedExt on plan9 since we may end up re-using
+			// this thread.
+			_g_.m.lockedExt = 0
 		}
 	}
 	schedule()
@@ -2823,8 +2759,11 @@ func reentersyscall(pc, sp uintptr) {
 	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
 	_g_.sysblocktraced = true
 	_g_.m.mcache = nil
-	_g_.m.p.ptr().m = 0
-	atomic.Store(&_g_.m.p.ptr().status, _Psyscall)
+	pp := _g_.m.p.ptr()
+	pp.m = 0
+	_g_.m.oldp.set(pp)
+	_g_.m.p = 0
+	atomic.Store(&pp.status, _Psyscall)
 	if sched.gcwaiting != 0 {
 		systemstack(entersyscall_gcwait)
 	}
@@ -2843,7 +2782,7 @@ func entersyscall_sysmon() {
 
 func entersyscall_gcwait() {
 	_g_ := getg()
-	_p_ := _g_.m.p.ptr()
+	_p_ := _g_.m.oldp.ptr()
 
 	lock(&sched.lock)
 	if sched.stopwait > 0 && atomic.Cas(&_p_.status, _Psyscall, _Pgcstop) {
@@ -2900,8 +2839,9 @@ func exitsyscall() {
 	_g_.m.locks++ // see comment in entersyscall
 
 	_g_.waitsince = 0
-	oldp := _g_.m.p.ptr()
-	if exitsyscallfast() {
+	oldp := _g_.m.oldp.ptr()
+	_g_.m.oldp = 0
+	if exitsyscallfast(oldp) {
 		if _g_.m.mcache == nil {
 			throw("lost mcache")
 		}
@@ -2924,6 +2864,12 @@ func exitsyscall() {
 		if getg().preempt {
 			checkPreempt()
 		}
+		_g_.throwsplit = false
+
+		if sched.disable.user && !schedEnabled(_g_) {
+			// Scheduling of this goroutine is disabled.
+			Gosched()
+		}
 
 		return
 	}
@@ -2964,27 +2910,23 @@ func exitsyscall() {
 }
 
 //go:nosplit
-func exitsyscallfast() bool {
+func exitsyscallfast(oldp *p) bool {
 	_g_ := getg()
 
 	// Freezetheworld sets stopwait but does not retake P's.
 	if sched.stopwait == freezeStopWait {
-		_g_.m.mcache = nil
-		_g_.m.p = 0
 		return false
 	}
 
 	// Try to re-acquire the last P.
-	if _g_.m.p != 0 && _g_.m.p.ptr().status == _Psyscall && atomic.Cas(&_g_.m.p.ptr().status, _Psyscall, _Prunning) {
+	if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) {
 		// There's a cpu for us, so we can run.
+		wirep(oldp)
 		exitsyscallfast_reacquired()
 		return true
 	}
 
 	// Try to get any other idle P.
-	oldp := _g_.m.p.ptr()
-	_g_.m.mcache = nil
-	_g_.m.p = 0
 	if sched.pidle != 0 {
 		var ok bool
 		systemstack(func() {
@@ -3011,15 +2953,9 @@ func exitsyscallfast() bool {
 // has successfully reacquired the P it was running on before the
 // syscall.
 //
-// This function is allowed to have write barriers because exitsyscall
-// has acquired a P at this point.
-//
-//go:yeswritebarrierrec
 //go:nosplit
 func exitsyscallfast_reacquired() {
 	_g_ := getg()
-	_g_.m.mcache = _g_.m.p.ptr().mcache
-	_g_.m.p.ptr().m.set(_g_.m)
 	if _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
 		if trace.enabled {
 			// The p was retaken and then enter into syscall again (since _g_.m.syscalltick has changed).
@@ -3062,7 +2998,10 @@ func exitsyscall0(gp *g) {
 	dropg()
 	casgstatus(gp, _Gexitingsyscall, _Grunnable)
 	lock(&sched.lock)
-	_p_ := pidleget()
+	var _p_ *p
+	if schedEnabled(_g_) {
+		_p_ = pidleget()
+	}
 	if _p_ == nil {
 		globrunqput(gp)
 	} else if atomic.Load(&sched.sysmonwait) != 0 {
@@ -3229,11 +3168,12 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 
 	newg.param = arg
 	newg.gopc = getcallerpc()
+	newg.ancestors = saveAncestors(_g_)
 	newg.startpc = fn
 	if _g_.m.curg != nil {
 		newg.labels = _g_.m.curg.labels
 	}
-	if isSystemGoroutine(newg) {
+	if isSystemGoroutine(newg, false) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
 	newg.gcscanvalid = false
@@ -3342,20 +3282,17 @@ func gfput(_p_ *p, gp *g) {
 		throw("gfput: bad status (not Gdead)")
 	}
 
-	gp.schedlink.set(_p_.gfree)
-	_p_.gfree = gp
-	_p_.gfreecnt++
-	if _p_.gfreecnt >= 64 {
-		lock(&sched.gflock)
-		for _p_.gfreecnt >= 32 {
-			_p_.gfreecnt--
-			gp = _p_.gfree
-			_p_.gfree = gp.schedlink.ptr()
-			gp.schedlink.set(sched.gfree)
-			sched.gfree = gp
-			sched.ngfree++
+	_p_.gFree.push(gp)
+	_p_.gFree.n++
+	if _p_.gFree.n >= 64 {
+		lock(&sched.gFree.lock)
+		for _p_.gFree.n >= 32 {
+			_p_.gFree.n--
+			gp = _p_.gFree.pop()
+			sched.gFree.list.push(gp)
+			sched.gFree.n++
 		}
-		unlock(&sched.gflock)
+		unlock(&sched.gFree.lock)
 	}
 }
 
@@ -3363,43 +3300,39 @@ func gfput(_p_ *p, gp *g) {
 // If local list is empty, grab a batch from global list.
 func gfget(_p_ *p) *g {
 retry:
-	gp := _p_.gfree
-	if gp == nil && sched.gfree != nil {
-		lock(&sched.gflock)
-		for _p_.gfreecnt < 32 {
-			if sched.gfree != nil {
-				gp = sched.gfree
-				sched.gfree = gp.schedlink.ptr()
-			} else {
+	if _p_.gFree.empty() && !sched.gFree.list.empty() {
+		lock(&sched.gFree.lock)
+		// Move a batch of free Gs to the P.
+		for _p_.gFree.n < 32 {
+			gp := sched.gFree.list.pop()
+			if gp == nil {
 				break
 			}
-			_p_.gfreecnt++
-			sched.ngfree--
-			gp.schedlink.set(_p_.gfree)
-			_p_.gfree = gp
+			sched.gFree.n--
+			_p_.gFree.push(gp)
+			_p_.gFree.n++
 		}
-		unlock(&sched.gflock)
+		unlock(&sched.gFree.lock)
 		goto retry
 	}
-	if gp != nil {
-		_p_.gfree = gp.schedlink.ptr()
-		_p_.gfreecnt--
+	gp := _p_.gFree.pop()
+	if gp == nil {
+		return nil
 	}
+	_p_.gFree.n--
 	return gp
 }
 
 // Purge all cached G's from gfree list to the global list.
 func gfpurge(_p_ *p) {
-	lock(&sched.gflock)
-	for _p_.gfreecnt != 0 {
-		_p_.gfreecnt--
-		gp := _p_.gfree
-		_p_.gfree = gp.schedlink.ptr()
-		gp.schedlink.set(sched.gfree)
-		sched.gfree = gp
-		sched.ngfree++
+	lock(&sched.gFree.lock)
+	for !_p_.gFree.empty() {
+		gp := _p_.gFree.pop()
+		_p_.gFree.n--
+		sched.gFree.list.push(gp)
+		sched.gFree.n++
 	}
-	unlock(&sched.gflock)
+	unlock(&sched.gFree.lock)
 }
 
 // Breakpoint executes a breakpoint trap.
@@ -3512,9 +3445,9 @@ func badunlockosthread() {
 }
 
 func gcount() int32 {
-	n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
+	n := int32(allglen) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
 	for _, _p_ := range allp {
-		n -= _p_.gfreecnt
+		n -= _p_.gFree.n
 	}
 
 	// All these variables can be changed concurrently, so the result can be inconsistent.
@@ -3627,7 +3560,7 @@ func sigprof(pc uintptr, gp *g, mp *m) {
 		// Account it against abstract "System" or "GC".
 		n = 2
 		stk[0] = pc
-		if mp.preemptoff != "" || mp.helpgc != 0 {
+		if mp.preemptoff != "" {
 			stk[1] = _GCPC + sys.PCQuantum
 		} else {
 			stk[1] = _SystemPC + sys.PCQuantum
@@ -3854,6 +3787,7 @@ func procresize(nprocs int32) *p {
 	if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
 		// continue to use the current P
 		_g_.m.p.ptr().status = _Prunning
+		_g_.m.p.ptr().mcache.prepareForSweep()
 	} else {
 		// release the current P and acquire allp[0]
 		if _g_.m.p != 0 {
@@ -3898,36 +3832,40 @@ func procresize(nprocs int32) *p {
 //go:yeswritebarrierrec
 func acquirep(_p_ *p) {
 	// Do the part that isn't allowed to have write barriers.
-	acquirep1(_p_)
+	wirep(_p_)
 
-	// have p; write barriers now allowed
-	_g_ := getg()
-	_g_.m.mcache = _p_.mcache
+	// Have p; write barriers now allowed.
+
+	// Perform deferred mcache flush before this P can allocate
+	// from a potentially stale mcache.
+	_p_.mcache.prepareForSweep()
 
 	if trace.enabled {
 		traceProcStart()
 	}
 }
 
-// acquirep1 is the first step of acquirep, which actually acquires
-// _p_. This is broken out so we can disallow write barriers for this
-// part, since we don't yet have a P.
+// wirep is the first step of acquirep, which actually associates the
+// current M to _p_. This is broken out so we can disallow write
+// barriers for this part, since we don't yet have a P.
 //
 //go:nowritebarrierrec
-func acquirep1(_p_ *p) {
+//go:nosplit
+func wirep(_p_ *p) {
 	_g_ := getg()
 
 	if _g_.m.p != 0 || _g_.m.mcache != nil {
-		throw("acquirep: already in go")
+		throw("wirep: already in go")
 	}
 	if _p_.m != 0 || _p_.status != _Pidle {
 		id := int64(0)
 		if _p_.m != 0 {
 			id = _p_.m.ptr().id
 		}
-		print("acquirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
-		throw("acquirep: invalid p state")
+		print("wirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
+		throw("wirep: invalid p state")
 	}
+	_g_.m.mcache = _p_.mcache
 	_g_.m.p.set(_p_)
 	_p_.m.set(_g_.m)
 	_p_.status = _Prunning
@@ -4005,7 +3943,7 @@ func checkdead() {
 	lock(&allglock)
 	for i := 0; i < len(allgs); i++ {
 		gp := allgs[i]
-		if isSystemGoroutine(gp) {
+		if isSystemGoroutine(gp, false) {
 			continue
 		}
 		s := readgstatus(gp)
@@ -4134,8 +4072,8 @@ func sysmon() {
 		now := nanotime()
 		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
-			gp := netpoll(false) // non-blocking - returns list of goroutines
-			if gp != nil {
+			list := netpoll(false) // non-blocking - returns list of goroutines
+			if !list.empty() {
 				// Need to decrement number of idle locked M's
 				// (pretending that one more is running) before injectglist.
 				// Otherwise it can lead to the following situation:
@@ -4144,7 +4082,7 @@ func sysmon() {
 				// observes that there is no work to do and no other running M's
 				// and reports deadlock.
 				incidlelocked(-1)
-				injectglist(gp)
+				injectglist(&list)
 				incidlelocked(1)
 			}
 		}
@@ -4159,8 +4097,9 @@ func sysmon() {
 		if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && atomic.Load(&forcegc.idle) != 0 {
 			lock(&forcegc.lock)
 			forcegc.idle = 0
-			forcegc.g.schedlink = 0
-			injectglist(forcegc.g)
+			var list gList
+			list.push(forcegc.g)
+			injectglist(&list)
 			unlock(&forcegc.lock)
 		}
 		// scavenge heap once in a while
@@ -4337,7 +4276,7 @@ func schedtrace(detailed bool) {
 			if mp != nil {
 				id = mp.id
 			}
-			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gfreecnt, "\n")
+			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, "\n")
 		} else {
 			// In non-detailed mode format lengths of per-P run queues as:
 			// [len1 len2 len3 len4]
@@ -4373,7 +4312,7 @@ func schedtrace(detailed bool) {
 		if lockedg != nil {
 			id3 = lockedg.goid
 		}
-		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " helpgc=", mp.helpgc, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
+		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
 	}
 
 	lock(&allglock)
@@ -4395,6 +4334,40 @@ func schedtrace(detailed bool) {
 	unlock(&sched.lock)
 }
 
+// schedEnableUser enables or disables the scheduling of user
+// goroutines.
+//
+// This does not stop already running user goroutines, so the caller
+// should first stop the world when disabling user goroutines.
+func schedEnableUser(enable bool) {
+	lock(&sched.lock)
+	if sched.disable.user == !enable {
+		unlock(&sched.lock)
+		return
+	}
+	sched.disable.user = !enable
+	if enable {
+		n := sched.disable.n
+		sched.disable.n = 0
+		globrunqputbatch(&sched.disable.runnable, n)
+		unlock(&sched.lock)
+		for ; n != 0 && sched.npidle != 0; n-- {
+			startm(nil, false)
+		}
+	} else {
+		unlock(&sched.lock)
+	}
+}
+
+// schedEnabled reports whether gp should be scheduled. It returns
+// false is scheduling of gp is disabled.
+func schedEnabled(gp *g) bool {
+	if sched.disable.user {
+		return isSystemGoroutine(gp, true)
+	}
+	return true
+}
+
 // Put mp on midle list.
 // Sched must be locked.
 // May run during STW, so write barriers are not allowed.
@@ -4424,13 +4397,7 @@ func mget() *m {
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func globrunqput(gp *g) {
-	gp.schedlink = 0
-	if sched.runqtail != 0 {
-		sched.runqtail.ptr().schedlink.set(gp)
-	} else {
-		sched.runqhead.set(gp)
-	}
-	sched.runqtail.set(gp)
+	sched.runq.pushBack(gp)
 	sched.runqsize++
 }
 
@@ -4439,25 +4406,17 @@ func globrunqput(gp *g) {
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func globrunqputhead(gp *g) {
-	gp.schedlink = sched.runqhead
-	sched.runqhead.set(gp)
-	if sched.runqtail == 0 {
-		sched.runqtail.set(gp)
-	}
+	sched.runq.push(gp)
 	sched.runqsize++
 }
 
 // Put a batch of runnable goroutines on the global runnable queue.
+// This clears *batch.
 // Sched must be locked.
-func globrunqputbatch(ghead *g, gtail *g, n int32) {
-	gtail.schedlink = 0
-	if sched.runqtail != 0 {
-		sched.runqtail.ptr().schedlink.set(ghead)
-	} else {
-		sched.runqhead.set(ghead)
-	}
-	sched.runqtail.set(gtail)
+func globrunqputbatch(batch *gQueue, n int32) {
+	sched.runq.pushBackAll(*batch)
 	sched.runqsize += n
+	*batch = gQueue{}
 }
 
 // Try get a batch of G's from the global runnable queue.
@@ -4479,16 +4438,11 @@ func globrunqget(_p_ *p, max int32) *g {
 	}
 
 	sched.runqsize -= n
-	if sched.runqsize == 0 {
-		sched.runqtail = 0
-	}
 
-	gp := sched.runqhead.ptr()
-	sched.runqhead = gp.schedlink
+	gp := sched.runq.pop()
 	n--
 	for ; n > 0; n-- {
-		gp1 := sched.runqhead.ptr()
-		sched.runqhead = gp1.schedlink
+		gp1 := sched.runq.pop()
 		runqput(_p_, gp1, false)
 	}
 	return gp
@@ -4520,7 +4474,7 @@ func pidleget() *p {
 	return _p_
 }
 
-// runqempty returns true if _p_ has no Gs on its local run queue.
+// runqempty reports whether _p_ has no Gs on its local run queue.
 // It never returns true spuriously.
 func runqempty(_p_ *p) bool {
 	// Defend against a race where 1) _p_ has G1 in runqnext but runqhead == runqtail,
@@ -4572,11 +4526,11 @@ func runqput(_p_ *p, gp *g, next bool) {
 	}
 
 retry:
-	h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with consumers
+	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with consumers
 	t := _p_.runqtail
 	if t-h < uint32(len(_p_.runq)) {
 		_p_.runq[t%uint32(len(_p_.runq))].set(gp)
-		atomic.Store(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
+		atomic.StoreRel(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
 		return
 	}
 	if runqputslow(_p_, gp, h, t) {
@@ -4600,7 +4554,7 @@ func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
 	for i := uint32(0); i < n; i++ {
 		batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
 	}
-	if !atomic.Cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+	if !atomic.CasRel(&_p_.runqhead, h, h+n) { // cas-release, commits consume
 		return false
 	}
 	batch[n] = gp
@@ -4616,10 +4570,13 @@ func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
 	for i := uint32(0); i < n; i++ {
 		batch[i].schedlink.set(batch[i+1])
 	}
+	var q gQueue
+	q.head.set(batch[0])
+	q.tail.set(batch[n])
 
 	// Now put the batch on global queue.
 	lock(&sched.lock)
-	globrunqputbatch(batch[0], batch[n], int32(n+1))
+	globrunqputbatch(&q, int32(n+1))
 	unlock(&sched.lock)
 	return true
 }
@@ -4641,13 +4598,13 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 	}
 
 	for {
-		h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with other consumers
+		h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
 		t := _p_.runqtail
 		if t == h {
 			return nil, false
 		}
 		gp := _p_.runq[h%uint32(len(_p_.runq))].ptr()
-		if atomic.Cas(&_p_.runqhead, h, h+1) { // cas-release, commits consume
+		if atomic.CasRel(&_p_.runqhead, h, h+1) { // cas-release, commits consume
 			return gp, false
 		}
 	}
@@ -4659,8 +4616,8 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 // Can be executed by any P.
 func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
 	for {
-		h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with other consumers
-		t := atomic.Load(&_p_.runqtail) // load-acquire, synchronize with the producer
+		h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
+		t := atomic.LoadAcq(&_p_.runqtail) // load-acquire, synchronize with the producer
 		n := t - h
 		n = n - n/2
 		if n == 0 {
@@ -4703,7 +4660,7 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
 			g := _p_.runq[(h+i)%uint32(len(_p_.runq))]
 			batch[(batchHead+i)%uint32(len(batch))] = g
 		}
-		if atomic.Cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+		if atomic.CasRel(&_p_.runqhead, h, h+n) { // cas-release, commits consume
 			return n
 		}
 	}
@@ -4723,11 +4680,112 @@ func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
 	if n == 0 {
 		return gp
 	}
-	h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with consumers
+	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with consumers
 	if t-h+n >= uint32(len(_p_.runq)) {
 		throw("runqsteal: runq overflow")
 	}
-	atomic.Store(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
+	atomic.StoreRel(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
+	return gp
+}
+
+// A gQueue is a dequeue of Gs linked through g.schedlink. A G can only
+// be on one gQueue or gList at a time.
+type gQueue struct {
+	head guintptr
+	tail guintptr
+}
+
+// empty reports whether q is empty.
+func (q *gQueue) empty() bool {
+	return q.head == 0
+}
+
+// push adds gp to the head of q.
+func (q *gQueue) push(gp *g) {
+	gp.schedlink = q.head
+	q.head.set(gp)
+	if q.tail == 0 {
+		q.tail.set(gp)
+	}
+}
+
+// pushBack adds gp to the tail of q.
+func (q *gQueue) pushBack(gp *g) {
+	gp.schedlink = 0
+	if q.tail != 0 {
+		q.tail.ptr().schedlink.set(gp)
+	} else {
+		q.head.set(gp)
+	}
+	q.tail.set(gp)
+}
+
+// pushBackAll adds all Gs in l2 to the tail of q. After this q2 must
+// not be used.
+func (q *gQueue) pushBackAll(q2 gQueue) {
+	if q2.tail == 0 {
+		return
+	}
+	q2.tail.ptr().schedlink = 0
+	if q.tail != 0 {
+		q.tail.ptr().schedlink = q2.head
+	} else {
+		q.head = q2.head
+	}
+	q.tail = q2.tail
+}
+
+// pop removes and returns the head of queue q. It returns nil if
+// q is empty.
+func (q *gQueue) pop() *g {
+	gp := q.head.ptr()
+	if gp != nil {
+		q.head = gp.schedlink
+		if q.head == 0 {
+			q.tail = 0
+		}
+	}
+	return gp
+}
+
+// popList takes all Gs in q and returns them as a gList.
+func (q *gQueue) popList() gList {
+	stack := gList{q.head}
+	*q = gQueue{}
+	return stack
+}
+
+// A gList is a list of Gs linked through g.schedlink. A G can only be
+// on one gQueue or gList at a time.
+type gList struct {
+	head guintptr
+}
+
+// empty reports whether l is empty.
+func (l *gList) empty() bool {
+	return l.head == 0
+}
+
+// push adds gp to the head of l.
+func (l *gList) push(gp *g) {
+	gp.schedlink = l.head
+	l.head.set(gp)
+}
+
+// pushAll prepends all Gs in q to l.
+func (l *gList) pushAll(q gQueue) {
+	if !q.empty() {
+		q.tail.ptr().schedlink = l.head
+		l.head = q.head
+	}
+}
+
+// pop removes and returns the head of l. If l is empty, it returns nil.
+func (l *gList) pop() *g {
+	gp := l.head.ptr()
+	if gp != nil {
+		l.head = gp.schedlink
+	}
 	return gp
 }
 
diff --git a/libgo/go/runtime/proc_test.go b/libgo/go/runtime/proc_test.go
index 82a2fe486a9..da5ffbbdee2 100644
--- a/libgo/go/runtime/proc_test.go
+++ b/libgo/go/runtime/proc_test.go
@@ -898,11 +898,22 @@ func testLockOSThreadExit(t *testing.T, prog string) {
 	output := runTestProg(t, prog, "LockOSThreadMain", "GOMAXPROCS=1")
 	want := "OK\n"
 	if output != want {
-		t.Errorf("want %s, got %s\n", want, output)
+		t.Errorf("want %q, got %q", want, output)
 	}
 
 	output = runTestProg(t, prog, "LockOSThreadAlt")
 	if output != want {
-		t.Errorf("want %s, got %s\n", want, output)
+		t.Errorf("want %q, got %q", want, output)
+	}
+}
+
+func TestLockOSThreadAvoidsStatePropagation(t *testing.T) {
+	want := "OK\n"
+	skip := "unshare not permitted\n"
+	output := runTestProg(t, "testprog", "LockOSThreadAvoidsStatePropagation", "GOMAXPROCS=1")
+	if output == skip {
+		t.Skip("unshare syscall not permitted on this system")
+	} else if output != want {
+		t.Errorf("want %q, got %q", want, output)
 	}
 }
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index 050f180c407..4e771629b0e 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -157,7 +157,7 @@ func check() {
 		h     uint64
 		i, i1 float32
 		j, j1 float64
-		k, k1 unsafe.Pointer
+		k     unsafe.Pointer
 		l     *uint16
 		m     [4]byte
 	)
@@ -246,21 +246,6 @@ func check() {
 		throw("cas6")
 	}
 
-	k = unsafe.Pointer(uintptr(0xfedcb123))
-	if sys.PtrSize == 8 {
-		k = unsafe.Pointer(uintptr(k) << 10)
-	}
-	if casp(&k, nil, nil) {
-		throw("casp1")
-	}
-	k1 = add(k, 1)
-	if !casp(&k, k, k1) {
-		throw("casp2")
-	}
-	if k != k1 {
-		throw("casp3")
-	}
-
 	m = [4]byte{1, 1, 1, 1}
 	atomic.Or8(&m[1], 0xf0)
 	if m[0] != 1 || m[1] != 0xf1 || m[2] != 1 || m[3] != 1 {
@@ -332,10 +317,10 @@ var debug struct {
 	gccheckmark        int32
 	gcpacertrace       int32
 	gcshrinkstackoff   int32
-	gcrescanstacks     int32
 	gcstoptheworld     int32
 	gctrace            int32
 	invalidptr         int32
+	madvdontneed       int32 // for Linux; issue 28466
 	sbrk               int32
 	scavenge           int32
 	scheddetail        int32
@@ -350,10 +335,10 @@ var dbgvars = []dbgVar{
 	{"gccheckmark", &debug.gccheckmark},
 	{"gcpacertrace", &debug.gcpacertrace},
 	{"gcshrinkstackoff", &debug.gcshrinkstackoff},
-	{"gcrescanstacks", &debug.gcrescanstacks},
 	{"gcstoptheworld", &debug.gcstoptheworld},
 	{"gctrace", &debug.gctrace},
 	{"invalidptr", &debug.invalidptr},
+	{"madvdontneed", &debug.madvdontneed},
 	{"sbrk", &debug.sbrk},
 	{"scavenge", &debug.scavenge},
 	{"scheddetail", &debug.scheddetail},
@@ -454,7 +439,9 @@ func timediv(v int64, div int32, rem *int32) int32 {
 	for bit := 30; bit >= 0; bit-- {
 		if v >= int64(div)<<uint(bit) {
 			v = v - (int64(div) << uint(bit))
-			res += 1 << uint(bit)
+			// Before this for loop, res was 0, thus all these
+			// power of 2 increments are now just bitsets.
+			res |= 1 << uint(bit)
 		}
 	}
 	if v >= int64(div) {
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 0e9cf63c10d..4cd68da2ea2 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -439,7 +440,12 @@ type g struct {
 	scang   uintptr // the g that wants to scan this g's stack (uintptr to avoid write barrier)
 	scangcw uintptr // gc worker for scanning stack (uintptr to avoid write barrier)
 
-	isSystemGoroutine bool // whether goroutine is a "system" goroutine
+	isSystemGoroutine    bool // whether goroutine is a "system" goroutine
+	isFinalizerGoroutine bool // whether goroutine is the finalizer goroutine
+
+	deferring          bool // whether we are running a deferred function
+	goexiting          bool // whether we are running Goexit
+	ranCgocallBackDone bool // whether we deferred CgocallBackDone
 
 	traceback uintptr // stack traceback buffer
 
@@ -463,6 +469,7 @@ type m struct {
 	caughtsig   guintptr // goroutine running during fatal signal
 	p           puintptr // attached p for executing go code (nil if not executing go code)
 	nextp       puintptr
+	oldp        puintptr // the p that was attached before executing a syscall
 	id          int64
 	mallocing   int32
 	throwing    int32
@@ -471,7 +478,6 @@ type m struct {
 	softfloat   int32
 	dying       int32
 	profilehz   int32
-	helpgc      int32
 	spinning    bool // m is out of work and is actively looking for work
 	blocked     bool // m is blocked on a note
 	inwb        bool // m is executing a write barrier
@@ -564,8 +570,10 @@ type p struct {
 	runnext guintptr
 
 	// Available G's (status == Gdead)
-	gfree    *g
-	gfreecnt int32
+	gFree struct {
+		gList
+		n int32
+	}
 
 	sudogcache []*sudog
 	sudogbuf   [128]*sudog
@@ -604,7 +612,7 @@ type p struct {
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
-	pad [sys.CacheLineSize]byte
+	pad cpu.CacheLinePad
 }
 
 type schedt struct {
@@ -632,14 +640,27 @@ type schedt struct {
 	nmspinning uint32 // See "Worker thread parking/unparking" comment in proc.go.
 
 	// Global runnable queue.
-	runqhead guintptr
-	runqtail guintptr
+	runq     gQueue
 	runqsize int32
 
+	// disable controls selective disabling of the scheduler.
+	//
+	// Use schedEnableUser to control this.
+	//
+	// disable is protected by sched.lock.
+	disable struct {
+		// user disables scheduling of user goroutines.
+		user     bool
+		runnable gQueue // pending runnable Gs
+		n        int32  // length of runnable
+	}
+
 	// Global cache of dead G's.
-	gflock mutex
-	gfree  *g
-	ngfree int32
+	gFree struct {
+		lock mutex
+		list gList // Gs
+		n    int32
+	}
 
 	// Central cache of sudog structs.
 	sudoglock  mutex
diff --git a/libgo/go/runtime/select.go b/libgo/go/runtime/select.go
index fb8373f53b5..d658a349ed2 100644
--- a/libgo/go/runtime/select.go
+++ b/libgo/go/runtime/select.go
@@ -105,7 +105,7 @@ func block() {
 //
 // selectgo returns the index of the chosen scase, which matches the
 // ordinal position of its respective select{recv,send,default} call.
-// Also, if the chosen scase was a receive operation, it returns whether
+// Also, if the chosen scase was a receive operation, it reports whether
 // a value was received.
 func selectgo(cas0 *scase, order0 *uint16, ncases int) (int, bool) {
 	if debugSelect {
diff --git a/libgo/go/runtime/sema.go b/libgo/go/runtime/sema.go
index 273e8aa54d9..2c7ad329662 100644
--- a/libgo/go/runtime/sema.go
+++ b/libgo/go/runtime/sema.go
@@ -20,8 +20,8 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -48,7 +48,7 @@ const semTabSize = 251
 
 var semtable [semTabSize]struct {
 	root semaRoot
-	pad  [sys.CacheLineSize - unsafe.Sizeof(semaRoot{})]byte
+	pad  [cpu.CacheLinePadSize - unsafe.Sizeof(semaRoot{})]byte
 }
 
 //go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
diff --git a/libgo/go/runtime/semasleep_test.go b/libgo/go/runtime/semasleep_test.go
new file mode 100644
index 00000000000..5b2cc64483f
--- /dev/null
+++ b/libgo/go/runtime/semasleep_test.go
@@ -0,0 +1,88 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//+build !nacl,!plan9,!windows,!js
+
+package runtime_test
+
+import (
+	"internal/testenv"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"syscall"
+	"testing"
+	"time"
+)
+
+// Issue #27250. Spurious wakeups to pthread_cond_timedwait_relative_np
+// shouldn't cause semasleep to retry with the same timeout which would
+// cause indefinite spinning.
+func TestSpuriousWakeupsNeverHangSemasleep(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	tempDir, err := ioutil.TempDir("", "issue-27250")
+	if err != nil {
+		t.Fatalf("Failed to create the temp directory: %v", err)
+	}
+	defer os.RemoveAll(tempDir)
+
+	repro := `
+    package main
+
+    import "time"
+
+    func main() {
+        <-time.After(1 * time.Second)
+    }
+    `
+	mainPath := filepath.Join(tempDir, "main.go")
+	if err := ioutil.WriteFile(mainPath, []byte(repro), 0644); err != nil {
+		t.Fatalf("Failed to create temp file for repro.go: %v", err)
+	}
+	binaryPath := filepath.Join(tempDir, "binary")
+
+	// Build the binary so that we can send the signal to its PID.
+	out, err := exec.Command(testenv.GoToolPath(t), "build", "-o", binaryPath, mainPath).CombinedOutput()
+	if err != nil {
+		t.Fatalf("Failed to compile the binary: err: %v\nOutput: %s\n", err, out)
+	}
+	if err := os.Chmod(binaryPath, 0755); err != nil {
+		t.Fatalf("Failed to chmod binary: %v", err)
+	}
+
+	// Now run the binary.
+	cmd := exec.Command(binaryPath)
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("Failed to start command: %v", err)
+	}
+	doneCh := make(chan error, 1)
+	go func() {
+		doneCh <- cmd.Wait()
+	}()
+
+	// With the repro running, we can continuously send to it
+	// a non-terminal signal such as SIGIO, to spuriously
+	// wakeup pthread_cond_timedwait_relative_np.
+	unfixedTimer := time.NewTimer(2 * time.Second)
+	for {
+		select {
+		case <-time.After(200 * time.Millisecond):
+			// Send the pesky signal that toggles spinning
+			// indefinitely if #27520 is not fixed.
+			cmd.Process.Signal(syscall.SIGIO)
+
+		case <-unfixedTimer.C:
+			t.Error("Program failed to return on time and has to be killed, issue #27520 still exists")
+			cmd.Process.Signal(syscall.SIGKILL)
+			return
+
+		case err := <-doneCh:
+			if err != nil {
+				t.Fatalf("The program returned but unfortunately with an error: %v", err)
+			}
+			return
+		}
+	}
+}
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index 84623d33c66..0a2cf724c0d 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -725,7 +725,7 @@ func unminitSignals() {
 	}
 }
 
-// blockableSig returns whether sig may be blocked by the signal mask.
+// blockableSig reports whether sig may be blocked by the signal mask.
 // We never want to block the signals marked _SigUnblock;
 // these are the synchronous signals that turn into a Go panic.
 // In a Go program--not a c-archive/c-shared--we never want to block
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index 7f9db4efa9e..335532d126b 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/math"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -31,28 +32,6 @@ type notInHeapSlice struct {
 	cap   int
 }
 
-// maxElems is a lookup table containing the maximum capacity for a slice.
-// The index is the size of the slice element.
-var maxElems = [...]uintptr{
-	^uintptr(0),
-	maxAlloc / 1, maxAlloc / 2, maxAlloc / 3, maxAlloc / 4,
-	maxAlloc / 5, maxAlloc / 6, maxAlloc / 7, maxAlloc / 8,
-	maxAlloc / 9, maxAlloc / 10, maxAlloc / 11, maxAlloc / 12,
-	maxAlloc / 13, maxAlloc / 14, maxAlloc / 15, maxAlloc / 16,
-	maxAlloc / 17, maxAlloc / 18, maxAlloc / 19, maxAlloc / 20,
-	maxAlloc / 21, maxAlloc / 22, maxAlloc / 23, maxAlloc / 24,
-	maxAlloc / 25, maxAlloc / 26, maxAlloc / 27, maxAlloc / 28,
-	maxAlloc / 29, maxAlloc / 30, maxAlloc / 31, maxAlloc / 32,
-}
-
-// maxSliceCap returns the maximum capacity for a slice.
-func maxSliceCap(elemsize uintptr) uintptr {
-	if elemsize < uintptr(len(maxElems)) {
-		return maxElems[elemsize]
-	}
-	return maxAlloc / elemsize
-}
-
 func panicmakeslicelen() {
 	panic(errorString("makeslice: len out of range"))
 }
@@ -62,21 +41,21 @@ func panicmakeslicecap() {
 }
 
 func makeslice(et *_type, len, cap int) unsafe.Pointer {
-	// NOTE: The len > maxElements check here is not strictly necessary,
-	// but it produces a 'len out of range' error instead of a 'cap out of range' error
-	// when someone does make([]T, bignumber). 'cap out of range' is true too,
-	// but since the cap is only being supplied implicitly, saying len is clearer.
-	// See issue 4085.
-	maxElements := maxSliceCap(et.size)
-	if len < 0 || uintptr(len) > maxElements {
-		panicmakeslicelen()
-	}
-
-	if cap < len || uintptr(cap) > maxElements {
+	mem, overflow := math.MulUintptr(et.size, uintptr(cap))
+	if overflow || mem > maxAlloc || len < 0 || len > cap {
+		// NOTE: Produce a 'len out of range' error instead of a
+		// 'cap out of range' error when someone does make([]T, bignumber).
+		// 'cap out of range' is true too, but since the cap is only being
+		// supplied implicitly, saying len is clearer.
+		// See golang.org/issue/4085.
+		mem, overflow := math.MulUintptr(et.size, uintptr(len))
+		if overflow || mem > maxAlloc || len < 0 {
+			panicmakeslicelen()
+		}
 		panicmakeslicecap()
 	}
 
-	return mallocgc(et.size*uintptr(cap), et, true)
+	return mallocgc(mem, et, true)
 }
 
 func makeslice64(et *_type, len64, cap64 int64) unsafe.Pointer {
@@ -107,10 +86,11 @@ func growslice(et *_type, old slice, cap int) slice {
 		msanread(old.array, uintptr(old.len*int(et.size)))
 	}
 
+	if cap < old.cap {
+		panic(errorString("growslice: cap out of range"))
+	}
+
 	if et.size == 0 {
-		if cap < old.cap {
-			panic(errorString("growslice: cap out of range"))
-		}
 		// append should not create a slice with nil pointer but non-zero len.
 		// We assume that append doesn't need to preserve old.array in this case.
 		return slice{unsafe.Pointer(&zerobase), cap, cap}
@@ -172,15 +152,14 @@ func growslice(et *_type, old slice, cap int) slice {
 	default:
 		lenmem = uintptr(old.len) * et.size
 		newlenmem = uintptr(cap) * et.size
-		capmem = roundupsize(uintptr(newcap) * et.size)
-		overflow = uintptr(newcap) > maxSliceCap(et.size)
+		capmem, overflow = math.MulUintptr(et.size, uintptr(newcap))
+		capmem = roundupsize(capmem)
 		newcap = int(capmem / et.size)
 	}
 
-	// The check of overflow (uintptr(newcap) > maxSliceCap(et.size))
-	// in addition to capmem > _MaxMem is needed to prevent an overflow
-	// which can be used to trigger a segfault on 32bit architectures
-	// with this example program:
+	// The check of overflow in addition to capmem > maxAlloc is needed
+	// to prevent an overflow which can be used to trigger a segfault
+	// on 32bit architectures with this example program:
 	//
 	// type T [1<<27 + 1]int64
 	//
@@ -191,28 +170,26 @@ func growslice(et *_type, old slice, cap int) slice {
 	//   s = append(s, d, d, d, d)
 	//   print(len(s), "\n")
 	// }
-	if cap < old.cap || overflow || capmem > maxAlloc {
+	if overflow || capmem > maxAlloc {
 		panic(errorString("growslice: cap out of range"))
 	}
 
 	var p unsafe.Pointer
 	if et.kind&kindNoPointers != 0 {
 		p = mallocgc(capmem, nil, false)
-		memmove(p, old.array, lenmem)
 		// The append() that calls growslice is going to overwrite from old.len to cap (which will be the new length).
 		// Only clear the part that will not be overwritten.
 		memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
 	} else {
 		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
 		p = mallocgc(capmem, et, true)
-		if !writeBarrier.enabled {
-			memmove(p, old.array, lenmem)
-		} else {
-			for i := uintptr(0); i < lenmem; i += et.size {
-				typedmemmove(et, add(p, i), add(old.array, i))
-			}
+		if writeBarrier.enabled {
+			// Only shade the pointers in old.array since we know the destination slice p
+			// only contains nil pointers because it has been cleared during alloc.
+			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(old.array), lenmem)
 		}
 	}
+	memmove(p, old.array, lenmem)
 
 	return slice{p, cap, newcap}
 }
diff --git a/libgo/go/runtime/slice_test.go b/libgo/go/runtime/slice_test.go
index c2dfb7afd19..0463fc70a76 100644
--- a/libgo/go/runtime/slice_test.go
+++ b/libgo/go/runtime/slice_test.go
@@ -10,20 +10,68 @@ import (
 
 const N = 20
 
-func BenchmarkMakeSlice(b *testing.B) {
-	var x []byte
-	for i := 0; i < b.N; i++ {
-		x = make([]byte, 32)
-		_ = x
-	}
-}
-
 type (
 	struct24 struct{ a, b, c int64 }
 	struct32 struct{ a, b, c, d int64 }
 	struct40 struct{ a, b, c, d, e int64 }
 )
 
+func BenchmarkMakeSlice(b *testing.B) {
+	const length = 2
+	b.Run("Byte", func(b *testing.B) {
+		var x []byte
+		for i := 0; i < b.N; i++ {
+			x = make([]byte, length, 2*length)
+			_ = x
+		}
+	})
+	b.Run("Int16", func(b *testing.B) {
+		var x []int16
+		for i := 0; i < b.N; i++ {
+			x = make([]int16, length, 2*length)
+			_ = x
+		}
+	})
+	b.Run("Int", func(b *testing.B) {
+		var x []int
+		for i := 0; i < b.N; i++ {
+			x = make([]int, length, 2*length)
+			_ = x
+		}
+	})
+	b.Run("Ptr", func(b *testing.B) {
+		var x []*byte
+		for i := 0; i < b.N; i++ {
+			x = make([]*byte, length, 2*length)
+			_ = x
+		}
+	})
+	b.Run("Struct", func(b *testing.B) {
+		b.Run("24", func(b *testing.B) {
+			var x []struct24
+			for i := 0; i < b.N; i++ {
+				x = make([]struct24, length, 2*length)
+				_ = x
+			}
+		})
+		b.Run("32", func(b *testing.B) {
+			var x []struct32
+			for i := 0; i < b.N; i++ {
+				x = make([]struct32, length, 2*length)
+				_ = x
+			}
+		})
+		b.Run("40", func(b *testing.B) {
+			var x []struct40
+			for i := 0; i < b.N; i++ {
+				x = make([]struct40, length, 2*length)
+				_ = x
+			}
+		})
+
+	})
+}
+
 func BenchmarkGrowSlice(b *testing.B) {
 	b.Run("Byte", func(b *testing.B) {
 		x := make([]byte, 9)
diff --git a/libgo/go/runtime/string.go b/libgo/go/runtime/string.go
index 5296ebddf39..025ea7a3bd7 100644
--- a/libgo/go/runtime/string.go
+++ b/libgo/go/runtime/string.go
@@ -146,7 +146,8 @@ func rawstringtmp(buf *tmpBuf, l int) (s string, b []byte) {
 // and otherwise intrinsified by the compiler.
 //
 // Some internal compiler optimizations use this function.
-// - Used for m[string(k)] lookup where m is a string-keyed map and k is a []byte.
+// - Used for m[T1{... Tn{..., string(k), ...} ...}] and m[string(k)]
+//   where k is []byte, T1 to Tn is a nesting of struct and array literals.
 // - Used for "<"+string(b)+">" concatenation where b is []byte.
 // - Used for string(b)=="foo" comparison where b is []byte.
 func slicebytetostringtmp(b []byte) string {
@@ -344,7 +345,7 @@ func index(s, t string) int {
 		return 0
 	}
 	for i := 0; i < len(s); i++ {
-		if s[i] == t[0] && hasprefix(s[i:], t) {
+		if s[i] == t[0] && hasPrefix(s[i:], t) {
 			return i
 		}
 	}
@@ -355,8 +356,8 @@ func contains(s, t string) bool {
 	return index(s, t) >= 0
 }
 
-func hasprefix(s, t string) bool {
-	return len(s) >= len(t) && s[:len(t)] == t
+func hasPrefix(s, prefix string) bool {
+	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
 }
 
 const (
diff --git a/libgo/go/runtime/string_test.go b/libgo/go/runtime/string_test.go
index 03327bbb1e2..ec83bb4d458 100644
--- a/libgo/go/runtime/string_test.go
+++ b/libgo/go/runtime/string_test.go
@@ -243,6 +243,36 @@ func TestCompareTempString(t *testing.T) {
 	}
 }
 
+func TestStringIndexHaystack(t *testing.T) {
+	// See issue 25864.
+	haystack := []byte("hello")
+	needle := "ll"
+	n := testing.AllocsPerRun(1000, func() {
+		if strings.Index(string(haystack), needle) != 2 {
+			t.Fatalf("needle not found")
+		}
+	})
+	// was n != 0, changed for gccgo.
+	if n > 1 {
+		t.Fatalf("want 0 allocs, got %v", n)
+	}
+}
+
+func TestStringIndexNeedle(t *testing.T) {
+	// See issue 25864.
+	haystack := "hello"
+	needle := []byte("ll")
+	n := testing.AllocsPerRun(1000, func() {
+		if strings.Index(haystack, string(needle)) != 2 {
+			t.Fatalf("needle not found")
+		}
+	})
+	// was n != 0, changed for gccgo
+	if n > 1 {
+		t.Fatalf("want 0 allocs, got %v", n)
+	}
+}
+
 func TestStringOnStack(t *testing.T) {
 	s := ""
 	for i := 0; i < 3; i++ {
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index 4caa39d5e94..f2b0ee8abaa 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -84,12 +84,12 @@ func badsystemstack() {
 // used only when the caller knows that *ptr contains no heap pointers
 // because either:
 //
-// 1. *ptr is initialized memory and its type is pointer-free.
+// *ptr is initialized memory and its type is pointer-free, or
 //
-// 2. *ptr is uninitialized memory (e.g., memory that's being reused
-//    for a new allocation) and hence contains only "junk".
+// *ptr is uninitialized memory (e.g., memory that's being reused
+// for a new allocation) and hence contains only "junk".
 //
-// in memclr_*.s
+// The (CPU-specific) implementations of this function are in memclr_*.s.
 //go:noescape
 func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
 
@@ -164,7 +164,7 @@ func breakpoint()
 
 func asminit() {}
 
-//go:linkname reflectcall reflect.call
+//go:linkname reflectcall runtime.reflectcall
 //go:noescape
 func reflectcall(fntype *functype, fn *funcval, isInterface, isMethod bool, params, results *unsafe.Pointer)
 
@@ -454,3 +454,15 @@ var usestackmaps bool
 // probestackmaps detects whether there are stack maps.
 //go:linkname probestackmaps runtime.probestackmaps
 func probestackmaps() bool
+
+// For the math/bits packages for gccgo.
+//go:linkname getDivideError runtime.getDivideError
+func getDivideError() error {
+	return divideError
+}
+
+// For the math/bits packages for gccgo.
+//go:linkname getOverflowError runtime.getOverflowError
+func getOverflowError() error {
+	return overflowError
+}
diff --git a/libgo/go/runtime/stubs2.go b/libgo/go/runtime/stubs2.go
index 1cb910cd111..304c8e4da37 100644
--- a/libgo/go/runtime/stubs2.go
+++ b/libgo/go/runtime/stubs2.go
@@ -7,6 +7,7 @@
 // +build !nacl
 // +build !js
 // +build !darwin
+// +build !aix
 
 package runtime
 
diff --git a/libgo/go/runtime/stubs3.go b/libgo/go/runtime/stubs3.go
index 1af693be2e6..d3e331ab9da 100644
--- a/libgo/go/runtime/stubs3.go
+++ b/libgo/go/runtime/stubs3.go
@@ -7,6 +7,7 @@
 // +build !nacl
 // +build !freebsd
 // +build !darwin
+// +build !aix
 
 package runtime
 
diff --git a/libgo/go/runtime/symtab.go b/libgo/go/runtime/symtab.go
index d137122137c..d7e8c18b517 100644
--- a/libgo/go/runtime/symtab.go
+++ b/libgo/go/runtime/symtab.go
@@ -7,7 +7,7 @@ package runtime
 // Frames may be used to get function/file/line information for a
 // slice of PC values returned by Callers.
 type Frames struct {
-	// callers is a slice of PCs that have not yet been expanded.
+	// callers is a slice of PCs that have not yet been expanded to frames.
 	callers []uintptr
 
 	// The last PC we saw.
@@ -125,7 +125,7 @@ type Func struct {
 // Note that in some situations involving plugins, there may be multiple
 // copies of a particular special runtime function.
 // Note: this list must match the list in cmd/internal/objabi/funcid.go.
-type funcID uint32
+type funcID uint8
 
 const (
 	funcID_normal funcID = iota // not a special function
@@ -146,13 +146,17 @@ const (
 	funcID_gogo
 	funcID_externalthreadhandler
 	funcID_debugCallV1
+	funcID_gopanic
+	funcID_panicwrap
+	funcID_wrapper // any autogenerated code (hash/eq algorithms, method wrappers, etc.)
 )
 
 // FuncForPC returns a *Func describing the function that contains the
 // given program counter address, or else nil.
 //
 // If pc represents multiple functions because of inlining, it returns
-// the *Func describing the outermost function.
+// the a *Func describing the innermost function, but with an entry
+// of the outermost function.
 func FuncForPC(pc uintptr) *Func {
 	name, _, _ := funcfileline(pc, -1)
 	if name == "" {
diff --git a/libgo/go/runtime/sys_darwin.go b/libgo/go/runtime/sys_darwin.go
index 7efbef746cd..f34ac883524 100644
--- a/libgo/go/runtime/sys_darwin.go
+++ b/libgo/go/runtime/sys_darwin.go
@@ -50,6 +50,61 @@ func libcCall(fn, arg unsafe.Pointer) int32 {
 	return res
 }
 
+// The X versions of syscall expect the libc call to return a 64-bit result.
+// Otherwise (the non-X version) expects a 32-bit result.
+// This distinction is required because an error is indicated by returning -1,
+// and we need to know whether to check 32 or 64 bits of the result.
+// (Some libc functions that return 32 bits put junk in the upper 32 bits of AX.)
+
+//go:linkname syscall_syscall syscall.syscall
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	entersyscallblock()
+	libcCall(unsafe.Pointer(funcPC(syscall)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscall()
+
+//go:linkname syscall_syscall6 syscall.syscall6
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	entersyscallblock()
+	libcCall(unsafe.Pointer(funcPC(syscall6)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscall6()
+
+//go:linkname syscall_syscall6X syscall.syscall6X
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscall6X(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	entersyscallblock()
+	libcCall(unsafe.Pointer(funcPC(syscall6X)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscall6X()
+
+//go:linkname syscall_rawSyscall syscall.rawSyscall
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_rawSyscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	libcCall(unsafe.Pointer(funcPC(syscall)), unsafe.Pointer(&fn))
+	return
+}
+
+//go:linkname syscall_rawSyscall6 syscall.rawSyscall6
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_rawSyscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
+	libcCall(unsafe.Pointer(funcPC(syscall6)), unsafe.Pointer(&fn))
+	return
+}
+
 // The *_trampoline functions convert from the Go calling convention to the C calling convention
 // and then call the underlying libc function.  They are defined in sys_darwin_$ARCH.s.
 
@@ -370,5 +425,5 @@ func closeonexec(fd int32) {
 //go:cgo_import_dynamic libc_pthread_cond_signal pthread_cond_signal "/usr/lib/libSystem.B.dylib"
 
 // Magic incantation to get libSystem actually dynamically linked.
-// TODO: Why does the code require this?  See cmd/compile/internal/ld/go.go:210
+// TODO: Why does the code require this?  See cmd/link/internal/ld/go.go
 //go:cgo_import_dynamic _ _ "/usr/lib/libSystem.B.dylib"
diff --git a/libgo/go/runtime/sys_darwin_32.go b/libgo/go/runtime/sys_darwin_32.go
new file mode 100644
index 00000000000..2f17091327d
--- /dev/null
+++ b/libgo/go/runtime/sys_darwin_32.go
@@ -0,0 +1,32 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin
+// +build 386 arm
+
+package runtime
+
+import "unsafe"
+
+//go:linkname syscall_syscall9 syscall.syscall9
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscall9(fn, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2, err uintptr) {
+	entersyscallblock()
+	libcCall(unsafe.Pointer(funcPC(syscall9)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscall9()
+
+//go:linkname syscall_syscallPtr syscall.syscallPtr
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscallPtr(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	entersyscallblock()
+	libcCall(unsafe.Pointer(funcPC(syscallPtr)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscallPtr()
diff --git a/libgo/go/runtime/sys_darwin_64.go b/libgo/go/runtime/sys_darwin_64.go
new file mode 100644
index 00000000000..8c128811b9f
--- /dev/null
+++ b/libgo/go/runtime/sys_darwin_64.go
@@ -0,0 +1,32 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin
+// +build amd64 arm64
+
+package runtime
+
+import "unsafe"
+
+//go:linkname syscall_syscallX syscall.syscallX
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscallX(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	entersyscallblock()
+	libcCall(unsafe.Pointer(funcPC(syscallX)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscallX()
+
+//go:linkname syscall_syscallXPtr syscall.syscallXPtr
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscallXPtr(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	entersyscallblock()
+	libcCall(unsafe.Pointer(funcPC(syscallXPtr)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscallXPtr()
diff --git a/libgo/go/runtime/testdata/testprog/gc.go b/libgo/go/runtime/testdata/testprog/gc.go
index 542451753b7..6b308e073b9 100644
--- a/libgo/go/runtime/testdata/testprog/gc.go
+++ b/libgo/go/runtime/testdata/testprog/gc.go
@@ -17,6 +17,7 @@ func init() {
 	register("GCFairness", GCFairness)
 	register("GCFairness2", GCFairness2)
 	register("GCSys", GCSys)
+	register("GCPhys", GCPhys)
 }
 
 func GCSys() {
@@ -51,8 +52,11 @@ func GCSys() {
 	fmt.Printf("OK\n")
 }
 
+var sink []byte
+
 func workthegc() []byte {
-	return make([]byte, 1029)
+	sink = make([]byte, 1029)
+	return sink
 }
 
 func GCFairness() {
@@ -124,3 +128,85 @@ func GCFairness2() {
 	}
 	fmt.Println("OK")
 }
+
+var maybeSaved []byte
+
+func GCPhys() {
+	// In this test, we construct a very specific scenario. We first
+	// allocate N objects and drop half of their pointers on the floor,
+	// effectively creating N/2 'holes' in our allocated arenas. We then
+	// try to allocate objects twice as big. At the end, we measure the
+	// physical memory overhead of large objects.
+	//
+	// The purpose of this test is to ensure that the GC scavenges free
+	// spans eagerly to ensure high physical memory utilization even
+	// during fragmentation.
+	const (
+		// Unfortunately, measuring actual used physical pages is
+		// difficult because HeapReleased doesn't include the parts
+		// of an arena that haven't yet been touched. So, we just
+		// make objects and size sufficiently large such that even
+		// 64 MB overhead is relatively small in the final
+		// calculation.
+		//
+		// Currently, we target 480MiB worth of memory for our test,
+		// computed as size * objects + (size*2) * (objects/2)
+		// = 2 * size * objects
+		//
+		// Size must be also large enough to be considered a large
+		// object (not in any size-segregated span).
+		size    = 1 << 20
+		objects = 240
+	)
+	// Save objects which we want to survive, and condemn objects which we don't.
+	// Note that we condemn objects in this way and release them all at once in
+	// order to avoid having the GC start freeing up these objects while the loop
+	// is still running and filling in the holes we intend to make.
+	saved := make([][]byte, 0, objects)
+	condemned := make([][]byte, 0, objects/2+1)
+	for i := 0; i < objects; i++ {
+		// Write into a global, to prevent this from being optimized away by
+		// the compiler in the future.
+		maybeSaved = make([]byte, size)
+		if i%2 == 0 {
+			saved = append(saved, maybeSaved)
+		} else {
+			condemned = append(condemned, maybeSaved)
+		}
+	}
+	condemned = nil
+	// Clean up the heap. This will free up every other object created above
+	// (i.e. everything in condemned) creating holes in the heap.
+	runtime.GC()
+	// Allocate many new objects of 2x size.
+	for i := 0; i < objects/2; i++ {
+		saved = append(saved, make([]byte, size*2))
+	}
+	// Clean up the heap again just to put it in a known state.
+	runtime.GC()
+	// heapBacked is an estimate of the amount of physical memory used by
+	// this test. HeapSys is an estimate of the size of the mapped virtual
+	// address space (which may or may not be backed by physical pages)
+	// whereas HeapReleased is an estimate of the amount of bytes returned
+	// to the OS. Their difference then roughly corresponds to the amount
+	// of virtual address space that is backed by physical pages.
+	var stats runtime.MemStats
+	runtime.ReadMemStats(&stats)
+	heapBacked := stats.HeapSys - stats.HeapReleased
+	// If heapBacked exceeds the amount of memory actually used for heap
+	// allocated objects by 10% (post-GC HeapAlloc should be quite close to
+	// the size of the working set), then fail.
+	//
+	// In the context of this test, that indicates a large amount of
+	// fragmentation with physical pages that are otherwise unused but not
+	// returned to the OS.
+	overuse := (float64(heapBacked) - float64(stats.HeapAlloc)) / float64(stats.HeapAlloc)
+	if overuse > 0.1 {
+		fmt.Printf("exceeded physical memory overuse threshold of 10%%: %3.2f%%\n"+
+			"(alloc: %d, sys: %d, rel: %d, objs: %d)\n", overuse*100, stats.HeapAlloc,
+			stats.HeapSys, stats.HeapReleased, len(saved))
+		return
+	}
+	fmt.Println("OK")
+	runtime.KeepAlive(saved)
+}
diff --git a/libgo/go/runtime/testdata/testprog/gettid.go b/libgo/go/runtime/testdata/testprog/gettid.go
deleted file mode 100644
index 1b3e29ab08e..00000000000
--- a/libgo/go/runtime/testdata/testprog/gettid.go
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build linux
-
-package main
-
-import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"syscall"
-)
-
-func gettid() int {
-	return syscall.Gettid()
-}
-
-func tidExists(tid int) (exists, supported bool) {
-	stat, err := ioutil.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
-	if os.IsNotExist(err) {
-		return false, true
-	}
-	// Check if it's a zombie thread.
-	state := bytes.Fields(stat)[2]
-	return !(len(state) == 1 && state[0] == 'Z'), true
-}
diff --git a/libgo/go/runtime/testdata/testprog/lockosthread.go b/libgo/go/runtime/testdata/testprog/lockosthread.go
index 88c0d12e4c1..fd3123e6474 100644
--- a/libgo/go/runtime/testdata/testprog/lockosthread.go
+++ b/libgo/go/runtime/testdata/testprog/lockosthread.go
@@ -24,6 +24,12 @@ func init() {
 		runtime.LockOSThread()
 	})
 	register("LockOSThreadAlt", LockOSThreadAlt)
+
+	registerInit("LockOSThreadAvoidsStatePropagation", func() {
+		// Lock the OS thread now so main runs on the main thread.
+		runtime.LockOSThread()
+	})
+	register("LockOSThreadAvoidsStatePropagation", LockOSThreadAvoidsStatePropagation)
 }
 
 func LockOSThreadMain() {
@@ -92,3 +98,100 @@ func LockOSThreadAlt() {
 ok:
 	println("OK")
 }
+
+func LockOSThreadAvoidsStatePropagation() {
+	// This test is similar to LockOSThreadAlt in that it will detect if a thread
+	// which should have died is still running. However, rather than do this with
+	// thread IDs, it does this by unsharing state on that thread. This way, it
+	// also detects whether new threads were cloned from the dead thread, and not
+	// from a clean thread. Cloning from a locked thread is undesirable since
+	// cloned threads will inherit potentially unwanted OS state.
+	//
+	// unshareFs, getcwd, and chdir("/tmp") are only guaranteed to work on
+	// Linux, so on other platforms this just checks that the runtime doesn't
+	// do anything terrible.
+	//
+	// This is running locked to the main OS thread.
+
+	// GOMAXPROCS=1 makes this fail much more reliably if a tainted thread is
+	// cloned from.
+	if runtime.GOMAXPROCS(-1) != 1 {
+		println("requires GOMAXPROCS=1")
+		os.Exit(1)
+	}
+
+	if err := chdir("/"); err != nil {
+		println("failed to chdir:", err.Error())
+		os.Exit(1)
+	}
+	// On systems other than Linux, cwd == "".
+	cwd, err := getcwd()
+	if err != nil {
+		println("failed to get cwd:", err.Error())
+		os.Exit(1)
+	}
+	if cwd != "" && cwd != "/" {
+		println("unexpected cwd", cwd, " wanted /")
+		os.Exit(1)
+	}
+
+	ready := make(chan bool, 1)
+	go func() {
+		// This goroutine must be running on a new thread.
+		runtime.LockOSThread()
+
+		// Unshare details about the FS, like the CWD, with
+		// the rest of the process on this thread.
+		// On systems other than Linux, this is a no-op.
+		if err := unshareFs(); err != nil {
+			if err == errNotPermitted {
+				println("unshare not permitted")
+				os.Exit(0)
+			}
+			println("failed to unshare fs:", err.Error())
+			os.Exit(1)
+		}
+		// Chdir to somewhere else on this thread.
+		// On systems other than Linux, this is a no-op.
+		if err := chdir("/tmp"); err != nil {
+			println("failed to chdir:", err.Error())
+			os.Exit(1)
+		}
+
+		// The state on this thread is now considered "tainted", but it
+		// should no longer be observable in any other context.
+
+		ready <- true
+		// Exit with the thread locked.
+	}()
+	<-ready
+
+	// Spawn yet another goroutine and lock it. Since GOMAXPROCS=1, if
+	// for some reason state from the (hopefully dead) locked thread above
+	// propagated into a newly created thread (via clone), or that thread
+	// is actually being re-used, then we should get scheduled on such a
+	// thread with high likelihood.
+	done := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+
+		// Get the CWD and check if this is the same as the main thread's
+		// CWD. Every thread should share the same CWD.
+		// On systems other than Linux, wd == "".
+		wd, err := getcwd()
+		if err != nil {
+			println("failed to get cwd:", err.Error())
+			os.Exit(1)
+		}
+		if wd != cwd {
+			println("bad state from old thread propagated after it should have died")
+			os.Exit(1)
+		}
+		<-done
+
+		runtime.UnlockOSThread()
+	}()
+	done <- true
+	runtime.UnlockOSThread()
+	println("OK")
+}
diff --git a/libgo/go/runtime/testdata/testprog/syscalls.go b/libgo/go/runtime/testdata/testprog/syscalls.go
new file mode 100644
index 00000000000..098d5cadf8a
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/syscalls.go
@@ -0,0 +1,11 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"errors"
+)
+
+var errNotPermitted = errors.New("operation not permitted")
diff --git a/libgo/go/runtime/testdata/testprog/syscalls_linux.go b/libgo/go/runtime/testdata/testprog/syscalls_linux.go
new file mode 100644
index 00000000000..b8ac0876269
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/syscalls_linux.go
@@ -0,0 +1,59 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"syscall"
+)
+
+func gettid() int {
+	return syscall.Gettid()
+}
+
+func tidExists(tid int) (exists, supported bool) {
+	stat, err := ioutil.ReadFile(fmt.Sprintf("/proc/self/task/%d/stat", tid))
+	if os.IsNotExist(err) {
+		return false, true
+	}
+	// Check if it's a zombie thread.
+	state := bytes.Fields(stat)[2]
+	return !(len(state) == 1 && state[0] == 'Z'), true
+}
+
+func getcwd() (string, error) {
+	if !syscall.ImplementsGetwd {
+		return "", nil
+	}
+	// Use the syscall to get the current working directory.
+	// This is imperative for checking for OS thread state
+	// after an unshare since os.Getwd might just check the
+	// environment, or use some other mechanism.
+	var buf [4096]byte
+	n, err := syscall.Getcwd(buf[:])
+	if err != nil {
+		return "", err
+	}
+	// Subtract one for null terminator.
+	return string(buf[:n-1]), nil
+}
+
+func unshareFs() error {
+	err := syscall.Unshare(syscall.CLONE_FS)
+	if err != nil {
+		errno, ok := err.(syscall.Errno)
+		if ok && errno == syscall.EPERM {
+			return errNotPermitted
+		}
+	}
+	return err
+}
+
+func chdir(path string) error {
+	return syscall.Chdir(path)
+}
diff --git a/libgo/go/runtime/testdata/testprog/gettid_none.go b/libgo/go/runtime/testdata/testprog/syscalls_none.go
index 036db87e10e..7f8ded3994f 100644
--- a/libgo/go/runtime/testdata/testprog/gettid_none.go
+++ b/libgo/go/runtime/testdata/testprog/syscalls_none.go
@@ -13,3 +13,15 @@ func gettid() int {
 func tidExists(tid int) (exists, supported bool) {
 	return false, false
 }
+
+func getcwd() (string, error) {
+	return "", nil
+}
+
+func unshareFs() error {
+	return nil
+}
+
+func chdir(path string) error {
+	return nil
+}
diff --git a/libgo/go/runtime/testdata/testprog/traceback_ancestors.go b/libgo/go/runtime/testdata/testprog/traceback_ancestors.go
index fe57c1c157e..0ee402c4bdc 100644
--- a/libgo/go/runtime/testdata/testprog/traceback_ancestors.go
+++ b/libgo/go/runtime/testdata/testprog/traceback_ancestors.go
@@ -5,8 +5,10 @@
 package main
 
 import (
+	"bytes"
 	"fmt"
 	"runtime"
+	"strings"
 )
 
 func init() {
@@ -18,25 +20,50 @@ const numFrames = 2
 
 func TracebackAncestors() {
 	w := make(chan struct{})
-	recurseThenCallGo(w, numGoroutines, numFrames)
+	recurseThenCallGo(w, numGoroutines, numFrames, true)
 	<-w
 	printStack()
 	close(w)
 }
 
+var ignoreGoroutines = make(map[string]bool)
+
 func printStack() {
 	buf := make([]byte, 1024)
 	for {
 		n := runtime.Stack(buf, true)
 		if n < len(buf) {
-			fmt.Print(string(buf[:n]))
+			tb := string(buf[:n])
+
+			// Delete any ignored goroutines, if present.
+			pos := 0
+			for pos < len(tb) {
+				next := pos + strings.Index(tb[pos:], "\n\n")
+				if next < pos {
+					next = len(tb)
+				} else {
+					next += len("\n\n")
+				}
+
+				if strings.HasPrefix(tb[pos:], "goroutine ") {
+					id := tb[pos+len("goroutine "):]
+					id = id[:strings.IndexByte(id, ' ')]
+					if ignoreGoroutines[id] {
+						tb = tb[:pos] + tb[next:]
+						next = pos
+					}
+				}
+				pos = next
+			}
+
+			fmt.Print(tb)
 			return
 		}
 		buf = make([]byte, 2*len(buf))
 	}
 }
 
-func recurseThenCallGo(w chan struct{}, frames int, goroutines int) {
+func recurseThenCallGo(w chan struct{}, frames int, goroutines int, main bool) {
 	if frames == 0 {
 		// Signal to TracebackAncestors that we are done recursing and starting goroutines.
 		w <- struct{}{}
@@ -44,10 +71,29 @@ func recurseThenCallGo(w chan struct{}, frames int, goroutines int) {
 		return
 	}
 	if goroutines == 0 {
+		// Record which goroutine this is so we can ignore it
+		// in the traceback if it hasn't finished exiting by
+		// the time we printStack.
+		if !main {
+			ignoreGoroutines[goroutineID()] = true
+		}
+
 		// Start the next goroutine now that there are no more recursions left
 		// for this current goroutine.
-		go recurseThenCallGo(w, frames-1, numFrames)
+		go recurseThenCallGo(w, frames-1, numFrames, false)
 		return
 	}
-	recurseThenCallGo(w, frames, goroutines-1)
+	recurseThenCallGo(w, frames, goroutines-1, main)
+}
+
+func goroutineID() string {
+	buf := make([]byte, 128)
+	runtime.Stack(buf, false)
+	const prefix = "goroutine "
+	if !bytes.HasPrefix(buf, []byte(prefix)) {
+		panic(fmt.Sprintf("expected %q at beginning of traceback:\n%s", prefix, buf))
+	}
+	buf = buf[len(prefix):]
+	n := bytes.IndexByte(buf, ' ')
+	return string(buf[:n])
 }
diff --git a/libgo/go/runtime/testdata/testprogcgo/exec.go b/libgo/go/runtime/testdata/testprogcgo/exec.go
index 2e948401c87..94da5dc526b 100644
--- a/libgo/go/runtime/testdata/testprogcgo/exec.go
+++ b/libgo/go/runtime/testdata/testprogcgo/exec.go
@@ -75,6 +75,14 @@ func CgoExecSignalMask() {
 					cmd.Stdout = os.Stdout
 					cmd.Stderr = os.Stderr
 					if err := cmd.Run(); err != nil {
+						// An overloaded system
+						// may fail with EAGAIN.
+						// This doesn't tell us
+						// anything useful; ignore it.
+						// Issue #27731.
+						if isEAGAIN(err) {
+							return
+						}
 						fmt.Printf("iteration %d: %v\n", j, err)
 						os.Exit(1)
 					}
@@ -87,3 +95,11 @@ func CgoExecSignalMask() {
 
 	fmt.Println("OK")
 }
+
+// isEAGAIN reports whether err is an EAGAIN error from a process execution.
+func isEAGAIN(err error) bool {
+	if p, ok := err.(*os.PathError); ok {
+		err = p.Err
+	}
+	return err == syscall.EAGAIN
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/pprof.go b/libgo/go/runtime/testdata/testprogcgo/pprof.go
index c4fde0251ae..9fa5c1b518a 100644
--- a/libgo/go/runtime/testdata/testprogcgo/pprof.go
+++ b/libgo/go/runtime/testdata/testprogcgo/pprof.go
@@ -28,6 +28,9 @@ void cpuHog() {
 	salt2 = foo;
 }
 
+void cpuHog2() {
+}
+
 static int cpuHogCount;
 
 struct cgoTracebackArg {
@@ -39,10 +42,13 @@ struct cgoTracebackArg {
 
 // pprofCgoTraceback is passed to runtime.SetCgoTraceback.
 // For testing purposes it pretends that all CPU hits in C code are in cpuHog.
+// Issue #29034: At least 2 frames are required to verify all frames are captured
+// since runtime/pprof ignores the runtime.goexit base frame if it exists.
 void pprofCgoTraceback(void* parg) {
 	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
 	arg->buf[0] = (uintptr_t)(cpuHog) + 0x10;
-	arg->buf[1] = 0;
+	arg->buf[1] = (uintptr_t)(cpuHog2) + 0x4;
+	arg->buf[2] = 0;
 	++cpuHogCount;
 }
 
diff --git a/libgo/go/runtime/testdata/testprogcgo/threadpprof.go b/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
index bec4ef50d76..f803511c526 100644
--- a/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
+++ b/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
@@ -31,6 +31,9 @@ void cpuHogThread() {
 	threadSalt2 = foo;
 }
 
+void cpuHogThread2() {
+}
+
 static int cpuHogThreadCount;
 
 struct cgoTracebackArg {
@@ -45,7 +48,8 @@ struct cgoTracebackArg {
 void pprofCgoThreadTraceback(void* parg) {
 	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
 	arg->buf[0] = (uintptr_t)(cpuHogThread) + 0x10;
-	arg->buf[1] = 0;
+	arg->buf[1] = (uintptr_t)(cpuHogThread2) + 0x4;
+	arg->buf[2] = 0;
 	__sync_add_and_fetch(&cpuHogThreadCount, 1);
 }
 
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index ea61baa9195..71e7556776e 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -7,7 +7,7 @@
 package runtime
 
 import (
-	"runtime/internal/sys"
+	"internal/cpu"
 	"unsafe"
 )
 
@@ -50,7 +50,7 @@ var timers [timersLen]struct {
 
 	// The padding should eliminate false sharing
 	// between timersBucket values.
-	pad [sys.CacheLineSize - unsafe.Sizeof(timersBucket{})%sys.CacheLineSize]byte
+	pad [cpu.CacheLinePadSize - unsafe.Sizeof(timersBucket{})%cpu.CacheLinePadSize]byte
 }
 
 func (t *timer) assignBucket() *timersBucket {
@@ -156,7 +156,7 @@ func (tb *timersBucket) addtimerLocked(t *timer) bool {
 	}
 	if t.i == 0 {
 		// siftup moved to top: new earliest deadline.
-		if tb.sleeping {
+		if tb.sleeping && tb.sleepUntil > t.when {
 			tb.sleeping = false
 			notewakeup(&tb.waitnote)
 		}
@@ -164,11 +164,11 @@ func (tb *timersBucket) addtimerLocked(t *timer) bool {
 			tb.rescheduling = false
 			goready(tb.gp, 0)
 		}
-	}
-	if !tb.created {
-		tb.created = true
-		expectSystemGoroutine()
-		go timerproc(tb)
+		if !tb.created {
+			tb.created = true
+			expectSystemGoroutine()
+			go timerproc(tb)
+		}
 	}
 	return true
 }
@@ -188,14 +188,22 @@ func deltimer(t *timer) bool {
 	tb := t.tb
 
 	lock(&tb.lock)
+	removed, ok := tb.deltimerLocked(t)
+	unlock(&tb.lock)
+	if !ok {
+		badTimer()
+	}
+	return removed
+}
+
+func (tb *timersBucket) deltimerLocked(t *timer) (removed, ok bool) {
 	// t may not be registered anymore and may have
 	// a bogus i (typically 0, if generated by Go).
 	// Verify it before proceeding.
 	i := t.i
 	last := len(tb.t) - 1
 	if i < 0 || i > last || tb.t[i] != t {
-		unlock(&tb.lock)
-		return false
+		return false, true
 	}
 	if i != last {
 		tb.t[i] = tb.t[last]
@@ -203,7 +211,7 @@ func deltimer(t *timer) bool {
 	}
 	tb.t[last] = nil
 	tb.t = tb.t[:last]
-	ok := true
+	ok = true
 	if i != last {
 		if !siftupTimer(tb.t, i) {
 			ok = false
@@ -212,11 +220,26 @@ func deltimer(t *timer) bool {
 			ok = false
 		}
 	}
+	return true, ok
+}
+
+func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) {
+	tb := t.tb
+
+	lock(&tb.lock)
+	_, ok := tb.deltimerLocked(t)
+	if ok {
+		t.when = when
+		t.period = period
+		t.f = f
+		t.arg = arg
+		t.seq = seq
+		ok = tb.addtimerLocked(t)
+	}
 	unlock(&tb.lock)
 	if !ok {
 		badTimer()
 	}
-	return true
 }
 
 // Timerproc runs the time-driven events.
@@ -438,23 +461,3 @@ func siftdownTimer(t []*timer, i int) bool {
 func badTimer() {
 	panic(errorString("racy use of timers"))
 }
-
-// Entry points for net, time to call nanotime.
-
-//go:linkname poll_runtimeNano internal..z2fpoll.runtimeNano
-func poll_runtimeNano() int64 {
-	return nanotime()
-}
-
-//go:linkname time_runtimeNano time.runtimeNano
-func time_runtimeNano() int64 {
-	return nanotime()
-}
-
-// Monotonic times are reported as offsets from startNano.
-// We initialize startNano to nanotime() - 1 so that on systems where
-// monotonic time resolution is fairly low (e.g. Windows 2008
-// which appears to have a default resolution of 15ms),
-// we avoid ever reporting a nanotime of 0.
-// (Callers may want to use 0 as "time not set".)
-var startNano int64 = nanotime() - 1
diff --git a/libgo/go/runtime/timeasm.go b/libgo/go/runtime/timeasm.go
index 55b0d0729fb..6bfaa8a8f0e 100644
--- a/libgo/go/runtime/timeasm.go
+++ b/libgo/go/runtime/timeasm.go
@@ -3,8 +3,6 @@
 // license that can be found in the LICENSE file.
 
 // Declarations for operating systems implementing time.now directly in assembly.
-// Those systems are also expected to have nanotime subtract startNano,
-// so that time.now and nanotime return the same monotonic clock readings.
 
 // +build ignore
 // +build windows
diff --git a/libgo/go/runtime/timestub.go b/libgo/go/runtime/timestub.go
index 9f1d111f4fd..14165cee974 100644
--- a/libgo/go/runtime/timestub.go
+++ b/libgo/go/runtime/timestub.go
@@ -14,5 +14,5 @@ import _ "unsafe" // for go:linkname
 //go:linkname time_now time.now
 func time_now() (sec int64, nsec int32, mono int64) {
 	sec, nsec = walltime()
-	return sec, nsec, nanotime() - startNano
+	return sec, nsec, nanotime()
 }
diff --git a/libgo/go/runtime/timestub2.go b/libgo/go/runtime/timestub2.go
index 9ddc6fed918..00c2c55f461 100644
--- a/libgo/go/runtime/timestub2.go
+++ b/libgo/go/runtime/timestub2.go
@@ -5,6 +5,7 @@
 // +build !darwin
 // +build !windows
 // +build !freebsd
+// +build !aix
 
 package runtime
 
diff --git a/libgo/go/runtime/trace.go b/libgo/go/runtime/trace.go
index 530d5e4c86e..6db5b62558e 100644
--- a/libgo/go/runtime/trace.go
+++ b/libgo/go/runtime/trace.go
@@ -533,12 +533,12 @@ func traceEvent(ev byte, skip int, args ...uint64) {
 }
 
 func traceEventLocked(extraBytes int, mp *m, pid int32, bufp *traceBufPtr, ev byte, skip int, args ...uint64) {
-	buf := (*bufp).ptr()
+	buf := bufp.ptr()
 	// TODO: test on non-zero extraBytes param.
 	maxSize := 2 + 5*traceBytesPerNumber + extraBytes // event type, length, sequence, timestamp, stack id and two add params
 	if buf == nil || len(buf.arr)-buf.pos < maxSize {
 		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
-		(*bufp).set(buf)
+		bufp.set(buf)
 	}
 
 	ticks := uint64(cputicks()) / traceTickDiv
@@ -585,7 +585,7 @@ func traceStackID(mp *m, buf []location, skip int) uint64 {
 	gp := mp.curg
 	var nstk int
 	if gp == _g_ {
-		nstk = callers(skip+1, buf[:])
+		nstk = callers(skip+1, buf)
 	} else if gp != nil {
 		// FIXME: get stack trace of different goroutine.
 	}
@@ -689,11 +689,11 @@ func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr)
 	// so there must be no memory allocation or any activities
 	// that causes tracing after this point.
 
-	buf := (*bufp).ptr()
+	buf := bufp.ptr()
 	size := 1 + 2*traceBytesPerNumber + len(s)
 	if buf == nil || len(buf.arr)-buf.pos < size {
 		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
-		(*bufp).set(buf)
+		bufp.set(buf)
 	}
 	buf.byte(traceEvString)
 	buf.varint(id)
@@ -708,7 +708,7 @@ func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr)
 	buf.varint(uint64(slen))
 	buf.pos += copy(buf.arr[buf.pos:], s[:slen])
 
-	(*bufp).set(buf)
+	bufp.set(buf)
 	return id, bufp
 }
 
@@ -1201,7 +1201,7 @@ func trace_userLog(id uint64, category, message string) {
 	traceEventLocked(extraSpace, mp, pid, bufp, traceEvUserLog, 3, id, categoryID)
 	// traceEventLocked reserved extra space for val and len(val)
 	// in buf, so buf now has room for the following.
-	buf := (*bufp).ptr()
+	buf := bufp.ptr()
 
 	// double-check the message and its length can fit.
 	// Otherwise, truncate the message.
diff --git a/libgo/go/runtime/trace/annotation.go b/libgo/go/runtime/trace/annotation.go
index d5a7d003fe2..82cb232dba9 100644
--- a/libgo/go/runtime/trace/annotation.go
+++ b/libgo/go/runtime/trace/annotation.go
@@ -171,7 +171,7 @@ func (r *Region) End() {
 	userRegion(r.id, regionEndCode, r.regionType)
 }
 
-// IsEnabled returns whether tracing is enabled.
+// IsEnabled reports whether tracing is enabled.
 // The information is advisory only. The tracing status
 // may have changed by the time this function returns.
 func IsEnabled() bool {
diff --git a/libgo/go/runtime/traceback_gccgo.go b/libgo/go/runtime/traceback_gccgo.go
index 72a83a5ab2d..7581798e852 100644
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@@ -21,16 +21,20 @@ func printcreatedby(gp *g) {
 		tracepc -= sys.PCQuantum
 	}
 	function, file, line := funcfileline(tracepc, -1)
-	if function != "" && showframe(function, gp) && gp.goid != 1 {
-		print("created by ", function, "\n")
-		print("\t", file, ":", line)
-		if entry != 0 && pc > entry {
-			print(" +", hex(pc-entry))
-		}
-		print("\n")
+	if function != "" && showframe(function, gp, false) && gp.goid != 1 {
+		printcreatedby1(function, file, line, entry, pc)
 	}
 }
 
+func printcreatedby1(function, file string, line int, entry, pc uintptr) {
+	print("created by ", function, "\n")
+	print("\t", file, ":", line)
+	if entry != 0 && pc > entry {
+		print(" +", hex(pc-entry))
+	}
+	print("\n")
+}
+
 // tracebackg is used to collect stack traces from other goroutines.
 type tracebackg struct {
 	gp     *g
@@ -63,50 +67,108 @@ func callers(skip int, locbuf []location) int {
 func traceback(skip int32) {
 	var locbuf [100]location
 	c := c_callers(skip+1, &locbuf[0], int32(len(locbuf)), false)
-	printtrace(locbuf[:c], getg())
-	printcreatedby(getg())
+	gp := getg()
+	printtrace(locbuf[:c], gp)
+	printcreatedby(gp)
+
+	if gp.ancestors == nil {
+		return
+	}
+	for _, ancestor := range *gp.ancestors {
+		printAncestorTraceback(ancestor)
+	}
+}
+
+// printAncestorTraceback prints the traceback of the given ancestor.
+func printAncestorTraceback(ancestor ancestorInfo) {
+	print("[originating from goroutine ", ancestor.goid, "]:\n")
+	for fidx, pc := range ancestor.pcs {
+		function, file, line := funcfileline(pc, -1)
+		if showfuncinfo(function, fidx == 0) {
+			printAncestorTracebackFuncInfo(function, file, line, pc)
+		}
+	}
+	if len(ancestor.pcs) == _TracebackMaxFrames {
+		print("...additional frames elided...\n")
+	}
+	// Show what created goroutine, except main goroutine (goid 1).
+	function, file, line := funcfileline(ancestor.gopc, -1)
+	if function != "" && showfuncinfo(function, false) && ancestor.goid != 1 {
+		printcreatedby1(function, file, line, funcentry(ancestor.gopc), ancestor.gopc)
+	}
+}
+
+// printAncestorTraceback prints the given function info at a given pc
+// within an ancestor traceback. The precision of this info is reduced
+// due to only have access to the pcs at the time of the caller
+// goroutine being created.
+func printAncestorTracebackFuncInfo(name, file string, line int, pc uintptr) {
+	if name == "runtime.gopanic" {
+		name = "panic"
+	}
+	print(name, "(...)\n")
+	print("\t", file, ":", line)
+	entry := funcentry(pc)
+	if pc > entry {
+		print(" +", hex(pc-entry))
+	}
+	print("\n")
 }
 
 // printtrace prints a traceback from locbuf.
 func printtrace(locbuf []location, gp *g) {
+	nprint := 0
 	for i := range locbuf {
-		if showframe(locbuf[i].function, gp) {
+		if showframe(locbuf[i].function, gp, nprint == 0) {
 			name := locbuf[i].function
 			if name == "runtime.gopanic" {
 				name = "panic"
 			}
 			print(name, "\n\t", locbuf[i].filename, ":", locbuf[i].lineno, "\n")
+			nprint++
 		}
 	}
 }
 
 // showframe returns whether to print a frame in a traceback.
 // name is the function name.
-func showframe(name string, gp *g) bool {
+func showframe(name string, gp *g, firstFrame bool) bool {
 	g := getg()
 	if g.m.throwing > 0 && gp != nil && (gp == g.m.curg || gp == g.m.caughtsig.ptr()) {
 		return true
 	}
+	return showfuncinfo(name, firstFrame)
+}
 
+func showfuncinfo(name string, firstFrame bool) bool {
 	// Gccgo can trace back through C functions called via cgo.
 	// We want to print those in the traceback.
 	// But unless GOTRACEBACK > 1 (checked below), still skip
 	// internal C functions and cgo-generated functions.
-	if name != "" && !contains(name, ".") && !hasprefix(name, "__go_") && !hasprefix(name, "_cgo_") {
+	if name != "" && !contains(name, ".") && !hasPrefix(name, "__go_") && !hasPrefix(name, "_cgo_") {
 		return true
 	}
 
 	level, _, _ := gotraceback()
+	if level > 1 {
+		// Show all frames.
+		return true
+	}
+
+	if name == "" {
+		return false
+	}
 
-	// Special case: always show runtime.gopanic frame, so that we can
-	// see where a panic started in the middle of a stack trace.
+	// Special case: always show runtime.gopanic frame
+	// in the middle of a stack trace, so that we can
+	// see the boundary between ordinary code and
+	// panic-induced deferred code.
 	// See golang.org/issue/5832.
-	// __go_panic is the current gccgo name.
-	if name == "runtime.gopanic" || name == "__go_panic" {
+	if name == "runtime.gopanic" && !firstFrame {
 		return true
 	}
 
-	return level > 1 || contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
+	return contains(name, ".") && (!hasPrefix(name, "runtime.") || isExportedRuntime(name))
 }
 
 // isExportedRuntime reports whether name is an exported runtime function.
@@ -115,7 +177,7 @@ func showframe(name string, gp *g) bool {
 // "runtime..z2f".
 func isExportedRuntime(name string) bool {
 	const n = len("runtime.")
-	if hasprefix(name, "runtime..z2f") {
+	if hasPrefix(name, "runtime..z2f") {
 		return true
 	}
 	return len(name) > n && name[:n] == "runtime." && 'A' <= name[n] && name[n] <= 'Z'
@@ -169,10 +231,24 @@ func goroutineheader(gp *g) {
 	print("]:\n")
 }
 
-// isSystemGoroutine reports whether the goroutine g must be omitted in
-// stack dumps and deadlock detector.
-func isSystemGoroutine(gp *g) bool {
-	return gp.isSystemGoroutine
+// isSystemGoroutine reports whether the goroutine g must be omitted
+// in stack dumps and deadlock detector. This is any goroutine that
+// starts at a runtime.* entry point, except for runtime.main and
+// sometimes runtime.runfinq.
+//
+// If fixed is true, any goroutine that can vary between user and
+// system (that is, the finalizer goroutine) is considered a user
+// goroutine.
+func isSystemGoroutine(gp *g, fixed bool) bool {
+	if !gp.isSystemGoroutine {
+		return false
+	}
+	if fixed && gp.isFinalizerGoroutine {
+		// This goroutine can vary. In fixed mode,
+		// always consider it a user goroutine.
+		return false
+	}
+	return true
 }
 
 func tracebackothers(me *g) {
@@ -200,7 +276,7 @@ func tracebackothers(me *g) {
 
 	lock(&allglock)
 	for _, gp := range allgs {
-		if gp == me || gp == g.m.curg || readgstatus(gp) == _Gdead || isSystemGoroutine(gp) && level < 2 {
+		if gp == me || gp == g.m.curg || readgstatus(gp) == _Gdead || isSystemGoroutine(gp, false) && level < 2 {
 			continue
 		}
 		print("\n")