113 files changed, 5011 insertions, 4014 deletions
diff --git a/libgo/go/runtime/callers_test.go b/libgo/go/runtime/callers_test.go
index 1fc7f861894..1c0e44185cd 100644
--- a/libgo/go/runtime/callers_test.go
+++ b/libgo/go/runtime/callers_test.go
@@ -68,6 +68,8 @@ func testCallers(t *testing.T, pcs []uintptr, pan bool) {
 }
 
 func testCallersEqual(t *testing.T, pcs []uintptr, want []string, ignore map[string]struct{}) {
+	t.Helper()
+
 	got := make([]string, 0, len(want))
 
 	frames := runtime.CallersFrames(pcs)
diff --git a/libgo/go/runtime/chan.go b/libgo/go/runtime/chan.go
index ec8252bb32a..de1d80acd30 100644
--- a/libgo/go/runtime/chan.go
+++ b/libgo/go/runtime/chan.go
@@ -121,6 +121,7 @@ func makechan(t *chantype, size int) *hchan {
 	c.elemsize = uint16(elem.size)
 	c.elemtype = elem
 	c.dataqsiz = uint(size)
+	lockInit(&c.lock, lockRankHchan)
 
 	if debugChan {
 		print("makechan: chan=", c, "; elemsize=", elem.size, "; dataqsiz=", size, "\n")
@@ -133,6 +134,21 @@ func chanbuf(c *hchan, i uint) unsafe.Pointer {
 	return add(c.buf, uintptr(i)*uintptr(c.elemsize))
 }
 
+// full reports whether a send on c would block (that is, the channel is full).
+// It uses a single word-sized read of mutable state, so although
+// the answer is instantaneously true, the correct answer may have changed
+// by the time the calling function receives the return value.
+func full(c *hchan) bool {
+	// c.dataqsiz is immutable (never written after the channel is created)
+	// so it is safe to read at any time during channel operation.
+	if c.dataqsiz == 0 {
+		// Assumes that a pointer read is relaxed-atomic.
+		return c.recvq.first == nil
+	}
+	// Assumes that a uint read is relaxed-atomic.
+	return c.qcount == c.dataqsiz
+}
+
 // entry point for c <- x from compiled code
 //go:nosplit
 func chansend1(c *hchan, elem unsafe.Pointer) {
@@ -177,7 +193,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	//
 	// After observing that the channel is not closed, we observe that the channel is
 	// not ready for sending. Each of these observations is a single word-sized read
-	// (first c.closed and second c.recvq.first or c.qcount depending on kind of channel).
+	// (first c.closed and second full()).
 	// Because a closed channel cannot transition from 'ready for sending' to
 	// 'not ready for sending', even if the channel is closed between the two observations,
 	// they imply a moment between the two when the channel was both not yet closed
@@ -186,9 +202,10 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	//
 	// It is okay if the reads are reordered here: if we observe that the channel is not
 	// ready for sending and then observe that it is not closed, that implies that the
-	// channel wasn't closed during the first observation.
-	if !block && c.closed == 0 && ((c.dataqsiz == 0 && c.recvq.first == nil) ||
-		(c.dataqsiz > 0 && c.qcount == c.dataqsiz)) {
+	// channel wasn't closed during the first observation. However, nothing here
+	// guarantees forward progress. We rely on the side effects of lock release in
+	// chanrecv() and closechan() to update this thread's view of c.closed and full().
+	if !block && c.closed == 0 && full(c) {
 		return false
 	}
 
@@ -418,6 +435,16 @@ func closechan(c *hchan) {
 	}
 }
 
+// empty reports whether a read from c would block (that is, the channel is
+// empty).  It uses a single atomic read of mutable state.
+func empty(c *hchan) bool {
+	// c.dataqsiz is immutable.
+	if c.dataqsiz == 0 {
+		return atomic.Loadp(unsafe.Pointer(&c.sendq.first)) == nil
+	}
+	return atomic.Loaduint(&c.qcount) == 0
+}
+
 // entry points for <- c from compiled code
 //go:nosplit
 func chanrecv1(c *hchan, elem unsafe.Pointer) {
@@ -458,21 +485,36 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	}
 
 	// Fast path: check for failed non-blocking operation without acquiring the lock.
-	//
-	// After observing that the channel is not ready for receiving, we observe that the
-	// channel is not closed. Each of these observations is a single word-sized read
-	// (first c.sendq.first or c.qcount, and second c.closed).
-	// Because a channel cannot be reopened, the later observation of the channel
-	// being not closed implies that it was also not closed at the moment of the
-	// first observation. We behave as if we observed the channel at that moment
-	// and report that the receive cannot proceed.
-	//
-	// The order of operations is important here: reversing the operations can lead to
-	// incorrect behavior when racing with a close.
-	if !block && (c.dataqsiz == 0 && c.sendq.first == nil ||
-		c.dataqsiz > 0 && atomic.Loaduint(&c.qcount) == 0) &&
-		atomic.Load(&c.closed) == 0 {
-		return
+	if !block && empty(c) {
+		// After observing that the channel is not ready for receiving, we observe whether the
+		// channel is closed.
+		//
+		// Reordering of these checks could lead to incorrect behavior when racing with a close.
+		// For example, if the channel was open and not empty, was closed, and then drained,
+		// reordered reads could incorrectly indicate "open and empty". To prevent reordering,
+		// we use atomic loads for both checks, and rely on emptying and closing to happen in
+		// separate critical sections under the same lock.  This assumption fails when closing
+		// an unbuffered channel with a blocked send, but that is an error condition anyway.
+		if atomic.Load(&c.closed) == 0 {
+			// Because a channel cannot be reopened, the later observation of the channel
+			// being not closed implies that it was also not closed at the moment of the
+			// first observation. We behave as if we observed the channel at that moment
+			// and report that the receive cannot proceed.
+			return
+		}
+		// The channel is irreversibly closed. Re-check whether the channel has any pending data
+		// to receive, which could have arrived between the empty and closed checks above.
+		// Sequential consistency is also required here, when racing with such a send.
+		if empty(c) {
+			// The channel is irreversibly closed and empty.
+			if raceenabled {
+				raceacquire(c.raceaddr())
+			}
+			if ep != nil {
+				typedmemclr(c.elemtype, ep)
+			}
+			return true, false
+		}
 	}
 
 	var t0 int64
diff --git a/libgo/go/runtime/chan_test.go b/libgo/go/runtime/chan_test.go
index c194781ede9..ac81d409bda 100644
--- a/libgo/go/runtime/chan_test.go
+++ b/libgo/go/runtime/chan_test.go
@@ -1132,6 +1132,20 @@ func BenchmarkChanPopular(b *testing.B) {
 	wg.Wait()
 }
 
+func BenchmarkChanClosed(b *testing.B) {
+	c := make(chan struct{})
+	close(c)
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			select {
+			case <-c:
+			default:
+				b.Error("Unreachable")
+			}
+		}
+	})
+}
+
 var (
 	alwaysFalse = false
 	workSink    = 0
diff --git a/libgo/go/runtime/checkptr.go b/libgo/go/runtime/checkptr.go
index d5f116c3927..e52f7df5b2b 100644
--- a/libgo/go/runtime/checkptr.go
+++ b/libgo/go/runtime/checkptr.go
@@ -14,18 +14,18 @@ func checkptrAlignment(p unsafe.Pointer, elem *_type, n uintptr) {
 	// no pointers themselves. See issue 37298.
 	// TODO(mdempsky): What about fieldAlign?
 	if elem.ptrdata != 0 && uintptr(p)&(uintptr(elem.align)-1) != 0 {
-		throw("checkptr: unsafe pointer conversion")
+		throw("checkptr: misaligned pointer conversion")
 	}
 
 	// Check that (*[n]elem)(p) doesn't straddle multiple heap objects.
 	if size := n * elem.size; size > 1 && checkptrBase(p) != checkptrBase(add(p, size-1)) {
-		throw("checkptr: unsafe pointer conversion")
+		throw("checkptr: converted pointer straddles multiple allocations")
 	}
 }
 
 func checkptrArithmetic(p unsafe.Pointer, originals []unsafe.Pointer) {
 	if 0 < uintptr(p) && uintptr(p) < minLegalPointer {
-		throw("checkptr: unsafe pointer arithmetic")
+		throw("checkptr: pointer arithmetic computed bad pointer value")
 	}
 
 	// Check that if the computed pointer p points into a heap
@@ -42,7 +42,7 @@ func checkptrArithmetic(p unsafe.Pointer, originals []unsafe.Pointer) {
 		}
 	}
 
-	throw("checkptr: unsafe pointer arithmetic")
+	throw("checkptr: pointer arithmetic result points to invalid allocation")
 }
 
 // checkptrBase returns the base address for the allocation containing
diff --git a/libgo/go/runtime/checkptr_test.go b/libgo/go/runtime/checkptr_test.go
index 0ca7b20cfd4..76aa2cdb2ba 100644
--- a/libgo/go/runtime/checkptr_test.go
+++ b/libgo/go/runtime/checkptr_test.go
@@ -28,11 +28,11 @@ func TestCheckPtr(t *testing.T) {
 		cmd  string
 		want string
 	}{
-		{"CheckPtrAlignmentPtr", "fatal error: checkptr: unsafe pointer conversion\n"},
+		{"CheckPtrAlignmentPtr", "fatal error: checkptr: misaligned pointer conversion\n"},
 		{"CheckPtrAlignmentNoPtr", ""},
-		{"CheckPtrArithmetic", "fatal error: checkptr: unsafe pointer arithmetic\n"},
-		{"CheckPtrSize", "fatal error: checkptr: unsafe pointer conversion\n"},
-		{"CheckPtrSmall", "fatal error: checkptr: unsafe pointer arithmetic\n"},
+		{"CheckPtrArithmetic", "fatal error: checkptr: pointer arithmetic result points to invalid allocation\n"},
+		{"CheckPtrSize", "fatal error: checkptr: converted pointer straddles multiple allocations\n"},
+		{"CheckPtrSmall", "fatal error: checkptr: pointer arithmetic computed bad pointer value\n"},
 	}
 
 	for _, tc := range testCases {
diff --git a/libgo/go/runtime/conv_wasm_test.go b/libgo/go/runtime/conv_wasm_test.go
new file mode 100644
index 00000000000..5054fca04dc
--- /dev/null
+++ b/libgo/go/runtime/conv_wasm_test.go
@@ -0,0 +1,128 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"testing"
+)
+
+var res int64
+var ures uint64
+
+func TestFloatTruncation(t *testing.T) {
+	testdata := []struct {
+		input      float64
+		convInt64  int64
+		convUInt64 uint64
+		overflow   bool
+	}{
+		// max +- 1
+		{
+			input:      0x7fffffffffffffff,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		// For out-of-bounds conversion, the result is implementation-dependent.
+		// This test verifies the implementation of wasm architecture.
+		{
+			input:      0x8000000000000000,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0x7ffffffffffffffe,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		// neg max +- 1
+		{
+			input:      -0x8000000000000000,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      -0x8000000000000001,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      -0x7fffffffffffffff,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		// trunc point +- 1
+		{
+			input:      0x7ffffffffffffdff,
+			convInt64:  0x7ffffffffffffc00,
+			convUInt64: 0x7ffffffffffffc00,
+		},
+		{
+			input:      0x7ffffffffffffe00,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0x7ffffffffffffdfe,
+			convInt64:  0x7ffffffffffffc00,
+			convUInt64: 0x7ffffffffffffc00,
+		},
+		// neg trunc point +- 1
+		{
+			input:      -0x7ffffffffffffdff,
+			convInt64:  -0x7ffffffffffffc00,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      -0x7ffffffffffffe00,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      -0x7ffffffffffffdfe,
+			convInt64:  -0x7ffffffffffffc00,
+			convUInt64: 0x8000000000000000,
+		},
+		// umax +- 1
+		{
+			input:      0xffffffffffffffff,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0x10000000000000000,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0xfffffffffffffffe,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		// umax trunc +- 1
+		{
+			input:      0xfffffffffffffbff,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0xfffffffffffff800,
+		},
+		{
+			input:      0xfffffffffffffc00,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0xfffffffffffffbfe,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0xfffffffffffff800,
+		},
+	}
+	for _, item := range testdata {
+		if got, want := int64(item.input), item.convInt64; got != want {
+			t.Errorf("int64(%f): got %x, want %x", item.input, got, want)
+		}
+		if got, want := uint64(item.input), item.convUInt64; got != want {
+			t.Errorf("uint64(%f): got %x, want %x", item.input, got, want)
+		}
+	}
+}
diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index 2b7d274f953..7c10213b868 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -284,7 +284,13 @@ func testCgoPprof(t *testing.T, buildArg, runArg, top, bottom string) {
 		t.Fatal(err)
 	}
 
-	got, err := testenv.CleanCmdEnv(exec.Command(exe, runArg)).CombinedOutput()
+	// pprofCgoTraceback is called whenever CGO code is executing and a signal
+	// is received. Disable signal preemption to increase the likelihood at
+	// least one SIGPROF signal fired to capture a sample. See issue #37201.
+	cmd := testenv.CleanCmdEnv(exec.Command(exe, runArg))
+	cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
+
+	got, err := cmd.CombinedOutput()
 	if err != nil {
 		if testenv.Builder() == "linux-amd64-alpine" {
 			// See Issue 18243 and Issue 19938.
@@ -561,3 +567,48 @@ func findTrace(text, top string) []string {
 	}
 	return nil
 }
+
+func TestSegv(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no signals on %s", runtime.GOOS)
+	}
+
+	for _, test := range []string{"Segv", "SegvInCgo"} {
+		t.Run(test, func(t *testing.T) {
+			t.Parallel()
+			got := runTestProg(t, "testprogcgo", test)
+			t.Log(got)
+			if !strings.Contains(got, "SIGSEGV") {
+				t.Errorf("expected crash from signal")
+			}
+		})
+	}
+}
+
+// TestEINTR tests that we handle EINTR correctly.
+// See issue #20400 and friends.
+func TestEINTR(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no EINTR on %s", runtime.GOOS)
+	case "linux":
+		if runtime.GOARCH == "386" {
+			// On linux-386 the Go signal handler sets
+			// a restorer function that is not preserved
+			// by the C sigaction call in the test,
+			// causing the signal handler to crash when
+			// returning the normal code. The test is not
+			// architecture-specific, so just skip on 386
+			// rather than doing a complicated workaround.
+			t.Skip("skipping on linux-386; C sigaction does not preserve Go restorer")
+		}
+	}
+
+	t.Parallel()
+	output := runTestProg(t, "testprogcgo", "EINTR")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/libgo/go/runtime/crash_unix_test.go b/libgo/go/runtime/crash_unix_test.go
index 7ce5bb207d1..80184d94ad5 100644
--- a/libgo/go/runtime/crash_unix_test.go
+++ b/libgo/go/runtime/crash_unix_test.go
@@ -15,6 +15,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"runtime"
+	"strings"
 	"sync"
 	"syscall"
 	"testing"
@@ -291,6 +292,12 @@ func TestSignalExitStatus(t *testing.T) {
 }
 
 func TestSignalIgnoreSIGTRAP(t *testing.T) {
+	if runtime.GOOS == "openbsd" {
+		if bn := testenv.Builder(); strings.HasSuffix(bn, "-62") || strings.HasSuffix(bn, "-64") {
+			testenv.SkipFlaky(t, 17496)
+		}
+	}
+
 	output := runTestProg(t, "testprognet", "SignalIgnoreSIGTRAP")
 	want := "OK\n"
 	if output != want {
diff --git a/libgo/go/runtime/debug.go b/libgo/go/runtime/debug.go
index e480466b4d5..1202e362a5a 100644
--- a/libgo/go/runtime/debug.go
+++ b/libgo/go/runtime/debug.go
@@ -26,12 +26,12 @@ func GOMAXPROCS(n int) int {
 		return ret
 	}
 
-	stopTheWorld("GOMAXPROCS")
+	stopTheWorldGC("GOMAXPROCS")
 
 	// newprocs will be processed by startTheWorld
 	newprocs = int32(n)
 
-	startTheWorld()
+	startTheWorldGC()
 	return ret
 }
 
diff --git a/libgo/go/runtime/debug/mod.go b/libgo/go/runtime/debug/mod.go
index c283928ff99..feac16894fb 100644
--- a/libgo/go/runtime/debug/mod.go
+++ b/libgo/go/runtime/debug/mod.go
@@ -48,9 +48,27 @@ func readBuildInfo(data string) (*BuildInfo, bool) {
 		repLine  = "=>\t"
 	)
 
-	info := &BuildInfo{}
+	readEntryFirstLine := func(elem []string) (Module, bool) {
+		if len(elem) != 2 && len(elem) != 3 {
+			return Module{}, false
+		}
+		sum := ""
+		if len(elem) == 3 {
+			sum = elem[2]
+		}
+		return Module{
+			Path:    elem[0],
+			Version: elem[1],
+			Sum:     sum,
+		}, true
+	}
 
-	var line string
+	var (
+		info = &BuildInfo{}
+		last *Module
+		line string
+		ok   bool
+	)
 	// Reverse of cmd/go/internal/modload.PackageBuildInfo
 	for len(data) > 0 {
 		i := strings.IndexByte(data, '\n')
@@ -64,42 +82,33 @@ func readBuildInfo(data string) (*BuildInfo, bool) {
 			info.Path = elem
 		case strings.HasPrefix(line, modLine):
 			elem := strings.Split(line[len(modLine):], "\t")
-			if len(elem) != 3 {
+			last = &info.Main
+			*last, ok = readEntryFirstLine(elem)
+			if !ok {
 				return nil, false
 			}
-			info.Main = Module{
-				Path:    elem[0],
-				Version: elem[1],
-				Sum:     elem[2],
-			}
 		case strings.HasPrefix(line, depLine):
 			elem := strings.Split(line[len(depLine):], "\t")
-			if len(elem) != 2 && len(elem) != 3 {
+			last = new(Module)
+			info.Deps = append(info.Deps, last)
+			*last, ok = readEntryFirstLine(elem)
+			if !ok {
 				return nil, false
 			}
-			sum := ""
-			if len(elem) == 3 {
-				sum = elem[2]
-			}
-			info.Deps = append(info.Deps, &Module{
-				Path:    elem[0],
-				Version: elem[1],
-				Sum:     sum,
-			})
 		case strings.HasPrefix(line, repLine):
 			elem := strings.Split(line[len(repLine):], "\t")
 			if len(elem) != 3 {
 				return nil, false
 			}
-			last := len(info.Deps) - 1
-			if last < 0 {
+			if last == nil {
 				return nil, false
 			}
-			info.Deps[last].Replace = &Module{
+			last.Replace = &Module{
 				Path:    elem[0],
 				Version: elem[1],
 				Sum:     elem[2],
 			}
+			last = nil
 		}
 	}
 	return info, true
diff --git a/libgo/go/runtime/debuglog.go b/libgo/go/runtime/debuglog.go
index 404d0570a0b..d8c87c76462 100644
--- a/libgo/go/runtime/debuglog.go
+++ b/libgo/go/runtime/debuglog.go
@@ -672,13 +672,17 @@ func (r *debugLogReader) printVal() bool {
 		print("..(", r.uvarint(), " more bytes)..")
 
 	case debugLogPC:
-		printDebugLogPC(uintptr(r.uvarint()))
+		printDebugLogPC(uintptr(r.uvarint()), false)
 
 	case debugLogTraceback:
 		n := int(r.uvarint())
 		for i := 0; i < n; i++ {
 			print("\n\t")
-			printDebugLogPC(uintptr(r.uvarint()))
+			// gentraceback PCs are always return PCs.
+			// Convert them to call PCs.
+			//
+			// TODO(austin): Expand inlined frames.
+			printDebugLogPC(uintptr(r.uvarint()), true)
 		}
 	}
 
@@ -801,9 +805,18 @@ func printDebugLog() {
 	printunlock()
 }
 
-func printDebugLogPC(pc uintptr) {
-	print(hex(pc))
+// printDebugLogPC prints a single symbolized PC. If returnPC is true,
+// pc is a return PC that must first be converted to a call PC.
+func printDebugLogPC(pc uintptr, returnPC bool) {
 	name, file, line, _ := funcfileline(pc, -1, false)
+	entry := funcentry(pc)
+	if returnPC && (name == "" || (entry != 0 && pc > funcentry(pc))) {
+		// TODO(austin): Don't back up if the previous frame
+		// was a sigpanic.
+		pc--
+	}
+
+	print(hex(pc))
 	if name == "" {
 		print(" [unknown PC]")
 	} else {
diff --git a/libgo/go/runtime/defer_test.go b/libgo/go/runtime/defer_test.go
index 11436a1f08b..5ac08145646 100644
--- a/libgo/go/runtime/defer_test.go
+++ b/libgo/go/runtime/defer_test.go
@@ -6,7 +6,6 @@ package runtime_test
 
 import (
 	"fmt"
-	"os"
 	"reflect"
 	"runtime"
 	"testing"
@@ -325,11 +324,13 @@ func recurseFnPanicRec(level int, maxlevel int) {
 	recurseFn(level, maxlevel)
 }
 
+var saveInt uint32
+
 func recurseFn(level int, maxlevel int) {
 	a := [40]uint32{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}
 	if level+1 < maxlevel {
-		// Need this print statement to keep a around.  '_ = a[4]' doesn't do it.
-		fmt.Fprintln(os.Stderr, "recurseFn", level, a[4])
+		// Make sure a array is referenced, so it is not optimized away
+		saveInt = a[4]
 		recurseFn(level+1, maxlevel)
 	} else {
 		panic("recurseFn panic")
@@ -350,12 +351,12 @@ func TestIssue37688(t *testing.T) {
 type foo struct {
 }
 
+//go:noinline
 func (f *foo) method1() {
-	fmt.Fprintln(os.Stderr, "method1")
 }
 
+//go:noinline
 func (f *foo) method2() {
-	fmt.Fprintln(os.Stderr, "method2")
 }
 
 func g2() {
@@ -379,6 +380,10 @@ func g3() {
 	g2()
 }
 
+var globstruct struct {
+	a, b, c, d, e, f, g, h, i int
+}
+
 func ff1(ap *foo, a, b, c, d, e, f, g, h, i int) {
 	defer ap.method1()
 
@@ -387,9 +392,15 @@ func ff1(ap *foo, a, b, c, d, e, f, g, h, i int) {
 	// defer pool)
 	defer func(ap *foo, a, b, c, d, e, f, g, h, i int) {
 		if v := recover(); v != nil {
-			fmt.Fprintln(os.Stderr, "did recover")
 		}
-		fmt.Fprintln(os.Stderr, "debug", ap, a, b, c, d, e, f, g, h)
+		globstruct.a = a
+		globstruct.b = b
+		globstruct.c = c
+		globstruct.d = d
+		globstruct.e = e
+		globstruct.f = f
+		globstruct.g = g
+		globstruct.h = h
 	}(ap, a, b, c, d, e, f, g, h, i)
 	panic("ff1 panic")
 }
@@ -397,7 +408,5 @@ func ff1(ap *foo, a, b, c, d, e, f, g, h, i int) {
 func rec1(max int) {
 	if max > 0 {
 		rec1(max - 1)
-	} else {
-		fmt.Fprintln(os.Stderr, "finished recursion", max)
 	}
 }
diff --git a/libgo/go/runtime/env_posix.go b/libgo/go/runtime/env_posix.go
index bf8996cd50d..48b9c4a32e1 100644
--- a/libgo/go/runtime/env_posix.go
+++ b/libgo/go/runtime/env_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris windows plan9
 
 package runtime
 
diff --git a/libgo/go/runtime/env_test.go b/libgo/go/runtime/env_test.go
index 2399e46faa9..c009d0f31e1 100644
--- a/libgo/go/runtime/env_test.go
+++ b/libgo/go/runtime/env_test.go
@@ -11,10 +11,6 @@ import (
 )
 
 func TestFixedGOROOT(t *testing.T) {
-	if runtime.GOOS == "plan9" {
-		t.Skipf("skipping plan9, it is inconsistent by allowing GOROOT to be updated by Setenv")
-	}
-
 	// Restore both the real GOROOT environment variable, and runtime's copies:
 	if orig, ok := syscall.Getenv("GOROOT"); ok {
 		defer syscall.Setenv("GOROOT", orig)
diff --git a/libgo/go/runtime/error.go b/libgo/go/runtime/error.go
index 6cc46bfc3ee..d91e2dbf507 100644
--- a/libgo/go/runtime/error.go
+++ b/libgo/go/runtime/error.go
@@ -226,11 +226,6 @@ type stringer interface {
 	String() string
 }
 
-func typestring(x interface{}) string {
-	e := efaceOf(&x)
-	return e._type.string()
-}
-
 // printany prints an argument passed to panic.
 // If panic is called with a value that has a String or Error method,
 // it has already been converted into a string by preprintpanics.
@@ -273,6 +268,50 @@ func printany(i interface{}) {
 	case string:
 		print(v)
 	default:
-		print("(", typestring(i), ") ", i)
+		printanycustomtype(i)
+	}
+}
+
+func printanycustomtype(i interface{}) {
+	eface := efaceOf(&i)
+	typestring := eface._type.string()
+
+	switch eface._type.kind & ((1 << 5) - 1) {
+	case kindString:
+		print(typestring, `("`, *(*string)(eface.data), `")`)
+	case kindBool:
+		print(typestring, "(", *(*bool)(eface.data), ")")
+	case kindInt:
+		print(typestring, "(", *(*int)(eface.data), ")")
+	case kindInt8:
+		print(typestring, "(", *(*int8)(eface.data), ")")
+	case kindInt16:
+		print(typestring, "(", *(*int16)(eface.data), ")")
+	case kindInt32:
+		print(typestring, "(", *(*int32)(eface.data), ")")
+	case kindInt64:
+		print(typestring, "(", *(*int64)(eface.data), ")")
+	case kindUint:
+		print(typestring, "(", *(*uint)(eface.data), ")")
+	case kindUint8:
+		print(typestring, "(", *(*uint8)(eface.data), ")")
+	case kindUint16:
+		print(typestring, "(", *(*uint16)(eface.data), ")")
+	case kindUint32:
+		print(typestring, "(", *(*uint32)(eface.data), ")")
+	case kindUint64:
+		print(typestring, "(", *(*uint64)(eface.data), ")")
+	case kindUintptr:
+		print(typestring, "(", *(*uintptr)(eface.data), ")")
+	case kindFloat32:
+		print(typestring, "(", *(*float32)(eface.data), ")")
+	case kindFloat64:
+		print(typestring, "(", *(*float64)(eface.data), ")")
+	case kindComplex64:
+		print(typestring, *(*complex64)(eface.data))
+	case kindComplex128:
+		print(typestring, *(*complex128)(eface.data))
+	default:
+		print("(", typestring, ") ", eface.data)
 	}
 }
diff --git a/libgo/go/runtime/export_debug_test.go b/libgo/go/runtime/export_debug_test.go
index 769ad55c588..c1cc44f23d0 100644
--- a/libgo/go/runtime/export_debug_test.go
+++ b/libgo/go/runtime/export_debug_test.go
@@ -49,6 +49,9 @@ func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error, ret
 
 	h := new(debugCallHandler)
 	h.gp = gp
+	// gp may not be running right now, but we can still get the M
+	// it will run on since it's locked.
+	h.mp = gp.lockedm.ptr()
 	h.fv, h.argp, h.argSize = fv, argp, argSize
 	h.handleF = h.handle // Avoid allocating closure during signal
 
@@ -87,6 +90,7 @@ func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error, ret
 
 type debugCallHandler struct {
 	gp      *g
+	mp      *m
 	fv      *funcval
 	argp    unsafe.Pointer
 	argSize uintptr
@@ -103,8 +107,8 @@ type debugCallHandler struct {
 func (h *debugCallHandler) inject(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 	switch h.gp.atomicstatus {
 	case _Grunning:
-		if getg().m != h.gp.m {
-			println("trap on wrong M", getg().m, h.gp.m)
+		if getg().m != h.mp {
+			println("trap on wrong M", getg().m, h.mp)
 			return false
 		}
 		// Push current PC on the stack.
@@ -136,8 +140,8 @@ func (h *debugCallHandler) inject(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 
 func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 	// Sanity check.
-	if getg().m != h.gp.m {
-		println("trap on wrong M", getg().m, h.gp.m)
+	if getg().m != h.mp {
+		println("trap on wrong M", getg().m, h.mp)
 		return false
 	}
 	f := findfunc(uintptr(ctxt.rip()))
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index ab74e34d611..482d014ad58 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -477,6 +477,8 @@ func GetNextArenaHint() uintptr {
 
 type G = g
 
+type Sudog = sudog
+
 func Getg() *G {
 	return getg()
 }
@@ -727,9 +729,12 @@ func (p *PageAlloc) Free(base, npages uintptr) {
 func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 	return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
 }
-func (p *PageAlloc) Scavenge(nbytes uintptr, locked bool) (r uintptr) {
+func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
+	pp := (*pageAlloc)(p)
 	systemstack(func() {
-		r = (*pageAlloc)(p).scavenge(nbytes, locked)
+		lock(pp.mheapLock)
+		r = pp.scavenge(nbytes, mayUnlock)
+		unlock(pp.mheapLock)
 	})
 	return
 }
@@ -737,8 +742,8 @@ func (p *PageAlloc) InUse() []AddrRange {
 	ranges := make([]AddrRange, 0, len(p.inUse.ranges))
 	for _, r := range p.inUse.ranges {
 		ranges = append(ranges, AddrRange{
-			Base:  r.base,
-			Limit: r.limit,
+			Base:  r.base.addr(),
+			Limit: r.limit.addr(),
 		})
 	}
 	return ranges
@@ -784,6 +789,7 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 
 	// We've got an entry, so initialize the pageAlloc.
 	p.init(new(mutex), nil)
+	lockInit(p.mheapLock, lockRankMheap)
 	p.test = true
 
 	for i, init := range chunks {
@@ -810,7 +816,6 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 				}
 			}
 		}
-		p.resetScavengeAddr()
 
 		// Apply alloc state.
 		for _, s := range init {
@@ -824,6 +829,11 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		// Update heap metadata for the allocRange calls above.
 		p.update(addr, pallocChunkPages, false, false)
 	}
+	systemstack(func() {
+		lock(p.mheapLock)
+		p.scavengeStartGen()
+		unlock(p.mheapLock)
+	})
 	return (*PageAlloc)(p)
 }
 
@@ -860,13 +870,9 @@ func FreePageAlloc(pp *PageAlloc) {
 // 64 bit and 32 bit platforms, allowing the tests to share code
 // between the two.
 //
-// On AIX, the arenaBaseOffset is 0x0a00000000000000. However, this
-// constant can't be used here because it is negative and will cause
-// a constant overflow.
-//
 // This should not be higher than 0x100*pallocChunkBytes to support
 // mips and mipsle, which only have 31-bit address spaces.
-var BaseChunkIdx = ChunkIdx(chunkIndex(((0xc000*pageAlloc64Bit + 0x100*pageAlloc32Bit) * pallocChunkBytes) + 0x0a00000000000000*sys.GoosAix*sys.GoarchPpc64))
+var BaseChunkIdx = ChunkIdx(chunkIndex(((0xc000*pageAlloc64Bit + 0x100*pageAlloc32Bit) * pallocChunkBytes) + arenaBaseOffset*sys.GoosAix*sys.GoarchPpc64))
 
 // PageBase returns an address given a chunk index and a page index
 // relative to that chunk.
@@ -970,4 +976,12 @@ func MapHashCheck(m interface{}, k interface{}) (uintptr, uintptr) {
 	return x, y
 }
 
+func MSpanCountAlloc(bits []byte) int {
+	s := mspan{
+		nelems:     uintptr(len(bits) * 8),
+		gcmarkBits: (*gcBits)(unsafe.Pointer(&bits[0])),
+	}
+	return s.countAlloc()
+}
+
 var Pusestackmaps = &usestackmaps
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index 96af6063f82..a85fbb07c00 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -87,9 +87,11 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	When set to 0 memory profiling is disabled.  Refer to the description of
 	MemProfileRate for the default value.
 
-	memprofilerate:  setting memprofilerate=X changes the setting for
-	runtime.MemProfileRate.  Refer to the description of this variable for how
-	it is used and its default value.
+	invalidptr: invalidptr=1 (the default) causes the garbage collector and stack
+	copier to crash the program if an invalid pointer value (for example, 1)
+	is found in a pointer-typed location. Setting invalidptr=0 disables this check.
+	This should only be used as a temporary workaround to diagnose buggy code.
+	The real fix is to not store integers in pointer-typed locations.
 
 	sbrk: setting sbrk=1 replaces the memory allocator and garbage collector
 	with a trivial allocator that obtains memory from the operating system and
@@ -102,10 +104,11 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	scavenger as well as the total amount of memory returned to the operating system
 	and an estimate of physical memory utilization. The format of this line is subject
 	to change, but currently it is:
-		scav # KiB work, # KiB total, #% util
+		scav # # KiB work, # KiB total, #% util
 	where the fields are as follows:
-		# KiB work   the amount of memory returned to the OS since the last scav line
-		# KiB total  how much of the heap at this point in time has been released to the OS
+		scav #       the scavenge cycle number
+		# KiB work   the amount of memory returned to the OS since the last line
+		# KiB total  the total amount of memory returned to the OS
 		#% util      the fraction of all unscavenged memory which is in-use
 	If the line ends with "(forced)", then scavenging was forced by a
 	debug.FreeOSMemory() call.
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index 8f14bf9e51f..f545e4bd3b6 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -6,10 +6,13 @@ package runtime_test
 
 import (
 	"fmt"
+	"math/rand"
 	"os"
 	"reflect"
 	"runtime"
 	"runtime/debug"
+	"sort"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"testing"
@@ -193,6 +196,18 @@ func TestPeriodicGC(t *testing.T) {
 }
 */
 
+func TestGcZombieReporting(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		t.Skip("gccgo uses partially conservative GC")
+	}
+	// This test is somewhat sensitive to how the allocator works.
+	got := runTestProg(t, "testprog", "GCZombie")
+	want := "found pointer to free object"
+	if !strings.Contains(got, want) {
+		t.Fatalf("expected %q in output, but got %q", want, got)
+	}
+}
+
 func BenchmarkSetTypePtr(b *testing.B) {
 	benchSetType(b, new(*byte))
 }
@@ -509,6 +524,90 @@ func BenchmarkReadMemStats(b *testing.B) {
 	hugeSink = nil
 }
 
+func BenchmarkReadMemStatsLatency(b *testing.B) {
+	// We’ll apply load to the runtime with maxProcs-1 goroutines
+	// and use one more to actually benchmark. It doesn't make sense
+	// to try to run this test with only 1 P (that's what
+	// BenchmarkReadMemStats is for).
+	maxProcs := runtime.GOMAXPROCS(-1)
+	if maxProcs == 1 {
+		b.Skip("This benchmark can only be run with GOMAXPROCS > 1")
+	}
+
+	// Code to build a big tree with lots of pointers.
+	type node struct {
+		children [16]*node
+	}
+	var buildTree func(depth int) *node
+	buildTree = func(depth int) *node {
+		tree := new(node)
+		if depth != 0 {
+			for i := range tree.children {
+				tree.children[i] = buildTree(depth - 1)
+			}
+		}
+		return tree
+	}
+
+	// Keep the GC busy by continuously generating large trees.
+	done := make(chan struct{})
+	var wg sync.WaitGroup
+	for i := 0; i < maxProcs-1; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			var hold *node
+		loop:
+			for {
+				hold = buildTree(5)
+				select {
+				case <-done:
+					break loop
+				default:
+				}
+			}
+			runtime.KeepAlive(hold)
+		}()
+	}
+
+	// Spend this much time measuring latencies.
+	latencies := make([]time.Duration, 0, 1024)
+
+	// Run for timeToBench hitting ReadMemStats continuously
+	// and measuring the latency.
+	b.ResetTimer()
+	var ms runtime.MemStats
+	for i := 0; i < b.N; i++ {
+		// Sleep for a bit, otherwise we're just going to keep
+		// stopping the world and no one will get to do anything.
+		time.Sleep(100 * time.Millisecond)
+		start := time.Now()
+		runtime.ReadMemStats(&ms)
+		latencies = append(latencies, time.Now().Sub(start))
+	}
+	close(done)
+	// Make sure to stop the timer before we wait! The goroutines above
+	// are very heavy-weight and not easy to stop, so we could end up
+	// confusing the benchmarking framework for small b.N.
+	b.StopTimer()
+	wg.Wait()
+
+	// Disable the default */op metrics.
+	// ns/op doesn't mean anything because it's an average, but we
+	// have a sleep in our b.N loop above which skews this significantly.
+	b.ReportMetric(0, "ns/op")
+	b.ReportMetric(0, "B/op")
+	b.ReportMetric(0, "allocs/op")
+
+	// Sort latencies then report percentiles.
+	sort.Slice(latencies, func(i, j int) bool {
+		return latencies[i] < latencies[j]
+	})
+	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
+}
+
 func TestUserForcedGC(t *testing.T) {
 	// Test that runtime.GC() triggers a GC even if GOGC=off.
 	defer debug.SetGCPercent(debug.SetGCPercent(-1))
@@ -669,6 +768,24 @@ func BenchmarkScanStackNoLocals(b *testing.B) {
 	close(teardown)
 }
 
+func BenchmarkMSpanCountAlloc(b *testing.B) {
+	// n is the number of bytes to benchmark against.
+	// n must always be a multiple of 8, since gcBits is
+	// always rounded up 8 bytes.
+	for _, n := range []int{8, 16, 32, 64, 128} {
+		b.Run(fmt.Sprintf("bits=%d", n*8), func(b *testing.B) {
+			// Initialize a new byte slice with pseduo-random data.
+			bits := make([]byte, n)
+			rand.Read(bits)
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				runtime.MSpanCountAlloc(bits)
+			}
+		})
+	}
+}
+
 func countpwg(n *int, ready *sync.WaitGroup, teardown chan bool) {
 	if *n == 0 {
 		ready.Done()
diff --git a/libgo/go/runtime/hash_test.go b/libgo/go/runtime/hash_test.go
index 522b7febf9b..60a86015f64 100644
--- a/libgo/go/runtime/hash_test.go
+++ b/libgo/go/runtime/hash_test.go
@@ -152,14 +152,13 @@ func (s *HashSet) addS_seed(x string, seed uintptr) {
 	s.add(StringHash(x, seed))
 }
 func (s *HashSet) check(t *testing.T) {
-	const SLOP = 10.0
+	const SLOP = 50.0
 	collisions := s.n - len(s.m)
-	//fmt.Printf("%d/%d\n", len(s.m), s.n)
 	pairs := int64(s.n) * int64(s.n-1) / 2
 	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
 	stddev := math.Sqrt(expected)
 	if float64(collisions) > expected+SLOP*(3*stddev+1) {
-		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
+		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f threshold=%f", collisions, expected, stddev, expected+SLOP*(3*stddev+1))
 	}
 }
 
@@ -564,8 +563,11 @@ func avalancheTest1(t *testing.T, k Key) {
 
 // All bit rotations of a set of distinct keys
 func TestSmhasherWindowed(t *testing.T) {
+	t.Logf("32 bit keys")
 	windowed(t, &Int32Key{})
+	t.Logf("64 bit keys")
 	windowed(t, &Int64Key{})
+	t.Logf("string keys")
 	windowed(t, &BytesKey{make([]byte, 128)})
 }
 func windowed(t *testing.T, k Key) {
diff --git a/libgo/go/runtime/iface.go b/libgo/go/runtime/iface.go
index 74b54f5209c..877e191a855 100644
--- a/libgo/go/runtime/iface.go
+++ b/libgo/go/runtime/iface.go
@@ -296,6 +296,7 @@ func getitab(lhs, rhs *_type, canfail bool) unsafe.Pointer {
 	}
 
 	// Not found.  Grab the lock and try again.
+	lockInit(&itabLock, lockRankItab)
 	lock(&itabLock)
 	if m = itabTable.find(lhsi, rhs); m != nil {
 		unlock(&itabLock)
@@ -514,8 +515,8 @@ func reflectlite_ifaceE2I(inter *interfacetype, e eface, dst *iface) {
 	dst.data = e.data
 }
 
-// staticbytes is used to avoid convT2E for byte-sized values.
-var staticbytes = [...]byte{
+// staticuint64s is used to avoid allocating in convTx for small integer values.
+var staticuint64s = [...]uint64{
 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
diff --git a/libgo/go/runtime/iface_test.go b/libgo/go/runtime/iface_test.go
index d63ea796138..43d3698b1d3 100644
--- a/libgo/go/runtime/iface_test.go
+++ b/libgo/go/runtime/iface_test.go
@@ -95,6 +95,19 @@ func BenchmarkNeIfaceConcrete(b *testing.B) {
 	}
 }
 
+func BenchmarkConvT2EByteSized(b *testing.B) {
+	b.Run("bool", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			e = yes
+		}
+	})
+	b.Run("uint8", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			e = eight8
+		}
+	})
+}
+
 func BenchmarkConvT2ESmall(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		e = ts
@@ -322,18 +335,22 @@ func TestZeroConvT2x(t *testing.T) {
 var (
 	eight8  uint8 = 8
 	eight8I T8    = 8
+	yes     bool  = true
 
-	zero16  uint16 = 0
-	zero16I T16    = 0
-	one16   uint16 = 1
+	zero16     uint16 = 0
+	zero16I    T16    = 0
+	one16      uint16 = 1
+	thousand16 uint16 = 1000
 
-	zero32  uint32 = 0
-	zero32I T32    = 0
-	one32   uint32 = 1
+	zero32     uint32 = 0
+	zero32I    T32    = 0
+	one32      uint32 = 1
+	thousand32 uint32 = 1000
 
-	zero64  uint64 = 0
-	zero64I T64    = 0
-	one64   uint64 = 1
+	zero64     uint64 = 0
+	zero64I    T64    = 0
+	one64      uint64 = 1
+	thousand64 uint64 = 1000
 
 	zerostr  string = ""
 	zerostrI Tstr   = ""
@@ -381,6 +398,23 @@ func BenchmarkConvT2Ezero(b *testing.B) {
 		})
 	})
 	b.Run("nonzero", func(b *testing.B) {
+		b.Run("str", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzstr
+			}
+		})
+		b.Run("slice", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzslice
+			}
+		})
+		b.Run("big", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzbig
+			}
+		})
+	})
+	b.Run("smallint", func(b *testing.B) {
 		b.Run("16", func(b *testing.B) {
 			for i := 0; i < b.N; i++ {
 				e = one16
@@ -396,19 +430,21 @@ func BenchmarkConvT2Ezero(b *testing.B) {
 				e = one64
 			}
 		})
-		b.Run("str", func(b *testing.B) {
+	})
+	b.Run("largeint", func(b *testing.B) {
+		b.Run("16", func(b *testing.B) {
 			for i := 0; i < b.N; i++ {
-				e = nzstr
+				e = thousand16
 			}
 		})
-		b.Run("slice", func(b *testing.B) {
+		b.Run("32", func(b *testing.B) {
 			for i := 0; i < b.N; i++ {
-				e = nzslice
+				e = thousand32
 			}
 		})
-		b.Run("big", func(b *testing.B) {
+		b.Run("64", func(b *testing.B) {
 			for i := 0; i < b.N; i++ {
-				e = nzbig
+				e = thousand64
 			}
 		})
 	})
diff --git a/libgo/go/runtime/lock_futex.go b/libgo/go/runtime/lock_futex.go
index f672efdc74c..21d7e0dd943 100644
--- a/libgo/go/runtime/lock_futex.go
+++ b/libgo/go/runtime/lock_futex.go
@@ -55,6 +55,10 @@ func key32(p *uintptr) *uint32 {
 }
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	gp := getg()
 
 	if gp.m.locks < 0 {
@@ -115,6 +119,10 @@ func lock(l *mutex) {
 }
 
 func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
+func unlock2(l *mutex) {
 	v := atomic.Xchg(key32(&l.key), mutex_unlocked)
 	if v == mutex_unlocked {
 		throw("unlock of unlocked lock")
@@ -241,8 +249,8 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func beforeIdle(int64) bool {
-	return false
+func beforeIdle(int64) (*g, bool) {
+	return nil, false
 }
 
 func checkTimeouts() {}
diff --git a/libgo/go/runtime/lock_js.go b/libgo/go/runtime/lock_js.go
index 3168c86d8a0..14bdc76842c 100644
--- a/libgo/go/runtime/lock_js.go
+++ b/libgo/go/runtime/lock_js.go
@@ -26,6 +26,10 @@ const (
 )
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	if l.key == mutex_locked {
 		// js/wasm is single-threaded so we should never
 		// observe this.
@@ -40,6 +44,10 @@ func lock(l *mutex) {
 }
 
 func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
+func unlock2(l *mutex) {
 	if l.key == mutex_unlocked {
 		throw("unlock of unlocked lock")
 	}
@@ -165,7 +173,9 @@ var idleID int32
 // beforeIdle gets called by the scheduler if no goroutine is awake.
 // If we are not already handling an event, then we pause for an async event.
 // If an event handler returned, we resume it and it will pause the execution.
-func beforeIdle(delay int64) bool {
+// beforeIdle either returns the specific goroutine to schedule next or
+// indicates with otherReady that some goroutine became ready.
+func beforeIdle(delay int64) (gp *g, otherReady bool) {
 	if delay > 0 {
 		clearIdleID()
 		if delay < 1e6 {
@@ -182,15 +192,14 @@ func beforeIdle(delay int64) bool {
 
 	if len(events) == 0 {
 		go handleAsyncEvent()
-		return true
+		return nil, true
 	}
 
 	e := events[len(events)-1]
 	if e.returned {
-		goready(e.gp, 1)
-		return true
+		return e.gp, false
 	}
-	return false
+	return nil, false
 }
 
 func handleAsyncEvent() {
diff --git a/libgo/go/runtime/lock_sema.go b/libgo/go/runtime/lock_sema.go
index 63a60149db3..fe173843797 100644
--- a/libgo/go/runtime/lock_sema.go
+++ b/libgo/go/runtime/lock_sema.go
@@ -44,6 +44,10 @@ const (
 )
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	gp := getg()
 	if gp.m.locks < 0 {
 		throw("runtime·lock: lock count")
@@ -100,9 +104,13 @@ Loop:
 	}
 }
 
+func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
 //go:nowritebarrier
 // We might not be holding a p in this code.
-func unlock(l *mutex) {
+func unlock2(l *mutex) {
 	gp := getg()
 	var mp *m
 	for {
@@ -300,8 +308,8 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func beforeIdle(int64) bool {
-	return false
+func beforeIdle(int64) (*g, bool) {
+	return nil, false
 }
 
 func checkTimeouts() {}
diff --git a/libgo/go/runtime/lockrank.go b/libgo/go/runtime/lockrank.go
new file mode 100644
index 00000000000..000193585df
--- /dev/null
+++ b/libgo/go/runtime/lockrank.go
@@ -0,0 +1,254 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file records the static ranks of the locks in the runtime. If a lock
+// is not given a rank, then it is assumed to be a leaf lock, which means no other
+// lock can be acquired while it is held. Therefore, leaf locks do not need to be
+// given an explicit rank. We list all of the architecture-independent leaf locks
+// for documentation purposes, but don't list any of the architecture-dependent
+// locks (which are all leaf locks). debugLock is ignored for ranking, since it is used
+// when printing out lock ranking errors.
+//
+// lockInit(l *mutex, rank int) is used to set the rank of lock before it is used.
+// If there is no clear place to initialize a lock, then the rank of a lock can be
+// specified during the lock call itself via lockWithrank(l *mutex, rank int).
+//
+// Besides the static lock ranking (which is a total ordering of the locks), we
+// also represent and enforce the actual partial order among the locks in the
+// arcs[] array below. That is, if it is possible that lock B can be acquired when
+// lock A is the previous acquired lock that is still held, then there should be
+// an entry for A in arcs[B][]. We will currently fail not only if the total order
+// (the lock ranking) is violated, but also if there is a missing entry in the
+// partial order.
+
+package runtime
+
+type lockRank int
+
+// Constants representing the lock rank of the architecture-independent locks in
+// the runtime. Locks with lower rank must be taken before locks with higher
+// rank.
+const (
+	lockRankDummy lockRank = iota
+
+	// Locks held above sched
+	lockRankSysmon
+	lockRankScavenge
+	lockRankForcegc
+	lockRankSweepWaiters
+	lockRankAssistQueue
+	lockRankCpuprof
+	lockRankSweep
+
+	lockRankSched
+	lockRankDeadlock
+	lockRankPanic
+	lockRankAllg
+	lockRankAllp
+	lockRankPollDesc
+
+	lockRankTimers // Multiple timers locked simultaneously in destroy()
+	lockRankItab
+	lockRankReflectOffs
+	lockRankHchan // Multiple hchans acquired in lock order in syncadjustsudogs()
+	lockRankFin
+	lockRankNotifyList
+	lockRankTraceBuf
+	lockRankTraceStrings
+	lockRankMspanSpecial
+	lockRankProf
+	lockRankGcBitsArenas
+	lockRankRoot
+	lockRankTrace
+	lockRankTraceStackTab
+	lockRankNetpollInit
+
+	lockRankRwmutexW
+	lockRankRwmutexR
+
+	lockRankMcentral // For !go115NewMCentralImpl
+	lockRankSpine    // For !go115NewMCentralImpl
+	lockRankSpanSetSpine
+	lockRankGscan
+	lockRankStackpool
+	lockRankStackLarge
+	lockRankDefer
+	lockRankSudog
+
+	// Memory-related non-leaf locks
+	lockRankWbufSpans
+	lockRankMheap
+	lockRankMheapSpecial
+
+	// Memory-related leaf locks
+	lockRankGlobalAlloc
+
+	// Other leaf locks
+	lockRankGFree
+	// Generally, hchan must be acquired before gscan. But in one specific
+	// case (in syncadjustsudogs from markroot after the g has been suspended
+	// by suspendG), we allow gscan to be acquired, and then an hchan lock. To
+	// allow this case, we get this lockRankHchanLeaf rank in
+	// syncadjustsudogs(), rather than lockRankHchan. By using this special
+	// rank, we don't allow any further locks to be acquired other than more
+	// hchan locks.
+	lockRankHchanLeaf
+
+	// Leaf locks with no dependencies, so these constants are not actually used anywhere.
+	// There are other architecture-dependent leaf locks as well.
+	lockRankNewmHandoff
+	lockRankDebugPtrmask
+	lockRankFaketimeState
+	lockRankTicks
+	lockRankRaceFini
+	lockRankPollCache
+	lockRankDebug
+)
+
+// lockRankLeafRank is the rank of lock that does not have a declared rank, and hence is
+// a leaf lock.
+const lockRankLeafRank lockRank = 1000
+
+// lockNames gives the names associated with each of the above ranks
+var lockNames = []string{
+	lockRankDummy: "",
+
+	lockRankSysmon:       "sysmon",
+	lockRankScavenge:     "scavenge",
+	lockRankForcegc:      "forcegc",
+	lockRankSweepWaiters: "sweepWaiters",
+	lockRankAssistQueue:  "assistQueue",
+	lockRankCpuprof:      "cpuprof",
+	lockRankSweep:        "sweep",
+
+	lockRankSched:    "sched",
+	lockRankDeadlock: "deadlock",
+	lockRankPanic:    "panic",
+	lockRankAllg:     "allg",
+	lockRankAllp:     "allp",
+	lockRankPollDesc: "pollDesc",
+
+	lockRankTimers:      "timers",
+	lockRankItab:        "itab",
+	lockRankReflectOffs: "reflectOffs",
+
+	lockRankHchan:         "hchan",
+	lockRankFin:           "fin",
+	lockRankNotifyList:    "notifyList",
+	lockRankTraceBuf:      "traceBuf",
+	lockRankTraceStrings:  "traceStrings",
+	lockRankMspanSpecial:  "mspanSpecial",
+	lockRankProf:          "prof",
+	lockRankGcBitsArenas:  "gcBitsArenas",
+	lockRankRoot:          "root",
+	lockRankTrace:         "trace",
+	lockRankTraceStackTab: "traceStackTab",
+	lockRankNetpollInit:   "netpollInit",
+
+	lockRankRwmutexW: "rwmutexW",
+	lockRankRwmutexR: "rwmutexR",
+
+	lockRankMcentral:     "mcentral",
+	lockRankSpine:        "spine",
+	lockRankSpanSetSpine: "spanSetSpine",
+	lockRankGscan:        "gscan",
+	lockRankStackpool:    "stackpool",
+	lockRankStackLarge:   "stackLarge",
+	lockRankDefer:        "defer",
+	lockRankSudog:        "sudog",
+
+	lockRankWbufSpans:    "wbufSpans",
+	lockRankMheap:        "mheap",
+	lockRankMheapSpecial: "mheapSpecial",
+
+	lockRankGlobalAlloc: "globalAlloc.mutex",
+
+	lockRankGFree:     "gFree",
+	lockRankHchanLeaf: "hchanLeaf",
+
+	lockRankNewmHandoff:   "newmHandoff.lock",
+	lockRankDebugPtrmask:  "debugPtrmask.lock",
+	lockRankFaketimeState: "faketimeState.lock",
+	lockRankTicks:         "ticks.lock",
+	lockRankRaceFini:      "raceFiniLock",
+	lockRankPollCache:     "pollCache.lock",
+	lockRankDebug:         "debugLock",
+}
+
+func (rank lockRank) String() string {
+	if rank == 0 {
+		return "UNKNOWN"
+	}
+	if rank == lockRankLeafRank {
+		return "LEAF"
+	}
+	return lockNames[rank]
+}
+
+// lockPartialOrder is a partial order among the various lock types, listing the immediate
+// ordering that has actually been observed in the runtime. Each entry (which
+// corresponds to a particular lock rank) specifies the list of locks that can be
+// already be held immediately "above" it.
+//
+// So, for example, the lockRankSched entry shows that all the locks preceding it in
+// rank can actually be held. The fin lock shows that only the sched, timers, or
+// hchan lock can be held immediately above it when it is acquired.
+var lockPartialOrder [][]lockRank = [][]lockRank{
+	lockRankDummy:         {},
+	lockRankSysmon:        {},
+	lockRankScavenge:      {lockRankSysmon},
+	lockRankForcegc:       {lockRankSysmon},
+	lockRankSweepWaiters:  {},
+	lockRankAssistQueue:   {},
+	lockRankCpuprof:       {},
+	lockRankSweep:         {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankDeadlock:      {lockRankDeadlock},
+	lockRankPanic:         {lockRankDeadlock},
+	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
+	lockRankAllp:          {lockRankSysmon, lockRankSched},
+	lockRankPollDesc:      {},
+	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
+	lockRankItab:          {},
+	lockRankReflectOffs:   {lockRankItab},
+	lockRankHchan:         {lockRankScavenge, lockRankSweep, lockRankHchan},
+	lockRankFin:           {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan},
+	lockRankNotifyList:    {},
+	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
+	lockRankTraceStrings:  {lockRankTraceBuf},
+	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankRoot:          {},
+	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankSweep},
+	lockRankTraceStackTab: {lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankTrace},
+	lockRankNetpollInit:   {lockRankTimers},
+
+	lockRankRwmutexW: {},
+	lockRankRwmutexR: {lockRankRwmutexW},
+
+	lockRankMcentral:     {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankSpine:        {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankMcentral, lockRankSpine, lockRankSpanSetSpine},
+	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankMcentral, lockRankSpine, lockRankSpanSetSpine, lockRankGscan},
+	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankSpanSetSpine, lockRankGscan},
+	lockRankDefer:        {},
+	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
+	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
+	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGlobalAlloc:  {lockRankProf, lockRankSpine, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+
+	lockRankGFree:     {lockRankSched},
+	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
+
+	lockRankNewmHandoff:   {},
+	lockRankDebugPtrmask:  {},
+	lockRankFaketimeState: {},
+	lockRankTicks:         {},
+	lockRankRaceFini:      {},
+	lockRankPollCache:     {},
+	lockRankDebug:         {},
+}
diff --git a/libgo/go/runtime/lockrank_off.go b/libgo/go/runtime/lockrank_off.go
new file mode 100644
index 00000000000..891589c0f27
--- /dev/null
+++ b/libgo/go/runtime/lockrank_off.go
@@ -0,0 +1,32 @@
+// +build !goexperiment.staticlockranking
+
+package runtime
+
+// // lockRankStruct is embedded in mutex, but is empty when staticklockranking is
+// disabled (the default)
+type lockRankStruct struct {
+}
+
+func lockInit(l *mutex, rank lockRank) {
+}
+
+func getLockRank(l *mutex) lockRank {
+	return 0
+}
+
+func lockWithRank(l *mutex, rank lockRank) {
+	lock2(l)
+}
+
+func acquireLockRank(rank lockRank) {
+}
+
+func unlockWithRank(l *mutex) {
+	unlock2(l)
+}
+
+func releaseLockRank(rank lockRank) {
+}
+
+func lockWithRankMayAcquire(l *mutex, rank lockRank) {
+}
diff --git a/libgo/go/runtime/lockrank_on.go b/libgo/go/runtime/lockrank_on.go
new file mode 100644
index 00000000000..cf4151ff462
--- /dev/null
+++ b/libgo/go/runtime/lockrank_on.go
@@ -0,0 +1,210 @@
+// +build goexperiment.staticlockranking
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+// lockRankStruct is embedded in mutex
+type lockRankStruct struct {
+	// static lock ranking of the lock
+	rank lockRank
+	// pad field to make sure lockRankStruct is a multiple of 8 bytes, even on
+	// 32-bit systems.
+	pad int
+}
+
+// init checks that the partial order in lockPartialOrder fits within the total
+// order determined by the order of the lockRank constants.
+func init() {
+	for rank, list := range lockPartialOrder {
+		for _, entry := range list {
+			if entry > lockRank(rank) {
+				println("lockPartial order row", lockRank(rank).String(), "entry", entry.String())
+				throw("lockPartialOrder table is inconsistent with total lock ranking order")
+			}
+		}
+	}
+}
+
+func lockInit(l *mutex, rank lockRank) {
+	l.rank = rank
+}
+
+func getLockRank(l *mutex) lockRank {
+	return l.rank
+}
+
+// The following functions are the entry-points to record lock
+// operations.
+// All of these are nosplit and switch to the system stack immediately
+// to avoid stack growths. Since a stack growth could itself have lock
+// operations, this prevents re-entrant calls.
+
+// lockWithRank is like lock(l), but allows the caller to specify a lock rank
+// when acquiring a non-static lock.
+//go:nosplit
+func lockWithRank(l *mutex, rank lockRank) {
+	if l == &debuglock || l == &paniclk {
+		// debuglock is only used for println/printlock(). Don't do lock
+		// rank recording for it, since print/println are used when
+		// printing out a lock ordering problem below.
+		//
+		// paniclk has an ordering problem, since it can be acquired
+		// during a panic with any other locks held (especially if the
+		// panic is because of a directed segv), and yet also allg is
+		// acquired after paniclk in tracebackothers()). This is a genuine
+		// problem, so for now we don't do lock rank recording for paniclk
+		// either.
+		lock2(l)
+		return
+	}
+	if rank == 0 {
+		rank = lockRankLeafRank
+	}
+	gp := getg()
+	// Log the new class.
+	systemstack(func() {
+		i := gp.m.locksHeldLen
+		if i >= len(gp.m.locksHeld) {
+			throw("too many locks held concurrently for rank checking")
+		}
+		gp.m.locksHeld[i].rank = rank
+		gp.m.locksHeld[i].lockAddr = uintptr(unsafe.Pointer(l))
+		gp.m.locksHeldLen++
+
+		// i is the index of the lock being acquired
+		if i > 0 {
+			checkRanks(gp, gp.m.locksHeld[i-1].rank, rank)
+		}
+		lock2(l)
+	})
+}
+
+// acquireLockRank acquires a rank which is not associated with a mutex lock
+//go:nosplit
+func acquireLockRank(rank lockRank) {
+	gp := getg()
+	// Log the new class.
+	systemstack(func() {
+		i := gp.m.locksHeldLen
+		if i >= len(gp.m.locksHeld) {
+			throw("too many locks held concurrently for rank checking")
+		}
+		gp.m.locksHeld[i].rank = rank
+		gp.m.locksHeld[i].lockAddr = 0
+		gp.m.locksHeldLen++
+
+		// i is the index of the lock being acquired
+		if i > 0 {
+			checkRanks(gp, gp.m.locksHeld[i-1].rank, rank)
+		}
+	})
+}
+
+// checkRanks checks if goroutine g, which has mostly recently acquired a lock
+// with rank 'prevRank', can now acquire a lock with rank 'rank'.
+func checkRanks(gp *g, prevRank, rank lockRank) {
+	rankOK := false
+	if rank < prevRank {
+		// If rank < prevRank, then we definitely have a rank error
+		rankOK = false
+	} else if rank == lockRankLeafRank {
+		// If new lock is a leaf lock, then the preceding lock can
+		// be anything except another leaf lock.
+		rankOK = prevRank < lockRankLeafRank
+	} else {
+		// We've now verified the total lock ranking, but we
+		// also enforce the partial ordering specified by
+		// lockPartialOrder as well. Two locks with the same rank
+		// can only be acquired at the same time if explicitly
+		// listed in the lockPartialOrder table.
+		list := lockPartialOrder[rank]
+		for _, entry := range list {
+			if entry == prevRank {
+				rankOK = true
+				break
+			}
+		}
+	}
+	if !rankOK {
+		printlock()
+		println(gp.m.procid, " ======")
+		for j, held := range gp.m.locksHeld[:gp.m.locksHeldLen] {
+			println(j, ":", held.rank.String(), held.rank, unsafe.Pointer(gp.m.locksHeld[j].lockAddr))
+		}
+		throw("lock ordering problem")
+	}
+}
+
+//go:nosplit
+func unlockWithRank(l *mutex) {
+	if l == &debuglock || l == &paniclk {
+		// See comment at beginning of lockWithRank.
+		unlock2(l)
+		return
+	}
+	gp := getg()
+	systemstack(func() {
+		found := false
+		for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+			if gp.m.locksHeld[i].lockAddr == uintptr(unsafe.Pointer(l)) {
+				found = true
+				copy(gp.m.locksHeld[i:gp.m.locksHeldLen-1], gp.m.locksHeld[i+1:gp.m.locksHeldLen])
+				gp.m.locksHeldLen--
+				break
+			}
+		}
+		if !found {
+			println(gp.m.procid, ":", l.rank.String(), l.rank, l)
+			throw("unlock without matching lock acquire")
+		}
+		unlock2(l)
+	})
+}
+
+// releaseLockRank releases a rank which is not associated with a mutex lock
+//go:nosplit
+func releaseLockRank(rank lockRank) {
+	gp := getg()
+	systemstack(func() {
+		found := false
+		for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+			if gp.m.locksHeld[i].rank == rank && gp.m.locksHeld[i].lockAddr == 0 {
+				found = true
+				copy(gp.m.locksHeld[i:gp.m.locksHeldLen-1], gp.m.locksHeld[i+1:gp.m.locksHeldLen])
+				gp.m.locksHeldLen--
+				break
+			}
+		}
+		if !found {
+			println(gp.m.procid, ":", rank.String(), rank)
+			throw("lockRank release without matching lockRank acquire")
+		}
+	})
+}
+
+//go:nosplit
+func lockWithRankMayAcquire(l *mutex, rank lockRank) {
+	gp := getg()
+	if gp.m.locksHeldLen == 0 {
+		// No possibilty of lock ordering problem if no other locks held
+		return
+	}
+
+	systemstack(func() {
+		i := gp.m.locksHeldLen
+		if i >= len(gp.m.locksHeld) {
+			throw("too many locks held concurrently for rank checking")
+		}
+		// Temporarily add this lock to the locksHeld list, so
+		// checkRanks() will print out list, including this lock, if there
+		// is a lock ordering problem.
+		gp.m.locksHeld[i].rank = rank
+		gp.m.locksHeld[i].lockAddr = uintptr(unsafe.Pointer(l))
+		gp.m.locksHeldLen++
+		checkRanks(gp, gp.m.locksHeld[i-1].rank, rank)
+		gp.m.locksHeldLen--
+	})
+}
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
index 6df7eaa53bb..81351eefbb7 100644
--- a/libgo/go/runtime/malloc.go
+++ b/libgo/go/runtime/malloc.go
@@ -312,7 +312,9 @@ const (
 	//
 	// On other platforms, the user address space is contiguous
 	// and starts at 0, so no offset is necessary.
-	arenaBaseOffset = sys.GoarchAmd64*(1<<47) + (^0x0a00000000000000+1)&uintptrMask*sys.GoosAix*sys.GoarchPpc64
+	arenaBaseOffset = 0xffff800000000000*sys.GoarchAmd64 + 0x0a00000000000000*sys.GoosAix*sys.GoarchPpc64
+	// A typed version of this constant that will make it into DWARF (for viewcore).
+	arenaBaseOffsetUintptr = uintptr(arenaBaseOffset)
 
 	// Max number of threads to run garbage collection.
 	// 2, 3, and 4 are all plausible maximums depending
@@ -476,11 +478,21 @@ func mallocinit() {
 			physHugePageShift++
 		}
 	}
+	if pagesPerArena%pagesPerSpanRoot != 0 {
+		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerSpanRoot (", pagesPerSpanRoot, ")\n")
+		throw("bad pagesPerSpanRoot")
+	}
+	if pagesPerArena%pagesPerReclaimerChunk != 0 {
+		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerReclaimerChunk (", pagesPerReclaimerChunk, ")\n")
+		throw("bad pagesPerReclaimerChunk")
+	}
 
 	// Initialize the heap.
 	mheap_.init()
-	_g_ := getg()
-	_g_.m.mcache = allocmcache()
+	mcache0 = allocmcache()
+	lockInit(&gcBitsArenas.lock, lockRankGcBitsArenas)
+	lockInit(&proflock, lockRankProf)
+	lockInit(&globalAlloc.mutex, lockRankGlobalAlloc)
 
 	// Create initial arena growth hints.
 	if sys.PtrSize == 8 {
@@ -605,7 +617,7 @@ func mallocinit() {
 			a, size := sysReserveAligned(unsafe.Pointer(p), arenaSize, heapArenaBytes)
 			if a != nil {
 				mheap_.arena.init(uintptr(a), size)
-				p = uintptr(a) + size // For hint below
+				p = mheap_.arena.end // For hint below
 				break
 			}
 		}
@@ -937,7 +949,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	// but before syscall.CgocallDone. Treat this allocation as a
 	// callback.
 	incallback := false
-	if gomcache() == nil && getg().m.ncgo > 0 {
+	if gp := getg(); gp.m.p == 0 && gp.m.ncgo > 0 {
 		exitsyscall()
 		incallback = true
 	}
@@ -975,7 +987,20 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	shouldhelpgc := false
 	dataSize := size
-	c := gomcache()
+	var c *mcache
+	if mp.p != 0 {
+		c = mp.p.ptr().mcache
+	} else {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+		if c == nil {
+			throw("malloc called with no P")
+		}
+	}
+	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
 	if size <= maxSmallSize {
@@ -1031,10 +1056,10 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				return x
 			}
 			// Allocate a new maxTinySize block.
-			span := c.alloc[tinySpanClass]
+			span = c.alloc[tinySpanClass]
 			v := nextFreeFast(span)
 			if v == 0 {
-				v, _, shouldhelpgc = c.nextFree(tinySpanClass)
+				v, span, shouldhelpgc = c.nextFree(tinySpanClass)
 			}
 			x = unsafe.Pointer(v)
 			(*[2]uint64)(x)[0] = 0
@@ -1049,13 +1074,13 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		} else {
 			var sizeclass uint8
 			if size <= smallSizeMax-8 {
-				sizeclass = size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]
+				sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)]
 			} else {
-				sizeclass = size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]
+				sizeclass = size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]
 			}
 			size = uintptr(class_to_size[sizeclass])
 			spc := makeSpanClass(sizeclass, noscan)
-			span := c.alloc[spc]
+			span = c.alloc[spc]
 			v := nextFreeFast(span)
 			if v == 0 {
 				v, span, shouldhelpgc = c.nextFree(spc)
@@ -1066,15 +1091,14 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			}
 		}
 	} else {
-		var s *mspan
 		shouldhelpgc = true
 		systemstack(func() {
-			s = largeAlloc(size, needzero, noscan)
+			span = largeAlloc(size, needzero, noscan)
 		})
-		s.freeindex = 1
-		s.allocCount = 1
-		x = unsafe.Pointer(s.base())
-		size = s.elemsize
+		span.freeindex = 1
+		span.allocCount = 1
+		x = unsafe.Pointer(span.base())
+		size = span.elemsize
 	}
 
 	var scanSize uintptr
@@ -1106,7 +1130,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	// This may be racing with GC so do it atomically if there can be
 	// a race marking the bit.
 	if gcphase != _GCoff {
-		gcmarknewobject(uintptr(x), size, scanSize)
+		gcmarknewobject(span, uintptr(x), size, scanSize)
 	}
 
 	if raceenabled {
@@ -1174,10 +1198,16 @@ func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 	// pays the debt down to npage pages.
 	deductSweepCredit(npages*_PageSize, npages)
 
-	s := mheap_.alloc(npages, makeSpanClass(0, noscan), needzero)
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc, needzero)
 	if s == nil {
 		throw("out of memory")
 	}
+	if go115NewMCentralImpl {
+		// Put the large span in the mcentral swept list so that it's
+		// visible to the background sweeper.
+		mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	}
 	s.limit = s.base() + size
 	heapBitsForAddr(s.base()).initSpan(s)
 	return s
@@ -1218,7 +1248,16 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	mp.mcache.next_sample = nextSample()
+	var c *mcache
+	if mp.p != 0 {
+		c = mp.p.ptr().mcache
+	} else {
+		c = mcache0
+		if c == nil {
+			throw("profilealloc called with no P")
+		}
+	}
+	c.next_sample = nextSample()
 	mProf_Malloc(x, size)
 }
 
@@ -1411,6 +1450,13 @@ type linearAlloc struct {
 }
 
 func (l *linearAlloc) init(base, size uintptr) {
+	if base+size < base {
+		// Chop off the last byte. The runtime isn't prepared
+		// to deal with situations where the bounds could overflow.
+		// Leave that memory reserved, though, so we don't map it
+		// later.
+		size -= 1
+	}
 	l.next, l.mapped = base, base
 	l.end = base + size
 }
diff --git a/libgo/go/runtime/map.go b/libgo/go/runtime/map.go
index 6667fe7cfe5..b82977103c0 100644
--- a/libgo/go/runtime/map.go
+++ b/libgo/go/runtime/map.go
@@ -81,7 +81,7 @@ const (
 	bucketCnt     = 1 << bucketCntBits
 
 	// Maximum average load of a bucket that triggers growth is 6.5.
-	// Represent as loadFactorNum/loadFactDen, to allow integer math.
+	// Represent as loadFactorNum/loadFactorDen, to allow integer math.
 	loadFactorNum = 13
 	loadFactorDen = 2
 
diff --git a/libgo/go/runtime/map_benchmark_test.go b/libgo/go/runtime/map_benchmark_test.go
index bae1aa0dbd6..893cb6c5b6e 100644
--- a/libgo/go/runtime/map_benchmark_test.go
+++ b/libgo/go/runtime/map_benchmark_test.go
@@ -513,3 +513,22 @@ func BenchmarkMapInterfacePtr(b *testing.B) {
 		BoolSink = m[key]
 	}
 }
+
+var (
+	hintLessThan8    = 7
+	hintGreaterThan8 = 32
+)
+
+func BenchmarkNewEmptyMapHintLessThan8(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		_ = make(map[int]int, hintLessThan8)
+	}
+}
+
+func BenchmarkNewEmptyMapHintGreaterThan8(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		_ = make(map[int]int, hintGreaterThan8)
+	}
+}
diff --git a/libgo/go/runtime/mbarrier.go b/libgo/go/runtime/mbarrier.go
index a4f9b3c0350..836f85a0330 100644
--- a/libgo/go/runtime/mbarrier.go
+++ b/libgo/go/runtime/mbarrier.go
@@ -163,8 +163,8 @@ func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if dst == src {
 		return
 	}
-	if typ.ptrdata != 0 {
-		bulkBarrierPreWrite(uintptr(dst), uintptr(src), typ.size)
+	if writeBarrier.needed && typ.ptrdata != 0 {
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), typ.ptrdata)
 	}
 	// There's a race here: if some other goroutine can write to
 	// src, it may change some pointer in src after we've
@@ -199,17 +199,18 @@ func reflectlite_typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 
 // typedmemmovepartial is like typedmemmove but assumes that
 // dst and src point off bytes into the value and only copies size bytes.
+// off must be a multiple of sys.PtrSize.
 //go:linkname reflect_typedmemmovepartial reflect.typedmemmovepartial
 func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
-	if writeBarrier.needed && typ.ptrdata != 0 && size >= sys.PtrSize {
-		// Pointer-align start address for bulk barrier.
-		adst, asrc, asize := dst, src, size
-		if frag := -off & (sys.PtrSize - 1); frag != 0 {
-			adst = add(dst, frag)
-			asrc = add(src, frag)
-			asize -= frag
+	if writeBarrier.needed && typ.ptrdata > off && size >= sys.PtrSize {
+		if off&(sys.PtrSize-1) != 0 {
+			panic("reflect: internal error: misaligned offset")
 		}
-		bulkBarrierPreWrite(uintptr(adst), uintptr(asrc), asize&^(sys.PtrSize-1))
+		pwsize := alignDown(size, sys.PtrSize)
+		if poff := typ.ptrdata - off; pwsize > poff {
+			pwsize = poff
+		}
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), pwsize)
 	}
 
 	memmove(dst, src, size)
@@ -257,7 +258,8 @@ func typedslicecopy(typ *_type, dstPtr unsafe.Pointer, dstLen int, srcPtr unsafe
 	// before calling typedslicecopy.
 	size := uintptr(n) * typ.size
 	if writeBarrier.needed {
-		bulkBarrierPreWrite(uintptr(dstPtr), uintptr(srcPtr), size)
+		pwsize := size - typ.size + typ.ptrdata
+		bulkBarrierPreWrite(uintptr(dstPtr), uintptr(srcPtr), pwsize)
 	}
 	// See typedmemmove for a discussion of the race between the
 	// barrier and memmove.
@@ -304,8 +306,8 @@ func reflect_typedslicecopy(elemType *_type, dst, src slice) int {
 //
 //go:nosplit
 func typedmemclr(typ *_type, ptr unsafe.Pointer) {
-	if typ.ptrdata != 0 {
-		bulkBarrierPreWrite(uintptr(ptr), 0, typ.size)
+	if writeBarrier.needed && typ.ptrdata != 0 {
+		bulkBarrierPreWrite(uintptr(ptr), 0, typ.ptrdata)
 	}
 	memclrNoHeapPointers(ptr, typ.size)
 }
@@ -317,7 +319,7 @@ func reflect_typedmemclr(typ *_type, ptr unsafe.Pointer) {
 
 //go:linkname reflect_typedmemclrpartial reflect.typedmemclrpartial
 func reflect_typedmemclrpartial(typ *_type, ptr unsafe.Pointer, off, size uintptr) {
-	if typ.ptrdata != 0 {
+	if writeBarrier.needed && typ.ptrdata != 0 {
 		bulkBarrierPreWrite(uintptr(ptr), 0, size)
 	}
 	memclrNoHeapPointers(ptr, size)
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index be8e00c2025..7acd5d1e07a 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -885,58 +885,22 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
 	}
 }
 
-// oneBitCount is indexed by byte and produces the
-// number of 1 bits in that byte. For example 128 has 1 bit set
-// and oneBitCount[128] will holds 1.
-var oneBitCount = [256]uint8{
-	0, 1, 1, 2, 1, 2, 2, 3,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	5, 6, 6, 7, 6, 7, 7, 8}
-
 // countAlloc returns the number of objects allocated in span s by
 // scanning the allocation bitmap.
-// TODO:(rlh) Use popcount intrinsic.
 func (s *mspan) countAlloc() int {
 	count := 0
-	maxIndex := s.nelems / 8
-	for i := uintptr(0); i < maxIndex; i++ {
-		mrkBits := *s.gcmarkBits.bytep(i)
-		count += int(oneBitCount[mrkBits])
-	}
-	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
-		mrkBits := *s.gcmarkBits.bytep(maxIndex)
-		mask := uint8((1 << bitsInLastByte) - 1)
-		bits := mrkBits & mask
-		count += int(oneBitCount[bits])
+	bytes := divRoundUp(s.nelems, 8)
+	// Iterate over each 8-byte chunk and count allocations
+	// with an intrinsic. Note that newMarkBits guarantees that
+	// gcmarkBits will be 8-byte aligned, so we don't have to
+	// worry about edge cases, irrelevant bits will simply be zero.
+	for i := uintptr(0); i < bytes; i += 8 {
+		// Extract 64 bits from the byte pointer and get a OnesCount.
+		// Note that the unsafe cast here doesn't preserve endianness,
+		// but that's OK. We only care about how many bits are 1, not
+		// about the order we discover them in.
+		mrkBits := *(*uint64)(unsafe.Pointer(s.gcmarkBits.bytep(i)))
+		count += sys.OnesCount64(mrkBits)
 	}
 	return count
 }
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index 27328e1e31e..ba526247217 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -123,7 +123,11 @@ func (c *mcache) refill(spc spanClass) {
 		if s.sweepgen != mheap_.sweepgen+3 {
 			throw("bad sweepgen in refill")
 		}
-		atomic.Store(&s.sweepgen, mheap_.sweepgen)
+		if go115NewMCentralImpl {
+			mheap_.central[spc].mcentral.uncacheSpan(s)
+		} else {
+			atomic.Store(&s.sweepgen, mheap_.sweepgen)
+		}
 	}
 
 	// Get a new cached span from the central lists.
diff --git a/libgo/go/runtime/mcentral.go b/libgo/go/runtime/mcentral.go
index 78a3ae6ac19..ed49d86d0c6 100644
--- a/libgo/go/runtime/mcentral.go
+++ b/libgo/go/runtime/mcentral.go
@@ -20,8 +20,31 @@ import "runtime/internal/atomic"
 type mcentral struct {
 	lock      mutex
 	spanclass spanClass
-	nonempty  mSpanList // list of spans with a free object, ie a nonempty free list
-	empty     mSpanList // list of spans with no free objects (or cached in an mcache)
+
+	// For !go115NewMCentralImpl.
+	nonempty mSpanList // list of spans with a free object, ie a nonempty free list
+	empty    mSpanList // list of spans with no free objects (or cached in an mcache)
+
+	// partial and full contain two mspan sets: one of swept in-use
+	// spans, and one of unswept in-use spans. These two trade
+	// roles on each GC cycle. The unswept set is drained either by
+	// allocation or by the background sweeper in every GC cycle,
+	// so only two roles are necessary.
+	//
+	// sweepgen is increased by 2 on each GC cycle, so the swept
+	// spans are in partial[sweepgen/2%2] and the unswept spans are in
+	// partial[1-sweepgen/2%2]. Sweeping pops spans from the
+	// unswept set and pushes spans that are still in-use on the
+	// swept set. Likewise, allocating an in-use span pushes it
+	// on the swept set.
+	//
+	// Some parts of the sweeper can sweep arbitrary spans, and hence
+	// can't remove them from the unswept set, but will add the span
+	// to the appropriate swept list. As a result, the parts of the
+	// sweeper and mcentral that do consume from the unswept list may
+	// encounter swept spans, and these should be ignored.
+	partial [2]spanSet // list of spans with a free object
+	full    [2]spanSet // list of spans with no free objects
 
 	// nmalloc is the cumulative count of objects allocated from
 	// this mcentral, assuming all spans in mcaches are
@@ -32,12 +55,168 @@ type mcentral struct {
 // Initialize a single central free list.
 func (c *mcentral) init(spc spanClass) {
 	c.spanclass = spc
-	c.nonempty.init()
-	c.empty.init()
+	if go115NewMCentralImpl {
+		lockInit(&c.partial[0].spineLock, lockRankSpanSetSpine)
+		lockInit(&c.partial[1].spineLock, lockRankSpanSetSpine)
+		lockInit(&c.full[0].spineLock, lockRankSpanSetSpine)
+		lockInit(&c.full[1].spineLock, lockRankSpanSetSpine)
+	} else {
+		c.nonempty.init()
+		c.empty.init()
+		lockInit(&c.lock, lockRankMcentral)
+	}
+}
+
+// partialUnswept returns the spanSet which holds partially-filled
+// unswept spans for this sweepgen.
+func (c *mcentral) partialUnswept(sweepgen uint32) *spanSet {
+	return &c.partial[1-sweepgen/2%2]
+}
+
+// partialSwept returns the spanSet which holds partially-filled
+// swept spans for this sweepgen.
+func (c *mcentral) partialSwept(sweepgen uint32) *spanSet {
+	return &c.partial[sweepgen/2%2]
+}
+
+// fullUnswept returns the spanSet which holds unswept spans without any
+// free slots for this sweepgen.
+func (c *mcentral) fullUnswept(sweepgen uint32) *spanSet {
+	return &c.full[1-sweepgen/2%2]
+}
+
+// fullSwept returns the spanSet which holds swept spans without any
+// free slots for this sweepgen.
+func (c *mcentral) fullSwept(sweepgen uint32) *spanSet {
+	return &c.full[sweepgen/2%2]
 }
 
 // Allocate a span to use in an mcache.
 func (c *mcentral) cacheSpan() *mspan {
+	if !go115NewMCentralImpl {
+		return c.oldCacheSpan()
+	}
+	// Deduct credit for this span allocation and sweep if necessary.
+	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
+	deductSweepCredit(spanBytes, 0)
+
+	sg := mheap_.sweepgen
+
+	traceDone := false
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+	// If we sweep spanBudget spans without finding any free
+	// space, just allocate a fresh span. This limits the amount
+	// of time we can spend trying to find free space and
+	// amortizes the cost of small object sweeping over the
+	// benefit of having a full free span to allocate from. By
+	// setting this to 100, we limit the space overhead to 1%.
+	//
+	// TODO(austin,mknyszek): This still has bad worst-case
+	// throughput. For example, this could find just one free slot
+	// on the 100th swept span. That limits allocation latency, but
+	// still has very poor throughput. We could instead keep a
+	// running free-to-used budget and switch to fresh span
+	// allocation if the budget runs low.
+	spanBudget := 100
+
+	var s *mspan
+
+	// Try partial swept spans first.
+	if s = c.partialSwept(sg).pop(); s != nil {
+		goto havespan
+	}
+
+	// Now try partial unswept spans.
+	for ; spanBudget >= 0; spanBudget-- {
+		s = c.partialUnswept(sg).pop()
+		if s == nil {
+			break
+		}
+		if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			// We got ownership of the span, so let's sweep it and use it.
+			s.sweep(true)
+			goto havespan
+		}
+		// We failed to get ownership of the span, which means it's being or
+		// has been swept by an asynchronous sweeper that just couldn't remove it
+		// from the unswept list. That sweeper took ownership of the span and
+		// responsibility for either freeing it to the heap or putting it on the
+		// right swept list. Either way, we should just ignore it (and it's unsafe
+		// for us to do anything else).
+	}
+	// Now try full unswept spans, sweeping them and putting them into the
+	// right list if we fail to get a span.
+	for ; spanBudget >= 0; spanBudget-- {
+		s = c.fullUnswept(sg).pop()
+		if s == nil {
+			break
+		}
+		if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			// We got ownership of the span, so let's sweep it.
+			s.sweep(true)
+			// Check if there's any free space.
+			freeIndex := s.nextFreeIndex()
+			if freeIndex != s.nelems {
+				s.freeindex = freeIndex
+				goto havespan
+			}
+			// Add it to the swept list, because sweeping didn't give us any free space.
+			c.fullSwept(sg).push(s)
+		}
+		// See comment for partial unswept spans.
+	}
+	if trace.enabled {
+		traceGCSweepDone()
+		traceDone = true
+	}
+
+	// We failed to get a span from the mcentral so get one from mheap.
+	s = c.grow()
+	if s == nil {
+		return nil
+	}
+
+	// At this point s is a span that should have free slots.
+havespan:
+	if trace.enabled && !traceDone {
+		traceGCSweepDone()
+	}
+	n := int(s.nelems) - int(s.allocCount)
+	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
+		throw("span has no free objects")
+	}
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	atomic.Xadd64(&c.nmalloc, int64(n))
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live changed.
+		gcController.revise()
+	}
+	freeByteBase := s.freeindex &^ (64 - 1)
+	whichByte := freeByteBase / 8
+	// Init alloc bits cache.
+	s.refillAllocCache(whichByte)
+
+	// Adjust the allocCache so that s.freeindex corresponds to the low bit in
+	// s.allocCache.
+	s.allocCache >>= s.freeindex % 64
+
+	return s
+}
+
+// Allocate a span to use in an mcache.
+//
+// For !go115NewMCentralImpl.
+func (c *mcentral) oldCacheSpan() *mspan {
 	// Deduct credit for this span allocation and sweep if necessary.
 	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
 	deductSweepCredit(spanBytes, 0)
@@ -147,7 +326,77 @@ havespan:
 }
 
 // Return span from an mcache.
+//
+// s must have a span class corresponding to this
+// mcentral and it must not be empty.
 func (c *mcentral) uncacheSpan(s *mspan) {
+	if !go115NewMCentralImpl {
+		c.oldUncacheSpan(s)
+		return
+	}
+	if s.allocCount == 0 {
+		throw("uncaching span but s.allocCount == 0")
+	}
+
+	sg := mheap_.sweepgen
+	stale := s.sweepgen == sg+1
+
+	// Fix up sweepgen.
+	if stale {
+		// Span was cached before sweep began. It's our
+		// responsibility to sweep it.
+		//
+		// Set sweepgen to indicate it's not cached but needs
+		// sweeping and can't be allocated from. sweep will
+		// set s.sweepgen to indicate s is swept.
+		atomic.Store(&s.sweepgen, sg-1)
+	} else {
+		// Indicate that s is no longer cached.
+		atomic.Store(&s.sweepgen, sg)
+	}
+	n := int(s.nelems) - int(s.allocCount)
+
+	// Fix up statistics.
+	if n > 0 {
+		// cacheSpan updated alloc assuming all objects on s
+		// were going to be allocated. Adjust for any that
+		// weren't. We must do this before potentially
+		// sweeping the span.
+		atomic.Xadd64(&c.nmalloc, -int64(n))
+
+		if !stale {
+			// (*mcentral).cacheSpan conservatively counted
+			// unallocated slots in heap_live. Undo this.
+			//
+			// If this span was cached before sweep, then
+			// heap_live was totally recomputed since
+			// caching this span, so we don't do this for
+			// stale spans.
+			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+		}
+	}
+
+	// Put the span in the appropriate place.
+	if stale {
+		// It's stale, so just sweep it. Sweeping will put it on
+		// the right list.
+		s.sweep(false)
+	} else {
+		if n > 0 {
+			// Put it back on the partial swept list.
+			c.partialSwept(sg).push(s)
+		} else {
+			// There's no free space and it's not stale, so put it on the
+			// full swept list.
+			c.fullSwept(sg).push(s)
+		}
+	}
+}
+
+// Return span from an mcache.
+//
+// For !go115NewMCentralImpl.
+func (c *mcentral) oldUncacheSpan(s *mspan) {
 	if s.allocCount == 0 {
 		throw("uncaching span but s.allocCount == 0")
 	}
@@ -206,6 +455,8 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 // freeSpan reports whether s was returned to the heap.
 // If preserve=true, it does not move s (the caller
 // must take care of it).
+//
+// For !go115NewMCentralImpl.
 func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
 	if sg := mheap_.sweepgen; s.sweepgen == sg+1 || s.sweepgen == sg+3 {
 		throw("freeSpan given cached span")
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
index 24043cfb2e2..9dd7bff8a80 100644
--- a/libgo/go/runtime/mgc.go
+++ b/libgo/go/runtime/mgc.go
@@ -191,6 +191,9 @@ func gcinit() {
 
 	work.startSema = 1
 	work.markDoneSema = 1
+	lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
+	lockInit(&work.assistQueue.lock, lockRankAssistQueue)
+	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
 }
 
 func readgogc() int32 {
@@ -235,8 +238,6 @@ func setGCPercent(in int32) (out int32) {
 		gcSetTriggerRatio(memstats.triggerRatio)
 		unlock(&mheap_.lock)
 	})
-	// Pacing changed, so the scavenger should be awoken.
-	wakeScavenger()
 
 	// If we just disabled GC, wait for any concurrent GC mark to
 	// finish so we always return with no GC running.
@@ -1279,6 +1280,7 @@ func gcStart(trigger gcTrigger) {
 	}
 
 	// Ok, we're doing it! Stop everybody else
+	semacquire(&gcsema)
 	semacquire(&worldsema)
 
 	if trace.enabled {
@@ -1318,6 +1320,7 @@ func gcStart(trigger gcTrigger) {
 	systemstack(func() {
 		finishsweep_m()
 	})
+
 	// clearpools before we start the GC. If we wait they memory will not be
 	// reclaimed until the next GC cycle.
 	clearpools()
@@ -1371,15 +1374,26 @@ func gcStart(trigger gcTrigger) {
 	// the world.
 	gcController.markStartTime = now
 
+	// In STW mode, we could block the instant systemstack
+	// returns, so make sure we're not preemptible.
+	mp = acquirem()
+
 	// Concurrent mark.
 	systemstack(func() {
 		now = startTheWorldWithSema(trace.enabled)
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
 	})
-	// In STW mode, we could block the instant systemstack
-	// returns, so don't do anything important here. Make sure we
-	// block rather than returning to user code.
+
+	// Release the world sema before Gosched() in STW mode
+	// because we will need to reacquire it later but before
+	// this goroutine becomes runnable again, and we could
+	// self-deadlock otherwise.
+	semrelease(&worldsema)
+	releasem(mp)
+
+	// Make sure we block instead of returning to user code
+	// in STW mode.
 	if mode != gcBackgroundMode {
 		Gosched()
 	}
@@ -1446,6 +1460,10 @@ top:
 		return
 	}
 
+	// forEachP needs worldsema to execute, and we'll need it to
+	// stop the world later, so acquire worldsema now.
+	semacquire(&worldsema)
+
 	// Flush all local buffers and collect flushedWork flags.
 	gcMarkDoneFlushed = 0
 	systemstack(func() {
@@ -1506,6 +1524,7 @@ top:
 		// work to do. Keep going. It's possible the
 		// transition condition became true again during the
 		// ragged barrier, so re-check it.
+		semrelease(&worldsema)
 		goto top
 	}
 
@@ -1582,6 +1601,7 @@ top:
 				now := startTheWorldWithSema(true)
 				work.pauseNS += now - work.pauseStart
 			})
+			semrelease(&worldsema)
 			goto top
 		}
 	}
@@ -1687,9 +1707,6 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// Update GC trigger and pacing for the next cycle.
 	gcSetTriggerRatio(nextTriggerRatio)
 
-	// Pacing changed, so the scavenger should be awoken.
-	wakeScavenger()
-
 	// Update timing memstats
 	now := nanotime()
 	sec, nsec, _ := time_now()
@@ -1796,6 +1813,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	}
 
 	semrelease(&worldsema)
+	semrelease(&gcsema)
 	// Careful: another GC cycle may start now.
 
 	releasem(mp)
@@ -2121,6 +2139,9 @@ func gcMark(start_time int64) {
 
 // gcSweep must be called on the system stack because it acquires the heap
 // lock. See mheap for details.
+//
+// The world must be stopped.
+//
 //go:systemstack
 func gcSweep(mode gcMode) {
 	if gcphase != _GCoff {
@@ -2130,7 +2151,7 @@ func gcSweep(mode gcMode) {
 	lock(&mheap_.lock)
 	mheap_.sweepgen += 2
 	mheap_.sweepdone = 0
-	if mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
+	if !go115NewMCentralImpl && mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
 		// We should have drained this list during the last
 		// sweep phase. We certainly need to start this phase
 		// with an empty swept list.
@@ -2142,6 +2163,10 @@ func gcSweep(mode gcMode) {
 	mheap_.reclaimCredit = 0
 	unlock(&mheap_.lock)
 
+	if go115NewMCentralImpl {
+		sweep.centralIndex.clear()
+	}
+
 	if !_ConcurrentSweep || mode == gcForceBlockMode {
 		// Special case synchronous sweep.
 		// Record that no proportional sweeping has to happen.
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
index a5af5d2446c..4e4d1314060 100644
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@@ -21,10 +21,6 @@ const (
 	// BSS root.
 	rootBlockBytes = 256 << 10
 
-	// rootBlockSpans is the number of spans to scan per span
-	// root.
-	rootBlockSpans = 8 * 1024 // 64MB worth of spans
-
 	// maxObletBytes is the maximum bytes of an object to scan at
 	// once. Larger objects will be split up into "oblets" of at
 	// most this size. Since we can scan 1–2 MB/ms, 128 KB bounds
@@ -41,14 +37,26 @@ const (
 	// a syscall, so its overhead is nontrivial). Higher values
 	// make the system less responsive to incoming work.
 	drainCheckThreshold = 100000
+
+	// pagesPerSpanRoot indicates how many pages to scan from a span root
+	// at a time. Used by special root marking.
+	//
+	// Higher values improve throughput by increasing locality, but
+	// increase the minimum latency of a marking operation.
+	//
+	// Must be a multiple of the pageInUse bitmap element size and
+	// must also evenly divide pagesPerArena.
+	pagesPerSpanRoot = 512
+
+	// go115NewMarkrootSpans is a feature flag that indicates whether
+	// to use the new bitmap-based markrootSpans implementation.
+	go115NewMarkrootSpans = true
 )
 
 // gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
 // some miscellany) and initializes scanning-related state.
 //
 // The world must be stopped.
-//
-//go:nowritebarrier
 func gcMarkRootPrepare() {
 	work.nFlushCacheRoots = 0
 
@@ -65,13 +73,24 @@ func gcMarkRootPrepare() {
 	//
 	// We depend on addfinalizer to mark objects that get
 	// finalizers after root marking.
-	//
-	// We're only interested in scanning the in-use spans,
-	// which will all be swept at this point. More spans
-	// may be added to this list during concurrent GC, but
-	// we only care about spans that were allocated before
-	// this mark phase.
-	work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
+	if go115NewMarkrootSpans {
+		// We're going to scan the whole heap (that was available at the time the
+		// mark phase started, i.e. markArenas) for in-use spans which have specials.
+		//
+		// Break up the work into arenas, and further into chunks.
+		//
+		// Snapshot allArenas as markArenas. This snapshot is safe because allArenas
+		// is append-only.
+		mheap_.markArenas = mheap_.allArenas[:len(mheap_.allArenas):len(mheap_.allArenas)]
+		work.nSpanRoots = len(mheap_.markArenas) * (pagesPerArena / pagesPerSpanRoot)
+	} else {
+		// We're only interested in scanning the in-use spans,
+		// which will all be swept at this point. More spans
+		// may be added to this list during concurrent GC, but
+		// we only care about spans that were allocated before
+		// this mark phase.
+		work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
+	}
 
 	// Scan stacks.
 	//
@@ -229,10 +248,96 @@ func markrootBlock(roots *gcRootList, gcw *gcWork) {
 	}
 }
 
-// markrootSpans marks roots for one shard of work.spans.
+// markrootSpans marks roots for one shard of markArenas.
 //
 //go:nowritebarrier
 func markrootSpans(gcw *gcWork, shard int) {
+	if !go115NewMarkrootSpans {
+		oldMarkrootSpans(gcw, shard)
+		return
+	}
+	// Objects with finalizers have two GC-related invariants:
+	//
+	// 1) Everything reachable from the object must be marked.
+	// This ensures that when we pass the object to its finalizer,
+	// everything the finalizer can reach will be retained.
+	//
+	// 2) Finalizer specials (which are not in the garbage
+	// collected heap) are roots. In practice, this means the fn
+	// field must be scanned.
+	sg := mheap_.sweepgen
+
+	// Find the arena and page index into that arena for this shard.
+	ai := mheap_.markArenas[shard/(pagesPerArena/pagesPerSpanRoot)]
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	arenaPage := uint(uintptr(shard) * pagesPerSpanRoot % pagesPerArena)
+
+	// Construct slice of bitmap which we'll iterate over.
+	specialsbits := ha.pageSpecials[arenaPage/8:]
+	specialsbits = specialsbits[:pagesPerSpanRoot/8]
+	for i := range specialsbits {
+		// Find set bits, which correspond to spans with specials.
+		specials := atomic.Load8(&specialsbits[i])
+		if specials == 0 {
+			continue
+		}
+		for j := uint(0); j < 8; j++ {
+			if specials&(1<<j) == 0 {
+				continue
+			}
+			// Find the span for this bit.
+			//
+			// This value is guaranteed to be non-nil because having
+			// specials implies that the span is in-use, and since we're
+			// currently marking we can be sure that we don't have to worry
+			// about the span being freed and re-used.
+			s := ha.spans[arenaPage+uint(i)*8+j]
+
+			// The state must be mSpanInUse if the specials bit is set, so
+			// sanity check that.
+			if state := s.state.get(); state != mSpanInUse {
+				print("s.state = ", state, "\n")
+				throw("non in-use span found with specials bit set")
+			}
+			// Check that this span was swept (it may be cached or uncached).
+			if !useCheckmark && !(s.sweepgen == sg || s.sweepgen == sg+3) {
+				// sweepgen was updated (+2) during non-checkmark GC pass
+				print("sweep ", s.sweepgen, " ", sg, "\n")
+				throw("gc: unswept span")
+			}
+
+			// Lock the specials to prevent a special from being
+			// removed from the list while we're traversing it.
+			lock(&s.speciallock)
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				// don't mark finalized object, but scan it so we
+				// retain everything it points to.
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				// A finalizer can be set for an inner byte of an object, find object beginning.
+				p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize
+
+				// Mark everything that can be reached from
+				// the object (but *not* the object itself or
+				// we'll never collect it).
+				scanobject(p, gcw)
+
+				// The special itself is a root.
+				scanblock(uintptr(unsafe.Pointer(&spf.fn)), sys.PtrSize, &oneptrmask[0], gcw)
+			}
+			unlock(&s.speciallock)
+		}
+	}
+}
+
+// oldMarkrootSpans marks roots for one shard of work.spans.
+//
+// For go115NewMarkrootSpans = false.
+//
+//go:nowritebarrier
+func oldMarkrootSpans(gcw *gcWork, shard int) {
 	// Objects with finalizers have two GC-related invariants:
 	//
 	// 1) Everything reachable from the object must be marked.
@@ -767,6 +872,8 @@ const (
 // credit to gcController.bgScanCredit every gcCreditSlack units of
 // scan work.
 //
+// gcDrain will always return if there is a pending STW.
+//
 //go:nowritebarrier
 func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 	if !writeBarrier.needed {
@@ -795,7 +902,8 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 
 	// Drain root marking jobs.
 	if work.markrootNext < work.markrootJobs {
-		for !(preemptible && gp.preempt) {
+		// Stop if we're preemptible or if someone wants to STW.
+		for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
 			job := atomic.Xadd(&work.markrootNext, +1) - 1
 			if job >= work.markrootJobs {
 				break
@@ -808,7 +916,8 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 	}
 
 	// Drain heap marking jobs.
-	for !(preemptible && gp.preempt) {
+	// Stop if we're preemptible or if someone wants to STW.
+	for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
 		// Try to keep work available on the global queue. We used to
 		// check if there were waiting workers, but it's better to
 		// just keep work available than to make workers wait. In the
@@ -1301,11 +1410,21 @@ func gcDumpObject(label string, obj, off uintptr) {
 //
 //go:nowritebarrier
 //go:nosplit
-func gcmarknewobject(obj, size, scanSize uintptr) {
+func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
 	if useCheckmark { // The world should be stopped so this should not happen.
 		throw("gcmarknewobject called while doing checkmark")
 	}
-	markBitsForAddr(obj).setMarked()
+
+	// Mark object.
+	objIndex := span.objIndex(obj)
+	span.markBitsForIndex(objIndex).setMarked()
+
+	// Mark span.
+	arena, pageIdx, pageMask := pageIndexOf(span.base())
+	if arena.pageMarks[pageIdx]&pageMask == 0 {
+		atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+	}
+
 	gcw := &getg().m.p.ptr().gcw
 	gcw.bytesMarked += uint64(size)
 	gcw.scanWork += int64(scanSize)
diff --git a/libgo/go/runtime/mgcscavenge.go b/libgo/go/runtime/mgcscavenge.go
index d4b527c0713..326e6ee079e 100644
--- a/libgo/go/runtime/mgcscavenge.go
+++ b/libgo/go/runtime/mgcscavenge.go
@@ -91,6 +91,11 @@ const (
 	// This ratio is used as part of multiplicative factor to help the scavenger account
 	// for the additional costs of using scavenged memory in its pacing.
 	scavengeCostRatio = 0.7 * sys.GoosDarwin
+
+	// scavengeReservationShards determines the amount of memory the scavenger
+	// should reserve for scavenging at a time. Specifically, the amount of
+	// memory reserved is (heap size in bytes) / scavengeReservationShards.
+	scavengeReservationShards = 64
 )
 
 // heapRetained returns an estimate of the current heap RSS.
@@ -150,24 +155,39 @@ func gcPaceScavenger() {
 		return
 	}
 	mheap_.scavengeGoal = retainedGoal
-	mheap_.pages.resetScavengeAddr()
 }
 
 // Sleep/wait state of the background scavenger.
 var scavenge struct {
-	lock   mutex
-	g      *g
-	parked bool
-	timer  *timer
+	lock       mutex
+	g          *g
+	parked     bool
+	timer      *timer
+	sysmonWake uint32 // Set atomically.
 }
 
-// wakeScavenger unparks the scavenger if necessary. It must be called
-// after any pacing update.
+// readyForScavenger signals sysmon to wake the scavenger because
+// there may be new work to do.
+//
+// There may be a significant delay between when this function runs
+// and when the scavenger is kicked awake, but it may be safely invoked
+// in contexts where wakeScavenger is unsafe to call directly.
+func readyForScavenger() {
+	atomic.Store(&scavenge.sysmonWake, 1)
+}
+
+// wakeScavenger immediately unparks the scavenger if necessary.
+//
+// May run without a P, but it may allocate, so it must not be called
+// on any allocation path.
 //
-// mheap_.lock and scavenge.lock must not be held.
+// mheap_.lock, scavenge.lock, and sched.lock must not be held.
 func wakeScavenger() {
 	lock(&scavenge.lock)
 	if scavenge.parked {
+		// Notify sysmon that it shouldn't bother waking up the scavenger.
+		atomic.Store(&scavenge.sysmonWake, 0)
+
 		// Try to stop the timer but we don't really care if we succeed.
 		// It's possible that either a timer was never started, or that
 		// we're racing with it.
@@ -183,9 +203,16 @@ func wakeScavenger() {
 		// scavenger at a "lower priority" but that's OK because it'll
 		// catch up on the work it missed when it does get scheduled.
 		scavenge.parked = false
-		systemstack(func() {
-			ready(scavenge.g, 0, false)
-		})
+
+		// Ready the goroutine by injecting it. We use injectglist instead
+		// of ready or goready in order to allow us to run this function
+		// without a P. injectglist also avoids placing the goroutine in
+		// the current P's runnext slot, which is desireable to prevent
+		// the scavenger from interfering with user goroutine scheduling
+		// too much.
+		var list gList
+		list.push(scavenge.g)
+		injectglist(&list)
 	}
 	unlock(&scavenge.lock)
 }
@@ -227,6 +254,7 @@ func bgscavenge(c chan int) {
 
 	scavenge.g = getg()
 
+	lockInit(&scavenge.lock, lockRankScavenge)
 	lock(&scavenge.lock)
 	scavenge.parked = true
 
@@ -272,13 +300,14 @@ func bgscavenge(c chan int) {
 				unlock(&mheap_.lock)
 				return
 			}
-			unlock(&mheap_.lock)
 
 			// Scavenge one page, and measure the amount of time spent scavenging.
 			start := nanotime()
-			released = mheap_.pages.scavengeOne(physPageSize, false)
-			atomic.Xadduintptr(&mheap_.pages.scavReleased, released)
+			released = mheap_.pages.scavenge(physPageSize, true)
+			mheap_.pages.scav.released += released
 			crit = float64(nanotime() - start)
+
+			unlock(&mheap_.lock)
 		})
 
 		if released == 0 {
@@ -358,28 +387,36 @@ func bgscavenge(c chan int) {
 
 // scavenge scavenges nbytes worth of free pages, starting with the
 // highest address first. Successive calls continue from where it left
-// off until the heap is exhausted. Call resetScavengeAddr to bring it
+// off until the heap is exhausted. Call scavengeStartGen to bring it
 // back to the top of the heap.
 //
 // Returns the amount of memory scavenged in bytes.
 //
-// If locked == false, s.mheapLock must not be locked. If locked == true,
-// s.mheapLock must be locked.
+// s.mheapLock must be held, but may be temporarily released if
+// mayUnlock == true.
 //
-// Must run on the system stack because scavengeOne must run on the
-// system stack.
+// Must run on the system stack because s.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavenge(nbytes uintptr, locked bool) uintptr {
+func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+	var (
+		addrs addrRange
+		gen   uint32
+	)
 	released := uintptr(0)
 	for released < nbytes {
-		r := s.scavengeOne(nbytes-released, locked)
-		if r == 0 {
-			// Nothing left to scavenge! Give up.
-			break
+		if addrs.size() == 0 {
+			if addrs, gen = s.scavengeReserve(); addrs.size() == 0 {
+				break
+			}
 		}
+		r, a := s.scavengeOne(addrs, nbytes-released, mayUnlock)
 		released += r
+		addrs = a
 	}
+	// Only unreserve the space which hasn't been scavenged or searched
+	// to ensure we always make progress.
+	s.scavengeUnreserve(addrs, gen)
 	return released
 }
 
@@ -388,9 +425,9 @@ func (s *pageAlloc) scavenge(nbytes uintptr, locked bool) uintptr {
 // released should be the amount of memory released since the last time this
 // was called, and forced indicates whether the scavenge was forced by the
 // application.
-func printScavTrace(released uintptr, forced bool) {
+func printScavTrace(gen uint32, released uintptr, forced bool) {
 	printlock()
-	print("scav ",
+	print("scav ", gen, " ",
 		released>>10, " KiB work, ",
 		atomic.Load64(&memstats.heap_released)>>10, " KiB total, ",
 		(atomic.Load64(&memstats.heap_inuse)*100)/heapRetained(), "% util",
@@ -402,39 +439,131 @@ func printScavTrace(released uintptr, forced bool) {
 	printunlock()
 }
 
-// resetScavengeAddr sets the scavenge start address to the top of the heap's
-// address space. This should be called each time the scavenger's pacing
-// changes.
+// scavengeStartGen starts a new scavenge generation, resetting
+// the scavenger's search space to the full in-use address space.
 //
 // s.mheapLock must be held.
-func (s *pageAlloc) resetScavengeAddr() {
-	released := atomic.Loaduintptr(&s.scavReleased)
+//
+// Must run on the system stack because s.mheapLock must be held.
+//
+//go:systemstack
+func (s *pageAlloc) scavengeStartGen() {
 	if debug.scavtrace > 0 {
-		printScavTrace(released, false)
+		printScavTrace(s.scav.gen, s.scav.released, false)
+	}
+	s.inUse.cloneInto(&s.scav.inUse)
+
+	// Pick the new starting address for the scavenger cycle.
+	var startAddr offAddr
+	if s.scav.scavLWM.lessThan(s.scav.freeHWM) {
+		// The "free" high watermark exceeds the "scavenged" low watermark,
+		// so there are free scavengable pages in parts of the address space
+		// that the scavenger already searched, the high watermark being the
+		// highest one. Pick that as our new starting point to ensure we
+		// see those pages.
+		startAddr = s.scav.freeHWM
+	} else {
+		// The "free" high watermark does not exceed the "scavenged" low
+		// watermark. This means the allocator didn't free any memory in
+		// the range we scavenged last cycle, so we might as well continue
+		// scavenging from where we were.
+		startAddr = s.scav.scavLWM
 	}
-	// Subtract from scavReleased instead of just setting it to zero because
-	// the scavenger could have increased scavReleased concurrently with the
-	// load above, and we may miss an update by just blindly zeroing the field.
-	atomic.Xadduintptr(&s.scavReleased, -released)
-	s.scavAddr = chunkBase(s.end) - 1
+	s.scav.inUse.removeGreaterEqual(startAddr.addr())
+
+	// reservationBytes may be zero if s.inUse.totalBytes is small, or if
+	// scavengeReservationShards is large. This case is fine as the scavenger
+	// will simply be turned off, but it does mean that scavengeReservationShards,
+	// in concert with pallocChunkBytes, dictates the minimum heap size at which
+	// the scavenger triggers. In practice this minimum is generally less than an
+	// arena in size, so virtually every heap has the scavenger on.
+	s.scav.reservationBytes = alignUp(s.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
+	s.scav.gen++
+	s.scav.released = 0
+	s.scav.freeHWM = minOffAddr
+	s.scav.scavLWM = maxOffAddr
 }
 
-// scavengeOne starts from s.scavAddr and walks down the heap until it finds
-// a contiguous run of pages to scavenge. It will try to scavenge at most
-// max bytes at once, but may scavenge more to avoid breaking huge pages. Once
-// it scavenges some memory it returns how much it scavenged and updates s.scavAddr
-// appropriately. s.scavAddr must be reset manually and externally.
+// scavengeReserve reserves a contiguous range of the address space
+// for scavenging. The maximum amount of space it reserves is proportional
+// to the size of the heap. The ranges are reserved from the high addresses
+// first.
+//
+// Returns the reserved range and the scavenge generation number for it.
+//
+// s.mheapLock must be held.
+//
+// Must run on the system stack because s.mheapLock must be held.
 //
-// Should it exhaust the heap, it will return 0 and set s.scavAddr to minScavAddr.
+//go:systemstack
+func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
+	// Start by reserving the minimum.
+	r := s.scav.inUse.removeLast(s.scav.reservationBytes)
+
+	// Return early if the size is zero; we don't want to use
+	// the bogus address below.
+	if r.size() == 0 {
+		return r, s.scav.gen
+	}
+
+	// The scavenger requires that base be aligned to a
+	// palloc chunk because that's the unit of operation for
+	// the scavenger, so align down, potentially extending
+	// the range.
+	newBase := alignDown(r.base.addr(), pallocChunkBytes)
+
+	// Remove from inUse however much extra we just pulled out.
+	s.scav.inUse.removeGreaterEqual(newBase)
+	r.base = offAddr{newBase}
+	return r, s.scav.gen
+}
+
+// scavengeUnreserve returns an unscavenged portion of a range that was
+// previously reserved with scavengeReserve.
 //
-// If locked == false, s.mheapLock must not be locked.
-// If locked == true, s.mheapLock must be locked.
+// s.mheapLock must be held.
 //
-// Must be run on the system stack because it either acquires the heap lock
-// or executes with the heap lock acquired.
+// Must run on the system stack because s.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeOne(max uintptr, locked bool) uintptr {
+func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
+	if r.size() == 0 || gen != s.scav.gen {
+		return
+	}
+	if r.base.addr()%pallocChunkBytes != 0 {
+		throw("unreserving unaligned region")
+	}
+	s.scav.inUse.add(r)
+}
+
+// scavengeOne walks over address range work until it finds
+// a contiguous run of pages to scavenge. It will try to scavenge
+// at most max bytes at once, but may scavenge more to avoid
+// breaking huge pages. Once it scavenges some memory it returns
+// how much it scavenged in bytes.
+//
+// Returns the number of bytes scavenged and the part of work
+// which was not yet searched.
+//
+// work's base address must be aligned to pallocChunkBytes.
+//
+// s.mheapLock must be held, but may be temporarily released if
+// mayUnlock == true.
+//
+// Must run on the system stack because s.mheapLock must be held.
+//
+//go:systemstack
+func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+	// Defensively check if we've recieved an empty address range.
+	// If so, just return.
+	if work.size() == 0 {
+		// Nothing to do.
+		return 0, work
+	}
+	// Check the prerequisites of work.
+	if work.base.addr()%pallocChunkBytes != 0 {
+		throw("scavengeOne called with unaligned work region")
+	}
 	// Calculate the maximum number of pages to scavenge.
 	//
 	// This should be alignUp(max, pageSize) / pageSize but max can and will
@@ -456,84 +585,49 @@ func (s *pageAlloc) scavengeOne(max uintptr, locked bool) uintptr {
 		minPages = 1
 	}
 
-	// Helpers for locking and unlocking only if locked == false.
+	// Helpers for locking and unlocking only if mayUnlock == true.
 	lockHeap := func() {
-		if !locked {
+		if mayUnlock {
 			lock(s.mheapLock)
 		}
 	}
 	unlockHeap := func() {
-		if !locked {
+		if mayUnlock {
 			unlock(s.mheapLock)
 		}
 	}
 
-	lockHeap()
-	ci := chunkIndex(s.scavAddr)
-	if ci < s.start {
-		unlockHeap()
-		return 0
-	}
-
-	// Check the chunk containing the scav addr, starting at the addr
-	// and see if there are any free and unscavenged pages.
+	// Fast path: check the chunk containing the top-most address in work,
+	// starting at that address's page index in the chunk.
 	//
-	// Only check this if s.scavAddr is covered by any address range
-	// in s.inUse, so that we know our check of the summary is safe.
-	if s.inUse.contains(s.scavAddr) && s.summary[len(s.summary)-1][ci].max() >= uint(minPages) {
+	// Note that work.end() is exclusive, so get the chunk we care about
+	// by subtracting 1.
+	maxAddr := work.limit.addr() - 1
+	maxChunk := chunkIndex(maxAddr)
+	if s.summary[len(s.summary)-1][maxChunk].max() >= uint(minPages) {
 		// We only bother looking for a candidate if there at least
-		// minPages free pages at all. It's important that we only
-		// continue if the summary says we can because that's how
-		// we can tell if parts of the address space are unused.
-		// See the comment on s.chunks in mpagealloc.go.
-		base, npages := s.chunkOf(ci).findScavengeCandidate(chunkPageIndex(s.scavAddr), minPages, maxPages)
+		// minPages free pages at all.
+		base, npages := s.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
 
 		// If we found something, scavenge it and return!
 		if npages != 0 {
-			s.scavengeRangeLocked(ci, base, npages)
-			unlockHeap()
-			return uintptr(npages) * pageSize
+			work.limit = offAddr{s.scavengeRangeLocked(maxChunk, base, npages)}
+			return uintptr(npages) * pageSize, work
 		}
 	}
+	// Update the limit to reflect the fact that we checked maxChunk already.
+	work.limit = offAddr{chunkBase(maxChunk)}
 
-	// getInUseRange returns the highest range in the
-	// intersection of [0, addr] and s.inUse.
+	// findCandidate finds the next scavenge candidate in work optimistically.
 	//
-	// s.mheapLock must be held.
-	getInUseRange := func(addr uintptr) addrRange {
-		top := s.inUse.findSucc(addr)
-		if top == 0 {
-			return addrRange{}
-		}
-		r := s.inUse.ranges[top-1]
-		// addr is inclusive, so treat it as such when
-		// updating the limit, which is exclusive.
-		if r.limit > addr+1 {
-			r.limit = addr + 1
-		}
-		return r
-	}
-
-	// Slow path: iterate optimistically over the in-use address space
-	// looking for any free and unscavenged page. If we think we see something,
-	// lock and verify it!
+	// Returns the candidate chunk index and true on success, and false on failure.
 	//
-	// We iterate over the address space by taking ranges from inUse.
-newRange:
-	for {
-		r := getInUseRange(s.scavAddr)
-		if r.size() == 0 {
-			break
-		}
-		unlockHeap()
-
-		// Iterate over all of the chunks described by r.
-		// Note that r.limit is the exclusive upper bound, but what
-		// we want is the top chunk instead, inclusive, so subtract 1.
-		bot, top := chunkIndex(r.base), chunkIndex(r.limit-1)
-		for i := top; i >= bot; i-- {
+	// The heap need not be locked.
+	findCandidate := func(work addrRange) (chunkIdx, bool) {
+		// Iterate over this work's chunks.
+		for i := chunkIndex(work.limit.addr() - 1); i >= chunkIndex(work.base.addr()); i-- {
 			// If this chunk is totally in-use or has no unscavenged pages, don't bother
-			// doing a  more sophisticated check.
+			// doing a more sophisticated check.
 			//
 			// Note we're accessing the summary and the chunks without a lock, but
 			// that's fine. We're being optimistic anyway.
@@ -550,70 +644,77 @@ newRange:
 			// see a nil pointer in this case if we do race with heap growth, but
 			// just defensively ignore the nils. This operation is optimistic anyway.
 			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&s.chunks[i.l1()])))
-			if l2 == nil || !l2[i.l2()].hasScavengeCandidate(minPages) {
-				continue
+			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
+				return i, true
 			}
+		}
+		return 0, false
+	}
 
-			// We found a candidate, so let's lock and verify it.
-			lockHeap()
-
-			// Find, verify, and scavenge if we can.
-			chunk := s.chunkOf(i)
-			base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
-			if npages > 0 {
-				// We found memory to scavenge! Mark the bits and report that up.
-				// scavengeRangeLocked will update scavAddr for us, also.
-				s.scavengeRangeLocked(i, base, npages)
-				unlockHeap()
-				return uintptr(npages) * pageSize
-			}
+	// Slow path: iterate optimistically over the in-use address space
+	// looking for any free and unscavenged page. If we think we see something,
+	// lock and verify it!
+	for work.size() != 0 {
+		unlockHeap()
 
-			// We were fooled, let's take this opportunity to move the scavAddr
-			// all the way down to where we searched as scavenged for future calls
-			// and keep iterating. Then, go get a new range.
-			s.scavAddr = chunkBase(i-1) + pallocChunkPages*pageSize - 1
-			continue newRange
-		}
+		// Search for the candidate.
+		candidateChunkIdx, ok := findCandidate(work)
+
+		// Lock the heap. We need to do this now if we found a candidate or not.
+		// If we did, we'll verify it. If not, we need to lock before returning
+		// anyway.
 		lockHeap()
 
-		// Move the scavenger down the heap, past everything we just searched.
-		// Since we don't check if scavAddr moved while twe let go of the heap lock,
-		// it's possible that it moved down and we're moving it up here. This
-		// raciness could result in us searching parts of the heap unnecessarily.
-		// TODO(mknyszek): Remove this racy behavior through explicit address
-		// space reservations, which are difficult to do with just scavAddr.
-		s.scavAddr = r.base - 1
-	}
-	// We reached the end of the in-use address space and couldn't find anything,
-	// so signal that there's nothing left to scavenge.
-	s.scavAddr = minScavAddr
-	unlockHeap()
+		if !ok {
+			// We didn't find a candidate, so we're done.
+			work.limit = work.base
+			break
+		}
+
+		// Find, verify, and scavenge if we can.
+		chunk := s.chunkOf(candidateChunkIdx)
+		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
+		if npages > 0 {
+			work.limit = offAddr{s.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+			return uintptr(npages) * pageSize, work
+		}
 
-	return 0
+		// We were fooled, so let's continue from where we left off.
+		work.limit = offAddr{chunkBase(candidateChunkIdx)}
+	}
+	return 0, work
 }
 
 // scavengeRangeLocked scavenges the given region of memory.
+// The region of memory is described by its chunk index (ci),
+// the starting page index of the region relative to that
+// chunk (base), and the length of the region in pages (npages).
+//
+// Returns the base address of the scavenged region.
 //
 // s.mheapLock must be held.
-func (s *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) {
+func (s *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
 	s.chunkOf(ci).scavenged.setRange(base, npages)
 
 	// Compute the full address for the start of the range.
 	addr := chunkBase(ci) + uintptr(base)*pageSize
 
-	// Update the scav pointer.
-	s.scavAddr = addr - 1
+	// Update the scavenge low watermark.
+	if oAddr := (offAddr{addr}); oAddr.lessThan(s.scav.scavLWM) {
+		s.scav.scavLWM = oAddr
+	}
 
 	// Only perform the actual scavenging if we're not in a test.
 	// It's dangerous to do so otherwise.
 	if s.test {
-		return
+		return addr
 	}
 	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
 
 	// Update global accounting only when not in test, otherwise
 	// the runtime's accounting will be wrong.
 	mSysStatInc(&memstats.heap_released, uintptr(npages)*pageSize)
+	return addr
 }
 
 // fillAligned returns x but with all zeroes in m-aligned
diff --git a/libgo/go/runtime/mgcscavenge_test.go b/libgo/go/runtime/mgcscavenge_test.go
index 58f9e3a80d3..7f619b1e7db 100644
--- a/libgo/go/runtime/mgcscavenge_test.go
+++ b/libgo/go/runtime/mgcscavenge_test.go
@@ -419,12 +419,12 @@ func TestPageAllocScavenge(t *testing.T) {
 	}
 	for name, v := range tests {
 		v := v
-		runTest := func(t *testing.T, locked bool) {
+		runTest := func(t *testing.T, mayUnlock bool) {
 			b := NewPageAlloc(v.beforeAlloc, v.beforeScav)
 			defer FreePageAlloc(b)
 
 			for iter, h := range v.expect {
-				if got := b.Scavenge(h.request, locked); got != h.expect {
+				if got := b.Scavenge(h.request, mayUnlock); got != h.expect {
 					t.Fatalf("bad scavenge #%d: want %d, got %d", iter+1, h.expect, got)
 				}
 			}
@@ -436,7 +436,7 @@ func TestPageAllocScavenge(t *testing.T) {
 		t.Run(name, func(t *testing.T) {
 			runTest(t, false)
 		})
-		t.Run(name+"Locked", func(t *testing.T) {
+		t.Run(name+"MayUnlock", func(t *testing.T) {
 			runTest(t, true)
 		})
 	}
diff --git a/libgo/go/runtime/mgcsweep.go b/libgo/go/runtime/mgcsweep.go
index 1e959a4ad2d..6877649edbb 100644
--- a/libgo/go/runtime/mgcsweep.go
+++ b/libgo/go/runtime/mgcsweep.go
@@ -10,7 +10,7 @@
 //   can free a whole span if none of the objects are marked, but that
 //   isn't its goal. This can be driven either synchronously by
 //   mcentral.cacheSpan for mcentral spans, or asynchronously by
-//   sweepone from the list of all in-use spans in mheap_.sweepSpans.
+//   sweepone, which looks at all the mcentral lists.
 //
 // * The span reclaimer looks for spans that contain no marked objects
 //   and frees whole spans. This is a separate algorithm because
@@ -40,6 +40,80 @@ type sweepdata struct {
 
 	nbgsweep    uint32
 	npausesweep uint32
+
+	// centralIndex is the current unswept span class.
+	// It represents an index into the mcentral span
+	// sets. Accessed and updated via its load and
+	// update methods. Not protected by a lock.
+	//
+	// Reset at mark termination.
+	// Used by mheap.nextSpanForSweep.
+	centralIndex sweepClass
+}
+
+// sweepClass is a spanClass and one bit to represent whether we're currently
+// sweeping partial or full spans.
+type sweepClass uint32
+
+const (
+	numSweepClasses            = numSpanClasses * 2
+	sweepClassDone  sweepClass = sweepClass(^uint32(0))
+)
+
+func (s *sweepClass) load() sweepClass {
+	return sweepClass(atomic.Load((*uint32)(s)))
+}
+
+func (s *sweepClass) update(sNew sweepClass) {
+	// Only update *s if its current value is less than sNew,
+	// since *s increases monotonically.
+	sOld := s.load()
+	for sOld < sNew && !atomic.Cas((*uint32)(s), uint32(sOld), uint32(sNew)) {
+		sOld = s.load()
+	}
+	// TODO(mknyszek): This isn't the only place we have
+	// an atomic monotonically increasing counter. It would
+	// be nice to have an "atomic max" which is just implemented
+	// as the above on most architectures. Some architectures
+	// like RISC-V however have native support for an atomic max.
+}
+
+func (s *sweepClass) clear() {
+	atomic.Store((*uint32)(s), 0)
+}
+
+// split returns the underlying span class as well as
+// whether we're interested in the full or partial
+// unswept lists for that class, indicated as a boolean
+// (true means "full").
+func (s sweepClass) split() (spc spanClass, full bool) {
+	return spanClass(s >> 1), s&1 == 0
+}
+
+// nextSpanForSweep finds and pops the next span for sweeping from the
+// central sweep buffers. It returns ownership of the span to the caller.
+// Returns nil if no such span exists.
+func (h *mheap) nextSpanForSweep() *mspan {
+	sg := h.sweepgen
+	for sc := sweep.centralIndex.load(); sc < numSweepClasses; sc++ {
+		spc, full := sc.split()
+		c := &h.central[spc].mcentral
+		var s *mspan
+		if full {
+			s = c.fullUnswept(sg).pop()
+		} else {
+			s = c.partialUnswept(sg).pop()
+		}
+		if s != nil {
+			// Write down that we found something so future sweepers
+			// can start from here.
+			sweep.centralIndex.update(sc)
+			return s
+		}
+	}
+	// Write down that we found nothing.
+	sweep.centralIndex.update(sweepClassDone)
+	return nil
 }
 
 // finishsweep_m ensures that all spans are swept.
@@ -58,6 +132,24 @@ func finishsweep_m() {
 		sweep.npausesweep++
 	}
 
+	if go115NewMCentralImpl {
+		// Reset all the unswept buffers, which should be empty.
+		// Do this in sweep termination as opposed to mark termination
+		// so that we can catch unswept spans and reclaim blocks as
+		// soon as possible.
+		sg := mheap_.sweepgen
+		for i := range mheap_.central {
+			c := &mheap_.central[i].mcentral
+			c.partialUnswept(sg).reset()
+			c.fullUnswept(sg).reset()
+		}
+	}
+
+	// Sweeping is done, so if the scavenger isn't already awake,
+	// wake it up. There's definitely work for it to do at this
+	// point.
+	wakeScavenger()
+
 	nextMarkBitArenaEpoch()
 }
 
@@ -66,6 +158,7 @@ func bgsweep(c chan int) {
 
 	sweep.g = getg()
 
+	lockInit(&sweep.lock, lockRankSweep)
 	lock(&sweep.lock)
 	sweep.parked = true
 	c <- 1
@@ -111,7 +204,11 @@ func sweepone() uintptr {
 	var s *mspan
 	sg := mheap_.sweepgen
 	for {
-		s = mheap_.sweepSpans[1-sg/2%2].pop()
+		if go115NewMCentralImpl {
+			s = mheap_.nextSpanForSweep()
+		} else {
+			s = mheap_.sweepSpans[1-sg/2%2].pop()
+		}
 		if s == nil {
 			atomic.Store(&mheap_.sweepdone, 1)
 			break
@@ -151,6 +248,27 @@ func sweepone() uintptr {
 	// Decrement the number of active sweepers and if this is the
 	// last one print trace information.
 	if atomic.Xadd(&mheap_.sweepers, -1) == 0 && atomic.Load(&mheap_.sweepdone) != 0 {
+		// Since the sweeper is done, move the scavenge gen forward (signalling
+		// that there's new work to do) and wake the scavenger.
+		//
+		// The scavenger is signaled by the last sweeper because once
+		// sweeping is done, we will definitely have useful work for
+		// the scavenger to do, since the scavenger only runs over the
+		// heap once per GC cyle. This update is not done during sweep
+		// termination because in some cases there may be a long delay
+		// between sweep done and sweep termination (e.g. not enough
+		// allocations to trigger a GC) which would be nice to fill in
+		// with scavenging work.
+		systemstack(func() {
+			lock(&mheap_.lock)
+			mheap_.pages.scavengeStartGen()
+			unlock(&mheap_.lock)
+		})
+		// Since we might sweep in an allocation path, it's not possible
+		// for us to wake the scavenger directly via wakeScavenger, since
+		// it could allocate. Ask sysmon to do it for us instead.
+		readyForScavenger()
+
 		if debug.gcpacertrace > 0 {
 			print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", (memstats.heap_live-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept, " pages at ", sweepRatio, " pages/byte\n")
 		}
@@ -206,6 +324,260 @@ func (s *mspan) ensureSwept() {
 // If preserve=true, don't return it to heap nor relink in mcentral lists;
 // caller takes care of it.
 func (s *mspan) sweep(preserve bool) bool {
+	if !go115NewMCentralImpl {
+		return s.oldSweep(preserve)
+	}
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("mspan.sweep: m is not locked")
+	}
+	sweepgen := mheap_.sweepgen
+	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("mspan.sweep: bad span state")
+	}
+
+	if trace.enabled {
+		traceGCSweepSpan(s.npages * _PageSize)
+	}
+
+	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
+
+	spc := s.spanclass
+	size := s.elemsize
+
+	c := _g_.m.p.ptr().mcache
+
+	// The allocBits indicate which unmarked objects don't need to be
+	// processed since they were free at the end of the last GC cycle
+	// and were not allocated since then.
+	// If the allocBits index is >= s.freeindex and the bit
+	// is not marked then the object remains unallocated
+	// since the last GC.
+	// This situation is analogous to being on a freelist.
+
+	// Unlink & free special records for any objects we're about to free.
+	// Two complications here:
+	// 1. An object can have both finalizer and profile special records.
+	//    In such case we need to queue finalizer for execution,
+	//    mark the object as live and preserve the profile special.
+	// 2. A tiny object can have several finalizers setup for different offsets.
+	//    If such object is not marked, we need to queue all finalizers at once.
+	// Both 1 and 2 are possible at the same time.
+	hadSpecials := s.specials != nil
+	specialp := &s.specials
+	special := *specialp
+	for special != nil {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		objIndex := uintptr(special.offset) / size
+		p := s.base() + objIndex*size
+		mbits := s.markBitsForIndex(objIndex)
+		if !mbits.isMarked() {
+			// This object is not marked and has at least one special record.
+			// Pass 1: see if it has at least one finalizer.
+			hasFin := false
+			endOffset := p - s.base() + size
+			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+				if tmp.kind == _KindSpecialFinalizer {
+					// Stop freeing of object if it has a finalizer.
+					mbits.setMarkedNonAtomic()
+					hasFin = true
+					break
+				}
+			}
+			// Pass 2: queue all finalizers _or_ handle profile record.
+			for special != nil && uintptr(special.offset) < endOffset {
+				// Find the exact byte for which the special was setup
+				// (as opposed to object beginning).
+				p := s.base() + uintptr(special.offset)
+				if special.kind == _KindSpecialFinalizer || !hasFin {
+					// Splice out special record.
+					y := special
+					special = special.next
+					*specialp = special
+					freespecial(y, unsafe.Pointer(p), size)
+				} else {
+					// This is profile record, but the object has finalizers (so kept alive).
+					// Keep special record.
+					specialp = &special.next
+					special = *specialp
+				}
+			}
+		} else {
+			// object is still live: keep special record
+			specialp = &special.next
+			special = *specialp
+		}
+	}
+	if hadSpecials && s.specials == nil {
+		spanHasNoSpecials(s)
+	}
+
+	if debug.allocfreetrace != 0 || debug.clobberfree != 0 || raceenabled || msanenabled {
+		// Find all newly freed objects. This doesn't have to
+		// efficient; allocfreetrace has massive overhead.
+		mbits := s.markBitsForBase()
+		abits := s.allocBitsForIndex(0)
+		for i := uintptr(0); i < s.nelems; i++ {
+			if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
+				x := s.base() + i*s.elemsize
+				if debug.allocfreetrace != 0 {
+					tracefree(unsafe.Pointer(x), size)
+				}
+				if debug.clobberfree != 0 {
+					clobberfree(unsafe.Pointer(x), size)
+				}
+				if raceenabled {
+					racefree(unsafe.Pointer(x), size)
+				}
+				if msanenabled {
+					msanfree(unsafe.Pointer(x), size)
+				}
+			}
+			mbits.advance()
+			abits.advance()
+		}
+	}
+
+	// Check for zombie objects.
+	if s.freeindex < s.nelems {
+		// Everything < freeindex is allocated and hence
+		// cannot be zombies.
+		//
+		// Check the first bitmap byte, where we have to be
+		// careful with freeindex.
+		obj := s.freeindex
+		if (*s.gcmarkBits.bytep(obj / 8)&^*s.allocBits.bytep(obj / 8))>>(obj%8) != 0 {
+			s.reportZombies()
+		}
+		// Check remaining bytes.
+		for i := obj/8 + 1; i < divRoundUp(s.nelems, 8); i++ {
+			if *s.gcmarkBits.bytep(i)&^*s.allocBits.bytep(i) != 0 {
+				s.reportZombies()
+			}
+		}
+	}
+
+	// Count the number of free objects in this span.
+	nalloc := uint16(s.countAlloc())
+	nfreed := s.allocCount - nalloc
+	if nalloc > s.allocCount {
+		// The zombie check above should have caught this in
+		// more detail.
+		print("runtime: nelems=", s.nelems, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
+		throw("sweep increased allocation count")
+	}
+
+	s.allocCount = nalloc
+	s.freeindex = 0 // reset allocation index to start of span.
+	if trace.enabled {
+		getg().m.p.ptr().traceReclaimed += uintptr(nfreed) * s.elemsize
+	}
+
+	// gcmarkBits becomes the allocBits.
+	// get a fresh cleared gcmarkBits in preparation for next GC
+	s.allocBits = s.gcmarkBits
+	s.gcmarkBits = newMarkBits(s.nelems)
+
+	// Initialize alloc bits cache.
+	s.refillAllocCache(0)
+
+	// The span must be in our exclusive ownership until we update sweepgen,
+	// check for potential races.
+	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("mspan.sweep: bad span state after sweep")
+	}
+	if s.sweepgen == sweepgen+1 || s.sweepgen == sweepgen+3 {
+		throw("swept cached span")
+	}
+
+	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+	// because of the potential for a concurrent free/SetFinalizer.
+	//
+	// But we need to set it before we make the span available for allocation
+	// (return it to heap or mcentral), because allocation code assumes that a
+	// span is already swept if available for allocation.
+	//
+	// Serialization point.
+	// At this point the mark bits are cleared and allocation ready
+	// to go so release the span.
+	atomic.Store(&s.sweepgen, sweepgen)
+
+	if spc.sizeclass() != 0 {
+		// Handle spans for small objects.
+		if nfreed > 0 {
+			// Only mark the span as needing zeroing if we've freed any
+			// objects, because a fresh span that had been allocated into,
+			// wasn't totally filled, but then swept, still has all of its
+			// free slots zeroed.
+			s.needzero = 1
+			c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+		}
+		if !preserve {
+			// The caller may not have removed this span from whatever
+			// unswept set its on but taken ownership of the span for
+			// sweeping by updating sweepgen. If this span still is in
+			// an unswept set, then the mcentral will pop it off the
+			// set, check its sweepgen, and ignore it.
+			if nalloc == 0 {
+				// Free totally free span directly back to the heap.
+				mheap_.freeSpan(s)
+				return true
+			}
+			// Return span back to the right mcentral list.
+			if uintptr(nalloc) == s.nelems {
+				mheap_.central[spc].mcentral.fullSwept(sweepgen).push(s)
+			} else {
+				mheap_.central[spc].mcentral.partialSwept(sweepgen).push(s)
+			}
+		}
+	} else if !preserve {
+		// Handle spans for large objects.
+		if nfreed != 0 {
+			// Free large object span to heap.
+
+			// NOTE(rsc,dvyukov): The original implementation of efence
+			// in CL 22060046 used sysFree instead of sysFault, so that
+			// the operating system would eventually give the memory
+			// back to us again, so that an efence program could run
+			// longer without running out of memory. Unfortunately,
+			// calling sysFree here without any kind of adjustment of the
+			// heap data structures means that when the memory does
+			// come back to us, we have the wrong metadata for it, either in
+			// the mspan structures or in the garbage collection bitmap.
+			// Using sysFault here means that the program will run out of
+			// memory fairly quickly in efence mode, but at least it won't
+			// have mysterious crashes due to confused memory reuse.
+			// It should be possible to switch back to sysFree if we also
+			// implement and then call some kind of mheap.deleteSpan.
+			if debug.efence > 0 {
+				s.limit = 0 // prevent mlookup from finding this span
+				sysFault(unsafe.Pointer(s.base()), size)
+			} else {
+				mheap_.freeSpan(s)
+			}
+			c.local_nlargefree++
+			c.local_largefree += size
+			return true
+		}
+
+		// Add a large span directly onto the full+swept list.
+		mheap_.central[spc].mcentral.fullSwept(sweepgen).push(s)
+	}
+	return false
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
+// Returns true if the span was returned to heap.
+// If preserve=true, don't return it to heap nor relink in mcentral lists;
+// caller takes care of it.
+//
+// For !go115NewMCentralImpl.
+func (s *mspan) oldSweep(preserve bool) bool {
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
@@ -228,7 +600,7 @@ func (s *mspan) sweep(preserve bool) bool {
 	size := s.elemsize
 	res := false
 
-	c := _g_.m.mcache
+	c := _g_.m.p.ptr().mcache
 	freeToHeap := false
 
 	// The allocBits indicate which unmarked objects don't need to be
@@ -247,6 +619,7 @@ func (s *mspan) sweep(preserve bool) bool {
 	// 2. A tiny object can have several finalizers setup for different offsets.
 	//    If such object is not marked, we need to queue all finalizers at once.
 	// Both 1 and 2 are possible at the same time.
+	hadSpecials := s.specials != nil
 	specialp := &s.specials
 	special := *specialp
 	for special != nil {
@@ -291,6 +664,9 @@ func (s *mspan) sweep(preserve bool) bool {
 			special = *specialp
 		}
 	}
+	if go115NewMarkrootSpans && hadSpecials && s.specials == nil {
+		spanHasNoSpecials(s)
+	}
 
 	if debug.allocfreetrace != 0 || debug.clobberfree != 0 || raceenabled || msanenabled {
 		// Find all newly freed objects. This doesn't have to
@@ -402,6 +778,57 @@ func (s *mspan) sweep(preserve bool) bool {
 	return res
 }
 
+// reportZombies reports any marked but free objects in s and throws.
+//
+// This generally means one of the following:
+//
+// 1. User code converted a pointer to a uintptr and then back
+// unsafely, and a GC ran while the uintptr was the only reference to
+// an object.
+//
+// 2. User code (or a compiler bug) constructed a bad pointer that
+// points to a free slot, often a past-the-end pointer.
+//
+// 3. The GC two cycles ago missed a pointer and freed a live object,
+// but it was still live in the last cycle, so this GC cycle found a
+// pointer to that object and marked it.
+func (s *mspan) reportZombies() {
+	printlock()
+	print("runtime: marked free object in span ", s, ", elemsize=", s.elemsize, " freeindex=", s.freeindex, " (bad use of unsafe.Pointer? try -d=checkptr)\n")
+	mbits := s.markBitsForBase()
+	abits := s.allocBitsForIndex(0)
+	for i := uintptr(0); i < s.nelems; i++ {
+		addr := s.base() + i*s.elemsize
+		print(hex(addr))
+		alloc := i < s.freeindex || abits.isMarked()
+		if alloc {
+			print(" alloc")
+		} else {
+			print(" free ")
+		}
+		if mbits.isMarked() {
+			print(" marked  ")
+		} else {
+			print(" unmarked")
+		}
+		zombie := mbits.isMarked() && !alloc
+		if zombie {
+			print(" zombie")
+		}
+		print("\n")
+		if zombie {
+			length := s.elemsize
+			if length > 1024 {
+				length = 1024
+			}
+			hexdumpWords(addr, addr+length, nil)
+		}
+		mbits.advance()
+		abits.advance()
+	}
+	throw("found pointer to free object")
+}
+
 // deductSweepCredit deducts sweep credit for allocating a span of
 // size spanBytes. This must be performed *before* the span is
 // allocated to ensure the system has enough credit. If necessary, it
diff --git a/libgo/go/runtime/mgcsweepbuf.go b/libgo/go/runtime/mgcsweepbuf.go
index 78288229c83..1f722c3d585 100644
--- a/libgo/go/runtime/mgcsweepbuf.go
+++ b/libgo/go/runtime/mgcsweepbuf.go
@@ -144,7 +144,7 @@ func (b *gcSweepBuf) pop() *mspan {
 // intervening pops. Spans that are pushed after the call may also
 // appear in these blocks.
 func (b *gcSweepBuf) numBlocks() int {
-	return int((atomic.Load(&b.index) + gcSweepBlockEntries - 1) / gcSweepBlockEntries)
+	return int(divRoundUp(uintptr(atomic.Load(&b.index)), gcSweepBlockEntries))
 }
 
 // block returns the spans in the i'th block of buffer b. block is
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
index a1b61add9b5..d23d64060ec 100644
--- a/libgo/go/runtime/mgcwork.go
+++ b/libgo/go/runtime/mgcwork.go
@@ -177,6 +177,10 @@ func (w *gcWork) put(obj uintptr) {
 
 	flushed := false
 	wbuf := w.wbuf1
+	// Record that this may acquire the wbufSpans or heap lock to
+	// allocate a workbuf.
+	lockWithRankMayAcquire(&work.wbufSpans.lock, lockRankWbufSpans)
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if wbuf == nil {
 		w.init()
 		wbuf = w.wbuf1
@@ -422,6 +426,10 @@ func getempty() *workbuf {
 			b.checkempty()
 		}
 	}
+	// Record that this may acquire the wbufSpans or heap lock to
+	// allocate a workbuf.
+	lockWithRankMayAcquire(&work.wbufSpans.lock, lockRankWbufSpans)
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if b == nil {
 		// Allocate more workbufs.
 		var s *mspan
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
index c40c9e26628..e73ee32efd4 100644
--- a/libgo/go/runtime/mheap.go
+++ b/libgo/go/runtime/mheap.go
@@ -27,6 +27,32 @@ const (
 	// maxPhysHugePageSize sets an upper-bound on the maximum huge page size
 	// that the runtime supports.
 	maxPhysHugePageSize = pallocChunkBytes
+
+	// pagesPerReclaimerChunk indicates how many pages to scan from the
+	// pageInUse bitmap at a time. Used by the page reclaimer.
+	//
+	// Higher values reduce contention on scanning indexes (such as
+	// h.reclaimIndex), but increase the minimum latency of the
+	// operation.
+	//
+	// The time required to scan this many pages can vary a lot depending
+	// on how many spans are actually freed. Experimentally, it can
+	// scan for pages at ~300 GB/ms on a 2.6GHz Core i7, but can only
+	// free spans at ~32 MB/ms. Using 512 pages bounds this at
+	// roughly 100µs.
+	//
+	// Must be a multiple of the pageInUse bitmap element size and
+	// must also evenly divid pagesPerArena.
+	pagesPerReclaimerChunk = 512
+
+	// go115NewMCentralImpl is a feature flag for the new mcentral implementation.
+	//
+	// This flag depends on go115NewMarkrootSpans because the new mcentral
+	// implementation requires that markroot spans no longer rely on mgcsweepbufs.
+	// The definition of this flag helps ensure that if there's a problem with
+	// the new markroot spans implementation and it gets turned off, that the new
+	// mcentral implementation also gets turned off so the runtime isn't broken.
+	go115NewMCentralImpl = true && go115NewMarkrootSpans
 )
 
 // Main malloc heap.
@@ -68,9 +94,11 @@ type mheap struct {
 	// unswept stack and pushes spans that are still in-use on the
 	// swept stack. Likewise, allocating an in-use span pushes it
 	// on the swept stack.
+	//
+	// For !go115NewMCentralImpl.
 	sweepSpans [2]gcSweepBuf
 
-	// _ uint32 // align uint64 fields on 32-bit for atomics
+	_ uint32 // align uint64 fields on 32-bit for atomics
 
 	// Proportional sweep
 	//
@@ -180,13 +208,19 @@ type mheap struct {
 	// simply blocking GC (by disabling preemption).
 	sweepArenas []arenaIdx
 
+	// markArenas is a snapshot of allArenas taken at the beginning
+	// of the mark cycle. Because allArenas is append-only, neither
+	// this slice nor its contents will change during the mark, so
+	// it can be read safely.
+	markArenas []arenaIdx
+
 	// curArena is the arena that the heap is currently growing
 	// into. This should always be physPageSize-aligned.
 	curArena struct {
 		base, end uintptr
 	}
 
-	_ uint32 // ensure 64-bit alignment of central
+	// _ uint32 // ensure 64-bit alignment of central
 
 	// central free lists for small size classes.
 	// the padding makes sure that the mcentrals are
@@ -256,6 +290,16 @@ type heapArena struct {
 	// operations.
 	pageMarks [pagesPerArena / 8]uint8
 
+	// pageSpecials is a bitmap that indicates which spans have
+	// specials (finalizers or other). Like pageInUse, only the bit
+	// corresponding to the first page in each span is used.
+	//
+	// Writes are done atomically whenever a special is added to
+	// a span and whenever the last special is removed from a span.
+	// Reads are done atomically to find spans containing specials
+	// during marking.
+	pageSpecials [pagesPerArena / 8]uint8
+
 	// zeroedBase marks the first byte of the first page in this
 	// arena which hasn't been used yet and is therefore already
 	// zero. zeroedBase is relative to the arena base.
@@ -532,13 +576,13 @@ func (sc spanClass) noscan() bool {
 //
 //go:nosplit
 func arenaIndex(p uintptr) arenaIdx {
-	return arenaIdx((p + arenaBaseOffset) / heapArenaBytes)
+	return arenaIdx((p - arenaBaseOffset) / heapArenaBytes)
 }
 
 // arenaBase returns the low address of the region covered by heap
 // arena i.
 func arenaBase(i arenaIdx) uintptr {
-	return uintptr(i)*heapArenaBytes - arenaBaseOffset
+	return uintptr(i)*heapArenaBytes + arenaBaseOffset
 }
 
 type arenaIdx uint
@@ -670,6 +714,11 @@ func pageIndexOf(p uintptr) (arena *heapArena, pageIdx uintptr, pageMask uint8)
 
 // Initialize the heap.
 func (h *mheap) init() {
+	lockInit(&h.lock, lockRankMheap)
+	lockInit(&h.sweepSpans[0].spineLock, lockRankSpine)
+	lockInit(&h.sweepSpans[1].spineLock, lockRankSpine)
+	lockInit(&h.speciallock, lockRankMheapSpecial)
+
 	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
 	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
 	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
@@ -701,23 +750,10 @@ func (h *mheap) init() {
 //
 // h must NOT be locked.
 func (h *mheap) reclaim(npage uintptr) {
-	// This scans pagesPerChunk at a time. Higher values reduce
-	// contention on h.reclaimPos, but increase the minimum
-	// latency of performing a reclaim.
-	//
-	// Must be a multiple of the pageInUse bitmap element size.
-	//
-	// The time required by this can vary a lot depending on how
-	// many spans are actually freed. Experimentally, it can scan
-	// for pages at ~300 GB/ms on a 2.6GHz Core i7, but can only
-	// free spans at ~32 MB/ms. Using 512 pages bounds this at
-	// roughly 100µs.
-	//
 	// TODO(austin): Half of the time spent freeing spans is in
 	// locking/unlocking the heap (even with low contention). We
 	// could make the slow path here several times faster by
 	// batching heap frees.
-	const pagesPerChunk = 512
 
 	// Bail early if there's no more reclaim work.
 	if atomic.Load64(&h.reclaimIndex) >= 1<<63 {
@@ -750,7 +786,7 @@ func (h *mheap) reclaim(npage uintptr) {
 		}
 
 		// Claim a chunk of work.
-		idx := uintptr(atomic.Xadd64(&h.reclaimIndex, pagesPerChunk) - pagesPerChunk)
+		idx := uintptr(atomic.Xadd64(&h.reclaimIndex, pagesPerReclaimerChunk) - pagesPerReclaimerChunk)
 		if idx/pagesPerArena >= uintptr(len(arenas)) {
 			// Page reclaiming is done.
 			atomic.Store64(&h.reclaimIndex, 1<<63)
@@ -764,7 +800,7 @@ func (h *mheap) reclaim(npage uintptr) {
 		}
 
 		// Scan this chunk.
-		nfound := h.reclaimChunk(arenas, idx, pagesPerChunk)
+		nfound := h.reclaimChunk(arenas, idx, pagesPerReclaimerChunk)
 		if nfound <= npage {
 			npage -= nfound
 		} else {
@@ -1141,10 +1177,21 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		// which may only be done with the heap locked.
 
 		// Transfer stats from mcache to global.
-		memstats.heap_scan += uint64(gp.m.mcache.local_scan)
-		gp.m.mcache.local_scan = 0
-		memstats.tinyallocs += uint64(gp.m.mcache.local_tinyallocs)
-		gp.m.mcache.local_tinyallocs = 0
+		var c *mcache
+		if gp.m.p != 0 {
+			c = gp.m.p.ptr().mcache
+		} else {
+			// This case occurs while bootstrapping.
+			// See the similar code in mallocgc.
+			c = mcache0
+			if c == nil {
+				throw("mheap.allocSpan called with no P")
+			}
+		}
+		memstats.heap_scan += uint64(c.local_scan)
+		c.local_scan = 0
+		memstats.tinyallocs += uint64(c.local_tinyallocs)
+		c.local_tinyallocs = 0
 
 		// Do some additional accounting if it's a large allocation.
 		if spanclass.sizeclass() == 0 {
@@ -1236,19 +1283,22 @@ HaveSpan:
 	// Publish the span in various locations.
 
 	// This is safe to call without the lock held because the slots
-	// related to this span will only every be read or modified by
-	// this thread until pointers into the span are published or
-	// pageInUse is updated.
+	// related to this span will only ever be read or modified by
+	// this thread until pointers into the span are published (and
+	// we execute a publication barrier at the end of this function
+	// before that happens) or pageInUse is updated.
 	h.setSpans(s.base(), npages, s)
 
 	if !manual {
-		// Add to swept in-use list.
-		//
-		// This publishes the span to root marking.
-		//
-		// h.sweepgen is guaranteed to only change during STW,
-		// and preemption is disabled in the page allocator.
-		h.sweepSpans[h.sweepgen/2%2].push(s)
+		if !go115NewMCentralImpl {
+			// Add to swept in-use list.
+			//
+			// This publishes the span to root marking.
+			//
+			// h.sweepgen is guaranteed to only change during STW,
+			// and preemption is disabled in the page allocator.
+			h.sweepSpans[h.sweepgen/2%2].push(s)
+		}
 
 		// Mark in-use span in arena page bitmap.
 		//
@@ -1266,6 +1316,11 @@ HaveSpan:
 			traceHeapAlloc()
 		}
 	}
+
+	// Make sure the newly allocated span will be observed
+	// by the GC before pointers into the span are published.
+	publicationBarrier()
+
 	return s
 }
 
@@ -1278,8 +1333,11 @@ func (h *mheap) grow(npage uintptr) bool {
 	ask := alignUp(npage, pallocChunkPages) * pageSize
 
 	totalGrowth := uintptr(0)
-	nBase := alignUp(h.curArena.base+ask, physPageSize)
-	if nBase > h.curArena.end {
+	// This may overflow because ask could be very large
+	// and is otherwise unrelated to h.curArena.base.
+	end := h.curArena.base + ask
+	nBase := alignUp(end, physPageSize)
+	if nBase > h.curArena.end || /* overflow */ end < h.curArena.base {
 		// Not enough room in the current arena. Allocate more
 		// arena space. This may not be contiguous with the
 		// current arena, so we have to request the full ask.
@@ -1315,7 +1373,10 @@ func (h *mheap) grow(npage uintptr) bool {
 		mSysStatInc(&memstats.heap_released, asize)
 		mSysStatInc(&memstats.heap_idle, asize)
 
-		// Recalculate nBase
+		// Recalculate nBase.
+		// We know this won't overflow, because sysAlloc returned
+		// a valid region starting at h.curArena.base which is at
+		// least ask bytes in size.
 		nBase = alignUp(h.curArena.base+ask, physPageSize)
 	}
 
@@ -1334,7 +1395,7 @@ func (h *mheap) grow(npage uintptr) bool {
 		if overage := uintptr(retained + uint64(totalGrowth) - h.scavengeGoal); todo > overage {
 			todo = overage
 		}
-		h.pages.scavenge(todo, true)
+		h.pages.scavenge(todo, false)
 	}
 	return true
 }
@@ -1342,12 +1403,12 @@ func (h *mheap) grow(npage uintptr) bool {
 // Free the span back into the heap.
 func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
-		mp := getg().m
+		c := getg().m.p.ptr().mcache
 		lock(&h.lock)
-		memstats.heap_scan += uint64(mp.mcache.local_scan)
-		mp.mcache.local_scan = 0
-		memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs)
-		mp.mcache.local_tinyallocs = 0
+		memstats.heap_scan += uint64(c.local_scan)
+		c.local_scan = 0
+		memstats.tinyallocs += uint64(c.local_tinyallocs)
+		c.local_tinyallocs = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
@@ -1418,9 +1479,9 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
 	h.freeMSpanLocked(s)
 }
 
-// scavengeAll visits each node in the free treap and scavenges the
-// treapNode's span. It then removes the scavenged span from
-// unscav and adds it into scav before continuing.
+// scavengeAll acquires the heap lock (blocking any additional
+// manipulation of the page allocator) and iterates over the whole
+// heap, scavenging every free page available.
 func (h *mheap) scavengeAll() {
 	// Disallow malloc or panic while holding the heap lock. We do
 	// this here because this is a non-mallocgc entry-point to
@@ -1428,14 +1489,16 @@ func (h *mheap) scavengeAll() {
 	gp := getg()
 	gp.m.mallocing++
 	lock(&h.lock)
-	// Reset the scavenger address so we have access to the whole heap.
-	h.pages.resetScavengeAddr()
-	released := h.pages.scavenge(^uintptr(0), true)
+	// Start a new scavenge generation so we have a chance to walk
+	// over the whole heap.
+	h.pages.scavengeStartGen()
+	released := h.pages.scavenge(^uintptr(0), false)
+	gen := h.pages.scav.gen
 	unlock(&h.lock)
 	gp.m.mallocing--
 
 	if debug.scavtrace > 0 {
-		printScavTrace(released, true)
+		printScavTrace(gen, released, true)
 	}
 }
 
@@ -1463,6 +1526,7 @@ func (span *mspan) init(base uintptr, npages uintptr) {
 	span.allocBits = nil
 	span.gcmarkBits = nil
 	span.state.set(mSpanDead)
+	lockInit(&span.speciallock, lockRankMspanSpecial)
 }
 
 func (span *mspan) inList() bool {
@@ -1576,6 +1640,22 @@ type special struct {
 	kind   byte     // kind of special
 }
 
+// spanHasSpecials marks a span as having specials in the arena bitmap.
+func spanHasSpecials(s *mspan) {
+	arenaPage := (s.base() / pageSize) % pagesPerArena
+	ai := arenaIndex(s.base())
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	atomic.Or8(&ha.pageSpecials[arenaPage/8], uint8(1)<<(arenaPage%8))
+}
+
+// spanHasNoSpecials marks a span as having no specials in the arena bitmap.
+func spanHasNoSpecials(s *mspan) {
+	arenaPage := (s.base() / pageSize) % pagesPerArena
+	ai := arenaIndex(s.base())
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	atomic.And8(&ha.pageSpecials[arenaPage/8], ^(uint8(1) << (arenaPage % 8)))
+}
+
 // Adds the special record s to the list of special records for
 // the object p. All fields of s should be filled in except for
 // offset & next, which this routine will fill in.
@@ -1621,6 +1701,9 @@ func addspecial(p unsafe.Pointer, s *special) bool {
 	s.offset = uint16(offset)
 	s.next = *t
 	*t = s
+	if go115NewMarkrootSpans {
+		spanHasSpecials(span)
+	}
 	unlock(&span.speciallock)
 	releasem(mp)
 
@@ -1644,6 +1727,7 @@ func removespecial(p unsafe.Pointer, kind uint8) *special {
 
 	offset := uintptr(p) - span.base()
 
+	var result *special
 	lock(&span.speciallock)
 	t := &span.specials
 	for {
@@ -1655,15 +1739,17 @@ func removespecial(p unsafe.Pointer, kind uint8) *special {
 		// "interior" specials (p must be exactly equal to s->offset).
 		if offset == uintptr(s.offset) && kind == s.kind {
 			*t = s.next
-			unlock(&span.speciallock)
-			releasem(mp)
-			return s
+			result = s
+			break
 		}
 		t = &s.next
 	}
+	if go115NewMarkrootSpans && span.specials == nil {
+		spanHasNoSpecials(span)
+	}
 	unlock(&span.speciallock)
 	releasem(mp)
-	return nil
+	return result
 }
 
 // The described object has a finalizer set for it.
diff --git a/libgo/go/runtime/mkpreempt.go b/libgo/go/runtime/mkpreempt.go
index 35ed42871f1..1fe77663b9c 100644
--- a/libgo/go/runtime/mkpreempt.go
+++ b/libgo/go/runtime/mkpreempt.go
@@ -502,8 +502,33 @@ func genPPC64() {
 }
 
 func genRISCV64() {
-	p("// No async preemption on riscv64 - see issue 36711")
-	p("UNDEF")
+	// X0 (zero), X1 (LR), X2 (SP), X4 (g), X31 (TMP) are special.
+	var l = layout{sp: "X2", stack: 8}
+
+	// Add integer registers (X3, X5-X30).
+	for i := 3; i < 31; i++ {
+		if i == 4 {
+			continue
+		}
+		reg := fmt.Sprintf("X%d", i)
+		l.add("MOV", reg, 8)
+	}
+
+	// Add floating point registers (F0-F31).
+	for i := 0; i <= 31; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add("MOVD", reg, 8)
+	}
+
+	p("MOV X1, -%d(X2)", l.stack)
+	p("ADD $-%d, X2", l.stack)
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+	p("MOV %d(X2), X1", l.stack)
+	p("MOV (X2), X31")
+	p("ADD $%d, X2", l.stack+8)
+	p("JMP (X31)")
 }
 
 func genS390X() {
diff --git a/libgo/go/runtime/mpagealloc.go b/libgo/go/runtime/mpagealloc.go
index bb751f1f8ed..60f7f9ff58e 100644
--- a/libgo/go/runtime/mpagealloc.go
+++ b/libgo/go/runtime/mpagealloc.go
@@ -81,20 +81,14 @@ const (
 	// there should this change.
 	pallocChunksL2Bits  = heapAddrBits - logPallocChunkBytes - pallocChunksL1Bits
 	pallocChunksL1Shift = pallocChunksL2Bits
-
-	// Maximum searchAddr value, which indicates that the heap has no free space.
-	//
-	// We subtract arenaBaseOffset because we want this to represent the maximum
-	// value in the shifted address space, but searchAddr is stored as a regular
-	// memory address. See arenaBaseOffset for details.
-	maxSearchAddr = ^uintptr(0) - arenaBaseOffset
-
-	// Minimum scavAddr value, which indicates that the scavenger is done.
-	//
-	// minScavAddr + arenaBaseOffset == 0
-	minScavAddr = (^arenaBaseOffset + 1) & uintptrMask
 )
 
+// Maximum searchAddr value, which indicates that the heap has no free space.
+//
+// We alias maxOffAddr just to make it clear that this is the maximum address
+// for the page allocator's search space. See maxOffAddr for details.
+var maxSearchAddr = maxOffAddr
+
 // Global chunk index.
 //
 // Represents an index into the leaf level of the radix tree.
@@ -105,12 +99,12 @@ type chunkIdx uint
 // chunkIndex returns the global index of the palloc chunk containing the
 // pointer p.
 func chunkIndex(p uintptr) chunkIdx {
-	return chunkIdx((p + arenaBaseOffset) / pallocChunkBytes)
+	return chunkIdx((p - arenaBaseOffset) / pallocChunkBytes)
 }
 
 // chunkIndex returns the base address of the palloc chunk at index ci.
 func chunkBase(ci chunkIdx) uintptr {
-	return uintptr(ci)*pallocChunkBytes - arenaBaseOffset
+	return uintptr(ci)*pallocChunkBytes + arenaBaseOffset
 }
 
 // chunkPageIndex computes the index of the page that contains p,
@@ -139,6 +133,18 @@ func (i chunkIdx) l2() uint {
 	}
 }
 
+// offAddrToLevelIndex converts an address in the offset address space
+// to the index into summary[level] containing addr.
+func offAddrToLevelIndex(level int, addr offAddr) int {
+	return int((addr.a - arenaBaseOffset) >> levelShift[level])
+}
+
+// levelIndexToOffAddr converts an index into summary[level] into
+// the corresponding address in the offset address space.
+func levelIndexToOffAddr(level, idx int) offAddr {
+	return offAddr{(uintptr(idx) << levelShift[level]) + arenaBaseOffset}
+}
+
 // addrsToSummaryRange converts base and limit pointers into a range
 // of entries for the given summary level.
 //
@@ -153,8 +159,8 @@ func addrsToSummaryRange(level int, base, limit uintptr) (lo int, hi int) {
 	// of a summary's max page count boundary for this level
 	// (1 << levelLogPages[level]). So, make limit an inclusive upper bound
 	// then shift, then add 1, so we get an exclusive upper bound at the end.
-	lo = int((base + arenaBaseOffset) >> levelShift[level])
-	hi = int(((limit-1)+arenaBaseOffset)>>levelShift[level]) + 1
+	lo = int((base - arenaBaseOffset) >> levelShift[level])
+	hi = int(((limit-1)-arenaBaseOffset)>>levelShift[level]) + 1
 	return
 }
 
@@ -237,16 +243,7 @@ type pageAlloc struct {
 	// Note that adding in arenaBaseOffset transforms addresses
 	// to a new address space with a linear view of the full address
 	// space on architectures with segmented address spaces.
-	searchAddr uintptr
-
-	// The address to start a scavenge candidate search with. It
-	// need not point to memory contained in inUse.
-	scavAddr uintptr
-
-	// The amount of memory scavenged since the last scavtrace print.
-	//
-	// Read and updated atomically.
-	scavReleased uintptr
+	searchAddr offAddr
 
 	// start and end represent the chunk indices
 	// which pageAlloc knows about. It assumes
@@ -267,6 +264,33 @@ type pageAlloc struct {
 	// All access is protected by the mheapLock.
 	inUse addrRanges
 
+	// scav stores the scavenger state.
+	//
+	// All fields are protected by mheapLock.
+	scav struct {
+		// inUse is a slice of ranges of address space which have not
+		// yet been looked at by the scavenger.
+		inUse addrRanges
+
+		// gen is the scavenge generation number.
+		gen uint32
+
+		// reservationBytes is how large of a reservation should be made
+		// in bytes of address space for each scavenge iteration.
+		reservationBytes uintptr
+
+		// released is the amount of memory released this generation.
+		released uintptr
+
+		// scavLWM is the lowest (offset) address that the scavenger reached this
+		// scavenge generation.
+		scavLWM offAddr
+
+		// freeHWM is the highest (offset) address of a page that was freed to
+		// the page allocator this scavenge generation.
+		freeHWM offAddr
+	}
+
 	// mheap_.lock. This level of indirection makes it possible
 	// to test pageAlloc indepedently of the runtime allocator.
 	mheapLock *mutex
@@ -299,34 +323,11 @@ func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
 	// Start with the searchAddr in a state indicating there's no free memory.
 	s.searchAddr = maxSearchAddr
 
-	// Start with the scavAddr in a state indicating there's nothing more to do.
-	s.scavAddr = minScavAddr
-
 	// Set the mheapLock.
 	s.mheapLock = mheapLock
-}
 
-// compareSearchAddrTo compares an address against s.searchAddr in a linearized
-// view of the address space on systems with discontinuous process address spaces.
-// This linearized view is the same one generated by chunkIndex and arenaIndex,
-// done by adding arenaBaseOffset.
-//
-// On systems without a discontinuous address space, it's just a normal comparison.
-//
-// Returns < 0 if addr is less than s.searchAddr in the linearized address space.
-// Returns > 0 if addr is greater than s.searchAddr in the linearized address space.
-// Returns 0 if addr and s.searchAddr are equal.
-func (s *pageAlloc) compareSearchAddrTo(addr uintptr) int {
-	// Compare with arenaBaseOffset added because it gives us a linear, contiguous view
-	// of the heap on architectures with signed address spaces.
-	lAddr := addr + arenaBaseOffset
-	lSearchAddr := s.searchAddr + arenaBaseOffset
-	if lAddr < lSearchAddr {
-		return -1
-	} else if lAddr > lSearchAddr {
-		return 1
-	}
-	return 0
+	// Initialize scavenge tracking state.
+	s.scav.scavLWM = maxSearchAddr
 }
 
 // chunkOf returns the chunk at the given chunk index.
@@ -362,13 +363,13 @@ func (s *pageAlloc) grow(base, size uintptr) {
 	// Note that [base, limit) will never overlap with any existing
 	// range inUse because grow only ever adds never-used memory
 	// regions to the page allocator.
-	s.inUse.add(addrRange{base, limit})
+	s.inUse.add(makeAddrRange(base, limit))
 
 	// A grow operation is a lot like a free operation, so if our
-	// chunk ends up below the (linearized) s.searchAddr, update
-	// s.searchAddr to the new address, just like in free.
-	if s.compareSearchAddrTo(base) < 0 {
-		s.searchAddr = base
+	// chunk ends up below s.searchAddr, update s.searchAddr to the
+	// new address, just like in free.
+	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
+		s.searchAddr = b
 	}
 
 	// Add entries into chunks, which is sparse, if needed. Then,
@@ -532,7 +533,7 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 // searchAddr returned is invalid and must be ignored.
 //
 // s.mheapLock must be held.
-func (s *pageAlloc) find(npages uintptr) (uintptr, uintptr) {
+func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
 	// Search algorithm.
 	//
 	// This algorithm walks each level l of the radix tree from the root level
@@ -572,13 +573,13 @@ func (s *pageAlloc) find(npages uintptr) (uintptr, uintptr) {
 	// firstFree is updated by calling foundFree each time free space in the
 	// heap is discovered.
 	//
-	// At the end of the search, base-arenaBaseOffset is the best new
+	// At the end of the search, base.addr() is the best new
 	// searchAddr we could deduce in this search.
 	firstFree := struct {
-		base, bound uintptr
+		base, bound offAddr
 	}{
-		base:  0,
-		bound: (1<<heapAddrBits - 1),
+		base:  minOffAddr,
+		bound: maxOffAddr,
 	}
 	// foundFree takes the given address range [addr, addr+size) and
 	// updates firstFree if it is a narrower range. The input range must
@@ -589,17 +590,17 @@ func (s *pageAlloc) find(npages uintptr) (uintptr, uintptr) {
 	// pages on the root level and narrow that down if we descend into
 	// that summary. But as soon as we need to iterate beyond that summary
 	// in a level to find a large enough range, we'll stop narrowing.
-	foundFree := func(addr, size uintptr) {
-		if firstFree.base <= addr && addr+size-1 <= firstFree.bound {
+	foundFree := func(addr offAddr, size uintptr) {
+		if firstFree.base.lessEqual(addr) && addr.add(size-1).lessEqual(firstFree.bound) {
 			// This range fits within the current firstFree window, so narrow
 			// down the firstFree window to the base and bound of this range.
 			firstFree.base = addr
-			firstFree.bound = addr + size - 1
-		} else if !(addr+size-1 < firstFree.base || addr > firstFree.bound) {
+			firstFree.bound = addr.add(size - 1)
+		} else if !(addr.add(size-1).lessThan(firstFree.base) || firstFree.bound.lessThan(addr)) {
 			// This range only partially overlaps with the firstFree range,
 			// so throw.
-			print("runtime: addr = ", hex(addr), ", size = ", size, "\n")
-			print("runtime: base = ", hex(firstFree.base), ", bound = ", hex(firstFree.bound), "\n")
+			print("runtime: addr = ", hex(addr.addr()), ", size = ", size, "\n")
+			print("runtime: base = ", hex(firstFree.base.addr()), ", bound = ", hex(firstFree.bound.addr()), "\n")
 			throw("range partially overlaps")
 		}
 	}
@@ -629,7 +630,7 @@ nextLevel:
 		// searchAddr on the previous level or we're on the root leve, in which
 		// case the searchAddr should be the same as i after levelShift.
 		j0 := 0
-		if searchIdx := int((s.searchAddr + arenaBaseOffset) >> levelShift[l]); searchIdx&^(entriesPerBlock-1) == i {
+		if searchIdx := offAddrToLevelIndex(l, s.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
 			j0 = searchIdx & (entriesPerBlock - 1)
 		}
 
@@ -655,7 +656,7 @@ nextLevel:
 
 			// We've encountered a non-zero summary which means
 			// free memory, so update firstFree.
-			foundFree(uintptr((i+j)<<levelShift[l]), (uintptr(1)<<logMaxPages)*pageSize)
+			foundFree(levelIndexToOffAddr(l, i+j), (uintptr(1)<<logMaxPages)*pageSize)
 
 			s := sum.start()
 			if size+s >= uint(npages) {
@@ -693,8 +694,8 @@ nextLevel:
 		if size >= uint(npages) {
 			// We found a sufficiently large run of free pages straddling
 			// some boundary, so compute the address and return it.
-			addr := uintptr(i<<levelShift[l]) - arenaBaseOffset + uintptr(base)*pageSize
-			return addr, firstFree.base - arenaBaseOffset
+			addr := levelIndexToOffAddr(l, i).add(uintptr(base) * pageSize).addr()
+			return addr, firstFree.base
 		}
 		if l == 0 {
 			// We're at level zero, so that means we've exhausted our search.
@@ -706,7 +707,7 @@ nextLevel:
 		// lied to us. In either case, dump some useful state and throw.
 		print("runtime: summary[", l-1, "][", lastSumIdx, "] = ", lastSum.start(), ", ", lastSum.max(), ", ", lastSum.end(), "\n")
 		print("runtime: level = ", l, ", npages = ", npages, ", j0 = ", j0, "\n")
-		print("runtime: s.searchAddr = ", hex(s.searchAddr), ", i = ", i, "\n")
+		print("runtime: s.searchAddr = ", hex(s.searchAddr.addr()), ", i = ", i, "\n")
 		print("runtime: levelShift[level] = ", levelShift[l], ", levelBits[level] = ", levelBits[l], "\n")
 		for j := 0; j < len(entries); j++ {
 			sum := entries[j]
@@ -724,7 +725,7 @@ nextLevel:
 	// is what the final level represents.
 	ci := chunkIdx(i)
 	j, searchIdx := s.chunkOf(ci).find(npages, 0)
-	if j < 0 {
+	if j == ^uint(0) {
 		// We couldn't find any space in this chunk despite the summaries telling
 		// us it should be there. There's likely a bug, so dump some state and throw.
 		sum := s.summary[len(s.summary)-1][i]
@@ -739,8 +740,8 @@ nextLevel:
 	// Since we actually searched the chunk, we may have
 	// found an even narrower free window.
 	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
-	foundFree(searchAddr+arenaBaseOffset, chunkBase(ci+1)-searchAddr)
-	return addr, firstFree.base - arenaBaseOffset
+	foundFree(offAddr{searchAddr}, chunkBase(ci+1)-searchAddr)
+	return addr, firstFree.base
 }
 
 // alloc allocates npages worth of memory from the page heap, returning the base
@@ -754,25 +755,25 @@ nextLevel:
 func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr) >= s.end {
+	if chunkIndex(s.searchAddr.addr()) >= s.end {
 		return 0, 0
 	}
 
 	// If npages has a chance of fitting in the chunk where the searchAddr is,
 	// search it directly.
-	searchAddr := uintptr(0)
-	if pallocChunkPages-chunkPageIndex(s.searchAddr) >= uint(npages) {
+	searchAddr := minOffAddr
+	if pallocChunkPages-chunkPageIndex(s.searchAddr.addr()) >= uint(npages) {
 		// npages is guaranteed to be no greater than pallocChunkPages here.
-		i := chunkIndex(s.searchAddr)
+		i := chunkIndex(s.searchAddr.addr())
 		if max := s.summary[len(s.summary)-1][i].max(); max >= uint(npages) {
-			j, searchIdx := s.chunkOf(i).find(npages, chunkPageIndex(s.searchAddr))
-			if j < 0 {
+			j, searchIdx := s.chunkOf(i).find(npages, chunkPageIndex(s.searchAddr.addr()))
+			if j == ^uint(0) {
 				print("runtime: max = ", max, ", npages = ", npages, "\n")
-				print("runtime: searchIdx = ", chunkPageIndex(s.searchAddr), ", s.searchAddr = ", hex(s.searchAddr), "\n")
+				print("runtime: searchIdx = ", chunkPageIndex(s.searchAddr.addr()), ", s.searchAddr = ", hex(s.searchAddr.addr()), "\n")
 				throw("bad summary data")
 			}
 			addr = chunkBase(i) + uintptr(j)*pageSize
-			searchAddr = chunkBase(i) + uintptr(searchIdx)*pageSize
+			searchAddr = offAddr{chunkBase(i) + uintptr(searchIdx)*pageSize}
 			goto Found
 		}
 	}
@@ -794,10 +795,10 @@ Found:
 	// Go ahead and actually mark the bits now that we have an address.
 	scav = s.allocRange(addr, npages)
 
-	// If we found a higher (linearized) searchAddr, we know that all the
-	// heap memory before that searchAddr in a linear address space is
+	// If we found a higher searchAddr, we know that all the
+	// heap memory before that searchAddr in an offset address space is
 	// allocated, so bump s.searchAddr up to the new one.
-	if s.compareSearchAddrTo(searchAddr) > 0 {
+	if s.searchAddr.lessThan(searchAddr) {
 		s.searchAddr = searchAddr
 	}
 	return addr, scav
@@ -807,9 +808,14 @@ Found:
 //
 // s.mheapLock must be held.
 func (s *pageAlloc) free(base, npages uintptr) {
-	// If we're freeing pages below the (linearized) s.searchAddr, update searchAddr.
-	if s.compareSearchAddrTo(base) < 0 {
-		s.searchAddr = base
+	// If we're freeing pages below the s.searchAddr, update searchAddr.
+	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
+		s.searchAddr = b
+	}
+	// Update the free high watermark for the scavenger.
+	limit := base + npages*pageSize - 1
+	if offLimit := (offAddr{limit}); s.scav.freeHWM.lessThan(offLimit) {
+		s.scav.freeHWM = offLimit
 	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly
@@ -818,7 +824,6 @@ func (s *pageAlloc) free(base, npages uintptr) {
 		s.chunkOf(i).free1(chunkPageIndex(base))
 	} else {
 		// Slow path: we're clearing more bits so we may need to iterate.
-		limit := base + npages*pageSize - 1
 		sc, ec := chunkIndex(base), chunkIndex(limit)
 		si, ei := chunkPageIndex(base), chunkPageIndex(limit)
 
diff --git a/libgo/go/runtime/mpagealloc_64bit.go b/libgo/go/runtime/mpagealloc_64bit.go
index 385b7b3e7a7..ac599463ce7 100644
--- a/libgo/go/runtime/mpagealloc_64bit.go
+++ b/libgo/go/runtime/mpagealloc_64bit.go
@@ -106,7 +106,7 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	// of summary indices which must be mapped to support those addresses
 	// in the summary range.
 	addrRangeToSummaryRange := func(level int, r addrRange) (int, int) {
-		sumIdxBase, sumIdxLimit := addrsToSummaryRange(level, r.base, r.limit)
+		sumIdxBase, sumIdxLimit := addrsToSummaryRange(level, r.base.addr(), r.limit.addr())
 		return blockAlignSummaryRange(level, sumIdxBase, sumIdxLimit)
 	}
 
@@ -118,8 +118,8 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		limitOffset := alignUp(uintptr(sumIdxLimit)*pallocSumBytes, physPageSize)
 		base := unsafe.Pointer(&s.summary[level][0])
 		return addrRange{
-			uintptr(add(base, baseOffset)),
-			uintptr(add(base, limitOffset)),
+			offAddr{uintptr(add(base, baseOffset))},
+			offAddr{uintptr(add(base, limitOffset))},
 		}
 	}
 
@@ -145,7 +145,7 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	// Walk up the radix tree and map summaries in as needed.
 	for l := range s.summary {
 		// Figure out what part of the summary array this new address space needs.
-		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, addrRange{base, limit})
+		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, makeAddrRange(base, limit))
 
 		// Update the summary slices with a new upper-bound. This ensures
 		// we get tight bounds checks on at least the top bound.
@@ -174,7 +174,7 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		}
 
 		// Map and commit need.
-		sysMap(unsafe.Pointer(need.base), need.size(), s.sysStat)
-		sysUsed(unsafe.Pointer(need.base), need.size())
+		sysMap(unsafe.Pointer(need.base.addr()), need.size(), s.sysStat)
+		sysUsed(unsafe.Pointer(need.base.addr()), need.size())
 	}
 }
diff --git a/libgo/go/runtime/mpagecache.go b/libgo/go/runtime/mpagecache.go
index a0749618405..683a9971368 100644
--- a/libgo/go/runtime/mpagecache.go
+++ b/libgo/go/runtime/mpagecache.go
@@ -91,8 +91,8 @@ func (c *pageCache) flush(s *pageAlloc) {
 	}
 	// Since this is a lot like a free, we need to make sure
 	// we update the searchAddr just like free does.
-	if s.compareSearchAddrTo(c.base) < 0 {
-		s.searchAddr = c.base
+	if b := (offAddr{c.base}); b.lessThan(s.searchAddr) {
+		s.searchAddr = b
 	}
 	s.update(c.base, pageCachePages, false, false)
 	*c = pageCache{}
@@ -106,16 +106,16 @@ func (c *pageCache) flush(s *pageAlloc) {
 func (s *pageAlloc) allocToCache() pageCache {
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr) >= s.end {
+	if chunkIndex(s.searchAddr.addr()) >= s.end {
 		return pageCache{}
 	}
 	c := pageCache{}
-	ci := chunkIndex(s.searchAddr) // chunk index
+	ci := chunkIndex(s.searchAddr.addr()) // chunk index
 	if s.summary[len(s.summary)-1][ci] != 0 {
 		// Fast path: there's free pages at or near the searchAddr address.
 		chunk := s.chunkOf(ci)
-		j, _ := chunk.find(1, chunkPageIndex(s.searchAddr))
-		if j < 0 {
+		j, _ := chunk.find(1, chunkPageIndex(s.searchAddr.addr()))
+		if j == ^uint(0) {
 			throw("bad summary data")
 		}
 		c = pageCache{
@@ -156,6 +156,6 @@ func (s *pageAlloc) allocToCache() pageCache {
 	// However, s.searchAddr is not allowed to point into unmapped heap memory
 	// unless it is maxSearchAddr, so make it the last page as opposed to
 	// the page after.
-	s.searchAddr = c.base + pageSize*(pageCachePages-1)
+	s.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
 	return c
 }
diff --git a/libgo/go/runtime/mprof.go b/libgo/go/runtime/mprof.go
index dd257d1b06f..a4b135d90d0 100644
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@@ -942,13 +942,16 @@ func ThreadCreateProfile(p []StackRecord) (n int, ok bool) {
 	return
 }
 
-// GoroutineProfile returns n, the number of records in the active goroutine stack profile.
-// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
-// If len(p) < n, GoroutineProfile does not change p and returns n, false.
-//
-// Most clients should use the runtime/pprof package instead
-// of calling GoroutineProfile directly.
-func GoroutineProfile(p []StackRecord) (n int, ok bool) {
+//go:linkname runtime_goroutineProfileWithLabels runtime..z2fpprof.runtime_goroutineProfileWithLabels
+func runtime_goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+	return goroutineProfileWithLabels(p, labels)
+}
+
+// labels may be nil. If labels is non-nil, it must have the same length as p.
+func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+	if labels != nil && len(labels) != len(p) {
+		labels = nil
+	}
 	gp := getg()
 
 	isOK := func(gp1 *g) bool {
@@ -968,12 +971,18 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
 
 	if n <= len(p) {
 		ok = true
-		r := p
+		r, lbl := p, labels
 
 		// Save current goroutine.
 		saveg(gp, &r[0])
 		r = r[1:]
 
+		// If we have a place to put our goroutine labelmap, insert it there.
+		if labels != nil {
+			lbl[0] = gp.labels
+			lbl = lbl[1:]
+		}
+
 		// Save other goroutines.
 		for _, gp1 := range allgs {
 			if isOK(gp1) {
@@ -983,16 +992,30 @@ func GoroutineProfile(p []StackRecord) (n int, ok bool) {
 					break
 				}
 				saveg(gp1, &r[0])
+				if labels != nil {
+					lbl[0] = gp1.labels
+					lbl = lbl[1:]
+				}
 				r = r[1:]
 			}
 		}
 	}
 
 	startTheWorld()
-
 	return n, ok
 }
 
+// GoroutineProfile returns n, the number of records in the active goroutine stack profile.
+// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
+// If len(p) < n, GoroutineProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package instead
+// of calling GoroutineProfile directly.
+func GoroutineProfile(p []StackRecord) (n int, ok bool) {
+
+	return goroutineProfileWithLabels(p, nil)
+}
+
 func saveg(gp *g, r *StackRecord) {
 	if gp == getg() {
 		var locbuf [32]location
diff --git a/libgo/go/runtime/mranges.go b/libgo/go/runtime/mranges.go
index b13385165b3..e23d0778eb9 100644
--- a/libgo/go/runtime/mranges.go
+++ b/libgo/go/runtime/mranges.go
@@ -15,23 +15,41 @@ import (
 )
 
 // addrRange represents a region of address space.
+//
+// An addrRange must never span a gap in the address space.
 type addrRange struct {
 	// base and limit together represent the region of address space
 	// [base, limit). That is, base is inclusive, limit is exclusive.
-	base, limit uintptr
+	// These are address over an offset view of the address space on
+	// platforms with a segmented address space, that is, on platforms
+	// where arenaBaseOffset != 0.
+	base, limit offAddr
+}
+
+// makeAddrRange creates a new address range from two virtual addresses.
+//
+// Throws if the base and limit are not in the same memory segment.
+func makeAddrRange(base, limit uintptr) addrRange {
+	r := addrRange{offAddr{base}, offAddr{limit}}
+	if (base-arenaBaseOffset >= base) != (limit-arenaBaseOffset >= limit) {
+		throw("addr range base and limit are not in the same memory segment")
+	}
+	return r
 }
 
 // size returns the size of the range represented in bytes.
 func (a addrRange) size() uintptr {
-	if a.limit <= a.base {
+	if !a.base.lessThan(a.limit) {
 		return 0
 	}
-	return a.limit - a.base
+	// Subtraction is safe because limit and base must be in the same
+	// segment of the address space.
+	return a.limit.diff(a.base)
 }
 
 // contains returns whether or not the range contains a given address.
 func (a addrRange) contains(addr uintptr) bool {
-	return addr >= a.base && addr < a.limit
+	return a.base.lessEqual(offAddr{addr}) && (offAddr{addr}).lessThan(a.limit)
 }
 
 // subtract takes the addrRange toPrune and cuts out any overlap with
@@ -39,18 +57,90 @@ func (a addrRange) contains(addr uintptr) bool {
 // either don't overlap at all, only overlap on one side, or are equal.
 // If b is strictly contained in a, thus forcing a split, it will throw.
 func (a addrRange) subtract(b addrRange) addrRange {
-	if a.base >= b.base && a.limit <= b.limit {
+	if b.base.lessEqual(a.base) && a.limit.lessEqual(b.limit) {
 		return addrRange{}
-	} else if a.base < b.base && a.limit > b.limit {
+	} else if a.base.lessThan(b.base) && b.limit.lessThan(a.limit) {
 		throw("bad prune")
-	} else if a.limit > b.limit && a.base < b.limit {
+	} else if b.limit.lessThan(a.limit) && a.base.lessThan(b.limit) {
 		a.base = b.limit
-	} else if a.base < b.base && a.limit > b.base {
+	} else if a.base.lessThan(b.base) && b.base.lessThan(a.limit) {
 		a.limit = b.base
 	}
 	return a
 }
 
+// removeGreaterEqual removes all addresses in a greater than or equal
+// to addr and returns the new range.
+func (a addrRange) removeGreaterEqual(addr uintptr) addrRange {
+	if (offAddr{addr}).lessEqual(a.base) {
+		return addrRange{}
+	}
+	if a.limit.lessEqual(offAddr{addr}) {
+		return a
+	}
+	return makeAddrRange(a.base.addr(), addr)
+}
+
+var (
+	// minOffAddr is the minimum address in the offset space, and
+	// it corresponds to the virtual address arenaBaseOffset.
+	minOffAddr = offAddr{arenaBaseOffset}
+
+	// maxOffAddr is the maximum address in the offset address
+	// space. It corresponds to the highest virtual address representable
+	// by the page alloc chunk and heap arena maps.
+	maxOffAddr = offAddr{(((1 << heapAddrBits) - 1) + arenaBaseOffset) & uintptrMask}
+)
+
+// offAddr represents an address in a contiguous view
+// of the address space on systems where the address space is
+// segmented. On other systems, it's just a normal address.
+type offAddr struct {
+	// a is just the virtual address, but should never be used
+	// directly. Call addr() to get this value instead.
+	a uintptr
+}
+
+// add adds a uintptr offset to the offAddr.
+func (l offAddr) add(bytes uintptr) offAddr {
+	return offAddr{a: l.a + bytes}
+}
+
+// sub subtracts a uintptr offset from the offAddr.
+func (l offAddr) sub(bytes uintptr) offAddr {
+	return offAddr{a: l.a - bytes}
+}
+
+// diff returns the amount of bytes in between the
+// two offAddrs.
+func (l1 offAddr) diff(l2 offAddr) uintptr {
+	return l1.a - l2.a
+}
+
+// lessThan returns true if l1 is less than l2 in the offset
+// address space.
+func (l1 offAddr) lessThan(l2 offAddr) bool {
+	return (l1.a - arenaBaseOffset) < (l2.a - arenaBaseOffset)
+}
+
+// lessEqual returns true if l1 is less than or equal to l2 in
+// the offset address space.
+func (l1 offAddr) lessEqual(l2 offAddr) bool {
+	return (l1.a - arenaBaseOffset) <= (l2.a - arenaBaseOffset)
+}
+
+// equal returns true if the two offAddr values are equal.
+func (l1 offAddr) equal(l2 offAddr) bool {
+	// No need to compare in the offset space, it
+	// means the same thing.
+	return l1 == l2
+}
+
+// addr returns the virtual address for this offset address.
+func (l offAddr) addr() uintptr {
+	return l.a
+}
+
 // addrRanges is a data structure holding a collection of ranges of
 // address space.
 //
@@ -65,6 +155,10 @@ type addrRanges struct {
 	// ranges is a slice of ranges sorted by base.
 	ranges []addrRange
 
+	// totalBytes is the total amount of address space in bytes counted by
+	// this addrRanges.
+	totalBytes uintptr
+
 	// sysStat is the stat to track allocations by this type
 	sysStat *uint64
 }
@@ -75,17 +169,19 @@ func (a *addrRanges) init(sysStat *uint64) {
 	ranges.cap = 16
 	ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), sys.PtrSize, sysStat))
 	a.sysStat = sysStat
+	a.totalBytes = 0
 }
 
 // findSucc returns the first index in a such that base is
 // less than the base of the addrRange at that index.
-func (a *addrRanges) findSucc(base uintptr) int {
+func (a *addrRanges) findSucc(addr uintptr) int {
 	// TODO(mknyszek): Consider a binary search for large arrays.
 	// While iterating over these ranges is potentially expensive,
 	// the expected number of ranges is small, ideally just 1,
 	// since Go heaps are usually mostly contiguous.
+	base := offAddr{addr}
 	for i := range a.ranges {
-		if base < a.ranges[i].base {
+		if base.lessThan(a.ranges[i].base) {
 			return i
 		}
 	}
@@ -116,9 +212,9 @@ func (a *addrRanges) add(r addrRange) {
 
 	// Because we assume r is not currently represented in a,
 	// findSucc gives us our insertion index.
-	i := a.findSucc(r.base)
-	coalescesDown := i > 0 && a.ranges[i-1].limit == r.base
-	coalescesUp := i < len(a.ranges) && r.limit == a.ranges[i].base
+	i := a.findSucc(r.base.addr())
+	coalescesDown := i > 0 && a.ranges[i-1].limit.equal(r.base)
+	coalescesUp := i < len(a.ranges) && r.limit.equal(a.ranges[i].base)
 	if coalescesUp && coalescesDown {
 		// We have neighbors and they both border us.
 		// Merge a.ranges[i-1], r, and a.ranges[i] together into a.ranges[i-1].
@@ -158,4 +254,68 @@ func (a *addrRanges) add(r addrRange) {
 		}
 		a.ranges[i] = r
 	}
+	a.totalBytes += r.size()
+}
+
+// removeLast removes and returns the highest-addressed contiguous range
+// of a, or the last nBytes of that range, whichever is smaller. If a is
+// empty, it returns an empty range.
+func (a *addrRanges) removeLast(nBytes uintptr) addrRange {
+	if len(a.ranges) == 0 {
+		return addrRange{}
+	}
+	r := a.ranges[len(a.ranges)-1]
+	size := r.size()
+	if size > nBytes {
+		newEnd := r.limit.sub(nBytes)
+		a.ranges[len(a.ranges)-1].limit = newEnd
+		a.totalBytes -= nBytes
+		return addrRange{newEnd, r.limit}
+	}
+	a.ranges = a.ranges[:len(a.ranges)-1]
+	a.totalBytes -= size
+	return r
+}
+
+// removeGreaterEqual removes the ranges of a which are above addr, and additionally
+// splits any range containing addr.
+func (a *addrRanges) removeGreaterEqual(addr uintptr) {
+	pivot := a.findSucc(addr)
+	if pivot == 0 {
+		// addr is before all ranges in a.
+		a.totalBytes = 0
+		a.ranges = a.ranges[:0]
+		return
+	}
+	removed := uintptr(0)
+	for _, r := range a.ranges[pivot:] {
+		removed += r.size()
+	}
+	if r := a.ranges[pivot-1]; r.contains(addr) {
+		removed += r.size()
+		r = r.removeGreaterEqual(addr)
+		if r.size() == 0 {
+			pivot--
+		} else {
+			removed -= r.size()
+			a.ranges[pivot-1] = r
+		}
+	}
+	a.ranges = a.ranges[:pivot]
+	a.totalBytes -= removed
+}
+
+// cloneInto makes a deep clone of a's state into b, re-using
+// b's ranges if able.
+func (a *addrRanges) cloneInto(b *addrRanges) {
+	if len(a.ranges) > cap(b.ranges) {
+		// Grow the array.
+		ranges := (*notInHeapSlice)(unsafe.Pointer(&b.ranges))
+		ranges.len = 0
+		ranges.cap = cap(a.ranges)
+		ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), sys.PtrSize, b.sysStat))
+	}
+	b.ranges = b.ranges[:len(a.ranges)]
+	b.totalBytes = a.totalBytes
+	copy(b.ranges, a.ranges)
 }
diff --git a/libgo/go/runtime/msize.go b/libgo/go/runtime/msize.go
index 11d06ce0251..c56aa5a7b2c 100644
--- a/libgo/go/runtime/msize.go
+++ b/libgo/go/runtime/msize.go
@@ -13,9 +13,9 @@ package runtime
 func roundupsize(size uintptr) uintptr {
 	if size < _MaxSmallSize {
 		if size <= smallSizeMax-8 {
-			return uintptr(class_to_size[size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]])
+			return uintptr(class_to_size[size_to_class8[divRoundUp(size, smallSizeDiv)]])
 		} else {
-			return uintptr(class_to_size[size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]])
+			return uintptr(class_to_size[size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]])
 		}
 	}
 	if size+_PageSize < size {
diff --git a/libgo/go/runtime/mspanset.go b/libgo/go/runtime/mspanset.go
new file mode 100644
index 00000000000..c872c11d1fb
--- /dev/null
+++ b/libgo/go/runtime/mspanset.go
@@ -0,0 +1,358 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// A spanSet is a set of *mspans.
+//
+// spanSet is safe for concurrent push and pop operations.
+type spanSet struct {
+	// A spanSet is a two-level data structure consisting of a
+	// growable spine that points to fixed-sized blocks. The spine
+	// can be accessed without locks, but adding a block or
+	// growing it requires taking the spine lock.
+	//
+	// Because each mspan covers at least 8K of heap and takes at
+	// most 8 bytes in the spanSet, the growth of the spine is
+	// quite limited.
+	//
+	// The spine and all blocks are allocated off-heap, which
+	// allows this to be used in the memory manager and avoids the
+	// need for write barriers on all of these. spanSetBlocks are
+	// managed in a pool, though never freed back to the operating
+	// system. We never release spine memory because there could be
+	// concurrent lock-free access and we're likely to reuse it
+	// anyway. (In principle, we could do this during STW.)
+
+	spineLock mutex
+	spine     unsafe.Pointer // *[N]*spanSetBlock, accessed atomically
+	spineLen  uintptr        // Spine array length, accessed atomically
+	spineCap  uintptr        // Spine array cap, accessed under lock
+
+	// index is the head and tail of the spanSet in a single field.
+	// The head and the tail both represent an index into the logical
+	// concatenation of all blocks, with the head always behind or
+	// equal to the tail (indicating an empty set). This field is
+	// always accessed atomically.
+	//
+	// The head and the tail are only 32 bits wide, which means we
+	// can only support up to 2^32 pushes before a reset. If every
+	// span in the heap were stored in this set, and each span were
+	// the minimum size (1 runtime page, 8 KiB), then roughly the
+	// smallest heap which would be unrepresentable is 32 TiB in size.
+	index headTailIndex
+}
+
+const (
+	spanSetBlockEntries = 512 // 4KB on 64-bit
+	spanSetInitSpineCap = 256 // Enough for 1GB heap on 64-bit
+)
+
+type spanSetBlock struct {
+	// Free spanSetBlocks are managed via a lock-free stack.
+	lfnode
+
+	// popped is the number of pop operations that have occurred on
+	// this block. This number is used to help determine when a block
+	// may be safely recycled.
+	popped uint32
+
+	// spans is the set of spans in this block.
+	spans [spanSetBlockEntries]*mspan
+}
+
+// push adds span s to buffer b. push is safe to call concurrently
+// with other push and pop operations.
+func (b *spanSet) push(s *mspan) {
+	// Obtain our slot.
+	cursor := uintptr(b.index.incTail().tail() - 1)
+	top, bottom := cursor/spanSetBlockEntries, cursor%spanSetBlockEntries
+
+	// Do we need to add a block?
+	spineLen := atomic.Loaduintptr(&b.spineLen)
+	var block *spanSetBlock
+retry:
+	if top < spineLen {
+		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+		blockp := add(spine, sys.PtrSize*top)
+		block = (*spanSetBlock)(atomic.Loadp(blockp))
+	} else {
+		// Add a new block to the spine, potentially growing
+		// the spine.
+		lock(&b.spineLock)
+		// spineLen cannot change until we release the lock,
+		// but may have changed while we were waiting.
+		spineLen = atomic.Loaduintptr(&b.spineLen)
+		if top < spineLen {
+			unlock(&b.spineLock)
+			goto retry
+		}
+
+		if spineLen == b.spineCap {
+			// Grow the spine.
+			newCap := b.spineCap * 2
+			if newCap == 0 {
+				newCap = spanSetInitSpineCap
+			}
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
+			if b.spineCap != 0 {
+				// Blocks are allocated off-heap, so
+				// no write barriers.
+				memmove(newSpine, b.spine, b.spineCap*sys.PtrSize)
+			}
+			// Spine is allocated off-heap, so no write barrier.
+			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
+			b.spineCap = newCap
+			// We can't immediately free the old spine
+			// since a concurrent push with a lower index
+			// could still be reading from it. We let it
+			// leak because even a 1TB heap would waste
+			// less than 2MB of memory on old spines. If
+			// this is a problem, we could free old spines
+			// during STW.
+		}
+
+		// Allocate a new block from the pool.
+		block = spanSetBlockPool.alloc()
+
+		// Add it to the spine.
+		blockp := add(b.spine, sys.PtrSize*top)
+		// Blocks are allocated off-heap, so no write barrier.
+		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
+		atomic.Storeuintptr(&b.spineLen, spineLen+1)
+		unlock(&b.spineLock)
+	}
+
+	// We have a block. Insert the span atomically, since there may be
+	// concurrent readers via the block API.
+	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), unsafe.Pointer(s))
+}
+
+// pop removes and returns a span from buffer b, or nil if b is empty.
+// pop is safe to call concurrently with other pop and push operations.
+func (b *spanSet) pop() *mspan {
+	var head, tail uint32
+claimLoop:
+	for {
+		headtail := b.index.load()
+		head, tail = headtail.split()
+		if head >= tail {
+			// The buf is empty, as far as we can tell.
+			return nil
+		}
+		// Check if the head position we want to claim is actually
+		// backed by a block.
+		spineLen := atomic.Loaduintptr(&b.spineLen)
+		if spineLen <= uintptr(head)/spanSetBlockEntries {
+			// We're racing with a spine growth and the allocation of
+			// a new block (and maybe a new spine!), and trying to grab
+			// the span at the index which is currently being pushed.
+			// Instead of spinning, let's just notify the caller that
+			// there's nothing currently here. Spinning on this is
+			// almost definitely not worth it.
+			return nil
+		}
+		// Try to claim the current head by CASing in an updated head.
+		// This may fail transiently due to a push which modifies the
+		// tail, so keep trying while the head isn't changing.
+		want := head
+		for want == head {
+			if b.index.cas(headtail, makeHeadTailIndex(want+1, tail)) {
+				break claimLoop
+			}
+			headtail = b.index.load()
+			head, tail = headtail.split()
+		}
+		// We failed to claim the spot we were after and the head changed,
+		// meaning a popper got ahead of us. Try again from the top because
+		// the buf may not be empty.
+	}
+	top, bottom := head/spanSetBlockEntries, head%spanSetBlockEntries
+
+	// We may be reading a stale spine pointer, but because the length
+	// grows monotonically and we've already verified it, we'll definitely
+	// be reading from a valid block.
+	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+	blockp := add(spine, sys.PtrSize*uintptr(top))
+
+	// Given that the spine length is correct, we know we will never
+	// see a nil block here, since the length is always updated after
+	// the block is set.
+	block := (*spanSetBlock)(atomic.Loadp(blockp))
+	s := (*mspan)(atomic.Loadp(unsafe.Pointer(&block.spans[bottom])))
+	for s == nil {
+		// We raced with the span actually being set, but given that we
+		// know a block for this span exists, the race window here is
+		// extremely small. Try again.
+		s = (*mspan)(atomic.Loadp(unsafe.Pointer(&block.spans[bottom])))
+	}
+	// Clear the pointer. This isn't strictly necessary, but defensively
+	// avoids accidentally re-using blocks which could lead to memory
+	// corruption. This way, we'll get a nil pointer access instead.
+	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), nil)
+
+	// Increase the popped count. If we are the last possible popper
+	// in the block (note that bottom need not equal spanSetBlockEntries-1
+	// due to races) then it's our resposibility to free the block.
+	//
+	// If we increment popped to spanSetBlockEntries, we can be sure that
+	// we're the last popper for this block, and it's thus safe to free it.
+	// Every other popper must have crossed this barrier (and thus finished
+	// popping its corresponding mspan) by the time we get here. Because
+	// we're the last popper, we also don't have to worry about concurrent
+	// pushers (there can't be any). Note that we may not be the popper
+	// which claimed the last slot in the block, we're just the last one
+	// to finish popping.
+	if atomic.Xadd(&block.popped, 1) == spanSetBlockEntries {
+		// Clear the block's pointer.
+		atomic.StorepNoWB(blockp, nil)
+
+		// Return the block to the block pool.
+		spanSetBlockPool.free(block)
+	}
+	return s
+}
+
+// reset resets a spanSet which is empty. It will also clean up
+// any left over blocks.
+//
+// Throws if the buf is not empty.
+//
+// reset may not be called concurrently with any other operations
+// on the span set.
+func (b *spanSet) reset() {
+	head, tail := b.index.load().split()
+	if head < tail {
+		print("head = ", head, ", tail = ", tail, "\n")
+		throw("attempt to clear non-empty span set")
+	}
+	top := head / spanSetBlockEntries
+	if uintptr(top) < b.spineLen {
+		// If the head catches up to the tail and the set is empty,
+		// we may not clean up the block containing the head and tail
+		// since it may be pushed into again. In order to avoid leaking
+		// memory since we're going to reset the head and tail, clean
+		// up such a block now, if it exists.
+		blockp := (**spanSetBlock)(add(b.spine, sys.PtrSize*uintptr(top)))
+		block := *blockp
+		if block != nil {
+			// Sanity check the popped value.
+			if block.popped == 0 {
+				// popped should never be zero because that means we have
+				// pushed at least one value but not yet popped if this
+				// block pointer is not nil.
+				throw("span set block with unpopped elements found in reset")
+			}
+			if block.popped == spanSetBlockEntries {
+				// popped should also never be equal to spanSetBlockEntries
+				// because the last popper should have made the block pointer
+				// in this slot nil.
+				throw("fully empty unfreed span set block found in reset")
+			}
+
+			// Clear the pointer to the block.
+			atomic.StorepNoWB(unsafe.Pointer(blockp), nil)
+
+			// Return the block to the block pool.
+			spanSetBlockPool.free(block)
+		}
+	}
+	b.index.reset()
+	atomic.Storeuintptr(&b.spineLen, 0)
+}
+
+// gccgoAlignment is used to get spanSetBlockPool aligned on a 64-bit
+// boundary on 32-bit x86.
+var gccgoAlignment uint64
+
+// spanSetBlockPool is a global pool of spanSetBlocks.
+var spanSetBlockPool = (*spanSetBlockAlloc)(unsafe.Pointer(&gccgoAlignment))
+
+// spanSetBlockAlloc represents a concurrent pool of spanSetBlocks.
+type spanSetBlockAlloc struct {
+	stack lfstack
+}
+
+// alloc tries to grab a spanSetBlock out of the pool, and if it fails
+// persistentallocs a new one and returns it.
+func (p *spanSetBlockAlloc) alloc() *spanSetBlock {
+	if s := (*spanSetBlock)(p.stack.pop()); s != nil {
+		return s
+	}
+	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
+}
+
+// free returns a spanSetBlock back to the pool.
+func (p *spanSetBlockAlloc) free(block *spanSetBlock) {
+	atomic.Store(&block.popped, 0)
+	p.stack.push(&block.lfnode)
+}
+
+// haidTailIndex represents a combined 32-bit head and 32-bit tail
+// of a queue into a single 64-bit value.
+type headTailIndex uint64
+
+// makeHeadTailIndex creates a headTailIndex value from a separate
+// head and tail.
+func makeHeadTailIndex(head, tail uint32) headTailIndex {
+	return headTailIndex(uint64(head)<<32 | uint64(tail))
+}
+
+// head returns the head of a headTailIndex value.
+func (h headTailIndex) head() uint32 {
+	return uint32(h >> 32)
+}
+
+// tail returns the tail of a headTailIndex value.
+func (h headTailIndex) tail() uint32 {
+	return uint32(h)
+}
+
+// split splits the headTailIndex value into its parts.
+func (h headTailIndex) split() (head uint32, tail uint32) {
+	return h.head(), h.tail()
+}
+
+// load atomically reads a headTailIndex value.
+func (h *headTailIndex) load() headTailIndex {
+	return headTailIndex(atomic.Load64((*uint64)(h)))
+}
+
+// cas atomically compares-and-swaps a headTailIndex value.
+func (h *headTailIndex) cas(old, new headTailIndex) bool {
+	return atomic.Cas64((*uint64)(h), uint64(old), uint64(new))
+}
+
+// incHead atomically increments the head of a headTailIndex.
+func (h *headTailIndex) incHead() headTailIndex {
+	return headTailIndex(atomic.Xadd64((*uint64)(h), (1 << 32)))
+}
+
+// decHead atomically decrements the head of a headTailIndex.
+func (h *headTailIndex) decHead() headTailIndex {
+	return headTailIndex(atomic.Xadd64((*uint64)(h), -(1 << 32)))
+}
+
+// incTail atomically increments the tail of a headTailIndex.
+func (h *headTailIndex) incTail() headTailIndex {
+	ht := headTailIndex(atomic.Xadd64((*uint64)(h), +1))
+	// Check for overflow.
+	if ht.tail() == 0 {
+		print("runtime: head = ", ht.head(), ", tail = ", ht.tail(), "\n")
+		throw("headTailIndex overflow")
+	}
+	return ht
+}
+
+// reset clears the headTailIndex to (0, 0).
+func (h *headTailIndex) reset() {
+	atomic.Store64((*uint64)(h), 0)
+}
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 2d4cdbe042c..4e2c66cea1f 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -513,6 +513,12 @@ func readGCStats_m(pauses *[]uint64) {
 
 //go:nowritebarrier
 func updatememstats() {
+	// Flush mcaches to mcentral before doing anything else.
+	//
+	// Flushing to the mcentral may in general cause stats to
+	// change as mcentral data structures are manipulated.
+	systemstack(flushallmcaches)
+
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
 	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
@@ -523,7 +529,7 @@ func updatememstats() {
 
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
-	// Current number of alive object in the heap and amount of alive heap memory
+	// Current number of alive objects in the heap and amount of alive heap memory
 	// are calculated by scanning all spans.
 	// Total number of mallocs is calculated as number of frees plus number of alive objects.
 	// Similarly, total amount of allocated memory is calculated as amount of freed memory
@@ -537,9 +543,6 @@ func updatememstats() {
 		memstats.by_size[i].nfree = 0
 	}
 
-	// Flush mcache's to mcentral.
-	systemstack(flushallmcaches)
-
 	// Aggregate local stats.
 	cachestats()
 
diff --git a/libgo/go/runtime/mwbbuf.go b/libgo/go/runtime/mwbbuf.go
index a27406e2987..548d4c51074 100644
--- a/libgo/go/runtime/mwbbuf.go
+++ b/libgo/go/runtime/mwbbuf.go
@@ -303,6 +303,13 @@ func wbBufFlush1(_p_ *p) {
 			continue
 		}
 		mbits.setMarked()
+
+		// Mark span.
+		arena, pageIdx, pageMask := pageIndexOf(span.base())
+		if arena.pageMarks[pageIdx]&pageMask == 0 {
+			atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+		}
+
 		if span.spanclass.noscan() {
 			gcw.bytesMarked += uint64(span.elemsize)
 			continue
diff --git a/libgo/go/runtime/netpoll.go b/libgo/go/runtime/netpoll.go
index d2fb77569d6..5157e4dd9e2 100644
--- a/libgo/go/runtime/netpoll.go
+++ b/libgo/go/runtime/netpoll.go
@@ -36,18 +36,27 @@ import (
 // func netpollIsPollDescriptor(fd uintptr) bool
 //     Reports whether fd is a file descriptor used by the poller.
 
+// Error codes returned by runtime_pollReset and runtime_pollWait.
+// These must match the values in internal/poll/fd_poll_runtime.go.
+const (
+	pollNoError        = 0 // no error
+	pollErrClosing     = 1 // descriptor is closed
+	pollErrTimeout     = 2 // I/O timeout
+	pollErrNotPollable = 3 // general error polling descriptor
+)
+
 // pollDesc contains 2 binary semaphores, rg and wg, to park reader and writer
 // goroutines respectively. The semaphore can be in the following states:
 // pdReady - io readiness notification is pending;
 //           a goroutine consumes the notification by changing the state to nil.
 // pdWait - a goroutine prepares to park on the semaphore, but not yet parked;
 //          the goroutine commits to park by changing the state to G pointer,
-//          or, alternatively, concurrent io notification changes the state to READY,
+//          or, alternatively, concurrent io notification changes the state to pdReady,
 //          or, alternatively, concurrent timeout/close changes the state to nil.
 // G pointer - the goroutine is blocked on the semaphore;
-//             io notification or timeout/close changes the state to READY or nil respectively
+//             io notification or timeout/close changes the state to pdReady or nil respectively
 //             and unparks the goroutine.
-// nil - nothing of the above.
+// nil - none of the above.
 const (
 	pdReady uintptr = 1
 	pdWait  uintptr = 2
@@ -110,6 +119,7 @@ func poll_runtime_pollServerInit() {
 
 func netpollGenericInit() {
 	if atomic.Load(&netpollInited) == 0 {
+		lockInit(&netpollInitLock, lockRankNetpollInit)
 		lock(&netpollInitLock)
 		if netpollInited == 0 {
 			netpollinit()
@@ -180,42 +190,49 @@ func (c *pollCache) free(pd *pollDesc) {
 	unlock(&c.lock)
 }
 
+// poll_runtime_pollReset, which is internal/poll.runtime_pollReset,
+// prepares a descriptor for polling in mode, which is 'r' or 'w'.
+// This returns an error code; the codes are defined above.
 //go:linkname poll_runtime_pollReset internal..z2fpoll.runtime_pollReset
 func poll_runtime_pollReset(ctx uintptr, mode int) int {
 	pd := (*pollDesc)(unsafe.Pointer(ctx))
-	err := netpollcheckerr(pd, int32(mode))
-	if err != 0 {
-		return err
+	errcode := netpollcheckerr(pd, int32(mode))
+	if errcode != pollNoError {
+		return errcode
 	}
 	if mode == 'r' {
 		pd.rg = 0
 	} else if mode == 'w' {
 		pd.wg = 0
 	}
-	return 0
+	return pollNoError
 }
 
+// poll_runtime_pollWait, which is internal/poll.runtime_pollWait,
+// waits for a descriptor to be ready for reading or writing,
+// according to mode, which is 'r' or 'w'.
+// This returns an error code; the codes are defined above.
 //go:linkname poll_runtime_pollWait internal..z2fpoll.runtime_pollWait
 func poll_runtime_pollWait(ctx uintptr, mode int) int {
 	pd := (*pollDesc)(unsafe.Pointer(ctx))
-	err := netpollcheckerr(pd, int32(mode))
-	if err != 0 {
-		return err
+	errcode := netpollcheckerr(pd, int32(mode))
+	if errcode != pollNoError {
+		return errcode
 	}
 	// As for now only Solaris, illumos, and AIX use level-triggered IO.
 	if GOOS == "solaris" || GOOS == "illumos" || GOOS == "aix" || GOOS == "hurd" {
 		netpollarm(pd, mode)
 	}
 	for !netpollblock(pd, int32(mode), false) {
-		err = netpollcheckerr(pd, int32(mode))
-		if err != 0 {
-			return err
+		errcode = netpollcheckerr(pd, int32(mode))
+		if errcode != pollNoError {
+			return errcode
 		}
 		// Can happen if timeout has fired and unblocked us,
 		// but before we had a chance to run, timeout has been reset.
 		// Pretend it has not happened and retry.
 	}
-	return 0
+	return pollNoError
 }
 
 //go:linkname poll_runtime_pollWaitCanceled internal..z2fpoll.runtime_pollWaitCanceled
@@ -368,18 +385,18 @@ func netpollready(toRun *gList, pd *pollDesc, mode int32) {
 
 func netpollcheckerr(pd *pollDesc, mode int32) int {
 	if pd.closing {
-		return 1 // ErrFileClosing or ErrNetClosing
+		return pollErrClosing
 	}
 	if (mode == 'r' && pd.rd < 0) || (mode == 'w' && pd.wd < 0) {
-		return 2 // ErrTimeout
+		return pollErrTimeout
 	}
 	// Report an event scanning error only on a read event.
 	// An error on a write event will be captured in a subsequent
 	// write call that is able to report a more specific error.
 	if mode == 'r' && pd.everr {
-		return 3 // ErrNotPollable
+		return pollErrNotPollable
 	}
-	return 0
+	return pollNoError
 }
 
 func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
@@ -406,7 +423,7 @@ func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
 		gpp = &pd.wg
 	}
 
-	// set the gpp semaphore to WAIT
+	// set the gpp semaphore to pdWait
 	for {
 		old := *gpp
 		if old == pdReady {
@@ -421,13 +438,13 @@ func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
 		}
 	}
 
-	// need to recheck error states after setting gpp to WAIT
+	// need to recheck error states after setting gpp to pdWait
 	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
 	// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
 	if waitio || netpollcheckerr(pd, mode) == 0 {
 		gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
 	}
-	// be careful to not lose concurrent READY notification
+	// be careful to not lose concurrent pdReady notification
 	old := atomic.Xchguintptr(gpp, 0)
 	if old > pdWait {
 		throw("runtime: corrupted polldesc")
@@ -447,7 +464,7 @@ func netpollunblock(pd *pollDesc, mode int32, ioready bool) *g {
 			return nil
 		}
 		if old == 0 && !ioready {
-			// Only set READY for ioready. runtime_pollWait
+			// Only set pdReady for ioready. runtime_pollWait
 			// will check for timeout/cancel before waiting.
 			return nil
 		}
@@ -456,7 +473,7 @@ func netpollunblock(pd *pollDesc, mode int32, ioready bool) *g {
 			new = pdReady
 		}
 		if atomic.Casuintptr(gpp, old, new) {
-			if old == pdReady || old == pdWait {
+			if old == pdWait {
 				old = 0
 			}
 			return (*g)(unsafe.Pointer(old))
@@ -535,6 +552,7 @@ func (c *pollCache) alloc() *pollDesc {
 	}
 	pd := c.first
 	c.first = pd.link
+	lockInit(&pd.lock, lockRankPollDesc)
 	unlock(&c.lock)
 	return pd
 }
diff --git a/libgo/go/runtime/netpoll_aix.go b/libgo/go/runtime/netpoll_aix.go
index a00742e39ae..bb6ce375475 100644
--- a/libgo/go/runtime/netpoll_aix.go
+++ b/libgo/go/runtime/netpoll_aix.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // This is based on the former libgo/runtime/netpoll_select.c implementation
 // except that it uses poll instead of select and is written in Go.
@@ -34,6 +37,8 @@ var (
 	rdwake         int32
 	wrwake         int32
 	pendingUpdates int32
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
@@ -121,9 +126,12 @@ func netpollarm(pd *pollDesc, mode int) {
 	unlock(&mtxset)
 }
 
-// netpollBreak interrupts an epollwait.
+// netpollBreak interrupts a poll.
 func netpollBreak() {
-	netpollwakeup()
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		b := [1]byte{0}
+		write(uintptr(wrwake), unsafe.Pointer(&b[0]), 1)
+	}
 }
 
 // netpoll checks for ready network connections.
@@ -178,6 +186,7 @@ retry:
 			var b [1]byte
 			for read(rdwake, unsafe.Pointer(&b[0]), 1) == 1 {
 			}
+			atomic.Store(&netpollWakeSig, 0)
 		}
 		// Still look at the other fds even if the mode may have
 		// changed, as netpollBreak might have been called.
diff --git a/libgo/go/runtime/netpoll_epoll.go b/libgo/go/runtime/netpoll_epoll.go
index 7b215f31871..9c5d33851cb 100644
--- a/libgo/go/runtime/netpoll_epoll.go
+++ b/libgo/go/runtime/netpoll_epoll.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 //extern epoll_create
 func epollcreate(size int32) int32
@@ -26,6 +29,8 @@ var (
 	epfd int32 = -1 // epoll descriptor
 
 	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
@@ -86,20 +91,22 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // netpollBreak interrupts an epollwait.
 func netpollBreak() {
-	for {
-		var b byte
-		n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
-		if n == 1 {
-			break
-		}
-		if n == -_EINTR {
-			continue
-		}
-		if n == -_EAGAIN {
-			return
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		for {
+			var b byte
+			n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+			if n == 1 {
+				break
+			}
+			if n == -_EINTR {
+				continue
+			}
+			if n == -_EAGAIN {
+				return
+			}
+			println("runtime: netpollBreak write failed with", -n)
+			throw("runtime: netpollBreak write failed")
 		}
-		println("runtime: netpollBreak write failed with", -n)
-		throw("runtime: netpollBreak write failed")
 	}
 }
 
@@ -160,6 +167,7 @@ retry:
 				// if blocking.
 				var tmp [16]byte
 				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
+				atomic.Store(&netpollWakeSig, 0)
 			}
 			continue
 		}
diff --git a/libgo/go/runtime/netpoll_kqueue.go b/libgo/go/runtime/netpoll_kqueue.go
index 94504613787..c41a7d03097 100644
--- a/libgo/go/runtime/netpoll_kqueue.go
+++ b/libgo/go/runtime/netpoll_kqueue.go
@@ -8,7 +8,10 @@ package runtime
 
 // Integrated network poller (kqueue-based implementation).
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 //extern kqueue
 func kqueue() int32
@@ -21,6 +24,8 @@ var (
 	kq int32 = -1
 
 	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
@@ -83,19 +88,21 @@ func netpollarm(pd *pollDesc, mode int) {
 	throw("runtime: unused")
 }
 
-// netpollBreak interrupts an epollwait.
+// netpollBreak interrupts a kevent.
 func netpollBreak() {
-	for {
-		var b byte
-		n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
-		if n == 1 || n == -_EAGAIN {
-			break
-		}
-		if n == -_EINTR {
-			continue
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		for {
+			var b byte
+			n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+			if n == 1 || n == -_EAGAIN {
+				break
+			}
+			if n == -_EINTR {
+				continue
+			}
+			println("runtime: netpollBreak write failed with", -n)
+			throw("runtime: netpollBreak write failed")
 		}
-		println("runtime: netpollBreak write failed with", -n)
-		throw("runtime: netpollBreak write failed")
 	}
 }
 
@@ -153,6 +160,7 @@ retry:
 				// if blocking.
 				var tmp [16]byte
 				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
+				atomic.Store(&netpollWakeSig, 0)
 			}
 			continue
 		}
diff --git a/libgo/go/runtime/netpoll_os_test.go b/libgo/go/runtime/netpoll_os_test.go
new file mode 100644
index 00000000000..b96b9f3ee38
--- /dev/null
+++ b/libgo/go/runtime/netpoll_os_test.go
@@ -0,0 +1,28 @@
+package runtime_test
+
+import (
+	"runtime"
+	"sync"
+	"testing"
+)
+
+var wg sync.WaitGroup
+
+func init() {
+	runtime.NetpollGenericInit()
+}
+
+func BenchmarkNetpollBreak(b *testing.B) {
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < 10; j++ {
+			wg.Add(1)
+			go func() {
+				runtime.NetpollBreak()
+				wg.Done()
+			}()
+		}
+	}
+	wg.Wait()
+	b.StopTimer()
+}
diff --git a/libgo/go/runtime/netpoll_solaris.go b/libgo/go/runtime/netpoll_solaris.go
index acb8bab77b6..d5302b59365 100644
--- a/libgo/go/runtime/netpoll_solaris.go
+++ b/libgo/go/runtime/netpoll_solaris.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Solaris runtime-integrated network poller.
 //
@@ -83,6 +86,10 @@ func port_getn(port int32, evs *portevent, max uint32, nget *uint32, timeout *ti
 //extern port_alert
 func port_alert(port int32, flags, events uint32, user uintptr) int32
 
+var (
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+)
+
 var portfd int32 = -1
 
 func netpollinit() {
@@ -161,15 +168,17 @@ func netpollarm(pd *pollDesc, mode int) {
 
 // netpollBreak interrupts a port_getn wait.
 func netpollBreak() {
-	// Use port_alert to put portfd into alert mode.
-	// This will wake up all threads sleeping in port_getn on portfd,
-	// and cause their calls to port_getn to return immediately.
-	// Further, until portfd is taken out of alert mode,
-	// all calls to port_getn will return immediately.
-	if port_alert(portfd, _PORT_ALERT_UPDATE, _POLLHUP, uintptr(unsafe.Pointer(&portfd))) < 0 {
-		if e := errno(); e != _EBUSY {
-			println("runtime: port_alert failed with", e)
-			throw("runtime: netpoll: port_alert failed")
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		// Use port_alert to put portfd into alert mode.
+		// This will wake up all threads sleeping in port_getn on portfd,
+		// and cause their calls to port_getn to return immediately.
+		// Further, until portfd is taken out of alert mode,
+		// all calls to port_getn will return immediately.
+		if port_alert(portfd, _PORT_ALERT_UPDATE, _POLLHUP, uintptr(unsafe.Pointer(&portfd))) < 0 {
+			if e := errno(); e != _EBUSY {
+				println("runtime: port_alert failed with", e)
+				throw("runtime: netpoll: port_alert failed")
+			}
 		}
 	}
 }
@@ -242,6 +251,7 @@ retry:
 					println("runtime: port_alert failed with", e)
 					throw("runtime: netpoll: port_alert failed")
 				}
+				atomic.Store(&netpollWakeSig, 0)
 			}
 			continue
 		}
diff --git a/libgo/go/runtime/netpoll_stub.go b/libgo/go/runtime/netpoll_stub.go
index f86f2f61748..3599f2d01b7 100644
--- a/libgo/go/runtime/netpoll_stub.go
+++ b/libgo/go/runtime/netpoll_stub.go
@@ -49,6 +49,9 @@ func netpoll(delay int64) gList {
 
 		notetsleep(&netpollNote, delay)
 		unlock(&netpollStubLock)
+		// Guard against starvation in case the lock is contended
+		// (eg when running TestNetpollBreak).
+		osyield()
 	}
 	return gList{}
 }
diff --git a/libgo/go/runtime/netpoll_windows.go b/libgo/go/runtime/netpoll_windows.go
index ced52cbd3a3..4c1cd2633a8 100644
--- a/libgo/go/runtime/netpoll_windows.go
+++ b/libgo/go/runtime/netpoll_windows.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -31,7 +32,11 @@ type overlappedEntry struct {
 	qty      uint32
 }
 
-var iocphandle uintptr = _INVALID_HANDLE_VALUE // completion port io handle
+var (
+	iocphandle uintptr = _INVALID_HANDLE_VALUE // completion port io handle
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+)
 
 func netpollinit() {
 	iocphandle = stdcall4(_CreateIoCompletionPort, _INVALID_HANDLE_VALUE, 0, 0, _DWORD_MAX)
@@ -62,9 +67,11 @@ func netpollarm(pd *pollDesc, mode int) {
 }
 
 func netpollBreak() {
-	if stdcall4(_PostQueuedCompletionStatus, iocphandle, 0, 0, 0) == 0 {
-		println("runtime: netpoll: PostQueuedCompletionStatus failed (errno=", getlasterror(), ")")
-		throw("runtime: netpoll: PostQueuedCompletionStatus failed")
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		if stdcall4(_PostQueuedCompletionStatus, iocphandle, 0, 0, 0) == 0 {
+			println("runtime: netpoll: PostQueuedCompletionStatus failed (errno=", getlasterror(), ")")
+			throw("runtime: netpoll: PostQueuedCompletionStatus failed")
+		}
 	}
 }
 
@@ -75,7 +82,7 @@ func netpollBreak() {
 // delay > 0: block for up to that many nanoseconds
 func netpoll(delay int64) gList {
 	var entries [64]overlappedEntry
-	var wait, qty, key, flags, n, i uint32
+	var wait, qty, flags, n, i uint32
 	var errno int32
 	var op *net_op
 	var toRun gList
@@ -99,82 +106,48 @@ func netpoll(delay int64) gList {
 		wait = 1e9
 	}
 
-	if _GetQueuedCompletionStatusEx != nil {
-		n = uint32(len(entries) / int(gomaxprocs))
-		if n < 8 {
-			n = 8
-		}
-		if delay != 0 {
-			mp.blocked = true
-		}
-		if stdcall6(_GetQueuedCompletionStatusEx, iocphandle, uintptr(unsafe.Pointer(&entries[0])), uintptr(n), uintptr(unsafe.Pointer(&n)), uintptr(wait), 0) == 0 {
-			mp.blocked = false
-			errno = int32(getlasterror())
-			if errno == _WAIT_TIMEOUT {
-				return gList{}
-			}
-			println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
-			throw("runtime: netpoll failed")
-		}
+	n = uint32(len(entries) / int(gomaxprocs))
+	if n < 8 {
+		n = 8
+	}
+	if delay != 0 {
+		mp.blocked = true
+	}
+	if stdcall6(_GetQueuedCompletionStatusEx, iocphandle, uintptr(unsafe.Pointer(&entries[0])), uintptr(n), uintptr(unsafe.Pointer(&n)), uintptr(wait), 0) == 0 {
 		mp.blocked = false
-		for i = 0; i < n; i++ {
-			op = entries[i].op
-			if op != nil {
-				errno = 0
-				qty = 0
-				if stdcall5(_WSAGetOverlappedResult, op.pd.fd, uintptr(unsafe.Pointer(op)), uintptr(unsafe.Pointer(&qty)), 0, uintptr(unsafe.Pointer(&flags))) == 0 {
-					errno = int32(getlasterror())
-				}
-				handlecompletion(&toRun, op, errno, qty)
-			} else {
-				if delay == 0 {
-					// Forward the notification to the
-					// blocked poller.
-					netpollBreak()
-				}
-			}
-		}
-	} else {
-		op = nil
-		errno = 0
-		qty = 0
-		if delay != 0 {
-			mp.blocked = true
+		errno = int32(getlasterror())
+		if errno == _WAIT_TIMEOUT {
+			return gList{}
 		}
-		if stdcall5(_GetQueuedCompletionStatus, iocphandle, uintptr(unsafe.Pointer(&qty)), uintptr(unsafe.Pointer(&key)), uintptr(unsafe.Pointer(&op)), uintptr(wait)) == 0 {
-			mp.blocked = false
-			errno = int32(getlasterror())
-			if errno == _WAIT_TIMEOUT {
-				return gList{}
-			}
-			if op == nil {
-				println("runtime: GetQueuedCompletionStatus failed (errno=", errno, ")")
-				throw("runtime: netpoll failed")
+		println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
+		throw("runtime: netpoll failed")
+	}
+	mp.blocked = false
+	for i = 0; i < n; i++ {
+		op = entries[i].op
+		if op != nil {
+			errno = 0
+			qty = 0
+			if stdcall5(_WSAGetOverlappedResult, op.pd.fd, uintptr(unsafe.Pointer(op)), uintptr(unsafe.Pointer(&qty)), 0, uintptr(unsafe.Pointer(&flags))) == 0 {
+				errno = int32(getlasterror())
 			}
-			// dequeued failed IO packet, so report that
-		}
-		mp.blocked = false
-		if op == nil {
+			handlecompletion(&toRun, op, errno, qty)
+		} else {
+			atomic.Store(&netpollWakeSig, 0)
 			if delay == 0 {
 				// Forward the notification to the
 				// blocked poller.
 				netpollBreak()
 			}
-			return gList{}
 		}
-		handlecompletion(&toRun, op, errno, qty)
 	}
 	return toRun
 }
 
 func handlecompletion(toRun *gList, op *net_op, errno int32, qty uint32) {
-	if op == nil {
-		println("runtime: GetQueuedCompletionStatus returned op == nil")
-		throw("runtime: netpoll failed")
-	}
 	mode := op.mode
 	if mode != 'r' && mode != 'w' {
-		println("runtime: GetQueuedCompletionStatus returned invalid mode=", mode)
+		println("runtime: GetQueuedCompletionStatusEx returned invalid mode=", mode)
 		throw("runtime: netpoll failed")
 	}
 	op.errno = errno
diff --git a/libgo/go/runtime/os_linux_mips64x.go b/libgo/go/runtime/os_linux_mips64x.go
index 2b59dcbf17c..f9059382757 100644
--- a/libgo/go/runtime/os_linux_mips64x.go
+++ b/libgo/go/runtime/os_linux_mips64x.go
@@ -7,5 +7,11 @@
 
 package runtime
 
+import "internal/cpu"
+
 func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_HWCAP:
+		cpu.HWCap = uint(val)
+	}
 }
diff --git a/libgo/go/runtime/panic_test.go b/libgo/go/runtime/panic_test.go
new file mode 100644
index 00000000000..b8a300f6b10
--- /dev/null
+++ b/libgo/go/runtime/panic_test.go
@@ -0,0 +1,48 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"strings"
+	"testing"
+)
+
+// Test that panics print out the underlying value
+// when the underlying kind is directly printable.
+// Issue: https://golang.org/issues/37531
+func TestPanicWithDirectlyPrintableCustomTypes(t *testing.T) {
+	tests := []struct {
+		name            string
+		wantPanicPrefix string
+	}{
+		{"panicCustomBool", `panic: main.MyBool(true)`},
+		{"panicCustomComplex128", `panic: main.MyComplex128(+3.210000e+001+1.000000e+001i)`},
+		{"panicCustomComplex64", `panic: main.MyComplex64(+1.100000e-001+3.000000e+000i)`},
+		{"panicCustomFloat32", `panic: main.MyFloat32(-9.370000e+001)`},
+		{"panicCustomFloat64", `panic: main.MyFloat64(-9.370000e+001)`},
+		{"panicCustomInt", `panic: main.MyInt(93)`},
+		{"panicCustomInt8", `panic: main.MyInt8(93)`},
+		{"panicCustomInt16", `panic: main.MyInt16(93)`},
+		{"panicCustomInt32", `panic: main.MyInt32(93)`},
+		{"panicCustomInt64", `panic: main.MyInt64(93)`},
+		{"panicCustomString", `panic: main.MyString("Panic")`},
+		{"panicCustomUint", `panic: main.MyUint(93)`},
+		{"panicCustomUint8", `panic: main.MyUint8(93)`},
+		{"panicCustomUint16", `panic: main.MyUint16(93)`},
+		{"panicCustomUint32", `panic: main.MyUint32(93)`},
+		{"panicCustomUint64", `panic: main.MyUint64(93)`},
+		{"panicCustomUintptr", `panic: main.MyUintptr(93)`},
+	}
+
+	for _, tt := range tests {
+		t := t
+		t.Run(tt.name, func(t *testing.T) {
+			output := runTestProg(t, "testprog", tt.name)
+			if !strings.HasPrefix(output, tt.wantPanicPrefix) {
+				t.Fatalf("%q\nis not present in\n%s", tt.wantPanicPrefix, output)
+			}
+		})
+	}
+}
diff --git a/libgo/go/runtime/pprof/internal/profile/encode.go b/libgo/go/runtime/pprof/internal/profile/encode.go
deleted file mode 100644
index af319330d9a..00000000000
--- a/libgo/go/runtime/pprof/internal/profile/encode.go
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package profile
-
-import (
-	"errors"
-	"fmt"
-	"sort"
-)
-
-func (p *Profile) decoder() []decoder {
-	return profileDecoder
-}
-
-// preEncode populates the unexported fields to be used by encode
-// (with suffix X) from the corresponding exported fields. The
-// exported fields are cleared up to facilitate testing.
-func (p *Profile) preEncode() {
-	strings := make(map[string]int)
-	addString(strings, "")
-
-	for _, st := range p.SampleType {
-		st.typeX = addString(strings, st.Type)
-		st.unitX = addString(strings, st.Unit)
-	}
-
-	for _, s := range p.Sample {
-		s.labelX = nil
-		var keys []string
-		for k := range s.Label {
-			keys = append(keys, k)
-		}
-		sort.Strings(keys)
-		for _, k := range keys {
-			vs := s.Label[k]
-			for _, v := range vs {
-				s.labelX = append(s.labelX,
-					Label{
-						keyX: addString(strings, k),
-						strX: addString(strings, v),
-					},
-				)
-			}
-		}
-		var numKeys []string
-		for k := range s.NumLabel {
-			numKeys = append(numKeys, k)
-		}
-		sort.Strings(numKeys)
-		for _, k := range numKeys {
-			vs := s.NumLabel[k]
-			for _, v := range vs {
-				s.labelX = append(s.labelX,
-					Label{
-						keyX: addString(strings, k),
-						numX: v,
-					},
-				)
-			}
-		}
-		s.locationIDX = nil
-		for _, l := range s.Location {
-			s.locationIDX = append(s.locationIDX, l.ID)
-		}
-	}
-
-	for _, m := range p.Mapping {
-		m.fileX = addString(strings, m.File)
-		m.buildIDX = addString(strings, m.BuildID)
-	}
-
-	for _, l := range p.Location {
-		for i, ln := range l.Line {
-			if ln.Function != nil {
-				l.Line[i].functionIDX = ln.Function.ID
-			} else {
-				l.Line[i].functionIDX = 0
-			}
-		}
-		if l.Mapping != nil {
-			l.mappingIDX = l.Mapping.ID
-		} else {
-			l.mappingIDX = 0
-		}
-	}
-	for _, f := range p.Function {
-		f.nameX = addString(strings, f.Name)
-		f.systemNameX = addString(strings, f.SystemName)
-		f.filenameX = addString(strings, f.Filename)
-	}
-
-	p.dropFramesX = addString(strings, p.DropFrames)
-	p.keepFramesX = addString(strings, p.KeepFrames)
-
-	if pt := p.PeriodType; pt != nil {
-		pt.typeX = addString(strings, pt.Type)
-		pt.unitX = addString(strings, pt.Unit)
-	}
-
-	p.stringTable = make([]string, len(strings))
-	for s, i := range strings {
-		p.stringTable[i] = s
-	}
-}
-
-func (p *Profile) encode(b *buffer) {
-	for _, x := range p.SampleType {
-		encodeMessage(b, 1, x)
-	}
-	for _, x := range p.Sample {
-		encodeMessage(b, 2, x)
-	}
-	for _, x := range p.Mapping {
-		encodeMessage(b, 3, x)
-	}
-	for _, x := range p.Location {
-		encodeMessage(b, 4, x)
-	}
-	for _, x := range p.Function {
-		encodeMessage(b, 5, x)
-	}
-	encodeStrings(b, 6, p.stringTable)
-	encodeInt64Opt(b, 7, p.dropFramesX)
-	encodeInt64Opt(b, 8, p.keepFramesX)
-	encodeInt64Opt(b, 9, p.TimeNanos)
-	encodeInt64Opt(b, 10, p.DurationNanos)
-	if pt := p.PeriodType; pt != nil && (pt.typeX != 0 || pt.unitX != 0) {
-		encodeMessage(b, 11, p.PeriodType)
-	}
-	encodeInt64Opt(b, 12, p.Period)
-}
-
-var profileDecoder = []decoder{
-	nil, // 0
-	// repeated ValueType sample_type = 1
-	func(b *buffer, m message) error {
-		x := new(ValueType)
-		pp := m.(*Profile)
-		pp.SampleType = append(pp.SampleType, x)
-		return decodeMessage(b, x)
-	},
-	// repeated Sample sample = 2
-	func(b *buffer, m message) error {
-		x := new(Sample)
-		pp := m.(*Profile)
-		pp.Sample = append(pp.Sample, x)
-		return decodeMessage(b, x)
-	},
-	// repeated Mapping mapping = 3
-	func(b *buffer, m message) error {
-		x := new(Mapping)
-		pp := m.(*Profile)
-		pp.Mapping = append(pp.Mapping, x)
-		return decodeMessage(b, x)
-	},
-	// repeated Location location = 4
-	func(b *buffer, m message) error {
-		x := new(Location)
-		pp := m.(*Profile)
-		pp.Location = append(pp.Location, x)
-		return decodeMessage(b, x)
-	},
-	// repeated Function function = 5
-	func(b *buffer, m message) error {
-		x := new(Function)
-		pp := m.(*Profile)
-		pp.Function = append(pp.Function, x)
-		return decodeMessage(b, x)
-	},
-	// repeated string string_table = 6
-	func(b *buffer, m message) error {
-		err := decodeStrings(b, &m.(*Profile).stringTable)
-		if err != nil {
-			return err
-		}
-		if *&m.(*Profile).stringTable[0] != "" {
-			return errors.New("string_table[0] must be ''")
-		}
-		return nil
-	},
-	// repeated int64 drop_frames = 7
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).dropFramesX) },
-	// repeated int64 keep_frames = 8
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).keepFramesX) },
-	// repeated int64 time_nanos = 9
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).TimeNanos) },
-	// repeated int64 duration_nanos = 10
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).DurationNanos) },
-	// optional string period_type = 11
-	func(b *buffer, m message) error {
-		x := new(ValueType)
-		pp := m.(*Profile)
-		pp.PeriodType = x
-		return decodeMessage(b, x)
-	},
-	// repeated int64 period = 12
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).Period) },
-	// repeated int64 comment = 13
-	func(b *buffer, m message) error { return decodeInt64s(b, &m.(*Profile).commentX) },
-	// int64 defaultSampleType = 14
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).defaultSampleTypeX) },
-}
-
-// postDecode takes the unexported fields populated by decode (with
-// suffix X) and populates the corresponding exported fields.
-// The unexported fields are cleared up to facilitate testing.
-func (p *Profile) postDecode() error {
-	var err error
-
-	mappings := make(map[uint64]*Mapping)
-	for _, m := range p.Mapping {
-		m.File, err = getString(p.stringTable, &m.fileX, err)
-		m.BuildID, err = getString(p.stringTable, &m.buildIDX, err)
-		mappings[m.ID] = m
-	}
-
-	functions := make(map[uint64]*Function)
-	for _, f := range p.Function {
-		f.Name, err = getString(p.stringTable, &f.nameX, err)
-		f.SystemName, err = getString(p.stringTable, &f.systemNameX, err)
-		f.Filename, err = getString(p.stringTable, &f.filenameX, err)
-		functions[f.ID] = f
-	}
-
-	locations := make(map[uint64]*Location)
-	for _, l := range p.Location {
-		l.Mapping = mappings[l.mappingIDX]
-		l.mappingIDX = 0
-		for i, ln := range l.Line {
-			if id := ln.functionIDX; id != 0 {
-				l.Line[i].Function = functions[id]
-				if l.Line[i].Function == nil {
-					return fmt.Errorf("Function ID %d not found", id)
-				}
-				l.Line[i].functionIDX = 0
-			}
-		}
-		locations[l.ID] = l
-	}
-
-	for _, st := range p.SampleType {
-		st.Type, err = getString(p.stringTable, &st.typeX, err)
-		st.Unit, err = getString(p.stringTable, &st.unitX, err)
-	}
-
-	for _, s := range p.Sample {
-		labels := make(map[string][]string)
-		numLabels := make(map[string][]int64)
-		for _, l := range s.labelX {
-			var key, value string
-			key, err = getString(p.stringTable, &l.keyX, err)
-			if l.strX != 0 {
-				value, err = getString(p.stringTable, &l.strX, err)
-				labels[key] = append(labels[key], value)
-			} else {
-				numLabels[key] = append(numLabels[key], l.numX)
-			}
-		}
-		if len(labels) > 0 {
-			s.Label = labels
-		}
-		if len(numLabels) > 0 {
-			s.NumLabel = numLabels
-		}
-		s.Location = nil
-		for _, lid := range s.locationIDX {
-			s.Location = append(s.Location, locations[lid])
-		}
-		s.locationIDX = nil
-	}
-
-	p.DropFrames, err = getString(p.stringTable, &p.dropFramesX, err)
-	p.KeepFrames, err = getString(p.stringTable, &p.keepFramesX, err)
-
-	if pt := p.PeriodType; pt == nil {
-		p.PeriodType = &ValueType{}
-	}
-
-	if pt := p.PeriodType; pt != nil {
-		pt.Type, err = getString(p.stringTable, &pt.typeX, err)
-		pt.Unit, err = getString(p.stringTable, &pt.unitX, err)
-	}
-	for _, i := range p.commentX {
-		var c string
-		c, err = getString(p.stringTable, &i, err)
-		p.Comments = append(p.Comments, c)
-	}
-
-	p.commentX = nil
-	p.DefaultSampleType, err = getString(p.stringTable, &p.defaultSampleTypeX, err)
-	p.stringTable = nil
-	return nil
-}
-
-func (p *ValueType) decoder() []decoder {
-	return valueTypeDecoder
-}
-
-func (p *ValueType) encode(b *buffer) {
-	encodeInt64Opt(b, 1, p.typeX)
-	encodeInt64Opt(b, 2, p.unitX)
-}
-
-var valueTypeDecoder = []decoder{
-	nil, // 0
-	// optional int64 type = 1
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*ValueType).typeX) },
-	// optional int64 unit = 2
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*ValueType).unitX) },
-}
-
-func (p *Sample) decoder() []decoder {
-	return sampleDecoder
-}
-
-func (p *Sample) encode(b *buffer) {
-	encodeUint64s(b, 1, p.locationIDX)
-	for _, x := range p.Value {
-		encodeInt64(b, 2, x)
-	}
-	for _, x := range p.labelX {
-		encodeMessage(b, 3, x)
-	}
-}
-
-var sampleDecoder = []decoder{
-	nil, // 0
-	// repeated uint64 location = 1
-	func(b *buffer, m message) error { return decodeUint64s(b, &m.(*Sample).locationIDX) },
-	// repeated int64 value = 2
-	func(b *buffer, m message) error { return decodeInt64s(b, &m.(*Sample).Value) },
-	// repeated Label label = 3
-	func(b *buffer, m message) error {
-		s := m.(*Sample)
-		n := len(s.labelX)
-		s.labelX = append(s.labelX, Label{})
-		return decodeMessage(b, &s.labelX[n])
-	},
-}
-
-func (p Label) decoder() []decoder {
-	return labelDecoder
-}
-
-func (p Label) encode(b *buffer) {
-	encodeInt64Opt(b, 1, p.keyX)
-	encodeInt64Opt(b, 2, p.strX)
-	encodeInt64Opt(b, 3, p.numX)
-}
-
-var labelDecoder = []decoder{
-	nil, // 0
-	// optional int64 key = 1
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).keyX) },
-	// optional int64 str = 2
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).strX) },
-	// optional int64 num = 3
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).numX) },
-}
-
-func (p *Mapping) decoder() []decoder {
-	return mappingDecoder
-}
-
-func (p *Mapping) encode(b *buffer) {
-	encodeUint64Opt(b, 1, p.ID)
-	encodeUint64Opt(b, 2, p.Start)
-	encodeUint64Opt(b, 3, p.Limit)
-	encodeUint64Opt(b, 4, p.Offset)
-	encodeInt64Opt(b, 5, p.fileX)
-	encodeInt64Opt(b, 6, p.buildIDX)
-	encodeBoolOpt(b, 7, p.HasFunctions)
-	encodeBoolOpt(b, 8, p.HasFilenames)
-	encodeBoolOpt(b, 9, p.HasLineNumbers)
-	encodeBoolOpt(b, 10, p.HasInlineFrames)
-}
-
-var mappingDecoder = []decoder{
-	nil, // 0
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).ID) },            // optional uint64 id = 1
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Start) },         // optional uint64 memory_offset = 2
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Limit) },         // optional uint64 memory_limit = 3
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Offset) },        // optional uint64 file_offset = 4
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Mapping).fileX) },          // optional int64 filename = 5
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Mapping).buildIDX) },       // optional int64 build_id = 6
-	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasFunctions) },    // optional bool has_functions = 7
-	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasFilenames) },    // optional bool has_filenames = 8
-	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasLineNumbers) },  // optional bool has_line_numbers = 9
-	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasInlineFrames) }, // optional bool has_inline_frames = 10
-}
-
-func (p *Location) decoder() []decoder {
-	return locationDecoder
-}
-
-func (p *Location) encode(b *buffer) {
-	encodeUint64Opt(b, 1, p.ID)
-	encodeUint64Opt(b, 2, p.mappingIDX)
-	encodeUint64Opt(b, 3, p.Address)
-	for i := range p.Line {
-		encodeMessage(b, 4, &p.Line[i])
-	}
-}
-
-var locationDecoder = []decoder{
-	nil, // 0
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).ID) },         // optional uint64 id = 1;
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).mappingIDX) }, // optional uint64 mapping_id = 2;
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).Address) },    // optional uint64 address = 3;
-	func(b *buffer, m message) error { // repeated Line line = 4
-		pp := m.(*Location)
-		n := len(pp.Line)
-		pp.Line = append(pp.Line, Line{})
-		return decodeMessage(b, &pp.Line[n])
-	},
-}
-
-func (p *Line) decoder() []decoder {
-	return lineDecoder
-}
-
-func (p *Line) encode(b *buffer) {
-	encodeUint64Opt(b, 1, p.functionIDX)
-	encodeInt64Opt(b, 2, p.Line)
-}
-
-var lineDecoder = []decoder{
-	nil, // 0
-	// optional uint64 function_id = 1
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Line).functionIDX) },
-	// optional int64 line = 2
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Line).Line) },
-}
-
-func (p *Function) decoder() []decoder {
-	return functionDecoder
-}
-
-func (p *Function) encode(b *buffer) {
-	encodeUint64Opt(b, 1, p.ID)
-	encodeInt64Opt(b, 2, p.nameX)
-	encodeInt64Opt(b, 3, p.systemNameX)
-	encodeInt64Opt(b, 4, p.filenameX)
-	encodeInt64Opt(b, 5, p.StartLine)
-}
-
-var functionDecoder = []decoder{
-	nil, // 0
-	// optional uint64 id = 1
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Function).ID) },
-	// optional int64 function_name = 2
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).nameX) },
-	// optional int64 function_system_name = 3
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).systemNameX) },
-	// repeated int64 filename = 4
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).filenameX) },
-	// optional int64 start_line = 5
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).StartLine) },
-}
-
-func addString(strings map[string]int, s string) int64 {
-	i, ok := strings[s]
-	if !ok {
-		i = len(strings)
-		strings[s] = i
-	}
-	return int64(i)
-}
-
-func getString(strings []string, strng *int64, err error) (string, error) {
-	if err != nil {
-		return "", err
-	}
-	s := int(*strng)
-	if s < 0 || s >= len(strings) {
-		return "", errMalformed
-	}
-	*strng = 0
-	return strings[s], nil
-}
diff --git a/libgo/go/runtime/pprof/internal/profile/filter.go b/libgo/go/runtime/pprof/internal/profile/filter.go
deleted file mode 100644
index 9cad866df8c..00000000000
--- a/libgo/go/runtime/pprof/internal/profile/filter.go
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Implements methods to filter samples from profiles.
-
-package profile
-
-import "regexp"
-
-// FilterSamplesByName filters the samples in a profile and only keeps
-// samples where at least one frame matches focus but none match ignore.
-// Returns true is the corresponding regexp matched at least one sample.
-func (p *Profile) FilterSamplesByName(focus, ignore, hide *regexp.Regexp) (fm, im, hm bool) {
-	focusOrIgnore := make(map[uint64]bool)
-	hidden := make(map[uint64]bool)
-	for _, l := range p.Location {
-		if ignore != nil && l.matchesName(ignore) {
-			im = true
-			focusOrIgnore[l.ID] = false
-		} else if focus == nil || l.matchesName(focus) {
-			fm = true
-			focusOrIgnore[l.ID] = true
-		}
-		if hide != nil && l.matchesName(hide) {
-			hm = true
-			l.Line = l.unmatchedLines(hide)
-			if len(l.Line) == 0 {
-				hidden[l.ID] = true
-			}
-		}
-	}
-
-	s := make([]*Sample, 0, len(p.Sample))
-	for _, sample := range p.Sample {
-		if focusedAndNotIgnored(sample.Location, focusOrIgnore) {
-			if len(hidden) > 0 {
-				var locs []*Location
-				for _, loc := range sample.Location {
-					if !hidden[loc.ID] {
-						locs = append(locs, loc)
-					}
-				}
-				if len(locs) == 0 {
-					// Remove sample with no locations (by not adding it to s).
-					continue
-				}
-				sample.Location = locs
-			}
-			s = append(s, sample)
-		}
-	}
-	p.Sample = s
-
-	return
-}
-
-// matchesName reports whether the function name or file in the
-// location matches the regular expression.
-func (loc *Location) matchesName(re *regexp.Regexp) bool {
-	for _, ln := range loc.Line {
-		if fn := ln.Function; fn != nil {
-			if re.MatchString(fn.Name) {
-				return true
-			}
-			if re.MatchString(fn.Filename) {
-				return true
-			}
-		}
-	}
-	return false
-}
-
-// unmatchedLines returns the lines in the location that do not match
-// the regular expression.
-func (loc *Location) unmatchedLines(re *regexp.Regexp) []Line {
-	var lines []Line
-	for _, ln := range loc.Line {
-		if fn := ln.Function; fn != nil {
-			if re.MatchString(fn.Name) {
-				continue
-			}
-			if re.MatchString(fn.Filename) {
-				continue
-			}
-		}
-		lines = append(lines, ln)
-	}
-	return lines
-}
-
-// focusedAndNotIgnored looks up a slice of ids against a map of
-// focused/ignored locations. The map only contains locations that are
-// explicitly focused or ignored. Returns whether there is at least
-// one focused location but no ignored locations.
-func focusedAndNotIgnored(locs []*Location, m map[uint64]bool) bool {
-	var f bool
-	for _, loc := range locs {
-		if focus, focusOrIgnore := m[loc.ID]; focusOrIgnore {
-			if focus {
-				// Found focused location. Must keep searching in case there
-				// is an ignored one as well.
-				f = true
-			} else {
-				// Found ignored location. Can return false right away.
-				return false
-			}
-		}
-	}
-	return f
-}
-
-// TagMatch selects tags for filtering
-type TagMatch func(key, val string, nval int64) bool
-
-// FilterSamplesByTag removes all samples from the profile, except
-// those that match focus and do not match the ignore regular
-// expression.
-func (p *Profile) FilterSamplesByTag(focus, ignore TagMatch) (fm, im bool) {
-	samples := make([]*Sample, 0, len(p.Sample))
-	for _, s := range p.Sample {
-		focused, ignored := focusedSample(s, focus, ignore)
-		fm = fm || focused
-		im = im || ignored
-		if focused && !ignored {
-			samples = append(samples, s)
-		}
-	}
-	p.Sample = samples
-	return
-}
-
-// focusedTag checks a sample against focus and ignore regexps.
-// Returns whether the focus/ignore regexps match any tags
-func focusedSample(s *Sample, focus, ignore TagMatch) (fm, im bool) {
-	fm = focus == nil
-	for key, vals := range s.Label {
-		for _, val := range vals {
-			if ignore != nil && ignore(key, val, 0) {
-				im = true
-			}
-			if !fm && focus(key, val, 0) {
-				fm = true
-			}
-		}
-	}
-	for key, vals := range s.NumLabel {
-		for _, val := range vals {
-			if ignore != nil && ignore(key, "", val) {
-				im = true
-			}
-			if !fm && focus(key, "", val) {
-				fm = true
-			}
-		}
-	}
-	return fm, im
-}
diff --git a/libgo/go/runtime/pprof/internal/profile/legacy_profile.go b/libgo/go/runtime/pprof/internal/profile/legacy_profile.go
deleted file mode 100644
index d69f8deee7c..00000000000
--- a/libgo/go/runtime/pprof/internal/profile/legacy_profile.go
+++ /dev/null
@@ -1,1266 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file implements parsers to convert legacy profiles into the
-// profile.proto format.
-
-package profile
-
-import (
-	"bufio"
-	"bytes"
-	"fmt"
-	"io"
-	"math"
-	"regexp"
-	"strconv"
-	"strings"
-)
-
-var (
-	countStartRE = regexp.MustCompile(`\A(\w+) profile: total \d+\n\z`)
-	countRE      = regexp.MustCompile(`\A(\d+) @(( 0x[0-9a-f]+)+)\n\z`)
-
-	heapHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] *@ *(heap[_a-z0-9]*)/?(\d*)`)
-	heapSampleRE = regexp.MustCompile(`(-?\d+): *(-?\d+) *\[ *(\d+): *(\d+) *] @([ x0-9a-f]*)`)
-
-	contentionSampleRE = regexp.MustCompile(`(\d+) *(\d+) @([ x0-9a-f]*)`)
-
-	hexNumberRE = regexp.MustCompile(`0x[0-9a-f]+`)
-
-	growthHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] @ growthz`)
-
-	fragmentationHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] @ fragmentationz`)
-
-	threadzStartRE = regexp.MustCompile(`--- threadz \d+ ---`)
-	threadStartRE  = regexp.MustCompile(`--- Thread ([[:xdigit:]]+) \(name: (.*)/(\d+)\) stack: ---`)
-
-	procMapsRE = regexp.MustCompile(`([[:xdigit:]]+)-([[:xdigit:]]+)\s+([-rwxp]+)\s+([[:xdigit:]]+)\s+([[:xdigit:]]+):([[:xdigit:]]+)\s+([[:digit:]]+)\s*(\S+)?`)
-
-	briefMapsRE = regexp.MustCompile(`\s*([[:xdigit:]]+)-([[:xdigit:]]+):\s*(\S+)(\s.*@)?([[:xdigit:]]+)?`)
-
-	// LegacyHeapAllocated instructs the heapz parsers to use the
-	// allocated memory stats instead of the default in-use memory. Note
-	// that tcmalloc doesn't provide all allocated memory, only in-use
-	// stats.
-	LegacyHeapAllocated bool
-)
-
-func isSpaceOrComment(line string) bool {
-	trimmed := strings.TrimSpace(line)
-	return len(trimmed) == 0 || trimmed[0] == '#'
-}
-
-// parseGoCount parses a Go count profile (e.g., threadcreate or
-// goroutine) and returns a new Profile.
-func parseGoCount(b []byte) (*Profile, error) {
-	r := bytes.NewBuffer(b)
-
-	var line string
-	var err error
-	for {
-		// Skip past comments and empty lines seeking a real header.
-		line, err = r.ReadString('\n')
-		if err != nil {
-			return nil, err
-		}
-		if !isSpaceOrComment(line) {
-			break
-		}
-	}
-
-	m := countStartRE.FindStringSubmatch(line)
-	if m == nil {
-		return nil, errUnrecognized
-	}
-	profileType := m[1]
-	p := &Profile{
-		PeriodType: &ValueType{Type: profileType, Unit: "count"},
-		Period:     1,
-		SampleType: []*ValueType{{Type: profileType, Unit: "count"}},
-	}
-	locations := make(map[uint64]*Location)
-	for {
-		line, err = r.ReadString('\n')
-		if err != nil {
-			if err == io.EOF {
-				break
-			}
-			return nil, err
-		}
-		if isSpaceOrComment(line) {
-			continue
-		}
-		if strings.HasPrefix(line, "---") {
-			break
-		}
-		m := countRE.FindStringSubmatch(line)
-		if m == nil {
-			return nil, errMalformed
-		}
-		n, err := strconv.ParseInt(m[1], 0, 64)
-		if err != nil {
-			return nil, errMalformed
-		}
-		fields := strings.Fields(m[2])
-		locs := make([]*Location, 0, len(fields))
-		for _, stk := range fields {
-			addr, err := strconv.ParseUint(stk, 0, 64)
-			if err != nil {
-				return nil, errMalformed
-			}
-			// Adjust all frames by -1 to land on the call instruction.
-			addr--
-			loc := locations[addr]
-			if loc == nil {
-				loc = &Location{
-					Address: addr,
-				}
-				locations[addr] = loc
-				p.Location = append(p.Location, loc)
-			}
-			locs = append(locs, loc)
-		}
-		p.Sample = append(p.Sample, &Sample{
-			Location: locs,
-			Value:    []int64{n},
-		})
-	}
-
-	if err = parseAdditionalSections(strings.TrimSpace(line), r, p); err != nil {
-		return nil, err
-	}
-	return p, nil
-}
-
-// remapLocationIDs ensures there is a location for each address
-// referenced by a sample, and remaps the samples to point to the new
-// location ids.
-func (p *Profile) remapLocationIDs() {
-	seen := make(map[*Location]bool, len(p.Location))
-	var locs []*Location
-
-	for _, s := range p.Sample {
-		for _, l := range s.Location {
-			if seen[l] {
-				continue
-			}
-			l.ID = uint64(len(locs) + 1)
-			locs = append(locs, l)
-			seen[l] = true
-		}
-	}
-	p.Location = locs
-}
-
-func (p *Profile) remapFunctionIDs() {
-	seen := make(map[*Function]bool, len(p.Function))
-	var fns []*Function
-
-	for _, l := range p.Location {
-		for _, ln := range l.Line {
-			fn := ln.Function
-			if fn == nil || seen[fn] {
-				continue
-			}
-			fn.ID = uint64(len(fns) + 1)
-			fns = append(fns, fn)
-			seen[fn] = true
-		}
-	}
-	p.Function = fns
-}
-
-// remapMappingIDs matches location addresses with existing mappings
-// and updates them appropriately. This is O(N*M), if this ever shows
-// up as a bottleneck, evaluate sorting the mappings and doing a
-// binary search, which would make it O(N*log(M)).
-func (p *Profile) remapMappingIDs() {
-	if len(p.Mapping) == 0 {
-		return
-	}
-
-	// Some profile handlers will incorrectly set regions for the main
-	// executable if its section is remapped. Fix them through heuristics.
-
-	// Remove the initial mapping if named '/anon_hugepage' and has a
-	// consecutive adjacent mapping.
-	if m := p.Mapping[0]; strings.HasPrefix(m.File, "/anon_hugepage") {
-		if len(p.Mapping) > 1 && m.Limit == p.Mapping[1].Start {
-			p.Mapping = p.Mapping[1:]
-		}
-	}
-
-	// Subtract the offset from the start of the main mapping if it
-	// ends up at a recognizable start address.
-	const expectedStart = 0x400000
-	if m := p.Mapping[0]; m.Start-m.Offset == expectedStart {
-		m.Start = expectedStart
-		m.Offset = 0
-	}
-
-	for _, l := range p.Location {
-		if a := l.Address; a != 0 {
-			for _, m := range p.Mapping {
-				if m.Start <= a && a < m.Limit {
-					l.Mapping = m
-					break
-				}
-			}
-		}
-	}
-
-	// Reset all mapping IDs.
-	for i, m := range p.Mapping {
-		m.ID = uint64(i + 1)
-	}
-}
-
-var cpuInts = []func([]byte) (uint64, []byte){
-	get32l,
-	get32b,
-	get64l,
-	get64b,
-}
-
-func get32l(b []byte) (uint64, []byte) {
-	if len(b) < 4 {
-		return 0, nil
-	}
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24, b[4:]
-}
-
-func get32b(b []byte) (uint64, []byte) {
-	if len(b) < 4 {
-		return 0, nil
-	}
-	return uint64(b[3]) | uint64(b[2])<<8 | uint64(b[1])<<16 | uint64(b[0])<<24, b[4:]
-}
-
-func get64l(b []byte) (uint64, []byte) {
-	if len(b) < 8 {
-		return 0, nil
-	}
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56, b[8:]
-}
-
-func get64b(b []byte) (uint64, []byte) {
-	if len(b) < 8 {
-		return 0, nil
-	}
-	return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56, b[8:]
-}
-
-// ParseTracebacks parses a set of tracebacks and returns a newly
-// populated profile. It will accept any text file and generate a
-// Profile out of it with any hex addresses it can identify, including
-// a process map if it can recognize one. Each sample will include a
-// tag "source" with the addresses recognized in string format.
-func ParseTracebacks(b []byte) (*Profile, error) {
-	r := bytes.NewBuffer(b)
-
-	p := &Profile{
-		PeriodType: &ValueType{Type: "trace", Unit: "count"},
-		Period:     1,
-		SampleType: []*ValueType{
-			{Type: "trace", Unit: "count"},
-		},
-	}
-
-	var sources []string
-	var sloc []*Location
-
-	locs := make(map[uint64]*Location)
-	for {
-		l, err := r.ReadString('\n')
-		if err != nil {
-			if err != io.EOF {
-				return nil, err
-			}
-			if l == "" {
-				break
-			}
-		}
-		if sectionTrigger(l) == memoryMapSection {
-			break
-		}
-		if s, addrs := extractHexAddresses(l); len(s) > 0 {
-			for _, addr := range addrs {
-				// Addresses from stack traces point to the next instruction after
-				// each call. Adjust by -1 to land somewhere on the actual call.
-				addr--
-				loc := locs[addr]
-				if locs[addr] == nil {
-					loc = &Location{
-						Address: addr,
-					}
-					p.Location = append(p.Location, loc)
-					locs[addr] = loc
-				}
-				sloc = append(sloc, loc)
-			}
-
-			sources = append(sources, s...)
-		} else {
-			if len(sources) > 0 || len(sloc) > 0 {
-				addTracebackSample(sloc, sources, p)
-				sloc, sources = nil, nil
-			}
-		}
-	}
-
-	// Add final sample to save any leftover data.
-	if len(sources) > 0 || len(sloc) > 0 {
-		addTracebackSample(sloc, sources, p)
-	}
-
-	if err := p.ParseMemoryMap(r); err != nil {
-		return nil, err
-	}
-	return p, nil
-}
-
-func addTracebackSample(l []*Location, s []string, p *Profile) {
-	p.Sample = append(p.Sample,
-		&Sample{
-			Value:    []int64{1},
-			Location: l,
-			Label:    map[string][]string{"source": s},
-		})
-}
-
-// parseCPU parses a profilez legacy profile and returns a newly
-// populated Profile.
-//
-// The general format for profilez samples is a sequence of words in
-// binary format. The first words are a header with the following data:
-//   1st word -- 0
-//   2nd word -- 3
-//   3rd word -- 0 if a c++ application, 1 if a java application.
-//   4th word -- Sampling period (in microseconds).
-//   5th word -- Padding.
-func parseCPU(b []byte) (*Profile, error) {
-	var parse func([]byte) (uint64, []byte)
-	var n1, n2, n3, n4, n5 uint64
-	for _, parse = range cpuInts {
-		var tmp []byte
-		n1, tmp = parse(b)
-		n2, tmp = parse(tmp)
-		n3, tmp = parse(tmp)
-		n4, tmp = parse(tmp)
-		n5, tmp = parse(tmp)
-
-		if tmp != nil && n1 == 0 && n2 == 3 && n3 == 0 && n4 > 0 && n5 == 0 {
-			b = tmp
-			return cpuProfile(b, int64(n4), parse)
-		}
-	}
-	return nil, errUnrecognized
-}
-
-// cpuProfile returns a new Profile from C++ profilez data.
-// b is the profile bytes after the header, period is the profiling
-// period, and parse is a function to parse 8-byte chunks from the
-// profile in its native endianness.
-func cpuProfile(b []byte, period int64, parse func(b []byte) (uint64, []byte)) (*Profile, error) {
-	p := &Profile{
-		Period:     period * 1000,
-		PeriodType: &ValueType{Type: "cpu", Unit: "nanoseconds"},
-		SampleType: []*ValueType{
-			{Type: "samples", Unit: "count"},
-			{Type: "cpu", Unit: "nanoseconds"},
-		},
-	}
-	var err error
-	if b, _, err = parseCPUSamples(b, parse, true, p); err != nil {
-		return nil, err
-	}
-
-	// If all samples have the same second-to-the-bottom frame, it
-	// strongly suggests that it is an uninteresting artifact of
-	// measurement -- a stack frame pushed by the signal handler. The
-	// bottom frame is always correct as it is picked up from the signal
-	// structure, not the stack. Check if this is the case and if so,
-	// remove.
-	if len(p.Sample) > 1 && len(p.Sample[0].Location) > 1 {
-		allSame := true
-		id1 := p.Sample[0].Location[1].Address
-		for _, s := range p.Sample {
-			if len(s.Location) < 2 || id1 != s.Location[1].Address {
-				allSame = false
-				break
-			}
-		}
-		if allSame {
-			for _, s := range p.Sample {
-				s.Location = append(s.Location[:1], s.Location[2:]...)
-			}
-		}
-	}
-
-	if err := p.ParseMemoryMap(bytes.NewBuffer(b)); err != nil {
-		return nil, err
-	}
-	return p, nil
-}
-
-// parseCPUSamples parses a collection of profilez samples from a
-// profile.
-//
-// profilez samples are a repeated sequence of stack frames of the
-// form:
-//    1st word -- The number of times this stack was encountered.
-//    2nd word -- The size of the stack (StackSize).
-//    3rd word -- The first address on the stack.
-//    ...
-//    StackSize + 2 -- The last address on the stack
-// The last stack trace is of the form:
-//   1st word -- 0
-//   2nd word -- 1
-//   3rd word -- 0
-//
-// Addresses from stack traces may point to the next instruction after
-// each call. Optionally adjust by -1 to land somewhere on the actual
-// call (except for the leaf, which is not a call).
-func parseCPUSamples(b []byte, parse func(b []byte) (uint64, []byte), adjust bool, p *Profile) ([]byte, map[uint64]*Location, error) {
-	locs := make(map[uint64]*Location)
-	for len(b) > 0 {
-		var count, nstk uint64
-		count, b = parse(b)
-		nstk, b = parse(b)
-		if b == nil || nstk > uint64(len(b)/4) {
-			return nil, nil, errUnrecognized
-		}
-		var sloc []*Location
-		addrs := make([]uint64, nstk)
-		for i := 0; i < int(nstk); i++ {
-			addrs[i], b = parse(b)
-		}
-
-		if count == 0 && nstk == 1 && addrs[0] == 0 {
-			// End of data marker
-			break
-		}
-		for i, addr := range addrs {
-			if adjust && i > 0 {
-				addr--
-			}
-			loc := locs[addr]
-			if loc == nil {
-				loc = &Location{
-					Address: addr,
-				}
-				locs[addr] = loc
-				p.Location = append(p.Location, loc)
-			}
-			sloc = append(sloc, loc)
-		}
-		p.Sample = append(p.Sample,
-			&Sample{
-				Value:    []int64{int64(count), int64(count) * p.Period},
-				Location: sloc,
-			})
-	}
-	// Reached the end without finding the EOD marker.
-	return b, locs, nil
-}
-
-// parseHeap parses a heapz legacy or a growthz profile and
-// returns a newly populated Profile.
-func parseHeap(b []byte) (p *Profile, err error) {
-	r := bytes.NewBuffer(b)
-	l, err := r.ReadString('\n')
-	if err != nil {
-		return nil, errUnrecognized
-	}
-
-	sampling := ""
-
-	if header := heapHeaderRE.FindStringSubmatch(l); header != nil {
-		p = &Profile{
-			SampleType: []*ValueType{
-				{Type: "objects", Unit: "count"},
-				{Type: "space", Unit: "bytes"},
-			},
-			PeriodType: &ValueType{Type: "objects", Unit: "bytes"},
-		}
-
-		var period int64
-		if len(header[6]) > 0 {
-			if period, err = strconv.ParseInt(header[6], 10, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		}
-
-		switch header[5] {
-		case "heapz_v2", "heap_v2":
-			sampling, p.Period = "v2", period
-		case "heapprofile":
-			sampling, p.Period = "", 1
-		case "heap":
-			sampling, p.Period = "v2", period/2
-		default:
-			return nil, errUnrecognized
-		}
-	} else if header = growthHeaderRE.FindStringSubmatch(l); header != nil {
-		p = &Profile{
-			SampleType: []*ValueType{
-				{Type: "objects", Unit: "count"},
-				{Type: "space", Unit: "bytes"},
-			},
-			PeriodType: &ValueType{Type: "heapgrowth", Unit: "count"},
-			Period:     1,
-		}
-	} else if header = fragmentationHeaderRE.FindStringSubmatch(l); header != nil {
-		p = &Profile{
-			SampleType: []*ValueType{
-				{Type: "objects", Unit: "count"},
-				{Type: "space", Unit: "bytes"},
-			},
-			PeriodType: &ValueType{Type: "allocations", Unit: "count"},
-			Period:     1,
-		}
-	} else {
-		return nil, errUnrecognized
-	}
-
-	if LegacyHeapAllocated {
-		for _, st := range p.SampleType {
-			st.Type = "alloc_" + st.Type
-		}
-	} else {
-		for _, st := range p.SampleType {
-			st.Type = "inuse_" + st.Type
-		}
-	}
-
-	locs := make(map[uint64]*Location)
-	for {
-		l, err = r.ReadString('\n')
-		if err != nil {
-			if err != io.EOF {
-				return nil, err
-			}
-
-			if l == "" {
-				break
-			}
-		}
-
-		if isSpaceOrComment(l) {
-			continue
-		}
-		l = strings.TrimSpace(l)
-
-		if sectionTrigger(l) != unrecognizedSection {
-			break
-		}
-
-		value, blocksize, addrs, err := parseHeapSample(l, p.Period, sampling)
-		if err != nil {
-			return nil, err
-		}
-		var sloc []*Location
-		for _, addr := range addrs {
-			// Addresses from stack traces point to the next instruction after
-			// each call. Adjust by -1 to land somewhere on the actual call.
-			addr--
-			loc := locs[addr]
-			if locs[addr] == nil {
-				loc = &Location{
-					Address: addr,
-				}
-				p.Location = append(p.Location, loc)
-				locs[addr] = loc
-			}
-			sloc = append(sloc, loc)
-		}
-
-		p.Sample = append(p.Sample, &Sample{
-			Value:    value,
-			Location: sloc,
-			NumLabel: map[string][]int64{"bytes": {blocksize}},
-		})
-	}
-
-	if err = parseAdditionalSections(l, r, p); err != nil {
-		return nil, err
-	}
-	return p, nil
-}
-
-// parseHeapSample parses a single row from a heap profile into a new Sample.
-func parseHeapSample(line string, rate int64, sampling string) (value []int64, blocksize int64, addrs []uint64, err error) {
-	sampleData := heapSampleRE.FindStringSubmatch(line)
-	if len(sampleData) != 6 {
-		return value, blocksize, addrs, fmt.Errorf("unexpected number of sample values: got %d, want 6", len(sampleData))
-	}
-
-	// Use first two values by default; tcmalloc sampling generates the
-	// same value for both, only the older heap-profile collect separate
-	// stats for in-use and allocated objects.
-	valueIndex := 1
-	if LegacyHeapAllocated {
-		valueIndex = 3
-	}
-
-	var v1, v2 int64
-	if v1, err = strconv.ParseInt(sampleData[valueIndex], 10, 64); err != nil {
-		return value, blocksize, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
-	}
-	if v2, err = strconv.ParseInt(sampleData[valueIndex+1], 10, 64); err != nil {
-		return value, blocksize, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
-	}
-
-	if v1 == 0 {
-		if v2 != 0 {
-			return value, blocksize, addrs, fmt.Errorf("allocation count was 0 but allocation bytes was %d", v2)
-		}
-	} else {
-		blocksize = v2 / v1
-		if sampling == "v2" {
-			v1, v2 = scaleHeapSample(v1, v2, rate)
-		}
-	}
-
-	value = []int64{v1, v2}
-	addrs = parseHexAddresses(sampleData[5])
-
-	return value, blocksize, addrs, nil
-}
-
-// extractHexAddresses extracts hex numbers from a string and returns
-// them, together with their numeric value, in a slice.
-func extractHexAddresses(s string) ([]string, []uint64) {
-	hexStrings := hexNumberRE.FindAllString(s, -1)
-	var ids []uint64
-	for _, s := range hexStrings {
-		if id, err := strconv.ParseUint(s, 0, 64); err == nil {
-			ids = append(ids, id)
-		} else {
-			// Do not expect any parsing failures due to the regexp matching.
-			panic("failed to parse hex value:" + s)
-		}
-	}
-	return hexStrings, ids
-}
-
-// parseHexAddresses parses hex numbers from a string and returns them
-// in a slice.
-func parseHexAddresses(s string) []uint64 {
-	_, ids := extractHexAddresses(s)
-	return ids
-}
-
-// scaleHeapSample adjusts the data from a heapz Sample to
-// account for its probability of appearing in the collected
-// data. heapz profiles are a sampling of the memory allocations
-// requests in a program. We estimate the unsampled value by dividing
-// each collected sample by its probability of appearing in the
-// profile. heapz v2 profiles rely on a poisson process to determine
-// which samples to collect, based on the desired average collection
-// rate R. The probability of a sample of size S to appear in that
-// profile is 1-exp(-S/R).
-func scaleHeapSample(count, size, rate int64) (int64, int64) {
-	if count == 0 || size == 0 {
-		return 0, 0
-	}
-
-	if rate <= 1 {
-		// if rate==1 all samples were collected so no adjustment is needed.
-		// if rate<1 treat as unknown and skip scaling.
-		return count, size
-	}
-
-	avgSize := float64(size) / float64(count)
-	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
-
-	return int64(float64(count) * scale), int64(float64(size) * scale)
-}
-
-// parseContention parses a mutex or contention profile. There are 2 cases:
-// "--- contentionz " for legacy C++ profiles (and backwards compatibility)
-// "--- mutex:" or "--- contention:" for profiles generated by the Go runtime.
-// This code converts the text output from runtime into a *Profile. (In the future
-// the runtime might write a serialized Profile directly making this unnecessary.)
-func parseContention(b []byte) (*Profile, error) {
-	r := bytes.NewBuffer(b)
-	var l string
-	var err error
-	for {
-		// Skip past comments and empty lines seeking a real header.
-		l, err = r.ReadString('\n')
-		if err != nil {
-			return nil, err
-		}
-		if !isSpaceOrComment(l) {
-			break
-		}
-	}
-
-	if strings.HasPrefix(l, "--- contentionz ") {
-		return parseCppContention(r)
-	} else if strings.HasPrefix(l, "--- mutex:") {
-		return parseCppContention(r)
-	} else if strings.HasPrefix(l, "--- contention:") {
-		return parseCppContention(r)
-	}
-	return nil, errUnrecognized
-}
-
-// parseCppContention parses the output from synchronization_profiling.cc
-// for backward compatibility, and the compatible (non-debug) block profile
-// output from the Go runtime.
-func parseCppContention(r *bytes.Buffer) (*Profile, error) {
-	p := &Profile{
-		PeriodType: &ValueType{Type: "contentions", Unit: "count"},
-		Period:     1,
-		SampleType: []*ValueType{
-			{Type: "contentions", Unit: "count"},
-			{Type: "delay", Unit: "nanoseconds"},
-		},
-	}
-
-	var cpuHz int64
-	var l string
-	var err error
-	// Parse text of the form "attribute = value" before the samples.
-	const delimiter = "="
-	for {
-		l, err = r.ReadString('\n')
-		if err != nil {
-			if err != io.EOF {
-				return nil, err
-			}
-
-			if l == "" {
-				break
-			}
-		}
-		if isSpaceOrComment(l) {
-			continue
-		}
-
-		if l = strings.TrimSpace(l); l == "" {
-			continue
-		}
-
-		if strings.HasPrefix(l, "---") {
-			break
-		}
-
-		attr := strings.SplitN(l, delimiter, 2)
-		if len(attr) != 2 {
-			break
-		}
-		key, val := strings.TrimSpace(attr[0]), strings.TrimSpace(attr[1])
-		var err error
-		switch key {
-		case "cycles/second":
-			if cpuHz, err = strconv.ParseInt(val, 0, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		case "sampling period":
-			if p.Period, err = strconv.ParseInt(val, 0, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		case "ms since reset":
-			ms, err := strconv.ParseInt(val, 0, 64)
-			if err != nil {
-				return nil, errUnrecognized
-			}
-			p.DurationNanos = ms * 1000 * 1000
-		case "format":
-			// CPP contentionz profiles don't have format.
-			return nil, errUnrecognized
-		case "resolution":
-			// CPP contentionz profiles don't have resolution.
-			return nil, errUnrecognized
-		case "discarded samples":
-		default:
-			return nil, errUnrecognized
-		}
-	}
-
-	locs := make(map[uint64]*Location)
-	for {
-		if !isSpaceOrComment(l) {
-			if l = strings.TrimSpace(l); strings.HasPrefix(l, "---") {
-				break
-			}
-			value, addrs, err := parseContentionSample(l, p.Period, cpuHz)
-			if err != nil {
-				return nil, err
-			}
-			var sloc []*Location
-			for _, addr := range addrs {
-				// Addresses from stack traces point to the next instruction after
-				// each call. Adjust by -1 to land somewhere on the actual call.
-				addr--
-				loc := locs[addr]
-				if locs[addr] == nil {
-					loc = &Location{
-						Address: addr,
-					}
-					p.Location = append(p.Location, loc)
-					locs[addr] = loc
-				}
-				sloc = append(sloc, loc)
-			}
-			p.Sample = append(p.Sample, &Sample{
-				Value:    value,
-				Location: sloc,
-			})
-		}
-
-		if l, err = r.ReadString('\n'); err != nil {
-			if err != io.EOF {
-				return nil, err
-			}
-			if l == "" {
-				break
-			}
-		}
-	}
-
-	if err = parseAdditionalSections(l, r, p); err != nil {
-		return nil, err
-	}
-
-	return p, nil
-}
-
-// parseContentionSample parses a single row from a contention profile
-// into a new Sample.
-func parseContentionSample(line string, period, cpuHz int64) (value []int64, addrs []uint64, err error) {
-	sampleData := contentionSampleRE.FindStringSubmatch(line)
-	if sampleData == nil {
-		return value, addrs, errUnrecognized
-	}
-
-	v1, err := strconv.ParseInt(sampleData[1], 10, 64)
-	if err != nil {
-		return value, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
-	}
-	v2, err := strconv.ParseInt(sampleData[2], 10, 64)
-	if err != nil {
-		return value, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
-	}
-
-	// Unsample values if period and cpuHz are available.
-	// - Delays are scaled to cycles and then to nanoseconds.
-	// - Contentions are scaled to cycles.
-	if period > 0 {
-		if cpuHz > 0 {
-			cpuGHz := float64(cpuHz) / 1e9
-			v1 = int64(float64(v1) * float64(period) / cpuGHz)
-		}
-		v2 = v2 * period
-	}
-
-	value = []int64{v2, v1}
-	addrs = parseHexAddresses(sampleData[3])
-
-	return value, addrs, nil
-}
-
-// parseThread parses a Threadz profile and returns a new Profile.
-func parseThread(b []byte) (*Profile, error) {
-	r := bytes.NewBuffer(b)
-
-	var line string
-	var err error
-	for {
-		// Skip past comments and empty lines seeking a real header.
-		line, err = r.ReadString('\n')
-		if err != nil {
-			return nil, err
-		}
-		if !isSpaceOrComment(line) {
-			break
-		}
-	}
-
-	if m := threadzStartRE.FindStringSubmatch(line); m != nil {
-		// Advance over initial comments until first stack trace.
-		for {
-			line, err = r.ReadString('\n')
-			if err != nil {
-				if err != io.EOF {
-					return nil, err
-				}
-
-				if line == "" {
-					break
-				}
-			}
-			if sectionTrigger(line) != unrecognizedSection || line[0] == '-' {
-				break
-			}
-		}
-	} else if t := threadStartRE.FindStringSubmatch(line); len(t) != 4 {
-		return nil, errUnrecognized
-	}
-
-	p := &Profile{
-		SampleType: []*ValueType{{Type: "thread", Unit: "count"}},
-		PeriodType: &ValueType{Type: "thread", Unit: "count"},
-		Period:     1,
-	}
-
-	locs := make(map[uint64]*Location)
-	// Recognize each thread and populate profile samples.
-	for sectionTrigger(line) == unrecognizedSection {
-		if strings.HasPrefix(line, "---- no stack trace for") {
-			line = ""
-			break
-		}
-		if t := threadStartRE.FindStringSubmatch(line); len(t) != 4 {
-			return nil, errUnrecognized
-		}
-
-		var addrs []uint64
-		line, addrs, err = parseThreadSample(r)
-		if err != nil {
-			return nil, errUnrecognized
-		}
-		if len(addrs) == 0 {
-			// We got a --same as previous threads--. Bump counters.
-			if len(p.Sample) > 0 {
-				s := p.Sample[len(p.Sample)-1]
-				s.Value[0]++
-			}
-			continue
-		}
-
-		var sloc []*Location
-		for _, addr := range addrs {
-			// Addresses from stack traces point to the next instruction after
-			// each call. Adjust by -1 to land somewhere on the actual call.
-			addr--
-			loc := locs[addr]
-			if locs[addr] == nil {
-				loc = &Location{
-					Address: addr,
-				}
-				p.Location = append(p.Location, loc)
-				locs[addr] = loc
-			}
-			sloc = append(sloc, loc)
-		}
-
-		p.Sample = append(p.Sample, &Sample{
-			Value:    []int64{1},
-			Location: sloc,
-		})
-	}
-
-	if err = parseAdditionalSections(line, r, p); err != nil {
-		return nil, err
-	}
-
-	return p, nil
-}
-
-// parseThreadSample parses a symbolized or unsymbolized stack trace.
-// Returns the first line after the traceback, the sample (or nil if
-// it hits a 'same-as-previous' marker) and an error.
-func parseThreadSample(b *bytes.Buffer) (nextl string, addrs []uint64, err error) {
-	var l string
-	sameAsPrevious := false
-	for {
-		if l, err = b.ReadString('\n'); err != nil {
-			if err != io.EOF {
-				return "", nil, err
-			}
-			if l == "" {
-				break
-			}
-		}
-		if l = strings.TrimSpace(l); l == "" {
-			continue
-		}
-
-		if strings.HasPrefix(l, "---") {
-			break
-		}
-		if strings.Contains(l, "same as previous thread") {
-			sameAsPrevious = true
-			continue
-		}
-
-		addrs = append(addrs, parseHexAddresses(l)...)
-	}
-
-	if sameAsPrevious {
-		return l, nil, nil
-	}
-	return l, addrs, nil
-}
-
-// parseAdditionalSections parses any additional sections in the
-// profile, ignoring any unrecognized sections.
-func parseAdditionalSections(l string, b *bytes.Buffer, p *Profile) (err error) {
-	for {
-		if sectionTrigger(l) == memoryMapSection {
-			break
-		}
-		// Ignore any unrecognized sections.
-		if l, err := b.ReadString('\n'); err != nil {
-			if err != io.EOF {
-				return err
-			}
-			if l == "" {
-				break
-			}
-		}
-	}
-	return p.ParseMemoryMap(b)
-}
-
-// ParseMemoryMap parses a memory map in the format of
-// /proc/self/maps, and overrides the mappings in the current profile.
-// It renumbers the samples and locations in the profile correspondingly.
-func (p *Profile) ParseMemoryMap(rd io.Reader) error {
-	b := bufio.NewReader(rd)
-
-	var attrs []string
-	var r *strings.Replacer
-	const delimiter = "="
-	for {
-		l, err := b.ReadString('\n')
-		if err != nil {
-			if err != io.EOF {
-				return err
-			}
-			if l == "" {
-				break
-			}
-		}
-		if l = strings.TrimSpace(l); l == "" {
-			continue
-		}
-
-		if r != nil {
-			l = r.Replace(l)
-		}
-		m, err := parseMappingEntry(l)
-		if err != nil {
-			if err == errUnrecognized {
-				// Recognize assignments of the form: attr=value, and replace
-				// $attr with value on subsequent mappings.
-				if attr := strings.SplitN(l, delimiter, 2); len(attr) == 2 {
-					attrs = append(attrs, "$"+strings.TrimSpace(attr[0]), strings.TrimSpace(attr[1]))
-					r = strings.NewReplacer(attrs...)
-				}
-				// Ignore any unrecognized entries
-				continue
-			}
-			return err
-		}
-		if m == nil || (m.File == "" && len(p.Mapping) != 0) {
-			// In some cases the first entry may include the address range
-			// but not the name of the file. It should be followed by
-			// another entry with the name.
-			continue
-		}
-		if len(p.Mapping) == 1 && p.Mapping[0].File == "" {
-			// Update the name if this is the entry following that empty one.
-			p.Mapping[0].File = m.File
-			continue
-		}
-		p.Mapping = append(p.Mapping, m)
-	}
-	p.remapLocationIDs()
-	p.remapFunctionIDs()
-	p.remapMappingIDs()
-	return nil
-}
-
-func parseMappingEntry(l string) (*Mapping, error) {
-	mapping := &Mapping{}
-	var err error
-	if me := procMapsRE.FindStringSubmatch(l); len(me) == 9 {
-		if !strings.Contains(me[3], "x") {
-			// Skip non-executable entries.
-			return nil, nil
-		}
-		if mapping.Start, err = strconv.ParseUint(me[1], 16, 64); err != nil {
-			return nil, errUnrecognized
-		}
-		if mapping.Limit, err = strconv.ParseUint(me[2], 16, 64); err != nil {
-			return nil, errUnrecognized
-		}
-		if me[4] != "" {
-			if mapping.Offset, err = strconv.ParseUint(me[4], 16, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		}
-		mapping.File = me[8]
-		return mapping, nil
-	}
-
-	if me := briefMapsRE.FindStringSubmatch(l); len(me) == 6 {
-		if mapping.Start, err = strconv.ParseUint(me[1], 16, 64); err != nil {
-			return nil, errUnrecognized
-		}
-		if mapping.Limit, err = strconv.ParseUint(me[2], 16, 64); err != nil {
-			return nil, errUnrecognized
-		}
-		mapping.File = me[3]
-		if me[5] != "" {
-			if mapping.Offset, err = strconv.ParseUint(me[5], 16, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		}
-		return mapping, nil
-	}
-
-	return nil, errUnrecognized
-}
-
-type sectionType int
-
-const (
-	unrecognizedSection sectionType = iota
-	memoryMapSection
-)
-
-var memoryMapTriggers = []string{
-	"--- Memory map: ---",
-	"MAPPED_LIBRARIES:",
-}
-
-func sectionTrigger(line string) sectionType {
-	for _, trigger := range memoryMapTriggers {
-		if strings.Contains(line, trigger) {
-			return memoryMapSection
-		}
-	}
-	return unrecognizedSection
-}
-
-func (p *Profile) addLegacyFrameInfo() {
-	switch {
-	case isProfileType(p, heapzSampleTypes) ||
-		isProfileType(p, heapzInUseSampleTypes) ||
-		isProfileType(p, heapzAllocSampleTypes):
-		p.DropFrames, p.KeepFrames = allocRxStr, allocSkipRxStr
-	case isProfileType(p, contentionzSampleTypes):
-		p.DropFrames, p.KeepFrames = lockRxStr, ""
-	default:
-		p.DropFrames, p.KeepFrames = cpuProfilerRxStr, ""
-	}
-}
-
-var heapzSampleTypes = []string{"allocations", "size"} // early Go pprof profiles
-var heapzInUseSampleTypes = []string{"inuse_objects", "inuse_space"}
-var heapzAllocSampleTypes = []string{"alloc_objects", "alloc_space"}
-var contentionzSampleTypes = []string{"contentions", "delay"}
-
-func isProfileType(p *Profile, t []string) bool {
-	st := p.SampleType
-	if len(st) != len(t) {
-		return false
-	}
-
-	for i := range st {
-		if st[i].Type != t[i] {
-			return false
-		}
-	}
-	return true
-}
-
-var allocRxStr = strings.Join([]string{
-	// POSIX entry points.
-	`calloc`,
-	`cfree`,
-	`malloc`,
-	`free`,
-	`memalign`,
-	`do_memalign`,
-	`(__)?posix_memalign`,
-	`pvalloc`,
-	`valloc`,
-	`realloc`,
-
-	// TC malloc.
-	`tcmalloc::.*`,
-	`tc_calloc`,
-	`tc_cfree`,
-	`tc_malloc`,
-	`tc_free`,
-	`tc_memalign`,
-	`tc_posix_memalign`,
-	`tc_pvalloc`,
-	`tc_valloc`,
-	`tc_realloc`,
-	`tc_new`,
-	`tc_delete`,
-	`tc_newarray`,
-	`tc_deletearray`,
-	`tc_new_nothrow`,
-	`tc_newarray_nothrow`,
-
-	// Memory-allocation routines on OS X.
-	`malloc_zone_malloc`,
-	`malloc_zone_calloc`,
-	`malloc_zone_valloc`,
-	`malloc_zone_realloc`,
-	`malloc_zone_memalign`,
-	`malloc_zone_free`,
-
-	// Go runtime
-	`runtime\..*`,
-
-	// Other misc. memory allocation routines
-	`BaseArena::.*`,
-	`(::)?do_malloc_no_errno`,
-	`(::)?do_malloc_pages`,
-	`(::)?do_malloc`,
-	`DoSampledAllocation`,
-	`MallocedMemBlock::MallocedMemBlock`,
-	`_M_allocate`,
-	`__builtin_(vec_)?delete`,
-	`__builtin_(vec_)?new`,
-	`__gnu_cxx::new_allocator::allocate`,
-	`__libc_malloc`,
-	`__malloc_alloc_template::allocate`,
-	`allocate`,
-	`cpp_alloc`,
-	`operator new(\[\])?`,
-	`simple_alloc::allocate`,
-}, `|`)
-
-var allocSkipRxStr = strings.Join([]string{
-	// Preserve Go runtime frames that appear in the middle/bottom of
-	// the stack.
-	`runtime\.panic`,
-	`runtime\.reflectcall`,
-	`runtime\.call[0-9]*`,
-}, `|`)
-
-var cpuProfilerRxStr = strings.Join([]string{
-	`ProfileData::Add`,
-	`ProfileData::prof_handler`,
-	`CpuProfiler::prof_handler`,
-	`__pthread_sighandler`,
-	`__restore`,
-}, `|`)
-
-var lockRxStr = strings.Join([]string{
-	`RecordLockProfileData`,
-	`(base::)?RecordLockProfileData.*`,
-	`(base::)?SubmitMutexProfileData.*`,
-	`(base::)?SubmitSpinLockProfileData.*`,
-	`(Mutex::)?AwaitCommon.*`,
-	`(Mutex::)?Unlock.*`,
-	`(Mutex::)?UnlockSlow.*`,
-	`(Mutex::)?ReaderUnlock.*`,
-	`(MutexLock::)?~MutexLock.*`,
-	`(SpinLock::)?Unlock.*`,
-	`(SpinLock::)?SlowUnlock.*`,
-	`(SpinLockHolder::)?~SpinLockHolder.*`,
-}, `|`)
diff --git a/libgo/go/runtime/pprof/internal/profile/profile.go b/libgo/go/runtime/pprof/internal/profile/profile.go
deleted file mode 100644
index 443accdd6d3..00000000000
--- a/libgo/go/runtime/pprof/internal/profile/profile.go
+++ /dev/null
@@ -1,577 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package profile provides a representation of profile.proto and
-// methods to encode/decode profiles in this format.
-//
-// This package is only for testing runtime/pprof.
-// It is not used by production Go programs.
-package profile
-
-import (
-	"bytes"
-	"compress/gzip"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"regexp"
-	"strings"
-	"time"
-)
-
-// Profile is an in-memory representation of profile.proto.
-type Profile struct {
-	SampleType        []*ValueType
-	DefaultSampleType string
-	Sample            []*Sample
-	Mapping           []*Mapping
-	Location          []*Location
-	Function          []*Function
-	Comments          []string
-
-	DropFrames string
-	KeepFrames string
-
-	TimeNanos     int64
-	DurationNanos int64
-	PeriodType    *ValueType
-	Period        int64
-
-	commentX           []int64
-	dropFramesX        int64
-	keepFramesX        int64
-	stringTable        []string
-	defaultSampleTypeX int64
-}
-
-// ValueType corresponds to Profile.ValueType
-type ValueType struct {
-	Type string // cpu, wall, inuse_space, etc
-	Unit string // seconds, nanoseconds, bytes, etc
-
-	typeX int64
-	unitX int64
-}
-
-// Sample corresponds to Profile.Sample
-type Sample struct {
-	Location []*Location
-	Value    []int64
-	Label    map[string][]string
-	NumLabel map[string][]int64
-
-	locationIDX []uint64
-	labelX      []Label
-}
-
-// Label corresponds to Profile.Label
-type Label struct {
-	keyX int64
-	// Exactly one of the two following values must be set
-	strX int64
-	numX int64 // Integer value for this label
-}
-
-// Mapping corresponds to Profile.Mapping
-type Mapping struct {
-	ID              uint64
-	Start           uint64
-	Limit           uint64
-	Offset          uint64
-	File            string
-	BuildID         string
-	HasFunctions    bool
-	HasFilenames    bool
-	HasLineNumbers  bool
-	HasInlineFrames bool
-
-	fileX    int64
-	buildIDX int64
-}
-
-// Location corresponds to Profile.Location
-type Location struct {
-	ID      uint64
-	Mapping *Mapping
-	Address uint64
-	Line    []Line
-
-	mappingIDX uint64
-}
-
-// Line corresponds to Profile.Line
-type Line struct {
-	Function *Function
-	Line     int64
-
-	functionIDX uint64
-}
-
-// Function corresponds to Profile.Function
-type Function struct {
-	ID         uint64
-	Name       string
-	SystemName string
-	Filename   string
-	StartLine  int64
-
-	nameX       int64
-	systemNameX int64
-	filenameX   int64
-}
-
-// Parse parses a profile and checks for its validity. The input
-// may be a gzip-compressed encoded protobuf or one of many legacy
-// profile formats which may be unsupported in the future.
-func Parse(r io.Reader) (*Profile, error) {
-	orig, err := ioutil.ReadAll(r)
-	if err != nil {
-		return nil, err
-	}
-
-	var p *Profile
-	if len(orig) >= 2 && orig[0] == 0x1f && orig[1] == 0x8b {
-		gz, err := gzip.NewReader(bytes.NewBuffer(orig))
-		if err != nil {
-			return nil, fmt.Errorf("decompressing profile: %v", err)
-		}
-		data, err := ioutil.ReadAll(gz)
-		if err != nil {
-			return nil, fmt.Errorf("decompressing profile: %v", err)
-		}
-		orig = data
-	}
-	if p, err = parseUncompressed(orig); err != nil {
-		if p, err = parseLegacy(orig); err != nil {
-			return nil, fmt.Errorf("parsing profile: %v", err)
-		}
-	}
-
-	if err := p.CheckValid(); err != nil {
-		return nil, fmt.Errorf("malformed profile: %v", err)
-	}
-	return p, nil
-}
-
-var errUnrecognized = fmt.Errorf("unrecognized profile format")
-var errMalformed = fmt.Errorf("malformed profile format")
-
-func parseLegacy(data []byte) (*Profile, error) {
-	parsers := []func([]byte) (*Profile, error){
-		parseCPU,
-		parseHeap,
-		parseGoCount, // goroutine, threadcreate
-		parseThread,
-		parseContention,
-	}
-
-	for _, parser := range parsers {
-		p, err := parser(data)
-		if err == nil {
-			p.setMain()
-			p.addLegacyFrameInfo()
-			return p, nil
-		}
-		if err != errUnrecognized {
-			return nil, err
-		}
-	}
-	return nil, errUnrecognized
-}
-
-func parseUncompressed(data []byte) (*Profile, error) {
-	p := &Profile{}
-	if err := unmarshal(data, p); err != nil {
-		return nil, err
-	}
-
-	if err := p.postDecode(); err != nil {
-		return nil, err
-	}
-
-	return p, nil
-}
-
-var libRx = regexp.MustCompile(`([.]so$|[.]so[._][0-9]+)`)
-
-// setMain scans Mapping entries and guesses which entry is main
-// because legacy profiles don't obey the convention of putting main
-// first.
-func (p *Profile) setMain() {
-	for i := 0; i < len(p.Mapping); i++ {
-		file := strings.TrimSpace(strings.ReplaceAll(p.Mapping[i].File, "(deleted)", ""))
-		if len(file) == 0 {
-			continue
-		}
-		if len(libRx.FindStringSubmatch(file)) > 0 {
-			continue
-		}
-		if strings.HasPrefix(file, "[") {
-			continue
-		}
-		// Swap what we guess is main to position 0.
-		p.Mapping[i], p.Mapping[0] = p.Mapping[0], p.Mapping[i]
-		break
-	}
-}
-
-// Write writes the profile as a gzip-compressed marshaled protobuf.
-func (p *Profile) Write(w io.Writer) error {
-	p.preEncode()
-	b := marshal(p)
-	zw := gzip.NewWriter(w)
-	defer zw.Close()
-	_, err := zw.Write(b)
-	return err
-}
-
-// CheckValid tests whether the profile is valid. Checks include, but are
-// not limited to:
-//   - len(Profile.Sample[n].value) == len(Profile.value_unit)
-//   - Sample.id has a corresponding Profile.Location
-func (p *Profile) CheckValid() error {
-	// Check that sample values are consistent
-	sampleLen := len(p.SampleType)
-	if sampleLen == 0 && len(p.Sample) != 0 {
-		return fmt.Errorf("missing sample type information")
-	}
-	for _, s := range p.Sample {
-		if len(s.Value) != sampleLen {
-			return fmt.Errorf("mismatch: sample has: %d values vs. %d types", len(s.Value), len(p.SampleType))
-		}
-	}
-
-	// Check that all mappings/locations/functions are in the tables
-	// Check that there are no duplicate ids
-	mappings := make(map[uint64]*Mapping, len(p.Mapping))
-	for _, m := range p.Mapping {
-		if m.ID == 0 {
-			return fmt.Errorf("found mapping with reserved ID=0")
-		}
-		if mappings[m.ID] != nil {
-			return fmt.Errorf("multiple mappings with same id: %d", m.ID)
-		}
-		mappings[m.ID] = m
-	}
-	functions := make(map[uint64]*Function, len(p.Function))
-	for _, f := range p.Function {
-		if f.ID == 0 {
-			return fmt.Errorf("found function with reserved ID=0")
-		}
-		if functions[f.ID] != nil {
-			return fmt.Errorf("multiple functions with same id: %d", f.ID)
-		}
-		functions[f.ID] = f
-	}
-	locations := make(map[uint64]*Location, len(p.Location))
-	for _, l := range p.Location {
-		if l.ID == 0 {
-			return fmt.Errorf("found location with reserved id=0")
-		}
-		if locations[l.ID] != nil {
-			return fmt.Errorf("multiple locations with same id: %d", l.ID)
-		}
-		locations[l.ID] = l
-		if m := l.Mapping; m != nil {
-			if m.ID == 0 || mappings[m.ID] != m {
-				return fmt.Errorf("inconsistent mapping %p: %d", m, m.ID)
-			}
-		}
-		for _, ln := range l.Line {
-			if f := ln.Function; f != nil {
-				if f.ID == 0 || functions[f.ID] != f {
-					return fmt.Errorf("inconsistent function %p: %d", f, f.ID)
-				}
-			}
-		}
-	}
-	return nil
-}
-
-// Aggregate merges the locations in the profile into equivalence
-// classes preserving the request attributes. It also updates the
-// samples to point to the merged locations.
-func (p *Profile) Aggregate(inlineFrame, function, filename, linenumber, address bool) error {
-	for _, m := range p.Mapping {
-		m.HasInlineFrames = m.HasInlineFrames && inlineFrame
-		m.HasFunctions = m.HasFunctions && function
-		m.HasFilenames = m.HasFilenames && filename
-		m.HasLineNumbers = m.HasLineNumbers && linenumber
-	}
-
-	// Aggregate functions
-	if !function || !filename {
-		for _, f := range p.Function {
-			if !function {
-				f.Name = ""
-				f.SystemName = ""
-			}
-			if !filename {
-				f.Filename = ""
-			}
-		}
-	}
-
-	// Aggregate locations
-	if !inlineFrame || !address || !linenumber {
-		for _, l := range p.Location {
-			if !inlineFrame && len(l.Line) > 1 {
-				l.Line = l.Line[len(l.Line)-1:]
-			}
-			if !linenumber {
-				for i := range l.Line {
-					l.Line[i].Line = 0
-				}
-			}
-			if !address {
-				l.Address = 0
-			}
-		}
-	}
-
-	return p.CheckValid()
-}
-
-// Print dumps a text representation of a profile. Intended mainly
-// for debugging purposes.
-func (p *Profile) String() string {
-
-	ss := make([]string, 0, len(p.Sample)+len(p.Mapping)+len(p.Location))
-	if pt := p.PeriodType; pt != nil {
-		ss = append(ss, fmt.Sprintf("PeriodType: %s %s", pt.Type, pt.Unit))
-	}
-	ss = append(ss, fmt.Sprintf("Period: %d", p.Period))
-	if p.TimeNanos != 0 {
-		ss = append(ss, fmt.Sprintf("Time: %v", time.Unix(0, p.TimeNanos)))
-	}
-	if p.DurationNanos != 0 {
-		ss = append(ss, fmt.Sprintf("Duration: %v", time.Duration(p.DurationNanos)))
-	}
-
-	ss = append(ss, "Samples:")
-	var sh1 string
-	for _, s := range p.SampleType {
-		sh1 = sh1 + fmt.Sprintf("%s/%s ", s.Type, s.Unit)
-	}
-	ss = append(ss, strings.TrimSpace(sh1))
-	for _, s := range p.Sample {
-		var sv string
-		for _, v := range s.Value {
-			sv = fmt.Sprintf("%s %10d", sv, v)
-		}
-		sv = sv + ": "
-		for _, l := range s.Location {
-			sv = sv + fmt.Sprintf("%d ", l.ID)
-		}
-		ss = append(ss, sv)
-		const labelHeader = "                "
-		if len(s.Label) > 0 {
-			ls := labelHeader
-			for k, v := range s.Label {
-				ls = ls + fmt.Sprintf("%s:%v ", k, v)
-			}
-			ss = append(ss, ls)
-		}
-		if len(s.NumLabel) > 0 {
-			ls := labelHeader
-			for k, v := range s.NumLabel {
-				ls = ls + fmt.Sprintf("%s:%v ", k, v)
-			}
-			ss = append(ss, ls)
-		}
-	}
-
-	ss = append(ss, "Locations")
-	for _, l := range p.Location {
-		locStr := fmt.Sprintf("%6d: %#x ", l.ID, l.Address)
-		if m := l.Mapping; m != nil {
-			locStr = locStr + fmt.Sprintf("M=%d ", m.ID)
-		}
-		if len(l.Line) == 0 {
-			ss = append(ss, locStr)
-		}
-		for li := range l.Line {
-			lnStr := "??"
-			if fn := l.Line[li].Function; fn != nil {
-				lnStr = fmt.Sprintf("%s %s:%d s=%d",
-					fn.Name,
-					fn.Filename,
-					l.Line[li].Line,
-					fn.StartLine)
-				if fn.Name != fn.SystemName {
-					lnStr = lnStr + "(" + fn.SystemName + ")"
-				}
-			}
-			ss = append(ss, locStr+lnStr)
-			// Do not print location details past the first line
-			locStr = "             "
-		}
-	}
-
-	ss = append(ss, "Mappings")
-	for _, m := range p.Mapping {
-		bits := ""
-		if m.HasFunctions {
-			bits += "[FN]"
-		}
-		if m.HasFilenames {
-			bits += "[FL]"
-		}
-		if m.HasLineNumbers {
-			bits += "[LN]"
-		}
-		if m.HasInlineFrames {
-			bits += "[IN]"
-		}
-		ss = append(ss, fmt.Sprintf("%d: %#x/%#x/%#x %s %s %s",
-			m.ID,
-			m.Start, m.Limit, m.Offset,
-			m.File,
-			m.BuildID,
-			bits))
-	}
-
-	return strings.Join(ss, "\n") + "\n"
-}
-
-// Merge adds profile p adjusted by ratio r into profile p. Profiles
-// must be compatible (same Type and SampleType).
-// TODO(rsilvera): consider normalizing the profiles based on the
-// total samples collected.
-func (p *Profile) Merge(pb *Profile, r float64) error {
-	if err := p.Compatible(pb); err != nil {
-		return err
-	}
-
-	pb = pb.Copy()
-
-	// Keep the largest of the two periods.
-	if pb.Period > p.Period {
-		p.Period = pb.Period
-	}
-
-	p.DurationNanos += pb.DurationNanos
-
-	p.Mapping = append(p.Mapping, pb.Mapping...)
-	for i, m := range p.Mapping {
-		m.ID = uint64(i + 1)
-	}
-	p.Location = append(p.Location, pb.Location...)
-	for i, l := range p.Location {
-		l.ID = uint64(i + 1)
-	}
-	p.Function = append(p.Function, pb.Function...)
-	for i, f := range p.Function {
-		f.ID = uint64(i + 1)
-	}
-
-	if r != 1.0 {
-		for _, s := range pb.Sample {
-			for i, v := range s.Value {
-				s.Value[i] = int64((float64(v) * r))
-			}
-		}
-	}
-	p.Sample = append(p.Sample, pb.Sample...)
-	return p.CheckValid()
-}
-
-// Compatible determines if two profiles can be compared/merged.
-// returns nil if the profiles are compatible; otherwise an error with
-// details on the incompatibility.
-func (p *Profile) Compatible(pb *Profile) error {
-	if !compatibleValueTypes(p.PeriodType, pb.PeriodType) {
-		return fmt.Errorf("incompatible period types %v and %v", p.PeriodType, pb.PeriodType)
-	}
-
-	if len(p.SampleType) != len(pb.SampleType) {
-		return fmt.Errorf("incompatible sample types %v and %v", p.SampleType, pb.SampleType)
-	}
-
-	for i := range p.SampleType {
-		if !compatibleValueTypes(p.SampleType[i], pb.SampleType[i]) {
-			return fmt.Errorf("incompatible sample types %v and %v", p.SampleType, pb.SampleType)
-		}
-	}
-
-	return nil
-}
-
-// HasFunctions determines if all locations in this profile have
-// symbolized function information.
-func (p *Profile) HasFunctions() bool {
-	for _, l := range p.Location {
-		if l.Mapping == nil || !l.Mapping.HasFunctions {
-			return false
-		}
-	}
-	return true
-}
-
-// HasFileLines determines if all locations in this profile have
-// symbolized file and line number information.
-func (p *Profile) HasFileLines() bool {
-	for _, l := range p.Location {
-		if l.Mapping == nil || (!l.Mapping.HasFilenames || !l.Mapping.HasLineNumbers) {
-			return false
-		}
-	}
-	return true
-}
-
-func compatibleValueTypes(v1, v2 *ValueType) bool {
-	if v1 == nil || v2 == nil {
-		return true // No grounds to disqualify.
-	}
-	return v1.Type == v2.Type && v1.Unit == v2.Unit
-}
-
-// Copy makes a fully independent copy of a profile.
-func (p *Profile) Copy() *Profile {
-	p.preEncode()
-	b := marshal(p)
-
-	pp := &Profile{}
-	if err := unmarshal(b, pp); err != nil {
-		panic(err)
-	}
-	if err := pp.postDecode(); err != nil {
-		panic(err)
-	}
-
-	return pp
-}
-
-// Demangler maps symbol names to a human-readable form. This may
-// include C++ demangling and additional simplification. Names that
-// are not demangled may be missing from the resulting map.
-type Demangler func(name []string) (map[string]string, error)
-
-// Demangle attempts to demangle and optionally simplify any function
-// names referenced in the profile. It works on a best-effort basis:
-// it will silently preserve the original names in case of any errors.
-func (p *Profile) Demangle(d Demangler) error {
-	// Collect names to demangle.
-	var names []string
-	for _, fn := range p.Function {
-		names = append(names, fn.SystemName)
-	}
-
-	// Update profile with demangled names.
-	demangled, err := d(names)
-	if err != nil {
-		return err
-	}
-	for _, fn := range p.Function {
-		if dd, ok := demangled[fn.SystemName]; ok {
-			fn.Name = dd
-		}
-	}
-	return nil
-}
-
-// Empty reports whether the profile contains no samples.
-func (p *Profile) Empty() bool {
-	return len(p.Sample) == 0
-}
diff --git a/libgo/go/runtime/pprof/internal/profile/profile_test.go b/libgo/go/runtime/pprof/internal/profile/profile_test.go
deleted file mode 100644
index e1963f33515..00000000000
--- a/libgo/go/runtime/pprof/internal/profile/profile_test.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package profile
-
-import (
-	"bytes"
-	"testing"
-)
-
-func TestEmptyProfile(t *testing.T) {
-	var buf bytes.Buffer
-	p, err := Parse(&buf)
-	if err != nil {
-		t.Error("Want no error, got", err)
-	}
-	if p == nil {
-		t.Fatal("Want a valid profile, got <nil>")
-	}
-	if !p.Empty() {
-		t.Errorf("Profile should be empty, got %#v", p)
-	}
-}
-
-func TestParseContention(t *testing.T) {
-	tests := []struct {
-		name    string
-		in      string
-		wantErr bool
-	}{
-		{
-			name: "valid",
-			in: `--- mutex:
-cycles/second=3491920901
-sampling period=1
-43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31
-34035731690 15760 @ 0x45e851 0x45f764 0x4a2b17 0x44ea31
-`,
-		},
-		{
-			name: "valid with comment",
-			in: `--- mutex:
-cycles/second=3491920901
-sampling period=1
-43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31
-#	0x45e850	sync.(*Mutex).Unlock+0x80	/go/src/sync/mutex.go:126
-#	0x45f763	sync.(*RWMutex).Unlock+0x83	/go/src/sync/rwmutex.go:125
-#	0x4a2be0	main.main.func3+0x70		/go/src/internal/pprof/profile/a_binary.go:58
-
-34035731690 15760 @ 0x45e851 0x45f764 0x4a2b17 0x44ea31
-#	0x45e850	sync.(*Mutex).Unlock+0x80	/go/src/sync/mutex.go:126
-#	0x45f763	sync.(*RWMutex).Unlock+0x83	/go/src/sync/rwmutex.go:125
-#	0x4a2b16	main.main.func2+0xd6		/go/src/internal/pprof/profile/a_binary.go:48
-`,
-		},
-		{
-			name:    "empty",
-			in:      `--- mutex:`,
-			wantErr: true,
-		},
-		{
-			name: "invalid header",
-			in: `--- channel:
-43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31`,
-			wantErr: true,
-		},
-	}
-	for _, tc := range tests {
-		_, err := parseContention([]byte(tc.in))
-		if tc.wantErr && err == nil {
-			t.Errorf("parseContention(%q) succeeded unexpectedly", tc.name)
-		}
-		if !tc.wantErr && err != nil {
-			t.Errorf("parseContention(%q) failed unexpectedly: %v", tc.name, err)
-		}
-	}
-
-}
diff --git a/libgo/go/runtime/pprof/internal/profile/proto.go b/libgo/go/runtime/pprof/internal/profile/proto.go
deleted file mode 100644
index 11d7f9ff9b3..00000000000
--- a/libgo/go/runtime/pprof/internal/profile/proto.go
+++ /dev/null
@@ -1,360 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file is a simple protocol buffer encoder and decoder.
-//
-// A protocol message must implement the message interface:
-//   decoder() []decoder
-//   encode(*buffer)
-//
-// The decode method returns a slice indexed by field number that gives the
-// function to decode that field.
-// The encode method encodes its receiver into the given buffer.
-//
-// The two methods are simple enough to be implemented by hand rather than
-// by using a protocol compiler.
-//
-// See profile.go for examples of messages implementing this interface.
-//
-// There is no support for groups, message sets, or "has" bits.
-
-package profile
-
-import "errors"
-
-type buffer struct {
-	field int
-	typ   int
-	u64   uint64
-	data  []byte
-	tmp   [16]byte
-}
-
-type decoder func(*buffer, message) error
-
-type message interface {
-	decoder() []decoder
-	encode(*buffer)
-}
-
-func marshal(m message) []byte {
-	var b buffer
-	m.encode(&b)
-	return b.data
-}
-
-func encodeVarint(b *buffer, x uint64) {
-	for x >= 128 {
-		b.data = append(b.data, byte(x)|0x80)
-		x >>= 7
-	}
-	b.data = append(b.data, byte(x))
-}
-
-func encodeLength(b *buffer, tag int, len int) {
-	encodeVarint(b, uint64(tag)<<3|2)
-	encodeVarint(b, uint64(len))
-}
-
-func encodeUint64(b *buffer, tag int, x uint64) {
-	// append varint to b.data
-	encodeVarint(b, uint64(tag)<<3|0)
-	encodeVarint(b, x)
-}
-
-func encodeUint64s(b *buffer, tag int, x []uint64) {
-	if len(x) > 2 {
-		// Use packed encoding
-		n1 := len(b.data)
-		for _, u := range x {
-			encodeVarint(b, u)
-		}
-		n2 := len(b.data)
-		encodeLength(b, tag, n2-n1)
-		n3 := len(b.data)
-		copy(b.tmp[:], b.data[n2:n3])
-		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
-		copy(b.data[n1:], b.tmp[:n3-n2])
-		return
-	}
-	for _, u := range x {
-		encodeUint64(b, tag, u)
-	}
-}
-
-func encodeUint64Opt(b *buffer, tag int, x uint64) {
-	if x == 0 {
-		return
-	}
-	encodeUint64(b, tag, x)
-}
-
-func encodeInt64(b *buffer, tag int, x int64) {
-	u := uint64(x)
-	encodeUint64(b, tag, u)
-}
-
-func encodeInt64Opt(b *buffer, tag int, x int64) {
-	if x == 0 {
-		return
-	}
-	encodeInt64(b, tag, x)
-}
-
-func encodeInt64s(b *buffer, tag int, x []int64) {
-	if len(x) > 2 {
-		// Use packed encoding
-		n1 := len(b.data)
-		for _, u := range x {
-			encodeVarint(b, uint64(u))
-		}
-		n2 := len(b.data)
-		encodeLength(b, tag, n2-n1)
-		n3 := len(b.data)
-		copy(b.tmp[:], b.data[n2:n3])
-		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
-		copy(b.data[n1:], b.tmp[:n3-n2])
-		return
-	}
-	for _, u := range x {
-		encodeInt64(b, tag, u)
-	}
-}
-
-func encodeString(b *buffer, tag int, x string) {
-	encodeLength(b, tag, len(x))
-	b.data = append(b.data, x...)
-}
-
-func encodeStrings(b *buffer, tag int, x []string) {
-	for _, s := range x {
-		encodeString(b, tag, s)
-	}
-}
-
-func encodeStringOpt(b *buffer, tag int, x string) {
-	if x == "" {
-		return
-	}
-	encodeString(b, tag, x)
-}
-
-func encodeBool(b *buffer, tag int, x bool) {
-	if x {
-		encodeUint64(b, tag, 1)
-	} else {
-		encodeUint64(b, tag, 0)
-	}
-}
-
-func encodeBoolOpt(b *buffer, tag int, x bool) {
-	if x == false {
-		return
-	}
-	encodeBool(b, tag, x)
-}
-
-func encodeMessage(b *buffer, tag int, m message) {
-	n1 := len(b.data)
-	m.encode(b)
-	n2 := len(b.data)
-	encodeLength(b, tag, n2-n1)
-	n3 := len(b.data)
-	copy(b.tmp[:], b.data[n2:n3])
-	copy(b.data[n1+(n3-n2):], b.data[n1:n2])
-	copy(b.data[n1:], b.tmp[:n3-n2])
-}
-
-func unmarshal(data []byte, m message) (err error) {
-	b := buffer{data: data, typ: 2}
-	return decodeMessage(&b, m)
-}
-
-func le64(p []byte) uint64 {
-	return uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 | uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56
-}
-
-func le32(p []byte) uint32 {
-	return uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
-}
-
-func decodeVarint(data []byte) (uint64, []byte, error) {
-	var i int
-	var u uint64
-	for i = 0; ; i++ {
-		if i >= 10 || i >= len(data) {
-			return 0, nil, errors.New("bad varint")
-		}
-		u |= uint64(data[i]&0x7F) << uint(7*i)
-		if data[i]&0x80 == 0 {
-			return u, data[i+1:], nil
-		}
-	}
-}
-
-func decodeField(b *buffer, data []byte) ([]byte, error) {
-	x, data, err := decodeVarint(data)
-	if err != nil {
-		return nil, err
-	}
-	b.field = int(x >> 3)
-	b.typ = int(x & 7)
-	b.data = nil
-	b.u64 = 0
-	switch b.typ {
-	case 0:
-		b.u64, data, err = decodeVarint(data)
-		if err != nil {
-			return nil, err
-		}
-	case 1:
-		if len(data) < 8 {
-			return nil, errors.New("not enough data")
-		}
-		b.u64 = le64(data[:8])
-		data = data[8:]
-	case 2:
-		var n uint64
-		n, data, err = decodeVarint(data)
-		if err != nil {
-			return nil, err
-		}
-		if n > uint64(len(data)) {
-			return nil, errors.New("too much data")
-		}
-		b.data = data[:n]
-		data = data[n:]
-	case 5:
-		if len(data) < 4 {
-			return nil, errors.New("not enough data")
-		}
-		b.u64 = uint64(le32(data[:4]))
-		data = data[4:]
-	default:
-		return nil, errors.New("unknown type: " + string(b.typ))
-	}
-
-	return data, nil
-}
-
-func checkType(b *buffer, typ int) error {
-	if b.typ != typ {
-		return errors.New("type mismatch")
-	}
-	return nil
-}
-
-func decodeMessage(b *buffer, m message) error {
-	if err := checkType(b, 2); err != nil {
-		return err
-	}
-	dec := m.decoder()
-	data := b.data
-	for len(data) > 0 {
-		// pull varint field# + type
-		var err error
-		data, err = decodeField(b, data)
-		if err != nil {
-			return err
-		}
-		if b.field >= len(dec) || dec[b.field] == nil {
-			continue
-		}
-		if err := dec[b.field](b, m); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func decodeInt64(b *buffer, x *int64) error {
-	if err := checkType(b, 0); err != nil {
-		return err
-	}
-	*x = int64(b.u64)
-	return nil
-}
-
-func decodeInt64s(b *buffer, x *[]int64) error {
-	if b.typ == 2 {
-		// Packed encoding
-		data := b.data
-		for len(data) > 0 {
-			var u uint64
-			var err error
-
-			if u, data, err = decodeVarint(data); err != nil {
-				return err
-			}
-			*x = append(*x, int64(u))
-		}
-		return nil
-	}
-	var i int64
-	if err := decodeInt64(b, &i); err != nil {
-		return err
-	}
-	*x = append(*x, i)
-	return nil
-}
-
-func decodeUint64(b *buffer, x *uint64) error {
-	if err := checkType(b, 0); err != nil {
-		return err
-	}
-	*x = b.u64
-	return nil
-}
-
-func decodeUint64s(b *buffer, x *[]uint64) error {
-	if b.typ == 2 {
-		data := b.data
-		// Packed encoding
-		for len(data) > 0 {
-			var u uint64
-			var err error
-
-			if u, data, err = decodeVarint(data); err != nil {
-				return err
-			}
-			*x = append(*x, u)
-		}
-		return nil
-	}
-	var u uint64
-	if err := decodeUint64(b, &u); err != nil {
-		return err
-	}
-	*x = append(*x, u)
-	return nil
-}
-
-func decodeString(b *buffer, x *string) error {
-	if err := checkType(b, 2); err != nil {
-		return err
-	}
-	*x = string(b.data)
-	return nil
-}
-
-func decodeStrings(b *buffer, x *[]string) error {
-	var s string
-	if err := decodeString(b, &s); err != nil {
-		return err
-	}
-	*x = append(*x, s)
-	return nil
-}
-
-func decodeBool(b *buffer, x *bool) error {
-	if err := checkType(b, 0); err != nil {
-		return err
-	}
-	if int64(b.u64) == 0 {
-		*x = false
-	} else {
-		*x = true
-	}
-	return nil
-}
diff --git a/libgo/go/runtime/pprof/internal/profile/proto_test.go b/libgo/go/runtime/pprof/internal/profile/proto_test.go
deleted file mode 100644
index c2613fc375a..00000000000
--- a/libgo/go/runtime/pprof/internal/profile/proto_test.go
+++ /dev/null
@@ -1,67 +0,0 @@
-package profile
-
-import (
-	"reflect"
-	"testing"
-)
-
-func TestPackedEncoding(t *testing.T) {
-
-	type testcase struct {
-		uint64s []uint64
-		int64s  []int64
-		encoded []byte
-	}
-	for i, tc := range []testcase{
-		{
-			[]uint64{0, 1, 10, 100, 1000, 10000},
-			[]int64{1000, 0, 1000},
-			[]byte{10, 8, 0, 1, 10, 100, 232, 7, 144, 78, 18, 5, 232, 7, 0, 232, 7},
-		},
-		{
-			[]uint64{10000},
-			nil,
-			[]byte{8, 144, 78},
-		},
-		{
-			nil,
-			[]int64{-10000},
-			[]byte{16, 240, 177, 255, 255, 255, 255, 255, 255, 255, 1},
-		},
-	} {
-		source := &packedInts{tc.uint64s, tc.int64s}
-		if got, want := marshal(source), tc.encoded; !reflect.DeepEqual(got, want) {
-			t.Errorf("failed encode %d, got %v, want %v", i, got, want)
-		}
-
-		dest := new(packedInts)
-		if err := unmarshal(tc.encoded, dest); err != nil {
-			t.Errorf("failed decode %d: %v", i, err)
-			continue
-		}
-		if got, want := dest.uint64s, tc.uint64s; !reflect.DeepEqual(got, want) {
-			t.Errorf("failed decode uint64s %d, got %v, want %v", i, got, want)
-		}
-		if got, want := dest.int64s, tc.int64s; !reflect.DeepEqual(got, want) {
-			t.Errorf("failed decode int64s %d, got %v, want %v", i, got, want)
-		}
-	}
-}
-
-type packedInts struct {
-	uint64s []uint64
-	int64s  []int64
-}
-
-func (u *packedInts) decoder() []decoder {
-	return []decoder{
-		nil,
-		func(b *buffer, m message) error { return decodeUint64s(b, &m.(*packedInts).uint64s) },
-		func(b *buffer, m message) error { return decodeInt64s(b, &m.(*packedInts).int64s) },
-	}
-}
-
-func (u *packedInts) encode(b *buffer) {
-	encodeUint64s(b, 1, u.uint64s)
-	encodeInt64s(b, 2, u.int64s)
-}
diff --git a/libgo/go/runtime/pprof/internal/profile/prune.go b/libgo/go/runtime/pprof/internal/profile/prune.go
deleted file mode 100644
index 1924fada7a5..00000000000
--- a/libgo/go/runtime/pprof/internal/profile/prune.go
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Implements methods to remove frames from profiles.
-
-package profile
-
-import (
-	"fmt"
-	"regexp"
-)
-
-// Prune removes all nodes beneath a node matching dropRx, and not
-// matching keepRx. If the root node of a Sample matches, the sample
-// will have an empty stack.
-func (p *Profile) Prune(dropRx, keepRx *regexp.Regexp) {
-	prune := make(map[uint64]bool)
-	pruneBeneath := make(map[uint64]bool)
-
-	for _, loc := range p.Location {
-		var i int
-		for i = len(loc.Line) - 1; i >= 0; i-- {
-			if fn := loc.Line[i].Function; fn != nil && fn.Name != "" {
-				funcName := fn.Name
-				// Account for leading '.' on the PPC ELF v1 ABI.
-				if funcName[0] == '.' {
-					funcName = funcName[1:]
-				}
-				if dropRx.MatchString(funcName) {
-					if keepRx == nil || !keepRx.MatchString(funcName) {
-						break
-					}
-				}
-			}
-		}
-
-		if i >= 0 {
-			// Found matching entry to prune.
-			pruneBeneath[loc.ID] = true
-
-			// Remove the matching location.
-			if i == len(loc.Line)-1 {
-				// Matched the top entry: prune the whole location.
-				prune[loc.ID] = true
-			} else {
-				loc.Line = loc.Line[i+1:]
-			}
-		}
-	}
-
-	// Prune locs from each Sample
-	for _, sample := range p.Sample {
-		// Scan from the root to the leaves to find the prune location.
-		// Do not prune frames before the first user frame, to avoid
-		// pruning everything.
-		foundUser := false
-		for i := len(sample.Location) - 1; i >= 0; i-- {
-			id := sample.Location[i].ID
-			if !prune[id] && !pruneBeneath[id] {
-				foundUser = true
-				continue
-			}
-			if !foundUser {
-				continue
-			}
-			if prune[id] {
-				sample.Location = sample.Location[i+1:]
-				break
-			}
-			if pruneBeneath[id] {
-				sample.Location = sample.Location[i:]
-				break
-			}
-		}
-	}
-}
-
-// RemoveUninteresting prunes and elides profiles using built-in
-// tables of uninteresting function names.
-func (p *Profile) RemoveUninteresting() error {
-	var keep, drop *regexp.Regexp
-	var err error
-
-	if p.DropFrames != "" {
-		if drop, err = regexp.Compile("^(" + p.DropFrames + ")$"); err != nil {
-			return fmt.Errorf("failed to compile regexp %s: %v", p.DropFrames, err)
-		}
-		if p.KeepFrames != "" {
-			if keep, err = regexp.Compile("^(" + p.KeepFrames + ")$"); err != nil {
-				return fmt.Errorf("failed to compile regexp %s: %v", p.KeepFrames, err)
-			}
-		}
-		p.Prune(drop, keep)
-	}
-	return nil
-}
diff --git a/libgo/go/runtime/pprof/label.go b/libgo/go/runtime/pprof/label.go
index 2d92ef7e8a2..b614f125449 100644
--- a/libgo/go/runtime/pprof/label.go
+++ b/libgo/go/runtime/pprof/label.go
@@ -6,6 +6,9 @@ package pprof
 
 import (
 	"context"
+	"fmt"
+	"sort"
+	"strings"
 )
 
 type label struct {
@@ -34,6 +37,23 @@ func labelValue(ctx context.Context) labelMap {
 // that admits incremental immutable modification more efficiently.
 type labelMap map[string]string
 
+// String statisfies Stringer and returns key, value pairs in a consistent
+// order.
+func (l *labelMap) String() string {
+	if l == nil {
+		return ""
+	}
+	keyVals := make([]string, 0, len(*l))
+
+	for k, v := range *l {
+		keyVals = append(keyVals, fmt.Sprintf("%q:%q", k, v))
+	}
+
+	sort.Strings(keyVals)
+
+	return "{" + strings.Join(keyVals, ", ") + "}"
+}
+
 // WithLabels returns a new context.Context with the given labels added.
 // A label overwrites a prior label with the same key.
 func WithLabels(ctx context.Context, labels LabelSet) context.Context {
@@ -54,7 +74,8 @@ func WithLabels(ctx context.Context, labels LabelSet) context.Context {
 // Labels takes an even number of strings representing key-value pairs
 // and makes a LabelSet containing them.
 // A label overwrites a prior label with the same key.
-// Currently only CPU profile utilizes labels information.
+// Currently only the CPU and goroutine profiles utilize any labels
+// information.
 // See https://golang.org/issue/23458 for details.
 func Labels(args ...string) LabelSet {
 	if len(args)%2 != 0 {
diff --git a/libgo/go/runtime/pprof/label_test.go b/libgo/go/runtime/pprof/label_test.go
index de39d85d3af..fcb00bde506 100644
--- a/libgo/go/runtime/pprof/label_test.go
+++ b/libgo/go/runtime/pprof/label_test.go
@@ -80,3 +80,35 @@ func TestContextLabels(t *testing.T) {
 		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
 	}
 }
+
+func TestLabelMapStringer(t *testing.T) {
+	for _, tbl := range []struct {
+		m        labelMap
+		expected string
+	}{
+		{
+			m: labelMap{
+				// empty map
+			},
+			expected: "{}",
+		}, {
+			m: labelMap{
+				"foo": "bar",
+			},
+			expected: `{"foo":"bar"}`,
+		}, {
+			m: labelMap{
+				"foo":             "bar",
+				"key1":            "value1",
+				"key2":            "value2",
+				"key3":            "value3",
+				"key4WithNewline": "\nvalue4",
+			},
+			expected: `{"foo":"bar", "key1":"value1", "key2":"value2", "key3":"value3", "key4WithNewline":"\nvalue4"}`,
+		},
+	} {
+		if got := tbl.m.String(); tbl.expected != got {
+			t.Errorf("%#v.String() = %q; want %q", tbl.m, got, tbl.expected)
+		}
+	}
+}
diff --git a/libgo/go/runtime/pprof/mprof_test.go b/libgo/go/runtime/pprof/mprof_test.go
index c352dea83ae..625ab7de8c8 100644
--- a/libgo/go/runtime/pprof/mprof_test.go
+++ b/libgo/go/runtime/pprof/mprof_test.go
@@ -9,10 +9,10 @@ package pprof
 import (
 	"bytes"
 	"fmt"
+	"internal/profile"
 	"reflect"
 	"regexp"
 	"runtime"
-	"runtime/pprof/internal/profile"
 	"testing"
 	"unsafe"
 )
diff --git a/libgo/go/runtime/pprof/pprof.go b/libgo/go/runtime/pprof/pprof.go
index 183881cfd3e..52df44a5a10 100644
--- a/libgo/go/runtime/pprof/pprof.go
+++ b/libgo/go/runtime/pprof/pprof.go
@@ -313,9 +313,11 @@ func (p *Profile) Remove(value interface{}) {
 // Otherwise, WriteTo returns nil.
 //
 // The debug parameter enables additional output.
-// Passing debug=0 prints only the hexadecimal addresses that pprof needs.
-// Passing debug=1 adds comments translating addresses to function names
-// and line numbers, so that a programmer can read the profile without tools.
+// Passing debug=0 writes the gzip-compressed protocol buffer described
+// in https://github.com/google/pprof/tree/master/proto#overview.
+// Passing debug=1 writes the legacy text format with comments
+// translating addresses to function names and line numbers, so that a
+// programmer can read the profile without tools.
 //
 // The predefined profiles may assign meaning to other debug values;
 // for example, when printing the "goroutine" profile, debug=2 means to
@@ -355,6 +357,7 @@ type stackProfile [][]uintptr
 
 func (x stackProfile) Len() int              { return len(x) }
 func (x stackProfile) Stack(i int) []uintptr { return x[i] }
+func (x stackProfile) Label(i int) *labelMap { return nil }
 
 // A countProfile is a set of stack traces to be printed as counts
 // grouped by stack trace. There are multiple implementations:
@@ -363,6 +366,7 @@ func (x stackProfile) Stack(i int) []uintptr { return x[i] }
 type countProfile interface {
 	Len() int
 	Stack(i int) []uintptr
+	Label(i int) *labelMap
 }
 
 // printCountCycleProfile outputs block profile records (for block or mutex profiles)
@@ -400,12 +404,16 @@ func printCountCycleProfile(w io.Writer, countName, cycleName string, scaler fun
 func printCountProfile(w io.Writer, debug int, name string, p countProfile) error {
 	// Build count of each stack.
 	var buf bytes.Buffer
-	key := func(stk []uintptr) string {
+	key := func(stk []uintptr, lbls *labelMap) string {
 		buf.Reset()
 		fmt.Fprintf(&buf, "@")
 		for _, pc := range stk {
 			fmt.Fprintf(&buf, " %#x", pc)
 		}
+		if lbls != nil {
+			buf.WriteString("\n# labels: ")
+			buf.WriteString(lbls.String())
+		}
 		return buf.String()
 	}
 	count := map[string]int{}
@@ -413,7 +421,7 @@ func printCountProfile(w io.Writer, debug int, name string, p countProfile) erro
 	var keys []string
 	n := p.Len()
 	for i := 0; i < n; i++ {
-		k := key(p.Stack(i))
+		k := key(p.Stack(i), p.Label(i))
 		if count[k] == 0 {
 			index[k] = i
 			keys = append(keys, k)
@@ -447,7 +455,16 @@ func printCountProfile(w io.Writer, debug int, name string, p countProfile) erro
 		// For count profiles, all stack addresses are
 		// return PCs, which is what appendLocsForStack expects.
 		locs = b.appendLocsForStack(locs[:0], p.Stack(index[k]))
-		b.pbSample(values, locs, nil)
+		idx := index[k]
+		var labels func()
+		if p.Label(idx) != nil {
+			labels = func() {
+				for k, v := range *p.Label(idx) {
+					b.pbLabel(tagSample_Label, k, v, 0)
+				}
+			}
+		}
+		b.pbSample(values, locs, labels)
 	}
 	b.build()
 	return nil
@@ -642,6 +659,9 @@ func writeHeapInternal(w io.Writer, debug int, defaultSampleType string) error {
 	fmt.Fprintf(w, "# GCCPUFraction = %v\n", s.GCCPUFraction)
 	fmt.Fprintf(w, "# DebugGC = %v\n", s.DebugGC)
 
+	// Also flush out MaxRSS on supported platforms.
+	addMaxRSS(w)
+
 	tw.Flush()
 	return b.Flush()
 }
@@ -654,7 +674,12 @@ func countThreadCreate() int {
 
 // writeThreadCreate writes the current runtime ThreadCreateProfile to w.
 func writeThreadCreate(w io.Writer, debug int) error {
-	return writeRuntimeProfile(w, debug, "threadcreate", runtime.ThreadCreateProfile)
+	// Until https://golang.org/issues/6104 is addressed, wrap
+	// ThreadCreateProfile because there's no point in tracking labels when we
+	// don't get any stack-traces.
+	return writeRuntimeProfile(w, debug, "threadcreate", func(p []runtime.StackRecord, _ []unsafe.Pointer) (n int, ok bool) {
+		return runtime.ThreadCreateProfile(p)
+	})
 }
 
 // countGoroutine returns the number of goroutines.
@@ -662,12 +687,15 @@ func countGoroutine() int {
 	return runtime.NumGoroutine()
 }
 
+// runtime_goroutineProfileWithLabels is defined in runtime/mprof.go
+func runtime_goroutineProfileWithLabels(p []runtime.StackRecord, labels []unsafe.Pointer) (n int, ok bool)
+
 // writeGoroutine writes the current runtime GoroutineProfile to w.
 func writeGoroutine(w io.Writer, debug int) error {
 	if debug >= 2 {
 		return writeGoroutineStacks(w)
 	}
-	return writeRuntimeProfile(w, debug, "goroutine", runtime.GoroutineProfile)
+	return writeRuntimeProfile(w, debug, "goroutine", runtime_goroutineProfileWithLabels)
 }
 
 func writeGoroutineStacks(w io.Writer) error {
@@ -691,7 +719,7 @@ func writeGoroutineStacks(w io.Writer) error {
 	return err
 }
 
-func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]runtime.StackRecord) (int, bool)) error {
+func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]runtime.StackRecord, []unsafe.Pointer) (int, bool)) error {
 	// Find out how many records there are (fetch(nil)),
 	// allocate that many records, and get the data.
 	// There's a race—more records might be added between
@@ -699,13 +727,15 @@ func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]runti
 	// and also try again if we're very unlucky.
 	// The loop should only execute one iteration in the common case.
 	var p []runtime.StackRecord
-	n, ok := fetch(nil)
+	var labels []unsafe.Pointer
+	n, ok := fetch(nil, nil)
 	for {
 		// Allocate room for a slightly bigger profile,
 		// in case a few more entries have been added
 		// since the call to ThreadProfile.
 		p = make([]runtime.StackRecord, n+10)
-		n, ok = fetch(p)
+		labels = make([]unsafe.Pointer, n+10)
+		n, ok = fetch(p, labels)
 		if ok {
 			p = p[0:n]
 			break
@@ -713,13 +743,17 @@ func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]runti
 		// Profile grew; try again.
 	}
 
-	return printCountProfile(w, debug, name, runtimeProfile(p))
+	return printCountProfile(w, debug, name, &runtimeProfile{p, labels})
 }
 
-type runtimeProfile []runtime.StackRecord
+type runtimeProfile struct {
+	stk    []runtime.StackRecord
+	labels []unsafe.Pointer
+}
 
-func (p runtimeProfile) Len() int              { return len(p) }
-func (p runtimeProfile) Stack(i int) []uintptr { return p[i].Stack() }
+func (p *runtimeProfile) Len() int              { return len(p.stk) }
+func (p *runtimeProfile) Stack(i int) []uintptr { return p.stk[i].Stack() }
+func (p *runtimeProfile) Label(i int) *labelMap { return (*labelMap)(p.labels[i]) }
 
 var cpu struct {
 	sync.Mutex
diff --git a/libgo/go/runtime/pprof/pprof_norusage.go b/libgo/go/runtime/pprof/pprof_norusage.go
new file mode 100644
index 00000000000..6fdcc6cc38d
--- /dev/null
+++ b/libgo/go/runtime/pprof/pprof_norusage.go
@@ -0,0 +1,15 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !darwin,!linux
+
+package pprof
+
+import (
+	"io"
+)
+
+// Stub call for platforms that don't support rusage.
+func addMaxRSS(w io.Writer) {
+}
diff --git a/libgo/go/runtime/pprof/pprof_rusage.go b/libgo/go/runtime/pprof/pprof_rusage.go
new file mode 100644
index 00000000000..d42e6ed4737
--- /dev/null
+++ b/libgo/go/runtime/pprof/pprof_rusage.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin linux
+
+package pprof
+
+import (
+	"fmt"
+	"io"
+	"runtime"
+	"syscall"
+)
+
+// Adds MaxRSS to platforms that are supported.
+func addMaxRSS(w io.Writer) {
+	var rssToBytes uintptr
+	switch runtime.GOOS {
+	case "linux", "android":
+		rssToBytes = 1024
+	case "darwin":
+		rssToBytes = 1
+	default:
+		panic("unsupported OS")
+	}
+
+	var rusage syscall.Rusage
+	syscall.Getrusage(0, &rusage)
+	fmt.Fprintf(w, "# MaxRSS = %d\n", uintptr(rusage.Maxrss)*rssToBytes)
+}
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index 239466fecc6..ff86bce1211 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -10,6 +10,7 @@ import (
 	"bytes"
 	"context"
 	"fmt"
+	"internal/profile"
 	"internal/testenv"
 	"io"
 	"io/ioutil"
@@ -18,7 +19,6 @@ import (
 	"os/exec"
 	"regexp"
 	"runtime"
-	"runtime/pprof/internal/profile"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -176,6 +176,25 @@ func inlinedCallee(x, n int) int {
 	return cpuHog0(x, n)
 }
 
+//go:noinline
+func dumpCallers(pcs []uintptr) {
+	if pcs == nil {
+		return
+	}
+
+	skip := 2 // Callers and dumpCallers
+	runtime.Callers(skip, pcs)
+}
+
+//go:noinline
+func inlinedCallerDump(pcs []uintptr) {
+	inlinedCalleeDump(pcs)
+}
+
+func inlinedCalleeDump(pcs []uintptr) {
+	dumpCallers(pcs)
+}
+
 func TestCPUProfileRecursion(t *testing.T) {
 	p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.recursionCallee", "runtime/pprof.recursionCaller"}, avoidFunctions(), func(dur time.Duration) {
 		cpuHogger(recursionCaller, &salt1, dur)
@@ -208,6 +227,25 @@ func recursionCallee(n, x int) int {
 	return y * recursionCallee(n-1, x)
 }
 
+func recursionChainTop(x int, pcs []uintptr) {
+	if x < 0 {
+		return
+	}
+	recursionChainMiddle(x, pcs)
+}
+
+func recursionChainMiddle(x int, pcs []uintptr) {
+	recursionChainBottom(x, pcs)
+}
+
+func recursionChainBottom(x int, pcs []uintptr) {
+	// This will be called each time, we only care about the last. We
+	// can't make this conditional or this function won't be inlined.
+	dumpCallers(pcs)
+
+	recursionChainTop(x-1, pcs)
+}
+
 func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) *profile.Profile {
 	p, err := profile.Parse(bytes.NewReader(valBytes))
 	if err != nil {
@@ -226,7 +264,7 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
 	switch runtime.GOOS {
 	case "darwin":
 		switch runtime.GOARCH {
-		case "arm", "arm64":
+		case "arm64":
 			// nothing
 		default:
 			out, err := exec.Command("uname", "-a").CombinedOutput()
@@ -959,6 +997,26 @@ func TestGoroutineCounts(t *testing.T) {
 			runtime.Gosched()
 		}
 	}
+	ctx := context.Background()
+
+	// ... and again, with labels this time (just with fewer iterations to keep
+	// sorting deterministic).
+	Do(ctx, Labels("label", "value"), func(context.Context) {
+		for i := 0; i < 89; i++ {
+			switch {
+			case i%10 == 0:
+				go func1(c)
+			case i%2 == 0:
+				go func2(c)
+			default:
+				go func3(c)
+			}
+			// Let goroutines block on channel
+			for j := 0; j < 5; j++ {
+				runtime.Gosched()
+			}
+		}
+	})
 
 	var w bytes.Buffer
 	goroutineProf := Lookup("goroutine")
@@ -967,8 +1025,11 @@ func TestGoroutineCounts(t *testing.T) {
 	goroutineProf.WriteTo(&w, 1)
 	prof := w.String()
 
-	if !containsInOrder(prof, "\n50 @ ", "\n40 @", "\n10 @", "\n1 @") {
-		t.Errorf("expected sorted goroutine counts:\n%s", prof)
+	labels := labelMap{"label": "value"}
+	labelStr := "\n# labels: " + labels.String()
+	if !containsInOrder(prof, "\n50 @ ", "\n44 @", labelStr,
+		"\n40 @", "\n36 @", labelStr, "\n10 @", "\n9 @", labelStr, "\n1 @") {
+		t.Errorf("expected sorted goroutine counts with Labels:\n%s", prof)
 	}
 
 	// Check proto profile
@@ -981,9 +1042,18 @@ func TestGoroutineCounts(t *testing.T) {
 	if err := p.CheckValid(); err != nil {
 		t.Errorf("protobuf profile is invalid: %v", err)
 	}
-	if !containsCounts(p, []int64{50, 40, 10, 1}) {
-		t.Errorf("expected count profile to contain goroutines with counts %v, got %v",
-			[]int64{50, 40, 10, 1}, p)
+	expectedLabels := map[int64]map[string]string{
+		50: map[string]string{},
+		44: map[string]string{"label": "value"},
+		40: map[string]string{},
+		36: map[string]string{"label": "value"},
+		10: map[string]string{},
+		9:  map[string]string{"label": "value"},
+		1:  map[string]string{},
+	}
+	if !containsCountsLabels(p, expectedLabels) {
+		t.Errorf("expected count profile to contain goroutines with counts and labels %v, got %v",
+			expectedLabels, p)
 	}
 
 	close(c)
@@ -1002,10 +1072,23 @@ func containsInOrder(s string, all ...string) bool {
 	return true
 }
 
-func containsCounts(prof *profile.Profile, counts []int64) bool {
+func containsCountsLabels(prof *profile.Profile, countLabels map[int64]map[string]string) bool {
 	m := make(map[int64]int)
-	for _, c := range counts {
+	type nkey struct {
+		count    int64
+		key, val string
+	}
+	n := make(map[nkey]int)
+	for c, kv := range countLabels {
 		m[c]++
+		for k, v := range kv {
+			n[nkey{
+				count: c,
+				key:   k,
+				val:   v,
+			}]++
+
+		}
 	}
 	for _, s := range prof.Sample {
 		// The count is the single value in the sample
@@ -1013,12 +1096,26 @@ func containsCounts(prof *profile.Profile, counts []int64) bool {
 			return false
 		}
 		m[s.Value[0]]--
+		for k, vs := range s.Label {
+			for _, v := range vs {
+				n[nkey{
+					count: s.Value[0],
+					key:   k,
+					val:   v,
+				}]--
+			}
+		}
 	}
 	for _, n := range m {
 		if n > 0 {
 			return false
 		}
 	}
+	for _, ncnt := range n {
+		if ncnt != 0 {
+			return false
+		}
+	}
 	return true
 }
 
@@ -1159,18 +1256,42 @@ func TestTracebackAll(t *testing.T) {
 	}
 }
 
-// TestTryAdd tests the cases that's hard to test with real program execution.
-// For example, the current go compilers may not inline functions involved in recursion
-// but that may not be true in the future compilers. This tests such cases by
-// using fake call sequences and forcing the profile build utilizing
-// translateCPUProfile defined in proto_test.go
+// TestTryAdd tests the cases that are hard to test with real program execution.
+//
+// For example, the current go compilers may not always inline functions
+// involved in recursion but that may not be true in the future compilers. This
+// tests such cases by using fake call sequences and forcing the profile build
+// utilizing translateCPUProfile defined in proto_test.go
 func TestTryAdd(t *testing.T) {
-	inlinedCallerPtr := uint64(funcPC(inlinedCaller)) + 1
-	inlinedCalleePtr, found := findInlinedCall(inlinedCaller, 4<<10)
-	if !found {
-		t.Skip("Can't determine whether inlinedCallee was inlined into inlinedCaller.")
+	if _, found := findInlinedCall(inlinedCallerDump, 4<<10); !found {
+		t.Skip("Can't determine whether anything was inlined into inlinedCallerDump.")
+	}
+
+	// inlinedCallerDump
+	//   inlinedCalleeDump
+	pcs := make([]uintptr, 2)
+	inlinedCallerDump(pcs)
+	inlinedCallerStack := make([]uint64, 2)
+	for i := range pcs {
+		inlinedCallerStack[i] = uint64(pcs[i])
+	}
+
+	if _, found := findInlinedCall(recursionChainBottom, 4<<10); !found {
+		t.Skip("Can't determine whether anything was inlined into recursionChainBottom.")
+	}
+
+	// recursionChainTop
+	//   recursionChainMiddle
+	//     recursionChainBottom
+	//       recursionChainTop
+	//         recursionChainMiddle
+	//           recursionChainBottom
+	pcs = make([]uintptr, 6)
+	recursionChainTop(1, pcs)
+	recursionStack := make([]uint64, len(pcs))
+	for i := range pcs {
+		recursionStack[i] = uint64(pcs[i])
 	}
-	inlinedCalleePtr += 1 // +1 to be safely inside of the function body.
 
 	period := int64(2000 * 1000) // 1/500*1e9 nanosec.
 
@@ -1180,13 +1301,29 @@ func TestTryAdd(t *testing.T) {
 		wantLocs    [][]string        // ordered location entries with function names.
 		wantSamples []*profile.Sample // ordered samples, we care only about Value and the profile location IDs.
 	}{{
+		// Sanity test for a normal, complete stack trace.
+		name: "full_stack_trace",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			5, 0, 50, inlinedCallerStack[0], inlinedCallerStack[1],
+		},
+		wantLocs: [][]string{
+			{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"},
+		},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{50, 50 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
 		name: "bug35538",
 		input: []uint64{
 			3, 0, 500, // hz = 500. Must match the period.
-			7, 0, 10, inlinedCalleePtr, inlinedCallerPtr, inlinedCalleePtr, inlinedCallerPtr,
-			5, 0, 20, inlinedCalleePtr, inlinedCallerPtr,
+			// Fake frame: tryAdd will have inlinedCallerDump
+			// (stack[1]) on the deck when it encounters the next
+			// inline function. It should accept this.
+			7, 0, 10, inlinedCallerStack[0], inlinedCallerStack[1], inlinedCallerStack[0], inlinedCallerStack[1],
+			5, 0, 20, inlinedCallerStack[0], inlinedCallerStack[1],
 		},
-		wantLocs: [][]string{{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"}},
 		wantSamples: []*profile.Sample{
 			{Value: []int64{10, 10 * period}, Location: []*profile.Location{{ID: 1}, {ID: 1}}},
 			{Value: []int64{20, 20 * period}, Location: []*profile.Location{{ID: 1}}},
@@ -1204,34 +1341,56 @@ func TestTryAdd(t *testing.T) {
 			{Value: []int64{4242, 4242 * period}, Location: []*profile.Location{{ID: 1}}},
 		},
 	}, {
-		// If a function is called recursively then it must not be
-		// inlined in the caller.
+		// If a function is directly called recursively then it must
+		// not be inlined in the caller.
 		//
 		// N.B. We're generating an impossible profile here, with a
-		// recursive inlineCallee call. This is simulating a non-Go
+		// recursive inlineCalleeDump call. This is simulating a non-Go
 		// function that looks like an inlined Go function other than
 		// its recursive property. See pcDeck.tryAdd.
-		name: "recursive_func_is_not_inlined",
+		name: "directly_recursive_func_is_not_inlined",
 		input: []uint64{
 			3, 0, 500, // hz = 500. Must match the period.
-			5, 0, 30, inlinedCalleePtr, inlinedCalleePtr,
-			4, 0, 40, inlinedCalleePtr,
+			5, 0, 30, inlinedCallerStack[0], inlinedCallerStack[0],
+			4, 0, 40, inlinedCallerStack[0],
 		},
-		// inlinedCaller shows up here because
+		// inlinedCallerDump shows up here because
 		// runtime_expandFinalInlineFrame adds it to the stack frame.
-		wantLocs: [][]string{{"runtime/pprof.inlinedCallee"}, {"runtime/pprof.inlinedCaller"}},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump"}, {"runtime/pprof.inlinedCallerDump"}},
 		wantSamples: []*profile.Sample{
 			{Value: []int64{30, 30 * period}, Location: []*profile.Location{{ID: 1}, {ID: 1}, {ID: 2}}},
 			{Value: []int64{40, 40 * period}, Location: []*profile.Location{{ID: 1}, {ID: 2}}},
 		},
 	}, {
+		name: "recursion_chain_inline",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			9, 0, 10, recursionStack[0], recursionStack[1], recursionStack[2], recursionStack[3], recursionStack[4], recursionStack[5],
+		},
+		wantLocs: [][]string{
+			{"runtime/pprof.recursionChainBottom"},
+			{
+				"runtime/pprof.recursionChainMiddle",
+				"runtime/pprof.recursionChainTop",
+				"runtime/pprof.recursionChainBottom",
+			},
+			{
+				"runtime/pprof.recursionChainMiddle",
+				"runtime/pprof.recursionChainTop",
+				"runtime/pprof.TestTryAdd", // inlined into the test.
+			},
+		},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{10, 10 * period}, Location: []*profile.Location{{ID: 1}, {ID: 2}, {ID: 3}}},
+		},
+	}, {
 		name: "truncated_stack_trace_later",
 		input: []uint64{
 			3, 0, 500, // hz = 500. Must match the period.
-			5, 0, 50, inlinedCalleePtr, inlinedCallerPtr,
-			4, 0, 60, inlinedCalleePtr,
+			5, 0, 50, inlinedCallerStack[0], inlinedCallerStack[1],
+			4, 0, 60, inlinedCallerStack[0],
 		},
-		wantLocs: [][]string{{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"}},
 		wantSamples: []*profile.Sample{
 			{Value: []int64{50, 50 * period}, Location: []*profile.Location{{ID: 1}}},
 			{Value: []int64{60, 60 * period}, Location: []*profile.Location{{ID: 1}}},
@@ -1240,10 +1399,10 @@ func TestTryAdd(t *testing.T) {
 		name: "truncated_stack_trace_first",
 		input: []uint64{
 			3, 0, 500, // hz = 500. Must match the period.
-			4, 0, 70, inlinedCalleePtr,
-			5, 0, 80, inlinedCalleePtr, inlinedCallerPtr,
+			4, 0, 70, inlinedCallerStack[0],
+			5, 0, 80, inlinedCallerStack[0], inlinedCallerStack[1],
 		},
-		wantLocs: [][]string{{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"}},
 		wantSamples: []*profile.Sample{
 			{Value: []int64{70, 70 * period}, Location: []*profile.Location{{ID: 1}}},
 			{Value: []int64{80, 80 * period}, Location: []*profile.Location{{ID: 1}}},
@@ -1253,9 +1412,9 @@ func TestTryAdd(t *testing.T) {
 		name: "truncated_stack_trace_only",
 		input: []uint64{
 			3, 0, 500, // hz = 500. Must match the period.
-			4, 0, 70, inlinedCalleePtr,
+			4, 0, 70, inlinedCallerStack[0],
 		},
-		wantLocs: [][]string{{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"}},
 		wantSamples: []*profile.Sample{
 			{Value: []int64{70, 70 * period}, Location: []*profile.Location{{ID: 1}}},
 		},
@@ -1264,12 +1423,16 @@ func TestTryAdd(t *testing.T) {
 		name: "truncated_stack_trace_twice",
 		input: []uint64{
 			3, 0, 500, // hz = 500. Must match the period.
-			4, 0, 70, inlinedCalleePtr,
-			5, 0, 80, inlinedCallerPtr, inlinedCalleePtr,
+			4, 0, 70, inlinedCallerStack[0],
+			// Fake frame: add a fake call to
+			// inlinedCallerDump to prevent this sample
+			// from getting merged into above.
+			5, 0, 80, inlinedCallerStack[1], inlinedCallerStack[0],
 		},
 		wantLocs: [][]string{
-			{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"},
-			{"runtime/pprof.inlinedCaller"}},
+			{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"},
+			{"runtime/pprof.inlinedCallerDump"},
+		},
 		wantSamples: []*profile.Sample{
 			{Value: []int64{70, 70 * period}, Location: []*profile.Location{{ID: 1}}},
 			{Value: []int64{80, 80 * period}, Location: []*profile.Location{{ID: 2}, {ID: 1}}},
diff --git a/libgo/go/runtime/pprof/proto.go b/libgo/go/runtime/pprof/proto.go
index 15fa44b991e..bd269e7a4a7 100644
--- a/libgo/go/runtime/pprof/proto.go
+++ b/libgo/go/runtime/pprof/proto.go
@@ -474,7 +474,7 @@ func (b *profileBuilder) appendLocsForStack(locs []uint64, stk []uintptr) (newLo
 // have the following properties:
 //   Frame's Func is nil (note: also true for non-Go functions), and
 //   Frame's Entry matches its entry function frame's Entry (note: could also be true for recursive calls and non-Go functions), and
-//   Frame's Name does not match its entry function frame's name (note: inlined functions cannot be recursive).
+//   Frame's Name does not match its entry function frame's name (note: inlined functions cannot be directly recursive).
 //
 // As reading and processing the pcs in a stack trace one by one (from leaf to the root),
 // we use pcDeck to temporarily hold the observed pcs and their expanded frames
diff --git a/libgo/go/runtime/pprof/proto_test.go b/libgo/go/runtime/pprof/proto_test.go
index 81cd5591d1d..e8efd4aa1ba 100644
--- a/libgo/go/runtime/pprof/proto_test.go
+++ b/libgo/go/runtime/pprof/proto_test.go
@@ -8,13 +8,13 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"internal/profile"
 	"internal/testenv"
 	"io/ioutil"
 	"os"
 	"os/exec"
 	"reflect"
 	"runtime"
-	"runtime/pprof/internal/profile"
 	"strings"
 	"testing"
 )
diff --git a/libgo/go/runtime/pprof/protomem_test.go b/libgo/go/runtime/pprof/protomem_test.go
index 471b1ae9c32..156f6286a92 100644
--- a/libgo/go/runtime/pprof/protomem_test.go
+++ b/libgo/go/runtime/pprof/protomem_test.go
@@ -6,8 +6,8 @@ package pprof
 
 import (
 	"bytes"
+	"internal/profile"
 	"runtime"
-	"runtime/pprof/internal/profile"
 	"testing"
 )
 
diff --git a/libgo/go/runtime/preempt.go b/libgo/go/runtime/preempt.go
index 1a8f9ac4a3d..9a78bcf51b8 100644
--- a/libgo/go/runtime/preempt.go
+++ b/libgo/go/runtime/preempt.go
@@ -56,6 +56,11 @@ import (
 	"runtime/internal/atomic"
 )
 
+// Keep in sync with cmd/compile/internal/gc/plive.go:go115ReduceLiveness.
+const go115ReduceLiveness = true
+
+const go115RestartSeq = go115ReduceLiveness && true // enable restartable sequences
+
 type suspendGState struct {
 	g *g
 
@@ -328,26 +333,30 @@ func wantAsyncPreempt(gp *g) bool {
 // 3. It's generally safe to interact with the runtime, even if we're
 // in a signal handler stopped here. For example, there are no runtime
 // locks held, so acquiring a runtime lock won't self-deadlock.
-func isAsyncSafePoint(gp *g, pc uintptr) bool {
+//
+// In some cases the PC is safe for asynchronous preemption but it
+// also needs to adjust the resumption PC. The new PC is returned in
+// the second result.
+func isAsyncSafePoint(gp *g, pc uintptr) (bool, uintptr) {
 	mp := gp.m
 
 	// Only user Gs can have safe-points. We check this first
 	// because it's extremely common that we'll catch mp in the
 	// scheduler processing this G preemption.
 	if mp.curg != gp {
-		return false
+		return false, 0
 	}
 
 	// Check M state.
 	if mp.p == 0 || !canPreemptM(mp) {
-		return false
+		return false, 0
 	}
 
 	// Check if PC is an unsafe-point.
 	f := FuncForPC(pc)
 	if f == nil {
 		// Not Go code.
-		return false
+		return false, 0
 	}
 	name := f.Name()
 	if hasPrefix(name, "runtime.") ||
@@ -363,8 +372,7 @@ func isAsyncSafePoint(gp *g, pc uintptr) bool {
 		//
 		// TODO(austin): We should improve this, or opt things
 		// in incrementally.
-		return false
+		return false, 0
 	}
-
-	return true
+	return true, pc
 }
diff --git a/libgo/go/runtime/print.go b/libgo/go/runtime/print.go
index 7729ddc0a74..e42023f889c 100644
--- a/libgo/go/runtime/print.go
+++ b/libgo/go/runtime/print.go
@@ -6,6 +6,7 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -276,3 +277,56 @@ func printeface(e eface) {
 func printiface(i iface) {
 	print("(", i.tab, ",", i.data, ")")
 }
+
+// hexdumpWords prints a word-oriented hex dump of [p, end).
+//
+// If mark != nil, it will be called with each printed word's address
+// and should return a character mark to appear just before that
+// word's value. It can return 0 to indicate no mark.
+func hexdumpWords(p, end uintptr, mark func(uintptr) byte) {
+	p1 := func(x uintptr) {
+		var buf [2 * sys.PtrSize]byte
+		for i := len(buf) - 1; i >= 0; i-- {
+			if x&0xF < 10 {
+				buf[i] = byte(x&0xF) + '0'
+			} else {
+				buf[i] = byte(x&0xF) - 10 + 'a'
+			}
+			x >>= 4
+		}
+		gwrite(buf[:])
+	}
+
+	printlock()
+	var markbuf [1]byte
+	markbuf[0] = ' '
+	for i := uintptr(0); p+i < end; i += sys.PtrSize {
+		if i%16 == 0 {
+			if i != 0 {
+				println()
+			}
+			p1(p + i)
+			print(": ")
+		}
+
+		if mark != nil {
+			markbuf[0] = mark(p + i)
+			if markbuf[0] == 0 {
+				markbuf[0] = ' '
+			}
+		}
+		gwrite(markbuf[:])
+		val := *(*uintptr)(unsafe.Pointer(p + i))
+		p1(val)
+		print(" ")
+
+		// Can we symbolize val?
+		name, _, _, _ := funcfileline(val, -1, false)
+		if name != "" {
+			entry := funcentry(val)
+			print("<", name, "+", val-entry, "> ")
+		}
+	}
+	println()
+	printunlock()
+}
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index e0981377512..8f6eb6c6122 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -139,6 +139,7 @@ var modinfo string
 var (
 	m0           m
 	g0           g
+	mcache0      *mcache
 	raceprocctx0 uintptr
 )
 
@@ -279,13 +280,14 @@ func forcegchelper() {
 	setSystemGoroutine()
 
 	forcegc.g = getg()
+	lockInit(&forcegc.lock, lockRankForcegc)
 	for {
 		lock(&forcegc.lock)
 		if forcegc.idle != 0 {
 			throw("forcegc: phase error")
 		}
 		atomic.Store(&forcegc.idle, 1)
-		goparkunlock(&forcegc.lock, waitReasonForceGGIdle, traceEvGoBlock, 1)
+		goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceEvGoBlock, 1)
 		// this goroutine is explicitly resumed by sysmon
 		if debug.gctrace > 0 {
 			println("GC forced")
@@ -542,6 +544,22 @@ func ginit() {
 //
 // The new G calls runtime·main.
 func schedinit() {
+	lockInit(&sched.lock, lockRankSched)
+	lockInit(&sched.sysmonlock, lockRankSysmon)
+	lockInit(&sched.deferlock, lockRankDefer)
+	lockInit(&sched.sudoglock, lockRankSudog)
+	lockInit(&deadlock, lockRankDeadlock)
+	lockInit(&paniclk, lockRankPanic)
+	lockInit(&allglock, lockRankAllg)
+	lockInit(&allpLock, lockRankAllp)
+	// lockInit(&reflectOffs.lock, lockRankReflectOffs)
+	lockInit(&finlock, lockRankFin)
+	lockInit(&trace.bufLock, lockRankTraceBuf)
+	lockInit(&trace.stringsLock, lockRankTraceStrings)
+	lockInit(&trace.lock, lockRankTrace)
+	lockInit(&cpuprof.lock, lockRankCpuprof)
+	lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+
 	_g_ := getg()
 	sched.maxmcount = 10000
 
@@ -675,9 +693,7 @@ func ready(gp *g, traceskip int, next bool) {
 	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
 	casgstatus(gp, _Gwaiting, _Grunnable)
 	runqput(_g_.m.p.ptr(), gp, next)
-	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
-		wakep()
-	}
+	wakep()
 	releasem(mp)
 }
 
@@ -747,6 +763,7 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
 		dumpgstatus(gp)
 		throw("casfrom_Gscanstatus: gp->status is not in scan state")
 	}
+	releaseLockRank(lockRankGscan)
 }
 
 // This will return false if the gp is not in the expected status and the cas fails.
@@ -758,7 +775,12 @@ func castogscanstatus(gp *g, oldval, newval uint32) bool {
 		_Gwaiting,
 		_Gsyscall:
 		if newval == oldval|_Gscan {
-			return atomic.Cas(&gp.atomicstatus, oldval, newval)
+			r := atomic.Cas(&gp.atomicstatus, oldval, newval)
+			if r {
+				acquireLockRank(lockRankGscan)
+			}
+			return r
+
 		}
 	}
 	print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
@@ -779,6 +801,9 @@ func casgstatus(gp *g, oldval, newval uint32) {
 		})
 	}
 
+	acquireLockRank(lockRankGscan)
+	releaseLockRank(lockRankGscan)
+
 	// See https://golang.org/cl/21503 for justification of the yield delay.
 	const yieldDelay = 5 * 1000
 	var nextYield int64
@@ -811,6 +836,7 @@ func casGToPreemptScan(gp *g, old, new uint32) {
 	if old != _Grunning || new != _Gscan|_Gpreempted {
 		throw("bad g transition")
 	}
+	acquireLockRank(lockRankGscan)
 	for !atomic.Cas(&gp.atomicstatus, _Grunning, _Gscan|_Gpreempted) {
 	}
 }
@@ -841,8 +867,23 @@ func casGFromPreempted(gp *g, old, new uint32) bool {
 // goroutines.
 func stopTheWorld(reason string) {
 	semacquire(&worldsema)
-	getg().m.preemptoff = reason
-	systemstack(stopTheWorldWithSema)
+	gp := getg()
+	gp.m.preemptoff = reason
+	systemstack(func() {
+		// Mark the goroutine which called stopTheWorld preemptible so its
+		// stack may be scanned.
+		// This lets a mark worker scan us while we try to stop the world
+		// since otherwise we could get in a mutual preemption deadlock.
+		// We must not modify anything on the G stack because a stack shrink
+		// may occur. A stack shrink is otherwise OK though because in order
+		// to return from this function (and to leave the system stack) we
+		// must have preempted all goroutines, including any attempting
+		// to scan our stack, in which case, any stack shrinking will
+		// have already completed by the time we exit.
+		casgstatus(gp, _Grunning, _Gwaiting)
+		stopTheWorldWithSema()
+		casgstatus(gp, _Gwaiting, _Grunning)
+	})
 }
 
 // startTheWorld undoes the effects of stopTheWorld.
@@ -854,10 +895,31 @@ func startTheWorld() {
 	getg().m.preemptoff = ""
 }
 
-// Holding worldsema grants an M the right to try to stop the world
-// and prevents gomaxprocs from changing concurrently.
+// stopTheWorldGC has the same effect as stopTheWorld, but blocks
+// until the GC is not running. It also blocks a GC from starting
+// until startTheWorldGC is called.
+func stopTheWorldGC(reason string) {
+	semacquire(&gcsema)
+	stopTheWorld(reason)
+}
+
+// startTheWorldGC undoes the effects of stopTheWorldGC.
+func startTheWorldGC() {
+	startTheWorld()
+	semrelease(&gcsema)
+}
+
+// Holding worldsema grants an M the right to try to stop the world.
 var worldsema uint32 = 1
 
+// Holding gcsema grants the M the right to block a GC, and blocks
+// until the current GC is done. In particular, it prevents gomaxprocs
+// from changing concurrently.
+//
+// TODO(mknyszek): Once gomaxprocs and the execution tracer can handle
+// being changed/enabled during a GC, remove this.
+var gcsema uint32 = 1
+
 // stopTheWorldWithSema is the core implementation of stopTheWorld.
 // The caller is responsible for acquiring worldsema and disabling
 // preemption first and then should stopTheWorldWithSema on the system
@@ -1003,9 +1065,7 @@ func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	// Wakeup an additional proc in case we have excessive runnable goroutines
 	// in local queues or in the global queue. If we don't, the proc will park itself.
 	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
-	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
-		wakep()
-	}
+	wakep()
 
 	releasem(mp)
 
@@ -1599,8 +1659,7 @@ func lockextra(nilokay bool) *m {
 	for {
 		old := atomic.Loaduintptr(&extram)
 		if old == locked {
-			yield := osyield
-			yield()
+			osyield()
 			continue
 		}
 		if old == 0 && !nilokay {
@@ -1617,8 +1676,7 @@ func lockextra(nilokay bool) *m {
 		if atomic.Casuintptr(&extram, old, locked) {
 			return (*m)(unsafe.Pointer(old))
 		}
-		yield := osyield
-		yield()
+		osyield()
 		continue
 	}
 }
@@ -1894,8 +1952,11 @@ func handoffp(_p_ *p) {
 // Tries to add one more P to execute G's.
 // Called when a G is made runnable (newproc, ready).
 func wakep() {
+	if atomic.Load(&sched.npidle) == 0 {
+		return
+	}
 	// be conservative about spinning threads
-	if !atomic.Cas(&sched.nmspinning, 0, 1) {
+	if atomic.Load(&sched.nmspinning) != 0 || !atomic.Cas(&sched.nmspinning, 0, 1) {
 		return
 	}
 	startm(nil, true)
@@ -2111,11 +2172,14 @@ top:
 			// Consider stealing timers from p2.
 			// This call to checkTimers is the only place where
 			// we hold a lock on a different P's timers.
-			// Lock contention can be a problem here, so avoid
-			// grabbing the lock if p2 is running and not marked
-			// for preemption. If p2 is running and not being
-			// preempted we assume it will handle its own timers.
-			if i > 2 && shouldStealTimers(p2) {
+			// Lock contention can be a problem here, so
+			// initially avoid grabbing the lock if p2 is running
+			// and is not marked for preemption. If p2 is running
+			// and not being preempted we assume it will handle its
+			// own timers.
+			// If we're still looking for work after checking all
+			// the P's, then go ahead and steal from an active P.
+			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
 				tnow, w, ran := checkTimers(p2, now)
 				now = tnow
 				if w != 0 && (pollUntil == 0 || w < pollUntil) {
@@ -2166,9 +2230,17 @@ stop:
 
 	// wasm only:
 	// If a callback returned and no other goroutine is awake,
-	// then pause execution until a callback was triggered.
-	if beforeIdle(delta) {
-		// At least one goroutine got woken.
+	// then wake event handler goroutine which pauses execution
+	// until a callback was triggered.
+	gp, otherReady := beforeIdle(delta)
+	if gp != nil {
+		casgstatus(gp, _Gwaiting, _Grunnable)
+		if trace.enabled {
+			traceGoUnpark(gp, 0)
+		}
+		return gp, false
+	}
+	if otherReady {
 		goto top
 	}
 
@@ -2358,12 +2430,16 @@ func resetspinning() {
 	// M wakeup policy is deliberately somewhat conservative, so check if we
 	// need to wakeup another P here. See "Worker thread parking/unparking"
 	// comment at the top of the file for details.
-	if nmspinning == 0 && atomic.Load(&sched.npidle) > 0 {
-		wakep()
-	}
+	wakep()
 }
 
-// Injects the list of runnable G's into the scheduler and clears glist.
+// injectglist adds each runnable G on the list to some run queue,
+// and clears glist. If there is no current P, they are added to the
+// global queue, and up to npidle M's are started to run them.
+// Otherwise, for each idle P, this adds a G to the global queue
+// and starts an M. Any remaining G's are added to the current P's
+// local run queue.
+// This may temporarily acquire the scheduler lock.
 // Can run concurrently with GC.
 func injectglist(glist *gList) {
 	if glist.empty() {
@@ -2374,18 +2450,52 @@ func injectglist(glist *gList) {
 			traceGoUnpark(gp, 0)
 		}
 	}
+
+	// Mark all the goroutines as runnable before we put them
+	// on the run queues.
+	head := glist.head.ptr()
+	var tail *g
+	qsize := 0
+	for gp := head; gp != nil; gp = gp.schedlink.ptr() {
+		tail = gp
+		qsize++
+		casgstatus(gp, _Gwaiting, _Grunnable)
+	}
+
+	// Turn the gList into a gQueue.
+	var q gQueue
+	q.head.set(head)
+	q.tail.set(tail)
+	*glist = gList{}
+
+	startIdle := func(n int) {
+		for ; n != 0 && sched.npidle != 0; n-- {
+			startm(nil, false)
+		}
+	}
+
+	pp := getg().m.p.ptr()
+	if pp == nil {
+		lock(&sched.lock)
+		globrunqputbatch(&q, int32(qsize))
+		unlock(&sched.lock)
+		startIdle(qsize)
+		return
+	}
+
 	lock(&sched.lock)
+	npidle := int(sched.npidle)
 	var n int
-	for n = 0; !glist.empty(); n++ {
-		gp := glist.pop()
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		globrunqput(gp)
+	for n = 0; n < npidle && !q.empty(); n++ {
+		globrunqput(q.pop())
 	}
 	unlock(&sched.lock)
-	for ; n != 0 && sched.npidle != 0; n-- {
-		startm(nil, false)
+	startIdle(n)
+	qsize -= n
+
+	if !q.empty() {
+		runqputbatch(pp, &q, qsize)
 	}
-	*glist = gList{}
 }
 
 // One round of scheduler: find a runnable goroutine and execute it.
@@ -2509,9 +2619,7 @@ top:
 	// If about to schedule a not-normal goroutine (a GCworker or tracereader),
 	// wake a P if there is one.
 	if tryWakeP {
-		if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
-			wakep()
-		}
+		wakep()
 	}
 	if gp.lockedm != 0 {
 		// Hands off own p to the locked m,
@@ -2861,7 +2969,6 @@ func reentersyscall(pc, sp uintptr) {
 
 	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
 	_g_.sysblocktraced = true
-	_g_.m.mcache = nil
 	pp := _g_.m.p.ptr()
 	pp.m = 0
 	_g_.m.oldp.set(pp)
@@ -2945,9 +3052,6 @@ func exitsyscall() {
 	oldp := _g_.m.oldp.ptr()
 	_g_.m.oldp = 0
 	if exitsyscallfast(oldp) {
-		if _g_.m.mcache == nil {
-			throw("lost mcache")
-		}
 		if trace.enabled {
 			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
 				systemstack(traceGoStart)
@@ -2996,10 +3100,6 @@ func exitsyscall() {
 	// Call the scheduler.
 	mcall(exitsyscall0)
 
-	if _g_.m.mcache == nil {
-		throw("lost mcache")
-	}
-
 	// Scheduler returned, so we're allowed to run now.
 	// Delete the syscallsp information that we left for
 	// the garbage collector during the system call.
@@ -3305,12 +3405,14 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 
 	makeGContext(newg, sp, spsize)
 
+	releasem(_g_.m)
+
 	runqput(_p_, newg, true)
 
-	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && mainStarted {
+	if mainStarted {
 		wakep()
 	}
-	releasem(_g_.m)
+
 	return newg
 }
 
@@ -3772,10 +3874,12 @@ func (pp *p) init(id int32) {
 	pp.wbBuf.reset()
 	if pp.mcache == nil {
 		if id == 0 {
-			if getg().m.mcache == nil {
+			if mcache0 == nil {
 				throw("missing mcache?")
 			}
-			pp.mcache = getg().m.mcache // bootstrap
+			// Use the bootstrap mcache0. Only one P will get
+			// mcache0: the one with ID 0.
+			pp.mcache = mcache0
 		} else {
 			pp.mcache = allocmcache()
 		}
@@ -3788,6 +3892,7 @@ func (pp *p) init(id int32) {
 			pp.raceprocctx = raceproccreate()
 		}
 	}
+	lockInit(&pp.timersLock, lockRankTimers)
 }
 
 // destroy releases all of the resources associated with pp and
@@ -3934,7 +4039,6 @@ func procresize(nprocs int32) *p {
 			_g_.m.p.ptr().m = 0
 		}
 		_g_.m.p = 0
-		_g_.m.mcache = nil
 		p := allp[0]
 		p.m = 0
 		p.status = _Pidle
@@ -3944,6 +4048,9 @@ func procresize(nprocs int32) *p {
 		}
 	}
 
+	// g.m.p is now set, so we no longer need mcache0 for bootstrapping.
+	mcache0 = nil
+
 	// release resources from unused P's
 	for i := nprocs; i < old; i++ {
 		p := allp[i]
@@ -4009,7 +4116,7 @@ func acquirep(_p_ *p) {
 func wirep(_p_ *p) {
 	_g_ := getg()
 
-	if _g_.m.p != 0 || _g_.m.mcache != nil {
+	if _g_.m.p != 0 {
 		throw("wirep: already in go")
 	}
 	if _p_.m != 0 || _p_.status != _Pidle {
@@ -4020,7 +4127,6 @@ func wirep(_p_ *p) {
 		print("wirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
 		throw("wirep: invalid p state")
 	}
-	_g_.m.mcache = _p_.mcache
 	_g_.m.p.set(_p_)
 	_p_.m.set(_g_.m)
 	_p_.status = _Prunning
@@ -4030,19 +4136,18 @@ func wirep(_p_ *p) {
 func releasep() *p {
 	_g_ := getg()
 
-	if _g_.m.p == 0 || _g_.m.mcache == nil {
+	if _g_.m.p == 0 {
 		throw("releasep: invalid arg")
 	}
 	_p_ := _g_.m.p.ptr()
-	if _p_.m.ptr() != _g_.m || _p_.mcache != _g_.m.mcache || _p_.status != _Prunning {
-		print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", hex(_p_.m), " m->mcache=", _g_.m.mcache, " p->mcache=", _p_.mcache, " p->status=", _p_.status, "\n")
+	if _p_.m.ptr() != _g_.m || _p_.status != _Prunning {
+		print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", hex(_p_.m), " p->status=", _p_.status, "\n")
 		throw("releasep: invalid p state")
 	}
 	if trace.enabled {
 		traceProcStop(_g_.m.p.ptr())
 	}
 	_g_.m.p = 0
-	_g_.m.mcache = nil
 	_p_.m = 0
 	_p_.status = _Pidle
 	return _p_
@@ -4222,6 +4327,18 @@ func sysmon() {
 			}
 			unlock(&sched.lock)
 		}
+		lock(&sched.sysmonlock)
+		{
+			// If we spent a long time blocked on sysmonlock
+			// then we want to update now and next since it's
+			// likely stale.
+			now1 := nanotime()
+			if now1-now > 50*1000 /* 50µs */ {
+				next, _ = timeSleepUntil()
+			}
+			now = now1
+		}
+
 		// trigger libc interceptors if needed
 		if *cgo_yield != nil {
 			asmcgocall(*cgo_yield, nil)
@@ -4250,6 +4367,10 @@ func sysmon() {
 			// Try to start an M to run them.
 			startm(nil, false)
 		}
+		if atomic.Load(&scavenge.sysmonWake) != 0 {
+			// Kick the scavenger awake if someone requested it.
+			wakeScavenger()
+		}
 		// retake P's blocked in syscalls
 		// and preempt long running G's
 		if retake(now) != 0 {
@@ -4270,6 +4391,7 @@ func sysmon() {
 			lasttrace = now
 			schedtrace(debug.scheddetail > 0)
 		}
+		unlock(&sched.sysmonlock)
 	}
 }
 
@@ -4747,6 +4869,38 @@ func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
 	return true
 }
 
+// runqputbatch tries to put all the G's on q on the local runnable queue.
+// If the queue is full, they are put on the global queue; in that case
+// this will temporarily acquire the scheduler lock.
+// Executed only by the owner P.
+func runqputbatch(pp *p, q *gQueue, qsize int) {
+	h := atomic.LoadAcq(&pp.runqhead)
+	t := pp.runqtail
+	n := uint32(0)
+	for !q.empty() && t-h < uint32(len(pp.runq)) {
+		gp := q.pop()
+		pp.runq[t%uint32(len(pp.runq))].set(gp)
+		t++
+		n++
+	}
+	qsize -= int(n)
+
+	if randomizeScheduler {
+		off := func(o uint32) uint32 {
+			return (pp.runqtail + o) % uint32(len(pp.runq))
+		}
+		for i := uint32(1); i < n; i++ {
+			j := fastrandn(i + 1)
+			pp.runq[off(i)], pp.runq[off(j)] = pp.runq[off(j)], pp.runq[off(i)]
+		}
+	}
+
+	atomic.StoreRel(&pp.runqtail, t)
+	if !q.empty() {
+		globrunqputbatch(q, int32(qsize))
+	}
+}
+
 // Get g from local runnable queue.
 // If inheritTime is true, gp should inherit the remaining time in the
 // current time slice. Otherwise, it should start a new time slice.
diff --git a/libgo/go/runtime/proc_test.go b/libgo/go/runtime/proc_test.go
index 5f96d648d98..b9828d98183 100644
--- a/libgo/go/runtime/proc_test.go
+++ b/libgo/go/runtime/proc_test.go
@@ -6,6 +6,7 @@ package runtime_test
 
 import (
 	"fmt"
+	"internal/race"
 	"internal/testenv"
 	"math"
 	"net"
@@ -429,6 +430,11 @@ func TestPingPongHog(t *testing.T) {
 	if testing.Short() {
 		t.Skip("skipping in -short mode")
 	}
+	if race.Enabled {
+		// The race detector randomizes the scheduler,
+		// which causes this test to fail (#38266).
+		t.Skip("skipping in -race mode")
+	}
 
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
 	done := make(chan bool)
@@ -1062,3 +1068,22 @@ loop:
 		t.Errorf("netpollBreak did not interrupt netpoll: slept for: %v", dur)
 	}
 }
+
+// TestBigGOMAXPROCS tests that setting GOMAXPROCS to a large value
+// doesn't cause a crash at startup. See issue 38474.
+func TestBigGOMAXPROCS(t *testing.T) {
+	t.Parallel()
+	output := runTestProg(t, "testprog", "NonexistentTest", "GOMAXPROCS=1024")
+	// Ignore error conditions on small machines.
+	for _, errstr := range []string{
+		"failed to create new OS thread",
+		"cannot allocate memory",
+	} {
+		if strings.Contains(output, errstr) {
+			t.Skipf("failed to create 1024 threads")
+		}
+	}
+	if !strings.Contains(output, "unknown function: NonexistentTest") {
+		t.Errorf("output:\n%s\nwanted:\nunknown function: NonexistentTest", output)
+	}
+}
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index 6edf7a5acec..a8a53d3052c 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -475,8 +475,3 @@ func releasem(mp *m) {
 	//	_g_.stackguard0 = stackPreempt
 	// }
 }
-
-//go:nosplit
-func gomcache() *mcache {
-	return getg().m.mcache
-}
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 75b42f71309..5029dbad027 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -164,7 +164,10 @@ const (
 // as fast as spin locks (just a few user-level instructions),
 // but on the contention path they sleep in the kernel.
 // A zeroed Mutex is unlocked (no need to initialize each lock).
+// Initialization is helpful for static lock ranking, but not required.
 type mutex struct {
+	// Empty struct if lock ranking is disabled, otherwise includes the lock rank
+	lockRankStruct
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
 	// Used to be a union, but unions break precise GC.
@@ -334,12 +337,9 @@ type sudog struct {
 
 	g *g
 
-	// isSelect indicates g is participating in a select, so
-	// g.selectDone must be CAS'd to win the wake-up race.
-	isSelect bool
-	next     *sudog
-	prev     *sudog
-	elem     unsafe.Pointer // data element (may point to stack)
+	next *sudog
+	prev *sudog
+	elem unsafe.Pointer // data element (may point to stack)
 
 	// The following fields are never accessed concurrently.
 	// For channels, waitlink is only accessed by g.
@@ -349,10 +349,15 @@ type sudog struct {
 	acquiretime int64
 	releasetime int64
 	ticket      uint32
-	parent      *sudog // semaRoot binary tree
-	waitlink    *sudog // g.waiting list or semaRoot
-	waittail    *sudog // semaRoot
-	c           *hchan // channel
+
+	// isSelect indicates g is participating in a select, so
+	// g.selectDone must be CAS'd to win the wake-up race.
+	isSelect bool
+
+	parent   *sudog // semaRoot binary tree
+	waitlink *sudog // g.waiting list or semaRoot
+	waittail *sudog // semaRoot
+	c        *hchan // channel
 }
 
 /*
@@ -393,6 +398,12 @@ type stack struct {
 }
 */
 
+// heldLockInfo gives info on a held lock and the rank of that lock
+type heldLockInfo struct {
+	lockAddr uintptr
+	rank     lockRank
+}
+
 type g struct {
 	// Stack parameters.
 	// stack describes the actual stack memory: [stack.lo, stack.hi).
@@ -566,7 +577,6 @@ type m struct {
 	park          note
 	alllink       *m // on allm
 	schedlink     muintptr
-	mcache        *mcache
 	lockedg       guintptr
 	createstack   [32]location // stack that created this thread.
 	lockedExt     uint32       // tracking for external LockOSThread
@@ -601,6 +611,10 @@ type m struct {
 
 	mOS
 
+	// Up to 10 locks held by this m, maintained by the lock ranking code.
+	locksHeldLen int
+	locksHeld    [10]heldLockInfo
+
 	// Remaining fields are specific to gccgo.
 
 	gsignalstack     unsafe.Pointer // stack for gsignal
@@ -817,6 +831,12 @@ type schedt struct {
 
 	procresizetime int64 // nanotime() of last change to gomaxprocs
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
+
+	// sysmonlock protects sysmon's actions on the runtime.
+	//
+	// Acquire and hold this mutex to block sysmon from interacting
+	// with the rest of the runtime.
+	sysmonlock mutex
 }
 
 // Values for the flags field of a sigTabT.
@@ -985,7 +1005,7 @@ const (
 	waitReasonChanReceive                             // "chan receive"
 	waitReasonChanSend                                // "chan send"
 	waitReasonFinalizerWait                           // "finalizer wait"
-	waitReasonForceGGIdle                             // "force gc (idle)"
+	waitReasonForceGCIdle                             // "force gc (idle)"
 	waitReasonSemacquire                              // "semacquire"
 	waitReasonSleep                                   // "sleep"
 	waitReasonSyncCondWait                            // "sync.Cond.Wait"
@@ -994,6 +1014,7 @@ const (
 	waitReasonWaitForGCCycle                          // "wait for GC cycle"
 	waitReasonGCWorkerIdle                            // "GC worker (idle)"
 	waitReasonPreempted                               // "preempted"
+	waitReasonDebugCall                               // "debug call"
 )
 
 var waitReasonStrings = [...]string{
@@ -1014,7 +1035,7 @@ var waitReasonStrings = [...]string{
 	waitReasonChanReceive:           "chan receive",
 	waitReasonChanSend:              "chan send",
 	waitReasonFinalizerWait:         "finalizer wait",
-	waitReasonForceGGIdle:           "force gc (idle)",
+	waitReasonForceGCIdle:           "force gc (idle)",
 	waitReasonSemacquire:            "semacquire",
 	waitReasonSleep:                 "sleep",
 	waitReasonSyncCondWait:          "sync.Cond.Wait",
@@ -1023,6 +1044,7 @@ var waitReasonStrings = [...]string{
 	waitReasonWaitForGCCycle:        "wait for GC cycle",
 	waitReasonGCWorkerIdle:          "GC worker (idle)",
 	waitReasonPreempted:             "preempted",
+	waitReasonDebugCall:             "debug call",
 }
 
 func (w waitReason) String() string {
diff --git a/libgo/go/runtime/rwmutex.go b/libgo/go/runtime/rwmutex.go
index a6da4c979ba..7713c3f1ccf 100644
--- a/libgo/go/runtime/rwmutex.go
+++ b/libgo/go/runtime/rwmutex.go
@@ -39,7 +39,7 @@ func (rw *rwmutex) rlock() {
 	if int32(atomic.Xadd(&rw.readerCount, 1)) < 0 {
 		// A writer is pending. Park on the reader queue.
 		systemstack(func() {
-			lock(&rw.rLock)
+			lockWithRank(&rw.rLock, lockRankRwmutexR)
 			if rw.readerPass > 0 {
 				// Writer finished.
 				rw.readerPass -= 1
@@ -67,7 +67,7 @@ func (rw *rwmutex) runlock() {
 		// A writer is pending.
 		if atomic.Xadd(&rw.readerWait, -1) == 0 {
 			// The last reader unblocks the writer.
-			lock(&rw.rLock)
+			lockWithRank(&rw.rLock, lockRankRwmutexR)
 			w := rw.writer.ptr()
 			if w != nil {
 				notewakeup(&w.park)
@@ -81,12 +81,12 @@ func (rw *rwmutex) runlock() {
 // lock locks rw for writing.
 func (rw *rwmutex) lock() {
 	// Resolve competition with other writers and stick to our P.
-	lock(&rw.wLock)
+	lockWithRank(&rw.wLock, lockRankRwmutexW)
 	m := getg().m
 	// Announce that there is a pending writer.
 	r := int32(atomic.Xadd(&rw.readerCount, -rwmutexMaxReaders)) + rwmutexMaxReaders
 	// Wait for any active readers to complete.
-	lock(&rw.rLock)
+	lockWithRank(&rw.rLock, lockRankRwmutexR)
 	if r != 0 && atomic.Xadd(&rw.readerWait, r) != 0 {
 		// Wait for reader to wake us up.
 		systemstack(func() {
@@ -108,7 +108,7 @@ func (rw *rwmutex) unlock() {
 		throw("unlock of unlocked rwmutex")
 	}
 	// Unblock blocked readers.
-	lock(&rw.rLock)
+	lockWithRank(&rw.rLock, lockRankRwmutexR)
 	for rw.readers.ptr() != nil {
 		reader := rw.readers.ptr()
 		rw.readers = reader.schedlink
diff --git a/libgo/go/runtime/select.go b/libgo/go/runtime/select.go
index c9e3dd77d70..cf5d0c75a49 100644
--- a/libgo/go/runtime/select.go
+++ b/libgo/go/runtime/select.go
@@ -103,8 +103,9 @@ func block() {
 // selectgo implements the select statement.
 //
 // cas0 points to an array of type [ncases]scase, and order0 points to
-// an array of type [2*ncases]uint16. Both reside on the goroutine's
-// stack (regardless of any escaping in selectgo).
+// an array of type [2*ncases]uint16 where ncases must be <= 65536.
+// Both reside on the goroutine's stack (regardless of any escaping in
+// selectgo).
 //
 // selectgo returns the index of the chosen scase, which matches the
 // ordinal position of its respective select{recv,send,default} call.
@@ -115,6 +116,8 @@ func selectgo(cas0 *scase, order0 *uint16, ncases int) (int, bool) {
 		print("select: cas0=", cas0, "\n")
 	}
 
+	// NOTE: In order to maintain a lean stack size, the number of scases
+	// is capped at 65536.
 	cas1 := (*[1 << 16]scase)(unsafe.Pointer(cas0))
 	order1 := (*[1 << 17]uint16)(unsafe.Pointer(order0))
 
diff --git a/libgo/go/runtime/sema.go b/libgo/go/runtime/sema.go
index b6fab6daca7..c1418b3da43 100644
--- a/libgo/go/runtime/sema.go
+++ b/libgo/go/runtime/sema.go
@@ -129,7 +129,7 @@ func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes i
 		s.acquiretime = t0
 	}
 	for {
-		lock(&root.lock)
+		lockWithRank(&root.lock, lockRankRoot)
 		// Add ourselves to nwait to disable "easy case" in semrelease.
 		atomic.Xadd(&root.nwait, 1)
 		// Check cansemacquire to avoid missed wakeup.
@@ -168,7 +168,7 @@ func semrelease1(addr *uint32, handoff bool, skipframes int) {
 	}
 
 	// Harder case: search for a waiter and wake it.
-	lock(&root.lock)
+	lockWithRank(&root.lock, lockRankRoot)
 	if atomic.Load(&root.nwait) == 0 {
 		// The count is already consumed by another goroutine,
 		// so no need to wake up another goroutine.
@@ -486,7 +486,7 @@ func notifyListAdd(l *notifyList) uint32 {
 // notifyListAdd was called, it returns immediately. Otherwise, it blocks.
 //go:linkname notifyListWait sync.runtime_notifyListWait
 func notifyListWait(l *notifyList, t uint32) {
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 
 	// Return right away if this ticket has already been notified.
 	if less(t, l.notify) {
@@ -528,7 +528,7 @@ func notifyListNotifyAll(l *notifyList) {
 
 	// Pull the list out into a local variable, waiters will be readied
 	// outside the lock.
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 	s := l.head
 	l.head = nil
 	l.tail = nil
@@ -558,7 +558,7 @@ func notifyListNotifyOne(l *notifyList) {
 		return
 	}
 
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 
 	// Re-check under the lock if we need to do anything.
 	t := l.notify
diff --git a/libgo/go/runtime/sema_test.go b/libgo/go/runtime/sema_test.go
index 8bd5d4ce57c..cf3de0a1909 100644
--- a/libgo/go/runtime/sema_test.go
+++ b/libgo/go/runtime/sema_test.go
@@ -6,6 +6,7 @@ package runtime_test
 
 import (
 	. "runtime"
+	"sync"
 	"sync/atomic"
 	"testing"
 )
@@ -61,8 +62,11 @@ func testSemaHandoff() bool {
 	// to another goroutine. Stop the current goroutine from migrating to
 	// another CPU where it can win the race (and appear to have not yielded) by
 	// keeping the CPUs slightly busy.
+	var wg sync.WaitGroup
 	for i := 0; i < GOMAXPROCS(-1); i++ {
+		wg.Add(1)
 		go func() {
+			defer wg.Done()
 			for {
 				select {
 				case <-done:
@@ -74,7 +78,9 @@ func testSemaHandoff() bool {
 		}()
 	}
 
+	wg.Add(1)
 	go func() {
+		defer wg.Done()
 		Semacquire(&sema)
 		atomic.CompareAndSwapUint32(&res, 0, 1)
 
@@ -91,7 +97,7 @@ func testSemaHandoff() bool {
 	Semrelease1(&sema, true, 0)
 	atomic.CompareAndSwapUint32(&res, 0, 2)
 
-	<-done // wait for goroutines to finish to avoid data races
+	wg.Wait() // wait for goroutines to finish to avoid data races
 
 	return res == 1 // did the waiter run first?
 }
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index 1e057f6bf24..17c15c5a205 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -335,10 +335,13 @@ func sigpipe() {
 func doSigPreempt(gp *g, ctxt *sigctxt, sigpc uintptr) {
 	// Check if this G wants to be preempted and is safe to
 	// preempt.
-	if wantAsyncPreempt(gp) && isAsyncSafePoint(gp, sigpc) {
-		// Inject a call to asyncPreempt.
-		// ctxt.pushCall(funcPC(asyncPreempt))
-		throw("pushCall not implemented")
+	if wantAsyncPreempt(gp) {
+		if ok, newpc := isAsyncSafePoint(gp, sigpc); ok {
+			// Adjust the PC and inject a call to asyncPreempt.
+			// ctxt.pushCall(funcPC(asyncPreempt), newpc)
+			throw("pushCall not implemented")
+			_ = newpc
+		}
 	}
 
 	// Acknowledge the preemption.
@@ -346,10 +349,8 @@ func doSigPreempt(gp *g, ctxt *sigctxt, sigpc uintptr) {
 	atomic.Store(&gp.m.signalPending, 0)
 }
 
-// gccgo-specific definition.
-const pushCallSupported = false
-
-const preemptMSupported = pushCallSupported
+// This is false for gccgo.
+const preemptMSupported = false
 
 // preemptM sends a preemption request to mp. This request may be
 // handled asynchronously and may be coalesced with other requests to
@@ -358,13 +359,8 @@ const preemptMSupported = pushCallSupported
 // safe-point, it will preempt the goroutine. It always atomically
 // increments mp.preemptGen after handling a preemption request.
 func preemptM(mp *m) {
-	if !pushCallSupported {
-		// This architecture doesn't support ctxt.pushCall
-		// yet, so doSigPreempt won't work.
-		return
-	}
-	if GOOS == "darwin" && (GOARCH == "arm" || GOARCH == "arm64") && !iscgo {
-		// On darwin, we use libc calls, and cgo is required on ARM and ARM64
+	if GOOS == "darwin" && GOARCH == "arm64" && !iscgo {
+		// On darwin, we use libc calls, and cgo is required on ARM64
 		// so we have TLS set up to save/restore G during C calls. If cgo is
 		// absent, we cannot save/restore G in TLS, and if a signal is
 		// received during C execution we cannot get the G. Therefore don't
@@ -480,7 +476,7 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 		return
 	}
 
-	if sig == sigPreempt {
+	if sig == sigPreempt && debug.asyncpreemptoff == 0 {
 		// Might be a preemption signal.
 		doSigPreempt(gp, c, sigpc)
 		// Even if this was definitely a preemption signal, it
@@ -492,10 +488,10 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 	if sig < uint32(len(sigtable)) {
 		flags = sigtable[sig].flags
 	}
-	if flags&_SigPanic != 0 && gp.throwsplit {
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 && gp.throwsplit {
 		// We can't safely sigpanic because it may grow the
 		// stack. Abort in the signal handler instead.
-		flags = (flags &^ _SigPanic) | _SigThrow
+		flags = _SigThrow
 	}
 	if isAbortPC(sigpc) {
 		// On many architectures, the abort function just
@@ -536,7 +532,11 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 		dieFromSignal(sig)
 	}
 
-	if flags&_SigThrow == 0 {
+	// _SigThrow means that we should exit now.
+	// If we get here with _SigPanic, it means that the signal
+	// was sent to us by a program (c.sigcode() == _SI_USER);
+	// in that case, if we didn't handle it in sigsend, we exit now.
+	if flags&(_SigThrow|_SigPanic) == 0 {
 		return
 	}
 
@@ -558,6 +558,30 @@ func sighandler(sig uint32, info *_siginfo_t, ctxt unsafe.Pointer, gp *g) {
 		print("signal arrived during cgo execution\n")
 		gp = _g_.m.lockedg.ptr()
 	}
+	if sig == _SIGILL {
+		// It would be nice to know how long the instruction is.
+		// Unfortunately, that's complicated to do in general (mostly for x86
+		// and s930x, but other archs have non-standard instruction lengths also).
+		// Opt to print 16 bytes, which covers most instructions.
+		const maxN = 16
+		n := uintptr(maxN)
+		// We have to be careful, though. If we're near the end of
+		// a page and the following page isn't mapped, we could
+		// segfault. So make sure we don't straddle a page (even though
+		// that could lead to printing an incomplete instruction).
+		// We're assuming here we can read at least the page containing the PC.
+		// I suppose it is possible that the page is mapped executable but not readable?
+		pc := sigpc
+		if n > physPageSize-pc%physPageSize {
+			n = physPageSize - pc%physPageSize
+		}
+		print("instruction bytes:")
+		b := (*[maxN]byte)(unsafe.Pointer(pc))
+		for i := uintptr(0); i < n; i++ {
+			print(" ", hex(b[i]))
+		}
+		println()
+	}
 	print("\n")
 
 	level, _, docrash := gotraceback()
diff --git a/libgo/go/runtime/signal_windows_test.go b/libgo/go/runtime/signal_windows_test.go
index 97484034126..f99857193c1 100644
--- a/libgo/go/runtime/signal_windows_test.go
+++ b/libgo/go/runtime/signal_windows_test.go
@@ -3,6 +3,9 @@
 package runtime_test
 
 import (
+	"bufio"
+	"bytes"
+	"fmt"
 	"internal/testenv"
 	"io/ioutil"
 	"os"
@@ -10,6 +13,7 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
+	"syscall"
 	"testing"
 )
 
@@ -59,3 +63,90 @@ func TestVectoredHandlerDontCrashOnLibrary(t *testing.T) {
 		t.Errorf("expected output %q, got %q", expectedOutput, cleanedOut)
 	}
 }
+
+func sendCtrlBreak(pid int) error {
+	kernel32, err := syscall.LoadDLL("kernel32.dll")
+	if err != nil {
+		return fmt.Errorf("LoadDLL: %v\n", err)
+	}
+	generateEvent, err := kernel32.FindProc("GenerateConsoleCtrlEvent")
+	if err != nil {
+		return fmt.Errorf("FindProc: %v\n", err)
+	}
+	result, _, err := generateEvent.Call(syscall.CTRL_BREAK_EVENT, uintptr(pid))
+	if result == 0 {
+		return fmt.Errorf("GenerateConsoleCtrlEvent: %v\n", err)
+	}
+	return nil
+}
+
+// TestLibraryCtrlHandler tests that Go DLL allows calling program to handle console control events.
+// See https://golang.org/issues/35965.
+func TestLibraryCtrlHandler(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+	if runtime.GOARCH != "amd64" {
+		t.Skip("this test can only run on windows/amd64")
+	}
+	testenv.MustHaveGoBuild(t)
+	testenv.MustHaveExecPath(t, "gcc")
+	testprog.Lock()
+	defer testprog.Unlock()
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	// build go dll
+	dll := filepath.Join(dir, "dummy.dll")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "--buildmode", "c-shared", "testdata/testwinlibsignal/dummy.go")
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build go library: %s\n%s", err, out)
+	}
+
+	// build c program
+	exe := filepath.Join(dir, "test.exe")
+	cmd = exec.Command("gcc", "-o", exe, "testdata/testwinlibsignal/main.c")
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build c exe: %s\n%s", err, out)
+	}
+
+	// run test program
+	cmd = exec.Command(exe)
+	var stderr bytes.Buffer
+	cmd.Stderr = &stderr
+	outPipe, err := cmd.StdoutPipe()
+	if err != nil {
+		t.Fatalf("Failed to create stdout pipe: %v", err)
+	}
+	outReader := bufio.NewReader(outPipe)
+
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP,
+	}
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+
+	errCh := make(chan error, 1)
+	go func() {
+		if line, err := outReader.ReadString('\n'); err != nil {
+			errCh <- fmt.Errorf("could not read stdout: %v", err)
+		} else if strings.TrimSpace(line) != "ready" {
+			errCh <- fmt.Errorf("unexpected message: %v", line)
+		} else {
+			errCh <- sendCtrlBreak(cmd.Process.Pid)
+		}
+	}()
+
+	if err := <-errCh; err != nil {
+		t.Fatal(err)
+	}
+	if err := cmd.Wait(); err != nil {
+		t.Fatalf("Program exited with error: %v\n%s", err, &stderr)
+	}
+}
diff --git a/libgo/go/runtime/sigqueue.go b/libgo/go/runtime/sigqueue.go
index 20704642f8d..7d1028eac29 100644
--- a/libgo/go/runtime/sigqueue.go
+++ b/libgo/go/runtime/sigqueue.go
@@ -192,16 +192,13 @@ func signalWaitUntilIdle() {
 //go:linkname signal_enable os..z2fsignal.signal_enable
 func signal_enable(s uint32) {
 	if !sig.inuse {
-		// The first call to signal_enable is for us
-		// to use for initialization. It does not pass
-		// signal information in m.
+		// This is the first call to signal_enable. Initialize.
 		sig.inuse = true // enable reception of signals; cannot disable
 		if GOOS == "darwin" {
 			sigNoteSetup(&sig.note)
-			return
+		} else {
+			noteclear(&sig.note)
 		}
-		noteclear(&sig.note)
-		return
 	}
 
 	if s >= uint32(len(sig.wanted)*32) {
diff --git a/libgo/go/runtime/sizeof_test.go b/libgo/go/runtime/sizeof_test.go
index d829c5865db..0afb05e19d2 100644
--- a/libgo/go/runtime/sizeof_test.go
+++ b/libgo/go/runtime/sizeof_test.go
@@ -25,7 +25,8 @@ func TestSizeof(t *testing.T) {
 		_32bit uintptr     // size on 32bit platforms
 		_64bit uintptr     // size on 64bit platforms
 	}{
-		{runtime.G{}, 216, 376}, // g, but exported for testing
+		{runtime.G{}, 216, 376},   // g, but exported for testing
+		{runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
 	}
 
 	for _, tt := range tests {
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index 519735392a9..97b26594dad 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -41,6 +41,55 @@ func panicmakeslicecap() {
 	panic(errorString("makeslice: cap out of range"))
 }
 
+// makeslicecopy allocates a slice of "tolen" elements of type "et",
+// then copies "fromlen" elements of type "et" into that new allocation from "from".
+func makeslicecopy(et *_type, tolen int, fromlen int, from unsafe.Pointer) unsafe.Pointer {
+	var tomem, copymem uintptr
+	if uintptr(tolen) > uintptr(fromlen) {
+		var overflow bool
+		tomem, overflow = math.MulUintptr(et.size, uintptr(tolen))
+		if overflow || tomem > maxAlloc || tolen < 0 {
+			panicmakeslicelen()
+		}
+		copymem = et.size * uintptr(fromlen)
+	} else {
+		// fromlen is a known good length providing and equal or greater than tolen,
+		// thereby making tolen a good slice length too as from and to slices have the
+		// same element width.
+		tomem = et.size * uintptr(tolen)
+		copymem = tomem
+	}
+
+	var to unsafe.Pointer
+	if et.ptrdata == 0 {
+		to = mallocgc(tomem, nil, false)
+		if copymem < tomem {
+			memclrNoHeapPointers(add(to, copymem), tomem-copymem)
+		}
+	} else {
+		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
+		to = mallocgc(tomem, et, true)
+		if copymem > 0 && writeBarrier.enabled {
+			// Only shade the pointers in old.array since we know the destination slice to
+			// only contains nil pointers because it has been cleared during alloc.
+			bulkBarrierPreWriteSrcOnly(uintptr(to), uintptr(from), copymem)
+		}
+	}
+
+	if raceenabled {
+		callerpc := getcallerpc()
+		pc := funcPC(makeslicecopy)
+		racereadrangepc(from, copymem, callerpc, pc)
+	}
+	if msanenabled {
+		msanread(from, copymem)
+	}
+
+	memmove(to, from, copymem)
+
+	return to
+}
+
 func makeslice(et *_type, len, cap int) unsafe.Pointer {
 	mem, overflow := math.MulUintptr(et.size, uintptr(cap))
 	if overflow || mem > maxAlloc || len < 0 || len > cap {
@@ -187,7 +236,7 @@ func growslice(et *_type, oldarray unsafe.Pointer, oldlen, oldcap, cap int) slic
 		if lenmem > 0 && writeBarrier.enabled {
 			// Only shade the pointers in old.array since we know the destination slice p
 			// only contains nil pointers because it has been cleared during alloc.
-			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(oldarray), lenmem)
+			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(oldarray), lenmem-et.size+et.ptrdata)
 		}
 	}
 	memmove(p, oldarray, lenmem)
@@ -216,12 +265,12 @@ func slicecopy(toPtr unsafe.Pointer, toLen int, fmPtr unsafe.Pointer, fmLen int,
 	if raceenabled {
 		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
-		racewriterangepc(toPtr, uintptr(n*int(width)), callerpc, pc)
 		racereadrangepc(fmPtr, uintptr(n*int(width)), callerpc, pc)
+		racewriterangepc(toPtr, uintptr(n*int(width)), callerpc, pc)
 	}
 	if msanenabled {
-		msanwrite(toPtr, uintptr(n*int(width)))
 		msanread(fmPtr, uintptr(n*int(width)))
+		msanwrite(toPtr, uintptr(n*int(width)))
 	}
 
 	size := uintptr(n) * width
diff --git a/libgo/go/runtime/slice_test.go b/libgo/go/runtime/slice_test.go
index 0463fc70a76..e963a43dd31 100644
--- a/libgo/go/runtime/slice_test.go
+++ b/libgo/go/runtime/slice_test.go
@@ -10,6 +10,84 @@ import (
 
 const N = 20
 
+func BenchmarkMakeSliceCopy(b *testing.B) {
+	const length = 32
+	var bytes = make([]byte, 8*length)
+	var ints = make([]int, length)
+	var ptrs = make([]*byte, length)
+	b.Run("mallocmove", func(b *testing.B) {
+		b.Run("Byte", func(b *testing.B) {
+			var x []byte
+			for i := 0; i < b.N; i++ {
+				x = make([]byte, len(bytes))
+				copy(x, bytes)
+			}
+		})
+		b.Run("Int", func(b *testing.B) {
+			var x []int
+			for i := 0; i < b.N; i++ {
+				x = make([]int, len(ints))
+				copy(x, ints)
+			}
+		})
+		b.Run("Ptr", func(b *testing.B) {
+			var x []*byte
+			for i := 0; i < b.N; i++ {
+				x = make([]*byte, len(ptrs))
+				copy(x, ptrs)
+			}
+
+		})
+	})
+	b.Run("makecopy", func(b *testing.B) {
+		b.Run("Byte", func(b *testing.B) {
+			var x []byte
+			for i := 0; i < b.N; i++ {
+				x = make([]byte, 8*length)
+				copy(x, bytes)
+			}
+		})
+		b.Run("Int", func(b *testing.B) {
+			var x []int
+			for i := 0; i < b.N; i++ {
+				x = make([]int, length)
+				copy(x, ints)
+			}
+		})
+		b.Run("Ptr", func(b *testing.B) {
+			var x []*byte
+			for i := 0; i < b.N; i++ {
+				x = make([]*byte, length)
+				copy(x, ptrs)
+			}
+
+		})
+	})
+	b.Run("nilappend", func(b *testing.B) {
+		b.Run("Byte", func(b *testing.B) {
+			var x []byte
+			for i := 0; i < b.N; i++ {
+				x = append([]byte(nil), bytes...)
+				_ = x
+			}
+		})
+		b.Run("Int", func(b *testing.B) {
+			var x []int
+			for i := 0; i < b.N; i++ {
+				x = append([]int(nil), ints...)
+				_ = x
+			}
+		})
+		b.Run("Ptr", func(b *testing.B) {
+			var x []*byte
+			for i := 0; i < b.N; i++ {
+				x = append([]*byte(nil), ptrs...)
+				_ = x
+			}
+		})
+	})
+}
+
 type (
 	struct24 struct{ a, b, c int64 }
 	struct32 struct{ a, b, c, d int64 }
diff --git a/libgo/go/runtime/string.go b/libgo/go/runtime/string.go
index 7b66a1b447d..c0058beee62 100644
--- a/libgo/go/runtime/string.go
+++ b/libgo/go/runtime/string.go
@@ -6,6 +6,7 @@ package runtime
 
 import (
 	"internal/bytealg"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -93,7 +94,11 @@ func slicebytetostring(buf *tmpBuf, ptr *byte, n int) (str string) {
 		msanread(unsafe.Pointer(ptr), uintptr(n))
 	}
 	if n == 1 {
-		stringStructOf(&str).str = unsafe.Pointer(&staticbytes[*ptr])
+		p := unsafe.Pointer(&staticuint64s[*ptr])
+		if sys.BigEndian {
+			p = add(p, 7)
+		}
+		stringStructOf(&str).str = p
 		stringStructOf(&str).len = 1
 		return
 	}
@@ -228,12 +233,6 @@ func stringStructOf(sp *string) *stringStruct {
 }
 
 func intstring(buf *[4]byte, v int64) (s string) {
-	if v >= 0 && v < runeSelf {
-		stringStructOf(&s).str = unsafe.Pointer(&staticbytes[v])
-		stringStructOf(&s).len = 1
-		return
-	}
-
 	var b []byte
 	if buf != nil {
 		b = buf[:]
diff --git a/libgo/go/runtime/string_test.go b/libgo/go/runtime/string_test.go
index e388f706b5f..b9799739741 100644
--- a/libgo/go/runtime/string_test.go
+++ b/libgo/go/runtime/string_test.go
@@ -287,7 +287,7 @@ func TestStringOnStack(t *testing.T) {
 func TestIntString(t *testing.T) {
 	// Non-escaping result of intstring.
 	s := ""
-	for i := 0; i < 4; i++ {
+	for i := rune(0); i < 4; i++ {
 		s += string(i+'0') + string(i+'0'+1)
 	}
 	if want := "01122334"; s != want {
@@ -296,7 +296,7 @@ func TestIntString(t *testing.T) {
 
 	// Escaping result of intstring.
 	var a [4]string
-	for i := 0; i < 4; i++ {
+	for i := rune(0); i < 4; i++ {
 		a[i] = string(i + '0')
 	}
 	s = a[0] + a[1] + a[2] + a[3]
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index 25b1836daf0..d0fe551eaf8 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -140,7 +140,7 @@ func fastrandn(n uint32) uint32 {
 //go:linkname sync_fastrand sync.fastrand
 func sync_fastrand() uint32 { return fastrand() }
 
-// in asm_*.s
+// in internal/bytealg/equal_*.s
 //go:noescape
 func memequal(a, b unsafe.Pointer, size uintptr) bool
 
diff --git a/libgo/go/runtime/symtab.go b/libgo/go/runtime/symtab.go
index 86734574070..bb0b61d6131 100644
--- a/libgo/go/runtime/symtab.go
+++ b/libgo/go/runtime/symtab.go
@@ -155,8 +155,8 @@ type Func struct {
 // given program counter address, or else nil.
 //
 // If pc represents multiple functions because of inlining, it returns
-// the a *Func describing the innermost function, but with an entry
-// of the outermost function.
+// the *Func describing the innermost function, but with an entry of
+// the outermost function.
 func FuncForPC(pc uintptr) *Func {
 	name, _, _, _ := funcfileline(pc, -1, false)
 	if name == "" {
diff --git a/libgo/go/runtime/testdata/testprog/gc.go b/libgo/go/runtime/testdata/testprog/gc.go
index cc16413ef5c..74732cd9f4b 100644
--- a/libgo/go/runtime/testdata/testprog/gc.go
+++ b/libgo/go/runtime/testdata/testprog/gc.go
@@ -11,6 +11,7 @@ import (
 	"runtime/debug"
 	"sync/atomic"
 	"time"
+	"unsafe"
 )
 
 func init() {
@@ -19,6 +20,7 @@ func init() {
 	register("GCSys", GCSys)
 	register("GCPhys", GCPhys)
 	register("DeferLiveness", DeferLiveness)
+	register("GCZombie", GCZombie)
 }
 
 func GCSys() {
@@ -150,16 +152,20 @@ func GCPhys() {
 
 		// The page cache could hide 64 8-KiB pages from the scavenger today.
 		maxPageCache = (8 << 10) * 64
+
+		// Reduce GOMAXPROCS down to 4 if it's greater. We need to bound the amount
+		// of memory held in the page cache because the scavenger can't reach it.
+		// The page cache will hold at most maxPageCache of memory per-P, so this
+		// bounds the amount of memory hidden from the scavenger to 4*maxPageCache
+		// at most.
+		maxProcs = 4
 	)
 	// Set GOGC so that this test operates under consistent assumptions.
 	debug.SetGCPercent(100)
-	// Reduce GOMAXPROCS down to 4 if it's greater. We need to bound the amount
-	// of memory held in the page cache because the scavenger can't reach it.
-	// The page cache will hold at most maxPageCache of memory per-P, so this
-	// bounds the amount of memory hidden from the scavenger to 4*maxPageCache.
 	procs := runtime.GOMAXPROCS(-1)
-	if procs > 4 {
-		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+	if procs > maxProcs {
+		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(maxProcs))
+		procs = runtime.GOMAXPROCS(-1)
 	}
 	// Save objects which we want to survive, and condemn objects which we don't.
 	// Note that we condemn objects in this way and release them all at once in
@@ -260,3 +266,37 @@ func DeferLiveness() {
 func escape(x interface{}) { sink2 = x; sink2 = nil }
 
 var sink2 interface{}
+
+// Test zombie object detection and reporting.
+func GCZombie() {
+	// Allocate several objects of unusual size (so free slots are
+	// unlikely to all be re-allocated by the runtime).
+	const size = 190
+	const count = 8192 / size
+	keep := make([]*byte, 0, (count+1)/2)
+	free := make([]uintptr, 0, (count+1)/2)
+	zombies := make([]*byte, 0, len(free))
+	for i := 0; i < count; i++ {
+		obj := make([]byte, size)
+		p := &obj[0]
+		if i%2 == 0 {
+			keep = append(keep, p)
+		} else {
+			free = append(free, uintptr(unsafe.Pointer(p)))
+		}
+	}
+
+	// Free the unreferenced objects.
+	runtime.GC()
+
+	// Bring the free objects back to life.
+	for _, p := range free {
+		zombies = append(zombies, (*byte)(unsafe.Pointer(p)))
+	}
+
+	// GC should detect the zombie objects.
+	runtime.GC()
+	println("failed")
+	runtime.KeepAlive(keep)
+	runtime.KeepAlive(zombies)
+}
diff --git a/libgo/go/runtime/testdata/testprog/lockosthread.go b/libgo/go/runtime/testdata/testprog/lockosthread.go
index 098cc4dd722..e9d7fdbc44a 100644
--- a/libgo/go/runtime/testdata/testprog/lockosthread.go
+++ b/libgo/go/runtime/testdata/testprog/lockosthread.go
@@ -220,7 +220,7 @@ func LockOSThreadTemplateThreadRace() {
 	}()
 
 	// Try to synchronize both LockOSThreads.
-	start := time.Now().Add(10*time.Millisecond)
+	start := time.Now().Add(10 * time.Millisecond)
 
 	var wg sync.WaitGroup
 	wg.Add(2)
@@ -232,10 +232,10 @@ func LockOSThreadTemplateThreadRace() {
 
 			// Add work to the local runq to trigger early startm
 			// in handoffp.
-			go func(){}()
+			go func() {}()
 
 			runtime.LockOSThread()
-			runtime.Gosched()  // add a preemption point.
+			runtime.Gosched() // add a preemption point.
 			wg.Done()
 		}()
 	}
diff --git a/libgo/go/runtime/testdata/testprog/numcpu_freebsd.go b/libgo/go/runtime/testdata/testprog/numcpu_freebsd.go
index 42ee1548830..aff36ec702b 100644
--- a/libgo/go/runtime/testdata/testprog/numcpu_freebsd.go
+++ b/libgo/go/runtime/testdata/testprog/numcpu_freebsd.go
@@ -85,7 +85,13 @@ func getList() ([]string, error) {
 	if err != nil {
 		return nil, fmt.Errorf("fail to execute '%s': %s", cmdline, err)
 	}
-	pos := bytes.IndexRune(output, ':')
+	pos := bytes.IndexRune(output, '\n')
+	if pos == -1 {
+		return nil, fmt.Errorf("invalid output from '%s', '\\n' not found: %s", cmdline, output)
+	}
+	output = output[0:pos]
+
+	pos = bytes.IndexRune(output, ':')
 	if pos == -1 {
 		return nil, fmt.Errorf("invalid output from '%s', ':' not found: %s", cmdline, output)
 	}
diff --git a/libgo/go/runtime/testdata/testprog/panicprint.go b/libgo/go/runtime/testdata/testprog/panicprint.go
new file mode 100644
index 00000000000..c8deabe2ab1
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/panicprint.go
@@ -0,0 +1,111 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+type MyBool bool
+type MyComplex128 complex128
+type MyComplex64 complex64
+type MyFloat32 float32
+type MyFloat64 float64
+type MyInt int
+type MyInt8 int8
+type MyInt16 int16
+type MyInt32 int32
+type MyInt64 int64
+type MyString string
+type MyUint uint
+type MyUint8 uint8
+type MyUint16 uint16
+type MyUint32 uint32
+type MyUint64 uint64
+type MyUintptr uintptr
+
+func panicCustomComplex64() {
+	panic(MyComplex64(0.11 + 3i))
+}
+
+func panicCustomComplex128() {
+	panic(MyComplex128(32.1 + 10i))
+}
+
+func panicCustomString() {
+	panic(MyString("Panic"))
+}
+
+func panicCustomBool() {
+	panic(MyBool(true))
+}
+
+func panicCustomInt() {
+	panic(MyInt(93))
+}
+
+func panicCustomInt8() {
+	panic(MyInt8(93))
+}
+
+func panicCustomInt16() {
+	panic(MyInt16(93))
+}
+
+func panicCustomInt32() {
+	panic(MyInt32(93))
+}
+
+func panicCustomInt64() {
+	panic(MyInt64(93))
+}
+
+func panicCustomUint() {
+	panic(MyUint(93))
+}
+
+func panicCustomUint8() {
+	panic(MyUint8(93))
+}
+
+func panicCustomUint16() {
+	panic(MyUint16(93))
+}
+
+func panicCustomUint32() {
+	panic(MyUint32(93))
+}
+
+func panicCustomUint64() {
+	panic(MyUint64(93))
+}
+
+func panicCustomUintptr() {
+	panic(MyUintptr(93))
+}
+
+func panicCustomFloat64() {
+	panic(MyFloat64(-93.70))
+}
+
+func panicCustomFloat32() {
+	panic(MyFloat32(-93.70))
+}
+
+func init() {
+	register("panicCustomComplex64", panicCustomComplex64)
+	register("panicCustomComplex128", panicCustomComplex128)
+	register("panicCustomBool", panicCustomBool)
+	register("panicCustomFloat32", panicCustomFloat32)
+	register("panicCustomFloat64", panicCustomFloat64)
+	register("panicCustomInt", panicCustomInt)
+	register("panicCustomInt8", panicCustomInt8)
+	register("panicCustomInt16", panicCustomInt16)
+	register("panicCustomInt32", panicCustomInt32)
+	register("panicCustomInt64", panicCustomInt64)
+	register("panicCustomString", panicCustomString)
+	register("panicCustomUint", panicCustomUint)
+	register("panicCustomUint8", panicCustomUint8)
+	register("panicCustomUint16", panicCustomUint16)
+	register("panicCustomUint32", panicCustomUint32)
+	register("panicCustomUint64", panicCustomUint64)
+	register("panicCustomUintptr", panicCustomUintptr)
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/eintr.go b/libgo/go/runtime/testdata/testprogcgo/eintr.go
new file mode 100644
index 00000000000..791ff1bedc0
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/eintr.go
@@ -0,0 +1,246 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+
+static int clearRestart(int sig) {
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof sa);
+	if (sigaction(sig, NULL, &sa) < 0) {
+		return errno;
+	}
+	sa.sa_flags &=~ SA_RESTART;
+	if (sigaction(sig, &sa, NULL) < 0) {
+		return errno;
+	}
+	return 0;
+}
+*/
+import "C"
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net"
+	"os"
+	"os/exec"
+	"sync"
+	"syscall"
+	"time"
+)
+
+func init() {
+	register("EINTR", EINTR)
+	register("Block", Block)
+}
+
+// Test various operations when a signal handler is installed without
+// the SA_RESTART flag. This tests that the os and net APIs handle EINTR.
+func EINTR() {
+	if errno := C.clearRestart(C.int(syscall.SIGURG)); errno != 0 {
+		log.Fatal(syscall.Errno(errno))
+	}
+	if errno := C.clearRestart(C.int(syscall.SIGWINCH)); errno != 0 {
+		log.Fatal(syscall.Errno(errno))
+	}
+	if errno := C.clearRestart(C.int(syscall.SIGCHLD)); errno != 0 {
+		log.Fatal(syscall.Errno(errno))
+	}
+
+	var wg sync.WaitGroup
+	testPipe(&wg)
+	testNet(&wg)
+	testExec(&wg)
+	wg.Wait()
+	fmt.Println("OK")
+}
+
+// spin does CPU bound spinning and allocating for a millisecond,
+// to get a SIGURG.
+//go:noinline
+func spin() (float64, []byte) {
+	stop := time.Now().Add(time.Millisecond)
+	r1 := 0.0
+	r2 := make([]byte, 200)
+	for time.Now().Before(stop) {
+		for i := 1; i < 1e6; i++ {
+			r1 += r1 / float64(i)
+			r2 = append(r2, bytes.Repeat([]byte{byte(i)}, 100)...)
+			r2 = r2[100:]
+		}
+	}
+	return r1, r2
+}
+
+// winch sends a few SIGWINCH signals to the process.
+func winch() {
+	ticker := time.NewTicker(100 * time.Microsecond)
+	defer ticker.Stop()
+	pid := syscall.Getpid()
+	for n := 10; n > 0; n-- {
+		syscall.Kill(pid, syscall.SIGWINCH)
+		<-ticker.C
+	}
+}
+
+// sendSomeSignals triggers a few SIGURG and SIGWINCH signals.
+func sendSomeSignals() {
+	done := make(chan struct{})
+	go func() {
+		spin()
+		close(done)
+	}()
+	winch()
+	<-done
+}
+
+// testPipe tests pipe operations.
+func testPipe(wg *sync.WaitGroup) {
+	r, w, err := os.Pipe()
+	if err != nil {
+		log.Fatal(err)
+	}
+	if err := syscall.SetNonblock(int(r.Fd()), false); err != nil {
+		log.Fatal(err)
+	}
+	if err := syscall.SetNonblock(int(w.Fd()), false); err != nil {
+		log.Fatal(err)
+	}
+	wg.Add(2)
+	go func() {
+		defer wg.Done()
+		defer w.Close()
+		// Spin before calling Write so that the first ReadFull
+		// in the other goroutine will likely be interrupted
+		// by a signal.
+		sendSomeSignals()
+		// This Write will likely be interrupted by a signal
+		// as the other goroutine spins in the middle of reading.
+		// We write enough data that we should always fill the
+		// pipe buffer and need multiple write system calls.
+		if _, err := w.Write(bytes.Repeat([]byte{0}, 2<<20)); err != nil {
+			log.Fatal(err)
+		}
+	}()
+	go func() {
+		defer wg.Done()
+		defer r.Close()
+		b := make([]byte, 1<<20)
+		// This ReadFull will likely be interrupted by a signal,
+		// as the other goroutine spins before writing anything.
+		if _, err := io.ReadFull(r, b); err != nil {
+			log.Fatal(err)
+		}
+		// Spin after reading half the data so that the Write
+		// in the other goroutine will likely be interrupted
+		// before it completes.
+		sendSomeSignals()
+		if _, err := io.ReadFull(r, b); err != nil {
+			log.Fatal(err)
+		}
+	}()
+}
+
+// testNet tests network operations.
+func testNet(wg *sync.WaitGroup) {
+	ln, err := net.Listen("tcp4", "127.0.0.1:0")
+	if err != nil {
+		if errors.Is(err, syscall.EAFNOSUPPORT) || errors.Is(err, syscall.EPROTONOSUPPORT) {
+			return
+		}
+		log.Fatal(err)
+	}
+	wg.Add(2)
+	go func() {
+		defer wg.Done()
+		defer ln.Close()
+		c, err := ln.Accept()
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer c.Close()
+		cf, err := c.(*net.TCPConn).File()
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer cf.Close()
+		if err := syscall.SetNonblock(int(cf.Fd()), false); err != nil {
+			log.Fatal(err)
+		}
+		// See comments in testPipe.
+		sendSomeSignals()
+		if _, err := cf.Write(bytes.Repeat([]byte{0}, 2<<20)); err != nil {
+			log.Fatal(err)
+		}
+	}()
+	go func() {
+		defer wg.Done()
+		sendSomeSignals()
+		c, err := net.Dial("tcp", ln.Addr().String())
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer c.Close()
+		cf, err := c.(*net.TCPConn).File()
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer cf.Close()
+		if err := syscall.SetNonblock(int(cf.Fd()), false); err != nil {
+			log.Fatal(err)
+		}
+		// See comments in testPipe.
+		b := make([]byte, 1<<20)
+		if _, err := io.ReadFull(cf, b); err != nil {
+			log.Fatal(err)
+		}
+		sendSomeSignals()
+		if _, err := io.ReadFull(cf, b); err != nil {
+			log.Fatal(err)
+		}
+	}()
+}
+
+func testExec(wg *sync.WaitGroup) {
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		cmd := exec.Command(os.Args[0], "Block")
+		stdin, err := cmd.StdinPipe()
+		if err != nil {
+			log.Fatal(err)
+		}
+		cmd.Stderr = new(bytes.Buffer)
+		cmd.Stdout = cmd.Stderr
+		if err := cmd.Start(); err != nil {
+			log.Fatal(err)
+		}
+
+		go func() {
+			sendSomeSignals()
+			stdin.Close()
+		}()
+
+		if err := cmd.Wait(); err != nil {
+			log.Fatalf("%v:\n%s", err, cmd.Stdout)
+		}
+	}()
+}
+
+// Block blocks until stdin is closed.
+func Block() {
+	io.Copy(ioutil.Discard, os.Stdin)
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/segv.go b/libgo/go/runtime/testdata/testprogcgo/segv.go
new file mode 100644
index 00000000000..3237a8c69c6
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/segv.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+// static void nop() {}
+import "C"
+
+import (
+	"syscall"
+	"time"
+)
+
+func init() {
+	register("Segv", Segv)
+	register("SegvInCgo", SegvInCgo)
+}
+
+var Sum int
+
+func Segv() {
+	c := make(chan bool)
+	go func() {
+		close(c)
+		for i := 0; ; i++ {
+			Sum += i
+		}
+	}()
+
+	<-c
+
+	syscall.Kill(syscall.Getpid(), syscall.SIGSEGV)
+
+	// Give the OS time to deliver the signal.
+	time.Sleep(time.Second)
+}
+
+func SegvInCgo() {
+	c := make(chan bool)
+	go func() {
+		close(c)
+		for {
+			C.nop()
+		}
+	}()
+
+	<-c
+
+	syscall.Kill(syscall.Getpid(), syscall.SIGSEGV)
+
+	// Give the OS time to deliver the signal.
+	time.Sleep(time.Second)
+}
diff --git a/libgo/go/runtime/testdata/testwinlibsignal/dummy.go b/libgo/go/runtime/testdata/testwinlibsignal/dummy.go
new file mode 100644
index 00000000000..82dfd91c93a
--- /dev/null
+++ b/libgo/go/runtime/testdata/testwinlibsignal/dummy.go
@@ -0,0 +1,10 @@
+// +build windows
+
+package main
+
+//export Dummy
+func Dummy() int {
+	return 42
+}
+
+func main() {}
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index 27d88d43105..3cf13f14f2f 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -215,11 +215,18 @@ func stopTimer(t *timer) bool {
 
 // resetTimer resets an inactive timer, adding it to the heap.
 //go:linkname resetTimer time.resetTimer
-func resetTimer(t *timer, when int64) {
+// Reports whether the timer was modified before it was run.
+func resetTimer(t *timer, when int64) bool {
 	if raceenabled {
 		racerelease(unsafe.Pointer(t))
 	}
-	resettimer(t, when)
+	return resettimer(t, when)
+}
+
+// modTimer modifies an existing timer.
+//go:linkname modTimer time.modTimer
+func modTimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) {
+	modtimer(t, when, period, f, arg, seq)
 }
 
 // Go runtime.
@@ -395,14 +402,16 @@ func dodeltimer0(pp *p) {
 }
 
 // modtimer modifies an existing timer.
-// This is called by the netpoll code.
-func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) {
+// This is called by the netpoll code or time.Ticker.Reset.
+// Reports whether the timer was modified before it was run.
+func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) bool {
 	if when < 0 {
 		when = maxWhen
 	}
 
 	status := uint32(timerNoStatus)
 	wasRemoved := false
+	var pending bool
 	var mp *m
 loop:
 	for {
@@ -412,6 +421,7 @@ loop:
 			// This could lead to a self-deadlock. See #38070.
 			mp = acquirem()
 			if atomic.Cas(&t.status, status, timerModifying) {
+				pending = true // timer not yet run
 				break loop
 			}
 			releasem(mp)
@@ -424,6 +434,7 @@ loop:
 			// Act like addtimer.
 			if atomic.Cas(&t.status, status, timerModifying) {
 				wasRemoved = true
+				pending = false // timer already run or stopped
 				break loop
 			}
 			releasem(mp)
@@ -433,6 +444,7 @@ loop:
 			mp = acquirem()
 			if atomic.Cas(&t.status, status, timerModifying) {
 				atomic.Xadd(&t.pp.ptr().deletedTimers, -1)
+				pending = false // timer already stopped
 				break loop
 			}
 			releasem(mp)
@@ -503,14 +515,17 @@ loop:
 			wakeNetPoller(when)
 		}
 	}
+
+	return pending
 }
 
 // resettimer resets the time when a timer should fire.
 // If used for an inactive timer, the timer will become active.
 // This should be called instead of addtimer if the timer value has been,
 // or may have been, used previously.
-func resettimer(t *timer, when int64) {
-	modtimer(t, when, t.period, t.f, t.arg, t.seq)
+// Reports whether the timer was modified before it was run.
+func resettimer(t *timer, when int64) bool {
+	return modtimer(t, when, t.period, t.f, t.arg, t.seq)
 }
 
 // cleantimers cleans up the head of the timer queue. This speeds up
@@ -518,10 +533,20 @@ func resettimer(t *timer, when int64) {
 // slows down addtimer. Reports whether no timer problems were found.
 // The caller must have locked the timers for pp.
 func cleantimers(pp *p) {
+	gp := getg()
 	for {
 		if len(pp.timers) == 0 {
 			return
 		}
+
+		// This loop can theoretically run for a while, and because
+		// it is holding timersLock it cannot be preempted.
+		// If someone is trying to preempt us, just return.
+		// We can clean the timers later.
+		if gp.preemptStop {
+			return
+		}
+
 		t := pp.timers[0]
 		if t.pp.ptr() != pp {
 			throw("cleantimers: bad p")
diff --git a/libgo/go/runtime/trace.go b/libgo/go/runtime/trace.go
index 358674b5ae8..ce185fc37ce 100644
--- a/libgo/go/runtime/trace.go
+++ b/libgo/go/runtime/trace.go
@@ -181,9 +181,15 @@ func traceBufPtrOf(b *traceBuf) traceBufPtr {
 // Most clients should use the runtime/trace package or the testing package's
 // -test.trace flag instead of calling StartTrace directly.
 func StartTrace() error {
-	// Stop the world, so that we can take a consistent snapshot
+	// Stop the world so that we can take a consistent snapshot
 	// of all goroutines at the beginning of the trace.
-	stopTheWorld("start tracing")
+	// Do not stop the world during GC so we ensure we always see
+	// a consistent view of GC-related events (e.g. a start is always
+	// paired with an end).
+	stopTheWorldGC("start tracing")
+
+	// Prevent sysmon from running any code that could generate events.
+	lock(&sched.sysmonlock)
 
 	// We are in stop-the-world, but syscalls can finish and write to trace concurrently.
 	// Exitsyscall could check trace.enabled long before and then suddenly wake up
@@ -194,7 +200,8 @@ func StartTrace() error {
 
 	if trace.enabled || trace.shutdown {
 		unlock(&trace.bufLock)
-		startTheWorld()
+		unlock(&sched.sysmonlock)
+		startTheWorldGC()
 		return errorString("tracing is already enabled")
 	}
 
@@ -265,7 +272,9 @@ func StartTrace() error {
 
 	unlock(&trace.bufLock)
 
-	startTheWorld()
+	unlock(&sched.sysmonlock)
+
+	startTheWorldGC()
 	return nil
 }
 
@@ -274,14 +283,18 @@ func StartTrace() error {
 func StopTrace() {
 	// Stop the world so that we can collect the trace buffers from all p's below,
 	// and also to avoid races with traceEvent.
-	stopTheWorld("stop tracing")
+	stopTheWorldGC("stop tracing")
+
+	// See the comment in StartTrace.
+	lock(&sched.sysmonlock)
 
 	// See the comment in StartTrace.
 	lock(&trace.bufLock)
 
 	if !trace.enabled {
 		unlock(&trace.bufLock)
-		startTheWorld()
+		unlock(&sched.sysmonlock)
+		startTheWorldGC()
 		return
 	}
 
@@ -318,7 +331,9 @@ func StopTrace() {
 	trace.shutdown = true
 	unlock(&trace.bufLock)
 
-	startTheWorld()
+	unlock(&sched.sysmonlock)
+
+	startTheWorldGC()
 
 	// The world is started but we've set trace.shutdown, so new tracing can't start.
 	// Wait for the trace reader to flush pending buffers and stop.
@@ -859,6 +874,7 @@ func (tab *traceStackTable) dump() {
 
 	tab.mem.drop()
 	*tab = traceStackTable{}
+	lockInit(&((*tab).lock), lockRankTraceStackTab)
 }
 
 type traceFrame struct {
diff --git a/libgo/go/runtime/trace/trace.go b/libgo/go/runtime/trace/trace.go
index 7f9d72a846e..b34aef03c51 100644
--- a/libgo/go/runtime/trace/trace.go
+++ b/libgo/go/runtime/trace/trace.go
@@ -19,7 +19,7 @@
 // command runs the test in the current directory and writes the trace
 // file (trace.out).
 //
-//    go test -trace=test.out
+//    go test -trace=trace.out
 //
 // This runtime/trace package provides APIs to add equivalent tracing
 // support to a standalone program. See the Example that demonstrates
diff --git a/libgo/go/runtime/trace/trace_stack_test.go b/libgo/go/runtime/trace/trace_stack_test.go
index 62c06e67d9d..cfc0419b72c 100644
--- a/libgo/go/runtime/trace/trace_stack_test.go
+++ b/libgo/go/runtime/trace/trace_stack_test.go
@@ -233,6 +233,7 @@ func TestTraceSymbolize(t *testing.T) {
 		}},
 		{trace.EvGomaxprocs, []frame{
 			{"runtime.startTheWorld", 0}, // this is when the current gomaxprocs is logged.
+			{"runtime.startTheWorldGC", 0},
 			{"runtime.GOMAXPROCS", 0},
 			{"runtime/trace_test.TestTraceSymbolize", 0},
 			{"testing.tRunner", 0},
@@ -251,6 +252,7 @@ func TestTraceSymbolize(t *testing.T) {
 			{trace.EvGoSysCall, []frame{
 				{"syscall.read", 0},
 				{"syscall.Read", 0},
+				{"internal/poll.ignoringEINTR", 0},
 				{"internal/poll.(*FD).Read", 0},
 				{"os.(*File).read", 0},
 				{"os.(*File).Read", 0},
diff --git a/libgo/go/runtime/type.go b/libgo/go/runtime/type.go
index 94abbb8e9f8..56b4fe6a111 100644
--- a/libgo/go/runtime/type.go
+++ b/libgo/go/runtime/type.go
@@ -45,7 +45,24 @@ type _type struct {
 }
 
 func (t *_type) string() string {
-	return *t._string
+	// For gccgo, try to strip out quoted strings.
+	s := *t._string
+	q := false
+	started := false
+	var start int
+	var end int
+	for i := 0; i < len(s); i++ {
+		if s[i] == '\t' {
+			q = !q
+		} else if !q {
+			if !started {
+				start = i
+				started = true
+			}
+			end = i
+		}
+	}
+	return s[start : end+1]
 }
 
 // pkgpath returns the path of the package where t was defined, if