163 files changed, 4480 insertions, 2794 deletions
diff --git a/libgo/go/runtime/abi_test.go b/libgo/go/runtime/abi_test.go
new file mode 100644
index 00000000000..b27b01244f2
--- /dev/null
+++ b/libgo/go/runtime/abi_test.go
@@ -0,0 +1,112 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.regabireflect
+// +build goexperiment.regabireflect
+
+// This file contains tests specific to making sure the register ABI
+// works in a bunch of contexts in the runtime.
+
+package runtime_test
+
+import (
+	"internal/testenv"
+	"os"
+	"os/exec"
+	"runtime"
+	"strings"
+	"testing"
+	"time"
+)
+
+var regConfirmRun chan int
+
+//go:registerparams
+func regFinalizerPointer(v *Tint) (int, float32, [10]byte) {
+	regConfirmRun <- *(*int)(v)
+	return 5151, 4.0, [10]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+}
+
+//go:registerparams
+func regFinalizerIface(v Tinter) (int, float32, [10]byte) {
+	regConfirmRun <- *(*int)(v.(*Tint))
+	return 5151, 4.0, [10]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+}
+
+func TestFinalizerRegisterABI(t *testing.T) {
+	testenv.MustHaveExec(t)
+
+	// Actually run the test in a subprocess because we don't want
+	// finalizers from other tests interfering.
+	if os.Getenv("TEST_FINALIZER_REGABI") != "1" {
+		cmd := testenv.CleanCmdEnv(exec.Command(os.Args[0], "-test.run=TestFinalizerRegisterABI", "-test.v"))
+		cmd.Env = append(cmd.Env, "TEST_FINALIZER_REGABI=1")
+		out, err := cmd.CombinedOutput()
+		if !strings.Contains(string(out), "PASS\n") || err != nil {
+			t.Fatalf("%s\n(exit status %v)", string(out), err)
+		}
+		return
+	}
+
+	// Optimistically clear any latent finalizers from e.g. the testing
+	// package before continuing.
+	//
+	// It's possible that a finalizer only becomes available to run
+	// after this point, which would interfere with the test and could
+	// cause a crash, but because we're running in a separate process
+	// it's extremely unlikely.
+	runtime.GC()
+	runtime.GC()
+
+	// fing will only pick the new IntRegArgs up if it's currently
+	// sleeping and wakes up, so wait for it to go to sleep.
+	success := false
+	for i := 0; i < 100; i++ {
+		if runtime.FinalizerGAsleep() {
+			success = true
+			break
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	if !success {
+		t.Fatal("finalizer not asleep?")
+	}
+
+	// argRegsBefore := runtime.SetIntArgRegs(abi.IntArgRegs)
+	// defer runtime.SetIntArgRegs(argRegsBefore)
+
+	tests := []struct {
+		name         string
+		fin          interface{}
+		confirmValue int
+	}{
+		{"Pointer", regFinalizerPointer, -1},
+		{"Interface", regFinalizerIface, -2},
+	}
+	for i := range tests {
+		test := &tests[i]
+		t.Run(test.name, func(t *testing.T) {
+			regConfirmRun = make(chan int)
+
+			x := new(Tint)
+			*x = (Tint)(test.confirmValue)
+			runtime.SetFinalizer(x, test.fin)
+
+			runtime.KeepAlive(x)
+
+			// Queue the finalizer.
+			runtime.GC()
+			runtime.GC()
+
+			select {
+			case <-time.After(time.Second):
+				t.Fatal("finalizer failed to execute")
+			case gotVal := <-regConfirmRun:
+				if gotVal != test.confirmValue {
+					t.Fatalf("wrong finalizer executed? got %d, want %d", gotVal, test.confirmValue)
+				}
+			}
+		})
+	}
+}
diff --git a/libgo/go/runtime/alg.go b/libgo/go/runtime/alg.go
index b5b22cfd0f8..4408eab37d6 100644
--- a/libgo/go/runtime/alg.go
+++ b/libgo/go/runtime/alg.go
@@ -216,7 +216,6 @@ func typehash(t *_type, p unsafe.Pointer, h uintptr) uintptr {
 	case kindStruct:
 		s := (*structtype)(unsafe.Pointer(t))
 		for _, f := range s.fields {
-			// TODO: maybe we could hash several contiguous fields all at once.
 			if f.name != nil && *f.name == "_" {
 				continue
 			}
diff --git a/libgo/go/runtime/auxv_none.go b/libgo/go/runtime/auxv_none.go
index 3ca617b21eb..3178f1a154c 100644
--- a/libgo/go/runtime/auxv_none.go
+++ b/libgo/go/runtime/auxv_none.go
@@ -2,12 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !linux
-// +build !darwin
-// +build !dragonfly
-// +build !freebsd
-// +build !netbsd
-// +build !solaris
+//go:build !linux && !darwin && !dragonfly && !freebsd && !netbsd && !solaris
+// +build !linux,!darwin,!dragonfly,!freebsd,!netbsd,!solaris
 
 package runtime
 
diff --git a/libgo/go/runtime/cgo/handle.go b/libgo/go/runtime/cgo/handle.go
new file mode 100644
index 00000000000..720acca802c
--- /dev/null
+++ b/libgo/go/runtime/cgo/handle.go
@@ -0,0 +1,109 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgo
+
+import (
+	"sync"
+	"sync/atomic"
+)
+
+// Handle provides a way to pass values that contain Go pointers
+// (pointers to memory allocated by Go) between Go and C without
+// breaking the cgo pointer passing rules. A Handle is an integer
+// value that can represent any Go value. A Handle can be passed
+// through C and back to Go, and Go code can use the Handle to
+// retrieve the original Go value.
+//
+// The underlying type of Handle is guaranteed to fit in an integer type
+// that is large enough to hold the bit pattern of any pointer. The zero
+// value of a Handle is not valid, and thus is safe to use as a sentinel
+// in C APIs.
+//
+// For instance, on the Go side:
+//
+//	package main
+//
+//	/*
+//	#include <stdint.h> // for uintptr_t
+//
+//	extern void MyGoPrint(uintptr_t handle);
+//	void myprint(uintptr_t handle);
+//	*/
+//	import "C"
+//	import "runtime/cgo"
+//
+//	//export MyGoPrint
+//	func MyGoPrint(handle C.uintptr_t) {
+//		h := cgo.Handle(handle)
+//		val := h.Value().(string)
+//		println(val)
+//		h.Delete()
+//	}
+//
+//	func main() {
+//		val := "hello Go"
+//		C.myprint(C.uintptr_t(cgo.NewHandle(val)))
+//		// Output: hello Go
+//	}
+//
+// and on the C side:
+//
+//	#include <stdint.h> // for uintptr_t
+//
+//	// A Go function
+//	extern void MyGoPrint(uintptr_t handle);
+//
+//	// A C function
+//	void myprint(uintptr_t handle) {
+//	    MyGoPrint(handle);
+//	}
+type Handle uintptr
+
+// NewHandle returns a handle for a given value.
+//
+// The handle is valid until the program calls Delete on it. The handle
+// uses resources, and this package assumes that C code may hold on to
+// the handle, so a program must explicitly call Delete when the handle
+// is no longer needed.
+//
+// The intended use is to pass the returned handle to C code, which
+// passes it back to Go, which calls Value.
+func NewHandle(v interface{}) Handle {
+	h := atomic.AddUintptr(&handleIdx, 1)
+	if h == 0 {
+		panic("runtime/cgo: ran out of handle space")
+	}
+
+	handles.Store(h, v)
+	return Handle(h)
+}
+
+// Value returns the associated Go value for a valid handle.
+//
+// The method panics if the handle is invalid.
+func (h Handle) Value() interface{} {
+	v, ok := handles.Load(uintptr(h))
+	if !ok {
+		panic("runtime/cgo: misuse of an invalid Handle")
+	}
+	return v
+}
+
+// Delete invalidates a handle. This method should only be called once
+// the program no longer needs to pass the handle to C and the C code
+// no longer has a copy of the handle value.
+//
+// The method panics if the handle is invalid.
+func (h Handle) Delete() {
+	_, ok := handles.LoadAndDelete(uintptr(h))
+	if !ok {
+		panic("runtime/cgo: misuse of an invalid Handle")
+	}
+}
+
+var (
+	handles   = sync.Map{} // map[Handle]interface{}
+	handleIdx uintptr      // atomic
+)
diff --git a/libgo/go/runtime/cgo/handle_test.go b/libgo/go/runtime/cgo/handle_test.go
new file mode 100644
index 00000000000..738051a0ea1
--- /dev/null
+++ b/libgo/go/runtime/cgo/handle_test.go
@@ -0,0 +1,103 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cgo
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestHandle(t *testing.T) {
+	v := 42
+
+	tests := []struct {
+		v1 interface{}
+		v2 interface{}
+	}{
+		{v1: v, v2: v},
+		{v1: &v, v2: &v},
+		{v1: nil, v2: nil},
+	}
+
+	for _, tt := range tests {
+		h1 := NewHandle(tt.v1)
+		h2 := NewHandle(tt.v2)
+
+		if uintptr(h1) == 0 || uintptr(h2) == 0 {
+			t.Fatalf("NewHandle returns zero")
+		}
+
+		if uintptr(h1) == uintptr(h2) {
+			t.Fatalf("Duplicated Go values should have different handles, but got equal")
+		}
+
+		h1v := h1.Value()
+		h2v := h2.Value()
+		if !reflect.DeepEqual(h1v, h2v) || !reflect.DeepEqual(h1v, tt.v1) {
+			t.Fatalf("Value of a Handle got wrong, got %+v %+v, want %+v", h1v, h2v, tt.v1)
+		}
+
+		h1.Delete()
+		h2.Delete()
+	}
+
+	siz := 0
+	handles.Range(func(k, v interface{}) bool {
+		siz++
+		return true
+	})
+	if siz != 0 {
+		t.Fatalf("handles are not cleared, got %d, want %d", siz, 0)
+	}
+}
+
+func TestInvalidHandle(t *testing.T) {
+	t.Run("zero", func(t *testing.T) {
+		h := Handle(0)
+
+		defer func() {
+			if r := recover(); r != nil {
+				return
+			}
+			t.Fatalf("Delete of zero handle did not trigger a panic")
+		}()
+
+		h.Delete()
+	})
+
+	t.Run("invalid", func(t *testing.T) {
+		h := NewHandle(42)
+
+		defer func() {
+			if r := recover(); r != nil {
+				h.Delete()
+				return
+			}
+			t.Fatalf("Invalid handle did not trigger a panic")
+		}()
+
+		Handle(h + 1).Delete()
+	})
+}
+
+func BenchmarkHandle(b *testing.B) {
+	b.Run("non-concurrent", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			h := NewHandle(i)
+			_ = h.Value()
+			h.Delete()
+		}
+	})
+	b.Run("concurrent", func(b *testing.B) {
+		b.RunParallel(func(pb *testing.PB) {
+			var v int
+			for pb.Next() {
+				h := NewHandle(v)
+				_ = h.Value()
+				h.Delete()
+			}
+		})
+	})
+}
diff --git a/libgo/go/runtime/cgocall.go b/libgo/go/runtime/cgocall.go
index 79fe65cb234..0495d67f867 100644
--- a/libgo/go/runtime/cgocall.go
+++ b/libgo/go/runtime/cgocall.go
@@ -15,6 +15,8 @@ import (
 //go:linkname cgoCheckPointer
 //go:linkname cgoCheckResult
 
+var ncgocall uint64 // number of cgo calls in total for dead m
+
 // Pointer checking for cgo code.
 
 // We want to detect all cases where a program that does not use
diff --git a/libgo/go/runtime/chan_test.go b/libgo/go/runtime/chan_test.go
index 85d3e04ffff..4aae89fd499 100644
--- a/libgo/go/runtime/chan_test.go
+++ b/libgo/go/runtime/chan_test.go
@@ -636,7 +636,7 @@ func TestNoShrinkStackWhileParking(t *testing.T) {
 	// channel. See issue 40641 for more details on the problem.
 	//
 	// The way we try to induce this failure is to set up two
-	// goroutines: a sender and a reciever that communicate across
+	// goroutines: a sender and a receiver that communicate across
 	// a channel. We try to set up a situation where the sender
 	// grows its stack temporarily then *fully* blocks on a channel
 	// often. Meanwhile a GC is triggered so that we try to get a
@@ -676,7 +676,7 @@ func TestNoShrinkStackWhileParking(t *testing.T) {
 		go send(c, done)
 		// Wait a little bit before triggering
 		// the GC to make sure the sender and
-		// reciever have gotten into their groove.
+		// receiver have gotten into their groove.
 		time.Sleep(50 * time.Microsecond)
 		runtime.GC()
 		<-done
@@ -713,8 +713,6 @@ func TestSelectDuplicateChannel(t *testing.T) {
 	c <- 8 // wake up B.  This operation used to fail because c.recvq was corrupted (it tries to wake up an already running G instead of B)
 }
 
-var selectSink interface{}
-
 func TestSelectStackAdjust(t *testing.T) {
 	// Test that channel receive slots that contain local stack
 	// pointers are adjusted correctly by stack shrinking.
@@ -771,20 +769,8 @@ func TestSelectStackAdjust(t *testing.T) {
 	<-ready2
 	time.Sleep(10 * time.Millisecond)
 
-	// Force concurrent GC a few times.
-	var before, after runtime.MemStats
-	runtime.ReadMemStats(&before)
-	for i := 0; i < 100; i++ {
-		selectSink = new([1 << 20]byte)
-		runtime.ReadMemStats(&after)
-		if after.NumGC-before.NumGC >= 2 {
-			goto done
-		}
-		runtime.Gosched()
-	}
-	t.Fatal("failed to trigger concurrent GC")
-done:
-	selectSink = nil
+	// Force concurrent GC to shrink the stacks.
+	runtime.GC()
 
 	// Wake selects.
 	close(d)
diff --git a/libgo/go/runtime/checkptr.go b/libgo/go/runtime/checkptr.go
index e52f7df5b2b..ad3ad29cb2f 100644
--- a/libgo/go/runtime/checkptr.go
+++ b/libgo/go/runtime/checkptr.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 package runtime
@@ -9,6 +10,11 @@ package runtime
 import "unsafe"
 
 func checkptrAlignment(p unsafe.Pointer, elem *_type, n uintptr) {
+	// nil pointer is always suitably aligned (#47430).
+	if p == nil {
+		return
+	}
+
 	// Check that (*[n]elem)(p) is appropriately aligned.
 	// Note that we allow unaligned pointers if the types they point to contain
 	// no pointers themselves. See issue 37298.
@@ -18,11 +24,32 @@ func checkptrAlignment(p unsafe.Pointer, elem *_type, n uintptr) {
 	}
 
 	// Check that (*[n]elem)(p) doesn't straddle multiple heap objects.
-	if size := n * elem.size; size > 1 && checkptrBase(p) != checkptrBase(add(p, size-1)) {
+	// TODO(mdempsky): Fix #46938 so we don't need to worry about overflow here.
+	if checkptrStraddles(p, n*elem.size) {
 		throw("checkptr: converted pointer straddles multiple allocations")
 	}
 }
 
+// checkptrStraddles reports whether the first size-bytes of memory
+// addressed by ptr is known to straddle more than one Go allocation.
+func checkptrStraddles(ptr unsafe.Pointer, size uintptr) bool {
+	if size <= 1 {
+		return false
+	}
+
+	// Check that add(ptr, size-1) won't overflow. This avoids the risk
+	// of producing an illegal pointer value (assuming ptr is legal).
+	if uintptr(ptr) >= -(size - 1) {
+		return true
+	}
+	end := add(ptr, size-1)
+
+	// TODO(mdempsky): Detect when [ptr, end] contains Go allocations,
+	// but neither ptr nor end point into one themselves.
+
+	return checkptrBase(ptr) != checkptrBase(end)
+}
+
 func checkptrArithmetic(p unsafe.Pointer, originals []unsafe.Pointer) {
 	if 0 < uintptr(p) && uintptr(p) < minLegalPointer {
 		throw("checkptr: pointer arithmetic computed bad pointer value")
diff --git a/libgo/go/runtime/checkptr_test.go b/libgo/go/runtime/checkptr_test.go
index bd2d00f117a..3e4246970f9 100644
--- a/libgo/go/runtime/checkptr_test.go
+++ b/libgo/go/runtime/checkptr_test.go
@@ -30,10 +30,13 @@ func TestCheckPtr(t *testing.T) {
 	}{
 		{"CheckPtrAlignmentPtr", "fatal error: checkptr: misaligned pointer conversion\n"},
 		{"CheckPtrAlignmentNoPtr", ""},
+		{"CheckPtrAlignmentNilPtr", ""},
 		{"CheckPtrArithmetic", "fatal error: checkptr: pointer arithmetic result points to invalid allocation\n"},
 		{"CheckPtrArithmetic2", "fatal error: checkptr: pointer arithmetic result points to invalid allocation\n"},
 		{"CheckPtrSize", "fatal error: checkptr: converted pointer straddles multiple allocations\n"},
 		{"CheckPtrSmall", "fatal error: checkptr: pointer arithmetic computed bad pointer value\n"},
+		{"CheckPtrSliceOK", ""},
+		{"CheckPtrSliceFail", "fatal error: checkptr: unsafe.Slice result straddles multiple allocations\n"},
 	}
 
 	for _, tc := range testCases {
diff --git a/libgo/go/runtime/cpuprof.go b/libgo/go/runtime/cpuprof.go
index 43f0a6705f5..55ad0d112e7 100644
--- a/libgo/go/runtime/cpuprof.go
+++ b/libgo/go/runtime/cpuprof.go
@@ -103,7 +103,16 @@ func (p *cpuProfile) add(gp *g, stk []uintptr) {
 		// because otherwise its write barrier behavior may not
 		// be correct. See the long comment there before
 		// changing the argument here.
-		cpuprof.log.write(&gp.labels, nanotime(), hdr[:], stk)
+		//
+		// Note: it can happen on Windows, where we are calling
+		// p.add with a gp that is not the current g, that gp is nil,
+		// meaning we interrupted a system thread with no g.
+		// Avoid faulting in that case.
+		var tagPtr *unsafe.Pointer
+		if gp != nil {
+			tagPtr = &gp.labels
+		}
+		cpuprof.log.write(tagPtr, nanotime(), hdr[:], stk)
 	}
 
 	atomic.Store(&prof.signalLock, 0)
diff --git a/libgo/go/runtime/cputicks.go b/libgo/go/runtime/cputicks.go
index c41a58bafd0..84e3ce3675a 100644
--- a/libgo/go/runtime/cputicks.go
+++ b/libgo/go/runtime/cputicks.go
@@ -2,13 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// // +build !arm
-// // +build !arm64
-// // +build !mips64
-// // +build !mips64le
-// // +build !mips
-// // +build !mipsle
-// // +build !wasm
+//-go:build !arm && !arm64 && !mips64 && !mips64le && !mips && !mipsle && !wasm
+// -build !arm,!arm64,!mips64,!mips64le,!mips,!mipsle,!wasm
 
 package runtime
 
diff --git a/libgo/go/runtime/crash_cgo_test.go b/libgo/go/runtime/crash_cgo_test.go
index b0198ffa2b6..6ad42b22a12 100644
--- a/libgo/go/runtime/crash_cgo_test.go
+++ b/libgo/go/runtime/crash_cgo_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build cgo
 // +build cgo
 
 package runtime_test
@@ -290,6 +291,18 @@ func TestCgoTracebackContext(t *testing.T) {
 	}
 }
 
+func TestCgoTracebackContextPreemption(t *testing.T) {
+	t.Parallel()
+	if runtime.Compiler == "gccgo" {
+		t.Skip("gccgo does not have SetCgoTraceback")
+	}
+	got := runTestProg(t, "testprogcgo", "TracebackContextPreemption")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}
+
 func testCgoPprof(t *testing.T, buildArg, runArg, top, bottom string) {
 	t.Parallel()
 	if runtime.GOOS != "linux" || (runtime.GOARCH != "amd64" && runtime.GOARCH != "ppc64le") {
diff --git a/libgo/go/runtime/crash_nonunix_test.go b/libgo/go/runtime/crash_nonunix_test.go
index 06c197ec2b2..5f61476f217 100644
--- a/libgo/go/runtime/crash_nonunix_test.go
+++ b/libgo/go/runtime/crash_nonunix_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build windows || plan9 || (js && wasm)
 // +build windows plan9 js,wasm
 
 package runtime_test
diff --git a/libgo/go/runtime/crash_test.go b/libgo/go/runtime/crash_test.go
index 102b1a5f476..15c60008dbf 100644
--- a/libgo/go/runtime/crash_test.go
+++ b/libgo/go/runtime/crash_test.go
@@ -474,14 +474,6 @@ func TestRecoverBeforePanicAfterGoexit2(t *testing.T) {
 }
 
 func TestNetpollDeadlock(t *testing.T) {
-	if os.Getenv("GO_BUILDER_NAME") == "darwin-amd64-10_12" {
-		// A suspected kernel bug in macOS 10.12 occasionally results in
-		// an apparent deadlock when dialing localhost. The errors have not
-		// been observed on newer versions of the OS, so we don't plan to work
-		// around them. See https://golang.org/issue/22019.
-		testenv.SkipFlaky(t, 22019)
-	}
-
 	t.Parallel()
 	output := runTestProg(t, "testprognet", "NetpollDeadlock")
 	want := "done\n"
diff --git a/libgo/go/runtime/crash_unix_test.go b/libgo/go/runtime/crash_unix_test.go
index 69aecd576ba..9dac167077a 100644
--- a/libgo/go/runtime/crash_unix_test.go
+++ b/libgo/go/runtime/crash_unix_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd hurd linux netbsd openbsd solaris
 
 package runtime_test
@@ -12,7 +13,6 @@ import (
 	"io"
 	"os"
 	"os/exec"
-	"path/filepath"
 	"runtime"
 	"strings"
 	"sync"
@@ -69,7 +69,8 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 		t.Skipf("skipping; not supported on %v", runtime.GOOS)
 	}
 
-	if runtime.GOOS == "openbsd" && runtime.GOARCH == "mips64" {
+	if runtime.GOOS == "openbsd" && (runtime.GOARCH == "arm" || runtime.GOARCH == "mips64") {
+		// This may be ncpu < 2 related...
 		t.Skipf("skipping; test fails on %s/%s - see issue #42464", runtime.GOOS, runtime.GOARCH)
 	}
 
@@ -77,31 +78,14 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 		t.Skip("skipping; SIGQUIT is blocked, see golang.org/issue/19196")
 	}
 
-	// We don't use executeTest because we need to kill the
-	// program while it is running.
-
 	testenv.MustHaveGoBuild(t)
 
-	t.Parallel()
-
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
-
-	if err := os.WriteFile(filepath.Join(dir, "main.go"), []byte(crashDumpsAllThreadsSource), 0666); err != nil {
-		t.Fatalf("failed to create Go file: %v", err)
-	}
-
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe", "main.go")
-	cmd.Dir = dir
-	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	exe, err := buildTestProg(t, "testprog")
 	if err != nil {
-		t.Fatalf("building source: %v\n%s", err, out)
+		t.Fatal(err)
 	}
 
-	cmd = exec.Command(filepath.Join(dir, "a.exe"))
+	cmd := exec.Command(exe, "CrashDumpsAllThreads")
 	cmd = testenv.CleanCmdEnv(cmd)
 	cmd.Env = append(cmd.Env,
 		"GOTRACEBACK=crash",
@@ -123,9 +107,12 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
+	defer rp.Close()
+
 	cmd.ExtraFiles = []*os.File{wp}
 
 	if err := cmd.Start(); err != nil {
+		wp.Close()
 		t.Fatalf("starting program: %v", err)
 	}
 
@@ -147,59 +134,14 @@ func TestCrashDumpsAllThreads(t *testing.T) {
 	// We want to see a stack trace for each thread.
 	// Before https://golang.org/cl/2811 running threads would say
 	// "goroutine running on other thread; stack unavailable".
-	out = outbuf.Bytes()
-	n := bytes.Count(out, []byte("main.loop"))
+	out := outbuf.Bytes()
+	n := bytes.Count(out, []byte("main.crashDumpsAllThreadsLoop"))
 	if n != 4 {
 		t.Errorf("found %d instances of main.loop; expected 4", n)
 		t.Logf("%s", out)
 	}
 }
 
-const crashDumpsAllThreadsSource = `
-package main
-
-import (
-	"fmt"
-	"os"
-	"runtime"
-	"time"
-)
-
-func main() {
-	const count = 4
-	runtime.GOMAXPROCS(count + 1)
-
-	chans := make([]chan bool, count)
-	for i := range chans {
-		chans[i] = make(chan bool)
-		go loop(i, chans[i])
-	}
-
-	// Wait for all the goroutines to start executing.
-	for _, c := range chans {
-		<-c
-	}
-
-	time.Sleep(time.Millisecond)
-
-	// Tell our parent that all the goroutines are executing.
-	if _, err := os.NewFile(3, "pipe").WriteString("x"); err != nil {
-		fmt.Fprintf(os.Stderr, "write to pipe failed: %v\n", err)
-		os.Exit(2)
-	}
-
-	select {}
-}
-
-func loop(i int, c chan bool) {
-	close(c)
-	for {
-		for j := 0; j < 0x7fffffff; j++ {
-		}
-	}
-}
-`
-
 func TestPanicSystemstack(t *testing.T) {
 	// Test that GOTRACEBACK=crash prints both the system and user
 	// stack of other threads.
diff --git a/libgo/go/runtime/debug.go b/libgo/go/runtime/debug.go
index 5fb02cb4668..0cd613a3183 100644
--- a/libgo/go/runtime/debug.go
+++ b/libgo/go/runtime/debug.go
@@ -45,7 +45,7 @@ func NumCPU() int {
 
 // NumCgoCall returns the number of cgo calls made by the current process.
 func NumCgoCall() int64 {
-	var n int64
+	var n = int64(atomic.Load64(&ncgocall))
 	for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
 		n += int64(mp.ncgocall)
 	}
diff --git a/libgo/go/runtime/debug/panic_test.go b/libgo/go/runtime/debug/panic_test.go
index b67a3de4f98..65f9555f376 100644
--- a/libgo/go/runtime/debug/panic_test.go
+++ b/libgo/go/runtime/debug/panic_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd
 // +build aix darwin dragonfly freebsd linux netbsd openbsd
 
 // TODO: test on Windows?
@@ -23,6 +24,9 @@ func TestPanicOnFault(t *testing.T) {
 	if runtime.GOOS == "ios" {
 		t.Skip("iOS doesn't provide fault addresses")
 	}
+	if runtime.GOOS == "netbsd" && runtime.GOARCH == "arm" {
+		t.Skip("netbsd-arm doesn't provide fault address (golang.org/issue/45026)")
+	}
 	m, err := syscall.Mmap(-1, 0, 0x1000, syscall.PROT_READ /* Note: no PROT_WRITE */, syscall.MAP_SHARED|syscall.MAP_ANON)
 	if err != nil {
 		t.Fatalf("can't map anonymous memory: %s", err)
diff --git a/libgo/go/runtime/debug_test.go b/libgo/go/runtime/debug_test.go
deleted file mode 100644
index 46261bc517b..00000000000
--- a/libgo/go/runtime/debug_test.go
+++ /dev/null
@@ -1,250 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// TODO: This test could be implemented on all (most?) UNIXes if we
-// added syscall.Tgkill more widely.
-
-// We skip all of these tests under race mode because our test thread
-// spends all of its time in the race runtime, which isn't a safe
-// point.
-
-// +build ignore_for_gccgo
-// +build amd64
-// +build linux
-// +build !race
-
-package runtime_test
-
-import (
-	"fmt"
-	"os"
-	"regexp"
-	"runtime"
-	"runtime/debug"
-	"sync/atomic"
-	"syscall"
-	"testing"
-)
-
-func startDebugCallWorker(t *testing.T) (g *runtime.G, after func()) {
-	// This can deadlock if run under a debugger because it
-	// depends on catching SIGTRAP, which is usually swallowed by
-	// a debugger.
-	skipUnderDebugger(t)
-
-	// This can deadlock if there aren't enough threads or if a GC
-	// tries to interrupt an atomic loop (see issue #10958). We
-	// use 8 Ps so there's room for the debug call worker,
-	// something that's trying to preempt the call worker, and the
-	// goroutine that's trying to stop the call worker.
-	ogomaxprocs := runtime.GOMAXPROCS(8)
-	ogcpercent := debug.SetGCPercent(-1)
-
-	// ready is a buffered channel so debugCallWorker won't block
-	// on sending to it. This makes it less likely we'll catch
-	// debugCallWorker while it's in the runtime.
-	ready := make(chan *runtime.G, 1)
-	var stop uint32
-	done := make(chan error)
-	go debugCallWorker(ready, &stop, done)
-	g = <-ready
-	return g, func() {
-		atomic.StoreUint32(&stop, 1)
-		err := <-done
-		if err != nil {
-			t.Fatal(err)
-		}
-		runtime.GOMAXPROCS(ogomaxprocs)
-		debug.SetGCPercent(ogcpercent)
-	}
-}
-
-func debugCallWorker(ready chan<- *runtime.G, stop *uint32, done chan<- error) {
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	ready <- runtime.Getg()
-
-	x := 2
-	debugCallWorker2(stop, &x)
-	if x != 1 {
-		done <- fmt.Errorf("want x = 2, got %d; register pointer not adjusted?", x)
-	}
-	close(done)
-}
-
-// Don't inline this function, since we want to test adjusting
-// pointers in the arguments.
-//
-//go:noinline
-func debugCallWorker2(stop *uint32, x *int) {
-	for atomic.LoadUint32(stop) == 0 {
-		// Strongly encourage x to live in a register so we
-		// can test pointer register adjustment.
-		*x++
-	}
-	*x = 1
-}
-
-func debugCallTKill(tid int) error {
-	return syscall.Tgkill(syscall.Getpid(), tid, syscall.SIGTRAP)
-}
-
-// skipUnderDebugger skips the current test when running under a
-// debugger (specifically if this process has a tracer). This is
-// Linux-specific.
-func skipUnderDebugger(t *testing.T) {
-	pid := syscall.Getpid()
-	status, err := os.ReadFile(fmt.Sprintf("/proc/%d/status", pid))
-	if err != nil {
-		t.Logf("couldn't get proc tracer: %s", err)
-		return
-	}
-	re := regexp.MustCompile(`TracerPid:\s+([0-9]+)`)
-	sub := re.FindSubmatch(status)
-	if sub == nil {
-		t.Logf("couldn't find proc tracer PID")
-		return
-	}
-	if string(sub[1]) == "0" {
-		return
-	}
-	t.Skip("test will deadlock under a debugger")
-}
-
-func TestDebugCall(t *testing.T) {
-	g, after := startDebugCallWorker(t)
-	defer after()
-
-	// Inject a call into the debugCallWorker goroutine and test
-	// basic argument and result passing.
-	var args struct {
-		x    int
-		yRet int
-	}
-	fn := func(x int) (yRet int) {
-		return x + 1
-	}
-	args.x = 42
-	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill, false); err != nil {
-		t.Fatal(err)
-	}
-	if args.yRet != 43 {
-		t.Fatalf("want 43, got %d", args.yRet)
-	}
-}
-
-func TestDebugCallLarge(t *testing.T) {
-	g, after := startDebugCallWorker(t)
-	defer after()
-
-	// Inject a call with a large call frame.
-	const N = 128
-	var args struct {
-		in  [N]int
-		out [N]int
-	}
-	fn := func(in [N]int) (out [N]int) {
-		for i := range in {
-			out[i] = in[i] + 1
-		}
-		return
-	}
-	var want [N]int
-	for i := range args.in {
-		args.in[i] = i
-		want[i] = i + 1
-	}
-	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill, false); err != nil {
-		t.Fatal(err)
-	}
-	if want != args.out {
-		t.Fatalf("want %v, got %v", want, args.out)
-	}
-}
-
-func TestDebugCallGC(t *testing.T) {
-	g, after := startDebugCallWorker(t)
-	defer after()
-
-	// Inject a call that performs a GC.
-	if _, err := runtime.InjectDebugCall(g, runtime.GC, nil, debugCallTKill, false); err != nil {
-		t.Fatal(err)
-	}
-}
-
-func TestDebugCallGrowStack(t *testing.T) {
-	g, after := startDebugCallWorker(t)
-	defer after()
-
-	// Inject a call that grows the stack. debugCallWorker checks
-	// for stack pointer breakage.
-	if _, err := runtime.InjectDebugCall(g, func() { growStack(nil) }, nil, debugCallTKill, false); err != nil {
-		t.Fatal(err)
-	}
-}
-
-//go:nosplit
-func debugCallUnsafePointWorker(gpp **runtime.G, ready, stop *uint32) {
-	// The nosplit causes this function to not contain safe-points
-	// except at calls.
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	*gpp = runtime.Getg()
-
-	for atomic.LoadUint32(stop) == 0 {
-		atomic.StoreUint32(ready, 1)
-	}
-}
-
-func TestDebugCallUnsafePoint(t *testing.T) {
-	skipUnderDebugger(t)
-
-	// This can deadlock if there aren't enough threads or if a GC
-	// tries to interrupt an atomic loop (see issue #10958).
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(8))
-	defer debug.SetGCPercent(debug.SetGCPercent(-1))
-
-	// Test that the runtime refuses call injection at unsafe points.
-	var g *runtime.G
-	var ready, stop uint32
-	defer atomic.StoreUint32(&stop, 1)
-	go debugCallUnsafePointWorker(&g, &ready, &stop)
-	for atomic.LoadUint32(&ready) == 0 {
-		runtime.Gosched()
-	}
-
-	_, err := runtime.InjectDebugCall(g, func() {}, nil, debugCallTKill, true)
-	if msg := "call not at safe point"; err == nil || err.Error() != msg {
-		t.Fatalf("want %q, got %s", msg, err)
-	}
-}
-
-func TestDebugCallPanic(t *testing.T) {
-	skipUnderDebugger(t)
-
-	// This can deadlock if there aren't enough threads.
-	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(8))
-
-	ready := make(chan *runtime.G)
-	var stop uint32
-	defer atomic.StoreUint32(&stop, 1)
-	go func() {
-		runtime.LockOSThread()
-		defer runtime.UnlockOSThread()
-		ready <- runtime.Getg()
-		for atomic.LoadUint32(&stop) == 0 {
-		}
-	}()
-	g := <-ready
-
-	p, err := runtime.InjectDebugCall(g, func() { panic("test") }, nil, debugCallTKill, false)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if ps, ok := p.(string); !ok || ps != "test" {
-		t.Fatalf("wanted panic %v, got %v", "test", p)
-	}
-}
diff --git a/libgo/go/runtime/debuglog_off.go b/libgo/go/runtime/debuglog_off.go
index bb3e172498e..dd381569992 100644
--- a/libgo/go/runtime/debuglog_off.go
+++ b/libgo/go/runtime/debuglog_off.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !debuglog
 // +build !debuglog
 
 package runtime
diff --git a/libgo/go/runtime/debuglog_on.go b/libgo/go/runtime/debuglog_on.go
index 3d477e8ef5f..2fcdbe70d14 100644
--- a/libgo/go/runtime/debuglog_on.go
+++ b/libgo/go/runtime/debuglog_on.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build debuglog
 // +build debuglog
 
 package runtime
diff --git a/libgo/go/runtime/defer_test.go b/libgo/go/runtime/defer_test.go
index 9a40ea19842..fc961445975 100644
--- a/libgo/go/runtime/defer_test.go
+++ b/libgo/go/runtime/defer_test.go
@@ -370,7 +370,7 @@ func g2() {
 	defer ap.method2()
 	defer ap.method1()
 	ff1(ap, 1, 2, 3, 4, 5, 6, 7, 8, 9)
-	// Try to get the stack to be be moved by growing it too large, so
+	// Try to get the stack to be moved by growing it too large, so
 	// existing stack-allocated defer becomes invalid.
 	rec1(2000)
 }
diff --git a/libgo/go/runtime/defs_windows_arm64.go b/libgo/go/runtime/defs_windows_arm64.go
new file mode 100644
index 00000000000..9ccce46f096
--- /dev/null
+++ b/libgo/go/runtime/defs_windows_arm64.go
@@ -0,0 +1,83 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// NOTE(rsc): _CONTEXT_CONTROL is actually 0x400001 and should include PC, SP, and LR.
+// However, empirically, LR doesn't come along on Windows 10
+// unless you also set _CONTEXT_INTEGER (0x400002).
+// Without LR, we skip over the next-to-bottom function in profiles
+// when the bottom function is frameless.
+// So we set both here, to make a working _CONTEXT_CONTROL.
+const _CONTEXT_CONTROL = 0x400003
+
+type neon128 struct {
+	low  uint64
+	high int64
+}
+
+// See https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-arm64_nt_context
+type context struct {
+	contextflags uint32
+	cpsr         uint32
+	x            [31]uint64 // fp is x[29], lr is x[30]
+	xsp          uint64
+	pc           uint64
+	v            [32]neon128
+	fpcr         uint32
+	fpsr         uint32
+	bcr          [8]uint32
+	bvr          [8]uint64
+	wcr          [2]uint32
+	wvr          [2]uint64
+}
+
+func (c *context) ip() uintptr { return uintptr(c.pc) }
+func (c *context) sp() uintptr { return uintptr(c.xsp) }
+func (c *context) lr() uintptr { return uintptr(c.x[30]) }
+
+func (c *context) set_ip(x uintptr) { c.pc = uint64(x) }
+func (c *context) set_sp(x uintptr) { c.xsp = uint64(x) }
+func (c *context) set_lr(x uintptr) { c.x[30] = uint64(x) }
+
+func dumpregs(r *context) {
+	print("r0   ", hex(r.x[0]), "\n")
+	print("r1   ", hex(r.x[1]), "\n")
+	print("r2   ", hex(r.x[2]), "\n")
+	print("r3   ", hex(r.x[3]), "\n")
+	print("r4   ", hex(r.x[4]), "\n")
+	print("r5   ", hex(r.x[5]), "\n")
+	print("r6   ", hex(r.x[6]), "\n")
+	print("r7   ", hex(r.x[7]), "\n")
+	print("r8   ", hex(r.x[8]), "\n")
+	print("r9   ", hex(r.x[9]), "\n")
+	print("r10  ", hex(r.x[10]), "\n")
+	print("r11  ", hex(r.x[11]), "\n")
+	print("r12  ", hex(r.x[12]), "\n")
+	print("r13  ", hex(r.x[13]), "\n")
+	print("r14  ", hex(r.x[14]), "\n")
+	print("r15  ", hex(r.x[15]), "\n")
+	print("r16  ", hex(r.x[16]), "\n")
+	print("r17  ", hex(r.x[17]), "\n")
+	print("r18  ", hex(r.x[18]), "\n")
+	print("r19  ", hex(r.x[19]), "\n")
+	print("r20  ", hex(r.x[20]), "\n")
+	print("r21  ", hex(r.x[21]), "\n")
+	print("r22  ", hex(r.x[22]), "\n")
+	print("r23  ", hex(r.x[23]), "\n")
+	print("r24  ", hex(r.x[24]), "\n")
+	print("r25  ", hex(r.x[25]), "\n")
+	print("r26  ", hex(r.x[26]), "\n")
+	print("r27  ", hex(r.x[27]), "\n")
+	print("r28  ", hex(r.x[28]), "\n")
+	print("r29  ", hex(r.x[29]), "\n")
+	print("lr   ", hex(r.x[30]), "\n")
+	print("sp   ", hex(r.xsp), "\n")
+	print("pc   ", hex(r.pc), "\n")
+	print("cpsr ", hex(r.cpsr), "\n")
+}
+
+func stackcheck() {
+	// TODO: not implemented on ARM
+}
diff --git a/libgo/go/runtime/env_posix.go b/libgo/go/runtime/env_posix.go
index 48b9c4a32e1..8893c4a6195 100644
--- a/libgo/go/runtime/env_posix.go
+++ b/libgo/go/runtime/env_posix.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || hurd || (js && wasm) || linux || netbsd || openbsd || solaris || windows || plan9
 // +build aix darwin dragonfly freebsd hurd js,wasm linux netbsd openbsd solaris windows plan9
 
 package runtime
diff --git a/libgo/go/runtime/example_test.go b/libgo/go/runtime/example_test.go
index 8dac9297509..e7f8dbc1306 100644
--- a/libgo/go/runtime/example_test.go
+++ b/libgo/go/runtime/example_test.go
@@ -12,12 +12,15 @@ import (
 
 func ExampleFrames() {
 	c := func() {
-		// Ask runtime.Callers for up to 10 pcs, including runtime.Callers itself.
+		// Ask runtime.Callers for up to 10 PCs, including runtime.Callers itself.
 		pc := make([]uintptr, 10)
 		n := runtime.Callers(0, pc)
 		if n == 0 {
-			// No pcs available. Stop now.
-			// This can happen if the first argument to runtime.Callers is large.
+			// No PCs available. This can happen if the first argument to
+			// runtime.Callers is large.
+			//
+			// Return now to avoid processing the zero Frame that would
+			// otherwise be returned by frames.Next below.
 			return
 		}
 
@@ -25,9 +28,12 @@ func ExampleFrames() {
 		frames := runtime.CallersFrames(pc)
 
 		// Loop to get frames.
-		// A fixed number of pcs can expand to an indefinite number of Frames.
+		// A fixed number of PCs can expand to an indefinite number of Frames.
 		for {
 			frame, more := frames.Next()
+
+			// Process this frame.
+			//
 			// To keep this example's output stable
 			// even if there are changes in the testing package,
 			// stop unwinding when we leave package runtime.
@@ -35,6 +41,8 @@ func ExampleFrames() {
 				break
 			}
 			fmt.Printf("- more:%v | %s\n", more, frame.Function)
+
+			// Check whether there are more frames to process after this one.
 			if !more {
 				break
 			}
diff --git a/libgo/go/runtime/export_debug_test.go b/libgo/go/runtime/export_debug_test.go
deleted file mode 100644
index c1cc44f23d0..00000000000
--- a/libgo/go/runtime/export_debug_test.go
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ignore_for_gccgo
-// +build amd64
-// +build linux
-
-package runtime
-
-import (
-	"runtime/internal/sys"
-	"unsafe"
-)
-
-// InjectDebugCall injects a debugger call to fn into g. args must be
-// a pointer to a valid call frame (including arguments and return
-// space) for fn, or nil. tkill must be a function that will send
-// SIGTRAP to thread ID tid. gp must be locked to its OS thread and
-// running.
-//
-// On success, InjectDebugCall returns the panic value of fn or nil.
-// If fn did not panic, its results will be available in args.
-func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error, returnOnUnsafePoint bool) (interface{}, error) {
-	if gp.lockedm == 0 {
-		return nil, plainError("goroutine not locked to thread")
-	}
-
-	tid := int(gp.lockedm.ptr().procid)
-	if tid == 0 {
-		return nil, plainError("missing tid")
-	}
-
-	f := efaceOf(&fn)
-	if f._type == nil || f._type.kind&kindMask != kindFunc {
-		return nil, plainError("fn must be a function")
-	}
-	fv := (*funcval)(f.data)
-
-	a := efaceOf(&args)
-	if a._type != nil && a._type.kind&kindMask != kindPtr {
-		return nil, plainError("args must be a pointer or nil")
-	}
-	argp := a.data
-	var argSize uintptr
-	if argp != nil {
-		argSize = (*ptrtype)(unsafe.Pointer(a._type)).elem.size
-	}
-
-	h := new(debugCallHandler)
-	h.gp = gp
-	// gp may not be running right now, but we can still get the M
-	// it will run on since it's locked.
-	h.mp = gp.lockedm.ptr()
-	h.fv, h.argp, h.argSize = fv, argp, argSize
-	h.handleF = h.handle // Avoid allocating closure during signal
-
-	defer func() { testSigtrap = nil }()
-	for i := 0; ; i++ {
-		testSigtrap = h.inject
-		noteclear(&h.done)
-		h.err = ""
-
-		if err := tkill(tid); err != nil {
-			return nil, err
-		}
-		// Wait for completion.
-		notetsleepg(&h.done, -1)
-		if h.err != "" {
-			switch h.err {
-			case "call not at safe point":
-				if returnOnUnsafePoint {
-					// This is for TestDebugCallUnsafePoint.
-					return nil, h.err
-				}
-				fallthrough
-			case "retry _Grunnable", "executing on Go runtime stack", "call from within the Go runtime":
-				// These are transient states. Try to get out of them.
-				if i < 100 {
-					usleep(100)
-					Gosched()
-					continue
-				}
-			}
-			return nil, h.err
-		}
-		return h.panic, nil
-	}
-}
-
-type debugCallHandler struct {
-	gp      *g
-	mp      *m
-	fv      *funcval
-	argp    unsafe.Pointer
-	argSize uintptr
-	panic   interface{}
-
-	handleF func(info *siginfo, ctxt *sigctxt, gp2 *g) bool
-
-	err       plainError
-	done      note
-	savedRegs sigcontext
-	savedFP   fpstate1
-}
-
-func (h *debugCallHandler) inject(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
-	switch h.gp.atomicstatus {
-	case _Grunning:
-		if getg().m != h.mp {
-			println("trap on wrong M", getg().m, h.mp)
-			return false
-		}
-		// Push current PC on the stack.
-		rsp := ctxt.rsp() - sys.PtrSize
-		*(*uint64)(unsafe.Pointer(uintptr(rsp))) = ctxt.rip()
-		ctxt.set_rsp(rsp)
-		// Write the argument frame size.
-		*(*uintptr)(unsafe.Pointer(uintptr(rsp - 16))) = h.argSize
-		// Save current registers.
-		h.savedRegs = *ctxt.regs()
-		h.savedFP = *h.savedRegs.fpstate
-		h.savedRegs.fpstate = nil
-		// Set PC to debugCallV1.
-		ctxt.set_rip(uint64(funcPC(debugCallV1)))
-		// Call injected. Switch to the debugCall protocol.
-		testSigtrap = h.handleF
-	case _Grunnable:
-		// Ask InjectDebugCall to pause for a bit and then try
-		// again to interrupt this goroutine.
-		h.err = plainError("retry _Grunnable")
-		notewakeup(&h.done)
-	default:
-		h.err = plainError("goroutine in unexpected state at call inject")
-		notewakeup(&h.done)
-	}
-	// Resume execution.
-	return true
-}
-
-func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
-	// Sanity check.
-	if getg().m != h.mp {
-		println("trap on wrong M", getg().m, h.mp)
-		return false
-	}
-	f := findfunc(uintptr(ctxt.rip()))
-	if !(hasPrefix(funcname(f), "runtime.debugCall") || hasPrefix(funcname(f), "debugCall")) {
-		println("trap in unknown function", funcname(f))
-		return false
-	}
-	if *(*byte)(unsafe.Pointer(uintptr(ctxt.rip() - 1))) != 0xcc {
-		println("trap at non-INT3 instruction pc =", hex(ctxt.rip()))
-		return false
-	}
-
-	switch status := ctxt.rax(); status {
-	case 0:
-		// Frame is ready. Copy the arguments to the frame.
-		sp := ctxt.rsp()
-		memmove(unsafe.Pointer(uintptr(sp)), h.argp, h.argSize)
-		// Push return PC.
-		sp -= sys.PtrSize
-		ctxt.set_rsp(sp)
-		*(*uint64)(unsafe.Pointer(uintptr(sp))) = ctxt.rip()
-		// Set PC to call and context register.
-		ctxt.set_rip(uint64(h.fv.fn))
-		ctxt.regs().rcx = uint64(uintptr(unsafe.Pointer(h.fv)))
-	case 1:
-		// Function returned. Copy frame back out.
-		sp := ctxt.rsp()
-		memmove(h.argp, unsafe.Pointer(uintptr(sp)), h.argSize)
-	case 2:
-		// Function panicked. Copy panic out.
-		sp := ctxt.rsp()
-		memmove(unsafe.Pointer(&h.panic), unsafe.Pointer(uintptr(sp)), 2*sys.PtrSize)
-	case 8:
-		// Call isn't safe. Get the reason.
-		sp := ctxt.rsp()
-		reason := *(*string)(unsafe.Pointer(uintptr(sp)))
-		h.err = plainError(reason)
-		// Don't wake h.done. We need to transition to status 16 first.
-	case 16:
-		// Restore all registers except RIP and RSP.
-		rip, rsp := ctxt.rip(), ctxt.rsp()
-		fp := ctxt.regs().fpstate
-		*ctxt.regs() = h.savedRegs
-		ctxt.regs().fpstate = fp
-		*fp = h.savedFP
-		ctxt.set_rip(rip)
-		ctxt.set_rsp(rsp)
-		// Done
-		notewakeup(&h.done)
-	default:
-		h.err = plainError("unexpected debugCallV1 status")
-		notewakeup(&h.done)
-	}
-	// Resume execution.
-	return true
-}
diff --git a/libgo/go/runtime/export_mmap_test.go b/libgo/go/runtime/export_mmap_test.go
index 000948b78c3..aa498bb932e 100644
--- a/libgo/go/runtime/export_mmap_test.go
+++ b/libgo/go/runtime/export_mmap_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd hurd linux netbsd openbsd solaris
 
 // Export guts for testing.
diff --git a/libgo/go/runtime/export_pipe2_test.go b/libgo/go/runtime/export_pipe2_test.go
index 209c6b14a11..22ac4b7a05d 100644
--- a/libgo/go/runtime/export_pipe2_test.go
+++ b/libgo/go/runtime/export_pipe2_test.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build freebsd hurd linux netbsd openbsd solaris
+//go:build dragonfly || freebsd || hurd || linux || netbsd || openbsd || solaris
+// +build dragonfly freebsd hurd linux netbsd openbsd solaris
 
 package runtime
 
diff --git a/libgo/go/runtime/export_pipe_test.go b/libgo/go/runtime/export_pipe_test.go
index 8f66770fb98..a0c6c0440d8 100644
--- a/libgo/go/runtime/export_pipe_test.go
+++ b/libgo/go/runtime/export_pipe_test.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly
+//go:build aix || darwin
+// +build aix darwin
 
 package runtime
 
diff --git a/libgo/go/runtime/export_test.go b/libgo/go/runtime/export_test.go
index 1455c2247d8..5255a708e22 100644
--- a/libgo/go/runtime/export_test.go
+++ b/libgo/go/runtime/export_test.go
@@ -46,6 +46,14 @@ var NetpollGenericInit = netpollGenericInit
 var Memmove = memmove
 var MemclrNoHeapPointers = memclrNoHeapPointers
 
+var LockPartialOrder = lockPartialOrder
+
+type LockRank lockRank
+
+func (l LockRank) String() string {
+	return lockRank(l).String()
+}
+
 const PreemptMSupported = preemptMSupported
 
 type LFNode struct {
@@ -136,35 +144,47 @@ func RunSchedLocalQueueStealTest() {
 	}
 }
 
+// Temporary to enable register ABI bringup.
+// TODO(register args): convert back to local variables in RunSchedLocalQueueEmptyTest that
+// get passed to the "go" stmts there.
+var RunSchedLocalQueueEmptyState struct {
+	done  chan bool
+	ready *uint32
+	p     *p
+}
+
 func RunSchedLocalQueueEmptyTest(iters int) {
 	// Test that runq is not spuriously reported as empty.
 	// Runq emptiness affects scheduling decisions and spurious emptiness
 	// can lead to underutilization (both runnable Gs and idle Ps coexist
 	// for arbitrary long time).
 	done := make(chan bool, 1)
-	_p_ := new(p)
+	RunSchedLocalQueueEmptyState.done = done
+	p := new(p)
+	RunSchedLocalQueueEmptyState.p = p
 	gs := make([]g, 2)
 	ready := new(uint32)
+	RunSchedLocalQueueEmptyState.ready = ready
 	for i := 0; i < iters; i++ {
 		*ready = 0
 		next0 := (i & 1) == 0
 		next1 := (i & 2) == 0
-		runqput(_p_, &gs[0], next0)
-		go func(done chan bool, p *p, ready *uint32, next0, next1 bool) {
-			for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
+		runqput(p, &gs[0], next0)
+		go func() {
+			for atomic.Xadd(RunSchedLocalQueueEmptyState.ready, 1); atomic.Load(RunSchedLocalQueueEmptyState.ready) != 2; {
 			}
-			if runqempty(p) {
-				println("next:", next0, next1)
+			if runqempty(RunSchedLocalQueueEmptyState.p) {
+				//println("next:", next0, next1)
 				throw("queue is empty")
 			}
-			done <- true
-		}(done, _p_, ready, next0, next1)
+			RunSchedLocalQueueEmptyState.done <- true
+		}()
 		for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
 		}
-		runqput(_p_, &gs[1], next1)
-		runqget(_p_)
+		runqput(p, &gs[1], next1)
+		runqget(p)
 		<-done
-		runqget(_p_)
+		runqget(p)
 	}
 }
 
@@ -195,8 +215,6 @@ var HashLoad = &hashLoad
 //	return
 //}
 
-type Uintreg sys.Uintreg
-
 var Open = open
 var Close = closefd
 var Read = read
@@ -370,7 +388,7 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			bySize[i].Mallocs += uint64(m.smallFreeCount[i])
 			smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
 		}
-		slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
+		slow.Frees += uint64(m.tinyAllocCount) + uint64(m.largeFreeCount)
 		slow.Mallocs += slow.Frees
 
 		slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
@@ -393,9 +411,6 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			slow.HeapReleased += uint64(pg) * pageSize
 		}
 
-		// Unused space in the current arena also counts as released space.
-		slow.HeapReleased += uint64(mheap_.curArena.end - mheap_.curArena.base)
-
 		getg().m.mallocing--
 	})
 
@@ -1127,31 +1142,6 @@ func SemNwait(addr *uint32) uint32 {
 	return atomic.Load(&root.nwait)
 }
 
-// MapHashCheck computes the hash of the key k for the map m, twice.
-// Method 1 uses the built-in hasher for the map.
-// Method 2 uses the typehash function (the one used by reflect).
-// Returns the two hash values, which should always be equal.
-func MapHashCheck(m interface{}, k interface{}) (uintptr, uintptr) {
-	// Unpack m.
-	mt := (*maptype)(unsafe.Pointer(efaceOf(&m)._type))
-	mh := (*hmap)(efaceOf(&m).data)
-
-	// Unpack k.
-	kt := efaceOf(&k)._type
-	var p unsafe.Pointer
-	if isDirectIface(kt) {
-		q := efaceOf(&k).data
-		p = unsafe.Pointer(&q)
-	} else {
-		p = efaceOf(&k).data
-	}
-
-	// Compute the hash functions.
-	x := mt.hasher(noescape(p), uintptr(mh.hash0))
-	y := typehash(kt, noescape(p), uintptr(mh.hash0))
-	return x, y
-}
-
 // mspan wrapper for testing.
 //go:notinheap
 type MSpan mspan
@@ -1210,3 +1200,28 @@ func (th *TimeHistogram) Record(duration int64) {
 }
 
 var Pusestackmaps = &usestackmaps
+
+func FinalizerGAsleep() bool {
+	lock(&finlock)
+	result := fingwait
+	unlock(&finlock)
+	return result
+}
+
+// For GCTestIsReachable, it's important that we do this as a call so
+// escape analysis can see through it.
+func GCTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
+	return gcTestIsReachable(ptrs...)
+}
+
+// For GCTestPointerClass, it's important that we do this as a call so
+// escape analysis can see through it.
+//
+// This is nosplit because gcTestPointerClass is.
+//
+//go:nosplit
+func GCTestPointerClass(p unsafe.Pointer) string {
+	return gcTestPointerClass(p)
+}
+
+const Raceenabled = raceenabled
diff --git a/libgo/go/runtime/export_unix_test.go b/libgo/go/runtime/export_unix_test.go
index 801786d60e6..180af01047a 100644
--- a/libgo/go/runtime/export_unix_test.go
+++ b/libgo/go/runtime/export_unix_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd hurd linux netbsd openbsd solaris
 
 package runtime
diff --git a/libgo/go/runtime/extern.go b/libgo/go/runtime/extern.go
index 8d10c8572db..0d7f3577913 100644
--- a/libgo/go/runtime/extern.go
+++ b/libgo/go/runtime/extern.go
@@ -110,8 +110,6 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	with a trivial allocator that obtains memory from the operating system and
 	never reclaims any memory.
 
-	scavenge: scavenge=1 enables debugging mode of heap scavenger.
-
 	scavtrace: setting scavtrace=1 causes the runtime to emit a single line to standard
 	error, roughly once per GC cycle, summarizing the amount of work done by the
 	scavenger as well as the total amount of memory returned to the operating system
@@ -213,6 +211,8 @@ func Caller(skip int) (pc uintptr, file string, line int, ok bool)
 // program counter adjustment.
 func Callers(skip int, pc []uintptr) int
 
+var defaultGOROOT string // set by cmd/link
+
 // GOROOT returns the root of the Go tree. It uses the
 // GOROOT environment variable, if set at process start,
 // or else the root used during the Go build.
@@ -221,14 +221,24 @@ func GOROOT() string {
 	if s != "" {
 		return s
 	}
-	return sys.DefaultGoroot
+	return defaultGOROOT
 }
 
+// buildVersion is the Go tree's version string at build time.
+//
+// If any GOEXPERIMENTs are set to non-default values, it will include
+// "X:<GOEXPERIMENT>".
+//
+// This is set by the linker.
+//
+// This is accessed by "go version <binary>".
+var buildVersion string
+
 // Version returns the Go tree's version string.
 // It is either the commit hash and date at the time of the build or,
 // when possible, a release tag like "go1.3".
 func Version() string {
-	return sys.TheVersion
+	return buildVersion
 }
 
 // GOOS is the running program's operating system target:
diff --git a/libgo/go/runtime/gc_test.go b/libgo/go/runtime/gc_test.go
index cde2e14395e..bb5e1ae6ebf 100644
--- a/libgo/go/runtime/gc_test.go
+++ b/libgo/go/runtime/gc_test.go
@@ -208,6 +208,126 @@ func TestGcZombieReporting(t *testing.T) {
 	}
 }
 
+/*
+
+func TestGCTestMoveStackOnNextCall(t *testing.T) {
+	t.Parallel()
+	var onStack int
+	// GCTestMoveStackOnNextCall can fail in rare cases if there's
+	// a preemption. This won't happen many times in quick
+	// succession, so just retry a few times.
+	for retry := 0; retry < 5; retry++ {
+		runtime.GCTestMoveStackOnNextCall()
+		if moveStackCheck(t, &onStack, uintptr(unsafe.Pointer(&onStack))) {
+			// Passed.
+			return
+		}
+	}
+	t.Fatal("stack did not move")
+}
+
+// This must not be inlined because the point is to force a stack
+// growth check and move the stack.
+//
+//go:noinline
+func moveStackCheck(t *testing.T, new *int, old uintptr) bool {
+	// new should have been updated by the stack move;
+	// old should not have.
+
+	// Capture new's value before doing anything that could
+	// further move the stack.
+	new2 := uintptr(unsafe.Pointer(new))
+
+	t.Logf("old stack pointer %x, new stack pointer %x", old, new2)
+	if new2 == old {
+		// Check that we didn't screw up the test's escape analysis.
+		if cls := runtime.GCTestPointerClass(unsafe.Pointer(new)); cls != "stack" {
+			t.Fatalf("test bug: new (%#x) should be a stack pointer, not %s", new2, cls)
+		}
+		// This was a real failure.
+		return false
+	}
+	return true
+}
+
+func TestGCTestMoveStackRepeatedly(t *testing.T) {
+	// Move the stack repeatedly to make sure we're not doubling
+	// it each time.
+	for i := 0; i < 100; i++ {
+		runtime.GCTestMoveStackOnNextCall()
+		moveStack1(false)
+	}
+}
+
+//go:noinline
+func moveStack1(x bool) {
+	// Make sure this function doesn't get auto-nosplit.
+	if x {
+		println("x")
+	}
+}
+
+*/
+
+func TestGCTestIsReachable(t *testing.T) {
+	var all, half []unsafe.Pointer
+	var want uint64
+	for i := 0; i < 16; i++ {
+		// The tiny allocator muddies things, so we use a
+		// scannable type.
+		p := unsafe.Pointer(new(*int))
+		all = append(all, p)
+		if i%2 == 0 {
+			half = append(half, p)
+			want |= 1 << i
+		}
+	}
+
+	got := runtime.GCTestIsReachable(all...)
+	if want != got {
+		// gccgo's conservative GC means that we sometimes
+		// keep data we shouldn't.
+		if runtime.Compiler == "gccgo" {
+			if ((got ^ want) & want) != 0 {
+				t.Fatalf("some expected bits not set: want %b, got %b", want, got)
+			}
+		} else {
+			t.Fatalf("did not get expected reachable set; want %b, got %b", want, got)
+		}
+	}
+	runtime.KeepAlive(half)
+}
+
+var pointerClassSink *int
+var pointerClassData = 42
+
+func TestGCTestPointerClass(t *testing.T) {
+	if runtime.Compiler == "gccgo" {
+		// gofrontend escape analysis doesn't handle passing
+		// &onStack through a closure.
+		t.Skip("skipping for gofrontend")
+	}
+
+	t.Parallel()
+	check := func(p unsafe.Pointer, want string) {
+		t.Helper()
+		got := runtime.GCTestPointerClass(p)
+		if got != want {
+			// Convert the pointer to a uintptr to avoid
+			// escaping it.
+			t.Errorf("for %#x, want class %s, got %s", uintptr(p), want, got)
+		}
+	}
+	var onStack int
+	var notOnStack int
+	pointerClassSink = &notOnStack
+	check(unsafe.Pointer(&onStack), "stack")
+	check(unsafe.Pointer(&notOnStack), "heap")
+	check(unsafe.Pointer(&pointerClassSink), "bss")
+	check(unsafe.Pointer(&pointerClassData), "data")
+	check(nil, "other")
+}
+
 func BenchmarkSetTypePtr(b *testing.B) {
 	benchSetType(b, new(*byte))
 }
diff --git a/libgo/go/runtime/hash32.go b/libgo/go/runtime/hash32.go
index 89efc1cde5c..58ae38b200d 100644
--- a/libgo/go/runtime/hash32.go
+++ b/libgo/go/runtime/hash32.go
@@ -3,10 +3,10 @@
 // license that can be found in the LICENSE file.
 
 // Hashing algorithm inspired by
-//   xxhash: https://code.google.com/p/xxhash/
-// cityhash: https://code.google.com/p/cityhash/
+// wyhash: https://github.com/wangyi-fudan/wyhash/blob/ceb019b530e2c1c14d70b79bfa2bc49de7d95bc1/Modern%20Non-Cryptographic%20Hash%20Function%20and%20Pseudorandom%20Number%20Generator.pdf
 
-// +build 386 arm armbe m68k mips mipsle nios2 ppc riscv s390 sh shbe sparc
+//go:build 386 || arm || mips || mipsle || armbe || m68k || nios2 || ppc || riscv || s390 || sh || shbe || sparc
+// +build 386 arm mips mipsle armbe m68k nios2 ppc riscv s390 sh shbe sparc
 
 package runtime
 
@@ -16,104 +16,54 @@ import "unsafe"
 //
 //go:linkname memhash
 
-const (
-	// Constants for multiplication: four random odd 32-bit numbers.
-	m1 = 3168982561
-	m2 = 3339683297
-	m3 = 832293441
-	m4 = 2336365089
-)
+func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+	a, b := mix32(uint32(seed), uint32(4^hashkey[0]))
+	t := readUnaligned32(p)
+	a ^= t
+	b ^= t
+	a, b = mix32(a, b)
+	a, b = mix32(a, b)
+	return uintptr(a ^ b)
+}
+
+func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+	a, b := mix32(uint32(seed), uint32(8^hashkey[0]))
+	a ^= readUnaligned32(p)
+	b ^= readUnaligned32(add(p, 4))
+	a, b = mix32(a, b)
+	a, b = mix32(a, b)
+	return uintptr(a ^ b)
+}
 
 func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
 	if GOARCH == "386" && GOOS != "nacl" && useAeshash {
 		return aeshash(p, seed, s)
 	}
-	h := uint32(seed + s*hashkey[0])
-tail:
-	switch {
-	case s == 0:
-	case s < 4:
-		h ^= uint32(*(*byte)(p))
-		h ^= uint32(*(*byte)(add(p, s>>1))) << 8
-		h ^= uint32(*(*byte)(add(p, s-1))) << 16
-		h = rotl_15(h*m1) * m2
-	case s == 4:
-		h ^= readUnaligned32(p)
-		h = rotl_15(h*m1) * m2
-	case s <= 8:
-		h ^= readUnaligned32(p)
-		h = rotl_15(h*m1) * m2
-		h ^= readUnaligned32(add(p, s-4))
-		h = rotl_15(h*m1) * m2
-	case s <= 16:
-		h ^= readUnaligned32(p)
-		h = rotl_15(h*m1) * m2
-		h ^= readUnaligned32(add(p, 4))
-		h = rotl_15(h*m1) * m2
-		h ^= readUnaligned32(add(p, s-8))
-		h = rotl_15(h*m1) * m2
-		h ^= readUnaligned32(add(p, s-4))
-		h = rotl_15(h*m1) * m2
-	default:
-		v1 := h
-		v2 := uint32(seed * hashkey[1])
-		v3 := uint32(seed * hashkey[2])
-		v4 := uint32(seed * hashkey[3])
-		for s >= 16 {
-			v1 ^= readUnaligned32(p)
-			v1 = rotl_15(v1*m1) * m2
-			p = add(p, 4)
-			v2 ^= readUnaligned32(p)
-			v2 = rotl_15(v2*m2) * m3
-			p = add(p, 4)
-			v3 ^= readUnaligned32(p)
-			v3 = rotl_15(v3*m3) * m4
-			p = add(p, 4)
-			v4 ^= readUnaligned32(p)
-			v4 = rotl_15(v4*m4) * m1
-			p = add(p, 4)
-			s -= 16
-		}
-		h = v1 ^ v2 ^ v3 ^ v4
-		goto tail
+	a, b := mix32(uint32(seed), uint32(s^hashkey[0]))
+	if s == 0 {
+		return uintptr(a ^ b)
 	}
-	h ^= h >> 17
-	h *= m3
-	h ^= h >> 13
-	h *= m4
-	h ^= h >> 16
-	return uintptr(h)
-}
-
-func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
-	h := uint32(seed + 4*hashkey[0])
-	h ^= readUnaligned32(p)
-	h = rotl_15(h*m1) * m2
-	h ^= h >> 17
-	h *= m3
-	h ^= h >> 13
-	h *= m4
-	h ^= h >> 16
-	return uintptr(h)
-}
-
-func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
-	h := uint32(seed + 8*hashkey[0])
-	h ^= readUnaligned32(p)
-	h = rotl_15(h*m1) * m2
-	h ^= readUnaligned32(add(p, 4))
-	h = rotl_15(h*m1) * m2
-	h ^= h >> 17
-	h *= m3
-	h ^= h >> 13
-	h *= m4
-	h ^= h >> 16
-	return uintptr(h)
+	for ; s > 8; s -= 8 {
+		a ^= readUnaligned32(p)
+		b ^= readUnaligned32(add(p, 4))
+		a, b = mix32(a, b)
+		p = add(p, 8)
+	}
+	if s >= 4 {
+		a ^= readUnaligned32(p)
+		b ^= readUnaligned32(add(p, s-4))
+	} else {
+		t := uint32(*(*byte)(p))
+		t |= uint32(*(*byte)(add(p, s>>1))) << 8
+		t |= uint32(*(*byte)(add(p, s-1))) << 16
+		b ^= t
+	}
+	a, b = mix32(a, b)
+	a, b = mix32(a, b)
+	return uintptr(a ^ b)
 }
 
-// Note: in order to get the compiler to issue rotl instructions, we
-// need to constant fold the shift amount by hand.
-// TODO: convince the compiler to issue rotl instructions after inlining.
-func rotl_15(x uint32) uint32 {
-	return (x << 15) | (x >> (32 - 15))
+func mix32(a, b uint32) (uint32, uint32) {
+	c := uint64(a^uint32(hashkey[1])) * uint64(b^uint32(hashkey[2]))
+	return uint32(c), uint32(c >> 32)
 }
diff --git a/libgo/go/runtime/hash64.go b/libgo/go/runtime/hash64.go
index 704bbe6f62b..4b32d515c4b 100644
--- a/libgo/go/runtime/hash64.go
+++ b/libgo/go/runtime/hash64.go
@@ -3,113 +3,98 @@
 // license that can be found in the LICENSE file.
 
 // Hashing algorithm inspired by
-//   xxhash: https://code.google.com/p/xxhash/
-// cityhash: https://code.google.com/p/cityhash/
+// wyhash: https://github.com/wangyi-fudan/wyhash
 
+//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm || alpha || amd64p32 || arm64be || ia64 || mips64p32 || mips64p32le || sparc64
 // +build amd64 arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x wasm alpha amd64p32 arm64be ia64 mips64p32 mips64p32le sparc64
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/math"
+	"unsafe"
+)
 
 // For gccgo, use go:linkname to export compiler-called functions.
 //
 //go:linkname memhash
 
 const (
-	// Constants for multiplication: four random odd 64-bit numbers.
-	m1 = 16877499708836156737
-	m2 = 2820277070424839065
-	m3 = 9497967016996688599
-	m4 = 15839092249703872147
+	m1 = 0xa0761d6478bd642f
+	m2 = 0xe7037ed1a0b428db
+	m3 = 0x8ebc6af09c88c6e3
+	m4 = 0x589965cc75374cc3
+	m5 = 0x1d8e4e27c47d124f
 )
 
 func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
 	if (GOARCH == "amd64" || GOARCH == "arm64") && useAeshash {
 		return aeshash(p, seed, s)
 	}
-	h := uint64(seed + s*hashkey[0])
-tail:
+	var a, b uintptr
+	seed ^= hashkey[0] ^ m1
 	switch {
 	case s == 0:
+		return seed
 	case s < 4:
-		h ^= uint64(*(*byte)(p))
-		h ^= uint64(*(*byte)(add(p, s>>1))) << 8
-		h ^= uint64(*(*byte)(add(p, s-1))) << 16
-		h = rotl_31(h*m1) * m2
-	case s <= 8:
-		h ^= uint64(readUnaligned32(p))
-		h ^= uint64(readUnaligned32(add(p, s-4))) << 32
-		h = rotl_31(h*m1) * m2
+		a = uintptr(*(*byte)(p))
+		a |= uintptr(*(*byte)(add(p, s>>1))) << 8
+		a |= uintptr(*(*byte)(add(p, s-1))) << 16
+	case s == 4:
+		a = r4(p)
+		b = a
+	case s < 8:
+		a = r4(p)
+		b = r4(add(p, s-4))
+	case s == 8:
+		a = r8(p)
+		b = a
 	case s <= 16:
-		h ^= readUnaligned64(p)
-		h = rotl_31(h*m1) * m2
-		h ^= readUnaligned64(add(p, s-8))
-		h = rotl_31(h*m1) * m2
-	case s <= 32:
-		h ^= readUnaligned64(p)
-		h = rotl_31(h*m1) * m2
-		h ^= readUnaligned64(add(p, 8))
-		h = rotl_31(h*m1) * m2
-		h ^= readUnaligned64(add(p, s-16))
-		h = rotl_31(h*m1) * m2
-		h ^= readUnaligned64(add(p, s-8))
-		h = rotl_31(h*m1) * m2
+		a = r8(p)
+		b = r8(add(p, s-8))
 	default:
-		v1 := h
-		v2 := uint64(seed * hashkey[1])
-		v3 := uint64(seed * hashkey[2])
-		v4 := uint64(seed * hashkey[3])
-		for s >= 32 {
-			v1 ^= readUnaligned64(p)
-			v1 = rotl_31(v1*m1) * m2
-			p = add(p, 8)
-			v2 ^= readUnaligned64(p)
-			v2 = rotl_31(v2*m2) * m3
-			p = add(p, 8)
-			v3 ^= readUnaligned64(p)
-			v3 = rotl_31(v3*m3) * m4
-			p = add(p, 8)
-			v4 ^= readUnaligned64(p)
-			v4 = rotl_31(v4*m4) * m1
-			p = add(p, 8)
-			s -= 32
+		l := s
+		if l > 48 {
+			seed1 := seed
+			seed2 := seed
+			for ; l > 48; l -= 48 {
+				seed = mix(r8(p)^m2, r8(add(p, 8))^seed)
+				seed1 = mix(r8(add(p, 16))^m3, r8(add(p, 24))^seed1)
+				seed2 = mix(r8(add(p, 32))^m4, r8(add(p, 40))^seed2)
+				p = add(p, 48)
+			}
+			seed ^= seed1 ^ seed2
+		}
+		for ; l > 16; l -= 16 {
+			seed = mix(r8(p)^m2, r8(add(p, 8))^seed)
+			p = add(p, 16)
 		}
-		h = v1 ^ v2 ^ v3 ^ v4
-		goto tail
+		a = r8(add(p, l-16))
+		b = r8(add(p, l-8))
 	}
 
-	h ^= h >> 29
-	h *= m3
-	h ^= h >> 32
-	return uintptr(h)
+	return mix(m5^s, mix(a^m2, b^seed))
 }
 
 func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
-	h := uint64(seed + 4*hashkey[0])
-	v := uint64(readUnaligned32(p))
-	h ^= v
-	h ^= v << 32
-	h = rotl_31(h*m1) * m2
-	h ^= h >> 29
-	h *= m3
-	h ^= h >> 32
-	return uintptr(h)
+	a := r4(p)
+	return mix(m5^4, mix(a^m2, a^seed^hashkey[0]^m1))
 }
 
 func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
-	h := uint64(seed + 8*hashkey[0])
-	h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32
-	h = rotl_31(h*m1) * m2
-	h ^= h >> 29
-	h *= m3
-	h ^= h >> 32
-	return uintptr(h)
+	a := r8(p)
+	return mix(m5^8, mix(a^m2, a^seed^hashkey[0]^m1))
+}
+
+func mix(a, b uintptr) uintptr {
+	hi, lo := math.Mul64(uint64(a), uint64(b))
+	return uintptr(hi ^ lo)
+}
+
+func r4(p unsafe.Pointer) uintptr {
+	return uintptr(readUnaligned32(p))
 }
 
-// Note: in order to get the compiler to issue rotl instructions, we
-// need to constant fold the shift amount by hand.
-// TODO: convince the compiler to issue rotl instructions after inlining.
-func rotl_31(x uint64) uint64 {
-	return (x << 31) | (x >> (64 - 31))
+func r8(p unsafe.Pointer) uintptr {
+	return uintptr(readUnaligned64(p))
 }
diff --git a/libgo/go/runtime/hash_test.go b/libgo/go/runtime/hash_test.go
index 60a86015f64..753b129d82c 100644
--- a/libgo/go/runtime/hash_test.go
+++ b/libgo/go/runtime/hash_test.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"math"
 	"math/rand"
-	"reflect"
 	. "runtime"
 	"strings"
 	"testing"
@@ -49,54 +48,6 @@ func TestMemHash64Equality(t *testing.T) {
 	}
 }
 
-func TestCompilerVsRuntimeHash(t *testing.T) {
-	// Test to make sure the compiler's hash function and the runtime's hash function agree.
-	// See issue 37716.
-	for _, m := range []interface{}{
-		map[bool]int{},
-		map[int8]int{},
-		map[uint8]int{},
-		map[int16]int{},
-		map[uint16]int{},
-		map[int32]int{},
-		map[uint32]int{},
-		map[int64]int{},
-		map[uint64]int{},
-		map[int]int{},
-		map[uint]int{},
-		map[uintptr]int{},
-		map[*byte]int{},
-		map[chan int]int{},
-		map[unsafe.Pointer]int{},
-		map[float32]int{},
-		map[float64]int{},
-		map[complex64]int{},
-		map[complex128]int{},
-		map[string]int{},
-		//map[interface{}]int{},
-		//map[interface{F()}]int{},
-		map[[8]uint64]int{},
-		map[[8]string]int{},
-		map[struct{ a, b, c, d int32 }]int{}, // Note: tests AMEM128
-		map[struct{ a, b, _, d int32 }]int{},
-		map[struct {
-			a, b int32
-			c    float32
-			d, e [8]byte
-		}]int{},
-		map[struct {
-			a int16
-			b int64
-		}]int{},
-	} {
-		k := reflect.New(reflect.TypeOf(m).Key()).Elem().Interface() // the zero key
-		x, y := MapHashCheck(m, k)
-		if x != y {
-			t.Errorf("hashes did not match (%x vs %x) for map %T", x, y, m)
-		}
-	}
-}
-
 // Smhasher is a torture test for hash functions.
 // https://code.google.com/p/smhasher/
 // This code is a port of some of the Smhasher tests to Go.
diff --git a/libgo/go/runtime/heapdump.go b/libgo/go/runtime/heapdump.go
index f3f7382b242..171acb8697c 100644
--- a/libgo/go/runtime/heapdump.go
+++ b/libgo/go/runtime/heapdump.go
@@ -294,9 +294,10 @@ func dumpgoroutine(gp *g) {
 }
 
 func dumpgs() {
+	assertWorldStopped()
+
 	// goroutines & stacks
-	for i := 0; uintptr(i) < allglen; i++ {
-		gp := allgs[i]
+	forEachG(func(gp *g) {
 		status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
 		switch status {
 		default:
@@ -309,7 +310,7 @@ func dumpgs() {
 			_Gwaiting:
 			dumpgoroutine(gp)
 		}
-	}
+	})
 }
 
 func finq_callback(fn *funcval, obj unsafe.Pointer, ft *functype, ot *ptrtype) {
@@ -409,7 +410,7 @@ func dumpparams() {
 	dumpint(uint64(arenaStart))
 	dumpint(uint64(arenaEnd))
 	dumpstr(sys.GOARCH)
-	dumpstr(sys.Goexperiment)
+	dumpstr(buildVersion)
 	dumpint(uint64(ncpu))
 }
 
diff --git a/libgo/go/runtime/histogram.go b/libgo/go/runtime/histogram.go
index da4910d341c..0cccbcca162 100644
--- a/libgo/go/runtime/histogram.go
+++ b/libgo/go/runtime/histogram.go
@@ -81,6 +81,10 @@ type timeHistogram struct {
 }
 
 // record adds the given duration to the distribution.
+//
+// Disallow preemptions and stack growths because this function
+// may run in sensitive locations.
+//go:nosplit
 func (h *timeHistogram) record(duration int64) {
 	if duration < 0 {
 		atomic.Xadd64(&h.underflow, 1)
diff --git a/libgo/go/runtime/internal/math/math.go b/libgo/go/runtime/internal/math/math.go
index 5385f5dd868..b6bd12d3e8b 100644
--- a/libgo/go/runtime/internal/math/math.go
+++ b/libgo/go/runtime/internal/math/math.go
@@ -17,3 +17,24 @@ func MulUintptr(a, b uintptr) (uintptr, bool) {
 	overflow := b > MaxUintptr/a
 	return a * b, overflow
 }
+
+// Mul64 returns the 128-bit product of x and y: (hi, lo) = x * y
+// with the product bits' upper half returned in hi and the lower
+// half returned in lo.
+// This is a copy from math/bits.Mul64
+// On supported platforms this is an intrinsic lowered by the compiler.
+func Mul64(x, y uint64) (hi, lo uint64) {
+	const mask32 = 1<<32 - 1
+	x0 := x & mask32
+	x1 := x >> 32
+	y0 := y & mask32
+	y1 := y >> 32
+	w0 := x0 * y0
+	t := x1*y0 + w0>>32
+	w1 := t & mask32
+	w2 := t >> 32
+	w1 += x0 * y1
+	hi = x1*y1 + w2 + w1>>32
+	lo = x * y
+	return
+}
diff --git a/libgo/go/runtime/internal/sys/arch.go b/libgo/go/runtime/internal/sys/arch.go
new file mode 100644
index 00000000000..7e09c10ca73
--- /dev/null
+++ b/libgo/go/runtime/internal/sys/arch.go
@@ -0,0 +1,41 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+type ArchFamilyType int
+
+// PtrSize is the size of a pointer in bytes - unsafe.Sizeof(uintptr(0)) but as an ideal constant.
+// It is also the size of the machine's native word size (that is, 4 on 32-bit systems, 8 on 64-bit).
+const PtrSize = 4 << (^uintptr(0) >> 63)
+
+// AIX requires a larger stack for syscalls.
+const StackGuardMultiplier = StackGuardMultiplierDefault*(1-GoosAix) + 2*GoosAix
+
+// ArchFamily is the architecture family (AMD64, ARM, ...)
+const ArchFamily ArchFamilyType = _ArchFamily
+
+// BigEndian reports whether the architecture is big-endian.
+const BigEndian = _BigEndian
+
+// DefaultPhysPageSize is the default physical page size.
+const DefaultPhysPageSize = _DefaultPhysPageSize
+
+// PCQuantum is the minimal unit for a program counter (1 on x86, 4 on most other systems).
+// The various PC tables record PC deltas pre-divided by PCQuantum.
+const PCQuantum = _PCQuantum
+
+// Int64Align is the required alignment for a 64-bit integer (4 on 32-bit systems, 8 on 64-bit).
+const Int64Align = _Int64Align
+
+// MinFrameSize is the size of the system-reserved words at the bottom
+// of a frame (just above the architectural stack pointer).
+// It is zero on x86 and PtrSize on most non-x86 (LR-based) systems.
+// On PowerPC it is larger, to cover three more reserved words:
+// the compiler word, the link editor word, and the TOC save word.
+const MinFrameSize = _MinFrameSize
+
+// StackAlign is the required alignment of the SP register.
+// The stack must be at least word aligned, but some architectures require more.
+const StackAlign = _StackAlign
diff --git a/libgo/go/runtime/internal/sys/stubs.go b/libgo/go/runtime/internal/sys/stubs.go
deleted file mode 100644
index 53280232682..00000000000
--- a/libgo/go/runtime/internal/sys/stubs.go
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package sys
-
-// Declarations for runtime services implemented in C or assembly.
-
-const PtrSize = 4 << (^uintptr(0) >> 63)           // unsafe.Sizeof(uintptr(0)) but an ideal const
-const RegSize = 4 << (^Uintreg(0) >> 63)           // unsafe.Sizeof(uintreg(0)) but an ideal const
-const SpAlign = 1*(1-GoarchArm64) + 16*GoarchArm64 // SP alignment: 1 normally, 16 for ARM64
-
-var DefaultGoroot string // set at link time
diff --git a/libgo/go/runtime/lfstack_32bit.go b/libgo/go/runtime/lfstack_32bit.go
index b3194dc7668..c0eb665dd20 100644
--- a/libgo/go/runtime/lfstack_32bit.go
+++ b/libgo/go/runtime/lfstack_32bit.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 amd64p32 arm armbe m68k mips mipsle mips64p32 mips64p32le nios2 ppc riscv s390 sh shbe sparc
+//go:build 386 || arm || mips || mipsle || amd64p32 || armbe || m68k || mips64p32 || mips64p32le || nios2 || ppc || riscv || s390 || sh || shbe || sparc
+// +build 386 arm mips mipsle amd64p32 armbe m68k mips64p32 mips64p32le nios2 ppc riscv s390 sh shbe sparc
 
 package runtime
 
diff --git a/libgo/go/runtime/lfstack_64bit.go b/libgo/go/runtime/lfstack_64bit.go
index af9e7d164b9..44e6812096d 100644
--- a/libgo/go/runtime/lfstack_64bit.go
+++ b/libgo/go/runtime/lfstack_64bit.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm || arm64be || alpha || sparc64 || ia64
 // +build amd64 arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x wasm arm64be alpha sparc64 ia64
 
 package runtime
diff --git a/libgo/go/runtime/libfuzzer.go b/libgo/go/runtime/libfuzzer.go
index 0161955f09f..578bce04149 100644
--- a/libgo/go/runtime/libfuzzer.go
+++ b/libgo/go/runtime/libfuzzer.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build libfuzzer
 // +build libfuzzer
 
 package runtime
diff --git a/libgo/go/runtime/lock_futex.go b/libgo/go/runtime/lock_futex.go
index 21d7e0dd943..74f6428e169 100644
--- a/libgo/go/runtime/lock_futex.go
+++ b/libgo/go/runtime/lock_futex.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build dragonfly || freebsd || linux
 // +build dragonfly freebsd linux
 
 package runtime
@@ -249,7 +250,7 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func beforeIdle(int64) (*g, bool) {
+func beforeIdle(int64, int64) (*g, bool) {
 	return nil, false
 }
 
diff --git a/libgo/go/runtime/lock_js.go b/libgo/go/runtime/lock_js.go
index 14bdc76842c..0ca3512baf3 100644
--- a/libgo/go/runtime/lock_js.go
+++ b/libgo/go/runtime/lock_js.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build js && wasm
 // +build js,wasm
 
 package runtime
@@ -175,7 +176,12 @@ var idleID int32
 // If an event handler returned, we resume it and it will pause the execution.
 // beforeIdle either returns the specific goroutine to schedule next or
 // indicates with otherReady that some goroutine became ready.
-func beforeIdle(delay int64) (gp *g, otherReady bool) {
+func beforeIdle(now, pollUntil int64) (gp *g, otherReady bool) {
+	delay := int64(-1)
+	if pollUntil != 0 {
+		delay = pollUntil - now
+	}
+
 	if delay > 0 {
 		clearIdleID()
 		if delay < 1e6 {
diff --git a/libgo/go/runtime/lock_sema.go b/libgo/go/runtime/lock_sema.go
index fe173843797..bcd8a86a3bf 100644
--- a/libgo/go/runtime/lock_sema.go
+++ b/libgo/go/runtime/lock_sema.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || hurd || netbsd || openbsd || plan9 || solaris || windows
 // +build aix darwin hurd netbsd openbsd plan9 solaris windows
 
 package runtime
@@ -308,7 +309,7 @@ func notetsleepg(n *note, ns int64) bool {
 	return ok
 }
 
-func beforeIdle(int64) (*g, bool) {
+func beforeIdle(int64, int64) (*g, bool) {
 	return nil, false
 }
 
diff --git a/libgo/go/runtime/lockrank.go b/libgo/go/runtime/lockrank.go
index b3c01ba1045..dde9f7c21a7 100644
--- a/libgo/go/runtime/lockrank.go
+++ b/libgo/go/runtime/lockrank.go
@@ -44,7 +44,6 @@ const (
 	lockRankPollDesc
 	lockRankSched
 	lockRankDeadlock
-	lockRankPanic
 	lockRankAllg
 	lockRankAllp
 
@@ -92,6 +91,7 @@ const (
 	// rank, we don't allow any further locks to be acquired other than more
 	// hchan locks.
 	lockRankHchanLeaf
+	lockRankPanic
 
 	// Leaf locks with no dependencies, so these constants are not actually used anywhere.
 	// There are other architecture-dependent leaf locks as well.
@@ -123,7 +123,6 @@ var lockNames = []string{
 	lockRankPollDesc: "pollDesc",
 	lockRankSched:    "sched",
 	lockRankDeadlock: "deadlock",
-	lockRankPanic:    "panic",
 	lockRankAllg:     "allg",
 	lockRankAllp:     "allp",
 
@@ -162,6 +161,7 @@ var lockNames = []string{
 
 	lockRankGFree:     "gFree",
 	lockRankHchanLeaf: "hchanLeaf",
+	lockRankPanic:     "panic",
 
 	lockRankNewmHandoff:   "newmHandoff.lock",
 	lockRankDebugPtrmask:  "debugPtrmask.lock",
@@ -202,10 +202,9 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankPollDesc:      {},
 	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
 	lockRankDeadlock:      {lockRankDeadlock},
-	lockRankPanic:         {lockRankDeadlock},
-	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
+	lockRankAllg:          {lockRankSysmon, lockRankSched},
 	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
+	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankPollDesc, lockRankSched, lockRankAllp, lockRankTimers},
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
 	lockRankHchan:         {lockRankScavenge, lockRankSweep, lockRankHchan},
@@ -214,29 +213,30 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
 	lockRankTraceStrings:  {lockRankTraceBuf},
 	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
-	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
 	lockRankRoot:          {},
-	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankSweep},
-	lockRankTraceStackTab: {lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankTrace},
+	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot},
+	lockRankTraceStackTab: {lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankTrace},
 	lockRankNetpollInit:   {lockRankTimers},
 
 	lockRankRwmutexW: {},
 	lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},
 
-	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
-	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
-	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
+	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
+	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
 	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
 	lockRankDefer:        {},
-	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
-	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankFin, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
-	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankSudog:        {lockRankHchan, lockRankNotifyList},
+	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
+	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
 	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
 
 	lockRankGFree:     {lockRankSched},
 	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
+	lockRankPanic:     {lockRankDeadlock}, // plus any other lock held on throw.
 
 	lockRankNewmHandoff:   {},
 	lockRankDebugPtrmask:  {},
diff --git a/libgo/go/runtime/lockrank_off.go b/libgo/go/runtime/lockrank_off.go
index 7dcd8f5fe96..f3d2c009145 100644
--- a/libgo/go/runtime/lockrank_off.go
+++ b/libgo/go/runtime/lockrank_off.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !goexperiment.staticlockranking
 // +build !goexperiment.staticlockranking
 
 package runtime
diff --git a/libgo/go/runtime/lockrank_on.go b/libgo/go/runtime/lockrank_on.go
index 88ac95a0047..fc8d2dc8d11 100644
--- a/libgo/go/runtime/lockrank_on.go
+++ b/libgo/go/runtime/lockrank_on.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build goexperiment.staticlockranking
 // +build goexperiment.staticlockranking
 
 package runtime
@@ -24,19 +25,6 @@ type lockRankStruct struct {
 	pad int
 }
 
-// init checks that the partial order in lockPartialOrder fits within the total
-// order determined by the order of the lockRank constants.
-func init() {
-	for rank, list := range lockPartialOrder {
-		for _, entry := range list {
-			if entry > lockRank(rank) {
-				println("lockPartial order row", lockRank(rank).String(), "entry", entry.String())
-				throw("lockPartialOrder table is inconsistent with total lock ranking order")
-			}
-		}
-	}
-}
-
 func lockInit(l *mutex, rank lockRank) {
 	l.rank = rank
 }
@@ -64,12 +52,11 @@ func lockWithRank(l *mutex, rank lockRank) {
 		// rank recording for it, since print/println are used when
 		// printing out a lock ordering problem below.
 		//
-		// paniclk has an ordering problem, since it can be acquired
-		// during a panic with any other locks held (especially if the
-		// panic is because of a directed segv), and yet also allg is
-		// acquired after paniclk in tracebackothers()). This is a genuine
-		// problem, so for now we don't do lock rank recording for paniclk
-		// either.
+		// paniclk is only used for fatal throw/panic. Don't do lock
+		// ranking recording for it, since we throw after reporting a
+		// lock ordering problem. Additionally, paniclk may be taken
+		// after effectively any lock (anywhere we might panic), which
+		// the partial order doesn't cover.
 		lock2(l)
 		return
 	}
@@ -219,7 +206,7 @@ func releaseLockRank(rank lockRank) {
 func lockWithRankMayAcquire(l *mutex, rank lockRank) {
 	gp := getg()
 	if gp.m.locksHeldLen == 0 {
-		// No possibilty of lock ordering problem if no other locks held
+		// No possibility of lock ordering problem if no other locks held
 		return
 	}
 
diff --git a/libgo/go/runtime/lockrank_test.go b/libgo/go/runtime/lockrank_test.go
new file mode 100644
index 00000000000..4b2fc0eaeea
--- /dev/null
+++ b/libgo/go/runtime/lockrank_test.go
@@ -0,0 +1,41 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+// Check that the partial order in lockPartialOrder fits within the total order
+// determined by the order of the lockRank constants.
+func TestLockRankPartialOrder(t *testing.T) {
+	for r, list := range LockPartialOrder {
+		rank := LockRank(r)
+		for _, e := range list {
+			entry := LockRank(e)
+			if entry > rank {
+				t.Errorf("lockPartialOrder row %v entry %v is inconsistent with total lock ranking order", rank, entry)
+			}
+		}
+	}
+}
+
+// Verify that partial order lists are kept sorted. This is a purely cosemetic
+// check to make manual reviews simpler. It does not affect correctness, unlike
+// the above test.
+func TestLockRankPartialOrderSortedEntries(t *testing.T) {
+	for r, list := range LockPartialOrder {
+		rank := LockRank(r)
+		var prev LockRank
+		for _, e := range list {
+			entry := LockRank(e)
+			if entry <= prev {
+				t.Errorf("Partial order for rank %v out of order: %v <= %v in %v", rank, entry, prev, list)
+			}
+			prev = entry
+		}
+	}
+}
diff --git a/libgo/go/runtime/malloc.go b/libgo/go/runtime/malloc.go
index d3d88a243f4..7005e53a2f3 100644
--- a/libgo/go/runtime/malloc.go
+++ b/libgo/go/runtime/malloc.go
@@ -237,6 +237,7 @@ const (
 	// --------------  ---------  ----------  ----------  -----------
 	//       */64-bit         48        64MB           1    4M (32MB)
 	// windows/64-bit         48         4MB          64    1M  (8MB)
+	//      ios/arm64         33         4MB           1  2048  (8KB)
 	//       */32-bit         32         4MB           1  1024  (4KB)
 	//     */mips(le)         31         4MB           1   512  (2KB)
 
@@ -257,7 +258,7 @@ const (
 	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
 	// prefer using heapArenaBytes where possible (we need the
 	// constant to compute some other constants).
-	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoarchWasm)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (2+20)*sys.GoarchWasm
+	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoarchWasm)*(1-sys.GoosIos*sys.GoarchArm64)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (2+20)*sys.GoarchWasm + (2+20)*sys.GoosIos*sys.GoarchArm64
 
 	// heapArenaBitmapBytes is the size of each heap arena's bitmap.
 	heapArenaBitmapBytes = heapArenaBytes / (sys.PtrSize * 8 / 2)
@@ -305,7 +306,7 @@ const (
 	// high addresses if viewed as unsigned).
 	//
 	// On aix/ppc64, this offset allows to keep the heapAddrBits to
-	// 48. Otherwize, it would be 60 in order to handle mmap addresses
+	// 48. Otherwise, it would be 60 in order to handle mmap addresses
 	// (in range 0x0a00000000000000 - 0x0afffffffffffff). But in this
 	// case, the memory reserved in (s *pageAlloc).init for chunks
 	// is causing important slowdowns.
@@ -579,7 +580,7 @@ func mallocinit() {
 		const arenaMetaSize = (1 << arenaBits) * unsafe.Sizeof(heapArena{})
 		meta := uintptr(sysReserve(nil, arenaMetaSize))
 		if meta != 0 {
-			mheap_.heapArenaAlloc.init(meta, arenaMetaSize)
+			mheap_.heapArenaAlloc.init(meta, arenaMetaSize, true)
 		}
 
 		// We want to start the arena low, but if we're linked
@@ -616,7 +617,7 @@ func mallocinit() {
 		for _, arenaSize := range &arenaSizes {
 			a, size := sysReserveAligned(unsafe.Pointer(p), arenaSize, heapArenaBytes)
 			if a != nil {
-				mheap_.arena.init(uintptr(a), size)
+				mheap_.arena.init(uintptr(a), size, false)
 				p = mheap_.arena.end // For hint below
 				break
 			}
@@ -633,8 +634,8 @@ func mallocinit() {
 // heapArenaBytes. sysAlloc returns nil on failure.
 // There is no corresponding free function.
 //
-// sysAlloc returns a memory region in the Prepared state. This region must
-// be transitioned to Ready before use.
+// sysAlloc returns a memory region in the Reserved state. This region must
+// be transitioned to Prepared and then Ready before use.
 //
 // h must be locked.
 func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
@@ -736,9 +737,6 @@ func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
 		throw("misrounded allocation in sysAlloc")
 	}
 
-	// Transition from Reserved to Prepared.
-	sysMap(v, size, &memstats.heap_sys)
-
 mapped:
 	// Create arena metadata.
 	for ri := arenaIndex(uintptr(v)); ri <= arenaIndex(uintptr(v)+size-1); ri++ {
@@ -947,7 +945,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 
 		if inittrace.active && inittrace.id == getg().goid {
-			// Init functions are executed sequentially in a single Go routine.
+			// Init functions are executed sequentially in a single goroutine.
 			inittrace.allocs += 1
 		}
 	}
@@ -1003,6 +1001,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
+	// In some cases block zeroing can profitably (for latency reduction purposes)
+	// be delayed till preemption is possible; isZeroed tracks that state.
+	isZeroed := true
 	if size <= maxSmallSize {
 		if noscan && size < maxTinySize {
 			// Tiny allocator.
@@ -1074,7 +1075,8 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			(*[2]uint64)(x)[1] = 0
 			// See if we need to replace the existing tiny block with the new one
 			// based on amount of remaining free space.
-			if size < c.tinyoffset || c.tiny == 0 {
+			if !raceenabled && (size < c.tinyoffset || c.tiny == 0) {
+				// Note: disabled when race detector is on, see comment near end of this function.
 				c.tiny = uintptr(x)
 				c.tinyoffset = size
 			}
@@ -1100,7 +1102,9 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		span = c.allocLarge(size, needzero, noscan)
+		// For large allocations, keep track of zeroed state so that
+		// bulk zeroing can be happen later in a preemptible context.
+		span, isZeroed = c.allocLarge(size, needzero && !noscan, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1147,30 +1151,34 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		msanmalloc(x, size)
 	}
 
+	if rate := MemProfileRate; rate > 0 {
+		// Note cache c only valid while m acquired; see #47302
+		if rate != 1 && size < c.nextSample {
+			c.nextSample -= size
+		} else {
+			profilealloc(mp, x, size)
+		}
+	}
 	mp.mallocing = 0
 	releasem(mp)
 
+	// Pointerfree data can be zeroed late in a context where preemption can occur.
+	// x will keep the memory alive.
+	if !isZeroed && needzero {
+		memclrNoHeapPointersChunked(size, x) // This is a possible preemption point: see #47302
+	}
+
 	if debug.malloc {
 		if debug.allocfreetrace != 0 {
 			tracealloc(x, size, typ)
 		}
 
 		if inittrace.active && inittrace.id == getg().goid {
-			// Init functions are executed sequentially in a single Go routine.
+			// Init functions are executed sequentially in a single goroutine.
 			inittrace.bytes += uint64(size)
 		}
 	}
 
-	if rate := MemProfileRate; rate > 0 {
-		if rate != 1 && size < c.nextSample {
-			c.nextSample -= size
-		} else {
-			mp := acquirem()
-			profilealloc(mp, x, size)
-			releasem(mp)
-		}
-	}
-
 	if assistG != nil {
 		// Account for internal fragmentation in the assist
 		// debt now that we know it.
@@ -1183,6 +1191,22 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	}
 
+	if raceenabled && noscan && dataSize < maxTinySize {
+		// Pad tinysize allocations so they are aligned with the end
+		// of the tinyalloc region. This ensures that any arithmetic
+		// that goes off the top end of the object will be detectable
+		// by checkptr (issue 38872).
+		// Note that we disable tinyalloc when raceenabled for this to work.
+		// TODO: This padding is only performed when the race detector
+		// is enabled. It would be nice to enable it if any package
+		// was compiled with checkptr, but there's no easy way to
+		// detect that (especially at compile time).
+		// TODO: enable this padding for all allocations, not just
+		// tinyalloc ones. It's tricky because of pointer maps.
+		// Maybe just all noscan objects?
+		x = add(x, size-dataSize)
+	}
+
 	// Check preemption, since unlike gc we don't check on every call.
 	if getg().preempt {
 		checkPreempt()
@@ -1195,6 +1219,33 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	return x
 }
 
+// memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers
+// on chunks of the buffer to be zeroed, with opportunities for preemption
+// along the way.  memclrNoHeapPointers contains no safepoints and also
+// cannot be preemptively scheduled, so this provides a still-efficient
+// block copy that can also be preempted on a reasonable granularity.
+//
+// Use this with care; if the data being cleared is tagged to contain
+// pointers, this allows the GC to run before it is all cleared.
+func memclrNoHeapPointersChunked(size uintptr, x unsafe.Pointer) {
+	v := uintptr(x)
+	// got this from benchmarking. 128k is too small, 512k is too large.
+	const chunkBytes = 256 * 1024
+	vsize := v + size
+	for voff := v; voff < vsize; voff = voff + chunkBytes {
+		if getg().preempt {
+			// may hold locks, e.g., profiling
+			goschedguarded()
+		}
+		// clear min(avail, lump) bytes
+		n := vsize - voff
+		if n > chunkBytes {
+			n = chunkBytes
+		}
+		memclrNoHeapPointers(unsafe.Pointer(voff), n)
+	}
+}
+
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
 // of this function
@@ -1425,15 +1476,19 @@ func inPersistentAlloc(p uintptr) bool {
 }
 
 // linearAlloc is a simple linear allocator that pre-reserves a region
-// of memory and then maps that region into the Ready state as needed. The
-// caller is responsible for locking.
+// of memory and then optionally maps that region into the Ready state
+// as needed.
+//
+// The caller is responsible for locking.
 type linearAlloc struct {
 	next   uintptr // next free byte
 	mapped uintptr // one byte past end of mapped space
 	end    uintptr // end of reserved space
+
+	mapMemory bool // transition memory from Reserved to Ready if true
 }
 
-func (l *linearAlloc) init(base, size uintptr) {
+func (l *linearAlloc) init(base, size uintptr, mapMemory bool) {
 	if base+size < base {
 		// Chop off the last byte. The runtime isn't prepared
 		// to deal with situations where the bounds could overflow.
@@ -1443,6 +1498,7 @@ func (l *linearAlloc) init(base, size uintptr) {
 	}
 	l.next, l.mapped = base, base
 	l.end = base + size
+	l.mapMemory = mapMemory
 }
 
 func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
@@ -1452,9 +1508,11 @@ func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Poi
 	}
 	l.next = p + size
 	if pEnd := alignUp(l.next-1, physPageSize); pEnd > l.mapped {
-		// Transition from Reserved to Prepared to Ready.
-		sysMap(unsafe.Pointer(l.mapped), pEnd-l.mapped, sysStat)
-		sysUsed(unsafe.Pointer(l.mapped), pEnd-l.mapped)
+		if l.mapMemory {
+			// Transition from Reserved to Prepared to Ready.
+			sysMap(unsafe.Pointer(l.mapped), pEnd-l.mapped, sysStat)
+			sysUsed(unsafe.Pointer(l.mapped), pEnd-l.mapped)
+		}
 		l.mapped = pEnd
 	}
 	return unsafe.Pointer(p)
diff --git a/libgo/go/runtime/malloc_test.go b/libgo/go/runtime/malloc_test.go
index 24b8740d05d..7e83255ae45 100644
--- a/libgo/go/runtime/malloc_test.go
+++ b/libgo/go/runtime/malloc_test.go
@@ -156,6 +156,9 @@ func TestStringConcatenationAllocs(t *testing.T) {
 }
 
 func TestTinyAlloc(t *testing.T) {
+	if runtime.Raceenabled {
+		t.Skip("tinyalloc suppressed when running in race mode")
+	}
 	const N = 16
 	var v [N]unsafe.Pointer
 	for i := range v {
@@ -184,6 +187,9 @@ type obj12 struct {
 }
 
 func TestTinyAllocIssue37262(t *testing.T) {
+	if runtime.Raceenabled {
+		t.Skip("tinyalloc suppressed when running in race mode")
+	}
 	// Try to cause an alignment access fault
 	// by atomically accessing the first 64-bit
 	// value of a tiny-allocated object.
diff --git a/libgo/go/runtime/map.go b/libgo/go/runtime/map.go
index 5b9d7102f43..c96d2c71626 100644
--- a/libgo/go/runtime/map.go
+++ b/libgo/go/runtime/map.go
@@ -128,7 +128,7 @@ func isEmpty(x uint8) bool {
 
 // A header for a Go map.
 type hmap struct {
-	// Note: the format of the hmap is also encoded in cmd/compile/internal/gc/reflect.go.
+	// Note: the format of the hmap is also encoded in cmd/compile/internal/reflectdata/reflect.go.
 	// Make sure this stays in sync with the compiler's definition.
 	count     int // # live cells == size of map.  Must be first (used by len() builtin)
 	flags     uint8
@@ -174,11 +174,11 @@ type bmap struct {
 }
 
 // A hash iteration structure.
-// If you modify hiter, also change cmd/compile/internal/gc/reflect.go to indicate
+// If you modify hiter, also change cmd/compile/internal/reflectdata/reflect.go to indicate
 // the layout of this structure.
 type hiter struct {
-	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/compile/internal/gc/range.go).
-	elem        unsafe.Pointer // Must be in second position (see cmd/compile/internal/gc/range.go).
+	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/compile/internal/walk/range.go).
+	elem        unsafe.Pointer // Must be in second position (see cmd/compile/internal/walk/range.go).
 	t           *maptype
 	h           *hmap
 	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
@@ -495,13 +495,13 @@ func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool)
 	}
 	hash := t.hasher(key, uintptr(h.hash0))
 	m := bucketMask(h.B)
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
 			// There used to be half as many buckets; mask down one more power of two.
 			m >>= 1
 		}
-		oldb := (*bmap)(unsafe.Pointer(uintptr(c) + (hash&m)*uintptr(t.bucketsize)))
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
 		if !evacuated(oldb) {
 			b = oldb
 		}
@@ -544,13 +544,13 @@ func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe
 	}
 	hash := t.hasher(key, uintptr(h.hash0))
 	m := bucketMask(h.B)
-	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
+	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
 		if !h.sameSizeGrow() {
 			// There used to be half as many buckets; mask down one more power of two.
 			m >>= 1
 		}
-		oldb := (*bmap)(unsafe.Pointer(uintptr(c) + (hash&m)*uintptr(t.bucketsize)))
+		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
 		if !evacuated(oldb) {
 			b = oldb
 		}
@@ -863,7 +863,7 @@ func mapiterinit(t *maptype, h *hmap, it *hiter) {
 	}
 
 	if unsafe.Sizeof(hiter{})/sys.PtrSize != 12 {
-		throw("hash_iter size incorrect") // see cmd/compile/internal/gc/reflect.go
+		throw("hash_iter size incorrect") // see cmd/compile/internal/reflectdata/reflect.go
 	}
 	it.t = t
 	it.h = h
diff --git a/libgo/go/runtime/map_faststr.go b/libgo/go/runtime/map_faststr.go
index 22059476da5..647978b68cf 100644
--- a/libgo/go/runtime/map_faststr.go
+++ b/libgo/go/runtime/map_faststr.go
@@ -262,6 +262,9 @@ bucketloop:
 			// already have a mapping for key. Update it.
 			inserti = i
 			insertb = b
+			// Overwrite existing key, so it can be garbage collected.
+			// The size is already guaranteed to be set correctly.
+			k.str = key.str
 			goto done
 		}
 		ovf := b.overflow(t)
diff --git a/libgo/go/runtime/mbitmap.go b/libgo/go/runtime/mbitmap.go
index 0eb19d6bc38..72cd9f35a99 100644
--- a/libgo/go/runtime/mbitmap.go
+++ b/libgo/go/runtime/mbitmap.go
@@ -226,16 +226,25 @@ func (s *mspan) isFree(index uintptr) bool {
 	return *bytep&mask == 0
 }
 
-func (s *mspan) objIndex(p uintptr) uintptr {
-	byteOffset := p - s.base()
-	if byteOffset == 0 {
-		return 0
-	}
-	if s.baseMask != 0 {
-		// s.baseMask is non-0, elemsize is a power of two, so shift by s.divShift
-		return byteOffset >> s.divShift
+// divideByElemSize returns n/s.elemsize.
+// n must be within [0, s.npages*_PageSize),
+// or may be exactly s.npages*_PageSize
+// if s.elemsize is from sizeclasses.go.
+func (s *mspan) divideByElemSize(n uintptr) uintptr {
+	const doubleCheck = false
+
+	// See explanation in mksizeclasses.go's computeDivMagic.
+	q := uintptr((uint64(n) * uint64(s.divMul)) >> 32)
+
+	if doubleCheck && q != n/s.elemsize {
+		println(n, "/", s.elemsize, "should be", n/s.elemsize, "but got", q)
+		throw("bad magic division")
 	}
-	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
+	return q
+}
+
+func (s *mspan) objIndex(p uintptr) uintptr {
+	return s.divideByElemSize(p - s.base())
 }
 
 func markBitsForAddr(p uintptr) markBits {
@@ -324,6 +333,10 @@ func heapBitsForAddr(addr uintptr) (h heapBits) {
 	return
 }
 
+// clobberdeadPtr is a special value that is used by the compiler to
+// clobber dead stack slots, when -clobberdead flag is set.
+const clobberdeadPtr = uintptr(0xdeaddead | 0xdeaddead<<((^uintptr(0)>>63)*32))
+
 // badPointer throws bad pointer in heap panic.
 func badPointer(s *mspan, p, refBase, refOff uintptr) {
 	// Typically this indicates an incorrect use
@@ -336,13 +349,16 @@ func badPointer(s *mspan, p, refBase, refOff uintptr) {
 	// in allocated spans.
 	printlock()
 	print("runtime: pointer ", hex(p))
-	state := s.state.get()
-	if state != mSpanInUse {
-		print(" to unallocated span")
-	} else {
-		print(" to unused region of span")
+	if s != nil {
+		state := s.state.get()
+		if state != mSpanInUse {
+			print(" to unallocated span")
+		} else {
+			print(" to unused region of span")
+		}
+		print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state)
 	}
-	print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state, "\n")
+	print("\n")
 	if refBase != 0 {
 		print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
 		gcDumpObject("object", refBase, refOff)
@@ -373,6 +389,12 @@ func findObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, s *msp
 	// If s is nil, the virtual address has never been part of the heap.
 	// This pointer may be to some mmap'd region, so we allow it.
 	if s == nil {
+		if GOARCH == "amd64" && p == clobberdeadPtr && debug.invalidptr != 0 {
+			// Crash if clobberdeadPtr is seen. Only on AMD64 for now, as
+			// it is the only platform where compiler's clobberdead mode is
+			// implemented. On AMD64 clobberdeadPtr cannot be a valid address.
+			badPointer(s, p, refBase, refOff)
+		}
 		return
 	}
 	// If p is a bad pointer, it may not be in s's bounds.
@@ -402,24 +424,8 @@ func findObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, s *msp
 		}
 	}
 
-	// If this span holds object of a power of 2 size, just mask off the bits to
-	// the interior of the object. Otherwise use the size to get the base.
-	if s.baseMask != 0 {
-		// optimize for power of 2 sized objects.
-		base = s.base()
-		base = base + (p-base)&uintptr(s.baseMask)
-		objIndex = (base - s.base()) >> s.divShift
-		// base = p & s.baseMask is faster for small spans,
-		// but doesn't work for large spans.
-		// Overall, it's faster to use the more general computation above.
-	} else {
-		base = s.base()
-		if p-base >= s.elemsize {
-			// n := (p - base) / s.elemsize, using division by multiplication
-			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
-			base += objIndex * s.elemsize
-		}
-	}
+	objIndex = s.objIndex(p)
+	base = s.base() + objIndex*s.elemsize
 	return
 }
 
diff --git a/libgo/go/runtime/mcache.go b/libgo/go/runtime/mcache.go
index 32622e6d4cb..531f3be3720 100644
--- a/libgo/go/runtime/mcache.go
+++ b/libgo/go/runtime/mcache.go
@@ -168,29 +168,29 @@ func (c *mcache) refill(spc spanClass) {
 	// mcache. If it gets uncached, we'll adjust this.
 	stats := memstats.heapStats.acquire()
 	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
-	memstats.heapStats.release()
-
-	// Update heap_live with the same assumption.
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
 
 	// Flush tinyAllocs.
 	if spc == tinySpanClass {
-		atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+		atomic.Xadduintptr(&stats.tinyAllocCount, c.tinyAllocs)
 		c.tinyAllocs = 0
 	}
+	memstats.heapStats.release()
+
+	// Update gcController.heapLive with the same assumption.
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&gcController.heapLive, int64(s.npages*pageSize)-int64(usedBytes))
 
 	// While we're here, flush scanAlloc, since we have to call
 	// revise anyway.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	atomic.Xadd64(&gcController.heapScan, int64(c.scanAlloc))
 	c.scanAlloc = 0
 
 	if trace.enabled {
-		// heap_live changed.
+		// gcController.heapLive changed.
 		traceHeapAlloc()
 	}
 	if gcBlackenEnabled != 0 {
-		// heap_live and heap_scan changed.
+		// gcController.heapLive and heapScan changed.
 		gcController.revise()
 	}
 
@@ -198,7 +198,10 @@ func (c *mcache) refill(spc spanClass) {
 }
 
 // allocLarge allocates a span for a large object.
-func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
+// The boolean result indicates whether the span is known-zeroed.
+// If it did not need to be zeroed, it may not have been zeroed;
+// but if it came directly from the OS, it is already zeroed.
+func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) (*mspan, bool) {
 	if size+_PageSize < size {
 		throw("out of memory")
 	}
@@ -213,7 +216,7 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	deductSweepCredit(npages*_PageSize, npages)
 
 	spc := makeSpanClass(0, noscan)
-	s := mheap_.alloc(npages, spc, needzero)
+	s, isZeroed := mheap_.alloc(npages, spc, needzero)
 	if s == nil {
 		throw("out of memory")
 	}
@@ -222,10 +225,10 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	atomic.Xadduintptr(&stats.largeAllocCount, 1)
 	memstats.heapStats.release()
 
-	// Update heap_live and revise pacing if needed.
-	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+	// Update gcController.heapLive and revise pacing if needed.
+	atomic.Xadd64(&gcController.heapLive, int64(npages*pageSize))
 	if trace.enabled {
-		// Trace that a heap alloc occurred because heap_live changed.
+		// Trace that a heap alloc occurred because gcController.heapLive changed.
 		traceHeapAlloc()
 	}
 	if gcBlackenEnabled != 0 {
@@ -237,12 +240,12 @@ func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
 	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
 	s.limit = s.base() + size
 	heapBitsForAddr(s.base()).initSpan(s)
-	return s
+	return s, isZeroed
 }
 
 func (c *mcache) releaseAll() {
 	// Take this opportunity to flush scanAlloc.
-	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	atomic.Xadd64(&gcController.heapScan, int64(c.scanAlloc))
 	c.scanAlloc = 0
 
 	sg := mheap_.sweepgen
@@ -255,14 +258,14 @@ func (c *mcache) releaseAll() {
 			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
 			memstats.heapStats.release()
 			if s.sweepgen != sg+1 {
-				// refill conservatively counted unallocated slots in heap_live.
+				// refill conservatively counted unallocated slots in gcController.heapLive.
 				// Undo this.
 				//
 				// If this span was cached before sweep, then
-				// heap_live was totally recomputed since
+				// gcController.heapLive was totally recomputed since
 				// caching this span, so we don't do this for
 				// stale spans.
-				atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+				atomic.Xadd64(&gcController.heapLive, -int64(n)*int64(s.elemsize))
 			}
 			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
@@ -272,10 +275,14 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
-	atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+
+	// Flush tinyAllocs.
+	stats := memstats.heapStats.acquire()
+	atomic.Xadduintptr(&stats.tinyAllocCount, c.tinyAllocs)
 	c.tinyAllocs = 0
+	memstats.heapStats.release()
 
-	// Updated heap_scan and possible heap_live.
+	// Updated heapScan and possible gcController.heapLive.
 	if gcBlackenEnabled != 0 {
 		gcController.revise()
 	}
diff --git a/libgo/go/runtime/mcentral.go b/libgo/go/runtime/mcentral.go
index cd20dec5394..6013c94c698 100644
--- a/libgo/go/runtime/mcentral.go
+++ b/libgo/go/runtime/mcentral.go
@@ -81,8 +81,6 @@ func (c *mcentral) cacheSpan() *mspan {
 	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
 	deductSweepCredit(spanBytes, 0)
 
-	sg := mheap_.sweepgen
-
 	traceDone := false
 	if trace.enabled {
 		traceGCSweepStart()
@@ -104,6 +102,8 @@ func (c *mcentral) cacheSpan() *mspan {
 	spanBudget := 100
 
 	var s *mspan
+	sl := newSweepLocker()
+	sg := sl.sweepGen
 
 	// Try partial swept spans first.
 	if s = c.partialSwept(sg).pop(); s != nil {
@@ -116,9 +116,10 @@ func (c *mcentral) cacheSpan() *mspan {
 		if s == nil {
 			break
 		}
-		if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		if s, ok := sl.tryAcquire(s); ok {
 			// We got ownership of the span, so let's sweep it and use it.
 			s.sweep(true)
+			sl.dispose()
 			goto havespan
 		}
 		// We failed to get ownership of the span, which means it's being or
@@ -135,20 +136,22 @@ func (c *mcentral) cacheSpan() *mspan {
 		if s == nil {
 			break
 		}
-		if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		if s, ok := sl.tryAcquire(s); ok {
 			// We got ownership of the span, so let's sweep it.
 			s.sweep(true)
 			// Check if there's any free space.
 			freeIndex := s.nextFreeIndex()
 			if freeIndex != s.nelems {
 				s.freeindex = freeIndex
+				sl.dispose()
 				goto havespan
 			}
 			// Add it to the swept list, because sweeping didn't give us any free space.
-			c.fullSwept(sg).push(s)
+			c.fullSwept(sg).push(s.mspan)
 		}
 		// See comment for partial unswept spans.
 	}
+	sl.dispose()
 	if trace.enabled {
 		traceGCSweepDone()
 		traceDone = true
@@ -211,7 +214,13 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 	if stale {
 		// It's stale, so just sweep it. Sweeping will put it on
 		// the right list.
-		s.sweep(false)
+		//
+		// We don't use a sweepLocker here. Stale cached spans
+		// aren't in the global sweep lists, so mark termination
+		// itself holds up sweep completion until all mcaches
+		// have been swept.
+		ss := sweepLocked{s}
+		ss.sweep(false)
 	} else {
 		if int(s.nelems)-int(s.allocCount) > 0 {
 			// Put it back on the partial swept list.
@@ -229,14 +238,14 @@ func (c *mcentral) grow() *mspan {
 	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
 	size := uintptr(class_to_size[c.spanclass.sizeclass()])
 
-	s := mheap_.alloc(npages, c.spanclass, true)
+	s, _ := mheap_.alloc(npages, c.spanclass, true)
 	if s == nil {
 		return nil
 	}
 
 	// Use division by multiplication and shifts to quickly compute:
 	// n := (npages << _PageShift) / size
-	n := (npages << _PageShift) >> s.divShift * uintptr(s.divMul) >> s.divShift2
+	n := s.divideByElemSize(npages << _PageShift)
 	s.limit = s.base() + size*n
 	heapBitsForAddr(s.base()).initSpan(s)
 	return s
diff --git a/libgo/go/runtime/metrics.go b/libgo/go/runtime/metrics.go
index e1f1db2dba3..45a68e47215 100644
--- a/libgo/go/runtime/metrics.go
+++ b/libgo/go/runtime/metrics.go
@@ -98,6 +98,20 @@ func initMetrics() {
 				}
 			},
 		},
+		"/gc/heap/allocs:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalAllocated
+			},
+		},
+		"/gc/heap/allocs:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalAllocs
+			},
+		},
 		"/gc/heap/frees-by-size:bytes": {
 			deps: makeStatDepSet(heapStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
@@ -110,6 +124,20 @@ func initMetrics() {
 				}
 			},
 		},
+		"/gc/heap/frees:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalFreed
+			},
+		},
+		"/gc/heap/frees:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.totalFrees
+			},
+		},
 		"/gc/heap/goal:bytes": {
 			deps: makeStatDepSet(sysStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
@@ -124,6 +152,13 @@ func initMetrics() {
 				out.scalar = in.heapStats.numObjects
 			},
 		},
+		"/gc/heap/tiny/allocs:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.tinyAllocCount)
+			},
+		},
 		"/gc/pauses:seconds": {
 			compute: func(_ *statAggregate, out *metricValue) {
 				hist := out.float64HistOrInit(timeHistBuckets)
@@ -245,6 +280,15 @@ func initMetrics() {
 				out.scalar = uint64(gcount())
 			},
 		},
+		"/sched/latencies:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(timeHistBuckets)
+				hist.counts[0] = atomic.Load64(&sched.timeToRun.underflow)
+				for i := range sched.timeToRun.counts {
+					hist.counts[i+1] = atomic.Load64(&sched.timeToRun.counts[i])
+				}
+			},
+		},
 	}
 	metricsInit = true
 }
@@ -321,6 +365,22 @@ type heapStatsAggregate struct {
 
 	// numObjects is the number of live objects in the heap.
 	numObjects uint64
+
+	// totalAllocated is the total bytes of heap objects allocated
+	// over the lifetime of the program.
+	totalAllocated uint64
+
+	// totalFreed is the total bytes of heap objects freed
+	// over the lifetime of the program.
+	totalFreed uint64
+
+	// totalAllocs is the number of heap objects allocated over
+	// the lifetime of the program.
+	totalAllocs uint64
+
+	// totalFrees is the number of heap objects freed over
+	// the lifetime of the program.
+	totalFrees uint64
 }
 
 // compute populates the heapStatsAggregate with values from the runtime.
@@ -328,13 +388,20 @@ func (a *heapStatsAggregate) compute() {
 	memstats.heapStats.read(&a.heapStatsDelta)
 
 	// Calculate derived stats.
-	a.inObjects = uint64(a.largeAlloc - a.largeFree)
-	a.numObjects = uint64(a.largeAllocCount - a.largeFreeCount)
+	a.totalAllocs = uint64(a.largeAllocCount)
+	a.totalFrees = uint64(a.largeFreeCount)
+	a.totalAllocated = uint64(a.largeAlloc)
+	a.totalFreed = uint64(a.largeFree)
 	for i := range a.smallAllocCount {
-		n := uint64(a.smallAllocCount[i] - a.smallFreeCount[i])
-		a.inObjects += n * uint64(class_to_size[i])
-		a.numObjects += n
+		na := uint64(a.smallAllocCount[i])
+		nf := uint64(a.smallFreeCount[i])
+		a.totalAllocs += na
+		a.totalFrees += nf
+		a.totalAllocated += na * uint64(class_to_size[i])
+		a.totalFreed += nf * uint64(class_to_size[i])
 	}
+	a.inObjects = a.totalAllocated - a.totalFreed
+	a.numObjects = a.totalAllocs - a.totalFrees
 }
 
 // sysStatsAggregate represents system memory stats obtained
@@ -364,7 +431,7 @@ func (a *sysStatsAggregate) compute() {
 	a.buckHashSys = memstats.buckhash_sys.load()
 	a.gcMiscSys = memstats.gcMiscSys.load()
 	a.otherSys = memstats.other_sys.load()
-	a.heapGoal = atomic.Load64(&memstats.next_gc)
+	a.heapGoal = atomic.Load64(&gcController.heapGoal)
 	a.gcCyclesDone = uint64(memstats.numgc)
 	a.gcCyclesForced = uint64(memstats.numforcedgc)
 
@@ -481,7 +548,7 @@ func readMetrics(samplesp unsafe.Pointer, len int, cap int) {
 
 	// Acquire the metricsSema but with handoff. This operation
 	// is expensive enough that queueing up goroutines and handing
-	// off between them will be noticably better-behaved.
+	// off between them will be noticeably better-behaved.
 	semacquire1(&metricsSema, true, 0, 0)
 
 	// Ensure the map is initialized.
diff --git a/libgo/go/runtime/metrics/description.go b/libgo/go/runtime/metrics/description.go
index 11751561043..c147cada894 100644
--- a/libgo/go/runtime/metrics/description.go
+++ b/libgo/go/runtime/metrics/description.go
@@ -70,18 +70,51 @@ var allDesc = []Description{
 		Cumulative:  true,
 	},
 	{
-		Name:        "/gc/heap/allocs-by-size:bytes",
-		Description: "Distribution of all objects allocated by approximate size.",
-		Kind:        KindFloat64Histogram,
+		Name: "/gc/heap/allocs-by-size:bytes",
+		Description: "Distribution of heap allocations by approximate size. " +
+			"Note that this does not include tiny objects as defined by " +
+			"/gc/heap/tiny/allocs:objects, only tiny blocks.",
+		Kind:       KindFloat64Histogram,
+		Cumulative: true,
+	},
+	{
+		Name:        "/gc/heap/allocs:bytes",
+		Description: "Cumulative sum of memory allocated to the heap by the application.",
+		Kind:        KindUint64,
 		Cumulative:  true,
 	},
 	{
-		Name:        "/gc/heap/frees-by-size:bytes",
-		Description: "Distribution of all objects freed by approximate size.",
-		Kind:        KindFloat64Histogram,
+		Name: "/gc/heap/allocs:objects",
+		Description: "Cumulative count of heap allocations triggered by the application. " +
+			"Note that this does not include tiny objects as defined by " +
+			"/gc/heap/tiny/allocs:objects, only tiny blocks.",
+		Kind:       KindUint64,
+		Cumulative: true,
+	},
+	{
+		Name: "/gc/heap/frees-by-size:bytes",
+		Description: "Distribution of freed heap allocations by approximate size. " +
+			"Note that this does not include tiny objects as defined by " +
+			"/gc/heap/tiny/allocs:objects, only tiny blocks.",
+		Kind:       KindFloat64Histogram,
+		Cumulative: true,
+	},
+	{
+		Name:        "/gc/heap/frees:bytes",
+		Description: "Cumulative sum of heap memory freed by the garbage collector.",
+		Kind:        KindUint64,
 		Cumulative:  true,
 	},
 	{
+		Name: "/gc/heap/frees:objects",
+		Description: "Cumulative count of heap allocations whose storage was freed " +
+			"by the garbage collector. " +
+			"Note that this does not include tiny objects as defined by " +
+			"/gc/heap/tiny/allocs:objects, only tiny blocks.",
+		Kind:       KindUint64,
+		Cumulative: true,
+	},
+	{
 		Name:        "/gc/heap/goal:bytes",
 		Description: "Heap size target for the end of the GC cycle.",
 		Kind:        KindUint64,
@@ -92,6 +125,16 @@ var allDesc = []Description{
 		Kind:        KindUint64,
 	},
 	{
+		Name: "/gc/heap/tiny/allocs:objects",
+		Description: "Count of small allocations that are packed together into blocks. " +
+			"These allocations are counted separately from other allocations " +
+			"because each individual allocation is not tracked by the runtime, " +
+			"only their block. Each block is already accounted for in " +
+			"allocs-by-size and frees-by-size.",
+		Kind:       KindUint64,
+		Cumulative: true,
+	},
+	{
 		Name:        "/gc/pauses:seconds",
 		Description: "Distribution individual GC-related stop-the-world pause latencies.",
 		Kind:        KindFloat64Histogram,
@@ -176,6 +219,11 @@ var allDesc = []Description{
 		Description: "Count of live goroutines.",
 		Kind:        KindUint64,
 	},
+	{
+		Name:        "/sched/latencies:seconds",
+		Description: "Distribution of the time goroutines have spent in the scheduler in a runnable state before actually running.",
+		Kind:        KindFloat64Histogram,
+	},
 }
 
 // All returns a slice of containing metric descriptions for all supported metrics.
diff --git a/libgo/go/runtime/metrics/doc.go b/libgo/go/runtime/metrics/doc.go
index 7f790afc12e..91ef03072de 100644
--- a/libgo/go/runtime/metrics/doc.go
+++ b/libgo/go/runtime/metrics/doc.go
@@ -61,10 +61,30 @@ Below is the full list of supported metrics, ordered lexicographically.
 		Count of all completed GC cycles.
 
 	/gc/heap/allocs-by-size:bytes
-		Distribution of all objects allocated by approximate size.
+		Distribution of heap allocations by approximate size.
+		Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects,
+		only tiny blocks.
+
+	/gc/heap/allocs:bytes
+		Cumulative sum of memory allocated to the heap by the application.
+
+	/gc/heap/allocs:objects
+		Cumulative count of heap allocations triggered by the application.
+		Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects,
+		only tiny blocks.
 
 	/gc/heap/frees-by-size:bytes
-		Distribution of all objects freed by approximate size.
+		Distribution of freed heap allocations by approximate size.
+		Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects,
+		only tiny blocks.
+
+	/gc/heap/frees:bytes
+		Cumulative sum of heap memory freed by the garbage collector.
+
+	/gc/heap/frees:objects
+		Cumulative count of heap allocations whose storage was freed by the garbage collector.
+		Note that this does not include tiny objects as defined by /gc/heap/tiny/allocs:objects,
+		only tiny blocks.
 
 	/gc/heap/goal:bytes
 		Heap size target for the end of the GC cycle.
@@ -72,6 +92,13 @@ Below is the full list of supported metrics, ordered lexicographically.
 	/gc/heap/objects:objects
 		Number of objects, live or unswept, occupying heap memory.
 
+	/gc/heap/tiny/allocs:objects
+		Count of small allocations that are packed together into blocks.
+		These allocations are counted separately from other allocations
+		because each individual allocation is not tracked by the runtime,
+		only their block. Each block is already accounted for in
+		allocs-by-size and frees-by-size.
+
 	/gc/pauses:seconds
 		Distribution individual GC-related stop-the-world pause latencies.
 
@@ -139,5 +166,9 @@ Below is the full list of supported metrics, ordered lexicographically.
 
 	/sched/goroutines:goroutines
 		Count of live goroutines.
+
+	/sched/latencies:seconds
+		Distribution of the time goroutines have spent in the scheduler
+		in a runnable state before actually running.
 */
 package metrics
diff --git a/libgo/go/runtime/metrics_test.go b/libgo/go/runtime/metrics_test.go
index 8a3cf019bdb..5d32ef469ca 100644
--- a/libgo/go/runtime/metrics_test.go
+++ b/libgo/go/runtime/metrics_test.go
@@ -40,6 +40,9 @@ func TestReadMetrics(t *testing.T) {
 	}
 
 	// Check to make sure the values we read line up with other values we read.
+	var allocsBySize *metrics.Float64Histogram
+	var tinyAllocs uint64
+	var mallocs, frees uint64
 	for i := range samples {
 		switch name := samples[i].Name; name {
 		case "/memory/classes/heap/free:bytes":
@@ -84,6 +87,9 @@ func TestReadMetrics(t *testing.T) {
 					t.Errorf("histogram counts do not much BySize for class %d: got %d, want %d", i, c, m)
 				}
 			}
+			allocsBySize = hist
+		case "/gc/heap/allocs:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc)
 		case "/gc/heap/frees-by-size:bytes":
 			hist := samples[i].Value.Float64Histogram()
 			// Skip size class 0 in BySize, because it's always empty and not represented
@@ -95,9 +101,29 @@ func TestReadMetrics(t *testing.T) {
 					continue
 				}
 				if c, f := hist.Counts[i], sc.Frees; c != f {
-					t.Errorf("histogram counts do not much BySize for class %d: got %d, want %d", i, c, f)
+					t.Errorf("histogram counts do not match BySize for class %d: got %d, want %d", i, c, f)
 				}
 			}
+		case "/gc/heap/frees:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.TotalAlloc-mstats.HeapAlloc)
+		case "/gc/heap/tiny/allocs:objects":
+			// Currently, MemStats adds tiny alloc count to both Mallocs AND Frees.
+			// The reason for this is because MemStats couldn't be extended at the time
+			// but there was a desire to have Mallocs at least be a little more representative,
+			// while having Mallocs - Frees still represent a live object count.
+			// Unfortunately, MemStats doesn't actually export a large allocation count,
+			// so it's impossible to pull this number out directly.
+			//
+			// Check tiny allocation count outside of this loop, by using the allocs-by-size
+			// histogram in order to figure out how many large objects there are.
+			tinyAllocs = samples[i].Value.Uint64()
+			// Because the next two metrics tests are checking against Mallocs and Frees,
+			// we can't check them directly for the same reason: we need to account for tiny
+			// allocations included in Mallocs and Frees.
+		case "/gc/heap/allocs:objects":
+			mallocs = samples[i].Value.Uint64()
+		case "/gc/heap/frees:objects":
+			frees = samples[i].Value.Uint64()
 		case "/gc/heap/objects:objects":
 			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
 		case "/gc/heap/goal:bytes":
@@ -110,6 +136,17 @@ func TestReadMetrics(t *testing.T) {
 			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
 		}
 	}
+
+	// Check tinyAllocs.
+	nonTinyAllocs := uint64(0)
+	for _, c := range allocsBySize.Counts {
+		nonTinyAllocs += c
+	}
+	checkUint64(t, "/gc/heap/tiny/allocs:objects", tinyAllocs, mstats.Mallocs-nonTinyAllocs)
+
+	// Check allocation and free counts.
+	checkUint64(t, "/gc/heap/allocs:objects", mallocs, mstats.Mallocs-tinyAllocs)
+	checkUint64(t, "/gc/heap/frees:objects", frees, mstats.Frees-tinyAllocs)
 }
 
 func TestReadMetricsConsistency(t *testing.T) {
@@ -132,8 +169,10 @@ func TestReadMetricsConsistency(t *testing.T) {
 		got, want uint64
 	}
 	var objects struct {
-		alloc, free *metrics.Float64Histogram
-		total       uint64
+		alloc, free             *metrics.Float64Histogram
+		allocs, frees           uint64
+		allocdBytes, freedBytes uint64
+		total, totalBytes       uint64
 	}
 	var gc struct {
 		numGC  uint64
@@ -159,10 +198,20 @@ func TestReadMetricsConsistency(t *testing.T) {
 		switch samples[i].Name {
 		case "/memory/classes/total:bytes":
 			totalVirtual.got = samples[i].Value.Uint64()
+		case "/memory/classes/heap/objects:bytes":
+			objects.totalBytes = samples[i].Value.Uint64()
 		case "/gc/heap/objects:objects":
 			objects.total = samples[i].Value.Uint64()
+		case "/gc/heap/allocs:bytes":
+			objects.allocdBytes = samples[i].Value.Uint64()
+		case "/gc/heap/allocs:objects":
+			objects.allocs = samples[i].Value.Uint64()
 		case "/gc/heap/allocs-by-size:bytes":
 			objects.alloc = samples[i].Value.Float64Histogram()
+		case "/gc/heap/frees:bytes":
+			objects.freedBytes = samples[i].Value.Uint64()
+		case "/gc/heap/frees:objects":
+			objects.frees = samples[i].Value.Uint64()
 		case "/gc/heap/frees-by-size:bytes":
 			objects.free = samples[i].Value.Float64Histogram()
 		case "/gc/cycles:gc-cycles":
@@ -182,6 +231,12 @@ func TestReadMetricsConsistency(t *testing.T) {
 	if totalVirtual.got != totalVirtual.want {
 		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
 	}
+	if got, want := objects.allocs-objects.frees, objects.total; got != want {
+		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
+	}
+	if got, want := objects.allocdBytes-objects.freedBytes, objects.totalBytes; got != want {
+		t.Errorf("mismatch between object alloc/free tallies and total: got %d, want %d", got, want)
+	}
 	if b, c := len(objects.alloc.Buckets), len(objects.alloc.Counts); b != c+1 {
 		t.Errorf("allocs-by-size has wrong bucket or counts length: %d buckets, %d counts", b, c)
 	}
@@ -201,17 +256,25 @@ func TestReadMetricsConsistency(t *testing.T) {
 			}
 		}
 		if !t.Failed() {
-			got, want := uint64(0), objects.total
+			var gotAlloc, gotFree uint64
+			want := objects.total
 			for i := range objects.alloc.Counts {
 				if objects.alloc.Counts[i] < objects.free.Counts[i] {
 					t.Errorf("found more allocs than frees in object dist bucket %d", i)
 					continue
 				}
-				got += objects.alloc.Counts[i] - objects.free.Counts[i]
+				gotAlloc += objects.alloc.Counts[i]
+				gotFree += objects.free.Counts[i]
 			}
-			if got != want {
+			if got := gotAlloc - gotFree; got != want {
 				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
 			}
+			if gotAlloc != objects.allocs {
+				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotAlloc, objects.allocs)
+			}
+			if gotFree != objects.frees {
+				t.Errorf("object distribution counts don't match total allocs: got %d, want %d", gotFree, objects.frees)
+			}
 		}
 	}
 	// The current GC has at least 2 pauses per GC.
diff --git a/libgo/go/runtime/mgc.go b/libgo/go/runtime/mgc.go
index a9f2c1abc34..efb80120edf 100644
--- a/libgo/go/runtime/mgc.go
+++ b/libgo/go/runtime/mgc.go
@@ -113,8 +113,8 @@
 // Next GC is after we've allocated an extra amount of memory proportional to
 // the amount already in use. The proportion is controlled by GOGC environment variable
 // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
-// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
-// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (this mark is tracked in gcController.heapGoal variable). This keeps the GC cost in
+// linear proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 // (and also the amount of extra memory used).
 
 // Oblets
@@ -149,45 +149,16 @@ const (
 	sweepMinHeapDistance = 1024 * 1024
 )
 
-// heapminimum is the minimum heap size at which to trigger GC.
-// For small heaps, this overrides the usual GOGC*live set rule.
-//
-// When there is a very small live set but a lot of allocation, simply
-// collecting when the heap reaches GOGC*live results in many GC
-// cycles and high total per-GC overhead. This minimum amortizes this
-// per-GC overhead while keeping the heap reasonably small.
-//
-// During initialization this is set to 4MB*GOGC/100. In the case of
-// GOGC==0, this will set heapminimum to 0, resulting in constant
-// collection even when the heap size is small, which is useful for
-// debugging.
-var heapminimum uint64 = defaultHeapMinimum
-
-// defaultHeapMinimum is the value of heapminimum for GOGC==100.
-const defaultHeapMinimum = 4 << 20
-
-// Initialized from $GOGC.  GOGC=off means no GC.
-var gcpercent int32
-
 func gcinit() {
 	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
 		throw("size of Workbuf is suboptimal")
 	}
-
 	// No sweep on the first cycle.
-	mheap_.sweepdone = 1
+	mheap_.sweepDrained = 1
 
-	// Set a reasonable initial GC trigger.
-	memstats.triggerRatio = 7 / 8.0
-
-	// Fake a heap_marked value so it looks like a trigger at
-	// heapminimum is the appropriate growth from heap_marked.
-	// This will go into computing the initial GC goal.
-	memstats.heap_marked = uint64(float64(heapminimum) / (1 + memstats.triggerRatio))
-
-	// Set gcpercent from the environment. This will also compute
-	// and set the GC trigger and goal.
-	_ = setGCPercent(readgogc())
+	// Initialize GC pacer state.
+	// Use the environment variable GOGC for the initial gcPercent value.
+	gcController.init(readGOGC())
 
 	work.startSema = 1
 	work.markDoneSema = 1
@@ -196,16 +167,9 @@ func gcinit() {
 	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
 }
 
-func readgogc() int32 {
-	p := gogetenv("GOGC")
-	if p == "off" {
-		return -1
-	}
-	if n, ok := atoi32(p); ok {
-		return n
-	}
-	return 100
-}
+// Temporary in order to enable register ABI work.
+// TODO(register args): convert back to local chan in gcenabled, passed to "go" stmts.
+var gcenable_setup chan int
 
 // gcenable is called after the bulk of the runtime initialization,
 // just before we're about to start letting user code run.
@@ -213,41 +177,17 @@ func readgogc() int32 {
 // scavenger goroutine, and enables GC.
 func gcenable() {
 	// Kick off sweeping and scavenging.
-	c := make(chan int, 2)
+	gcenable_setup = make(chan int, 2)
 	expectSystemGoroutine()
-	go bgsweep(c)
+	go bgsweep()
 	expectSystemGoroutine()
-	go bgscavenge(c)
-	<-c
-	<-c
+	go bgscavenge()
+	<-gcenable_setup
+	<-gcenable_setup
+	gcenable_setup = nil
 	memstats.enablegc = true // now that runtime is initialized, GC is okay
 }
 
-//go:linkname setGCPercent runtime_1debug.setGCPercent
-func setGCPercent(in int32) (out int32) {
-	// Run on the system stack since we grab the heap lock.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		out = gcpercent
-		if in < 0 {
-			in = -1
-		}
-		gcpercent = in
-		heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
-		// Update pacing in response to gcpercent change.
-		gcSetTriggerRatio(memstats.triggerRatio)
-		unlock(&mheap_.lock)
-	})
-
-	// If we just disabled GC, wait for any concurrent GC mark to
-	// finish so we always return with no GC running.
-	if in < 0 {
-		gcWaitOnMark(atomic.Load(&work.cycles))
-	}
-
-	return out
-}
-
 // Garbage collector phase.
 // Indicates to write barrier and synchronization task to perform.
 var gcphase uint32
@@ -304,9 +244,11 @@ const (
 	// gcMarkWorkerFractionalMode indicates that a P is currently
 	// running the "fractional" mark worker. The fractional worker
 	// is necessary when GOMAXPROCS*gcBackgroundUtilization is not
-	// an integer. The fractional worker should run until it is
-	// preempted and will be scheduled to pick up the fractional
-	// part of GOMAXPROCS*gcBackgroundUtilization.
+	// an integer and using only dedicated workers would result in
+	// utilization too far from the target of gcBackgroundUtilization.
+	// The fractional worker should run until it is preempted and
+	// will be scheduled to pick up the fractional part of
+	// GOMAXPROCS*gcBackgroundUtilization.
 	gcMarkWorkerFractionalMode
 
 	// gcMarkWorkerIdleMode indicates that a P is running the mark
@@ -325,474 +267,6 @@ var gcMarkWorkerModeStrings = [...]string{
 	"GC (idle)",
 }
 
-// gcController implements the GC pacing controller that determines
-// when to trigger concurrent garbage collection and how much marking
-// work to do in mutator assists and background marking.
-//
-// It uses a feedback control algorithm to adjust the memstats.gc_trigger
-// trigger based on the heap growth and GC CPU utilization each cycle.
-// This algorithm optimizes for heap growth to match GOGC and for CPU
-// utilization between assist and background marking to be 25% of
-// GOMAXPROCS. The high-level design of this algorithm is documented
-// at https://golang.org/s/go15gcpacing.
-//
-// All fields of gcController are used only during a single mark
-// cycle.
-var gcController gcControllerState
-
-type gcControllerState struct {
-	// scanWork is the total scan work performed this cycle. This
-	// is updated atomically during the cycle. Updates occur in
-	// bounded batches, since it is both written and read
-	// throughout the cycle. At the end of the cycle, this is how
-	// much of the retained heap is scannable.
-	//
-	// Currently this is the bytes of heap scanned. For most uses,
-	// this is an opaque unit of work, but for estimation the
-	// definition is important.
-	scanWork int64
-
-	// bgScanCredit is the scan work credit accumulated by the
-	// concurrent background scan. This credit is accumulated by
-	// the background scan and stolen by mutator assists. This is
-	// updated atomically. Updates occur in bounded batches, since
-	// it is both written and read throughout the cycle.
-	bgScanCredit int64
-
-	// assistTime is the nanoseconds spent in mutator assists
-	// during this cycle. This is updated atomically. Updates
-	// occur in bounded batches, since it is both written and read
-	// throughout the cycle.
-	assistTime int64
-
-	// dedicatedMarkTime is the nanoseconds spent in dedicated
-	// mark workers during this cycle. This is updated atomically
-	// at the end of the concurrent mark phase.
-	dedicatedMarkTime int64
-
-	// fractionalMarkTime is the nanoseconds spent in the
-	// fractional mark worker during this cycle. This is updated
-	// atomically throughout the cycle and will be up-to-date if
-	// the fractional mark worker is not currently running.
-	fractionalMarkTime int64
-
-	// idleMarkTime is the nanoseconds spent in idle marking
-	// during this cycle. This is updated atomically throughout
-	// the cycle.
-	idleMarkTime int64
-
-	// markStartTime is the absolute start time in nanoseconds
-	// that assists and background mark workers started.
-	markStartTime int64
-
-	// dedicatedMarkWorkersNeeded is the number of dedicated mark
-	// workers that need to be started. This is computed at the
-	// beginning of each cycle and decremented atomically as
-	// dedicated mark workers get started.
-	dedicatedMarkWorkersNeeded int64
-
-	// assistWorkPerByte is the ratio of scan work to allocated
-	// bytes that should be performed by mutator assists. This is
-	// computed at the beginning of each cycle and updated every
-	// time heap_scan is updated.
-	//
-	// Stored as a uint64, but it's actually a float64. Use
-	// float64frombits to get the value.
-	//
-	// Read and written atomically.
-	assistWorkPerByte uint64
-
-	// assistBytesPerWork is 1/assistWorkPerByte.
-	//
-	// Stored as a uint64, but it's actually a float64. Use
-	// float64frombits to get the value.
-	//
-	// Read and written atomically.
-	//
-	// Note that because this is read and written independently
-	// from assistWorkPerByte users may notice a skew between
-	// the two values, and such a state should be safe.
-	assistBytesPerWork uint64
-
-	// fractionalUtilizationGoal is the fraction of wall clock
-	// time that should be spent in the fractional mark worker on
-	// each P that isn't running a dedicated worker.
-	//
-	// For example, if the utilization goal is 25% and there are
-	// no dedicated workers, this will be 0.25. If the goal is
-	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
-	// this will be 0.05 to make up the missing 5%.
-	//
-	// If this is zero, no fractional workers are needed.
-	fractionalUtilizationGoal float64
-
-	_ cpu.CacheLinePad
-}
-
-// startCycle resets the GC controller's state and computes estimates
-// for a new GC cycle. The caller must hold worldsema and the world
-// must be stopped.
-func (c *gcControllerState) startCycle() {
-	c.scanWork = 0
-	c.bgScanCredit = 0
-	c.assistTime = 0
-	c.dedicatedMarkTime = 0
-	c.fractionalMarkTime = 0
-	c.idleMarkTime = 0
-
-	// Ensure that the heap goal is at least a little larger than
-	// the current live heap size. This may not be the case if GC
-	// start is delayed or if the allocation that pushed heap_live
-	// over gc_trigger is large or if the trigger is really close to
-	// GOGC. Assist is proportional to this distance, so enforce a
-	// minimum distance, even if it means going over the GOGC goal
-	// by a tiny bit.
-	if memstats.next_gc < memstats.heap_live+1024*1024 {
-		memstats.next_gc = memstats.heap_live + 1024*1024
-	}
-
-	// Compute the background mark utilization goal. In general,
-	// this may not come out exactly. We round the number of
-	// dedicated workers so that the utilization is closest to
-	// 25%. For small GOMAXPROCS, this would introduce too much
-	// error, so we add fractional workers in that case.
-	totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
-	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
-	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
-	const maxUtilError = 0.3
-	if utilError < -maxUtilError || utilError > maxUtilError {
-		// Rounding put us more than 30% off our goal. With
-		// gcBackgroundUtilization of 25%, this happens for
-		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
-		// workers to compensate.
-		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
-			// Too many dedicated workers.
-			c.dedicatedMarkWorkersNeeded--
-		}
-		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
-	} else {
-		c.fractionalUtilizationGoal = 0
-	}
-
-	// In STW mode, we just want dedicated workers.
-	if debug.gcstoptheworld > 0 {
-		c.dedicatedMarkWorkersNeeded = int64(gomaxprocs)
-		c.fractionalUtilizationGoal = 0
-	}
-
-	// Clear per-P state
-	for _, p := range allp {
-		p.gcAssistTime = 0
-		p.gcFractionalMarkTime = 0
-	}
-
-	// Compute initial values for controls that are updated
-	// throughout the cycle.
-	c.revise()
-
-	if debug.gcpacertrace > 0 {
-		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
-		print("pacer: assist ratio=", assistRatio,
-			" (scan ", memstats.heap_scan>>20, " MB in ",
-			work.initialHeapLive>>20, "->",
-			memstats.next_gc>>20, " MB)",
-			" workers=", c.dedicatedMarkWorkersNeeded,
-			"+", c.fractionalUtilizationGoal, "\n")
-	}
-}
-
-// revise updates the assist ratio during the GC cycle to account for
-// improved estimates. This should be called whenever memstats.heap_scan,
-// memstats.heap_live, or memstats.next_gc is updated. It is safe to
-// call concurrently, but it may race with other calls to revise.
-//
-// The result of this race is that the two assist ratio values may not line
-// up or may be stale. In practice this is OK because the assist ratio
-// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
-// heuristic anyway. Furthermore, no part of the heuristic depends on
-// the two assist ratio values being exact reciprocals of one another, since
-// the two values are used to convert values from different sources.
-//
-// The worst case result of this raciness is that we may miss a larger shift
-// in the ratio (say, if we decide to pace more aggressively against the
-// hard heap goal) but even this "hard goal" is best-effort (see #40460).
-// The dedicated GC should ensure we don't exceed the hard goal by too much
-// in the rare case we do exceed it.
-//
-// It should only be called when gcBlackenEnabled != 0 (because this
-// is when assists are enabled and the necessary statistics are
-// available).
-func (c *gcControllerState) revise() {
-	gcpercent := gcpercent
-	if gcpercent < 0 {
-		// If GC is disabled but we're running a forced GC,
-		// act like GOGC is huge for the below calculations.
-		gcpercent = 100000
-	}
-	live := atomic.Load64(&memstats.heap_live)
-	scan := atomic.Load64(&memstats.heap_scan)
-	work := atomic.Loadint64(&c.scanWork)
-
-	// Assume we're under the soft goal. Pace GC to complete at
-	// next_gc assuming the heap is in steady-state.
-	heapGoal := int64(atomic.Load64(&memstats.next_gc))
-
-	// Compute the expected scan work remaining.
-	//
-	// This is estimated based on the expected
-	// steady-state scannable heap. For example, with
-	// GOGC=100, only half of the scannable heap is
-	// expected to be live, so that's what we target.
-	//
-	// (This is a float calculation to avoid overflowing on
-	// 100*heap_scan.)
-	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
-
-	if int64(live) > heapGoal || work > scanWorkExpected {
-		// We're past the soft goal, or we've already done more scan
-		// work than we expected. Pace GC so that in the worst case it
-		// will complete by the hard goal.
-		const maxOvershoot = 1.1
-		heapGoal = int64(float64(heapGoal) * maxOvershoot)
-
-		// Compute the upper bound on the scan work remaining.
-		scanWorkExpected = int64(scan)
-	}
-
-	// Compute the remaining scan work estimate.
-	//
-	// Note that we currently count allocations during GC as both
-	// scannable heap (heap_scan) and scan work completed
-	// (scanWork), so allocation will change this difference
-	// slowly in the soft regime and not at all in the hard
-	// regime.
-	scanWorkRemaining := scanWorkExpected - work
-	if scanWorkRemaining < 1000 {
-		// We set a somewhat arbitrary lower bound on
-		// remaining scan work since if we aim a little high,
-		// we can miss by a little.
-		//
-		// We *do* need to enforce that this is at least 1,
-		// since marking is racy and double-scanning objects
-		// may legitimately make the remaining scan work
-		// negative, even in the hard goal regime.
-		scanWorkRemaining = 1000
-	}
-
-	// Compute the heap distance remaining.
-	heapRemaining := heapGoal - int64(live)
-	if heapRemaining <= 0 {
-		// This shouldn't happen, but if it does, avoid
-		// dividing by zero or setting the assist negative.
-		heapRemaining = 1
-	}
-
-	// Compute the mutator assist ratio so by the time the mutator
-	// allocates the remaining heap bytes up to next_gc, it will
-	// have done (or stolen) the remaining amount of scan work.
-	// Note that the assist ratio values are updated atomically
-	// but not together. This means there may be some degree of
-	// skew between the two values. This is generally OK as the
-	// values shift relatively slowly over the course of a GC
-	// cycle.
-	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
-	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
-	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
-	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
-}
-
-// endCycle computes the trigger ratio for the next cycle.
-func (c *gcControllerState) endCycle() float64 {
-	if work.userForced {
-		// Forced GC means this cycle didn't start at the
-		// trigger, so where it finished isn't good
-		// information about how to adjust the trigger.
-		// Just leave it where it is.
-		return memstats.triggerRatio
-	}
-
-	// Proportional response gain for the trigger controller. Must
-	// be in [0, 1]. Lower values smooth out transient effects but
-	// take longer to respond to phase changes. Higher values
-	// react to phase changes quickly, but are more affected by
-	// transient changes. Values near 1 may be unstable.
-	const triggerGain = 0.5
-
-	// Compute next cycle trigger ratio. First, this computes the
-	// "error" for this cycle; that is, how far off the trigger
-	// was from what it should have been, accounting for both heap
-	// growth and GC CPU utilization. We compute the actual heap
-	// growth during this cycle and scale that by how far off from
-	// the goal CPU utilization we were (to estimate the heap
-	// growth if we had the desired CPU utilization). The
-	// difference between this estimate and the GOGC-based goal
-	// heap growth is the error.
-	goalGrowthRatio := gcEffectiveGrowthRatio()
-	actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
-	assistDuration := nanotime() - c.markStartTime
-
-	// Assume background mark hit its utilization goal.
-	utilization := gcBackgroundUtilization
-	// Add assist utilization; avoid divide by zero.
-	if assistDuration > 0 {
-		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
-	}
-
-	triggerError := goalGrowthRatio - memstats.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-memstats.triggerRatio)
-
-	// Finally, we adjust the trigger for next time by this error,
-	// damped by the proportional gain.
-	triggerRatio := memstats.triggerRatio + triggerGain*triggerError
-
-	if debug.gcpacertrace > 0 {
-		// Print controller state in terms of the design
-		// document.
-		H_m_prev := memstats.heap_marked
-		h_t := memstats.triggerRatio
-		H_T := memstats.gc_trigger
-		h_a := actualGrowthRatio
-		H_a := memstats.heap_live
-		h_g := goalGrowthRatio
-		H_g := int64(float64(H_m_prev) * (1 + h_g))
-		u_a := utilization
-		u_g := gcGoalUtilization
-		W_a := c.scanWork
-		print("pacer: H_m_prev=", H_m_prev,
-			" h_t=", h_t, " H_T=", H_T,
-			" h_a=", h_a, " H_a=", H_a,
-			" h_g=", h_g, " H_g=", H_g,
-			" u_a=", u_a, " u_g=", u_g,
-			" W_a=", W_a,
-			" goalΔ=", goalGrowthRatio-h_t,
-			" actualΔ=", h_a-h_t,
-			" u_a/u_g=", u_a/u_g,
-			"\n")
-	}
-
-	return triggerRatio
-}
-
-// enlistWorker encourages another dedicated mark worker to start on
-// another P if there are spare worker slots. It is used by putfull
-// when more work is made available.
-//
-//go:nowritebarrier
-func (c *gcControllerState) enlistWorker() {
-	// If there are idle Ps, wake one so it will run an idle worker.
-	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
-	//
-	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
-	//		wakep()
-	//		return
-	//	}
-
-	// There are no idle Ps. If we need more dedicated workers,
-	// try to preempt a running P so it will switch to a worker.
-	if c.dedicatedMarkWorkersNeeded <= 0 {
-		return
-	}
-	// Pick a random other P to preempt.
-	if gomaxprocs <= 1 {
-		return
-	}
-	gp := getg()
-	if gp == nil || gp.m == nil || gp.m.p == 0 {
-		return
-	}
-	myID := gp.m.p.ptr().id
-	for tries := 0; tries < 5; tries++ {
-		id := int32(fastrandn(uint32(gomaxprocs - 1)))
-		if id >= myID {
-			id++
-		}
-		p := allp[id]
-		if p.status != _Prunning {
-			continue
-		}
-		if preemptone(p) {
-			return
-		}
-	}
-}
-
-// findRunnableGCWorker returns a background mark worker for _p_ if it
-// should be run. This must only be called when gcBlackenEnabled != 0.
-func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
-	if gcBlackenEnabled == 0 {
-		throw("gcControllerState.findRunnable: blackening not enabled")
-	}
-
-	if !gcMarkWorkAvailable(_p_) {
-		// No work to be done right now. This can happen at
-		// the end of the mark phase when there are still
-		// assists tapering off. Don't bother running a worker
-		// now because it'll just return immediately.
-		return nil
-	}
-
-	// Grab a worker before we commit to running below.
-	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
-	if node == nil {
-		// There is at least one worker per P, so normally there are
-		// enough workers to run on all Ps, if necessary. However, once
-		// a worker enters gcMarkDone it may park without rejoining the
-		// pool, thus freeing a P with no corresponding worker.
-		// gcMarkDone never depends on another worker doing work, so it
-		// is safe to simply do nothing here.
-		//
-		// If gcMarkDone bails out without completing the mark phase,
-		// it will always do so with queued global work. Thus, that P
-		// will be immediately eligible to re-run the worker G it was
-		// just using, ensuring work can complete.
-		return nil
-	}
-
-	decIfPositive := func(ptr *int64) bool {
-		for {
-			v := atomic.Loadint64(ptr)
-			if v <= 0 {
-				return false
-			}
-
-			// TODO: having atomic.Casint64 would be more pleasant.
-			if atomic.Cas64((*uint64)(unsafe.Pointer(ptr)), uint64(v), uint64(v-1)) {
-				return true
-			}
-		}
-	}
-
-	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
-		// This P is now dedicated to marking until the end of
-		// the concurrent mark phase.
-		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
-	} else if c.fractionalUtilizationGoal == 0 {
-		// No need for fractional workers.
-		gcBgMarkWorkerPool.push(&node.node)
-		return nil
-	} else {
-		// Is this P behind on the fractional utilization
-		// goal?
-		//
-		// This should be kept in sync with pollFractionalWorkerExit.
-		delta := nanotime() - gcController.markStartTime
-		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
-			// Nope. No need to run a fractional worker.
-			gcBgMarkWorkerPool.push(&node.node)
-			return nil
-		}
-		// Run a fractional worker.
-		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
-	}
-
-	// Run the background mark worker.
-	gp := node.gp.ptr()
-	casgstatus(gp, _Gwaiting, _Grunnable)
-	if trace.enabled {
-		traceGoUnpark(gp, 0)
-	}
-	return gp
-}
-
 // pollFractionalWorkerExit reports whether a fractional mark worker
 // should self-preempt. It assumes it is called from the fractional
 // worker.
@@ -811,203 +285,6 @@ func pollFractionalWorkerExit() bool {
 	return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
 }
 
-// gcSetTriggerRatio sets the trigger ratio and updates everything
-// derived from it: the absolute trigger, the heap goal, mark pacing,
-// and sweep pacing.
-//
-// This can be called any time. If GC is the in the middle of a
-// concurrent phase, it will adjust the pacing of that phase.
-//
-// This depends on gcpercent, memstats.heap_marked, and
-// memstats.heap_live. These must be up to date.
-//
-// mheap_.lock must be held or the world must be stopped.
-func gcSetTriggerRatio(triggerRatio float64) {
-	assertWorldStoppedOrLockHeld(&mheap_.lock)
-
-	// Compute the next GC goal, which is when the allocated heap
-	// has grown by GOGC/100 over the heap marked by the last
-	// cycle.
-	goal := ^uint64(0)
-	if gcpercent >= 0 {
-		goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
-	}
-
-	// Set the trigger ratio, capped to reasonable bounds.
-	if gcpercent >= 0 {
-		scalingFactor := float64(gcpercent) / 100
-		// Ensure there's always a little margin so that the
-		// mutator assist ratio isn't infinity.
-		maxTriggerRatio := 0.95 * scalingFactor
-		if triggerRatio > maxTriggerRatio {
-			triggerRatio = maxTriggerRatio
-		}
-
-		// If we let triggerRatio go too low, then if the application
-		// is allocating very rapidly we might end up in a situation
-		// where we're allocating black during a nearly always-on GC.
-		// The result of this is a growing heap and ultimately an
-		// increase in RSS. By capping us at a point >0, we're essentially
-		// saying that we're OK using more CPU during the GC to prevent
-		// this growth in RSS.
-		//
-		// The current constant was chosen empirically: given a sufficiently
-		// fast/scalable allocator with 48 Ps that could drive the trigger ratio
-		// to <0.05, this constant causes applications to retain the same peak
-		// RSS compared to not having this allocator.
-		minTriggerRatio := 0.6 * scalingFactor
-		if triggerRatio < minTriggerRatio {
-			triggerRatio = minTriggerRatio
-		}
-	} else if triggerRatio < 0 {
-		// gcpercent < 0, so just make sure we're not getting a negative
-		// triggerRatio. This case isn't expected to happen in practice,
-		// and doesn't really matter because if gcpercent < 0 then we won't
-		// ever consume triggerRatio further on in this function, but let's
-		// just be defensive here; the triggerRatio being negative is almost
-		// certainly undesirable.
-		triggerRatio = 0
-	}
-	memstats.triggerRatio = triggerRatio
-
-	// Compute the absolute GC trigger from the trigger ratio.
-	//
-	// We trigger the next GC cycle when the allocated heap has
-	// grown by the trigger ratio over the marked heap size.
-	trigger := ^uint64(0)
-	if gcpercent >= 0 {
-		trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
-		// Don't trigger below the minimum heap size.
-		minTrigger := heapminimum
-		if !isSweepDone() {
-			// Concurrent sweep happens in the heap growth
-			// from heap_live to gc_trigger, so ensure
-			// that concurrent sweep has some heap growth
-			// in which to perform sweeping before we
-			// start the next GC cycle.
-			sweepMin := atomic.Load64(&memstats.heap_live) + sweepMinHeapDistance
-			if sweepMin > minTrigger {
-				minTrigger = sweepMin
-			}
-		}
-		if trigger < minTrigger {
-			trigger = minTrigger
-		}
-		if int64(trigger) < 0 {
-			print("runtime: next_gc=", memstats.next_gc, " heap_marked=", memstats.heap_marked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
-			throw("gc_trigger underflow")
-		}
-		if trigger > goal {
-			// The trigger ratio is always less than GOGC/100, but
-			// other bounds on the trigger may have raised it.
-			// Push up the goal, too.
-			goal = trigger
-		}
-	}
-
-	// Commit to the trigger and goal.
-	memstats.gc_trigger = trigger
-	atomic.Store64(&memstats.next_gc, goal)
-	if trace.enabled {
-		traceNextGC()
-	}
-
-	// Update mark pacing.
-	if gcphase != _GCoff {
-		gcController.revise()
-	}
-
-	// Update sweep pacing.
-	if isSweepDone() {
-		mheap_.sweepPagesPerByte = 0
-	} else {
-		// Concurrent sweep needs to sweep all of the in-use
-		// pages by the time the allocated heap reaches the GC
-		// trigger. Compute the ratio of in-use pages to sweep
-		// per byte allocated, accounting for the fact that
-		// some might already be swept.
-		heapLiveBasis := atomic.Load64(&memstats.heap_live)
-		heapDistance := int64(trigger) - int64(heapLiveBasis)
-		// Add a little margin so rounding errors and
-		// concurrent sweep are less likely to leave pages
-		// unswept when GC starts.
-		heapDistance -= 1024 * 1024
-		if heapDistance < _PageSize {
-			// Avoid setting the sweep ratio extremely high
-			heapDistance = _PageSize
-		}
-		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
-		pagesInUse := atomic.Load64(&mheap_.pagesInUse)
-		sweepDistancePages := int64(pagesInUse) - int64(pagesSwept)
-		if sweepDistancePages <= 0 {
-			mheap_.sweepPagesPerByte = 0
-		} else {
-			mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
-			mheap_.sweepHeapLiveBasis = heapLiveBasis
-			// Write pagesSweptBasis last, since this
-			// signals concurrent sweeps to recompute
-			// their debt.
-			atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
-		}
-	}
-
-	gcPaceScavenger()
-}
-
-// gcEffectiveGrowthRatio returns the current effective heap growth
-// ratio (GOGC/100) based on heap_marked from the previous GC and
-// next_gc for the current GC.
-//
-// This may differ from gcpercent/100 because of various upper and
-// lower bounds on gcpercent. For example, if the heap is smaller than
-// heapminimum, this can be higher than gcpercent/100.
-//
-// mheap_.lock must be held or the world must be stopped.
-func gcEffectiveGrowthRatio() float64 {
-	assertWorldStoppedOrLockHeld(&mheap_.lock)
-
-	egogc := float64(atomic.Load64(&memstats.next_gc)-memstats.heap_marked) / float64(memstats.heap_marked)
-	if egogc < 0 {
-		// Shouldn't happen, but just in case.
-		egogc = 0
-	}
-	return egogc
-}
-
-// gcGoalUtilization is the goal CPU utilization for
-// marking as a fraction of GOMAXPROCS.
-const gcGoalUtilization = 0.30
-
-// gcBackgroundUtilization is the fixed CPU utilization for background
-// marking. It must be <= gcGoalUtilization. The difference between
-// gcGoalUtilization and gcBackgroundUtilization will be made up by
-// mark assists. The scheduler will aim to use within 50% of this
-// goal.
-//
-// Setting this to < gcGoalUtilization avoids saturating the trigger
-// feedback controller when there are no assists, which allows it to
-// better control CPU and heap growth. However, the larger the gap,
-// the more mutator assists are expected to happen, which impact
-// mutator latency.
-const gcBackgroundUtilization = 0.25
-
-// gcCreditSlack is the amount of scan work credit that can
-// accumulate locally before updating gcController.scanWork and,
-// optionally, gcController.bgScanCredit. Lower values give a more
-// accurate assist ratio and make it more likely that assists will
-// successfully steal background credit. Higher values reduce memory
-// contention.
-const gcCreditSlack = 2000
-
-// gcAssistTimeSlack is the nanoseconds of mutator assist time that
-// can accumulate on a P before updating gcController.assistTime.
-const gcAssistTimeSlack = 5000
-
-// gcOverAssistWork determines how many extra units of scan work a GC
-// assist does when an assist happens. This amortizes the cost of an
-// assist by pre-paying for this many bytes of future allocations.
-const gcOverAssistWork = 64 << 10
-
 var work struct {
 	full  lfstack          // lock-free list of full blocks workbuf
 	empty lfstack          // lock-free list of empty blocks workbuf
@@ -1050,9 +327,11 @@ var work struct {
 	nwait  uint32
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
-	nFlushCacheRoots                    int
 	nDataRoots, nSpanRoots, nStackRoots int
 
+	// Base indexes of each root type. Set by gcMarkRootPrepare.
+	baseData, baseSpans, baseStacks, baseEnd uint32
+
 	// Each type of GC state transition is protected by a lock.
 	// Since multiple threads can simultaneously detect the state
 	// transition condition, any thread that detects a transition
@@ -1086,7 +365,7 @@ var work struct {
 	// program started if debug.gctrace > 0.
 	totaltime int64
 
-	// initialHeapLive is the value of memstats.heap_live at the
+	// initialHeapLive is the value of gcController.heapLive at the
 	// beginning of this GC cycle.
 	initialHeapLive uint64
 
@@ -1183,7 +462,7 @@ func GC() {
 	// First, wait for sweeping to finish. (We know there are no
 	// more spans on the sweep queue, but we may be concurrently
 	// sweeping spans, so we have to wait.)
-	for atomic.Load(&work.cycles) == n+1 && atomic.Load(&mheap_.sweepers) != 0 {
+	for atomic.Load(&work.cycles) == n+1 && !isSweepDone() {
 		Gosched()
 	}
 
@@ -1267,13 +546,13 @@ func (t gcTrigger) test() bool {
 	}
 	switch t.kind {
 	case gcTriggerHeap:
-		// Non-atomic access to heap_live for performance. If
+		// Non-atomic access to gcController.heapLive for performance. If
 		// we are going to trigger on this, this thread just
-		// atomically wrote heap_live anyway and we'll see our
+		// atomically wrote gcController.heapLive anyway and we'll see our
 		// own write.
-		return memstats.heap_live >= memstats.gc_trigger
+		return gcController.heapLive >= gcController.trigger
 	case gcTriggerTime:
-		if gcpercent < 0 {
+		if gcController.gcPercent < 0 {
 			return false
 		}
 		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
@@ -1367,7 +646,7 @@ func gcStart(trigger gcTrigger) {
 		// so it can't be more than ncpu, even if GOMAXPROCS is.
 		work.stwprocs = ncpu
 	}
-	work.heap0 = atomic.Load64(&memstats.heap_live)
+	work.heap0 = atomic.Load64(&gcController.heapLive)
 	work.pauseNS = 0
 	work.mode = mode
 
@@ -1390,7 +669,7 @@ func gcStart(trigger gcTrigger) {
 	work.cycles++
 
 	gcController.startCycle()
-	work.heapGoal = memstats.next_gc
+	work.heapGoal = gcController.heapGoal
 
 	// In STW mode, disable scheduling of user Gs. This may also
 	// disable scheduling of this goroutine, so it may block as
@@ -1619,7 +898,7 @@ top:
 	// endCycle depends on all gcWork cache stats being flushed.
 	// The termination algorithm above ensured that up to
 	// allocations since the ragged barrier.
-	nextTriggerRatio := gcController.endCycle()
+	nextTriggerRatio := gcController.endCycle(work.userForced)
 
 	// Perform mark termination. This will restart the world.
 	gcMarkTermination(nextTriggerRatio)
@@ -1631,7 +910,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// Start marktermination (write barrier remains enabled for now).
 	setGCPhase(_GCmarktermination)
 
-	work.heap1 = memstats.heap_live
+	work.heap1 = gcController.heapLive
 	startTime := nanotime()
 
 	mp := acquirem()
@@ -1693,12 +972,12 @@ func gcMarkTermination(nextTriggerRatio float64) {
 		throw("gc done but gcphase != _GCoff")
 	}
 
-	// Record next_gc and heap_inuse for scavenger.
-	memstats.last_next_gc = memstats.next_gc
+	// Record heapGoal and heap_inuse for scavenger.
+	gcController.lastHeapGoal = gcController.heapGoal
 	memstats.last_heap_inuse = memstats.heap_inuse
 
 	// Update GC trigger and pacing for the next cycle.
-	gcSetTriggerRatio(nextTriggerRatio)
+	gcController.commit(nextTriggerRatio)
 
 	// Update timing memstats
 	now := nanotime()
@@ -1745,6 +1024,13 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	// so events don't leak into the wrong cycle.
 	mProf_NextCycle()
 
+	// There may be stale spans in mcaches that need to be swept.
+	// Those aren't tracked in any sweep lists, so we need to
+	// count them against sweep completion until we ensure all
+	// those spans have been forced out.
+	sl := newSweepLocker()
+	sl.blockCompletion()
+
 	systemstack(func() { startTheWorldWithSema(true) })
 
 	// Flush the heap profile so we can start a new cycle next GC.
@@ -1765,6 +1051,9 @@ func gcMarkTermination(nextTriggerRatio float64) {
 			_p_.mcache.prepareForSweep()
 		})
 	})
+	// Now that we've swept stale spans in mcaches, they don't
+	// count against unswept spans.
+	sl.dispose()
 
 	// Print gctrace before dropping worldsema. As soon as we drop
 	// worldsema another cycle could start and smash the stats
@@ -1986,15 +1275,11 @@ func gcBgMarkWorker() {
 					// everything out of the run
 					// queue so it can run
 					// somewhere else.
-					lock(&sched.lock)
-					for {
-						gp, _ := runqget(pp)
-						if gp == nil {
-							break
-						}
-						globrunqput(gp)
+					if drainQ, n := runqdrain(pp); n > 0 {
+						lock(&sched.lock)
+						globrunqputbatch(&drainQ, int32(n))
+						unlock(&sched.lock)
 					}
-					unlock(&sched.lock)
 				}
 				// Go back to draining, this time
 				// without preemption.
@@ -2068,7 +1353,7 @@ func gcMarkWorkAvailable(p *p) bool {
 // gcMark runs the mark (or, for concurrent GC, mark termination)
 // All gcWork caches must be empty.
 // STW is in effect at this point.
-func gcMark(start_time int64) {
+func gcMark(startTime int64) {
 	if debug.allocfreetrace > 0 {
 		tracegc()
 	}
@@ -2076,7 +1361,7 @@ func gcMark(start_time int64) {
 	if gcphase != _GCmarktermination {
 		throw("in gcMark expecting to see gcphase as _GCmarktermination")
 	}
-	work.tstart = start_time
+	work.tstart = startTime
 
 	// Check that there's no marking work remaining.
 	if work.full != 0 || work.markrootNext < work.markrootJobs {
@@ -2138,25 +1423,25 @@ func gcMark(start_time int64) {
 	}
 
 	// Update the marked heap stat.
-	memstats.heap_marked = work.bytesMarked
+	gcController.heapMarked = work.bytesMarked
 
 	// Flush scanAlloc from each mcache since we're about to modify
-	// heap_scan directly. If we were to flush this later, then scanAlloc
+	// heapScan directly. If we were to flush this later, then scanAlloc
 	// might have incorrect information.
 	for _, p := range allp {
 		c := p.mcache
 		if c == nil {
 			continue
 		}
-		memstats.heap_scan += uint64(c.scanAlloc)
+		gcController.heapScan += uint64(c.scanAlloc)
 		c.scanAlloc = 0
 	}
 
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
-	// flushallmcaches (which modifies heap_live).
-	memstats.heap_live = work.bytesMarked
-	memstats.heap_scan = uint64(gcController.scanWork)
+	// flushallmcaches (which modifies gcController.heapLive).
+	gcController.heapLive = work.bytesMarked
+	gcController.heapScan = uint64(gcController.scanWork)
 
 	if trace.enabled {
 		traceHeapAlloc()
@@ -2178,7 +1463,7 @@ func gcSweep(mode gcMode) {
 
 	lock(&mheap_.lock)
 	mheap_.sweepgen += 2
-	mheap_.sweepdone = 0
+	mheap_.sweepDrained = 0
 	mheap_.pagesSwept = 0
 	mheap_.sweepArenas = mheap_.allArenas
 	mheap_.reclaimIndex = 0
@@ -2229,14 +1514,12 @@ func gcSweep(mode gcMode) {
 //
 //go:systemstack
 func gcResetMarkState() {
-	// This may be called during a concurrent phase, so make sure
+	// This may be called during a concurrent phase, so lock to make sure
 	// allgs doesn't change.
-	lock(&allglock)
-	for _, gp := range allgs {
+	forEachG(func(gp *g) {
 		gp.gcscandone = false // set to true in gcphasework
 		gp.gcAssistBytes = 0
-	}
-	unlock(&allglock)
+	})
 
 	// Clear page marks. This is just 1MB per 64GB of heap, so the
 	// time here is pretty trivial.
@@ -2251,7 +1534,7 @@ func gcResetMarkState() {
 	}
 
 	work.bytesMarked = 0
-	work.initialHeapLive = atomic.Load64(&memstats.heap_live)
+	work.initialHeapLive = atomic.Load64(&gcController.heapLive)
 }
 
 // Hooks for other packages
@@ -2334,3 +1617,95 @@ func fmtNSAsMS(buf []byte, ns uint64) []byte {
 	}
 	return itoaDiv(buf, x, dec)
 }
+
+// Helpers for testing GC.
+
+// gcTestIsReachable performs a GC and returns a bit set where bit i
+// is set if ptrs[i] is reachable.
+func gcTestIsReachable(ptrs ...unsafe.Pointer) (mask uint64) {
+	// This takes the pointers as unsafe.Pointers in order to keep
+	// them live long enough for us to attach specials. After
+	// that, we drop our references to them.
+
+	if len(ptrs) > 64 {
+		panic("too many pointers for uint64 mask")
+	}
+
+	// Block GC while we attach specials and drop our references
+	// to ptrs. Otherwise, if a GC is in progress, it could mark
+	// them reachable via this function before we have a chance to
+	// drop them.
+	semacquire(&gcsema)
+
+	// Create reachability specials for ptrs.
+	specials := make([]*specialReachable, len(ptrs))
+	for i, p := range ptrs {
+		lock(&mheap_.speciallock)
+		s := (*specialReachable)(mheap_.specialReachableAlloc.alloc())
+		unlock(&mheap_.speciallock)
+		s.special.kind = _KindSpecialReachable
+		if !addspecial(p, &s.special) {
+			throw("already have a reachable special (duplicate pointer?)")
+		}
+		specials[i] = s
+		// Make sure we don't retain ptrs.
+		ptrs[i] = nil
+	}
+
+	semrelease(&gcsema)
+
+	// Force a full GC and sweep.
+	GC()
+
+	// Process specials.
+	for i, s := range specials {
+		if !s.done {
+			printlock()
+			println("runtime: object", i, "was not swept")
+			throw("IsReachable failed")
+		}
+		if s.reachable {
+			mask |= 1 << i
+		}
+		lock(&mheap_.speciallock)
+		mheap_.specialReachableAlloc.free(unsafe.Pointer(s))
+		unlock(&mheap_.speciallock)
+	}
+
+	return mask
+}
+
+// onCurrentStack reports whether the argument is on the current stack.
+// It is implemented in C.
+func onCurrentStack(uintptr) bool
+
+// getBSS returns the start of the BSS section.
+// It is implemented in C.
+func getBSS() uintptr
+
+// gcTestPointerClass returns the category of what p points to, one of:
+// "heap", "stack", "data", "bss", "other". This is useful for checking
+// that a test is doing what it's intended to do.
+//
+// This is nosplit simply to avoid extra pointer shuffling that may
+// complicate a test.
+//
+//go:nosplit
+func gcTestPointerClass(p unsafe.Pointer) string {
+	p2 := uintptr(noescape(p))
+	if onCurrentStack(p2) {
+		return "stack"
+	}
+	if base, _, _ := findObject(p2, 0, 0, false); base != 0 {
+		return "heap"
+	}
+	bss := getBSS()
+	if p2 >= getText() && p2 < bss {
+		return "data"
+	}
+	if p2 >= bss && p2 < getEnd() {
+		return "bss"
+	}
+	KeepAlive(p)
+	return "other"
+}
diff --git a/libgo/go/runtime/mgcmark.go b/libgo/go/runtime/mgcmark.go
index e5581255bb9..f6e1a14b5e9 100644
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@@ -56,8 +56,6 @@ const (
 func gcMarkRootPrepare() {
 	assertWorldStopped()
 
-	work.nFlushCacheRoots = 0
-
 	work.nDataRoots = 0
 
 	// Only scan globals once per cycle; preferably concurrently.
@@ -91,7 +89,13 @@ func gcMarkRootPrepare() {
 	work.nStackRoots = int(atomic.Loaduintptr(&allglen))
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nSpanRoots + work.nStackRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nSpanRoots + work.nStackRoots)
+
+	// Calculate base indexes of each root type
+	work.baseData = uint32(fixedRootCount)
+	work.baseSpans = work.baseData + uint32(work.nDataRoots)
+	work.baseStacks = work.baseSpans + uint32(work.nSpanRoots)
+	work.baseEnd = work.baseStacks + uint32(work.nStackRoots)
 }
 
 // gcMarkRootCheck checks that all roots have been scanned. It is
@@ -102,23 +106,26 @@ func gcMarkRootCheck() {
 		throw("left over markroot jobs")
 	}
 
-	lock(&allglock)
 	// Check that stacks have been scanned.
-	var gp *g
-	for i := 0; i < work.nStackRoots; i++ {
-		gp = allgs[i]
+	//
+	// We only check the first nStackRoots Gs that we should have scanned.
+	// Since we don't care about newer Gs (see comment in
+	// gcMarkRootPrepare), no locking is required.
+	i := 0
+	forEachGRace(func(gp *g) {
+		if i >= work.nStackRoots {
+			return
+		}
+
 		if !gp.gcscandone {
-			goto fail
+			println("gp", gp, "goid", gp.goid,
+				"status", readgstatus(gp),
+				"gcscandone", gp.gcscandone)
+			throw("scan missed a g")
 		}
-	}
-	unlock(&allglock)
-	return
-
-fail:
-	println("gp", gp, "goid", gp.goid,
-		"status", readgstatus(gp),
-		"gcscandone", gp.gcscandone)
-	throw("scan missed a g")
+
+		i++
+	})
 }
 
 // ptrmask for an allocation containing a single pointer.
@@ -132,22 +139,11 @@ var oneptrmask = [...]uint8{1}
 //
 //go:nowritebarrier
 func markroot(gcw *gcWork, i uint32) {
-	// TODO(austin): This is a bit ridiculous. Compute and store
-	// the bases in gcMarkRootPrepare instead of the counts.
-	baseFlushCache := uint32(fixedRootCount)
-	baseData := baseFlushCache + uint32(work.nFlushCacheRoots)
-	baseSpans := baseData + uint32(work.nDataRoots)
-	baseStacks := baseSpans + uint32(work.nSpanRoots)
-	end := baseStacks + uint32(work.nStackRoots)
-
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
 	switch {
-	case baseFlushCache <= i && i < baseData:
-		flushmcache(int(i - baseFlushCache))
-
-	case baseData <= i && i < baseSpans:
+	case work.baseData <= i && i < work.baseSpans:
 		roots := gcRoots
-		c := baseData
+		c := work.baseData
 		for roots != nil {
 			if i == c {
 				markrootBlock(roots, gcw)
@@ -166,15 +162,18 @@ func markroot(gcw *gcWork, i uint32) {
 	case i == fixedRootFreeGStacks:
 		// FIXME: We don't do this for gccgo.
 
-	case baseSpans <= i && i < baseStacks:
+	case work.baseSpans <= i && i < work.baseStacks:
 		// mark mspan.specials
-		markrootSpans(gcw, int(i-baseSpans))
+		markrootSpans(gcw, int(i-work.baseSpans))
 
 	default:
 		// the rest is scanning goroutine stacks
 		var gp *g
-		if baseStacks <= i && i < end {
-			gp = allgs[i-baseStacks]
+		if work.baseStacks <= i && i < work.baseEnd {
+			// N.B. Atomic read of allglen in gcMarkRootPrepare
+			// acts as a barrier to ensure that allgs must be large
+			// enough to contain all relevant Gs.
+			gp = allgs[i-work.baseStacks]
 		} else {
 			throw("markroot: bad index")
 		}
@@ -1051,12 +1050,7 @@ func scanobject(b uintptr, gcw *gcWork) {
 	}
 
 	var i uintptr
-	for i = 0; i < n; i += sys.PtrSize {
-		// Find bits for this word.
-		if i != 0 {
-			// Avoid needless hbits.next() on last iteration.
-			hbits = hbits.next()
-		}
+	for i = 0; i < n; i, hbits = i+sys.PtrSize, hbits.next() {
 		// Load bits once. See CL 22712 and issue 16973 for discussion.
 		bits := hbits.bits()
 		if bits&bitScan == 0 {
diff --git a/libgo/go/runtime/mgcpacer.go b/libgo/go/runtime/mgcpacer.go
new file mode 100644
index 00000000000..8a669204dd0
--- /dev/null
+++ b/libgo/go/runtime/mgcpacer.go
@@ -0,0 +1,848 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	// gcGoalUtilization is the goal CPU utilization for
+	// marking as a fraction of GOMAXPROCS.
+	gcGoalUtilization = 0.30
+
+	// gcBackgroundUtilization is the fixed CPU utilization for background
+	// marking. It must be <= gcGoalUtilization. The difference between
+	// gcGoalUtilization and gcBackgroundUtilization will be made up by
+	// mark assists. The scheduler will aim to use within 50% of this
+	// goal.
+	//
+	// Setting this to < gcGoalUtilization avoids saturating the trigger
+	// feedback controller when there are no assists, which allows it to
+	// better control CPU and heap growth. However, the larger the gap,
+	// the more mutator assists are expected to happen, which impact
+	// mutator latency.
+	gcBackgroundUtilization = 0.25
+
+	// gcCreditSlack is the amount of scan work credit that can
+	// accumulate locally before updating gcController.scanWork and,
+	// optionally, gcController.bgScanCredit. Lower values give a more
+	// accurate assist ratio and make it more likely that assists will
+	// successfully steal background credit. Higher values reduce memory
+	// contention.
+	gcCreditSlack = 2000
+
+	// gcAssistTimeSlack is the nanoseconds of mutator assist time that
+	// can accumulate on a P before updating gcController.assistTime.
+	gcAssistTimeSlack = 5000
+
+	// gcOverAssistWork determines how many extra units of scan work a GC
+	// assist does when an assist happens. This amortizes the cost of an
+	// assist by pre-paying for this many bytes of future allocations.
+	gcOverAssistWork = 64 << 10
+
+	// defaultHeapMinimum is the value of heapMinimum for GOGC==100.
+	defaultHeapMinimum = 4 << 20
+)
+
+func init() {
+	if offset := unsafe.Offsetof(gcController.heapLive); offset%8 != 0 {
+		println(offset)
+		throw("gcController.heapLive not aligned to 8 bytes")
+	}
+}
+
+// gcController implements the GC pacing controller that determines
+// when to trigger concurrent garbage collection and how much marking
+// work to do in mutator assists and background marking.
+//
+// It uses a feedback control algorithm to adjust the gcController.trigger
+// trigger based on the heap growth and GC CPU utilization each cycle.
+// This algorithm optimizes for heap growth to match GOGC and for CPU
+// utilization between assist and background marking to be 25% of
+// GOMAXPROCS. The high-level design of this algorithm is documented
+// at https://golang.org/s/go15gcpacing.
+//
+// All fields of gcController are used only during a single mark
+// cycle.
+var gcController gcControllerState
+
+type gcControllerState struct {
+	// Initialized from $GOGC. GOGC=off means no GC.
+	gcPercent int32
+
+	_ uint32 // padding so following 64-bit values are 8-byte aligned
+
+	// heapMinimum is the minimum heap size at which to trigger GC.
+	// For small heaps, this overrides the usual GOGC*live set rule.
+	//
+	// When there is a very small live set but a lot of allocation, simply
+	// collecting when the heap reaches GOGC*live results in many GC
+	// cycles and high total per-GC overhead. This minimum amortizes this
+	// per-GC overhead while keeping the heap reasonably small.
+	//
+	// During initialization this is set to 4MB*GOGC/100. In the case of
+	// GOGC==0, this will set heapMinimum to 0, resulting in constant
+	// collection even when the heap size is small, which is useful for
+	// debugging.
+	heapMinimum uint64
+
+	// triggerRatio is the heap growth ratio that triggers marking.
+	//
+	// E.g., if this is 0.6, then GC should start when the live
+	// heap has reached 1.6 times the heap size marked by the
+	// previous cycle. This should be ≤ GOGC/100 so the trigger
+	// heap size is less than the goal heap size. This is set
+	// during mark termination for the next cycle's trigger.
+	//
+	// Protected by mheap_.lock or a STW.
+	triggerRatio float64
+
+	// trigger is the heap size that triggers marking.
+	//
+	// When heapLive ≥ trigger, the mark phase will start.
+	// This is also the heap size by which proportional sweeping
+	// must be complete.
+	//
+	// This is computed from triggerRatio during mark termination
+	// for the next cycle's trigger.
+	//
+	// Protected by mheap_.lock or a STW.
+	trigger uint64
+
+	// heapGoal is the goal heapLive for when next GC ends.
+	// Set to ^uint64(0) if disabled.
+	//
+	// Read and written atomically, unless the world is stopped.
+	heapGoal uint64
+
+	// lastHeapGoal is the value of heapGoal for the previous GC.
+	// Note that this is distinct from the last value heapGoal had,
+	// because it could change if e.g. gcPercent changes.
+	//
+	// Read and written with the world stopped or with mheap_.lock held.
+	lastHeapGoal uint64
+
+	// heapLive is the number of bytes considered live by the GC.
+	// That is: retained by the most recent GC plus allocated
+	// since then. heapLive ≤ memstats.heapAlloc, since heapAlloc includes
+	// unmarked objects that have not yet been swept (and hence goes up as we
+	// allocate and down as we sweep) while heapLive excludes these
+	// objects (and hence only goes up between GCs).
+	//
+	// This is updated atomically without locking. To reduce
+	// contention, this is updated only when obtaining a span from
+	// an mcentral and at this point it counts all of the
+	// unallocated slots in that span (which will be allocated
+	// before that mcache obtains another span from that
+	// mcentral). Hence, it slightly overestimates the "true" live
+	// heap size. It's better to overestimate than to
+	// underestimate because 1) this triggers the GC earlier than
+	// necessary rather than potentially too late and 2) this
+	// leads to a conservative GC rate rather than a GC rate that
+	// is potentially too low.
+	//
+	// Reads should likewise be atomic (or during STW).
+	//
+	// Whenever this is updated, call traceHeapAlloc() and
+	// this gcControllerState's revise() method.
+	heapLive uint64
+
+	// heapScan is the number of bytes of "scannable" heap. This
+	// is the live heap (as counted by heapLive), but omitting
+	// no-scan objects and no-scan tails of objects.
+	//
+	// Whenever this is updated, call this gcControllerState's
+	// revise() method.
+	//
+	// Read and written atomically or with the world stopped.
+	heapScan uint64
+
+	// heapMarked is the number of bytes marked by the previous
+	// GC. After mark termination, heapLive == heapMarked, but
+	// unlike heapLive, heapMarked does not change until the
+	// next mark termination.
+	heapMarked uint64
+
+	// scanWork is the total scan work performed this cycle. This
+	// is updated atomically during the cycle. Updates occur in
+	// bounded batches, since it is both written and read
+	// throughout the cycle. At the end of the cycle, this is how
+	// much of the retained heap is scannable.
+	//
+	// Currently this is the bytes of heap scanned. For most uses,
+	// this is an opaque unit of work, but for estimation the
+	// definition is important.
+	scanWork int64
+
+	// bgScanCredit is the scan work credit accumulated by the
+	// concurrent background scan. This credit is accumulated by
+	// the background scan and stolen by mutator assists. This is
+	// updated atomically. Updates occur in bounded batches, since
+	// it is both written and read throughout the cycle.
+	bgScanCredit int64
+
+	// assistTime is the nanoseconds spent in mutator assists
+	// during this cycle. This is updated atomically. Updates
+	// occur in bounded batches, since it is both written and read
+	// throughout the cycle.
+	assistTime int64
+
+	// dedicatedMarkTime is the nanoseconds spent in dedicated
+	// mark workers during this cycle. This is updated atomically
+	// at the end of the concurrent mark phase.
+	dedicatedMarkTime int64
+
+	// fractionalMarkTime is the nanoseconds spent in the
+	// fractional mark worker during this cycle. This is updated
+	// atomically throughout the cycle and will be up-to-date if
+	// the fractional mark worker is not currently running.
+	fractionalMarkTime int64
+
+	// idleMarkTime is the nanoseconds spent in idle marking
+	// during this cycle. This is updated atomically throughout
+	// the cycle.
+	idleMarkTime int64
+
+	// markStartTime is the absolute start time in nanoseconds
+	// that assists and background mark workers started.
+	markStartTime int64
+
+	// dedicatedMarkWorkersNeeded is the number of dedicated mark
+	// workers that need to be started. This is computed at the
+	// beginning of each cycle and decremented atomically as
+	// dedicated mark workers get started.
+	dedicatedMarkWorkersNeeded int64
+
+	// assistWorkPerByte is the ratio of scan work to allocated
+	// bytes that should be performed by mutator assists. This is
+	// computed at the beginning of each cycle and updated every
+	// time heapScan is updated.
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	assistWorkPerByte uint64
+
+	// assistBytesPerWork is 1/assistWorkPerByte.
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	//
+	// Note that because this is read and written independently
+	// from assistWorkPerByte users may notice a skew between
+	// the two values, and such a state should be safe.
+	assistBytesPerWork uint64
+
+	// fractionalUtilizationGoal is the fraction of wall clock
+	// time that should be spent in the fractional mark worker on
+	// each P that isn't running a dedicated worker.
+	//
+	// For example, if the utilization goal is 25% and there are
+	// no dedicated workers, this will be 0.25. If the goal is
+	// 25%, there is one dedicated worker, and GOMAXPROCS is 5,
+	// this will be 0.05 to make up the missing 5%.
+	//
+	// If this is zero, no fractional workers are needed.
+	fractionalUtilizationGoal float64
+
+	_ cpu.CacheLinePad
+}
+
+func (c *gcControllerState) init(gcPercent int32) {
+	c.heapMinimum = defaultHeapMinimum
+
+	// Set a reasonable initial GC trigger.
+	c.triggerRatio = 7 / 8.0
+
+	// Fake a heapMarked value so it looks like a trigger at
+	// heapMinimum is the appropriate growth from heapMarked.
+	// This will go into computing the initial GC goal.
+	c.heapMarked = uint64(float64(c.heapMinimum) / (1 + c.triggerRatio))
+
+	// This will also compute and set the GC trigger and goal.
+	c.setGCPercent(gcPercent)
+}
+
+// startCycle resets the GC controller's state and computes estimates
+// for a new GC cycle. The caller must hold worldsema and the world
+// must be stopped.
+func (c *gcControllerState) startCycle() {
+	c.scanWork = 0
+	c.bgScanCredit = 0
+	c.assistTime = 0
+	c.dedicatedMarkTime = 0
+	c.fractionalMarkTime = 0
+	c.idleMarkTime = 0
+
+	// Ensure that the heap goal is at least a little larger than
+	// the current live heap size. This may not be the case if GC
+	// start is delayed or if the allocation that pushed gcController.heapLive
+	// over trigger is large or if the trigger is really close to
+	// GOGC. Assist is proportional to this distance, so enforce a
+	// minimum distance, even if it means going over the GOGC goal
+	// by a tiny bit.
+	if c.heapGoal < c.heapLive+1024*1024 {
+		c.heapGoal = c.heapLive + 1024*1024
+	}
+
+	// Compute the background mark utilization goal. In general,
+	// this may not come out exactly. We round the number of
+	// dedicated workers so that the utilization is closest to
+	// 25%. For small GOMAXPROCS, this would introduce too much
+	// error, so we add fractional workers in that case.
+	totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
+	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
+	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
+	const maxUtilError = 0.3
+	if utilError < -maxUtilError || utilError > maxUtilError {
+		// Rounding put us more than 30% off our goal. With
+		// gcBackgroundUtilization of 25%, this happens for
+		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
+		// workers to compensate.
+		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
+			// Too many dedicated workers.
+			c.dedicatedMarkWorkersNeeded--
+		}
+		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
+	} else {
+		c.fractionalUtilizationGoal = 0
+	}
+
+	// In STW mode, we just want dedicated workers.
+	if debug.gcstoptheworld > 0 {
+		c.dedicatedMarkWorkersNeeded = int64(gomaxprocs)
+		c.fractionalUtilizationGoal = 0
+	}
+
+	// Clear per-P state
+	for _, p := range allp {
+		p.gcAssistTime = 0
+		p.gcFractionalMarkTime = 0
+	}
+
+	// Compute initial values for controls that are updated
+	// throughout the cycle.
+	c.revise()
+
+	if debug.gcpacertrace > 0 {
+		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
+		print("pacer: assist ratio=", assistRatio,
+			" (scan ", gcController.heapScan>>20, " MB in ",
+			work.initialHeapLive>>20, "->",
+			c.heapGoal>>20, " MB)",
+			" workers=", c.dedicatedMarkWorkersNeeded,
+			"+", c.fractionalUtilizationGoal, "\n")
+	}
+}
+
+// revise updates the assist ratio during the GC cycle to account for
+// improved estimates. This should be called whenever gcController.heapScan,
+// gcController.heapLive, or gcController.heapGoal is updated. It is safe to
+// call concurrently, but it may race with other calls to revise.
+//
+// The result of this race is that the two assist ratio values may not line
+// up or may be stale. In practice this is OK because the assist ratio
+// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
+// heuristic anyway. Furthermore, no part of the heuristic depends on
+// the two assist ratio values being exact reciprocals of one another, since
+// the two values are used to convert values from different sources.
+//
+// The worst case result of this raciness is that we may miss a larger shift
+// in the ratio (say, if we decide to pace more aggressively against the
+// hard heap goal) but even this "hard goal" is best-effort (see #40460).
+// The dedicated GC should ensure we don't exceed the hard goal by too much
+// in the rare case we do exceed it.
+//
+// It should only be called when gcBlackenEnabled != 0 (because this
+// is when assists are enabled and the necessary statistics are
+// available).
+func (c *gcControllerState) revise() {
+	gcPercent := c.gcPercent
+	if gcPercent < 0 {
+		// If GC is disabled but we're running a forced GC,
+		// act like GOGC is huge for the below calculations.
+		gcPercent = 100000
+	}
+	live := atomic.Load64(&c.heapLive)
+	scan := atomic.Load64(&c.heapScan)
+	work := atomic.Loadint64(&c.scanWork)
+
+	// Assume we're under the soft goal. Pace GC to complete at
+	// heapGoal assuming the heap is in steady-state.
+	heapGoal := int64(atomic.Load64(&c.heapGoal))
+
+	// Compute the expected scan work remaining.
+	//
+	// This is estimated based on the expected
+	// steady-state scannable heap. For example, with
+	// GOGC=100, only half of the scannable heap is
+	// expected to be live, so that's what we target.
+	//
+	// (This is a float calculation to avoid overflowing on
+	// 100*heapScan.)
+	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcPercent))
+
+	if int64(live) > heapGoal || work > scanWorkExpected {
+		// We're past the soft goal, or we've already done more scan
+		// work than we expected. Pace GC so that in the worst case it
+		// will complete by the hard goal.
+		const maxOvershoot = 1.1
+		heapGoal = int64(float64(heapGoal) * maxOvershoot)
+
+		// Compute the upper bound on the scan work remaining.
+		scanWorkExpected = int64(scan)
+	}
+
+	// Compute the remaining scan work estimate.
+	//
+	// Note that we currently count allocations during GC as both
+	// scannable heap (heapScan) and scan work completed
+	// (scanWork), so allocation will change this difference
+	// slowly in the soft regime and not at all in the hard
+	// regime.
+	scanWorkRemaining := scanWorkExpected - work
+	if scanWorkRemaining < 1000 {
+		// We set a somewhat arbitrary lower bound on
+		// remaining scan work since if we aim a little high,
+		// we can miss by a little.
+		//
+		// We *do* need to enforce that this is at least 1,
+		// since marking is racy and double-scanning objects
+		// may legitimately make the remaining scan work
+		// negative, even in the hard goal regime.
+		scanWorkRemaining = 1000
+	}
+
+	// Compute the heap distance remaining.
+	heapRemaining := heapGoal - int64(live)
+	if heapRemaining <= 0 {
+		// This shouldn't happen, but if it does, avoid
+		// dividing by zero or setting the assist negative.
+		heapRemaining = 1
+	}
+
+	// Compute the mutator assist ratio so by the time the mutator
+	// allocates the remaining heap bytes up to heapGoal, it will
+	// have done (or stolen) the remaining amount of scan work.
+	// Note that the assist ratio values are updated atomically
+	// but not together. This means there may be some degree of
+	// skew between the two values. This is generally OK as the
+	// values shift relatively slowly over the course of a GC
+	// cycle.
+	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
+	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
+	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
+	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
+}
+
+// endCycle computes the trigger ratio for the next cycle.
+// userForced indicates whether the current GC cycle was forced
+// by the application.
+func (c *gcControllerState) endCycle(userForced bool) float64 {
+	if userForced {
+		// Forced GC means this cycle didn't start at the
+		// trigger, so where it finished isn't good
+		// information about how to adjust the trigger.
+		// Just leave it where it is.
+		return c.triggerRatio
+	}
+
+	// Proportional response gain for the trigger controller. Must
+	// be in [0, 1]. Lower values smooth out transient effects but
+	// take longer to respond to phase changes. Higher values
+	// react to phase changes quickly, but are more affected by
+	// transient changes. Values near 1 may be unstable.
+	const triggerGain = 0.5
+
+	// Compute next cycle trigger ratio. First, this computes the
+	// "error" for this cycle; that is, how far off the trigger
+	// was from what it should have been, accounting for both heap
+	// growth and GC CPU utilization. We compute the actual heap
+	// growth during this cycle and scale that by how far off from
+	// the goal CPU utilization we were (to estimate the heap
+	// growth if we had the desired CPU utilization). The
+	// difference between this estimate and the GOGC-based goal
+	// heap growth is the error.
+	goalGrowthRatio := c.effectiveGrowthRatio()
+	actualGrowthRatio := float64(c.heapLive)/float64(c.heapMarked) - 1
+	assistDuration := nanotime() - c.markStartTime
+
+	// Assume background mark hit its utilization goal.
+	utilization := gcBackgroundUtilization
+	// Add assist utilization; avoid divide by zero.
+	if assistDuration > 0 {
+		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
+	}
+
+	triggerError := goalGrowthRatio - c.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-c.triggerRatio)
+
+	// Finally, we adjust the trigger for next time by this error,
+	// damped by the proportional gain.
+	triggerRatio := c.triggerRatio + triggerGain*triggerError
+
+	if debug.gcpacertrace > 0 {
+		// Print controller state in terms of the design
+		// document.
+		H_m_prev := c.heapMarked
+		h_t := c.triggerRatio
+		H_T := c.trigger
+		h_a := actualGrowthRatio
+		H_a := c.heapLive
+		h_g := goalGrowthRatio
+		H_g := int64(float64(H_m_prev) * (1 + h_g))
+		u_a := utilization
+		u_g := gcGoalUtilization
+		W_a := c.scanWork
+		print("pacer: H_m_prev=", H_m_prev,
+			" h_t=", h_t, " H_T=", H_T,
+			" h_a=", h_a, " H_a=", H_a,
+			" h_g=", h_g, " H_g=", H_g,
+			" u_a=", u_a, " u_g=", u_g,
+			" W_a=", W_a,
+			" goalΔ=", goalGrowthRatio-h_t,
+			" actualΔ=", h_a-h_t,
+			" u_a/u_g=", u_a/u_g,
+			"\n")
+	}
+
+	return triggerRatio
+}
+
+// enlistWorker encourages another dedicated mark worker to start on
+// another P if there are spare worker slots. It is used by putfull
+// when more work is made available.
+//
+//go:nowritebarrier
+func (c *gcControllerState) enlistWorker() {
+	// If there are idle Ps, wake one so it will run an idle worker.
+	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
+	//
+	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
+	//		wakep()
+	//		return
+	//	}
+
+	// There are no idle Ps. If we need more dedicated workers,
+	// try to preempt a running P so it will switch to a worker.
+	if c.dedicatedMarkWorkersNeeded <= 0 {
+		return
+	}
+	// Pick a random other P to preempt.
+	if gomaxprocs <= 1 {
+		return
+	}
+	gp := getg()
+	if gp == nil || gp.m == nil || gp.m.p == 0 {
+		return
+	}
+	myID := gp.m.p.ptr().id
+	for tries := 0; tries < 5; tries++ {
+		id := int32(fastrandn(uint32(gomaxprocs - 1)))
+		if id >= myID {
+			id++
+		}
+		p := allp[id]
+		if p.status != _Prunning {
+			continue
+		}
+		if preemptone(p) {
+			return
+		}
+	}
+}
+
+// findRunnableGCWorker returns a background mark worker for _p_ if it
+// should be run. This must only be called when gcBlackenEnabled != 0.
+func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
+	if gcBlackenEnabled == 0 {
+		throw("gcControllerState.findRunnable: blackening not enabled")
+	}
+
+	if !gcMarkWorkAvailable(_p_) {
+		// No work to be done right now. This can happen at
+		// the end of the mark phase when there are still
+		// assists tapering off. Don't bother running a worker
+		// now because it'll just return immediately.
+		return nil
+	}
+
+	// Grab a worker before we commit to running below.
+	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+	if node == nil {
+		// There is at least one worker per P, so normally there are
+		// enough workers to run on all Ps, if necessary. However, once
+		// a worker enters gcMarkDone it may park without rejoining the
+		// pool, thus freeing a P with no corresponding worker.
+		// gcMarkDone never depends on another worker doing work, so it
+		// is safe to simply do nothing here.
+		//
+		// If gcMarkDone bails out without completing the mark phase,
+		// it will always do so with queued global work. Thus, that P
+		// will be immediately eligible to re-run the worker G it was
+		// just using, ensuring work can complete.
+		return nil
+	}
+
+	decIfPositive := func(ptr *int64) bool {
+		for {
+			v := atomic.Loadint64(ptr)
+			if v <= 0 {
+				return false
+			}
+
+			if atomic.Casint64(ptr, v, v-1) {
+				return true
+			}
+		}
+	}
+
+	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
+		// This P is now dedicated to marking until the end of
+		// the concurrent mark phase.
+		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+	} else if c.fractionalUtilizationGoal == 0 {
+		// No need for fractional workers.
+		gcBgMarkWorkerPool.push(&node.node)
+		return nil
+	} else {
+		// Is this P behind on the fractional utilization
+		// goal?
+		//
+		// This should be kept in sync with pollFractionalWorkerExit.
+		delta := nanotime() - c.markStartTime
+		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
+			// Nope. No need to run a fractional worker.
+			gcBgMarkWorkerPool.push(&node.node)
+			return nil
+		}
+		// Run a fractional worker.
+		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
+	}
+
+	// Run the background mark worker.
+	gp := node.gp.ptr()
+	casgstatus(gp, _Gwaiting, _Grunnable)
+	if trace.enabled {
+		traceGoUnpark(gp, 0)
+	}
+	return gp
+}
+
+// commit sets the trigger ratio and updates everything
+// derived from it: the absolute trigger, the heap goal, mark pacing,
+// and sweep pacing.
+//
+// This can be called any time. If GC is the in the middle of a
+// concurrent phase, it will adjust the pacing of that phase.
+//
+// This depends on gcPercent, gcController.heapMarked, and
+// gcController.heapLive. These must be up to date.
+//
+// mheap_.lock must be held or the world must be stopped.
+func (c *gcControllerState) commit(triggerRatio float64) {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	// Compute the next GC goal, which is when the allocated heap
+	// has grown by GOGC/100 over the heap marked by the last
+	// cycle.
+	goal := ^uint64(0)
+	if c.gcPercent >= 0 {
+		goal = c.heapMarked + c.heapMarked*uint64(c.gcPercent)/100
+	}
+
+	// Set the trigger ratio, capped to reasonable bounds.
+	if c.gcPercent >= 0 {
+		scalingFactor := float64(c.gcPercent) / 100
+		// Ensure there's always a little margin so that the
+		// mutator assist ratio isn't infinity.
+		maxTriggerRatio := 0.95 * scalingFactor
+		if triggerRatio > maxTriggerRatio {
+			triggerRatio = maxTriggerRatio
+		}
+
+		// If we let triggerRatio go too low, then if the application
+		// is allocating very rapidly we might end up in a situation
+		// where we're allocating black during a nearly always-on GC.
+		// The result of this is a growing heap and ultimately an
+		// increase in RSS. By capping us at a point >0, we're essentially
+		// saying that we're OK using more CPU during the GC to prevent
+		// this growth in RSS.
+		//
+		// The current constant was chosen empirically: given a sufficiently
+		// fast/scalable allocator with 48 Ps that could drive the trigger ratio
+		// to <0.05, this constant causes applications to retain the same peak
+		// RSS compared to not having this allocator.
+		minTriggerRatio := 0.6 * scalingFactor
+		if triggerRatio < minTriggerRatio {
+			triggerRatio = minTriggerRatio
+		}
+	} else if triggerRatio < 0 {
+		// gcPercent < 0, so just make sure we're not getting a negative
+		// triggerRatio. This case isn't expected to happen in practice,
+		// and doesn't really matter because if gcPercent < 0 then we won't
+		// ever consume triggerRatio further on in this function, but let's
+		// just be defensive here; the triggerRatio being negative is almost
+		// certainly undesirable.
+		triggerRatio = 0
+	}
+	c.triggerRatio = triggerRatio
+
+	// Compute the absolute GC trigger from the trigger ratio.
+	//
+	// We trigger the next GC cycle when the allocated heap has
+	// grown by the trigger ratio over the marked heap size.
+	trigger := ^uint64(0)
+	if c.gcPercent >= 0 {
+		trigger = uint64(float64(c.heapMarked) * (1 + triggerRatio))
+		// Don't trigger below the minimum heap size.
+		minTrigger := c.heapMinimum
+		if !isSweepDone() {
+			// Concurrent sweep happens in the heap growth
+			// from gcController.heapLive to trigger, so ensure
+			// that concurrent sweep has some heap growth
+			// in which to perform sweeping before we
+			// start the next GC cycle.
+			sweepMin := atomic.Load64(&c.heapLive) + sweepMinHeapDistance
+			if sweepMin > minTrigger {
+				minTrigger = sweepMin
+			}
+		}
+		if trigger < minTrigger {
+			trigger = minTrigger
+		}
+		if int64(trigger) < 0 {
+			print("runtime: heapGoal=", c.heapGoal, " heapMarked=", c.heapMarked, " gcController.heapLive=", c.heapLive, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
+			throw("trigger underflow")
+		}
+		if trigger > goal {
+			// The trigger ratio is always less than GOGC/100, but
+			// other bounds on the trigger may have raised it.
+			// Push up the goal, too.
+			goal = trigger
+		}
+	}
+
+	// Commit to the trigger and goal.
+	c.trigger = trigger
+	atomic.Store64(&c.heapGoal, goal)
+	if trace.enabled {
+		traceHeapGoal()
+	}
+
+	// Update mark pacing.
+	if gcphase != _GCoff {
+		c.revise()
+	}
+
+	// Update sweep pacing.
+	if isSweepDone() {
+		mheap_.sweepPagesPerByte = 0
+	} else {
+		// Concurrent sweep needs to sweep all of the in-use
+		// pages by the time the allocated heap reaches the GC
+		// trigger. Compute the ratio of in-use pages to sweep
+		// per byte allocated, accounting for the fact that
+		// some might already be swept.
+		heapLiveBasis := atomic.Load64(&c.heapLive)
+		heapDistance := int64(trigger) - int64(heapLiveBasis)
+		// Add a little margin so rounding errors and
+		// concurrent sweep are less likely to leave pages
+		// unswept when GC starts.
+		heapDistance -= 1024 * 1024
+		if heapDistance < _PageSize {
+			// Avoid setting the sweep ratio extremely high
+			heapDistance = _PageSize
+		}
+		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
+		pagesInUse := atomic.Load64(&mheap_.pagesInUse)
+		sweepDistancePages := int64(pagesInUse) - int64(pagesSwept)
+		if sweepDistancePages <= 0 {
+			mheap_.sweepPagesPerByte = 0
+		} else {
+			mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
+			mheap_.sweepHeapLiveBasis = heapLiveBasis
+			// Write pagesSweptBasis last, since this
+			// signals concurrent sweeps to recompute
+			// their debt.
+			atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
+		}
+	}
+
+	gcPaceScavenger()
+}
+
+// effectiveGrowthRatio returns the current effective heap growth
+// ratio (GOGC/100) based on heapMarked from the previous GC and
+// heapGoal for the current GC.
+//
+// This may differ from gcPercent/100 because of various upper and
+// lower bounds on gcPercent. For example, if the heap is smaller than
+// heapMinimum, this can be higher than gcPercent/100.
+//
+// mheap_.lock must be held or the world must be stopped.
+func (c *gcControllerState) effectiveGrowthRatio() float64 {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	egogc := float64(atomic.Load64(&c.heapGoal)-c.heapMarked) / float64(c.heapMarked)
+	if egogc < 0 {
+		// Shouldn't happen, but just in case.
+		egogc = 0
+	}
+	return egogc
+}
+
+// setGCPercent updates gcPercent and all related pacer state.
+// Returns the old value of gcPercent.
+//
+// The world must be stopped, or mheap_.lock must be held.
+func (c *gcControllerState) setGCPercent(in int32) int32 {
+	assertWorldStoppedOrLockHeld(&mheap_.lock)
+
+	out := c.gcPercent
+	if in < 0 {
+		in = -1
+	}
+	c.gcPercent = in
+	c.heapMinimum = defaultHeapMinimum * uint64(c.gcPercent) / 100
+	// Update pacing in response to gcPercent change.
+	c.commit(c.triggerRatio)
+
+	return out
+}
+
+//go:linkname setGCPercent runtime_1debug.setGCPercent
+func setGCPercent(in int32) (out int32) {
+	// Run on the system stack since we grab the heap lock.
+	systemstack(func() {
+		lock(&mheap_.lock)
+		out = gcController.setGCPercent(in)
+		unlock(&mheap_.lock)
+	})
+
+	// If we just disabled GC, wait for any concurrent GC mark to
+	// finish so we always return with no GC running.
+	if in < 0 {
+		gcWaitOnMark(atomic.Load(&work.cycles))
+	}
+
+	return out
+}
+
+func readGOGC() int32 {
+	p := gogetenv("GOGC")
+	if p == "off" {
+		return -1
+	}
+	if n, ok := atoi32(p); ok {
+		return n
+	}
+	return 100
+}
diff --git a/libgo/go/runtime/mgcscavenge.go b/libgo/go/runtime/mgcscavenge.go
index da5be70344a..3fa1c4604d7 100644
--- a/libgo/go/runtime/mgcscavenge.go
+++ b/libgo/go/runtime/mgcscavenge.go
@@ -18,12 +18,12 @@
 // application down to a goal.
 //
 // That goal is defined as:
-//   (retainExtraPercent+100) / 100 * (next_gc / last_next_gc) * last_heap_inuse
+//   (retainExtraPercent+100) / 100 * (heapGoal / lastHeapGoal) * last_heap_inuse
 //
 // Essentially, we wish to have the application's RSS track the heap goal, but
 // the heap goal is defined in terms of bytes of objects, rather than pages like
 // RSS. As a result, we need to take into account for fragmentation internal to
-// spans. next_gc / last_next_gc defines the ratio between the current heap goal
+// spans. heapGoal / lastHeapGoal defines the ratio between the current heap goal
 // and the last heap goal, which tells us by how much the heap is growing and
 // shrinking. We estimate what the heap will grow to in terms of pages by taking
 // this ratio and multiplying it by heap_inuse at the end of the last GC, which
@@ -118,12 +118,12 @@ func gcPaceScavenger() {
 	// We never scavenge before the 2nd GC cycle anyway (we don't have enough
 	// information about the heap yet) so this is fine, and avoids a fault
 	// or garbage data later.
-	if memstats.last_next_gc == 0 {
+	if gcController.lastHeapGoal == 0 {
 		mheap_.scavengeGoal = ^uint64(0)
 		return
 	}
 	// Compute our scavenging goal.
-	goalRatio := float64(atomic.Load64(&memstats.next_gc)) / float64(memstats.last_next_gc)
+	goalRatio := float64(atomic.Load64(&gcController.heapGoal)) / float64(gcController.lastHeapGoal)
 	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
@@ -207,7 +207,7 @@ func wakeScavenger() {
 		// Ready the goroutine by injecting it. We use injectglist instead
 		// of ready or goready in order to allow us to run this function
 		// without a P. injectglist also avoids placing the goroutine in
-		// the current P's runnext slot, which is desireable to prevent
+		// the current P's runnext slot, which is desirable to prevent
 		// the scavenger from interfering with user goroutine scheduling
 		// too much.
 		var list gList
@@ -249,7 +249,7 @@ func scavengeSleep(ns int64) int64 {
 // The background scavenger maintains the RSS of the application below
 // the line described by the proportional scavenging statistics in
 // the mheap struct.
-func bgscavenge(c chan int) {
+func bgscavenge() {
 	setSystemGoroutine()
 
 	scavenge.g = getg()
@@ -263,7 +263,7 @@ func bgscavenge(c chan int) {
 		wakeScavenger()
 	}
 
-	c <- 1
+	gcenable_setup <- 1
 	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
 
 	// Exponentially-weighted moving average of the fraction of time this
@@ -374,7 +374,7 @@ func bgscavenge(c chan int) {
 		// Due to OS-related anomalies we may "sleep" for an inordinate amount
 		// of time. Let's avoid letting the ratio get out of hand by bounding
 		// the sleep time we use in our EWMA.
-		const minFraction = 1 / 1000
+		const minFraction = 1.0 / 1000.0
 		if fraction < minFraction {
 			fraction = minFraction
 		}
diff --git a/libgo/go/runtime/mgcscavenge_test.go b/libgo/go/runtime/mgcscavenge_test.go
index 250343077ff..3b12a2e1e63 100644
--- a/libgo/go/runtime/mgcscavenge_test.go
+++ b/libgo/go/runtime/mgcscavenge_test.go
@@ -152,12 +152,6 @@ func TestPallocDataFindScavengeCandidate(t *testing.T) {
 			max:   PallocChunkPages,
 			want:  BitRange{0, uint(m)},
 		}
-		tests["StartFree"+suffix] = test{
-			alloc: []BitRange{{uint(m), PallocChunkPages - uint(m)}},
-			min:   m,
-			max:   PallocChunkPages,
-			want:  BitRange{0, uint(m)},
-		}
 		tests["EndFree"+suffix] = test{
 			alloc: []BitRange{{0, PallocChunkPages - uint(m)}},
 			min:   m,
diff --git a/libgo/go/runtime/mgcsweep.go b/libgo/go/runtime/mgcsweep.go
index b2a5a8670b8..746e7f0a9f7 100644
--- a/libgo/go/runtime/mgcsweep.go
+++ b/libgo/go/runtime/mgcsweep.go
@@ -153,7 +153,7 @@ func finishsweep_m() {
 	nextMarkBitArenaEpoch()
 }
 
-func bgsweep(c chan int) {
+func bgsweep() {
 	setSystemGoroutine()
 
 	sweep.g = getg()
@@ -161,7 +161,7 @@ func bgsweep(c chan int) {
 	lockInit(&sweep.lock, lockRankSweep)
 	lock(&sweep.lock)
 	sweep.parked = true
-	c <- 1
+	gcenable_setup <- 1
 	goparkunlock(&sweep.lock, waitReasonGCSweepWait, traceEvGoBlock, 1)
 
 	for {
@@ -178,6 +178,10 @@ func bgsweep(c chan int) {
 			// gosweepone returning ^0 above
 			// and the lock being acquired.
 			unlock(&sweep.lock)
+
+			// We need a preemption point for gofrontend.
+			Gosched()
+
 			continue
 		}
 		sweep.parked = true
@@ -185,66 +189,133 @@ func bgsweep(c chan int) {
 	}
 }
 
+// sweepLocker acquires sweep ownership of spans and blocks sweep
+// completion.
+type sweepLocker struct {
+	// sweepGen is the sweep generation of the heap.
+	sweepGen uint32
+	// blocking indicates that this tracker is blocking sweep
+	// completion, usually as a result of acquiring sweep
+	// ownership of at least one span.
+	blocking bool
+}
+
+// sweepLocked represents sweep ownership of a span.
+type sweepLocked struct {
+	*mspan
+}
+
+func newSweepLocker() sweepLocker {
+	return sweepLocker{
+		sweepGen: mheap_.sweepgen,
+	}
+}
+
+// tryAcquire attempts to acquire sweep ownership of span s. If it
+// successfully acquires ownership, it blocks sweep completion.
+func (l *sweepLocker) tryAcquire(s *mspan) (sweepLocked, bool) {
+	// Check before attempting to CAS.
+	if atomic.Load(&s.sweepgen) != l.sweepGen-2 {
+		return sweepLocked{}, false
+	}
+	// Add ourselves to sweepers before potentially taking
+	// ownership.
+	l.blockCompletion()
+	// Attempt to acquire sweep ownership of s.
+	if !atomic.Cas(&s.sweepgen, l.sweepGen-2, l.sweepGen-1) {
+		return sweepLocked{}, false
+	}
+	return sweepLocked{s}, true
+}
+
+// blockCompletion blocks sweep completion without acquiring any
+// specific spans.
+func (l *sweepLocker) blockCompletion() {
+	if !l.blocking {
+		atomic.Xadd(&mheap_.sweepers, +1)
+		l.blocking = true
+	}
+}
+
+func (l *sweepLocker) dispose() {
+	if !l.blocking {
+		return
+	}
+	// Decrement the number of active sweepers and if this is the
+	// last one, mark sweep as complete.
+	l.blocking = false
+	if atomic.Xadd(&mheap_.sweepers, -1) == 0 && atomic.Load(&mheap_.sweepDrained) != 0 {
+		l.sweepIsDone()
+	}
+}
+
+func (l *sweepLocker) sweepIsDone() {
+	if debug.gcpacertrace > 0 {
+		print("pacer: sweep done at heap size ", gcController.heapLive>>20, "MB; allocated ", (gcController.heapLive-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept, " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
+	}
+}
+
 // sweepone sweeps some unswept heap span and returns the number of pages returned
 // to the heap, or ^uintptr(0) if there was nothing to sweep.
 func sweepone() uintptr {
 	_g_ := getg()
-	sweepRatio := mheap_.sweepPagesPerByte // For debugging
 
 	// increment locks to ensure that the goroutine is not preempted
 	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
 	_g_.m.locks++
-	if atomic.Load(&mheap_.sweepdone) != 0 {
+	if atomic.Load(&mheap_.sweepDrained) != 0 {
 		_g_.m.locks--
 		return ^uintptr(0)
 	}
-	atomic.Xadd(&mheap_.sweepers, +1)
+	// TODO(austin): sweepone is almost always called in a loop;
+	// lift the sweepLocker into its callers.
+	sl := newSweepLocker()
 
 	// Find a span to sweep.
-	var s *mspan
-	sg := mheap_.sweepgen
+	npages := ^uintptr(0)
+	var noMoreWork bool
 	for {
-		s = mheap_.nextSpanForSweep()
+		s := mheap_.nextSpanForSweep()
 		if s == nil {
-			atomic.Store(&mheap_.sweepdone, 1)
+			noMoreWork = atomic.Cas(&mheap_.sweepDrained, 0, 1)
 			break
 		}
 		if state := s.state.get(); state != mSpanInUse {
 			// This can happen if direct sweeping already
 			// swept this span, but in that case the sweep
 			// generation should always be up-to-date.
-			if !(s.sweepgen == sg || s.sweepgen == sg+3) {
-				print("runtime: bad span s.state=", state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
+			if !(s.sweepgen == sl.sweepGen || s.sweepgen == sl.sweepGen+3) {
+				print("runtime: bad span s.state=", state, " s.sweepgen=", s.sweepgen, " sweepgen=", sl.sweepGen, "\n")
 				throw("non in-use span in unswept list")
 			}
 			continue
 		}
-		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		if s, ok := sl.tryAcquire(s); ok {
+			// Sweep the span we found.
+			npages = s.npages
+			if s.sweep(false) {
+				// Whole span was freed. Count it toward the
+				// page reclaimer credit since these pages can
+				// now be used for span allocation.
+				atomic.Xadduintptr(&mheap_.reclaimCredit, npages)
+			} else {
+				// Span is still in-use, so this returned no
+				// pages to the heap and the span needs to
+				// move to the swept in-use list.
+				npages = 0
+			}
 			break
 		}
 	}
 
-	// Sweep the span we found.
-	npages := ^uintptr(0)
-	if s != nil {
-		npages = s.npages
-		if s.sweep(false) {
-			// Whole span was freed. Count it toward the
-			// page reclaimer credit since these pages can
-			// now be used for span allocation.
-			atomic.Xadduintptr(&mheap_.reclaimCredit, npages)
-		} else {
-			// Span is still in-use, so this returned no
-			// pages to the heap and the span needs to
-			// move to the swept in-use list.
-			npages = 0
-		}
-	}
+	sl.dispose()
 
-	// Decrement the number of active sweepers and if this is the
-	// last one print trace information.
-	if atomic.Xadd(&mheap_.sweepers, -1) == 0 && atomic.Load(&mheap_.sweepdone) != 0 {
-		// Since the sweeper is done, move the scavenge gen forward (signalling
+	if noMoreWork {
+		// The sweep list is empty. There may still be
+		// concurrent sweeps running, but we're at least very
+		// close to done sweeping.
+
+		// Move the scavenge gen forward (signalling
 		// that there's new work to do) and wake the scavenger.
 		//
 		// The scavenger is signaled by the last sweeper because once
@@ -264,23 +335,23 @@ func sweepone() uintptr {
 		// for us to wake the scavenger directly via wakeScavenger, since
 		// it could allocate. Ask sysmon to do it for us instead.
 		readyForScavenger()
-
-		if debug.gcpacertrace > 0 {
-			print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", (memstats.heap_live-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept, " pages at ", sweepRatio, " pages/byte\n")
-		}
 	}
+
 	_g_.m.locks--
 	return npages
 }
 
-// isSweepDone reports whether all spans are swept or currently being swept.
+// isSweepDone reports whether all spans are swept.
 //
 // Note that this condition may transition from false to true at any
 // time as the sweeper runs. It may transition from true to false if a
 // GC runs; to prevent that the caller must be non-preemptible or must
 // somehow block GC progress.
 func isSweepDone() bool {
-	return mheap_.sweepdone != 0
+	// Check that all spans have at least begun sweeping and there
+	// are no active sweepers. If both are true, then all spans
+	// have finished sweeping.
+	return atomic.Load(&mheap_.sweepDrained) != 0 && atomic.Load(&mheap_.sweepers) == 0
 }
 
 // Returns only when span s has been swept.
@@ -294,20 +365,19 @@ func (s *mspan) ensureSwept() {
 		throw("mspan.ensureSwept: m is not locked")
 	}
 
-	sg := mheap_.sweepgen
-	spangen := atomic.Load(&s.sweepgen)
-	if spangen == sg || spangen == sg+3 {
-		return
-	}
+	sl := newSweepLocker()
 	// The caller must be sure that the span is a mSpanInUse span.
-	if atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+	if s, ok := sl.tryAcquire(s); ok {
 		s.sweep(false)
+		sl.dispose()
 		return
 	}
+	sl.dispose()
+
 	// unfortunate condition, and we don't have efficient means to wait
 	for {
 		spangen := atomic.Load(&s.sweepgen)
-		if spangen == sg || spangen == sg+3 {
+		if spangen == sl.sweepGen || spangen == sl.sweepGen+3 {
 			break
 		}
 		osyield()
@@ -319,13 +389,21 @@ func (s *mspan) ensureSwept() {
 // Returns true if the span was returned to heap.
 // If preserve=true, don't return it to heap nor relink in mcentral lists;
 // caller takes care of it.
-func (s *mspan) sweep(preserve bool) bool {
+func (sl *sweepLocked) sweep(preserve bool) bool {
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
 	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
 		throw("mspan.sweep: m is not locked")
 	}
+
+	s := sl.mspan
+	if !preserve {
+		// We'll release ownership of this span. Nil it out to
+		// prevent the caller from accidentally using it.
+		sl.mspan = nil
+	}
+
 	sweepgen := mheap_.sweepgen
 	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
 		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
@@ -358,11 +436,10 @@ func (s *mspan) sweep(preserve bool) bool {
 	//    If such object is not marked, we need to queue all finalizers at once.
 	// Both 1 and 2 are possible at the same time.
 	hadSpecials := s.specials != nil
-	specialp := &s.specials
-	special := *specialp
-	for special != nil {
+	siter := newSpecialsIter(s)
+	for siter.valid() {
 		// A finalizer can be set for an inner byte of an object, find object beginning.
-		objIndex := uintptr(special.offset) / size
+		objIndex := uintptr(siter.s.offset) / size
 		p := s.base() + objIndex*size
 		mbits := s.markBitsForIndex(objIndex)
 		if !mbits.isMarked() {
@@ -370,7 +447,7 @@ func (s *mspan) sweep(preserve bool) bool {
 			// Pass 1: see if it has at least one finalizer.
 			hasFin := false
 			endOffset := p - s.base() + size
-			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+			for tmp := siter.s; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
 				if tmp.kind == _KindSpecialFinalizer {
 					// Stop freeing of object if it has a finalizer.
 					mbits.setMarkedNonAtomic()
@@ -379,27 +456,31 @@ func (s *mspan) sweep(preserve bool) bool {
 				}
 			}
 			// Pass 2: queue all finalizers _or_ handle profile record.
-			for special != nil && uintptr(special.offset) < endOffset {
+			for siter.valid() && uintptr(siter.s.offset) < endOffset {
 				// Find the exact byte for which the special was setup
 				// (as opposed to object beginning).
+				special := siter.s
 				p := s.base() + uintptr(special.offset)
 				if special.kind == _KindSpecialFinalizer || !hasFin {
-					// Splice out special record.
-					y := special
-					special = special.next
-					*specialp = special
-					freespecial(y, unsafe.Pointer(p), size)
+					siter.unlinkAndNext()
+					freeSpecial(special, unsafe.Pointer(p), size)
 				} else {
-					// This is profile record, but the object has finalizers (so kept alive).
-					// Keep special record.
-					specialp = &special.next
-					special = *specialp
+					// The object has finalizers, so we're keeping it alive.
+					// All other specials only apply when an object is freed,
+					// so just keep the special record.
+					siter.next()
 				}
 			}
 		} else {
-			// object is still live: keep special record
-			specialp = &special.next
-			special = *specialp
+			// object is still live
+			if siter.s.kind == _KindSpecialReachable {
+				special := siter.unlinkAndNext()
+				(*specialReachable)(unsafe.Pointer(special)).reachable = true
+				freeSpecial(special, unsafe.Pointer(p), size)
+			} else {
+				// keep special record
+				siter.next()
+			}
 		}
 	}
 	if hadSpecials && s.specials == nil {
@@ -647,7 +728,7 @@ retry:
 	sweptBasis := atomic.Load64(&mheap_.pagesSweptBasis)
 
 	// Fix debt if necessary.
-	newHeapLive := uintptr(atomic.Load64(&memstats.heap_live)-mheap_.sweepHeapLiveBasis) + spanBytes
+	newHeapLive := uintptr(atomic.Load64(&gcController.heapLive)-mheap_.sweepHeapLiveBasis) + spanBytes
 	pagesTarget := int64(mheap_.sweepPagesPerByte*float64(newHeapLive)) - int64(callerSweepPages)
 	for pagesTarget > int64(atomic.Load64(&mheap_.pagesSwept)-sweptBasis) {
 		if sweepone() == ^uintptr(0) {
diff --git a/libgo/go/runtime/mgcwork.go b/libgo/go/runtime/mgcwork.go
index b3a068661eb..667c7afa971 100644
--- a/libgo/go/runtime/mgcwork.go
+++ b/libgo/go/runtime/mgcwork.go
@@ -398,7 +398,7 @@ func getempty() *workbuf {
 }
 
 // putempty puts a workbuf onto the work.empty list.
-// Upon entry this go routine owns b. The lfstack.push relinquishes ownership.
+// Upon entry this goroutine owns b. The lfstack.push relinquishes ownership.
 //go:nowritebarrier
 func putempty(b *workbuf) {
 	b.checkempty()
diff --git a/libgo/go/runtime/mheap.go b/libgo/go/runtime/mheap.go
index 77f39870c27..8e346c8ead7 100644
--- a/libgo/go/runtime/mheap.go
+++ b/libgo/go/runtime/mheap.go
@@ -62,11 +62,12 @@ const (
 type mheap struct {
 	// lock must only be acquired on the system stack, otherwise a g
 	// could self-deadlock if its stack grows with the lock held.
-	lock      mutex
-	pages     pageAlloc // page allocation data structure
-	sweepgen  uint32    // sweep generation, see comment in mspan; written during STW
-	sweepdone uint32    // all spans are swept
-	sweepers  uint32    // number of active sweepone calls
+	lock  mutex
+	pages pageAlloc // page allocation data structure
+
+	sweepgen     uint32 // sweep generation, see comment in mspan; written during STW
+	sweepDrained uint32 // all spans are swept or are being swept
+	sweepers     uint32 // number of active sweepone calls
 
 	// allspans is a slice of all mspans ever created. Each mspan
 	// appears exactly once.
@@ -85,14 +86,14 @@ type mheap struct {
 
 	// Proportional sweep
 	//
-	// These parameters represent a linear function from heap_live
+	// These parameters represent a linear function from gcController.heapLive
 	// to page sweep count. The proportional sweep system works to
 	// stay in the black by keeping the current page sweep count
-	// above this line at the current heap_live.
+	// above this line at the current gcController.heapLive.
 	//
 	// The line has slope sweepPagesPerByte and passes through a
 	// basis point at (sweepHeapLiveBasis, pagesSweptBasis). At
-	// any given time, the system is at (memstats.heap_live,
+	// any given time, the system is at (gcController.heapLive,
 	// pagesSwept) in this space.
 	//
 	// It's important that the line pass through a point we
@@ -104,7 +105,7 @@ type mheap struct {
 	pagesInUse         uint64  // pages of spans in stats mSpanInUse; updated atomically
 	pagesSwept         uint64  // pages swept this cycle; updated atomically
 	pagesSweptBasis    uint64  // pagesSwept to use as the origin of the sweep ratio; updated atomically
-	sweepHeapLiveBasis uint64  // value of heap_live to use as the origin of sweep ratio; written with lock, read without
+	sweepHeapLiveBasis uint64  // value of gcController.heapLive to use as the origin of sweep ratio; written with lock, read without
 	sweepPagesPerByte  float64 // proportional sweep ratio; written with lock, read without
 	// TODO(austin): pagesInUse should be a uintptr, but the 386
 	// compiler can't 8-byte align fields.
@@ -212,6 +213,7 @@ type mheap struct {
 	cachealloc            fixalloc // allocator for mcache*
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
+	specialReachableAlloc fixalloc // allocator for specialReachable
 	speciallock           mutex    // lock for special record allocators.
 	arenaHintAlloc        fixalloc // allocator for arenaHints
 
@@ -451,14 +453,11 @@ type mspan struct {
 	// h->sweepgen is incremented by 2 after every GC
 
 	sweepgen    uint32
-	divMul      uint16        // for divide by elemsize - divMagic.mul
-	baseMask    uint16        // if non-0, elemsize is a power of 2, & this will get object allocation base
+	divMul      uint32        // for divide by elemsize
 	allocCount  uint16        // number of allocated objects
 	spanclass   spanClass     // size class and noscan (uint8)
 	state       mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
 	needzero    uint8         // needs to be zeroed before allocation
-	divShift    uint8         // for divide by elemsize - divMagic.shift
-	divShift2   uint8         // for divide by elemsize - divMagic.shift2
 	elemsize    uintptr       // computed from sizeclass or from npages
 	limit       uintptr       // end of data in span
 	speciallock mutex         // guards specials list
@@ -706,6 +705,7 @@ func (h *mheap) init() {
 	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
 	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
 	h.specialprofilealloc.init(unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
+	h.specialReachableAlloc.init(unsafe.Sizeof(specialReachable{}), nil, nil, &memstats.other_sys)
 	h.arenaHintAlloc.init(unsafe.Sizeof(arenaHint{}), nil, nil, &memstats.other_sys)
 
 	// Don't zero mspan allocations. Background sweeping can
@@ -818,7 +818,7 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 
 	n0 := n
 	var nFreed uintptr
-	sg := h.sweepgen
+	sl := newSweepLocker()
 	for n > 0 {
 		ai := arenas[pageIdx/pagesPerArena]
 		ha := h.arenas[ai.l1()][ai.l2()]
@@ -843,7 +843,7 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 			for j := uint(0); j < 8; j++ {
 				if inUseUnmarked&(1<<j) != 0 {
 					s := ha.spans[arenaPage+uint(i)*8+j]
-					if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+					if s, ok := sl.tryAcquire(s); ok {
 						npages := s.npages
 						unlock(&h.lock)
 						if s.sweep(false) {
@@ -864,6 +864,7 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 		pageIdx += uintptr(len(inUse) * 8)
 		n -= uintptr(len(inUse) * 8)
 	}
+	sl.dispose()
 	if trace.enabled {
 		unlock(&h.lock)
 		// Account for pages scanned but not reclaimed.
@@ -896,7 +897,9 @@ func (s spanAllocType) manual() bool {
 // spanclass indicates the span's size class and scannability.
 //
 // If needzero is true, the memory for the returned span will be zeroed.
-func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan {
+// The boolean returned indicates whether the returned span contains zeroes,
+// either because this was requested, or because it was already zeroed.
+func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) (*mspan, bool) {
 	// Don't do any operations that lock the heap on the G stack.
 	// It might trigger stack growth, and the stack growth code needs
 	// to be able to allocate heap.
@@ -904,19 +907,22 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 	systemstack(func() {
 		// To prevent excessive heap growth, before allocating n pages
 		// we need to sweep and reclaim at least n pages.
-		if h.sweepdone == 0 {
+		if !isSweepDone() {
 			h.reclaim(npages)
 		}
 		s = h.allocSpan(npages, spanAllocHeap, spanclass)
 	})
 
-	if s != nil {
-		if needzero && s.needzero != 0 {
-			memclrNoHeapPointers(unsafe.Pointer(s.base()), s.npages<<_PageShift)
-		}
-		s.needzero = 0
+	if s == nil {
+		return nil, false
 	}
-	return s
+	isZeroed := s.needzero == 0
+	if needzero && !isZeroed {
+		memclrNoHeapPointers(unsafe.Pointer(s.base()), s.npages<<_PageShift)
+		isZeroed = true
+	}
+	s.needzero = 0
+	return s, isZeroed
 }
 
 // allocManual allocates a manually-managed span of npage pages.
@@ -1224,20 +1230,11 @@ HaveSpan:
 		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
 			s.elemsize = nbytes
 			s.nelems = 1
-
-			s.divShift = 0
 			s.divMul = 0
-			s.divShift2 = 0
-			s.baseMask = 0
 		} else {
 			s.elemsize = uintptr(class_to_size[sizeclass])
 			s.nelems = nbytes / s.elemsize
-
-			m := &class_to_divmagic[sizeclass]
-			s.divShift = m.shift
-			s.divMul = m.mul
-			s.divShift2 = m.shift2
-			s.baseMask = m.baseMask
+			s.divMul = class_to_divmagic[sizeclass]
 		}
 
 		// Initialize mark and allocation structures.
@@ -1332,6 +1329,10 @@ func (h *mheap) grow(npage uintptr) bool {
 	assertLockHeld(&h.lock)
 
 	// We must grow the heap in whole palloc chunks.
+	// We call sysMap below but note that because we
+	// round up to pallocChunkPages which is on the order
+	// of MiB (generally >= to the huge page size) we
+	// won't be calling it too much.
 	ask := alignUp(npage, pallocChunkPages) * pageSize
 
 	totalGrowth := uintptr(0)
@@ -1358,6 +1359,17 @@ func (h *mheap) grow(npage uintptr) bool {
 			// remains of the current space and switch to
 			// the new space. This should be rare.
 			if size := h.curArena.end - h.curArena.base; size != 0 {
+				// Transition this space from Reserved to Prepared and mark it
+				// as released since we'll be able to start using it after updating
+				// the page allocator and releasing the lock at any time.
+				sysMap(unsafe.Pointer(h.curArena.base), size, &memstats.heap_sys)
+				// Update stats.
+				atomic.Xadd64(&memstats.heap_released, int64(size))
+				stats := memstats.heapStats.acquire()
+				atomic.Xaddint64(&stats.released, int64(size))
+				memstats.heapStats.release()
+				// Update the page allocator's structures to make this
+				// space ready for allocation.
 				h.pages.grow(h.curArena.base, size)
 				totalGrowth += size
 			}
@@ -1366,17 +1378,6 @@ func (h *mheap) grow(npage uintptr) bool {
 			h.curArena.end = uintptr(av) + asize
 		}
 
-		// The memory just allocated counts as both released
-		// and idle, even though it's not yet backed by spans.
-		//
-		// The allocation is always aligned to the heap arena
-		// size which is always > physPageSize, so its safe to
-		// just add directly to heap_released.
-		atomic.Xadd64(&memstats.heap_released, int64(asize))
-		stats := memstats.heapStats.acquire()
-		atomic.Xaddint64(&stats.released, int64(asize))
-		memstats.heapStats.release()
-
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
 		// a valid region starting at h.curArena.base which is at
@@ -1387,6 +1388,23 @@ func (h *mheap) grow(npage uintptr) bool {
 	// Grow into the current arena.
 	v := h.curArena.base
 	h.curArena.base = nBase
+
+	// Transition the space we're going to use from Reserved to Prepared.
+	sysMap(unsafe.Pointer(v), nBase-v, &memstats.heap_sys)
+
+	// The memory just allocated counts as both released
+	// and idle, even though it's not yet backed by spans.
+	//
+	// The allocation is always aligned to the heap arena
+	// size which is always > physPageSize, so its safe to
+	// just add directly to heap_released.
+	atomic.Xadd64(&memstats.heap_released, int64(nBase-v))
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.released, int64(nBase-v))
+	memstats.heapStats.release()
+
+	// Update the page allocator's structures to make this
+	// space ready for allocation.
 	h.pages.grow(v, nBase-v)
 	totalGrowth += nBase - v
 
@@ -1640,6 +1658,9 @@ func (list *mSpanList) takeAll(other *mSpanList) {
 const (
 	_KindSpecialFinalizer = 1
 	_KindSpecialProfile   = 2
+	// _KindSpecialReachable is a special used for tracking
+	// reachability during testing.
+	_KindSpecialReachable = 3
 	// Note: The finalizer special must be first because if we're freeing
 	// an object, a finalizer special will cause the freeing operation
 	// to abort, and we want to keep the other special records around
@@ -1843,9 +1864,45 @@ func setprofilebucket(p unsafe.Pointer, b *bucket) {
 	}
 }
 
-// Do whatever cleanup needs to be done to deallocate s. It has
-// already been unlinked from the mspan specials list.
-func freespecial(s *special, p unsafe.Pointer, size uintptr) {
+// specialReachable tracks whether an object is reachable on the next
+// GC cycle. This is used by testing.
+type specialReachable struct {
+	special   special
+	done      bool
+	reachable bool
+}
+
+// specialsIter helps iterate over specials lists.
+type specialsIter struct {
+	pprev **special
+	s     *special
+}
+
+func newSpecialsIter(span *mspan) specialsIter {
+	return specialsIter{&span.specials, span.specials}
+}
+
+func (i *specialsIter) valid() bool {
+	return i.s != nil
+}
+
+func (i *specialsIter) next() {
+	i.pprev = &i.s.next
+	i.s = *i.pprev
+}
+
+// unlinkAndNext removes the current special from the list and moves
+// the iterator to the next special. It returns the unlinked special.
+func (i *specialsIter) unlinkAndNext() *special {
+	cur := i.s
+	i.s = cur.next
+	*i.pprev = i.s
+	return cur
+}
+
+// freeSpecial performs any cleanup on special s and deallocates it.
+// s must already be unlinked from the specials list.
+func freeSpecial(s *special, p unsafe.Pointer, size uintptr) {
 	switch s.kind {
 	case _KindSpecialFinalizer:
 		sf := (*specialfinalizer)(unsafe.Pointer(s))
@@ -1859,6 +1916,10 @@ func freespecial(s *special, p unsafe.Pointer, size uintptr) {
 		lock(&mheap_.speciallock)
 		mheap_.specialprofilealloc.free(unsafe.Pointer(sp))
 		unlock(&mheap_.speciallock)
+	case _KindSpecialReachable:
+		sp := (*specialReachable)(unsafe.Pointer(s))
+		sp.done = true
+		// The creator frees these.
 	default:
 		throw("bad special kind")
 		panic("not reached")
diff --git a/libgo/go/runtime/mkfastlog2table.go b/libgo/go/runtime/mkfastlog2table.go
index d6502923948..8d78a3923a0 100644
--- a/libgo/go/runtime/mkfastlog2table.go
+++ b/libgo/go/runtime/mkfastlog2table.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 // fastlog2Table contains log2 approximations for 5 binary digits.
diff --git a/libgo/go/runtime/mkpreempt.go b/libgo/go/runtime/mkpreempt.go
index 8644973a0cc..f82df934422 100644
--- a/libgo/go/runtime/mkpreempt.go
+++ b/libgo/go/runtime/mkpreempt.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 // mkpreempt generates the asyncPreempt functions for each
@@ -123,6 +124,7 @@ func header(arch string) {
 	fmt.Fprintf(out, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
 	if beLe[arch] {
 		base := arch[:len(arch)-1]
+		fmt.Fprintf(out, "//go:build %s || %sle\n", base, base)
 		fmt.Fprintf(out, "// +build %s %sle\n\n", base, base)
 	}
 	fmt.Fprintf(out, "#include \"go_asm.h\"\n")
@@ -230,12 +232,16 @@ func genAMD64() {
 		if reg == "SP" || reg == "BP" {
 			continue
 		}
-		if strings.HasPrefix(reg, "X") {
-			l.add("MOVUPS", reg, 16)
-		} else {
+		if !strings.HasPrefix(reg, "X") {
 			l.add("MOVQ", reg, 8)
 		}
 	}
+	lSSE := layout{stack: l.stack, sp: "SP"}
+	for _, reg := range regNamesAMD64 {
+		if strings.HasPrefix(reg, "X") {
+			lSSE.add("MOVUPS", reg, 16)
+		}
+	}
 
 	// TODO: MXCSR register?
 
@@ -244,10 +250,12 @@ func genAMD64() {
 	p("// Save flags before clobbering them")
 	p("PUSHFQ")
 	p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
-	p("ADJSP $%d", l.stack)
+	p("ADJSP $%d", lSSE.stack)
 	p("// But vet doesn't know ADJSP, so suppress vet stack checking")
 	p("NOP SP")
 
+	l.save()
+
 	// Apparently, the signal handling code path in darwin kernel leaves
 	// the upper bits of Y registers in a dirty state, which causes
 	// many SSE operations (128-bit and narrower) become much slower.
@@ -259,10 +267,11 @@ func genAMD64() {
 	p("VZEROUPPER")
 	p("#endif")
 
-	l.save()
+	lSSE.save()
 	p("CALL ·asyncPreempt2(SB)")
+	lSSE.restore()
 	l.restore()
-	p("ADJSP $%d", -l.stack)
+	p("ADJSP $%d", -lSSE.stack)
 	p("POPFQ")
 	p("POPQ BP")
 	p("RET")
diff --git a/libgo/go/runtime/mksizeclasses.go b/libgo/go/runtime/mksizeclasses.go
index b92d1fed5f8..b1b10e9e023 100644
--- a/libgo/go/runtime/mksizeclasses.go
+++ b/libgo/go/runtime/mksizeclasses.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ignore
 // +build ignore
 
 // Generate tables for small malloc size classes.
@@ -36,6 +37,8 @@ import (
 	"go/format"
 	"io"
 	"log"
+	"math"
+	"math/bits"
 	"os"
 )
 
@@ -86,11 +89,6 @@ const (
 type class struct {
 	size   int // max size
 	npages int // number of pages
-
-	mul    int
-	shift  uint
-	shift2 uint
-	mask   int
 }
 
 func powerOfTwo(x int) bool {
@@ -167,9 +165,9 @@ func makeClasses() []class {
 	return classes
 }
 
-// computeDivMagic computes some magic constants to implement
-// the division required to compute object number from span offset.
-// n / c.size is implemented as n >> c.shift * c.mul >> c.shift2
+// computeDivMagic checks that the division required to compute object
+// index from span offset can be computed using 32-bit multiplication.
+// n / c.size is implemented as (n * (^uint32(0)/uint32(c.size) + 1)) >> 32
 // for all 0 <= n <= c.npages * pageSize
 func computeDivMagic(c *class) {
 	// divisor
@@ -181,68 +179,67 @@ func computeDivMagic(c *class) {
 	// maximum input value for which the formula needs to work.
 	max := c.npages * pageSize
 
+	// As reported in [1], if n and d are unsigned N-bit integers, we
+	// can compute n / d as ⌊n * c / 2^F⌋, where c is ⌈2^F / d⌉ and F is
+	// computed with:
+	//
+	// 	Algorithm 2: Algorithm to select the number of fractional bits
+	// 	and the scaled approximate reciprocal in the case of unsigned
+	// 	integers.
+	//
+	// 	if d is a power of two then
+	// 		Let F ← log₂(d) and c = 1.
+	// 	else
+	// 		Let F ← N + L where L is the smallest integer
+	// 		such that d ≤ (2^(N+L) mod d) + 2^L.
+	// 	end if
+	//
+	// [1] "Faster Remainder by Direct Computation: Applications to
+	// Compilers and Software Libraries" Daniel Lemire, Owen Kaser,
+	// Nathan Kurz arXiv:1902.01961
+	//
+	// To minimize the risk of introducing errors, we implement the
+	// algorithm exactly as stated, rather than trying to adapt it to
+	// fit typical Go idioms.
+	N := bits.Len(uint(max))
+	var F int
 	if powerOfTwo(d) {
-		// If the size is a power of two, heapBitsForObject can divide even faster by masking.
-		// Compute this mask.
-		if max >= 1<<16 {
-			panic("max too big for power of two size")
+		F = int(math.Log2(float64(d)))
+		if d != 1<<F {
+			panic("imprecise log2")
 		}
-		c.mask = 1<<16 - d
-	}
-
-	// Compute pre-shift by factoring power of 2 out of d.
-	for d%2 == 0 {
-		c.shift++
-		d >>= 1
-		max >>= 1
-	}
-
-	// Find the smallest k that works.
-	// A small k allows us to fit the math required into 32 bits
-	// so we can use 32-bit multiplies and shifts on 32-bit platforms.
-nextk:
-	for k := uint(0); ; k++ {
-		mul := (int(1)<<k + d - 1) / d //  ⌈2^k / d⌉
-
-		// Test to see if mul works.
-		for n := 0; n <= max; n++ {
-			if n*mul>>k != n/d {
-				continue nextk
+	} else {
+		for L := 0; ; L++ {
+			if d <= ((1<<(N+L))%d)+(1<<L) {
+				F = N + L
+				break
 			}
 		}
-		if mul >= 1<<16 {
-			panic("mul too big")
-		}
-		if uint64(mul)*uint64(max) >= 1<<32 {
-			panic("mul*max too big")
-		}
-		c.mul = mul
-		c.shift2 = k
-		break
 	}
 
-	// double-check.
+	// Also, noted in the paper, F is the smallest number of fractional
+	// bits required. We use 32 bits, because it works for all size
+	// classes and is fast on all CPU architectures that we support.
+	if F > 32 {
+		fmt.Printf("d=%d max=%d N=%d F=%d\n", c.size, max, N, F)
+		panic("size class requires more than 32 bits of precision")
+	}
+
+	// Brute force double-check with the exact computation that will be
+	// done by the runtime.
+	m := ^uint32(0)/uint32(c.size) + 1
 	for n := 0; n <= max; n++ {
-		if n*c.mul>>c.shift2 != n/d {
-			fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
-			panic("bad multiply magic")
-		}
-		// Also check the exact computations that will be done by the runtime,
-		// for both 32 and 64 bit operations.
-		if uint32(n)*uint32(c.mul)>>uint8(c.shift2) != uint32(n/d) {
-			fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
+		if uint32((uint64(n)*uint64(m))>>32) != uint32(n/c.size) {
+			fmt.Printf("d=%d max=%d m=%d n=%d\n", d, max, m, n)
 			panic("bad 32-bit multiply magic")
 		}
-		if uint64(n)*uint64(c.mul)>>uint8(c.shift2) != uint64(n/d) {
-			fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
-			panic("bad 64-bit multiply magic")
-		}
 	}
 }
 
 func printComment(w io.Writer, classes []class) {
-	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-10s  %-9s\n", "class", "bytes/obj", "bytes/span", "objects", "tail waste", "max waste")
+	fmt.Fprintf(w, "// %-5s  %-9s  %-10s  %-7s  %-10s  %-9s  %-9s\n", "class", "bytes/obj", "bytes/span", "objects", "tail waste", "max waste", "min align")
 	prevSize := 0
+	var minAligns [pageShift + 1]int
 	for i, c := range classes {
 		if i == 0 {
 			continue
@@ -251,8 +248,32 @@ func printComment(w io.Writer, classes []class) {
 		objects := spanSize / c.size
 		tailWaste := spanSize - c.size*(spanSize/c.size)
 		maxWaste := float64((c.size-prevSize-1)*objects+tailWaste) / float64(spanSize)
+		alignBits := bits.TrailingZeros(uint(c.size))
+		if alignBits > pageShift {
+			// object alignment is capped at page alignment
+			alignBits = pageShift
+		}
+		for i := range minAligns {
+			if i > alignBits {
+				minAligns[i] = 0
+			} else if minAligns[i] == 0 {
+				minAligns[i] = c.size
+			}
+		}
 		prevSize = c.size
-		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %10d  %8.2f%%\n", i, c.size, spanSize, objects, tailWaste, 100*maxWaste)
+		fmt.Fprintf(w, "// %5d  %9d  %10d  %7d  %10d  %8.2f%%  %9d\n", i, c.size, spanSize, objects, tailWaste, 100*maxWaste, 1<<alignBits)
+	}
+	fmt.Fprintf(w, "\n")
+
+	fmt.Fprintf(w, "// %-9s  %-4s  %-12s\n", "alignment", "bits", "min obj size")
+	for bits, size := range minAligns {
+		if size == 0 {
+			break
+		}
+		if bits+1 < len(minAligns) && size == minAligns[bits+1] {
+			continue
+		}
+		fmt.Fprintf(w, "// %9d  %4d  %12d\n", 1<<bits, bits, size)
 	}
 	fmt.Fprintf(w, "\n")
 }
@@ -279,15 +300,13 @@ func printClasses(w io.Writer, classes []class) {
 	}
 	fmt.Fprintln(w, "}")
 
-	fmt.Fprintln(w, "type divMagic struct {")
-	fmt.Fprintln(w, "  shift uint8")
-	fmt.Fprintln(w, "  shift2 uint8")
-	fmt.Fprintln(w, "  mul uint16")
-	fmt.Fprintln(w, "  baseMask uint16")
-	fmt.Fprintln(w, "}")
-	fmt.Fprint(w, "var class_to_divmagic = [_NumSizeClasses]divMagic {")
+	fmt.Fprint(w, "var class_to_divmagic = [_NumSizeClasses]uint32 {")
 	for _, c := range classes {
-		fmt.Fprintf(w, "{%d,%d,%d,%d},", c.shift, c.shift2, c.mul, c.mask)
+		if c.size == 0 {
+			fmt.Fprintf(w, "0,")
+			continue
+		}
+		fmt.Fprintf(w, "^uint32(0)/%d+1,", c.size)
 	}
 	fmt.Fprintln(w, "}")
 
diff --git a/libgo/go/runtime/mpagealloc.go b/libgo/go/runtime/mpagealloc.go
index dac1f399690..071f1fc2749 100644
--- a/libgo/go/runtime/mpagealloc.go
+++ b/libgo/go/runtime/mpagealloc.go
@@ -395,6 +395,9 @@ func (p *pageAlloc) grow(base, size uintptr) {
 			// Store it atomically to avoid races with readers which
 			// don't acquire the heap lock.
 			r := sysAlloc(unsafe.Sizeof(*p.chunks[0]), p.sysStat)
+			if r == nil {
+				throw("pageAlloc: out of memory")
+			}
 			atomic.StorepNoWB(unsafe.Pointer(&p.chunks[c.l1()]), r)
 		}
 		p.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
diff --git a/libgo/go/runtime/mpagealloc_32bit.go b/libgo/go/runtime/mpagealloc_32bit.go
index e721c3848ea..07415461701 100644
--- a/libgo/go/runtime/mpagealloc_32bit.go
+++ b/libgo/go/runtime/mpagealloc_32bit.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build 386 || arm || mips || mipsle || wasm || (ios && arm64) || amd64p32 || armbe || m68k || mips64p32 || mips64p32le || nios2 || ppc || riscv || s390 || sh || shbe || sparc
 // +build 386 arm mips mipsle wasm ios,arm64 amd64p32 armbe m68k mips64p32 mips64p32le nios2 ppc riscv s390 sh shbe sparc
 
 // wasm is a treated as a 32-bit architecture for the purposes of the page
diff --git a/libgo/go/runtime/mpagealloc_64bit.go b/libgo/go/runtime/mpagealloc_64bit.go
index 4d4257a8c0a..729a4eb9b79 100644
--- a/libgo/go/runtime/mpagealloc_64bit.go
+++ b/libgo/go/runtime/mpagealloc_64bit.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build amd64 || (!ios && arm64) || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || arm64be || alpha || sparc64 || ia64
 // +build amd64 !ios,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x arm64be alpha sparc64 ia64
 
 // See mpagealloc_32bit.go for why ios/arm64 is excluded here.
diff --git a/libgo/go/runtime/mprof.go b/libgo/go/runtime/mprof.go
index afacf8fa70f..9b115973ee3 100644
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@@ -12,6 +12,10 @@ import (
 	"unsafe"
 )
 
+// For gofrontend, use go:linkname for blockevent so that
+// runtime/pprof/pprof_test can call it.
+//go:linkname blockevent
+
 // NOTE(rsc): Everything here could use cas if contention became an issue.
 var proflock mutex
 
@@ -138,7 +142,7 @@ func (a *memRecordCycle) add(b *memRecordCycle) {
 // A blockRecord is the bucket data for a bucket of type blockProfile,
 // which is used in blocking and mutex profiles.
 type blockRecord struct {
-	count  int64
+	count  float64
 	cycles int64
 }
 
@@ -431,20 +435,23 @@ func blockevent(cycles int64, skip int) {
 	if cycles <= 0 {
 		cycles = 1
 	}
-	if blocksampled(cycles) {
-		saveblockevent(cycles, skip+1, blockProfile)
+
+	rate := int64(atomic.Load64(&blockprofilerate))
+	if blocksampled(cycles, rate) {
+		saveblockevent(cycles, rate, skip+1, blockProfile)
 	}
 }
 
-func blocksampled(cycles int64) bool {
-	rate := int64(atomic.Load64(&blockprofilerate))
+// blocksampled returns true for all events where cycles >= rate. Shorter
+// events have a cycles/rate random chance of returning true.
+func blocksampled(cycles, rate int64) bool {
 	if rate <= 0 || (rate > cycles && int64(fastrand())%rate > cycles) {
 		return false
 	}
 	return true
 }
 
-func saveblockevent(cycles int64, skip int, which bucketType) {
+func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 	gp := getg()
 	var nstk int
 	var stk [maxStack]uintptr
@@ -457,8 +464,15 @@ func saveblockevent(cycles int64, skip int, which bucketType) {
 	}
 	lock(&proflock)
 	b := stkbucket(which, 0, skip, stk[:nstk], true)
-	b.bp().count++
-	b.bp().cycles += cycles
+
+	if which == blockProfile && cycles < rate {
+		// Remove sampling bias, see discussion on http://golang.org/cl/299991.
+		b.bp().count += float64(rate) / float64(cycles)
+		b.bp().cycles += rate
+	} else {
+		b.bp().count++
+		b.bp().cycles += cycles
+	}
 	unlock(&proflock)
 }
 
@@ -489,7 +503,7 @@ func mutexevent(cycles int64, skip int) {
 	// TODO(pjw): measure impact of always calling fastrand vs using something
 	// like malloc.go:nextSample()
 	if rate > 0 && int64(fastrand())%rate == 0 {
-		saveblockevent(cycles, skip+1, mutexProfile)
+		saveblockevent(cycles, rate, skip+1, mutexProfile)
 	}
 }
 
@@ -525,7 +539,22 @@ func (r *StackRecord) Stack() []uintptr {
 // memory profiling rate should do so just once, as early as
 // possible in the execution of the program (for example,
 // at the beginning of main).
-var MemProfileRate int = 512 * 1024
+var MemProfileRate int = defaultMemProfileRate(512 * 1024)
+
+// defaultMemProfileRate returns 0 if disableMemoryProfiling is set.
+// It exists primarily for the godoc rendering of MemProfileRate
+// above.
+func defaultMemProfileRate(v int) int {
+	if disableMemoryProfiling {
+		return 0
+	}
+	return v
+}
+
+// disableMemoryProfiling is set by the linker if runtime.MemProfile
+// is not used and the link type guarantees nobody else could use it
+// elsewhere.
+var disableMemoryProfiling bool
 
 // A MemProfileRecord describes the live objects allocated
 // by a particular call sequence (stack trace).
@@ -722,9 +751,8 @@ func fixupBucket(b *bucket) {
 		rawrecord := b.mp()
 		cb.mp().active.add(&rawrecord.active)
 	case blockProfile, mutexProfile:
-		bpcount := b.bp().count
-		cb.bp().count += bpcount
-		cb.bp().cycles += bpcount
+		cb.bp().count += b.bp().count
+		cb.bp().cycles += b.bp().cycles
 	}
 }
 
@@ -866,7 +894,12 @@ func harvestBlockMutexProfile(buckets *bucket, p []BlockProfileRecord) (n int, o
 		for b := sbuckets; b != nil; b = b.allnext {
 			bp := b.bp()
 			r := &p[0]
-			r.Count = bp.count
+			r.Count = int64(bp.count)
+			// Prevent callers from having to worry about division by zero errors.
+			// See discussion on http://golang.org/cl/299991.
+			if r.Count == 0 {
+				r.Count = 1
+			}
 			r.Cycles = bp.cycles
 			i := 0
 			var pc uintptr
@@ -962,12 +995,13 @@ func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int
 
 	stopTheWorld("profile")
 
+	// World is stopped, no locking required.
 	n = 1
-	for _, gp1 := range allgs {
+	forEachGRace(func(gp1 *g) {
 		if isOK(gp1) {
 			n++
 		}
-	}
+	})
 
 	if n <= len(p) {
 		ok = true
@@ -984,21 +1018,23 @@ func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int
 		}
 
 		// Save other goroutines.
-		for _, gp1 := range allgs {
+		forEachGRace(func(gp1 *g) {
 			if isOK(gp1) {
-				if len(r) == 0 {
-					// Should be impossible, but better to return a
-					// truncated profile than to crash the entire process.
-					break
-				}
-				saveg(gp1, &r[0])
-				if labels != nil {
-					lbl[0] = gp1.labels
-					lbl = lbl[1:]
-				}
-				r = r[1:]
+				return
 			}
-		}
+
+			if len(r) == 0 {
+				// Should be impossible, but better to return a
+				// truncated profile than to crash the entire process.
+				return
+			}
+			saveg(gp1, &r[0])
+			if labels != nil {
+				lbl[0] = gp1.labels
+				lbl = lbl[1:]
+			}
+			r = r[1:]
+		})
 	}
 
 	startTheWorld()
diff --git a/libgo/go/runtime/msan0.go b/libgo/go/runtime/msan0.go
index 374d13f30b5..b1096a6750e 100644
--- a/libgo/go/runtime/msan0.go
+++ b/libgo/go/runtime/msan0.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !msan
 // +build !msan
 
 // Dummy MSan support API, used when not built with -msan.
diff --git a/libgo/go/runtime/mstats.go b/libgo/go/runtime/mstats.go
index 488e5d1eba1..535b78a1cf6 100644
--- a/libgo/go/runtime/mstats.go
+++ b/libgo/go/runtime/mstats.go
@@ -8,6 +8,7 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -62,12 +63,6 @@ type mstats struct {
 
 	// Statistics about the garbage collector.
 
-	// next_gc is the goal heap_live for when next GC ends.
-	// Set to ^uint64(0) if disabled.
-	//
-	// Read and written atomically, unless the world is stopped.
-	next_gc uint64
-
 	// Protected by mheap or stopping the world during GC.
 	last_gc_unix    uint64 // last gc (in unix time)
 	pause_total_ns  uint64
@@ -92,69 +87,8 @@ type mstats struct {
 	_ [1 - _NumSizeClasses%2]uint32
 
 	last_gc_nanotime uint64 // last gc (monotonic time)
-	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
-	last_next_gc     uint64 // next_gc for the previous GC
 	last_heap_inuse  uint64 // heap_inuse at mark termination of the previous GC
 
-	// triggerRatio is the heap growth ratio that triggers marking.
-	//
-	// E.g., if this is 0.6, then GC should start when the live
-	// heap has reached 1.6 times the heap size marked by the
-	// previous cycle. This should be ≤ GOGC/100 so the trigger
-	// heap size is less than the goal heap size. This is set
-	// during mark termination for the next cycle's trigger.
-	triggerRatio float64
-
-	// gc_trigger is the heap size that triggers marking.
-	//
-	// When heap_live ≥ gc_trigger, the mark phase will start.
-	// This is also the heap size by which proportional sweeping
-	// must be complete.
-	//
-	// This is computed from triggerRatio during mark termination
-	// for the next cycle's trigger.
-	gc_trigger uint64
-
-	// heap_live is the number of bytes considered live by the GC.
-	// That is: retained by the most recent GC plus allocated
-	// since then. heap_live <= alloc, since alloc includes unmarked
-	// objects that have not yet been swept (and hence goes up as we
-	// allocate and down as we sweep) while heap_live excludes these
-	// objects (and hence only goes up between GCs).
-	//
-	// This is updated atomically without locking. To reduce
-	// contention, this is updated only when obtaining a span from
-	// an mcentral and at this point it counts all of the
-	// unallocated slots in that span (which will be allocated
-	// before that mcache obtains another span from that
-	// mcentral). Hence, it slightly overestimates the "true" live
-	// heap size. It's better to overestimate than to
-	// underestimate because 1) this triggers the GC earlier than
-	// necessary rather than potentially too late and 2) this
-	// leads to a conservative GC rate rather than a GC rate that
-	// is potentially too low.
-	//
-	// Reads should likewise be atomic (or during STW).
-	//
-	// Whenever this is updated, call traceHeapAlloc() and
-	// gcController.revise().
-	heap_live uint64
-
-	// heap_scan is the number of bytes of "scannable" heap. This
-	// is the live heap (as counted by heap_live), but omitting
-	// no-scan objects and no-scan tails of objects.
-	//
-	// Whenever this is updated, call gcController.revise().
-	//
-	// Read and written atomically or with the world stopped.
-	heap_scan uint64
-
-	// heap_marked is the number of bytes marked by the previous
-	// GC. After mark termination, heap_live == heap_marked, but
-	// unlike heap_live, heap_marked does not change until the
-	// next mark termination.
-	heap_marked uint64
-
 	// heapStats is a set of statistics
 	heapStats consistentHeapStats
 
@@ -443,10 +377,6 @@ type MemStats struct {
 }
 
 func init() {
-	if offset := unsafe.Offsetof(memstats.heap_live); offset%8 != 0 {
-		println(offset)
-		throw("memstats.heap_live not aligned to 8 bytes")
-	}
 	if offset := unsafe.Offsetof(memstats.heapStats); offset%8 != 0 {
 		println(offset)
 		throw("memstats.heapStats not aligned to 8 bytes")
@@ -523,7 +453,7 @@ func readmemstats_m(stats *MemStats) {
 	// at a more granular level in the runtime.
 	stats.GCSys = memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
 	stats.OtherSys = memstats.other_sys.load()
-	stats.NextGC = memstats.next_gc
+	stats.NextGC = gcController.heapGoal
 	stats.LastGC = memstats.last_gc_unix
 	stats.PauseTotalNs = memstats.pause_total_ns
 	stats.PauseNs = memstats.pause_ns
@@ -656,8 +586,8 @@ func updatememstats() {
 	}
 
 	// Account for tiny allocations.
-	memstats.nfree += memstats.tinyallocs
-	memstats.nmalloc += memstats.tinyallocs
+	memstats.nfree += uint64(consStats.tinyAllocCount)
+	memstats.nmalloc += uint64(consStats.tinyAllocCount)
 
 	// Calculate derived stats.
 	memstats.total_alloc = totalAlloc
@@ -772,6 +702,7 @@ type heapStatsDelta struct {
 	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
 
 	// Allocator stats.
+	tinyAllocCount  uintptr                  // number of tiny allocations
 	largeAlloc      uintptr                  // bytes allocated for large objects
 	largeAllocCount uintptr                  // number of large object allocations
 	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
@@ -781,7 +712,7 @@ type heapStatsDelta struct {
 
 	// Add a uint32 to ensure this struct is a multiple of 8 bytes in size.
 	// Only necessary on 32-bit platforms.
-	// _ [(sys.PtrSize / 4) % 2]uint32
+	_ [(sys.PtrSize / 4) % 2]uint32
 }
 
 // merge adds in the deltas from b into a.
@@ -793,6 +724,7 @@ func (a *heapStatsDelta) merge(b *heapStatsDelta) {
 	a.inWorkBufs += b.inWorkBufs
 	a.inPtrScalarBits += b.inPtrScalarBits
 
+	a.tinyAllocCount += b.tinyAllocCount
 	a.largeAlloc += b.largeAlloc
 	a.largeAllocCount += b.largeAllocCount
 	for i := range b.smallAllocCount {
diff --git a/libgo/go/runtime/nbpipe_pipe.go b/libgo/go/runtime/nbpipe_pipe.go
index 822b2944dbf..b17257e9ec3 100644
--- a/libgo/go/runtime/nbpipe_pipe.go
+++ b/libgo/go/runtime/nbpipe_pipe.go
@@ -2,7 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly
+//go:build aix || darwin
+// +build aix darwin
 
 package runtime
 
diff --git a/libgo/go/runtime/nbpipe_pipe2.go b/libgo/go/runtime/nbpipe_pipe2.go
index 76fb40c460a..34ead010a30 100644
--- a/libgo/go/runtime/nbpipe_pipe2.go
+++ b/libgo/go/runtime/nbpipe_pipe2.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build freebsd || hurd || linux || netbsd || openbsd || solaris
 // +build freebsd hurd linux netbsd openbsd solaris
 
 package runtime
diff --git a/libgo/go/runtime/nbpipe_test.go b/libgo/go/runtime/nbpipe_test.go
index d7c5d45c854..22d41cd1caa 100644
--- a/libgo/go/runtime/nbpipe_test.go
+++ b/libgo/go/runtime/nbpipe_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd hurd linux netbsd openbsd solaris
 
 package runtime_test
diff --git a/libgo/go/runtime/netpoll.go b/libgo/go/runtime/netpoll.go
index 879112a728d..82380f6a85b 100644
--- a/libgo/go/runtime/netpoll.go
+++ b/libgo/go/runtime/netpoll.go
@@ -26,6 +26,9 @@ import (
 //     Arm edge-triggered notifications for fd. The pd argument is to pass
 //     back to netpollready when fd is ready. Return an errno value.
 //
+// func netpollclose(fd uintptr) int32
+//     Disable notifications for fd. Return an errno value.
+//
 // func netpoll(delta int64) gList
 //     Poll the network. If delta < 0, block indefinitely. If delta == 0,
 //     poll without blocking. If delta > 0, block for up to delta nanoseconds.
@@ -165,9 +168,12 @@ func poll_runtime_pollOpen(fd uintptr) (uintptr, int) {
 	pd.self = pd
 	unlock(&pd.lock)
 
-	var errno int32
-	errno = netpollopen(fd, pd)
-	return uintptr(unsafe.Pointer(pd)), int(errno)
+	errno := netpollopen(fd, pd)
+	if errno != 0 {
+		pollcache.free(pd)
+		return 0, int(errno)
+	}
+	return uintptr(unsafe.Pointer(pd)), 0
 }
 
 //go:linkname poll_runtime_pollClose internal_1poll.runtime__pollClose
diff --git a/libgo/go/runtime/netpoll_epoll.go b/libgo/go/runtime/netpoll_epoll.go
index 9c5d33851cb..60a1dc00d3a 100644
--- a/libgo/go/runtime/netpoll_epoll.go
+++ b/libgo/go/runtime/netpoll_epoll.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux
 // +build linux
 
 package runtime
diff --git a/libgo/go/runtime/netpoll_fake.go b/libgo/go/runtime/netpoll_fake.go
index b2af3b89b2c..8366f289144 100644
--- a/libgo/go/runtime/netpoll_fake.go
+++ b/libgo/go/runtime/netpoll_fake.go
@@ -5,6 +5,7 @@
 // Fake network poller for wasm/js.
 // Should never be used, because wasm/js network connections do not honor "SetNonblock".
 
+//go:build js && wasm
 // +build js,wasm
 
 package runtime
diff --git a/libgo/go/runtime/netpoll_kqueue.go b/libgo/go/runtime/netpoll_kqueue.go
index 1724f37e732..1e1b110a74c 100644
--- a/libgo/go/runtime/netpoll_kqueue.go
+++ b/libgo/go/runtime/netpoll_kqueue.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build darwin || dragonfly || freebsd || netbsd || openbsd
 // +build darwin dragonfly freebsd netbsd openbsd
 
 package runtime
diff --git a/libgo/go/runtime/netpoll_stub.go b/libgo/go/runtime/netpoll_stub.go
index 3599f2d01b7..33ab8eba58c 100644
--- a/libgo/go/runtime/netpoll_stub.go
+++ b/libgo/go/runtime/netpoll_stub.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build plan9
 // +build plan9
 
 package runtime
diff --git a/libgo/go/runtime/norace_test.go b/libgo/go/runtime/norace_test.go
index e90128bb6d4..9ad5dde3829 100644
--- a/libgo/go/runtime/norace_test.go
+++ b/libgo/go/runtime/norace_test.go
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // The file contains tests that cannot run under race detector for some reason.
+//go:build !race
 // +build !race
 
 package runtime_test
diff --git a/libgo/go/runtime/os_aix.go b/libgo/go/runtime/os_aix.go
index d902ae04dc9..390fefaf7d3 100644
--- a/libgo/go/runtime/os_aix.go
+++ b/libgo/go/runtime/os_aix.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix
 // +build aix
 
 package runtime
diff --git a/libgo/go/runtime/os_darwin.go b/libgo/go/runtime/os_darwin.go
index 58e0412600c..0ae716c42d7 100644
--- a/libgo/go/runtime/os_darwin.go
+++ b/libgo/go/runtime/os_darwin.go
@@ -4,7 +4,9 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"unsafe"
+)
 
 type mOS struct {
 	initialized bool
@@ -115,10 +117,15 @@ func sigNoteWakeup(*note) {
 
 // sigNoteSleep waits for a note created by sigNoteSetup to be woken.
 func sigNoteSleep(*note) {
-	entersyscallblock()
-	var b byte
-	read(sigNoteRead, unsafe.Pointer(&b), 1)
-	exitsyscall()
+	for {
+		var b byte
+		entersyscallblock()
+		n := read(sigNoteRead, unsafe.Pointer(&b), 1)
+		exitsyscall()
+		if n != -_EINTR {
+			return
+		}
+	}
 }
 
 // BSD interface for threading.
diff --git a/libgo/go/runtime/os_gccgo.go b/libgo/go/runtime/os_gccgo.go
index 65d7aef165a..da12c35d26c 100644
--- a/libgo/go/runtime/os_gccgo.go
+++ b/libgo/go/runtime/os_gccgo.go
@@ -97,3 +97,11 @@ func setNonblock(fd int32) {
 		fcntlUintptr(uintptr(fd), _F_SETFL, flags|_O_NONBLOCK)
 	}
 }
+
+// For gccgo this is in the C code.
+func osyield()
+
+//go:nosplit
+func osyield_no_g() {
+	osyield()
+}
diff --git a/libgo/go/runtime/os_js.go b/libgo/go/runtime/os_js.go
index 24261e88a20..52b64e76027 100644
--- a/libgo/go/runtime/os_js.go
+++ b/libgo/go/runtime/os_js.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build js && wasm
 // +build js,wasm
 
 package runtime
@@ -30,12 +31,22 @@ func wasmWrite(fd uintptr, p unsafe.Pointer, n int32)
 
 func usleep(usec uint32)
 
+//go:nosplit
+func usleep_no_g(usec uint32) {
+	usleep(usec)
+}
+
 func exitThread(wait *uint32)
 
 type mOS struct{}
 
 func osyield()
 
+//go:nosplit
+func osyield_no_g() {
+	osyield()
+}
+
 const _SIGSEGV = 0xb
 
 func sigpanic() {
diff --git a/libgo/go/runtime/os_linux_arm64.go b/libgo/go/runtime/os_linux_arm64.go
index 3844620b203..2b51a2a5176 100644
--- a/libgo/go/runtime/os_linux_arm64.go
+++ b/libgo/go/runtime/os_linux_arm64.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build arm64
 // +build arm64
 
 package runtime
diff --git a/libgo/go/runtime/os_linux_mips64x.go b/libgo/go/runtime/os_linux_mips64x.go
index f9059382757..bc85ab3b8f7 100644
--- a/libgo/go/runtime/os_linux_mips64x.go
+++ b/libgo/go/runtime/os_linux_mips64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (mips64 || mips64le)
 // +build linux
 // +build mips64 mips64le
 
diff --git a/libgo/go/runtime/os_linux_mipsx.go b/libgo/go/runtime/os_linux_mipsx.go
index 2bfd6f40cc9..be31f191f1f 100644
--- a/libgo/go/runtime/os_linux_mipsx.go
+++ b/libgo/go/runtime/os_linux_mipsx.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (mips || mipsle)
 // +build linux
 // +build mips mipsle
 
diff --git a/libgo/go/runtime/os_linux_noauxv.go b/libgo/go/runtime/os_linux_noauxv.go
index 895b4cd5f44..59b5aacaebc 100644
--- a/libgo/go/runtime/os_linux_noauxv.go
+++ b/libgo/go/runtime/os_linux_noauxv.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux
-// +build !arm,!arm64,!mips,!mipsle,!mips64,!mips64le,!s390x,!ppc64,!ppc64le
+//go:build linux && !arm && !arm64 && !mips && !mipsle && !mips64 && !mips64le && !s390x && !ppc64 && !ppc64le
+// +build linux,!arm,!arm64,!mips,!mipsle,!mips64,!mips64le,!s390x,!ppc64,!ppc64le
 
 package runtime
 
diff --git a/libgo/go/runtime/os_linux_ppc64x.go b/libgo/go/runtime/os_linux_ppc64x.go
index cc79cc4a66c..43a75c71004 100644
--- a/libgo/go/runtime/os_linux_ppc64x.go
+++ b/libgo/go/runtime/os_linux_ppc64x.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux && (ppc64 || ppc64le)
 // +build linux
 // +build ppc64 ppc64le
 
diff --git a/libgo/go/runtime/os_only_solaris.go b/libgo/go/runtime/os_only_solaris.go
index e2f5409354d..3829683c807 100644
--- a/libgo/go/runtime/os_only_solaris.go
+++ b/libgo/go/runtime/os_only_solaris.go
@@ -4,6 +4,7 @@
 
 // Solaris code that doesn't also apply to illumos.
 
+//go:build !illumos
 // +build !illumos
 
 package runtime
diff --git a/libgo/go/runtime/os_windows_arm64.go b/libgo/go/runtime/os_windows_arm64.go
new file mode 100644
index 00000000000..7e413445ba4
--- /dev/null
+++ b/libgo/go/runtime/os_windows_arm64.go
@@ -0,0 +1,14 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+//go:nosplit
+func cputicks() int64 {
+	var counter int64
+	stdcall1(_QueryPerformanceCounter, uintptr(unsafe.Pointer(&counter)))
+	return counter
+}
diff --git a/libgo/go/runtime/panic32.go b/libgo/go/runtime/panic32.go
index fa314afc20a..a2bf7e8fa2a 100644
--- a/libgo/go/runtime/panic32.go
+++ b/libgo/go/runtime/panic32.go
@@ -2,6 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build 386 || arm || mips || mipsle || armbe || m68k || nios2 || ppc || riscv || s390 || sh || shbe || sparc
+// +build 386 arm mips mipsle armbe m68k nios2 ppc riscv s390 sh shbe sparc
+
 package runtime
 
 import _ "unsafe" // for go:linkname
diff --git a/libgo/go/runtime/pprof/mprof_test.go b/libgo/go/runtime/pprof/mprof_test.go
index 9551ea726f2..6a448a75614 100644
--- a/libgo/go/runtime/pprof/mprof_test.go
+++ b/libgo/go/runtime/pprof/mprof_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !js
 // +build !js
 
 package pprof
@@ -85,6 +86,17 @@ func TestMemoryProfiler(t *testing.T) {
 
 	runtime.GC() // materialize stats
 
+	// TODO(mknyszek): Fix #45315 and remove this extra call.
+	//
+	// Unfortunately, it's possible for the sweep termination condition
+	// to flap, so with just one runtime.GC call, a freed object could be
+	// missed, leading this test to fail. A second call reduces the chance
+	// of this happening to zero, because sweeping actually has to finish
+	// to move on to the next GC, during which nothing will happen.
+	//
+	// See #46500 for more details.
+	runtime.GC()
+
 	memoryProfilerRun++
 
 	tests := []struct {
@@ -93,33 +105,33 @@ func TestMemoryProfiler(t *testing.T) {
 	}{{
 		stk: []string{"runtime/pprof.allocatePersistent1K", "runtime/pprof.TestMemoryProfiler"},
 		legacy: fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/mprof_test\.go:47
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test\.go:82
+#	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/mprof_test\.go:48
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test\.go:83
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient1M", "runtime/pprof.TestMemoryProfiler"},
 		legacy: fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/mprof_test.go:24
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:79
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/mprof_test.go:25
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:80
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient2M", "runtime/pprof.TestMemoryProfiler"},
 		// This should start with "0: 0" but gccgo's imprecise
 		// GC means that sometimes the value is not collected.
 		legacy: fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/mprof_test.go:30
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:80
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/mprof_test.go:31
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:81
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient2MInline", "runtime/pprof.TestMemoryProfiler"},
 		legacy: fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @ 0x[0-9,a-f x]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2MInline\+0x[0-9,a-f]+	.*/mprof_test.go:34
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:81
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2MInline\+0x[0-9,a-f]+	.*/mprof_test.go:35
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/mprof_test.go:82
 `, memoryProfilerRun, (4<<20)*memoryProfilerRun, memoryProfilerRun, (4<<20)*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateReflectTransient"},
 		legacy: fmt.Sprintf(`(0|%v): (0|%v) \[%v: %v\] @( 0x[0-9,a-f]+)+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/mprof_test.go:55
+#	0x[0-9,a-f]+	runtime/pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/mprof_test.go:56
 `, memoryProfilerRun, (3<<20)*memoryProfilerRun, memoryProfilerRun, (3<<20)*memoryProfilerRun),
 	}}
 
diff --git a/libgo/go/runtime/pprof/pprof.go b/libgo/go/runtime/pprof/pprof.go
index 52df44a5a10..54838fcc50a 100644
--- a/libgo/go/runtime/pprof/pprof.go
+++ b/libgo/go/runtime/pprof/pprof.go
@@ -857,45 +857,7 @@ func countMutex() int {
 
 // writeBlock writes the current blocking profile to w.
 func writeBlock(w io.Writer, debug int) error {
-	var p []runtime.BlockProfileRecord
-	n, ok := runtime.BlockProfile(nil)
-	for {
-		p = make([]runtime.BlockProfileRecord, n+50)
-		n, ok = runtime.BlockProfile(p)
-		if ok {
-			p = p[:n]
-			break
-		}
-	}
-
-	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
-
-	if debug <= 0 {
-		return printCountCycleProfile(w, "contentions", "delay", scaleBlockProfile, p)
-	}
-
-	b := bufio.NewWriter(w)
-	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
-	w = tw
-
-	fmt.Fprintf(w, "--- contention:\n")
-	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
-	for i := range p {
-		r := &p[i]
-		fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count)
-		for _, pc := range r.Stack() {
-			fmt.Fprintf(w, " %#x", pc)
-		}
-		fmt.Fprint(w, "\n")
-		if debug > 0 {
-			printStackRecord(w, r.Stack(), true)
-		}
-	}
-
-	if tw != nil {
-		tw.Flush()
-	}
-	return b.Flush()
+	return writeProfileInternal(w, debug, "contention", runtime.BlockProfile, scaleBlockProfile)
 }
 
 func scaleBlockProfile(cnt int64, ns float64) (int64, float64) {
@@ -908,12 +870,16 @@ func scaleBlockProfile(cnt int64, ns float64) (int64, float64) {
 
 // writeMutex writes the current mutex profile to w.
 func writeMutex(w io.Writer, debug int) error {
-	// TODO(pjw): too much common code with writeBlock. FIX!
+	return writeProfileInternal(w, debug, "mutex", runtime.MutexProfile, scaleMutexProfile)
+}
+
+// writeProfileInternal writes the current blocking or mutex profile depending on the passed parameters
+func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile func([]runtime.BlockProfileRecord) (int, bool), scaleProfile func(int64, float64) (int64, float64)) error {
 	var p []runtime.BlockProfileRecord
-	n, ok := runtime.MutexProfile(nil)
+	n, ok := runtimeProfile(nil)
 	for {
 		p = make([]runtime.BlockProfileRecord, n+50)
-		n, ok = runtime.MutexProfile(p)
+		n, ok = runtimeProfile(p)
 		if ok {
 			p = p[:n]
 			break
@@ -923,16 +889,18 @@ func writeMutex(w io.Writer, debug int) error {
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
 	if debug <= 0 {
-		return printCountCycleProfile(w, "contentions", "delay", scaleMutexProfile, p)
+		return printCountCycleProfile(w, "contentions", "delay", scaleProfile, p)
 	}
 
 	b := bufio.NewWriter(w)
 	tw := tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
 	w = tw
 
-	fmt.Fprintf(w, "--- mutex:\n")
+	fmt.Fprintf(w, "--- %v:\n", name)
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
-	fmt.Fprintf(w, "sampling period=%d\n", runtime.SetMutexProfileFraction(-1))
+	if name == "mutex" {
+		fmt.Fprintf(w, "sampling period=%d\n", runtime.SetMutexProfileFraction(-1))
+	}
 	for i := range p {
 		r := &p[i]
 		fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count)
diff --git a/libgo/go/runtime/pprof/pprof_norusage.go b/libgo/go/runtime/pprof/pprof_norusage.go
index 6fdcc6cc38d..e175dd380cf 100644
--- a/libgo/go/runtime/pprof/pprof_norusage.go
+++ b/libgo/go/runtime/pprof/pprof_norusage.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !darwin && !linux
 // +build !darwin,!linux
 
 package pprof
diff --git a/libgo/go/runtime/pprof/pprof_rusage.go b/libgo/go/runtime/pprof/pprof_rusage.go
index 79546738119..269f21bc2f9 100644
--- a/libgo/go/runtime/pprof/pprof_rusage.go
+++ b/libgo/go/runtime/pprof/pprof_rusage.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build darwin || linux
 // +build darwin linux
 
 package pprof
diff --git a/libgo/go/runtime/pprof/pprof_test.go b/libgo/go/runtime/pprof/pprof_test.go
index 73d7aaaf9f8..ab96b0ced0b 100644
--- a/libgo/go/runtime/pprof/pprof_test.go
+++ b/libgo/go/runtime/pprof/pprof_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !js
 // +build !js
 
 package pprof
@@ -13,6 +14,7 @@ import (
 	"internal/profile"
 	"internal/testenv"
 	"io"
+	"math"
 	"math/big"
 	"os"
 	"os/exec"
@@ -23,6 +25,7 @@ import (
 	"sync/atomic"
 	"testing"
 	"time"
+	_ "unsafe"
 )
 
 func cpuHogger(f func(x int) int, y *int, dur time.Duration) {
@@ -257,40 +260,43 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca
 	return p
 }
 
+func cpuProfilingBroken() bool {
+	switch runtime.GOOS {
+	case "plan9":
+		// Profiling unimplemented.
+		return true
+	case "aix":
+		// See https://golang.org/issue/45170.
+		return true
+	case "ios", "dragonfly", "netbsd", "illumos", "solaris":
+		// See https://golang.org/issue/13841.
+		return true
+	case "openbsd":
+		if runtime.GOARCH == "arm" || runtime.GOARCH == "arm64" {
+			// See https://golang.org/issue/13841.
+			return true
+		}
+	}
+
+	return false
+}
+
 // testCPUProfile runs f under the CPU profiler, checking for some conditions specified by need,
 // as interpreted by matches, and returns the parsed profile.
 func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) *profile.Profile {
 	switch runtime.GOOS {
-	case "darwin", "ios":
-		switch runtime.GOARCH {
-		case "arm64":
-			// nothing
-		default:
-			out, err := exec.Command("uname", "-a").CombinedOutput()
-			if err != nil {
-				t.Fatal(err)
-			}
-			vers := string(out)
-			t.Logf("uname -a: %v", vers)
+	case "darwin":
+		out, err := exec.Command("uname", "-a").CombinedOutput()
+		if err != nil {
+			t.Fatal(err)
 		}
+		vers := string(out)
+		t.Logf("uname -a: %v", vers)
 	case "plan9":
 		t.Skip("skipping on plan9")
 	}
 
-	broken := false
-	switch runtime.GOOS {
-	// See https://golang.org/issue/45170 for AIX.
-	case "darwin", "ios", "dragonfly", "netbsd", "illumos", "solaris", "aix":
-		broken = true
-	case "openbsd":
-		if runtime.GOARCH == "arm" || runtime.GOARCH == "arm64" {
-			broken = true
-		}
-	case "windows":
-		if runtime.GOARCH == "arm" {
-			broken = true // See https://golang.org/issues/42862
-		}
-	}
+	broken := cpuProfilingBroken()
 
 	maxDuration := 5 * time.Second
 	if testing.Short() && broken {
@@ -524,8 +530,10 @@ func TestGoroutineSwitch(t *testing.T) {
 		}
 		StopCPUProfile()
 
-		// Read profile to look for entries for runtime.gogo with an attempt at a traceback.
-		// The special entry
+		// Read profile to look for entries for gogo with an attempt at a traceback.
+		// "runtime.gogo" is OK, because that's the part of the context switch
+		// before the actual switch begins. But we should not see "gogo",
+		// aka "gogo<>(SB)", which does the actual switch and is marked SPWRITE.
 		parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, _ map[string][]string) {
 			// An entry with two frames with 'System' in its top frame
 			// exists to record a PC without a traceback. Those are okay.
@@ -536,13 +544,19 @@ func TestGoroutineSwitch(t *testing.T) {
 				}
 			}
 
-			// Otherwise, should not see runtime.gogo.
+			// An entry with just one frame is OK too:
+			// it knew to stop at gogo.
+			if len(stk) == 1 {
+				return
+			}
+
+			// Otherwise, should not see gogo.
 			// The place we'd see it would be the inner most frame.
 			name := stk[0].Line[0].Function.Name
-			if name == "runtime.gogo" {
+			if name == "gogo" {
 				var buf bytes.Buffer
 				fprintStack(&buf, stk)
-				t.Fatalf("found profile entry for runtime.gogo:\n%s", buf.String())
+				t.Fatalf("found profile entry for gogo:\n%s", buf.String())
 			}
 		})
 	}
@@ -616,17 +630,20 @@ func TestMorestack(t *testing.T) {
 
 //go:noinline
 func growstack1() {
-	growstack()
+	growstack(10)
 }
 
 //go:noinline
-func growstack() {
-	var buf [8 << 10]byte
+func growstack(n int) {
+	var buf [8 << 18]byte
 	use(buf)
+	if n > 0 {
+		growstack(n - 1)
+	}
 }
 
 //go:noinline
-func use(x [8 << 10]byte) {}
+func use(x [8 << 18]byte) {}
 
 func TestBlockProfile(t *testing.T) {
 	t.Skip("lots of details are different for gccgo; FIXME")
@@ -905,6 +922,74 @@ func blockCond() {
 	mu.Unlock()
 }
 
+// See http://golang.org/cl/299991.
+func TestBlockProfileBias(t *testing.T) {
+	rate := int(1000) // arbitrary value
+	runtime.SetBlockProfileRate(rate)
+	defer runtime.SetBlockProfileRate(0)
+
+	// simulate blocking events
+	blockFrequentShort(rate)
+	blockInfrequentLong(rate)
+
+	var w bytes.Buffer
+	Lookup("block").WriteTo(&w, 0)
+	p, err := profile.Parse(&w)
+	if err != nil {
+		t.Fatalf("failed to parse profile: %v", err)
+	}
+	t.Logf("parsed proto: %s", p)
+
+	il := float64(-1) // blockInfrequentLong duration
+	fs := float64(-1) // blockFrequentShort duration
+	for _, s := range p.Sample {
+		for _, l := range s.Location {
+			for _, line := range l.Line {
+				if len(s.Value) < 2 {
+					t.Fatal("block profile has less than 2 sample types")
+				}
+
+				if line.Function.Name == "runtime/pprof.blockInfrequentLong" {
+					il = float64(s.Value[1])
+				} else if line.Function.Name == "runtime/pprof.blockFrequentShort" {
+					fs = float64(s.Value[1])
+				}
+			}
+		}
+	}
+	if il == -1 || fs == -1 {
+		t.Fatal("block profile is missing expected functions")
+	}
+
+	// stddev of bias from 100 runs on local machine multiplied by 10x
+	const threshold = 0.2
+	if bias := (il - fs) / il; math.Abs(bias) > threshold {
+		t.Fatalf("bias: abs(%f) > %f", bias, threshold)
+	} else {
+		t.Logf("bias: abs(%f) < %f", bias, threshold)
+	}
+}
+
+// blockFrequentShort produces 100000 block events with an average duration of
+// rate / 10.
+func blockFrequentShort(rate int) {
+	for i := 0; i < 100000; i++ {
+		blockevent(int64(rate/10), 1)
+	}
+}
+
+// blockFrequentShort produces 10000 block events with an average duration of
+// rate.
+func blockInfrequentLong(rate int) {
+	for i := 0; i < 10000; i++ {
+		blockevent(int64(rate), 1)
+	}
+}
+
+// Used by TestBlockProfileBias.
+//go:linkname blockevent runtime.blockevent
+func blockevent(cycles int64, skip int)
+
 func TestMutexProfile(t *testing.T) {
 	// Generate mutex profile
 
diff --git a/libgo/go/runtime/preempt_nonwindows.go b/libgo/go/runtime/preempt_nonwindows.go
index 3066a1521eb..365e86a611a 100644
--- a/libgo/go/runtime/preempt_nonwindows.go
+++ b/libgo/go/runtime/preempt_nonwindows.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !windows
 // +build !windows
 
 package runtime
diff --git a/libgo/go/runtime/print.go b/libgo/go/runtime/print.go
index be51ac0c0c2..d141061c158 100644
--- a/libgo/go/runtime/print.go
+++ b/libgo/go/runtime/print.go
@@ -238,13 +238,15 @@ func printint(v int64) {
 	printuint(uint64(v))
 }
 
+var minhexdigits = 0 // protected by printlock
+
 func printhex(v uint64) {
 	const dig = "0123456789abcdef"
 	var buf [100]byte
 	i := len(buf)
 	for i--; i > 0; i-- {
 		buf[i] = dig[v%16]
-		if v < 16 {
+		if v < 16 && len(buf)-i >= minhexdigits {
 			break
 		}
 		v /= 16
@@ -287,29 +289,16 @@ func printiface(i iface) {
 // and should return a character mark to appear just before that
 // word's value. It can return 0 to indicate no mark.
 func hexdumpWords(p, end uintptr, mark func(uintptr) byte) {
-	p1 := func(x uintptr) {
-		var buf [2 * sys.PtrSize]byte
-		for i := len(buf) - 1; i >= 0; i-- {
-			if x&0xF < 10 {
-				buf[i] = byte(x&0xF) + '0'
-			} else {
-				buf[i] = byte(x&0xF) - 10 + 'a'
-			}
-			x >>= 4
-		}
-		gwrite(buf[:])
-	}
-
 	printlock()
 	var markbuf [1]byte
 	markbuf[0] = ' '
+	minhexdigits = int(unsafe.Sizeof(uintptr(0)) * 2)
 	for i := uintptr(0); p+i < end; i += sys.PtrSize {
 		if i%16 == 0 {
 			if i != 0 {
 				println()
 			}
-			p1(p + i)
-			print(": ")
+			print(hex(p+i), ": ")
 		}
 
 		if mark != nil {
@@ -320,16 +309,17 @@ func hexdumpWords(p, end uintptr, mark func(uintptr) byte) {
 		}
 		gwrite(markbuf[:])
 		val := *(*uintptr)(unsafe.Pointer(p + i))
-		p1(val)
+		print(hex(val))
 		print(" ")
 
 		// Can we symbolize val?
 		name, _, _, _ := funcfileline(val, -1, false)
 		if name != "" {
 			entry := funcentry(val)
-			print("<", name, "+", val-entry, "> ")
+			print("<", name, "+", hex(val-entry), "> ")
 		}
 	}
+	minhexdigits = 0
 	println()
 	printunlock()
 }
diff --git a/libgo/go/runtime/proc.go b/libgo/go/runtime/proc.go
index 037e7798a38..05342905ff1 100644
--- a/libgo/go/runtime/proc.go
+++ b/libgo/go/runtime/proc.go
@@ -68,8 +68,6 @@ func main_init()
 //extern main.main
 func main_main()
 
-var buildVersion = sys.TheVersion
-
 // set using cmd/go/internal/modload.ModInfoProg
 var modinfo string
 
@@ -108,33 +106,64 @@ var modinfo string
 //    any work to do.
 //
 // The current approach:
-// We unpark an additional thread when we ready a goroutine if (1) there is an
-// idle P and there are no "spinning" worker threads. A worker thread is considered
-// spinning if it is out of local work and did not find work in global run queue/
-// netpoller; the spinning state is denoted in m.spinning and in sched.nmspinning.
-// Threads unparked this way are also considered spinning; we don't do goroutine
-// handoff so such threads are out of work initially. Spinning threads do some
-// spinning looking for work in per-P run queues before parking. If a spinning
+//
+// This approach applies to three primary sources of potential work: readying a
+// goroutine, new/modified-earlier timers, and idle-priority GC. See below for
+// additional details.
+//
+// We unpark an additional thread when we submit work if (this is wakep()):
+// 1. There is an idle P, and
+// 2. There are no "spinning" worker threads.
+//
+// A worker thread is considered spinning if it is out of local work and did
+// not find work in the global run queue or netpoller; the spinning state is
+// denoted in m.spinning and in sched.nmspinning. Threads unparked this way are
+// also considered spinning; we don't do goroutine handoff so such threads are
+// out of work initially. Spinning threads spin on looking for work in per-P
+// run queues and timer heaps or from the GC before parking. If a spinning
 // thread finds work it takes itself out of the spinning state and proceeds to
-// execution. If it does not find work it takes itself out of the spinning state
-// and then parks.
-// If there is at least one spinning thread (sched.nmspinning>1), we don't unpark
-// new threads when readying goroutines. To compensate for that, if the last spinning
-// thread finds work and stops spinning, it must unpark a new spinning thread.
-// This approach smooths out unjustified spikes of thread unparking,
-// but at the same time guarantees eventual maximal CPU parallelism utilization.
+// execution. If it does not find work it takes itself out of the spinning
+// state and then parks.
+//
+// If there is at least one spinning thread (sched.nmspinning>1), we don't
+// unpark new threads when submitting work. To compensate for that, if the last
+// spinning thread finds work and stops spinning, it must unpark a new spinning
+// thread.  This approach smooths out unjustified spikes of thread unparking,
+// but at the same time guarantees eventual maximal CPU parallelism
+// utilization.
+//
+// The main implementation complication is that we need to be very careful
+// during spinning->non-spinning thread transition. This transition can race
+// with submission of new work, and either one part or another needs to unpark
+// another worker thread. If they both fail to do that, we can end up with
+// semi-persistent CPU underutilization.
 //
-// The main implementation complication is that we need to be very careful during
-// spinning->non-spinning thread transition. This transition can race with submission
-// of a new goroutine, and either one part or another needs to unpark another worker
-// thread. If they both fail to do that, we can end up with semi-persistent CPU
-// underutilization. The general pattern for goroutine readying is: submit a goroutine
-// to local work queue, #StoreLoad-style memory barrier, check sched.nmspinning.
-// The general pattern for spinning->non-spinning transition is: decrement nmspinning,
-// #StoreLoad-style memory barrier, check all per-P work queues for new work.
-// Note that all this complexity does not apply to global run queue as we are not
-// sloppy about thread unparking when submitting to global queue. Also see comments
-// for nmspinning manipulation.
+// The general pattern for submission is:
+// 1. Submit work to the local run queue, timer heap, or GC state.
+// 2. #StoreLoad-style memory barrier.
+// 3. Check sched.nmspinning.
+//
+// The general pattern for spinning->non-spinning transition is:
+// 1. Decrement nmspinning.
+// 2. #StoreLoad-style memory barrier.
+// 3. Check all per-P work queues and GC for new work.
+//
+// Note that all this complexity does not apply to global run queue as we are
+// not sloppy about thread unparking when submitting to global queue. Also see
+// comments for nmspinning manipulation.
+//
+// How these different sources of work behave varies, though it doesn't affect
+// the synchronization approach:
+// * Ready goroutine: this is an obvious source of work; the goroutine is
+//   immediately ready and must run on some thread eventually.
+// * New/modified-earlier timer: The current timer implementation (see time.go)
+//   uses netpoll in a thread with no work available to wait for the soonest
+//   timer. If there is no thread waiting, we want a new spinning thread to go
+//   wait.
+// * Idle-priority GC: The GC wakes a stopped idle thread to contribute to
+//   background GC work (note: currently disabled per golang.org/issue/19112).
+//   Also see golang.org/issue/44313, as this should be extended to all GC
+//   workers.
 
 var (
 	m0           m
@@ -514,8 +543,8 @@ var (
 	allglock mutex
 	allgs    []*g
 
-	// allglen and allgptr are atomic variables that contain len(allg) and
-	// &allg[0] respectively. Proper ordering depends on totally-ordered
+	// allglen and allgptr are atomic variables that contain len(allgs) and
+	// &allgs[0] respectively. Proper ordering depends on totally-ordered
 	// loads and stores. Writes are protected by allglock.
 	//
 	// allgptr is updated before allglen. Readers should read allglen
@@ -556,6 +585,30 @@ func atomicAllGIndex(ptr **g, i uintptr) *g {
 	return *(**g)(add(unsafe.Pointer(ptr), i*sys.PtrSize))
 }
 
+// forEachG calls fn on every G from allgs.
+//
+// forEachG takes a lock to exclude concurrent addition of new Gs.
+func forEachG(fn func(gp *g)) {
+	lock(&allglock)
+	for _, gp := range allgs {
+		fn(gp)
+	}
+	unlock(&allglock)
+}
+
+// forEachGRace calls fn on every G from allgs.
+//
+// forEachGRace avoids locking, but does not exclude addition of new Gs during
+// execution, which may be missed.
+func forEachGRace(fn func(gp *g)) {
+	ptr, length := atomicAllG()
+	for i := uintptr(0); i < length; i++ {
+		gp := atomicAllGIndex(ptr, i)
+		fn(gp)
+	}
+	return
+}
+
 const (
 	// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
 	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
@@ -649,6 +702,11 @@ func schedinit() {
 	sigsave(&_g_.m.sigmask)
 	initSigmask = _g_.m.sigmask
 
+	if offset := unsafe.Offsetof(sched.timeToRun); offset%8 != 0 {
+		println(offset)
+		throw("sched.timeToRun not aligned to 8 bytes")
+	}
+
 	goargs()
 	goenvs()
 	parsedebugvars()
@@ -925,6 +983,37 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
+
+	// Handle tracking for scheduling latencies.
+	if oldval == _Grunning {
+		// Track every 8th time a goroutine transitions out of running.
+		if gp.trackingSeq%gTrackingPeriod == 0 {
+			gp.tracking = true
+		}
+		gp.trackingSeq++
+	}
+	if gp.tracking {
+		now := nanotime()
+		if oldval == _Grunnable {
+			// We transitioned out of runnable, so measure how much
+			// time we spent in this state and add it to
+			// runnableTime.
+			gp.runnableTime += now - gp.runnableStamp
+			gp.runnableStamp = 0
+		}
+		if newval == _Grunnable {
+			// We just transitioned into runnable, so record what
+			// time that happened.
+			gp.runnableStamp = now
+		} else if newval == _Grunning {
+			// We're transitioning into running, so turn off
+			// tracking and record how much time we spent in
+			// runnable.
+			gp.tracking = false
+			sched.timeToRun.record(gp.runnableTime)
+			gp.runnableTime = 0
+		}
+	}
 }
 
 // casGToPreemptScan transitions gp from _Grunning to _Gscan|_Gpreempted.
@@ -1235,6 +1324,9 @@ func kickoff() {
 	goexit1()
 }
 
+// The go:noinline is to guarantee the getcallerpc/getcallersp below are safe,
+// so that we can set up g0.sched to return to the call of mstart1 above.
+//go:noinline
 func mstart1() {
 	_g_ := getg()
 
@@ -1370,6 +1462,8 @@ found:
 	}
 	unlock(&sched.lock)
 
+	atomic.Xadd64(&ncgocall, int64(m.ncgocall))
+
 	// Release the P.
 	handoffp(releasep())
 	// After this point we must not have write barriers.
@@ -1661,6 +1755,10 @@ func needm() {
 	// Store the original signal mask for use by minit.
 	mp.sigmask = sigmask
 
+	// Install TLS on some platforms (previously setg
+	// would do this if necessary).
+	osSetupTLS(mp)
+
 	// Install g (= m->curg).
 	setg(mp.curg)
 
@@ -1822,7 +1920,7 @@ func lockextra(nilokay bool) *m {
 	for {
 		old := atomic.Loaduintptr(&extram)
 		if old == locked {
-			osyield()
+			osyield_no_g()
 			continue
 		}
 		if old == 0 && !nilokay {
@@ -1833,13 +1931,13 @@ func lockextra(nilokay bool) *m {
 				atomic.Xadd(&extraMWaiters, 1)
 				incr = true
 			}
-			usleep(1)
+			usleep_no_g(1)
 			continue
 		}
 		if atomic.Casuintptr(&extram, old, locked) {
 			return (*m)(unsafe.Pointer(old))
 		}
-		osyield()
+		osyield_no_g()
 		continue
 	}
 }
@@ -2417,85 +2515,40 @@ top:
 		}
 	}
 
-	// Steal work from other P's.
+	// Spinning Ms: steal work from other Ps.
+	//
+	// Limit the number of spinning Ms to half the number of busy Ps.
+	// This is necessary to prevent excessive CPU consumption when
+	// GOMAXPROCS>>1 but the program parallelism is low.
 	procs := uint32(gomaxprocs)
-	ranTimer := false
-	// If number of spinning M's >= number of busy P's, block.
-	// This is necessary to prevent excessive CPU consumption
-	// when GOMAXPROCS>>1 but the program parallelism is low.
-	if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= procs-atomic.Load(&sched.npidle) {
-		goto stop
-	}
-	if !_g_.m.spinning {
-		_g_.m.spinning = true
-		atomic.Xadd(&sched.nmspinning, 1)
-	}
-	const stealTries = 4
-	for i := 0; i < stealTries; i++ {
-		stealTimersOrRunNextG := i == stealTries-1
-
-		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
-			if sched.gcwaiting != 0 {
-				goto top
-			}
-			p2 := allp[enum.position()]
-			if _p_ == p2 {
-				continue
-			}
-
-			// Steal timers from p2. This call to checkTimers is the only place
-			// where we might hold a lock on a different P's timers. We do this
-			// once on the last pass before checking runnext because stealing
-			// from the other P's runnext should be the last resort, so if there
-			// are timers to steal do that first.
-			//
-			// We only check timers on one of the stealing iterations because
-			// the time stored in now doesn't change in this loop and checking
-			// the timers for each P more than once with the same value of now
-			// is probably a waste of time.
-			//
-			// timerpMask tells us whether the P may have timers at all. If it
-			// can't, no need to check at all.
-			if stealTimersOrRunNextG && timerpMask.read(enum.position()) {
-				tnow, w, ran := checkTimers(p2, now)
-				now = tnow
-				if w != 0 && (pollUntil == 0 || w < pollUntil) {
-					pollUntil = w
-				}
-				if ran {
-					// Running the timers may have
-					// made an arbitrary number of G's
-					// ready and added them to this P's
-					// local run queue. That invalidates
-					// the assumption of runqsteal
-					// that is always has room to add
-					// stolen G's. So check now if there
-					// is a local G to run.
-					if gp, inheritTime := runqget(_p_); gp != nil {
-						return gp, inheritTime
-					}
-					ranTimer = true
-				}
-			}
+	if _g_.m.spinning || 2*atomic.Load(&sched.nmspinning) < procs-atomic.Load(&sched.npidle) {
+		if !_g_.m.spinning {
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
+		}
 
-			// Don't bother to attempt to steal if p2 is idle.
-			if !idlepMask.read(enum.position()) {
-				if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
-					return gp, false
-				}
-			}
+		gp, inheritTime, tnow, w, newWork := stealWork(now)
+		now = tnow
+		if gp != nil {
+			// Successfully stole.
+			return gp, inheritTime
+		}
+		if newWork {
+			// There may be new timer or GC work; restart to
+			// discover.
+			goto top
+		}
+		if w != 0 && (pollUntil == 0 || w < pollUntil) {
+			// Earlier timer to wait for.
+			pollUntil = w
 		}
-	}
-	if ranTimer {
-		// Running a timer may have made some goroutine ready.
-		goto top
 	}
 
-stop:
-
-	// We have nothing to do. If we're in the GC mark phase, can
-	// safely scan and blacken objects, and have work to do, run
-	// idle-time marking rather than give up the P.
+	// We have nothing to do.
+	//
+	// If we're in the GC mark phase, can safely scan and blacken objects,
+	// and have work to do, run idle-time marking rather than give up the
+	// P.
 	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
 		node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
 		if node != nil {
@@ -2509,17 +2562,11 @@ stop:
 		}
 	}
 
-	delta := int64(-1)
-	if pollUntil != 0 {
-		// checkTimers ensures that polluntil > now.
-		delta = pollUntil - now
-	}
-
 	// wasm only:
 	// If a callback returned and no other goroutine is awake,
 	// then wake event handler goroutine which pauses execution
 	// until a callback was triggered.
-	gp, otherReady := beforeIdle(delta)
+	gp, otherReady := beforeIdle(now, pollUntil)
 	if gp != nil {
 		casgstatus(gp, _Gwaiting, _Grunnable)
 		if trace.enabled {
@@ -2558,18 +2605,25 @@ stop:
 	pidleput(_p_)
 	unlock(&sched.lock)
 
-	// Delicate dance: thread transitions from spinning to non-spinning state,
-	// potentially concurrently with submission of new goroutines. We must
-	// drop nmspinning first and then check all per-P queues again (with
-	// #StoreLoad memory barrier in between). If we do it the other way around,
-	// another thread can submit a goroutine after we've checked all run queues
-	// but before we drop nmspinning; as a result nobody will unpark a thread
-	// to run the goroutine.
+	// Delicate dance: thread transitions from spinning to non-spinning
+	// state, potentially concurrently with submission of new work. We must
+	// drop nmspinning first and then check all sources again (with
+	// #StoreLoad memory barrier in between). If we do it the other way
+	// around, another thread can submit work after we've checked all
+	// sources but before we drop nmspinning; as a result nobody will
+	// unpark a thread to run the work.
+	//
+	// This applies to the following sources of work:
+	//
+	// * Goroutines added to a per-P run queue.
+	// * New/modified-earlier timers on a per-P timer heap.
+	// * Idle-priority GC work (barring golang.org/issue/19112).
+	//
 	// If we discover new work below, we need to restore m.spinning as a signal
 	// for resetspinning to unpark a new worker thread (because there can be more
 	// than one starving goroutine). However, if after discovering new work
-	// we also observe no idle Ps, it is OK to just park the current thread:
-	// the system is fully loaded so no spinning threads are required.
+	// we also observe no idle Ps it is OK to skip unparking a new worker
+	// thread: the system is fully loaded so no spinning threads are required.
 	// Also see "Worker thread parking/unparking" comment at the top of the file.
 	wasSpinning := _g_.m.spinning
 	if _g_.m.spinning {
@@ -2577,97 +2631,48 @@ stop:
 		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
 			throw("findrunnable: negative nmspinning")
 		}
-	}
 
-	// check all runqueues once again
-	for id, _p_ := range allpSnapshot {
-		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(_p_) {
-			lock(&sched.lock)
-			_p_ = pidleget()
-			unlock(&sched.lock)
-			if _p_ != nil {
-				acquirep(_p_)
-				if wasSpinning {
-					_g_.m.spinning = true
-					atomic.Xadd(&sched.nmspinning, 1)
-				}
-				goto top
-			}
-			break
-		}
-	}
-
-	// Similar to above, check for timer creation or expiry concurrently with
-	// transitioning from spinning to non-spinning. Note that we cannot use
-	// checkTimers here because it calls adjusttimers which may need to allocate
-	// memory, and that isn't allowed when we don't have an active P.
-	for id, _p_ := range allpSnapshot {
-		if timerpMaskSnapshot.read(uint32(id)) {
-			w := nobarrierWakeTime(_p_)
-			if w != 0 && (pollUntil == 0 || w < pollUntil) {
-				pollUntil = w
-			}
-		}
-	}
-	if pollUntil != 0 {
-		if now == 0 {
-			now = nanotime()
-		}
-		delta = pollUntil - now
-		if delta < 0 {
-			delta = 0
-		}
-	}
+		// Note the for correctness, only the last M transitioning from
+		// spinning to non-spinning must perform these rechecks to
+		// ensure no missed work. We are performing it on every M that
+		// transitions as a conservative change to monitor effects on
+		// latency. See golang.org/issue/43997.
 
-	// Check for idle-priority GC work again.
-	//
-	// N.B. Since we have no P, gcBlackenEnabled may change at any time; we
-	// must check again after acquiring a P.
-	if atomic.Load(&gcBlackenEnabled) != 0 && gcMarkWorkAvailable(nil) {
-		// Work is available; we can start an idle GC worker only if
-		// there is an available P and available worker G.
-		//
-		// We can attempt to acquire these in either order. Workers are
-		// almost always available (see comment in findRunnableGCWorker
-		// for the one case there may be none). Since we're slightly
-		// less likely to find a P, check for that first.
-		lock(&sched.lock)
-		var node *gcBgMarkWorkerNode
-		_p_ = pidleget()
+		// Check all runqueues once again.
+		_p_ = checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
 		if _p_ != nil {
-			// Now that we own a P, gcBlackenEnabled can't change
-			// (as it requires STW).
-			if gcBlackenEnabled != 0 {
-				node = (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
-				if node == nil {
-					pidleput(_p_)
-					_p_ = nil
-				}
-			} else {
-				pidleput(_p_)
-				_p_ = nil
-			}
+			acquirep(_p_)
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
+			goto top
 		}
-		unlock(&sched.lock)
+
+		// Check for idle-priority GC work again.
+		_p_, gp = checkIdleGCNoP()
 		if _p_ != nil {
 			acquirep(_p_)
-			if wasSpinning {
-				_g_.m.spinning = true
-				atomic.Xadd(&sched.nmspinning, 1)
-			}
+			_g_.m.spinning = true
+			atomic.Xadd(&sched.nmspinning, 1)
 
 			// Run the idle worker.
 			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
-			gp := node.gp.ptr()
 			casgstatus(gp, _Gwaiting, _Grunnable)
 			if trace.enabled {
 				traceGoUnpark(gp, 0)
 			}
 			return gp, false
 		}
+
+		// Finally, check for timer creation or expiry concurrently with
+		// transitioning from spinning to non-spinning.
+		//
+		// Note that we cannot use checkTimers here because it calls
+		// adjusttimers which may need to allocate memory, and that isn't
+		// allowed when we don't have an active P.
+		pollUntil = checkTimersNoP(allpSnapshot, timerpMaskSnapshot, pollUntil)
 	}
 
-	// poll network
+	// Poll network until next timer.
 	if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
 		atomic.Store64(&sched.pollUntil, uint64(pollUntil))
 		if _g_.m.p != 0 {
@@ -2676,11 +2681,21 @@ stop:
 		if _g_.m.spinning {
 			throw("findrunnable: netpoll with spinning")
 		}
+		delay := int64(-1)
+		if pollUntil != 0 {
+			if now == 0 {
+				now = nanotime()
+			}
+			delay = pollUntil - now
+			if delay < 0 {
+				delay = 0
+			}
+		}
 		if faketime != 0 {
 			// When using fake time, just poll.
-			delta = 0
+			delay = 0
 		}
-		list := netpoll(delta) // block until new work is available
+		list := netpoll(delay) // block until new work is available
 		atomic.Store64(&sched.pollUntil, 0)
 		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
 		if faketime != 0 && list.empty() {
@@ -2742,6 +2757,178 @@ func pollWork() bool {
 	return false
 }
 
+// stealWork attempts to steal a runnable goroutine or timer from any P.
+//
+// If newWork is true, new work may have been readied.
+//
+// If now is not 0 it is the current time. stealWork returns the passed time or
+// the current time if now was passed as 0.
+func stealWork(now int64) (gp *g, inheritTime bool, rnow, pollUntil int64, newWork bool) {
+	pp := getg().m.p.ptr()
+
+	ranTimer := false
+
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
+		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
+			if sched.gcwaiting != 0 {
+				// GC work may be available.
+				return nil, false, now, pollUntil, true
+			}
+			p2 := allp[enum.position()]
+			if pp == p2 {
+				continue
+			}
+
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
+			//
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
+			//
+			// timerpMask tells us whether the P may have timers at all. If it
+			// can't, no need to check at all.
+			if stealTimersOrRunNextG && timerpMask.read(enum.position()) {
+				tnow, w, ran := checkTimers(p2, now)
+				now = tnow
+				if w != 0 && (pollUntil == 0 || w < pollUntil) {
+					pollUntil = w
+				}
+				if ran {
+					// Running the timers may have
+					// made an arbitrary number of G's
+					// ready and added them to this P's
+					// local run queue. That invalidates
+					// the assumption of runqsteal
+					// that it always has room to add
+					// stolen G's. So check now if there
+					// is a local G to run.
+					if gp, inheritTime := runqget(pp); gp != nil {
+						return gp, inheritTime, now, pollUntil, ranTimer
+					}
+					ranTimer = true
+				}
+			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(pp, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false, now, pollUntil, ranTimer
+				}
+			}
+		}
+	}
+
+	// No goroutines found to steal. Regardless, running a timer may have
+	// made some goroutine ready that we missed. Indicate the next timer to
+	// wait for.
+	return nil, false, now, pollUntil, ranTimer
+}
+
+// Check all Ps for a runnable G to steal.
+//
+// On entry we have no P. If a G is available to steal and a P is available,
+// the P is returned which the caller should acquire and attempt to steal the
+// work to.
+func checkRunqsNoP(allpSnapshot []*p, idlepMaskSnapshot pMask) *p {
+	for id, p2 := range allpSnapshot {
+		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(p2) {
+			lock(&sched.lock)
+			pp := pidleget()
+			unlock(&sched.lock)
+			if pp != nil {
+				return pp
+			}
+
+			// Can't get a P, don't bother checking remaining Ps.
+			break
+		}
+	}
+
+	return nil
+}
+
+// Check all Ps for a timer expiring sooner than pollUntil.
+//
+// Returns updated pollUntil value.
+func checkTimersNoP(allpSnapshot []*p, timerpMaskSnapshot pMask, pollUntil int64) int64 {
+	for id, p2 := range allpSnapshot {
+		if timerpMaskSnapshot.read(uint32(id)) {
+			w := nobarrierWakeTime(p2)
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+
+	return pollUntil
+}
+
+// Check for idle-priority GC, without a P on entry.
+//
+// If some GC work, a P, and a worker G are all available, the P and G will be
+// returned. The returned P has not been wired yet.
+func checkIdleGCNoP() (*p, *g) {
+	// N.B. Since we have no P, gcBlackenEnabled may change at any time; we
+	// must check again after acquiring a P.
+	if atomic.Load(&gcBlackenEnabled) == 0 {
+		return nil, nil
+	}
+	if !gcMarkWorkAvailable(nil) {
+		return nil, nil
+	}
+
+	// Work is available; we can start an idle GC worker only if there is
+	// an available P and available worker G.
+	//
+	// We can attempt to acquire these in either order, though both have
+	// synchronization concerns (see below). Workers are almost always
+	// available (see comment in findRunnableGCWorker for the one case
+	// there may be none). Since we're slightly less likely to find a P,
+	// check for that first.
+	//
+	// Synchronization: note that we must hold sched.lock until we are
+	// committed to keeping it. Otherwise we cannot put the unnecessary P
+	// back in sched.pidle without performing the full set of idle
+	// transition checks.
+	//
+	// If we were to check gcBgMarkWorkerPool first, we must somehow handle
+	// the assumption in gcControllerState.findRunnableGCWorker that an
+	// empty gcBgMarkWorkerPool is only possible if gcMarkDone is running.
+	lock(&sched.lock)
+	pp := pidleget()
+	if pp == nil {
+		unlock(&sched.lock)
+		return nil, nil
+	}
+
+	// Now that we own a P, gcBlackenEnabled can't change (as it requires
+	// STW).
+	if gcBlackenEnabled == 0 {
+		pidleput(pp)
+		unlock(&sched.lock)
+		return nil, nil
+	}
+
+	node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
+	if node == nil {
+		pidleput(pp)
+		unlock(&sched.lock)
+		return nil, nil
+	}
+
+	unlock(&sched.lock)
+
+	return pp, node.gp.ptr()
+}
+
 // wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
 // going to wake up before the when argument; or it wakes an idle P to service
 // timers and the network poller if there isn't one already.
@@ -2908,7 +3095,9 @@ top:
 	}
 	if gp == nil && gcBlackenEnabled != 0 {
 		gp = gcController.findRunnableGCWorker(_g_.m.p.ptr())
-		tryWakeP = tryWakeP || gp != nil
+		if gp != nil {
+			tryWakeP = true
+		}
 	}
 	if gp == nil {
 		// Check the global runnable queue once in a while to ensure fairness.
@@ -2999,7 +3188,7 @@ func dropg() {
 
 // checkTimers runs any timers for the P that are ready.
 // If now is not 0 it is the current time.
-// It returns the current time or 0 if it is not known,
+// It returns the passed time or the current time if now was passed as 0.
 // and the time when the next timer should run or 0 if there is no next timer,
 // and reports whether it ran any timers.
 // If the time when the next timer should run is not 0,
@@ -3149,6 +3338,7 @@ func preemptPark(gp *g) {
 		throw("bad g status")
 	}
 	gp.waitreason = waitReasonPreempted
+
 	// Transition from _Grunning to _Gscan|_Gpreempted. We can't
 	// be in _Grunning when we dropg because then we'd be running
 	// without an M, but the moment we're in _Gpreempted,
@@ -3527,20 +3717,28 @@ func exitsyscallfast_pidle() bool {
 // exitsyscall slow path on g0.
 // Failed to acquire P, enqueue gp as runnable.
 //
+// Called via mcall, so gp is the calling g from this M.
+//
 //go:nowritebarrierrec
 func exitsyscall0(gp *g) {
-	_g_ := getg()
-
 	casgstatus(gp, _Gsyscall, _Gexitingsyscall)
 	dropg()
 	casgstatus(gp, _Gexitingsyscall, _Grunnable)
 	lock(&sched.lock)
 	var _p_ *p
-	if schedEnabled(_g_) {
+	if schedEnabled(gp) {
 		_p_ = pidleget()
 	}
+	var locked bool
 	if _p_ == nil {
 		globrunqput(gp)
+
+		// Below, we stoplockedm if gp is locked. globrunqput releases
+		// ownership of gp, so we must check if gp is locked prior to
+		// committing the release by unlocking sched.lock, otherwise we
+		// could race with another M transitioning gp from unlocked to
+		// locked.
+		locked = gp.lockedm != 0
 	} else if atomic.Load(&sched.sysmonwait) != 0 {
 		atomic.Store(&sched.sysmonwait, 0)
 		notewakeup(&sched.sysmonnote)
@@ -3550,8 +3748,11 @@ func exitsyscall0(gp *g) {
 		acquirep(_p_)
 		execute(gp, false) // Never returns.
 	}
-	if _g_.m.lockedg != 0 {
+	if locked {
 		// Wait until another thread schedules gp and so m again.
+		//
+		// N.B. lockedm must be this M, as this g was running on this M
+		// before entersyscall.
 		stoplockedm()
 		execute(gp, false) // Never returns.
 	}
@@ -3586,7 +3787,10 @@ func syscall_exitsyscall() {
 	exitsyscall()
 }
 
-func beforefork() {
+// Called from syscall package before fork.
+//go:linkname syscall_runtime_BeforeFork syscall.runtime__BeforeFork
+//go:nosplit
+func syscall_runtime_BeforeFork() {
 	gp := getg().m.curg
 
 	// Block signals during a fork, so that the child does not run
@@ -3597,14 +3801,10 @@ func beforefork() {
 	sigblock(false)
 }
 
-// Called from syscall package before fork.
-//go:linkname syscall_runtime_BeforeFork syscall.runtime__BeforeFork
+// Called from syscall package after fork in parent.
+//go:linkname syscall_runtime_AfterFork syscall.runtime__AfterFork
 //go:nosplit
-func syscall_runtime_BeforeFork() {
-	systemstack(beforefork)
-}
-
-func afterfork() {
+func syscall_runtime_AfterFork() {
 	gp := getg().m.curg
 
 	msigrestore(gp.m.sigmask)
@@ -3612,13 +3812,6 @@ func afterfork() {
 	gp.m.locks--
 }
 
-// Called from syscall package after fork in parent.
-//go:linkname syscall_runtime_AfterFork syscall.runtime__AfterFork
-//go:nosplit
-func syscall_runtime_AfterFork() {
-	systemstack(afterfork)
-}
-
 // inForkedChild is true while manipulating signals in the child process.
 // This is used to avoid calling libc functions in case we are using vfork.
 var inForkedChild bool
@@ -3734,6 +3927,11 @@ func newproc(fn uintptr, arg unsafe.Pointer) *g {
 	if isSystemGoroutine(newg, false) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
+	// Track initial transition?
+	newg.trackingSeq = uint8(fastrand())
+	if newg.trackingSeq%gTrackingPeriod == 0 {
+		newg.tracking = true
+	}
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
@@ -3844,13 +4042,19 @@ func gfput(_p_ *p, gp *g) {
 	_p_.gFree.push(gp)
 	_p_.gFree.n++
 	if _p_.gFree.n >= 64 {
-		lock(&sched.gFree.lock)
+		var (
+			inc      int32
+			noStackQ gQueue
+		)
 		for _p_.gFree.n >= 32 {
-			_p_.gFree.n--
 			gp = _p_.gFree.pop()
-			sched.gFree.list.push(gp)
-			sched.gFree.n++
+			_p_.gFree.n--
+			noStackQ.push(gp)
+			inc++
 		}
+		lock(&sched.gFree.lock)
+		sched.gFree.list.pushAll(noStackQ)
+		sched.gFree.n += inc
 		unlock(&sched.gFree.lock)
 	}
 }
@@ -3884,13 +4088,19 @@ retry:
 
 // Purge all cached G's from gfree list to the global list.
 func gfpurge(_p_ *p) {
-	lock(&sched.gFree.lock)
+	var (
+		inc      int32
+		noStackQ gQueue
+	)
 	for !_p_.gFree.empty() {
 		gp := _p_.gFree.pop()
 		_p_.gFree.n--
-		sched.gFree.list.push(gp)
-		sched.gFree.n++
+		noStackQ.push(gp)
+		inc++
 	}
+	lock(&sched.gFree.lock)
+	sched.gFree.list.pushAll(noStackQ)
+	sched.gFree.n += inc
 	unlock(&sched.gFree.lock)
 }
 
@@ -4061,71 +4271,6 @@ func sigprof(pc uintptr, gp *g, mp *m) {
 	// See golang.org/issue/17165.
 	getg().m.mallocing++
 
-	// Define that a "user g" is a user-created goroutine, and a "system g"
-	// is one that is m->g0 or m->gsignal.
-	//
-	// We might be interrupted for profiling halfway through a
-	// goroutine switch. The switch involves updating three (or four) values:
-	// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
-	// because once it gets updated the new g is running.
-	//
-	// When switching from a user g to a system g, LR is not considered live,
-	// so the update only affects g, SP, and PC. Since PC must be last, there
-	// the possible partial transitions in ordinary execution are (1) g alone is updated,
-	// (2) both g and SP are updated, and (3) SP alone is updated.
-	// If SP or g alone is updated, we can detect the partial transition by checking
-	// whether the SP is within g's stack bounds. (We could also require that SP
-	// be changed only after g, but the stack bounds check is needed by other
-	// cases, so there is no need to impose an additional requirement.)
-	//
-	// There is one exceptional transition to a system g, not in ordinary execution.
-	// When a signal arrives, the operating system starts the signal handler running
-	// with an updated PC and SP. The g is updated last, at the beginning of the
-	// handler. There are two reasons this is okay. First, until g is updated the
-	// g and SP do not match, so the stack bounds check detects the partial transition.
-	// Second, signal handlers currently run with signals disabled, so a profiling
-	// signal cannot arrive during the handler.
-	//
-	// When switching from a system g to a user g, there are three possibilities.
-	//
-	// First, it may be that the g switch has no PC update, because the SP
-	// either corresponds to a user g throughout (as in asmcgocall)
-	// or because it has been arranged to look like a user g frame
-	// (as in cgocallback). In this case, since the entire
-	// transition is a g+SP update, a partial transition updating just one of
-	// those will be detected by the stack bounds check.
-	//
-	// Second, when returning from a signal handler, the PC and SP updates
-	// are performed by the operating system in an atomic update, so the g
-	// update must be done before them. The stack bounds check detects
-	// the partial transition here, and (again) signal handlers run with signals
-	// disabled, so a profiling signal cannot arrive then anyway.
-	//
-	// Third, the common case: it may be that the switch updates g, SP, and PC
-	// separately. If the PC is within any of the functions that does this,
-	// we don't ask for a traceback. C.F. the function setsSP for more about this.
-	//
-	// There is another apparently viable approach, recorded here in case
-	// the "PC within setsSP function" check turns out not to be usable.
-	// It would be possible to delay the update of either g or SP until immediately
-	// before the PC update instruction. Then, because of the stack bounds check,
-	// the only problematic interrupt point is just before that PC update instruction,
-	// and the sigprof handler can detect that instruction and simulate stepping past
-	// it in order to reach a consistent state. On ARM, the update of g must be made
-	// in two places (in R10 and also in a TLS slot), so the delayed update would
-	// need to be the SP update. The sigprof handler must read the instruction at
-	// the current PC and if it was the known instruction (for example, JMP BX or
-	// MOV R2, PC), use that other register in place of the PC value.
-	// The biggest drawback to this solution is that it requires that we can tell
-	// whether it's safe to read from the memory pointed at by PC.
-	// In a correct program, we can test PC == nil and otherwise read,
-	// but if a profiling signal happens at the instant that a program executes
-	// a bad jump (before the program manages to handle the resulting fault)
-	// the profiling handler could fault trying to read nonexistent memory.
-	//
-	// To recap, there are no constraints on the assembly being used for the
-	// transition. We simply require that g and SP match and that the PC is not
-	// in gogo.
 	traceback := true
 
 	// If SIGPROF arrived while already fetching runtime callers
@@ -4351,7 +4496,6 @@ func (pp *p) destroy() {
 		moveTimers(plocal, pp.timers)
 		pp.timers = nil
 		pp.numTimers = 0
-		pp.adjustTimers = 0
 		pp.deletedTimers = 0
 		atomic.Store64(&pp.timer0When, 0)
 		unlock(&pp.timersLock)
@@ -4650,11 +4794,9 @@ func checkdead() {
 	}
 
 	grunning := 0
-	lock(&allglock)
-	for i := 0; i < len(allgs); i++ {
-		gp := allgs[i]
+	forEachG(func(gp *g) {
 		if isSystemGoroutine(gp, false) {
-			continue
+			return
 		}
 		s := readgstatus(gp)
 		switch s &^ _Gscan {
@@ -4667,8 +4809,7 @@ func checkdead() {
 			print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
 			throw("checkdead: runnable g")
 		}
-	}
-	unlock(&allglock)
+	})
 	if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
 		unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang
 		throw("no goroutines (main called runtime.Goexit) - deadlock!")
@@ -4972,7 +5113,7 @@ func preemptall() bool {
 
 // Tell the goroutine running on processor P to stop.
 // This function is purely best-effort. It can incorrectly fail to inform the
-// goroutine. It can send inform the wrong goroutine. Even if it informs the
+// goroutine. It can inform the wrong goroutine. Even if it informs the
 // correct goroutine, that goroutine might ignore the request if it is
 // simultaneously executing newstack.
 // No lock needs to be held.
@@ -5080,9 +5221,7 @@ func schedtrace(detailed bool) {
 		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
 	}
 
-	lock(&allglock)
-	for gi := 0; gi < len(allgs); gi++ {
-		gp := allgs[gi]
+	forEachG(func(gp *g) {
 		mp := gp.m
 		lockedm := gp.lockedm.ptr()
 		id1 := int64(-1)
@@ -5094,8 +5233,7 @@ func schedtrace(detailed bool) {
 			id2 = lockedm.id
 		}
 		print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason.String(), ") m=", id1, " lockedm=", id2, "\n")
-	}
-	unlock(&allglock)
+	})
 	unlock(&sched.lock)
 }
 
@@ -5190,6 +5328,8 @@ func globrunqputhead(gp *g) {
 // Put a batch of runnable goroutines on the global runnable queue.
 // This clears *batch.
 // sched.lock must be held.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrierrec
 func globrunqputbatch(batch *gQueue, n int32) {
 	assertLockHeld(&sched.lock)
 
@@ -5505,6 +5645,45 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) {
 	}
 }
 
+// runqdrain drains the local runnable queue of _p_ and returns all goroutines in it.
+// Executed only by the owner P.
+func runqdrain(_p_ *p) (drainQ gQueue, n uint32) {
+	oldNext := _p_.runnext
+	if oldNext != 0 && _p_.runnext.cas(oldNext, 0) {
+		drainQ.pushBack(oldNext.ptr())
+		n++
+	}
+
+retry:
+	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
+	t := _p_.runqtail
+	qn := t - h
+	if qn == 0 {
+		return
+	}
+	if qn > uint32(len(_p_.runq)) { // read inconsistent h and t
+		goto retry
+	}
+
+	if !atomic.CasRel(&_p_.runqhead, h, h+qn) { // cas-release, commits consume
+		goto retry
+	}
+
+	// We've inverted the order in which it gets G's from the local P's runnable queue
+	// and then advances the head pointer because we don't want to mess up the statuses of G's
+	// while runqdrain() and runqsteal() are running in parallel.
+	// Thus we should advance the head pointer before draining the local P into a gQueue,
+	// so that we can update any gp.schedlink only after we take the full ownership of G,
+	// meanwhile, other P's can't access to all G's in local P's runnable queue and steal them.
+	// See https://groups.google.com/g/golang-dev/c/0pTKxEKhHSc/m/6Q85QjdVBQAJ for more details.
+	for i := uint32(0); i < qn; i++ {
+		gp := _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
+		drainQ.pushBack(gp)
+		n++
+	}
+	return
+}
+
 // Grabs a batch of goroutines from _p_'s runnable queue into batch.
 // Batch is a ring buffer starting at batchHead.
 // Returns number of grabbed goroutines.
@@ -5698,11 +5877,6 @@ func setMaxThreads(in int) (out int) {
 	return
 }
 
-func haveexperiment(name string) bool {
-	// The gofrontend does not support experiments.
-	return false
-}
-
 //go:nosplit
 func procPin() int {
 	_g_ := getg()
@@ -5828,7 +6002,7 @@ var inittrace tracestat
 
 type tracestat struct {
 	active bool   // init tracing activation status
-	id     int64  // init go routine id
+	id     int64  // init goroutine id
 	allocs uint64 // heap allocations
 	bytes  uint64 // heap allocated bytes
 }
diff --git a/libgo/go/runtime/proc_test.go b/libgo/go/runtime/proc_test.go
index a8f0dc39c0c..d7549a8874f 100644
--- a/libgo/go/runtime/proc_test.go
+++ b/libgo/go/runtime/proc_test.go
@@ -699,6 +699,55 @@ func BenchmarkCreateGoroutinesCapture(b *testing.B) {
 	}
 }
 
+// warmupScheduler ensures the scheduler has at least targetThreadCount threads
+// in its thread pool.
+func warmupScheduler(targetThreadCount int) {
+	var wg sync.WaitGroup
+	var count int32
+	for i := 0; i < targetThreadCount; i++ {
+		wg.Add(1)
+		go func() {
+			atomic.AddInt32(&count, 1)
+			for atomic.LoadInt32(&count) < int32(targetThreadCount) {
+				// spin until all threads started
+			}
+
+			// spin a bit more to ensure they are all running on separate CPUs.
+			doWork(time.Millisecond)
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func doWork(dur time.Duration) {
+	start := time.Now()
+	for time.Since(start) < dur {
+	}
+}
+
+// BenchmarkCreateGoroutinesSingle creates many goroutines, all from a single
+// producer (the main benchmark goroutine).
+//
+// Compared to BenchmarkCreateGoroutines, this causes different behavior in the
+// scheduler because Ms are much more likely to need to steal work from the
+// main P rather than having work in the local run queue.
+func BenchmarkCreateGoroutinesSingle(b *testing.B) {
+	// Since we are interested in stealing behavior, warm the scheduler to
+	// get all the Ps running first.
+	warmupScheduler(runtime.GOMAXPROCS(0))
+	b.ResetTimer()
+
+	var wg sync.WaitGroup
+	wg.Add(b.N)
+	for i := 0; i < b.N; i++ {
+		go func() {
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
 func BenchmarkClosureCall(b *testing.B) {
 	sum := 0
 	off1 := 1
diff --git a/libgo/go/runtime/race0.go b/libgo/go/runtime/race0.go
index 180f707b1a7..0e431b8103e 100644
--- a/libgo/go/runtime/race0.go
+++ b/libgo/go/runtime/race0.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !race
 // +build !race
 
 // Dummy race detection API, used when not built with -race.
diff --git a/libgo/go/runtime/relax_stub.go b/libgo/go/runtime/relax_stub.go
index 81ed1291b8b..5b92879c20b 100644
--- a/libgo/go/runtime/relax_stub.go
+++ b/libgo/go/runtime/relax_stub.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !windows
 // +build !windows
 
 package runtime
diff --git a/libgo/go/runtime/runtime-lldb_test.go b/libgo/go/runtime/runtime-lldb_test.go
index c923b872aa1..19a6cc6f8d3 100644
--- a/libgo/go/runtime/runtime-lldb_test.go
+++ b/libgo/go/runtime/runtime-lldb_test.go
@@ -142,14 +142,10 @@ func TestLldbPython(t *testing.T) {
 
 	checkLldbPython(t)
 
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	src := filepath.Join(dir, "main.go")
-	err = os.WriteFile(src, []byte(lldbHelloSource), 0644)
+	err := os.WriteFile(src, []byte(lldbHelloSource), 0644)
 	if err != nil {
 		t.Fatalf("failed to create src file: %v", err)
 	}
diff --git a/libgo/go/runtime/runtime1.go b/libgo/go/runtime/runtime1.go
index 143d19eb134..68e7b9ea6fa 100644
--- a/libgo/go/runtime/runtime1.go
+++ b/libgo/go/runtime/runtime1.go
@@ -321,7 +321,6 @@ var debug struct {
 	gctrace            int32
 	invalidptr         int32
 	madvdontneed       int32 // for Linux; issue 28466
-	scavenge           int32
 	scavtrace          int32
 	scheddetail        int32
 	schedtrace         int32
@@ -350,7 +349,6 @@ var dbgvars = []dbgVar{
 	{"invalidptr", &debug.invalidptr},
 	{"madvdontneed", &debug.madvdontneed},
 	{"sbrk", &debug.sbrk},
-	{"scavenge", &debug.scavenge},
 	{"scavtrace", &debug.scavtrace},
 	{"scheddetail", &debug.scheddetail},
 	{"schedtrace", &debug.schedtrace},
diff --git a/libgo/go/runtime/runtime2.go b/libgo/go/runtime/runtime2.go
index 1879b829be7..81a40d5633e 100644
--- a/libgo/go/runtime/runtime2.go
+++ b/libgo/go/runtime/runtime2.go
@@ -5,7 +5,6 @@
 package runtime
 
 import (
-	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -412,7 +411,18 @@ type g struct {
 	syscallsp uintptr // if status==Gsyscall, syscallsp = sched.sp to use during gc
 	syscallpc uintptr // if status==Gsyscall, syscallpc = sched.pc to use during gc
 	// Not for gccgo: stktopsp       uintptr        // expected sp at top of stack, to check in traceback
-	param        unsafe.Pointer // passed parameter on wakeup
+	// param is a generic pointer parameter field used to pass
+	// values in particular contexts where other storage for the
+	// parameter would be difficult to find. It is currently used
+	// in three ways:
+	// 1. When a channel operation wakes up a blocked goroutine, it sets param to
+	//    point to the sudog of the completed blocking operation.
+	// 2. By gcAssistAlloc1 to signal back to its caller that the goroutine completed
+	//    the GC cycle. It is unsafe to do so in any other way, because the goroutine's
+	//    stack may have moved in the meantime.
+	// 3. By debugCallWrap to pass parameters to a new goroutine because allocating a
+	//    closure in the runtime is forbidden.
+	param        unsafe.Pointer
 	atomicstatus uint32
 	// Not for gccgo: stackLock      uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
 	goid        int64
@@ -446,6 +456,10 @@ type g struct {
 
 	raceignore     int8     // ignore race detection events
 	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+	tracking       bool     // whether we're tracking this G for sched latency statistics
+	trackingSeq    uint8    // used to decide whether to track this G
+	runnableStamp  int64    // timestamp of when the G last became runnable, only used when tracking
+	runnableTime   int64    // the amount of time spent runnable, cleared when running, only used when tracking
 	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
 	traceseq       uint64   // trace event sequencer
 	tracelastp     puintptr // last P emitted an event for this goroutine
@@ -529,6 +543,17 @@ type g struct {
 	stackcontext [10]uintptr  // split-stack context
 }
 
+// gTrackingPeriod is the number of transitions out of _Grunning between
+// latency tracking runs.
+const gTrackingPeriod = 8
+
+const (
+	// tlsSlots is the number of pointer-sized slots reserved for TLS on some platforms,
+	// like Windows.
+	tlsSlots = 6
+	tlsSize  = tlsSlots * sys.PtrSize
+)
+
 type m struct {
 	g0 *g // goroutine with scheduling stack
 	// Not for gccgo: morebuf gobuf  // gobuf arg to morestack
@@ -539,7 +564,7 @@ type m struct {
 	gsignal *g     // signal-handling g
 	// Not for gccgo: goSigStack    gsignalStack // Go-allocated signal handling stack
 	sigmask sigset // storage for saved signal mask
-	// Not for gccgo: tls           [6]uintptr   // thread-local storage (for x86 extern register)
+	// Not for gccgo: tls           [tlsSlots]uintptr   // thread-local storage (for x86 extern register)
 	mstartfn    func()
 	curg        *g       // current running goroutine
 	caughtsig   guintptr // goroutine running during fatal signal
@@ -663,6 +688,9 @@ type p struct {
 	// unit and eliminates the (potentially large) scheduling
 	// latency that otherwise arises from adding the ready'd
 	// goroutines to the end of the run queue.
+	//
+	// Note that while other P's may atomically CAS this to zero,
+	// only the owner P can CAS it to a valid G.
 	runnext guintptr
 
 	// Available G's (status == Gdead)
@@ -708,7 +736,7 @@ type p struct {
 	// timerModifiedEarlier status. Because the timer may have been
 	// modified again, there need not be any timer with this value.
 	// This is updated using atomic functions.
-	// This is 0 if the value is unknown.
+	// This is 0 if there are no timerModifiedEarlier timers.
 	timerModifiedEarliest uint64
 
 	// Per-P GC state
@@ -754,12 +782,6 @@ type p struct {
 	// Modified using atomic instructions.
 	numTimers uint32
 
-	// Number of timerModifiedEarlier timers on P's heap.
-	// This should only be modified while holding timersLock,
-	// or while the timer status is in a transient state
-	// such as timerModifying.
-	adjustTimers uint32
-
 	// Number of timerDeleted timers in P's heap.
 	// Modified using atomic instructions.
 	deletedTimers uint32
@@ -771,7 +793,8 @@ type p struct {
 	// scheduler ASAP (regardless of what G is running on it).
 	preempt bool
 
-	pad cpu.CacheLinePad
+	// Padding is no longer needed. False sharing is now not a worry because p is large enough
+	// that its size class is an integer multiple of the cache line size (for any of our architectures).
 }
 
 type schedt struct {
@@ -860,6 +883,13 @@ type schedt struct {
 	// Acquire and hold this mutex to block sysmon from interacting
 	// with the rest of the runtime.
 	sysmonlock mutex
+
+	// timeToRun is a distribution of scheduling latencies, defined
+	// as the sum of time a G spends in the _Grunnable state before
+	// it transitions to _Grunning.
+	//
+	// timeToRun is protected by sched.lock.
+	timeToRun timeHistogram
 }
 
 // Values for the flags field of a sigTabT.
diff --git a/libgo/go/runtime/runtime_mmap_test.go b/libgo/go/runtime/runtime_mmap_test.go
index 21918c50bb4..8f72daa5c65 100644
--- a/libgo/go/runtime/runtime_mmap_test.go
+++ b/libgo/go/runtime/runtime_mmap_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd hurd linux netbsd openbsd solaris
 
 package runtime_test
diff --git a/libgo/go/runtime/runtime_test.go b/libgo/go/runtime/runtime_test.go
index ca352055755..bda09cfd05e 100644
--- a/libgo/go/runtime/runtime_test.go
+++ b/libgo/go/runtime/runtime_test.go
@@ -270,8 +270,8 @@ func TestTrailingZero(t *testing.T) {
 		n int64
 		z struct{}
 	}
-	if unsafe.Sizeof(T2{}) != 8+unsafe.Sizeof(Uintreg(0)) {
-		t.Errorf("sizeof(%#v)==%d, want %d", T2{}, unsafe.Sizeof(T2{}), 8+unsafe.Sizeof(Uintreg(0)))
+	if unsafe.Sizeof(T2{}) != 8+unsafe.Sizeof(uintptr(0)) {
+		t.Errorf("sizeof(%#v)==%d, want %d", T2{}, unsafe.Sizeof(T2{}), 8+unsafe.Sizeof(uintptr(0)))
 	}
 	type T3 struct {
 		n byte
diff --git a/libgo/go/runtime/runtime_unix_test.go b/libgo/go/runtime/runtime_unix_test.go
index cdad0e2b943..54e14c9a1ee 100644
--- a/libgo/go/runtime/runtime_unix_test.go
+++ b/libgo/go/runtime/runtime_unix_test.go
@@ -6,6 +6,7 @@
 // We need a fast system call to provoke the race,
 // and Close(-1) is nearly universally fast.
 
+//go:build aix || darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd || plan9
 // +build aix darwin dragonfly freebsd hurd linux netbsd openbsd plan9
 
 package runtime_test
diff --git a/libgo/go/runtime/select.go b/libgo/go/runtime/select.go
index 31272b03560..d84af07d798 100644
--- a/libgo/go/runtime/select.go
+++ b/libgo/go/runtime/select.go
@@ -20,7 +20,7 @@ const debugSelect = false
 
 // Select case descriptor.
 // Known to compiler.
-// Changes here must also be made in src/cmd/internal/gc/select.go's scasetype.
+// Changes here must also be made in src/cmd/compile/internal/walk/select.go's scasetype.
 type scase struct {
 	c    *hchan         // chan
 	elem unsafe.Pointer // data element
diff --git a/libgo/go/runtime/semasleep_test.go b/libgo/go/runtime/semasleep_test.go
index 9b371b07324..905e932b7d8 100644
--- a/libgo/go/runtime/semasleep_test.go
+++ b/libgo/go/runtime/semasleep_test.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !plan9 && !windows && !js
 // +build !plan9,!windows,!js
 
 package runtime_test
diff --git a/libgo/go/runtime/signal_unix.go b/libgo/go/runtime/signal_unix.go
index e85136af7d1..a291d2b6bd4 100644
--- a/libgo/go/runtime/signal_unix.go
+++ b/libgo/go/runtime/signal_unix.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build aix || darwin || dragonfly || freebsd || hurd || linux || netbsd || openbsd || solaris
 // +build aix darwin dragonfly freebsd hurd linux netbsd openbsd solaris
 
 package runtime
@@ -288,6 +289,8 @@ func setProcessCPUProfiler(hz int32) {
 		it.it_value = it.it_interval
 		setitimer(_ITIMER_PROF, &it, nil)
 	} else {
+		setitimer(_ITIMER_PROF, &_itimerval{}, nil)
+
 		// If the Go signal handler should be disabled by default,
 		// switch back to the signal handler that was installed
 		// when we enabled profiling. We don't try to handle the case
@@ -311,8 +314,6 @@ func setProcessCPUProfiler(hz int32) {
 				setsig(_SIGPROF, h)
 			}
 		}
-
-		setitimer(_ITIMER_PROF, &_itimerval{}, nil)
 	}
 }
 
diff --git a/libgo/go/runtime/signal_windows_test.go b/libgo/go/runtime/signal_windows_test.go
index 33a9b92ee73..1b7cb9d4c45 100644
--- a/libgo/go/runtime/signal_windows_test.go
+++ b/libgo/go/runtime/signal_windows_test.go
@@ -1,3 +1,4 @@
+//go:build windows
 // +build windows
 
 package runtime_test
@@ -7,7 +8,6 @@ import (
 	"bytes"
 	"fmt"
 	"internal/testenv"
-	"os"
 	"os/exec"
 	"path/filepath"
 	"runtime"
@@ -28,15 +28,11 @@ func TestVectoredHandlerDontCrashOnLibrary(t *testing.T) {
 	testenv.MustHaveExecPath(t, "gcc")
 	testprog.Lock()
 	defer testprog.Unlock()
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// build go dll
 	dll := filepath.Join(dir, "testwinlib.dll")
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "--buildmode", "c-shared", "testdata/testwinlib/main.go")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "-buildmode", "c-shared", "testdata/testwinlib/main.go")
 	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("failed to build go library: %s\n%s", err, out)
@@ -156,15 +152,11 @@ func TestLibraryCtrlHandler(t *testing.T) {
 	testenv.MustHaveExecPath(t, "gcc")
 	testprog.Lock()
 	defer testprog.Unlock()
-	dir, err := os.MkdirTemp("", "go-build")
-	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
+	dir := t.TempDir()
 
 	// build go dll
 	dll := filepath.Join(dir, "dummy.dll")
-	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "--buildmode", "c-shared", "testdata/testwinlibsignal/dummy.go")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "-buildmode", "c-shared", "testdata/testwinlibsignal/dummy.go")
 	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
 	if err != nil {
 		t.Fatalf("failed to build go library: %s\n%s", err, out)
diff --git a/libgo/go/runtime/sigqueue.go b/libgo/go/runtime/sigqueue.go
index 7ca755740d2..d5539f1cd82 100644
--- a/libgo/go/runtime/sigqueue.go
+++ b/libgo/go/runtime/sigqueue.go
@@ -28,6 +28,7 @@
 // unnecessary rechecks of sig.mask, but it cannot lead to missed signals
 // nor deadlocks.
 
+//go:build !plan9
 // +build !plan9
 
 package runtime
@@ -71,7 +72,7 @@ const (
 // It runs from the signal handler, so it's limited in what it can do.
 func sigsend(s uint32) bool {
 	bit := uint32(1) << uint(s&31)
-	if !sig.inuse || s >= uint32(32*len(sig.wanted)) {
+	if s >= uint32(32*len(sig.wanted)) {
 		return false
 	}
 
diff --git a/libgo/go/runtime/sigqueue_note.go b/libgo/go/runtime/sigqueue_note.go
index 16aeeb2ef0b..e23446bea47 100644
--- a/libgo/go/runtime/sigqueue_note.go
+++ b/libgo/go/runtime/sigqueue_note.go
@@ -7,8 +7,8 @@
 // signal_recv thread. This file holds the non-Darwin implementations of
 // those functions. These functions will never be called.
 
-// +build !darwin
-// +build !plan9
+//go:build !darwin && !plan9
+// +build !darwin,!plan9
 
 package runtime
 
diff --git a/libgo/go/runtime/sizeclasses.go b/libgo/go/runtime/sizeclasses.go
index c5521ce1bd2..067871eaf34 100644
--- a/libgo/go/runtime/sizeclasses.go
+++ b/libgo/go/runtime/sizeclasses.go
@@ -3,74 +3,83 @@
 
 package runtime
 
-// class  bytes/obj  bytes/span  objects  tail waste  max waste
-//     1          8        8192     1024           0     87.50%
-//     2         16        8192      512           0     43.75%
-//     3         24        8192      341           8     29.24%
-//     4         32        8192      256           0     21.88%
-//     5         48        8192      170          32     31.52%
-//     6         64        8192      128           0     23.44%
-//     7         80        8192      102          32     19.07%
-//     8         96        8192       85          32     15.95%
-//     9        112        8192       73          16     13.56%
-//    10        128        8192       64           0     11.72%
-//    11        144        8192       56         128     11.82%
-//    12        160        8192       51          32      9.73%
-//    13        176        8192       46          96      9.59%
-//    14        192        8192       42         128      9.25%
-//    15        208        8192       39          80      8.12%
-//    16        224        8192       36         128      8.15%
-//    17        240        8192       34          32      6.62%
-//    18        256        8192       32           0      5.86%
-//    19        288        8192       28         128     12.16%
-//    20        320        8192       25         192     11.80%
-//    21        352        8192       23          96      9.88%
-//    22        384        8192       21         128      9.51%
-//    23        416        8192       19         288     10.71%
-//    24        448        8192       18         128      8.37%
-//    25        480        8192       17          32      6.82%
-//    26        512        8192       16           0      6.05%
-//    27        576        8192       14         128     12.33%
-//    28        640        8192       12         512     15.48%
-//    29        704        8192       11         448     13.93%
-//    30        768        8192       10         512     13.94%
-//    31        896        8192        9         128     15.52%
-//    32       1024        8192        8           0     12.40%
-//    33       1152        8192        7         128     12.41%
-//    34       1280        8192        6         512     15.55%
-//    35       1408       16384       11         896     14.00%
-//    36       1536        8192        5         512     14.00%
-//    37       1792       16384        9         256     15.57%
-//    38       2048        8192        4           0     12.45%
-//    39       2304       16384        7         256     12.46%
-//    40       2688        8192        3         128     15.59%
-//    41       3072       24576        8           0     12.47%
-//    42       3200       16384        5         384      6.22%
-//    43       3456       24576        7         384      8.83%
-//    44       4096        8192        2           0     15.60%
-//    45       4864       24576        5         256     16.65%
-//    46       5376       16384        3         256     10.92%
-//    47       6144       24576        4           0     12.48%
-//    48       6528       32768        5         128      6.23%
-//    49       6784       40960        6         256      4.36%
-//    50       6912       49152        7         768      3.37%
-//    51       8192        8192        1           0     15.61%
-//    52       9472       57344        6         512     14.28%
-//    53       9728       49152        5         512      3.64%
-//    54      10240       40960        4           0      4.99%
-//    55      10880       32768        3         128      6.24%
-//    56      12288       24576        2           0     11.45%
-//    57      13568       40960        3         256      9.99%
-//    58      14336       57344        4           0      5.35%
-//    59      16384       16384        1           0     12.49%
-//    60      18432       73728        4           0     11.11%
-//    61      19072       57344        3         128      3.57%
-//    62      20480       40960        2           0      6.87%
-//    63      21760       65536        3         256      6.25%
-//    64      24576       24576        1           0     11.45%
-//    65      27264       81920        3         128     10.00%
-//    66      28672       57344        2           0      4.91%
-//    67      32768       32768        1           0     12.50%
+// class  bytes/obj  bytes/span  objects  tail waste  max waste  min align
+//     1          8        8192     1024           0     87.50%          8
+//     2         16        8192      512           0     43.75%         16
+//     3         24        8192      341           8     29.24%          8
+//     4         32        8192      256           0     21.88%         32
+//     5         48        8192      170          32     31.52%         16
+//     6         64        8192      128           0     23.44%         64
+//     7         80        8192      102          32     19.07%         16
+//     8         96        8192       85          32     15.95%         32
+//     9        112        8192       73          16     13.56%         16
+//    10        128        8192       64           0     11.72%        128
+//    11        144        8192       56         128     11.82%         16
+//    12        160        8192       51          32      9.73%         32
+//    13        176        8192       46          96      9.59%         16
+//    14        192        8192       42         128      9.25%         64
+//    15        208        8192       39          80      8.12%         16
+//    16        224        8192       36         128      8.15%         32
+//    17        240        8192       34          32      6.62%         16
+//    18        256        8192       32           0      5.86%        256
+//    19        288        8192       28         128     12.16%         32
+//    20        320        8192       25         192     11.80%         64
+//    21        352        8192       23          96      9.88%         32
+//    22        384        8192       21         128      9.51%        128
+//    23        416        8192       19         288     10.71%         32
+//    24        448        8192       18         128      8.37%         64
+//    25        480        8192       17          32      6.82%         32
+//    26        512        8192       16           0      6.05%        512
+//    27        576        8192       14         128     12.33%         64
+//    28        640        8192       12         512     15.48%        128
+//    29        704        8192       11         448     13.93%         64
+//    30        768        8192       10         512     13.94%        256
+//    31        896        8192        9         128     15.52%        128
+//    32       1024        8192        8           0     12.40%       1024
+//    33       1152        8192        7         128     12.41%        128
+//    34       1280        8192        6         512     15.55%        256
+//    35       1408       16384       11         896     14.00%        128
+//    36       1536        8192        5         512     14.00%        512
+//    37       1792       16384        9         256     15.57%        256
+//    38       2048        8192        4           0     12.45%       2048
+//    39       2304       16384        7         256     12.46%        256
+//    40       2688        8192        3         128     15.59%        128
+//    41       3072       24576        8           0     12.47%       1024
+//    42       3200       16384        5         384      6.22%        128
+//    43       3456       24576        7         384      8.83%        128
+//    44       4096        8192        2           0     15.60%       4096
+//    45       4864       24576        5         256     16.65%        256
+//    46       5376       16384        3         256     10.92%        256
+//    47       6144       24576        4           0     12.48%       2048
+//    48       6528       32768        5         128      6.23%        128
+//    49       6784       40960        6         256      4.36%        128
+//    50       6912       49152        7         768      3.37%        256
+//    51       8192        8192        1           0     15.61%       8192
+//    52       9472       57344        6         512     14.28%        256
+//    53       9728       49152        5         512      3.64%        512
+//    54      10240       40960        4           0      4.99%       2048
+//    55      10880       32768        3         128      6.24%        128
+//    56      12288       24576        2           0     11.45%       4096
+//    57      13568       40960        3         256      9.99%        256
+//    58      14336       57344        4           0      5.35%       2048
+//    59      16384       16384        1           0     12.49%       8192
+//    60      18432       73728        4           0     11.11%       2048
+//    61      19072       57344        3         128      3.57%        128
+//    62      20480       40960        2           0      6.87%       4096
+//    63      21760       65536        3         256      6.25%        256
+//    64      24576       24576        1           0     11.45%       8192
+//    65      27264       81920        3         128     10.00%        128
+//    66      28672       57344        2           0      4.91%       4096
+//    67      32768       32768        1           0     12.50%       8192
+
+// alignment  bits  min obj size
+//         8     3             8
+//        16     4            32
+//        32     5           256
+//        64     6           512
+//       128     7           768
+//      4096    12         28672
+//      8192    13         32768
 
 const (
 	_MaxSmallSize   = 32768
@@ -83,14 +92,6 @@ const (
 
 var class_to_size = [_NumSizeClasses]uint16{0, 8, 16, 24, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 288, 320, 352, 384, 416, 448, 480, 512, 576, 640, 704, 768, 896, 1024, 1152, 1280, 1408, 1536, 1792, 2048, 2304, 2688, 3072, 3200, 3456, 4096, 4864, 5376, 6144, 6528, 6784, 6912, 8192, 9472, 9728, 10240, 10880, 12288, 13568, 14336, 16384, 18432, 19072, 20480, 21760, 24576, 27264, 28672, 32768}
 var class_to_allocnpages = [_NumSizeClasses]uint8{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 3, 2, 3, 1, 3, 2, 3, 4, 5, 6, 1, 7, 6, 5, 4, 3, 5, 7, 2, 9, 7, 5, 8, 3, 10, 7, 4}
-
-type divMagic struct {
-	shift    uint8
-	shift2   uint8
-	mul      uint16
-	baseMask uint16
-}
-
-var class_to_divmagic = [_NumSizeClasses]divMagic{{0, 0, 0, 0}, {3, 0, 1, 65528}, {4, 0, 1, 65520}, {3, 11, 683, 0}, {5, 0, 1, 65504}, {4, 11, 683, 0}, {6, 0, 1, 65472}, {4, 10, 205, 0}, {5, 9, 171, 0}, {4, 11, 293, 0}, {7, 0, 1, 65408}, {4, 13, 911, 0}, {5, 10, 205, 0}, {4, 12, 373, 0}, {6, 9, 171, 0}, {4, 13, 631, 0}, {5, 11, 293, 0}, {4, 13, 547, 0}, {8, 0, 1, 65280}, {5, 9, 57, 0}, {6, 9, 103, 0}, {5, 12, 373, 0}, {7, 7, 43, 0}, {5, 10, 79, 0}, {6, 10, 147, 0}, {5, 11, 137, 0}, {9, 0, 1, 65024}, {6, 9, 57, 0}, {7, 9, 103, 0}, {6, 11, 187, 0}, {8, 7, 43, 0}, {7, 8, 37, 0}, {10, 0, 1, 64512}, {7, 9, 57, 0}, {8, 6, 13, 0}, {7, 11, 187, 0}, {9, 5, 11, 0}, {8, 8, 37, 0}, {11, 0, 1, 63488}, {8, 9, 57, 0}, {7, 10, 49, 0}, {10, 5, 11, 0}, {7, 10, 41, 0}, {7, 9, 19, 0}, {12, 0, 1, 61440}, {8, 9, 27, 0}, {8, 10, 49, 0}, {11, 5, 11, 0}, {7, 13, 161, 0}, {7, 13, 155, 0}, {8, 9, 19, 0}, {13, 0, 1, 57344}, {8, 12, 111, 0}, {9, 9, 27, 0}, {11, 6, 13, 0}, {7, 14, 193, 0}, {12, 3, 3, 0}, {8, 13, 155, 0}, {11, 8, 37, 0}, {14, 0, 1, 49152}, {11, 8, 29, 0}, {7, 13, 55, 0}, {12, 5, 7, 0}, {8, 14, 193, 0}, {13, 3, 3, 0}, {7, 14, 77, 0}, {12, 7, 19, 0}, {15, 0, 1, 32768}}
+var class_to_divmagic = [_NumSizeClasses]uint32{0, ^uint32(0)/8 + 1, ^uint32(0)/16 + 1, ^uint32(0)/24 + 1, ^uint32(0)/32 + 1, ^uint32(0)/48 + 1, ^uint32(0)/64 + 1, ^uint32(0)/80 + 1, ^uint32(0)/96 + 1, ^uint32(0)/112 + 1, ^uint32(0)/128 + 1, ^uint32(0)/144 + 1, ^uint32(0)/160 + 1, ^uint32(0)/176 + 1, ^uint32(0)/192 + 1, ^uint32(0)/208 + 1, ^uint32(0)/224 + 1, ^uint32(0)/240 + 1, ^uint32(0)/256 + 1, ^uint32(0)/288 + 1, ^uint32(0)/320 + 1, ^uint32(0)/352 + 1, ^uint32(0)/384 + 1, ^uint32(0)/416 + 1, ^uint32(0)/448 + 1, ^uint32(0)/480 + 1, ^uint32(0)/512 + 1, ^uint32(0)/576 + 1, ^uint32(0)/640 + 1, ^uint32(0)/704 + 1, ^uint32(0)/768 + 1, ^uint32(0)/896 + 1, ^uint32(0)/1024 + 1, ^uint32(0)/1152 + 1, ^uint32(0)/1280 + 1, ^uint32(0)/1408 + 1, ^uint32(0)/1536 + 1, ^uint32(0)/1792 + 1, ^uint32(0)/2048 + 1, ^uint32(0)/2304 + 1, ^uint32(0)/2688 + 1, ^uint32(0)/3072 + 1, ^uint32(0)/3200 + 1, ^uint32(0)/3456 + 1, ^uint32(0)/4096 + 1, ^uint32(0)/4864 + 1, ^uint32(0)/5376 + 1, ^uint32(0)/6144 + 1, ^uint32(0)/6528 + 1, ^uint32(0)/6784 + 1, ^uint32(0)/6912 + 1, ^uint32(0)/8192 + 1, ^uint32(0)/9472 + 1, ^uint32(0)/9728 + 1, ^uint32(0)/10240 + 1, ^uint32(0)/10880 + 1, ^uint32(0)/12288 + 1, ^uint32(0)/13568 + 1, ^uint32(0)/14336 + 1, ^uint32(0)/16384 + 1, ^uint32(0)/18432 + 1, ^uint32(0)/19072 + 1, ^uint32(0)/20480 + 1, ^uint32(0)/21760 + 1, ^uint32(0)/24576 + 1, ^uint32(0)/27264 + 1, ^uint32(0)/28672 + 1, ^uint32(0)/32768 + 1}
 var size_to_class8 = [smallSizeMax/smallSizeDiv + 1]uint8{0, 1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}
 var size_to_class128 = [(_MaxSmallSize-smallSizeMax)/largeSizeDiv + 1]uint8{32, 33, 34, 35, 36, 37, 37, 38, 38, 39, 39, 40, 40, 40, 41, 41, 41, 42, 43, 43, 44, 44, 44, 44, 44, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 56, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 58, 58, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 61, 61, 61, 61, 61, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67, 67}
diff --git a/libgo/go/runtime/sizeof_test.go b/libgo/go/runtime/sizeof_test.go
index 0afb05e19d2..f510eeb2f67 100644
--- a/libgo/go/runtime/sizeof_test.go
+++ b/libgo/go/runtime/sizeof_test.go
@@ -25,7 +25,7 @@ func TestSizeof(t *testing.T) {
 		_32bit uintptr     // size on 32bit platforms
 		_64bit uintptr     // size on 64bit platforms
 	}{
-		{runtime.G{}, 216, 376},   // g, but exported for testing
+		{runtime.G{}, 236, 392},   // g, but exported for testing
 		{runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
 	}
 
diff --git a/libgo/go/runtime/slice.go b/libgo/go/runtime/slice.go
index d4c0e9028b7..8654abab18d 100644
--- a/libgo/go/runtime/slice.go
+++ b/libgo/go/runtime/slice.go
@@ -152,6 +152,18 @@ func unsafeslice64(et *_type, ptr unsafe.Pointer, len64 int64) {
 	unsafeslice(et, ptr, len)
 }
 
+func unsafeslicecheckptr(et *_type, ptr unsafe.Pointer, len64 int64) {
+	unsafeslice64(et, ptr, len64)
+
+	/* Commented out for gofrontend.
+	// Check that underlying array doesn't straddle multiple heap objects.
+	// unsafeslice64 has already checked for overflow.
+	if checkptrStraddles(ptr, uintptr(len64)*et.size) {
+		throw("checkptr: unsafe.Slice result straddles multiple allocations")
+	}
+	*/
+}
+
 func panicunsafeslicelen() {
 	panic(errorString("unsafe.Slice: len out of range"))
 }
diff --git a/libgo/go/runtime/stubs.go b/libgo/go/runtime/stubs.go
index dea7234b3ad..e24313c411b 100644
--- a/libgo/go/runtime/stubs.go
+++ b/libgo/go/runtime/stubs.go
@@ -274,9 +274,6 @@ func checkASM() bool {
 	return true
 }
 
-// For gccgo this is in the C code.
-func osyield()
-
 //extern __go_syscall6
 func syscall(trap uintptr, a1, a2, a3, a4, a5, a6 uintptr) uintptr
 
diff --git a/libgo/go/runtime/stubs2.go b/libgo/go/runtime/stubs2.go
index 93ff5661880..c9e5f3d0c4a 100644
--- a/libgo/go/runtime/stubs2.go
+++ b/libgo/go/runtime/stubs2.go
@@ -2,9 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !js
-// +build !plan9
-// +build !windows
+//go:build !js && !plan9 && !windows
+// +build !js,!plan9,!windows
 
 package runtime
 
@@ -21,6 +20,11 @@ func closefd(fd int32) int32
 func exit(code int32)
 func usleep(usec uint32)
 
+//go:nosplit
+func usleep_no_g(usec uint32) {
+	usleep(usec)
+}
+
 // write calls the write system call.
 // It returns a non-negative number of bytes written or a negative errno value.
 //go:noescape
diff --git a/libgo/go/runtime/stubs3.go b/libgo/go/runtime/stubs3.go
index 35feb100c85..cb900b23aae 100644
--- a/libgo/go/runtime/stubs3.go
+++ b/libgo/go/runtime/stubs3.go
@@ -2,6 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//-go:build !aix && !darwin && !freebsd && !openbsd && !plan9 && !solaris
+// -build !aix,!darwin,!freebsd,!openbsd,!plan9,!solaris
+
 package runtime
 
 func nanotime1() int64
diff --git a/libgo/go/runtime/stubs_linux.go b/libgo/go/runtime/stubs_linux.go
index 3c733e37d9a..b8c957991fd 100644
--- a/libgo/go/runtime/stubs_linux.go
+++ b/libgo/go/runtime/stubs_linux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build linux
 // +build linux
 
 package runtime
diff --git a/libgo/go/runtime/stubs_nonlinux.go b/libgo/go/runtime/stubs_nonlinux.go
index 325d35d537d..4f081d5b185 100644
--- a/libgo/go/runtime/stubs_nonlinux.go
+++ b/libgo/go/runtime/stubs_nonlinux.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !linux
 // +build !linux
 
 package runtime
diff --git a/libgo/go/runtime/symtab.go b/libgo/go/runtime/symtab.go
index 22a2b13977d..89a27c43fb0 100644
--- a/libgo/go/runtime/symtab.go
+++ b/libgo/go/runtime/symtab.go
@@ -62,8 +62,15 @@ func CallersFrames(callers []uintptr) *Frames {
 	return &Frames{callers: callers}
 }
 
-// Next returns frame information for the next caller.
-// If more is false, there are no more callers (the Frame value is valid).
+// Next returns a Frame representing the next call frame in the slice
+// of PC values. If it has already returned all call frames, Next
+// returns a zero Frame.
+//
+// The more result indicates whether the next call to Next will return
+// a valid Frame. It does not necessarily indicate whether this call
+// returned one.
+//
+// See the Frames example for idiomatic usage.
 func (ci *Frames) Next() (frame Frame, more bool) {
 	if len(ci.callers) == 0 {
 		return Frame{}, false
diff --git a/libgo/go/runtime/sys_wasm.go b/libgo/go/runtime/sys_wasm.go
index 9bf710ba0e7..057ed4ccd90 100644
--- a/libgo/go/runtime/sys_wasm.go
+++ b/libgo/go/runtime/sys_wasm.go
@@ -27,13 +27,9 @@ func wasmTruncU()
 func wasmExit(code int32)
 
 // adjust Gobuf as it if executed a call to fn with context ctxt
-// and then did an immediate gosave.
+// and then stopped before the first instruction in fn.
 func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
 	sp := buf.sp
-	if sys.RegSize > sys.PtrSize {
-		sp -= sys.PtrSize
-		*(*uintptr)(unsafe.Pointer(sp)) = 0
-	}
 	sp -= sys.PtrSize
 	*(*uintptr)(unsafe.Pointer(sp)) = buf.pc
 	buf.sp = sp
diff --git a/libgo/go/runtime/testdata/testprog/checkptr.go b/libgo/go/runtime/testdata/testprog/checkptr.go
index e0a2794f4c1..9c5561396e5 100644
--- a/libgo/go/runtime/testdata/testprog/checkptr.go
+++ b/libgo/go/runtime/testdata/testprog/checkptr.go
@@ -4,15 +4,22 @@
 
 package main
 
-import "unsafe"
+import (
+	"runtime"
+	"time"
+	"unsafe"
+)
 
 func init() {
 	register("CheckPtrAlignmentNoPtr", CheckPtrAlignmentNoPtr)
 	register("CheckPtrAlignmentPtr", CheckPtrAlignmentPtr)
+	register("CheckPtrAlignmentNilPtr", CheckPtrAlignmentNilPtr)
 	register("CheckPtrArithmetic", CheckPtrArithmetic)
 	register("CheckPtrArithmetic2", CheckPtrArithmetic2)
 	register("CheckPtrSize", CheckPtrSize)
 	register("CheckPtrSmall", CheckPtrSmall)
+	register("CheckPtrSliceOK", CheckPtrSliceOK)
+	register("CheckPtrSliceFail", CheckPtrSliceFail)
 }
 
 func CheckPtrAlignmentNoPtr() {
@@ -27,6 +34,35 @@ func CheckPtrAlignmentPtr() {
 	sink2 = (**int64)(unsafe.Pointer(uintptr(p) + 1))
 }
 
+// CheckPtrAlignmentNilPtr tests that checkptrAlignment doesn't crash
+// on nil pointers (#47430).
+func CheckPtrAlignmentNilPtr() {
+	var do func(int)
+	do = func(n int) {
+		// Inflate the stack so runtime.shrinkstack gets called during GC
+		if n > 0 {
+			do(n - 1)
+		}
+
+		var p unsafe.Pointer
+		_ = (*int)(p)
+	}
+
+	go func() {
+		for {
+			runtime.GC()
+		}
+	}()
+
+	go func() {
+		for i := 0; ; i++ {
+			do(i % 1024)
+		}
+	}()
+
+	time.Sleep(time.Second)
+}
+
 func CheckPtrArithmetic() {
 	var x int
 	i := uintptr(unsafe.Pointer(&x))
@@ -49,3 +85,14 @@ func CheckPtrSize() {
 func CheckPtrSmall() {
 	sink2 = unsafe.Pointer(uintptr(1))
 }
+
+func CheckPtrSliceOK() {
+	p := new([4]int64)
+	sink2 = unsafe.Slice(&p[1], 3)
+}
+
+func CheckPtrSliceFail() {
+	p := new(int64)
+	sink2 = p
+	sink2 = unsafe.Slice(p, 100)
+}
diff --git a/libgo/go/runtime/testdata/testprog/crashdump.go b/libgo/go/runtime/testdata/testprog/crashdump.go
new file mode 100644
index 00000000000..bced397b8a3
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprog/crashdump.go
@@ -0,0 +1,47 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+)
+
+func init() {
+	register("CrashDumpsAllThreads", CrashDumpsAllThreads)
+}
+
+func CrashDumpsAllThreads() {
+	const count = 4
+	runtime.GOMAXPROCS(count + 1)
+
+	chans := make([]chan bool, count)
+	for i := range chans {
+		chans[i] = make(chan bool)
+		go crashDumpsAllThreadsLoop(i, chans[i])
+	}
+
+	// Wait for all the goroutines to start executing.
+	for _, c := range chans {
+		<-c
+	}
+
+	// Tell our parent that all the goroutines are executing.
+	if _, err := os.NewFile(3, "pipe").WriteString("x"); err != nil {
+		fmt.Fprintf(os.Stderr, "write to pipe failed: %v\n", err)
+		os.Exit(2)
+	}
+
+	select {}
+}
+
+func crashDumpsAllThreadsLoop(i int, c chan bool) {
+	close(c)
+	for {
+		for j := 0; j < 0x7fffffff; j++ {
+		}
+	}
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/aprof.go b/libgo/go/runtime/testdata/testprogcgo/aprof.go
index aabca9e1ebd..44a15b08650 100644
--- a/libgo/go/runtime/testdata/testprogcgo/aprof.go
+++ b/libgo/go/runtime/testdata/testprogcgo/aprof.go
@@ -10,7 +10,7 @@ package main
 // The test fails when the function is the first C function.
 // The exported functions are the first C functions, so we use that.
 
-// extern void GoNop();
+// extern void CallGoNop();
 import "C"
 
 import (
@@ -38,7 +38,7 @@ func CgoCCodeSIGPROF() {
 					break
 				}
 			}
-			C.GoNop()
+			C.CallGoNop()
 		}
 		c <- true
 	}()
diff --git a/libgo/go/runtime/testdata/testprogcgo/aprof_c.c b/libgo/go/runtime/testdata/testprogcgo/aprof_c.c
new file mode 100644
index 00000000000..d588e13045f
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/aprof_c.c
@@ -0,0 +1,9 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "_cgo_export.h"
+
+void CallGoNop() {
+	GoNop();
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/bigstack1_windows.c b/libgo/go/runtime/testdata/testprogcgo/bigstack1_windows.c
new file mode 100644
index 00000000000..551fb683093
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/bigstack1_windows.c
@@ -0,0 +1,12 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This is not in bigstack_windows.c because it needs to be part of
+// testprogcgo but is not part of the DLL built from bigstack_windows.c.
+
+#include "_cgo_export.h"
+
+void CallGoBigStack1(char* p) {
+	goBigStack1(p);
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/bigstack_windows.go b/libgo/go/runtime/testdata/testprogcgo/bigstack_windows.go
index f58fcf993f0..135b5fcfe04 100644
--- a/libgo/go/runtime/testdata/testprogcgo/bigstack_windows.go
+++ b/libgo/go/runtime/testdata/testprogcgo/bigstack_windows.go
@@ -6,7 +6,7 @@ package main
 
 /*
 typedef void callback(char*);
-extern void goBigStack1(char*);
+extern void CallGoBigStack1(char*);
 extern void bigStack(callback*);
 */
 import "C"
@@ -18,7 +18,7 @@ func init() {
 func BigStack() {
 	// Create a large thread stack and call back into Go to test
 	// if Go correctly determines the stack bounds.
-	C.bigStack((*C.callback)(C.goBigStack1))
+	C.bigStack((*C.callback)(C.CallGoBigStack1))
 }
 
 //export goBigStack1
diff --git a/libgo/go/runtime/testdata/testprogcgo/lockosthread.c b/libgo/go/runtime/testdata/testprogcgo/lockosthread.c
index 9a8c057761e..b10cc4f3b92 100644
--- a/libgo/go/runtime/testdata/testprogcgo/lockosthread.c
+++ b/libgo/go/runtime/testdata/testprogcgo/lockosthread.c
@@ -9,5 +9,5 @@
 uint32_t threadExited;
 
 void setExited(void *x) {
-	__atomic_add_fetch(&threadExited, 1, __ATOMIC_SEQ_CST);
+	__sync_fetch_and_add(&threadExited, 1);
 }
diff --git a/libgo/go/runtime/testdata/testprogcgo/pprof.go b/libgo/go/runtime/testdata/testprogcgo/pprof.go
index 3a3176d2201..668d4e0c7be 100644
--- a/libgo/go/runtime/testdata/testprogcgo/pprof.go
+++ b/libgo/go/runtime/testdata/testprogcgo/pprof.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !gccgo
 // +build !gccgo
 
 package main
diff --git a/libgo/go/runtime/testdata/testprogcgo/raceprof.go b/libgo/go/runtime/testdata/testprogcgo/raceprof.go
index 0750ec168c5..c17bb3956e0 100644
--- a/libgo/go/runtime/testdata/testprogcgo/raceprof.go
+++ b/libgo/go/runtime/testdata/testprogcgo/raceprof.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build ((linux && amd64) || (freebsd && amd64)) && !gccgo
 // +build linux,amd64 freebsd,amd64
 // +build !gccgo
 
diff --git a/libgo/go/runtime/testdata/testprogcgo/threadpprof.go b/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
index 3b36c3096ea..ed3faed404c 100644
--- a/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
+++ b/libgo/go/runtime/testdata/testprogcgo/threadpprof.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !plan9,!windows
-// +build !gccgo
+//go:build !plan9 && !windows && !gccgo
+// +build !plan9,!windows,!gccgo
 
 package main
 
@@ -50,13 +50,13 @@ void pprofCgoThreadTraceback(void* parg) {
 	arg->buf[0] = (uintptr_t)(cpuHogThread) + 0x10;
 	arg->buf[1] = (uintptr_t)(cpuHogThread2) + 0x4;
 	arg->buf[2] = 0;
-	__atomic_add_fetch(&cpuHogThreadCount, 1, __ATOMIC_SEQ_CST);
+	__sync_add_and_fetch(&cpuHogThreadCount, 1);
 }
 
 // getCPUHogThreadCount fetches the number of times we've seen cpuHogThread
 // in the traceback.
 int getCPUHogThreadCount() {
-	return __atomic_load(&cpuHogThreadCount, __ATOMIC_SEQ_CST);
+	return __sync_add_and_fetch(&cpuHogThreadCount, 0);
 }
 
 static void* cpuHogDriver(void* arg __attribute__ ((unused))) {
diff --git a/libgo/go/runtime/testdata/testprogcgo/traceback.go b/libgo/go/runtime/testdata/testprogcgo/traceback.go
index 46cf644ec98..e04a6358770 100644
--- a/libgo/go/runtime/testdata/testprogcgo/traceback.go
+++ b/libgo/go/runtime/testdata/testprogcgo/traceback.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !gccgo
 // +build !gccgo
 
 package main
diff --git a/libgo/go/runtime/testdata/testprogcgo/traceback_c.c b/libgo/go/runtime/testdata/testprogcgo/traceback_c.c
new file mode 100644
index 00000000000..4de749c0143
--- /dev/null
+++ b/libgo/go/runtime/testdata/testprogcgo/traceback_c.c
@@ -0,0 +1,68 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !gccgo
+// +build !gccgo
+
+// The C definitions for traceback.go. That file uses //export so
+// it can't put function definitions in the "C" import comment.
+
+#include <stdint.h>
+
+char *p;
+
+int crashInGo;
+extern void h1(void);
+
+int tracebackF3(void) {
+	if (crashInGo)
+		h1();
+	else
+		*p = 0;
+	return 0;
+}
+
+int tracebackF2(void) {
+	return tracebackF3();
+}
+
+int tracebackF1(void) {
+	return tracebackF2();
+}
+
+struct cgoTracebackArg {
+	uintptr_t  context;
+	uintptr_t  sigContext;
+	uintptr_t* buf;
+	uintptr_t  max;
+};
+
+struct cgoSymbolizerArg {
+	uintptr_t   pc;
+	const char* file;
+	uintptr_t   lineno;
+	const char* func;
+	uintptr_t   entry;
+	uintptr_t   more;
+	uintptr_t   data;
+};
+
+void cgoTraceback(void* parg) {
+	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
+	arg->buf[0] = 1;
+	arg->buf[1] = 2;
+	arg->buf[2] = 3;
+	arg->buf[3] = 0;
+}
+
+void cgoSymbolizer(void* parg) {
+	struct cgoSymbolizerArg* arg = (struct cgoSymbolizerArg*)(parg);
+	if (arg->pc != arg->data + 1) {
+		arg->file = "unexpected data";
+	} else {
+		arg->file = "cgo symbolizer";
+	}
+	arg->lineno = arg->data + 1;
+	arg->data++;
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/traceback_gccgo.go b/libgo/go/runtime/testdata/testprogcgo/traceback_gccgo.go
index 83357fdd5d9..2d07c2db8a4 100644
--- a/libgo/go/runtime/testdata/testprogcgo/traceback_gccgo.go
+++ b/libgo/go/runtime/testdata/testprogcgo/traceback_gccgo.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build gccgo
 // +build gccgo
 
 package main
diff --git a/libgo/go/runtime/testdata/testprogcgo/tracebackctxt.go b/libgo/go/runtime/testdata/testprogcgo/tracebackctxt.go
index 3940a77d4f9..69078b7bc70 100644
--- a/libgo/go/runtime/testdata/testprogcgo/tracebackctxt.go
+++ b/libgo/go/runtime/testdata/testprogcgo/tracebackctxt.go
@@ -2,8 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// The __attribute__((weak)) used below doesn't seem to work on Windows.
-
+//go:build !gccgo
 // +build !gccgo
 
 package main
@@ -16,20 +15,24 @@ package main
 extern void C1(void);
 extern void C2(void);
 extern void tcContext(void*);
+extern void tcContextSimple(void*);
 extern void tcTraceback(void*);
 extern void tcSymbolizer(void*);
 extern int getContextCount(void);
+extern void TracebackContextPreemptionCallGo(int);
 */
 import "C"
 
 import (
 	"fmt"
 	"runtime"
+	"sync"
 	"unsafe"
 )
 
 func init() {
 	register("TracebackContext", TracebackContext)
+	register("TracebackContextPreemption", TracebackContextPreemption)
 }
 
 var tracebackOK bool
@@ -107,3 +110,30 @@ wantLoop:
 		tracebackOK = false
 	}
 }
+
+// Issue 47441.
+func TracebackContextPreemption() {
+	runtime.SetCgoTraceback(0, unsafe.Pointer(C.tcTraceback), unsafe.Pointer(C.tcContextSimple), unsafe.Pointer(C.tcSymbolizer))
+
+	const funcs = 10
+	const calls = 1e5
+	var wg sync.WaitGroup
+	for i := 0; i < funcs; i++ {
+		wg.Add(1)
+		go func(i int) {
+			defer wg.Done()
+			for j := 0; j < calls; j++ {
+				C.TracebackContextPreemptionCallGo(C.int(i*calls + j))
+			}
+		}(i)
+	}
+	wg.Wait()
+
+	fmt.Println("OK")
+}
+
+//export TracebackContextPreemptionGoFunction
+func TracebackContextPreemptionGoFunction(i C.int) {
+	// Do some busy work.
+	fmt.Sprintf("%d\n", i)
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/tracebackctxt_c.c b/libgo/go/runtime/testdata/testprogcgo/tracebackctxt_c.c
index ff6895e637c..ff8430948a1 100644
--- a/libgo/go/runtime/testdata/testprogcgo/tracebackctxt_c.c
+++ b/libgo/go/runtime/testdata/testprogcgo/tracebackctxt_c.c
@@ -5,6 +5,7 @@
 // The C definitions for tracebackctxt.go. That file uses //export so
 // it can't put function definitions in the "C" import comment.
 
+//go:build !gccgo
 // +build !gccgo
 
 #include <stdlib.h>
@@ -13,6 +14,7 @@
 // Functions exported from Go.
 extern void G1(void);
 extern void G2(void);
+extern void TracebackContextPreemptionGoFunction(int);
 
 void C1() {
 	G1();
@@ -49,25 +51,32 @@ struct cgoSymbolizerArg {
 static int contextCount;
 
 int getContextCount() {
-	return __atomic_load_n(&contextCount, __ATOMIC_SEQ_CST);
+	return __sync_add_and_fetch(&contextCount, 0);
 }
 
 void tcContext(void* parg) {
 	struct cgoContextArg* arg = (struct cgoContextArg*)(parg);
 	if (arg->context == 0) {
-		arg->context = __atomic_add_fetch(&contextCount, 1, __ATOMIC_SEQ_CST);
+		arg->context = __sync_add_and_fetch(&contextCount, 1);
 	} else {
-		if (arg->context != __atomic_load_n(&contextCount, __ATOMIC_SEQ_CST))
+		if (arg->context != __sync_add_and_fetch(&contextCount, 0)) {
 			abort();
 		}
-		__atomic_sub_fetch(&contextCount, 1, __ATOMIC_SEQ_CST);
+		__sync_sub_and_fetch(&contextCount, 1);
+	}
+}
+
+void tcContextSimple(void* parg) {
+	struct cgoContextArg* arg = (struct cgoContextArg*)(parg);
+	if (arg->context == 0) {
+		arg->context = 1;
 	}
 }
 
 void tcTraceback(void* parg) {
 	int base, i;
 	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
-	if (arg->context == 0) {
+	if (arg->context == 0 && arg->sigContext == 0) {
 		// This shouldn't happen in this program.
 		abort();
 	}
@@ -91,3 +100,7 @@ void tcSymbolizer(void *parg) {
 	arg->func = "cFunction";
 	arg->lineno = arg->pc + (arg->more << 16);
 }
+
+void TracebackContextPreemptionCallGo(int i) {
+	TracebackContextPreemptionGoFunction(i);
+}
diff --git a/libgo/go/runtime/testdata/testprogcgo/windows/win.go b/libgo/go/runtime/testdata/testprogcgo/windows/win.go
index f2eabb95488..12488aa6588 100644
--- a/libgo/go/runtime/testdata/testprogcgo/windows/win.go
+++ b/libgo/go/runtime/testdata/testprogcgo/windows/win.go
@@ -1,7 +1,7 @@
 package windows
 
 /*
-#cgo CFLAGS: -mnop-fun-dllimport
+#cgo amd64 386 CFLAGS: -mnop-fun-dllimport
 
 #include <windows.h>
 
diff --git a/libgo/go/runtime/testdata/testwinlibsignal/dummy.go b/libgo/go/runtime/testdata/testwinlibsignal/dummy.go
index 82dfd91c93a..e610f15d06c 100644
--- a/libgo/go/runtime/testdata/testwinlibsignal/dummy.go
+++ b/libgo/go/runtime/testdata/testwinlibsignal/dummy.go
@@ -1,7 +1,10 @@
+//go:build windows
 // +build windows
 
 package main
 
+import "C"
+
 //export Dummy
 func Dummy() int {
 	return 42
diff --git a/libgo/go/runtime/testdata/testwinlibsignal/main.c b/libgo/go/runtime/testdata/testwinlibsignal/main.c
new file mode 100644
index 00000000000..37f24823e66
--- /dev/null
+++ b/libgo/go/runtime/testdata/testwinlibsignal/main.c
@@ -0,0 +1,57 @@
+#include <windows.h>
+#include <stdio.h>
+
+HANDLE waitForCtrlBreakEvent;
+
+BOOL WINAPI CtrlHandler(DWORD fdwCtrlType)
+{
+    switch (fdwCtrlType)
+    {
+    case CTRL_BREAK_EVENT:
+        SetEvent(waitForCtrlBreakEvent);
+        return TRUE;
+    default:
+        return FALSE;
+    }
+}
+
+int main(void)
+{
+    waitForCtrlBreakEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
+    if (!waitForCtrlBreakEvent) {
+        fprintf(stderr, "ERROR: Could not create event\n");
+        return 1;
+    }
+
+    if (!SetConsoleCtrlHandler(CtrlHandler, TRUE))
+    {
+        fprintf(stderr, "ERROR: Could not set control handler\n");
+        return 1;
+    }
+
+    // The library must be loaded after the SetConsoleCtrlHandler call
+    // so that the library handler registers after the main program.
+    // This way the library handler gets called first.
+    HMODULE dummyDll = LoadLibrary("dummy.dll");
+    if (!dummyDll) {
+        fprintf(stderr, "ERROR: Could not load dummy.dll\n");
+        return 1;
+    }
+
+    // Call the Dummy function so that Go initialization completes, since
+    // all cgo entry points call out to _cgo_wait_runtime_init_done.
+    if (((int(*)(void))GetProcAddress(dummyDll, "Dummy"))() != 42) {
+        fprintf(stderr, "ERROR: Dummy function did not return 42\n");
+        return 1;
+    }
+
+    printf("ready\n");
+    fflush(stdout);
+
+    if (WaitForSingleObject(waitForCtrlBreakEvent, 5000) != WAIT_OBJECT_0) {
+        fprintf(stderr, "FAILURE: No signal received\n");
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/libgo/go/runtime/time.go b/libgo/go/runtime/time.go
index 327726c09db..7cc9156d19c 100644
--- a/libgo/go/runtime/time.go
+++ b/libgo/go/runtime/time.go
@@ -332,7 +332,6 @@ func deltimer(t *timer) bool {
 				// Must fetch t.pp before setting status
 				// to timerDeleted.
 				tpp := t.pp.ptr()
-				atomic.Xadd(&tpp.adjustTimers, -1)
 				if !atomic.Cas(&t.status, timerModifying, timerDeleted) {
 					badTimer()
 				}
@@ -509,20 +508,9 @@ loop:
 
 		tpp := t.pp.ptr()
 
-		// Update the adjustTimers field.  Subtract one if we
-		// are removing a timerModifiedEarlier, add one if we
-		// are adding a timerModifiedEarlier.
-		adjust := int32(0)
-		if status == timerModifiedEarlier {
-			adjust--
-		}
 		if newStatus == timerModifiedEarlier {
-			adjust++
 			updateTimerModifiedEarliest(tpp, when)
 		}
-		if adjust != 0 {
-			atomic.Xadd(&tpp.adjustTimers, adjust)
-		}
 
 		// Set the new status of the timer.
 		if !atomic.Cas(&t.status, timerModifying, newStatus) {
@@ -590,9 +578,6 @@ func cleantimers(pp *p) {
 			// Move t to the right position.
 			dodeltimer0(pp)
 			doaddtimer(pp, t)
-			if s == timerModifiedEarlier {
-				atomic.Xadd(&pp.adjustTimers, -1)
-			}
 			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
 				badTimer()
 			}
@@ -663,37 +648,23 @@ func moveTimers(pp *p, timers []*timer) {
 // it also moves timers that have been modified to run later,
 // and removes deleted timers. The caller must have locked the timers for pp.
 func adjusttimers(pp *p, now int64) {
-	if atomic.Load(&pp.adjustTimers) == 0 {
-		if verifyTimers {
-			verifyTimerHeap(pp)
-		}
-		// There are no timers to adjust, so it is safe to clear
-		// timerModifiedEarliest. Do so in case it is stale.
-		// Everything will work if we don't do this,
-		// but clearing here may save future calls to adjusttimers.
-		atomic.Store64(&pp.timerModifiedEarliest, 0)
-		return
-	}
-
 	// If we haven't yet reached the time of the first timerModifiedEarlier
 	// timer, don't do anything. This speeds up programs that adjust
 	// a lot of timers back and forth if the timers rarely expire.
 	// We'll postpone looking through all the adjusted timers until
 	// one would actually expire.
-	if first := atomic.Load64(&pp.timerModifiedEarliest); first != 0 {
-		if int64(first) > now {
-			if verifyTimers {
-				verifyTimerHeap(pp)
-			}
-			return
+	first := atomic.Load64(&pp.timerModifiedEarliest)
+	if first == 0 || int64(first) > now {
+		if verifyTimers {
+			verifyTimerHeap(pp)
 		}
-
-		// We are going to clear all timerModifiedEarlier timers.
-		atomic.Store64(&pp.timerModifiedEarliest, 0)
+		return
 	}
 
+	// We are going to clear all timerModifiedEarlier timers.
+	atomic.Store64(&pp.timerModifiedEarliest, 0)
+
 	var moved []*timer
-loop:
 	for i := 0; i < len(pp.timers); i++ {
 		t := pp.timers[i]
 		if t.pp.ptr() != pp {
@@ -720,11 +691,6 @@ loop:
 				// loop to skip some other timer.
 				dodeltimer(pp, i)
 				moved = append(moved, t)
-				if s == timerModifiedEarlier {
-					if n := atomic.Xadd(&pp.adjustTimers, -1); int32(n) <= 0 {
-						break loop
-					}
-				}
 				// Look at this heap position again.
 				i--
 			}
@@ -823,9 +789,6 @@ func runtimer(pp *p, now int64) int64 {
 			t.when = t.nextwhen
 			dodeltimer0(pp)
 			doaddtimer(pp, t)
-			if s == timerModifiedEarlier {
-				atomic.Xadd(&pp.adjustTimers, -1)
-			}
 			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
 				badTimer()
 			}
@@ -898,7 +861,6 @@ func clearDeletedTimers(pp *p) {
 	atomic.Store64(&pp.timerModifiedEarliest, 0)
 
 	cdel := int32(0)
-	cearlier := int32(0)
 	to := 0
 	changedHeap := false
 	timers := pp.timers
@@ -923,9 +885,6 @@ nextTimer:
 					if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
 						badTimer()
 					}
-					if s == timerModifiedEarlier {
-						cearlier++
-					}
 					continue nextTimer
 				}
 			case timerDeleted:
@@ -962,7 +921,6 @@ nextTimer:
 
 	atomic.Xadd(&pp.deletedTimers, -cdel)
 	atomic.Xadd(&pp.numTimers, -cdel)
-	atomic.Xadd(&pp.adjustTimers, -cearlier)
 
 	timers = timers[:to]
 	pp.timers = timers
diff --git a/libgo/go/runtime/time_fake.go b/libgo/go/runtime/time_fake.go
index c64d2994a90..c790faba3d6 100644
--- a/libgo/go/runtime/time_fake.go
+++ b/libgo/go/runtime/time_fake.go
@@ -2,20 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build faketime
-// +build !windows
-
-// Faketime isn't currently supported on Windows. This would require:
-//
-// 1. Shadowing time_now, which is implemented in assembly on Windows.
-//    Since that's exported directly to the time package from runtime
-//    assembly, this would involve moving it from sys_windows_*.s into
-//    its own assembly files build-tagged with !faketime and using the
-//    implementation of time_now from timestub.go in faketime mode.
-//
-// 2. Modifying syscall.Write to call syscall.faketimeWrite,
-//    translating the Stdout and Stderr handles into FDs 1 and 2.
-//    (See CL 192739 PS 3.)
+//go:build faketime && !windows
+// +build faketime,!windows
+
+// Faketime isn't currently supported on Windows. This would require
+// modifying syscall.Write to call syscall.faketimeWrite,
+// translating the Stdout and Stderr handles into FDs 1 and 2.
+// (See CL 192739 PS 3.)
 
 package runtime
 
@@ -44,8 +37,9 @@ func nanotime() int64 {
 	return faketime
 }
 
-func walltime() (sec int64, nsec int32) {
-	return faketime / 1000000000, int32(faketime % 1000000000)
+//go:linkname time_now time.now
+func time_now() (sec int64, nsec int32, mono int64) {
+	return faketime / 1e9, int32(faketime % 1e9), faketime
 }
 
 func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
diff --git a/libgo/go/runtime/time_nofake.go b/libgo/go/runtime/time_nofake.go
index 1912a94e873..5a4ceaf43d2 100644
--- a/libgo/go/runtime/time_nofake.go
+++ b/libgo/go/runtime/time_nofake.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !faketime
 // +build !faketime
 
 package runtime
@@ -19,10 +20,6 @@ func nanotime() int64 {
 	return nanotime1()
 }
 
-func walltime() (sec int64, nsec int32) {
-	return walltime1()
-}
-
 // write must be nosplit on Windows (see write1)
 //
 //go:nosplit
diff --git a/libgo/go/runtime/timeasm.go b/libgo/go/runtime/timeasm.go
index 6bfaa8a8f0e..2c06dcb940e 100644
--- a/libgo/go/runtime/timeasm.go
+++ b/libgo/go/runtime/timeasm.go
@@ -4,8 +4,10 @@
 
 // Declarations for operating systems implementing time.now directly in assembly.
 
+//go:build ignore && !faketime && (windows || (linux && amd64))
 // +build ignore
-// +build windows
+// +build !faketime
+// +build windows linux,amd64
 
 package runtime
 
diff --git a/libgo/go/runtime/timestub.go b/libgo/go/runtime/timestub.go
index 14165cee974..76de76f2548 100644
--- a/libgo/go/runtime/timestub.go
+++ b/libgo/go/runtime/timestub.go
@@ -5,7 +5,10 @@
 // Declarations for operating systems implementing time.now
 // indirectly, in terms of walltime and nanotime assembly.
 
+//-go:build !faketime && !windows && !(linux && amd64)
+// -build !faketime
 // -build !windows
+// -build !linux !amd64
 
 package runtime
 
diff --git a/libgo/go/runtime/timestub2.go b/libgo/go/runtime/timestub2.go
index f691388c729..d9b4ee0c0ac 100644
--- a/libgo/go/runtime/timestub2.go
+++ b/libgo/go/runtime/timestub2.go
@@ -2,6 +2,15 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//-go:build !aix && !darwin && !freebsd && !openbsd && !solaris && !windows && !(linux && amd64)
+// -build !aix
+// -build !darwin
+// -build !freebsd
+// -build !openbsd
+// -build !solaris
+// -build !windows
+// -build !linux !amd64
+
 package runtime
 
-func walltime1() (sec int64, nsec int32)
+func walltime() (sec int64, nsec int32)
diff --git a/libgo/go/runtime/tls_stub.go b/libgo/go/runtime/tls_stub.go
new file mode 100644
index 00000000000..95dafd007c1
--- /dev/null
+++ b/libgo/go/runtime/tls_stub.go
@@ -0,0 +1,11 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (windows && !amd64) || !windows
+// +build windows,!amd64 !windows
+
+package runtime
+
+//go:nosplit
+func osSetupTLS(mp *m) {}
diff --git a/libgo/go/runtime/tls_windows_amd64.go b/libgo/go/runtime/tls_windows_amd64.go
new file mode 100644
index 00000000000..cacaa844966
--- /dev/null
+++ b/libgo/go/runtime/tls_windows_amd64.go
@@ -0,0 +1,10 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// osSetupTLS is called by needm to set up TLS for non-Go threads.
+//
+// Defined in assembly.
+func osSetupTLS(mp *m)
diff --git a/libgo/go/runtime/trace.go b/libgo/go/runtime/trace.go
index f13c81a3799..a7c36bac878 100644
--- a/libgo/go/runtime/trace.go
+++ b/libgo/go/runtime/trace.go
@@ -53,8 +53,8 @@ const (
 	traceEvGoSysBlock        = 30 // syscall blocks [timestamp]
 	traceEvGoWaiting         = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
 	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
-	traceEvHeapAlloc         = 33 // memstats.heap_live change [timestamp, heap_alloc]
-	traceEvNextGC            = 34 // memstats.next_gc change [timestamp, next_gc]
+	traceEvHeapAlloc         = 33 // gcController.heapLive change [timestamp, heap_alloc]
+	traceEvHeapGoal          = 34 // gcController.heapGoal (formerly next_gc) change [timestamp, heap goal in bytes]
 	traceEvTimerGoroutine    = 35 // not currently used; previously denoted timer goroutine [timer goroutine id]
 	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
 	traceEvString            = 37 // string dictionary entry [ID, length, string]
@@ -222,7 +222,8 @@ func StartTrace() error {
 	stackID := traceStackID(mp, stkBuf, 2)
 	releasem(mp)
 
-	for _, gp := range allgs {
+	// World is stopped, no need to lock.
+	forEachGRace(func(gp *g) {
 		status := readgstatus(gp)
 		if status != _Gdead {
 			gp.traceseq = 0
@@ -242,7 +243,7 @@ func StartTrace() error {
 		} else {
 			gp.sysblocktraced = false
 		}
-	}
+	})
 	traceProcStart()
 	traceGoStart()
 	// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
@@ -1138,15 +1139,15 @@ func traceGoSysBlock(pp *p) {
 }
 
 func traceHeapAlloc() {
-	traceEvent(traceEvHeapAlloc, -1, memstats.heap_live)
+	traceEvent(traceEvHeapAlloc, -1, gcController.heapLive)
 }
 
-func traceNextGC() {
-	if nextGC := atomic.Load64(&memstats.next_gc); nextGC == ^uint64(0) {
+func traceHeapGoal() {
+	if heapGoal := atomic.Load64(&gcController.heapGoal); heapGoal == ^uint64(0) {
 		// Heap-based triggering is disabled.
-		traceEvent(traceEvNextGC, -1, 0)
+		traceEvent(traceEvHeapGoal, -1, 0)
 	} else {
-		traceEvent(traceEvNextGC, -1, nextGC)
+		traceEvent(traceEvHeapGoal, -1, heapGoal)
 	}
 }
 
diff --git a/libgo/go/runtime/traceback_test.go b/libgo/go/runtime/traceback_test.go
new file mode 100644
index 00000000000..152db6c5c02
--- /dev/null
+++ b/libgo/go/runtime/traceback_test.go
@@ -0,0 +1,297 @@
+// Copyright 2021 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"bytes"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+var testTracebackArgsBuf [1000]byte
+
+func TestTracebackArgs(t *testing.T) {
+	tests := []struct {
+		fn     func() int
+		expect string
+	}{
+		// simple ints
+		{
+			func() int { return testTracebackArgs1(1, 2, 3, 4, 5) },
+			"testTracebackArgs1(0x1, 0x2, 0x3, 0x4, 0x5)",
+		},
+		// some aggregates
+		{
+			func() int {
+				return testTracebackArgs2(false, struct {
+					a, b, c int
+					x       [2]int
+				}{1, 2, 3, [2]int{4, 5}}, [0]int{}, [3]byte{6, 7, 8})
+			},
+			"testTracebackArgs2(0x0, {0x1, 0x2, 0x3, {0x4, 0x5}}, {}, {0x6, 0x7, 0x8})",
+		},
+		{
+			func() int { return testTracebackArgs3([3]byte{1, 2, 3}, 4, 5, 6, [3]byte{7, 8, 9}) },
+			"testTracebackArgs3({0x1, 0x2, 0x3}, 0x4, 0x5, 0x6, {0x7, 0x8, 0x9})",
+		},
+		// too deeply nested type
+		{
+			func() int { return testTracebackArgs4(false, [1][1][1][1][1][1][1][1][1][1]int{}) },
+			"testTracebackArgs4(0x0, {{{{{...}}}}})",
+		},
+		// a lot of zero-sized type
+		{
+			func() int {
+				z := [0]int{}
+				return testTracebackArgs5(false, struct {
+					x int
+					y [0]int
+					z [2][0]int
+				}{1, z, [2][0]int{}}, z, z, z, z, z, z, z, z, z, z, z, z)
+			},
+			"testTracebackArgs5(0x0, {0x1, {}, {{}, {}}}, {}, {}, {}, {}, {}, ...)",
+		},
+
+		// edge cases for ...
+		// no ... for 10 args
+		{
+			func() int { return testTracebackArgs6a(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) },
+			"testTracebackArgs6a(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa)",
+		},
+		// has ... for 11 args
+		{
+			func() int { return testTracebackArgs6b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) },
+			"testTracebackArgs6b(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, ...)",
+		},
+		// no ... for aggregates with 10 words
+		{
+			func() int { return testTracebackArgs7a([10]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}) },
+			"testTracebackArgs7a({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa})",
+		},
+		// has ... for aggregates with 11 words
+		{
+			func() int { return testTracebackArgs7b([11]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}) },
+			"testTracebackArgs7b({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, ...})",
+		},
+		// no ... for aggregates, but with more args
+		{
+			func() int { return testTracebackArgs7c([10]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 11) },
+			"testTracebackArgs7c({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa}, ...)",
+		},
+		// has ... for aggregates and also for more args
+		{
+			func() int { return testTracebackArgs7d([11]int{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, 12) },
+			"testTracebackArgs7d({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, ...}, ...)",
+		},
+		// nested aggregates, no ...
+		{
+			func() int { return testTracebackArgs8a(testArgsType8a{1, 2, 3, 4, 5, 6, 7, 8, [2]int{9, 10}}) },
+			"testTracebackArgs8a({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, {0x9, 0xa}})",
+		},
+		// nested aggregates, ... in inner but not outer
+		{
+			func() int { return testTracebackArgs8b(testArgsType8b{1, 2, 3, 4, 5, 6, 7, 8, [3]int{9, 10, 11}}) },
+			"testTracebackArgs8b({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, {0x9, 0xa, ...}})",
+		},
+		// nested aggregates, ... in outer but not inner
+		{
+			func() int { return testTracebackArgs8c(testArgsType8c{1, 2, 3, 4, 5, 6, 7, 8, [2]int{9, 10}, 11}) },
+			"testTracebackArgs8c({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, {0x9, 0xa}, ...})",
+		},
+		// nested aggregates, ... in both inner and outer
+		{
+			func() int { return testTracebackArgs8d(testArgsType8d{1, 2, 3, 4, 5, 6, 7, 8, [3]int{9, 10, 11}, 12}) },
+			"testTracebackArgs8d({0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, {0x9, 0xa, ...}, ...})",
+		},
+	}
+	for _, test := range tests {
+		n := test.fn()
+		got := testTracebackArgsBuf[:n]
+		expect := test.expect
+		if runtime.Compiler == "gccgo" {
+			expect = expect[:strings.Index(expect, "(")]
+		}
+		if !bytes.Contains(got, []byte(expect)) {
+			t.Errorf("traceback does not contain expected string: want %q, got\n%s", expect, got)
+		}
+	}
+}
+
+//go:noinline
+func testTracebackArgs1(a, b, c, d, e int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return a + b + c + d + e
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs2(a bool, b struct {
+	a, b, c int
+	x       [2]int
+}, _ [0]int, d [3]byte) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		// use in-reg args to keep them alive
+		return b.a + b.b + b.c + b.x[0] + b.x[1] + int(d[0]) + int(d[1]) + int(d[2])
+	}
+	return n
+
+}
+
+//go:noinline
+//go:registerparams
+func testTracebackArgs3(x [3]byte, a, b, c int, y [3]byte) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return int(x[0]) + int(x[1]) + int(x[2]) + a + b + c + int(y[0]) + int(y[1]) + int(y[2])
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs4(a bool, x [1][1][1][1][1][1][1][1][1][1]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		panic(x) // use args to keep them alive
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs5(a bool, x struct {
+	x int
+	y [0]int
+	z [2][0]int
+}, _, _, _, _, _, _, _, _, _, _, _, _ [0]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a {
+		panic(x) // use args to keep them alive
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs6a(a, b, c, d, e, f, g, h, i, j int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return a + b + c + d + e + f + g + h + i + j
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs6b(a, b, c, d, e, f, g, h, i, j, k int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a < 0 {
+		// use in-reg args to keep them alive
+		return a + b + c + d + e + f + g + h + i + j + k
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs7a(a [10]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a[0] < 0 {
+		// use in-reg args to keep them alive
+		return a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7] + a[8] + a[9]
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs7b(a [11]int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a[0] < 0 {
+		// use in-reg args to keep them alive
+		return a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7] + a[8] + a[9] + a[10]
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs7c(a [10]int, b int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a[0] < 0 {
+		// use in-reg args to keep them alive
+		return a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7] + a[8] + a[9] + b
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs7d(a [11]int, b int) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a[0] < 0 {
+		// use in-reg args to keep them alive
+		return a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7] + a[8] + a[9] + a[10] + b
+	}
+	return n
+}
+
+type testArgsType8a struct {
+	a, b, c, d, e, f, g, h int
+	i                      [2]int
+}
+type testArgsType8b struct {
+	a, b, c, d, e, f, g, h int
+	i                      [3]int
+}
+type testArgsType8c struct {
+	a, b, c, d, e, f, g, h int
+	i                      [2]int
+	j                      int
+}
+type testArgsType8d struct {
+	a, b, c, d, e, f, g, h int
+	i                      [3]int
+	j                      int
+}
+
+//go:noinline
+func testTracebackArgs8a(a testArgsType8a) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a.a < 0 {
+		// use in-reg args to keep them alive
+		return a.b + a.c + a.d + a.e + a.f + a.g + a.h + a.i[0] + a.i[1]
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs8b(a testArgsType8b) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a.a < 0 {
+		// use in-reg args to keep them alive
+		return a.b + a.c + a.d + a.e + a.f + a.g + a.h + a.i[0] + a.i[1] + a.i[2]
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs8c(a testArgsType8c) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a.a < 0 {
+		// use in-reg args to keep them alive
+		return a.b + a.c + a.d + a.e + a.f + a.g + a.h + a.i[0] + a.i[1] + a.j
+	}
+	return n
+}
+
+//go:noinline
+func testTracebackArgs8d(a testArgsType8d) int {
+	n := runtime.Stack(testTracebackArgsBuf[:], false)
+	if a.a < 0 {
+		// use in-reg args to keep them alive
+		return a.b + a.c + a.d + a.e + a.f + a.g + a.h + a.i[0] + a.i[1] + a.i[2] + a.j
+	}
+	return n
+}
diff --git a/libgo/go/runtime/type.go b/libgo/go/runtime/type.go
index 56b4fe6a111..c788342aee3 100644
--- a/libgo/go/runtime/type.go
+++ b/libgo/go/runtime/type.go
@@ -24,6 +24,10 @@ const (
 	tflagRegularMemory tflag = 1 << 3 // equal and hash can treat values of this type as a single region of t.size bytes
 )
 
+// Needs to be in sync with
+// go/types.cc
+// ../reflect/type.go:/^type.rtype.
+// ../internal/reflectlite/type.go:/^type.rtype.
 type _type struct {
 	size       uintptr
 	ptrdata    uintptr
@@ -118,7 +122,7 @@ type maptype struct {
 }
 
 // Note: flag values must match those used in the TMAP case
-// in ../cmd/compile/internal/gc/reflect.go:dtypesym.
+// in ../cmd/compile/internal/reflectdata/reflect.go:writeType.
 func (mt *maptype) indirectkey() bool { // store ptr to key instead of key itself
 	return mt.flags&1 != 0
 }
diff --git a/libgo/go/runtime/write_err.go b/libgo/go/runtime/write_err.go
index 6b1467b1c48..a4656fd7281 100644
--- a/libgo/go/runtime/write_err.go
+++ b/libgo/go/runtime/write_err.go
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+//go:build !android
 // +build !android
 
 package runtime
diff --git a/libgo/go/runtime/write_err_android.go b/libgo/go/runtime/write_err_android.go
index 2419fc8663e..a876900c954 100644
--- a/libgo/go/runtime/write_err_android.go
+++ b/libgo/go/runtime/write_err_android.go
@@ -144,7 +144,7 @@ func writeLogdHeader() int {
 	//      hdr[3:7] sec unsigned uint32, little endian.
 	//      hdr[7:11] nsec unsigned uint32, little endian.
 	hdr[0] = 0 // LOG_ID_MAIN
-	sec, nsec := walltime()
+	sec, nsec, _ := time_now()
 	packUint32(hdr[3:7], uint32(sec))
 	packUint32(hdr[7:11], uint32(nsec))