[dev.garbage] all: merge dev.cc (81884b89bd88) into dev.garbage

TBR=rlh CC=golang-codereviews https://codereview.appspot.com/181100044
author: Russ Cox <rsc@golang.org> 2014-12-05 11:40:41 -0500
committer: Russ Cox <rsc@golang.org> 2014-12-05 11:40:41 -0500
commit: 60e02bbbea5d3bf4a6b06551f09c984285f8139c (patch)
tree: 69e27f09db888f62c2aa9db29ab0c7233c1b286b
parent: 6d3ba1914e289ed223f7bb69f34604c0e2ae5384 (diff)
parent: e5fc9ffb729e31c4eb0a6518e819e9fc70f14818 (diff)
download: go-60e02bbbea5d3bf4a6b06551f09c984285f8139c.tar.gz
43 files changed, 1874 insertions, 552 deletions
diff --git a/lib/codereview/codereview.py b/lib/codereview/codereview.py
index 9181f1df3..0c9b27a31 100644
--- a/lib/codereview/codereview.py
+++ b/lib/codereview/codereview.py
@@ -2024,13 +2024,13 @@ def submit(ui, repo, *pats, **opts):
 	# push to remote; if it fails for any reason, roll back
 	try:
 		new_heads = len(hg_heads(ui, repo).split())
-		if old_heads != new_heads and not (old_heads == 0 and new_heads == 1):
+		if cl.desc.find("create new branch") < 0 and old_heads != new_heads and not (old_heads == 0 and new_heads == 1):
 			# Created new head, so we weren't up to date.
 			need_sync()
 
 		# Push changes to remote.  If it works, we're committed.  If not, roll back.
 		try:
-			if hg_push(ui, repo):
+			if hg_push(ui, repo, new_branch=cl.desc.find("create new branch")>=0):
 				raise hg_util.Abort("push error")
 		except hg_error.Abort, e:
 			if e.message.find("push creates new heads") >= 0:
diff --git a/src/cmd/api/goapi.go b/src/cmd/api/goapi.go
index 5a8c87603..e49ba33bb 100644
--- a/src/cmd/api/goapi.go
+++ b/src/cmd/api/goapi.go
@@ -405,6 +405,7 @@ func (w *Walker) parseFile(dir, file string) (*ast.File, error) {
 			" note struct{};" +
 			" p struct{};" +
 			" parfor struct{};" +
+			" slice struct{};" +
 			" slicetype struct{};" +
 			" stkframe struct{};" +
 			" sudog struct{};" +
diff --git a/src/cmd/gc/builtin.c b/src/cmd/gc/builtin.c
index fbca4ee5f..aeeadedca 100644
--- a/src/cmd/gc/builtin.c
+++ b/src/cmd/gc/builtin.c
@@ -24,6 +24,8 @@ char *runtimeimport =
 	"func @\"\".printslice (? any)\n"
 	"func @\"\".printnl ()\n"
 	"func @\"\".printsp ()\n"
+	"func @\"\".printlock ()\n"
+	"func @\"\".printunlock ()\n"
 	"func @\"\".concatstring2 (? string, ? string) (? string)\n"
 	"func @\"\".concatstring3 (? string, ? string, ? string) (? string)\n"
 	"func @\"\".concatstring4 (? string, ? string, ? string, ? string) (? string)\n"
@@ -86,10 +88,33 @@ char *runtimeimport =
 	"func @\"\".writebarrierstring (@\"\".dst·1 *any, @\"\".src·2 any)\n"
 	"func @\"\".writebarrierslice (@\"\".dst·1 *any, @\"\".src·2 any)\n"
 	"func @\"\".writebarrieriface (@\"\".dst·1 *any, @\"\".src·2 any)\n"
-	"func @\"\".writebarrierfat2 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
-	"func @\"\".writebarrierfat3 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
-	"func @\"\".writebarrierfat4 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat01 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat10 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat11 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1000 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
 	"func @\"\".writebarrierfat (@\"\".typ·1 *byte, @\"\".dst·2 *any, @\"\".src·3 *any)\n"
+	"func @\"\".writebarriercopy (@\"\".typ·2 *byte, @\"\".dst·3 any, @\"\".src·4 any) (? int)\n"
 	"func @\"\".selectnbsend (@\"\".chanType·2 *byte, @\"\".hchan·3 chan<- any, @\"\".elem·4 *any) (? bool)\n"
 	"func @\"\".selectnbrecv (@\"\".chanType·2 *byte, @\"\".elem·3 *any, @\"\".hchan·4 <-chan any) (? bool)\n"
 	"func @\"\".selectnbrecv2 (@\"\".chanType·2 *byte, @\"\".elem·3 *any, @\"\".received·4 *bool, @\"\".hchan·5 <-chan any) (? bool)\n"
diff --git a/src/cmd/gc/go.h b/src/cmd/gc/go.h
index 6e326961d..5236305f8 100644
--- a/src/cmd/gc/go.h
+++ b/src/cmd/gc/go.h
@@ -1473,6 +1473,7 @@ void	walk(Node *fn);
 void	walkexpr(Node **np, NodeList **init);
 void	walkexprlist(NodeList *l, NodeList **init);
 void	walkexprlistsafe(NodeList *l, NodeList **init);
+void	walkexprlistcheap(NodeList *l, NodeList **init);
 void	walkstmt(Node **np);
 void	walkstmtlist(NodeList *l);
 Node*	conv(Node*, Type*);
diff --git a/src/cmd/gc/popt.c b/src/cmd/gc/popt.c
index 993bb2482..6e6db88ef 100644
--- a/src/cmd/gc/popt.c
+++ b/src/cmd/gc/popt.c
@@ -847,6 +847,10 @@ nilopt(Prog *firstp)
 	Graph *g;
 	int ncheck, nkill;
 
+	// TODO(minux): nilopt on power64 throw away seemly random segment of code.
+	if(thechar == '9')
+		return;
+
 	g = flowstart(firstp, sizeof(NilFlow));
 	if(g == nil)
 		return;
diff --git a/src/cmd/gc/runtime.go b/src/cmd/gc/runtime.go
index 0fb15c265..c6007714c 100644
--- a/src/cmd/gc/runtime.go
+++ b/src/cmd/gc/runtime.go
@@ -36,6 +36,8 @@ func printeface(any)
 func printslice(any)
 func printnl()
 func printsp()
+func printlock()
+func printunlock()
 
 func concatstring2(string, string) string
 func concatstring3(string, string, string) string
@@ -115,10 +117,35 @@ func writebarrieriface(dst *any, src any)
 // The unused *byte argument makes sure that src is 2-pointer-aligned,
 // which is the maximum alignment on NaCl amd64p32
 // (and possibly on 32-bit systems if we start 64-bit aligning uint64s).
-func writebarrierfat2(dst *any, _ *byte, src any)
-func writebarrierfat3(dst *any, _ *byte, src any)
-func writebarrierfat4(dst *any, _ *byte, src any)
+// The bitmap in the name tells which words being copied are pointers.
+func writebarrierfat01(dst *any, _ *byte, src any)
+func writebarrierfat10(dst *any, _ *byte, src any)
+func writebarrierfat11(dst *any, _ *byte, src any)
+func writebarrierfat001(dst *any, _ *byte, src any)
+func writebarrierfat010(dst *any, _ *byte, src any)
+func writebarrierfat011(dst *any, _ *byte, src any)
+func writebarrierfat100(dst *any, _ *byte, src any)
+func writebarrierfat101(dst *any, _ *byte, src any)
+func writebarrierfat110(dst *any, _ *byte, src any)
+func writebarrierfat111(dst *any, _ *byte, src any)
+func writebarrierfat0001(dst *any, _ *byte, src any)
+func writebarrierfat0010(dst *any, _ *byte, src any)
+func writebarrierfat0011(dst *any, _ *byte, src any)
+func writebarrierfat0100(dst *any, _ *byte, src any)
+func writebarrierfat0101(dst *any, _ *byte, src any)
+func writebarrierfat0110(dst *any, _ *byte, src any)
+func writebarrierfat0111(dst *any, _ *byte, src any)
+func writebarrierfat1000(dst *any, _ *byte, src any)
+func writebarrierfat1001(dst *any, _ *byte, src any)
+func writebarrierfat1010(dst *any, _ *byte, src any)
+func writebarrierfat1011(dst *any, _ *byte, src any)
+func writebarrierfat1100(dst *any, _ *byte, src any)
+func writebarrierfat1101(dst *any, _ *byte, src any)
+func writebarrierfat1110(dst *any, _ *byte, src any)
+func writebarrierfat1111(dst *any, _ *byte, src any)
+
 func writebarrierfat(typ *byte, dst *any, src *any)
+func writebarriercopy(typ *byte, dst any, src any) int
 
 func selectnbsend(chanType *byte, hchan chan<- any, elem *any) bool
 func selectnbrecv(chanType *byte, elem *any, hchan <-chan any) bool
diff --git a/src/cmd/gc/typecheck.c b/src/cmd/gc/typecheck.c
index 714c66268..f05d8022d 100644
--- a/src/cmd/gc/typecheck.c
+++ b/src/cmd/gc/typecheck.c
@@ -2891,7 +2891,8 @@ typecheckas(Node *n)
 		case OSLICE3:
 		case OSLICESTR:
 			// For x = x[0:y], x can be updated in place, without touching pointer.
-			if(samesafeexpr(n->left, n->right->left) && (n->right->right->left == N || iszero(n->right->right->left)))
+			// TODO(rsc): Reenable once it is actually updated in place without touching the pointer.
+			if(0 && samesafeexpr(n->left, n->right->left) && (n->right->right->left == N || iszero(n->right->right->left)))
 				n->right->reslice = 1;
 			break;
 		
@@ -2899,7 +2900,8 @@ typecheckas(Node *n)
 			// For x = append(x, ...), x can be updated in place when there is capacity,
 			// without touching the pointer; otherwise the emitted code to growslice
 			// can take care of updating the pointer, and only in that case.
-			if(n->right->list != nil && samesafeexpr(n->left, n->right->list->n))
+			// TODO(rsc): Reenable once the emitted code does update the pointer.
+			if(0 && n->right->list != nil && samesafeexpr(n->left, n->right->list->n))
 				n->right->reslice = 1;
 			break;
 		}
diff --git a/src/cmd/gc/walk.c b/src/cmd/gc/walk.c
index 77f9c80f9..061089349 100644
--- a/src/cmd/gc/walk.c
+++ b/src/cmd/gc/walk.c
@@ -6,6 +6,7 @@
 #include	<libc.h>
 #include	"go.h"
 #include	"../ld/textflag.h"
+#include	"../../runtime/mgc0.h"
 
 static	Node*	walkprint(Node*, NodeList**);
 static	Node*	writebarrierfn(char*, Type*, Type*);
@@ -363,6 +364,15 @@ walkexprlistsafe(NodeList *l, NodeList **init)
 }
 
 void
+walkexprlistcheap(NodeList *l, NodeList **init)
+{
+	for(; l; l=l->next) {
+		l->n = cheapexpr(l->n, init);
+		walkexpr(&l->n, init);
+	}
+}
+
+void
 walkexpr(Node **np, NodeList **init)
 {
 	Node *r, *l, *var, *a;
@@ -1771,6 +1781,11 @@ walkprint(Node *nn, NodeList **init)
 	calls = nil;
 	notfirst = 0;
 
+	// Hoist all the argument evaluation up before the lock.
+	walkexprlistcheap(all, init);
+
+	calls = list(calls, mkcall("printlock", T, init));
+
 	for(l=all; l; l=l->next) {
 		if(notfirst) {
 			calls = list(calls, mkcall("printsp", T, init));
@@ -1851,6 +1866,9 @@ walkprint(Node *nn, NodeList **init)
 
 	if(op == OPRINTN)
 		calls = list(calls, mkcall("printnl", T, nil));
+
+	calls = list(calls, mkcall("printunlock", T, init));
+
 	typechecklist(calls, Etop);
 	walkexprlist(calls, init);
 
@@ -1987,6 +2005,9 @@ applywritebarrier(Node *n, NodeList **init)
 {
 	Node *l, *r;
 	Type *t;
+	vlong x;
+	static Bvec *bv;
+	char name[32];
 
 	if(n->left && n->right && needwritebarrier(n->left, n->right)) {
 		t = n->left->type;
@@ -2004,14 +2025,35 @@ applywritebarrier(Node *n, NodeList **init)
 		} else if(isinter(t)) {
 			n = mkcall1(writebarrierfn("writebarrieriface", t, n->right->type), T, init,
 				l, n->right);
-		} else if(t->width == 2*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat2", t, n->right->type), T, init,
-				l, nodnil(), n->right);
-		} else if(t->width == 3*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat3", t, n->right->type), T, init,
-				l, nodnil(), n->right);
-		} else if(t->width == 4*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat4", t, n->right->type), T, init,
+		} else if(t->width <= 4*widthptr) {
+			x = 0;
+			if(bv == nil)
+				bv = bvalloc(BitsPerPointer*4);
+			bvresetall(bv);
+			twobitwalktype1(t, &x, bv);
+			// The bvgets are looking for BitsPointer in successive slots.
+			enum {
+				PtrBit = 1,
+			};
+			if(BitsPointer != (1<<PtrBit))
+				fatal("wrong PtrBit");
+			switch(t->width/widthptr) {
+			default:
+				fatal("found writebarrierfat for %d-byte object of type %T", (int)t->width, t);
+			case 2:
+				snprint(name, sizeof name, "writebarrierfat%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit));
+				break;
+			case 3:
+				snprint(name, sizeof name, "writebarrierfat%d%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit), bvget(bv, 2*BitsPerPointer+PtrBit));
+				break;
+			case 4:
+				snprint(name, sizeof name, "writebarrierfat%d%d%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit), bvget(bv, 2*BitsPerPointer+PtrBit), bvget(bv, 3*BitsPerPointer+PtrBit));
+				break;
+			}
+			n = mkcall1(writebarrierfn(name, t, n->right->type), T, init,
 				l, nodnil(), n->right);
 		} else {
 			r = n->right;
@@ -2873,6 +2915,11 @@ copyany(Node *n, NodeList **init, int runtimecall)
 {
 	Node *nl, *nr, *nfrm, *nto, *nif, *nlen, *nwid, *fn;
 	NodeList *l;
+	
+	if(haspointers(n->left->type->type)) {
+		fn = writebarrierfn("writebarriercopy", n->left->type, n->right->type);
+		return mkcall1(fn, n->type, init, typename(n->left->type->type), n->left, n->right);
+	}
 
 	if(runtimecall) {
 		if(n->right->type->etype == TSTRING)
diff --git a/src/run.bash b/src/run.bash
index 6b9ecc33c..b8ce417a0 100755
--- a/src/run.bash
+++ b/src/run.bash
@@ -162,7 +162,8 @@ esac
 # Race detector only supported on Linux, FreeBSD and OS X,
 # and only on amd64, and only when cgo is enabled.
 # Delayed until here so we know whether to try external linking.
-case "$GOHOSTOS-$GOOS-$GOARCH-$CGO_ENABLED" in
+# DISABLED until we get garbage collection working.
+case "$GOHOSTOS-$GOOS-$GOARCH-$CGO_ENABLED-XXX-DISABLED" in
 linux-linux-amd64-1 | freebsd-freebsd-amd64-1 | darwin-darwin-amd64-1)
 	echo
 	echo '# Testing race detector.'
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index a02bb5556..7cc64a3a4 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -2285,3 +2285,23 @@ TEXT runtime·getg(SB),NOSPLIT,$0-4
 	MOVL	AX, ret+0(FP)
 	RET
 
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT0	(AX)
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT1	(AX)
+	RET
+
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT2	(AX)
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHNTA	(AX)
+	RET
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 6e3f5ff6c..14be2fe92 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -2228,3 +2228,23 @@ TEXT runtime·getg(SB),NOSPLIT,$0-8
 	MOVQ	g(CX), AX
 	MOVQ	AX, ret+0(FP)
 	RET
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
+	MOVQ	addr+0(FP), AX
+	PREFETCHT0	(AX)
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
+	MOVQ	addr+0(FP), AX
+	PREFETCHT1	(AX)
+	RET
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
+	MOVQ	addr+0(FP), AX
+	PREFETCHT2	(AX)
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
+	MOVQ	addr+0(FP), AX
+	PREFETCHNTA	(AX)
+	RET
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index b8370efd3..c87d848fe 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -1079,3 +1079,24 @@ TEXT runtime·getg(SB),NOSPLIT,$0-4
 	MOVL	g(CX), AX
 	MOVL	AX, ret+0(FP)
 	RET
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT0	(AX)
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT1	(AX)
+	RET
+
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT2	(AX)
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHNTA	(AX)
+	RET
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 583c7ba50..c6c98b443 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -1320,3 +1320,15 @@ TEXT runtime·goexit(SB),NOSPLIT,$-4-0
 TEXT runtime·getg(SB),NOSPLIT,$-4-4
 	MOVW	g, ret+0(FP)
 	RET
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
+	RET
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
+	RET
diff --git a/src/runtime/asm_power64x.s b/src/runtime/asm_power64x.s
index 3f2ab6d0e..548c88e47 100644
--- a/src/runtime/asm_power64x.s
+++ b/src/runtime/asm_power64x.s
@@ -977,3 +977,15 @@ TEXT runtime·goexit(SB),NOSPLIT,$-8-0
 TEXT runtime·getg(SB),NOSPLIT,$-8-8
 	MOVD	g, ret+0(FP)
 	RETURN
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
+	RETURN
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
+	RETURN
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
+	RETURN
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
+	RETURN
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index e871236e4..5ed255026 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -26,7 +26,7 @@ var Exitsyscall = exitsyscall
 var LockedOSThread = lockedOSThread
 
 type LFNode struct {
-	Next    *LFNode
+	Next    uint64
 	Pushcnt uintptr
 }
 
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 0c1a60c8b..e1693d40f 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -464,8 +464,8 @@ func dumpobjs() {
 		if n > uintptr(len(freemark)) {
 			gothrow("freemark array doesn't have enough entries")
 		}
-		for l := s.freelist; l != nil; l = l.next {
-			freemark[(uintptr(unsafe.Pointer(l))-p)/size] = true
+		for l := s.freelist; l.ptr() != nil; l = l.ptr().next {
+			freemark[(uintptr(l)-p)/size] = true
 		}
 		for j := uintptr(0); j < n; j, p = j+1, p+size {
 			if freemark[j] {
diff --git a/src/runtime/lfstack.go b/src/runtime/lfstack.go
index 8a36a67b3..fd3325972 100644
--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go
@@ -18,7 +18,7 @@ func lfstackpush(head *uint64, node *lfnode) {
 	}
 	for {
 		old := atomicload64(head)
-		node.next, _ = lfstackUnpack(old)
+		node.next = old
 		if cas64(head, old, new) {
 			break
 		}
@@ -32,12 +32,8 @@ func lfstackpop(head *uint64) unsafe.Pointer {
 			return nil
 		}
 		node, _ := lfstackUnpack(old)
-		node2 := (*lfnode)(atomicloadp(unsafe.Pointer(&node.next)))
-		new := uint64(0)
-		if node2 != nil {
-			new = lfstackPack(node2, node2.pushcnt)
-		}
-		if cas64(head, old, new) {
+		next := atomicload64(&node.next)
+		if cas64(head, old, next) {
 			return unsafe.Pointer(node)
 		}
 	}
diff --git a/src/runtime/lfstack_test.go b/src/runtime/lfstack_test.go
index e51877704..68f221d6e 100644
--- a/src/runtime/lfstack_test.go
+++ b/src/runtime/lfstack_test.go
@@ -121,7 +121,7 @@ func TestLFStackStress(t *testing.T) {
 			}
 			cnt++
 			sum2 += node.data
-			node.Next = nil
+			node.Next = 0
 		}
 	}
 	if cnt != K {
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index d73d1ba6a..e9fec7bb1 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -140,14 +140,14 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 			// Allocate a new maxTinySize block.
 			s = c.alloc[tinySizeClass]
 			v := s.freelist
-			if v == nil {
+			if v.ptr() == nil {
 				systemstack(func() {
 					mCache_Refill(c, tinySizeClass)
 				})
 				s = c.alloc[tinySizeClass]
 				v = s.freelist
 			}
-			s.freelist = v.next
+			s.freelist = v.ptr().next
 			s.ref++
 			//TODO: prefetch v.next
 			x = unsafe.Pointer(v)
@@ -170,19 +170,19 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 			size = uintptr(class_to_size[sizeclass])
 			s = c.alloc[sizeclass]
 			v := s.freelist
-			if v == nil {
+			if v.ptr() == nil {
 				systemstack(func() {
 					mCache_Refill(c, int32(sizeclass))
 				})
 				s = c.alloc[sizeclass]
 				v = s.freelist
 			}
-			s.freelist = v.next
+			s.freelist = v.ptr().next
 			s.ref++
 			//TODO: prefetch
 			x = unsafe.Pointer(v)
 			if flags&flagNoZero == 0 {
-				v.next = nil
+				v.ptr().next = 0
 				if size > 2*ptrSize && ((*[2]uintptr)(x))[1] != 0 {
 					memclr(unsafe.Pointer(v), size)
 				}
@@ -241,6 +241,8 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 			masksize = masksize * pointersPerByte / 8 // 4 bits per word
 			masksize++                                // unroll flag in the beginning
 			if masksize > maxGCMask && typ.gc[1] != 0 {
+				// write barriers have not been updated to deal with this case yet.
+				gothrow("maxGCMask too small for now")
 				// If the mask is too large, unroll the program directly
 				// into the GC bitmap. It's 7 times slower than copying
 				// from the pre-unrolled mask, but saves 1/16 of type size
@@ -295,6 +297,17 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 		}
 	}
 marked:
+
+	// GCmarkterminate allocates black
+	// All slots hold nil so no scanning is needed.
+	// This may be racing with GC so do it atomically if there can be
+	// a race marking the bit.
+	if gcphase == _GCmarktermination {
+		systemstack(func() {
+			gcmarknewobject_m(uintptr(x))
+		})
+	}
+
 	if raceenabled {
 		racemalloc(x, size)
 	}
@@ -328,13 +341,43 @@ marked:
 		}
 	}
 
-	if memstats.heap_alloc >= memstats.next_gc {
+	if memstats.heap_alloc >= memstats.next_gc/2 {
 		gogc(0)
 	}
 
 	return x
 }
 
+func loadPtrMask(typ *_type) []uint8 {
+	var ptrmask *uint8
+	nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+	if typ.kind&kindGCProg != 0 {
+		masksize := nptr
+		if masksize%2 != 0 {
+			masksize *= 2 // repeated
+		}
+		masksize = masksize * pointersPerByte / 8 // 4 bits per word
+		masksize++                                // unroll flag in the beginning
+		if masksize > maxGCMask && typ.gc[1] != 0 {
+			// write barriers have not been updated to deal with this case yet.
+			gothrow("maxGCMask too small for now")
+		}
+		ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+		// Check whether the program is already unrolled
+		// by checking if the unroll flag byte is set
+		maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
+		if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
+			systemstack(func() {
+				unrollgcprog_m(typ)
+			})
+		}
+		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+	} else {
+		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
+	}
+	return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+1)/2]
+}
+
 // implementation of new builtin
 func newobject(typ *_type) unsafe.Pointer {
 	flags := uint32(0)
@@ -429,7 +472,21 @@ func gogc(force int32) {
 	mp = acquirem()
 	mp.gcing = 1
 	releasem(mp)
+
 	systemstack(stoptheworld)
+	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
+	if true {                  // To turn on concurrent scan and mark set to true...
+		systemstack(starttheworld)
+		// Do a concurrent heap scan before we stop the world.
+		systemstack(gcscan_m)
+		systemstack(stoptheworld)
+		systemstack(gcinstallmarkwb_m)
+		systemstack(starttheworld)
+		systemstack(gcmark_m)
+		systemstack(stoptheworld)
+		systemstack(gcinstalloffwb_m)
+	}
+
 	if mp != acquirem() {
 		gothrow("gogc: rescheduled")
 	}
@@ -445,17 +502,21 @@ func gogc(force int32) {
 	if debug.gctrace > 1 {
 		n = 2
 	}
+	eagersweep := force >= 2
 	for i := 0; i < n; i++ {
 		if i > 0 {
 			startTime = nanotime()
 		}
 		// switch to g0, call gc, then switch back
-		eagersweep := force >= 2
 		systemstack(func() {
 			gc_m(startTime, eagersweep)
 		})
 	}
 
+	systemstack(func() {
+		gccheckmark_m(startTime, eagersweep)
+	})
+
 	// all done
 	mp.gcing = 0
 	semrelease(&worldsema)
@@ -470,6 +531,14 @@ func gogc(force int32) {
 	}
 }
 
+func GCcheckmarkenable() {
+	systemstack(gccheckmarkenable_m)
+}
+
+func GCcheckmarkdisable() {
+	systemstack(gccheckmarkdisable_m)
+}
+
 // GC runs a garbage collection.
 func GC() {
 	gogc(2)
diff --git a/src/runtime/malloc2.go b/src/runtime/malloc2.go
index c175c2aec..a9d40de30 100644
--- a/src/runtime/malloc2.go
+++ b/src/runtime/malloc2.go
@@ -139,10 +139,35 @@ const (
 )
 
 // A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+// Since assignments to mlink.next will result in a write barrier being preformed
+// this can not be used by some of the internal GC structures. For example when
+// the sweeper is placing an unmarked object on the free list it does not want the
+// write barrier to be called since that could result in the object being reachable.
 type mlink struct {
 	next *mlink
 }
 
+// A gclink is a node in a linked list of blocks, like mlink,
+// but it is opaque to the garbage collector.
+// The GC does not trace the pointers during collection,
+// and the compiler does not emit write barriers for assignments
+// of gclinkptr values. Code should store references to gclinks
+// as gclinkptr, not as *gclink.
+type gclink struct {
+	next gclinkptr
+}
+
+// A gclinkptr is a pointer to a gclink, but it is opaque
+// to the garbage collector.
+type gclinkptr uintptr
+
+// ptr returns the *gclink form of p.
+// The result should be used for accessing fields, not stored
+// in other data structures.
+func (p gclinkptr) ptr() *gclink {
+	return (*gclink)(unsafe.Pointer(p))
+}
+
 // sysAlloc obtains a large chunk of zeroed memory from the
 // operating system, typically on the order of a hundred kilobytes
 // or a megabyte.
@@ -275,8 +300,8 @@ type mcachelist struct {
 }
 
 type stackfreelist struct {
-	list *mlink  // linked list of free stacks
-	size uintptr // total size of stacks in list
+	list gclinkptr // linked list of free stacks
+	size uintptr   // total size of stacks in list
 }
 
 // Per-thread (in Go, per-P) cache for small objects.
@@ -299,8 +324,6 @@ type mcache struct {
 
 	sudogcache *sudog
 
-	gcworkbuf unsafe.Pointer
-
 	// Local allocator stats, flushed during GC.
 	local_nlookup    uintptr                  // number of pointer lookups
 	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
@@ -348,11 +371,11 @@ const (
 )
 
 type mspan struct {
-	next     *mspan  // in a span linked list
-	prev     *mspan  // in a span linked list
-	start    pageID  // starting page number
-	npages   uintptr // number of pages in span
-	freelist *mlink  // list of free objects
+	next     *mspan    // in a span linked list
+	prev     *mspan    // in a span linked list
+	start    pageID    // starting page number
+	npages   uintptr   // number of pages in span
+	freelist gclinkptr // list of free objects
 	// sweep generation:
 	// if sweepgen == h->sweepgen - 2, the span needs sweeping
 	// if sweepgen == h->sweepgen - 1, the span is currently being swept
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index d3afef6be..f8389c5cb 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -38,7 +38,12 @@ func freemcache(c *mcache) {
 	systemstack(func() {
 		mCache_ReleaseAll(c)
 		stackcache_clear(c)
-		gcworkbuffree(c.gcworkbuf)
+
+		// NOTE(rsc,rlh): If gcworkbuffree comes back, we need to coordinate
+		// with the stealing of gcworkbufs during garbage collection to avoid
+		// a race where the workbuf is double-freed.
+		// gcworkbuffree(c.gcworkbuf)
+
 		lock(&mheap_.lock)
 		purgecachedstats(c)
 		fixAlloc_Free(&mheap_.cachealloc, unsafe.Pointer(c))
@@ -54,7 +59,7 @@ func mCache_Refill(c *mcache, sizeclass int32) *mspan {
 	_g_.m.locks++
 	// Return the current cached span to the central lists.
 	s := c.alloc[sizeclass]
-	if s.freelist != nil {
+	if s.freelist.ptr() != nil {
 		gothrow("refill on a nonempty span")
 	}
 	if s != &emptymspan {
@@ -66,7 +71,7 @@ func mCache_Refill(c *mcache, sizeclass int32) *mspan {
 	if s == nil {
 		gothrow("out of memory")
 	}
-	if s.freelist == nil {
+	if s.freelist.ptr() == nil {
 		println(s.ref, (s.npages<<_PageShift)/s.elemsize)
 		gothrow("empty span")
 	}
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index 0d172a08b..ae5c6f1d5 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -55,7 +55,7 @@ retry:
 			mSpanList_InsertBack(&c.empty, s)
 			unlock(&c.lock)
 			mSpan_Sweep(s, true)
-			if s.freelist != nil {
+			if s.freelist.ptr() != nil {
 				goto havespan
 			}
 			lock(&c.lock)
@@ -90,7 +90,7 @@ havespan:
 	if n == 0 {
 		gothrow("empty span")
 	}
-	if s.freelist == nil {
+	if s.freelist.ptr() == nil {
 		gothrow("freelist empty")
 	}
 	s.incache = true
@@ -122,14 +122,14 @@ func mCentral_UncacheSpan(c *mcentral, s *mspan) {
 // the latest generation.
 // If preserve=true, don't return the span to heap nor relink in MCentral lists;
 // caller takes care of it.
-func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start *mlink, end *mlink, preserve bool) bool {
+func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool) bool {
 	if s.incache {
 		gothrow("freespan into cached span")
 	}
 
 	// Add the objects back to s's free list.
-	wasempty := s.freelist == nil
-	end.next = s.freelist
+	wasempty := s.freelist.ptr() == nil
+	end.ptr().next = s.freelist
 	s.freelist = start
 	s.ref -= uint16(n)
 
@@ -165,7 +165,7 @@ func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start *mlink, end *mlink,
 	// s is completely freed, return it to the heap.
 	mSpanList_Remove(s)
 	s.needzero = 1
-	s.freelist = nil
+	s.freelist = 0
 	unlock(&c.lock)
 	unmarkspan(uintptr(s.start)<<_PageShift, s.npages<<_PageShift)
 	mHeap_Free(&mheap_, s, 0)
@@ -183,17 +183,21 @@ func mCentral_Grow(c *mcentral) *mspan {
 		return nil
 	}
 
-	// Carve span into sequence of blocks.
-	tailp := &s.freelist
 	p := uintptr(s.start << _PageShift)
 	s.limit = p + size*n
-	for i := uintptr(0); i < n; i++ {
-		v := (*mlink)(unsafe.Pointer(p))
-		*tailp = v
-		tailp = &v.next
+	head := gclinkptr(p)
+	tail := gclinkptr(p)
+	// i==0 iteration already done
+	for i := uintptr(1); i < n; i++ {
 		p += size
+		tail.ptr().next = gclinkptr(p)
+		tail = gclinkptr(p)
 	}
-	*tailp = nil
+	if s.freelist.ptr() != nil {
+		gothrow("freelist not empty")
+	}
+	tail.ptr().next = 0
+	s.freelist = head
 	markspan(unsafe.Pointer(uintptr(s.start)<<_PageShift), size, n, size*n < s.npages<<_PageShift)
 	return s
 }
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index f44d7ddbc..a13de0488 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -7,22 +7,72 @@
 
 // Garbage collector (GC).
 //
-// GC is:
-// - mark&sweep
-// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
-// - parallel (up to MaxGcproc threads)
-// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
-// - non-moving/non-compacting
-// - full (non-partial)
+// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple GC
+// thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
+// non-generational and non-compacting. Allocation is done using size segregated per P allocation
+// areas to minimize fragmentation while eliminating locks in the common case.
 //
-// GC rate.
-// Next GC is after we've allocated an extra amount of memory proportional to
-// the amount already in use. The proportion is controlled by GOGC environment variable
-// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
-// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
-// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
-// (and also the amount of extra memory used).
+// The algorithm decomposes into several steps.
+// This is a high level description of the algorithm being used. For an overview of GC a good
+// place to start is Richard Jones' gchandbook.org.
+//
+// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
+// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
+// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978), 966-975.
+// For journal quality proofs that these steps are complete, correct, and terminate see
+// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
+// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
 //
+//  0. Set phase = GCscan from GCoff.
+//  1. Wait for all P's to acknowledge phase change.
+//         At this point all goroutines have passed through a GC safepoint and
+//         know we are in the GCscan phase.
+//  2. GC scans all goroutine stacks, mark and enqueues all encountered pointers
+//       (marking avoids most duplicate enqueuing but races may produce duplication which is benign).
+//       Preempted goroutines are scanned before P schedules next goroutine.
+//  3. Set phase = GCmark.
+//  4. Wait for all P's to acknowledge phase change.
+//  5. Now write barrier marks and enqueues black, grey, or white to white pointers.
+//       Malloc still allocates white (non-marked) objects.
+//  6. Meanwhile GC transitively walks the heap marking reachable objects.
+//  7. When GC finishes marking heap, it preempts P's one-by-one and
+//       retakes partial wbufs (filled by write barrier or during a stack scan of the goroutine
+//       currently scheduled on the P).
+//  8. Once the GC has exhausted all available marking work it sets phase = marktermination.
+//  9. Wait for all P's to acknowledge phase change.
+// 10. Malloc now allocates black objects, so number of unmarked reachable objects
+//        monotonically decreases.
+// 11. GC preempts P's one-by-one taking partial wbufs and marks all unmarked yet reachable objects.
+// 12. When GC completes a full cycle over P's and discovers no new grey
+//         objects, (which means all reachable objects are marked) set phase = GCsweep.
+// 13. Wait for all P's to acknowledge phase change.
+// 14. Now malloc allocates white (but sweeps spans before use).
+//         Write barrier becomes nop.
+// 15. GC does background sweeping, see description below.
+// 16. When sweeping is complete set phase to GCoff.
+// 17. When sufficient allocation has taken place replay the sequence starting at 0 above,
+//         see discussion of GC rate below.
+
+// Changing phases.
+// Phases are changed by setting the gcphase to the next phase and possibly calling ackgcphase.
+// All phase action must be benign in the presence of a change.
+// Starting with GCoff
+// GCoff to GCscan
+//     GSscan scans stacks and globals greying them and never marks an object black.
+//     Once all the P's are aware of the new phase they will scan gs on preemption.
+//     This means that the scanning of preempted gs can't start until all the Ps
+//     have acknowledged.
+// GCscan to GCmark
+//     GCMark turns on the write barrier which also only greys objects. No scanning
+//     of objects (making them black) can happen until all the Ps have acknowledged
+//     the phase change.
+// GCmark to GCmarktermination
+//     The only change here is that we start allocating black so the Ps must acknowledge
+//     the change before we begin the termination algorithm
+// GCmarktermination to GSsweep
+//     Object currently on the freelist must be marked black for this to work.
+//     Are things on the free lists black or white? How does the sweep phase work?
+
 // Concurrent sweep.
 // The sweep phase proceeds concurrently with normal program execution.
 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
@@ -53,6 +103,14 @@
 // The finalizer goroutine is kicked off only when all spans are swept.
 // When the next GC starts, it sweeps all not-yet-swept spans (if any).
 
+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+
 package runtime
 
 import "unsafe"
@@ -75,7 +133,7 @@ const (
 // ptrmask for an allocation containing a single pointer.
 var oneptr = [...]uint8{bitsPointer}
 
-// Initialized from $GOGC.  GOGC=off means no gc.
+// Initialized from $GOGC.  GOGC=off means no GC.
 var gcpercent int32
 
 // Holding worldsema grants an M the right to try to stop the world.
@@ -93,6 +151,17 @@ var gcpercent int32
 //
 var worldsema uint32 = 1
 
+// It is a bug if bits does not have bitBoundary set but
+// there are still some cases where this happens related
+// to stack spans.
+type markbits struct {
+	bitp  *byte   // pointer to the byte holding xbits
+	shift uintptr // bits xbits needs to be shifted to get bits
+	xbits byte    // byte holding all the bits from *bitp
+	bits  byte    // mark and boundary bits relevant to corresponding slot.
+	tbits byte    // pointer||scalar bits relevant to corresponding slot.
+}
+
 type workbuf struct {
 	node lfnode // must be first
 	nobj uintptr
@@ -121,6 +190,7 @@ var nbadblock int32
 type workdata struct {
 	full    uint64                // lock-free list of full blocks
 	empty   uint64                // lock-free list of empty blocks
+	partial uint64                // lock-free list of partially filled blocks
 	pad0    [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
 	nproc   uint32
 	tstart  int64
@@ -143,293 +213,405 @@ func have_cgo_allocate() bool {
 	return &weak_cgo_allocate != nil
 }
 
-// scanblock scans a block of n bytes starting at pointer b for references
-// to other objects, scanning any it finds recursively until there are no
-// unscanned objects left.  Instead of using an explicit recursion, it keeps
-// a work list in the Workbuf* structures and loops in the main function
-// body.  Keeping an explicit work list is easier on the stack allocator and
-// more efficient.
-func scanblock(b, n uintptr, ptrmask *uint8) {
-	// Cache memory arena parameters in local vars.
-	arena_start := mheap_.arena_start
-	arena_used := mheap_.arena_used
-
-	wbuf := getempty(nil)
-	nobj := wbuf.nobj
-	wp := &wbuf.obj[nobj]
-	keepworking := b == 0
+// To help debug the concurrent GC we remark with the world
+// stopped ensuring that any object encountered has their normal
+// mark bit set. To do this we use an orthogonal bit
+// pattern to indicate the object is marked. The following pattern
+// uses the upper two bits in the object's bounday nibble.
+// 01: scalar  not marked
+// 10: pointer not marked
+// 11: pointer     marked
+// 00: scalar      marked
+// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
+// The higher bit is 1 for pointers and 0 for scalars, whether the object
+// is marked or not.
+// The first nibble no longer holds the bitsDead pattern indicating that the
+// there are no more pointers in the object. This information is held
+// in the second nibble.
+
+// When marking an object if the bool checkmark is true one uses the above
+// encoding, otherwise one uses the bitMarked bit in the lower two bits
+// of the nibble.
+var (
+	checkmark         = false
+	gccheckmarkenable = true
+)
 
-	var ptrbitp unsafe.Pointer
+// Is address b in the known heap. If it doesn't have a valid gcmap
+// returns false. For example pointers into stacks will return false.
+func inheap(b uintptr) bool {
+	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
+		return false
+	}
+	// Not a beginning of a block, consult span table to find the block beginning.
+	k := b >> _PageShift
+	x := k
+	x -= mheap_.arena_start >> _PageShift
+	s := h_spans[x]
+	if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
+		return false
+	}
+	return true
+}
 
-	// ptrmask can have 2 possible values:
-	// 1. nil - obtain pointer mask from GC bitmap.
-	// 2. pointer to a compact mask (for stacks and data).
-	goto_scanobj := b != 0
+// Given an address in the heap return the relevant byte from the gcmap. This routine
+// can be used on addresses to the start of an object or to the interior of the an object.
+func slottombits(obj uintptr, mbits *markbits) {
+	off := (obj&^(ptrSize-1) - mheap_.arena_start) / ptrSize
+	mbits.bitp = (*byte)(unsafe.Pointer(mheap_.arena_start - off/wordsPerBitmapByte - 1))
+	mbits.shift = off % wordsPerBitmapByte * gcBits
+	mbits.xbits = *mbits.bitp
+	mbits.bits = (mbits.xbits >> mbits.shift) & bitMask
+	mbits.tbits = ((mbits.xbits >> mbits.shift) & bitPtrMask) >> 2
+}
 
+// b is a pointer into the heap.
+// Find the start of the object refered to by b.
+// Set mbits to the associated bits from the bit map.
+// If b is not a valid heap object return nil and
+// undefined values in mbits.
+func objectstart(b uintptr, mbits *markbits) uintptr {
+	obj := b &^ (ptrSize - 1)
 	for {
-		if goto_scanobj {
-			goto_scanobj = false
-		} else {
-			if nobj == 0 {
-				// Out of work in workbuf.
-				if !keepworking {
-					putempty(wbuf)
-					return
-				}
+		slottombits(obj, mbits)
+		if mbits.bits&bitBoundary == bitBoundary {
+			break
+		}
 
-				// Refill workbuf from global queue.
-				wbuf = getfull(wbuf)
-				if wbuf == nil {
-					return
-				}
-				nobj = wbuf.nobj
-				if nobj < uintptr(len(wbuf.obj)) {
-					wp = &wbuf.obj[nobj]
-				} else {
-					wp = nil
-				}
+		// Not a beginning of a block, consult span table to find the block beginning.
+		k := b >> _PageShift
+		x := k
+		x -= mheap_.arena_start >> _PageShift
+		s := h_spans[x]
+		if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
+			if s != nil && s.state == _MSpanStack {
+				return 0 // This is legit.
 			}
 
-			// If another proc wants a pointer, give it some.
-			if work.nwait > 0 && nobj > 4 && work.full == 0 {
-				wbuf.nobj = nobj
-				wbuf = handoff(wbuf)
-				nobj = wbuf.nobj
-				if nobj < uintptr(len(wbuf.obj)) {
-					wp = &wbuf.obj[nobj]
+			// The following ensures that we are rigorous about what data
+			// structures hold valid pointers
+			if false {
+				// Still happens sometimes. We don't know why.
+				printlock()
+				print("runtime:objectstart Span weird: obj=", hex(obj), " k=", hex(k))
+				if s == nil {
+					print(" s=nil\n")
 				} else {
-					wp = nil
+					print(" s.start=", hex(s.start<<_PageShift), " s.limit=", hex(s.limit), " s.state=", s.state, "\n")
 				}
+				printunlock()
+				gothrow("objectstart: bad pointer in unexpected span")
 			}
-
-			nobj--
-			wp = &wbuf.obj[nobj]
-			b = *wp
-			n = arena_used - uintptr(b)
-			ptrmask = nil // use GC bitmap for pointer info
+			return 0
 		}
 
-		if _DebugGCPtrs {
-			print("scanblock ", b, " +", hex(n), " ", ptrmask, "\n")
+		p := uintptr(s.start) << _PageShift
+		if s.sizeclass != 0 {
+			size := s.elemsize
+			idx := (obj - p) / size
+			p = p + idx*size
 		}
-
-		// Find bits of the beginning of the object.
-		if ptrmask == nil {
-			off := (uintptr(b) - arena_start) / ptrSize
-			ptrbitp = unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1)
+		if p == obj {
+			print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")
+			gothrow("failed to find block beginning")
 		}
+		obj = p
+	}
 
-		var i uintptr
-		for i = 0; i < n; i += ptrSize {
-			// Find bits for this word.
-			var bits uintptr
-			if ptrmask == nil {
-				// Check if we have reached end of span.
-				if (uintptr(b)+i)%_PageSize == 0 &&
-					h_spans[(uintptr(b)-arena_start)>>_PageShift] != h_spans[(uintptr(b)+i-arena_start)>>_PageShift] {
-					break
-				}
+	// if size(obj.firstfield) < PtrSize, the &obj.secondfield could map to the boundary bit
+	// Clear any low bits to get to the start of the object.
+	// greyobject depends on this.
+	return obj
+}
 
-				// Consult GC bitmap.
-				bits = uintptr(*(*byte)(ptrbitp))
+// Slow for now as we serialize this, since this is on a debug path
+// speed is not critical at this point.
+var andlock mutex
 
-				if wordsPerBitmapByte != 2 {
-					gothrow("alg doesn't work for wordsPerBitmapByte != 2")
-				}
-				j := (uintptr(b) + i) / ptrSize & 1
-				ptrbitp = add(ptrbitp, -j)
-				bits >>= gcBits * j
+func atomicand8(src *byte, val byte) {
+	lock(&andlock)
+	*src &= val
+	unlock(&andlock)
+}
 
-				if bits&bitBoundary != 0 && i != 0 {
-					break // reached beginning of the next object
-				}
-				bits = (bits >> 2) & bitsMask
-				if bits == bitsDead {
-					break // reached no-scan part of the object
-				}
-			} else {
-				// dense mask (stack or data)
-				bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
-			}
+// Mark using the checkmark scheme.
+func docheckmark(mbits *markbits) {
+	// xor 01 moves 01(scalar unmarked) to 00(scalar marked)
+	// and 10(pointer unmarked) to 11(pointer marked)
+	if mbits.tbits == _BitsScalar {
+		atomicand8(mbits.bitp, ^byte(_BitsCheckMarkXor<<mbits.shift<<2))
+	} else if mbits.tbits == _BitsPointer {
+		atomicor8(mbits.bitp, byte(_BitsCheckMarkXor<<mbits.shift<<2))
+	}
 
-			if bits <= _BitsScalar { // BitsScalar || BitsDead
-				continue
-			}
+	// reload bits for ischeckmarked
+	mbits.xbits = *mbits.bitp
+	mbits.bits = (mbits.xbits >> mbits.shift) & bitMask
+	mbits.tbits = ((mbits.xbits >> mbits.shift) & bitPtrMask) >> 2
+}
 
-			if bits != _BitsPointer {
-				gothrow("unexpected garbage collection bits")
-			}
+// In the default scheme does mbits refer to a marked object.
+func ismarked(mbits *markbits) bool {
+	if mbits.bits&bitBoundary != bitBoundary {
+		gothrow("ismarked: bits should have boundary bit set")
+	}
+	return mbits.bits&bitMarked == bitMarked
+}
 
-			obj := *(*uintptr)(unsafe.Pointer(b + i))
-			obj0 := obj
+// In the checkmark scheme does mbits refer to a marked object.
+func ischeckmarked(mbits *markbits) bool {
+	if mbits.bits&bitBoundary != bitBoundary {
+		gothrow("ischeckmarked: bits should have boundary bit set")
+	}
+	return mbits.tbits == _BitsScalarMarked || mbits.tbits == _BitsPointerMarked
+}
 
-		markobj:
-			var s *mspan
-			var off, bitp, shift, xbits uintptr
+// When in GCmarkterminate phase we allocate black.
+func gcmarknewobject_m(obj uintptr) {
+	if gcphase != _GCmarktermination {
+		gothrow("marking new object while not in mark termination phase")
+	}
+	if checkmark { // The world should be stopped so this should not happen.
+		gothrow("gcmarknewobject called while doing checkmark")
+	}
 
-			// At this point we have extracted the next potential pointer.
-			// Check if it points into heap.
-			if obj == 0 {
-				continue
-			}
-			if obj < arena_start || arena_used <= obj {
-				if uintptr(obj) < _PhysPageSize && invalidptr != 0 {
-					s = nil
-					goto badobj
-				}
-				continue
-			}
+	var mbits markbits
+	slottombits(obj, &mbits)
+	if mbits.bits&bitMarked != 0 {
+		return
+	}
 
-			// Mark the object.
-			obj &^= ptrSize - 1
-			off = (obj - arena_start) / ptrSize
-			bitp = arena_start - off/wordsPerBitmapByte - 1
-			shift = (off % wordsPerBitmapByte) * gcBits
-			xbits = uintptr(*(*byte)(unsafe.Pointer(bitp)))
-			bits = (xbits >> shift) & bitMask
-			if (bits & bitBoundary) == 0 {
-				// Not a beginning of a block, consult span table to find the block beginning.
-				k := pageID(obj >> _PageShift)
-				x := k
-				x -= pageID(arena_start >> _PageShift)
-				s = h_spans[x]
-				if s == nil || k < s.start || s.limit <= obj || s.state != mSpanInUse {
-					// Stack pointers lie within the arena bounds but are not part of the GC heap.
-					// Ignore them.
-					if s != nil && s.state == _MSpanStack {
-						continue
-					}
-					goto badobj
-				}
-				p := uintptr(s.start) << _PageShift
-				if s.sizeclass != 0 {
-					size := s.elemsize
-					idx := (obj - p) / size
-					p = p + idx*size
-				}
-				if p == obj {
-					print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")
-					gothrow("failed to find block beginning")
+	// Each byte of GC bitmap holds info for two words.
+	// If the current object is larger than two words, or if the object is one word
+	// but the object it shares the byte with is already marked,
+	// then all the possible concurrent updates are trying to set the same bit,
+	// so we can use a non-atomic update.
+	if mbits.xbits&(bitMask|(bitMask<<gcBits)) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
+		*mbits.bitp = mbits.xbits | bitMarked<<mbits.shift
+	} else {
+		atomicor8(mbits.bitp, bitMarked<<mbits.shift)
+	}
+}
+
+// obj is the start of an object with mark mbits.
+// If it isn't already marked, mark it and enqueue into workbuf.
+// Return possibly new workbuf to use.
+func greyobject(obj uintptr, mbits *markbits, wbuf *workbuf) *workbuf {
+	// obj should be start of allocation, and so must be at least pointer-aligned.
+	if obj&(ptrSize-1) != 0 {
+		gothrow("greyobject: obj not pointer-aligned")
+	}
+
+	if checkmark {
+		if !ismarked(mbits) {
+			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), ", mbits->bits=", hex(mbits.bits), " *mbits->bitp=", hex(*mbits.bitp), "\n")
+
+			k := obj >> _PageShift
+			x := k
+			x -= mheap_.arena_start >> _PageShift
+			s := h_spans[x]
+			printlock()
+			print("runtime:greyobject Span: obj=", hex(obj), " k=", hex(k))
+			if s == nil {
+				print(" s=nil\n")
+			} else {
+				print(" s.start=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
+				// NOTE(rsc): This code is using s.sizeclass as an approximation of the
+				// number of pointer-sized words in an object. Perhaps not what was intended.
+				for i := 0; i < int(s.sizeclass); i++ {
+					print(" *(obj+", i*ptrSize, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + uintptr(i)*ptrSize))), "\n")
 				}
-				obj = p
-				goto markobj
 			}
+			gothrow("checkmark found unmarked object")
+		}
+		if ischeckmarked(mbits) {
+			return wbuf
+		}
+		docheckmark(mbits)
+		if !ischeckmarked(mbits) {
+			print("mbits xbits=", hex(mbits.xbits), " bits=", hex(mbits.bits), " tbits=", hex(mbits.tbits), " shift=", mbits.shift, "\n")
+			gothrow("docheckmark and ischeckmarked disagree")
+		}
+	} else {
+		// If marked we have nothing to do.
+		if mbits.bits&bitMarked != 0 {
+			return wbuf
+		}
 
-			if _DebugGCPtrs {
-				print("scan *", hex(b+i), " = ", hex(obj0), " => base ", hex(obj), "\n")
-			}
+		// Each byte of GC bitmap holds info for two words.
+		// If the current object is larger than two words, or if the object is one word
+		// but the object it shares the byte with is already marked,
+		// then all the possible concurrent updates are trying to set the same bit,
+		// so we can use a non-atomic update.
+		if mbits.xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
+			*mbits.bitp = mbits.xbits | bitMarked<<mbits.shift
+		} else {
+			atomicor8(mbits.bitp, bitMarked<<mbits.shift)
+		}
+	}
 
-			if nbadblock > 0 && obj == badblock[nbadblock-1] {
-				// Running garbage collection again because
-				// we want to find the path from a root to a bad pointer.
-				// Found possible next step; extend or finish path.
-				for j := int32(0); j < nbadblock; j++ {
-					if badblock[j] == b {
-						goto AlreadyBad
-					}
-				}
-				print("runtime: found *(", hex(b), "+", hex(i), ") = ", hex(obj0), "+", hex(obj-obj0), "\n")
-				if ptrmask != nil {
-					gothrow("bad pointer")
-				}
-				if nbadblock >= int32(len(badblock)) {
-					gothrow("badblock trace too long")
-				}
-				badblock[nbadblock] = uintptr(b)
-				nbadblock++
-			AlreadyBad:
+	if !checkmark && (mbits.xbits>>(mbits.shift+2))&_BitsMask == _BitsDead {
+		return wbuf // noscan object
+	}
+
+	// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
+	// seems like a nice optimization that can be added back in.
+	// There needs to be time between the PREFETCH and the use.
+	// Previously we put the obj in an 8 element buffer that is drained at a rate
+	// to give the PREFETCH time to do its work.
+	// Use of PREFETCHNTA might be more appropriate than PREFETCH
+
+	// If workbuf is full, obtain an empty one.
+	if wbuf.nobj >= uintptr(len(wbuf.obj)) {
+		wbuf = getempty(wbuf)
+	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+	return wbuf
+}
+
+// Scan the object b of size n, adding pointers to wbuf.
+// Return possibly new wbuf to use.
+// If ptrmask != nil, it specifies where pointers are in b.
+// If ptrmask == nil, the GC bitmap should be consulted.
+// In this case, n may be an overestimate of the size; the GC bitmap
+// must also be used to make sure the scan stops at the end of b.
+func scanobject(b, n uintptr, ptrmask *uint8, wbuf *workbuf) *workbuf {
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	// Find bits of the beginning of the object.
+	var ptrbitp unsafe.Pointer
+	var mbits markbits
+	if ptrmask == nil {
+		b = objectstart(b, &mbits)
+		if b == 0 {
+			return wbuf
+		}
+		ptrbitp = unsafe.Pointer(mbits.bitp)
+	}
+	for i := uintptr(0); i < n; i += ptrSize {
+		// Find bits for this word.
+		var bits uintptr
+		if ptrmask != nil {
+			// dense mask (stack or data)
+			bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
+		} else {
+			// Check if we have reached end of span.
+			// n is an overestimate of the size of the object.
+			if (b+i)%_PageSize == 0 && h_spans[(b-arena_start)>>_PageShift] != h_spans[(b+i-arena_start)>>_PageShift] {
+				break
 			}
 
-			// Now we have bits, bitp, and shift correct for
-			// obj pointing at the base of the object.
-			// Only care about not marked objects.
-			if bits&bitMarked != 0 {
-				continue
+			// Consult GC bitmap.
+			bits = uintptr(*(*byte)(ptrbitp))
+			if wordsPerBitmapByte != 2 {
+				gothrow("alg doesn't work for wordsPerBitmapByte != 2")
 			}
+			j := (uintptr(b) + i) / ptrSize & 1 // j indicates upper nibble or lower nibble
+			bits >>= gcBits * j
+			if i == 0 {
+				bits &^= bitBoundary
+			}
+			ptrbitp = add(ptrbitp, -j)
 
-			// If obj size is greater than 8, then each byte of GC bitmap
-			// contains info for at most one object. In such case we use
-			// non-atomic byte store to mark the object. This can lead
-			// to double enqueue of the object for scanning, but scanning
-			// is an idempotent operation, so it is OK. This cannot lead
-			// to bitmap corruption because the single marked bit is the
-			// only thing that can change in the byte.
-			// For 8-byte objects we use non-atomic store, if the other
-			// quadruple is already marked. Otherwise we resort to CAS
-			// loop for marking.
-			if xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
-				*(*byte)(unsafe.Pointer(bitp)) = uint8(xbits | bitMarked<<shift)
-			} else {
-				atomicor8((*byte)(unsafe.Pointer(bitp)), bitMarked<<shift)
+			if bits&bitBoundary != 0 && i != 0 {
+				break // reached beginning of the next object
 			}
+			bits = (bits & bitPtrMask) >> 2 // bits refer to the type bits.
 
-			if (xbits>>(shift+2))&bitsMask == bitsDead {
-				continue // noscan object
+			if i != 0 && bits == bitsDead { // BitsDead in first nibble not valid during checkmark
+				break // reached no-scan part of the object
 			}
+		}
 
-			// Queue the obj for scanning.
-			// TODO: PREFETCH here.
+		if bits <= _BitsScalar { // _BitsScalar, _BitsDead, _BitsScalarMarked
+			continue
+		}
 
-			// If workbuf is full, obtain an empty one.
-			if nobj >= uintptr(len(wbuf.obj)) {
-				wbuf.nobj = nobj
-				wbuf = getempty(wbuf)
-				nobj = wbuf.nobj
-				wp = &wbuf.obj[nobj]
-			}
-			*wp = obj
-			nobj++
-			if nobj < uintptr(len(wbuf.obj)) {
-				wp = &wbuf.obj[nobj]
-			} else {
-				wp = nil
-			}
+		if bits&_BitsPointer != _BitsPointer {
+			print("gc checkmark=", checkmark, " b=", hex(b), " ptrmask=", ptrmask, " mbits.bitp=", mbits.bitp, " mbits.xbits=", hex(mbits.xbits), " bits=", hex(bits), "\n")
+			gothrow("unexpected garbage collection bits")
+		}
+
+		obj := *(*uintptr)(unsafe.Pointer(b + i))
+
+		// At this point we have extracted the next potential pointer.
+		// Check if it points into heap.
+		if obj == 0 || obj < arena_start || obj >= arena_used {
+			continue
+		}
+
+		// Mark the object. return some important bits.
+		// We we combine the following two rotines we don't have to pass mbits or obj around.
+		var mbits markbits
+		obj = objectstart(obj, &mbits)
+		if obj == 0 {
 			continue
+		}
+		wbuf = greyobject(obj, &mbits, wbuf)
+	}
+	return wbuf
+}
 
-		badobj:
-			// If cgo_allocate is linked into the binary, it can allocate
-			// memory as []unsafe.Pointer that may not contain actual
-			// pointers and must be scanned conservatively.
-			// In this case alone, allow the bad pointer.
-			if have_cgo_allocate() && ptrmask == nil {
-				continue
+// scanblock starts by scanning b as scanobject would.
+// If the gcphase is GCscan, that's all scanblock does.
+// Otherwise it traverses some fraction of the pointers it found in b, recursively.
+// As a special case, scanblock(nil, 0, nil) means to scan previously queued work,
+// stopping only when no work is left in the system.
+func scanblock(b, n uintptr, ptrmask *uint8) {
+	wbuf := getpartialorempty()
+	if b != 0 {
+		wbuf = scanobject(b, n, ptrmask, wbuf)
+		if gcphase == _GCscan {
+			if inheap(b) && ptrmask == nil {
+				// b is in heap, we are in GCscan so there should be a ptrmask.
+				gothrow("scanblock: In GCscan phase and inheap is true.")
 			}
+			// GCscan only goes one level deep since mark wb not turned on.
+			putpartial(wbuf)
+			return
+		}
+	}
+	if gcphase == _GCscan {
+		gothrow("scanblock: In GCscan phase but no b passed in.")
+	}
 
-			// Anything else indicates a bug somewhere.
-			// If we're in the middle of chasing down a different bad pointer,
-			// don't confuse the trace by printing about this one.
-			if nbadblock > 0 {
-				continue
-			}
+	keepworking := b == 0
 
-			print("runtime: garbage collector found invalid heap pointer *(", hex(b), "+", hex(i), ")=", hex(obj))
-			if s == nil {
-				print(" s=nil\n")
-			} else {
-				print(" span=", uintptr(s.start)<<_PageShift, "-", s.limit, "-", (uintptr(s.start)+s.npages)<<_PageShift, " state=", s.state, "\n")
+	// ptrmask can have 2 possible values:
+	// 1. nil - obtain pointer mask from GC bitmap.
+	// 2. pointer to a compact mask (for stacks and data).
+	for {
+		if wbuf.nobj == 0 {
+			if !keepworking {
+				putempty(wbuf)
+				return
 			}
-			if ptrmask != nil {
-				gothrow("invalid heap pointer")
+			// Refill workbuf from global queue.
+			wbuf = getfull(wbuf)
+			if wbuf == nil { // nil means out of work barrier reached
+				return
 			}
-			// Add to badblock list, which will cause the garbage collection
-			// to keep repeating until it has traced the chain of pointers
-			// leading to obj all the way back to a root.
-			if nbadblock == 0 {
-				badblock[nbadblock] = uintptr(b)
-				nbadblock++
+
+			if wbuf.nobj <= 0 {
+				gothrow("runtime:scanblock getfull returns empty buffer")
 			}
 		}
-		if _DebugGCPtrs {
-			print("end scanblock ", hex(b), " +", hex(n), " ", ptrmask, "\n")
-		}
-		if _DebugGC > 0 && ptrmask == nil {
-			// For heap objects ensure that we did not overscan.
-			var p, n uintptr
-			if mlookup(b, &p, &n, nil) == 0 || b != p || i > n {
-				print("runtime: scanned (", hex(b), "+", hex(i), "), heap object (", hex(p), "+", hex(n), ")\n")
-				gothrow("scanblock: scanned invalid object")
-			}
+
+		// If another proc wants a pointer, give it some.
+		if work.nwait > 0 && wbuf.nobj > 4 && work.full == 0 {
+			wbuf = handoff(wbuf)
 		}
+
+		// This might be a good place to add prefetch code...
+		// if(wbuf->nobj > 4) {
+		//         PREFETCH(wbuf->obj[wbuf->nobj - 3];
+		//  }
+		wbuf.nobj--
+		b = wbuf.obj[wbuf.nobj]
+		wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf)
 	}
 }
 
@@ -455,7 +637,8 @@ func markroot(desc *parfor, i uint32) {
 			if s.state != mSpanInUse {
 				continue
 			}
-			if s.sweepgen != sg {
+			if !checkmark && s.sweepgen != sg {
+				// sweepgen was updated (+2) during non-checkmark GC pass
 				print("sweep ", s.sweepgen, " ", sg, "\n")
 				gothrow("gc: unswept span")
 			}
@@ -468,13 +651,17 @@ func markroot(desc *parfor, i uint32) {
 				spf := (*specialfinalizer)(unsafe.Pointer(sp))
 				// A finalizer can be set for an inner byte of an object, find object beginning.
 				p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
-				scanblock(p, s.elemsize, nil)
+				if gcphase != _GCscan {
+					scanblock(p, s.elemsize, nil) // scanned during mark phase
+				}
 				scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptr[0])
 			}
 		}
 
 	case _RootFlushCaches:
-		flushallmcaches()
+		if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
+			flushallmcaches()
+		}
 
 	default:
 		// the rest is scanning goroutine stacks
@@ -482,21 +669,44 @@ func markroot(desc *parfor, i uint32) {
 			gothrow("markroot: bad index")
 		}
 		gp := allgs[i-_RootCount]
+
 		// remember when we've first observed the G blocked
 		// needed only to output in traceback
-		status := readgstatus(gp)
+		status := readgstatus(gp) // We are not in a scan state
 		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
 			gp.waitsince = work.tstart
 		}
-		// Shrink a stack if not much of it is being used.
-		shrinkstack(gp)
+
+		// Shrink a stack if not much of it is being used but not in the scan phase.
+		if gcphase != _GCscan { // Do not shrink during GCscan phase.
+			shrinkstack(gp)
+		}
 		if readgstatus(gp) == _Gdead {
 			gp.gcworkdone = true
 		} else {
 			gp.gcworkdone = false
 		}
 		restart := stopg(gp)
-		scanstack(gp)
+
+		// goroutine will scan its own stack when it stops running.
+		// Wait until it has.
+		for readgstatus(gp) == _Grunning && !gp.gcworkdone {
+		}
+
+		// scanstack(gp) is done as part of gcphasework
+		// But to make sure we finished we need to make sure that
+		// the stack traps have all responded so drop into
+		// this while loop until they respond.
+		for !gp.gcworkdone {
+			status = readgstatus(gp)
+			if status == _Gdead {
+				gp.gcworkdone = true // scan is a noop
+				break
+			}
+			if status == _Gwaiting || status == _Grunnable {
+				restart = stopg(gp)
+			}
+		}
 		if restart {
 			restartg(gp)
 		}
@@ -506,48 +716,83 @@ func markroot(desc *parfor, i uint32) {
 // Get an empty work buffer off the work.empty list,
 // allocating new buffers as needed.
 func getempty(b *workbuf) *workbuf {
-	_g_ := getg()
 	if b != nil {
-		lfstackpush(&work.full, &b.node)
+		putfull(b)
+		b = nil
 	}
-	b = nil
-	c := _g_.m.mcache
-	if c.gcworkbuf != nil {
-		b = (*workbuf)(c.gcworkbuf)
-		c.gcworkbuf = nil
-	}
-	if b == nil {
+	if work.empty != 0 {
 		b = (*workbuf)(lfstackpop(&work.empty))
 	}
+	if b != nil && b.nobj != 0 {
+		_g_ := getg()
+		print("m", _g_.m.id, ": getempty: popped b=", b, " with non-zero b.nobj=", b.nobj, "\n")
+		gothrow("getempty: workbuffer not empty, b->nobj not 0")
+	}
 	if b == nil {
 		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), _CacheLineSize, &memstats.gc_sys))
+		b.nobj = 0
 	}
-	b.nobj = 0
 	return b
 }
 
 func putempty(b *workbuf) {
-	_g_ := getg()
-	c := _g_.m.mcache
-	if c.gcworkbuf == nil {
-		c.gcworkbuf = (unsafe.Pointer)(b)
-		return
+	if b.nobj != 0 {
+		gothrow("putempty: b->nobj not 0")
 	}
 	lfstackpush(&work.empty, &b.node)
 }
 
-func gcworkbuffree(b unsafe.Pointer) {
-	if b != nil {
-		putempty((*workbuf)(b))
+func putfull(b *workbuf) {
+	if b.nobj <= 0 {
+		gothrow("putfull: b->nobj <= 0")
+	}
+	lfstackpush(&work.full, &b.node)
+}
+
+// Get an partially empty work buffer
+// if none are available get an empty one.
+func getpartialorempty() *workbuf {
+	b := (*workbuf)(lfstackpop(&work.partial))
+	if b == nil {
+		b = getempty(nil)
+	}
+	return b
+}
+
+func putpartial(b *workbuf) {
+	if b.nobj == 0 {
+		lfstackpush(&work.empty, &b.node)
+	} else if b.nobj < uintptr(len(b.obj)) {
+		lfstackpush(&work.partial, &b.node)
+	} else if b.nobj == uintptr(len(b.obj)) {
+		lfstackpush(&work.full, &b.node)
+	} else {
+		print("b=", b, " b.nobj=", b.nobj, " len(b.obj)=", len(b.obj), "\n")
+		gothrow("putpartial: bad Workbuf b.nobj")
 	}
 }
 
-// Get a full work buffer off the work.full list, or return nil.
+// Get a full work buffer off the work.full or a partially
+// filled one off the work.partial list. If nothing is available
+// wait until all the other gc helpers have finished and then
+// return nil.
+// getfull acts as a barrier for work.nproc helpers. As long as one
+// gchelper is actively marking objects it
+// may create a workbuffer that the other helpers can work on.
+// The for loop either exits when a work buffer is found
+// or when _all_ of the work.nproc GC helpers are in the loop
+// looking for work and thus not capable of creating new work.
+// This is in fact the termination condition for the STW mark
+// phase.
 func getfull(b *workbuf) *workbuf {
 	if b != nil {
-		lfstackpush(&work.empty, &b.node)
+		putempty(b)
 	}
+
 	b = (*workbuf)(lfstackpop(&work.full))
+	if b == nil {
+		b = (*workbuf)(lfstackpop(&work.partial))
+	}
 	if b != nil || work.nproc == 1 {
 		return b
 	}
@@ -557,6 +802,9 @@ func getfull(b *workbuf) *workbuf {
 		if work.full != 0 {
 			xadd(&work.nwait, -1)
 			b = (*workbuf)(lfstackpop(&work.full))
+			if b == nil {
+				b = (*workbuf)(lfstackpop(&work.partial))
+			}
 			if b != nil {
 				return b
 			}
@@ -675,14 +923,11 @@ func scanframe(frame *stkframe, unused unsafe.Pointer) bool {
 }
 
 func scanstack(gp *g) {
-	// TODO(rsc): Due to a precedence error, this was never checked in the original C version.
-	// If you enable the check, the gothrow happens.
-	/*
-		if readgstatus(gp)&_Gscan == 0 {
-			print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-			gothrow("mark - bad status")
-		}
-	*/
+
+	if readgstatus(gp)&_Gscan == 0 {
+		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
+		gothrow("scanstack - bad status")
+	}
 
 	switch readgstatus(gp) &^ _Gscan {
 	default:
@@ -692,7 +937,7 @@ func scanstack(gp *g) {
 		return
 	case _Grunning:
 		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-		gothrow("mark - world not stopped")
+		gothrow("scanstack: goroutine not stopped")
 	case _Grunnable, _Gsyscall, _Gwaiting:
 		// ok
 	}
@@ -709,18 +954,114 @@ func scanstack(gp *g) {
 	tracebackdefers(gp, scanframe, nil)
 }
 
-// The gp has been moved to a gc safepoint. If there is gcphase specific
-// work it is done here.
+// If the slot is grey or black return true, if white return false.
+// If the slot is not in the known heap and thus does not have a valid GC bitmap then
+// it is considered grey. Globals and stacks can hold such slots.
+// The slot is grey if its mark bit is set and it is enqueued to be scanned.
+// The slot is black if it has already been scanned.
+// It is white if it has a valid mark bit and the bit is not set.
+func shaded(slot uintptr) bool {
+	if !inheap(slot) { // non-heap slots considered grey
+		return true
+	}
+
+	var mbits markbits
+	valid := objectstart(slot, &mbits)
+	if valid == 0 {
+		return true
+	}
+
+	if checkmark {
+		return ischeckmarked(&mbits)
+	}
+
+	return mbits.bits&bitMarked != 0
+}
+
+// Shade the object if it isn't already.
+// The object is not nil and known to be in the heap.
+func shade(b uintptr) {
+	if !inheap(b) {
+		gothrow("shade: passed an address not in the heap")
+	}
+
+	wbuf := getpartialorempty()
+	// Mark the object, return some important bits.
+	// If we combine the following two rotines we don't have to pass mbits or obj around.
+	var mbits markbits
+	obj := objectstart(b, &mbits)
+	if obj != 0 {
+		wbuf = greyobject(obj, &mbits, wbuf) // augments the wbuf
+	}
+	putpartial(wbuf)
+}
+
+// This is the Dijkstra barrier coarsened to always shade the ptr (dst) object.
+// The original Dijkstra barrier only shaded ptrs being placed in black slots.
+//
+// Shade indicates that it has seen a white pointer by adding the referent
+// to wbuf as well as marking it.
+//
+// slot is the destination (dst) in go code
+// ptr is the value that goes into the slot (src) in the go code
+//
+// Dijkstra pointed out that maintaining the no black to white
+// pointers means that white to white pointers not need
+// to be noted by the write barrier. Furthermore if either
+// white object dies before it is reached by the
+// GC then the object can be collected during this GC cycle
+// instead of waiting for the next cycle. Unfortunately the cost of
+// ensure that the object holding the slot doesn't concurrently
+// change to black without the mutator noticing seems prohibitive.
+//
+// Consider the following example where the mutator writes into
+// a slot and then loads the slot's mark bit while the GC thread
+// writes to the slot's mark bit and then as part of scanning reads
+// the slot.
+//
+// Initially both [slot] and [slotmark] are 0 (nil)
+// Mutator thread          GC thread
+// st [slot], ptr          st [slotmark], 1
+//
+// ld r1, [slotmark]       ld r2, [slot]
+//
+// This is a classic example of independent reads of independent writes,
+// aka IRIW. The question is if r1==r2==0 is allowed and for most HW the
+// answer is yes without inserting a memory barriers between the st and the ld.
+// These barriers are expensive so we have decided that we will
+// always grey the ptr object regardless of the slot's color.
+func gcmarkwb_m(slot *uintptr, ptr uintptr) {
+	switch gcphase {
+	default:
+		gothrow("gcphasework in bad gcphase")
+
+	case _GCoff, _GCquiesce, _GCstw, _GCsweep, _GCscan:
+		// ok
+
+	case _GCmark, _GCmarktermination:
+		if ptr != 0 && inheap(ptr) {
+			shade(ptr)
+		}
+	}
+}
+
+// The gp has been moved to a GC safepoint. GC phase specific
+// work is done here.
 func gcphasework(gp *g) {
 	switch gcphase {
 	default:
 		gothrow("gcphasework in bad gcphase")
 	case _GCoff, _GCquiesce, _GCstw, _GCsweep:
-		// No work for now.
+		// No work.
+	case _GCscan:
+		// scan the stack, mark the objects, put pointers in work buffers
+		// hanging off the P where this is being run.
+		scanstack(gp)
 	case _GCmark:
-		// Disabled until concurrent GC is implemented
-		// but indicate the scan has been done.
-		// scanstack(gp);
+		// No work.
+	case _GCmarktermination:
+		scanstack(gp)
+		// All available mark work will be emptied before returning.
 	}
 	gp.gcworkdone = true
 }
@@ -797,6 +1138,7 @@ func iterate_finq(callback func(*funcval, unsafe.Pointer, uintptr, *_type, *ptrt
 	}
 }
 
+// Returns only when span s has been swept.
 func mSpan_EnsureSwept(s *mspan) {
 	// Caller must disable preemption.
 	// Otherwise when this function returns the span can become unswept again
@@ -810,6 +1152,7 @@ func mSpan_EnsureSwept(s *mspan) {
 	if atomicload(&s.sweepgen) == sg {
 		return
 	}
+	// The caller must be sure that the span is a MSpanInUse span.
 	if cas(&s.sweepgen, sg-2, sg-1) {
 		mSpan_Sweep(s, false)
 		return
@@ -826,6 +1169,10 @@ func mSpan_EnsureSwept(s *mspan) {
 // If preserve=true, don't return it to heap nor relink in MCentral lists;
 // caller takes care of it.
 func mSpan_Sweep(s *mspan, preserve bool) bool {
+	if checkmark {
+		gothrow("MSpan_Sweep: checkmark only runs in STW and after the sweep")
+	}
+
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
@@ -851,13 +1198,14 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 	}
 	res := false
 	nfree := 0
-	var head mlink
-	end := &head
+
+	var head, end gclinkptr
+
 	c := _g_.m.mcache
 	sweepgenset := false
 
 	// Mark any free objects in this span so we don't collect them.
-	for link := s.freelist; link != nil; link = link.next {
+	for link := s.freelist; link.ptr() != nil; link = link.ptr().next {
 		off := (uintptr(unsafe.Pointer(link)) - arena_start) / ptrSize
 		bitp := arena_start - off/wordsPerBitmapByte - 1
 		shift := (off % wordsPerBitmapByte) * gcBits
@@ -978,8 +1326,13 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 			} else if size > ptrSize {
 				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = 0
 			}
-			end.next = (*mlink)(unsafe.Pointer(p))
-			end = end.next
+			if head.ptr() == nil {
+				head = gclinkptr(p)
+			} else {
+				end.ptr().next = gclinkptr(p)
+			}
+			end = gclinkptr(p)
+			end.ptr().next = gclinkptr(0x0bade5)
 			nfree++
 		}
 	}
@@ -1002,7 +1355,7 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 		c.local_nsmallfree[cl] += uintptr(nfree)
 		c.local_cachealloc -= intptr(uintptr(nfree) * size)
 		xadd64(&memstats.next_gc, -int64(nfree)*int64(size)*int64(gcpercent+100)/100)
-		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head.next, end, preserve)
+		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head, end, preserve)
 		// MCentral_FreeSpan updates sweepgen
 	}
 	return res
@@ -1073,11 +1426,11 @@ func gchelper() {
 	_g_.m.traceback = 2
 	gchelperstart()
 
-	// parallel mark for over gc roots
+	// parallel mark for over GC roots
 	parfordo(work.markfor)
-
-	// help other threads scan secondary blocks
-	scanblock(0, 0, nil)
+	if gcphase != _GCscan {
+		scanblock(0, 0, nil) // blocks in getfull
+	}
 
 	nproc := work.nproc // work.nproc can change right after we increment work.ndone
 	if xadd(&work.ndone, +1) == nproc-1 {
@@ -1204,6 +1557,7 @@ func gcinit() {
 	gcbssmask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcbss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)))
 }
 
+// Called from malloc.go using onM, stopping and starting the world handled in caller.
 func gc_m(start_time int64, eagersweep bool) {
 	_g_ := getg()
 	gp := _g_.m.curg
@@ -1211,18 +1565,266 @@ func gc_m(start_time int64, eagersweep bool) {
 	gp.waitreason = "garbage collection"
 
 	gc(start_time, eagersweep)
+	casgstatus(gp, _Gwaiting, _Grunning)
+}
+
+// Similar to clearcheckmarkbits but works on a single span.
+// It preforms two tasks.
+// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
+//    for nibbles with the BoundaryBit set.
+// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and
+//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
+// For the second case it is possible to restore the BitsDead pattern but since
+// clearmark is a debug tool performance has a lower priority than simplicity.
+// The span is MSpanInUse and the world is stopped.
+func clearcheckmarkbitsspan(s *mspan) {
+	if s.state != _MSpanInUse {
+		print("runtime:clearcheckmarkbitsspan: state=", s.state, "\n")
+		gothrow("clearcheckmarkbitsspan: bad span state")
+	}
 
-	if nbadblock > 0 {
-		// Work out path from root to bad block.
-		for {
-			gc(start_time, eagersweep)
-			if nbadblock >= int32(len(badblock)) {
-				gothrow("cannot find path to bad pointer")
+	arena_start := mheap_.arena_start
+	cl := s.sizeclass
+	size := s.elemsize
+	var n int32
+	if cl == 0 {
+		n = 1
+	} else {
+		// Chunk full of small blocks
+		npages := class_to_allocnpages[cl]
+		n = npages << _PageShift / int32(size)
+	}
+
+	// MSpan_Sweep has similar code but instead of overloading and
+	// complicating that routine we do a simpler walk here.
+	// Sweep through n objects of given size starting at p.
+	// This thread owns the span now, so it can manipulate
+	// the block bitmap without atomic operations.
+	p := uintptr(s.start) << _PageShift
+
+	// Find bits for the beginning of the span.
+	off := (p - arena_start) / ptrSize
+	bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+	step := size / (ptrSize * wordsPerBitmapByte)
+
+	// The type bit values are:
+	//	00 - BitsDead, for us BitsScalarMarked
+	//	01 - BitsScalar
+	//	10 - BitsPointer
+	//	11 - unused, for us BitsPointerMarked
+	//
+	// When called to prepare for the checkmark phase (checkmark==1),
+	// we change BitsDead to BitsScalar, so that there are no BitsScalarMarked
+	// type bits anywhere.
+	//
+	// The checkmark phase marks by changing BitsScalar to BitsScalarMarked
+	// and BitsPointer to BitsPointerMarked.
+	//
+	// When called to clean up after the checkmark phase (checkmark==0),
+	// we unmark by changing BitsScalarMarked back to BitsScalar and
+	// BitsPointerMarked back to BitsPointer.
+	//
+	// There are two problems with the scheme as just described.
+	// First, the setup rewrites BitsDead to BitsScalar, but the type bits
+	// following a BitsDead are uninitialized and must not be used.
+	// Second, objects that are free are expected to have their type
+	// bits zeroed (BitsDead), so in the cleanup we need to restore
+	// any BitsDeads that were there originally.
+	//
+	// In a one-word object (8-byte allocation on 64-bit system),
+	// there is no difference between BitsScalar and BitsDead, because
+	// neither is a pointer and there are no more words in the object,
+	// so using BitsScalar during the checkmark is safe and mapping
+	// both back to BitsDead during cleanup is also safe.
+	//
+	// In a larger object, we need to be more careful. During setup,
+	// if the type of the first word is BitsDead, we change it to BitsScalar
+	// (as we must) but also initialize the type of the second
+	// word to BitsDead, so that a scan during the checkmark phase
+	// will still stop before seeing the uninitialized type bits in the
+	// rest of the object. The sequence 'BitsScalar BitsDead' never
+	// happens in real type bitmaps - BitsDead is always as early
+	// as possible, so immediately after the last BitsPointer.
+	// During cleanup, if we see a BitsScalar, we can check to see if it
+	// is followed by BitsDead. If so, it was originally BitsDead and
+	// we can change it back.
+
+	if step == 0 {
+		// updating top and bottom nibbles, all boundaries
+		for i := int32(0); i < n/2; i, bitp = i+1, addb(bitp, uintptrMask&-1) {
+			if *bitp&bitBoundary == 0 {
+				gothrow("missing bitBoundary")
+			}
+			b := (*bitp & bitPtrMask) >> 2
+			if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) {
+				*bitp &^= 0x0c // convert to _BitsDead
+			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
+				*bitp &^= _BitsCheckMarkXor << 2
+			}
+
+			if (*bitp>>gcBits)&bitBoundary == 0 {
+				gothrow("missing bitBoundary")
+			}
+			b = ((*bitp >> gcBits) & bitPtrMask) >> 2
+			if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) {
+				*bitp &^= 0xc0 // convert to _BitsDead
+			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
+				*bitp &^= _BitsCheckMarkXor << (2 + gcBits)
+			}
+		}
+	} else {
+		// updating bottom nibble for first word of each object
+		for i := int32(0); i < n; i, bitp = i+1, addb(bitp, -step) {
+			if *bitp&bitBoundary == 0 {
+				gothrow("missing bitBoundary")
+			}
+			b := (*bitp & bitPtrMask) >> 2
+
+			if checkmark && b == _BitsDead {
+				// move BitsDead into second word.
+				// set bits to BitsScalar in preparation for checkmark phase.
+				*bitp &^= 0xc0
+				*bitp |= _BitsScalar << 2
+			} else if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) && *bitp&0xc0 == 0 {
+				// Cleaning up after checkmark phase.
+				// First word is scalar or dead (we forgot)
+				// and second word is dead.
+				// First word might as well be dead too.
+				*bitp &^= 0x0c
+			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
+				*bitp ^= _BitsCheckMarkXor << 2
 			}
 		}
 	}
+}
 
-	casgstatus(gp, _Gwaiting, _Grunning)
+// clearcheckmarkbits preforms two tasks.
+// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
+//    for nibbles with the BoundaryBit set.
+// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and
+//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
+// This is a bit expensive but preserves the BitsDead encoding during the normal marking.
+// BitsDead remains valid for every nibble except the ones with BitsBoundary set.
+func clearcheckmarkbits() {
+	for _, s := range work.spans {
+		if s.state == _MSpanInUse {
+			clearcheckmarkbitsspan(s)
+		}
+	}
+}
+
+// Called from malloc.go using onM.
+// The world is stopped. Rerun the scan and mark phases
+// using the bitMarkedCheck bit instead of the
+// bitMarked bit. If the marking encounters an
+// bitMarked bit that is not set then we throw.
+func gccheckmark_m(startTime int64, eagersweep bool) {
+	if !gccheckmarkenable {
+		return
+	}
+
+	if checkmark {
+		gothrow("gccheckmark_m, entered with checkmark already true")
+	}
+
+	checkmark = true
+	clearcheckmarkbits()        // Converts BitsDead to BitsScalar.
+	gc_m(startTime, eagersweep) // turns off checkmark
+	// Work done, fixed up the GC bitmap to remove the checkmark bits.
+	clearcheckmarkbits()
+}
+
+func gccheckmarkenable_m() {
+	gccheckmarkenable = true
+}
+
+func gccheckmarkdisable_m() {
+	gccheckmarkenable = false
+}
+
+func finishsweep_m() {
+	// The world is stopped so we should be able to complete the sweeps
+	// quickly.
+	for sweepone() != ^uintptr(0) {
+		sweep.npausesweep++
+	}
+
+	// There may be some other spans being swept concurrently that
+	// we need to wait for. If finishsweep_m is done with the world stopped
+	// this code is not required.
+	sg := mheap_.sweepgen
+	for _, s := range work.spans {
+		if s.sweepgen != sg && s.state == _MSpanInUse {
+			mSpan_EnsureSwept(s)
+		}
+	}
+}
+
+// Scan all of the stacks, greying (or graying if in America) the referents
+// but not blackening them since the mark write barrier isn't installed.
+func gcscan_m() {
+	_g_ := getg()
+
+	// Grab the g that called us and potentially allow rescheduling.
+	// This allows it to be scanned like other goroutines.
+	mastergp := _g_.m.curg
+	casgstatus(mastergp, _Grunning, _Gwaiting)
+	mastergp.waitreason = "garbage collection scan"
+
+	// Span sweeping has been done by finishsweep_m.
+	// Long term we will want to make this goroutine runnable
+	// by placing it onto a scanenqueue state and then calling
+	// runtime·restartg(mastergp) to make it Grunnable.
+	// At the bottom we will want to return this p back to the scheduler.
+	oldphase := gcphase
+
+	// Prepare flag indicating that the scan has not been completed.
+	lock(&allglock)
+	local_allglen := allglen
+	for i := uintptr(0); i < local_allglen; i++ {
+		gp := allgs[i]
+		gp.gcworkdone = false // set to true in gcphasework
+	}
+	unlock(&allglock)
+
+	work.nwait = 0
+	work.ndone = 0
+	work.nproc = 1 // For now do not do this in parallel.
+	gcphase = _GCscan
+	//	ackgcphase is not needed since we are not scanning running goroutines.
+	parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), nil, false, markroot)
+	parfordo(work.markfor)
+
+	lock(&allglock)
+	// Check that gc work is done.
+	for i := uintptr(0); i < local_allglen; i++ {
+		gp := allgs[i]
+		if !gp.gcworkdone {
+			gothrow("scan missed a g")
+		}
+	}
+	unlock(&allglock)
+
+	gcphase = oldphase
+	casgstatus(mastergp, _Gwaiting, _Grunning)
+	// Let the g that called us continue to run.
+}
+
+// Mark all objects that are known about.
+func gcmark_m() {
+	scanblock(0, 0, nil)
+}
+
+// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
+// all go routines see the new barrier.
+func gcinstallmarkwb_m() {
+	gcphase = _GCmark
+}
+
+// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
+// all go routines see the new barrier.
+func gcinstalloffwb_m() {
+	gcphase = _GCoff
 }
 
 func gc(start_time int64, eagersweep bool) {
@@ -1244,9 +1846,8 @@ func gc(start_time int64, eagersweep bool) {
 		t1 = nanotime()
 	}
 
-	// Sweep what is not sweeped by bgsweep.
-	for sweepone() != ^uintptr(0) {
-		sweep.npausesweep++
+	if !checkmark {
+		finishsweep_m() // skip during checkmark debug phase.
 	}
 
 	// Cache runtime.mheap_.allspans in work.spans to avoid conflicts with
@@ -1266,10 +1867,19 @@ func gc(start_time int64, eagersweep bool) {
 	mheap_.gcspans = mheap_.allspans
 	work.spans = h_allspans
 	unlock(&mheap_.lock)
+	oldphase := gcphase
 
 	work.nwait = 0
 	work.ndone = 0
 	work.nproc = uint32(gcprocs())
+	gcphase = _GCmarktermination
+
+	// World is stopped so allglen will not change.
+	for i := uintptr(0); i < allglen; i++ {
+		gp := allgs[i]
+		gp.gcworkdone = false // set to true in gcphasework
+	}
+
 	parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), nil, false, markroot)
 	if work.nproc > 1 {
 		noteclear(&work.alldone)
@@ -1285,6 +1895,14 @@ func gc(start_time int64, eagersweep bool) {
 	parfordo(work.markfor)
 	scanblock(0, 0, nil)
 
+	if work.full != 0 {
+		gothrow("work.full != 0")
+	}
+	if work.partial != 0 {
+		gothrow("work.partial != 0")
+	}
+
+	gcphase = oldphase
 	var t3 int64
 	if debug.gctrace > 0 {
 		t3 = nanotime()
@@ -1349,6 +1967,15 @@ func gc(start_time int64, eagersweep bool) {
 		sysFree(unsafe.Pointer(&work.spans[0]), uintptr(len(work.spans))*unsafe.Sizeof(work.spans[0]), &memstats.other_sys)
 	}
 
+	if gccheckmarkenable {
+		if !checkmark {
+			// first half of two-pass; don't set up sweep
+			unlock(&mheap_.lock)
+			return
+		}
+		checkmark = false // done checking marks
+	}
+
 	// Cache the current array for sweeping.
 	mheap_.gcspans = mheap_.allspans
 	mheap_.sweepgen += 2
diff --git a/src/runtime/mgc0.go b/src/runtime/mgc0.go
index 38406f33a..7797894fc 100644
--- a/src/runtime/mgc0.go
+++ b/src/runtime/mgc0.go
@@ -97,54 +97,135 @@ func bgsweep() {
 	}
 }
 
+const (
+	_PoisonGC    = 0xf969696969696969 & (1<<(8*ptrSize) - 1)
+	_PoisonStack = 0x6868686868686868 & (1<<(8*ptrSize) - 1)
+)
+
 // NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer,
 // but if we do that, Go inserts a write barrier on *dst = src.
 //go:nosplit
 func writebarrierptr(dst *uintptr, src uintptr) {
 	*dst = src
+	writebarrierptr_nostore(dst, src)
+}
+
+// Like writebarrierptr, but the store has already been applied.
+// Do not reapply.
+//go:nosplit
+func writebarrierptr_nostore(dst *uintptr, src uintptr) {
+	if getg() == nil { // very low-level startup
+		return
+	}
+
+	if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) {
+		systemstack(func() { gothrow("bad pointer in write barrier") })
+	}
+
+	mp := acquirem()
+	if mp.inwb || mp.dying > 0 {
+		releasem(mp)
+		return
+	}
+	mp.inwb = true
+	systemstack(func() {
+		gcmarkwb_m(dst, src)
+	})
+	mp.inwb = false
+	releasem(mp)
 }
 
 //go:nosplit
 func writebarrierstring(dst *[2]uintptr, src [2]uintptr) {
-	dst[0] = src[0]
+	writebarrierptr(&dst[0], src[0])
 	dst[1] = src[1]
 }
 
 //go:nosplit
 func writebarrierslice(dst *[3]uintptr, src [3]uintptr) {
-	dst[0] = src[0]
+	writebarrierptr(&dst[0], src[0])
 	dst[1] = src[1]
 	dst[2] = src[2]
 }
 
 //go:nosplit
 func writebarrieriface(dst *[2]uintptr, src [2]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-}
-
-//go:nosplit
-func writebarrierfat2(dst *[2]uintptr, _ *byte, src [2]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
 }
 
-//go:nosplit
-func writebarrierfat3(dst *[3]uintptr, _ *byte, src [3]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	dst[2] = src[2]
-}
+//go:generate go run wbfat_gen.go -- wbfat.go
+//
+// The above line generates multiword write barriers for
+// all the combinations of ptr+scalar up to four words.
+// The implementations are written to wbfat.go.
 
 //go:nosplit
-func writebarrierfat4(dst *[4]uintptr, _ *byte, src [4]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	dst[2] = src[2]
-	dst[3] = src[3]
+func writebarrierfat(typ *_type, dst, src unsafe.Pointer) {
+	mask := loadPtrMask(typ)
+	nptr := typ.size / ptrSize
+	for i := uintptr(0); i < nptr; i += 2 {
+		bits := mask[i/2]
+		if (bits>>2)&_BitsMask == _BitsPointer {
+			writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
+		} else {
+			*(*uintptr)(dst) = *(*uintptr)(src)
+		}
+		dst = add(dst, ptrSize)
+		src = add(src, ptrSize)
+		if i+1 == nptr {
+			break
+		}
+		bits >>= 4
+		if (bits>>2)&_BitsMask == _BitsPointer {
+			writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
+		} else {
+			*(*uintptr)(dst) = *(*uintptr)(src)
+		}
+		dst = add(dst, ptrSize)
+		src = add(src, ptrSize)
+	}
 }
 
 //go:nosplit
-func writebarrierfat(typ *_type, dst, src unsafe.Pointer) {
-	memmove(dst, src, typ.size)
+func writebarriercopy(typ *_type, dst, src slice) int {
+	n := dst.len
+	if n > src.len {
+		n = src.len
+	}
+	if n == 0 {
+		return 0
+	}
+	dstp := unsafe.Pointer(dst.array)
+	srcp := unsafe.Pointer(src.array)
+
+	if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) {
+		// Overlap with src before dst.
+		// Copy backward, being careful not to move dstp/srcp
+		// out of the array they point into.
+		dstp = add(dstp, uintptr(n-1)*typ.size)
+		srcp = add(srcp, uintptr(n-1)*typ.size)
+		i := uint(0)
+		for {
+			writebarrierfat(typ, dstp, srcp)
+			if i++; i >= n {
+				break
+			}
+			dstp = add(dstp, -typ.size)
+			srcp = add(srcp, -typ.size)
+		}
+	} else {
+		// Copy forward, being careful not to move dstp/srcp
+		// out of the array they point into.
+		i := uint(0)
+		for {
+			writebarrierfat(typ, dstp, srcp)
+			if i++; i >= n {
+				break
+			}
+			dstp = add(dstp, typ.size)
+			srcp = add(srcp, typ.size)
+		}
+	}
+	return int(n)
 }
diff --git a/src/runtime/mgc0.h b/src/runtime/mgc0.h
index 62726b4f0..dd0c46024 100644
--- a/src/runtime/mgc0.h
+++ b/src/runtime/mgc0.h
@@ -12,9 +12,11 @@ enum {
 	BitsPointer = 2,
 	BitsMask = 3,
 	PointersPerByte = 8/BitsPerPointer,
-	MaxGCMask = 64,
 	insData = 1,
 	insArray,
 	insArrayEnd,
 	insEnd,
+
+	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
+	MaxGCMask	= 65536, // TODO(rsc): change back to 64
 };
diff --git a/src/runtime/mgc1.go b/src/runtime/mgc1.go
index d1aab4554..04a5207e5 100644
--- a/src/runtime/mgc1.go
+++ b/src/runtime/mgc1.go
@@ -50,12 +50,15 @@ const (
 
 	// If you change these, also change scanblock.
 	// scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
-	_BitsDead    = 0
-	_BitsScalar  = 1
-	_BitsPointer = 2
+	_BitsDead          = 0
+	_BitsScalar        = 1                                // 01
+	_BitsPointer       = 2                                // 10
+	_BitsCheckMarkXor  = 1                                // 10
+	_BitsScalarMarked  = _BitsScalar ^ _BitsCheckMarkXor  // 00
+	_BitsPointerMarked = _BitsPointer ^ _BitsCheckMarkXor // 11
 
 	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
-	_MaxGCMask = 64
+	_MaxGCMask = 65536 // TODO(rsc): change back to 64
 )
 
 // Bits in per-word bitmap.
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index fedcd69c5..30205d68d 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -196,7 +196,7 @@ func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan
 		// able to map interior pointer to containing span.
 		atomicstore(&s.sweepgen, h.sweepgen)
 		s.state = _MSpanInUse
-		s.freelist = nil
+		s.freelist = 0
 		s.ref = 0
 		s.sizeclass = uint8(sizeclass)
 		if sizeclass == 0 {
@@ -248,7 +248,7 @@ func mHeap_AllocStack(h *mheap, npage uintptr) *mspan {
 	s := mHeap_AllocSpanLocked(h, npage)
 	if s != nil {
 		s.state = _MSpanStack
-		s.freelist = nil
+		s.freelist = 0
 		s.ref = 0
 		memstats.stacks_inuse += uint64(s.npages << _PageShift)
 	}
@@ -571,7 +571,7 @@ func mSpan_Init(span *mspan, start pageID, npages uintptr) {
 	span.prev = nil
 	span.start = start
 	span.npages = npages
-	span.freelist = nil
+	span.freelist = 0
 	span.ref = 0
 	span.sizeclass = 0
 	span.incache = false
diff --git a/src/runtime/os_linux_386.go b/src/runtime/os_linux_386.go
index c4f95804a..adcd5a1c4 100644
--- a/src/runtime/os_linux_386.go
+++ b/src/runtime/os_linux_386.go
@@ -14,8 +14,7 @@ const (
 
 var _vdso uint32
 
-//go:nosplit
-func linux_setup_vdso(argc int32, argv **byte) {
+func sysargs(argc int32, argv **byte) {
 	// skip over argv, envv to get to auxv
 	n := argc + 1
 	for argv_index(argv, n) != nil {
diff --git a/src/runtime/print1.go b/src/runtime/print1.go
index 8f8268873..3d812bd04 100644
--- a/src/runtime/print1.go
+++ b/src/runtime/print1.go
@@ -41,7 +41,31 @@ func snprintf(dst *byte, n int32, s *byte) {
 	gp.writebuf = nil
 }
 
-//var debuglock mutex
+var debuglock mutex
+
+// The compiler emits calls to printlock and printunlock around
+// the multiple calls that implement a single Go print or println
+// statement. Some of the print helpers (printsp, for example)
+// call print recursively. There is also the problem of a crash
+// happening during the print routines and needing to acquire
+// the print lock to print information about the crash.
+// For both these reasons, let a thread acquire the printlock 'recursively'.
+
+func printlock() {
+	mp := getg().m
+	mp.printlock++
+	if mp.printlock == 1 {
+		lock(&debuglock)
+	}
+}
+
+func printunlock() {
+	mp := getg().m
+	mp.printlock--
+	if mp.printlock == 0 {
+		unlock(&debuglock)
+	}
+}
 
 // write to goroutine-local buffer if diverting output,
 // or else standard error.
@@ -80,7 +104,7 @@ func printnl() {
 // Very simple printf.  Only for debugging prints.
 // Do not add to this without checking with Rob.
 func vprintf(str string, arg unsafe.Pointer) {
-	//lock(&debuglock);
+	printlock()
 
 	s := bytes(str)
 	start := 0
@@ -160,7 +184,7 @@ func vprintf(str string, arg unsafe.Pointer) {
 		gwrite(s[start:i])
 	}
 
-	//unlock(&debuglock);
+	printunlock()
 }
 
 func printpc(p unsafe.Pointer) {
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 295190cb4..64f6a3520 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -181,6 +181,9 @@ func acquireSudog() *sudog {
 	// which keeps the garbage collector from being invoked.
 	mp := acquirem()
 	p := new(sudog)
+	if p.elem != nil {
+		gothrow("acquireSudog: found p.elem != nil after new")
+	}
 	releasem(mp)
 	return p
 }
diff --git a/src/runtime/proc1.go b/src/runtime/proc1.go
index aeded0e77..5a898ff41 100644
--- a/src/runtime/proc1.go
+++ b/src/runtime/proc1.go
@@ -316,6 +316,10 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
 
 	// Check that transition is valid.
 	switch oldval {
+	default:
+		print("runtime: casfrom_Gscanstatus bad oldval gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
+		dumpgstatus(gp)
+		gothrow("casfrom_Gscanstatus:top gp->status is not in scan state")
 	case _Gscanrunnable,
 		_Gscanwaiting,
 		_Gscanrunning,
@@ -377,12 +381,12 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			})
 		}
 		// Help GC if needed.
-		if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
-			gp.preemptscan = false
-			systemstack(func() {
-				gcphasework(gp)
-			})
-		}
+		// if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
+		// 	gp.preemptscan = false
+		// 	systemstack(func() {
+		// 		gcphasework(gp)
+		// 	})
+		// }
 	}
 }
 
@@ -512,9 +516,10 @@ func stopscanstart(gp *g) {
 
 // Runs on g0 and does the actual work after putting the g back on the run queue.
 func mquiesce(gpmaster *g) {
-	activeglen := len(allgs)
 	// enqueue the calling goroutine.
 	restartg(gpmaster)
+
+	activeglen := len(allgs)
 	for i := 0; i < activeglen; i++ {
 		gp := allgs[i]
 		if readgstatus(gp) == _Gdead {
@@ -1579,7 +1584,8 @@ func save(pc, sp uintptr) {
 	_g_.sched.lr = 0
 	_g_.sched.ret = 0
 	_g_.sched.ctxt = nil
-	_g_.sched.g = _g_
+	// write as uintptr to avoid write barrier, which will smash _g_.sched.
+	*(*uintptr)(unsafe.Pointer(&_g_.sched.g)) = uintptr(unsafe.Pointer(_g_))
 }
 
 // The goroutine g is about to enter a system call.
@@ -1625,7 +1631,10 @@ func reentersyscall(pc, sp uintptr) {
 	_g_.syscallpc = pc
 	casgstatus(_g_, _Grunning, _Gsyscall)
 	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
-		systemstack(entersyscall_bad)
+		systemstack(func() {
+			print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			gothrow("entersyscall")
+		})
 	}
 
 	if atomicload(&sched.sysmonwait) != 0 { // TODO: fast atomic
@@ -1654,13 +1663,6 @@ func entersyscall(dummy int32) {
 	reentersyscall(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
 }
 
-func entersyscall_bad() {
-	var gp *g
-	gp = getg().m.curg
-	print("entersyscall inconsistent ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
-	gothrow("entersyscall")
-}
-
 func entersyscall_sysmon() {
 	lock(&sched.lock)
 	if atomicload(&sched.sysmonwait) != 0 {
@@ -1692,12 +1694,26 @@ func entersyscallblock(dummy int32) {
 	_g_.stackguard0 = stackPreempt // see comment in entersyscall
 
 	// Leave SP around for GC and traceback.
-	save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+	pc := getcallerpc(unsafe.Pointer(&dummy))
+	sp := getcallersp(unsafe.Pointer(&dummy))
+	save(pc, sp)
 	_g_.syscallsp = _g_.sched.sp
 	_g_.syscallpc = _g_.sched.pc
+	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+		sp1 := sp
+		sp2 := _g_.sched.sp
+		sp3 := _g_.syscallsp
+		systemstack(func() {
+			print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			gothrow("entersyscallblock")
+		})
+	}
 	casgstatus(_g_, _Grunning, _Gsyscall)
 	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
-		systemstack(entersyscall_bad)
+		systemstack(func() {
+			print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			gothrow("entersyscallblock")
+		})
 	}
 
 	systemstack(entersyscallblock_handoff)
@@ -1776,6 +1792,7 @@ func exitsyscallfast() bool {
 
 	// Freezetheworld sets stopwait but does not retake P's.
 	if sched.stopwait != 0 {
+		_g_.m.mcache = nil
 		_g_.m.p = nil
 		return false
 	}
@@ -1789,6 +1806,7 @@ func exitsyscallfast() bool {
 	}
 
 	// Try to get any other idle P.
+	_g_.m.mcache = nil
 	_g_.m.p = nil
 	if sched.pidle != nil {
 		var ok bool
@@ -2363,6 +2381,8 @@ func setcpuprofilerate_m(hz int32) {
 }
 
 // Change number of processors.  The world is stopped, sched is locked.
+// gcworkbufs are not being modified by either the GC or
+// the write barrier code.
 func procresize(new int32) {
 	old := gomaxprocs
 	if old < 0 || old > _MaxGomaxprocs || new <= 0 || new > _MaxGomaxprocs {
diff --git a/src/runtime/rt0_linux_386.s b/src/runtime/rt0_linux_386.s
index 352e594d5..47fd908e7 100644
--- a/src/runtime/rt0_linux_386.s
+++ b/src/runtime/rt0_linux_386.s
@@ -9,7 +9,6 @@ TEXT _rt0_386_linux(SB),NOSPLIT,$8
 	LEAL	12(SP), BX
 	MOVL	AX, 0(SP)
 	MOVL	BX, 4(SP)
-	CALL	runtime·linux_setup_vdso(SB)
 	CALL	main(SB)
 	INT	$3
 
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 15dea01a3..9e19b68be 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -97,7 +97,10 @@ func testAtomic64() {
 
 	z64 = 42
 	x64 = 0
-	// TODO: PREFETCH((unsafe.Pointer)(&z64))
+	prefetcht0(uintptr(unsafe.Pointer(&z64)))
+	prefetcht1(uintptr(unsafe.Pointer(&z64)))
+	prefetcht2(uintptr(unsafe.Pointer(&z64)))
+	prefetchnta(uintptr(unsafe.Pointer(&z64)))
 	if cas64(&z64, x64, 1) {
 		gothrow("cas64 failed")
 	}
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 7987a7373..d18178d09 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -227,6 +227,8 @@ type m struct {
 	helpgc        int32
 	spinning      bool // m is out of work and is actively looking for work
 	blocked       bool // m is blocked on a note
+	inwb          bool // m is executing a write barrier
+	printlock     int8
 	fastrand      uint32
 	ncgocall      uint64 // number of cgo calls in total
 	ncgo          int32  // number of cgo calls currently in progress
@@ -402,8 +404,9 @@ type itab struct {
 }
 
 // Lock-free stack node.
+// // Also known to export_test.go.
 type lfnode struct {
-	next    *lfnode
+	next    uint64
 	pushcnt uintptr
 }
 
@@ -448,11 +451,13 @@ type debugvars struct {
 
 // Indicates to write barrier and sychronization task to preform.
 const (
-	_GCoff     = iota // stop and start             nop
-	_GCquiesce        // stop and start             nop
-	_GCstw            // stop the ps                nop
-	_GCmark           // scan the stacks and start  no white to black
-	_GCsweep          // stop and start             nop
+	_GCoff             = iota // GC not running, write barrier disabled
+	_GCquiesce                // unused state
+	_GCstw                    // unused state
+	_GCscan                   // GC collecting roots into workbufs, write barrier disabled
+	_GCmark                   // GC marking from workbufs, write barrier ENABLED
+	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
+	_GCsweep                  // GC mark completed; sweeping in background, write barrier disabled
 )
 
 type forcegcstate struct {
diff --git a/src/runtime/select.go b/src/runtime/select.go
index 6717b9395..5e5047bc1 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -377,12 +377,7 @@ loop:
 	// iterating through the linked list they are in reverse order.
 	cas = nil
 	sglist = gp.waiting
-	// Clear all selectdone and elem before unlinking from gp.waiting.
-	// They must be cleared before being put back into the sudog cache.
-	// Clear before unlinking, because if a stack copy happens after the unlink,
-	// they will not be updated, they will be left pointing to the old stack,
-	// which creates dangling pointers, which may be detected by the
-	// garbage collector.
+	// Clear all elem before unlinking from gp.waiting.
 	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
 		sg1.selectdone = nil
 		sg1.elem = nil
diff --git a/src/runtime/stack1.go b/src/runtime/stack1.go
index 1fd61ce1a..28000864d 100644
--- a/src/runtime/stack1.go
+++ b/src/runtime/stack1.go
@@ -58,7 +58,7 @@ func stackinit() {
 
 // Allocates a stack from the free pool.  Must be called with
 // stackpoolmu held.
-func stackpoolalloc(order uint8) *mlink {
+func stackpoolalloc(order uint8) gclinkptr {
 	list := &stackpool[order]
 	s := list.next
 	if s == list {
@@ -70,23 +70,23 @@ func stackpoolalloc(order uint8) *mlink {
 		if s.ref != 0 {
 			gothrow("bad ref")
 		}
-		if s.freelist != nil {
+		if s.freelist.ptr() != nil {
 			gothrow("bad freelist")
 		}
 		for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order {
-			x := (*mlink)(unsafe.Pointer(uintptr(s.start)<<_PageShift + i))
-			x.next = s.freelist
+			x := gclinkptr(uintptr(s.start)<<_PageShift + i)
+			x.ptr().next = s.freelist
 			s.freelist = x
 		}
 		mSpanList_Insert(list, s)
 	}
 	x := s.freelist
-	if x == nil {
+	if x.ptr() == nil {
 		gothrow("span has no free stacks")
 	}
-	s.freelist = x.next
+	s.freelist = x.ptr().next
 	s.ref++
-	if s.freelist == nil {
+	if s.freelist.ptr() == nil {
 		// all stacks in s are allocated.
 		mSpanList_Remove(s)
 	}
@@ -94,22 +94,22 @@ func stackpoolalloc(order uint8) *mlink {
 }
 
 // Adds stack x to the free pool.  Must be called with stackpoolmu held.
-func stackpoolfree(x *mlink, order uint8) {
+func stackpoolfree(x gclinkptr, order uint8) {
 	s := mHeap_Lookup(&mheap_, (unsafe.Pointer)(x))
 	if s.state != _MSpanStack {
 		gothrow("freeing stack not in a stack span")
 	}
-	if s.freelist == nil {
+	if s.freelist.ptr() == nil {
 		// s will now have a free stack
 		mSpanList_Insert(&stackpool[order], s)
 	}
-	x.next = s.freelist
+	x.ptr().next = s.freelist
 	s.freelist = x
 	s.ref--
 	if s.ref == 0 {
 		// span is completely free - return to heap
 		mSpanList_Remove(s)
-		s.freelist = nil
+		s.freelist = 0
 		mHeap_FreeStack(&mheap_, s)
 	}
 }
@@ -123,12 +123,12 @@ func stackcacherefill(c *mcache, order uint8) {
 
 	// Grab some stacks from the global cache.
 	// Grab half of the allowed capacity (to prevent thrashing).
-	var list *mlink
+	var list gclinkptr
 	var size uintptr
 	lock(&stackpoolmu)
 	for size < _StackCacheSize/2 {
 		x := stackpoolalloc(order)
-		x.next = list
+		x.ptr().next = list
 		list = x
 		size += _FixedStack << order
 	}
@@ -145,7 +145,7 @@ func stackcacherelease(c *mcache, order uint8) {
 	size := c.stackcache[order].size
 	lock(&stackpoolmu)
 	for size > _StackCacheSize/2 {
-		y := x.next
+		y := x.ptr().next
 		stackpoolfree(x, order)
 		x = y
 		size -= _FixedStack << order
@@ -162,12 +162,12 @@ func stackcache_clear(c *mcache) {
 	lock(&stackpoolmu)
 	for order := uint8(0); order < _NumStackOrders; order++ {
 		x := c.stackcache[order].list
-		for x != nil {
-			y := x.next
+		for x.ptr() != nil {
+			y := x.ptr().next
 			stackpoolfree(x, order)
 			x = y
 		}
-		c.stackcache[order].list = nil
+		c.stackcache[order].list = 0
 		c.stackcache[order].size = 0
 	}
 	unlock(&stackpoolmu)
@@ -207,7 +207,7 @@ func stackalloc(n uint32) stack {
 			order++
 			n2 >>= 1
 		}
-		var x *mlink
+		var x gclinkptr
 		c := thisg.m.mcache
 		if c == nil || thisg.m.gcing != 0 || thisg.m.helpgc != 0 {
 			// c == nil can happen in the guts of exitsyscall or
@@ -219,11 +219,11 @@ func stackalloc(n uint32) stack {
 			unlock(&stackpoolmu)
 		} else {
 			x = c.stackcache[order].list
-			if x == nil {
+			if x.ptr() == nil {
 				stackcacherefill(c, order)
 				x = c.stackcache[order].list
 			}
-			c.stackcache[order].list = x.next
+			c.stackcache[order].list = x.ptr().next
 			c.stackcache[order].size -= uintptr(n)
 		}
 		v = (unsafe.Pointer)(x)
@@ -270,7 +270,7 @@ func stackfree(stk stack) {
 			order++
 			n2 >>= 1
 		}
-		x := (*mlink)(v)
+		x := gclinkptr(v)
 		c := gp.m.mcache
 		if c == nil || gp.m.gcing != 0 || gp.m.helpgc != 0 {
 			lock(&stackpoolmu)
@@ -280,7 +280,7 @@ func stackfree(stk stack) {
 			if c.stackcache[order].size >= _StackCacheSize {
 				stackcacherelease(c, order)
 			}
-			x.next = c.stackcache[order].list
+			x.ptr().next = c.stackcache[order].list
 			c.stackcache[order].list = x
 			c.stackcache[order].size += n
 		}
@@ -526,6 +526,7 @@ func fillstack(stk stack, b byte) {
 }
 
 // Copies gp's stack to a new stack of a different size.
+// Caller must have changed gp status to Gcopystack.
 func copystack(gp *g, newsize uintptr) {
 	if gp.syscallsp != 0 {
 		gothrow("stack growth not allowed in system call")
@@ -563,15 +564,11 @@ func copystack(gp *g, newsize uintptr) {
 	}
 	memmove(unsafe.Pointer(new.hi-used), unsafe.Pointer(old.hi-used), used)
 
-	oldstatus := casgcopystack(gp) // cas from Gwaiting or Grunnable to Gcopystack, return old status
-
 	// Swap out old stack for new one
 	gp.stack = new
 	gp.stackguard0 = new.lo + _StackGuard // NOTE: might clobber a preempt request
 	gp.sched.sp = new.hi - used
 
-	casgstatus(gp, _Gcopystack, oldstatus) // oldstatus is Gwaiting or Grunnable
-
 	// free old stack
 	if stackPoisonCopy != 0 {
 		fillstack(old, 0xfc)
@@ -669,6 +666,14 @@ func newstack() {
 		gothrow("runtime: split stack overflow")
 	}
 
+	if gp.sched.ctxt != nil {
+		// morestack wrote sched.ctxt on its way in here,
+		// without a write barrier. Run the write barrier now.
+		// It is not possible to be preempted between then
+		// and now, so it's okay.
+		writebarrierptr_nostore((*uintptr)(unsafe.Pointer(&gp.sched.ctxt)), uintptr(gp.sched.ctxt))
+	}
+
 	if gp.stackguard0 == stackPreempt {
 		if gp == thisg.m.g0 {
 			gothrow("runtime: preempt g0")
@@ -677,7 +682,12 @@ func newstack() {
 			gothrow("runtime: g is running but p is not")
 		}
 		if gp.preemptscan {
+			for !castogscanstatus(gp, _Gwaiting, _Gscanwaiting) {
+				// Likely to be racing with the GC as it sees a _Gwaiting and does the stack scan.
+				// If so this stack will be scanned twice which does not change correctness.
+			}
 			gcphasework(gp)
+			casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
 			casgstatus(gp, _Gwaiting, _Grunning)
 			gp.stackguard0 = gp.stack.lo + _StackGuard
 			gp.preempt = false
@@ -708,13 +718,15 @@ func newstack() {
 		gothrow("stack overflow")
 	}
 
-	// Note that the concurrent GC might be scanning the stack as we try to replace it.
-	// copystack takes care of the appropriate coordination with the stack scanner.
+	casgstatus(gp, _Gwaiting, _Gcopystack)
+
+	// The concurrent GC will not scan the stack while we are doing the copy since
+	// the gp is in a Gcopystack status.
 	copystack(gp, uintptr(newsize))
 	if stackDebug >= 1 {
 		print("stack grow done\n")
 	}
-	casgstatus(gp, _Gwaiting, _Grunning)
+	casgstatus(gp, _Gcopystack, _Grunning)
 	gogo(&gp.sched)
 }
 
@@ -767,17 +779,17 @@ func shrinkstack(gp *g) {
 	if gp.syscallsp != 0 {
 		return
 	}
-
-	/* TODO
-	if goos_windows && gp.m != nil && gp.m.libcallsp != 0 {
+	if goos_windows != 0 && gp.m != nil && gp.m.libcallsp != 0 {
 		return
 	}
-	*/
 
 	if stackDebug > 0 {
 		print("shrinking stack ", oldsize, "->", newsize, "\n")
 	}
+
+	oldstatus := casgcopystack(gp)
 	copystack(gp, newsize)
+	casgstatus(gp, _Gcopystack, oldstatus)
 }
 
 // Do any delayed stack freeing that was queued up during GC.
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index aa7577cf9..4408e22bf 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -231,3 +231,8 @@ func call536870912(fn, arg unsafe.Pointer, n, retoffset uint32)
 func call1073741824(fn, arg unsafe.Pointer, n, retoffset uint32)
 
 func systemstack_switch()
+
+func prefetcht0(addr uintptr)
+func prefetcht1(addr uintptr)
+func prefetcht2(addr uintptr)
+func prefetchnta(addr uintptr)
diff --git a/src/runtime/vdso_none.go b/src/runtime/vdso_none.go
index ac6f8cb18..6f83ecc89 100644
--- a/src/runtime/vdso_none.go
+++ b/src/runtime/vdso_none.go
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build !linux !amd64
+// +build !linux !386
 
 package runtime
 
diff --git a/src/runtime/wbfat.go b/src/runtime/wbfat.go
new file mode 100644
index 000000000..75c58b26b
--- /dev/null
+++ b/src/runtime/wbfat.go
@@ -0,0 +1,190 @@
+// generated by wbfat_gen.go; use go generate
+
+package runtime
+
+//go:nosplit
+func writebarrierfat01(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+}
+
+//go:nosplit
+func writebarrierfat10(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+}
+
+//go:nosplit
+func writebarrierfat11(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+}
+
+//go:nosplit
+func writebarrierfat001(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat010(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat011(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat100(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat101(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat110(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat111(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat0001(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0010(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0011(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0100(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0101(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0110(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0111(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1000(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1001(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1010(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1011(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1100(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1101(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1110(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1111(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
diff --git a/src/runtime/wbfat_gen.go b/src/runtime/wbfat_gen.go
new file mode 100644
index 000000000..78d5b6271
--- /dev/null
+++ b/src/runtime/wbfat_gen.go
@@ -0,0 +1,41 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+)
+
+func main() {
+	flag.Parse()
+	if flag.NArg() > 0 {
+		f, err := os.Create(flag.Arg(0))
+		if err != nil {
+			log.Fatal(err)
+		}
+		os.Stdout = f
+	}
+	fmt.Printf("// generated by wbfat_gen.go; use go generate\n\n")
+	fmt.Printf("package runtime\n")
+	for i := uint(2); i <= 4; i++ {
+		for j := 1; j < 1<<i; j++ {
+			fmt.Printf("\n//go:nosplit\n")
+			fmt.Printf("func writebarrierfat%0*b(dst *[%d]uintptr, _ *byte, src [%d]uintptr) {\n", int(i), j, i, i)
+			for k := uint(0); k < i; k++ {
+				if j&(1<<(i-1-k)) != 0 {
+					fmt.Printf("\twritebarrierptr(&dst[%d], src[%d])\n", k, k)
+				} else {
+					fmt.Printf("\tdst[%d] = src[%d]\n", k, k)
+				}
+			}
+			fmt.Printf("}\n")
+		}
+	}
+}
diff --git a/src/sync/atomic/atomic_test.go b/src/sync/atomic/atomic_test.go
index 9f13af48b..ec573aa8c 100644
--- a/src/sync/atomic/atomic_test.go
+++ b/src/sync/atomic/atomic_test.go
@@ -164,7 +164,7 @@ func TestSwapPointer(t *testing.T) {
 	x.before = magicptr
 	x.after = magicptr
 	var j uintptr
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		k := SwapPointer(&x.i, unsafe.Pointer(delta))
 		if uintptr(x.i) != delta || uintptr(k) != j {
 			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
@@ -456,7 +456,7 @@ func TestCompareAndSwapPointer(t *testing.T) {
 	magicptr := uintptr(m)
 	x.before = magicptr
 	x.after = magicptr
-	for val := uintptr(1); val+val > val; val += val {
+	for val := uintptr(1 << 16); val+val > val; val += val {
 		x.i = unsafe.Pointer(val)
 		if !CompareAndSwapPointer(&x.i, unsafe.Pointer(val), unsafe.Pointer(val+1)) {
 			t.Fatalf("should have swapped %#x %#x", val, val+1)
@@ -595,7 +595,7 @@ func TestLoadPointer(t *testing.T) {
 	magicptr := uintptr(m)
 	x.before = magicptr
 	x.after = magicptr
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		k := LoadPointer(&x.i)
 		if k != x.i {
 			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
@@ -731,7 +731,7 @@ func TestStorePointer(t *testing.T) {
 	x.before = magicptr
 	x.after = magicptr
 	v := unsafe.Pointer(uintptr(0))
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		StorePointer(&x.i, unsafe.Pointer(v))
 		if x.i != v {
 			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)
diff --git a/test/live.go b/test/live.go
index f69d0a4c1..62c6a0b0e 100644
--- a/test/live.go
+++ b/test/live.go
@@ -9,20 +9,39 @@
 
 package main
 
+func printnl()
+
+//go:noescape
+func printpointer(**int)
+
+//go:noescape
+func printintpointer(*int)
+
+//go:noescape
+func printstringpointer(*string)
+
+//go:noescape
+func printstring(string)
+
+//go:noescape
+func printbytepointer(*byte)
+
+func printint(int)
+
 func f1() {
 	var x *int
-	print(&x) // ERROR "live at call to printpointer: x$"
-	print(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
 }
 
 func f2(b bool) {
 	if b {
-		print(0) // nothing live here
+		printint(0) // nothing live here
 		return
 	}
 	var x *int
-	print(&x) // ERROR "live at call to printpointer: x$"
-	print(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
 }
 
 func f3(b bool) {
@@ -30,22 +49,22 @@ func f3(b bool) {
 	// live throughout the function, to avoid being poisoned
 	// in GODEBUG=gcdead=1 mode.
 
-	print(0) // ERROR "live at call to printint: x y$"
+	printint(0) // ERROR "live at call to printint: x y$"
 	if b == false {
-		print(0) // ERROR "live at call to printint: x y$"
+		printint(0) // ERROR "live at call to printint: x y$"
 		return
 	}
 
 	if b {
 		var x *int
-		print(&x) // ERROR "live at call to printpointer: x y$"
-		print(&x) // ERROR "live at call to printpointer: x y$"
+		printpointer(&x) // ERROR "live at call to printpointer: x y$"
+		printpointer(&x) // ERROR "live at call to printpointer: x y$"
 	} else {
 		var y *int
-		print(&y) // ERROR "live at call to printpointer: x y$"
-		print(&y) // ERROR "live at call to printpointer: x y$"
+		printpointer(&y) // ERROR "live at call to printpointer: x y$"
+		printpointer(&y) // ERROR "live at call to printpointer: x y$"
 	}
-	print(0) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
+	printint(0) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
 }
 
 // The old algorithm treated x as live on all code that
@@ -56,20 +75,20 @@ func f3(b bool) {
 
 func f4(b1, b2 bool) { // x not live here
 	if b2 {
-		print(0) // x not live here
+		printint(0) // x not live here
 		return
 	}
 	var z **int
 	x := new(int)
 	*x = 42
 	z = &x
-	print(**z) // ERROR "live at call to printint: x z$"
+	printint(**z) // ERROR "live at call to printint: x z$"
 	if b2 {
-		print(1) // ERROR "live at call to printint: x$"
+		printint(1) // ERROR "live at call to printint: x$"
 		return
 	}
 	for {
-		print(**z) // ERROR "live at call to printint: x z$"
+		printint(**z) // ERROR "live at call to printint: x z$"
 	}
 }
 
@@ -84,7 +103,7 @@ func f5(b1 bool) {
 		*y = 54
 		z = &y
 	}
-	print(**z) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
+	printint(**z) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
 }
 
 // confusion about the _ result used to cause spurious "live at entry to f6: _".
@@ -155,8 +174,8 @@ func f11b() *int {
 		// At this point p is dead: the code here cannot
 		// get to the bottom of the function.
 		// This used to have a spurious "live at call to printint: p".
-		print(1) // nothing live here!
-		select { // ERROR "live at call to newselect: autotmp" "live at call to selectgo: autotmp"
+		printint(1) // nothing live here!
+		select {    // ERROR "live at call to newselect: autotmp" "live at call to selectgo: autotmp"
 		case <-c: // ERROR "live at call to selectrecv: autotmp"
 			return nil
 		case <-c: // ERROR "live at call to selectrecv: autotmp"
@@ -172,8 +191,8 @@ func f11c() *int {
 	if b {
 		// Unlike previous, the cases in this select fall through,
 		// so we can get to the println, so p is not dead.
-		print(1) // ERROR "live at call to printint: p"
-		select { // ERROR "live at call to newselect: autotmp.* p" "live at call to selectgo: autotmp.* p"
+		printint(1) // ERROR "live at call to printint: p"
+		select {    // ERROR "live at call to newselect: autotmp.* p" "live at call to selectgo: autotmp.* p"
 		case <-c: // ERROR "live at call to selectrecv: autotmp.* p"
 		case <-c: // ERROR "live at call to selectrecv: autotmp.* p"
 		}
@@ -209,7 +228,7 @@ func h13(string, string) string
 
 func f14() {
 	x := g14()
-	print(&x) // ERROR "live at call to printpointer: x"
+	printstringpointer(&x) // ERROR "live at call to printstringpointer: x"
 }
 
 func g14() string
@@ -217,8 +236,8 @@ func g14() string
 func f15() {
 	var x string
 	_ = &x
-	x = g15() // ERROR "live at call to g15: x"
-	print(x)  // ERROR "live at call to printstring: x"
+	x = g15()      // ERROR "live at call to g15: x"
+	printstring(x) // ERROR "live at call to printstring: x"
 }
 
 func g15() string
@@ -282,7 +301,7 @@ func f18() {
 	}
 	z = m2[g18()] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
 	z = m2[g18()] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }
 
 var ch chan *byte
@@ -296,7 +315,7 @@ func f19() {
 	}
 	z = <-ch // ERROR "live at call to chanrecv1: autotmp_[0-9]+$"
 	z = <-ch // ERROR "live at call to chanrecv1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }
 
 func f20() {
@@ -316,7 +335,7 @@ func f21() {
 	}
 	z = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
 	z = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }
 
 func f23() {
@@ -328,7 +347,8 @@ func f23() {
 	}
 	z, ok = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess2: autotmp_[0-9]+$"
 	z, ok = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess2: autotmp_[0-9]+$"
-	print(z, ok)
+	printbytepointer(z)
+	print(ok)
 }
 
 func f24() {
@@ -350,8 +370,8 @@ func f25(b bool) {
 	}
 	var x string
 	_ = &x
-	x = g15() // ERROR "live at call to g15: x"
-	print(x)  // ERROR "live at call to printstring: x"
+	x = g15()      // ERROR "live at call to g15: x"
+	printstring(x) // ERROR "live at call to printstring: x"
 } // ERROR "live at call to deferreturn: x"
 
 func g25()
@@ -366,7 +386,7 @@ func f26(b bool) {
 	}
 	print26((*int)(nil), (*int)(nil), (*int)(nil)) // ERROR "live at call to print26: autotmp_[0-9]+$"
 	print26((*int)(nil), (*int)(nil), (*int)(nil)) // ERROR "live at call to print26: autotmp_[0-9]+$"
-	println()
+	printnl()
 }
 
 //go:noescape
@@ -381,7 +401,7 @@ func f27(b bool) {
 	}
 	call27(func() { x++ }) // ERROR "live at call to call27: autotmp_[0-9]+$"
 	call27(func() { x++ }) // ERROR "live at call to call27: autotmp_[0-9]+$"
-	println()
+	printnl()
 }
 
 // but defer does escape to later execution in the function
@@ -392,7 +412,7 @@ func f27defer(b bool) {
 		defer call27(func() { x++ }) // ERROR "live at call to deferproc: autotmp_[0-9]+$" "live at call to deferreturn: autotmp_[0-9]+$"
 	}
 	defer call27(func() { x++ }) // ERROR "live at call to deferproc: autotmp_[0-9]+ autotmp_[0-9]+$" "live at call to deferreturn: autotmp_[0-9]+ autotmp_[0-9]+$" "ambiguously live"
-	println()                    // ERROR "live at call to printnl: autotmp_[0-9]+ autotmp_[0-9]+$"
+	printnl()                    // ERROR "live at call to printnl: autotmp_[0-9]+ autotmp_[0-9]+$"
 } // ERROR "live at call to deferreturn: autotmp_[0-9]+ autotmp_[0-9]+$"
 
 // and newproc (go) escapes to the heap
@@ -403,7 +423,7 @@ func f27go(b bool) {
 		go call27(func() { x++ }) // ERROR "live at call to newobject: &x" "live at call to newproc: &x$"
 	}
 	go call27(func() { x++ }) // ERROR "live at call to newobject: &x"
-	println()
+	printnl()
 }
 
 //go:noescape
@@ -415,10 +435,10 @@ var s1, s2, s3, s4, s5, s6, s7, s8, s9, s10 string
 
 func f28(b bool) {
 	if b {
-		print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+		printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
 	}
-	print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
-	print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+	printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+	printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
 }
 
 // map iterator should die on end of range loop
@@ -426,14 +446,14 @@ func f28(b bool) {
 func f29(b bool) {
 	if b {
 		for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-			print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+			printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 		}
 	}
 	for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-		print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+		printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 	}
 	for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-		print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+		printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 	}
 }
 
@@ -446,14 +466,14 @@ func f30(b bool) {
 	// the copy of ptrarr and the internal iterator pointer.
 	if b {
 		for _, p := range ptrarr {
-			print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+			printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 		}
 	}
 	for _, p := range ptrarr {
-		print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+		printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 	}
 	for _, p := range ptrarr {
-		print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+		printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 	}
 }
 
@@ -503,44 +523,44 @@ var m33 map[interface{}]int
 
 func f33() {
 	if m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	} else {
-		println()
+		printnl()
 	}
-	println()
+	printnl()
 }
 
 func f34() {
 	if m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }
 
 func f35() {
 	if m33[nil] == 0 && m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }
 
 func f36() {
 	if m33[nil] == 0 || m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }
 
 func f37() {
 	if (m33[nil] == 0 || m33[nil] == 0) && m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }
 
 // select temps should disappear in the case bodies
@@ -558,44 +578,44 @@ func f38(b bool) {
 	if b {
 		select { // ERROR "live at call"
 		case <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		case fc38() <- *fi38(1): // ERROR "live at call"
-			println()
+			printnl()
 		case *fi38(2) = <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		case *fi38(3), *fb38() = <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		}
-		println()
+		printnl()
 	}
-	println()
+	printnl()
 }
 
 // issue 8097: mishandling of x = x during return.
 
 func f39() (x []int) {
 	x = []int{1}
-	println() // ERROR "live at call to printnl: x"
+	printnl() // ERROR "live at call to printnl: x"
 	return x
 }
 
 func f39a() (x []int) {
 	x = []int{1}
-	println() // ERROR "live at call to printnl: x"
+	printnl() // ERROR "live at call to printnl: x"
 	return
 }
 
 func f39b() (x [10]*int) {
 	x = [10]*int{}
 	x[0] = new(int) // ERROR "live at call to newobject: x"
-	println()       // ERROR "live at call to printnl: x"
+	printnl()       // ERROR "live at call to printnl: x"
 	return x
 }
 
 func f39c() (x [10]*int) {
 	x = [10]*int{}
 	x[0] = new(int) // ERROR "live at call to newobject: x"
-	println()       // ERROR "live at call to printnl: x"
+	printnl()       // ERROR "live at call to printnl: x"
 	return
 }
 
@@ -615,13 +635,13 @@ func newT40() *T40 {
 func bad40() {
 	t := newT40()
 	_ = t
-	println()
+	printnl()
 }
 
 func good40() {
 	ret := T40{}
 	ret.m = make(map[int]int) // ERROR "live at call to makemap: ret"
 	t := &ret
-	println() // ERROR "live at call to printnl: ret"
+	printnl() // ERROR "live at call to printnl: ret"
 	_ = t
 }
diff --git a/test/live2.go b/test/live2.go
index ef6ad994c..1bd0af2cc 100644
--- a/test/live2.go
+++ b/test/live2.go
@@ -12,6 +12,8 @@ package main
 // issue 8142: lost 'addrtaken' bit on inlined variables.
 // no inlining in this test, so just checking that non-inlined works.
 
+func printnl()
+
 type T40 struct {
 	m map[int]int
 }
@@ -24,7 +26,7 @@ func newT40() *T40 {
 
 func bad40() {
 	t := newT40() // ERROR "live at call to makemap: ret"
-	println()     // ERROR "live at call to printnl: ret"
+	printnl()     // ERROR "live at call to printnl: ret"
 	_ = t
 }
 
@@ -32,6 +34,6 @@ func good40() {
 	ret := T40{}
 	ret.m = make(map[int]int) // ERROR "live at call to makemap: ret"
 	t := &ret
-	println() // ERROR "live at call to printnl: ret"
+	printnl() // ERROR "live at call to printnl: ret"
 	_ = t
 }
author	Russ Cox <rsc@golang.org>	2014-12-05 11:40:41 -0500
committer	Russ Cox <rsc@golang.org>	2014-12-05 11:40:41 -0500
commit	60e02bbbea5d3bf4a6b06551f09c984285f8139c (patch)
tree	69e27f09db888f62c2aa9db29ab0c7233c1b286b
parent	6d3ba1914e289ed223f7bb69f34604c0e2ae5384 (diff)
parent	e5fc9ffb729e31c4eb0a6518e819e9fc70f14818 (diff)
download	go-60e02bbbea5d3bf4a6b06551f09c984285f8139c.tar.gz