build: merge the great pkg/ rename into dev.power64

This also removes pkg/runtime/traceback_lr.c, which was ported to Go in an earlier commit and then moved to runtime/traceback.go. Reviewer: rsc@golang.org rsc: LGTM
author: Austin Clements <austin@google.com> 2014-10-22 13:25:37 -0400
committer: Austin Clements <austin@google.com> 2014-10-22 13:25:37 -0400
commit: 800a826ae560191d4dbb6c529eea15cf4349a06f (patch)
tree: c4206bb02365c3b9c66e2946031b86f0ee527faf /src/sync
parent: 2c586384223e980fd3303625ea6b02ed5d9fb9c0 (diff)
parent: 1678ee65674b332e900a703de296eb66fbadcf45 (diff)
download: go-800a826ae560191d4dbb6c529eea15cf4349a06f.tar.gz
33 files changed, 5214 insertions, 0 deletions
diff --git a/src/sync/atomic/64bit_arm.go b/src/sync/atomic/64bit_arm.go
new file mode 100644
index 000000000..c08f214c7
--- /dev/null
+++ b/src/sync/atomic/64bit_arm.go
@@ -0,0 +1,46 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+func loadUint64(addr *uint64) (val uint64) {
+	for {
+		val = *addr
+		if CompareAndSwapUint64(addr, val, val) {
+			break
+		}
+	}
+	return
+}
+
+func storeUint64(addr *uint64, val uint64) {
+	for {
+		old := *addr
+		if CompareAndSwapUint64(addr, old, val) {
+			break
+		}
+	}
+	return
+}
+
+func addUint64(val *uint64, delta uint64) (new uint64) {
+	for {
+		old := *val
+		new = old + delta
+		if CompareAndSwapUint64(val, old, new) {
+			break
+		}
+	}
+	return
+}
+
+func swapUint64(addr *uint64, new uint64) (old uint64) {
+	for {
+		old = *addr
+		if CompareAndSwapUint64(addr, old, new) {
+			break
+		}
+	}
+	return
+}
diff --git a/src/sync/atomic/asm_386.s b/src/sync/atomic/asm_386.s
new file mode 100644
index 000000000..740dfe76b
--- /dev/null
+++ b/src/sync/atomic/asm_386.s
@@ -0,0 +1,214 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+#include "textflag.h"
+
+TEXT ·SwapInt32(SB),NOSPLIT,$0-12
+	JMP	·SwapUint32(SB)
+
+TEXT ·SwapUint32(SB),NOSPLIT,$0-12
+	MOVL	addr+0(FP), BP
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BP)
+	MOVL	AX, old+8(FP)
+	RET
+
+TEXT ·SwapInt64(SB),NOSPLIT,$0-20
+	JMP	·SwapUint64(SB)
+
+TEXT ·SwapUint64(SB),NOSPLIT,$0-20
+	// no XCHGQ so use CMPXCHG8B loop
+	MOVL	addr+0(FP), BP
+	TESTL	$7, BP
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	// CX:BX = new
+	MOVL	new_lo+4(FP), BX
+	MOVL	new_hi+8(FP), CX
+	// DX:AX = *addr
+	MOVL	0(BP), AX
+	MOVL	4(BP), DX
+swaploop:
+	// if *addr == DX:AX
+	//	*addr = CX:BX
+	// else
+	//	DX:AX = *addr
+	// all in one instruction
+	LOCK
+	CMPXCHG8B	0(BP)
+	JNZ	swaploop
+
+	// success
+	// return DX:AX
+	MOVL	AX, old_lo+12(FP)
+	MOVL	DX, old_hi+16(FP)
+	RET
+
+TEXT ·SwapUintptr(SB),NOSPLIT,$0-12
+	JMP	·SwapUint32(SB)
+
+TEXT ·SwapPointer(SB),NOSPLIT,$0-12
+	JMP	·SwapUint32(SB)
+
+TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0-13
+	JMP	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-13
+	MOVL	addr+0(FP), BP
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	// CMPXCHGL was introduced on the 486.
+	LOCK
+	CMPXCHGL	CX, 0(BP)
+	SETEQ	swapped+12(FP)
+	RET
+
+TEXT ·CompareAndSwapUintptr(SB),NOSPLIT,$0-13
+	JMP	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapPointer(SB),NOSPLIT,$0-13
+	JMP	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0-21
+	JMP	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-21
+	MOVL	addr+0(FP), BP
+	TESTL	$7, BP
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	MOVL	old_lo+4(FP), AX
+	MOVL	old_hi+8(FP), DX
+	MOVL	new_lo+12(FP), BX
+	MOVL	new_hi+16(FP), CX
+	// CMPXCHG8B was introduced on the Pentium.
+	LOCK
+	CMPXCHG8B	0(BP)
+	SETEQ	swapped+20(FP)
+	RET
+
+TEXT ·AddInt32(SB),NOSPLIT,$0-12
+	JMP	·AddUint32(SB)
+
+TEXT ·AddUint32(SB),NOSPLIT,$0-12
+	MOVL	addr+0(FP), BP
+	MOVL	delta+4(FP), AX
+	MOVL	AX, CX
+	// XADD was introduced on the 486.
+	LOCK
+	XADDL	AX, 0(BP)
+	ADDL	AX, CX
+	MOVL	CX, new+8(FP)
+	RET
+
+TEXT ·AddUintptr(SB),NOSPLIT,$0-12
+	JMP	·AddUint32(SB)
+
+TEXT ·AddInt64(SB),NOSPLIT,$0-20
+	JMP	·AddUint64(SB)
+
+TEXT ·AddUint64(SB),NOSPLIT,$0-20
+	// no XADDQ so use CMPXCHG8B loop
+	MOVL	addr+0(FP), BP
+	TESTL	$7, BP
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	// DI:SI = delta
+	MOVL	delta_lo+4(FP), SI
+	MOVL	delta_hi+8(FP), DI
+	// DX:AX = *addr
+	MOVL	0(BP), AX
+	MOVL	4(BP), DX
+addloop:
+	// CX:BX = DX:AX (*addr) + DI:SI (delta)
+	MOVL	AX, BX
+	MOVL	DX, CX
+	ADDL	SI, BX
+	ADCL	DI, CX
+
+	// if *addr == DX:AX {
+	//	*addr = CX:BX
+	// } else {
+	//	DX:AX = *addr
+	// }
+	// all in one instruction
+	LOCK
+	CMPXCHG8B	0(BP)
+
+	JNZ	addloop
+
+	// success
+	// return CX:BX
+	MOVL	BX, new_lo+12(FP)
+	MOVL	CX, new_hi+16(FP)
+	RET
+
+TEXT ·LoadInt32(SB),NOSPLIT,$0-8
+	JMP	·LoadUint32(SB)
+
+TEXT ·LoadUint32(SB),NOSPLIT,$0-8
+	MOVL	addr+0(FP), AX
+	MOVL	0(AX), AX
+	MOVL	AX, val+4(FP)
+	RET
+
+TEXT ·LoadInt64(SB),NOSPLIT,$0-12
+	JMP	·LoadUint64(SB)
+
+TEXT ·LoadUint64(SB),NOSPLIT,$0-12
+	MOVL	addr+0(FP), AX
+	TESTL	$7, AX
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	// MOVQ and EMMS were introduced on the Pentium MMX.
+	// MOVQ (%EAX), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
+	// MOVQ %MM0, 0x8(%ESP)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x44; BYTE $0x24; BYTE $0x08
+	EMMS
+	RET
+
+TEXT ·LoadUintptr(SB),NOSPLIT,$0-8
+	JMP	·LoadUint32(SB)
+
+TEXT ·LoadPointer(SB),NOSPLIT,$0-8
+	JMP	·LoadUint32(SB)
+
+TEXT ·StoreInt32(SB),NOSPLIT,$0-8
+	JMP	·StoreUint32(SB)
+
+TEXT ·StoreUint32(SB),NOSPLIT,$0-8
+	MOVL	addr+0(FP), BP
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BP)
+	RET
+
+TEXT ·StoreInt64(SB),NOSPLIT,$0-12
+	JMP	·StoreUint64(SB)
+
+TEXT ·StoreUint64(SB),NOSPLIT,$0-12
+	MOVL	addr+0(FP), AX
+	TESTL	$7, AX
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	// MOVQ and EMMS were introduced on the Pentium MMX.
+	// MOVQ 0x8(%ESP), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
+	// MOVQ %MM0, (%EAX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
+	EMMS
+	// This is essentially a no-op, but it provides required memory fencing.
+	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
+	XORL	AX, AX
+	LOCK
+	XADDL	AX, (SP)
+	RET
+
+TEXT ·StoreUintptr(SB),NOSPLIT,$0-8
+	JMP	·StoreUint32(SB)
+
+TEXT ·StorePointer(SB),NOSPLIT,$0-8
+	JMP	·StoreUint32(SB)
diff --git a/src/sync/atomic/asm_amd64.s b/src/sync/atomic/asm_amd64.s
new file mode 100644
index 000000000..6e53ebedd
--- /dev/null
+++ b/src/sync/atomic/asm_amd64.s
@@ -0,0 +1,146 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+#include "textflag.h"
+
+TEXT ·SwapInt32(SB),NOSPLIT,$0-20
+	JMP	·SwapUint32(SB)
+
+TEXT ·SwapUint32(SB),NOSPLIT,$0-20
+	MOVQ	addr+0(FP), BP
+	MOVL	new+8(FP), AX
+	XCHGL	AX, 0(BP)
+	MOVL	AX, old+16(FP)
+	RET
+
+TEXT ·SwapInt64(SB),NOSPLIT,$0-24
+	JMP	·SwapUint64(SB)
+
+TEXT ·SwapUint64(SB),NOSPLIT,$0-24
+	MOVQ	addr+0(FP), BP
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BP)
+	MOVQ	AX, old+16(FP)
+	RET
+
+TEXT ·SwapUintptr(SB),NOSPLIT,$0-24
+	JMP	·SwapUint64(SB)
+
+TEXT ·SwapPointer(SB),NOSPLIT,$0-24
+	JMP	·SwapUint64(SB)
+
+TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0-17
+	JMP	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
+	MOVQ	addr+0(FP), BP
+	MOVL	old+8(FP), AX
+	MOVL	new+12(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BP)
+	SETEQ	swapped+16(FP)
+	RET
+
+TEXT ·CompareAndSwapUintptr(SB),NOSPLIT,$0-25
+	JMP	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapPointer(SB),NOSPLIT,$0-25
+	JMP	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0-25
+	JMP	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
+	MOVQ	addr+0(FP), BP
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BP)
+	SETEQ	swapped+24(FP)
+	RET
+
+TEXT ·AddInt32(SB),NOSPLIT,$0-20
+	JMP	·AddUint32(SB)
+
+TEXT ·AddUint32(SB),NOSPLIT,$0-20
+	MOVQ	addr+0(FP), BP
+	MOVL	delta+8(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BP)
+	ADDL	AX, CX
+	MOVL	CX, new+16(FP)
+	RET
+
+TEXT ·AddUintptr(SB),NOSPLIT,$0-24
+	JMP	·AddUint64(SB)
+
+TEXT ·AddInt64(SB),NOSPLIT,$0-24
+	JMP	·AddUint64(SB)
+
+TEXT ·AddUint64(SB),NOSPLIT,$0-24
+	MOVQ	addr+0(FP), BP
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BP)
+	ADDQ	AX, CX
+	MOVQ	CX, new+16(FP)
+	RET
+
+TEXT ·LoadInt32(SB),NOSPLIT,$0-12
+	JMP	·LoadUint32(SB)
+
+TEXT ·LoadUint32(SB),NOSPLIT,$0-12
+	MOVQ	addr+0(FP), AX
+	MOVL	0(AX), AX
+	MOVL	AX, val+8(FP)
+	RET
+
+TEXT ·LoadInt64(SB),NOSPLIT,$0-16
+	JMP	·LoadUint64(SB)
+
+TEXT ·LoadUint64(SB),NOSPLIT,$0-16
+	MOVQ	addr+0(FP), AX
+	MOVQ	0(AX), AX
+	MOVQ	AX, val+8(FP)
+	RET
+
+TEXT ·LoadUintptr(SB),NOSPLIT,$0-16
+	JMP	·LoadPointer(SB)
+
+TEXT ·LoadPointer(SB),NOSPLIT,$0-16
+	MOVQ	addr+0(FP), AX
+	MOVQ	0(AX), AX
+	MOVQ	AX, val+8(FP)
+	RET
+
+TEXT ·StoreInt32(SB),NOSPLIT,$0-12
+	JMP	·StoreUint32(SB)
+
+TEXT ·StoreUint32(SB),NOSPLIT,$0-12
+	MOVQ	addr+0(FP), BP
+	MOVL	val+8(FP), AX
+	XCHGL	AX, 0(BP)
+	RET
+
+TEXT ·StoreInt64(SB),NOSPLIT,$0-16
+	JMP	·StoreUint64(SB)
+
+TEXT ·StoreUint64(SB),NOSPLIT,$0-16
+	MOVQ	addr+0(FP), BP
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BP)
+	RET
+
+TEXT ·StoreUintptr(SB),NOSPLIT,$0-16
+	JMP	·StorePointer(SB)
+
+TEXT ·StorePointer(SB),NOSPLIT,$0-16
+	MOVQ	addr+0(FP), BP
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BP)
+	RET
diff --git a/src/sync/atomic/asm_amd64p32.s b/src/sync/atomic/asm_amd64p32.s
new file mode 100644
index 000000000..d77cc2c08
--- /dev/null
+++ b/src/sync/atomic/asm_amd64p32.s
@@ -0,0 +1,159 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT ·SwapInt32(SB),NOSPLIT,$0-12
+	JMP	·SwapUint32(SB)
+
+TEXT ·SwapUint32(SB),NOSPLIT,$0-12
+	MOVL	addr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, old+8(FP)
+	RET
+
+TEXT ·SwapInt64(SB),NOSPLIT,$0-24
+	JMP	·SwapUint64(SB)
+
+TEXT ·SwapUint64(SB),NOSPLIT,$0-24
+	MOVL	addr+0(FP), BX
+	TESTL	$7, BX
+	JZ	2(PC)
+	MOVL	0, BX // crash with nil ptr deref
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, old+16(FP)
+	RET
+
+TEXT ·SwapUintptr(SB),NOSPLIT,$0-12
+	JMP	·SwapUint32(SB)
+
+TEXT ·SwapPointer(SB),NOSPLIT,$0-12
+	JMP	·SwapUint32(SB)
+
+TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0-17
+	JMP	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
+	MOVL	addr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	swapped+16(FP)
+	RET
+
+TEXT ·CompareAndSwapUintptr(SB),NOSPLIT,$0-17
+	JMP	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapPointer(SB),NOSPLIT,$0-17
+	JMP	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0-25
+	JMP	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
+	MOVL	addr+0(FP), BX
+	TESTL	$7, BX
+	JZ	2(PC)
+	MOVL	0, BX // crash with nil ptr deref
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	swapped+24(FP)
+	RET
+
+TEXT ·AddInt32(SB),NOSPLIT,$0-12
+	JMP	·AddUint32(SB)
+
+TEXT ·AddUint32(SB),NOSPLIT,$0-12
+	MOVL	addr+0(FP), BX
+	MOVL	delta+4(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	AX, CX
+	MOVL	CX, new+8(FP)
+	RET
+
+TEXT ·AddUintptr(SB),NOSPLIT,$0-12
+	JMP	·AddUint32(SB)
+
+TEXT ·AddInt64(SB),NOSPLIT,$0-24
+	JMP	·AddUint64(SB)
+
+TEXT ·AddUint64(SB),NOSPLIT,$0-24
+	MOVL	addr+0(FP), BX
+	TESTL	$7, BX
+	JZ	2(PC)
+	MOVL	0, BX // crash with nil ptr deref
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	AX, CX
+	MOVQ	CX, new+16(FP)
+	RET
+
+TEXT ·LoadInt32(SB),NOSPLIT,$0-12
+	JMP	·LoadUint32(SB)
+
+TEXT ·LoadUint32(SB),NOSPLIT,$0-12
+	MOVL	addr+0(FP), AX
+	MOVL	0(AX), AX
+	MOVL	AX, val+8(FP)
+	RET
+
+TEXT ·LoadInt64(SB),NOSPLIT,$0-16
+	JMP	·LoadUint64(SB)
+
+TEXT ·LoadUint64(SB),NOSPLIT,$0-16
+	MOVL	addr+0(FP), AX
+	TESTL	$7, AX
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	MOVQ	0(AX), AX
+	MOVQ	AX, val+8(FP)
+	RET
+
+TEXT ·LoadUintptr(SB),NOSPLIT,$0-12
+	JMP	·LoadPointer(SB)
+
+TEXT ·LoadPointer(SB),NOSPLIT,$0-12
+	MOVL	addr+0(FP), AX
+	MOVL	0(AX), AX
+	MOVL	AX, val+8(FP)
+	RET
+
+TEXT ·StoreInt32(SB),NOSPLIT,$0-8
+	JMP	·StoreUint32(SB)
+
+TEXT ·StoreUint32(SB),NOSPLIT,$0-8
+	MOVL	addr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT ·StoreInt64(SB),NOSPLIT,$0-16
+	JMP	·StoreUint64(SB)
+
+TEXT ·StoreUint64(SB),NOSPLIT,$0-16
+	MOVL	addr+0(FP), BX
+	TESTL	$7, BX
+	JZ	2(PC)
+	MOVL	0, BX // crash with nil ptr deref
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT ·StoreUintptr(SB),NOSPLIT,$0-8
+	JMP	·StorePointer(SB)
+
+TEXT ·StorePointer(SB),NOSPLIT,$0-8
+	MOVL	addr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
diff --git a/src/sync/atomic/asm_arm.s b/src/sync/atomic/asm_arm.s
new file mode 100644
index 000000000..47639a80e
--- /dev/null
+++ b/src/sync/atomic/asm_arm.s
@@ -0,0 +1,197 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+#include "textflag.h"
+
+// ARM atomic operations, for use by asm_$(GOOS)_arm.s.
+
+TEXT ·armCompareAndSwapUint32(SB),NOSPLIT,$0-13
+	MOVW	addr+0(FP), R1
+	MOVW	old+4(FP), R2
+	MOVW	new+8(FP), R3
+casloop:
+	// LDREX and STREX were introduced in ARMv6.
+	LDREX	(R1), R0
+	CMP	R0, R2
+	BNE	casfail
+	STREX	R3, (R1), R0
+	CMP	$0, R0
+	BNE	casloop
+	MOVW	$1, R0
+	MOVBU	R0, ret+12(FP)
+	RET
+casfail:
+	MOVW	$0, R0
+	MOVBU	R0, ret+12(FP)
+	RET
+
+TEXT ·armCompareAndSwapUint64(SB),NOSPLIT,$0-21
+	BL	fastCheck64<>(SB)
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)
+	MOVW	oldlo+4(FP), R2
+	MOVW	oldhi+8(FP), R3
+	MOVW	newlo+12(FP), R4
+	MOVW	newhi+16(FP), R5
+cas64loop:
+	// LDREXD and STREXD were introduced in ARMv6k.
+	LDREXD	(R1), R6  // loads R6 and R7
+	CMP	R2, R6
+	BNE	cas64fail
+	CMP	R3, R7
+	BNE	cas64fail
+	STREXD	R4, (R1), R0	// stores R4 and R5
+	CMP	$0, R0
+	BNE	cas64loop
+	MOVW	$1, R0
+	MOVBU	R0, ret+20(FP)
+	RET
+cas64fail:
+	MOVW	$0, R0
+	MOVBU	R0, ret+20(FP)
+	RET
+
+TEXT ·armAddUint32(SB),NOSPLIT,$0-12
+	MOVW	addr+0(FP), R1
+	MOVW	delta+4(FP), R2
+addloop:
+	// LDREX and STREX were introduced in ARMv6.
+	LDREX	(R1), R3
+	ADD	R2, R3
+	STREX	R3, (R1), R0
+	CMP	$0, R0
+	BNE	addloop
+	MOVW	R3, ret+8(FP)
+	RET
+
+TEXT ·armAddUint64(SB),NOSPLIT,$0-20
+	BL	fastCheck64<>(SB)
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)
+	MOVW	deltalo+4(FP), R2
+	MOVW	deltahi+8(FP), R3
+add64loop:
+	// LDREXD and STREXD were introduced in ARMv6k.
+	LDREXD	(R1), R4	// loads R4 and R5
+	ADD.S	R2, R4
+	ADC	R3, R5
+	STREXD	R4, (R1), R0	// stores R4 and R5
+	CMP	$0, R0
+	BNE	add64loop
+	MOVW	R4, retlo+12(FP)
+	MOVW	R5, rethi+16(FP)
+	RET
+
+TEXT ·armSwapUint32(SB),NOSPLIT,$0-12
+	MOVW	addr+0(FP), R1
+	MOVW	new+4(FP), R2
+swaploop:
+	// LDREX and STREX were introduced in ARMv6.
+	LDREX	(R1), R3
+	STREX	R2, (R1), R0
+	CMP	$0, R0
+	BNE	swaploop
+	MOVW	R3, old+8(FP)
+	RET
+
+TEXT ·armSwapUint64(SB),NOSPLIT,$0-20
+	BL	fastCheck64<>(SB)
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)
+	MOVW	newlo+4(FP), R2
+	MOVW	newhi+8(FP), R3
+swap64loop:
+	// LDREXD and STREXD were introduced in ARMv6k.
+	LDREXD	(R1), R4	// loads R4 and R5
+	STREXD	R2, (R1), R0	// stores R2 and R3
+	CMP	$0, R0
+	BNE	swap64loop
+	MOVW	R4, oldlo+12(FP)
+	MOVW	R5, oldhi+16(FP)
+	RET
+
+TEXT ·armLoadUint64(SB),NOSPLIT,$0-12
+	BL	fastCheck64<>(SB)
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)
+load64loop:
+	LDREXD	(R1), R2	// loads R2 and R3
+	STREXD	R2, (R1), R0	// stores R2 and R3
+	CMP	$0, R0
+	BNE	load64loop
+	MOVW	R2, vallo+4(FP)
+	MOVW	R3, valhi+8(FP)
+	RET
+
+TEXT ·armStoreUint64(SB),NOSPLIT,$0-12
+	BL	fastCheck64<>(SB)
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	2(PC)
+	MOVW	R2, (R2)
+	MOVW	vallo+4(FP), R2
+	MOVW	valhi+8(FP), R3
+store64loop:
+	LDREXD	(R1), R4	// loads R4 and R5
+	STREXD	R2, (R1), R0	// stores R2 and R3
+	CMP	$0, R0
+	BNE	store64loop
+	RET
+
+// Check for broken 64-bit LDREXD as found in QEMU.
+// LDREXD followed by immediate STREXD should succeed.
+// If it fails, try a few times just to be sure (maybe our thread got
+// rescheduled between the two instructions) and then panic.
+// A bug in some copies of QEMU makes STREXD never succeed,
+// which will make uses of the 64-bit atomic operations loop forever.
+// If things are working, set okLDREXD to avoid future checks.
+// https://bugs.launchpad.net/qemu/+bug/670883.
+TEXT	check64<>(SB),NOSPLIT,$16-0
+	MOVW	$10, R1
+	// 8-aligned stack address scratch space.
+	MOVW	$8(R13), R5
+	AND	$~7, R5
+loop:
+	LDREXD	(R5), R2
+	STREXD	R2, (R5), R0
+	CMP	$0, R0
+	BEQ	ok
+	SUB	$1, R1
+	CMP	$0, R1
+	BNE	loop
+	// Must be buggy QEMU.
+	BL	·panic64(SB)
+ok:
+	RET
+
+// Fast, cached version of check.  No frame, just MOVW CMP RET after first time.
+TEXT	fastCheck64<>(SB),NOSPLIT,$-4
+	MOVW	ok64<>(SB), R0
+	CMP	$0, R0	// have we been here before?
+	RET.NE
+	B	slowCheck64<>(SB)
+
+TEXT slowCheck64<>(SB),NOSPLIT,$0-0
+	BL	check64<>(SB)
+	// Still here, must be okay.
+	MOVW	$1, R0
+	MOVW	R0, ok64<>(SB)
+	RET
+
+GLOBL ok64<>(SB), $4
diff --git a/src/sync/atomic/asm_freebsd_arm.s b/src/sync/atomic/asm_freebsd_arm.s
new file mode 100644
index 000000000..06b975e89
--- /dev/null
+++ b/src/sync/atomic/asm_freebsd_arm.s
@@ -0,0 +1,109 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// FreeBSD/ARM atomic operations.
+// TODO(minux): this only supports ARMv6K or higher.
+
+TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0
+	B ·armCompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUintptr(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapPointer(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·AddInt32(SB),NOSPLIT,$0
+	B ·AddUint32(SB)
+
+TEXT ·AddUint32(SB),NOSPLIT,$0
+	B ·armAddUint32(SB)
+
+TEXT ·AddUintptr(SB),NOSPLIT,$0
+	B ·AddUint32(SB)
+
+TEXT ·SwapInt32(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·SwapUint32(SB),NOSPLIT,$0
+	B ·armSwapUint32(SB)
+
+TEXT ·SwapUintptr(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·SwapPointer(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$-4
+	B ·armCompareAndSwapUint64(SB)
+
+TEXT ·AddInt64(SB),NOSPLIT,$0
+	B ·addUint64(SB)
+
+TEXT ·AddUint64(SB),NOSPLIT,$0
+	B ·addUint64(SB)
+
+TEXT ·SwapInt64(SB),NOSPLIT,$0
+	B ·swapUint64(SB)
+
+TEXT ·SwapUint64(SB),NOSPLIT,$0
+	B ·swapUint64(SB)
+
+TEXT ·LoadInt32(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·LoadUint32(SB),NOSPLIT,$0-8
+	MOVW addr+0(FP), R1
+load32loop:
+	LDREX (R1), R2		// loads R2
+	STREX R2, (R1), R0	// stores R2
+	CMP $0, R0
+	BNE load32loop
+	MOVW R2, val+4(FP)
+	RET
+
+TEXT ·LoadInt64(SB),NOSPLIT,$0
+	B ·loadUint64(SB)
+
+TEXT ·LoadUint64(SB),NOSPLIT,$0
+	B ·loadUint64(SB)
+
+TEXT ·LoadUintptr(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·LoadPointer(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·StoreInt32(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
+
+TEXT ·StoreUint32(SB),NOSPLIT,$0-8
+	MOVW addr+0(FP), R1
+	MOVW val+4(FP), R2
+storeloop:
+	LDREX (R1), R4		// loads R4
+	STREX R2, (R1), R0	// stores R2
+	CMP $0, R0
+	BNE storeloop
+	RET
+
+TEXT ·StoreInt64(SB),NOSPLIT,$0
+	B ·storeUint64(SB)
+
+TEXT ·StoreUint64(SB),NOSPLIT,$0
+	B ·storeUint64(SB)
+
+TEXT ·StoreUintptr(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
+
+TEXT ·StorePointer(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
diff --git a/src/sync/atomic/asm_linux_arm.s b/src/sync/atomic/asm_linux_arm.s
new file mode 100644
index 000000000..63f1f9e38
--- /dev/null
+++ b/src/sync/atomic/asm_linux_arm.s
@@ -0,0 +1,216 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+#include "textflag.h"
+
+// Linux/ARM atomic operations.
+
+// Because there is so much variation in ARM devices,
+// the Linux kernel provides an appropriate compare-and-swap
+// implementation at address 0xffff0fc0.  Caller sets:
+//	R0 = old value
+//	R1 = new value
+//	R2 = addr
+//	LR = return address
+// The function returns with CS true if the swap happened.
+// http://lxr.linux.no/linux+v2.6.37.2/arch/arm/kernel/entry-armv.S#L850
+// On older kernels (before 2.6.24) the function can incorrectly
+// report a conflict, so we have to double-check the compare ourselves
+// and retry if necessary.
+//
+// http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b49c0f24cf6744a3f4fd09289fe7cade349dead5
+//
+TEXT cas<>(SB),NOSPLIT,$0
+	MOVW	$0xffff0fc0, PC
+
+TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0
+	B	·CompareAndSwapUint32(SB)
+
+// Implement using kernel cas for portability.
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-13
+	MOVW	addr+0(FP), R2
+	// trigger potential paging fault here,
+	// because we don't know how to traceback through __kuser_cmpxchg
+	MOVW	(R2), R0
+	MOVW	old+4(FP), R0
+casagain:
+	MOVW	new+8(FP), R1
+	BL	cas<>(SB)
+	BCC	cascheck
+	MOVW	$1, R0
+casret:
+	MOVB	R0, swapped+12(FP)
+	RET
+cascheck:
+	// Kernel lies; double-check.
+	MOVW	addr+0(FP), R2
+	MOVW	old+4(FP), R0
+	MOVW	0(R2), R3
+	CMP	R0, R3
+	BEQ	casagain
+	MOVW	$0, R0
+	B	casret
+
+TEXT ·CompareAndSwapUintptr(SB),NOSPLIT,$0
+	B	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapPointer(SB),NOSPLIT,$0
+	B	·CompareAndSwapUint32(SB)
+
+TEXT ·AddInt32(SB),NOSPLIT,$0
+	B	·AddUint32(SB)
+
+// Implement using kernel cas for portability.
+TEXT ·AddUint32(SB),NOSPLIT,$0-12
+	MOVW	addr+0(FP), R2
+	MOVW	delta+4(FP), R4
+addloop1:
+	MOVW	0(R2), R0
+	MOVW	R0, R1
+	ADD	R4, R1
+	BL	cas<>(SB)
+	BCC	addloop1
+	MOVW	R1, new+8(FP)
+	RET
+
+TEXT ·AddUintptr(SB),NOSPLIT,$0
+	B	·AddUint32(SB)
+
+TEXT ·SwapInt32(SB),NOSPLIT,$0
+	B	·SwapUint32(SB)
+
+// Implement using kernel cas for portability.
+TEXT ·SwapUint32(SB),NOSPLIT,$0-12
+	MOVW	addr+0(FP), R2
+	MOVW	new+4(FP), R1
+swaploop1:
+	MOVW	0(R2), R0
+	MOVW	R0, R4 // cas smashes R0
+	BL	cas<>(SB)
+	BCC	swaploop1
+	MOVW	R4, old+8(FP)
+	RET
+
+TEXT ·SwapUintptr(SB),NOSPLIT,$0
+	B	·SwapUint32(SB)
+
+TEXT ·SwapPointer(SB),NOSPLIT,$0
+	B	·SwapUint32(SB)
+
+TEXT cas64<>(SB),NOSPLIT,$0
+	MOVW	$0xffff0f60, PC // __kuser_cmpxchg64: Linux-3.1 and above
+
+TEXT kernelCAS64<>(SB),NOSPLIT,$0-21
+	// int (*__kuser_cmpxchg64_t)(const int64_t *oldval, const int64_t *newval, volatile int64_t *ptr);
+	MOVW	addr+0(FP), R2 // ptr
+	// trigger potential paging fault here,
+	// because we don't know how to traceback through __kuser_cmpxchg64
+	MOVW	(R2), R0
+	// make unaligned atomic access panic
+	AND.S	$7, R2, R1
+	BEQ 	2(PC)
+	MOVW	R1, (R1)
+	MOVW	$4(FP), R0 // oldval
+	MOVW	$12(FP), R1 // newval
+	BL	cas64<>(SB)
+	MOVW.CS	$1, R0 // C is set if the kernel has changed *ptr
+	MOVW.CC	$0, R0
+	MOVW	R0, 20(FP)
+	RET
+
+TEXT ·generalCAS64(SB),NOSPLIT,$0-21
+	B  	runtime·cas64(SB)
+
+GLOBL armCAS64(SB), $4
+
+TEXT setupAndCallCAS64<>(SB),NOSPLIT,$-4-21
+	MOVW	$0xffff0ffc, R0 // __kuser_helper_version
+	MOVW	(R0), R0
+	// __kuser_cmpxchg64 only present if helper version >= 5
+	CMP 	$5, R0
+	MOVW.CS	$kernelCAS64<>(SB), R1
+	MOVW.CS	R1, armCAS64(SB)
+	MOVW.CS	R1, PC
+	MOVB	runtime·armArch(SB), R0
+	// LDREXD, STREXD only present on ARMv6K or higher
+	CMP	$6, R0 // TODO(minux): how to differentiate ARMv6 with ARMv6K?
+	MOVW.CS	$·armCompareAndSwapUint64(SB), R1
+	MOVW.CS	R1, armCAS64(SB)
+	MOVW.CS	R1, PC
+	// we are out of luck, can only use runtime's emulated 64-bit cas
+	MOVW	$·generalCAS64(SB), R1
+	MOVW	R1, armCAS64(SB)
+	MOVW	R1, PC
+
+TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0
+	B   	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$-4-21
+	MOVW	armCAS64(SB), R0
+	CMP 	$0, R0
+	MOVW.NE	R0, PC
+	B	setupAndCallCAS64<>(SB)
+
+TEXT ·AddInt64(SB),NOSPLIT,$0
+	B	·addUint64(SB)
+
+TEXT ·AddUint64(SB),NOSPLIT,$0
+	B	·addUint64(SB)
+
+TEXT ·SwapInt64(SB),NOSPLIT,$0
+	B	·swapUint64(SB)
+
+TEXT ·SwapUint64(SB),NOSPLIT,$0
+	B	·swapUint64(SB)
+
+TEXT ·LoadInt32(SB),NOSPLIT,$0
+	B	·LoadUint32(SB)
+
+TEXT ·LoadUint32(SB),NOSPLIT,$0-8
+	MOVW	addr+0(FP), R2
+loadloop1:
+	MOVW	0(R2), R0
+	MOVW	R0, R1
+	BL	cas<>(SB)
+	BCC	loadloop1
+	MOVW	R1, val+4(FP)
+	RET
+
+TEXT ·LoadInt64(SB),NOSPLIT,$0
+	B	·loadUint64(SB)
+
+TEXT ·LoadUint64(SB),NOSPLIT,$0
+	B	·loadUint64(SB)
+
+TEXT ·LoadUintptr(SB),NOSPLIT,$0
+	B	·LoadUint32(SB)
+
+TEXT ·LoadPointer(SB),NOSPLIT,$0
+	B	·LoadUint32(SB)
+
+TEXT ·StoreInt32(SB),NOSPLIT,$0
+	B	·StoreUint32(SB)
+
+TEXT ·StoreUint32(SB),NOSPLIT,$0-8
+	MOVW	addr+0(FP), R2
+	MOVW	val+4(FP), R1
+storeloop1:
+	MOVW	0(R2), R0
+	BL	cas<>(SB)
+	BCC	storeloop1
+	RET
+
+TEXT ·StoreInt64(SB),NOSPLIT,$0
+	B	·storeUint64(SB)
+
+TEXT ·StoreUint64(SB),NOSPLIT,$0
+	B	·storeUint64(SB)
+
+TEXT ·StoreUintptr(SB),NOSPLIT,$0
+	B	·StoreUint32(SB)
+
+TEXT ·StorePointer(SB),NOSPLIT,$0
+	B	·StoreUint32(SB)
diff --git a/src/sync/atomic/asm_nacl_arm.s b/src/sync/atomic/asm_nacl_arm.s
new file mode 100644
index 000000000..76f623336
--- /dev/null
+++ b/src/sync/atomic/asm_nacl_arm.s
@@ -0,0 +1,109 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// NaCl/ARM atomic operations.
+// NaCl/ARM explicitly targets ARMv7A.
+
+TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0
+	B ·armCompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUintptr(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapPointer(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·AddInt32(SB),NOSPLIT,$0
+	B ·AddUint32(SB)
+
+TEXT ·AddUint32(SB),NOSPLIT,$0
+	B ·armAddUint32(SB)
+
+TEXT ·AddUintptr(SB),NOSPLIT,$0
+	B ·AddUint32(SB)
+
+TEXT ·SwapInt32(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·SwapUint32(SB),NOSPLIT,$0
+	B ·armSwapUint32(SB)
+
+TEXT ·SwapUintptr(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·SwapPointer(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$-4
+	B ·armCompareAndSwapUint64(SB)
+
+TEXT ·AddInt64(SB),NOSPLIT,$0
+	B ·addUint64(SB)
+
+TEXT ·AddUint64(SB),NOSPLIT,$0
+	B ·addUint64(SB)
+
+TEXT ·SwapInt64(SB),NOSPLIT,$0
+	B ·swapUint64(SB)
+
+TEXT ·SwapUint64(SB),NOSPLIT,$0
+	B ·swapUint64(SB)
+
+TEXT ·LoadInt32(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·LoadUint32(SB),NOSPLIT,$0-8
+	MOVW addr+0(FP), R1
+load32loop:
+	LDREX (R1), R2		// loads R2
+	STREX R2, (R1), R0	// stores R2
+	CMP $0, R0
+	BNE load32loop
+	MOVW R2, val+4(FP)
+	RET
+
+TEXT ·LoadInt64(SB),NOSPLIT,$0
+	B ·loadUint64(SB)
+
+TEXT ·LoadUint64(SB),NOSPLIT,$0
+	B ·loadUint64(SB)
+
+TEXT ·LoadUintptr(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·LoadPointer(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·StoreInt32(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
+
+TEXT ·StoreUint32(SB),NOSPLIT,$0-8
+	MOVW addr+0(FP), R1
+	MOVW val+4(FP), R2
+storeloop:
+	LDREX (R1), R4		// loads R4
+	STREX R2, (R1), R0	// stores R2
+	CMP $0, R0
+	BNE storeloop
+	RET
+
+TEXT ·StoreInt64(SB),NOSPLIT,$0
+	B ·storeUint64(SB)
+
+TEXT ·StoreUint64(SB),NOSPLIT,$0
+	B ·storeUint64(SB)
+
+TEXT ·StoreUintptr(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
+
+TEXT ·StorePointer(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
diff --git a/src/sync/atomic/asm_netbsd_arm.s b/src/sync/atomic/asm_netbsd_arm.s
new file mode 100644
index 000000000..dbe80898f
--- /dev/null
+++ b/src/sync/atomic/asm_netbsd_arm.s
@@ -0,0 +1,109 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// NetBSD/ARM atomic operations.
+// TODO(minux): this only supports ARMv6K or higher.
+
+TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0
+	B ·armCompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUintptr(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapPointer(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint32(SB)
+
+TEXT ·AddInt32(SB),NOSPLIT,$0
+	B ·AddUint32(SB)
+
+TEXT ·AddUint32(SB),NOSPLIT,$0
+	B ·armAddUint32(SB)
+
+TEXT ·AddUintptr(SB),NOSPLIT,$0
+	B ·AddUint32(SB)
+
+TEXT ·SwapInt32(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·SwapUint32(SB),NOSPLIT,$0
+	B ·armSwapUint32(SB)
+
+TEXT ·SwapUintptr(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·SwapPointer(SB),NOSPLIT,$0
+	B ·SwapUint32(SB)
+
+TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0
+	B ·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$-4
+	B ·armCompareAndSwapUint64(SB)
+
+TEXT ·AddInt64(SB),NOSPLIT,$0
+	B ·addUint64(SB)
+
+TEXT ·AddUint64(SB),NOSPLIT,$0
+	B ·addUint64(SB)
+
+TEXT ·SwapInt64(SB),NOSPLIT,$0
+	B ·swapUint64(SB)
+
+TEXT ·SwapUint64(SB),NOSPLIT,$0
+	B ·swapUint64(SB)
+
+TEXT ·LoadInt32(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·LoadUint32(SB),NOSPLIT,$0-8
+	MOVW addr+0(FP), R1
+load32loop:
+	LDREX (R1), R2		// loads R2
+	STREX R2, (R1), R0	// stores R2
+	CMP $0, R0
+	BNE load32loop
+	MOVW R2, val+4(FP)
+	RET
+
+TEXT ·LoadInt64(SB),NOSPLIT,$0
+	B ·loadUint64(SB)
+
+TEXT ·LoadUint64(SB),NOSPLIT,$0
+	B ·loadUint64(SB)
+
+TEXT ·LoadUintptr(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·LoadPointer(SB),NOSPLIT,$0
+	B ·LoadUint32(SB)
+
+TEXT ·StoreInt32(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
+
+TEXT ·StoreUint32(SB),NOSPLIT,$0-8
+	MOVW addr+0(FP), R1
+	MOVW val+4(FP), R2
+storeloop:
+	LDREX (R1), R4		// loads R4
+	STREX R2, (R1), R0	// stores R2
+	CMP $0, R0
+	BNE storeloop
+	RET
+
+TEXT ·StoreInt64(SB),NOSPLIT,$0
+	B ·storeUint64(SB)
+
+TEXT ·StoreUint64(SB),NOSPLIT,$0
+	B ·storeUint64(SB)
+
+TEXT ·StoreUintptr(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
+
+TEXT ·StorePointer(SB),NOSPLIT,$0
+	B ·StoreUint32(SB)
diff --git a/src/sync/atomic/asm_power64x.s b/src/sync/atomic/asm_power64x.s
new file mode 100644
index 000000000..bdb1dcff4
--- /dev/null
+++ b/src/sync/atomic/asm_power64x.s
@@ -0,0 +1,184 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build power64 power64le
+
+#include "../../../cmd/ld/textflag.h"
+
+TEXT ·SwapInt32(SB),NOSPLIT,$0-20
+	BR	·SwapUint32(SB)
+
+TEXT ·SwapUint32(SB),NOSPLIT,$0-20
+	MOVD	addr+0(FP), R3
+	MOVW	new+8(FP), R4
+	SYNC
+	LWAR	(R3), R5
+	STWCCC	R4, (R3)
+	BNE	-3(PC)
+	SYNC
+	ISYNC
+	MOVW	R5, old+16(FP)
+	RETURN
+
+TEXT ·SwapInt64(SB),NOSPLIT,$0-24
+	BR	·SwapUint64(SB)
+
+TEXT ·SwapUint64(SB),NOSPLIT,$0-24
+	MOVD	addr+0(FP), R3
+	MOVD	new+8(FP), R4
+	SYNC
+	LDAR	(R3), R5
+	STDCCC	R4, (R3)
+	BNE	-3(PC)
+	SYNC
+	ISYNC
+	MOVD	R5, old+16(FP)
+	RETURN
+
+TEXT ·SwapUintptr(SB),NOSPLIT,$0-24
+	BR	·SwapUint64(SB)
+
+TEXT ·SwapPointer(SB),NOSPLIT,$0-24
+	BR	·SwapUint64(SB)
+
+TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0-17
+	BR	·CompareAndSwapUint32(SB)
+
+TEXT ·CompareAndSwapUint32(SB),NOSPLIT,$0-17
+	MOVD	addr+0(FP), R3
+	MOVW	old+8(FP), R4
+	MOVW	new+12(FP), R5
+	SYNC
+	LWAR	(R3), R6
+	CMPW	R6, R4
+	BNE	8(PC)
+	STWCCC	R5, (R3)
+	BNE	-5(PC)
+	SYNC
+	ISYNC
+	MOVD	$1, R3
+	MOVB	R3, swapped+16(FP)
+	RETURN
+	MOVB	R0, swapped+16(FP)
+	RETURN
+
+TEXT ·CompareAndSwapUintptr(SB),NOSPLIT,$0-25
+	BR	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapPointer(SB),NOSPLIT,$0-25
+	BR	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0-25
+	BR	·CompareAndSwapUint64(SB)
+
+TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$0-25
+	MOVD	addr+0(FP), R3
+	MOVD	old+8(FP), R4
+	MOVD	new+16(FP), R5
+	SYNC
+	LDAR	(R3), R6
+	CMP	R6, R4
+	BNE	8(PC)
+	STDCCC	R5, (R3)
+	BNE	-5(PC)
+	SYNC
+	ISYNC
+	MOVD	$1, R3
+	MOVB	R3, swapped+24(FP)
+	RETURN
+	MOVB	R0, swapped+24(FP)
+	RETURN
+
+TEXT ·AddInt32(SB),NOSPLIT,$0-20
+	BR	·AddUint32(SB)
+
+TEXT ·AddUint32(SB),NOSPLIT,$0-20
+	MOVD	addr+0(FP), R3
+	MOVW	delta+8(FP), R4
+	SYNC
+	LWAR	(R3), R5
+	ADD	R4, R5
+	STWCCC	R5, (R3)
+	BNE	-4(PC)
+	SYNC
+	ISYNC
+	MOVW	R5, ret+16(FP)
+	RETURN
+
+TEXT ·AddUintptr(SB),NOSPLIT,$0-24
+	BR	·AddUint64(SB)
+
+TEXT ·AddInt64(SB),NOSPLIT,$0-24
+	BR	·AddUint64(SB)
+
+TEXT ·AddUint64(SB),NOSPLIT,$0-24
+	MOVD	addr+0(FP), R3
+	MOVD	delta+8(FP), R4
+	SYNC
+	LDAR	(R3), R5
+	ADD	R4, R5
+	STDCCC	R5, (R3)
+	BNE	-4(PC)
+	SYNC
+	ISYNC
+	MOVD	R5, ret+16(FP)
+	RETURN
+
+TEXT ·LoadInt32(SB),NOSPLIT,$0-12
+	BR	·LoadUint32(SB)
+
+TEXT ·LoadUint32(SB),NOSPLIT,$0-12
+	MOVD	addr+0(FP), R3
+	SYNC
+	MOVW	0(R3), R3
+	CMPW	R3, R3, CR7
+	BC	4, 30, 1(PC)	// bne- cr7,0x4
+	ISYNC
+	MOVW	R3, val+8(FP)
+	RETURN
+
+TEXT ·LoadInt64(SB),NOSPLIT,$0-16
+	BR	·LoadUint64(SB)
+
+TEXT ·LoadUint64(SB),NOSPLIT,$0-16
+	MOVD	addr+0(FP), R3
+	SYNC
+	MOVD	0(R3), R3
+	CMP	R3, R3, CR7
+	BC	4, 30, 1(PC)	// bne- cr7,0x4
+	ISYNC
+	MOVD	R3, val+8(FP)
+	RETURN
+
+TEXT ·LoadUintptr(SB),NOSPLIT,$0-16
+	BR	·LoadPointer(SB)
+
+TEXT ·LoadPointer(SB),NOSPLIT,$0-16
+	BR	·LoadUint64(SB)
+
+TEXT ·StoreInt32(SB),NOSPLIT,$0-12
+	BR	·StoreUint32(SB)
+
+TEXT ·StoreUint32(SB),NOSPLIT,$0-12
+	MOVD	addr+0(FP), R3
+	MOVW	val+8(FP), R4
+	SYNC
+	MOVW	R4, 0(R3)
+	RETURN
+
+TEXT ·StoreInt64(SB),NOSPLIT,$0-16
+	BR	·StoreUint64(SB)
+
+TEXT ·StoreUint64(SB),NOSPLIT,$0-16
+	MOVD	addr+0(FP), R3
+	MOVD	val+8(FP), R4
+	SYNC
+	MOVD	R4, 0(R3)
+	RETURN
+
+TEXT ·StoreUintptr(SB),NOSPLIT,$0-16
+	BR	·StorePointer(SB)
+
+TEXT ·StorePointer(SB),NOSPLIT,$0-16
+	BR	·StoreUint64(SB)
diff --git a/src/sync/atomic/atomic_linux_arm_test.go b/src/sync/atomic/atomic_linux_arm_test.go
new file mode 100644
index 000000000..b6965b99b
--- /dev/null
+++ b/src/sync/atomic/atomic_linux_arm_test.go
@@ -0,0 +1,14 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic_test
+
+import (
+	. "sync/atomic"
+	"testing"
+)
+
+func TestGeneralCAS64(t *testing.T) {
+	testCompareAndSwapUint64(t, GeneralCAS64)
+}
diff --git a/src/sync/atomic/atomic_test.go b/src/sync/atomic/atomic_test.go
new file mode 100644
index 000000000..9f13af48b
--- /dev/null
+++ b/src/sync/atomic/atomic_test.go
@@ -0,0 +1,1509 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic_test
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+	. "sync/atomic"
+	"testing"
+	"unsafe"
+)
+
+// Tests of correct behavior, without contention.
+// (Does the function work as advertised?)
+//
+// Test that the Add functions add correctly.
+// Test that the CompareAndSwap functions actually
+// do the comparison and the swap correctly.
+//
+// The loop over power-of-two values is meant to
+// ensure that the operations apply to the full word size.
+// The struct fields x.before and x.after check that the
+// operations do not extend past the full word size.
+
+const (
+	magic32 = 0xdedbeef
+	magic64 = 0xdeddeadbeefbeef
+)
+
+// Do the 64-bit functions panic?  If so, don't bother testing.
+var test64err = func() (err interface{}) {
+	defer func() {
+		err = recover()
+	}()
+	var x int64
+	AddInt64(&x, 1)
+	return nil
+}()
+
+func TestSwapInt32(t *testing.T) {
+	var x struct {
+		before int32
+		i      int32
+		after  int32
+	}
+	x.before = magic32
+	x.after = magic32
+	var j int32
+	for delta := int32(1); delta+delta > delta; delta += delta {
+		k := SwapInt32(&x.i, delta)
+		if x.i != delta || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+		j = delta
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestSwapUint32(t *testing.T) {
+	var x struct {
+		before uint32
+		i      uint32
+		after  uint32
+	}
+	x.before = magic32
+	x.after = magic32
+	var j uint32
+	for delta := uint32(1); delta+delta > delta; delta += delta {
+		k := SwapUint32(&x.i, delta)
+		if x.i != delta || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+		j = delta
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestSwapInt64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before int64
+		i      int64
+		after  int64
+	}
+	x.before = magic64
+	x.after = magic64
+	var j int64
+	for delta := int64(1); delta+delta > delta; delta += delta {
+		k := SwapInt64(&x.i, delta)
+		if x.i != delta || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+		j = delta
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func TestSwapUint64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before uint64
+		i      uint64
+		after  uint64
+	}
+	x.before = magic64
+	x.after = magic64
+	var j uint64
+	for delta := uint64(1); delta+delta > delta; delta += delta {
+		k := SwapUint64(&x.i, delta)
+		if x.i != delta || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+		j = delta
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func TestSwapUintptr(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      uintptr
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	var j uintptr
+	for delta := uintptr(1); delta+delta > delta; delta += delta {
+		k := SwapUintptr(&x.i, delta)
+		if x.i != delta || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+		j = delta
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+func TestSwapPointer(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      unsafe.Pointer
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	var j uintptr
+	for delta := uintptr(1); delta+delta > delta; delta += delta {
+		k := SwapPointer(&x.i, unsafe.Pointer(delta))
+		if uintptr(x.i) != delta || uintptr(k) != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+		j = delta
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+func TestAddInt32(t *testing.T) {
+	var x struct {
+		before int32
+		i      int32
+		after  int32
+	}
+	x.before = magic32
+	x.after = magic32
+	var j int32
+	for delta := int32(1); delta+delta > delta; delta += delta {
+		k := AddInt32(&x.i, delta)
+		j += delta
+		if x.i != j || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestAddUint32(t *testing.T) {
+	var x struct {
+		before uint32
+		i      uint32
+		after  uint32
+	}
+	x.before = magic32
+	x.after = magic32
+	var j uint32
+	for delta := uint32(1); delta+delta > delta; delta += delta {
+		k := AddUint32(&x.i, delta)
+		j += delta
+		if x.i != j || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestAddInt64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before int64
+		i      int64
+		after  int64
+	}
+	x.before = magic64
+	x.after = magic64
+	var j int64
+	for delta := int64(1); delta+delta > delta; delta += delta {
+		k := AddInt64(&x.i, delta)
+		j += delta
+		if x.i != j || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, int64(magic64), int64(magic64))
+	}
+}
+
+func TestAddUint64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before uint64
+		i      uint64
+		after  uint64
+	}
+	x.before = magic64
+	x.after = magic64
+	var j uint64
+	for delta := uint64(1); delta+delta > delta; delta += delta {
+		k := AddUint64(&x.i, delta)
+		j += delta
+		if x.i != j || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func TestAddUintptr(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      uintptr
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	var j uintptr
+	for delta := uintptr(1); delta+delta > delta; delta += delta {
+		k := AddUintptr(&x.i, delta)
+		j += delta
+		if x.i != j || k != j {
+			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
+		}
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+func TestCompareAndSwapInt32(t *testing.T) {
+	var x struct {
+		before int32
+		i      int32
+		after  int32
+	}
+	x.before = magic32
+	x.after = magic32
+	for val := int32(1); val+val > val; val += val {
+		x.i = val
+		if !CompareAndSwapInt32(&x.i, val, val+1) {
+			t.Fatalf("should have swapped %#x %#x", val, val+1)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+		x.i = val + 1
+		if CompareAndSwapInt32(&x.i, val, val+2) {
+			t.Fatalf("should not have swapped %#x %#x", val, val+2)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestCompareAndSwapUint32(t *testing.T) {
+	var x struct {
+		before uint32
+		i      uint32
+		after  uint32
+	}
+	x.before = magic32
+	x.after = magic32
+	for val := uint32(1); val+val > val; val += val {
+		x.i = val
+		if !CompareAndSwapUint32(&x.i, val, val+1) {
+			t.Fatalf("should have swapped %#x %#x", val, val+1)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+		x.i = val + 1
+		if CompareAndSwapUint32(&x.i, val, val+2) {
+			t.Fatalf("should not have swapped %#x %#x", val, val+2)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestCompareAndSwapInt64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before int64
+		i      int64
+		after  int64
+	}
+	x.before = magic64
+	x.after = magic64
+	for val := int64(1); val+val > val; val += val {
+		x.i = val
+		if !CompareAndSwapInt64(&x.i, val, val+1) {
+			t.Fatalf("should have swapped %#x %#x", val, val+1)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+		x.i = val + 1
+		if CompareAndSwapInt64(&x.i, val, val+2) {
+			t.Fatalf("should not have swapped %#x %#x", val, val+2)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func testCompareAndSwapUint64(t *testing.T, cas func(*uint64, uint64, uint64) bool) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before uint64
+		i      uint64
+		after  uint64
+	}
+	x.before = magic64
+	x.after = magic64
+	for val := uint64(1); val+val > val; val += val {
+		x.i = val
+		if !cas(&x.i, val, val+1) {
+			t.Fatalf("should have swapped %#x %#x", val, val+1)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+		x.i = val + 1
+		if cas(&x.i, val, val+2) {
+			t.Fatalf("should not have swapped %#x %#x", val, val+2)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func TestCompareAndSwapUint64(t *testing.T) {
+	testCompareAndSwapUint64(t, CompareAndSwapUint64)
+}
+
+func TestCompareAndSwapUintptr(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      uintptr
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	for val := uintptr(1); val+val > val; val += val {
+		x.i = val
+		if !CompareAndSwapUintptr(&x.i, val, val+1) {
+			t.Fatalf("should have swapped %#x %#x", val, val+1)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+		x.i = val + 1
+		if CompareAndSwapUintptr(&x.i, val, val+2) {
+			t.Fatalf("should not have swapped %#x %#x", val, val+2)
+		}
+		if x.i != val+1 {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+func TestCompareAndSwapPointer(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      unsafe.Pointer
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	for val := uintptr(1); val+val > val; val += val {
+		x.i = unsafe.Pointer(val)
+		if !CompareAndSwapPointer(&x.i, unsafe.Pointer(val), unsafe.Pointer(val+1)) {
+			t.Fatalf("should have swapped %#x %#x", val, val+1)
+		}
+		if x.i != unsafe.Pointer(val+1) {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+		x.i = unsafe.Pointer(val + 1)
+		if CompareAndSwapPointer(&x.i, unsafe.Pointer(val), unsafe.Pointer(val+2)) {
+			t.Fatalf("should not have swapped %#x %#x", val, val+2)
+		}
+		if x.i != unsafe.Pointer(val+1) {
+			t.Fatalf("wrong x.i after swap: x.i=%#x val+1=%#x", x.i, val+1)
+		}
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+func TestLoadInt32(t *testing.T) {
+	var x struct {
+		before int32
+		i      int32
+		after  int32
+	}
+	x.before = magic32
+	x.after = magic32
+	for delta := int32(1); delta+delta > delta; delta += delta {
+		k := LoadInt32(&x.i)
+		if k != x.i {
+			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
+		}
+		x.i += delta
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestLoadUint32(t *testing.T) {
+	var x struct {
+		before uint32
+		i      uint32
+		after  uint32
+	}
+	x.before = magic32
+	x.after = magic32
+	for delta := uint32(1); delta+delta > delta; delta += delta {
+		k := LoadUint32(&x.i)
+		if k != x.i {
+			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
+		}
+		x.i += delta
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestLoadInt64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before int64
+		i      int64
+		after  int64
+	}
+	x.before = magic64
+	x.after = magic64
+	for delta := int64(1); delta+delta > delta; delta += delta {
+		k := LoadInt64(&x.i)
+		if k != x.i {
+			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
+		}
+		x.i += delta
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func TestLoadUint64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before uint64
+		i      uint64
+		after  uint64
+	}
+	x.before = magic64
+	x.after = magic64
+	for delta := uint64(1); delta+delta > delta; delta += delta {
+		k := LoadUint64(&x.i)
+		if k != x.i {
+			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
+		}
+		x.i += delta
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func TestLoadUintptr(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      uintptr
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	for delta := uintptr(1); delta+delta > delta; delta += delta {
+		k := LoadUintptr(&x.i)
+		if k != x.i {
+			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
+		}
+		x.i += delta
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+func TestLoadPointer(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      unsafe.Pointer
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	for delta := uintptr(1); delta+delta > delta; delta += delta {
+		k := LoadPointer(&x.i)
+		if k != x.i {
+			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
+		}
+		x.i = unsafe.Pointer(uintptr(x.i) + delta)
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+func TestStoreInt32(t *testing.T) {
+	var x struct {
+		before int32
+		i      int32
+		after  int32
+	}
+	x.before = magic32
+	x.after = magic32
+	v := int32(0)
+	for delta := int32(1); delta+delta > delta; delta += delta {
+		StoreInt32(&x.i, v)
+		if x.i != v {
+			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)
+		}
+		v += delta
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestStoreUint32(t *testing.T) {
+	var x struct {
+		before uint32
+		i      uint32
+		after  uint32
+	}
+	x.before = magic32
+	x.after = magic32
+	v := uint32(0)
+	for delta := uint32(1); delta+delta > delta; delta += delta {
+		StoreUint32(&x.i, v)
+		if x.i != v {
+			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)
+		}
+		v += delta
+	}
+	if x.before != magic32 || x.after != magic32 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magic32, magic32)
+	}
+}
+
+func TestStoreInt64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before int64
+		i      int64
+		after  int64
+	}
+	x.before = magic64
+	x.after = magic64
+	v := int64(0)
+	for delta := int64(1); delta+delta > delta; delta += delta {
+		StoreInt64(&x.i, v)
+		if x.i != v {
+			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)
+		}
+		v += delta
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func TestStoreUint64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	var x struct {
+		before uint64
+		i      uint64
+		after  uint64
+	}
+	x.before = magic64
+	x.after = magic64
+	v := uint64(0)
+	for delta := uint64(1); delta+delta > delta; delta += delta {
+		StoreUint64(&x.i, v)
+		if x.i != v {
+			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)
+		}
+		v += delta
+	}
+	if x.before != magic64 || x.after != magic64 {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, uint64(magic64), uint64(magic64))
+	}
+}
+
+func TestStoreUintptr(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      uintptr
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	v := uintptr(0)
+	for delta := uintptr(1); delta+delta > delta; delta += delta {
+		StoreUintptr(&x.i, v)
+		if x.i != v {
+			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)
+		}
+		v += delta
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+func TestStorePointer(t *testing.T) {
+	var x struct {
+		before uintptr
+		i      unsafe.Pointer
+		after  uintptr
+	}
+	var m uint64 = magic64
+	magicptr := uintptr(m)
+	x.before = magicptr
+	x.after = magicptr
+	v := unsafe.Pointer(uintptr(0))
+	for delta := uintptr(1); delta+delta > delta; delta += delta {
+		StorePointer(&x.i, unsafe.Pointer(v))
+		if x.i != v {
+			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)
+		}
+		v = unsafe.Pointer(uintptr(v) + delta)
+	}
+	if x.before != magicptr || x.after != magicptr {
+		t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, magicptr, magicptr)
+	}
+}
+
+// Tests of correct behavior, with contention.
+// (Is the function atomic?)
+//
+// For each function, we write a "hammer" function that repeatedly
+// uses the atomic operation to add 1 to a value.  After running
+// multiple hammers in parallel, check that we end with the correct
+// total.
+// Swap can't add 1, so it uses a different scheme.
+// The functions repeatedly generate a pseudo-random number such that
+// low bits are equal to high bits, swap, check that the old value
+// has low and high bits equal.
+
+var hammer32 = map[string]func(*uint32, int){
+	"SwapInt32":             hammerSwapInt32,
+	"SwapUint32":            hammerSwapUint32,
+	"SwapUintptr":           hammerSwapUintptr32,
+	"SwapPointer":           hammerSwapPointer32,
+	"AddInt32":              hammerAddInt32,
+	"AddUint32":             hammerAddUint32,
+	"AddUintptr":            hammerAddUintptr32,
+	"CompareAndSwapInt32":   hammerCompareAndSwapInt32,
+	"CompareAndSwapUint32":  hammerCompareAndSwapUint32,
+	"CompareAndSwapUintptr": hammerCompareAndSwapUintptr32,
+	"CompareAndSwapPointer": hammerCompareAndSwapPointer32,
+}
+
+func init() {
+	var v uint64 = 1 << 50
+	if uintptr(v) != 0 {
+		// 64-bit system; clear uintptr tests
+		delete(hammer32, "SwapUintptr")
+		delete(hammer32, "SwapPointer")
+		delete(hammer32, "AddUintptr")
+		delete(hammer32, "CompareAndSwapUintptr")
+		delete(hammer32, "CompareAndSwapPointer")
+	}
+}
+
+func hammerSwapInt32(uaddr *uint32, count int) {
+	addr := (*int32)(unsafe.Pointer(uaddr))
+	seed := int(uintptr(unsafe.Pointer(&count)))
+	for i := 0; i < count; i++ {
+		new := uint32(seed+i)<<16 | uint32(seed+i)<<16>>16
+		old := uint32(SwapInt32(addr, int32(new)))
+		if old>>16 != old<<16>>16 {
+			panic(fmt.Sprintf("SwapInt32 is not atomic: %v", old))
+		}
+	}
+}
+
+func hammerSwapUint32(addr *uint32, count int) {
+	seed := int(uintptr(unsafe.Pointer(&count)))
+	for i := 0; i < count; i++ {
+		new := uint32(seed+i)<<16 | uint32(seed+i)<<16>>16
+		old := SwapUint32(addr, new)
+		if old>>16 != old<<16>>16 {
+			panic(fmt.Sprintf("SwapUint32 is not atomic: %v", old))
+		}
+	}
+}
+
+func hammerSwapUintptr32(uaddr *uint32, count int) {
+	// only safe when uintptr is 32-bit.
+	// not called on 64-bit systems.
+	addr := (*uintptr)(unsafe.Pointer(uaddr))
+	seed := int(uintptr(unsafe.Pointer(&count)))
+	for i := 0; i < count; i++ {
+		new := uintptr(seed+i)<<16 | uintptr(seed+i)<<16>>16
+		old := SwapUintptr(addr, new)
+		if old>>16 != old<<16>>16 {
+			panic(fmt.Sprintf("SwapUintptr is not atomic: %#08x", old))
+		}
+	}
+}
+
+func hammerSwapPointer32(uaddr *uint32, count int) {
+	// only safe when uintptr is 32-bit.
+	// not called on 64-bit systems.
+	addr := (*unsafe.Pointer)(unsafe.Pointer(uaddr))
+	seed := int(uintptr(unsafe.Pointer(&count)))
+	for i := 0; i < count; i++ {
+		new := uintptr(seed+i)<<16 | uintptr(seed+i)<<16>>16
+		old := uintptr(SwapPointer(addr, unsafe.Pointer(new)))
+		if old>>16 != old<<16>>16 {
+			panic(fmt.Sprintf("SwapPointer is not atomic: %#08x", old))
+		}
+	}
+}
+
+func hammerAddInt32(uaddr *uint32, count int) {
+	addr := (*int32)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		AddInt32(addr, 1)
+	}
+}
+
+func hammerAddUint32(addr *uint32, count int) {
+	for i := 0; i < count; i++ {
+		AddUint32(addr, 1)
+	}
+}
+
+func hammerAddUintptr32(uaddr *uint32, count int) {
+	// only safe when uintptr is 32-bit.
+	// not called on 64-bit systems.
+	addr := (*uintptr)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		AddUintptr(addr, 1)
+	}
+}
+
+func hammerCompareAndSwapInt32(uaddr *uint32, count int) {
+	addr := (*int32)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		for {
+			v := LoadInt32(addr)
+			if CompareAndSwapInt32(addr, v, v+1) {
+				break
+			}
+		}
+	}
+}
+
+func hammerCompareAndSwapUint32(addr *uint32, count int) {
+	for i := 0; i < count; i++ {
+		for {
+			v := LoadUint32(addr)
+			if CompareAndSwapUint32(addr, v, v+1) {
+				break
+			}
+		}
+	}
+}
+
+func hammerCompareAndSwapUintptr32(uaddr *uint32, count int) {
+	// only safe when uintptr is 32-bit.
+	// not called on 64-bit systems.
+	addr := (*uintptr)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		for {
+			v := LoadUintptr(addr)
+			if CompareAndSwapUintptr(addr, v, v+1) {
+				break
+			}
+		}
+	}
+}
+
+func hammerCompareAndSwapPointer32(uaddr *uint32, count int) {
+	// only safe when uintptr is 32-bit.
+	// not called on 64-bit systems.
+	addr := (*unsafe.Pointer)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		for {
+			v := LoadPointer(addr)
+			if CompareAndSwapPointer(addr, v, unsafe.Pointer(uintptr(v)+1)) {
+				break
+			}
+		}
+	}
+}
+
+func TestHammer32(t *testing.T) {
+	const p = 4
+	n := 100000
+	if testing.Short() {
+		n = 1000
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(p))
+
+	for name, testf := range hammer32 {
+		c := make(chan int)
+		var val uint32
+		for i := 0; i < p; i++ {
+			go func() {
+				defer func() {
+					if err := recover(); err != nil {
+						t.Error(err.(string))
+					}
+					c <- 1
+				}()
+				testf(&val, n)
+			}()
+		}
+		for i := 0; i < p; i++ {
+			<-c
+		}
+		if !strings.HasPrefix(name, "Swap") && val != uint32(n)*p {
+			t.Fatalf("%s: val=%d want %d", name, val, n*p)
+		}
+	}
+}
+
+var hammer64 = map[string]func(*uint64, int){
+	"SwapInt64":             hammerSwapInt64,
+	"SwapUint64":            hammerSwapUint64,
+	"SwapUintptr":           hammerSwapUintptr64,
+	"SwapPointer":           hammerSwapPointer64,
+	"AddInt64":              hammerAddInt64,
+	"AddUint64":             hammerAddUint64,
+	"AddUintptr":            hammerAddUintptr64,
+	"CompareAndSwapInt64":   hammerCompareAndSwapInt64,
+	"CompareAndSwapUint64":  hammerCompareAndSwapUint64,
+	"CompareAndSwapUintptr": hammerCompareAndSwapUintptr64,
+	"CompareAndSwapPointer": hammerCompareAndSwapPointer64,
+}
+
+func init() {
+	var v uint64 = 1 << 50
+	if uintptr(v) == 0 {
+		// 32-bit system; clear uintptr tests
+		delete(hammer64, "SwapUintptr")
+		delete(hammer64, "SwapPointer")
+		delete(hammer64, "AddUintptr")
+		delete(hammer64, "CompareAndSwapUintptr")
+		delete(hammer64, "CompareAndSwapPointer")
+	}
+}
+
+func hammerSwapInt64(uaddr *uint64, count int) {
+	addr := (*int64)(unsafe.Pointer(uaddr))
+	seed := int(uintptr(unsafe.Pointer(&count)))
+	for i := 0; i < count; i++ {
+		new := uint64(seed+i)<<32 | uint64(seed+i)<<32>>32
+		old := uint64(SwapInt64(addr, int64(new)))
+		if old>>32 != old<<32>>32 {
+			panic(fmt.Sprintf("SwapInt64 is not atomic: %v", old))
+		}
+	}
+}
+
+func hammerSwapUint64(addr *uint64, count int) {
+	seed := int(uintptr(unsafe.Pointer(&count)))
+	for i := 0; i < count; i++ {
+		new := uint64(seed+i)<<32 | uint64(seed+i)<<32>>32
+		old := SwapUint64(addr, new)
+		if old>>32 != old<<32>>32 {
+			panic(fmt.Sprintf("SwapUint64 is not atomic: %v", old))
+		}
+	}
+}
+
+func hammerSwapUintptr64(uaddr *uint64, count int) {
+	// only safe when uintptr is 64-bit.
+	// not called on 32-bit systems.
+	addr := (*uintptr)(unsafe.Pointer(uaddr))
+	seed := int(uintptr(unsafe.Pointer(&count)))
+	for i := 0; i < count; i++ {
+		new := uintptr(seed+i)<<32 | uintptr(seed+i)<<32>>32
+		old := SwapUintptr(addr, new)
+		if old>>32 != old<<32>>32 {
+			panic(fmt.Sprintf("SwapUintptr is not atomic: %v", old))
+		}
+	}
+}
+
+func hammerSwapPointer64(uaddr *uint64, count int) {
+	// only safe when uintptr is 64-bit.
+	// not called on 32-bit systems.
+	addr := (*unsafe.Pointer)(unsafe.Pointer(uaddr))
+	seed := int(uintptr(unsafe.Pointer(&count)))
+	for i := 0; i < count; i++ {
+		new := uintptr(seed+i)<<32 | uintptr(seed+i)<<32>>32
+		old := uintptr(SwapPointer(addr, unsafe.Pointer(new)))
+		if old>>32 != old<<32>>32 {
+			panic(fmt.Sprintf("SwapPointer is not atomic: %v", old))
+		}
+	}
+}
+
+func hammerAddInt64(uaddr *uint64, count int) {
+	addr := (*int64)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		AddInt64(addr, 1)
+	}
+}
+
+func hammerAddUint64(addr *uint64, count int) {
+	for i := 0; i < count; i++ {
+		AddUint64(addr, 1)
+	}
+}
+
+func hammerAddUintptr64(uaddr *uint64, count int) {
+	// only safe when uintptr is 64-bit.
+	// not called on 32-bit systems.
+	addr := (*uintptr)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		AddUintptr(addr, 1)
+	}
+}
+
+func hammerCompareAndSwapInt64(uaddr *uint64, count int) {
+	addr := (*int64)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		for {
+			v := LoadInt64(addr)
+			if CompareAndSwapInt64(addr, v, v+1) {
+				break
+			}
+		}
+	}
+}
+
+func hammerCompareAndSwapUint64(addr *uint64, count int) {
+	for i := 0; i < count; i++ {
+		for {
+			v := LoadUint64(addr)
+			if CompareAndSwapUint64(addr, v, v+1) {
+				break
+			}
+		}
+	}
+}
+
+func hammerCompareAndSwapUintptr64(uaddr *uint64, count int) {
+	// only safe when uintptr is 64-bit.
+	// not called on 32-bit systems.
+	addr := (*uintptr)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		for {
+			v := LoadUintptr(addr)
+			if CompareAndSwapUintptr(addr, v, v+1) {
+				break
+			}
+		}
+	}
+}
+
+func hammerCompareAndSwapPointer64(uaddr *uint64, count int) {
+	// only safe when uintptr is 64-bit.
+	// not called on 32-bit systems.
+	addr := (*unsafe.Pointer)(unsafe.Pointer(uaddr))
+	for i := 0; i < count; i++ {
+		for {
+			v := LoadPointer(addr)
+			if CompareAndSwapPointer(addr, v, unsafe.Pointer(uintptr(v)+1)) {
+				break
+			}
+		}
+	}
+}
+
+func TestHammer64(t *testing.T) {
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	const p = 4
+	n := 100000
+	if testing.Short() {
+		n = 1000
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(p))
+
+	for name, testf := range hammer64 {
+		c := make(chan int)
+		var val uint64
+		for i := 0; i < p; i++ {
+			go func() {
+				defer func() {
+					if err := recover(); err != nil {
+						t.Error(err.(string))
+					}
+					c <- 1
+				}()
+				testf(&val, n)
+			}()
+		}
+		for i := 0; i < p; i++ {
+			<-c
+		}
+		if !strings.HasPrefix(name, "Swap") && val != uint64(n)*p {
+			t.Fatalf("%s: val=%d want %d", name, val, n*p)
+		}
+	}
+}
+
+func hammerStoreLoadInt32(t *testing.T, paddr unsafe.Pointer) {
+	addr := (*int32)(paddr)
+	v := LoadInt32(addr)
+	vlo := v & ((1 << 16) - 1)
+	vhi := v >> 16
+	if vlo != vhi {
+		t.Fatalf("Int32: %#x != %#x", vlo, vhi)
+	}
+	new := v + 1 + 1<<16
+	if vlo == 1e4 {
+		new = 0
+	}
+	StoreInt32(addr, new)
+}
+
+func hammerStoreLoadUint32(t *testing.T, paddr unsafe.Pointer) {
+	addr := (*uint32)(paddr)
+	v := LoadUint32(addr)
+	vlo := v & ((1 << 16) - 1)
+	vhi := v >> 16
+	if vlo != vhi {
+		t.Fatalf("Uint32: %#x != %#x", vlo, vhi)
+	}
+	new := v + 1 + 1<<16
+	if vlo == 1e4 {
+		new = 0
+	}
+	StoreUint32(addr, new)
+}
+
+func hammerStoreLoadInt64(t *testing.T, paddr unsafe.Pointer) {
+	addr := (*int64)(paddr)
+	v := LoadInt64(addr)
+	vlo := v & ((1 << 32) - 1)
+	vhi := v >> 32
+	if vlo != vhi {
+		t.Fatalf("Int64: %#x != %#x", vlo, vhi)
+	}
+	new := v + 1 + 1<<32
+	StoreInt64(addr, new)
+}
+
+func hammerStoreLoadUint64(t *testing.T, paddr unsafe.Pointer) {
+	addr := (*uint64)(paddr)
+	v := LoadUint64(addr)
+	vlo := v & ((1 << 32) - 1)
+	vhi := v >> 32
+	if vlo != vhi {
+		t.Fatalf("Uint64: %#x != %#x", vlo, vhi)
+	}
+	new := v + 1 + 1<<32
+	StoreUint64(addr, new)
+}
+
+func hammerStoreLoadUintptr(t *testing.T, paddr unsafe.Pointer) {
+	addr := (*uintptr)(paddr)
+	var test64 uint64 = 1 << 50
+	arch32 := uintptr(test64) == 0
+	v := LoadUintptr(addr)
+	new := v
+	if arch32 {
+		vlo := v & ((1 << 16) - 1)
+		vhi := v >> 16
+		if vlo != vhi {
+			t.Fatalf("Uintptr: %#x != %#x", vlo, vhi)
+		}
+		new = v + 1 + 1<<16
+		if vlo == 1e4 {
+			new = 0
+		}
+	} else {
+		vlo := v & ((1 << 32) - 1)
+		vhi := v >> 32
+		if vlo != vhi {
+			t.Fatalf("Uintptr: %#x != %#x", vlo, vhi)
+		}
+		inc := uint64(1 + 1<<32)
+		new = v + uintptr(inc)
+	}
+	StoreUintptr(addr, new)
+}
+
+func hammerStoreLoadPointer(t *testing.T, paddr unsafe.Pointer) {
+	addr := (*unsafe.Pointer)(paddr)
+	var test64 uint64 = 1 << 50
+	arch32 := uintptr(test64) == 0
+	v := uintptr(LoadPointer(addr))
+	new := v
+	if arch32 {
+		vlo := v & ((1 << 16) - 1)
+		vhi := v >> 16
+		if vlo != vhi {
+			t.Fatalf("Pointer: %#x != %#x", vlo, vhi)
+		}
+		new = v + 1 + 1<<16
+		if vlo == 1e4 {
+			new = 0
+		}
+	} else {
+		vlo := v & ((1 << 32) - 1)
+		vhi := v >> 32
+		if vlo != vhi {
+			t.Fatalf("Pointer: %#x != %#x", vlo, vhi)
+		}
+		inc := uint64(1 + 1<<32)
+		new = v + uintptr(inc)
+	}
+	StorePointer(addr, unsafe.Pointer(new))
+}
+
+func TestHammerStoreLoad(t *testing.T) {
+	var tests []func(*testing.T, unsafe.Pointer)
+	tests = append(tests, hammerStoreLoadInt32, hammerStoreLoadUint32,
+		hammerStoreLoadUintptr, hammerStoreLoadPointer)
+	if test64err == nil {
+		tests = append(tests, hammerStoreLoadInt64, hammerStoreLoadUint64)
+	}
+	n := int(1e6)
+	if testing.Short() {
+		n = int(1e4)
+	}
+	const procs = 8
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(procs))
+	for _, tt := range tests {
+		c := make(chan int)
+		var val uint64
+		for p := 0; p < procs; p++ {
+			go func() {
+				for i := 0; i < n; i++ {
+					tt(t, unsafe.Pointer(&val))
+				}
+				c <- 1
+			}()
+		}
+		for p := 0; p < procs; p++ {
+			<-c
+		}
+	}
+}
+
+func TestStoreLoadSeqCst32(t *testing.T) {
+	if runtime.NumCPU() == 1 {
+		t.Skipf("Skipping test on %v processor machine", runtime.NumCPU())
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+	N := int32(1e3)
+	if testing.Short() {
+		N = int32(1e2)
+	}
+	c := make(chan bool, 2)
+	X := [2]int32{}
+	ack := [2][3]int32{{-1, -1, -1}, {-1, -1, -1}}
+	for p := 0; p < 2; p++ {
+		go func(me int) {
+			he := 1 - me
+			for i := int32(1); i < N; i++ {
+				StoreInt32(&X[me], i)
+				my := LoadInt32(&X[he])
+				StoreInt32(&ack[me][i%3], my)
+				for w := 1; LoadInt32(&ack[he][i%3]) == -1; w++ {
+					if w%1000 == 0 {
+						runtime.Gosched()
+					}
+				}
+				his := LoadInt32(&ack[he][i%3])
+				if (my != i && my != i-1) || (his != i && his != i-1) {
+					t.Fatalf("invalid values: %d/%d (%d)", my, his, i)
+				}
+				if my != i && his != i {
+					t.Fatalf("store/load are not sequentially consistent: %d/%d (%d)", my, his, i)
+				}
+				StoreInt32(&ack[me][(i-1)%3], -1)
+			}
+			c <- true
+		}(p)
+	}
+	<-c
+	<-c
+}
+
+func TestStoreLoadSeqCst64(t *testing.T) {
+	if runtime.NumCPU() == 1 {
+		t.Skipf("Skipping test on %v processor machine", runtime.NumCPU())
+	}
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+	N := int64(1e3)
+	if testing.Short() {
+		N = int64(1e2)
+	}
+	c := make(chan bool, 2)
+	X := [2]int64{}
+	ack := [2][3]int64{{-1, -1, -1}, {-1, -1, -1}}
+	for p := 0; p < 2; p++ {
+		go func(me int) {
+			he := 1 - me
+			for i := int64(1); i < N; i++ {
+				StoreInt64(&X[me], i)
+				my := LoadInt64(&X[he])
+				StoreInt64(&ack[me][i%3], my)
+				for w := 1; LoadInt64(&ack[he][i%3]) == -1; w++ {
+					if w%1000 == 0 {
+						runtime.Gosched()
+					}
+				}
+				his := LoadInt64(&ack[he][i%3])
+				if (my != i && my != i-1) || (his != i && his != i-1) {
+					t.Fatalf("invalid values: %d/%d (%d)", my, his, i)
+				}
+				if my != i && his != i {
+					t.Fatalf("store/load are not sequentially consistent: %d/%d (%d)", my, his, i)
+				}
+				StoreInt64(&ack[me][(i-1)%3], -1)
+			}
+			c <- true
+		}(p)
+	}
+	<-c
+	<-c
+}
+
+func TestStoreLoadRelAcq32(t *testing.T) {
+	if runtime.NumCPU() == 1 {
+		t.Skipf("Skipping test on %v processor machine", runtime.NumCPU())
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+	N := int32(1e3)
+	if testing.Short() {
+		N = int32(1e2)
+	}
+	c := make(chan bool, 2)
+	type Data struct {
+		signal int32
+		pad1   [128]int8
+		data1  int32
+		pad2   [128]int8
+		data2  float32
+	}
+	var X Data
+	for p := int32(0); p < 2; p++ {
+		go func(p int32) {
+			for i := int32(1); i < N; i++ {
+				if (i+p)%2 == 0 {
+					X.data1 = i
+					X.data2 = float32(i)
+					StoreInt32(&X.signal, i)
+				} else {
+					for w := 1; LoadInt32(&X.signal) != i; w++ {
+						if w%1000 == 0 {
+							runtime.Gosched()
+						}
+					}
+					d1 := X.data1
+					d2 := X.data2
+					if d1 != i || d2 != float32(i) {
+						t.Fatalf("incorrect data: %d/%g (%d)", d1, d2, i)
+					}
+				}
+			}
+			c <- true
+		}(p)
+	}
+	<-c
+	<-c
+}
+
+func TestStoreLoadRelAcq64(t *testing.T) {
+	if runtime.NumCPU() == 1 {
+		t.Skipf("Skipping test on %v processor machine", runtime.NumCPU())
+	}
+	if test64err != nil {
+		t.Skipf("Skipping 64-bit tests: %v", test64err)
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+	N := int64(1e3)
+	if testing.Short() {
+		N = int64(1e2)
+	}
+	c := make(chan bool, 2)
+	type Data struct {
+		signal int64
+		pad1   [128]int8
+		data1  int64
+		pad2   [128]int8
+		data2  float64
+	}
+	var X Data
+	for p := int64(0); p < 2; p++ {
+		go func(p int64) {
+			for i := int64(1); i < N; i++ {
+				if (i+p)%2 == 0 {
+					X.data1 = i
+					X.data2 = float64(i)
+					StoreInt64(&X.signal, i)
+				} else {
+					for w := 1; LoadInt64(&X.signal) != i; w++ {
+						if w%1000 == 0 {
+							runtime.Gosched()
+						}
+					}
+					d1 := X.data1
+					d2 := X.data2
+					if d1 != i || d2 != float64(i) {
+						t.Fatalf("incorrect data: %d/%g (%d)", d1, d2, i)
+					}
+				}
+			}
+			c <- true
+		}(p)
+	}
+	<-c
+	<-c
+}
+
+func shouldPanic(t *testing.T, name string, f func()) {
+	defer func() {
+		if recover() == nil {
+			t.Errorf("%s did not panic", name)
+		}
+	}()
+	f()
+}
+
+func TestUnaligned64(t *testing.T) {
+	// Unaligned 64-bit atomics on 32-bit systems are
+	// a continual source of pain. Test that on 32-bit systems they crash
+	// instead of failing silently.
+	if unsafe.Sizeof(int(0)) != 4 {
+		t.Skip("test only runs on 32-bit systems")
+	}
+
+	x := make([]uint32, 4)
+	p := (*uint64)(unsafe.Pointer(&x[1])) // misaligned
+
+	shouldPanic(t, "LoadUint64", func() { LoadUint64(p) })
+	shouldPanic(t, "StoreUint64", func() { StoreUint64(p, 1) })
+	shouldPanic(t, "CompareAndSwapUint64", func() { CompareAndSwapUint64(p, 1, 2) })
+	shouldPanic(t, "AddUint64", func() { AddUint64(p, 3) })
+}
+
+func TestNilDeref(t *testing.T) {
+	if p := runtime.GOOS + "/" + runtime.GOARCH; p == "freebsd/arm" || p == "netbsd/arm" {
+		t.Skipf("issue 7338: skipping test on %q", p)
+	}
+	funcs := [...]func(){
+		func() { CompareAndSwapInt32(nil, 0, 0) },
+		func() { CompareAndSwapInt64(nil, 0, 0) },
+		func() { CompareAndSwapUint32(nil, 0, 0) },
+		func() { CompareAndSwapUint64(nil, 0, 0) },
+		func() { CompareAndSwapUintptr(nil, 0, 0) },
+		func() { CompareAndSwapPointer(nil, nil, nil) },
+		func() { SwapInt32(nil, 0) },
+		func() { SwapUint32(nil, 0) },
+		func() { SwapInt64(nil, 0) },
+		func() { SwapUint64(nil, 0) },
+		func() { SwapUintptr(nil, 0) },
+		func() { SwapPointer(nil, nil) },
+		func() { AddInt32(nil, 0) },
+		func() { AddUint32(nil, 0) },
+		func() { AddInt64(nil, 0) },
+		func() { AddUint64(nil, 0) },
+		func() { AddUintptr(nil, 0) },
+		func() { LoadInt32(nil) },
+		func() { LoadInt64(nil) },
+		func() { LoadUint32(nil) },
+		func() { LoadUint64(nil) },
+		func() { LoadUintptr(nil) },
+		func() { LoadPointer(nil) },
+		func() { StoreInt32(nil, 0) },
+		func() { StoreInt64(nil, 0) },
+		func() { StoreUint32(nil, 0) },
+		func() { StoreUint64(nil, 0) },
+		func() { StoreUintptr(nil, 0) },
+		func() { StorePointer(nil, nil) },
+	}
+	for _, f := range funcs {
+		func() {
+			defer func() {
+				runtime.GC()
+				recover()
+			}()
+			f()
+		}()
+	}
+}
diff --git a/src/sync/atomic/doc.go b/src/sync/atomic/doc.go
new file mode 100644
index 000000000..10fb8c917
--- /dev/null
+++ b/src/sync/atomic/doc.go
@@ -0,0 +1,149 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package atomic provides low-level atomic memory primitives
+// useful for implementing synchronization algorithms.
+//
+// These functions require great care to be used correctly.
+// Except for special, low-level applications, synchronization is better
+// done with channels or the facilities of the sync package.
+// Share memory by communicating;
+// don't communicate by sharing memory.
+//
+// The swap operation, implemented by the SwapT functions, is the atomic
+// equivalent of:
+//
+//	old = *addr
+//	*addr = new
+//	return old
+//
+// The compare-and-swap operation, implemented by the CompareAndSwapT
+// functions, is the atomic equivalent of:
+//
+//	if *addr == old {
+//		*addr = new
+//		return true
+//	}
+//	return false
+//
+// The add operation, implemented by the AddT functions, is the atomic
+// equivalent of:
+//
+//	*addr += delta
+//	return *addr
+//
+// The load and store operations, implemented by the LoadT and StoreT
+// functions, are the atomic equivalents of "return *addr" and
+// "*addr = val".
+//
+package atomic
+
+import (
+	"unsafe"
+)
+
+// BUG(rsc): On x86-32, the 64-bit functions use instructions unavailable before the Pentium MMX.
+//
+// On non-Linux ARM, the 64-bit functions use instructions unavailable before the ARMv6k core.
+//
+// On both ARM and x86-32, it is the caller's responsibility to arrange for 64-bit
+// alignment of 64-bit words accessed atomically. The first word in a global
+// variable or in an allocated struct or slice can be relied upon to be
+// 64-bit aligned.
+
+// SwapInt32 atomically stores new into *addr and returns the previous *addr value.
+func SwapInt32(addr *int32, new int32) (old int32)
+
+// SwapInt64 atomically stores new into *addr and returns the previous *addr value.
+func SwapInt64(addr *int64, new int64) (old int64)
+
+// SwapUint32 atomically stores new into *addr and returns the previous *addr value.
+func SwapUint32(addr *uint32, new uint32) (old uint32)
+
+// SwapUint64 atomically stores new into *addr and returns the previous *addr value.
+func SwapUint64(addr *uint64, new uint64) (old uint64)
+
+// SwapUintptr atomically stores new into *addr and returns the previous *addr value.
+func SwapUintptr(addr *uintptr, new uintptr) (old uintptr)
+
+// SwapPointer atomically stores new into *addr and returns the previous *addr value.
+func SwapPointer(addr *unsafe.Pointer, new unsafe.Pointer) (old unsafe.Pointer)
+
+// CompareAndSwapInt32 executes the compare-and-swap operation for an int32 value.
+func CompareAndSwapInt32(addr *int32, old, new int32) (swapped bool)
+
+// CompareAndSwapInt64 executes the compare-and-swap operation for an int64 value.
+func CompareAndSwapInt64(addr *int64, old, new int64) (swapped bool)
+
+// CompareAndSwapUint32 executes the compare-and-swap operation for a uint32 value.
+func CompareAndSwapUint32(addr *uint32, old, new uint32) (swapped bool)
+
+// CompareAndSwapUint64 executes the compare-and-swap operation for a uint64 value.
+func CompareAndSwapUint64(addr *uint64, old, new uint64) (swapped bool)
+
+// CompareAndSwapUintptr executes the compare-and-swap operation for a uintptr value.
+func CompareAndSwapUintptr(addr *uintptr, old, new uintptr) (swapped bool)
+
+// CompareAndSwapPointer executes the compare-and-swap operation for a unsafe.Pointer value.
+func CompareAndSwapPointer(addr *unsafe.Pointer, old, new unsafe.Pointer) (swapped bool)
+
+// AddInt32 atomically adds delta to *addr and returns the new value.
+func AddInt32(addr *int32, delta int32) (new int32)
+
+// AddUint32 atomically adds delta to *addr and returns the new value.
+// To subtract a signed positive constant value c from x, do AddUint32(&x, ^uint32(c-1)).
+// In particular, to decrement x, do AddUint32(&x, ^uint32(0)).
+func AddUint32(addr *uint32, delta uint32) (new uint32)
+
+// AddInt64 atomically adds delta to *addr and returns the new value.
+func AddInt64(addr *int64, delta int64) (new int64)
+
+// AddUint64 atomically adds delta to *addr and returns the new value.
+// To subtract a signed positive constant value c from x, do AddUint64(&x, ^uint64(c-1)).
+// In particular, to decrement x, do AddUint64(&x, ^uint64(0)).
+func AddUint64(addr *uint64, delta uint64) (new uint64)
+
+// AddUintptr atomically adds delta to *addr and returns the new value.
+func AddUintptr(addr *uintptr, delta uintptr) (new uintptr)
+
+// LoadInt32 atomically loads *addr.
+func LoadInt32(addr *int32) (val int32)
+
+// LoadInt64 atomically loads *addr.
+func LoadInt64(addr *int64) (val int64)
+
+// LoadUint32 atomically loads *addr.
+func LoadUint32(addr *uint32) (val uint32)
+
+// LoadUint64 atomically loads *addr.
+func LoadUint64(addr *uint64) (val uint64)
+
+// LoadUintptr atomically loads *addr.
+func LoadUintptr(addr *uintptr) (val uintptr)
+
+// LoadPointer atomically loads *addr.
+func LoadPointer(addr *unsafe.Pointer) (val unsafe.Pointer)
+
+// StoreInt32 atomically stores val into *addr.
+func StoreInt32(addr *int32, val int32)
+
+// StoreInt64 atomically stores val into *addr.
+func StoreInt64(addr *int64, val int64)
+
+// StoreUint32 atomically stores val into *addr.
+func StoreUint32(addr *uint32, val uint32)
+
+// StoreUint64 atomically stores val into *addr.
+func StoreUint64(addr *uint64, val uint64)
+
+// StoreUintptr atomically stores val into *addr.
+func StoreUintptr(addr *uintptr, val uintptr)
+
+// StorePointer atomically stores val into *addr.
+func StorePointer(addr *unsafe.Pointer, val unsafe.Pointer)
+
+// Helper for ARM.  Linker will discard on other systems
+func panic64() {
+	panic("sync/atomic: broken 64-bit atomic operations (buggy QEMU)")
+}
diff --git a/src/sync/atomic/export_linux_arm_test.go b/src/sync/atomic/export_linux_arm_test.go
new file mode 100644
index 000000000..5cd43353e
--- /dev/null
+++ b/src/sync/atomic/export_linux_arm_test.go
@@ -0,0 +1,9 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+func generalCAS64(addr *uint64, old uint64, new uint64) bool
+
+var GeneralCAS64 = generalCAS64
diff --git a/src/sync/atomic/race.s b/src/sync/atomic/race.s
new file mode 100644
index 000000000..bdce7668b
--- /dev/null
+++ b/src/sync/atomic/race.s
@@ -0,0 +1,8 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+// This file is here only to allow external functions.
+// The operations are implemented in src/runtime/race_amd64.s
diff --git a/src/sync/cond.go b/src/sync/cond.go
new file mode 100644
index 000000000..9e6bc170f
--- /dev/null
+++ b/src/sync/cond.go
@@ -0,0 +1,118 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// Cond implements a condition variable, a rendezvous point
+// for goroutines waiting for or announcing the occurrence
+// of an event.
+//
+// Each Cond has an associated Locker L (often a *Mutex or *RWMutex),
+// which must be held when changing the condition and
+// when calling the Wait method.
+//
+// A Cond can be created as part of other structures.
+// A Cond must not be copied after first use.
+type Cond struct {
+	// L is held while observing or changing the condition
+	L Locker
+
+	sema    syncSema
+	waiters uint32 // number of waiters
+	checker copyChecker
+}
+
+// NewCond returns a new Cond with Locker l.
+func NewCond(l Locker) *Cond {
+	return &Cond{L: l}
+}
+
+// Wait atomically unlocks c.L and suspends execution
+// of the calling goroutine.  After later resuming execution,
+// Wait locks c.L before returning.  Unlike in other systems,
+// Wait cannot return unless awoken by Broadcast or Signal.
+//
+// Because c.L is not locked when Wait first resumes, the caller
+// typically cannot assume that the condition is true when
+// Wait returns.  Instead, the caller should Wait in a loop:
+//
+//    c.L.Lock()
+//    for !condition() {
+//        c.Wait()
+//    }
+//    ... make use of condition ...
+//    c.L.Unlock()
+//
+func (c *Cond) Wait() {
+	c.checker.check()
+	if raceenabled {
+		raceDisable()
+	}
+	atomic.AddUint32(&c.waiters, 1)
+	if raceenabled {
+		raceEnable()
+	}
+	c.L.Unlock()
+	runtime_Syncsemacquire(&c.sema)
+	c.L.Lock()
+}
+
+// Signal wakes one goroutine waiting on c, if there is any.
+//
+// It is allowed but not required for the caller to hold c.L
+// during the call.
+func (c *Cond) Signal() {
+	c.signalImpl(false)
+}
+
+// Broadcast wakes all goroutines waiting on c.
+//
+// It is allowed but not required for the caller to hold c.L
+// during the call.
+func (c *Cond) Broadcast() {
+	c.signalImpl(true)
+}
+
+func (c *Cond) signalImpl(all bool) {
+	c.checker.check()
+	if raceenabled {
+		raceDisable()
+	}
+	for {
+		old := atomic.LoadUint32(&c.waiters)
+		if old == 0 {
+			if raceenabled {
+				raceEnable()
+			}
+			return
+		}
+		new := old - 1
+		if all {
+			new = 0
+		}
+		if atomic.CompareAndSwapUint32(&c.waiters, old, new) {
+			if raceenabled {
+				raceEnable()
+			}
+			runtime_Syncsemrelease(&c.sema, old-new)
+			return
+		}
+	}
+}
+
+// copyChecker holds back pointer to itself to detect object copying.
+type copyChecker uintptr
+
+func (c *copyChecker) check() {
+	if uintptr(*c) != uintptr(unsafe.Pointer(c)) &&
+		!atomic.CompareAndSwapUintptr((*uintptr)(c), 0, uintptr(unsafe.Pointer(c))) &&
+		uintptr(*c) != uintptr(unsafe.Pointer(c)) {
+		panic("sync.Cond is copied")
+	}
+}
diff --git a/src/sync/cond_test.go b/src/sync/cond_test.go
new file mode 100644
index 000000000..467c80621
--- /dev/null
+++ b/src/sync/cond_test.go
@@ -0,0 +1,255 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package sync_test
+
+import (
+	. "sync"
+
+	"runtime"
+	"testing"
+)
+
+func TestCondSignal(t *testing.T) {
+	var m Mutex
+	c := NewCond(&m)
+	n := 2
+	running := make(chan bool, n)
+	awake := make(chan bool, n)
+	for i := 0; i < n; i++ {
+		go func() {
+			m.Lock()
+			running <- true
+			c.Wait()
+			awake <- true
+			m.Unlock()
+		}()
+	}
+	for i := 0; i < n; i++ {
+		<-running // Wait for everyone to run.
+	}
+	for n > 0 {
+		select {
+		case <-awake:
+			t.Fatal("goroutine not asleep")
+		default:
+		}
+		m.Lock()
+		c.Signal()
+		m.Unlock()
+		<-awake // Will deadlock if no goroutine wakes up
+		select {
+		case <-awake:
+			t.Fatal("too many goroutines awake")
+		default:
+		}
+		n--
+	}
+	c.Signal()
+}
+
+func TestCondSignalGenerations(t *testing.T) {
+	var m Mutex
+	c := NewCond(&m)
+	n := 100
+	running := make(chan bool, n)
+	awake := make(chan int, n)
+	for i := 0; i < n; i++ {
+		go func(i int) {
+			m.Lock()
+			running <- true
+			c.Wait()
+			awake <- i
+			m.Unlock()
+		}(i)
+		if i > 0 {
+			a := <-awake
+			if a != i-1 {
+				t.Fatalf("wrong goroutine woke up: want %d, got %d", i-1, a)
+			}
+		}
+		<-running
+		m.Lock()
+		c.Signal()
+		m.Unlock()
+	}
+}
+
+func TestCondBroadcast(t *testing.T) {
+	var m Mutex
+	c := NewCond(&m)
+	n := 200
+	running := make(chan int, n)
+	awake := make(chan int, n)
+	exit := false
+	for i := 0; i < n; i++ {
+		go func(g int) {
+			m.Lock()
+			for !exit {
+				running <- g
+				c.Wait()
+				awake <- g
+			}
+			m.Unlock()
+		}(i)
+	}
+	for i := 0; i < n; i++ {
+		for i := 0; i < n; i++ {
+			<-running // Will deadlock unless n are running.
+		}
+		if i == n-1 {
+			m.Lock()
+			exit = true
+			m.Unlock()
+		}
+		select {
+		case <-awake:
+			t.Fatal("goroutine not asleep")
+		default:
+		}
+		m.Lock()
+		c.Broadcast()
+		m.Unlock()
+		seen := make([]bool, n)
+		for i := 0; i < n; i++ {
+			g := <-awake
+			if seen[g] {
+				t.Fatal("goroutine woke up twice")
+			}
+			seen[g] = true
+		}
+	}
+	select {
+	case <-running:
+		t.Fatal("goroutine did not exit")
+	default:
+	}
+	c.Broadcast()
+}
+
+func TestRace(t *testing.T) {
+	x := 0
+	c := NewCond(&Mutex{})
+	done := make(chan bool)
+	go func() {
+		c.L.Lock()
+		x = 1
+		c.Wait()
+		if x != 2 {
+			t.Fatal("want 2")
+		}
+		x = 3
+		c.Signal()
+		c.L.Unlock()
+		done <- true
+	}()
+	go func() {
+		c.L.Lock()
+		for {
+			if x == 1 {
+				x = 2
+				c.Signal()
+				break
+			}
+			c.L.Unlock()
+			runtime.Gosched()
+			c.L.Lock()
+		}
+		c.L.Unlock()
+		done <- true
+	}()
+	go func() {
+		c.L.Lock()
+		for {
+			if x == 2 {
+				c.Wait()
+				if x != 3 {
+					t.Fatal("want 3")
+				}
+				break
+			}
+			if x == 3 {
+				break
+			}
+			c.L.Unlock()
+			runtime.Gosched()
+			c.L.Lock()
+		}
+		c.L.Unlock()
+		done <- true
+	}()
+	<-done
+	<-done
+	<-done
+}
+
+func TestCondCopy(t *testing.T) {
+	defer func() {
+		err := recover()
+		if err == nil || err.(string) != "sync.Cond is copied" {
+			t.Fatalf("got %v, expect sync.Cond is copied", err)
+		}
+	}()
+	c := Cond{L: &Mutex{}}
+	c.Signal()
+	c2 := c
+	c2.Signal()
+}
+
+func BenchmarkCond1(b *testing.B) {
+	benchmarkCond(b, 1)
+}
+
+func BenchmarkCond2(b *testing.B) {
+	benchmarkCond(b, 2)
+}
+
+func BenchmarkCond4(b *testing.B) {
+	benchmarkCond(b, 4)
+}
+
+func BenchmarkCond8(b *testing.B) {
+	benchmarkCond(b, 8)
+}
+
+func BenchmarkCond16(b *testing.B) {
+	benchmarkCond(b, 16)
+}
+
+func BenchmarkCond32(b *testing.B) {
+	benchmarkCond(b, 32)
+}
+
+func benchmarkCond(b *testing.B, waiters int) {
+	c := NewCond(&Mutex{})
+	done := make(chan bool)
+	id := 0
+
+	for routine := 0; routine < waiters+1; routine++ {
+		go func() {
+			for i := 0; i < b.N; i++ {
+				c.L.Lock()
+				if id == -1 {
+					c.L.Unlock()
+					break
+				}
+				id++
+				if id == waiters+1 {
+					id = 0
+					c.Broadcast()
+				} else {
+					c.Wait()
+				}
+				c.L.Unlock()
+			}
+			c.L.Lock()
+			id = -1
+			c.Broadcast()
+			c.L.Unlock()
+			done <- true
+		}()
+	}
+	for routine := 0; routine < waiters+1; routine++ {
+		<-done
+	}
+}
diff --git a/src/sync/example_test.go b/src/sync/example_test.go
new file mode 100644
index 000000000..bdd3af6fe
--- /dev/null
+++ b/src/sync/example_test.go
@@ -0,0 +1,59 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync_test
+
+import (
+	"fmt"
+	"sync"
+)
+
+type httpPkg struct{}
+
+func (httpPkg) Get(url string) {}
+
+var http httpPkg
+
+// This example fetches several URLs concurrently,
+// using a WaitGroup to block until all the fetches are complete.
+func ExampleWaitGroup() {
+	var wg sync.WaitGroup
+	var urls = []string{
+		"http://www.golang.org/",
+		"http://www.google.com/",
+		"http://www.somestupidname.com/",
+	}
+	for _, url := range urls {
+		// Increment the WaitGroup counter.
+		wg.Add(1)
+		// Launch a goroutine to fetch the URL.
+		go func(url string) {
+			// Decrement the counter when the goroutine completes.
+			defer wg.Done()
+			// Fetch the URL.
+			http.Get(url)
+		}(url)
+	}
+	// Wait for all HTTP fetches to complete.
+	wg.Wait()
+}
+
+func ExampleOnce() {
+	var once sync.Once
+	onceBody := func() {
+		fmt.Println("Only once")
+	}
+	done := make(chan bool)
+	for i := 0; i < 10; i++ {
+		go func() {
+			once.Do(onceBody)
+			done <- true
+		}()
+	}
+	for i := 0; i < 10; i++ {
+		<-done
+	}
+	// Output:
+	// Only once
+}
diff --git a/src/sync/export_test.go b/src/sync/export_test.go
new file mode 100644
index 000000000..fa5983a2d
--- /dev/null
+++ b/src/sync/export_test.go
@@ -0,0 +1,9 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+// Export for testing.
+var Runtime_Semacquire = runtime_Semacquire
+var Runtime_Semrelease = runtime_Semrelease
diff --git a/src/sync/mutex.go b/src/sync/mutex.go
new file mode 100644
index 000000000..73b337702
--- /dev/null
+++ b/src/sync/mutex.go
@@ -0,0 +1,109 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package sync provides basic synchronization primitives such as mutual
+// exclusion locks.  Other than the Once and WaitGroup types, most are intended
+// for use by low-level library routines.  Higher-level synchronization is
+// better done via channels and communication.
+//
+// Values containing the types defined in this package should not be copied.
+package sync
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// A Mutex is a mutual exclusion lock.
+// Mutexes can be created as part of other structures;
+// the zero value for a Mutex is an unlocked mutex.
+type Mutex struct {
+	state int32
+	sema  uint32
+}
+
+// A Locker represents an object that can be locked and unlocked.
+type Locker interface {
+	Lock()
+	Unlock()
+}
+
+const (
+	mutexLocked = 1 << iota // mutex is locked
+	mutexWoken
+	mutexWaiterShift = iota
+)
+
+// Lock locks m.
+// If the lock is already in use, the calling goroutine
+// blocks until the mutex is available.
+func (m *Mutex) Lock() {
+	// Fast path: grab unlocked mutex.
+	if atomic.CompareAndSwapInt32(&m.state, 0, mutexLocked) {
+		if raceenabled {
+			raceAcquire(unsafe.Pointer(m))
+		}
+		return
+	}
+
+	awoke := false
+	for {
+		old := m.state
+		new := old | mutexLocked
+		if old&mutexLocked != 0 {
+			new = old + 1<<mutexWaiterShift
+		}
+		if awoke {
+			// The goroutine has been woken from sleep,
+			// so we need to reset the flag in either case.
+			new &^= mutexWoken
+		}
+		if atomic.CompareAndSwapInt32(&m.state, old, new) {
+			if old&mutexLocked == 0 {
+				break
+			}
+			runtime_Semacquire(&m.sema)
+			awoke = true
+		}
+	}
+
+	if raceenabled {
+		raceAcquire(unsafe.Pointer(m))
+	}
+}
+
+// Unlock unlocks m.
+// It is a run-time error if m is not locked on entry to Unlock.
+//
+// A locked Mutex is not associated with a particular goroutine.
+// It is allowed for one goroutine to lock a Mutex and then
+// arrange for another goroutine to unlock it.
+func (m *Mutex) Unlock() {
+	if raceenabled {
+		_ = m.state
+		raceRelease(unsafe.Pointer(m))
+	}
+
+	// Fast path: drop lock bit.
+	new := atomic.AddInt32(&m.state, -mutexLocked)
+	if (new+mutexLocked)&mutexLocked == 0 {
+		panic("sync: unlock of unlocked mutex")
+	}
+
+	old := new
+	for {
+		// If there are no waiters or a goroutine has already
+		// been woken or grabbed the lock, no need to wake anyone.
+		if old>>mutexWaiterShift == 0 || old&(mutexLocked|mutexWoken) != 0 {
+			return
+		}
+		// Grab the right to wake someone.
+		new = (old - 1<<mutexWaiterShift) | mutexWoken
+		if atomic.CompareAndSwapInt32(&m.state, old, new) {
+			runtime_Semrelease(&m.sema)
+			return
+		}
+		old = m.state
+	}
+}
diff --git a/src/sync/mutex_test.go b/src/sync/mutex_test.go
new file mode 100644
index 000000000..151b25c10
--- /dev/null
+++ b/src/sync/mutex_test.go
@@ -0,0 +1,136 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+package sync_test
+
+import (
+	"runtime"
+	. "sync"
+	"testing"
+)
+
+func HammerSemaphore(s *uint32, loops int, cdone chan bool) {
+	for i := 0; i < loops; i++ {
+		Runtime_Semacquire(s)
+		Runtime_Semrelease(s)
+	}
+	cdone <- true
+}
+
+func TestSemaphore(t *testing.T) {
+	s := new(uint32)
+	*s = 1
+	c := make(chan bool)
+	for i := 0; i < 10; i++ {
+		go HammerSemaphore(s, 1000, c)
+	}
+	for i := 0; i < 10; i++ {
+		<-c
+	}
+}
+
+func BenchmarkUncontendedSemaphore(b *testing.B) {
+	s := new(uint32)
+	*s = 1
+	HammerSemaphore(s, b.N, make(chan bool, 2))
+}
+
+func BenchmarkContendedSemaphore(b *testing.B) {
+	b.StopTimer()
+	s := new(uint32)
+	*s = 1
+	c := make(chan bool)
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
+	b.StartTimer()
+
+	go HammerSemaphore(s, b.N/2, c)
+	go HammerSemaphore(s, b.N/2, c)
+	<-c
+	<-c
+}
+
+func HammerMutex(m *Mutex, loops int, cdone chan bool) {
+	for i := 0; i < loops; i++ {
+		m.Lock()
+		m.Unlock()
+	}
+	cdone <- true
+}
+
+func TestMutex(t *testing.T) {
+	m := new(Mutex)
+	c := make(chan bool)
+	for i := 0; i < 10; i++ {
+		go HammerMutex(m, 1000, c)
+	}
+	for i := 0; i < 10; i++ {
+		<-c
+	}
+}
+
+func TestMutexPanic(t *testing.T) {
+	defer func() {
+		if recover() == nil {
+			t.Fatalf("unlock of unlocked mutex did not panic")
+		}
+	}()
+
+	var mu Mutex
+	mu.Lock()
+	mu.Unlock()
+	mu.Unlock()
+}
+
+func BenchmarkMutexUncontended(b *testing.B) {
+	type PaddedMutex struct {
+		Mutex
+		pad [128]uint8
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		var mu PaddedMutex
+		for pb.Next() {
+			mu.Lock()
+			mu.Unlock()
+		}
+	})
+}
+
+func benchmarkMutex(b *testing.B, slack, work bool) {
+	var mu Mutex
+	if slack {
+		b.SetParallelism(10)
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			mu.Lock()
+			mu.Unlock()
+			if work {
+				for i := 0; i < 100; i++ {
+					foo *= 2
+					foo /= 2
+				}
+			}
+		}
+		_ = foo
+	})
+}
+
+func BenchmarkMutex(b *testing.B) {
+	benchmarkMutex(b, false, false)
+}
+
+func BenchmarkMutexSlack(b *testing.B) {
+	benchmarkMutex(b, true, false)
+}
+
+func BenchmarkMutexWork(b *testing.B) {
+	benchmarkMutex(b, false, true)
+}
+
+func BenchmarkMutexWorkSlack(b *testing.B) {
+	benchmarkMutex(b, true, true)
+}
diff --git a/src/sync/once.go b/src/sync/once.go
new file mode 100644
index 000000000..161ae3b3e
--- /dev/null
+++ b/src/sync/once.go
@@ -0,0 +1,43 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"sync/atomic"
+)
+
+// Once is an object that will perform exactly one action.
+type Once struct {
+	m    Mutex
+	done uint32
+}
+
+// Do calls the function f if and only if Do is being called for the
+// first time for this instance of Once.  In other words, given
+// 	var once Once
+// if once.Do(f) is called multiple times, only the first call will invoke f,
+// even if f has a different value in each invocation.  A new instance of
+// Once is required for each function to execute.
+//
+// Do is intended for initialization that must be run exactly once.  Since f
+// is niladic, it may be necessary to use a function literal to capture the
+// arguments to a function to be invoked by Do:
+// 	config.once.Do(func() { config.init(filename) })
+//
+// Because no call to Do returns until the one call to f returns, if f causes
+// Do to be called, it will deadlock.
+//
+func (o *Once) Do(f func()) {
+	if atomic.LoadUint32(&o.done) == 1 {
+		return
+	}
+	// Slow-path.
+	o.m.Lock()
+	defer o.m.Unlock()
+	if o.done == 0 {
+		f()
+		atomic.StoreUint32(&o.done, 1)
+	}
+}
diff --git a/src/sync/once_test.go b/src/sync/once_test.go
new file mode 100644
index 000000000..8afda82f3
--- /dev/null
+++ b/src/sync/once_test.go
@@ -0,0 +1,70 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync_test
+
+import (
+	. "sync"
+	"testing"
+)
+
+type one int
+
+func (o *one) Increment() {
+	*o++
+}
+
+func run(t *testing.T, once *Once, o *one, c chan bool) {
+	once.Do(func() { o.Increment() })
+	if v := *o; v != 1 {
+		t.Errorf("once failed inside run: %d is not 1", v)
+	}
+	c <- true
+}
+
+func TestOnce(t *testing.T) {
+	o := new(one)
+	once := new(Once)
+	c := make(chan bool)
+	const N = 10
+	for i := 0; i < N; i++ {
+		go run(t, once, o, c)
+	}
+	for i := 0; i < N; i++ {
+		<-c
+	}
+	if *o != 1 {
+		t.Errorf("once failed outside run: %d is not 1", *o)
+	}
+}
+
+func TestOncePanic(t *testing.T) {
+	once := new(Once)
+	for i := 0; i < 2; i++ {
+		func() {
+			defer func() {
+				if recover() == nil {
+					t.Fatalf("Once.Do() has not panic'ed")
+				}
+			}()
+			once.Do(func() {
+				panic("failed")
+			})
+		}()
+	}
+	once.Do(func() {})
+	once.Do(func() {
+		t.Fatalf("Once called twice")
+	})
+}
+
+func BenchmarkOnce(b *testing.B) {
+	var once Once
+	f := func() {}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			once.Do(f)
+		}
+	})
+}
diff --git a/src/sync/pool.go b/src/sync/pool.go
new file mode 100644
index 000000000..1f08707cd
--- /dev/null
+++ b/src/sync/pool.go
@@ -0,0 +1,223 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"runtime"
+	"sync/atomic"
+	"unsafe"
+)
+
+// A Pool is a set of temporary objects that may be individually saved and
+// retrieved.
+//
+// Any item stored in the Pool may be removed automatically at any time without
+// notification. If the Pool holds the only reference when this happens, the
+// item might be deallocated.
+//
+// A Pool is safe for use by multiple goroutines simultaneously.
+//
+// Pool's purpose is to cache allocated but unused items for later reuse,
+// relieving pressure on the garbage collector. That is, it makes it easy to
+// build efficient, thread-safe free lists. However, it is not suitable for all
+// free lists.
+//
+// An appropriate use of a Pool is to manage a group of temporary items
+// silently shared among and potentially reused by concurrent independent
+// clients of a package. Pool provides a way to amortize allocation overhead
+// across many clients.
+//
+// An example of good use of a Pool is in the fmt package, which maintains a
+// dynamically-sized store of temporary output buffers. The store scales under
+// load (when many goroutines are actively printing) and shrinks when
+// quiescent.
+//
+// On the other hand, a free list maintained as part of a short-lived object is
+// not a suitable use for a Pool, since the overhead does not amortize well in
+// that scenario. It is more efficient to have such objects implement their own
+// free list.
+//
+type Pool struct {
+	local     unsafe.Pointer // local fixed-size per-P pool, actual type is [P]poolLocal
+	localSize uintptr        // size of the local array
+
+	// New optionally specifies a function to generate
+	// a value when Get would otherwise return nil.
+	// It may not be changed concurrently with calls to Get.
+	New func() interface{}
+}
+
+// Local per-P Pool appendix.
+type poolLocal struct {
+	private interface{}   // Can be used only by the respective P.
+	shared  []interface{} // Can be used by any P.
+	Mutex                 // Protects shared.
+	pad     [128]byte     // Prevents false sharing.
+}
+
+// Put adds x to the pool.
+func (p *Pool) Put(x interface{}) {
+	if raceenabled {
+		// Under race detector the Pool degenerates into no-op.
+		// It's conforming, simple and does not introduce excessive
+		// happens-before edges between unrelated goroutines.
+		return
+	}
+	if x == nil {
+		return
+	}
+	l := p.pin()
+	if l.private == nil {
+		l.private = x
+		x = nil
+	}
+	runtime_procUnpin()
+	if x == nil {
+		return
+	}
+	l.Lock()
+	l.shared = append(l.shared, x)
+	l.Unlock()
+}
+
+// Get selects an arbitrary item from the Pool, removes it from the
+// Pool, and returns it to the caller.
+// Get may choose to ignore the pool and treat it as empty.
+// Callers should not assume any relation between values passed to Put and
+// the values returned by Get.
+//
+// If Get would otherwise return nil and p.New is non-nil, Get returns
+// the result of calling p.New.
+func (p *Pool) Get() interface{} {
+	if raceenabled {
+		if p.New != nil {
+			return p.New()
+		}
+		return nil
+	}
+	l := p.pin()
+	x := l.private
+	l.private = nil
+	runtime_procUnpin()
+	if x != nil {
+		return x
+	}
+	l.Lock()
+	last := len(l.shared) - 1
+	if last >= 0 {
+		x = l.shared[last]
+		l.shared = l.shared[:last]
+	}
+	l.Unlock()
+	if x != nil {
+		return x
+	}
+	return p.getSlow()
+}
+
+func (p *Pool) getSlow() (x interface{}) {
+	// See the comment in pin regarding ordering of the loads.
+	size := atomic.LoadUintptr(&p.localSize) // load-acquire
+	local := p.local                         // load-consume
+	// Try to steal one element from other procs.
+	pid := runtime_procPin()
+	runtime_procUnpin()
+	for i := 0; i < int(size); i++ {
+		l := indexLocal(local, (pid+i+1)%int(size))
+		l.Lock()
+		last := len(l.shared) - 1
+		if last >= 0 {
+			x = l.shared[last]
+			l.shared = l.shared[:last]
+			l.Unlock()
+			break
+		}
+		l.Unlock()
+	}
+
+	if x == nil && p.New != nil {
+		x = p.New()
+	}
+	return x
+}
+
+// pin pins the current goroutine to P, disables preemption and returns poolLocal pool for the P.
+// Caller must call runtime_procUnpin() when done with the pool.
+func (p *Pool) pin() *poolLocal {
+	pid := runtime_procPin()
+	// In pinSlow we store to localSize and then to local, here we load in opposite order.
+	// Since we've disabled preemption, GC can not happen in between.
+	// Thus here we must observe local at least as large localSize.
+	// We can observe a newer/larger local, it is fine (we must observe its zero-initialized-ness).
+	s := atomic.LoadUintptr(&p.localSize) // load-acquire
+	l := p.local                          // load-consume
+	if uintptr(pid) < s {
+		return indexLocal(l, pid)
+	}
+	return p.pinSlow()
+}
+
+func (p *Pool) pinSlow() *poolLocal {
+	// Retry under the mutex.
+	// Can not lock the mutex while pinned.
+	runtime_procUnpin()
+	allPoolsMu.Lock()
+	defer allPoolsMu.Unlock()
+	pid := runtime_procPin()
+	// poolCleanup won't be called while we are pinned.
+	s := p.localSize
+	l := p.local
+	if uintptr(pid) < s {
+		return indexLocal(l, pid)
+	}
+	if p.local == nil {
+		allPools = append(allPools, p)
+	}
+	// If GOMAXPROCS changes between GCs, we re-allocate the array and lose the old one.
+	size := runtime.GOMAXPROCS(0)
+	local := make([]poolLocal, size)
+	atomic.StorePointer((*unsafe.Pointer)(&p.local), unsafe.Pointer(&local[0])) // store-release
+	atomic.StoreUintptr(&p.localSize, uintptr(size))                            // store-release
+	return &local[pid]
+}
+
+func poolCleanup() {
+	// This function is called with the world stopped, at the beginning of a garbage collection.
+	// It must not allocate and probably should not call any runtime functions.
+	// Defensively zero out everything, 2 reasons:
+	// 1. To prevent false retention of whole Pools.
+	// 2. If GC happens while a goroutine works with l.shared in Put/Get,
+	//    it will retain whole Pool. So next cycle memory consumption would be doubled.
+	for i, p := range allPools {
+		allPools[i] = nil
+		for i := 0; i < int(p.localSize); i++ {
+			l := indexLocal(p.local, i)
+			l.private = nil
+			for j := range l.shared {
+				l.shared[j] = nil
+			}
+			l.shared = nil
+		}
+	}
+	allPools = []*Pool{}
+}
+
+var (
+	allPoolsMu Mutex
+	allPools   []*Pool
+)
+
+func init() {
+	runtime_registerPoolCleanup(poolCleanup)
+}
+
+func indexLocal(l unsafe.Pointer, i int) *poolLocal {
+	return &(*[1000000]poolLocal)(l)[i]
+}
+
+// Implemented in runtime.
+func runtime_registerPoolCleanup(cleanup func())
+func runtime_procPin() int
+func runtime_procUnpin()
diff --git a/src/sync/pool_test.go b/src/sync/pool_test.go
new file mode 100644
index 000000000..cf5c8bd90
--- /dev/null
+++ b/src/sync/pool_test.go
@@ -0,0 +1,151 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Pool is no-op under race detector, so all these tests do not work.
+// +build !race
+
+package sync_test
+
+import (
+	"runtime"
+	"runtime/debug"
+	. "sync"
+	"sync/atomic"
+	"testing"
+	"time"
+)
+
+func TestPool(t *testing.T) {
+	// disable GC so we can control when it happens.
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
+	var p Pool
+	if p.Get() != nil {
+		t.Fatal("expected empty")
+	}
+	p.Put("a")
+	p.Put("b")
+	if g := p.Get(); g != "a" {
+		t.Fatalf("got %#v; want a", g)
+	}
+	if g := p.Get(); g != "b" {
+		t.Fatalf("got %#v; want b", g)
+	}
+	if g := p.Get(); g != nil {
+		t.Fatalf("got %#v; want nil", g)
+	}
+
+	p.Put("c")
+	debug.SetGCPercent(100) // to allow following GC to actually run
+	runtime.GC()
+	if g := p.Get(); g != nil {
+		t.Fatalf("got %#v; want nil after GC", g)
+	}
+}
+
+func TestPoolNew(t *testing.T) {
+	// disable GC so we can control when it happens.
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
+
+	i := 0
+	p := Pool{
+		New: func() interface{} {
+			i++
+			return i
+		},
+	}
+	if v := p.Get(); v != 1 {
+		t.Fatalf("got %v; want 1", v)
+	}
+	if v := p.Get(); v != 2 {
+		t.Fatalf("got %v; want 2", v)
+	}
+	p.Put(42)
+	if v := p.Get(); v != 42 {
+		t.Fatalf("got %v; want 42", v)
+	}
+	if v := p.Get(); v != 3 {
+		t.Fatalf("got %v; want 3", v)
+	}
+}
+
+// Test that Pool does not hold pointers to previously cached
+// resources
+func TestPoolGC(t *testing.T) {
+	var p Pool
+	var fin uint32
+	const N = 100
+	for i := 0; i < N; i++ {
+		v := new(string)
+		runtime.SetFinalizer(v, func(vv *string) {
+			atomic.AddUint32(&fin, 1)
+		})
+		p.Put(v)
+	}
+	for i := 0; i < N; i++ {
+		p.Get()
+	}
+	for i := 0; i < 5; i++ {
+		runtime.GC()
+		time.Sleep(time.Duration(i*100+10) * time.Millisecond)
+		// 1 pointer can remain on stack or elsewhere
+		if atomic.LoadUint32(&fin) >= N-1 {
+			return
+		}
+	}
+	t.Fatalf("only %v out of %v resources are finalized",
+		atomic.LoadUint32(&fin), N)
+}
+
+func TestPoolStress(t *testing.T) {
+	const P = 10
+	N := int(1e6)
+	if testing.Short() {
+		N /= 100
+	}
+	var p Pool
+	done := make(chan bool)
+	for i := 0; i < P; i++ {
+		go func() {
+			var v interface{} = 0
+			for j := 0; j < N; j++ {
+				if v == nil {
+					v = 0
+				}
+				p.Put(v)
+				v = p.Get()
+				if v != nil && v.(int) != 0 {
+					t.Fatalf("expect 0, got %v", v)
+				}
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < P; i++ {
+		<-done
+	}
+}
+
+func BenchmarkPool(b *testing.B) {
+	var p Pool
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			p.Put(1)
+			p.Get()
+		}
+	})
+}
+
+func BenchmarkPoolOverflow(b *testing.B) {
+	var p Pool
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for b := 0; b < 100; b++ {
+				p.Put(1)
+			}
+			for b := 0; b < 100; b++ {
+				p.Get()
+			}
+		}
+	})
+}
diff --git a/src/sync/race.go b/src/sync/race.go
new file mode 100644
index 000000000..fd0277dcc
--- /dev/null
+++ b/src/sync/race.go
@@ -0,0 +1,42 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+package sync
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const raceenabled = true
+
+func raceAcquire(addr unsafe.Pointer) {
+	runtime.RaceAcquire(addr)
+}
+
+func raceRelease(addr unsafe.Pointer) {
+	runtime.RaceRelease(addr)
+}
+
+func raceReleaseMerge(addr unsafe.Pointer) {
+	runtime.RaceReleaseMerge(addr)
+}
+
+func raceDisable() {
+	runtime.RaceDisable()
+}
+
+func raceEnable() {
+	runtime.RaceEnable()
+}
+
+func raceRead(addr unsafe.Pointer) {
+	runtime.RaceRead(addr)
+}
+
+func raceWrite(addr unsafe.Pointer) {
+	runtime.RaceWrite(addr)
+}
diff --git a/src/sync/race0.go b/src/sync/race0.go
new file mode 100644
index 000000000..65ada1c5d
--- /dev/null
+++ b/src/sync/race0.go
@@ -0,0 +1,34 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !race
+
+package sync
+
+import (
+	"unsafe"
+)
+
+const raceenabled = false
+
+func raceAcquire(addr unsafe.Pointer) {
+}
+
+func raceRelease(addr unsafe.Pointer) {
+}
+
+func raceReleaseMerge(addr unsafe.Pointer) {
+}
+
+func raceDisable() {
+}
+
+func raceEnable() {
+}
+
+func raceRead(addr unsafe.Pointer) {
+}
+
+func raceWrite(addr unsafe.Pointer) {
+}
diff --git a/src/sync/runtime.go b/src/sync/runtime.go
new file mode 100644
index 000000000..3b866303a
--- /dev/null
+++ b/src/sync/runtime.go
@@ -0,0 +1,40 @@
+// Copyright 2012 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import "unsafe"
+
+// defined in package runtime
+
+// Semacquire waits until *s > 0 and then atomically decrements it.
+// It is intended as a simple sleep primitive for use by the synchronization
+// library and should not be used directly.
+func runtime_Semacquire(s *uint32)
+
+// Semrelease atomically increments *s and notifies a waiting goroutine
+// if one is blocked in Semacquire.
+// It is intended as a simple wakeup primitive for use by the synchronization
+// library and should not be used directly.
+func runtime_Semrelease(s *uint32)
+
+// Approximation of syncSema in runtime/sema.go.
+type syncSema struct {
+	lock uintptr
+	head unsafe.Pointer
+	tail unsafe.Pointer
+}
+
+// Syncsemacquire waits for a pairing Syncsemrelease on the same semaphore s.
+func runtime_Syncsemacquire(s *syncSema)
+
+// Syncsemrelease waits for n pairing Syncsemacquire on the same semaphore s.
+func runtime_Syncsemrelease(s *syncSema, n uint32)
+
+// Ensure that sync and runtime agree on size of syncSema.
+func runtime_Syncsemcheck(size uintptr)
+func init() {
+	var s syncSema
+	runtime_Syncsemcheck(unsafe.Sizeof(s))
+}
diff --git a/src/sync/runtime_sema_test.go b/src/sync/runtime_sema_test.go
new file mode 100644
index 000000000..5b7dd3df3
--- /dev/null
+++ b/src/sync/runtime_sema_test.go
@@ -0,0 +1,72 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync_test
+
+import (
+	"runtime"
+	. "sync"
+	"testing"
+)
+
+func BenchmarkSemaUncontended(b *testing.B) {
+	type PaddedSem struct {
+		sem uint32
+		pad [32]uint32
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		sem := new(PaddedSem)
+		for pb.Next() {
+			Runtime_Semrelease(&sem.sem)
+			Runtime_Semacquire(&sem.sem)
+		}
+	})
+}
+
+func benchmarkSema(b *testing.B, block, work bool) {
+	sem := uint32(0)
+	if block {
+		done := make(chan bool)
+		go func() {
+			for p := 0; p < runtime.GOMAXPROCS(0)/2; p++ {
+				Runtime_Semacquire(&sem)
+			}
+			done <- true
+		}()
+		defer func() {
+			<-done
+		}()
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			Runtime_Semrelease(&sem)
+			if work {
+				for i := 0; i < 100; i++ {
+					foo *= 2
+					foo /= 2
+				}
+			}
+			Runtime_Semacquire(&sem)
+		}
+		_ = foo
+		Runtime_Semrelease(&sem)
+	})
+}
+
+func BenchmarkSemaSyntNonblock(b *testing.B) {
+	benchmarkSema(b, false, false)
+}
+
+func BenchmarkSemaSyntBlock(b *testing.B) {
+	benchmarkSema(b, true, false)
+}
+
+func BenchmarkSemaWorkNonblock(b *testing.B) {
+	benchmarkSema(b, false, true)
+}
+
+func BenchmarkSemaWorkBlock(b *testing.B) {
+	benchmarkSema(b, true, true)
+}
diff --git a/src/sync/rwmutex.go b/src/sync/rwmutex.go
new file mode 100644
index 000000000..0e8a58e5f
--- /dev/null
+++ b/src/sync/rwmutex.go
@@ -0,0 +1,136 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// An RWMutex is a reader/writer mutual exclusion lock.
+// The lock can be held by an arbitrary number of readers
+// or a single writer.
+// RWMutexes can be created as part of other
+// structures; the zero value for a RWMutex is
+// an unlocked mutex.
+type RWMutex struct {
+	w           Mutex  // held if there are pending writers
+	writerSem   uint32 // semaphore for writers to wait for completing readers
+	readerSem   uint32 // semaphore for readers to wait for completing writers
+	readerCount int32  // number of pending readers
+	readerWait  int32  // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// RLock locks rw for reading.
+func (rw *RWMutex) RLock() {
+	if raceenabled {
+		_ = rw.w.state
+		raceDisable()
+	}
+	if atomic.AddInt32(&rw.readerCount, 1) < 0 {
+		// A writer is pending, wait for it.
+		runtime_Semacquire(&rw.readerSem)
+	}
+	if raceenabled {
+		raceEnable()
+		raceAcquire(unsafe.Pointer(&rw.readerSem))
+	}
+}
+
+// RUnlock undoes a single RLock call;
+// it does not affect other simultaneous readers.
+// It is a run-time error if rw is not locked for reading
+// on entry to RUnlock.
+func (rw *RWMutex) RUnlock() {
+	if raceenabled {
+		_ = rw.w.state
+		raceReleaseMerge(unsafe.Pointer(&rw.writerSem))
+		raceDisable()
+	}
+	if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			raceEnable()
+			panic("sync: RUnlock of unlocked RWMutex")
+		}
+		// A writer is pending.
+		if atomic.AddInt32(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			runtime_Semrelease(&rw.writerSem)
+		}
+	}
+	if raceenabled {
+		raceEnable()
+	}
+}
+
+// Lock locks rw for writing.
+// If the lock is already locked for reading or writing,
+// Lock blocks until the lock is available.
+// To ensure that the lock eventually becomes available,
+// a blocked Lock call excludes new readers from acquiring
+// the lock.
+func (rw *RWMutex) Lock() {
+	if raceenabled {
+		_ = rw.w.state
+		raceDisable()
+	}
+	// First, resolve competition with other writers.
+	rw.w.Lock()
+	// Announce to readers there is a pending writer.
+	r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
+	// Wait for active readers.
+	if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
+		runtime_Semacquire(&rw.writerSem)
+	}
+	if raceenabled {
+		raceEnable()
+		raceAcquire(unsafe.Pointer(&rw.readerSem))
+		raceAcquire(unsafe.Pointer(&rw.writerSem))
+	}
+}
+
+// Unlock unlocks rw for writing.  It is a run-time error if rw is
+// not locked for writing on entry to Unlock.
+//
+// As with Mutexes, a locked RWMutex is not associated with a particular
+// goroutine.  One goroutine may RLock (Lock) an RWMutex and then
+// arrange for another goroutine to RUnlock (Unlock) it.
+func (rw *RWMutex) Unlock() {
+	if raceenabled {
+		_ = rw.w.state
+		raceRelease(unsafe.Pointer(&rw.readerSem))
+		raceRelease(unsafe.Pointer(&rw.writerSem))
+		raceDisable()
+	}
+
+	// Announce to readers there is no active writer.
+	r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
+	if r >= rwmutexMaxReaders {
+		raceEnable()
+		panic("sync: Unlock of unlocked RWMutex")
+	}
+	// Unblock blocked readers, if any.
+	for i := 0; i < int(r); i++ {
+		runtime_Semrelease(&rw.readerSem)
+	}
+	// Allow other writers to proceed.
+	rw.w.Unlock()
+	if raceenabled {
+		raceEnable()
+	}
+}
+
+// RLocker returns a Locker interface that implements
+// the Lock and Unlock methods by calling rw.RLock and rw.RUnlock.
+func (rw *RWMutex) RLocker() Locker {
+	return (*rlocker)(rw)
+}
+
+type rlocker RWMutex
+
+func (r *rlocker) Lock()   { (*RWMutex)(r).RLock() }
+func (r *rlocker) Unlock() { (*RWMutex)(r).RUnlock() }
diff --git a/src/sync/rwmutex_test.go b/src/sync/rwmutex_test.go
new file mode 100644
index 000000000..f625bc3a5
--- /dev/null
+++ b/src/sync/rwmutex_test.go
@@ -0,0 +1,254 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+package sync_test
+
+import (
+	"fmt"
+	"runtime"
+	. "sync"
+	"sync/atomic"
+	"testing"
+)
+
+func parallelReader(m *RWMutex, clocked, cunlock, cdone chan bool) {
+	m.RLock()
+	clocked <- true
+	<-cunlock
+	m.RUnlock()
+	cdone <- true
+}
+
+func doTestParallelReaders(numReaders, gomaxprocs int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	var m RWMutex
+	clocked := make(chan bool)
+	cunlock := make(chan bool)
+	cdone := make(chan bool)
+	for i := 0; i < numReaders; i++ {
+		go parallelReader(&m, clocked, cunlock, cdone)
+	}
+	// Wait for all parallel RLock()s to succeed.
+	for i := 0; i < numReaders; i++ {
+		<-clocked
+	}
+	for i := 0; i < numReaders; i++ {
+		cunlock <- true
+	}
+	// Wait for the goroutines to finish.
+	for i := 0; i < numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestParallelReaders(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	doTestParallelReaders(1, 4)
+	doTestParallelReaders(3, 4)
+	doTestParallelReaders(4, 2)
+}
+
+func reader(rwm *RWMutex, num_iterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < num_iterations; i++ {
+		rwm.RLock()
+		n := atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func writer(rwm *RWMutex, num_iterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < num_iterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.Unlock()
+	}
+	cdone <- true
+}
+
+func HammerRWMutex(gomaxprocs, numReaders, num_iterations int) {
+	runtime.GOMAXPROCS(gomaxprocs)
+	// Number of active readers + 10000 * number of active writers.
+	var activity int32
+	var rwm RWMutex
+	cdone := make(chan bool)
+	go writer(&rwm, num_iterations, &activity, cdone)
+	var i int
+	for i = 0; i < numReaders/2; i++ {
+		go reader(&rwm, num_iterations, &activity, cdone)
+	}
+	go writer(&rwm, num_iterations, &activity, cdone)
+	for ; i < numReaders; i++ {
+		go reader(&rwm, num_iterations, &activity, cdone)
+	}
+	// Wait for the 2 writers and all readers to finish.
+	for i := 0; i < 2+numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestRWMutex(t *testing.T) {
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1))
+	n := 1000
+	if testing.Short() {
+		n = 5
+	}
+	HammerRWMutex(1, 1, n)
+	HammerRWMutex(1, 3, n)
+	HammerRWMutex(1, 10, n)
+	HammerRWMutex(4, 1, n)
+	HammerRWMutex(4, 3, n)
+	HammerRWMutex(4, 10, n)
+	HammerRWMutex(10, 1, n)
+	HammerRWMutex(10, 3, n)
+	HammerRWMutex(10, 10, n)
+	HammerRWMutex(10, 5, n)
+}
+
+func TestRLocker(t *testing.T) {
+	var wl RWMutex
+	var rl Locker
+	wlocked := make(chan bool, 1)
+	rlocked := make(chan bool, 1)
+	rl = wl.RLocker()
+	n := 10
+	go func() {
+		for i := 0; i < n; i++ {
+			rl.Lock()
+			rl.Lock()
+			rlocked <- true
+			wl.Lock()
+			wlocked <- true
+		}
+	}()
+	for i := 0; i < n; i++ {
+		<-rlocked
+		rl.Unlock()
+		select {
+		case <-wlocked:
+			t.Fatal("RLocker() didn't read-lock it")
+		default:
+		}
+		rl.Unlock()
+		<-wlocked
+		select {
+		case <-rlocked:
+			t.Fatal("RLocker() didn't respect the write lock")
+		default:
+		}
+		wl.Unlock()
+	}
+}
+
+func TestUnlockPanic(t *testing.T) {
+	defer func() {
+		if recover() == nil {
+			t.Fatalf("unlock of unlocked RWMutex did not panic")
+		}
+	}()
+	var mu RWMutex
+	mu.Unlock()
+}
+
+func TestUnlockPanic2(t *testing.T) {
+	defer func() {
+		if recover() == nil {
+			t.Fatalf("unlock of unlocked RWMutex did not panic")
+		}
+	}()
+	var mu RWMutex
+	mu.RLock()
+	mu.Unlock()
+}
+
+func TestRUnlockPanic(t *testing.T) {
+	defer func() {
+		if recover() == nil {
+			t.Fatalf("read unlock of unlocked RWMutex did not panic")
+		}
+	}()
+	var mu RWMutex
+	mu.RUnlock()
+}
+
+func TestRUnlockPanic2(t *testing.T) {
+	defer func() {
+		if recover() == nil {
+			t.Fatalf("read unlock of unlocked RWMutex did not panic")
+		}
+	}()
+	var mu RWMutex
+	mu.Lock()
+	mu.RUnlock()
+}
+
+func BenchmarkRWMutexUncontended(b *testing.B) {
+	type PaddedRWMutex struct {
+		RWMutex
+		pad [32]uint32
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		var rwm PaddedRWMutex
+		for pb.Next() {
+			rwm.RLock()
+			rwm.RLock()
+			rwm.RUnlock()
+			rwm.RUnlock()
+			rwm.Lock()
+			rwm.Unlock()
+		}
+	})
+}
+
+func benchmarkRWMutex(b *testing.B, localWork, writeRatio int) {
+	var rwm RWMutex
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			foo++
+			if foo%writeRatio == 0 {
+				rwm.Lock()
+				rwm.Unlock()
+			} else {
+				rwm.RLock()
+				for i := 0; i != localWork; i += 1 {
+					foo *= 2
+					foo /= 2
+				}
+				rwm.RUnlock()
+			}
+		}
+		_ = foo
+	})
+}
+
+func BenchmarkRWMutexWrite100(b *testing.B) {
+	benchmarkRWMutex(b, 0, 100)
+}
+
+func BenchmarkRWMutexWrite10(b *testing.B) {
+	benchmarkRWMutex(b, 0, 10)
+}
+
+func BenchmarkRWMutexWorkWrite100(b *testing.B) {
+	benchmarkRWMutex(b, 100, 100)
+}
+
+func BenchmarkRWMutexWorkWrite10(b *testing.B) {
+	benchmarkRWMutex(b, 100, 10)
+}
diff --git a/src/sync/waitgroup.go b/src/sync/waitgroup.go
new file mode 100644
index 000000000..92cc57d2c
--- /dev/null
+++ b/src/sync/waitgroup.go
@@ -0,0 +1,137 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+// A WaitGroup waits for a collection of goroutines to finish.
+// The main goroutine calls Add to set the number of
+// goroutines to wait for.  Then each of the goroutines
+// runs and calls Done when finished.  At the same time,
+// Wait can be used to block until all goroutines have finished.
+type WaitGroup struct {
+	m       Mutex
+	counter int32
+	waiters int32
+	sema    *uint32
+}
+
+// WaitGroup creates a new semaphore each time the old semaphore
+// is released. This is to avoid the following race:
+//
+// G1: Add(1)
+// G1: go G2()
+// G1: Wait() // Context switch after Unlock() and before Semacquire().
+// G2: Done() // Release semaphore: sema == 1, waiters == 0. G1 doesn't run yet.
+// G3: Wait() // Finds counter == 0, waiters == 0, doesn't block.
+// G3: Add(1) // Makes counter == 1, waiters == 0.
+// G3: go G4()
+// G3: Wait() // G1 still hasn't run, G3 finds sema == 1, unblocked! Bug.
+
+// Add adds delta, which may be negative, to the WaitGroup counter.
+// If the counter becomes zero, all goroutines blocked on Wait are released.
+// If the counter goes negative, Add panics.
+//
+// Note that calls with a positive delta that occur when the counter is zero
+// must happen before a Wait. Calls with a negative delta, or calls with a
+// positive delta that start when the counter is greater than zero, may happen
+// at any time.
+// Typically this means the calls to Add should execute before the statement
+// creating the goroutine or other event to be waited for.
+// See the WaitGroup example.
+func (wg *WaitGroup) Add(delta int) {
+	if raceenabled {
+		_ = wg.m.state // trigger nil deref early
+		if delta < 0 {
+			// Synchronize decrements with Wait.
+			raceReleaseMerge(unsafe.Pointer(wg))
+		}
+		raceDisable()
+		defer raceEnable()
+	}
+	v := atomic.AddInt32(&wg.counter, int32(delta))
+	if raceenabled {
+		if delta > 0 && v == int32(delta) {
+			// The first increment must be synchronized with Wait.
+			// Need to model this as a read, because there can be
+			// several concurrent wg.counter transitions from 0.
+			raceRead(unsafe.Pointer(&wg.sema))
+		}
+	}
+	if v < 0 {
+		panic("sync: negative WaitGroup counter")
+	}
+	if v > 0 || atomic.LoadInt32(&wg.waiters) == 0 {
+		return
+	}
+	wg.m.Lock()
+	if atomic.LoadInt32(&wg.counter) == 0 {
+		for i := int32(0); i < wg.waiters; i++ {
+			runtime_Semrelease(wg.sema)
+		}
+		wg.waiters = 0
+		wg.sema = nil
+	}
+	wg.m.Unlock()
+}
+
+// Done decrements the WaitGroup counter.
+func (wg *WaitGroup) Done() {
+	wg.Add(-1)
+}
+
+// Wait blocks until the WaitGroup counter is zero.
+func (wg *WaitGroup) Wait() {
+	if raceenabled {
+		_ = wg.m.state // trigger nil deref early
+		raceDisable()
+	}
+	if atomic.LoadInt32(&wg.counter) == 0 {
+		if raceenabled {
+			raceEnable()
+			raceAcquire(unsafe.Pointer(wg))
+		}
+		return
+	}
+	wg.m.Lock()
+	w := atomic.AddInt32(&wg.waiters, 1)
+	// This code is racing with the unlocked path in Add above.
+	// The code above modifies counter and then reads waiters.
+	// We must modify waiters and then read counter (the opposite order)
+	// to avoid missing an Add.
+	if atomic.LoadInt32(&wg.counter) == 0 {
+		atomic.AddInt32(&wg.waiters, -1)
+		if raceenabled {
+			raceEnable()
+			raceAcquire(unsafe.Pointer(wg))
+			raceDisable()
+		}
+		wg.m.Unlock()
+		if raceenabled {
+			raceEnable()
+		}
+		return
+	}
+	if raceenabled && w == 1 {
+		// Wait must be synchronized with the first Add.
+		// Need to model this is as a write to race with the read in Add.
+		// As a consequence, can do the write only for the first waiter,
+		// otherwise concurrent Waits will race with each other.
+		raceWrite(unsafe.Pointer(&wg.sema))
+	}
+	if wg.sema == nil {
+		wg.sema = new(uint32)
+	}
+	s := wg.sema
+	wg.m.Unlock()
+	runtime_Semacquire(s)
+	if raceenabled {
+		raceEnable()
+		raceAcquire(unsafe.Pointer(wg))
+	}
+}
diff --git a/src/sync/waitgroup_test.go b/src/sync/waitgroup_test.go
new file mode 100644
index 000000000..4c0a043c0
--- /dev/null
+++ b/src/sync/waitgroup_test.go
@@ -0,0 +1,148 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sync_test
+
+import (
+	. "sync"
+	"sync/atomic"
+	"testing"
+)
+
+func testWaitGroup(t *testing.T, wg1 *WaitGroup, wg2 *WaitGroup) {
+	n := 16
+	wg1.Add(n)
+	wg2.Add(n)
+	exited := make(chan bool, n)
+	for i := 0; i != n; i++ {
+		go func(i int) {
+			wg1.Done()
+			wg2.Wait()
+			exited <- true
+		}(i)
+	}
+	wg1.Wait()
+	for i := 0; i != n; i++ {
+		select {
+		case <-exited:
+			t.Fatal("WaitGroup released group too soon")
+		default:
+		}
+		wg2.Done()
+	}
+	for i := 0; i != n; i++ {
+		<-exited // Will block if barrier fails to unlock someone.
+	}
+}
+
+func TestWaitGroup(t *testing.T) {
+	wg1 := &WaitGroup{}
+	wg2 := &WaitGroup{}
+
+	// Run the same test a few times to ensure barrier is in a proper state.
+	for i := 0; i != 8; i++ {
+		testWaitGroup(t, wg1, wg2)
+	}
+}
+
+func TestWaitGroupMisuse(t *testing.T) {
+	defer func() {
+		err := recover()
+		if err != "sync: negative WaitGroup counter" {
+			t.Fatalf("Unexpected panic: %#v", err)
+		}
+	}()
+	wg := &WaitGroup{}
+	wg.Add(1)
+	wg.Done()
+	wg.Done()
+	t.Fatal("Should panic")
+}
+
+func TestWaitGroupRace(t *testing.T) {
+	// Run this test for about 1ms.
+	for i := 0; i < 1000; i++ {
+		wg := &WaitGroup{}
+		n := new(int32)
+		// spawn goroutine 1
+		wg.Add(1)
+		go func() {
+			atomic.AddInt32(n, 1)
+			wg.Done()
+		}()
+		// spawn goroutine 2
+		wg.Add(1)
+		go func() {
+			atomic.AddInt32(n, 1)
+			wg.Done()
+		}()
+		// Wait for goroutine 1 and 2
+		wg.Wait()
+		if atomic.LoadInt32(n) != 2 {
+			t.Fatal("Spurious wakeup from Wait")
+		}
+	}
+}
+
+func BenchmarkWaitGroupUncontended(b *testing.B) {
+	type PaddedWaitGroup struct {
+		WaitGroup
+		pad [128]uint8
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		var wg PaddedWaitGroup
+		for pb.Next() {
+			wg.Add(1)
+			wg.Done()
+			wg.Wait()
+		}
+	})
+}
+
+func benchmarkWaitGroupAddDone(b *testing.B, localWork int) {
+	var wg WaitGroup
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			wg.Add(1)
+			for i := 0; i < localWork; i++ {
+				foo *= 2
+				foo /= 2
+			}
+			wg.Done()
+		}
+		_ = foo
+	})
+}
+
+func BenchmarkWaitGroupAddDone(b *testing.B) {
+	benchmarkWaitGroupAddDone(b, 0)
+}
+
+func BenchmarkWaitGroupAddDoneWork(b *testing.B) {
+	benchmarkWaitGroupAddDone(b, 100)
+}
+
+func benchmarkWaitGroupWait(b *testing.B, localWork int) {
+	var wg WaitGroup
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			wg.Wait()
+			for i := 0; i < localWork; i++ {
+				foo *= 2
+				foo /= 2
+			}
+		}
+		_ = foo
+	})
+}
+
+func BenchmarkWaitGroupWait(b *testing.B) {
+	benchmarkWaitGroupWait(b, 0)
+}
+
+func BenchmarkWaitGroupWaitWork(b *testing.B) {
+	benchmarkWaitGroupWait(b, 100)
+}
author	Austin Clements <austin@google.com>	2014-10-22 13:25:37 -0400
committer	Austin Clements <austin@google.com>	2014-10-22 13:25:37 -0400
commit	800a826ae560191d4dbb6c529eea15cf4349a06f (patch)
tree	c4206bb02365c3b9c66e2946031b86f0ee527faf /src/sync
parent	2c586384223e980fd3303625ea6b02ed5d9fb9c0 (diff)
parent	1678ee65674b332e900a703de296eb66fbadcf45 (diff)
download	go-800a826ae560191d4dbb6c529eea15cf4349a06f.tar.gz