all: merge NaCl branch (part 1)

See golang.org/s/go13nacl for design overview. This CL is the mostly mechanical changes from rsc's Go 1.2 based NaCl branch, specifically 39cb35750369 to 500771b477cf from https://code.google.com/r/rsc-go13nacl. This CL does not include working NaCl support, there are probably two or three more large merges to come. CL 15750044 is not included as it involves more invasive changes to the linker which will need to be merged separately. The exact change lists included are 15050047: syscall: support for Native Client 15360044: syscall: unzip implementation for Native Client 15370044: syscall: Native Client SRPC implementation 15400047: cmd/dist, cmd/go, go/build, test: support for Native Client 15410048: runtime: support for Native Client 15410049: syscall: file descriptor table for Native Client 15410050: syscall: in-memory file system for Native Client 15440048: all: update +build lines for Native Client port 15540045: cmd/6g, cmd/8g, cmd/gc: support for Native Client 15570045: os: support for Native Client 15680044: crypto/..., hash/crc32, reflect, sync/atomic: support for amd64p32 15690044: net: support for Native Client 15690048: runtime: support for fake time like on Go Playground 15690051: build: disable various tests on Native Client LGTM=rsc R=rsc CC=golang-codereviews https://codereview.appspot.com/68150047 Committer: Russ Cox <rsc@golang.org>
author: Dave Cheney <dave@cheney.net> 2014-02-25 09:47:42 -0500
committer: Dave Cheney <dave@cheney.net> 2014-02-25 09:47:42 -0500
commit: 0e79f07290bb48f4188338b63faeb771a76e647a (patch)
tree: d5f6df7e130b20c5907de69e07dfafa943434fa8 /src/pkg/crypto
parent: 52dde77c1b85ecfae27ac82645c1ea5ec80db930 (diff)
download: go-0e79f07290bb48f4188338b63faeb771a76e647a.tar.gz
9 files changed, 598 insertions, 6 deletions
diff --git a/src/pkg/crypto/md5/md5block_amd64p32.s b/src/pkg/crypto/md5/md5block_amd64p32.s
new file mode 100644
index 000000000..a78a3f610
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_amd64p32.s
@@ -0,0 +1,184 @@
+// Original source:
+//	http://www.zorinaq.com/papers/md5-amd64.html
+//	http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 6a assembly by the Go Authors.
+//
+// Restrictions to make code safe for Native Client:
+// replace BP with R11, reloaded before use at return.
+// replace R15 with R11.
+
+#include "../../../cmd/ld/textflag.h"
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+TEXT	·block(SB),NOSPLIT,$0-32
+	MOVL	dig+0(FP),	R11
+	MOVL	p+4(FP),	SI
+	MOVL	p_len+8(FP), DX
+	SHRQ	$6,		DX
+	SHLQ	$6,		DX
+
+	LEAQ	(SI)(DX*1),	DI
+	MOVL	(0*4)(R11),	AX
+	MOVL	(1*4)(R11),	BX
+	MOVL	(2*4)(R11),	CX
+	MOVL	(3*4)(R11),	DX
+
+	CMPQ	SI,		DI
+	JEQ	end
+
+loop:
+	MOVL	AX,		R12
+	MOVL	BX,		R13
+	MOVL	CX,		R14
+	MOVL	DX,		R11
+
+	MOVL	(0*4)(SI),	R8
+	MOVL	DX,		R9
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+	XORL	c, R9; \
+	LEAL	const(a)(R8*1), a; \
+	ANDL	b, R9; \
+	XORL d, R9; \
+	MOVL (index*4)(SI), R8; \
+	ADDL R9, a; \
+	ROLL $shift, a; \
+	MOVL c, R9; \
+	ADDL b, a
+
+	ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+	ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+	ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+	ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+	ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+	ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+	ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+	ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+	ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+	ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+	ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+	ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+	ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+	ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+	ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+	ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+	MOVL	(1*4)(SI),	R8
+	MOVL	DX,		R9
+	MOVL	DX,		R10
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+	NOTL	R9; \
+	LEAL	const(a)(R8*1),a; \
+	ANDL	b,		R10; \
+	ANDL	c,		R9; \
+	MOVL	(index*4)(SI),R8; \
+	ORL	R9,		R10; \
+	MOVL	c,		R9; \
+	ADDL	R10,		a; \
+	MOVL	c,		R10; \
+	ROLL	$shift,	a; \
+	ADDL	b,		a
+
+	ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+	ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+	ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+	ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+	ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+	ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+	ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+	ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+	ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+	ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+	ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+	ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+	ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+	ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+	ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+	ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+ 
+	MOVL	(5*4)(SI),	R8
+	MOVL	CX,		R9
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(R8*1),a; \
+	MOVL	(index*4)(SI),R8; \
+	XORL	d,		R9; \
+	XORL	b,		R9; \
+	ADDL	R9,		a; \
+	ROLL	$shift,		a; \
+	MOVL	b,		R9; \
+	ADDL	b,		a
+
+	ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+	ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+	ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+	ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+	ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+	ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+	ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+	ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+	ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+	ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+	ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+	ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+	ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+	ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+	ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+	ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+	MOVL	(0*4)(SI),	R8
+	MOVL	$0xffffffff,	R9
+	XORL	DX,		R9
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+	LEAL	const(a)(R8*1),a; \
+	ORL	b,		R9; \
+	XORL	c,		R9; \
+	ADDL	R9,		a; \
+	MOVL	(index*4)(SI),R8; \
+	MOVL	$0xffffffff,	R9; \
+	ROLL	$shift,		a; \
+	XORL	c,		R9; \
+	ADDL	b,		a
+	
+	ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+	ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+	ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+	ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+	ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+	ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+	ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+	ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+	ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+	ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+	ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+	ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+	ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+	ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+	ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+	ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+	ADDL	R12,	AX
+	ADDL	R13,	BX
+	ADDL	R14,	CX
+	ADDL	R11,	DX
+
+	ADDQ	$64,		SI
+	CMPQ	SI,		DI
+	JB	loop
+
+end:
+	MOVL	dig+0(FP),	R11
+	MOVL	AX,		(0*4)(R11)
+	MOVL	BX,		(1*4)(R11)
+	MOVL	CX,		(2*4)(R11)
+	MOVL	DX,		(3*4)(R11)
+	RET
diff --git a/src/pkg/crypto/md5/md5block_decl.go b/src/pkg/crypto/md5/md5block_decl.go
index c4d6aaaf0..d7956a6d2 100644
--- a/src/pkg/crypto/md5/md5block_decl.go
+++ b/src/pkg/crypto/md5/md5block_decl.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 386 arm
+// +build amd64 amd64p32 386 arm
 
 package md5
 
diff --git a/src/pkg/crypto/rand/rand_unix.go b/src/pkg/crypto/rand/rand_unix.go
index 0fbd7eaf5..1e741fda1 100644
--- a/src/pkg/crypto/rand/rand_unix.go
+++ b/src/pkg/crypto/rand/rand_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux netbsd openbsd plan9 solaris
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd plan9 solaris
 
 // Unix cryptographically secure pseudorandom number
 // generator.
diff --git a/src/pkg/crypto/rc4/rc4_amd64p32.s b/src/pkg/crypto/rc4/rc4_amd64p32.s
new file mode 100644
index 000000000..27d849507
--- /dev/null
+++ b/src/pkg/crypto/rc4/rc4_amd64p32.s
@@ -0,0 +1,192 @@
+// Original source:
+//	http://www.zorinaq.com/papers/rc4-amd64.html
+//	http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
+
+#include "../../../cmd/ld/textflag.h"
+
+// Local modifications:
+//
+// Transliterated from GNU to 6a assembly syntax by the Go authors.
+// The comments and spacing are from the original.
+//
+// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
+//
+// The original code accumulated 64 bits of key stream in an integer
+// register and then XOR'ed the key stream into the data 8 bytes at a time.
+// Modified to accumulate 128 bits of key stream into an XMM register
+// and then XOR the key stream into the data 16 bytes at a time.
+// Approximately doubles throughput.
+//
+// Converted to amd64p32.
+//
+// To make safe for Native Client, avoid use of BP, R15,
+// and two-register addressing modes.
+
+// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
+// but makes the code run 2.0x slower on Xeon.
+#define EXTEND(r) MOVBLZX r, r
+
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The code has been designed to be easily integrated into openssl:
+** the exported RC4() function can replace the actual implementations
+** openssl already contains. Please note that when linking with openssl,
+** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
+** with -DRC4_INT='unsigned long'.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+*/
+
+TEXT ·xorKeyStream(SB),NOSPLIT,$0
+	MOVL	n+8(FP),	BX		// rbx = ARG(len)
+	MOVL	src+4(FP),	SI		// in = ARG(in)
+	MOVL	dst+0(FP),	DI		// out = ARG(out)
+	MOVL	state+12(FP),	R10		// d = ARG(data)
+	MOVL	i+16(FP),	AX
+	MOVBQZX	0(AX),		CX		// x = *xp
+	MOVL	j+20(FP),	AX
+	MOVBQZX	0(AX),		DX		// y = *yp
+
+	LEAQ	(SI)(BX*1),	R9		// limit = in+len
+
+l1:	CMPQ	SI,		R9		// cmp in with in+len
+	JGE	finished			// jump if (in >= in+len)
+
+	INCB	CX
+	EXTEND(CX)
+	TESTL	$15,		CX
+	JZ	wordloop
+	LEAL	(R10)(CX*4), R12
+
+	MOVBLZX	(R12),	AX
+
+	ADDB	AX,		DX		// y += tx
+	EXTEND(DX)
+	LEAL (R10)(DX*4), R11
+	MOVBLZX	(R11),	BX		// ty = d[y]
+	MOVB	BX,		(R12)	// d[x] = ty
+	ADDB	AX,		BX		// val = ty+tx
+	EXTEND(BX)
+	LEAL (R10)(BX*4), R13
+	MOVB	AX,		(R11)	// d[y] = tx
+	MOVBLZX	(R13),	R8		// val = d[val]
+	XORB	(SI),		R8		// xor 1 byte
+	MOVB	R8,		(DI)
+	INCQ	SI				// in++
+	INCQ	DI				// out++
+	JMP l1
+
+wordloop:
+	SUBQ	$16,		R9
+	CMPQ	SI,		R9
+	JGT	end
+
+start:
+	ADDQ	$16,		SI		// increment in
+	ADDQ	$16,		DI		// increment out
+
+	// Each KEYROUND generates one byte of key and
+	// inserts it into an XMM register at the given 16-bit index.
+	// The key state array is uint32 words only using the bottom
+	// byte of each word, so the 16-bit OR only copies 8 useful bits.
+	// We accumulate alternating bytes into X0 and X1, and then at
+	// the end we OR X1<<8 into X0 to produce the actual key.
+	//
+	// At the beginning of the loop, CX%16 == 0, so the 16 loads
+	// at state[CX], state[CX+1], ..., state[CX+15] can precompute
+	// (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
+	// without fear of the byte computation CX+15 wrapping around.
+	//
+	// The first round needs R12[0], the second needs R12[1], and so on.
+	// We can avoid memory stalls by starting the load for round n+1
+	// before the end of round n, using the LOAD macro.
+	LEAQ	(R10)(CX*4),	R12
+
+#define KEYROUND(xmm, load, off, r1, r2, index) \
+	LEAL (R10)(DX*4), R11; \
+	MOVBLZX	(R11),	R8; \
+	MOVB	r1,		(R11); \
+	load((off+1), r2); \
+	MOVB	R8,		(off*4)(R12); \
+	ADDB	r1,		R8; \
+	EXTEND(R8); \
+	LEAL (R10)(R8*4), R14; \
+	PINSRW	$index, (R14), xmm
+
+#define LOAD(off, reg) \
+	MOVBLZX	(off*4)(R12),	reg; \
+	ADDB	reg,		DX; \
+	EXTEND(DX)
+
+#define SKIP(off, reg)
+
+	LOAD(0, AX)
+	KEYROUND(X0, LOAD, 0, AX, BX, 0)
+	KEYROUND(X1, LOAD, 1, BX, AX, 0)
+	KEYROUND(X0, LOAD, 2, AX, BX, 1)
+	KEYROUND(X1, LOAD, 3, BX, AX, 1)
+	KEYROUND(X0, LOAD, 4, AX, BX, 2)
+	KEYROUND(X1, LOAD, 5, BX, AX, 2)
+	KEYROUND(X0, LOAD, 6, AX, BX, 3)
+	KEYROUND(X1, LOAD, 7, BX, AX, 3)
+	KEYROUND(X0, LOAD, 8, AX, BX, 4)
+	KEYROUND(X1, LOAD, 9, BX, AX, 4)
+	KEYROUND(X0, LOAD, 10, AX, BX, 5)
+	KEYROUND(X1, LOAD, 11, BX, AX, 5)
+	KEYROUND(X0, LOAD, 12, AX, BX, 6)
+	KEYROUND(X1, LOAD, 13, BX, AX, 6)
+	KEYROUND(X0, LOAD, 14, AX, BX, 7)
+	KEYROUND(X1, SKIP, 15, BX, AX, 7)
+	
+	ADDB	$16,		CX
+
+	PSLLQ	$8,		X1
+	PXOR	X1,		X0
+	MOVOU	-16(SI),	X2
+	PXOR	X0,		X2
+	MOVOU	X2,		-16(DI)
+
+	CMPQ	SI,		R9		// cmp in with in+len-16
+	JLE	start				// jump if (in <= in+len-16)
+
+end:
+	DECB	CX
+	ADDQ	$16,		R9		// tmp = in+len
+
+	// handle the last bytes, one by one
+l2:	CMPQ	SI,		R9		// cmp in with in+len
+	JGE	finished			// jump if (in >= in+len)
+
+	INCB	CX
+	EXTEND(CX)
+	LEAL (R10)(CX*4), R12
+	MOVBLZX	(R12),	AX
+
+	ADDB	AX,		DX		// y += tx
+	EXTEND(DX)
+	LEAL (R10)(DX*4), R11
+	MOVBLZX	(R11),	BX		// ty = d[y]
+	MOVB	BX,		(R12)	// d[x] = ty
+	ADDB	AX,		BX		// val = ty+tx
+	EXTEND(BX)
+	LEAL (R10)(BX*4), R13
+	MOVB	AX,		(R11)	// d[y] = tx
+	MOVBLZX	(R13),	R8		// val = d[val]
+	XORB	(SI),		R8		// xor 1 byte
+	MOVB	R8,		(DI)
+	INCQ	SI				// in++
+	INCQ	DI				// out++
+	JMP l2
+
+finished:
+	MOVL	j+20(FP),	BX
+	MOVB	DX, 0(BX)
+	MOVL	i+16(FP),	AX
+	MOVB	CX, 0(AX)
+	RET
diff --git a/src/pkg/crypto/rc4/rc4_asm.go b/src/pkg/crypto/rc4/rc4_asm.go
index c582a4488..fc71b9a6f 100644
--- a/src/pkg/crypto/rc4/rc4_asm.go
+++ b/src/pkg/crypto/rc4/rc4_asm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 arm 386
+// +build amd64 amd64p32 arm 386
 
 package rc4
 
diff --git a/src/pkg/crypto/rc4/rc4_ref.go b/src/pkg/crypto/rc4/rc4_ref.go
index bdf5e1db2..1ecce1a7f 100644
--- a/src/pkg/crypto/rc4/rc4_ref.go
+++ b/src/pkg/crypto/rc4/rc4_ref.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64,!arm,!386
+// +build !amd64,!amd64p32,!arm,!386
 
 package rc4
 
diff --git a/src/pkg/crypto/sha1/sha1block_amd64p32.s b/src/pkg/crypto/sha1/sha1block_amd64p32.s
new file mode 100644
index 000000000..3c589d94f
--- /dev/null
+++ b/src/pkg/crypto/sha1/sha1block_amd64p32.s
@@ -0,0 +1,216 @@
+// Copyright 2013 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "../../../cmd/ld/textflag.h"
+
+// SHA1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
+//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+//
+// amd64p32 version.
+// To ensure safety for Native Client, avoids use of BP and R15
+// as well as two-register addressing modes.
+
+#define LOAD(index) \
+	MOVL	(index*4)(SI), R10; \
+	BSWAPL	R10; \
+	MOVL	R10, (index*4)(SP)
+
+#define SHUFFLE(index) \
+	MOVL	(((index)&0xf)*4)(SP), R10; \
+	XORL	(((index-3)&0xf)*4)(SP), R10; \
+	XORL	(((index-8)&0xf)*4)(SP), R10; \
+	XORL	(((index-14)&0xf)*4)(SP), R10; \
+	ROLL	$1, R10; \
+	MOVL	R10, (((index)&0xf)*4)(SP)
+
+#define FUNC1(a, b, c, d, e) \
+	MOVL	d, R9; \
+	XORL	c, R9; \
+	ANDL	b, R9; \
+	XORL	d, R9
+
+#define FUNC2(a, b, c, d, e) \
+	MOVL	b, R9; \
+	XORL	c, R9; \
+	XORL	d, R9
+
+#define FUNC3(a, b, c, d, e) \
+	MOVL	b, R8; \
+	ORL	c, R8; \
+	ANDL	d, R8; \
+	MOVL	b, R9; \
+	ANDL	c, R9; \
+	ORL	R8, R9
+	
+#define FUNC4 FUNC2
+
+#define MIX(a, b, c, d, e, const) \
+	ROLL	$30, b; \
+	ADDL	R9, e; \
+	MOVL	a, R8; \
+	ROLL	$5, R8; \
+	LEAL	const(e)(R10*1), e; \
+	ADDL	R8, e
+
+#define ROUND1(a, b, c, d, e, index) \
+	LOAD(index); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND1x(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC1(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND2(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC2(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x6ED9EBA1)
+
+#define ROUND3(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC3(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0x8F1BBCDC)
+
+#define ROUND4(a, b, c, d, e, index) \
+	SHUFFLE(index); \
+	FUNC4(a, b, c, d, e); \
+	MIX(a, b, c, d, e, 0xCA62C1D6)
+
+TEXT ·block(SB),NOSPLIT,$64-32
+	MOVL	dig+0(FP),	R14
+	MOVL	p_base+4(FP),	SI
+	MOVL	p_len+8(FP),	DX
+	SHRQ	$6,		DX
+	SHLQ	$6,		DX
+	
+	LEAQ	(SI)(DX*1),	DI
+	MOVL	(0*4)(R14),	AX
+	MOVL	(1*4)(R14),	BX
+	MOVL	(2*4)(R14),	CX
+	MOVL	(3*4)(R14),	DX
+	MOVL	(4*4)(R14),	R13
+
+	CMPQ	SI,		DI
+	JEQ	end
+
+loop:
+#define BP R13 /* keep diff from sha1block_amd64.s small */
+	ROUND1(AX, BX, CX, DX, BP, 0)
+	ROUND1(BP, AX, BX, CX, DX, 1)
+	ROUND1(DX, BP, AX, BX, CX, 2)
+	ROUND1(CX, DX, BP, AX, BX, 3)
+	ROUND1(BX, CX, DX, BP, AX, 4)
+	ROUND1(AX, BX, CX, DX, BP, 5)
+	ROUND1(BP, AX, BX, CX, DX, 6)
+	ROUND1(DX, BP, AX, BX, CX, 7)
+	ROUND1(CX, DX, BP, AX, BX, 8)
+	ROUND1(BX, CX, DX, BP, AX, 9)
+	ROUND1(AX, BX, CX, DX, BP, 10)
+	ROUND1(BP, AX, BX, CX, DX, 11)
+	ROUND1(DX, BP, AX, BX, CX, 12)
+	ROUND1(CX, DX, BP, AX, BX, 13)
+	ROUND1(BX, CX, DX, BP, AX, 14)
+	ROUND1(AX, BX, CX, DX, BP, 15)
+
+	ROUND1x(BP, AX, BX, CX, DX, 16)
+	ROUND1x(DX, BP, AX, BX, CX, 17)
+	ROUND1x(CX, DX, BP, AX, BX, 18)
+	ROUND1x(BX, CX, DX, BP, AX, 19)
+	
+	ROUND2(AX, BX, CX, DX, BP, 20)
+	ROUND2(BP, AX, BX, CX, DX, 21)
+	ROUND2(DX, BP, AX, BX, CX, 22)
+	ROUND2(CX, DX, BP, AX, BX, 23)
+	ROUND2(BX, CX, DX, BP, AX, 24)
+	ROUND2(AX, BX, CX, DX, BP, 25)
+	ROUND2(BP, AX, BX, CX, DX, 26)
+	ROUND2(DX, BP, AX, BX, CX, 27)
+	ROUND2(CX, DX, BP, AX, BX, 28)
+	ROUND2(BX, CX, DX, BP, AX, 29)
+	ROUND2(AX, BX, CX, DX, BP, 30)
+	ROUND2(BP, AX, BX, CX, DX, 31)
+	ROUND2(DX, BP, AX, BX, CX, 32)
+	ROUND2(CX, DX, BP, AX, BX, 33)
+	ROUND2(BX, CX, DX, BP, AX, 34)
+	ROUND2(AX, BX, CX, DX, BP, 35)
+	ROUND2(BP, AX, BX, CX, DX, 36)
+	ROUND2(DX, BP, AX, BX, CX, 37)
+	ROUND2(CX, DX, BP, AX, BX, 38)
+	ROUND2(BX, CX, DX, BP, AX, 39)
+	
+	ROUND3(AX, BX, CX, DX, BP, 40)
+	ROUND3(BP, AX, BX, CX, DX, 41)
+	ROUND3(DX, BP, AX, BX, CX, 42)
+	ROUND3(CX, DX, BP, AX, BX, 43)
+	ROUND3(BX, CX, DX, BP, AX, 44)
+	ROUND3(AX, BX, CX, DX, BP, 45)
+	ROUND3(BP, AX, BX, CX, DX, 46)
+	ROUND3(DX, BP, AX, BX, CX, 47)
+	ROUND3(CX, DX, BP, AX, BX, 48)
+	ROUND3(BX, CX, DX, BP, AX, 49)
+	ROUND3(AX, BX, CX, DX, BP, 50)
+	ROUND3(BP, AX, BX, CX, DX, 51)
+	ROUND3(DX, BP, AX, BX, CX, 52)
+	ROUND3(CX, DX, BP, AX, BX, 53)
+	ROUND3(BX, CX, DX, BP, AX, 54)
+	ROUND3(AX, BX, CX, DX, BP, 55)
+	ROUND3(BP, AX, BX, CX, DX, 56)
+	ROUND3(DX, BP, AX, BX, CX, 57)
+	ROUND3(CX, DX, BP, AX, BX, 58)
+	ROUND3(BX, CX, DX, BP, AX, 59)
+	
+	ROUND4(AX, BX, CX, DX, BP, 60)
+	ROUND4(BP, AX, BX, CX, DX, 61)
+	ROUND4(DX, BP, AX, BX, CX, 62)
+	ROUND4(CX, DX, BP, AX, BX, 63)
+	ROUND4(BX, CX, DX, BP, AX, 64)
+	ROUND4(AX, BX, CX, DX, BP, 65)
+	ROUND4(BP, AX, BX, CX, DX, 66)
+	ROUND4(DX, BP, AX, BX, CX, 67)
+	ROUND4(CX, DX, BP, AX, BX, 68)
+	ROUND4(BX, CX, DX, BP, AX, 69)
+	ROUND4(AX, BX, CX, DX, BP, 70)
+	ROUND4(BP, AX, BX, CX, DX, 71)
+	ROUND4(DX, BP, AX, BX, CX, 72)
+	ROUND4(CX, DX, BP, AX, BX, 73)
+	ROUND4(BX, CX, DX, BP, AX, 74)
+	ROUND4(AX, BX, CX, DX, BP, 75)
+	ROUND4(BP, AX, BX, CX, DX, 76)
+	ROUND4(DX, BP, AX, BX, CX, 77)
+	ROUND4(CX, DX, BP, AX, BX, 78)
+	ROUND4(BX, CX, DX, BP, AX, 79)
+#undef BP
+
+	ADDL	(0*4)(R14), AX
+	ADDL	(1*4)(R14), BX
+	ADDL	(2*4)(R14), CX
+	ADDL	(3*4)(R14), DX
+	ADDL	(4*4)(R14), R13
+
+	MOVL	AX, (0*4)(R14)
+	MOVL	BX, (1*4)(R14)
+	MOVL	CX, (2*4)(R14)
+	MOVL	DX, (3*4)(R14)
+	MOVL	R13, (4*4)(R14)
+
+	ADDQ	$64, SI
+	CMPQ	SI, DI
+	JB	loop
+
+end:
+	RET
diff --git a/src/pkg/crypto/sha1/sha1block_decl.go b/src/pkg/crypto/sha1/sha1block_decl.go
index b2c68f0e8..2331deb3a 100644
--- a/src/pkg/crypto/sha1/sha1block_decl.go
+++ b/src/pkg/crypto/sha1/sha1block_decl.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 386 arm
+// +build amd64 amd64p32 386
 
 package sha1
 
diff --git a/src/pkg/crypto/x509/root_unix.go b/src/pkg/crypto/x509/root_unix.go
index a5bd19e82..11ad3c440 100644
--- a/src/pkg/crypto/x509/root_unix.go
+++ b/src/pkg/crypto/x509/root_unix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build dragonfly freebsd linux openbsd netbsd solaris
+// +build dragonfly freebsd linux nacl netbsd openbsd solaris
 
 package x509
author	Dave Cheney <dave@cheney.net>	2014-02-25 09:47:42 -0500
committer	Dave Cheney <dave@cheney.net>	2014-02-25 09:47:42 -0500
commit	0e79f07290bb48f4188338b63faeb771a76e647a (patch)
tree	d5f6df7e130b20c5907de69e07dfafa943434fa8 /src/pkg/crypto
parent	52dde77c1b85ecfae27ac82645c1ea5ec80db930 (diff)
download	go-0e79f07290bb48f4188338b63faeb771a76e647a.tar.gz