summaryrefslogtreecommitdiff
path: root/src/pkg/crypto
diff options
context:
space:
mode:
authorDave Cheney <dave@cheney.net>2014-02-25 09:47:42 -0500
committerDave Cheney <dave@cheney.net>2014-02-25 09:47:42 -0500
commit0e79f07290bb48f4188338b63faeb771a76e647a (patch)
treed5f6df7e130b20c5907de69e07dfafa943434fa8 /src/pkg/crypto
parent52dde77c1b85ecfae27ac82645c1ea5ec80db930 (diff)
downloadgo-0e79f07290bb48f4188338b63faeb771a76e647a.tar.gz
all: merge NaCl branch (part 1)
See golang.org/s/go13nacl for design overview. This CL is the mostly mechanical changes from rsc's Go 1.2 based NaCl branch, specifically 39cb35750369 to 500771b477cf from https://code.google.com/r/rsc-go13nacl. This CL does not include working NaCl support, there are probably two or three more large merges to come. CL 15750044 is not included as it involves more invasive changes to the linker which will need to be merged separately. The exact change lists included are 15050047: syscall: support for Native Client 15360044: syscall: unzip implementation for Native Client 15370044: syscall: Native Client SRPC implementation 15400047: cmd/dist, cmd/go, go/build, test: support for Native Client 15410048: runtime: support for Native Client 15410049: syscall: file descriptor table for Native Client 15410050: syscall: in-memory file system for Native Client 15440048: all: update +build lines for Native Client port 15540045: cmd/6g, cmd/8g, cmd/gc: support for Native Client 15570045: os: support for Native Client 15680044: crypto/..., hash/crc32, reflect, sync/atomic: support for amd64p32 15690044: net: support for Native Client 15690048: runtime: support for fake time like on Go Playground 15690051: build: disable various tests on Native Client LGTM=rsc R=rsc CC=golang-codereviews https://codereview.appspot.com/68150047 Committer: Russ Cox <rsc@golang.org>
Diffstat (limited to 'src/pkg/crypto')
-rw-r--r--src/pkg/crypto/md5/md5block_amd64p32.s184
-rw-r--r--src/pkg/crypto/md5/md5block_decl.go2
-rw-r--r--src/pkg/crypto/rand/rand_unix.go2
-rw-r--r--src/pkg/crypto/rc4/rc4_amd64p32.s192
-rw-r--r--src/pkg/crypto/rc4/rc4_asm.go2
-rw-r--r--src/pkg/crypto/rc4/rc4_ref.go2
-rw-r--r--src/pkg/crypto/sha1/sha1block_amd64p32.s216
-rw-r--r--src/pkg/crypto/sha1/sha1block_decl.go2
-rw-r--r--src/pkg/crypto/x509/root_unix.go2
9 files changed, 598 insertions, 6 deletions
diff --git a/src/pkg/crypto/md5/md5block_amd64p32.s b/src/pkg/crypto/md5/md5block_amd64p32.s
new file mode 100644
index 000000000..a78a3f610
--- /dev/null
+++ b/src/pkg/crypto/md5/md5block_amd64p32.s
@@ -0,0 +1,184 @@
+// Original source:
+// http://www.zorinaq.com/papers/md5-amd64.html
+// http://www.zorinaq.com/papers/md5-amd64.tar.bz2
+//
+// Translated from Perl generating GNU assembly into
+// #defines generating 6a assembly by the Go Authors.
+//
+// Restrictions to make code safe for Native Client:
+// replace BP with R11, reloaded before use at return.
+// replace R15 with R11.
+
+#include "../../../cmd/ld/textflag.h"
+
+// MD5 optimized for AMD64.
+//
+// Author: Marc Bevand <bevand_m (at) epita.fr>
+// Licence: I hereby disclaim the copyright on this code and place it
+// in the public domain.
+
+TEXT ·block(SB),NOSPLIT,$0-32
+ MOVL dig+0(FP), R11
+ MOVL p+4(FP), SI
+ MOVL p_len+8(FP), DX
+ SHRQ $6, DX
+ SHLQ $6, DX
+
+ LEAQ (SI)(DX*1), DI
+ MOVL (0*4)(R11), AX
+ MOVL (1*4)(R11), BX
+ MOVL (2*4)(R11), CX
+ MOVL (3*4)(R11), DX
+
+ CMPQ SI, DI
+ JEQ end
+
+loop:
+ MOVL AX, R12
+ MOVL BX, R13
+ MOVL CX, R14
+ MOVL DX, R11
+
+ MOVL (0*4)(SI), R8
+ MOVL DX, R9
+
+#define ROUND1(a, b, c, d, index, const, shift) \
+ XORL c, R9; \
+ LEAL const(a)(R8*1), a; \
+ ANDL b, R9; \
+ XORL d, R9; \
+ MOVL (index*4)(SI), R8; \
+ ADDL R9, a; \
+ ROLL $shift, a; \
+ MOVL c, R9; \
+ ADDL b, a
+
+ ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
+ ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
+ ROUND1(CX,DX,AX,BX, 3,0x242070db,17);
+ ROUND1(BX,CX,DX,AX, 4,0xc1bdceee,22);
+ ROUND1(AX,BX,CX,DX, 5,0xf57c0faf, 7);
+ ROUND1(DX,AX,BX,CX, 6,0x4787c62a,12);
+ ROUND1(CX,DX,AX,BX, 7,0xa8304613,17);
+ ROUND1(BX,CX,DX,AX, 8,0xfd469501,22);
+ ROUND1(AX,BX,CX,DX, 9,0x698098d8, 7);
+ ROUND1(DX,AX,BX,CX,10,0x8b44f7af,12);
+ ROUND1(CX,DX,AX,BX,11,0xffff5bb1,17);
+ ROUND1(BX,CX,DX,AX,12,0x895cd7be,22);
+ ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
+ ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
+ ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
+ ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
+
+ MOVL (1*4)(SI), R8
+ MOVL DX, R9
+ MOVL DX, R10
+
+#define ROUND2(a, b, c, d, index, const, shift) \
+ NOTL R9; \
+ LEAL const(a)(R8*1),a; \
+ ANDL b, R10; \
+ ANDL c, R9; \
+ MOVL (index*4)(SI),R8; \
+ ORL R9, R10; \
+ MOVL c, R9; \
+ ADDL R10, a; \
+ MOVL c, R10; \
+ ROLL $shift, a; \
+ ADDL b, a
+
+ ROUND2(AX,BX,CX,DX, 6,0xf61e2562, 5);
+ ROUND2(DX,AX,BX,CX,11,0xc040b340, 9);
+ ROUND2(CX,DX,AX,BX, 0,0x265e5a51,14);
+ ROUND2(BX,CX,DX,AX, 5,0xe9b6c7aa,20);
+ ROUND2(AX,BX,CX,DX,10,0xd62f105d, 5);
+ ROUND2(DX,AX,BX,CX,15, 0x2441453, 9);
+ ROUND2(CX,DX,AX,BX, 4,0xd8a1e681,14);
+ ROUND2(BX,CX,DX,AX, 9,0xe7d3fbc8,20);
+ ROUND2(AX,BX,CX,DX,14,0x21e1cde6, 5);
+ ROUND2(DX,AX,BX,CX, 3,0xc33707d6, 9);
+ ROUND2(CX,DX,AX,BX, 8,0xf4d50d87,14);
+ ROUND2(BX,CX,DX,AX,13,0x455a14ed,20);
+ ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
+ ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
+ ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
+ ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
+
+ MOVL (5*4)(SI), R8
+ MOVL CX, R9
+
+#define ROUND3(a, b, c, d, index, const, shift) \
+ LEAL const(a)(R8*1),a; \
+ MOVL (index*4)(SI),R8; \
+ XORL d, R9; \
+ XORL b, R9; \
+ ADDL R9, a; \
+ ROLL $shift, a; \
+ MOVL b, R9; \
+ ADDL b, a
+
+ ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
+ ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
+ ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
+ ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
+ ROUND3(AX,BX,CX,DX, 4,0xa4beea44, 4);
+ ROUND3(DX,AX,BX,CX, 7,0x4bdecfa9,11);
+ ROUND3(CX,DX,AX,BX,10,0xf6bb4b60,16);
+ ROUND3(BX,CX,DX,AX,13,0xbebfbc70,23);
+ ROUND3(AX,BX,CX,DX, 0,0x289b7ec6, 4);
+ ROUND3(DX,AX,BX,CX, 3,0xeaa127fa,11);
+ ROUND3(CX,DX,AX,BX, 6,0xd4ef3085,16);
+ ROUND3(BX,CX,DX,AX, 9, 0x4881d05,23);
+ ROUND3(AX,BX,CX,DX,12,0xd9d4d039, 4);
+ ROUND3(DX,AX,BX,CX,15,0xe6db99e5,11);
+ ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
+ ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
+
+ MOVL (0*4)(SI), R8
+ MOVL $0xffffffff, R9
+ XORL DX, R9
+
+#define ROUND4(a, b, c, d, index, const, shift) \
+ LEAL const(a)(R8*1),a; \
+ ORL b, R9; \
+ XORL c, R9; \
+ ADDL R9, a; \
+ MOVL (index*4)(SI),R8; \
+ MOVL $0xffffffff, R9; \
+ ROLL $shift, a; \
+ XORL c, R9; \
+ ADDL b, a
+
+ ROUND4(AX,BX,CX,DX, 7,0xf4292244, 6);
+ ROUND4(DX,AX,BX,CX,14,0x432aff97,10);
+ ROUND4(CX,DX,AX,BX, 5,0xab9423a7,15);
+ ROUND4(BX,CX,DX,AX,12,0xfc93a039,21);
+ ROUND4(AX,BX,CX,DX, 3,0x655b59c3, 6);
+ ROUND4(DX,AX,BX,CX,10,0x8f0ccc92,10);
+ ROUND4(CX,DX,AX,BX, 1,0xffeff47d,15);
+ ROUND4(BX,CX,DX,AX, 8,0x85845dd1,21);
+ ROUND4(AX,BX,CX,DX,15,0x6fa87e4f, 6);
+ ROUND4(DX,AX,BX,CX, 6,0xfe2ce6e0,10);
+ ROUND4(CX,DX,AX,BX,13,0xa3014314,15);
+ ROUND4(BX,CX,DX,AX, 4,0x4e0811a1,21);
+ ROUND4(AX,BX,CX,DX,11,0xf7537e82, 6);
+ ROUND4(DX,AX,BX,CX, 2,0xbd3af235,10);
+ ROUND4(CX,DX,AX,BX, 9,0x2ad7d2bb,15);
+ ROUND4(BX,CX,DX,AX, 0,0xeb86d391,21);
+
+ ADDL R12, AX
+ ADDL R13, BX
+ ADDL R14, CX
+ ADDL R11, DX
+
+ ADDQ $64, SI
+ CMPQ SI, DI
+ JB loop
+
+end:
+ MOVL dig+0(FP), R11
+ MOVL AX, (0*4)(R11)
+ MOVL BX, (1*4)(R11)
+ MOVL CX, (2*4)(R11)
+ MOVL DX, (3*4)(R11)
+ RET
diff --git a/src/pkg/crypto/md5/md5block_decl.go b/src/pkg/crypto/md5/md5block_decl.go
index c4d6aaaf0..d7956a6d2 100644
--- a/src/pkg/crypto/md5/md5block_decl.go
+++ b/src/pkg/crypto/md5/md5block_decl.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64 386 arm
+// +build amd64 amd64p32 386 arm
package md5
diff --git a/src/pkg/crypto/rand/rand_unix.go b/src/pkg/crypto/rand/rand_unix.go
index 0fbd7eaf5..1e741fda1 100644
--- a/src/pkg/crypto/rand/rand_unix.go
+++ b/src/pkg/crypto/rand/rand_unix.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build darwin dragonfly freebsd linux netbsd openbsd plan9 solaris
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd plan9 solaris
// Unix cryptographically secure pseudorandom number
// generator.
diff --git a/src/pkg/crypto/rc4/rc4_amd64p32.s b/src/pkg/crypto/rc4/rc4_amd64p32.s
new file mode 100644
index 000000000..27d849507
--- /dev/null
+++ b/src/pkg/crypto/rc4/rc4_amd64p32.s
@@ -0,0 +1,192 @@
+// Original source:
+// http://www.zorinaq.com/papers/rc4-amd64.html
+// http://www.zorinaq.com/papers/rc4-amd64.tar.bz2
+
+#include "../../../cmd/ld/textflag.h"
+
+// Local modifications:
+//
+// Transliterated from GNU to 6a assembly syntax by the Go authors.
+// The comments and spacing are from the original.
+//
+// The new EXTEND macros avoid a bad stall on some systems after 8-bit math.
+//
+// The original code accumulated 64 bits of key stream in an integer
+// register and then XOR'ed the key stream into the data 8 bytes at a time.
+// Modified to accumulate 128 bits of key stream into an XMM register
+// and then XOR the key stream into the data 16 bytes at a time.
+// Approximately doubles throughput.
+//
+// Converted to amd64p32.
+//
+// To make safe for Native Client, avoid use of BP, R15,
+// and two-register addressing modes.
+
+// NOTE: Changing EXTEND to a no-op makes the code run 1.2x faster on Core i5
+// but makes the code run 2.0x slower on Xeon.
+#define EXTEND(r) MOVBLZX r, r
+
+/*
+** RC4 implementation optimized for AMD64.
+**
+** Author: Marc Bevand <bevand_m (at) epita.fr>
+** Licence: I hereby disclaim the copyright on this code and place it
+** in the public domain.
+**
+** The code has been designed to be easily integrated into openssl:
+** the exported RC4() function can replace the actual implementations
+** openssl already contains. Please note that when linking with openssl,
+** it requires that sizeof(RC4_INT) == 8. So openssl must be compiled
+** with -DRC4_INT='unsigned long'.
+**
+** The throughput achieved by this code is about 320 MBytes/sec, on
+** a 1.8 GHz AMD Opteron (rev C0) processor.
+*/
+
+TEXT ·xorKeyStream(SB),NOSPLIT,$0
+ MOVL n+8(FP), BX // rbx = ARG(len)
+ MOVL src+4(FP), SI // in = ARG(in)
+ MOVL dst+0(FP), DI // out = ARG(out)
+ MOVL state+12(FP), R10 // d = ARG(data)
+ MOVL i+16(FP), AX
+ MOVBQZX 0(AX), CX // x = *xp
+ MOVL j+20(FP), AX
+ MOVBQZX 0(AX), DX // y = *yp
+
+ LEAQ (SI)(BX*1), R9 // limit = in+len
+
+l1: CMPQ SI, R9 // cmp in with in+len
+ JGE finished // jump if (in >= in+len)
+
+ INCB CX
+ EXTEND(CX)
+ TESTL $15, CX
+ JZ wordloop
+ LEAL (R10)(CX*4), R12
+
+ MOVBLZX (R12), AX
+
+ ADDB AX, DX // y += tx
+ EXTEND(DX)
+ LEAL (R10)(DX*4), R11
+ MOVBLZX (R11), BX // ty = d[y]
+ MOVB BX, (R12) // d[x] = ty
+ ADDB AX, BX // val = ty+tx
+ EXTEND(BX)
+ LEAL (R10)(BX*4), R13
+ MOVB AX, (R11) // d[y] = tx
+ MOVBLZX (R13), R8 // val = d[val]
+ XORB (SI), R8 // xor 1 byte
+ MOVB R8, (DI)
+ INCQ SI // in++
+ INCQ DI // out++
+ JMP l1
+
+wordloop:
+ SUBQ $16, R9
+ CMPQ SI, R9
+ JGT end
+
+start:
+ ADDQ $16, SI // increment in
+ ADDQ $16, DI // increment out
+
+ // Each KEYROUND generates one byte of key and
+ // inserts it into an XMM register at the given 16-bit index.
+ // The key state array is uint32 words only using the bottom
+ // byte of each word, so the 16-bit OR only copies 8 useful bits.
+ // We accumulate alternating bytes into X0 and X1, and then at
+ // the end we OR X1<<8 into X0 to produce the actual key.
+ //
+ // At the beginning of the loop, CX%16 == 0, so the 16 loads
+ // at state[CX], state[CX+1], ..., state[CX+15] can precompute
+ // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
+ // without fear of the byte computation CX+15 wrapping around.
+ //
+ // The first round needs R12[0], the second needs R12[1], and so on.
+ // We can avoid memory stalls by starting the load for round n+1
+ // before the end of round n, using the LOAD macro.
+ LEAQ (R10)(CX*4), R12
+
+#define KEYROUND(xmm, load, off, r1, r2, index) \
+ LEAL (R10)(DX*4), R11; \
+ MOVBLZX (R11), R8; \
+ MOVB r1, (R11); \
+ load((off+1), r2); \
+ MOVB R8, (off*4)(R12); \
+ ADDB r1, R8; \
+ EXTEND(R8); \
+ LEAL (R10)(R8*4), R14; \
+ PINSRW $index, (R14), xmm
+
+#define LOAD(off, reg) \
+ MOVBLZX (off*4)(R12), reg; \
+ ADDB reg, DX; \
+ EXTEND(DX)
+
+#define SKIP(off, reg)
+
+ LOAD(0, AX)
+ KEYROUND(X0, LOAD, 0, AX, BX, 0)
+ KEYROUND(X1, LOAD, 1, BX, AX, 0)
+ KEYROUND(X0, LOAD, 2, AX, BX, 1)
+ KEYROUND(X1, LOAD, 3, BX, AX, 1)
+ KEYROUND(X0, LOAD, 4, AX, BX, 2)
+ KEYROUND(X1, LOAD, 5, BX, AX, 2)
+ KEYROUND(X0, LOAD, 6, AX, BX, 3)
+ KEYROUND(X1, LOAD, 7, BX, AX, 3)
+ KEYROUND(X0, LOAD, 8, AX, BX, 4)
+ KEYROUND(X1, LOAD, 9, BX, AX, 4)
+ KEYROUND(X0, LOAD, 10, AX, BX, 5)
+ KEYROUND(X1, LOAD, 11, BX, AX, 5)
+ KEYROUND(X0, LOAD, 12, AX, BX, 6)
+ KEYROUND(X1, LOAD, 13, BX, AX, 6)
+ KEYROUND(X0, LOAD, 14, AX, BX, 7)
+ KEYROUND(X1, SKIP, 15, BX, AX, 7)
+
+ ADDB $16, CX
+
+ PSLLQ $8, X1
+ PXOR X1, X0
+ MOVOU -16(SI), X2
+ PXOR X0, X2
+ MOVOU X2, -16(DI)
+
+ CMPQ SI, R9 // cmp in with in+len-16
+ JLE start // jump if (in <= in+len-16)
+
+end:
+ DECB CX
+ ADDQ $16, R9 // tmp = in+len
+
+ // handle the last bytes, one by one
+l2: CMPQ SI, R9 // cmp in with in+len
+ JGE finished // jump if (in >= in+len)
+
+ INCB CX
+ EXTEND(CX)
+ LEAL (R10)(CX*4), R12
+ MOVBLZX (R12), AX
+
+ ADDB AX, DX // y += tx
+ EXTEND(DX)
+ LEAL (R10)(DX*4), R11
+ MOVBLZX (R11), BX // ty = d[y]
+ MOVB BX, (R12) // d[x] = ty
+ ADDB AX, BX // val = ty+tx
+ EXTEND(BX)
+ LEAL (R10)(BX*4), R13
+ MOVB AX, (R11) // d[y] = tx
+ MOVBLZX (R13), R8 // val = d[val]
+ XORB (SI), R8 // xor 1 byte
+ MOVB R8, (DI)
+ INCQ SI // in++
+ INCQ DI // out++
+ JMP l2
+
+finished:
+ MOVL j+20(FP), BX
+ MOVB DX, 0(BX)
+ MOVL i+16(FP), AX
+ MOVB CX, 0(AX)
+ RET
diff --git a/src/pkg/crypto/rc4/rc4_asm.go b/src/pkg/crypto/rc4/rc4_asm.go
index c582a4488..fc71b9a6f 100644
--- a/src/pkg/crypto/rc4/rc4_asm.go
+++ b/src/pkg/crypto/rc4/rc4_asm.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64 arm 386
+// +build amd64 amd64p32 arm 386
package rc4
diff --git a/src/pkg/crypto/rc4/rc4_ref.go b/src/pkg/crypto/rc4/rc4_ref.go
index bdf5e1db2..1ecce1a7f 100644
--- a/src/pkg/crypto/rc4/rc4_ref.go
+++ b/src/pkg/crypto/rc4/rc4_ref.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64,!arm,!386
+// +build !amd64,!amd64p32,!arm,!386
package rc4
diff --git a/src/pkg/crypto/sha1/sha1block_amd64p32.s b/src/pkg/crypto/sha1/sha1block_amd64p32.s
new file mode 100644
index 000000000..3c589d94f
--- /dev/null
+++ b/src/pkg/crypto/sha1/sha1block_amd64p32.s
@@ -0,0 +1,216 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "../../../cmd/ld/textflag.h"
+
+// SHA1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+// - rounds 0-15 are type 1 and load data (ROUND1 macro).
+// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+//
+// amd64p32 version.
+// To ensure safety for Native Client, avoids use of BP and R15
+// as well as two-register addressing modes.
+
+#define LOAD(index) \
+ MOVL (index*4)(SI), R10; \
+ BSWAPL R10; \
+ MOVL R10, (index*4)(SP)
+
+#define SHUFFLE(index) \
+ MOVL (((index)&0xf)*4)(SP), R10; \
+ XORL (((index-3)&0xf)*4)(SP), R10; \
+ XORL (((index-8)&0xf)*4)(SP), R10; \
+ XORL (((index-14)&0xf)*4)(SP), R10; \
+ ROLL $1, R10; \
+ MOVL R10, (((index)&0xf)*4)(SP)
+
+#define FUNC1(a, b, c, d, e) \
+ MOVL d, R9; \
+ XORL c, R9; \
+ ANDL b, R9; \
+ XORL d, R9
+
+#define FUNC2(a, b, c, d, e) \
+ MOVL b, R9; \
+ XORL c, R9; \
+ XORL d, R9
+
+#define FUNC3(a, b, c, d, e) \
+ MOVL b, R8; \
+ ORL c, R8; \
+ ANDL d, R8; \
+ MOVL b, R9; \
+ ANDL c, R9; \
+ ORL R8, R9
+
+#define FUNC4 FUNC2
+
+#define MIX(a, b, c, d, e, const) \
+ ROLL $30, b; \
+ ADDL R9, e; \
+ MOVL a, R8; \
+ ROLL $5, R8; \
+ LEAL const(e)(R10*1), e; \
+ ADDL R8, e
+
+#define ROUND1(a, b, c, d, e, index) \
+ LOAD(index); \
+ FUNC1(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND1x(a, b, c, d, e, index) \
+ SHUFFLE(index); \
+ FUNC1(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND2(a, b, c, d, e, index) \
+ SHUFFLE(index); \
+ FUNC2(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x6ED9EBA1)
+
+#define ROUND3(a, b, c, d, e, index) \
+ SHUFFLE(index); \
+ FUNC3(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x8F1BBCDC)
+
+#define ROUND4(a, b, c, d, e, index) \
+ SHUFFLE(index); \
+ FUNC4(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0xCA62C1D6)
+
+TEXT ·block(SB),NOSPLIT,$64-32
+ MOVL dig+0(FP), R14
+ MOVL p_base+4(FP), SI
+ MOVL p_len+8(FP), DX
+ SHRQ $6, DX
+ SHLQ $6, DX
+
+ LEAQ (SI)(DX*1), DI
+ MOVL (0*4)(R14), AX
+ MOVL (1*4)(R14), BX
+ MOVL (2*4)(R14), CX
+ MOVL (3*4)(R14), DX
+ MOVL (4*4)(R14), R13
+
+ CMPQ SI, DI
+ JEQ end
+
+loop:
+#define BP R13 /* keep diff from sha1block_amd64.s small */
+ ROUND1(AX, BX, CX, DX, BP, 0)
+ ROUND1(BP, AX, BX, CX, DX, 1)
+ ROUND1(DX, BP, AX, BX, CX, 2)
+ ROUND1(CX, DX, BP, AX, BX, 3)
+ ROUND1(BX, CX, DX, BP, AX, 4)
+ ROUND1(AX, BX, CX, DX, BP, 5)
+ ROUND1(BP, AX, BX, CX, DX, 6)
+ ROUND1(DX, BP, AX, BX, CX, 7)
+ ROUND1(CX, DX, BP, AX, BX, 8)
+ ROUND1(BX, CX, DX, BP, AX, 9)
+ ROUND1(AX, BX, CX, DX, BP, 10)
+ ROUND1(BP, AX, BX, CX, DX, 11)
+ ROUND1(DX, BP, AX, BX, CX, 12)
+ ROUND1(CX, DX, BP, AX, BX, 13)
+ ROUND1(BX, CX, DX, BP, AX, 14)
+ ROUND1(AX, BX, CX, DX, BP, 15)
+
+ ROUND1x(BP, AX, BX, CX, DX, 16)
+ ROUND1x(DX, BP, AX, BX, CX, 17)
+ ROUND1x(CX, DX, BP, AX, BX, 18)
+ ROUND1x(BX, CX, DX, BP, AX, 19)
+
+ ROUND2(AX, BX, CX, DX, BP, 20)
+ ROUND2(BP, AX, BX, CX, DX, 21)
+ ROUND2(DX, BP, AX, BX, CX, 22)
+ ROUND2(CX, DX, BP, AX, BX, 23)
+ ROUND2(BX, CX, DX, BP, AX, 24)
+ ROUND2(AX, BX, CX, DX, BP, 25)
+ ROUND2(BP, AX, BX, CX, DX, 26)
+ ROUND2(DX, BP, AX, BX, CX, 27)
+ ROUND2(CX, DX, BP, AX, BX, 28)
+ ROUND2(BX, CX, DX, BP, AX, 29)
+ ROUND2(AX, BX, CX, DX, BP, 30)
+ ROUND2(BP, AX, BX, CX, DX, 31)
+ ROUND2(DX, BP, AX, BX, CX, 32)
+ ROUND2(CX, DX, BP, AX, BX, 33)
+ ROUND2(BX, CX, DX, BP, AX, 34)
+ ROUND2(AX, BX, CX, DX, BP, 35)
+ ROUND2(BP, AX, BX, CX, DX, 36)
+ ROUND2(DX, BP, AX, BX, CX, 37)
+ ROUND2(CX, DX, BP, AX, BX, 38)
+ ROUND2(BX, CX, DX, BP, AX, 39)
+
+ ROUND3(AX, BX, CX, DX, BP, 40)
+ ROUND3(BP, AX, BX, CX, DX, 41)
+ ROUND3(DX, BP, AX, BX, CX, 42)
+ ROUND3(CX, DX, BP, AX, BX, 43)
+ ROUND3(BX, CX, DX, BP, AX, 44)
+ ROUND3(AX, BX, CX, DX, BP, 45)
+ ROUND3(BP, AX, BX, CX, DX, 46)
+ ROUND3(DX, BP, AX, BX, CX, 47)
+ ROUND3(CX, DX, BP, AX, BX, 48)
+ ROUND3(BX, CX, DX, BP, AX, 49)
+ ROUND3(AX, BX, CX, DX, BP, 50)
+ ROUND3(BP, AX, BX, CX, DX, 51)
+ ROUND3(DX, BP, AX, BX, CX, 52)
+ ROUND3(CX, DX, BP, AX, BX, 53)
+ ROUND3(BX, CX, DX, BP, AX, 54)
+ ROUND3(AX, BX, CX, DX, BP, 55)
+ ROUND3(BP, AX, BX, CX, DX, 56)
+ ROUND3(DX, BP, AX, BX, CX, 57)
+ ROUND3(CX, DX, BP, AX, BX, 58)
+ ROUND3(BX, CX, DX, BP, AX, 59)
+
+ ROUND4(AX, BX, CX, DX, BP, 60)
+ ROUND4(BP, AX, BX, CX, DX, 61)
+ ROUND4(DX, BP, AX, BX, CX, 62)
+ ROUND4(CX, DX, BP, AX, BX, 63)
+ ROUND4(BX, CX, DX, BP, AX, 64)
+ ROUND4(AX, BX, CX, DX, BP, 65)
+ ROUND4(BP, AX, BX, CX, DX, 66)
+ ROUND4(DX, BP, AX, BX, CX, 67)
+ ROUND4(CX, DX, BP, AX, BX, 68)
+ ROUND4(BX, CX, DX, BP, AX, 69)
+ ROUND4(AX, BX, CX, DX, BP, 70)
+ ROUND4(BP, AX, BX, CX, DX, 71)
+ ROUND4(DX, BP, AX, BX, CX, 72)
+ ROUND4(CX, DX, BP, AX, BX, 73)
+ ROUND4(BX, CX, DX, BP, AX, 74)
+ ROUND4(AX, BX, CX, DX, BP, 75)
+ ROUND4(BP, AX, BX, CX, DX, 76)
+ ROUND4(DX, BP, AX, BX, CX, 77)
+ ROUND4(CX, DX, BP, AX, BX, 78)
+ ROUND4(BX, CX, DX, BP, AX, 79)
+#undef BP
+
+ ADDL (0*4)(R14), AX
+ ADDL (1*4)(R14), BX
+ ADDL (2*4)(R14), CX
+ ADDL (3*4)(R14), DX
+ ADDL (4*4)(R14), R13
+
+ MOVL AX, (0*4)(R14)
+ MOVL BX, (1*4)(R14)
+ MOVL CX, (2*4)(R14)
+ MOVL DX, (3*4)(R14)
+ MOVL R13, (4*4)(R14)
+
+ ADDQ $64, SI
+ CMPQ SI, DI
+ JB loop
+
+end:
+ RET
diff --git a/src/pkg/crypto/sha1/sha1block_decl.go b/src/pkg/crypto/sha1/sha1block_decl.go
index b2c68f0e8..2331deb3a 100644
--- a/src/pkg/crypto/sha1/sha1block_decl.go
+++ b/src/pkg/crypto/sha1/sha1block_decl.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64 386 arm
+// +build amd64 amd64p32 386
package sha1
diff --git a/src/pkg/crypto/x509/root_unix.go b/src/pkg/crypto/x509/root_unix.go
index a5bd19e82..11ad3c440 100644
--- a/src/pkg/crypto/x509/root_unix.go
+++ b/src/pkg/crypto/x509/root_unix.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build dragonfly freebsd linux openbsd netbsd solaris
+// +build dragonfly freebsd linux nacl netbsd openbsd solaris
package x509