summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2013-12-13 21:07:41 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2013-12-13 22:40:52 +0200
commitd2b853246c2ed056a92096d89c3ca057e45c9c92 (patch)
treec3267dcc3e2f5b8ba58721c7d76a5a9e98feee25
parentbe2238f68abcc6f2b4e8c38ad9141376ce622a22 (diff)
downloadlibgcrypt-d2b853246c2ed056a92096d89c3ca057e45c9c92.tar.gz
Convert SHA-1 SSSE3 implementation from mixed asm&C to pure asm
* cipher/Makefile.am: Change 'sha1-ssse3-amd64.c' to 'sha1-ssse3-amd64.S'. * cipher/sha1-ssse3-amd64.c: Remove. * cipher/sha1-ssse3-amd64.S: New. -- Mixed C&asm implementation appears to trigger GCC bugs easily. Therefore convert SSSE3 implementation to pure assembly for safety. Benchmark also show smallish speed improvement. cpu C&asm asm Intel i5-4570 5.22 c/B 5.09 c/B Intel i5-2450M 7.24 c/B 7.00 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--cipher/Makefile.am2
-rw-r--r--cipher/sha1-ssse3-amd64.S378
-rw-r--r--cipher/sha1-ssse3-amd64.c319
3 files changed, 379 insertions, 320 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 04777729..7d737e23 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -77,7 +77,7 @@ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
scrypt.c \
seed.c \
serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \
-sha1.c sha1-ssse3-amd64.c \
+sha1.c sha1-ssse3-amd64.S \
sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \
stribog.c \
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
new file mode 100644
index 00000000..5165f3f2
--- /dev/null
+++ b/cipher/sha1-ssse3-amd64.S
@@ -0,0 +1,378 @@
+/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Based on sha1.c:
+ * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ * "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
+
+#ifdef __PIC__
+# define RIP (%rip)
+#else
+# define RIP
+#endif
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+.data
+#define K1 0x5A827999
+#define K2 0x6ED9EBA1
+#define K3 0x8F1BBCDC
+#define K4 0xCA62C1D6
+.align 16
+.LK_XMM:
+.LK1: .long K1, K1, K1, K1
+.LK2: .long K2, K2, K2, K2
+.LK3: .long K3, K3, K3, K3
+.LK4: .long K4, K4, K4, K4
+
+.Lbswap_shufb_ctl:
+ .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+
+/* Register macros */
+
+#define RSTATE %r8
+#define RDATA %r9
+#define ROLDSTACK %r10
+
+#define a %eax
+#define b %ebx
+#define c %ecx
+#define d %edx
+#define e %edi
+
+#define RT0 %esi
+#define RT1 %ebp
+
+#define Wtmp0 %xmm0
+#define Wtmp1 %xmm1
+
+#define W0 %xmm2
+#define W1 %xmm3
+#define W2 %xmm4
+#define W3 %xmm5
+#define W4 %xmm6
+#define W5 %xmm7
+#define W6 %xmm8
+#define W7 %xmm9
+
+#define BSWAP_REG %xmm10
+
+
+/* Round function macros. */
+
+#define WK(i) (((i) & 15) * 4)(%rsp)
+
+#define R_F1(a,b,c,d,e,i) \
+ movl c, RT0; \
+ addl WK(i), e; \
+ xorl d, RT0; \
+ movl a, RT1; \
+ andl b, RT0; \
+ roll $30, b; \
+ xorl d, RT0; \
+ leal (RT0,e), e; \
+ roll $5, RT1; \
+ addl RT1, e;
+
+#define R_F2(a,b,c,d,e,i) \
+ movl c, RT0; \
+ addl WK(i), e; \
+ xorl b, RT0; \
+ roll $30, b; \
+ xorl d, RT0; \
+ movl a, RT1; \
+ leal (RT0,e), e; \
+ roll $5, RT1; \
+ addl RT1, e;
+
+#define R_F3(a,b,c,d,e,i) \
+ movl c, RT0; \
+ movl b, RT1; \
+ xorl b, RT0; \
+ andl c, RT1; \
+ andl d, RT0; \
+ addl RT1, e; \
+ addl WK(i), e; \
+ roll $30, b; \
+ movl a, RT1; \
+ leal (RT0,e), e; \
+ roll $5, RT1; \
+ addl RT1, e;
+
+#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
+
+#define R(a,b,c,d,e,f,i) \
+ R_##f(a,b,c,d,e,i)
+
+
+/* Input expansion macros. */
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+ movdqu (4*(i))(RDATA), tmp0;
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+ pshufb BSWAP_REG, tmp0; \
+ movdqa tmp0, W;
+
+#define W_PRECALC_00_15_2(i, W, tmp0) \
+ paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0;
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+ movdqa tmp0, WK(i&~3);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ movdqa W_m12, W; \
+ palignr $8, W_m16, W; \
+ movdqa W_m04, tmp0; \
+ psrldq $4, tmp0; \
+ pxor W_m08, W;
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ pxor W_m16, tmp0; \
+ pxor tmp0, W; \
+ movdqa W, tmp1; \
+ movdqa W, tmp0; \
+ pslldq $12, tmp1;
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ psrld $31, W; \
+ pslld $1, tmp0; \
+ por W, tmp0; \
+ movdqa tmp1, W; \
+ psrld $30, tmp1; \
+ pslld $2, W;
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ pxor W, tmp0; \
+ pxor tmp1, tmp0; \
+ movdqa tmp0, W; \
+ paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \
+ movdqa tmp0, WK((i)&~3);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ movdqa W_m04, tmp0; \
+ pxor W_m28, W; \
+ palignr $8, W_m08, tmp0;
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ pxor W_m16, W; \
+ pxor tmp0, W; \
+ movdqa W, tmp0;
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ psrld $30, W; \
+ pslld $2, tmp0; \
+ por W, tmp0;
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ movdqa tmp0, W; \
+ paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \
+ movdqa tmp0, WK((i)&~3);
+
+#define CLEAR_REG(reg) pxor reg, reg;
+
+
+/*
+ * Transform 64 bytes (16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data)
+ */
+.text
+.globl _gcry_sha1_transform_amd64_ssse3
+.type _gcry_sha1_transform_amd64_ssse3,@function
+.align 16
+_gcry_sha1_transform_amd64_ssse3:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: data (64 bytes)
+ * %rdx: ...
+ */
+
+ movq %rdi, RSTATE;
+ movq %rsi, RDATA;
+ pushq %rbx;
+ pushq %rbp;
+
+ movq %rsp, ROLDSTACK;
+
+ subq $(16*4), %rsp;
+ andq $(~31), %rsp;
+
+ /* Get the values of the chaining variables. */
+ movl state_h0(RSTATE), a;
+ movl state_h1(RSTATE), b;
+ movl state_h2(RSTATE), c;
+ movl state_h3(RSTATE), d;
+ movl state_h4(RSTATE), e;
+
+ movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+
+ /* Precalc 0-15. */
+ W_PRECALC_00_15_0(0, W0, Wtmp0);
+ W_PRECALC_00_15_1(1, W0, Wtmp0);
+ W_PRECALC_00_15_2(2, W0, Wtmp0);
+ W_PRECALC_00_15_3(3, W0, Wtmp0);
+ W_PRECALC_00_15_0(4, W7, Wtmp0);
+ W_PRECALC_00_15_1(5, W7, Wtmp0);
+ W_PRECALC_00_15_2(6, W7, Wtmp0);
+ W_PRECALC_00_15_3(7, W7, Wtmp0);
+ W_PRECALC_00_15_0(8, W6, Wtmp0);
+ W_PRECALC_00_15_1(9, W6, Wtmp0);
+ W_PRECALC_00_15_2(10, W6, Wtmp0);
+ W_PRECALC_00_15_3(11, W6, Wtmp0);
+ W_PRECALC_00_15_0(12, W5, Wtmp0);
+ W_PRECALC_00_15_1(13, W5, Wtmp0);
+ W_PRECALC_00_15_2(14, W5, Wtmp0);
+ W_PRECALC_00_15_3(15, W5, Wtmp0);
+
+ /* Transform 0-15 + Precalc 16-31. */
+ R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+
+ /* Transform 16-63 + Precalc 32-79. */
+ R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+
+ /* Transform 64-79 + Clear XMM registers. */
+ R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG);
+ R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0);
+ R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1);
+ R( d, e, a, b, c, F4, 67 ); CLEAR_REG(W0);
+ R( c, d, e, a, b, F4, 68 ); CLEAR_REG(W1);
+ R( b, c, d, e, a, F4, 69 ); CLEAR_REG(W2);
+ R( a, b, c, d, e, F4, 70 ); CLEAR_REG(W3);
+ R( e, a, b, c, d, F4, 71 ); CLEAR_REG(W4);
+ R( d, e, a, b, c, F4, 72 ); CLEAR_REG(W5);
+ R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6);
+ R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7);
+ R( a, b, c, d, e, F4, 75 );
+ R( e, a, b, c, d, F4, 76 );
+ R( d, e, a, b, c, F4, 77 );
+ R( c, d, e, a, b, F4, 78 );
+ R( b, c, d, e, a, F4, 79 );
+
+ /* Update the chaining variables. */
+ addl state_h0(RSTATE), a;
+ addl state_h1(RSTATE), b;
+ addl state_h2(RSTATE), c;
+ addl state_h3(RSTATE), d;
+ addl state_h4(RSTATE), e;
+
+ movl a, state_h0(RSTATE);
+ movl b, state_h1(RSTATE);
+ movl c, state_h2(RSTATE);
+ movl d, state_h3(RSTATE);
+ movl e, state_h4(RSTATE);
+
+ movq ROLDSTACK, %rsp;
+
+ popq %rbp;
+ popq %rbx;
+
+ /* burn_stack */
+ movl $(16*4 + 2*8 + 31), %eax;
+
+ ret;
+
+#endif
+#endif
diff --git a/cipher/sha1-ssse3-amd64.c b/cipher/sha1-ssse3-amd64.c
deleted file mode 100644
index 13422352..00000000
--- a/cipher/sha1-ssse3-amd64.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function
- * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * Based on sha1.c:
- * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-/*
- * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
- * "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
- * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
- */
-
-#ifdef __x86_64__
-#include <config.h>
-
-#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
- defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
- defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
-
-#ifdef HAVE_STDINT_H
-# include <stdint.h> /* uintptr_t */
-#elif defined(HAVE_INTTYPES_H)
-# include <inttypes.h>
-#else
-/* In this case, uintptr_t is provided by config.h. */
-#endif
-
-#include "bithelp.h"
-
-
-/* Helper macro to force alignment to 16 bytes. */
-#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
-# define ATTR_ALIGNED_16 __attribute__ ((aligned (16)))
-#else
-# define ATTR_ALIGNED_16
-#endif
-
-
-typedef struct
-{
- u32 h0,h1,h2,h3,h4;
-} SHA1_STATE;
-
-
-/* Round function macros. */
-#define K1 0x5A827999L
-#define K2 0x6ED9EBA1L
-#define K3 0x8F1BBCDCL
-#define K4 0xCA62C1D6L
-#define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) )
-#define F2(x,y,z) ( x ^ y ^ z )
-#define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) )
-#define F4(x,y,z) ( x ^ y ^ z )
-#define R(a,b,c,d,e,f,wk) do { e += rol( a, 5 ) \
- + f( b, c, d ) \
- + wk; \
- b = rol( b, 30 ); \
- } while(0)
-
-#define WK(i) (wk[i & 15])
-
-
-static const u32 K_XMM[4][4] ATTR_ALIGNED_16 =
- {
- { K1, K1, K1, K1 },
- { K2, K2, K2, K2 },
- { K3, K3, K3, K3 },
- { K4, K4, K4, K4 },
- };
-static const u32 bswap_shufb_ctl[4] ATTR_ALIGNED_16 =
- { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f };
-
-
-/*
- * Transform 64 bytes (16 32-bit words) at DATA.
- */
-unsigned int
-_gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data)
-{
- SHA1_STATE *state = ctx;
- register u32 a, b, c, d, e; /* Local copies of the chaining variables. */
- byte wk_unaligned[4*16+15]; /* The array we work on. */
- u32 *wk = (u32 *)(wk_unaligned
- + ((16 - ((uintptr_t)wk_unaligned & 15)) & 15));
-
- /* Get the values of the chaining variables. */
- a = state->h0;
- b = state->h1;
- c = state->h2;
- d = state->h3;
- e = state->h4;
-
-#define Wtmp0 "xmm0"
-#define Wtmp1 "xmm1"
-
-#define W0 "xmm2"
-#define W1 "xmm3"
-#define W2 "xmm4"
-#define W3 "xmm5"
-#define W4 "xmm6"
-#define W5 "xmm7"
-#define W6 "xmm8"
-#define W7 "xmm9"
-
-#define BSWAP_REG "xmm10"
-
- __asm__ volatile ("movdqa %[bswap], %%"BSWAP_REG";\n\t"
- :: [bswap] "m" (bswap_shufb_ctl[0]));
-
-#define W_PRECALC_00_15_0(i, W, tmp0) \
- __asm__ volatile ("movdqu %[data], %%"tmp0";\n\t" \
- ::[data] "m" (*(data+4*(i))));
-
-#define W_PRECALC_00_15_1(i, W, tmp0) \
- __asm__ volatile ("pshufb %%"BSWAP_REG", %%"tmp0";\n\t" \
- "movdqa %%"tmp0", %%"W";\n\t" \
- ::: "cc");
-
-#define W_PRECALC_00_15_2(i, W, tmp0) \
- __asm__ volatile ("paddd %[k_xmm], %%"tmp0";\n\t" \
- ::[k_xmm] "m" (K_XMM[i / 20][0]));
-
-#define W_PRECALC_00_15_3(i, W, tmp0) \
- __asm__ volatile ("movdqa %%"tmp0", %[wk];\n\t" \
- :[wk] "=m" (WK(i&~3)));
-
- /* Precalc 0-15. */
- W_PRECALC_00_15_0(0, W0, Wtmp0);
- W_PRECALC_00_15_1(1, W0, Wtmp0);
- W_PRECALC_00_15_2(2, W0, Wtmp0);
- W_PRECALC_00_15_3(3, W0, Wtmp0);
- W_PRECALC_00_15_0(4, W7, Wtmp0);
- W_PRECALC_00_15_1(5, W7, Wtmp0);
- W_PRECALC_00_15_2(6, W7, Wtmp0);
- W_PRECALC_00_15_3(7, W7, Wtmp0);
- W_PRECALC_00_15_0(8, W6, Wtmp0);
- W_PRECALC_00_15_1(9, W6, Wtmp0);
- W_PRECALC_00_15_2(10, W6, Wtmp0);
- W_PRECALC_00_15_3(11, W6, Wtmp0);
- W_PRECALC_00_15_0(12, W5, Wtmp0);
- W_PRECALC_00_15_1(13, W5, Wtmp0);
- W_PRECALC_00_15_2(14, W5, Wtmp0);
- W_PRECALC_00_15_3(15, W5, Wtmp0);
-
-#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
- __asm__ volatile ("movdqa %%"W_m12", %%"W";\n\t" \
- "palignr $8, %%"W_m16", %%"W";\n\t" \
- "movdqa %%"W_m04", %%"tmp0";\n\t" \
- "psrldq $4, %%"tmp0";\n\t" \
- "pxor %%"W_m08", %%"W";\n\t" \
- :::"cc");
-
-#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
- __asm__ volatile ("pxor %%"W_m16", %%"tmp0";\n\t" \
- "pxor %%"tmp0", %%"W";\n\t" \
- "movdqa %%"W", %%"tmp1";\n\t" \
- "movdqa %%"W", %%"tmp0";\n\t" \
- "pslldq $12, %%"tmp1";\n\t" \
- :::"cc");
-
-#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
- __asm__ volatile ("psrld $31, %%"W";\n\t" \
- "pslld $1, %%"tmp0";\n\t" \
- "por %%"W", %%"tmp0";\n\t" \
- "movdqa %%"tmp1", %%"W";\n\t" \
- "psrld $30, %%"tmp1";\n\t" \
- "pslld $2, %%"W";\n\t" \
- :::"cc");
-
-#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
- __asm__ volatile ("pxor %%"W", %%"tmp0";\n\t" \
- "pxor %%"tmp1", %%"tmp0";\n\t" \
- "movdqa %%"tmp0", %%"W";\n\t" \
- "paddd %[k_xmm], %%"tmp0";\n\t" \
- "movdqa %%"tmp0", %[wk];\n\t" \
- : [wk] "=m" (WK(i&~3)) \
- : [k_xmm] "m" (K_XMM[i / 20][0]));
-
- /* Transform 0-15 + Precalc 16-31. */
- R( a, b, c, d, e, F1, WK( 0) ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
- R( e, a, b, c, d, F1, WK( 1) ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
- R( d, e, a, b, c, F1, WK( 2) ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
- R( c, d, e, a, b, F1, WK( 3) ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
- R( b, c, d, e, a, F1, WK( 4) ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
- R( a, b, c, d, e, F1, WK( 5) ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
- R( e, a, b, c, d, F1, WK( 6) ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
- R( d, e, a, b, c, F1, WK( 7) ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
- R( c, d, e, a, b, F1, WK( 8) ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
- R( b, c, d, e, a, F1, WK( 9) ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
- R( a, b, c, d, e, F1, WK(10) ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
- R( e, a, b, c, d, F1, WK(11) ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
- R( d, e, a, b, c, F1, WK(12) ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
- R( c, d, e, a, b, F1, WK(13) ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
- R( b, c, d, e, a, F1, WK(14) ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
- R( a, b, c, d, e, F1, WK(15) ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
-
-#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
- __asm__ volatile ("movdqa %%"W_m04", %%"tmp0";\n\t" \
- "pxor %%"W_m28", %%"W";\n\t" \
- "palignr $8, %%"W_m08", %%"tmp0";\n\t" \
- :::"cc");
-
-#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
- __asm__ volatile ("pxor %%"W_m16", %%"W";\n\t" \
- "pxor %%"tmp0", %%"W";\n\t" \
- "movdqa %%"W", %%"tmp0";\n\t" \
- :::"cc");
-
-#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
- __asm__ volatile ("psrld $30, %%"W";\n\t" \
- "pslld $2, %%"tmp0";\n\t" \
- "por %%"W", %%"tmp0";\n\t" \
- :::"cc");
-
-#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
- __asm__ volatile ("movdqa %%"tmp0", %%"W";\n\t" \
- "paddd %[k_xmm], %%"tmp0";\n\t" \
- "movdqa %%"tmp0", %[wk];\n\t" \
- : [wk] "=m" (WK(i&~3)) \
- : [k_xmm] "m" (K_XMM[i / 20][0]));
-
- /* Transform 16-63 + Precalc 32-79. */
- R( e, a, b, c, d, F1, WK(16) ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( d, e, a, b, c, F1, WK(17) ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( c, d, e, a, b, F1, WK(18) ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( b, c, d, e, a, F1, WK(19) ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( a, b, c, d, e, F2, WK(20) ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( e, a, b, c, d, F2, WK(21) ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( d, e, a, b, c, F2, WK(22) ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( c, d, e, a, b, F2, WK(23) ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( b, c, d, e, a, F2, WK(24) ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( a, b, c, d, e, F2, WK(25) ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( e, a, b, c, d, F2, WK(26) ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( d, e, a, b, c, F2, WK(27) ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( c, d, e, a, b, F2, WK(28) ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( b, c, d, e, a, F2, WK(29) ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( a, b, c, d, e, F2, WK(30) ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( e, a, b, c, d, F2, WK(31) ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( d, e, a, b, c, F2, WK(32) ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
- R( c, d, e, a, b, F2, WK(33) ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
- R( b, c, d, e, a, F2, WK(34) ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
- R( a, b, c, d, e, F2, WK(35) ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
- R( e, a, b, c, d, F2, WK(36) ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
- R( d, e, a, b, c, F2, WK(37) ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
- R( c, d, e, a, b, F2, WK(38) ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
- R( b, c, d, e, a, F2, WK(39) ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
- R( a, b, c, d, e, F3, WK(40) ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
- R( e, a, b, c, d, F3, WK(41) ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
- R( d, e, a, b, c, F3, WK(42) ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
- R( c, d, e, a, b, F3, WK(43) ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
- R( b, c, d, e, a, F3, WK(44) ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
- R( a, b, c, d, e, F3, WK(45) ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
- R( e, a, b, c, d, F3, WK(46) ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
- R( d, e, a, b, c, F3, WK(47) ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
- R( c, d, e, a, b, F3, WK(48) ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( b, c, d, e, a, F3, WK(49) ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( a, b, c, d, e, F3, WK(50) ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( e, a, b, c, d, F3, WK(51) ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( d, e, a, b, c, F3, WK(52) ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( c, d, e, a, b, F3, WK(53) ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( b, c, d, e, a, F3, WK(54) ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( a, b, c, d, e, F3, WK(55) ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( e, a, b, c, d, F3, WK(56) ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( d, e, a, b, c, F3, WK(57) ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( c, d, e, a, b, F3, WK(58) ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( b, c, d, e, a, F3, WK(59) ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( a, b, c, d, e, F4, WK(60) ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( e, a, b, c, d, F4, WK(61) ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( d, e, a, b, c, F4, WK(62) ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( c, d, e, a, b, F4, WK(63) ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
-
-#define CLEAR_REG(reg) __asm__ volatile ("pxor %%"reg", %%"reg";\n\t":::"cc");
-
- /* Transform 64-79 + Clear XMM registers. */
- R( b, c, d, e, a, F4, WK(64) ); CLEAR_REG(BSWAP_REG);
- R( a, b, c, d, e, F4, WK(65) ); CLEAR_REG(Wtmp0);
- R( e, a, b, c, d, F4, WK(66) ); CLEAR_REG(Wtmp1);
- R( d, e, a, b, c, F4, WK(67) ); CLEAR_REG(W0);
- R( c, d, e, a, b, F4, WK(68) ); CLEAR_REG(W1);
- R( b, c, d, e, a, F4, WK(69) ); CLEAR_REG(W2);
- R( a, b, c, d, e, F4, WK(70) ); CLEAR_REG(W3);
- R( e, a, b, c, d, F4, WK(71) ); CLEAR_REG(W4);
- R( d, e, a, b, c, F4, WK(72) ); CLEAR_REG(W5);
- R( c, d, e, a, b, F4, WK(73) ); CLEAR_REG(W6);
- R( b, c, d, e, a, F4, WK(74) ); CLEAR_REG(W7);
- R( a, b, c, d, e, F4, WK(75) );
- R( e, a, b, c, d, F4, WK(76) );
- R( d, e, a, b, c, F4, WK(77) );
- R( c, d, e, a, b, F4, WK(78) );
- R( b, c, d, e, a, F4, WK(79) );
-
- /* Update the chaining variables. */
- state->h0 += a;
- state->h1 += b;
- state->h2 += c;
- state->h3 += d;
- state->h4 += e;
-
- return /* burn_stack */ 84+15;
-}
-
-#endif
-#endif