diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-12-13 21:07:41 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-12-13 22:40:52 +0200 |
commit | d2b853246c2ed056a92096d89c3ca057e45c9c92 (patch) | |
tree | c3267dcc3e2f5b8ba58721c7d76a5a9e98feee25 | |
parent | be2238f68abcc6f2b4e8c38ad9141376ce622a22 (diff) | |
download | libgcrypt-d2b853246c2ed056a92096d89c3ca057e45c9c92.tar.gz |
Convert SHA-1 SSSE3 implementation from mixed asm&C to pure asm
* cipher/Makefile.am: Change 'sha1-ssse3-amd64.c' to
'sha1-ssse3-amd64.S'.
* cipher/sha1-ssse3-amd64.c: Remove.
* cipher/sha1-ssse3-amd64.S: New.
--
Mixed C&asm implementation appears to trigger GCC bugs easily. Therefore
convert SSSE3 implementation to pure assembly for safety.
Benchmark also show smallish speed improvement.
cpu C&asm asm
Intel i5-4570 5.22 c/B 5.09 c/B
Intel i5-2450M 7.24 c/B 7.00 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/sha1-ssse3-amd64.S | 378 | ||||
-rw-r--r-- | cipher/sha1-ssse3-amd64.c | 319 |
3 files changed, 379 insertions, 320 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 04777729..7d737e23 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -77,7 +77,7 @@ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \ scrypt.c \ seed.c \ serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \ -sha1.c sha1-ssse3-amd64.c \ +sha1.c sha1-ssse3-amd64.S \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \ sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \ stribog.c \ diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S new file mode 100644 index 00000000..5165f3f2 --- /dev/null +++ b/cipher/sha1-ssse3-amd64.S @@ -0,0 +1,378 @@ +/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function + * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * Based on sha1.c: + * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * Intel SSSE3 accelerated SHA-1 implementation based on white paper: + * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 + */ + +#ifdef __x86_64__ +#include <config.h> + +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ + defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +.data +#define K1 0x5A827999 +#define K2 0x6ED9EBA1 +#define K3 0x8F1BBCDC +#define K4 0xCA62C1D6 +.align 16 +.LK_XMM: +.LK1: .long K1, K1, K1, K1 +.LK2: .long K2, K2, K2, K2 +.LK3: .long K3, K3, K3, K3 +.LK4: .long K4, K4, K4, K4 + +.Lbswap_shufb_ctl: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + + +/* Register macros */ + +#define RSTATE %r8 +#define RDATA %r9 +#define ROLDSTACK %r10 + +#define a %eax +#define b %ebx +#define c %ecx +#define d %edx +#define e %edi + +#define RT0 %esi +#define RT1 %ebp + +#define Wtmp0 %xmm0 +#define Wtmp1 %xmm1 + +#define W0 %xmm2 +#define W1 %xmm3 +#define W2 %xmm4 +#define W3 %xmm5 +#define W4 %xmm6 +#define W5 %xmm7 +#define W6 %xmm8 +#define W7 %xmm9 + +#define BSWAP_REG %xmm10 + + +/* Round function macros. */ + +#define WK(i) (((i) & 15) * 4)(%rsp) + +#define R_F1(a,b,c,d,e,i) \ + movl c, RT0; \ + addl WK(i), e; \ + xorl d, RT0; \ + movl a, RT1; \ + andl b, RT0; \ + roll $30, b; \ + xorl d, RT0; \ + leal (RT0,e), e; \ + roll $5, RT1; \ + addl RT1, e; + +#define R_F2(a,b,c,d,e,i) \ + movl c, RT0; \ + addl WK(i), e; \ + xorl b, RT0; \ + roll $30, b; \ + xorl d, RT0; \ + movl a, RT1; \ + leal (RT0,e), e; \ + roll $5, RT1; \ + addl RT1, e; + +#define R_F3(a,b,c,d,e,i) \ + movl c, RT0; \ + movl b, RT1; \ + xorl b, RT0; \ + andl c, RT1; \ + andl d, RT0; \ + addl RT1, e; \ + addl WK(i), e; \ + roll $30, b; \ + movl a, RT1; \ + leal (RT0,e), e; \ + roll $5, RT1; \ + addl RT1, e; + +#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i) + +#define R(a,b,c,d,e,f,i) \ + R_##f(a,b,c,d,e,i) + + +/* Input expansion macros. */ + +#define W_PRECALC_00_15_0(i, W, tmp0) \ + movdqu (4*(i))(RDATA), tmp0; + +#define W_PRECALC_00_15_1(i, W, tmp0) \ + pshufb BSWAP_REG, tmp0; \ + movdqa tmp0, W; + +#define W_PRECALC_00_15_2(i, W, tmp0) \ + paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; + +#define W_PRECALC_00_15_3(i, W, tmp0) \ + movdqa tmp0, WK(i&~3); + +#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + movdqa W_m12, W; \ + palignr $8, W_m16, W; \ + movdqa W_m04, tmp0; \ + psrldq $4, tmp0; \ + pxor W_m08, W; + +#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + pxor W_m16, tmp0; \ + pxor tmp0, W; \ + movdqa W, tmp1; \ + movdqa W, tmp0; \ + pslldq $12, tmp1; + +#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + psrld $31, W; \ + pslld $1, tmp0; \ + por W, tmp0; \ + movdqa tmp1, W; \ + psrld $30, tmp1; \ + pslld $2, W; + +#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + pxor W, tmp0; \ + pxor tmp1, tmp0; \ + movdqa tmp0, W; \ + paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \ + movdqa tmp0, WK((i)&~3); + +#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + movdqa W_m04, tmp0; \ + pxor W_m28, W; \ + palignr $8, W_m08, tmp0; + +#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + pxor W_m16, W; \ + pxor tmp0, W; \ + movdqa W, tmp0; + +#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + psrld $30, W; \ + pslld $2, tmp0; \ + por W, tmp0; + +#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + movdqa tmp0, W; \ + paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \ + movdqa tmp0, WK((i)&~3); + +#define CLEAR_REG(reg) pxor reg, reg; + + +/* + * Transform 64 bytes (16 32-bit words) at DATA. + * + * unsigned int + * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data) + */ +.text +.globl _gcry_sha1_transform_amd64_ssse3 +.type _gcry_sha1_transform_amd64_ssse3,@function +.align 16 +_gcry_sha1_transform_amd64_ssse3: + /* input: + * %rdi: ctx, CTX + * %rsi: data (64 bytes) + * %rdx: ... + */ + + movq %rdi, RSTATE; + movq %rsi, RDATA; + pushq %rbx; + pushq %rbp; + + movq %rsp, ROLDSTACK; + + subq $(16*4), %rsp; + andq $(~31), %rsp; + + /* Get the values of the chaining variables. */ + movl state_h0(RSTATE), a; + movl state_h1(RSTATE), b; + movl state_h2(RSTATE), c; + movl state_h3(RSTATE), d; + movl state_h4(RSTATE), e; + + movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG; + + /* Precalc 0-15. */ + W_PRECALC_00_15_0(0, W0, Wtmp0); + W_PRECALC_00_15_1(1, W0, Wtmp0); + W_PRECALC_00_15_2(2, W0, Wtmp0); + W_PRECALC_00_15_3(3, W0, Wtmp0); + W_PRECALC_00_15_0(4, W7, Wtmp0); + W_PRECALC_00_15_1(5, W7, Wtmp0); + W_PRECALC_00_15_2(6, W7, Wtmp0); + W_PRECALC_00_15_3(7, W7, Wtmp0); + W_PRECALC_00_15_0(8, W6, Wtmp0); + W_PRECALC_00_15_1(9, W6, Wtmp0); + W_PRECALC_00_15_2(10, W6, Wtmp0); + W_PRECALC_00_15_3(11, W6, Wtmp0); + W_PRECALC_00_15_0(12, W5, Wtmp0); + W_PRECALC_00_15_1(13, W5, Wtmp0); + W_PRECALC_00_15_2(14, W5, Wtmp0); + W_PRECALC_00_15_3(15, W5, Wtmp0); + + /* Transform 0-15 + Precalc 16-31. */ + R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + + /* Transform 16-63 + Precalc 32-79. */ + R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + + /* Transform 64-79 + Clear XMM registers. */ + R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG); + R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0); + R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1); + R( d, e, a, b, c, F4, 67 ); CLEAR_REG(W0); + R( c, d, e, a, b, F4, 68 ); CLEAR_REG(W1); + R( b, c, d, e, a, F4, 69 ); CLEAR_REG(W2); + R( a, b, c, d, e, F4, 70 ); CLEAR_REG(W3); + R( e, a, b, c, d, F4, 71 ); CLEAR_REG(W4); + R( d, e, a, b, c, F4, 72 ); CLEAR_REG(W5); + R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6); + R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7); + R( a, b, c, d, e, F4, 75 ); + R( e, a, b, c, d, F4, 76 ); + R( d, e, a, b, c, F4, 77 ); + R( c, d, e, a, b, F4, 78 ); + R( b, c, d, e, a, F4, 79 ); + + /* Update the chaining variables. */ + addl state_h0(RSTATE), a; + addl state_h1(RSTATE), b; + addl state_h2(RSTATE), c; + addl state_h3(RSTATE), d; + addl state_h4(RSTATE), e; + + movl a, state_h0(RSTATE); + movl b, state_h1(RSTATE); + movl c, state_h2(RSTATE); + movl d, state_h3(RSTATE); + movl e, state_h4(RSTATE); + + movq ROLDSTACK, %rsp; + + popq %rbp; + popq %rbx; + + /* burn_stack */ + movl $(16*4 + 2*8 + 31), %eax; + + ret; + +#endif +#endif diff --git a/cipher/sha1-ssse3-amd64.c b/cipher/sha1-ssse3-amd64.c deleted file mode 100644 index 13422352..00000000 --- a/cipher/sha1-ssse3-amd64.c +++ /dev/null @@ -1,319 +0,0 @@ -/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function - * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> - * - * Based on sha1.c: - * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see <http://www.gnu.org/licenses/>. - */ - -/* - * Intel SSSE3 accelerated SHA-1 implementation based on white paper: - * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" - * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 - */ - -#ifdef __x86_64__ -#include <config.h> - -#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \ - defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ - defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) - -#ifdef HAVE_STDINT_H -# include <stdint.h> /* uintptr_t */ -#elif defined(HAVE_INTTYPES_H) -# include <inttypes.h> -#else -/* In this case, uintptr_t is provided by config.h. */ -#endif - -#include "bithelp.h" - - -/* Helper macro to force alignment to 16 bytes. */ -#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED -# define ATTR_ALIGNED_16 __attribute__ ((aligned (16))) -#else -# define ATTR_ALIGNED_16 -#endif - - -typedef struct -{ - u32 h0,h1,h2,h3,h4; -} SHA1_STATE; - - -/* Round function macros. */ -#define K1 0x5A827999L -#define K2 0x6ED9EBA1L -#define K3 0x8F1BBCDCL -#define K4 0xCA62C1D6L -#define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) ) -#define F2(x,y,z) ( x ^ y ^ z ) -#define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) ) -#define F4(x,y,z) ( x ^ y ^ z ) -#define R(a,b,c,d,e,f,wk) do { e += rol( a, 5 ) \ - + f( b, c, d ) \ - + wk; \ - b = rol( b, 30 ); \ - } while(0) - -#define WK(i) (wk[i & 15]) - - -static const u32 K_XMM[4][4] ATTR_ALIGNED_16 = - { - { K1, K1, K1, K1 }, - { K2, K2, K2, K2 }, - { K3, K3, K3, K3 }, - { K4, K4, K4, K4 }, - }; -static const u32 bswap_shufb_ctl[4] ATTR_ALIGNED_16 = - { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f }; - - -/* - * Transform 64 bytes (16 32-bit words) at DATA. - */ -unsigned int -_gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data) -{ - SHA1_STATE *state = ctx; - register u32 a, b, c, d, e; /* Local copies of the chaining variables. */ - byte wk_unaligned[4*16+15]; /* The array we work on. */ - u32 *wk = (u32 *)(wk_unaligned - + ((16 - ((uintptr_t)wk_unaligned & 15)) & 15)); - - /* Get the values of the chaining variables. */ - a = state->h0; - b = state->h1; - c = state->h2; - d = state->h3; - e = state->h4; - -#define Wtmp0 "xmm0" -#define Wtmp1 "xmm1" - -#define W0 "xmm2" -#define W1 "xmm3" -#define W2 "xmm4" -#define W3 "xmm5" -#define W4 "xmm6" -#define W5 "xmm7" -#define W6 "xmm8" -#define W7 "xmm9" - -#define BSWAP_REG "xmm10" - - __asm__ volatile ("movdqa %[bswap], %%"BSWAP_REG";\n\t" - :: [bswap] "m" (bswap_shufb_ctl[0])); - -#define W_PRECALC_00_15_0(i, W, tmp0) \ - __asm__ volatile ("movdqu %[data], %%"tmp0";\n\t" \ - ::[data] "m" (*(data+4*(i)))); - -#define W_PRECALC_00_15_1(i, W, tmp0) \ - __asm__ volatile ("pshufb %%"BSWAP_REG", %%"tmp0";\n\t" \ - "movdqa %%"tmp0", %%"W";\n\t" \ - ::: "cc"); - -#define W_PRECALC_00_15_2(i, W, tmp0) \ - __asm__ volatile ("paddd %[k_xmm], %%"tmp0";\n\t" \ - ::[k_xmm] "m" (K_XMM[i / 20][0])); - -#define W_PRECALC_00_15_3(i, W, tmp0) \ - __asm__ volatile ("movdqa %%"tmp0", %[wk];\n\t" \ - :[wk] "=m" (WK(i&~3))); - - /* Precalc 0-15. */ - W_PRECALC_00_15_0(0, W0, Wtmp0); - W_PRECALC_00_15_1(1, W0, Wtmp0); - W_PRECALC_00_15_2(2, W0, Wtmp0); - W_PRECALC_00_15_3(3, W0, Wtmp0); - W_PRECALC_00_15_0(4, W7, Wtmp0); - W_PRECALC_00_15_1(5, W7, Wtmp0); - W_PRECALC_00_15_2(6, W7, Wtmp0); - W_PRECALC_00_15_3(7, W7, Wtmp0); - W_PRECALC_00_15_0(8, W6, Wtmp0); - W_PRECALC_00_15_1(9, W6, Wtmp0); - W_PRECALC_00_15_2(10, W6, Wtmp0); - W_PRECALC_00_15_3(11, W6, Wtmp0); - W_PRECALC_00_15_0(12, W5, Wtmp0); - W_PRECALC_00_15_1(13, W5, Wtmp0); - W_PRECALC_00_15_2(14, W5, Wtmp0); - W_PRECALC_00_15_3(15, W5, Wtmp0); - -#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ - __asm__ volatile ("movdqa %%"W_m12", %%"W";\n\t" \ - "palignr $8, %%"W_m16", %%"W";\n\t" \ - "movdqa %%"W_m04", %%"tmp0";\n\t" \ - "psrldq $4, %%"tmp0";\n\t" \ - "pxor %%"W_m08", %%"W";\n\t" \ - :::"cc"); - -#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ - __asm__ volatile ("pxor %%"W_m16", %%"tmp0";\n\t" \ - "pxor %%"tmp0", %%"W";\n\t" \ - "movdqa %%"W", %%"tmp1";\n\t" \ - "movdqa %%"W", %%"tmp0";\n\t" \ - "pslldq $12, %%"tmp1";\n\t" \ - :::"cc"); - -#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ - __asm__ volatile ("psrld $31, %%"W";\n\t" \ - "pslld $1, %%"tmp0";\n\t" \ - "por %%"W", %%"tmp0";\n\t" \ - "movdqa %%"tmp1", %%"W";\n\t" \ - "psrld $30, %%"tmp1";\n\t" \ - "pslld $2, %%"W";\n\t" \ - :::"cc"); - -#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ - __asm__ volatile ("pxor %%"W", %%"tmp0";\n\t" \ - "pxor %%"tmp1", %%"tmp0";\n\t" \ - "movdqa %%"tmp0", %%"W";\n\t" \ - "paddd %[k_xmm], %%"tmp0";\n\t" \ - "movdqa %%"tmp0", %[wk];\n\t" \ - : [wk] "=m" (WK(i&~3)) \ - : [k_xmm] "m" (K_XMM[i / 20][0])); - - /* Transform 0-15 + Precalc 16-31. */ - R( a, b, c, d, e, F1, WK( 0) ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( e, a, b, c, d, F1, WK( 1) ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( d, e, a, b, c, F1, WK( 2) ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( c, d, e, a, b, F1, WK( 3) ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( b, c, d, e, a, F1, WK( 4) ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( a, b, c, d, e, F1, WK( 5) ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( e, a, b, c, d, F1, WK( 6) ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( d, e, a, b, c, F1, WK( 7) ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( c, d, e, a, b, F1, WK( 8) ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( b, c, d, e, a, F1, WK( 9) ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( a, b, c, d, e, F1, WK(10) ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( e, a, b, c, d, F1, WK(11) ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( d, e, a, b, c, F1, WK(12) ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - R( c, d, e, a, b, F1, WK(13) ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - R( b, c, d, e, a, F1, WK(14) ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - R( a, b, c, d, e, F1, WK(15) ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - -#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ - __asm__ volatile ("movdqa %%"W_m04", %%"tmp0";\n\t" \ - "pxor %%"W_m28", %%"W";\n\t" \ - "palignr $8, %%"W_m08", %%"tmp0";\n\t" \ - :::"cc"); - -#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ - __asm__ volatile ("pxor %%"W_m16", %%"W";\n\t" \ - "pxor %%"tmp0", %%"W";\n\t" \ - "movdqa %%"W", %%"tmp0";\n\t" \ - :::"cc"); - -#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ - __asm__ volatile ("psrld $30, %%"W";\n\t" \ - "pslld $2, %%"tmp0";\n\t" \ - "por %%"W", %%"tmp0";\n\t" \ - :::"cc"); - -#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ - __asm__ volatile ("movdqa %%"tmp0", %%"W";\n\t" \ - "paddd %[k_xmm], %%"tmp0";\n\t" \ - "movdqa %%"tmp0", %[wk];\n\t" \ - : [wk] "=m" (WK(i&~3)) \ - : [k_xmm] "m" (K_XMM[i / 20][0])); - - /* Transform 16-63 + Precalc 32-79. */ - R( e, a, b, c, d, F1, WK(16) ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( d, e, a, b, c, F1, WK(17) ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( c, d, e, a, b, F1, WK(18) ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( b, c, d, e, a, F1, WK(19) ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( a, b, c, d, e, F2, WK(20) ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( e, a, b, c, d, F2, WK(21) ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( d, e, a, b, c, F2, WK(22) ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( c, d, e, a, b, F2, WK(23) ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( b, c, d, e, a, F2, WK(24) ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( a, b, c, d, e, F2, WK(25) ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( e, a, b, c, d, F2, WK(26) ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( d, e, a, b, c, F2, WK(27) ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( c, d, e, a, b, F2, WK(28) ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( b, c, d, e, a, F2, WK(29) ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( a, b, c, d, e, F2, WK(30) ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( e, a, b, c, d, F2, WK(31) ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( d, e, a, b, c, F2, WK(32) ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( c, d, e, a, b, F2, WK(33) ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( b, c, d, e, a, F2, WK(34) ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( a, b, c, d, e, F2, WK(35) ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( e, a, b, c, d, F2, WK(36) ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( d, e, a, b, c, F2, WK(37) ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( c, d, e, a, b, F2, WK(38) ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( b, c, d, e, a, F2, WK(39) ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( a, b, c, d, e, F3, WK(40) ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( e, a, b, c, d, F3, WK(41) ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( d, e, a, b, c, F3, WK(42) ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( c, d, e, a, b, F3, WK(43) ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( b, c, d, e, a, F3, WK(44) ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( a, b, c, d, e, F3, WK(45) ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( e, a, b, c, d, F3, WK(46) ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( d, e, a, b, c, F3, WK(47) ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( c, d, e, a, b, F3, WK(48) ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( b, c, d, e, a, F3, WK(49) ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( a, b, c, d, e, F3, WK(50) ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( e, a, b, c, d, F3, WK(51) ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( d, e, a, b, c, F3, WK(52) ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( c, d, e, a, b, F3, WK(53) ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( b, c, d, e, a, F3, WK(54) ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( a, b, c, d, e, F3, WK(55) ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( e, a, b, c, d, F3, WK(56) ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( d, e, a, b, c, F3, WK(57) ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( c, d, e, a, b, F3, WK(58) ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( b, c, d, e, a, F3, WK(59) ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( a, b, c, d, e, F4, WK(60) ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( e, a, b, c, d, F4, WK(61) ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( d, e, a, b, c, F4, WK(62) ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( c, d, e, a, b, F4, WK(63) ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - -#define CLEAR_REG(reg) __asm__ volatile ("pxor %%"reg", %%"reg";\n\t":::"cc"); - - /* Transform 64-79 + Clear XMM registers. */ - R( b, c, d, e, a, F4, WK(64) ); CLEAR_REG(BSWAP_REG); - R( a, b, c, d, e, F4, WK(65) ); CLEAR_REG(Wtmp0); - R( e, a, b, c, d, F4, WK(66) ); CLEAR_REG(Wtmp1); - R( d, e, a, b, c, F4, WK(67) ); CLEAR_REG(W0); - R( c, d, e, a, b, F4, WK(68) ); CLEAR_REG(W1); - R( b, c, d, e, a, F4, WK(69) ); CLEAR_REG(W2); - R( a, b, c, d, e, F4, WK(70) ); CLEAR_REG(W3); - R( e, a, b, c, d, F4, WK(71) ); CLEAR_REG(W4); - R( d, e, a, b, c, F4, WK(72) ); CLEAR_REG(W5); - R( c, d, e, a, b, F4, WK(73) ); CLEAR_REG(W6); - R( b, c, d, e, a, F4, WK(74) ); CLEAR_REG(W7); - R( a, b, c, d, e, F4, WK(75) ); - R( e, a, b, c, d, F4, WK(76) ); - R( d, e, a, b, c, F4, WK(77) ); - R( c, d, e, a, b, F4, WK(78) ); - R( b, c, d, e, a, F4, WK(79) ); - - /* Update the chaining variables. */ - state->h0 += a; - state->h1 += b; - state->h2 += c; - state->h3 += d; - state->h4 += e; - - return /* burn_stack */ 84+15; -} - -#endif -#endif |