diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2014-08-31 13:17:24 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2014-10-04 15:36:01 +0300 |
commit | de0ccd4dce7ec185a678d78878d4538dd609ca0f (patch) | |
tree | 638f49809b7620692141ab9b87315706e99c7994 /cipher | |
parent | 30bd759f398f45b04d0a783b875f59ce9bd1e51d (diff) | |
download | libgcrypt-de0ccd4dce7ec185a678d78878d4538dd609ca0f.tar.gz |
Add Whirlpool AMD64/SSE2 assembly implementation
* cipher/Makefile.am: Add 'whirlpool-sse2-amd64.S'.
* cipher/whirlpool-sse2-amd64.S: New.
* cipher/whirlpool.c (USE_AMD64_ASM): New.
(whirlpool_tables_s): New.
(rc, C0, C1, C2, C3, C4, C5, C6, C7): Combine these tables into single
structure and replace old tables with macros of same name.
(tab): New structure containing above tables.
[USE_AMD64_ASM] (_gcry_whirlpool_transform_amd64)
(whirlpool_transform): New.
* configure.ac [host=x86_64]: Add 'whirlpool-sse2-amd64.lo'.
--
Benchmark results:
On Intel Core i5-4570 (3.2 Ghz):
After:
WHIRLPOOL | 4.82 ns/B 197.8 MiB/s 15.43 c/B
Before:
WHIRLPOOL | 9.10 ns/B 104.8 MiB/s 29.13 c/B
On Intel Core i5-2450M (2.5 Ghz):
After:
WHIRLPOOL | 8.43 ns/B 113.1 MiB/s 21.09 c/B
Before:
WHIRLPOOL | 13.45 ns/B 70.92 MiB/s 33.62 c/B
On Intel Core2 T8100 (2.1 Ghz):
After:
WHIRLPOOL | 10.22 ns/B 93.30 MiB/s 21.47 c/B
Before:
WHIRLPOOL | 19.87 ns/B 48.00 MiB/s 41.72 c/B
Summary, old vs new ratio:
Intel Core i5-4570: 1.88x
Intel Core i5-2450M: 1.59x
Intel Core2 T8100: 1.94x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/whirlpool-sse2-amd64.S | 335 | ||||
-rw-r--r-- | cipher/whirlpool.c | 91 |
3 files changed, 391 insertions, 37 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index c1653566..7f45cbbe 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -87,7 +87,7 @@ sha512.c sha512-ssse3-amd64.S sha512-avx-amd64.S sha512-avx2-bmi2-amd64.S \ sha512-armv7-neon.S \ stribog.c \ tiger.c \ -whirlpool.c \ +whirlpool.c whirlpool-sse2-amd64.S \ twofish.c twofish-amd64.S twofish-arm.S \ rfc2268.c \ camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \ diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S new file mode 100644 index 00000000..d0bcf2d9 --- /dev/null +++ b/cipher/whirlpool-sse2-amd64.S @@ -0,0 +1,335 @@ +/* whirlpool-sse2-amd64.S - AMD64 assembly implementation of Whirlpool + * + * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && defined(USE_WHIRLPOOL) + +#ifdef __PIC__ +# define RIP %rip +#else +# define RIP +#endif + +.text + +/* look-up table offsets on RTAB */ +#define RC (0) +#define C0 (RC + (8 * 10)) +#define C1 (C0 + (8 * 256)) +#define C2 (C1 + (8 * 256)) +#define C3 (C2 + (8 * 256)) +#define C4 (C3 + (8 * 256)) +#define C5 (C4 + (8 * 256)) +#define C6 (C5 + (8 * 256)) +#define C7 (C6 + (8 * 256)) + +/* stack variables */ +#define STACK_DATAP (0) +#define STACK_STATEP (STACK_DATAP + 8) +#define STACK_ROUNDS (STACK_STATEP + 8) +#define STACK_NBLKS (STACK_ROUNDS + 8) +#define STACK_RBP (STACK_NBLKS + 8) +#define STACK_RBX (STACK_RBP + 8) +#define STACK_R12 (STACK_RBX + 8) +#define STACK_R13 (STACK_R12 + 8) +#define STACK_R14 (STACK_R13 + 8) +#define STACK_R15 (STACK_R14 + 8) +#define STACK_MAX (STACK_R15 + 8) + +/* register macros */ +#define RTAB %rbp + +#define RI1 %rax +#define RI2 %rbx +#define RI3 %rcx +#define RI4 %rdx + +#define RI1d %eax +#define RI2d %ebx +#define RI3d %ecx +#define RI4d %edx + +#define RI1bl %al +#define RI2bl %bl +#define RI3bl %cl +#define RI4bl %dl + +#define RI1bh %ah +#define RI2bh %bh +#define RI3bh %ch +#define RI4bh %dh + +#define RB0 %r8 +#define RB1 %r9 +#define RB2 %r10 +#define RB3 %r11 +#define RB4 %r12 +#define RB5 %r13 +#define RB6 %r14 +#define RB7 %r15 + +#define RT0 %rsi +#define RT1 %rdi + +#define RT0d %esi +#define RT1d %edi + +#define XKEY0 %xmm0 +#define XKEY1 %xmm1 +#define XKEY2 %xmm2 +#define XKEY3 %xmm3 +#define XKEY4 %xmm4 +#define XKEY5 %xmm5 +#define XKEY6 %xmm6 +#define XKEY7 %xmm7 + +#define XSTATE0 %xmm8 +#define XSTATE1 %xmm9 +#define XSTATE2 %xmm10 +#define XSTATE3 %xmm11 +#define XSTATE4 %xmm12 +#define XSTATE5 %xmm13 +#define XSTATE6 %xmm14 +#define XSTATE7 %xmm15 + +/*********************************************************************** + * AMD64 assembly implementation of Whirlpool. + * - Using table-lookups + * - Store state in XMM registers + ***********************************************************************/ +#define __do_whirl(op, ri, \ + b0, b1, b2, b3, b4, b5, b6, b7, \ + load_ri, load_arg) \ + movzbl ri ## bl, RT0d; \ + movzbl ri ## bh, RT1d; \ + shrq $16, ri; \ + op ## q C7(RTAB,RT0,8), b7; \ + op ## q C6(RTAB,RT1,8), b6; \ + movzbl ri ## bl, RT0d; \ + movzbl ri ## bh, RT1d; \ + shrq $16, ri; \ + op ## q C5(RTAB,RT0,8), b5; \ + op ## q C4(RTAB,RT1,8), b4; \ + movzbl ri ## bl, RT0d; \ + movzbl ri ## bh, RT1d; \ + shrl $16, ri ## d; \ + op ## q C3(RTAB,RT0,8), b3; \ + op ## q C2(RTAB,RT1,8), b2; \ + movzbl ri ## bl, RT0d; \ + movzbl ri ## bh, RT1d; \ + load_ri( load_arg, ri); \ + op ## q C1(RTAB,RT0,8), b1; \ + op ## q C0(RTAB,RT1,8), b0; + +#define do_whirl(op, ri, rb_add, load_ri, load_arg) \ + __do_whirl(op, ##ri, rb_add, load_ri, load_arg) + +#define dummy(...) /*_*/ + +#define do_movq(src, dst) movq src, dst; + +#define RB_ADD0 RB0, RB1, RB2, RB3, RB4, RB5, RB6, RB7 +#define RB_ADD1 RB1, RB2, RB3, RB4, RB5, RB6, RB7, RB0 +#define RB_ADD2 RB2, RB3, RB4, RB5, RB6, RB7, RB0, RB1 +#define RB_ADD3 RB3, RB4, RB5, RB6, RB7, RB0, RB1, RB2 +#define RB_ADD4 RB4, RB5, RB6, RB7, RB0, RB1, RB2, RB3 +#define RB_ADD5 RB5, RB6, RB7, RB0, RB1, RB2, RB3, RB4 +#define RB_ADD6 RB6, RB7, RB0, RB1, RB2, RB3, RB4, RB5 +#define RB_ADD7 RB7, RB0, RB1, RB2, RB3, RB4, RB5, RB6 + +.align 8 +.globl _gcry_whirlpool_transform_amd64 +.type _gcry_whirlpool_transform_amd64,@function; + +_gcry_whirlpool_transform_amd64: + /* input: + * %rdi: state + * %rsi: inblk + * %rdx: nblks + * %rcx: look-up tables + */ + cmp $0, %rdx; + je .Lskip; + + subq $STACK_MAX, %rsp; + movq %rbp, STACK_RBP(%rsp); + movq %rbx, STACK_RBX(%rsp); + movq %r12, STACK_R12(%rsp); + movq %r13, STACK_R13(%rsp); + movq %r14, STACK_R14(%rsp); + movq %r15, STACK_R15(%rsp); + + movq %rdx, STACK_NBLKS(%rsp); + movq %rdi, STACK_STATEP(%rsp); + movq %rsi, STACK_DATAP(%rsp); + + movq %rcx, RTAB; + + jmp .Lfirst_block; + +.align 8 +.Lblock_loop: + movq STACK_DATAP(%rsp), %rsi; + movq RI1, %rdi; + +.Lfirst_block: + /* load data_block */ + movq 0*8(%rsi), RB0; + movq 1*8(%rsi), RB1; + bswapq RB0; + movq 2*8(%rsi), RB2; + bswapq RB1; + movq 3*8(%rsi), RB3; + bswapq RB2; + movq 4*8(%rsi), RB4; + bswapq RB3; + movq 5*8(%rsi), RB5; + bswapq RB4; + movq RB0, XSTATE0; + movq 6*8(%rsi), RB6; + bswapq RB5; + movq RB1, XSTATE1; + movq 7*8(%rsi), RB7; + bswapq RB6; + movq RB2, XSTATE2; + bswapq RB7; + movq RB3, XSTATE3; + movq RB4, XSTATE4; + movq RB5, XSTATE5; + movq RB6, XSTATE6; + movq RB7, XSTATE7; + + /* load key */ + movq 0*8(%rdi), XKEY0; + movq 1*8(%rdi), XKEY1; + movq 2*8(%rdi), XKEY2; + movq 3*8(%rdi), XKEY3; + movq 4*8(%rdi), XKEY4; + movq 5*8(%rdi), XKEY5; + movq 6*8(%rdi), XKEY6; + movq 7*8(%rdi), XKEY7; + + movq XKEY0, RI1; + movq XKEY1, RI2; + movq XKEY2, RI3; + movq XKEY3, RI4; + + /* prepare and store state */ + pxor XKEY0, XSTATE0; + pxor XKEY1, XSTATE1; + pxor XKEY2, XSTATE2; + pxor XKEY3, XSTATE3; + pxor XKEY4, XSTATE4; + pxor XKEY5, XSTATE5; + pxor XKEY6, XSTATE6; + pxor XKEY7, XSTATE7; + + movq XSTATE0, 0*8(%rdi); + movq XSTATE1, 1*8(%rdi); + movq XSTATE2, 2*8(%rdi); + movq XSTATE3, 3*8(%rdi); + movq XSTATE4, 4*8(%rdi); + movq XSTATE5, 5*8(%rdi); + movq XSTATE6, 6*8(%rdi); + movq XSTATE7, 7*8(%rdi); + + addq $64, STACK_DATAP(%rsp); + movl $(0), STACK_ROUNDS(%rsp); +.align 8 +.Lround_loop: + do_whirl(mov, RI1 /*XKEY0*/, RB_ADD0, do_movq, XKEY4); + do_whirl(xor, RI2 /*XKEY1*/, RB_ADD1, do_movq, XKEY5); + do_whirl(xor, RI3 /*XKEY2*/, RB_ADD2, do_movq, XKEY6); + do_whirl(xor, RI4 /*XKEY3*/, RB_ADD3, do_movq, XKEY7); + do_whirl(xor, RI1 /*XKEY0*/, RB_ADD4, do_movq, XSTATE0); + do_whirl(xor, RI2 /*XKEY1*/, RB_ADD5, do_movq, XSTATE1); + do_whirl(xor, RI3 /*XKEY2*/, RB_ADD6, do_movq, XSTATE2); + do_whirl(xor, RI4 /*XKEY3*/, RB_ADD7, do_movq, XSTATE3); + + movl STACK_ROUNDS(%rsp), RT0d; + movq RB1, XKEY1; + addl $1, STACK_ROUNDS(%rsp); + movq RB2, XKEY2; + movq RB3, XKEY3; + xorq RC(RTAB,RT0,8), RB0; /* Add round constant */ + movq RB4, XKEY4; + movq RB5, XKEY5; + movq RB0, XKEY0; + movq RB6, XKEY6; + movq RB7, XKEY7; + + do_whirl(xor, RI1 /*XSTATE0*/, RB_ADD0, do_movq, XSTATE4); + do_whirl(xor, RI2 /*XSTATE1*/, RB_ADD1, do_movq, XSTATE5); + do_whirl(xor, RI3 /*XSTATE2*/, RB_ADD2, do_movq, XSTATE6); + do_whirl(xor, RI4 /*XSTATE3*/, RB_ADD3, do_movq, XSTATE7); + + cmpl $10, STACK_ROUNDS(%rsp); + je .Lis_last_round; + + do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, do_movq, XKEY0); + do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, do_movq, XKEY1); + do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, do_movq, XKEY2); + do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, do_movq, XKEY3); + movq RB0, XSTATE0; + movq RB1, XSTATE1; + movq RB2, XSTATE2; + movq RB3, XSTATE3; + movq RB4, XSTATE4; + movq RB5, XSTATE5; + movq RB6, XSTATE6; + movq RB7, XSTATE7; + + jmp .Lround_loop; +.align 8 +.Lis_last_round: + do_whirl(xor, RI1 /*XSTATE4*/, RB_ADD4, dummy, _); + movq STACK_STATEP(%rsp), RI1; + do_whirl(xor, RI2 /*XSTATE5*/, RB_ADD5, dummy, _); + do_whirl(xor, RI3 /*XSTATE6*/, RB_ADD6, dummy, _); + do_whirl(xor, RI4 /*XSTATE7*/, RB_ADD7, dummy, _); + + /* store state */ + xorq RB0, 0*8(RI1); + xorq RB1, 1*8(RI1); + xorq RB2, 2*8(RI1); + xorq RB3, 3*8(RI1); + xorq RB4, 4*8(RI1); + xorq RB5, 5*8(RI1); + xorq RB6, 6*8(RI1); + xorq RB7, 7*8(RI1); + + subq $1, STACK_NBLKS(%rsp); + jnz .Lblock_loop; + + movq STACK_RBP(%rsp), %rbp; + movq STACK_RBX(%rsp), %rbx; + movq STACK_R12(%rsp), %r12; + movq STACK_R13(%rsp), %r13; + movq STACK_R14(%rsp), %r14; + movq STACK_R15(%rsp), %r15; + addq $STACK_MAX, %rsp; +.Lskip: + movl $(STACK_MAX + 8), %eax; + ret; +.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64; + +#endif +#endif diff --git a/cipher/whirlpool.c b/cipher/whirlpool.c index ffc6662c..2732f63c 100644 --- a/cipher/whirlpool.c +++ b/cipher/whirlpool.c @@ -40,6 +40,14 @@ #include "bufhelp.h" #include "hash-common.h" +/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ +#undef USE_AMD64_ASM +#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) +# define USE_AMD64_ASM 1 +#endif + + + /* Size of a whirlpool block (in bytes). */ #define BLOCK_SIZE 64 @@ -89,8 +97,15 @@ typedef struct { + +struct whirlpool_tables_s { + u64 RC[R]; + u64 C[8][256]; +}; + +static const struct whirlpool_tables_s tab = +{ /* Round constants. */ -static const u64 rc[R] = { U64_C (0x1823c6e887b8014f), U64_C (0x36a6d2f5796f9152), @@ -102,13 +117,9 @@ static const u64 rc[R] = U64_C (0xe427418ba77d95d8), U64_C (0xfbee7c66dd17479e), U64_C (0xca2dbf07ad5a8333), - }; - - - + }, /* Main lookup boxes. */ -static const u64 C0[256] = - { + { { U64_C (0x18186018c07830d8), U64_C (0x23238c2305af4626), U64_C (0xc6c63fc67ef991b8), U64_C (0xe8e887e8136fcdfb), U64_C (0x878726874ca113cb), U64_C (0xb8b8dab8a9626d11), @@ -237,10 +248,7 @@ static const u64 C0[256] = U64_C (0x98985a98b4c22d2c), U64_C (0xa4a4aaa4490e55ed), U64_C (0x2828a0285d885075), U64_C (0x5c5c6d5cda31b886), U64_C (0xf8f8c7f8933fed6b), U64_C (0x8686228644a411c2), - }; - -static const u64 C1[256] = - { + }, { U64_C (0xd818186018c07830), U64_C (0x2623238c2305af46), U64_C (0xb8c6c63fc67ef991), U64_C (0xfbe8e887e8136fcd), U64_C (0xcb878726874ca113), U64_C (0x11b8b8dab8a9626d), @@ -369,10 +377,7 @@ static const u64 C1[256] = U64_C (0x2c98985a98b4c22d), U64_C (0xeda4a4aaa4490e55), U64_C (0x752828a0285d8850), U64_C (0x865c5c6d5cda31b8), U64_C (0x6bf8f8c7f8933fed), U64_C (0xc28686228644a411), - }; - -static const u64 C2[256] = - { + }, { U64_C (0x30d818186018c078), U64_C (0x462623238c2305af), U64_C (0x91b8c6c63fc67ef9), U64_C (0xcdfbe8e887e8136f), U64_C (0x13cb878726874ca1), U64_C (0x6d11b8b8dab8a962), @@ -501,10 +506,7 @@ static const u64 C2[256] = U64_C (0x2d2c98985a98b4c2), U64_C (0x55eda4a4aaa4490e), U64_C (0x50752828a0285d88), U64_C (0xb8865c5c6d5cda31), U64_C (0xed6bf8f8c7f8933f), U64_C (0x11c28686228644a4), - }; - -static const u64 C3[256] = - { + }, { U64_C (0x7830d818186018c0), U64_C (0xaf462623238c2305), U64_C (0xf991b8c6c63fc67e), U64_C (0x6fcdfbe8e887e813), U64_C (0xa113cb878726874c), U64_C (0x626d11b8b8dab8a9), @@ -633,10 +635,7 @@ static const u64 C3[256] = U64_C (0xc22d2c98985a98b4), U64_C (0x0e55eda4a4aaa449), U64_C (0x8850752828a0285d), U64_C (0x31b8865c5c6d5cda), U64_C (0x3fed6bf8f8c7f893), U64_C (0xa411c28686228644), - }; - -static const u64 C4[256] = - { + }, { U64_C (0xc07830d818186018), U64_C (0x05af462623238c23), U64_C (0x7ef991b8c6c63fc6), U64_C (0x136fcdfbe8e887e8), U64_C (0x4ca113cb87872687), U64_C (0xa9626d11b8b8dab8), @@ -765,10 +764,7 @@ static const u64 C4[256] = U64_C (0xb4c22d2c98985a98), U64_C (0x490e55eda4a4aaa4), U64_C (0x5d8850752828a028), U64_C (0xda31b8865c5c6d5c), U64_C (0x933fed6bf8f8c7f8), U64_C (0x44a411c286862286), - }; - -static const u64 C5[256] = - { + }, { U64_C (0x18c07830d8181860), U64_C (0x2305af462623238c), U64_C (0xc67ef991b8c6c63f), U64_C (0xe8136fcdfbe8e887), U64_C (0x874ca113cb878726), U64_C (0xb8a9626d11b8b8da), @@ -897,10 +893,7 @@ static const u64 C5[256] = U64_C (0x98b4c22d2c98985a), U64_C (0xa4490e55eda4a4aa), U64_C (0x285d8850752828a0), U64_C (0x5cda31b8865c5c6d), U64_C (0xf8933fed6bf8f8c7), U64_C (0x8644a411c2868622), - }; - -static const u64 C6[256] = - { + }, { U64_C (0x6018c07830d81818), U64_C (0x8c2305af46262323), U64_C (0x3fc67ef991b8c6c6), U64_C (0x87e8136fcdfbe8e8), U64_C (0x26874ca113cb8787), U64_C (0xdab8a9626d11b8b8), @@ -1029,10 +1022,7 @@ static const u64 C6[256] = U64_C (0x5a98b4c22d2c9898), U64_C (0xaaa4490e55eda4a4), U64_C (0xa0285d8850752828), U64_C (0x6d5cda31b8865c5c), U64_C (0xc7f8933fed6bf8f8), U64_C (0x228644a411c28686), - }; - -static const u64 C7[256] = - { + }, { U64_C (0x186018c07830d818), U64_C (0x238c2305af462623), U64_C (0xc63fc67ef991b8c6), U64_C (0xe887e8136fcdfbe8), U64_C (0x8726874ca113cb87), U64_C (0xb8dab8a9626d11b8), @@ -1161,7 +1151,18 @@ static const u64 C7[256] = U64_C (0x985a98b4c22d2c98), U64_C (0xa4aaa4490e55eda4), U64_C (0x28a0285d88507528), U64_C (0x5c6d5cda31b8865c), U64_C (0xf8c7f8933fed6bf8), U64_C (0x86228644a411c286), - }; + } } +}; +#define C tab.C +#define C0 C[0] +#define C1 C[1] +#define C2 C[2] +#define C3 C[3] +#define C4 C[4] +#define C5 C[5] +#define C6 C[6] +#define C7 C[7] +#define rc tab.RC @@ -1189,6 +1190,22 @@ whirlpool_init (void *ctx, unsigned int flags) } +#ifdef USE_AMD64_ASM + +extern unsigned int +_gcry_whirlpool_transform_amd64(u64 *state, const unsigned char *data, + size_t nblks, const struct whirlpool_tables_s *tables); + +static unsigned int +whirlpool_transform (void *ctx, const unsigned char *data, size_t nblks) +{ + whirlpool_context_t *context = ctx; + + return _gcry_whirlpool_transform_amd64( + context->hash_state, data, nblks, &tab); +} + +#else /* USE_AMD64_ASM */ /* * Transform block. @@ -1308,6 +1325,8 @@ whirlpool_transform ( void *c, const unsigned char *data, size_t nblks ) return burn; } +#endif /* !USE_AMD64_ASM */ + /* Bug compatibility Whirlpool version. */ static void |