diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-10-31 21:29:56 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2015-11-01 21:47:06 +0200 |
commit | 2857cb89c6dc1c02266600bc1fd2967a3cd5cf88 (patch) | |
tree | 1d5ca6d7135264461757cfdd90b051fc98f5f0ed /cipher/keccak.c | |
parent | 07e4839e75a7bca3a6c0a94aecfe75efe61d7ff2 (diff) | |
download | libgcrypt-2857cb89c6dc1c02266600bc1fd2967a3cd5cf88.tar.gz |
Optimize Keccak 64-bit absorb functions
* cipher/keccak.c [USE_64BIT] [__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
* cipher/keccak.c [USE_64BIT] [!__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
[USE_64BIT] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT] (keccak_absorb_lanes64): Remove.
[USE_64BIT_SHLD] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_SHLD] (keccak_absorb_lanes64_shld): Remove.
[USE_64BIT_BMI2] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_BMI2] (keccak_absorb_lanes64_bmi2): Remove.
* cipher/keccak_permute_64.h (KECCAK_F1600_ABSORB_FUNC_NAME): New.
--
Optimize 64-bit absorb functions for small speed-up. After this
change, 64-bit BMI2 implementation matches speed of fastest results
from SUPERCOP for Intel Haswell CPUs (long messages).
Benchmark on Intel Haswell @ 3.2 Ghz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.32 ns/B 411.7 MiB/s 7.41 c/B
SHAKE256 | 2.84 ns/B 336.2 MiB/s 9.08 c/B
SHA3-224 | 2.69 ns/B 354.9 MiB/s 8.60 c/B
SHA3-256 | 2.84 ns/B 336.0 MiB/s 9.08 c/B
SHA3-384 | 3.69 ns/B 258.4 MiB/s 11.81 c/B
SHA3-512 | 5.30 ns/B 179.9 MiB/s 16.97 c/B
After:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.27 ns/B 420.6 MiB/s 7.26 c/B
SHAKE256 | 2.79 ns/B 341.4 MiB/s 8.94 c/B
SHA3-224 | 2.64 ns/B 361.7 MiB/s 8.44 c/B
SHA3-256 | 2.79 ns/B 341.5 MiB/s 8.94 c/B
SHA3-384 | 3.65 ns/B 261.4 MiB/s 11.68 c/B
SHA3-512 | 5.27 ns/B 181.0 MiB/s 16.87 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak.c')
-rw-r--r-- | cipher/keccak.c | 159 |
1 files changed, 93 insertions, 66 deletions
diff --git a/cipher/keccak.c b/cipher/keccak.c index f4f0ef3e..ce578607 100644 --- a/cipher/keccak.c +++ b/cipher/keccak.c @@ -223,38 +223,105 @@ keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1) /* Construct generic 64-bit implementation. */ #ifdef USE_64BIT +#if __GNUC__ >= 4 && defined(__x86_64__) + +static inline void absorb_lanes64_8(u64 *dst, const byte *in) +{ + asm ("movdqu 0*16(%[dst]), %%xmm0\n\t" + "movdqu 0*16(%[in]), %%xmm4\n\t" + "movdqu 1*16(%[dst]), %%xmm1\n\t" + "movdqu 1*16(%[in]), %%xmm5\n\t" + "movdqu 2*16(%[dst]), %%xmm2\n\t" + "movdqu 3*16(%[dst]), %%xmm3\n\t" + "pxor %%xmm4, %%xmm0\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu 2*16(%[in]), %%xmm4\n\t" + "movdqu 3*16(%[in]), %%xmm5\n\t" + "movdqu %%xmm0, 0*16(%[dst])\n\t" + "pxor %%xmm4, %%xmm2\n\t" + "movdqu %%xmm1, 1*16(%[dst])\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "movdqu %%xmm2, 2*16(%[dst])\n\t" + "movdqu %%xmm3, 3*16(%[dst])\n\t" + : + : [dst] "r" (dst), [in] "r" (in) + : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory"); +} + +static inline void absorb_lanes64_4(u64 *dst, const byte *in) +{ + asm ("movdqu 0*16(%[dst]), %%xmm0\n\t" + "movdqu 0*16(%[in]), %%xmm4\n\t" + "movdqu 1*16(%[dst]), %%xmm1\n\t" + "movdqu 1*16(%[in]), %%xmm5\n\t" + "pxor %%xmm4, %%xmm0\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "movdqu %%xmm0, 0*16(%[dst])\n\t" + "movdqu %%xmm1, 1*16(%[dst])\n\t" + : + : [dst] "r" (dst), [in] "r" (in) + : "xmm0", "xmm1", "xmm4", "xmm5", "memory"); +} + +static inline void absorb_lanes64_2(u64 *dst, const byte *in) +{ + asm ("movdqu 0*16(%[dst]), %%xmm0\n\t" + "movdqu 0*16(%[in]), %%xmm4\n\t" + "pxor %%xmm4, %%xmm0\n\t" + "movdqu %%xmm0, 0*16(%[dst])\n\t" + : + : [dst] "r" (dst), [in] "r" (in) + : "xmm0", "xmm4", "memory"); +} + +#else /* __x86_64__ */ + +static inline void absorb_lanes64_8(u64 *dst, const byte *in) +{ + dst[0] ^= buf_get_le64(in + 8 * 0); + dst[1] ^= buf_get_le64(in + 8 * 1); + dst[2] ^= buf_get_le64(in + 8 * 2); + dst[3] ^= buf_get_le64(in + 8 * 3); + dst[4] ^= buf_get_le64(in + 8 * 4); + dst[5] ^= buf_get_le64(in + 8 * 5); + dst[6] ^= buf_get_le64(in + 8 * 6); + dst[7] ^= buf_get_le64(in + 8 * 7); +} + +static inline void absorb_lanes64_4(u64 *dst, const byte *in) +{ + dst[0] ^= buf_get_le64(in + 8 * 0); + dst[1] ^= buf_get_le64(in + 8 * 1); + dst[2] ^= buf_get_le64(in + 8 * 2); + dst[3] ^= buf_get_le64(in + 8 * 3); +} + +static inline void absorb_lanes64_2(u64 *dst, const byte *in) +{ + dst[0] ^= buf_get_le64(in + 8 * 0); + dst[1] ^= buf_get_le64(in + 8 * 1); +} + +#endif /* !__x86_64__ */ + +static inline void absorb_lanes64_1(u64 *dst, const byte *in) +{ + dst[0] ^= buf_get_le64(in + 8 * 0); +} + + # define ANDN64(x, y) (~(x) & (y)) # define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \ ((x) >> ((64 - (unsigned int)(n)) & 63))) # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64 +# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64 # include "keccak_permute_64.h" # undef ANDN64 # undef ROL64 # undef KECCAK_F1600_PERMUTE_FUNC_NAME - -static unsigned int -keccak_absorb_lanes64(KECCAK_STATE *hd, int pos, const byte *lanes, - unsigned int nlanes, int blocklanes) -{ - unsigned int burn = 0; - - while (nlanes) - { - hd->u.state64[pos] ^= buf_get_le64(lanes); - lanes += 8; - nlanes--; - - if (++pos == blocklanes) - { - burn = keccak_f1600_state_permute64(hd); - pos = 0; - } - } - - return burn; -} +# undef KECCAK_F1600_ABSORB_FUNC_NAME static const keccak_ops_t keccak_generic64_ops = { @@ -279,33 +346,13 @@ static const keccak_ops_t keccak_generic64_ops = tmp; }) # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld +# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_shld # include "keccak_permute_64.h" # undef ANDN64 # undef ROL64 # undef KECCAK_F1600_PERMUTE_FUNC_NAME - -static unsigned int -keccak_absorb_lanes64_shld(KECCAK_STATE *hd, int pos, const byte *lanes, - unsigned int nlanes, int blocklanes) -{ - unsigned int burn = 0; - - while (nlanes) - { - hd->u.state64[pos] ^= buf_get_le64(lanes); - lanes += 8; - nlanes--; - - if (++pos == blocklanes) - { - burn = keccak_f1600_state_permute64_shld(hd); - pos = 0; - } - } - - return burn; -} +# undef KECCAK_F1600_ABSORB_FUNC_NAME static const keccak_ops_t keccak_shld_64_ops = { @@ -335,33 +382,13 @@ static const keccak_ops_t keccak_shld_64_ops = tmp; }) # define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2 +# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_bmi2 # include "keccak_permute_64.h" # undef ANDN64 # undef ROL64 # undef KECCAK_F1600_PERMUTE_FUNC_NAME - -static unsigned int -keccak_absorb_lanes64_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes, - unsigned int nlanes, int blocklanes) -{ - unsigned int burn = 0; - - while (nlanes) - { - hd->u.state64[pos] ^= buf_get_le64(lanes); - lanes += 8; - nlanes--; - - if (++pos == blocklanes) - { - burn = keccak_f1600_state_permute64_bmi2(hd); - pos = 0; - } - } - - return burn; -} +# undef KECCAK_F1600_ABSORB_FUNC_NAME static const keccak_ops_t keccak_bmi2_64_ops = { |