diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2017-01-04 10:18:36 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2017-01-06 12:48:20 +0200 |
commit | c59a8ce51ceb9a80169c44ef86a67e95cf8528c3 (patch) | |
tree | 79900afec0b7eaeb7b47d0de95159f11648da4d3 /cipher/twofish.c | |
parent | 232a129b1f915fc54881506e4b07c89cf84932e6 (diff) | |
download | libgcrypt-c59a8ce51ceb9a80169c44ef86a67e95cf8528c3.tar.gz |
Add AVX2/vpgather bulk implementation of Twofish
* cipher/Makefile.am: Add 'twofish-avx2-amd64.S'.
* cipher/twofish-avx2-amd64.S: New.
* cipher/twofish.c (USE_AVX2): New.
(TWOFISH_context) [USE_AVX2]: Add 'use_avx2' member.
(ASM_FUNC_ABI): New.
(twofish_setkey): Add check for AVX2 and fast VPGATHER HW features.
(_gcry_twofish_avx2_ctr_enc, _gcry_twofish_avx2_cbc_dec)
(_gcry_twofish_avx2_cfb_dec, _gcry_twofish_avx2_ocb_enc)
(_gcry_twofish_avx2_ocb_dec, _gcry_twofish_avx2_ocb_auth): New.
(_gcry_twofish_ctr_enc, _gcry_twofish_cbc_dec, _gcry_twofish_cfb_dec)
(_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Add AVX2 bulk
handling.
(selftest_ctr, selftest_cbc, selftest_cfb): Increase nblocks from
3+X to 16+X.
* configure.ac: Add 'twofish-avx2-amd64.lo'.
* src/g10lib.h (HWF_INTEL_FAST_VPGATHER): New.
* src/hwf-x86.c (detect_x86_gnuc): Add detection for
HWF_INTEL_FAST_VPGATHER.
* src/hwfeatures.c (HWF_INTEL_FAST_VPGATHER): Add
"intel-fast-vpgather" for HWF_INTEL_FAST_VPGATHER.
--
Benchmark on Intel Core i3-6100 (3.7 Ghz):
Before:
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 4.25 ns/B 224.5 MiB/s 15.71 c/B
ECB dec | 4.16 ns/B 229.5 MiB/s 15.38 c/B
CBC enc | 4.53 ns/B 210.4 MiB/s 16.77 c/B
CBC dec | 2.71 ns/B 351.6 MiB/s 10.04 c/B
CFB enc | 4.60 ns/B 207.3 MiB/s 17.02 c/B
CFB dec | 2.70 ns/B 353.5 MiB/s 9.98 c/B
OFB enc | 4.25 ns/B 224.2 MiB/s 15.74 c/B
OFB dec | 4.24 ns/B 225.0 MiB/s 15.68 c/B
CTR enc | 2.72 ns/B 350.6 MiB/s 10.06 c/B
CTR dec | 2.72 ns/B 350.7 MiB/s 10.06 c/B
CCM enc | 7.25 ns/B 131.5 MiB/s 26.83 c/B
CCM dec | 7.25 ns/B 131.5 MiB/s 26.83 c/B
CCM auth | 4.57 ns/B 208.9 MiB/s 16.89 c/B
GCM enc | 3.02 ns/B 315.3 MiB/s 11.19 c/B
GCM dec | 3.02 ns/B 315.6 MiB/s 11.18 c/B
GCM auth | 0.297 ns/B 3208.4 MiB/s 1.10 c/B
OCB enc | 2.73 ns/B 349.7 MiB/s 10.09 c/B
OCB dec | 2.82 ns/B 338.3 MiB/s 10.43 c/B
OCB auth | 2.77 ns/B 343.7 MiB/s 10.27 c/B
After (CBC-dec & CFB-dec & CTR & OCB, ~1.5x faster):
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 4.25 ns/B 224.2 MiB/s 15.74 c/B
ECB dec | 4.15 ns/B 229.5 MiB/s 15.37 c/B
CBC enc | 4.61 ns/B 206.8 MiB/s 17.06 c/B
CBC dec | 1.75 ns/B 544.0 MiB/s 6.49 c/B
CFB enc | 4.52 ns/B 211.0 MiB/s 16.72 c/B
CFB dec | 1.72 ns/B 554.1 MiB/s 6.37 c/B
OFB enc | 4.27 ns/B 223.3 MiB/s 15.80 c/B
OFB dec | 4.28 ns/B 222.7 MiB/s 15.84 c/B
CTR enc | 1.73 ns/B 549.9 MiB/s 6.42 c/B
CTR dec | 1.75 ns/B 545.1 MiB/s 6.47 c/B
CCM enc | 6.31 ns/B 151.2 MiB/s 23.34 c/B
CCM dec | 6.42 ns/B 148.5 MiB/s 23.76 c/B
CCM auth | 4.56 ns/B 208.9 MiB/s 16.89 c/B
GCM enc | 1.90 ns/B 502.8 MiB/s 7.02 c/B
GCM dec | 2.00 ns/B 477.8 MiB/s 7.38 c/B
GCM auth | 0.300 ns/B 3178.6 MiB/s 1.11 c/B
OCB enc | 1.76 ns/B 542.2 MiB/s 6.51 c/B
OCB dec | 1.76 ns/B 540.7 MiB/s 6.53 c/B
OCB auth | 1.76 ns/B 542.8 MiB/s 6.50 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/twofish.c')
-rw-r--r-- | cipher/twofish.c | 272 |
1 files changed, 268 insertions, 4 deletions
diff --git a/cipher/twofish.c b/cipher/twofish.c index 55f6fb98..942e8d42 100644 --- a/cipher/twofish.c +++ b/cipher/twofish.c @@ -72,6 +72,15 @@ # endif # endif +/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */ +#undef USE_AVX2 +#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# if defined(ENABLE_AVX2_SUPPORT) +# define USE_AVX2 1 +# endif +#endif + /* Prototype for the self-test function. */ static const char *selftest(void); @@ -82,8 +91,25 @@ static const char *selftest(void); * that k[i] corresponds to what the Twofish paper calls K[i+8]. */ typedef struct { u32 s[4][256], w[8], k[32]; + +#ifdef USE_AVX2 + int use_avx2; +#endif } TWOFISH_context; + +/* Assembly implementations use SystemV ABI, ABI conversion and additional + * stack to store XMM6-XMM15 needed on Win64. */ +#undef ASM_FUNC_ABI +#if defined(USE_AVX2) +# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# else +# define ASM_FUNC_ABI +# endif +#endif + + /* These two tables are the q0 and q1 permutations, exactly as described in * the Twofish paper. */ @@ -711,12 +737,66 @@ static gcry_err_code_t twofish_setkey (void *context, const byte *key, unsigned int keylen) { TWOFISH_context *ctx = context; - int rc = do_twofish_setkey (ctx, key, keylen); + unsigned int hwfeatures = _gcry_get_hw_features (); + int rc; + + rc = do_twofish_setkey (ctx, key, keylen); + +#ifdef USE_AVX2 + ctx->use_avx2 = 0; + if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER)) + { + ctx->use_avx2 = 1; + } +#endif + + (void)hwfeatures; + _gcry_burn_stack (23+6*sizeof(void*)); return rc; } +#ifdef USE_AVX2 +/* Assembler implementations of Twofish using AVX2. Process 16 block in + parallel. + */ +extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *ctr) ASM_FUNC_ABI; + +extern void _gcry_twofish_avx2_cbc_dec(const TWOFISH_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_twofish_avx2_cfb_dec(const TWOFISH_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *iv) ASM_FUNC_ABI; + +extern void _gcry_twofish_avx2_ocb_enc(const TWOFISH_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_twofish_avx2_ocb_dec(const TWOFISH_context *ctx, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[16]) ASM_FUNC_ABI; + +extern void _gcry_twofish_avx2_ocb_auth(const TWOFISH_context *ctx, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[16]) ASM_FUNC_ABI; +#endif + #ifdef USE_AMD64_ASM @@ -1111,6 +1191,31 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg, unsigned int burn, burn_stack_depth = 0; int i; +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + _gcry_twofish_avx2_ctr_enc(ctx, outbuf, inbuf, ctr); + + nblocks -= 16; + outbuf += 16 * TWOFISH_BLOCKSIZE; + inbuf += 16 * TWOFISH_BLOCKSIZE; + did_use_avx2 = 1; + } + + if (did_use_avx2) + { + /* twofish-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + } +#endif + #ifdef USE_AMD64_ASM { /* Process data in 3 block chunks. */ @@ -1169,6 +1274,31 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg, unsigned char savebuf[TWOFISH_BLOCKSIZE]; unsigned int burn, burn_stack_depth = 0; +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + _gcry_twofish_avx2_cbc_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 16; + outbuf += 16 * TWOFISH_BLOCKSIZE; + inbuf += 16 * TWOFISH_BLOCKSIZE; + did_use_avx2 = 1; + } + + if (did_use_avx2) + { + /* twofish-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + } +#endif + #ifdef USE_AMD64_ASM { /* Process data in 3 block chunks. */ @@ -1218,6 +1348,31 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg, const unsigned char *inbuf = inbuf_arg; unsigned int burn, burn_stack_depth = 0; +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + _gcry_twofish_avx2_cfb_dec(ctx, outbuf, inbuf, iv); + + nblocks -= 16; + outbuf += 16 * TWOFISH_BLOCKSIZE; + inbuf += 16 * TWOFISH_BLOCKSIZE; + did_use_avx2 = 1; + } + + if (did_use_avx2) + { + /* twofish-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + } +#endif + #ifdef USE_AMD64_ASM { /* Process data in 3 block chunks. */ @@ -1264,6 +1419,62 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, unsigned int burn, burn_stack_depth = 0; u64 blkn = c->u_mode.ocb.data_nblocks; +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + u64 Ls[16]; + unsigned int n = 16 - (blkn % 16); + u64 *l; + int i; + + if (nblocks >= 16) + { + for (i = 0; i < 16; i += 8) + { + /* Use u64 to store pointers for x32 support (assembly function + * assumes 64-bit pointers). */ + Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + } + + Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; + l = &Ls[(15 + n) % 16]; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + blkn += 16; + *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16); + + if (encrypt) + _gcry_twofish_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + else + _gcry_twofish_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv, + c->u_ctr.ctr, Ls); + + nblocks -= 16; + outbuf += 16 * TWOFISH_BLOCKSIZE; + inbuf += 16 * TWOFISH_BLOCKSIZE; + did_use_avx2 = 1; + } + } + + if (did_use_avx2) + { + /* twofish-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + } +#endif + { /* Use u64 to store pointers for x32 support (assembly function * assumes 64-bit pointers). */ @@ -1321,6 +1532,59 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, unsigned int burn, burn_stack_depth = 0; u64 blkn = c->u_mode.ocb.aad_nblocks; +#ifdef USE_AVX2 + if (ctx->use_avx2) + { + int did_use_avx2 = 0; + u64 Ls[16]; + unsigned int n = 16 - (blkn % 16); + u64 *l; + int i; + + if (nblocks >= 16) + { + for (i = 0; i < 16; i += 8) + { + /* Use u64 to store pointers for x32 support (assembly function + * assumes 64-bit pointers). */ + Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2]; + Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + } + + Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; + l = &Ls[(15 + n) % 16]; + + /* Process data in 16 block chunks. */ + while (nblocks >= 16) + { + blkn += 16; + *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16); + + _gcry_twofish_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + + nblocks -= 16; + abuf += 16 * TWOFISH_BLOCKSIZE; + did_use_avx2 = 1; + } + } + + if (did_use_avx2) + { + /* twofish-avx2 assembly code does not use stack */ + if (nblocks == 0) + burn_stack_depth = 0; + } + + /* Use generic code to handle smaller chunks... */ + } +#endif + { /* Use u64 to store pointers for x32 support (assembly function * assumes 64-bit pointers). */ @@ -1367,7 +1631,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, static const char * selftest_ctr (void) { - const int nblocks = 3+1; + const int nblocks = 16+1; const int blocksize = TWOFISH_BLOCKSIZE; const int context_size = sizeof(TWOFISH_context); @@ -1381,7 +1645,7 @@ selftest_ctr (void) static const char * selftest_cbc (void) { - const int nblocks = 3+2; + const int nblocks = 16+2; const int blocksize = TWOFISH_BLOCKSIZE; const int context_size = sizeof(TWOFISH_context); @@ -1395,7 +1659,7 @@ selftest_cbc (void) static const char * selftest_cfb (void) { - const int nblocks = 3+2; + const int nblocks = 16+2; const int blocksize = TWOFISH_BLOCKSIZE; const int context_size = sizeof(TWOFISH_context); |