summaryrefslogtreecommitdiff
path: root/cipher/twofish.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2017-01-04 10:18:36 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2017-01-06 12:48:20 +0200
commitc59a8ce51ceb9a80169c44ef86a67e95cf8528c3 (patch)
tree79900afec0b7eaeb7b47d0de95159f11648da4d3 /cipher/twofish.c
parent232a129b1f915fc54881506e4b07c89cf84932e6 (diff)
downloadlibgcrypt-c59a8ce51ceb9a80169c44ef86a67e95cf8528c3.tar.gz
Add AVX2/vpgather bulk implementation of Twofish
* cipher/Makefile.am: Add 'twofish-avx2-amd64.S'. * cipher/twofish-avx2-amd64.S: New. * cipher/twofish.c (USE_AVX2): New. (TWOFISH_context) [USE_AVX2]: Add 'use_avx2' member. (ASM_FUNC_ABI): New. (twofish_setkey): Add check for AVX2 and fast VPGATHER HW features. (_gcry_twofish_avx2_ctr_enc, _gcry_twofish_avx2_cbc_dec) (_gcry_twofish_avx2_cfb_dec, _gcry_twofish_avx2_ocb_enc) (_gcry_twofish_avx2_ocb_dec, _gcry_twofish_avx2_ocb_auth): New. (_gcry_twofish_ctr_enc, _gcry_twofish_cbc_dec, _gcry_twofish_cfb_dec) (_gcry_twofish_ocb_crypt, _gcry_twofish_ocb_auth): Add AVX2 bulk handling. (selftest_ctr, selftest_cbc, selftest_cfb): Increase nblocks from 3+X to 16+X. * configure.ac: Add 'twofish-avx2-amd64.lo'. * src/g10lib.h (HWF_INTEL_FAST_VPGATHER): New. * src/hwf-x86.c (detect_x86_gnuc): Add detection for HWF_INTEL_FAST_VPGATHER. * src/hwfeatures.c (HWF_INTEL_FAST_VPGATHER): Add "intel-fast-vpgather" for HWF_INTEL_FAST_VPGATHER. -- Benchmark on Intel Core i3-6100 (3.7 Ghz): Before: TWOFISH | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 4.25 ns/B 224.5 MiB/s 15.71 c/B ECB dec | 4.16 ns/B 229.5 MiB/s 15.38 c/B CBC enc | 4.53 ns/B 210.4 MiB/s 16.77 c/B CBC dec | 2.71 ns/B 351.6 MiB/s 10.04 c/B CFB enc | 4.60 ns/B 207.3 MiB/s 17.02 c/B CFB dec | 2.70 ns/B 353.5 MiB/s 9.98 c/B OFB enc | 4.25 ns/B 224.2 MiB/s 15.74 c/B OFB dec | 4.24 ns/B 225.0 MiB/s 15.68 c/B CTR enc | 2.72 ns/B 350.6 MiB/s 10.06 c/B CTR dec | 2.72 ns/B 350.7 MiB/s 10.06 c/B CCM enc | 7.25 ns/B 131.5 MiB/s 26.83 c/B CCM dec | 7.25 ns/B 131.5 MiB/s 26.83 c/B CCM auth | 4.57 ns/B 208.9 MiB/s 16.89 c/B GCM enc | 3.02 ns/B 315.3 MiB/s 11.19 c/B GCM dec | 3.02 ns/B 315.6 MiB/s 11.18 c/B GCM auth | 0.297 ns/B 3208.4 MiB/s 1.10 c/B OCB enc | 2.73 ns/B 349.7 MiB/s 10.09 c/B OCB dec | 2.82 ns/B 338.3 MiB/s 10.43 c/B OCB auth | 2.77 ns/B 343.7 MiB/s 10.27 c/B After (CBC-dec & CFB-dec & CTR & OCB, ~1.5x faster): TWOFISH | nanosecs/byte mebibytes/sec cycles/byte ECB enc | 4.25 ns/B 224.2 MiB/s 15.74 c/B ECB dec | 4.15 ns/B 229.5 MiB/s 15.37 c/B CBC enc | 4.61 ns/B 206.8 MiB/s 17.06 c/B CBC dec | 1.75 ns/B 544.0 MiB/s 6.49 c/B CFB enc | 4.52 ns/B 211.0 MiB/s 16.72 c/B CFB dec | 1.72 ns/B 554.1 MiB/s 6.37 c/B OFB enc | 4.27 ns/B 223.3 MiB/s 15.80 c/B OFB dec | 4.28 ns/B 222.7 MiB/s 15.84 c/B CTR enc | 1.73 ns/B 549.9 MiB/s 6.42 c/B CTR dec | 1.75 ns/B 545.1 MiB/s 6.47 c/B CCM enc | 6.31 ns/B 151.2 MiB/s 23.34 c/B CCM dec | 6.42 ns/B 148.5 MiB/s 23.76 c/B CCM auth | 4.56 ns/B 208.9 MiB/s 16.89 c/B GCM enc | 1.90 ns/B 502.8 MiB/s 7.02 c/B GCM dec | 2.00 ns/B 477.8 MiB/s 7.38 c/B GCM auth | 0.300 ns/B 3178.6 MiB/s 1.11 c/B OCB enc | 1.76 ns/B 542.2 MiB/s 6.51 c/B OCB dec | 1.76 ns/B 540.7 MiB/s 6.53 c/B OCB auth | 1.76 ns/B 542.8 MiB/s 6.50 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/twofish.c')
-rw-r--r--cipher/twofish.c272
1 files changed, 268 insertions, 4 deletions
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 55f6fb98..942e8d42 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -72,6 +72,15 @@
# endif
# endif
+/* USE_AVX2 indicates whether to compile with AMD64 AVX2 code. */
+#undef USE_AVX2
+#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# if defined(ENABLE_AVX2_SUPPORT)
+# define USE_AVX2 1
+# endif
+#endif
+
/* Prototype for the self-test function. */
static const char *selftest(void);
@@ -82,8 +91,25 @@ static const char *selftest(void);
* that k[i] corresponds to what the Twofish paper calls K[i+8]. */
typedef struct {
u32 s[4][256], w[8], k[32];
+
+#ifdef USE_AVX2
+ int use_avx2;
+#endif
} TWOFISH_context;
+
+/* Assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#if defined(USE_AVX2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+# else
+# define ASM_FUNC_ABI
+# endif
+#endif
+
+
/* These two tables are the q0 and q1 permutations, exactly as described in
* the Twofish paper. */
@@ -711,12 +737,66 @@ static gcry_err_code_t
twofish_setkey (void *context, const byte *key, unsigned int keylen)
{
TWOFISH_context *ctx = context;
- int rc = do_twofish_setkey (ctx, key, keylen);
+ unsigned int hwfeatures = _gcry_get_hw_features ();
+ int rc;
+
+ rc = do_twofish_setkey (ctx, key, keylen);
+
+#ifdef USE_AVX2
+ ctx->use_avx2 = 0;
+ if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER))
+ {
+ ctx->use_avx2 = 1;
+ }
+#endif
+
+ (void)hwfeatures;
+
_gcry_burn_stack (23+6*sizeof(void*));
return rc;
}
+#ifdef USE_AVX2
+/* Assembler implementations of Twofish using AVX2. Process 16 block in
+ parallel.
+ */
+extern void _gcry_twofish_avx2_ctr_enc(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_cbc_dec(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_cfb_dec(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_enc(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_dec(const TWOFISH_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_twofish_avx2_ocb_auth(const TWOFISH_context *ctx,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+#endif
+
#ifdef USE_AMD64_ASM
@@ -1111,6 +1191,31 @@ _gcry_twofish_ctr_enc(void *context, unsigned char *ctr, void *outbuf_arg,
unsigned int burn, burn_stack_depth = 0;
int i;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_twofish_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
+
+ nblocks -= 16;
+ outbuf += 16 * TWOFISH_BLOCKSIZE;
+ inbuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+ }
+#endif
+
#ifdef USE_AMD64_ASM
{
/* Process data in 3 block chunks. */
@@ -1169,6 +1274,31 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
unsigned char savebuf[TWOFISH_BLOCKSIZE];
unsigned int burn, burn_stack_depth = 0;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_twofish_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * TWOFISH_BLOCKSIZE;
+ inbuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+ }
+#endif
+
#ifdef USE_AMD64_ASM
{
/* Process data in 3 block chunks. */
@@ -1218,6 +1348,31 @@ _gcry_twofish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
const unsigned char *inbuf = inbuf_arg;
unsigned int burn, burn_stack_depth = 0;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_twofish_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * TWOFISH_BLOCKSIZE;
+ inbuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+ }
+#endif
+
#ifdef USE_AMD64_ASM
{
/* Process data in 3 block chunks. */
@@ -1264,6 +1419,62 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
unsigned int burn, burn_stack_depth = 0;
u64 blkn = c->u_mode.ocb.data_nblocks;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+ u64 Ls[16];
+ unsigned int n = 16 - (blkn % 16);
+ u64 *l;
+ int i;
+
+ if (nblocks >= 16)
+ {
+ for (i = 0; i < 16; i += 8)
+ {
+ /* Use u64 to store pointers for x32 support (assembly function
+ * assumes 64-bit pointers). */
+ Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ l = &Ls[(15 + n) % 16];
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ if (encrypt)
+ _gcry_twofish_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_twofish_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 16;
+ outbuf += 16 * TWOFISH_BLOCKSIZE;
+ inbuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+ }
+#endif
+
{
/* Use u64 to store pointers for x32 support (assembly function
* assumes 64-bit pointers). */
@@ -1321,6 +1532,59 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
unsigned int burn, burn_stack_depth = 0;
u64 blkn = c->u_mode.ocb.aad_nblocks;
+#ifdef USE_AVX2
+ if (ctx->use_avx2)
+ {
+ int did_use_avx2 = 0;
+ u64 Ls[16];
+ unsigned int n = 16 - (blkn % 16);
+ u64 *l;
+ int i;
+
+ if (nblocks >= 16)
+ {
+ for (i = 0; i < 16; i += 8)
+ {
+ /* Use u64 to store pointers for x32 support (assembly function
+ * assumes 64-bit pointers). */
+ Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ l = &Ls[(15 + n) % 16];
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ _gcry_twofish_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+
+ nblocks -= 16;
+ abuf += 16 * TWOFISH_BLOCKSIZE;
+ did_use_avx2 = 1;
+ }
+ }
+
+ if (did_use_avx2)
+ {
+ /* twofish-avx2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
{
/* Use u64 to store pointers for x32 support (assembly function
* assumes 64-bit pointers). */
@@ -1367,7 +1631,7 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
static const char *
selftest_ctr (void)
{
- const int nblocks = 3+1;
+ const int nblocks = 16+1;
const int blocksize = TWOFISH_BLOCKSIZE;
const int context_size = sizeof(TWOFISH_context);
@@ -1381,7 +1645,7 @@ selftest_ctr (void)
static const char *
selftest_cbc (void)
{
- const int nblocks = 3+2;
+ const int nblocks = 16+2;
const int blocksize = TWOFISH_BLOCKSIZE;
const int context_size = sizeof(TWOFISH_context);
@@ -1395,7 +1659,7 @@ selftest_cbc (void)
static const char *
selftest_cfb (void)
{
- const int nblocks = 3+2;
+ const int nblocks = 16+2;
const int blocksize = TWOFISH_BLOCKSIZE;
const int context_size = sizeof(TWOFISH_context);