diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-06-11 20:17:17 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-06-20 14:48:04 +0300 |
commit | c9a3f1bb91e63033e3bf3e06bdd6075622626d0d (patch) | |
tree | cc05e9c1542681b11e8f049a62c93e6c2f85a660 /cipher/sm4.c | |
parent | 81fee26bbbae820a311a3ce3ac55e304655c2acd (diff) | |
download | libgcrypt-c9a3f1bb91e63033e3bf3e06bdd6075622626d0d.tar.gz |
Add SM4 x86-64/AES-NI/AVX implementation
* cipher/Makefile.am: Add 'sm4-aesni-avx-amd64.S'.
* cipher/sm4-aesni-avx-amd64.S: New.
* cipher/sm4.c (USE_AESNI_AVX, ASM_FUNC_ABI): New.
(SM4_context) [USE_AESNI_AVX]: Add 'use_aesni_avx'.
[USE_AESNI_AVX] (_gcry_sm4_aesni_avx_expand_key)
(_gcry_sm4_aesni_avx_crypt_blk1_8, _gcry_sm4_aesni_avx_ctr_enc)
(_gcry_sm4_aesni_avx_cbc_dec, _gcry_sm4_aesni_avx_cfb_dec)
(_gcry_sm4_aesni_avx_ocb_enc, _gcry_sm4_aesni_avx_ocb_dec)
(_gcry_sm4_aesni_avx_ocb_auth, sm4_aesni_avx_crypt_blk1_8): New.
(sm4_expand_key) [USE_AESNI_AVX]: Use AES-NI/AVX key setup.
(sm4_setkey): Enable AES-NI/AVX if supported by HW.
(_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec)
(_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth) [USE_AESNI_AVX]: Add
AES-NI/AVX bulk functions.
* configure.ac: Add ''sm4-aesni-avx-amd64.lo'.
--
This patch adds x86-64/AES-NI/AVX bulk encryption/decryption and key
setup for SM4 cipher. Bulk functions process eight blocks in parallel.
Benchmark on AMD Ryzen 7 3700X:
Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC enc | 8.94 ns/B 106.7 MiB/s 38.66 c/B 4325
CBC dec | 4.78 ns/B 199.7 MiB/s 20.42 c/B 4275
CFB enc | 8.95 ns/B 106.5 MiB/s 38.72 c/B 4325
CFB dec | 4.81 ns/B 198.2 MiB/s 20.57 c/B 4275
CTR enc | 4.81 ns/B 198.2 MiB/s 20.69 c/B 4300
CTR dec | 4.80 ns/B 198.8 MiB/s 20.63 c/B 4300
GCM auth | 0.116 ns/B 8232 MiB/s 0.504 c/B 4351
OCB enc | 4.88 ns/B 195.5 MiB/s 20.86 c/B 4275
OCB dec | 4.85 ns/B 196.6 MiB/s 20.86 c/B 4301
OCB auth | 4.80 ns/B 198.9 MiB/s 20.62 c/B 4301
After (~3.0x faster):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
CBC enc | 8.98 ns/B 106.2 MiB/s 38.62 c/B 4300
CBC dec | 1.55 ns/B 613.7 MiB/s 6.64 c/B 4275
CFB enc | 8.96 ns/B 106.4 MiB/s 38.52 c/B 4300
CFB dec | 1.54 ns/B 617.4 MiB/s 6.60 c/B 4275
CTR enc | 1.57 ns/B 607.8 MiB/s 6.75 c/B 4300
CTR dec | 1.57 ns/B 608.9 MiB/s 6.74 c/B 4300
OCB enc | 1.58 ns/B 603.8 MiB/s 6.75 c/B 4275
OCB dec | 1.57 ns/B 605.7 MiB/s 6.73 c/B 4275
OCB auth | 1.53 ns/B 624.5 MiB/s 6.57 c/B 4300
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
sm4 avx fix
sm4 avx fix
Diffstat (limited to 'cipher/sm4.c')
-rw-r--r-- | cipher/sm4.c | 240 |
1 files changed, 240 insertions, 0 deletions
diff --git a/cipher/sm4.c b/cipher/sm4.c index 621532fa..87bbfd7d 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -38,12 +38,35 @@ # define ATTR_ALIGNED_64 #endif +/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX code. */ +#undef USE_AESNI_AVX +#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) +# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_AESNI_AVX 1 +# endif +#endif + +/* Assembly implementations use SystemV ABI, ABI conversion and additional + * stack to store XMM6-XMM15 needed on Win64. */ +#undef ASM_FUNC_ABI +#if defined(USE_AESNI_AVX) +# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +# else +# define ASM_FUNC_ABI +# endif +#endif + static const char *sm4_selftest (void); typedef struct { u32 rkey_enc[32]; u32 rkey_dec[32]; +#ifdef USE_AESNI_AVX + unsigned int use_aesni_avx:1; +#endif } SM4_context; static const u32 fk[4] = @@ -110,6 +133,53 @@ static const u32 ck[] = 0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279 }; +#ifdef USE_AESNI_AVX +extern void _gcry_sm4_aesni_avx_expand_key(const byte *key, u32 *rk_enc, + u32 *rk_dec, const u32 *fk, + const u32 *ck) ASM_FUNC_ABI; + +extern void _gcry_sm4_aesni_avx_ctr_enc(const u32 *rk_enc, byte *out, + const byte *in, byte *ctr) ASM_FUNC_ABI; + +extern void _gcry_sm4_aesni_avx_cbc_dec(const u32 *rk_dec, byte *out, + const byte *in, byte *iv) ASM_FUNC_ABI; + +extern void _gcry_sm4_aesni_avx_cfb_dec(const u32 *rk_enc, byte *out, + const byte *in, byte *iv) ASM_FUNC_ABI; + +extern void _gcry_sm4_aesni_avx_ocb_enc(const u32 *rk_enc, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[8]) ASM_FUNC_ABI; + +extern void _gcry_sm4_aesni_avx_ocb_dec(const u32 *rk_dec, + unsigned char *out, + const unsigned char *in, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[8]) ASM_FUNC_ABI; + +extern void _gcry_sm4_aesni_avx_ocb_auth(const u32 *rk_enc, + const unsigned char *abuf, + unsigned char *offset, + unsigned char *checksum, + const u64 Ls[8]) ASM_FUNC_ABI; + +extern unsigned int +_gcry_sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in, + unsigned int num_blks) ASM_FUNC_ABI; + +static inline unsigned int +sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in, + unsigned int num_blks) +{ + return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks); +} + +#endif /* USE_AESNI_AVX */ + static inline void prefetch_sbox_table(void) { const volatile byte *vtab = (void *)&sbox_table; @@ -178,6 +248,15 @@ sm4_expand_key (SM4_context *ctx, const byte *key) u32 rk[4]; int i; +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + _gcry_sm4_aesni_avx_expand_key (key, ctx->rkey_enc, ctx->rkey_dec, + fk, ck); + return; + } +#endif + rk[0] = buf_get_be32(key + 4 * 0) ^ fk[0]; rk[1] = buf_get_be32(key + 4 * 1) ^ fk[1]; rk[2] = buf_get_be32(key + 4 * 2) ^ fk[2]; @@ -209,8 +288,10 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, SM4_context *ctx = context; static int init = 0; static const char *selftest_failed = NULL; + unsigned int hwf = _gcry_get_hw_features (); (void)hd; + (void)hwf; if (!init) { @@ -225,6 +306,10 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen, if (keylen != 16) return GPG_ERR_INV_KEYLEN; +#ifdef USE_AESNI_AVX + ctx->use_aesni_avx = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX); +#endif + sm4_expand_key (ctx, key); return 0; } @@ -367,6 +452,21 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr, const byte *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + _gcry_sm4_aesni_avx_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr); + + nblocks -= 8; + outbuf += 8 * 16; + inbuf += 8 * 16; + } + } +#endif + /* Process remaining blocks. */ if (nblocks) { @@ -377,6 +477,12 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr, if (0) ; +#ifdef USE_AESNI_AVX + else if (ctx->use_aesni_avx) + { + crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8; + } +#endif else { prefetch_sbox_table (); @@ -432,6 +538,21 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv, const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + _gcry_sm4_aesni_avx_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv); + + nblocks -= 8; + outbuf += 8 * 16; + inbuf += 8 * 16; + } + } +#endif + /* Process remaining blocks. */ if (nblocks) { @@ -442,6 +563,12 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv, if (0) ; +#ifdef USE_AESNI_AVX + else if (ctx->use_aesni_avx) + { + crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8; + } +#endif else { prefetch_sbox_table (); @@ -490,6 +617,21 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, const unsigned char *inbuf = inbuf_arg; int burn_stack_depth = 0; +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + _gcry_sm4_aesni_avx_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv); + + nblocks -= 8; + outbuf += 8 * 16; + inbuf += 8 * 16; + } + } +#endif + /* Process remaining blocks. */ if (nblocks) { @@ -500,6 +642,12 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv, if (0) ; +#ifdef USE_AESNI_AVX + else if (ctx->use_aesni_avx) + { + crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8; + } +#endif else { prefetch_sbox_table (); @@ -551,6 +699,48 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, u64 blkn = c->u_mode.ocb.data_nblocks; int burn_stack_depth = 0; +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + u64 Ls[8]; + unsigned int n = 8 - (blkn % 8); + u64 *l; + + if (nblocks >= 8) + { + /* Use u64 to store pointers for x32 support (assembly function + * assumes 64-bit pointers). */ + Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2]; + Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; + l = &Ls[(7 + n) % 8]; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + blkn += 8; + *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8); + + if (encrypt) + _gcry_sm4_aesni_avx_ocb_enc(ctx->rkey_enc, outbuf, inbuf, + c->u_iv.iv, c->u_ctr.ctr, Ls); + else + _gcry_sm4_aesni_avx_ocb_dec(ctx->rkey_dec, outbuf, inbuf, + c->u_iv.iv, c->u_ctr.ctr, Ls); + + nblocks -= 8; + outbuf += 8 * 16; + inbuf += 8 * 16; + } + } + } +#endif + if (nblocks) { unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in, @@ -561,6 +751,12 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg, if (0) ; +#ifdef USE_AESNI_AVX + else if (ctx->use_aesni_avx) + { + crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8; + } +#endif else { prefetch_sbox_table (); @@ -625,6 +821,44 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) const unsigned char *abuf = abuf_arg; u64 blkn = c->u_mode.ocb.aad_nblocks; +#ifdef USE_AESNI_AVX + if (ctx->use_aesni_avx) + { + u64 Ls[8]; + unsigned int n = 8 - (blkn % 8); + u64 *l; + + if (nblocks >= 8) + { + /* Use u64 to store pointers for x32 support (assembly function + * assumes 64-bit pointers). */ + Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2]; + Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1]; + Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0]; + Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3]; + l = &Ls[(7 + n) % 8]; + + /* Process data in 8 block chunks. */ + while (nblocks >= 8) + { + blkn += 8; + *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8); + + _gcry_sm4_aesni_avx_ocb_auth(ctx->rkey_enc, abuf, + c->u_mode.ocb.aad_offset, + c->u_mode.ocb.aad_sum, Ls); + + nblocks -= 8; + abuf += 8 * 16; + } + } + } +#endif + if (nblocks) { unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in, @@ -634,6 +868,12 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks) if (0) ; +#ifdef USE_AESNI_AVX + else if (ctx->use_aesni_avx) + { + crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8; + } +#endif else { prefetch_sbox_table (); |