diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-04-24 21:32:58 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-04-30 13:01:41 +0300 |
commit | e239738b4af28b64ab617900fced8a216552e9f1 (patch) | |
tree | b8e160490c4b94013ef89e7ecdb23f0e91c841f4 | |
parent | 5095d60af42d898311d66b10f5204a3418a4a8af (diff) | |
download | libgcrypt-e239738b4af28b64ab617900fced8a216552e9f1.tar.gz |
sm4-aesni-avx2: add generic 1 to 16 block bulk processing function
* cipher/sm4-aesni-avx2-amd64.S: Remove unnecessary vzeroupper at
function entries.
(_gcry_sm4_aesni_avx2_crypt_blk1_16): New.
* cipher/sm4.c (_gcry_sm4_aesni_avx2_crypt_blk1_16)
(sm4_aesni_avx2_crypt_blk1_16): New.
(sm4_get_crypt_blk1_16_fn) [USE_AESNI_AVX2]: Add
'sm4_aesni_avx2_crypt_blk1_16'.
--
Benchmark AMD Ryzen 5800X:
Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 1.48 ns/B 643.2 MiB/s 7.19 c/B 4850
XTS dec | 1.48 ns/B 644.3 MiB/s 7.18 c/B 4850
After (1.37x faster):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 1.07 ns/B 888.7 MiB/s 5.21 c/B 4850
XTS dec | 1.07 ns/B 889.4 MiB/s 5.20 c/B 4850
Benchmark on Intel i5-6200U 2.30GHz:
Before:
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 2.95 ns/B 323.0 MiB/s 8.25 c/B 2792
XTS dec | 2.95 ns/B 323.0 MiB/s 8.24 c/B 2792
After (1.64x faster):
SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
XTS enc | 1.79 ns/B 531.4 MiB/s 5.01 c/B 2791
XTS dec | 1.79 ns/B 531.6 MiB/s 5.01 c/B 2791
Reviewed-and-tested-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/sm4-aesni-avx2-amd64.S | 82 | ||||
-rw-r--r-- | cipher/sm4.c | 26 |
2 files changed, 95 insertions, 13 deletions
diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S index effe590b..e09fed8f 100644 --- a/cipher/sm4-aesni-avx2-amd64.S +++ b/cipher/sm4-aesni-avx2-amd64.S @@ -1,6 +1,6 @@ /* sm4-avx2-amd64.S - AVX2 implementation of SM4 cipher * - * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2020, 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -45,11 +45,19 @@ #define RA1 %ymm9 #define RA2 %ymm10 #define RA3 %ymm11 +#define RA0x %xmm8 +#define RA1x %xmm9 +#define RA2x %xmm10 +#define RA3x %xmm11 #define RB0 %ymm12 #define RB1 %ymm13 #define RB2 %ymm14 #define RB3 %ymm15 +#define RB0x %xmm12 +#define RB1x %xmm13 +#define RB2x %xmm14 +#define RB3x %xmm15 #define RNOT %ymm0 #define RBSWAP %ymm1 @@ -280,6 +288,66 @@ __sm4_crypt_blk16: CFI_ENDPROC(); ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;) +.align 8 +.globl _gcry_sm4_aesni_avx2_crypt_blk1_16 +ELF(.type _gcry_sm4_aesni_avx2_crypt_blk1_16,@function;) +_gcry_sm4_aesni_avx2_crypt_blk1_16: + /* input: + * %rdi: round key array, CTX + * %rsi: dst (1..16 blocks) + * %rdx: src (1..16 blocks) + * %rcx: num blocks (1..16) + */ + CFI_STARTPROC(); + +#define LOAD_INPUT(offset, yreg) \ + cmpq $(1 + 2 * (offset)), %rcx; \ + jb .Lblk16_load_input_done; \ + ja 1f; \ + vmovdqu (offset) * 32(%rdx), yreg##x; \ + jmp .Lblk16_load_input_done; \ + 1: \ + vmovdqu (offset) * 32(%rdx), yreg; + + LOAD_INPUT(0, RA0); + LOAD_INPUT(1, RA1); + LOAD_INPUT(2, RA2); + LOAD_INPUT(3, RA3); + LOAD_INPUT(4, RB0); + LOAD_INPUT(5, RB1); + LOAD_INPUT(6, RB2); + LOAD_INPUT(7, RB3); +#undef LOAD_INPUT + +.Lblk16_load_input_done: + call __sm4_crypt_blk16; + +#define STORE_OUTPUT(yreg, offset) \ + cmpq $(1 + 2 * (offset)), %rcx; \ + jb .Lblk16_store_output_done; \ + ja 1f; \ + vmovdqu yreg##x, (offset) * 32(%rsi); \ + jmp .Lblk16_store_output_done; \ + 1: \ + vmovdqu yreg, (offset) * 32(%rsi); + + STORE_OUTPUT(RA0, 0); + STORE_OUTPUT(RA1, 1); + STORE_OUTPUT(RA2, 2); + STORE_OUTPUT(RA3, 3); + STORE_OUTPUT(RB0, 4); + STORE_OUTPUT(RB1, 5); + STORE_OUTPUT(RB2, 6); + STORE_OUTPUT(RB3, 7); +#undef STORE_OUTPUT + +.Lblk16_store_output_done: + vzeroall; + xorl %eax, %eax; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_sm4_aesni_avx2_crypt_blk1_16,.-_gcry_sm4_aesni_avx2_crypt_blk1_16;) + #define inc_le128(x, minus_one, tmp) \ vpcmpeqq minus_one, x, tmp; \ vpsubq minus_one, x, x; \ @@ -301,8 +369,6 @@ _gcry_sm4_aesni_avx2_ctr_enc: movq 8(%rcx), %rax; bswapq %rax; - vzeroupper; - vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; vpcmpeqd RNOT, RNOT, RNOT; vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ @@ -410,8 +476,6 @@ _gcry_sm4_aesni_avx2_cbc_dec: */ CFI_STARTPROC(); - vzeroupper; - vmovdqu (0 * 32)(%rdx), RA0; vmovdqu (1 * 32)(%rdx), RA1; vmovdqu (2 * 32)(%rdx), RA2; @@ -463,8 +527,6 @@ _gcry_sm4_aesni_avx2_cfb_dec: */ CFI_STARTPROC(); - vzeroupper; - /* Load input */ vmovdqu (%rcx), RNOTx; vinserti128 $1, (%rdx), RNOT, RA0; @@ -521,8 +583,6 @@ _gcry_sm4_aesni_avx2_ocb_enc: */ CFI_STARTPROC(); - vzeroupper; - subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); @@ -635,8 +695,6 @@ _gcry_sm4_aesni_avx2_ocb_dec: */ CFI_STARTPROC(); - vzeroupper; - subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); @@ -758,8 +816,6 @@ _gcry_sm4_aesni_avx2_ocb_auth: */ CFI_STARTPROC(); - vzeroupper; - subq $(4 * 8), %rsp; CFI_ADJUST_CFA_OFFSET(4 * 8); diff --git a/cipher/sm4.c b/cipher/sm4.c index 73fa23f4..7c7bc1ff 100644 --- a/cipher/sm4.c +++ b/cipher/sm4.c @@ -291,6 +291,24 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc, unsigned char *offset, unsigned char *checksum, const u64 Ls[16]) ASM_FUNC_ABI; + +extern unsigned int +_gcry_sm4_aesni_avx2_crypt_blk1_16(const u32 *rk, byte *out, const byte *in, + unsigned int num_blks) ASM_FUNC_ABI; + +static inline unsigned int +sm4_aesni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in, + unsigned int num_blks) +{ +#ifdef USE_AESNI_AVX + /* Use 128-bit register implementation for short input. */ + if (num_blks <= 8) + return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks); +#endif + + return _gcry_sm4_aesni_avx2_crypt_blk1_16(rk, out, in, num_blks); +} + #endif /* USE_AESNI_AVX2 */ #ifdef USE_GFNI_AVX2 @@ -382,6 +400,7 @@ sm4_aarch64_crypt_blk1_16(const void *rk, byte *out, const byte *in, _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, num_blks); return 0; } + #endif /* USE_AARCH64_SIMD */ #ifdef USE_ARM_CE @@ -427,6 +446,7 @@ sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in, _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, num_blks); return 0; } + #endif /* USE_ARM_CE */ static inline void prefetch_sbox_table(void) @@ -771,6 +791,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx) return &sm4_gfni_avx2_crypt_blk1_16; } #endif +#ifdef USE_AESNI_AVX2 + else if (ctx->use_aesni_avx2) + { + return &sm4_aesni_avx2_crypt_blk1_16; + } +#endif #ifdef USE_AESNI_AVX else if (ctx->use_aesni_avx) { |