summaryrefslogtreecommitdiff
path: root/cipher/sm4.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2022-07-21 11:05:38 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2022-07-21 11:05:38 +0300
commiteaed633c1662d8a98042ac146c981113f2807b22 (patch)
tree5d0977724cbf429c34f2bc52dfe6f2f32406a2c6 /cipher/sm4.c
parent2dc2654006746a25f9cb6b24786867f1725ac244 (diff)
downloadlibgcrypt-eaed633c1662d8a98042ac146c981113f2807b22.tar.gz
sm4: add amd64 GFNI/AVX512 implementation
* cipher/Makefile.am: Add 'sm4-gfni-avx512-amd64.S'. * cipher/sm4-gfni-avx512-amd64.S: New. * cipher/sm4-gfni.c (USE_GFNI_AVX512): New. (SM4_context): Add 'use_gfni_avx512' and 'crypt_blk1_16'. (_gcry_sm4_gfni_avx512_expand_key, _gcry_sm4_gfni_avx512_ctr_enc) (_gcry_sm4_gfni_avx512_cbc_dec, _gcry_sm4_gfni_avx512_cfb_dec) (_gcry_sm4_gfni_avx512_ocb_enc, _gcry_sm4_gfni_avx512_ocb_dec) (_gcry_sm4_gfni_avx512_ocb_auth, _gcry_sm4_gfni_avx512_ctr_enc_blk32) (_gcry_sm4_gfni_avx512_cbc_dec_blk32) (_gcry_sm4_gfni_avx512_cfb_dec_blk32) (_gcry_sm4_gfni_avx512_ocb_enc_blk32) (_gcry_sm4_gfni_avx512_ocb_dec_blk32) (_gcry_sm4_gfni_avx512_crypt_blk1_16) (_gcry_sm4_gfni_avx512_crypt_blk32, sm4_gfni_avx512_crypt_blk1_16) (sm4_crypt_blk1_32, sm4_encrypt_blk1_32, sm4_decrypt_blk1_32): New. (sm4_expand_key): Add GFNI/AVX512 code-path (sm4_setkey): Use GFNI/AVX512 if supported by CPU; Setup `ctx->crypt_blk1_16`. (sm4_encrypt, sm4_decrypt, sm4_get_crypt_blk1_16_fn, _gcry_sm4_ctr_enc) (_gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec, _gcry_sm4_ocb_crypt) (_gcry_sm4_ocb_auth) [USE_GFNI_AVX512]: Add GFNI/AVX512 code path. (_gcry_sm4_xts_crypt): Change parallel block size from 16 to 32. * configure.ac: Add 'sm4-gfni-avx512-amd64.lo'. -- Benchmark on Intel i3-1115G4 (tigerlake): Before: SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC enc | 9.45 ns/B 101.0 MiB/s 38.63 c/B 4089 CBC dec | 0.647 ns/B 1475 MiB/s 2.64 c/B 4089 CFB enc | 9.43 ns/B 101.1 MiB/s 38.57 c/B 4089 CFB dec | 0.648 ns/B 1472 MiB/s 2.65 c/B 4089 CTR enc | 0.661 ns/B 1443 MiB/s 2.70 c/B 4089 CTR dec | 0.661 ns/B 1444 MiB/s 2.70 c/B 4089 XTS enc | 0.767 ns/B 1243 MiB/s 3.14 c/B 4089 XTS dec | 0.772 ns/B 1235 MiB/s 3.16 c/B 4089 OCB enc | 0.671 ns/B 1421 MiB/s 2.74 c/B 4089 OCB dec | 0.676 ns/B 1410 MiB/s 2.77 c/B 4089 OCB auth | 0.668 ns/B 1428 MiB/s 2.73 c/B 4090 After: SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC enc | 7.80 ns/B 122.2 MiB/s 31.91 c/B 4090 CBC dec | 0.293 ns/B 3258 MiB/s 1.20 c/B 4095±3 CFB enc | 7.80 ns/B 122.2 MiB/s 31.90 c/B 4089 CFB dec | 0.294 ns/B 3247 MiB/s 1.20 c/B 4096±3 CTR enc | 0.306 ns/B 3120 MiB/s 1.25 c/B 4098±4 CTR dec | 0.300 ns/B 3182 MiB/s 1.23 c/B 4103±6 XTS enc | 0.431 ns/B 2211 MiB/s 1.77 c/B 4107±9 XTS dec | 0.431 ns/B 2213 MiB/s 1.77 c/B 4102±6 OCB enc | 0.324 ns/B 2946 MiB/s 1.33 c/B 4096±3 OCB dec | 0.326 ns/B 2923 MiB/s 1.34 c/B 4093±2 OCB auth | 0.536 ns/B 1779 MiB/s 2.19 c/B 4089 CBC/CFB enc: 1.20x faster CBC/CFB dec: 2.20x faster CTR: 2.18x faster XTS: 1.78x faster OCB enc/dec: 2.07x faster OCB auth: 1.24x faster Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/sm4.c')
-rw-r--r--cipher/sm4.c336
1 files changed, 322 insertions, 14 deletions
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 062a14f4..02c399a9 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -65,10 +65,20 @@
# endif
#endif
+/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */
+#undef USE_GFNI_AVX512
+#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_GFNI_AVX512 1
+# endif
+#endif
+
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_GFNI_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || \
+ defined(USE_GFNI_AVX2) || defined(USE_GFNI_AVX512)
# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
# define ASM_FUNC_ABI __attribute__((sysv_abi))
# else
@@ -125,10 +135,15 @@ static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
static size_t _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
size_t nblocks);
+typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out,
+ const byte *in,
+ unsigned int num_blks);
+
typedef struct
{
u32 rkey_enc[32];
u32 rkey_dec[32];
+ crypt_blk1_16_fn_t crypt_blk1_16;
#ifdef USE_AESNI_AVX
unsigned int use_aesni_avx:1;
#endif
@@ -138,6 +153,9 @@ typedef struct
#ifdef USE_GFNI_AVX2
unsigned int use_gfni_avx2:1;
#endif
+#ifdef USE_GFNI_AVX512
+ unsigned int use_gfni_avx512:1;
+#endif
#ifdef USE_AARCH64_SIMD
unsigned int use_aarch64_simd:1;
#endif
@@ -149,10 +167,6 @@ typedef struct
#endif
} SM4_context;
-typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out,
- const byte *in,
- unsigned int num_blks);
-
static const u32 fk[4] =
{
0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
@@ -217,6 +231,8 @@ static const u32 ck[] =
0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279
};
+static inline crypt_blk1_16_fn_t sm4_get_crypt_blk1_16_fn(SM4_context *ctx);
+
#ifdef USE_AESNI_AVX
extern void _gcry_sm4_aesni_avx_expand_key(const byte *key, u32 *rk_enc,
u32 *rk_dec, const u32 *fk,
@@ -374,6 +390,86 @@ sm4_gfni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in,
#endif /* USE_GFNI_AVX2 */
+#ifdef USE_GFNI_AVX512
+extern void _gcry_sm4_gfni_avx512_expand_key(const byte *key, u32 *rk_enc,
+ u32 *rk_dec, const u32 *fk,
+ const u32 *ck) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ctr_enc(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_cbc_dec(const u32 *rk_dec, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_cfb_dec(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_enc(const u32 *rk_enc,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_dec(const u32 *rk_dec,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_auth(const u32 *rk_enc,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ctr_enc_blk32(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_cbc_dec_blk32(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_cfb_dec_blk32(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_enc_blk32(const u32 *rk_enc,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx512_ocb_dec_blk32(const u32 *rk_dec,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_gfni_avx512_crypt_blk1_16(const u32 *rk, byte *out, const byte *in,
+ unsigned int num_blks) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_gfni_avx512_crypt_blk32(const u32 *rk, byte *out,
+ const byte *in) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_gfni_avx512_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ return _gcry_sm4_gfni_avx512_crypt_blk1_16(rk, out, in, num_blks);
+}
+
+#endif /* USE_GFNI_AVX2 */
+
#ifdef USE_AARCH64_SIMD
extern void _gcry_sm4_aarch64_crypt(const u32 *rk, byte *out,
const byte *in,
@@ -561,6 +657,15 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
u32 rk[4];
int i;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ _gcry_sm4_gfni_avx512_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+ fk, ck);
+ return;
+ }
+#endif
+
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
{
@@ -645,6 +750,9 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
#ifdef USE_GFNI_AVX2
ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
#endif
+#ifdef USE_GFNI_AVX512
+ ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
+#endif
#ifdef USE_AARCH64_SIMD
ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON);
#endif
@@ -670,6 +778,8 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
}
#endif
+ ctx->crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+
/* Setup bulk encryption routines. */
memset (bulk_ops, 0, sizeof(*bulk_ops));
bulk_ops->cbc_dec = _gcry_sm4_cbc_dec;
@@ -715,6 +825,11 @@ sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
+#endif
+
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
@@ -735,6 +850,11 @@ sm4_decrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ return sm4_gfni_avx512_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
+#endif
+
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
@@ -834,6 +954,12 @@ sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
{
if (0)
;
+#ifdef USE_GFNI_AVX512
+ else if (ctx->use_gfni_avx512)
+ {
+ return &sm4_gfni_avx512_crypt_blk1_16;
+ }
+#endif
#ifdef USE_GFNI_AVX2
else if (ctx->use_gfni_avx2)
{
@@ -890,6 +1016,32 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
const byte *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ _gcry_sm4_gfni_avx512_ctr_enc_blk32(ctx->rkey_enc,
+ outbuf, inbuf, ctr);
+
+ nblocks -= 32;
+ outbuf += 32 * 16;
+ inbuf += 32 * 16;
+ }
+
+ /* Process data in 16 block chunks. */
+ if (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx512_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
{
@@ -982,7 +1134,7 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
byte tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
@@ -1011,6 +1163,31 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ _gcry_sm4_gfni_avx512_cbc_dec_blk32(ctx->rkey_dec, outbuf, inbuf, iv);
+
+ nblocks -= 32;
+ outbuf += 32 * 16;
+ inbuf += 32 * 16;
+ }
+
+ /* Process data in 16 block chunks. */
+ if (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx512_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
{
@@ -1103,7 +1280,7 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
@@ -1132,6 +1309,31 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ _gcry_sm4_gfni_avx512_cfb_dec_blk32(ctx->rkey_enc, outbuf, inbuf, iv);
+
+ nblocks -= 32;
+ outbuf += 32 * 16;
+ inbuf += 32 * 16;
+ }
+
+ /* Process data in 16 block chunks. */
+ if (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx512_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
{
@@ -1224,7 +1426,7 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
@@ -1241,6 +1443,52 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
+static unsigned int
+sm4_crypt_blk1_32 (const SM4_context *ctx, byte *outbuf, const byte *inbuf,
+ unsigned int num_blks, const u32 *rk)
+{
+ unsigned int stack_burn_size = 0;
+ unsigned int nburn;
+
+ gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX512
+ if (num_blks == 32 && ctx->use_gfni_avx512)
+ {
+ return _gcry_sm4_gfni_avx512_crypt_blk32 (rk, outbuf, inbuf);
+ }
+#endif
+
+ do
+ {
+ unsigned int curr_blks = num_blks > 16 ? 16 : num_blks;
+ nburn = ctx->crypt_blk1_16 (rk, outbuf, inbuf, curr_blks);
+ stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+ outbuf += curr_blks * 16;
+ inbuf += curr_blks * 16;
+ num_blks -= curr_blks;
+ }
+ while (num_blks > 0);
+
+ return stack_burn_size;
+}
+
+static unsigned int
+sm4_encrypt_blk1_32 (const void *context, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ const SM4_context *ctx = context;
+ return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_enc);
+}
+
+static unsigned int
+sm4_decrypt_blk1_32 (const void *context, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ const SM4_context *ctx = context;
+ return sm4_crypt_blk1_32 (ctx, out, in, num_blks, ctx->rkey_dec);
+}
+
/* Bulk encryption/decryption of complete blocks in XTS mode. */
static void
_gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
@@ -1254,13 +1502,13 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
- u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
- unsigned char tmpbuf[16 * 16];
+ unsigned char tmpbuf[32 * 16];
unsigned int tmp_used = 16;
size_t nburn;
- nburn = bulk_xts_crypt_128(rk, crypt_blk1_16, outbuf, inbuf, nblocks,
+ nburn = bulk_xts_crypt_128(ctx, encrypt ? sm4_encrypt_blk1_32
+ : sm4_decrypt_blk1_32,
+ outbuf, inbuf, nblocks,
tweak, tmpbuf, sizeof(tmpbuf) / 16,
&tmp_used);
burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
@@ -1283,6 +1531,39 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
u64 blkn = c->u_mode.ocb.data_nblocks;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ u64 Ls[32];
+ u64 *l;
+
+ if (nblocks >= 32)
+ {
+ l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
+
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ blkn += 32;
+ *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 32);
+
+ if (encrypt)
+ _gcry_sm4_gfni_avx512_ocb_enc_blk32 (ctx->rkey_enc, outbuf,
+ inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_sm4_gfni_avx512_ocb_dec_blk32 (ctx->rkey_dec, outbuf,
+ inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 32;
+ outbuf += 32 * 16;
+ inbuf += 32 * 16;
+ }
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
{
@@ -1379,7 +1660,7 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
@@ -1410,6 +1691,33 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
u64 blkn = c->u_mode.ocb.aad_nblocks;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ u64 Ls[16];
+ u64 *l;
+
+ if (nblocks >= 16)
+ {
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l (c, blkn - blkn % 16);
+
+ _gcry_sm4_gfni_avx512_ocb_auth (ctx->rkey_enc, abuf,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+
+ nblocks -= 16;
+ abuf += 16 * 16;
+ }
+ }
+ }
+#endif
+
#ifdef USE_GFNI_AVX2
if (ctx->use_gfni_avx2)
{
@@ -1494,7 +1802,7 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ crypt_blk1_16_fn_t crypt_blk1_16 = ctx->crypt_blk1_16;
unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;