summaryrefslogtreecommitdiff
path: root/cipher/sm4.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2022-04-24 21:03:24 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2022-04-30 13:01:41 +0300
commit5095d60af42d898311d66b10f5204a3418a4a8af (patch)
treefaad48a3d82f826f1f0038e3621689dea3c242da /cipher/sm4.c
parentaad3381e93846212c2022dba50e621e4b48f3295 (diff)
downloadlibgcrypt-5095d60af42d898311d66b10f5204a3418a4a8af.tar.gz
Add SM4 x86-64/GFNI/AVX2 implementation
* cipher/Makefile.am: Add 'sm4-gfni-avx2-amd64.S'. * cipher/sm4-aesni-avx2-amd64.S: New. * cipher/sm4.c (USE_GFNI_AVX2): New. (SM4_context): Add 'use_gfni_avx2'. (crypt_blk1_8_fn_t): Rename to... (crypt_blk1_16_fn_t): ...this. (sm4_aesni_avx_crypt_blk1_8): Rename to... (sm4_aesni_avx_crypt_blk1_16): ...this and add handling for 9 to 16 input blocks. (_gcry_sm4_gfni_avx_expand_key, _gcry_sm4_gfni_avx2_ctr_enc) (_gcry_sm4_gfni_avx2_cbc_dec, _gcry_sm4_gfni_avx2_cfb_dec) (_gcry_sm4_gfni_avx2_ocb_enc, _gcry_sm4_gfni_avx2_ocb_dec) (_gcry_sm4_gfni_avx2_ocb_auth, _gcry_sm4_gfni_avx2_crypt_blk1_16) (sm4_gfni_avx2_crypt_blk1_16): New. (sm4_aarch64_crypt_blk1_8): Rename to... (sm4_aarch64_crypt_blk1_16): ...this and add handling for 9 to 16 input blocks. (sm4_armv8_ce_crypt_blk1_8): Rename to... (sm4_armv8_ce_crypt_blk1_16): ...this and add handling for 9 to 16 input blocks. (sm4_expand_key): Add GFNI/AVX2 path. (sm4_setkey): Enable GFNI/AVX2 implementation if HW features available; Disable AESNI implementations when GFNI implementation is enabled. (sm4_encrypt) [USE_GFNI_AVX2]: New. (sm4_decrypt) [USE_GFNI_AVX2]: New. (sm4_get_crypt_blk1_8_fn): Rename to... (sm4_get_crypt_blk1_16_fn): ...this; Update to use *_blk1_16 functions; Add GFNI/AVX2 selection. (_gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec, _gcry_sm4_cfb_dec) (_gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth): Add GFNI/AVX2 path; Widen generic bulk processing from 8 blocks to 16 blocks. (_gcry_sm4_xts_crypt): Widen generic bulk processing from 8 blocks to 16 blocks. -- Benchmark on Intel i3-1115G4 (tigerlake): Before: SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 10.34 ns/B 92.21 MiB/s 42.29 c/B 4089 ECB dec | 10.34 ns/B 92.24 MiB/s 42.29 c/B 4090 CBC enc | 11.06 ns/B 86.26 MiB/s 45.21 c/B 4090 CBC dec | 1.13 ns/B 844.8 MiB/s 4.62 c/B 4090 CFB enc | 11.06 ns/B 86.27 MiB/s 45.22 c/B 4090 CFB dec | 1.13 ns/B 846.0 MiB/s 4.61 c/B 4090 CTR enc | 1.14 ns/B 834.3 MiB/s 4.67 c/B 4089 CTR dec | 1.14 ns/B 834.5 MiB/s 4.67 c/B 4089 XTS enc | 1.93 ns/B 494.1 MiB/s 7.89 c/B 4090 XTS dec | 1.94 ns/B 492.5 MiB/s 7.92 c/B 4090 OCB enc | 1.16 ns/B 823.3 MiB/s 4.74 c/B 4090 OCB dec | 1.16 ns/B 818.8 MiB/s 4.76 c/B 4089 OCB auth | 1.15 ns/B 831.0 MiB/s 4.69 c/B 4089 After: SM4 | nanosecs/byte mebibytes/sec cycles/byte auto Mhz ECB enc | 8.39 ns/B 113.6 MiB/s 34.33 c/B 4090 ECB dec | 8.40 ns/B 113.5 MiB/s 34.35 c/B 4090 CBC enc | 9.45 ns/B 101.0 MiB/s 38.63 c/B 4089 CBC dec | 0.650 ns/B 1468 MiB/s 2.66 c/B 4090 CFB enc | 9.44 ns/B 101.1 MiB/s 38.59 c/B 4090 CFB dec | 0.660 ns/B 1444 MiB/s 2.70 c/B 4090 CTR enc | 0.664 ns/B 1437 MiB/s 2.71 c/B 4090 CTR dec | 0.664 ns/B 1437 MiB/s 2.71 c/B 4090 XTS enc | 0.756 ns/B 1262 MiB/s 3.09 c/B 4090 XTS dec | 0.757 ns/B 1260 MiB/s 3.10 c/B 4090 OCB enc | 0.673 ns/B 1417 MiB/s 2.75 c/B 4090 OCB dec | 0.675 ns/B 1413 MiB/s 2.76 c/B 4090 OCB auth | 0.672 ns/B 1418 MiB/s 2.75 c/B 4090 ECB: 1.2x faster CBC-enc / CFB-enc: 1.17x faster CBC-dec / CFB-dec / CTR / OCB: 1.7x faster XTS: 2.5x faster Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/sm4.c')
-rw-r--r--cipher/sm4.c308
1 files changed, 269 insertions, 39 deletions
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 600850e2..73fa23f4 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -1,7 +1,7 @@
/* sm4.c - SM4 Cipher Algorithm
* Copyright (C) 2020 Alibaba Group.
* Copyright (C) 2020 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
- * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2020-2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -48,7 +48,7 @@
# endif
#endif
-/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX2 code. */
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
#undef USE_AESNI_AVX2
#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
@@ -57,10 +57,19 @@
# endif
#endif
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_GFNI_AVX2 1
+# endif
+#endif
+
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_GFNI_AVX2)
# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
# define ASM_FUNC_ABI __attribute__((sysv_abi))
# else
@@ -116,6 +125,9 @@ typedef struct
#ifdef USE_AESNI_AVX2
unsigned int use_aesni_avx2:1;
#endif
+#ifdef USE_GFNI_AVX2
+ unsigned int use_gfni_avx2:1;
+#endif
#ifdef USE_AARCH64_SIMD
unsigned int use_aarch64_simd:1;
#endif
@@ -124,9 +136,9 @@ typedef struct
#endif
} SM4_context;
-typedef unsigned int (*crypt_blk1_8_fn_t) (const void *ctx, byte *out,
- const byte *in,
- unsigned int num_blks);
+typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out,
+ const byte *in,
+ unsigned int num_blks);
static const u32 fk[4] =
{
@@ -231,9 +243,17 @@ _gcry_sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
unsigned int num_blks) ASM_FUNC_ABI;
static inline unsigned int
-sm4_aesni_avx_crypt_blk1_8(const void *rk, byte *out, const byte *in,
- unsigned int num_blks)
+sm4_aesni_avx_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
{
+ if (num_blks > 8)
+ {
+ _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, 8);
+ in += 8 * 16;
+ out += 8 * 16;
+ num_blks -= 8;
+ }
+
return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks);
}
@@ -273,6 +293,56 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc,
const u64 Ls[16]) ASM_FUNC_ABI;
#endif /* USE_AESNI_AVX2 */
+#ifdef USE_GFNI_AVX2
+extern void _gcry_sm4_gfni_avx_expand_key(const byte *key, u32 *rk_enc,
+ u32 *rk_dec, const u32 *fk,
+ const u32 *ck) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ctr_enc(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_cbc_dec(const u32 *rk_dec, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_cfb_dec(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_enc(const u32 *rk_enc,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_dec(const u32 *rk_dec,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_auth(const u32 *rk_enc,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_gfni_avx2_crypt_blk1_16(const u32 *rk, byte *out, const byte *in,
+ unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_gfni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ return _gcry_sm4_gfni_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+
+#endif /* USE_GFNI_AVX2 */
+
#ifdef USE_AARCH64_SIMD
extern void _gcry_sm4_aarch64_crypt(const u32 *rk, byte *out,
const byte *in,
@@ -298,10 +368,18 @@ extern void _gcry_sm4_aarch64_crypt_blk1_8(const u32 *rk, byte *out,
size_t num_blocks);
static inline unsigned int
-sm4_aarch64_crypt_blk1_8(const void *rk, byte *out, const byte *in,
- unsigned int num_blks)
+sm4_aarch64_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
{
- _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, (size_t)num_blks);
+ if (num_blks > 8)
+ {
+ _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, 8);
+ in += 8 * 16;
+ out += 8 * 16;
+ num_blks -= 8;
+ }
+
+ _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, num_blks);
return 0;
}
#endif /* USE_AARCH64_SIMD */
@@ -335,10 +413,18 @@ extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out,
size_t num_blocks);
static inline unsigned int
-sm4_armv8_ce_crypt_blk1_8(const void *rk, byte *out, const byte *in,
- unsigned int num_blks)
+sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
{
- _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, (size_t)num_blks);
+ if (num_blks > 8)
+ {
+ _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, 8);
+ in += 8 * 16;
+ out += 8 * 16;
+ num_blks -= 8;
+ }
+
+ _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, num_blks);
return 0;
}
#endif /* USE_ARM_CE */
@@ -411,6 +497,15 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
u32 rk[4];
int i;
+#ifdef USE_GFNI_AVX
+ if (ctx->use_gfni_avx)
+ {
+ _gcry_sm4_gfni_avx_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+ fk, ck);
+ return;
+ }
+#endif
+
#ifdef USE_AESNI_AVX
if (ctx->use_aesni_avx)
{
@@ -483,6 +578,9 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
#ifdef USE_AESNI_AVX2
ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
#endif
+#ifdef USE_GFNI_AVX2
+ ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
#ifdef USE_AARCH64_SIMD
ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON);
#endif
@@ -490,6 +588,19 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
ctx->use_arm_ce = !!(hwf & HWF_ARM_SM4);
#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Disable AESNI implementations when GFNI implementation is enabled. */
+#ifdef USE_AESNI_AVX
+ ctx->use_aesni_avx = 0;
+#endif
+#ifdef USE_AESNI_AVX2
+ ctx->use_aesni_avx2 = 0;
+#endif
+ }
+#endif
+
/* Setup bulk encryption routines. */
memset (bulk_ops, 0, sizeof(*bulk_ops));
bulk_ops->cbc_dec = _gcry_sm4_cbc_dec;
@@ -535,9 +646,14 @@ sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
+#endif
+
#ifdef USE_ARM_CE
if (ctx->use_arm_ce)
- return sm4_armv8_ce_crypt_blk1_8(ctx->rkey_enc, outbuf, inbuf, 1);
+ return sm4_armv8_ce_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
#endif
prefetch_sbox_table ();
@@ -550,9 +666,14 @@ sm4_decrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
+#endif
+
#ifdef USE_ARM_CE
if (ctx->use_arm_ce)
- return sm4_armv8_ce_crypt_blk1_8(ctx->rkey_dec, outbuf, inbuf, 1);
+ return sm4_armv8_ce_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
#endif
prefetch_sbox_table ();
@@ -639,27 +760,33 @@ sm4_crypt_blocks (const void *ctx, byte *out, const byte *in,
return burn_depth;
}
-static inline crypt_blk1_8_fn_t
-sm4_get_crypt_blk1_8_fn(SM4_context *ctx)
+static inline crypt_blk1_16_fn_t
+sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
{
if (0)
;
#ifdef USE_AESNI_AVX
+ else if (ctx->use_gfni_avx2)
+ {
+ return &sm4_gfni_avx2_crypt_blk1_16;
+ }
+#endif
+#ifdef USE_AESNI_AVX
else if (ctx->use_aesni_avx)
{
- return &sm4_aesni_avx_crypt_blk1_8;
+ return &sm4_aesni_avx_crypt_blk1_16;
}
#endif
#ifdef USE_ARM_CE
else if (ctx->use_arm_ce)
{
- return &sm4_armv8_ce_crypt_blk1_8;
+ return &sm4_armv8_ce_crypt_blk1_16;
}
#endif
#ifdef USE_AARCH64_SIMD
else if (ctx->use_aarch64_simd)
{
- return &sm4_aarch64_crypt_blk1_8;
+ return &sm4_aarch64_crypt_blk1_16;
}
#endif
else
@@ -682,6 +809,21 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
const byte *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx2_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
@@ -749,12 +891,12 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
- byte tmpbuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ byte tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
- nburn = bulk_ctr_enc_128(ctx->rkey_enc, crypt_blk1_8, outbuf, inbuf,
+ nburn = bulk_ctr_enc_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf,
nblocks, ctr, tmpbuf, sizeof(tmpbuf) / 16,
&tmp_used);
burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
@@ -778,6 +920,21 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx2_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
@@ -845,12 +1002,12 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
- unsigned char tmpbuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
- nburn = bulk_cbc_dec_128(ctx->rkey_dec, crypt_blk1_8, outbuf, inbuf,
+ nburn = bulk_cbc_dec_128(ctx->rkey_dec, crypt_blk1_16, outbuf, inbuf,
nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16,
&tmp_used);
burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
@@ -874,6 +1031,21 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx2_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
@@ -941,12 +1113,12 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
- unsigned char tmpbuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
- nburn = bulk_cfb_dec_128(ctx->rkey_enc, crypt_blk1_8, outbuf, inbuf,
+ nburn = bulk_cfb_dec_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf,
nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16,
&tmp_used);
burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
@@ -971,13 +1143,13 @@ _gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
- unsigned char tmpbuf[16 * 8];
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
- nburn = bulk_xts_crypt_128(rk, crypt_blk1_8, outbuf, inbuf, nblocks,
+ nburn = bulk_xts_crypt_128(rk, crypt_blk1_16, outbuf, inbuf, nblocks,
tweak, tmpbuf, sizeof(tmpbuf) / 16,
&tmp_used);
burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
@@ -1000,6 +1172,37 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
u64 blkn = c->u_mode.ocb.data_nblocks;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ u64 Ls[16];
+ u64 *l;
+
+ if (nblocks >= 16)
+ {
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ if (encrypt)
+ _gcry_sm4_gfni_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
+ c->u_iv.iv, c->u_ctr.ctr, Ls);
+ else
+ _gcry_sm4_gfni_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
+ c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
@@ -1065,13 +1268,13 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
- unsigned char tmpbuf[16 * 8];
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
- nburn = bulk_ocb_crypt_128 (c, rk, crypt_blk1_8, outbuf, inbuf, nblocks,
+ nburn = bulk_ocb_crypt_128 (c, rk, crypt_blk1_16, outbuf, inbuf, nblocks,
&blkn, encrypt, tmpbuf, sizeof(tmpbuf) / 16,
&tmp_used);
burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
@@ -1096,6 +1299,33 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
u64 blkn = c->u_mode.ocb.aad_nblocks;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ u64 Ls[16];
+ u64 *l;
+
+ if (nblocks >= 16)
+ {
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ _gcry_sm4_gfni_avx2_ocb_auth(ctx->rkey_enc, abuf,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+
+ nblocks -= 16;
+ abuf += 16 * 16;
+ }
+ }
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
@@ -1153,12 +1383,12 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
/* Process remaining blocks. */
if (nblocks)
{
- crypt_blk1_8_fn_t crypt_blk1_8 = sm4_get_crypt_blk1_8_fn(ctx);
- unsigned char tmpbuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
size_t nburn;
- nburn = bulk_ocb_auth_128 (c, ctx->rkey_enc, crypt_blk1_8, abuf, nblocks,
+ nburn = bulk_ocb_auth_128 (c, ctx->rkey_enc, crypt_blk1_16, abuf, nblocks,
&blkn, tmpbuf, sizeof(tmpbuf) / 16, &tmp_used);
burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;