summaryrefslogtreecommitdiff
path: root/cipher/chacha20.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2020-12-30 17:46:04 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2020-12-30 17:46:04 +0200
commit6a0bb9ab7f886087d7edb0725c90485086a1c0b4 (patch)
tree660cef2ba05c1282425ee8d229ebe7fa57e070bf /cipher/chacha20.c
parent1d13794780e3d052cd5ed6f900bf5900cf44b377 (diff)
downloadlibgcrypt-6a0bb9ab7f886087d7edb0725c90485086a1c0b4.tar.gz
Add s390x/zSeries implementation of ChaCha20
* cipher/Makefile.am: Add 'asm-common-s390x.h' and 'chacha20-s390x.S'. * cipher/asm-common-s390x.h: New. * cipher/chacha20-s390x.S: New. * cipher/chacha20.c (USE_S390X_VX): New. (CHACHA20_context_t): Change 'use_*' bit-field to unsigned type; Add 'use_s390x'. (_gcry_chacha20_s390x_vx_blocks8) (_gcry_chacha20_s390x_vx_blocks4_2_1): New. (chacha20_do_setkey): Add HW feature detect for s390x/VX. (chacha20_blocks, do_chacha20_encrypt_stream_tail): Add s390x/VX code-path. * configure.ac: Add 'chacha20-s390x.lo'. -- Patch adds VX vector instruction set accelerated ChaCha20 implementation for zSeries. Benchmark on z15 (4504 Mhz): Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.62 ns/B 364.0 MiB/s 11.80 c/B STREAM dec | 2.62 ns/B 363.8 MiB/s 11.81 c/B After (~5x faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.505 ns/B 1888 MiB/s 2.28 c/B STREAM dec | 0.506 ns/B 1887 MiB/s 2.28 c/B GnuPG-bug-id: 5201 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20.c')
-rw-r--r--cipher/chacha20.c53
1 files changed, 48 insertions, 5 deletions
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index c5967b6f..7b283080 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -97,6 +97,14 @@
# endif
#endif
+/* USE_S390X_VX indicates whether to enable zSeries code. */
+#undef USE_S390X_VX
+#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+# if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
+# define USE_S390X_VX 1
+# endif /* USE_S390X_VX */
+#endif
+
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
@@ -113,10 +121,11 @@ typedef struct CHACHA20_context_s
u32 input[16];
unsigned char pad[CHACHA20_BLOCK_SIZE];
unsigned int unused; /* bytes in the pad. */
- int use_ssse3:1;
- int use_avx2:1;
- int use_neon:1;
- int use_ppc:1;
+ unsigned int use_ssse3:1;
+ unsigned int use_avx2:1;
+ unsigned int use_neon:1;
+ unsigned int use_ppc:1;
+ unsigned int use_s390x:1;
} CHACHA20_context_t;
@@ -168,10 +177,20 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
u32 *state, byte *dst, const byte *src, size_t nblks,
POLY1305_STATE *st, const byte *poly1305_src);
-#endif
+#endif /* SIZEOF_UNSIGNED_LONG == 8 */
#endif /* USE_PPC_VEC */
+#ifdef USE_S390X_VX
+
+unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst,
+ const byte *src, size_t nblks);
+
+unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst,
+ const byte *src, size_t nblks);
+
+#endif /* USE_S390X_VX */
+
#ifdef USE_ARMV7_NEON
unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
@@ -311,6 +330,13 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
}
#endif
+#ifdef USE_S390X_VX
+ if (ctx->use_s390x)
+ {
+ return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks);
+ }
+#endif
+
return do_chacha20_blocks (ctx->input, dst, src, nblks);
}
@@ -438,6 +464,9 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
#ifdef USE_PPC_VEC
ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
#endif
+#ifdef USE_S390X_VX
+ ctx->use_s390x = (features & HWF_S390X_VX) != 0;
+#endif
(void)features;
@@ -538,6 +567,20 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
}
#endif
+#ifdef USE_S390X_VX
+ if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 8;
+ nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf,
+ nblocks);
+ burn = nburn > burn ? nburn : burn;
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
if (length >= CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;