diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-12-30 17:46:04 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2020-12-30 17:46:04 +0200 |
commit | 6a0bb9ab7f886087d7edb0725c90485086a1c0b4 (patch) | |
tree | 660cef2ba05c1282425ee8d229ebe7fa57e070bf /cipher/chacha20.c | |
parent | 1d13794780e3d052cd5ed6f900bf5900cf44b377 (diff) | |
download | libgcrypt-6a0bb9ab7f886087d7edb0725c90485086a1c0b4.tar.gz |
Add s390x/zSeries implementation of ChaCha20
* cipher/Makefile.am: Add 'asm-common-s390x.h' and 'chacha20-s390x.S'.
* cipher/asm-common-s390x.h: New.
* cipher/chacha20-s390x.S: New.
* cipher/chacha20.c (USE_S390X_VX): New.
(CHACHA20_context_t): Change 'use_*' bit-field to unsigned type; Add
'use_s390x'.
(_gcry_chacha20_s390x_vx_blocks8)
(_gcry_chacha20_s390x_vx_blocks4_2_1): New.
(chacha20_do_setkey): Add HW feature detect for s390x/VX.
(chacha20_blocks, do_chacha20_encrypt_stream_tail): Add s390x/VX
code-path.
* configure.ac: Add 'chacha20-s390x.lo'.
--
Patch adds VX vector instruction set accelerated ChaCha20
implementation for zSeries.
Benchmark on z15 (4504 Mhz):
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 2.62 ns/B 364.0 MiB/s 11.80 c/B
STREAM dec | 2.62 ns/B 363.8 MiB/s 11.81 c/B
After (~5x faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.505 ns/B 1888 MiB/s 2.28 c/B
STREAM dec | 0.506 ns/B 1887 MiB/s 2.28 c/B
GnuPG-bug-id: 5201
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20.c')
-rw-r--r-- | cipher/chacha20.c | 53 |
1 files changed, 48 insertions, 5 deletions
diff --git a/cipher/chacha20.c b/cipher/chacha20.c index c5967b6f..7b283080 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -97,6 +97,14 @@ # endif #endif +/* USE_S390X_VX indicates whether to enable zSeries code. */ +#undef USE_S390X_VX +#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 +# if defined(HAVE_GCC_INLINE_ASM_S390X_VX) +# define USE_S390X_VX 1 +# endif /* USE_S390X_VX */ +#endif + /* Assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI @@ -113,10 +121,11 @@ typedef struct CHACHA20_context_s u32 input[16]; unsigned char pad[CHACHA20_BLOCK_SIZE]; unsigned int unused; /* bytes in the pad. */ - int use_ssse3:1; - int use_avx2:1; - int use_neon:1; - int use_ppc:1; + unsigned int use_ssse3:1; + unsigned int use_avx2:1; + unsigned int use_neon:1; + unsigned int use_ppc:1; + unsigned int use_s390x:1; } CHACHA20_context_t; @@ -168,10 +177,20 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, unsigned int _gcry_chacha20_poly1305_ppc8_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); -#endif +#endif /* SIZEOF_UNSIGNED_LONG == 8 */ #endif /* USE_PPC_VEC */ +#ifdef USE_S390X_VX + +unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst, + const byte *src, size_t nblks); + +unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst, + const byte *src, size_t nblks); + +#endif /* USE_S390X_VX */ + #ifdef USE_ARMV7_NEON unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst, @@ -311,6 +330,13 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, } #endif +#ifdef USE_S390X_VX + if (ctx->use_s390x) + { + return _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, dst, src, nblks); + } +#endif + return do_chacha20_blocks (ctx->input, dst, src, nblks); } @@ -438,6 +464,9 @@ chacha20_do_setkey (CHACHA20_context_t *ctx, #ifdef USE_PPC_VEC ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0; #endif +#ifdef USE_S390X_VX + ctx->use_s390x = (features & HWF_S390X_VX) != 0; +#endif (void)features; @@ -538,6 +567,20 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, } #endif +#ifdef USE_S390X_VX + if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 8) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 8; + nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, + nblocks); + burn = nburn > burn ? nburn : burn; + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } +#endif + if (length >= CHACHA20_BLOCK_SIZE) { size_t nblocks = length / CHACHA20_BLOCK_SIZE; |