summaryrefslogtreecommitdiff
path: root/cipher/chacha20.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2020-12-30 17:46:07 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2020-12-30 17:46:07 +0200
commit1f75681cbba895ea2f7ea0637900721f4522e729 (patch)
tree19eb7a48b5513f9f5811b1e515a3d4c8e637641c /cipher/chacha20.c
parent6a0bb9ab7f886087d7edb0725c90485086a1c0b4 (diff)
downloadlibgcrypt-cipher-s390x-optimizations.tar.gz
Add s390x/zSeries implementation of Poly1305cipher-s390x-optimizations
* cipher/Makefile.am: Add 'poly1305-s390x.S' and 'asm-poly1305-s390x.h'. * cipher/asm-poly1305-s390x.h: New * cipher/chacha20-s390x.S (_gcry_chacha20_poly1305_s390x_vx_blocks8) (_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1): New, stitched chacha20-poly1305 implementation. * cipher/chacha20.c (USE_S390X_VX_POLY1305): New. (_gcry_chacha20_poly1305_s390x_vx_blocks8) (_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1): New prototypes. (_gcry_chacha20_poly1305_encrypt, _gcry_chacha20_poly1305_decrypt): Add s390x/VX stitched chacha20-poly1305 code-path. * cipher/poly1305-s390x.S: New. * cipher/poly1305.c (USE_S390X_ASM, HAVE_ASM_POLY1305_BLOCKS): New. [USE_S390X_ASM] (_gcry_poly1305_s390x_blocks1, poly1305_blocks): New. * configure.ac (gcry_cv_gcc_inline_asm_s390x): Check for 'risbgn' and 'algrk' instructions. * tests/basic.c (_check_poly1305_cipher): Add large chacha20-poly1305 test vector. -- Patch adds Poly1305 and stitched ChaCha20-Poly1305 implementation for zSeries. Stitched implementation interleaves ChaCha20 and Poly1305 processing for higher instruction level parallelism and better utilization of execution units. Benchmark on z15 (4504 Mhz): Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 1.16 ns/B 823.2 MiB/s 5.22 c/B POLY1305 dec | 1.16 ns/B 823.2 MiB/s 5.22 c/B POLY1305 auth | 0.736 ns/B 1295 MiB/s 3.32 c/B After (chacha20-poly1305 ~71% faster, poly1305 ~29% faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 0.677 ns/B 1409 MiB/s 3.05 c/B POLY1305 dec | 0.655 ns/B 1456 MiB/s 2.95 c/B POLY1305 auth | 0.569 ns/B 1675 MiB/s 2.56 c/B GnuPG-bug-id: 5202 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20.c')
-rw-r--r--cipher/chacha20.c126
1 files changed, 126 insertions, 0 deletions
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 7b283080..497594a0 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -189,6 +189,18 @@ unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst,
unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst,
const byte *src, size_t nblks);
+#undef USE_S390X_VX_POLY1305
+#if SIZEOF_UNSIGNED_LONG == 8
+#define USE_S390X_VX_POLY1305 1
+unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ POLY1305_STATE *st, const byte *poly1305_src);
+
+unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ POLY1305_STATE *st, const byte *poly1305_src);
+#endif /* SIZEOF_UNSIGNED_LONG == 8 */
+
#endif /* USE_S390X_VX */
#ifdef USE_ARMV7_NEON
@@ -759,6 +771,48 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
inbuf += 4 * CHACHA20_BLOCK_SIZE;
}
#endif
+#ifdef USE_S390X_VX_POLY1305
+ else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8)
+ {
+ nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 8 * CHACHA20_BLOCK_SIZE;
+ outbuf += 8 * CHACHA20_BLOCK_SIZE;
+ inbuf += 8 * CHACHA20_BLOCK_SIZE;
+ }
+ else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4)
+ {
+ nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 4 * CHACHA20_BLOCK_SIZE;
+ outbuf += 4 * CHACHA20_BLOCK_SIZE;
+ inbuf += 4 * CHACHA20_BLOCK_SIZE;
+ }
+ else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2)
+ {
+ nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 2 * CHACHA20_BLOCK_SIZE;
+ outbuf += 2 * CHACHA20_BLOCK_SIZE;
+ inbuf += 2 * CHACHA20_BLOCK_SIZE;
+ }
+ else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE)
+ {
+ nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 1 * CHACHA20_BLOCK_SIZE;
+ outbuf += 1 * CHACHA20_BLOCK_SIZE;
+ inbuf += 1 * CHACHA20_BLOCK_SIZE;
+ }
+#endif
if (authptr)
{
@@ -862,6 +916,44 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
}
#endif
+#ifdef USE_S390X_VX_POLY1305
+ if (ctx->use_s390x)
+ {
+ if (length >= 8 * CHACHA20_BLOCK_SIZE &&
+ authoffset >= 8 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 8;
+
+ burn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+
+ if (length >= CHACHA20_BLOCK_SIZE &&
+ authoffset >= CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+ burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+ }
+#endif
+
if (authoffset > 0)
{
_gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
@@ -1026,6 +1118,40 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
}
#endif
+#ifdef USE_S390X_VX_POLY1305
+ if (ctx->use_s390x)
+ {
+ if (length >= 8 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 8;
+
+ nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+
+ if (length >= CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+ nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+ }
+#endif
+
while (length)
{
size_t currlen = length;