summaryrefslogtreecommitdiff
path: root/cipher/chacha20.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2019-09-07 01:48:31 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2019-09-15 22:52:01 +0300
commit557702f0d53a7ad1cf2ce0333c9df799a8abad59 (patch)
tree64a09b3cf307f5a59af53bea39cb791540cbe58c /cipher/chacha20.c
parent0564757b934d24c7fef10df8594099985fbbc0ac (diff)
downloadlibgcrypt-557702f0d53a7ad1cf2ce0333c9df799a8abad59.tar.gz
Add PowerPC vector implementation of ChaCha20
* cipher/Makefile.am: Add 'chacha20-ppc.c'. * cipher/chacha20-ppc.c: New. * cipher/chacha20.c (USE_PPC_VEC, _gcry_chacha20_ppc8_blocks4) (_gcry_chacha20_ppc8_blocks1, USE_PPC_VEC_POLY1305) (_gcry_chacha20_poly1305_ppc8_blocks4): New. (CHACHA20_context_t): Add 'use_ppc'. (chacha20_blocks, chacha20_keysetup) (do_chacha20_encrypt_stream_tail): Add USE_PPC_VEC code. (_gcry_chacha20_poly1305_encrypt, _gcry_chacha20_poly1305_decrypt): Add USE_PPC_VEC_POLY1305 code. * configure.ac: Add 'chacha20-ppc.lo'. * src/g10lib.h (HWF_PPC_ARCH_2_07): New. * src/hwf-ppc.c (PPC_FEATURE2_ARCH_2_07): New. (ppc_features): Add HWF_PPC_ARCH_2_07. * src/hwfeatures.c (hwflist): Add 'ppc-arch_2_07'. -- This patch adds 1-way, 2-way and 4-way ChaCha20 vector implementations and 4-way stitched ChaCha20+Poly1305 implementation for PowerPC. Benchmark on POWER8 (ppc64le, ~3.8Ghz): Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.60 ns/B 366.2 MiB/s 9.90 c/B STREAM dec | 2.61 ns/B 366.1 MiB/s 9.90 c/B POLY1305 enc | 3.11 ns/B 307.1 MiB/s 11.80 c/B POLY1305 dec | 3.11 ns/B 307.0 MiB/s 11.80 c/B POLY1305 auth | 0.502 ns/B 1900 MiB/s 1.91 c/B After (~4x faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.619 ns/B 1540 MiB/s 2.35 c/B STREAM dec | 0.619 ns/B 1541 MiB/s 2.35 c/B POLY1305 enc | 0.785 ns/B 1215 MiB/s 2.98 c/B POLY1305 dec | 0.769 ns/B 1240 MiB/s 2.92 c/B POLY1305 auth | 0.502 ns/B 1901 MiB/s 1.91 c/B Benchmark on POWER9 (ppc64le, ~3.8Ghz): Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 2.27 ns/B 419.9 MiB/s 8.63 c/B STREAM dec | 2.27 ns/B 419.8 MiB/s 8.63 c/B POLY1305 enc | 2.73 ns/B 349.1 MiB/s 10.38 c/B POLY1305 dec | 2.73 ns/B 349.3 MiB/s 10.37 c/B POLY1305 auth | 0.459 ns/B 2076 MiB/s 1.75 c/B After (chacha20 ~3x faster, chacha20+poly1305 ~2.5x faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.690 ns/B 1381 MiB/s 2.62 c/B STREAM dec | 0.690 ns/B 1382 MiB/s 2.62 c/B POLY1305 enc | 1.09 ns/B 878.2 MiB/s 4.13 c/B POLY1305 dec | 1.07 ns/B 887.8 MiB/s 4.08 c/B POLY1305 auth | 0.459 ns/B 2076 MiB/s 1.75 c/B GnuPG-bug-id: 4460 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20.c')
-rw-r--r--cipher/chacha20.c105
1 files changed, 105 insertions, 0 deletions
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 48fff625..b34d8d19 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -85,6 +85,18 @@
# endif
#endif
+/* USE_PPC_VEC indicates whether to enable PowerPC vector
+ * accelerated code. */
+#undef USE_PPC_VEC
+#ifdef ENABLE_PPC_CRYPTO_SUPPORT
+# if defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
+ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC)
+# if __GNUC__ >= 4
+# define USE_PPC_VEC 1
+# endif
+# endif
+#endif
+
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
@@ -104,6 +116,7 @@ typedef struct CHACHA20_context_s
int use_ssse3:1;
int use_avx2:1;
int use_neon:1;
+ int use_ppc:1;
} CHACHA20_context_t;
@@ -139,6 +152,26 @@ unsigned int _gcry_chacha20_poly1305_amd64_avx2_blocks8(
#endif /* USE_AVX2 */
+#ifdef USE_PPC_VEC
+
+unsigned int _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks);
+
+unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst,
+ const byte *src,
+ size_t nblks);
+
+#undef USE_PPC_VEC_POLY1305
+#if SIZEOF_UNSIGNED_LONG == 8
+#define USE_PPC_VEC_POLY1305 1
+unsigned int _gcry_chacha20_poly1305_ppc8_blocks4(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ POLY1305_STATE *st, const byte *poly1305_src);
+#endif
+
+#endif /* USE_PPC_VEC */
+
#ifdef USE_ARMV7_NEON
unsigned int _gcry_chacha20_armv7_neon_blocks4(u32 *state, byte *dst,
@@ -267,6 +300,13 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src,
}
#endif
+#ifdef USE_PPC_VEC
+ if (ctx->use_ppc)
+ {
+ return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks);
+ }
+#endif
+
return do_chacha20_blocks (ctx->input, dst, src, nblks);
}
@@ -391,6 +431,9 @@ chacha20_do_setkey (CHACHA20_context_t *ctx,
#ifdef USE_AARCH64_SIMD
ctx->use_neon = (features & HWF_ARM_NEON) != 0;
#endif
+#ifdef USE_PPC_VEC
+ ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0;
+#endif
(void)features;
@@ -478,6 +521,19 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf,
}
#endif
+#ifdef USE_PPC_VEC
+ if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+ nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks);
+ burn = nburn > burn ? nburn : burn;
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
if (length >= CHACHA20_BLOCK_SIZE)
{
size_t nblocks = length / CHACHA20_BLOCK_SIZE;
@@ -632,6 +688,18 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
inbuf += 1 * CHACHA20_BLOCK_SIZE;
}
#endif
+#ifdef USE_PPC_VEC_POLY1305
+ else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4)
+ {
+ nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 4 * CHACHA20_BLOCK_SIZE;
+ outbuf += 4 * CHACHA20_BLOCK_SIZE;
+ inbuf += 4 * CHACHA20_BLOCK_SIZE;
+ }
+#endif
if (authptr)
{
@@ -695,6 +763,26 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
}
#endif
+#ifdef USE_PPC_VEC_POLY1305
+ if (ctx->use_ppc &&
+ length >= 4 * CHACHA20_BLOCK_SIZE &&
+ authoffset >= 4 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+
+ nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
if (authoffset > 0)
{
_gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
@@ -825,6 +913,23 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
}
#endif
+#ifdef USE_PPC_VEC_POLY1305
+ if (ctx->use_ppc && length >= 4 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 4;
+
+ nburn = _gcry_chacha20_poly1305_ppc8_blocks4(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+#endif
+
while (length)
{
size_t currlen = length;