diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-26 14:18:42 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-26 14:59:16 +0200 |
commit | f3d1d4a8c9f0df107a57e2cd3699253766d6e45a (patch) | |
tree | 1906eedf02055ced6bd0d7ba78db46b049cba41a | |
parent | 100063cf4e1ca3350f05a343d8fa0ccf305debb1 (diff) | |
download | libgcrypt-f3d1d4a8c9f0df107a57e2cd3699253766d6e45a.tar.gz |
chacha20-ppc: use target and optimize attributes for P8 and P9
* cipher/chacha20-ppc.c (_gcry_chacha20_ppc8_blocks1): Rename to...
(chacha20_ppc_blocks1): ...this; Add 'always inline' attribute.
(_gcry_chacha20_ppc8_blocks4): Rename to...
(chacha20_ppc_blocks4): ...this; Add 'always inline' attribute.
(_gcry_chacha20_poly1305_ppc8_blocks4): Rename to...
(chacha20_poly1305_ppc_blocks4): ...this; Add 'always inline'
attribute.
(FUNC_ATTR_OPT_O2, FUNC_ATTR_TARGET_P8, FUNC_ATTR_TARGET_P9): New.
(_gcry_chacha20_ppc8_blocks1, _gcry_chacha20_ppc8_blocks4)
(_gcry_chacha20_poly1305_ppc8_blocks4): New.
(_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
(_gcry_chacha20_poly1305_ppc9_blocks4): New.
* cipher/chacha20.c (CHACHA20_context_t): Add 'use_p9'.
(_gcry_chacha20_ppc9_blocks1, _gcry_chacha20_ppc9_blocks4)
(_gcry_chacha20_poly1305_ppc9_blocks4): New.
(chacha20_do_setkey): Set 'use_p9' if HW has HWF_PPC_ARCH_3_00.
(chacha20_blocks, do_chacha20_encrypt_stream_tail)
(_gcry_chacha20_poly1305_encrypt)
(_gcry_chacha20_poly1305_decrypt) [USE_PPC_VEC]: Add 'use_p9' paths.
--
This change makes sure that chacha20-ppc gets compiled
with proper optimization level and right target setting.
Benchmark on POWER9:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 1.11 ns/B 856.0 MiB/s 2.56 c/B
STREAM dec | 1.11 ns/B 856.0 MiB/s 2.56 c/B
POLY1305 enc | 1.57 ns/B 606.2 MiB/s 3.62 c/B
POLY1305 dec | 1.56 ns/B 610.4 MiB/s 3.59 c/B
POLY1305 auth | 0.876 ns/B 1089 MiB/s 2.02 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/chacha20-ppc.c | 118 | ||||
-rw-r--r-- | cipher/chacha20.c | 55 |
2 files changed, 154 insertions, 19 deletions
diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c index 4a21b837..3fe7bc8c 100644 --- a/cipher/chacha20-ppc.c +++ b/cipher/chacha20-ppc.c @@ -136,9 +136,8 @@ vec_add_ctr_u64(vector4x_u32 v, vector4x_u32 a) #define ADD_U64(v,a) \ (v = vec_add_ctr_u64(v, a)) -unsigned int ASM_FUNC_ATTR -_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, - size_t nblks) +static unsigned int ASM_FUNC_ATTR_INLINE +chacha20_ppc_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks) { vector4x_u32 counter_1 = { 1, 0, 0, 0 }; vector4x_u32 rotate_16 = { 16, 16, 16, 16 }; @@ -283,9 +282,8 @@ _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \ ROTATE(b1, rotate_7); ROTATE(b2, rotate_7); -unsigned int ASM_FUNC_ATTR -_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src, - size_t nblks) +static unsigned int ASM_FUNC_ATTR_INLINE +chacha20_ppc_blocks4(u32 *state, byte *dst, const byte *src, size_t nblks) { vector4x_u32 counters_0123 = { 0, 1, 2, 3 }; vector4x_u32 counter_4 = { 4, 0, 0, 0 }; @@ -470,10 +468,10 @@ _gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src, MUL_MOD_1305_64_PART2(h2, h1, h0, r1, r0, r1_mult5); \ } while (0) -unsigned int ASM_FUNC_ATTR -_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, - size_t nblks, POLY1305_STATE *st, - const byte *poly1305_src) +static unsigned int ASM_FUNC_ATTR_INLINE +chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src, + size_t nblks, POLY1305_STATE *st, + const byte *poly1305_src) { vector4x_u32 counters_0123 = { 0, 1, 2, 3 }; vector4x_u32 counter_4 = { 4, 0, 0, 0 }; @@ -641,6 +639,106 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, return 0; } +#else + +static unsigned int ASM_FUNC_ATTR_INLINE +chacha20_poly1305_ppc_blocks4(u32 *state, byte *dst, const byte *src, + size_t nblks, POLY1305_STATE *st, + const byte *poly1305_src) +{ +} + #endif /* SIZEOF_UNSIGNED_LONG == 8 */ + +#ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE +# define FUNC_ATTR_OPT_O2 __attribute__((optimize("-O2"))) +#else +# define FUNC_ATTR_OPT_O2 +#endif + +#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET +# define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8"))) +# define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9"))) +#else +# define FUNC_ATTR_TARGET_P8 +# define FUNC_ATTR_TARGET_P9 +#endif + + +/* Functions targetting POWER8. */ +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2 +_gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, + size_t nblks) +{ + return chacha20_ppc_blocks1(state, dst, src, nblks); +} + +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2 +_gcry_chacha20_ppc8_blocks4(u32 *state, byte *dst, const byte *src, + size_t nblks) +{ + return chacha20_ppc_blocks4(state, dst, src, nblks); +} + +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P8 FUNC_ATTR_OPT_O2 +_gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, + size_t nblks, POLY1305_STATE *st, + const byte *poly1305_src) +{ + return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st, + poly1305_src); +} + +#ifdef HAVE_GCC_ATTRIBUTE_PPC_TARGET +/* Functions targetting POWER9. */ +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2 +_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src, + size_t nblks) +{ + return chacha20_ppc_blocks1(state, dst, src, nblks); +} + +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2 +_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src, + size_t nblks) +{ + return chacha20_ppc_blocks4(state, dst, src, nblks); +} + +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2 +_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src, + size_t nblks, POLY1305_STATE *st, + const byte *poly1305_src) +{ + return chacha20_poly1305_ppc_blocks4(state, dst, src, nblks, st, + poly1305_src); +} +#else +/* Compiler does not support target attribute, use same functions for POWER9 + * as for POWER8. */ +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2 +_gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, const byte *src, + size_t nblks) +{ + return _gcry_chacha20_ppc8_blocks1(state, dst, src, nblks); +} + +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2 +_gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, const byte *src, + size_t nblks) +{ + return _gcry_chacha20_ppc8_blocks4(state, dst, src, nblks); +} + +unsigned int ASM_FUNC_ATTR FUNC_ATTR_TARGET_P9 FUNC_ATTR_OPT_O2 +_gcry_chacha20_poly1305_ppc9_blocks4(u32 *state, byte *dst, const byte *src, + size_t nblks, POLY1305_STATE *st, + const byte *poly1305_src) +{ + return _gcry_chacha20_poly1305_ppc8_blocks4(state, dst, src, nblks, st, + poly1305_src); +} +#endif /* HAVE_GCC_ATTRIBUTE_PPC_TARGET */ + #endif /* ENABLE_PPC_CRYPTO_SUPPORT */ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index a7e0dd63..d979d263 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -134,6 +134,7 @@ typedef struct CHACHA20_context_s unsigned int use_avx512:1; unsigned int use_neon:1; unsigned int use_ppc:1; + unsigned int use_p9:1; unsigned int use_p10:1; unsigned int use_s390x:1; } CHACHA20_context_t; @@ -195,12 +196,24 @@ unsigned int _gcry_chacha20_ppc8_blocks1(u32 *state, byte *dst, const byte *src, size_t nblks); +unsigned int _gcry_chacha20_ppc9_blocks4(u32 *state, byte *dst, + const byte *src, + size_t nblks); + +unsigned int _gcry_chacha20_ppc9_blocks1(u32 *state, byte *dst, + const byte *src, + size_t nblks); + #undef USE_PPC_VEC_POLY1305 #if SIZEOF_UNSIGNED_LONG == 8 #define USE_PPC_VEC_POLY1305 1 unsigned int _gcry_chacha20_poly1305_ppc8_blocks4( u32 *state, byte *dst, const byte *src, size_t nblks, POLY1305_STATE *st, const byte *poly1305_src); + +unsigned int _gcry_chacha20_poly1305_ppc9_blocks4( + u32 *state, byte *dst, const byte *src, size_t nblks, + POLY1305_STATE *st, const byte *poly1305_src); #endif /* SIZEOF_UNSIGNED_LONG == 8 */ #endif /* USE_PPC_VEC */ @@ -369,7 +382,10 @@ chacha20_blocks (CHACHA20_context_t *ctx, byte *dst, const byte *src, #ifdef USE_PPC_VEC if (ctx->use_ppc) { - return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks); + if (ctx->use_p9) + return _gcry_chacha20_ppc9_blocks1(ctx->input, dst, src, nblks); + else + return _gcry_chacha20_ppc8_blocks1(ctx->input, dst, src, nblks); } #endif @@ -509,6 +525,7 @@ chacha20_do_setkey (CHACHA20_context_t *ctx, #endif #ifdef USE_PPC_VEC ctx->use_ppc = (features & HWF_PPC_ARCH_2_07) != 0; + ctx->use_p9 = (features & HWF_PPC_ARCH_3_00) != 0; # ifndef WORDS_BIGENDIAN ctx->use_p10 = (features & HWF_PPC_ARCH_3_10) != 0; # ifdef ENABLE_FORCE_SOFT_HWFEATURES @@ -626,18 +643,25 @@ do_chacha20_encrypt_stream_tail (CHACHA20_context_t *ctx, byte *outbuf, { size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; + if (0) + {} #ifndef WORDS_BIGENDIAN /* * A workaround to skip counter overflow. This is rare. */ - if (ctx->use_p10 && nblocks >= 8 - && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU) + else if (ctx->use_p10 && nblocks >= 8 + && ((u64)ctx->input[12] + nblocks) <= 0xffffffffU) { size_t len = nblocks * CHACHA20_BLOCK_SIZE; nburn = _gcry_chacha20_p10le_8x(ctx->input, outbuf, inbuf, len); } - else #endif + else if (ctx->use_p9) + { + nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf, + nblocks); + } + else { nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, nblocks); @@ -844,7 +868,10 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, } else if (ctx->use_ppc && length >= CHACHA20_BLOCK_SIZE * 4) { - nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4); + if (ctx->use_p9) + nburn = _gcry_chacha20_ppc9_blocks4(ctx->input, outbuf, inbuf, 4); + else + nburn = _gcry_chacha20_ppc8_blocks4(ctx->input, outbuf, inbuf, 4); burn = nburn > burn ? nburn : burn; authptr = outbuf; @@ -986,7 +1013,12 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; - nburn = _gcry_chacha20_poly1305_ppc8_blocks4( + if (ctx->use_p9) + nburn = _gcry_chacha20_poly1305_ppc9_blocks4( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, authptr); + else + nburn = _gcry_chacha20_poly1305_ppc8_blocks4( ctx->input, outbuf, inbuf, nblocks, &c->u_mode.poly1305.ctx.state, authptr); burn = nburn > burn ? nburn : burn; @@ -1212,9 +1244,14 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf, size_t nblocks = length / CHACHA20_BLOCK_SIZE; nblocks -= nblocks % 4; - nburn = _gcry_chacha20_poly1305_ppc8_blocks4( - ctx->input, outbuf, inbuf, nblocks, - &c->u_mode.poly1305.ctx.state, inbuf); + if (ctx->use_p9) + nburn = _gcry_chacha20_poly1305_ppc9_blocks4( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, inbuf); + else + nburn = _gcry_chacha20_poly1305_ppc8_blocks4( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, inbuf); burn = nburn > burn ? nburn : burn; length -= nblocks * CHACHA20_BLOCK_SIZE; |