From 84f2e2d0b51b7b3e75d96d8188ae6a8d8174542b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Sun, 26 Feb 2023 12:20:59 +0200 Subject: aes-ppc: add CTR32LE bulk acceleration * cipher/rijndael-ppc-functions.h (CTR32LE_ENC_FUNC): New. * cipher/rijndael-ppc.c (_gcry_aes_ppc8_ctr32le_enc): New. * cipher/rijndael-ppc9le.c (_gcry_aes_ppc9le_ctr32le_enc): New. * cipher/rijndael.c (_gcry_aes_ppc8_ctr32le_enc) (_gcry_aes_ppc9le_ctr32le_enc): New. (do_setkey): Setup _gcry_aes_ppc8_ctr32le_enc for POWER8 and _gcry_aes_ppc9le_ctr32le_enc for POWER9. -- Benchmark on POWER9: Before: AES | nanosecs/byte mebibytes/sec cycles/byte GCM-SIV enc | 1.42 ns/B 672.2 MiB/s 3.26 c/B After: AES | nanosecs/byte mebibytes/sec cycles/byte GCM-SIV enc | 0.725 ns/B 1316 MiB/s 1.67 c/B Signed-off-by: Jussi Kivilinna --- cipher/rijndael-ppc-functions.h | 245 ++++++++++++++++++++++++++++++++++++++++ cipher/rijndael-ppc.c | 1 + cipher/rijndael-ppc9le.c | 1 + cipher/rijndael.c | 11 ++ 4 files changed, 258 insertions(+) (limited to 'cipher') diff --git a/cipher/rijndael-ppc-functions.h b/cipher/rijndael-ppc-functions.h index 8a05d3c9..79eca7a2 100644 --- a/cipher/rijndael-ppc-functions.h +++ b/cipher/rijndael-ppc-functions.h @@ -2292,3 +2292,248 @@ void XTS_CRYPT_FUNC (void *context, unsigned char *tweak_arg, #undef GEN_TWEAK } + + +void CTR32LE_ENC_FUNC(void *context, unsigned char *ctr_arg, void *outbuf_arg, + const void *inbuf_arg, size_t nblocks) +{ +#ifndef WORDS_BIGENDIAN + static const vec_u32 vec_u32_one = { 1, 0, 0, 0 }; +#else + static const vec_u32 vec_u32_one = { 0, 0, 0, 1 }; +#endif + const block bige_const = asm_load_be_const(); + RIJNDAEL_context *ctx = context; + const u128_t *rk = (u128_t *)&ctx->keyschenc; + const u128_t *in = (const u128_t *)inbuf_arg; + u128_t *out = (u128_t *)outbuf_arg; + int rounds = ctx->rounds; + ROUND_KEY_VARIABLES; + block rkeylast_orig; + block b; + vec_u32 ctr, one; + + ctr = (vec_u32)vec_reve (VEC_LOAD_BE (ctr_arg, 0, bige_const)); + one = vec_u32_one; + + PRELOAD_ROUND_KEYS (rounds); + rkeylast_orig = rkeylast; + +#define VEC_ADD_CTRLE32(ctrv_u32, addv_u32) \ + vec_reve((block)((ctrv_u32) + (addv_u32))) + + if (nblocks >= 4) + { + block in0, in1, in2, in3, in4, in5, in6, in7; + block b0, b1, b2, b3, b4, b5, b6, b7; + vec_u32 two, three, four, five, six, seven, eight; + block rkey; + + two = one + one; + three = two + one; + four = two + two; + five = three + two; + six = three + three; + seven = four + three; + eight = four + four; + + for (; nblocks >= 8; nblocks -= 8) + { + b1 = VEC_ADD_CTRLE32 (ctr, one); + b2 = VEC_ADD_CTRLE32 (ctr, two); + b3 = VEC_ADD_CTRLE32 (ctr, three); + b4 = VEC_ADD_CTRLE32 (ctr, four); + b5 = VEC_ADD_CTRLE32 (ctr, five); + b6 = VEC_ADD_CTRLE32 (ctr, six); + b7 = VEC_ADD_CTRLE32 (ctr, seven); + b0 = asm_xor (rkey0, vec_reve((block)ctr)); + rkey = ALIGNED_LOAD (rk, 1); + ctr = ctr + eight; + b1 = asm_xor (rkey0, b1); + b2 = asm_xor (rkey0, b2); + b3 = asm_xor (rkey0, b3); + b0 = asm_cipher_be (b0, rkey); + b1 = asm_cipher_be (b1, rkey); + b2 = asm_cipher_be (b2, rkey); + b3 = asm_cipher_be (b3, rkey); + b4 = asm_xor (rkey0, b4); + b5 = asm_xor (rkey0, b5); + b6 = asm_xor (rkey0, b6); + b7 = asm_xor (rkey0, b7); + b4 = asm_cipher_be (b4, rkey); + b5 = asm_cipher_be (b5, rkey); + b6 = asm_cipher_be (b6, rkey); + b7 = asm_cipher_be (b7, rkey); + +#define DO_ROUND(r) \ + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); \ + b4 = asm_cipher_be (b4, rkey); \ + b5 = asm_cipher_be (b5, rkey); \ + b6 = asm_cipher_be (b6, rkey); \ + b7 = asm_cipher_be (b7, rkey); + + in0 = VEC_LOAD_BE_NOSWAP (in, 0); + DO_ROUND(2); + in1 = VEC_LOAD_BE_NOSWAP (in, 1); + DO_ROUND(3); + in2 = VEC_LOAD_BE_NOSWAP (in, 2); + DO_ROUND(4); + in3 = VEC_LOAD_BE_NOSWAP (in, 3); + DO_ROUND(5); + in4 = VEC_LOAD_BE_NOSWAP (in, 4); + DO_ROUND(6); + in5 = VEC_LOAD_BE_NOSWAP (in, 5); + DO_ROUND(7); + in6 = VEC_LOAD_BE_NOSWAP (in, 6); + DO_ROUND(8); + in7 = VEC_LOAD_BE_NOSWAP (in, 7); + in += 8; + DO_ROUND(9); + + if (rounds >= 12) + { + DO_ROUND(10); + DO_ROUND(11); + if (rounds > 12) + { + DO_ROUND(12); + DO_ROUND(13); + } + } + +#undef DO_ROUND + + in0 = VEC_BE_SWAP (in0, bige_const); + in1 = VEC_BE_SWAP (in1, bige_const); + in2 = VEC_BE_SWAP (in2, bige_const); + in3 = VEC_BE_SWAP (in3, bige_const); + in4 = VEC_BE_SWAP (in4, bige_const); + in5 = VEC_BE_SWAP (in5, bige_const); + in6 = VEC_BE_SWAP (in6, bige_const); + in7 = VEC_BE_SWAP (in7, bige_const); + + in0 = asm_xor (rkeylast, in0); + in1 = asm_xor (rkeylast, in1); + in2 = asm_xor (rkeylast, in2); + in3 = asm_xor (rkeylast, in3); + b0 = asm_cipherlast_be (b0, in0); + b1 = asm_cipherlast_be (b1, in1); + in4 = asm_xor (rkeylast, in4); + in5 = asm_xor (rkeylast, in5); + b2 = asm_cipherlast_be (b2, in2); + b3 = asm_cipherlast_be (b3, in3); + in6 = asm_xor (rkeylast, in6); + in7 = asm_xor (rkeylast, in7); + b4 = asm_cipherlast_be (b4, in4); + b5 = asm_cipherlast_be (b5, in5); + b6 = asm_cipherlast_be (b6, in6); + b7 = asm_cipherlast_be (b7, in7); + + b0 = VEC_BE_SWAP (b0, bige_const); + b1 = VEC_BE_SWAP (b1, bige_const); + b2 = VEC_BE_SWAP (b2, bige_const); + b3 = VEC_BE_SWAP (b3, bige_const); + b4 = VEC_BE_SWAP (b4, bige_const); + b5 = VEC_BE_SWAP (b5, bige_const); + b6 = VEC_BE_SWAP (b6, bige_const); + b7 = VEC_BE_SWAP (b7, bige_const); + VEC_STORE_BE_NOSWAP (out, 0, b0); + VEC_STORE_BE_NOSWAP (out, 1, b1); + VEC_STORE_BE_NOSWAP (out, 2, b2); + VEC_STORE_BE_NOSWAP (out, 3, b3); + VEC_STORE_BE_NOSWAP (out, 4, b4); + VEC_STORE_BE_NOSWAP (out, 5, b5); + VEC_STORE_BE_NOSWAP (out, 6, b6); + VEC_STORE_BE_NOSWAP (out, 7, b7); + out += 8; + } + + if (nblocks >= 4) + { + b1 = VEC_ADD_CTRLE32 (ctr, one); + b2 = VEC_ADD_CTRLE32 (ctr, two); + b3 = VEC_ADD_CTRLE32 (ctr, three); + b0 = asm_xor (rkey0, vec_reve((block)ctr)); + ctr = ctr + four; + b1 = asm_xor (rkey0, b1); + b2 = asm_xor (rkey0, b2); + b3 = asm_xor (rkey0, b3); + +#define DO_ROUND(r) \ + rkey = ALIGNED_LOAD (rk, r); \ + b0 = asm_cipher_be (b0, rkey); \ + b1 = asm_cipher_be (b1, rkey); \ + b2 = asm_cipher_be (b2, rkey); \ + b3 = asm_cipher_be (b3, rkey); + + DO_ROUND(1); + DO_ROUND(2); + DO_ROUND(3); + DO_ROUND(4); + DO_ROUND(5); + DO_ROUND(6); + DO_ROUND(7); + DO_ROUND(8); + + in0 = VEC_LOAD_BE (in, 0, bige_const); + in1 = VEC_LOAD_BE (in, 1, bige_const); + in2 = VEC_LOAD_BE (in, 2, bige_const); + in3 = VEC_LOAD_BE (in, 3, bige_const); + + DO_ROUND(9); + if (rounds >= 12) + { + DO_ROUND(10); + DO_ROUND(11); + if (rounds > 12) + { + DO_ROUND(12); + DO_ROUND(13); + } + } + +#undef DO_ROUND + + in0 = asm_xor (rkeylast, in0); + in1 = asm_xor (rkeylast, in1); + in2 = asm_xor (rkeylast, in2); + in3 = asm_xor (rkeylast, in3); + + b0 = asm_cipherlast_be (b0, in0); + b1 = asm_cipherlast_be (b1, in1); + b2 = asm_cipherlast_be (b2, in2); + b3 = asm_cipherlast_be (b3, in3); + + VEC_STORE_BE (out, 0, b0, bige_const); + VEC_STORE_BE (out, 1, b1, bige_const); + VEC_STORE_BE (out, 2, b2, bige_const); + VEC_STORE_BE (out, 3, b3, bige_const); + + in += 4; + out += 4; + nblocks -= 4; + } + } + + for (; nblocks; nblocks--) + { + b = vec_reve((block)ctr); + ctr = ctr + one; + rkeylast = rkeylast_orig ^ VEC_LOAD_BE (in, 0, bige_const); + + AES_ENCRYPT (b, rounds); + + VEC_STORE_BE (out, 0, b, bige_const); + + out++; + in++; + } + +#undef VEC_ADD_CTRLE32 + + VEC_STORE_BE (ctr_arg, 0, vec_reve((block)ctr), bige_const); +} diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c index 53c4f126..d16fbb40 100644 --- a/cipher/rijndael-ppc.c +++ b/cipher/rijndael-ppc.c @@ -198,6 +198,7 @@ _gcry_aes_ppc8_prepare_decryption (RIJNDAEL_context *ctx) #define OCB_CRYPT_FUNC _gcry_aes_ppc8_ocb_crypt #define OCB_AUTH_FUNC _gcry_aes_ppc8_ocb_auth #define XTS_CRYPT_FUNC _gcry_aes_ppc8_xts_crypt +#define CTR32LE_ENC_FUNC _gcry_aes_ppc8_ctr32le_enc #include diff --git a/cipher/rijndael-ppc9le.c b/cipher/rijndael-ppc9le.c index 9ce9c224..f7055290 100644 --- a/cipher/rijndael-ppc9le.c +++ b/cipher/rijndael-ppc9le.c @@ -97,6 +97,7 @@ asm_store_be_noswap(block vec, unsigned long offset, void *ptr) #define OCB_CRYPT_FUNC _gcry_aes_ppc9le_ocb_crypt #define OCB_AUTH_FUNC _gcry_aes_ppc9le_ocb_auth #define XTS_CRYPT_FUNC _gcry_aes_ppc9le_xts_crypt +#define CTR32LE_ENC_FUNC _gcry_aes_ppc9le_ctr32le_enc #include diff --git a/cipher/rijndael.c b/cipher/rijndael.c index 071d4a16..b49a0642 100644 --- a/cipher/rijndael.c +++ b/cipher/rijndael.c @@ -280,6 +280,10 @@ extern void _gcry_aes_ppc8_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); + +extern void _gcry_aes_ppc8_ctr32le_enc (void *context, unsigned char *ctr, + void *outbuf_arg, const void *inbuf_arg, + size_t nblocks); #endif /*USE_PPC_CRYPTO*/ #ifdef USE_PPC_CRYPTO_WITH_PPC9LE @@ -322,6 +326,11 @@ extern void _gcry_aes_ppc9le_xts_crypt (void *context, unsigned char *tweak, const void *inbuf_arg, size_t nblocks, int encrypt); +extern void _gcry_aes_ppc9le_ctr32le_enc (void *context, unsigned char *ctr, + void *outbuf_arg, + const void *inbuf_arg, + size_t nblocks); + extern size_t _gcry_aes_p10le_gcm_crypt (gcry_cipher_hd_t c, void *outbuf_arg, const void *inbuf_arg, size_t nblocks, int encrypt); @@ -633,6 +642,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ocb_crypt = _gcry_aes_ppc9le_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_ppc9le_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_ppc9le_xts_crypt; + bulk_ops->ctr32le_enc = _gcry_aes_ppc9le_ctr32le_enc; if (hwfeatures & HWF_PPC_ARCH_3_10) /* for P10 */ bulk_ops->gcm_crypt = _gcry_aes_p10le_gcm_crypt; # ifdef ENABLE_FORCE_SOFT_HWFEATURES @@ -663,6 +673,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen, bulk_ops->ocb_crypt = _gcry_aes_ppc8_ocb_crypt; bulk_ops->ocb_auth = _gcry_aes_ppc8_ocb_auth; bulk_ops->xts_crypt = _gcry_aes_ppc8_xts_crypt; + bulk_ops->ctr32le_enc = _gcry_aes_ppc8_ctr32le_enc; } #endif #ifdef USE_S390X_CRYPTO -- cgit v1.2.1