diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2019-09-19 22:25:17 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2019-09-22 19:52:36 +0300 |
commit | 96b91e164160dfbd913aefe258f472d386f5b642 (patch) | |
tree | 439fdd0cfb5a8ee1a1692bfb0bb56cf8a26cb3be /cipher/chacha20-ppc.c | |
parent | 664370ea02df883d16db1ffdd9ada023335b0f63 (diff) | |
download | libgcrypt-96b91e164160dfbd913aefe258f472d386f5b642.tar.gz |
Small tweak for PowerPC Chacha20-Poly1305 round loop
* cipher/chacha20-ppc.c (_gcry_chacha20_poly1305_ppc8_block4): Use
inner/outer round loop structure instead of two separate loops for
stitched and non-stitched parts.
--
Benchmark on POWER8 ~3.8Ghz:
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.619 ns/B 1541 MiB/s 2.35 c/B
STREAM dec | 0.619 ns/B 1541 MiB/s 2.35 c/B
POLY1305 enc | 0.784 ns/B 1216 MiB/s 2.98 c/B
POLY1305 dec | 0.770 ns/B 1239 MiB/s 2.93 c/B
POLY1305 auth | 0.502 ns/B 1898 MiB/s 1.91 c/B
After (~2% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 0.765 ns/B 1247 MiB/s 2.91 c/B
POLY1305 dec | 0.749 ns/B 1273 MiB/s 2.85 c/B
Benchmark on POWER9 ~3.8Ghz:
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
STREAM enc | 0.687 ns/B 1389 MiB/s 2.61 c/B
STREAM dec | 0.692 ns/B 1379 MiB/s 2.63 c/B
POLY1305 enc | 1.08 ns/B 880.9 MiB/s 4.11 c/B
POLY1305 dec | 1.07 ns/B 888.0 MiB/s 4.08 c/B
POLY1305 auth | 0.459 ns/B 2078 MiB/s 1.74 c/B
After (~5% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 1.03 ns/B 929.2 MiB/s 3.90 c/B
POLY1305 dec | 1.02 ns/B 936.6 MiB/s 3.87 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20-ppc.c')
-rw-r--r-- | cipher/chacha20-ppc.c | 29 |
1 files changed, 15 insertions, 14 deletions
diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c index 17e2f090..985f2fcd 100644 --- a/cipher/chacha20-ppc.c +++ b/cipher/chacha20-ppc.c @@ -469,7 +469,7 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, u64 m0, m1, m2; u64 x0_lo, x0_hi, x1_lo, x1_hi; u64 t0_lo, t0_hi, t1_lo, t1_hi; - int i; + unsigned int i, o; /* load poly1305 state */ m2 = 1; @@ -515,19 +515,21 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, v12 += counters_0123; v13 -= vec_cmplt(v12, counters_0123); - for (i = 0; i < 16; i += 2) - { - POLY1305_BLOCK_PART1((i + 0) * 16); - QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13) - POLY1305_BLOCK_PART2(); - QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15) - POLY1305_BLOCK_PART1((i + 1) * 16); - QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12) - POLY1305_BLOCK_PART2(); - QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14) - } - for (; i < 20; i += 2) + for (o = 20; o; o -= 10) { + for (i = 8; i; i -= 2) + { + POLY1305_BLOCK_PART1(0 * 16); + QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13) + POLY1305_BLOCK_PART2(); + QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15) + POLY1305_BLOCK_PART1(1 * 16); + poly1305_src += 2 * 16; + QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12) + POLY1305_BLOCK_PART2(); + QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14) + } + QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13) QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15) QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12) @@ -601,7 +603,6 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src, src += 4*64; dst += 4*64; - poly1305_src += 16*16; nblks -= 4; } |