summaryrefslogtreecommitdiff
path: root/cipher/chacha20-ppc.c
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2019-09-19 22:25:17 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2019-09-22 19:52:36 +0300
commit96b91e164160dfbd913aefe258f472d386f5b642 (patch)
tree439fdd0cfb5a8ee1a1692bfb0bb56cf8a26cb3be /cipher/chacha20-ppc.c
parent664370ea02df883d16db1ffdd9ada023335b0f63 (diff)
downloadlibgcrypt-96b91e164160dfbd913aefe258f472d386f5b642.tar.gz
Small tweak for PowerPC Chacha20-Poly1305 round loop
* cipher/chacha20-ppc.c (_gcry_chacha20_poly1305_ppc8_block4): Use inner/outer round loop structure instead of two separate loops for stitched and non-stitched parts. -- Benchmark on POWER8 ~3.8Ghz: Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.619 ns/B 1541 MiB/s 2.35 c/B STREAM dec | 0.619 ns/B 1541 MiB/s 2.35 c/B POLY1305 enc | 0.784 ns/B 1216 MiB/s 2.98 c/B POLY1305 dec | 0.770 ns/B 1239 MiB/s 2.93 c/B POLY1305 auth | 0.502 ns/B 1898 MiB/s 1.91 c/B After (~2% faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 0.765 ns/B 1247 MiB/s 2.91 c/B POLY1305 dec | 0.749 ns/B 1273 MiB/s 2.85 c/B Benchmark on POWER9 ~3.8Ghz: Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte STREAM enc | 0.687 ns/B 1389 MiB/s 2.61 c/B STREAM dec | 0.692 ns/B 1379 MiB/s 2.63 c/B POLY1305 enc | 1.08 ns/B 880.9 MiB/s 4.11 c/B POLY1305 dec | 1.07 ns/B 888.0 MiB/s 4.08 c/B POLY1305 auth | 0.459 ns/B 2078 MiB/s 1.74 c/B After (~5% faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 1.03 ns/B 929.2 MiB/s 3.90 c/B POLY1305 dec | 1.02 ns/B 936.6 MiB/s 3.87 c/B Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/chacha20-ppc.c')
-rw-r--r--cipher/chacha20-ppc.c29
1 files changed, 15 insertions, 14 deletions
diff --git a/cipher/chacha20-ppc.c b/cipher/chacha20-ppc.c
index 17e2f090..985f2fcd 100644
--- a/cipher/chacha20-ppc.c
+++ b/cipher/chacha20-ppc.c
@@ -469,7 +469,7 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
u64 m0, m1, m2;
u64 x0_lo, x0_hi, x1_lo, x1_hi;
u64 t0_lo, t0_hi, t1_lo, t1_hi;
- int i;
+ unsigned int i, o;
/* load poly1305 state */
m2 = 1;
@@ -515,19 +515,21 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
v12 += counters_0123;
v13 -= vec_cmplt(v12, counters_0123);
- for (i = 0; i < 16; i += 2)
- {
- POLY1305_BLOCK_PART1((i + 0) * 16);
- QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13)
- POLY1305_BLOCK_PART2();
- QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15)
- POLY1305_BLOCK_PART1((i + 1) * 16);
- QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12)
- POLY1305_BLOCK_PART2();
- QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14)
- }
- for (; i < 20; i += 2)
+ for (o = 20; o; o -= 10)
{
+ for (i = 8; i; i -= 2)
+ {
+ POLY1305_BLOCK_PART1(0 * 16);
+ QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13)
+ POLY1305_BLOCK_PART2();
+ QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15)
+ POLY1305_BLOCK_PART1(1 * 16);
+ poly1305_src += 2 * 16;
+ QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12)
+ POLY1305_BLOCK_PART2();
+ QUARTERROUND2(v2, v7, v8, v13, v3, v4, v9, v14)
+ }
+
QUARTERROUND2(v0, v4, v8, v12, v1, v5, v9, v13)
QUARTERROUND2(v2, v6, v10, v14, v3, v7, v11, v15)
QUARTERROUND2(v0, v5, v10, v15, v1, v6, v11, v12)
@@ -601,7 +603,6 @@ _gcry_chacha20_poly1305_ppc8_blocks4(u32 *state, byte *dst, const byte *src,
src += 4*64;
dst += 4*64;
- poly1305_src += 16*16;
nblks -= 4;
}