summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2020-12-01 18:07:21 +0100
committerNiels Möller <nisse@lysator.liu.se>2020-12-01 18:07:21 +0100
commit1d2cb8ba6fb85fc0c2653f8a9972c17c6990e23b (patch)
tree8dcc7f647b7f56d1e9ec629712ca9970662f2907
parent377bddbc533f9363234737e0526b66def9fc63a8 (diff)
downloadnettle-1d2cb8ba6fb85fc0c2653f8a9972c17c6990e23b.tar.gz
ppc: Optimize chacha_4core main loop
* powerpc64/p7/chacha-4core.asm (QR): Instruction level interleaving in the main loop, written by Torbjörn Granlund.
-rw-r--r--ChangeLog5
-rw-r--r--powerpc64/p7/chacha-4core.asm51
2 files changed, 47 insertions, 9 deletions
diff --git a/ChangeLog b/ChangeLog
index d3507d99..b0e9e199 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2020-12-01 Niels Möller <nisse@lysator.liu.se>
+
+ * powerpc64/p7/chacha-4core.asm (QR): Instruction level
+ interleaving in the main loop, written by Torbjörn Granlund.
+
2020-11-30 Niels Möller <nisse@lysator.liu.se>
* m4-utils.m4 (m4_unquote): New macro, copied from GMP's
diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm
index ce1e9a67..0cd5c877 100644
--- a/powerpc64/p7/chacha-4core.asm
+++ b/powerpc64/p7/chacha-4core.asm
@@ -56,18 +56,57 @@ define(`T3', `v23')
C Main loop for round
define(`QR',`
vadduwm $1, $1, $2
+ vadduwm $5, $5, $6
vxor $4, $4, $1
+ vxor $8, $8, $5
vrlw $4, $4, ROT16
+ vrlw $8, $8, ROT16
+ vadduwm $9, $9, $10
+ vadduwm $13, $13, $14
+ vxor $12, $12, $9
+ vxor $16, $16, $13
+ vrlw $12, $12, ROT16
+ vrlw $16, $16, ROT16
+
vadduwm $3, $3, $4
+ vadduwm $7, $7, $8
vxor $2, $2, $3
+ vxor $6, $6, $7
vrlw $2, $2, ROT12
+ vrlw $6, $6, ROT12
+ vadduwm $11, $11, $12
+ vadduwm $15, $15, $16
+ vxor $10, $10, $11
+ vxor $14, $14, $15
+ vrlw $10, $10, ROT12
+ vrlw $14, $14, ROT12
+
vadduwm $1, $1, $2
+ vadduwm $5, $5, $6
vxor $4, $4, $1
+ vxor $8, $8, $5
vrlw $4, $4, ROT8
+ vrlw $8, $8, ROT8
+ vadduwm $9, $9, $10
+ vadduwm $13, $13, $14
+ vxor $12, $12, $9
+ vxor $16, $16, $13
+ vrlw $12, $12, ROT8
+ vrlw $16, $16, ROT8
+
vadduwm $3, $3, $4
+ vadduwm $7, $7, $8
vxor $2, $2, $3
+ vxor $6, $6, $7
vrlw $2, $2, ROT7
- ')
+ vrlw $6, $6, ROT7
+ vadduwm $11, $11, $12
+ vadduwm $15, $15, $16
+ vxor $10, $10, $11
+ vxor $14, $14, $15
+ vrlw $10, $10, ROT7
+ vrlw $14, $14, ROT7
+')
define(`TRANSPOSE',`
vmrghw T0, $1, $3 C A0 A2 B0 B2
@@ -143,14 +182,8 @@ C Load state and splat
srdi ROUNDS, ROUNDS, 1
mtctr ROUNDS
.Loop:
- QR(v0, v4, v8, v12)
- QR(v1, v5, v9, v13)
- QR(v2, v6, v10, v14)
- QR(v3, v7, v11, v15)
- QR(v0, v5, v10, v15)
- QR(v1, v6, v11, v12)
- QR(v2, v7, v8, v13)
- QR(v3, v4, v9, v14)
+ QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
bdnz .Loop
C Add in saved original words, including counters, before