summaryrefslogtreecommitdiff
path: root/powerpc64/p7
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2020-12-08 20:10:15 +0100
committerNiels Möller <nisse@lysator.liu.se>2020-12-08 20:10:15 +0100
commit9bdadc1317ffe0bf0ac5bfa7842275c6386ae7f6 (patch)
tree788f7142e7bce2159695ca922c0c7b48daefaa6c /powerpc64/p7
parent94376efa6942215b34b507ed6db921bb8e362f15 (diff)
parentc4ac0953c151dfca6716332247f6aa716ed17b62 (diff)
downloadnettle-9bdadc1317ffe0bf0ac5bfa7842275c6386ae7f6.tar.gz
Merge branch 'ppc-chacha-4core'
Diffstat (limited to 'powerpc64/p7')
-rw-r--r--powerpc64/p7/chacha-4core.asm278
1 files changed, 278 insertions, 0 deletions
diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm
new file mode 100644
index 00000000..b2330247
--- /dev/null
+++ b/powerpc64/p7/chacha-4core.asm
@@ -0,0 +1,278 @@
+C powerpc64/chacha-4core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+define(`SP', `r1')
+define(`TOCP', `r2')
+
+C Argments
+define(`DST', `r3')
+define(`SRC', `r4')
+define(`ROUNDS', `r5')
+
+C Working state in v0,...,v15
+
+define(`ROT16', v16)
+define(`ROT12', v17)
+define(`ROT8', v18)
+define(`ROT7', v19)
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `v20')
+define(`T1', `v21')
+define(`T2', `v22')
+define(`T3', `v23')
+
+C Main loop for round
+define(`QR',`
+ vadduwm $1, $1, $2
+ vadduwm $5, $5, $6
+ vxor $4, $4, $1
+ vxor $8, $8, $5
+ vrlw $4, $4, ROT16
+ vrlw $8, $8, ROT16
+ vadduwm $9, $9, $10
+ vadduwm $13, $13, $14
+ vxor $12, $12, $9
+ vxor $16, $16, $13
+ vrlw $12, $12, ROT16
+ vrlw $16, $16, ROT16
+
+ vadduwm $3, $3, $4
+ vadduwm $7, $7, $8
+ vxor $2, $2, $3
+ vxor $6, $6, $7
+ vrlw $2, $2, ROT12
+ vrlw $6, $6, ROT12
+ vadduwm $11, $11, $12
+ vadduwm $15, $15, $16
+ vxor $10, $10, $11
+ vxor $14, $14, $15
+ vrlw $10, $10, ROT12
+ vrlw $14, $14, ROT12
+
+ vadduwm $1, $1, $2
+ vadduwm $5, $5, $6
+ vxor $4, $4, $1
+ vxor $8, $8, $5
+ vrlw $4, $4, ROT8
+ vrlw $8, $8, ROT8
+ vadduwm $9, $9, $10
+ vadduwm $13, $13, $14
+ vxor $12, $12, $9
+ vxor $16, $16, $13
+ vrlw $12, $12, ROT8
+ vrlw $16, $16, ROT8
+
+ vadduwm $3, $3, $4
+ vadduwm $7, $7, $8
+ vxor $2, $2, $3
+ vxor $6, $6, $7
+ vrlw $2, $2, ROT7
+ vrlw $6, $6, ROT7
+ vadduwm $11, $11, $12
+ vadduwm $15, $15, $16
+ vxor $10, $10, $11
+ vxor $14, $14, $15
+ vrlw $10, $10, ROT7
+ vrlw $14, $14, ROT7
+')
+
+define(`TRANSPOSE',`
+ vmrghw T0, $1, $3 C A0 A2 B0 B2
+ vmrghw T1, $2, $4 C A1 A3 B1 B3
+ vmrglw T2, $1, $3 C C0 C2 D0 D2
+ vmrglw T3, $2, $4 C C1 C3 D1 D3
+
+ vmrghw $1, T0, T1 C A0 A1 A2 A3
+ vmrglw $2, T0, T1 C B0 B1 B2 B3
+ vmrghw $3, T2, T3 C C0 C2 C1 C3
+ vmrglw $4, T2, T3 C D0 D1 D2 D3
+')
+
+ C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_4core)
+
+ vspltisw T2, 1 C Apply counter carries
+
+.Lshared_entry:
+
+ li r6, 0x10 C set up some...
+ li r7, 0x20 C ...useful...
+ li r8, 0x30 C ...offsets
+
+ C Save callee-save registers. Use the "protected zone", max
+ C 228 bytes, below the stack pointer, accessed via r10.
+ addi r10, SP, -0x40
+ stvx v20, 0, r10
+ stvx v21, r6, r10
+ stvx v22, r7, r10
+ stvx v23, r8, r10
+
+ vspltisw ROT16, -16 C -16 instead of 16 actually works!
+ vspltisw ROT12, 12
+ vspltisw ROT8, 8
+ vspltisw ROT7, 7
+
+C Load state and splat
+ lxvw4x VSR(v0), 0, SRC C "expa ..."
+ lxvw4x VSR(v4), r6, SRC C key
+ lxvw4x VSR(v8), r7, SRC C key
+ lxvw4x VSR(v12), r8, SRC C cnt and nonce
+
+ vspltw v1, v0, 1
+ vspltw v2, v0, 2
+ vspltw v3, v0, 3
+ vspltw v0, v0, 0
+ vspltw v5, v4, 1
+ vspltw v6, v4, 2
+ vspltw v7, v4, 3
+ vspltw v4, v4, 0
+ vspltw v9, v8, 1
+ vspltw v10, v8, 2
+ vspltw v11, v8, 3
+ vspltw v8, v8, 0
+ vspltw v13, v12, 1
+ vspltw v14, v12, 2
+ vspltw v15, v12, 3
+ vspltw v12, v12, 0
+
+ ld r9, .Lcnts@got(r2)
+ lxvw4x VSR(T0), 0, r9 C increments
+ vaddcuw T1, v12, T0 C compute carry-out
+ vadduwm v12, v12, T0 C low adds
+ vand T1, T1, T2 C discard carries for 32-bit counter variant
+ vadduwm v13, v13, T1 C apply carries
+
+ C Save all 4x4 of the last words.
+ vor T0, v12, v12
+ vor T1, v13, v13
+ vor T2, v14, v14
+ vor T3, v15, v15
+
+ srdi ROUNDS, ROUNDS, 1
+ mtctr ROUNDS
+.Loop:
+ QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+ QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ bdnz .Loop
+
+ C Add in saved original words, including counters, before
+ C transpose.
+ vadduwm v12, v12, T0
+ vadduwm v13, v13, T1
+ vadduwm v14, v14, T2
+ vadduwm v15, v15, T3
+
+ TRANSPOSE(v0, v1,v2, v3)
+ TRANSPOSE(v4, v5, v6, v7)
+ TRANSPOSE(v8, v9, v10, v11)
+ TRANSPOSE(v12, v13, v14, v15)
+
+ lxvw4x VSR(T0), 0, SRC
+ lxvw4x VSR(T1), r6, SRC
+ lxvw4x VSR(T2), r7, SRC
+
+ vadduwm v0, v0, T0
+ vadduwm v1, v1, T0
+ vadduwm v2, v2, T0
+ vadduwm v3, v3, T0
+
+ vadduwm v4, v4, T1
+ vadduwm v5, v5, T1
+ vadduwm v6, v6, T1
+ vadduwm v7, v7, T1
+
+ vadduwm v8, v8, T2
+ vadduwm v9, v9, T2
+ vadduwm v10, v10, T2
+ vadduwm v11, v11, T2
+
+IF_BE(`
+ C Output always stored in little-endian byte order.
+ C Can reuse T0 and T1 to construct permutation mask.
+ li r9, 0
+ lvsl T0, r9, r9 C 00 01 02 03 ... 0c 0d 0e 0f
+ vspltisb T1, 0x03 C 03 03 03 03 ... 03 03 03 03
+ vxor T1, T1, T0 C 03 02 01 00 ... 0f 0e 0d 0c
+
+ forloop(i, 0, 15, `
+ vperm m4_unquote(v`'i), m4_unquote(v`'i), m4_unquote(v`'i), T1
+ ')
+')
+
+ stxvw4x VSR(v0), 0, DST
+ stxvw4x VSR(v4), r6, DST
+ stxvw4x VSR(v8), r7, DST
+ stxvw4x VSR(v12), r8, DST
+
+ addi DST, DST, 64
+
+ stxvw4x VSR(v1), 0, DST
+ stxvw4x VSR(v5), r6, DST
+ stxvw4x VSR(v9), r7, DST
+ stxvw4x VSR(v13), r8, DST
+
+ addi DST, DST, 64
+
+ stxvw4x VSR(v2), 0, DST
+ stxvw4x VSR(v6), r6, DST
+ stxvw4x VSR(v10), r7, DST
+ stxvw4x VSR(v14), r8, DST
+
+ addi DST, DST, 64
+
+ stxvw4x VSR(v3), 0, DST
+ stxvw4x VSR(v7), r6, DST
+ stxvw4x VSR(v11), r7, DST
+ stxvw4x VSR(v15), r8, DST
+
+ C Restore callee-save registers
+ lvx v20, 0, r10
+ lvx v21, r6, r10
+ lvx v22, r7, r10
+ lvx v23, r8, r10
+
+ blr
+EPILOGUE(_nettle_chacha_4core)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_4core32)
+ vspltisw T2, 0 C Ignore counter carries
+ b .Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+ .section .rodata
+ ALIGN(16)
+.Lcnts: .long 0,1,2,3 C increments