summaryrefslogtreecommitdiff
path: root/powerpc64
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2020-11-23 17:25:19 +0100
committerNiels Möller <nisse@lysator.liu.se>2020-11-23 17:25:19 +0100
commitda40585672c6c113d5d9c2eefc73d21828bfe759 (patch)
treeade9d2626034918fa9a931291fe9a2978cad58e6 /powerpc64
parentc4e859750a1580491f227287400b99e1d1d82319 (diff)
downloadnettle-da40585672c6c113d5d9c2eefc73d21828bfe759.tar.gz
ppc: Implement _chacha_2core.
* powerpc64/p7/chacha-2core.asm: New file.
Diffstat (limited to 'powerpc64')
-rw-r--r--powerpc64/p7/chacha-2core.asm249
1 files changed, 249 insertions, 0 deletions
diff --git a/powerpc64/p7/chacha-2core.asm b/powerpc64/p7/chacha-2core.asm
new file mode 100644
index 00000000..725d72af
--- /dev/null
+++ b/powerpc64/p7/chacha-2core.asm
@@ -0,0 +1,249 @@
+C powerpc64/p7/chacha-2core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `r3')
+define(`SRC', `r4')
+define(`ROUNDS', `r5')
+
+C State, even elements in X, odd elements in Y
+define(`X0', `v0')
+define(`X1', `v1')
+define(`X2', `v2')
+define(`X3', `v3')
+define(`Y0', `v4')
+define(`Y1', `v5')
+define(`Y2', `v6')
+define(`Y3', `v7')
+
+define(`ROT16', `v8')
+define(`ROT12', `v9')
+define(`ROT8', `v10')
+define(`ROT7', `v11')
+
+C Original input state
+define(`S0', `v12')
+define(`S1', `v13')
+define(`S2', `v14')
+define(`S3', `v15')
+define(`S3p1', `v16')
+
+define(`T0', `v17')
+
+ .text
+ C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_2core)
+
+ li r8, 0x30 C offset for x3
+ vspltisw X1, 1 C {1,1,...,1}
+ vspltisw X0, 0 C {0,0,...,0}
+ vsldoi X1, X1, X0, 12 C {1,0,...,0}
+
+ lxvw4x VSR(X3), r8, SRC
+
+ vsubuwm X0, X0, X1 C {-1,0,...,0}
+ vcmpequw Y3, X3, X0
+ vand Y3, Y3, X1 C Counter carry out
+ vsldoi Y3, Y3, Y3, 12
+ vor Y3, Y3, X1
+
+.Lshared_entry:
+ vadduwm Y3, Y3, X3
+
+ li r6, 0x10 C set up some...
+ li r7, 0x20 C ...useful...
+ lxvw4x VSR(X0), 0, SRC
+ lxvw4x VSR(X1), r6, SRC
+ lxvw4x VSR(X2), r7, SRC
+
+ vor S0, X0, X0
+ vor S1, X1, X1
+ vor S2, X2, X2
+ vor S3, X3, X3
+ vor S3p1, Y3, Y3
+
+ vmrgow Y0, X0, X0 C 1 1 3 3
+ vmrgew X0, X0, X0 C 0 0 2 2
+ vmrgow Y1, X1, X1 C 5 5 7 7
+ vmrgew X1, X1, X1 C 4 4 6 6
+ vmrgow Y2, X2, X2 C 9 9 11 11
+ vmrgew X2, X2, X2 C 8 8 10 10
+ vmrgow Y3, X3, S3p1 C 13 13 15 15
+ vmrgew X3, X3, S3p1 C 12 12 14 14
+
+ vspltisw ROT16, -16 C -16 instead of 16 actually works!
+ vspltisw ROT12, 12
+ vspltisw ROT8, 8
+ vspltisw ROT7, 7
+
+ srdi ROUNDS, ROUNDS, 1
+ mtctr ROUNDS
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
+C X1: A4 B4 A6 B6 Y1: A5 B5 A7 B7
+C X2: A8 B8 A10 B10 Y2: A9 B9 A11 B11
+C X3: A12 B12 A14 B14 Y3: A13 B13 A15 B15
+ vadduwm X0, X0, X1
+ vadduwm Y0, Y0, Y1
+ vxor X3, X3, X0
+ vxor Y3, Y3, Y0
+ vrlw X3, X3, ROT16
+ vrlw Y3, Y3, ROT16
+
+ vadduwm X2, X2, X3
+ vadduwm Y2, Y2, Y3
+ vxor X1, X1, X2
+ vxor Y1, Y1, Y2
+ vrlw X1, X1, ROT12
+ vrlw Y1, Y1, ROT12
+
+ vadduwm X0, X0, X1
+ vadduwm Y0, Y0, Y1
+ vxor X3, X3, X0
+ vxor Y3, Y3, Y0
+ vrlw X3, X3, ROT8
+ vrlw Y3, Y3, ROT8
+
+ vadduwm X2, X2, X3
+ vadduwm Y2, Y2, Y3
+ vxor X1, X1, X2
+ vxor Y1, Y1, Y2
+ vrlw X1, X1, ROT7
+ vrlw Y1, Y1, ROT7
+
+ vsldoi X1, X1, X1, 8
+ vsldoi X2, X2, X2, 8
+ vsldoi Y2, Y2, Y2, 8
+ vsldoi Y3, Y3, Y3, 8
+
+C Register layout:
+C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
+C Y1: A5 B5 A7 B7 X1: A6 B6 A4 B4 (X1 swapped)
+C X2: A10 B10 A8 B8 Y2: A11 A11 A9 B9 (X2, Y2 swapped)
+C Y3 A15 B15 A13 B13 X3 A12 B12 A14 B14 (Y3 swapped)
+
+ vadduwm X0, X0, Y1
+ vadduwm Y0, Y0, X1
+ vxor Y3, Y3, X0
+ vxor X3, X3, Y0
+ vrlw Y3, Y3, ROT16
+ vrlw X3, X3, ROT16
+
+ vadduwm X2, X2, Y3
+ vadduwm Y2, Y2, X3
+ vxor Y1, Y1, X2
+ vxor X1, X1, Y2
+ vrlw Y1, Y1, ROT12
+ vrlw X1, X1, ROT12
+
+ vadduwm X0, X0, Y1
+ vadduwm Y0, Y0, X1
+ vxor Y3, Y3, X0
+ vxor X3, X3, Y0
+ vrlw Y3, Y3, ROT8
+ vrlw X3, X3, ROT8
+
+ vadduwm X2, X2, Y3
+ vadduwm Y2, Y2, X3
+ vxor Y1, Y1, X2
+ vxor X1, X1, Y2
+ vrlw Y1, Y1, ROT7
+ vrlw X1, X1, ROT7
+
+ vsldoi X1, X1, X1, 8
+ vsldoi X2, X2, X2, 8
+ vsldoi Y2, Y2, Y2, 8
+ vsldoi Y3, Y3, Y3, 8
+
+ bdnz .Loop
+
+ vmrgew T0, X0, Y0
+ vmrgow Y0, X0, Y0
+
+ vmrgew X0, X1, Y1
+ vmrgow Y1, X1, Y1
+
+ vmrgew X1, X2, Y2
+ vmrgow Y2, X2, Y2
+
+ vmrgew X2, X3, Y3
+ vmrgow Y3, X3, Y3
+
+ vadduwm T0, T0, S0
+ vadduwm Y0, Y0, S0
+ vadduwm X0, X0, S1
+ vadduwm Y1, Y1, S1
+ vadduwm X1, X1, S2
+ vadduwm Y2, Y2, S2
+ vadduwm X2, X2, S3
+ vadduwm Y3, Y3, S3p1
+
+ stxvw4x VSR(T0), 0, DST
+ stxvw4x VSR(X0), r6, DST
+ stxvw4x VSR(X1), r7, DST
+ stxvw4x VSR(X2), r8, DST
+
+ addi DST, DST, 64
+
+ stxvw4x VSR(Y0), 0, DST
+ stxvw4x VSR(Y1), r6, DST
+ stxvw4x VSR(Y2), r7, DST
+ stxvw4x VSR(Y3), r8, DST
+ blr
+EPILOGUE(_nettle_chacha_2core)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_2core32)
+ li r8, 0x30 C offset for x3
+ vspltisw Y3, 1 C {1,1,...,1}
+ vspltisw X0, 0 C {0,0,...,0}
+ vsldoi Y3, Y3, X0, 12 C {1,0,...,0}
+ lxvw4x VSR(X3), r8, SRC
+ b .Lshared_entry
+EPILOGUE(_nettle_chacha_2core32)
+
+divert(-1)
+define core2state
+p/x $vs32.v4_int32
+p/x $vs33.v4_int32
+p/x $vs34.v4_int32
+p/x $vs35.v4_int32
+p/x $vs36.v4_int32
+p/x $vs37.v4_int32
+p/x $vs38.v4_int32
+p/x $vs39.v4_int32
+end