diff options
author | Niels Möller <nisse@lysator.liu.se> | 2020-07-06 10:57:25 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2020-07-06 23:14:59 +0200 |
commit | 2ac58a1ce729a6cfe1d3703f4deb6da8862909e9 (patch) | |
tree | 9bc6d4b0ed52835a75d7b6372e88ae0793a9c44a /arm | |
parent | 8e3e05b1eb48f8e6f49d1e88a6b7c78cb7307a00 (diff) | |
download | nettle-2ac58a1ce729a6cfe1d3703f4deb6da8862909e9.tar.gz |
Two-way interleaving of salsa20 on Neon
Diffstat (limited to 'arm')
-rw-r--r-- | arm/neon/salsa20-2core.asm | 206 |
1 files changed, 206 insertions, 0 deletions
diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm new file mode 100644 index 00000000..cdb6133a --- /dev/null +++ b/arm/neon/salsa20-2core.asm @@ -0,0 +1,206 @@ +C arm/neon/salsa20-2core.asm + +ifelse(< + Copyright (C) 2020 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + + .file "salsa20-2core.asm" + .fpu neon + +define(<DST>, <r0>) +define(<SRC>, <r1>) +define(<ROUNDS>, <r2>) + +C State, even elements in X, odd elements in Y +define(<X0>, <q0>) +define(<X1>, <q1>) +define(<X2>, <q2>) +define(<X3>, <q3>) +define(<Y0>, <q8>) +define(<Y1>, <q9>) +define(<Y2>, <q10>) +define(<Y3>, <q11>) +define(<T0>, <q12>) +define(<T1>, <q13>) +define(<T2>, <q14>) +define(<T3>, <q15>) + + .text + .align 4 +.Lcount1: + .int 1,0,0,0 + + C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds) +PROLOGUE(_nettle_salsa20_2core) + vldm SRC, {X0,X1,X2,X3} + adr r12, .Lcount1 + + vmov Y3, X0 + vld1.64 {Y1}, [r12] + vmov Y0, X1 + vadd.i64 Y1, Y1, X2 C Increment counter + vmov Y2, X3 + + vtrn.32 X0, Y3 C X0: 0 0 2 2 Y3: 1 1 3 3 + vtrn.32 X1, Y0 C X1: 4 4 6 6 Y0: 5 5 7 7 + vtrn.32 X2, Y1 C X2: 8 8 10 10 Y1: 9 9 1 1 + vtrn.32 X3, Y2 C X3: 12 12 14 14 Y2: 13 13 15 15 + + C Swap, to get + C X0: 0 10 Y0: 5 15 + C X1: 4 14 Y1: 9 3 + C X2: 8 2 Y2: 13 7 + C X3: 12 6 Y3: 1 11 + vswp D1REG(X0), D1REG(X2) + vswp D1REG(X1), D1REG(X3) + vswp D1REG(Y0), D1REG(Y2) + vswp D1REG(Y1), D1REG(Y3) + +.Loop: +C Register layout (A is first block, B is second block) +C +C X0: A0 B0 A10 B10 Y0: A5 A5 A15 B15 +C X1: A4 B4 A14 B14 Y1: A9 B9 A3 B3 +C X2: A8 B8 A2 B2 Y2: A13 B13 A7 B7 +C X3: A12 B12 A6 B6 Y3: A1 B1 A11 B11 + + vadd.i32 T0, X0, X3 + vshl.i32 T1, T0, #7 + vadd.i32 T2, Y0, Y3 + vsri.u32 T1, T0, #25 + vshl.i32 T3, T2, #7 + veor X1, X1, T1 + vsri.u32 T3, T2, #25 + vadd.i32 T0, X1, X0 + veor Y1, Y1, T3 + vshl.i32 T1, T0, #9 + vadd.i32 T2, Y1, Y0 + vsri.u32 T1, T0, #23 + vshl.i32 T3, T2, #9 + veor X2, X2, T1 + vsri.u32 T3, T2, #23 + vadd.i32 T0, X2, X1 + veor Y2, Y2, T3 + vshl.i32 T1, T0, #13 + vadd.i32 T2, Y2, Y1 + vsri.u32 T1, T0, #19 + vshl.i32 T3, T2, #13 + veor X3, X3, T1 + vsri.u32 T3, T2, #19 + vadd.i32 T0, X3, X2 + veor Y3, Y3, T3 + vshl.i32 T1, T0, #18 + vadd.i32 T2, Y3, Y2 + vext.32 Y1, Y1, Y1, #2 + vsri.u32 T1, T0, #14 + vshl.i32 T3, T2, #18 + vext.32 Y2, Y2, Y2, #2 + veor X0, X0, T1 + vsri.u32 T3, T2, #14 + vext.32 X3, X3, X3, #2 + veor Y0, Y0, T3 + +C Register layout: +C X0: A0 B0 A10 B10 Y0: A5 A5 A15 B15 +C Y1: A3 B3 A9 B9 X1: A4 B4 A14 B14 (Y1 swapped) +C X2: A2 B2 A8 B8 Y2: A7 B7 A13 B13 (X2, Y2 swapped) +C Y3: A1 B1 A11 B11 X3: A6 B6 A12 B12 (X3 swapped) + + vadd.i32 T0, X0, Y1 + vext.32 X2, X2, X2, #2 + vshl.i32 T1, T0, #7 + vadd.i32 T2, Y0, X1 + vsri.u32 T1, T0, #25 + vshl.i32 T3, T2, #7 + veor Y3, Y3, T1 + vsri.u32 T3, T2, #25 + vadd.i32 T0, Y3, X0 + veor X3, X3, T3 + vshl.i32 T1, T0, #9 + vadd.i32 T2, X3, Y0 + vsri.u32 T1, T0, #23 + vshl.i32 T3, T2, #9 + veor X2, X2, T1 + vsri.u32 T3, T2, #23 + vadd.i32 T0, X2, Y3 + veor Y2, Y2, T3 + vshl.i32 T1, T0, #13 + vadd.i32 T2, Y2, X3 + vsri.u32 T1, T0, #19 + vshl.i32 T3, T2, #13 + veor Y1, Y1, T1 + vsri.u32 T3, T2, #19 + vadd.i32 T0, Y1, X2 + veor X1, X1, T3 + vext.32 X2, X2, X2, #2 + vshl.i32 T1, T0, #18 + vadd.i32 T2, X1, Y2 + vext.32 Y1, Y1, Y1, #2 + vsri.u32 T1, T0, #14 + subs ROUNDS, ROUNDS, #2 + vshl.i32 T3, T2, #18 + vext.32 X3, X3, X3, #2 + veor X0, X0, T1 + vsri.u32 T3, T2, #14 + vext.32 Y2, Y2, Y2, #2 + veor Y0, Y0, T3 + + bhi .Loop + +C Inverse swaps and transpositions + + vswp D1REG(X0), D1REG(X2) + vswp D1REG(X1), D1REG(X3) + vswp D1REG(Y0), D1REG(Y2) + vswp D1REG(Y1), D1REG(Y3) + + vldm SRC, {T0,T1,T2,T3} + + vtrn.32 X0, Y3 + vtrn.32 X1, Y0 + vtrn.32 X2, Y1 + vtrn.32 X3, Y2 + +C Add in the original context + vadd.i32 X0, X0, T0 + vadd.i32 X1, X1, T1 + vadd.i32 X2, X2, T2 + vadd.i32 X3, X3, T3 + + vstmia DST!, {X0,X1,X2,X3} + vld1.64 {X0}, [r12] + vadd.i32 T0, T0, Y3 + vadd.i64 T2, T2, X0 + vadd.i32 T1, T1, Y0 + vadd.i32 T2, T2, Y1 + vadd.i32 T3, T3, Y2 + + vstm DST, {T0,T1,T2,T3} + bx lr +EPILOGUE(_nettle_salsa20_2core) |