summaryrefslogtreecommitdiff
path: root/arm/neon
diff options
context:
space:
mode:
authorMichael Weiser <michael.weiser@gmx.de>2020-12-25 17:13:52 +0100
committerNiels Möller <nisse@lysator.liu.se>2021-01-13 13:22:28 +0100
commit62dc4ce42fbebd7cad1f431dc6cd92bb66bf6242 (patch)
treebfb711651fdd40821b76c486eb9bf524a180f421 /arm/neon
parent49cb4039be99d2d49f5c97edd22fc47976c34651 (diff)
downloadnettle-62dc4ce42fbebd7cad1f431dc6cd92bb66bf6242.tar.gz
arm: Unify neon asm for big- and little-endian modes
Switch arm neon assembler routines to endianness-agnostic loads and stores where possible to avoid modifications to the rest of the code. This involves switching to vld1.32 for loading consecutive 32-bit words in host endianness as well as vst1.8 for storing back to memory in little-endian order as required by the caller. Where necessary, r3 is used to store the precalculated offset into the source vector for the secondary load operations. vstm is kept for little-endian platforms because it is faster than vst1 on most ARM implementations. vst1.x (at least on the Allwinner A20 Cortex-A7 implementation) seems to interfer with itself on subsequent calls, slowing it down further. So we reschedule some instructions to do stores as soon as results become available to have some other calculations or loads before the next vst1.x. This reliably saves two additional cycles per block on salsa20 and chacha which would otherwise be incurred. vld1.x does not seem to suffer from this or at least not to a level where two consecutive vld1.x run slower than an equivalent vldm. Rescheduling them similarly did not improve performance beyond that of vldm. Signed-off-by: Michael Weiser <michael.weiser@gmx.de>
Diffstat (limited to 'arm/neon')
-rw-r--r--arm/neon/chacha-3core.asm36
-rw-r--r--arm/neon/chacha-core-internal.asm47
-rw-r--r--arm/neon/salsa20-2core.asm28
-rw-r--r--arm/neon/salsa20-core-internal.asm59
4 files changed, 82 insertions, 88 deletions
diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm
index bd1cf63c..c29c62a5 100644
--- a/arm/neon/chacha-3core.asm
+++ b/arm/neon/chacha-3core.asm
@@ -36,6 +36,7 @@ ifelse(`
define(`DST', `r0')
define(`SRC', `r1')
define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
C State, X, Y and Z representing consecutive blocks
define(`X0', `q0')
@@ -64,10 +65,13 @@ define(`T3', `q7')
C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_3core)
- vldm SRC, {X0,X1,X2,X3}
+ C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+ add SRCp32, SRC, #32
+ vld1.32 {X0,X1}, [SRC]
+ vld1.32 {X2,X3}, [SRCp32]
vpush {q4,q5,q6,q7}
adr r12, .Lcount1
- vld1.64 {Z3}, [r12]
+ vld1.32 {Z3}, [r12]
vadd.i64 Y3, X3, Z3 C Increment 64-bit counter
vadd.i64 Z3, Y3, Z3
@@ -213,33 +217,49 @@ PROLOGUE(_nettle_chacha_3core)
vadd.i32 Y3, Y3, T2
vadd.i32 Z3, Z3, T3
- vldm SRC, {T0,T1,T2,T3}
+ vld1.32 {T0,T1}, [SRC]
vadd.i32 X0, X0, T0
vadd.i32 X1, X1, T1
+
+ C vst1.8 because caller expects results little-endian
+ C interleave loads, calculations and stores to save cycles on stores
+ C use vstm when little-endian for some additional speedup
+IF_BE(` vst1.8 {X0,X1}, [DST]!')
+
+ vld1.32 {T2,T3}, [SRCp32]
vadd.i32 X2, X2, T2
vadd.i32 X3, X3, T3
- vstmia DST!, {X0,X1,X2,X3}
+IF_BE(` vst1.8 {X2,X3}, [DST]!')
+IF_LE(` vstmia DST!, {X0,X1,X2,X3}')
vadd.i32 Y0, Y0, T0
vadd.i32 Y1, Y1, T1
+IF_BE(` vst1.8 {Y0,Y1}, [DST]!')
+
vadd.i32 Y2, Y2, T2
- vstmia DST!, {Y0,Y1,Y2,Y3}
+IF_BE(` vst1.8 {Y2,Y3}, [DST]!')
+IF_LE(` vstmia DST!, {Y0,Y1,Y2,Y3}')
vadd.i32 Z0, Z0, T0
vadd.i32 Z1, Z1, T1
+IF_BE(` vst1.8 {Z0,Z1}, [DST]!')
+
vadd.i32 Z2, Z2, T2
vpop {q4,q5,q6,q7}
- vstm DST, {Z0,Z1,Z2,Z3}
+IF_BE(` vst1.8 {Z2,Z3}, [DST]')
+IF_LE(` vstm DST, {Z0,Z1,Z2,Z3}')
bx lr
EPILOGUE(_nettle_chacha_3core)
PROLOGUE(_nettle_chacha_3core32)
- vldm SRC, {X0,X1,X2,X3}
+ add SRCp32, SRC, #32
+ vld1.32 {X0,X1}, [SRC]
+ vld1.32 {X2,X3}, [SRCp32]
vpush {q4,q5,q6,q7}
adr r12, .Lcount1
- vld1.64 {Z3}, [r12]
+ vld1.32 {Z3}, [r12]
vadd.i32 Y3, X3, Z3 C Increment 32-bit counter
vadd.i32 Z3, Y3, Z3
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm
index b0a775bd..5095be6a 100644
--- a/arm/neon/chacha-core-internal.asm
+++ b/arm/neon/chacha-core-internal.asm
@@ -83,7 +83,9 @@ define(`QROUND', `
C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_chacha_core)
- vldm SRC, {X0,X1,X2,X3}
+ C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+ vld1.32 {X0,X1}, [SRC]! C SRC changed!
+ vld1.32 {X2,X3}, [SRC]
vmov S0, X0
vmov S1, X1
@@ -96,15 +98,6 @@ PROLOGUE(_nettle_chacha_core)
C 8 9 10 11 X2
C 12 13 14 15 X3
- C Input rows big-endian:
- C 1 0 3 2 X0
- C 5 4 7 6 X1
- C 9 8 11 10 X2
- C 13 12 15 14 X3
- C even and odd columns switched because
- C vldm loads consecutive doublewords and
- C switches words inside them to make them BE
-
.Loop:
QROUND(X0, X1, X2, X3)
@@ -113,44 +106,32 @@ PROLOGUE(_nettle_chacha_core)
C 5 6 7 4 >>> 3
C 10 11 8 9 >>> 2
C 15 12 13 14 >>> 1
-
- C In big-endian rotate rows, to get
- C 1 0 3 2
- C 6 5 4 7 >>> 1
- C 11 10 9 8 >>> 2
- C 12 15 14 13 >>> 3
- C different number of elements needs to be
- C extracted on BE because of different column order
-IF_LE(` vext.32 X1, X1, X1, #1')
-IF_BE(` vext.32 X1, X1, X1, #3')
+ vext.32 X1, X1, X1, #1
vext.32 X2, X2, X2, #2
-IF_LE(` vext.32 X3, X3, X3, #3')
-IF_BE(` vext.32 X3, X3, X3, #1')
+ vext.32 X3, X3, X3, #3
QROUND(X0, X1, X2, X3)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
-IF_LE(` vext.32 X1, X1, X1, #3')
-IF_BE(` vext.32 X1, X1, X1, #1')
+ vext.32 X1, X1, X1, #3
vext.32 X2, X2, X2, #2
-IF_LE(` vext.32 X3, X3, X3, #1')
-IF_BE(` vext.32 X3, X3, X3, #3')
+ vext.32 X3, X3, X3, #1
bhi .Loop
vadd.u32 X0, X0, S0
vadd.u32 X1, X1, S1
+
+ C vst1.8 because caller expects results little-endian
+ C use vstm when little-endian for some additional speedup
+IF_BE(` vst1.8 {X0,X1}, [DST]!')
+
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
- C caller expects result little-endian
-IF_BE(` vrev32.u8 X0, X0
- vrev32.u8 X1, X1
- vrev32.u8 X2, X2
- vrev32.u8 X3, X3')
-
- vstm DST, {X0,X1,X2,X3}
+IF_BE(` vst1.8 {X2,X3}, [DST]')
+IF_LE(` vstm DST, {X0,X1,X2,X3}')
bx lr
EPILOGUE(_nettle_chacha_core)
diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm
index b3fe7e94..4d9da79b 100644
--- a/arm/neon/salsa20-2core.asm
+++ b/arm/neon/salsa20-2core.asm
@@ -36,6 +36,7 @@ ifelse(`
define(`DST', `r0')
define(`SRC', `r1')
define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
C State, even elements in X, odd elements in Y
define(`X0', `q0')
@@ -58,11 +59,14 @@ define(`T3', `q15')
C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_salsa20_2core)
- vldm SRC, {X0,X1,X2,X3}
+ C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+ add SRCp32, SRC, #32
+ vld1.32 {X0,X1}, [SRC]
+ vld1.32 {X2,X3}, [SRCp32]
adr r12, .Lcount1
vmov Y3, X0
- vld1.64 {Y1}, [r12]
+ vld1.32 {Y1}, [r12]
vmov Y0, X1
vadd.i64 Y1, Y1, X2 C Increment counter
vmov Y2, X3
@@ -180,7 +184,8 @@ C Inverse swaps and transpositions
vswp D1REG(Y0), D1REG(Y2)
vswp D1REG(Y1), D1REG(Y3)
- vldm SRC, {T0,T1,T2,T3}
+ vld1.32 {T0,T1}, [SRC]
+ vld1.32 {T2,T3}, [SRCp32]
vtrn.32 X0, Y3
vtrn.32 X1, Y0
@@ -190,17 +195,26 @@ C Inverse swaps and transpositions
C Add in the original context
vadd.i32 X0, X0, T0
vadd.i32 X1, X1, T1
+
+C vst1.8 because caller expects results little-endian
+C interleave loads, calculations and stores to save cycles on stores
+C use vstm when little-endian for some additional speedup
+IF_BE(` vst1.8 {X0,X1}, [DST]!')
+
vadd.i32 X2, X2, T2
vadd.i32 X3, X3, T3
+IF_BE(` vst1.8 {X2,X3}, [DST]!')
+IF_LE(` vstmia DST!, {X0,X1,X2,X3}')
- vstmia DST!, {X0,X1,X2,X3}
- vld1.64 {X0}, [r12]
+ vld1.32 {X0}, [r12]
vadd.i32 T0, T0, Y3
vadd.i64 T2, T2, X0
vadd.i32 T1, T1, Y0
+IF_BE(` vst1.8 {T0,T1}, [DST]!')
+
vadd.i32 T2, T2, Y1
vadd.i32 T3, T3, Y2
-
- vstm DST, {T0,T1,T2,T3}
+IF_BE(` vst1.8 {T2,T3}, [DST]')
+IF_LE(` vstm DST, {T0,T1,T2,T3}')
bx lr
EPILOGUE(_nettle_salsa20_2core)
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
index d59d7b80..c5785da4 100644
--- a/arm/neon/salsa20-core-internal.asm
+++ b/arm/neon/salsa20-core-internal.asm
@@ -36,6 +36,7 @@ ifelse(`
define(`DST', `r0')
define(`SRC', `r1')
define(`ROUNDS', `r2')
+define(`SRCp32', `r3')
define(`X0', `q0')
define(`X1', `q1')
@@ -86,7 +87,10 @@ define(`QROUND', `
C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
PROLOGUE(_nettle_salsa20_core)
- vldm SRC, {X0,X1,X2,X3}
+ C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
+ add SRCp32, SRC, #32
+ vld1.32 {X0,X1}, [SRC]
+ vld1.32 {X2,X3}, [SRCp32]
C Input rows little-endian:
C 0 1 2 3 X0
@@ -99,23 +103,10 @@ PROLOGUE(_nettle_salsa20_core)
C 8 13 2 7
C 12 1 6 11
- C Input rows big-endian:
- C 1 0 3 2 X0
- C 5 4 7 6 X1
- C 9 8 11 10 X2
- C 13 12 15 14 X3
- C even and odd columns switched because
- C vldm loads consecutive doublewords and
- C switches words inside them to make them BE
- C Permuted to:
- C 5 0 15 10
- C 9 4 3 14
- C 13 8 7 2
- C 1 12 11 6
-
C FIXME: Construct in some other way?
adr r12, .Lmasks
- vldm r12, {M0101, M0110, M0011}
+ vld1.32 {M0101, M0110}, [r12]!
+ vld1.32 {M0011}, [r12]
vmov S1, X1
vmov S2, X2
@@ -160,29 +151,17 @@ PROLOGUE(_nettle_salsa20_core)
C 3 4 9 14 >>> 1
C 2 7 8 13 >>> 2
C 1 6 11 12 >>> 3
-
- C In big-endian rotate rows, to get
- C 5 0 15 10
- C 4 3 14 9 >>> 3
- C 7 2 13 8 >>> 2
- C 6 1 12 11 >>> 1
- C different number of elements needs to be
- C extracted on BE because of different column order
-IF_LE(` vext.32 X1, X1, X1, #3')
-IF_BE(` vext.32 X1, X1, X1, #1')
+ vext.32 X1, X1, X1, #3
vext.32 X2, X2, X2, #2
-IF_LE(` vext.32 X3, X3, X3, #1')
-IF_BE(` vext.32 X3, X3, X3, #3')
+ vext.32 X3, X3, X3, #1
QROUND(X0, X3, X2, X1)
subs ROUNDS, ROUNDS, #2
C Inverse rotation
-IF_LE(` vext.32 X1, X1, X1, #1')
-IF_BE(` vext.32 X1, X1, X1, #3')
+ vext.32 X1, X1, X1, #1
vext.32 X2, X2, X2, #2
-IF_LE(` vext.32 X3, X3, X3, #3')
-IF_BE(` vext.32 X3, X3, X3, #1')
+ vext.32 X3, X3, X3, #3
bhi .Loop
@@ -202,19 +181,19 @@ IF_BE(` vext.32 X3, X3, X3, #1')
vbit X2, X3, M0101
vbit X3, T1, M0101
- vld1.64 {T0}, [SRC]
+ vld1.32 {T0}, [SRC]
vadd.u32 X0, X0, T0
vadd.u32 X1, X1, S1
+
+ C vst1.8 because caller expects results little-endian
+ C use vstm when little-endian for some additional speedup
+IF_BE(` vst1.8 {X0,X1}, [DST]!')
+
vadd.u32 X2, X2, S2
vadd.u32 X3, X3, S3
- C caller expects result little-endian
-IF_BE(` vrev32.u8 X0, X0
- vrev32.u8 X1, X1
- vrev32.u8 X2, X2
- vrev32.u8 X3, X3')
-
- vstm DST, {X0,X1,X2,X3}
+IF_BE(` vst1.8 {X2,X3}, [DST]')
+IF_LE(` vstm DST, {X0,X1,X2,X3}')
bx lr
EPILOGUE(_nettle_salsa20_core)