diff options
author | Michael Weiser <michael.weiser@gmx.de> | 2020-12-25 17:13:52 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2021-01-13 13:22:28 +0100 |
commit | 62dc4ce42fbebd7cad1f431dc6cd92bb66bf6242 (patch) | |
tree | bfb711651fdd40821b76c486eb9bf524a180f421 /arm/neon | |
parent | 49cb4039be99d2d49f5c97edd22fc47976c34651 (diff) | |
download | nettle-62dc4ce42fbebd7cad1f431dc6cd92bb66bf6242.tar.gz |
arm: Unify neon asm for big- and little-endian modes
Switch arm neon assembler routines to endianness-agnostic loads and
stores where possible to avoid modifications to the rest of the code.
This involves switching to vld1.32 for loading consecutive 32-bit words
in host endianness as well as vst1.8 for storing back to memory in
little-endian order as required by the caller. Where necessary, r3 is
used to store the precalculated offset into the source vector for the
secondary load operations. vstm is kept for little-endian platforms
because it is faster than vst1 on most ARM implementations.
vst1.x (at least on the Allwinner A20 Cortex-A7 implementation) seems to
interfer with itself on subsequent calls, slowing it down further. So we
reschedule some instructions to do stores as soon as results become
available to have some other calculations or loads before the next
vst1.x. This reliably saves two additional cycles per block on salsa20
and chacha which would otherwise be incurred.
vld1.x does not seem to suffer from this or at least not to a level
where two consecutive vld1.x run slower than an equivalent vldm.
Rescheduling them similarly did not improve performance beyond that of
vldm.
Signed-off-by: Michael Weiser <michael.weiser@gmx.de>
Diffstat (limited to 'arm/neon')
-rw-r--r-- | arm/neon/chacha-3core.asm | 36 | ||||
-rw-r--r-- | arm/neon/chacha-core-internal.asm | 47 | ||||
-rw-r--r-- | arm/neon/salsa20-2core.asm | 28 | ||||
-rw-r--r-- | arm/neon/salsa20-core-internal.asm | 59 |
4 files changed, 82 insertions, 88 deletions
diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm index bd1cf63c..c29c62a5 100644 --- a/arm/neon/chacha-3core.asm +++ b/arm/neon/chacha-3core.asm @@ -36,6 +36,7 @@ ifelse(` define(`DST', `r0') define(`SRC', `r1') define(`ROUNDS', `r2') +define(`SRCp32', `r3') C State, X, Y and Z representing consecutive blocks define(`X0', `q0') @@ -64,10 +65,13 @@ define(`T3', `q7') C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_chacha_3core) - vldm SRC, {X0,X1,X2,X3} + C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words + add SRCp32, SRC, #32 + vld1.32 {X0,X1}, [SRC] + vld1.32 {X2,X3}, [SRCp32] vpush {q4,q5,q6,q7} adr r12, .Lcount1 - vld1.64 {Z3}, [r12] + vld1.32 {Z3}, [r12] vadd.i64 Y3, X3, Z3 C Increment 64-bit counter vadd.i64 Z3, Y3, Z3 @@ -213,33 +217,49 @@ PROLOGUE(_nettle_chacha_3core) vadd.i32 Y3, Y3, T2 vadd.i32 Z3, Z3, T3 - vldm SRC, {T0,T1,T2,T3} + vld1.32 {T0,T1}, [SRC] vadd.i32 X0, X0, T0 vadd.i32 X1, X1, T1 + + C vst1.8 because caller expects results little-endian + C interleave loads, calculations and stores to save cycles on stores + C use vstm when little-endian for some additional speedup +IF_BE(` vst1.8 {X0,X1}, [DST]!') + + vld1.32 {T2,T3}, [SRCp32] vadd.i32 X2, X2, T2 vadd.i32 X3, X3, T3 - vstmia DST!, {X0,X1,X2,X3} +IF_BE(` vst1.8 {X2,X3}, [DST]!') +IF_LE(` vstmia DST!, {X0,X1,X2,X3}') vadd.i32 Y0, Y0, T0 vadd.i32 Y1, Y1, T1 +IF_BE(` vst1.8 {Y0,Y1}, [DST]!') + vadd.i32 Y2, Y2, T2 - vstmia DST!, {Y0,Y1,Y2,Y3} +IF_BE(` vst1.8 {Y2,Y3}, [DST]!') +IF_LE(` vstmia DST!, {Y0,Y1,Y2,Y3}') vadd.i32 Z0, Z0, T0 vadd.i32 Z1, Z1, T1 +IF_BE(` vst1.8 {Z0,Z1}, [DST]!') + vadd.i32 Z2, Z2, T2 vpop {q4,q5,q6,q7} - vstm DST, {Z0,Z1,Z2,Z3} +IF_BE(` vst1.8 {Z2,Z3}, [DST]') +IF_LE(` vstm DST, {Z0,Z1,Z2,Z3}') bx lr EPILOGUE(_nettle_chacha_3core) PROLOGUE(_nettle_chacha_3core32) - vldm SRC, {X0,X1,X2,X3} + add SRCp32, SRC, #32 + vld1.32 {X0,X1}, [SRC] + vld1.32 {X2,X3}, [SRCp32] vpush {q4,q5,q6,q7} adr r12, .Lcount1 - vld1.64 {Z3}, [r12] + vld1.32 {Z3}, [r12] vadd.i32 Y3, X3, Z3 C Increment 32-bit counter vadd.i32 Z3, Y3, Z3 diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm index b0a775bd..5095be6a 100644 --- a/arm/neon/chacha-core-internal.asm +++ b/arm/neon/chacha-core-internal.asm @@ -83,7 +83,9 @@ define(`QROUND', ` C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_chacha_core) - vldm SRC, {X0,X1,X2,X3} + C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words + vld1.32 {X0,X1}, [SRC]! C SRC changed! + vld1.32 {X2,X3}, [SRC] vmov S0, X0 vmov S1, X1 @@ -96,15 +98,6 @@ PROLOGUE(_nettle_chacha_core) C 8 9 10 11 X2 C 12 13 14 15 X3 - C Input rows big-endian: - C 1 0 3 2 X0 - C 5 4 7 6 X1 - C 9 8 11 10 X2 - C 13 12 15 14 X3 - C even and odd columns switched because - C vldm loads consecutive doublewords and - C switches words inside them to make them BE - .Loop: QROUND(X0, X1, X2, X3) @@ -113,44 +106,32 @@ PROLOGUE(_nettle_chacha_core) C 5 6 7 4 >>> 3 C 10 11 8 9 >>> 2 C 15 12 13 14 >>> 1 - - C In big-endian rotate rows, to get - C 1 0 3 2 - C 6 5 4 7 >>> 1 - C 11 10 9 8 >>> 2 - C 12 15 14 13 >>> 3 - C different number of elements needs to be - C extracted on BE because of different column order -IF_LE(` vext.32 X1, X1, X1, #1') -IF_BE(` vext.32 X1, X1, X1, #3') + vext.32 X1, X1, X1, #1 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #3') -IF_BE(` vext.32 X3, X3, X3, #1') + vext.32 X3, X3, X3, #3 QROUND(X0, X1, X2, X3) subs ROUNDS, ROUNDS, #2 C Inverse rotation -IF_LE(` vext.32 X1, X1, X1, #3') -IF_BE(` vext.32 X1, X1, X1, #1') + vext.32 X1, X1, X1, #3 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #1') -IF_BE(` vext.32 X3, X3, X3, #3') + vext.32 X3, X3, X3, #1 bhi .Loop vadd.u32 X0, X0, S0 vadd.u32 X1, X1, S1 + + C vst1.8 because caller expects results little-endian + C use vstm when little-endian for some additional speedup +IF_BE(` vst1.8 {X0,X1}, [DST]!') + vadd.u32 X2, X2, S2 vadd.u32 X3, X3, S3 - C caller expects result little-endian -IF_BE(` vrev32.u8 X0, X0 - vrev32.u8 X1, X1 - vrev32.u8 X2, X2 - vrev32.u8 X3, X3') - - vstm DST, {X0,X1,X2,X3} +IF_BE(` vst1.8 {X2,X3}, [DST]') +IF_LE(` vstm DST, {X0,X1,X2,X3}') bx lr EPILOGUE(_nettle_chacha_core) diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm index b3fe7e94..4d9da79b 100644 --- a/arm/neon/salsa20-2core.asm +++ b/arm/neon/salsa20-2core.asm @@ -36,6 +36,7 @@ ifelse(` define(`DST', `r0') define(`SRC', `r1') define(`ROUNDS', `r2') +define(`SRCp32', `r3') C State, even elements in X, odd elements in Y define(`X0', `q0') @@ -58,11 +59,14 @@ define(`T3', `q15') C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_salsa20_2core) - vldm SRC, {X0,X1,X2,X3} + C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words + add SRCp32, SRC, #32 + vld1.32 {X0,X1}, [SRC] + vld1.32 {X2,X3}, [SRCp32] adr r12, .Lcount1 vmov Y3, X0 - vld1.64 {Y1}, [r12] + vld1.32 {Y1}, [r12] vmov Y0, X1 vadd.i64 Y1, Y1, X2 C Increment counter vmov Y2, X3 @@ -180,7 +184,8 @@ C Inverse swaps and transpositions vswp D1REG(Y0), D1REG(Y2) vswp D1REG(Y1), D1REG(Y3) - vldm SRC, {T0,T1,T2,T3} + vld1.32 {T0,T1}, [SRC] + vld1.32 {T2,T3}, [SRCp32] vtrn.32 X0, Y3 vtrn.32 X1, Y0 @@ -190,17 +195,26 @@ C Inverse swaps and transpositions C Add in the original context vadd.i32 X0, X0, T0 vadd.i32 X1, X1, T1 + +C vst1.8 because caller expects results little-endian +C interleave loads, calculations and stores to save cycles on stores +C use vstm when little-endian for some additional speedup +IF_BE(` vst1.8 {X0,X1}, [DST]!') + vadd.i32 X2, X2, T2 vadd.i32 X3, X3, T3 +IF_BE(` vst1.8 {X2,X3}, [DST]!') +IF_LE(` vstmia DST!, {X0,X1,X2,X3}') - vstmia DST!, {X0,X1,X2,X3} - vld1.64 {X0}, [r12] + vld1.32 {X0}, [r12] vadd.i32 T0, T0, Y3 vadd.i64 T2, T2, X0 vadd.i32 T1, T1, Y0 +IF_BE(` vst1.8 {T0,T1}, [DST]!') + vadd.i32 T2, T2, Y1 vadd.i32 T3, T3, Y2 - - vstm DST, {T0,T1,T2,T3} +IF_BE(` vst1.8 {T2,T3}, [DST]') +IF_LE(` vstm DST, {T0,T1,T2,T3}') bx lr EPILOGUE(_nettle_salsa20_2core) diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm index d59d7b80..c5785da4 100644 --- a/arm/neon/salsa20-core-internal.asm +++ b/arm/neon/salsa20-core-internal.asm @@ -36,6 +36,7 @@ ifelse(` define(`DST', `r0') define(`SRC', `r1') define(`ROUNDS', `r2') +define(`SRCp32', `r3') define(`X0', `q0') define(`X1', `q1') @@ -86,7 +87,10 @@ define(`QROUND', ` C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_salsa20_core) - vldm SRC, {X0,X1,X2,X3} + C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words + add SRCp32, SRC, #32 + vld1.32 {X0,X1}, [SRC] + vld1.32 {X2,X3}, [SRCp32] C Input rows little-endian: C 0 1 2 3 X0 @@ -99,23 +103,10 @@ PROLOGUE(_nettle_salsa20_core) C 8 13 2 7 C 12 1 6 11 - C Input rows big-endian: - C 1 0 3 2 X0 - C 5 4 7 6 X1 - C 9 8 11 10 X2 - C 13 12 15 14 X3 - C even and odd columns switched because - C vldm loads consecutive doublewords and - C switches words inside them to make them BE - C Permuted to: - C 5 0 15 10 - C 9 4 3 14 - C 13 8 7 2 - C 1 12 11 6 - C FIXME: Construct in some other way? adr r12, .Lmasks - vldm r12, {M0101, M0110, M0011} + vld1.32 {M0101, M0110}, [r12]! + vld1.32 {M0011}, [r12] vmov S1, X1 vmov S2, X2 @@ -160,29 +151,17 @@ PROLOGUE(_nettle_salsa20_core) C 3 4 9 14 >>> 1 C 2 7 8 13 >>> 2 C 1 6 11 12 >>> 3 - - C In big-endian rotate rows, to get - C 5 0 15 10 - C 4 3 14 9 >>> 3 - C 7 2 13 8 >>> 2 - C 6 1 12 11 >>> 1 - C different number of elements needs to be - C extracted on BE because of different column order -IF_LE(` vext.32 X1, X1, X1, #3') -IF_BE(` vext.32 X1, X1, X1, #1') + vext.32 X1, X1, X1, #3 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #1') -IF_BE(` vext.32 X3, X3, X3, #3') + vext.32 X3, X3, X3, #1 QROUND(X0, X3, X2, X1) subs ROUNDS, ROUNDS, #2 C Inverse rotation -IF_LE(` vext.32 X1, X1, X1, #1') -IF_BE(` vext.32 X1, X1, X1, #3') + vext.32 X1, X1, X1, #1 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #3') -IF_BE(` vext.32 X3, X3, X3, #1') + vext.32 X3, X3, X3, #3 bhi .Loop @@ -202,19 +181,19 @@ IF_BE(` vext.32 X3, X3, X3, #1') vbit X2, X3, M0101 vbit X3, T1, M0101 - vld1.64 {T0}, [SRC] + vld1.32 {T0}, [SRC] vadd.u32 X0, X0, T0 vadd.u32 X1, X1, S1 + + C vst1.8 because caller expects results little-endian + C use vstm when little-endian for some additional speedup +IF_BE(` vst1.8 {X0,X1}, [DST]!') + vadd.u32 X2, X2, S2 vadd.u32 X3, X3, S3 - C caller expects result little-endian -IF_BE(` vrev32.u8 X0, X0 - vrev32.u8 X1, X1 - vrev32.u8 X2, X2 - vrev32.u8 X3, X3') - - vstm DST, {X0,X1,X2,X3} +IF_BE(` vst1.8 {X2,X3}, [DST]') +IF_LE(` vstm DST, {X0,X1,X2,X3}') bx lr EPILOGUE(_nettle_salsa20_core) |