summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2021-01-25 20:53:27 +0100
committerNiels Möller <nisse@lysator.liu.se>2021-01-25 20:53:27 +0100
commitd02f776eb2541cb0b80f53ace9292092586557e2 (patch)
treed63e81badba01352fd63991ef5f945938aa76dd5
parentddf99ac4c5de3710c7bfac81eac60d55e59a35c2 (diff)
downloadnettle-d02f776eb2541cb0b80f53ace9292092586557e2.tar.gz
Delete the ARM Neon code doing a single block salsa20.
-rw-r--r--ChangeLog13
-rw-r--r--arm/fat/salsa20-core-internal-2.asm37
-rw-r--r--arm/neon/salsa20-core-internal.asm206
-rw-r--r--fat-arm.c10
4 files changed, 13 insertions, 253 deletions
diff --git a/ChangeLog b/ChangeLog
index e11c5c80..6afcdd44 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2021-01-25 Niels Möller <nisse@lysator.liu.se>
+
+ * arm/neon/salsa20-core-internal.asm: Deleted file. This ARM Neon
+ implementation reportedly gave a speedup of 45% on Cortex A9,
+ compared to the C implementation, when it was added back in 2013.
+ That appears to no longer be the case with more recent processors
+ and compilers. And it's even significantly slower than the C
+ implementation on some platforms, including the Raspberry Pi 4.
+ With the introduction of salsa20-2core.asm, performance of this
+ function is also less important.
+ * arm/fat/salsa20-core-internal-2.asm: Deleted file.
+ * fat-arm.c: Delete fat setup for _nettle_salsa20_core.
+
2021-01-20 Niels Möller <nisse@lysator.liu.se>
* ecc-ecdsa-verify.c (ecc_ecdsa_verify): Fix corner case with
diff --git a/arm/fat/salsa20-core-internal-2.asm b/arm/fat/salsa20-core-internal-2.asm
deleted file mode 100644
index f88afd86..00000000
--- a/arm/fat/salsa20-core-internal-2.asm
+++ /dev/null
@@ -1,37 +0,0 @@
-C arm/fat/salsa20-core-internal-2.asm
-
-
-ifelse(`
- Copyright (C) 2015 Niels Möller
-
- This file is part of GNU Nettle.
-
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
- or
-
- * the GNU General Public License as published by the Free
- Software Foundation; either version 2 of the License, or (at your
- option) any later version.
-
- or both in parallel, as here.
-
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received copies of the GNU General Public License and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
-')
-
-dnl PROLOGUE(_nettle_salsa20_core) picked up by configure
-
-define(`fat_transform', `$1_neon')
-include_src(`arm/neon/salsa20-core-internal.asm')
diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm
deleted file mode 100644
index c5785da4..00000000
--- a/arm/neon/salsa20-core-internal.asm
+++ /dev/null
@@ -1,206 +0,0 @@
-C arm/neon/salsa20-core-internal.asm
-
-ifelse(`
- Copyright (C) 2013 Niels Möller
-
- This file is part of GNU Nettle.
-
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
-
- * the GNU Lesser General Public License as published by the Free
- Software Foundation; either version 3 of the License, or (at your
- option) any later version.
-
- or
-
- * the GNU General Public License as published by the Free
- Software Foundation; either version 2 of the License, or (at your
- option) any later version.
-
- or both in parallel, as here.
-
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received copies of the GNU General Public License and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
-')
-
- .file "salsa20-core-internal.asm"
- .fpu neon
-
-define(`DST', `r0')
-define(`SRC', `r1')
-define(`ROUNDS', `r2')
-define(`SRCp32', `r3')
-
-define(`X0', `q0')
-define(`X1', `q1')
-define(`X2', `q2')
-define(`X3', `q3')
-define(`T0', `q8')
-define(`T1', `q9')
-define(`M0101', `q10')
-define(`M0110', `q11')
-define(`M0011', `q12')
-define(`S1', `q13')
-define(`S2', `q14')
-define(`S3', `q15')
-
-define(`QROUND', `
- vadd.i32 T0, $1, $4
- vshl.i32 T1, T0, #7
- vshr.u32 T0, T0, #25
- veor $2, $2, T0
- veor $2, $2, T1
-
- vadd.i32 T0, $1, $2
- vshl.i32 T1, T0, #9
- vshr.u32 T0, T0, #23
- veor $3, $3, T0
- veor $3, $3, T1
-
- vadd.i32 T0, $2, $3
- vshl.i32 T1, T0, #13
- vshr.u32 T0, T0, #19
- veor $4, $4, T0
- veor $4, $4, T1
-
- vadd.i32 T0, $3, $4
- vshl.i32 T1, T0, #18
- vshr.u32 T0, T0, #14
- veor $1, $1, T0
- veor $1, $1, T1
-')
-
- .text
- .align 4
-.Lmasks:
- .int 0,-1, 0,-1
- .int 0,-1,-1, 0
- .int 0, 0,-1,-1
-
- C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
-
-PROLOGUE(_nettle_salsa20_core)
- C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words
- add SRCp32, SRC, #32
- vld1.32 {X0,X1}, [SRC]
- vld1.32 {X2,X3}, [SRCp32]
-
- C Input rows little-endian:
- C 0 1 2 3 X0
- C 4 5 6 7 X1
- C 8 9 10 11 X2
- C 12 13 14 15 X3
- C Permuted to:
- C 0 5 10 15
- C 4 9 14 3
- C 8 13 2 7
- C 12 1 6 11
-
- C FIXME: Construct in some other way?
- adr r12, .Lmasks
- vld1.32 {M0101, M0110}, [r12]!
- vld1.32 {M0011}, [r12]
-
- vmov S1, X1
- vmov S2, X2
- vmov S3, X3
-
- C Swaps in columns 1, 3:
- C 0 5 2 7 X0 ^
- C 4 1 6 3 T0 v
- C 8 13 10 15 T1 ^
- C 12 9 14 11 X3 v
- C same in big endian just with transposed rows
- vmov T0, X1
- vmov T1, X2
- vbit T0, X0, M0101
- vbit X0, X1, M0101
- vbit T1, X3, M0101
- vbit X3, X2, M0101
-
- C Swaps in column 1, 2:
- C 0 5 2 7 X0
- C 4 9 14 3 X1 ^
- C 8 13 10 15 T1 |
- C 12 1 6 11 X3 v
- vmov X1, T0
- vbit X1, X3, M0110
- vbit X3, T0, M0110
-
- C Swaps in columm 2,3:
- C 0 5 10 15 X0 ^
- C 4 9 14 3 X1 |
- C 8 13 2 7 X2 v
- C 12 1 6 11 X3
- vmov X2, T1
- vbit X2, X0, M0011
- vbit X0, T1, M0011
-
-.Loop:
- QROUND(X0, X1, X2, X3)
-
- C In little-endian rotate rows, to get
- C 0 5 10 15
- C 3 4 9 14 >>> 1
- C 2 7 8 13 >>> 2
- C 1 6 11 12 >>> 3
- vext.32 X1, X1, X1, #3
- vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #1
-
- QROUND(X0, X3, X2, X1)
-
- subs ROUNDS, ROUNDS, #2
- C Inverse rotation
- vext.32 X1, X1, X1, #1
- vext.32 X2, X2, X2, #2
- vext.32 X3, X3, X3, #3
-
- bhi .Loop
-
- C Inverse swaps
- vmov T1, X2
- vbit T1, X0, M0011
- vbit X0, X2, M0011
-
- vmov T0, X1
- vbit T0, X3, M0110
- vbit X3, X1, M0110
-
- vmov X1, T0
- vmov X2, T1
- vbit X1, X0, M0101
- vbit X0, T0, M0101
- vbit X2, X3, M0101
- vbit X3, T1, M0101
-
- vld1.32 {T0}, [SRC]
- vadd.u32 X0, X0, T0
- vadd.u32 X1, X1, S1
-
- C vst1.8 because caller expects results little-endian
- C use vstm when little-endian for some additional speedup
-IF_BE(` vst1.8 {X0,X1}, [DST]!')
-
- vadd.u32 X2, X2, S2
- vadd.u32 X3, X3, S3
-
-IF_BE(` vst1.8 {X2,X3}, [DST]')
-IF_LE(` vstm DST, {X0,X1,X2,X3}')
- bx lr
-EPILOGUE(_nettle_salsa20_core)
-
-divert(-1)
-define salsastate
-p/x $q0.u32
-p/x $q1.u32
-p/x $q2.u32
-p/x $q3.u32
-end
diff --git a/fat-arm.c b/fat-arm.c
index edc7de1c..5812cf74 100644
--- a/fat-arm.c
+++ b/fat-arm.c
@@ -145,10 +145,6 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func)
DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, arm)
DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, armv6)
-DECLARE_FAT_FUNC(_nettle_salsa20_core, salsa20_core_func)
-DECLARE_FAT_FUNC_VAR(salsa20_core, salsa20_core_func, c)
-DECLARE_FAT_FUNC_VAR(salsa20_core, salsa20_core_func, neon)
-
DECLARE_FAT_FUNC(_nettle_salsa20_crypt, salsa20_crypt_func)
DECLARE_FAT_FUNC_VAR(salsa20_crypt, salsa20_crypt_func, 1core)
DECLARE_FAT_FUNC_VAR(salsa20_crypt, salsa20_crypt_func, 2core)
@@ -225,7 +221,6 @@ fat_init (void)
{
if (verbose)
fprintf (stderr, "libnettle: enabling neon code.\n");
- _nettle_salsa20_core_vec = _nettle_salsa20_core_neon;
_nettle_salsa20_crypt_vec = _nettle_salsa20_crypt_2core;
_nettle_sha512_compress_vec = _nettle_sha512_compress_neon;
nettle_sha3_permute_vec = _nettle_sha3_permute_neon;
@@ -239,7 +234,6 @@ fat_init (void)
{
if (verbose)
fprintf (stderr, "libnettle: not enabling neon code.\n");
- _nettle_salsa20_core_vec = _nettle_salsa20_core_c;
_nettle_salsa20_crypt_vec = _nettle_salsa20_crypt_1core;
_nettle_sha512_compress_vec = _nettle_sha512_compress_c;
nettle_sha3_permute_vec = _nettle_sha3_permute_c;
@@ -265,10 +259,6 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
const uint8_t *src),
(rounds, keys, T, length, dst, src))
-DEFINE_FAT_FUNC(_nettle_salsa20_core, void,
- (uint32_t *dst, const uint32_t *src, unsigned rounds),
- (dst, src, rounds))
-
DEFINE_FAT_FUNC(_nettle_salsa20_crypt, void,
(struct salsa20_ctx *ctx, unsigned rounds,
size_t length, uint8_t *dst,