diff options
author | Niels Möller <nisse@lysator.liu.se> | 2021-01-25 20:53:27 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2021-01-25 20:53:27 +0100 |
commit | d02f776eb2541cb0b80f53ace9292092586557e2 (patch) | |
tree | d63e81badba01352fd63991ef5f945938aa76dd5 | |
parent | ddf99ac4c5de3710c7bfac81eac60d55e59a35c2 (diff) | |
download | nettle-d02f776eb2541cb0b80f53ace9292092586557e2.tar.gz |
Delete the ARM Neon code doing a single block salsa20.
-rw-r--r-- | ChangeLog | 13 | ||||
-rw-r--r-- | arm/fat/salsa20-core-internal-2.asm | 37 | ||||
-rw-r--r-- | arm/neon/salsa20-core-internal.asm | 206 | ||||
-rw-r--r-- | fat-arm.c | 10 |
4 files changed, 13 insertions, 253 deletions
@@ -1,3 +1,16 @@ +2021-01-25 Niels Möller <nisse@lysator.liu.se> + + * arm/neon/salsa20-core-internal.asm: Deleted file. This ARM Neon + implementation reportedly gave a speedup of 45% on Cortex A9, + compared to the C implementation, when it was added back in 2013. + That appears to no longer be the case with more recent processors + and compilers. And it's even significantly slower than the C + implementation on some platforms, including the Raspberry Pi 4. + With the introduction of salsa20-2core.asm, performance of this + function is also less important. + * arm/fat/salsa20-core-internal-2.asm: Deleted file. + * fat-arm.c: Delete fat setup for _nettle_salsa20_core. + 2021-01-20 Niels Möller <nisse@lysator.liu.se> * ecc-ecdsa-verify.c (ecc_ecdsa_verify): Fix corner case with diff --git a/arm/fat/salsa20-core-internal-2.asm b/arm/fat/salsa20-core-internal-2.asm deleted file mode 100644 index f88afd86..00000000 --- a/arm/fat/salsa20-core-internal-2.asm +++ /dev/null @@ -1,37 +0,0 @@ -C arm/fat/salsa20-core-internal-2.asm - - -ifelse(` - Copyright (C) 2015 Niels Möller - - This file is part of GNU Nettle. - - GNU Nettle is free software: you can redistribute it and/or - modify it under the terms of either: - - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. - - or - - * the GNU General Public License as published by the Free - Software Foundation; either version 2 of the License, or (at your - option) any later version. - - or both in parallel, as here. - - GNU Nettle is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received copies of the GNU General Public License and - the GNU Lesser General Public License along with this program. If - not, see http://www.gnu.org/licenses/. -') - -dnl PROLOGUE(_nettle_salsa20_core) picked up by configure - -define(`fat_transform', `$1_neon') -include_src(`arm/neon/salsa20-core-internal.asm') diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm deleted file mode 100644 index c5785da4..00000000 --- a/arm/neon/salsa20-core-internal.asm +++ /dev/null @@ -1,206 +0,0 @@ -C arm/neon/salsa20-core-internal.asm - -ifelse(` - Copyright (C) 2013 Niels Möller - - This file is part of GNU Nettle. - - GNU Nettle is free software: you can redistribute it and/or - modify it under the terms of either: - - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. - - or - - * the GNU General Public License as published by the Free - Software Foundation; either version 2 of the License, or (at your - option) any later version. - - or both in parallel, as here. - - GNU Nettle is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received copies of the GNU General Public License and - the GNU Lesser General Public License along with this program. If - not, see http://www.gnu.org/licenses/. -') - - .file "salsa20-core-internal.asm" - .fpu neon - -define(`DST', `r0') -define(`SRC', `r1') -define(`ROUNDS', `r2') -define(`SRCp32', `r3') - -define(`X0', `q0') -define(`X1', `q1') -define(`X2', `q2') -define(`X3', `q3') -define(`T0', `q8') -define(`T1', `q9') -define(`M0101', `q10') -define(`M0110', `q11') -define(`M0011', `q12') -define(`S1', `q13') -define(`S2', `q14') -define(`S3', `q15') - -define(`QROUND', ` - vadd.i32 T0, $1, $4 - vshl.i32 T1, T0, #7 - vshr.u32 T0, T0, #25 - veor $2, $2, T0 - veor $2, $2, T1 - - vadd.i32 T0, $1, $2 - vshl.i32 T1, T0, #9 - vshr.u32 T0, T0, #23 - veor $3, $3, T0 - veor $3, $3, T1 - - vadd.i32 T0, $2, $3 - vshl.i32 T1, T0, #13 - vshr.u32 T0, T0, #19 - veor $4, $4, T0 - veor $4, $4, T1 - - vadd.i32 T0, $3, $4 - vshl.i32 T1, T0, #18 - vshr.u32 T0, T0, #14 - veor $1, $1, T0 - veor $1, $1, T1 -') - - .text - .align 4 -.Lmasks: - .int 0,-1, 0,-1 - .int 0,-1,-1, 0 - .int 0, 0,-1,-1 - - C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds) - -PROLOGUE(_nettle_salsa20_core) - C loads using vld1.32 to be endianness-neutral wrt consecutive 32-bit words - add SRCp32, SRC, #32 - vld1.32 {X0,X1}, [SRC] - vld1.32 {X2,X3}, [SRCp32] - - C Input rows little-endian: - C 0 1 2 3 X0 - C 4 5 6 7 X1 - C 8 9 10 11 X2 - C 12 13 14 15 X3 - C Permuted to: - C 0 5 10 15 - C 4 9 14 3 - C 8 13 2 7 - C 12 1 6 11 - - C FIXME: Construct in some other way? - adr r12, .Lmasks - vld1.32 {M0101, M0110}, [r12]! - vld1.32 {M0011}, [r12] - - vmov S1, X1 - vmov S2, X2 - vmov S3, X3 - - C Swaps in columns 1, 3: - C 0 5 2 7 X0 ^ - C 4 1 6 3 T0 v - C 8 13 10 15 T1 ^ - C 12 9 14 11 X3 v - C same in big endian just with transposed rows - vmov T0, X1 - vmov T1, X2 - vbit T0, X0, M0101 - vbit X0, X1, M0101 - vbit T1, X3, M0101 - vbit X3, X2, M0101 - - C Swaps in column 1, 2: - C 0 5 2 7 X0 - C 4 9 14 3 X1 ^ - C 8 13 10 15 T1 | - C 12 1 6 11 X3 v - vmov X1, T0 - vbit X1, X3, M0110 - vbit X3, T0, M0110 - - C Swaps in columm 2,3: - C 0 5 10 15 X0 ^ - C 4 9 14 3 X1 | - C 8 13 2 7 X2 v - C 12 1 6 11 X3 - vmov X2, T1 - vbit X2, X0, M0011 - vbit X0, T1, M0011 - -.Loop: - QROUND(X0, X1, X2, X3) - - C In little-endian rotate rows, to get - C 0 5 10 15 - C 3 4 9 14 >>> 1 - C 2 7 8 13 >>> 2 - C 1 6 11 12 >>> 3 - vext.32 X1, X1, X1, #3 - vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #1 - - QROUND(X0, X3, X2, X1) - - subs ROUNDS, ROUNDS, #2 - C Inverse rotation - vext.32 X1, X1, X1, #1 - vext.32 X2, X2, X2, #2 - vext.32 X3, X3, X3, #3 - - bhi .Loop - - C Inverse swaps - vmov T1, X2 - vbit T1, X0, M0011 - vbit X0, X2, M0011 - - vmov T0, X1 - vbit T0, X3, M0110 - vbit X3, X1, M0110 - - vmov X1, T0 - vmov X2, T1 - vbit X1, X0, M0101 - vbit X0, T0, M0101 - vbit X2, X3, M0101 - vbit X3, T1, M0101 - - vld1.32 {T0}, [SRC] - vadd.u32 X0, X0, T0 - vadd.u32 X1, X1, S1 - - C vst1.8 because caller expects results little-endian - C use vstm when little-endian for some additional speedup -IF_BE(` vst1.8 {X0,X1}, [DST]!') - - vadd.u32 X2, X2, S2 - vadd.u32 X3, X3, S3 - -IF_BE(` vst1.8 {X2,X3}, [DST]') -IF_LE(` vstm DST, {X0,X1,X2,X3}') - bx lr -EPILOGUE(_nettle_salsa20_core) - -divert(-1) -define salsastate -p/x $q0.u32 -p/x $q1.u32 -p/x $q2.u32 -p/x $q3.u32 -end @@ -145,10 +145,6 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, arm) DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, armv6) -DECLARE_FAT_FUNC(_nettle_salsa20_core, salsa20_core_func) -DECLARE_FAT_FUNC_VAR(salsa20_core, salsa20_core_func, c) -DECLARE_FAT_FUNC_VAR(salsa20_core, salsa20_core_func, neon) - DECLARE_FAT_FUNC(_nettle_salsa20_crypt, salsa20_crypt_func) DECLARE_FAT_FUNC_VAR(salsa20_crypt, salsa20_crypt_func, 1core) DECLARE_FAT_FUNC_VAR(salsa20_crypt, salsa20_crypt_func, 2core) @@ -225,7 +221,6 @@ fat_init (void) { if (verbose) fprintf (stderr, "libnettle: enabling neon code.\n"); - _nettle_salsa20_core_vec = _nettle_salsa20_core_neon; _nettle_salsa20_crypt_vec = _nettle_salsa20_crypt_2core; _nettle_sha512_compress_vec = _nettle_sha512_compress_neon; nettle_sha3_permute_vec = _nettle_sha3_permute_neon; @@ -239,7 +234,6 @@ fat_init (void) { if (verbose) fprintf (stderr, "libnettle: not enabling neon code.\n"); - _nettle_salsa20_core_vec = _nettle_salsa20_core_c; _nettle_salsa20_crypt_vec = _nettle_salsa20_crypt_1core; _nettle_sha512_compress_vec = _nettle_sha512_compress_c; nettle_sha3_permute_vec = _nettle_sha3_permute_c; @@ -265,10 +259,6 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void, const uint8_t *src), (rounds, keys, T, length, dst, src)) -DEFINE_FAT_FUNC(_nettle_salsa20_core, void, - (uint32_t *dst, const uint32_t *src, unsigned rounds), - (dst, src, rounds)) - DEFINE_FAT_FUNC(_nettle_salsa20_crypt, void, (struct salsa20_ctx *ctx, unsigned rounds, size_t length, uint8_t *dst, |