summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2021-03-04 09:41:17 +0100
committerNiels Möller <nisse@lysator.liu.se>2021-03-04 09:41:17 +0100
commitfe7ae87d1b837e82f7c7968b068bca7d853a4cec (patch)
treee13584719cf48a88c5c687c1aeb13b1a97528872
parentc9d9c66b8ed111ab9ebd39440ec3f8d8d91734bd (diff)
parenta471ae85f768b2b496b2e2e4272ba76fa74d5785 (diff)
downloadnettle-fe7ae87d1b837e82f7c7968b068bca7d853a4cec.tar.gz
Merge branch 'arm64'
-rw-r--r--ChangeLog23
-rw-r--r--Makefile.in1
-rw-r--r--arm64/README45
-rw-r--r--arm64/crypto/gcm-hash.asm338
-rw-r--r--arm64/machine.m40
-rw-r--r--configure.ac29
6 files changed, 436 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index fd138d82..71cf6c96 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+2021-03-04 Niels Möller <nisse@lysator.liu.se>
+
+ Merged initial arm64 code.
+
+2021-02-03 Niels Möller <nisse@lysator.liu.se>
+
+ * arm64/crypto/gcm-hash.asm: Renamed directory, moved file,...
+ * arm64/v8/gcm-hash.asm: ... old name.
+
+2021-02-02 Niels Möller <nisse@lysator.liu.se>
+
+ * arm64/v8/gcm-hash.asm: Add ".arch armv8-a+crypto" directive.
+ Supported by both GNU as and clang (the latter at least from
+ version 3.9.1).
+ * configure.ac: Don't add -march=armv8-a+crypto to CFLAGS.
+
+2021-01-31 Niels Möller <nisse@lysator.liu.se>
+
+ * arm64/v8/gcm-hash.asm: New file, contributed by Maamoun TK and
+ Michael Weiser.
+ * arm64/README: New file. Document endianness issues, contributed
+ by Michael Weiser.
+
2021-02-17 Niels Möller <nisse@lysator.liu.se>
* Released Nettle-3.7.1.
diff --git a/Makefile.in b/Makefile.in
index db02f5c0..2274d8be 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -616,6 +616,7 @@ distdir: $(DISTFILES)
set -e; for d in sparc32 sparc64 x86 \
x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
arm arm/neon arm/v6 arm/fat \
+ arm64 arm64/crypto \
powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat ; do \
mkdir "$(distdir)/$$d" ; \
find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' -o -name README ')' \
diff --git a/arm64/README b/arm64/README
new file mode 100644
index 00000000..139a3cc1
--- /dev/null
+++ b/arm64/README
@@ -0,0 +1,45 @@
+Endianness
+
+Similar to arm, aarch64 can run with little-endian or big-endian memory
+accesses. Endianness is handled exclusively on load and store operations.
+Register layout and operation behaviour is identical in both modes.
+
+When writing SIMD code, endianness interaction with vector loads and stores may
+exhibit seemingly unintuitive behaviour, particularly when mixing normal and
+vector load/store operations.
+
+See https://llvm.org/docs/BigEndianNEON.html for a good overview, particularly
+into the pitfalls of using ldr/str vs. ld1/st1.
+
+For example, ld1 {v1.2d,v2.2d},[x0] will load v1 and v2 with elements of a
+one-dimensional vector from consecutive memory locations. So v1.d[0] will be
+read from x0+0, v1.d[1] from x0+8 (bytes) and v2.d[0] from x0+16 and v2.d[1]
+from x0+24. That'll be the same in LE and BE mode because it is the structure
+of the vector prescribed by the load operation. Endianness will be applied to
+the individual doublewords but the order in which they're loaded from memory
+and in which they're put into d[0] and d[1] won't change.
+
+Another way is to explicitly load a vector of bytes using ld1 {v1.16b,
+v2.16b},[x0]. This will load x0+0 into v1.b[0], x0+1 (byte) into v1.b[1] and so
+forth. This load (or store) is endianness-neutral and behaves identical in LE
+and BE mode.
+
+Care must however be taken when switching views onto the registers: d[0] is
+mapped onto b[0] through b[7] and b[0] will be the least significant byte in
+d[0] and b[7] will be MSB. This layout is also the same in both memory
+endianness modes. ld1 {v1.16b}, however, will always load a vector of bytes
+with eight elements as consecutive bytes from memory into b[0] through b[7].
+When accessed trough d[0] this will only appear as the expected
+doubleword-sized number if it was indeed stored little-endian in memory.
+Something similar happens when loading a vector of doublewords (ld1
+{v1.2d},[x0]) and then accessing individual bytes of it. Bytes will only be at
+the expected indices if the doublewords are indeed stored in current memory
+endianness in memory. Therefore it is most intuitive to use the appropriate
+vector element width for the data being loaded or stored to apply the necessary
+endianness correction.
+
+Finally, ldr/str are not vector operations. When used to load a 128bit
+quadword, they will apply endianness to the whole quadword. Therefore
+particular care must be taken if the loaded data is then to be regarded as
+elements of e.g. a doubleword vector. Indicies may appear reversed on
+big-endian systems (because they are).
diff --git a/arm64/crypto/gcm-hash.asm b/arm64/crypto/gcm-hash.asm
new file mode 100644
index 00000000..b77b08d6
--- /dev/null
+++ b/arm64/crypto/gcm-hash.asm
@@ -0,0 +1,338 @@
+C arm/v8/gcm-hash.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Mamone Tarsha
+ Copyright (C) 2021 Michael Weiser
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+.file "gcm-hash.asm"
+.arch armv8-a+crypto
+
+.text
+
+C gcm_set_key() assigns H value in the middle element of the table
+define(`H_Idx', `128')
+
+C common register usage:
+define(`POLY', `v6')
+define(`T', `v7')
+define(`F', `v16')
+define(`F1', `v17')
+define(`R', `v18')
+define(`R1', `v19')
+
+C common macros:
+.macro PMUL in, param1, param2
+ pmull F.1q,\param2\().1d,\in\().1d
+ pmull2 F1.1q,\param2\().2d,\in\().2d
+ pmull R.1q,\param1\().1d,\in\().1d
+ pmull2 R1.1q,\param1\().2d,\in\().2d
+ eor F.16b,F.16b,F1.16b
+ eor R.16b,R.16b,R1.16b
+.endm
+
+.macro REDUCTION out
+ pmull T.1q,F.1d,POLY.1d
+ eor R.16b,R.16b,T.16b
+ ext R.16b,R.16b,R.16b,#8
+ eor \out\().16b,F.16b,R.16b
+.endm
+
+ C void gcm_init_key (union gcm_block *table)
+
+C This function populates the gcm table as the following layout
+C *******************************************************************************
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ |
+C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) |
+C | |
+C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ |
+C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) |
+C | |
+C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ |
+C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) |
+C | |
+C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ |
+C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) |
+C *******************************************************************************
+
+C gcm_init_key register usage:
+define(`TABLE', `x0')
+
+define(`EMSB', `v0')
+define(`B', `v1')
+define(`H', `v2')
+define(`H2', `v3')
+define(`H3', `v4')
+define(`H4', `v5')
+define(`Hp', `v20')
+define(`Hl', `v21')
+define(`Hm', `v22')
+define(`H1M', `v23')
+define(`H1L', `v24')
+define(`H2M', `v25')
+define(`H2L', `v26')
+define(`H3M', `v27')
+define(`H3L', `v28')
+define(`H4M', `v29')
+define(`H4L', `v30')
+
+.macro PMUL_PARAM in, param1, param2
+ pmull2 Hp.1q,\in\().2d,POLY.2d
+ eor Hm.16b,\in\().16b,Hp.16b
+ ext \param1\().16b,Hm.16b,\in\().16b,#8
+ ext \param2\().16b,\in\().16b,Hm.16b,#8
+ ext \param1\().16b,\param1\().16b,\param1\().16b,#8
+.endm
+
+PROLOGUE(_nettle_gcm_init_key)
+ add x1,TABLE,#16*H_Idx
+ ld1 {H.2d},[x1]
+
+ C we treat data as big-endian doublewords for processing. Since there is no
+ C endianness-neutral MSB-first load operation we need to restore our desired
+ C byte order on little-endian systems. The same holds true for DATA below
+ C but not our own internal precalculated TABLE (see below).
+IF_LE(`
+ rev64 H.16b,H.16b
+')
+ dup EMSB.16b,H.b[7]
+ mov x1,#0xC200000000000000
+ mov x2,#1
+ mov POLY.d[0],x1
+ mov POLY.d[1],x2
+ sshr EMSB.16b,EMSB.16b,#7
+ and EMSB.16b,EMSB.16b,POLY.16b
+ ushr B.2d,H.2d,#63
+ and B.16b,B.16b,POLY.16b
+ ext B.16b,B.16b,B.16b,#8
+ shl H.2d,H.2d,#1
+ orr H.16b,H.16b,B.16b
+ eor H.16b,H.16b,EMSB.16b
+
+ dup POLY.2d,POLY.d[0]
+
+ C --- calculate H^2 = H*H ---
+
+ PMUL_PARAM H,H1M,H1L
+
+ PMUL H,H1M,H1L
+
+ REDUCTION H2
+
+ PMUL_PARAM H2,H2M,H2L
+
+ C we store to the table as doubleword-vectors in current memory endianness
+ C because it's our own strictly internal data structure and what gcm_hash
+ C can most naturally use
+ st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64
+
+ C --- calculate H^3 = H^1*H^2 ---
+
+ PMUL H2,H1M,H1L
+
+ REDUCTION H3
+
+ PMUL_PARAM H3,H3M,H3L
+
+ C --- calculate H^4 = H^2*H^2 ---
+
+ PMUL H2,H2M,H2L
+
+ REDUCTION H4
+
+ PMUL_PARAM H4,H4M,H4L
+
+ st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE]
+
+ ret
+EPILOGUE(_nettle_gcm_init_key)
+
+C gcm_hash register usage:
+define(`TABLE', `x0')
+define(`X', `x1')
+define(`LENGTH', `x2')
+define(`DATA', `x3')
+
+define(`D', `v0')
+define(`C0', `v1')
+define(`C0D', `d1')
+define(`C1', `v2')
+define(`C2', `v3')
+define(`C3', `v4')
+define(`R2', `v20')
+define(`F2', `v21')
+define(`R3', `v22')
+define(`F3', `v23')
+define(`H1M', `v24')
+define(`H1L', `v25')
+define(`H2M', `v26')
+define(`H2L', `v27')
+define(`H3M', `v28')
+define(`H3L', `v29')
+define(`H4M', `v30')
+define(`H4L', `v31')
+
+.macro PMUL_SUM in, param1, param2
+ pmull F2.1q,\param2\().1d,\in\().1d
+ pmull2 F3.1q,\param2\().2d,\in\().2d
+ pmull R2.1q,\param1\().1d,\in\().1d
+ pmull2 R3.1q,\param1\().2d,\in\().2d
+ eor F2.16b,F2.16b,F3.16b
+ eor R2.16b,R2.16b,R3.16b
+ eor F.16b,F.16b,F2.16b
+ eor R.16b,R.16b,R2.16b
+.endm
+
+ C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+ C size_t length, const uint8_t *data)
+
+PROLOGUE(_nettle_gcm_hash)
+ mov x4,#0xC200000000000000
+ mov POLY.d[0],x4
+
+ ld1 {D.2d},[X]
+IF_LE(`
+ rev64 D.16b,D.16b
+')
+
+ ands x4,LENGTH,#-64
+ b.eq L2x
+
+ add x5,TABLE,#64
+ ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
+ ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5]
+
+L4x_loop:
+ ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
+IF_LE(`
+ rev64 C0.16b,C0.16b
+ rev64 C1.16b,C1.16b
+ rev64 C2.16b,C2.16b
+ rev64 C3.16b,C3.16b
+')
+
+ eor C0.16b,C0.16b,D.16b
+
+ PMUL C1,H3M,H3L
+ PMUL_SUM C2,H2M,H2L
+ PMUL_SUM C3,H1M,H1L
+ PMUL_SUM C0,H4M,H4L
+
+ REDUCTION D
+
+ subs x4,x4,#64
+ b.ne L4x_loop
+
+ and LENGTH,LENGTH,#63
+
+L2x:
+ tst LENGTH,#-32
+ b.eq L1x
+
+ ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
+
+ ld1 {C0.2d,C1.2d},[DATA],#32
+IF_LE(`
+ rev64 C0.16b,C0.16b
+ rev64 C1.16b,C1.16b
+')
+
+ eor C0.16b,C0.16b,D.16b
+
+ PMUL C1,H1M,H1L
+ PMUL_SUM C0,H2M,H2L
+
+ REDUCTION D
+
+ and LENGTH,LENGTH,#31
+
+L1x:
+ tst LENGTH,#-16
+ b.eq Lmod
+
+ ld1 {H1M.2d,H1L.2d},[TABLE]
+
+ ld1 {C0.2d},[DATA],#16
+IF_LE(`
+ rev64 C0.16b,C0.16b
+')
+
+ eor C0.16b,C0.16b,D.16b
+
+ PMUL C0,H1M,H1L
+
+ REDUCTION D
+
+Lmod:
+ tst LENGTH,#15
+ b.eq Ldone
+
+ ld1 {H1M.2d,H1L.2d},[TABLE]
+
+ tbz LENGTH,3,Lmod_8
+ ldr C0D,[DATA],#8
+IF_LE(`
+ rev64 C0.16b,C0.16b
+')
+ mov x7,#0
+ mov C0.d[1],x7
+Lmod_8:
+ tst LENGTH,#7
+ b.eq Lmod_8_done
+ mov x6,#0
+ mov x5,#64
+ and x4,LENGTH,#7
+Lmod_8_loop:
+ mov x7,#0
+ ldrb w7,[DATA],#1
+ sub x5,x5,#8
+ lsl x7,x7,x5
+ orr x6,x6,x7
+ subs x4,x4,#1
+ b.ne Lmod_8_loop
+ tbz LENGTH,3,Lmod_8_load
+ mov C0.d[1],x6
+ b Lmod_8_done
+Lmod_8_load:
+ mov x7,#0
+ mov C0.d[0],x6
+ mov C0.d[1],x7
+Lmod_8_done:
+ eor C0.16b,C0.16b,D.16b
+
+ PMUL C0,H1M,H1L
+
+ REDUCTION D
+
+Ldone:
+IF_LE(`
+ rev64 D.16b,D.16b
+')
+ st1 {D.2d},[X]
+ ret
+EPILOGUE(_nettle_gcm_hash)
diff --git a/arm64/machine.m4 b/arm64/machine.m4
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/arm64/machine.m4
diff --git a/configure.ac b/configure.ac
index 93ba4fba..6080a06a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon,
AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
[enable_arm_neon=auto])
+AC_ARG_ENABLE(arm64-crypto,
+ AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),,
+ [enable_arm64_crypto=no])
+
AC_ARG_ENABLE(x86-aesni,
AC_HELP_STRING([--enable-x86-aesni], [Enable x86_64 aes instructions. (default=no)]),,
[enable_x86_aesni=no])
@@ -344,6 +348,17 @@ case "$host_cpu" in
ABI=64
])
;;
+ aarch64*)
+ AC_TRY_COMPILE([
+#if defined(__aarch64__)
+#error 64-bit arm
+#endif
+ ], [], [
+ ABI=32
+ ], [
+ ABI=64
+ ])
+ ;;
esac
if test "x$ABI" != xstandard ; then
@@ -459,6 +474,20 @@ if test "x$enable_assembler" = xyes ; then
fi
fi
;;
+ aarch64*)
+ if test "$ABI" = 64 ; then
+ asm_path=arm64
+ if test "$enable_arm64_crypto" = yes ; then
+ asm_path="arm64/crypto $asm_path"
+ fi
+ else
+ # As far as I understand, Neon instructions are unlikely to be
+ # missing. It may be omitted "only for implementations
+ # targeting specialized markets", to quote the Armv8 reference
+ # manual.
+ asm_path="arm/neon arm/v6 arm"
+ fi
+ ;;
*powerpc64*)
if test "$ABI" = 64 ; then
GMP_ASM_POWERPC_R_REGISTERS