Merge branch 'arm64'

author: Niels Möller <nisse@lysator.liu.se> 2021-03-04 09:41:17 +0100
committer: Niels Möller <nisse@lysator.liu.se> 2021-03-04 09:41:17 +0100
commit: fe7ae87d1b837e82f7c7968b068bca7d853a4cec (patch)
tree: e13584719cf48a88c5c687c1aeb13b1a97528872
parent: c9d9c66b8ed111ab9ebd39440ec3f8d8d91734bd (diff)
parent: a471ae85f768b2b496b2e2e4272ba76fa74d5785 (diff)
download: nettle-fe7ae87d1b837e82f7c7968b068bca7d853a4cec.tar.gz
6 files changed, 436 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index fd138d82..71cf6c96 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,26 @@
+2021-03-04  Niels Möller  <nisse@lysator.liu.se>
+
+	Merged initial arm64 code.
+
+2021-02-03  Niels Möller  <nisse@lysator.liu.se>
+
+	* arm64/crypto/gcm-hash.asm: Renamed directory, moved file,...
+	* arm64/v8/gcm-hash.asm: ... old name.
+
+2021-02-02  Niels Möller  <nisse@lysator.liu.se>
+
+	* arm64/v8/gcm-hash.asm: Add ".arch armv8-a+crypto" directive.
+	Supported by both GNU as and clang (the latter at least from
+	version 3.9.1).
+	* configure.ac: Don't add -march=armv8-a+crypto to CFLAGS.
+
+2021-01-31  Niels Möller  <nisse@lysator.liu.se>
+
+	* arm64/v8/gcm-hash.asm: New file, contributed by Maamoun TK and
+	Michael Weiser.
+	* arm64/README: New file. Document endianness issues, contributed
+	by Michael Weiser.
+
 2021-02-17  Niels Möller  <nisse@lysator.liu.se>
 
 	* Released Nettle-3.7.1.
diff --git a/Makefile.in b/Makefile.in
index db02f5c0..2274d8be 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -616,6 +616,7 @@ distdir: $(DISTFILES)
 	set -e; for d in sparc32 sparc64 x86 \
 		x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
 		arm arm/neon arm/v6 arm/fat \
+		arm64 arm64/crypto \
 		powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat ; do \
 	  mkdir "$(distdir)/$$d" ; \
 	  find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' -o -name README ')' \
diff --git a/arm64/README b/arm64/README
new file mode 100644
index 00000000..139a3cc1
--- /dev/null
+++ b/arm64/README
@@ -0,0 +1,45 @@
+Endianness
+
+Similar to arm, aarch64 can run with little-endian or big-endian memory
+accesses. Endianness is handled exclusively on load and store operations.
+Register layout and operation behaviour is identical in both modes.
+
+When writing SIMD code, endianness interaction with vector loads and stores may
+exhibit seemingly unintuitive behaviour, particularly when mixing normal and
+vector load/store operations.
+
+See https://llvm.org/docs/BigEndianNEON.html for a good overview, particularly
+into the pitfalls of using ldr/str vs. ld1/st1.
+
+For example, ld1 {v1.2d,v2.2d},[x0] will load v1 and v2 with elements of a
+one-dimensional vector from consecutive memory locations. So v1.d[0] will be
+read from x0+0, v1.d[1] from x0+8 (bytes) and v2.d[0] from x0+16 and v2.d[1]
+from x0+24. That'll be the same in LE and BE mode because it is the structure
+of the vector prescribed by the load operation. Endianness will be applied to
+the individual doublewords but the order in which they're loaded from memory
+and in which they're put into d[0] and d[1] won't change.
+
+Another way is to explicitly load a vector of bytes using ld1 {v1.16b,
+v2.16b},[x0]. This will load x0+0 into v1.b[0], x0+1 (byte) into v1.b[1] and so
+forth. This load (or store) is endianness-neutral and behaves identical in LE
+and BE mode.
+
+Care must however be taken when switching views onto the registers: d[0] is
+mapped onto b[0] through b[7] and b[0] will be the least significant byte in
+d[0] and b[7] will be MSB. This layout is also the same in both memory
+endianness modes. ld1 {v1.16b}, however, will always load a vector of bytes
+with eight elements as consecutive bytes from memory into b[0] through b[7].
+When accessed trough d[0] this will only appear as the expected
+doubleword-sized number if it was indeed stored little-endian in memory.
+Something similar happens when loading a vector of doublewords (ld1
+{v1.2d},[x0]) and then accessing individual bytes of it. Bytes will only be at
+the expected indices if the doublewords are indeed stored in current memory
+endianness in memory. Therefore it is most intuitive to use the appropriate
+vector element width for the data being loaded or stored to apply the necessary
+endianness correction.
+
+Finally, ldr/str are not vector operations. When used to load a 128bit
+quadword, they will apply endianness to the whole quadword. Therefore
+particular care must be taken if the loaded data is then to be regarded as
+elements of e.g. a doubleword vector. Indicies may appear reversed on
+big-endian systems (because they are).
diff --git a/arm64/crypto/gcm-hash.asm b/arm64/crypto/gcm-hash.asm
new file mode 100644
index 00000000..b77b08d6
--- /dev/null
+++ b/arm64/crypto/gcm-hash.asm
@@ -0,0 +1,338 @@
+C arm/v8/gcm-hash.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Mamone Tarsha
+   Copyright (C) 2021 Michael Weiser
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+.file "gcm-hash.asm"
+.arch armv8-a+crypto
+
+.text
+
+C gcm_set_key() assigns H value in the middle element of the table
+define(`H_Idx', `128')
+
+C common register usage:
+define(`POLY', `v6')
+define(`T', `v7')
+define(`F', `v16')
+define(`F1', `v17')
+define(`R', `v18')
+define(`R1', `v19')
+
+C common macros:
+.macro PMUL in, param1, param2
+    pmull          F.1q,\param2\().1d,\in\().1d
+    pmull2         F1.1q,\param2\().2d,\in\().2d
+    pmull          R.1q,\param1\().1d,\in\().1d
+    pmull2         R1.1q,\param1\().2d,\in\().2d
+    eor            F.16b,F.16b,F1.16b
+    eor            R.16b,R.16b,R1.16b
+.endm
+
+.macro REDUCTION out
+    pmull          T.1q,F.1d,POLY.1d
+    eor            R.16b,R.16b,T.16b
+    ext            R.16b,R.16b,R.16b,#8
+    eor            \out\().16b,F.16b,R.16b
+.endm
+
+    C void gcm_init_key (union gcm_block *table)
+
+C This function populates the gcm table as the following layout
+C *******************************************************************************
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
+C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) |
+C |                                                                             |
+C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
+C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) |
+C |                                                                             |
+C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
+C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) |
+C |                                                                             |
+C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
+C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) |
+C *******************************************************************************
+
+C gcm_init_key register usage:
+define(`TABLE', `x0')
+
+define(`EMSB', `v0')
+define(`B', `v1')
+define(`H', `v2')
+define(`H2', `v3')
+define(`H3', `v4')
+define(`H4', `v5')
+define(`Hp', `v20')
+define(`Hl', `v21')
+define(`Hm', `v22')
+define(`H1M', `v23')
+define(`H1L', `v24')
+define(`H2M', `v25')
+define(`H2L', `v26')
+define(`H3M', `v27')
+define(`H3L', `v28')
+define(`H4M', `v29')
+define(`H4L', `v30')
+
+.macro PMUL_PARAM in, param1, param2
+    pmull2         Hp.1q,\in\().2d,POLY.2d
+    eor            Hm.16b,\in\().16b,Hp.16b
+    ext            \param1\().16b,Hm.16b,\in\().16b,#8
+    ext            \param2\().16b,\in\().16b,Hm.16b,#8
+    ext            \param1\().16b,\param1\().16b,\param1\().16b,#8
+.endm
+
+PROLOGUE(_nettle_gcm_init_key)
+    add            x1,TABLE,#16*H_Idx
+    ld1            {H.2d},[x1]
+
+    C we treat data as big-endian doublewords for processing. Since there is no
+    C endianness-neutral MSB-first load operation we need to restore our desired
+    C byte order on little-endian systems. The same holds true for DATA below
+    C but not our own internal precalculated TABLE (see below).
+IF_LE(`
+    rev64          H.16b,H.16b
+')
+    dup            EMSB.16b,H.b[7]
+    mov            x1,#0xC200000000000000
+    mov            x2,#1
+    mov            POLY.d[0],x1
+    mov            POLY.d[1],x2
+    sshr           EMSB.16b,EMSB.16b,#7
+    and            EMSB.16b,EMSB.16b,POLY.16b
+    ushr           B.2d,H.2d,#63
+    and            B.16b,B.16b,POLY.16b
+    ext            B.16b,B.16b,B.16b,#8
+    shl            H.2d,H.2d,#1
+    orr            H.16b,H.16b,B.16b
+    eor            H.16b,H.16b,EMSB.16b
+
+    dup            POLY.2d,POLY.d[0]
+
+    C --- calculate H^2 = H*H ---
+
+    PMUL_PARAM H,H1M,H1L
+
+    PMUL H,H1M,H1L
+
+    REDUCTION H2
+
+    PMUL_PARAM H2,H2M,H2L
+
+    C we store to the table as doubleword-vectors in current memory endianness
+    C because it's our own strictly internal data structure and what gcm_hash
+    C can most naturally use
+    st1            {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64
+
+    C --- calculate H^3 = H^1*H^2 ---
+
+    PMUL H2,H1M,H1L
+
+    REDUCTION H3
+
+    PMUL_PARAM H3,H3M,H3L
+
+    C --- calculate H^4 = H^2*H^2 ---
+
+    PMUL H2,H2M,H2L
+
+    REDUCTION H4
+
+    PMUL_PARAM H4,H4M,H4L
+
+    st1            {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE]
+
+    ret
+EPILOGUE(_nettle_gcm_init_key)
+
+C gcm_hash register usage:
+define(`TABLE', `x0')
+define(`X', `x1')
+define(`LENGTH', `x2')
+define(`DATA', `x3')
+
+define(`D', `v0')
+define(`C0', `v1')
+define(`C0D', `d1')
+define(`C1', `v2')
+define(`C2', `v3')
+define(`C3', `v4')
+define(`R2', `v20')
+define(`F2', `v21')
+define(`R3', `v22')
+define(`F3', `v23')
+define(`H1M', `v24')
+define(`H1L', `v25')
+define(`H2M', `v26')
+define(`H2L', `v27')
+define(`H3M', `v28')
+define(`H3L', `v29')
+define(`H4M', `v30')
+define(`H4L', `v31')
+
+.macro PMUL_SUM in, param1, param2
+    pmull          F2.1q,\param2\().1d,\in\().1d
+    pmull2         F3.1q,\param2\().2d,\in\().2d
+    pmull          R2.1q,\param1\().1d,\in\().1d
+    pmull2         R3.1q,\param1\().2d,\in\().2d
+    eor            F2.16b,F2.16b,F3.16b
+    eor            R2.16b,R2.16b,R3.16b
+    eor            F.16b,F.16b,F2.16b
+    eor            R.16b,R.16b,R2.16b
+.endm
+
+    C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
+    C                size_t length, const uint8_t *data)
+
+PROLOGUE(_nettle_gcm_hash)
+    mov            x4,#0xC200000000000000
+    mov            POLY.d[0],x4
+
+    ld1            {D.2d},[X]
+IF_LE(`
+    rev64          D.16b,D.16b
+')
+
+    ands           x4,LENGTH,#-64
+    b.eq           L2x
+
+    add            x5,TABLE,#64
+    ld1            {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
+    ld1            {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5]
+
+L4x_loop:
+    ld1            {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64
+IF_LE(`
+    rev64          C0.16b,C0.16b
+    rev64          C1.16b,C1.16b
+    rev64          C2.16b,C2.16b
+    rev64          C3.16b,C3.16b
+')
+
+    eor            C0.16b,C0.16b,D.16b
+
+    PMUL C1,H3M,H3L
+    PMUL_SUM C2,H2M,H2L
+    PMUL_SUM C3,H1M,H1L
+    PMUL_SUM C0,H4M,H4L
+
+    REDUCTION D
+
+    subs           x4,x4,#64
+    b.ne           L4x_loop
+
+    and            LENGTH,LENGTH,#63
+
+L2x:
+    tst            LENGTH,#-32
+    b.eq           L1x
+
+    ld1            {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE]
+
+    ld1            {C0.2d,C1.2d},[DATA],#32
+IF_LE(`
+    rev64          C0.16b,C0.16b
+    rev64          C1.16b,C1.16b
+')
+
+    eor            C0.16b,C0.16b,D.16b
+
+    PMUL C1,H1M,H1L
+    PMUL_SUM C0,H2M,H2L
+
+    REDUCTION D
+
+    and            LENGTH,LENGTH,#31
+
+L1x:
+    tst            LENGTH,#-16
+    b.eq           Lmod
+
+    ld1            {H1M.2d,H1L.2d},[TABLE]
+
+    ld1            {C0.2d},[DATA],#16
+IF_LE(`
+    rev64          C0.16b,C0.16b
+')
+
+    eor            C0.16b,C0.16b,D.16b
+
+    PMUL C0,H1M,H1L
+
+    REDUCTION D
+
+Lmod:
+    tst            LENGTH,#15
+    b.eq           Ldone
+
+    ld1            {H1M.2d,H1L.2d},[TABLE]
+
+    tbz            LENGTH,3,Lmod_8
+    ldr            C0D,[DATA],#8
+IF_LE(`
+    rev64          C0.16b,C0.16b
+')
+    mov            x7,#0
+    mov            C0.d[1],x7
+Lmod_8:
+    tst            LENGTH,#7
+    b.eq           Lmod_8_done
+    mov            x6,#0
+    mov            x5,#64
+    and            x4,LENGTH,#7
+Lmod_8_loop:
+    mov            x7,#0
+    ldrb           w7,[DATA],#1
+    sub            x5,x5,#8
+    lsl            x7,x7,x5
+    orr            x6,x6,x7
+    subs           x4,x4,#1
+    b.ne           Lmod_8_loop
+    tbz            LENGTH,3,Lmod_8_load
+    mov            C0.d[1],x6
+    b              Lmod_8_done
+Lmod_8_load:
+    mov            x7,#0
+    mov            C0.d[0],x6
+    mov            C0.d[1],x7
+Lmod_8_done:
+    eor            C0.16b,C0.16b,D.16b
+
+    PMUL C0,H1M,H1L
+
+    REDUCTION D
+
+Ldone:
+IF_LE(`
+    rev64          D.16b,D.16b
+')
+    st1            {D.2d},[X]
+    ret
+EPILOGUE(_nettle_gcm_hash)
diff --git a/arm64/machine.m4 b/arm64/machine.m4
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/arm64/machine.m4
diff --git a/configure.ac b/configure.ac
index 93ba4fba..6080a06a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon,
   AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
   [enable_arm_neon=auto])
 
+AC_ARG_ENABLE(arm64-crypto,
+  AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),,
+  [enable_arm64_crypto=no])
+
 AC_ARG_ENABLE(x86-aesni,
   AC_HELP_STRING([--enable-x86-aesni], [Enable x86_64 aes instructions. (default=no)]),,
   [enable_x86_aesni=no])
@@ -344,6 +348,17 @@ case "$host_cpu" in
       ABI=64
     ])
     ;;
+  aarch64*)
+    AC_TRY_COMPILE([
+#if defined(__aarch64__)
+#error 64-bit arm
+#endif
+    ], [], [
+      ABI=32
+    ], [
+      ABI=64
+    ])
+    ;;
 esac
 
 if test "x$ABI" != xstandard ; then
@@ -459,6 +474,20 @@ if test "x$enable_assembler" = xyes ; then
 	fi
       fi
       ;;
+    aarch64*)
+      if test "$ABI" = 64 ; then
+	asm_path=arm64
+	if test "$enable_arm64_crypto" = yes ; then
+          asm_path="arm64/crypto $asm_path"
+        fi
+      else
+	# As far as I understand, Neon instructions are unlikely to be
+	# missing. It may be omitted "only for implementations
+	# targeting specialized markets", to quote the Armv8 reference
+	# manual.
+	asm_path="arm/neon arm/v6 arm"
+      fi
+      ;;
     *powerpc64*)
       if test "$ABI" = 64 ; then
 	GMP_ASM_POWERPC_R_REGISTERS
author	Niels Möller <nisse@lysator.liu.se>	2021-03-04 09:41:17 +0100
committer	Niels Möller <nisse@lysator.liu.se>	2021-03-04 09:41:17 +0100
commit	fe7ae87d1b837e82f7c7968b068bca7d853a4cec (patch)
tree	e13584719cf48a88c5c687c1aeb13b1a97528872
parent	c9d9c66b8ed111ab9ebd39440ec3f8d8d91734bd (diff)
parent	a471ae85f768b2b496b2e2e4272ba76fa74d5785 (diff)
download	nettle-fe7ae87d1b837e82f7c7968b068bca7d853a4cec.tar.gz