diff options
author | Niels Möller <nisse@lysator.liu.se> | 2021-03-22 19:08:14 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2021-03-22 19:08:14 +0100 |
commit | a3e38b1d36d189834deaa626111faa93bee95ca9 (patch) | |
tree | b0d6e3040fb1c533ec82995cfd0e5a96943a3987 | |
parent | 944881d7c7f321c6e4078f271e7e7be9b32aee07 (diff) | |
parent | 1585f6acd92508aef2988c362db598c2e35f56dd (diff) | |
download | nettle-a3e38b1d36d189834deaa626111faa93bee95ca9.tar.gz |
Merge arm64 fat support into master.
-rw-r--r-- | Makefile.in | 2 | ||||
-rw-r--r-- | arm64/README | 46 | ||||
-rw-r--r-- | arm64/crypto/gcm-hash.asm | 220 | ||||
-rw-r--r-- | arm64/fat/gcm-hash.asm | 38 | ||||
-rw-r--r-- | configure.ac | 12 | ||||
-rw-r--r-- | fat-arm64.c | 156 |
6 files changed, 361 insertions, 113 deletions
diff --git a/Makefile.in b/Makefile.in index 2274d8be..0ace35f7 100644 --- a/Makefile.in +++ b/Makefile.in @@ -616,7 +616,7 @@ distdir: $(DISTFILES) set -e; for d in sparc32 sparc64 x86 \ x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \ arm arm/neon arm/v6 arm/fat \ - arm64 arm64/crypto \ + arm64 arm64/crypto arm64/fat \ powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat ; do \ mkdir "$(distdir)/$$d" ; \ find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' -o -name README ')' \ diff --git a/arm64/README b/arm64/README index 139a3cc1..d2745d57 100644 --- a/arm64/README +++ b/arm64/README @@ -1,3 +1,42 @@ +General-purpose Registers[1] + +There are thirty-one, 64-bit, general-purpose (integer) registers visible to +the A64 instruction set; these are labeled r0-r30. In a 64-bit context these +registers are normally referred to using the names x0-x30; in a 32-bit context +the registers are specified by using w0-w30. Additionally, a stack-pointer +register, SP, can be used with a restricted number of instructions. + +The first eight registers, r0-r7, are used to pass argument values into +a subroutine and to return result values from a function. + +Software developers creating platform-independent code are advised to avoid +using r18 if at all possible. Most compilers provide a mechanism to prevent +specific registers from being used for general allocation; portable hand-coded +assembler should avoid it entirely. It should not be assumed that treating the +register as callee-saved will be sufficient to satisfy the requirements of the +platform. Virtualization code must, of course, treat the register as they would +any other resource provided to the virtual machine. + +A subroutine invocation must preserve the contents of the registers r19-r29 +and SP. All 64 bits of each value stored in r19-r29 must be preserved, even +when using the ILP32 data model. + +SIMD and Floating-Point Registers[1] + +Unlike in AArch32, in AArch64 the 128-bit and 64-bit views of a SIMD and +Floating-Point register do not overlap multiple registers in a narrower view, +so q1, d1 and s1 all refer to the same entry in the register bank. + +The first eight registers, v0-v7, are used to pass argument values into +a subroutine and to return result values from a function. They may also +be used to hold intermediate values within a routine (but, in general, +only between subroutine calls). + +Registers v8-v15 must be preserved by a callee across subroutine calls; +the remaining registers (v0-v7, v16-v31) do not need to be preserved +(or should be preserved by the caller). Additionally, only the bottom 64 bits +of each value stored in v8-v15 need to be preserved. + Endianness Similar to arm, aarch64 can run with little-endian or big-endian memory @@ -8,8 +47,8 @@ When writing SIMD code, endianness interaction with vector loads and stores may exhibit seemingly unintuitive behaviour, particularly when mixing normal and vector load/store operations. -See https://llvm.org/docs/BigEndianNEON.html for a good overview, particularly -into the pitfalls of using ldr/str vs. ld1/st1. +See [2] for a good overview, particularly into the pitfalls of using +ldr/str vs. ld1/st1. For example, ld1 {v1.2d,v2.2d},[x0] will load v1 and v2 with elements of a one-dimensional vector from consecutive memory locations. So v1.d[0] will be @@ -43,3 +82,6 @@ quadword, they will apply endianness to the whole quadword. Therefore particular care must be taken if the loaded data is then to be regarded as elements of e.g. a doubleword vector. Indicies may appear reversed on big-endian systems (because they are). + +[1] https://github.com/ARM-software/abi-aa/releases/download/2020Q4/aapcs64.pdf +[2] https://llvm.org/docs/BigEndianNEON.html diff --git a/arm64/crypto/gcm-hash.asm b/arm64/crypto/gcm-hash.asm index b77b08d6..3e4c98d8 100644 --- a/arm64/crypto/gcm-hash.asm +++ b/arm64/crypto/gcm-hash.asm @@ -1,4 +1,4 @@ -C arm/v8/gcm-hash.asm +C arm64/crypto/gcm-hash.asm ifelse(` Copyright (C) 2020 Niels Möller and Mamone Tarsha @@ -38,30 +38,42 @@ ifelse(` C gcm_set_key() assigns H value in the middle element of the table define(`H_Idx', `128') -C common register usage: +C common SIMD register usage: define(`POLY', `v6') +C temporary register that assist the reduction procedure define(`T', `v7') +C permenant register that hold the 16-byte result of pmull define(`F', `v16') +C permenant register that hold the 16-byte result of pmull2, +C its value is accumulated on 'F' register immediately define(`F1', `v17') +C permenant register that hold the 16-byte result of pmull define(`R', `v18') +C permenant register that hold the 16-byte result of pmull2, +C its value is accumulated on 'F' register immediately define(`R1', `v19') C common macros: -.macro PMUL in, param1, param2 - pmull F.1q,\param2\().1d,\in\().1d - pmull2 F1.1q,\param2\().2d,\in\().2d - pmull R.1q,\param1\().1d,\in\().1d - pmull2 R1.1q,\param1\().2d,\in\().2d +C long multiply of six 64-bit polynomials and sum +C R = (in.l × param2.l) + (in.h × param2.h) +C F = (in.l × param3.l) + (in.h × param3.h) +C PMUL(in, param1, param2) +define(`PMUL', m4_assert_numargs(3)` + pmull F.1q,$3.1d,$1.1d + pmull2 F1.1q,$3.2d,$1.2d + pmull R.1q,$2.1d,$1.1d + pmull2 R1.1q,$2.2d,$1.2d eor F.16b,F.16b,F1.16b eor R.16b,R.16b,R1.16b -.endm - -.macro REDUCTION out +') +C Reduce 'R' and 'F' values to 128-bit output +C REDUCTION(out) +define(`REDUCTION', m4_assert_numargs(1)` pmull T.1q,F.1d,POLY.1d eor R.16b,R.16b,T.16b ext R.16b,R.16b,R.16b,#8 - eor \out\().16b,F.16b,R.16b -.endm + eor $1.16b,F.16b,R.16b +') C void gcm_init_key (union gcm_block *table) @@ -101,13 +113,14 @@ define(`H3L', `v28') define(`H4M', `v29') define(`H4L', `v30') -.macro PMUL_PARAM in, param1, param2 - pmull2 Hp.1q,\in\().2d,POLY.2d - eor Hm.16b,\in\().16b,Hp.16b - ext \param1\().16b,Hm.16b,\in\().16b,#8 - ext \param2\().16b,\in\().16b,Hm.16b,#8 - ext \param1\().16b,\param1\().16b,\param1\().16b,#8 -.endm +C PMUL_PARAM(in, param1, param2) +define(`PMUL_PARAM', m4_assert_numargs(3)` + pmull2 Hp.1q,$1.2d,POLY.2d + eor Hm.16b,$1.16b,Hp.16b + ext $2.16b,Hm.16b,$1.16b,#8 + ext $3.16b,$1.16b,Hm.16b,#8 + ext $2.16b,$2.16b,$2.16b,#8 +') PROLOGUE(_nettle_gcm_init_key) add x1,TABLE,#16*H_Idx @@ -120,6 +133,8 @@ PROLOGUE(_nettle_gcm_init_key) IF_LE(` rev64 H.16b,H.16b ') + C --- calculate H = H × x mod R(X); R(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) --- + dup EMSB.16b,H.b[7] mov x1,#0xC200000000000000 mov x2,#1 @@ -136,36 +151,36 @@ IF_LE(` dup POLY.2d,POLY.d[0] - C --- calculate H^2 = H*H --- + C --- calculate H^2 = H × H --- - PMUL_PARAM H,H1M,H1L + PMUL_PARAM(H,H1M,H1L) - PMUL H,H1M,H1L + PMUL(H,H1M,H1L) - REDUCTION H2 + REDUCTION(H2) - PMUL_PARAM H2,H2M,H2L + PMUL_PARAM(H2,H2M,H2L) C we store to the table as doubleword-vectors in current memory endianness C because it's our own strictly internal data structure and what gcm_hash C can most naturally use st1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE],#64 - C --- calculate H^3 = H^1*H^2 --- + C --- calculate H^3 = H^1 × H^2 --- - PMUL H2,H1M,H1L + PMUL(H2,H1M,H1L) - REDUCTION H3 + REDUCTION(H3) - PMUL_PARAM H3,H3M,H3L + PMUL_PARAM(H3,H3M,H3L) - C --- calculate H^4 = H^2*H^2 --- + C --- calculate H^4 = H^2 × H^2 --- - PMUL H2,H2M,H2L + PMUL(H2,H2M,H2L) - REDUCTION H4 + REDUCTION(H4) - PMUL_PARAM H4,H4M,H4L + PMUL_PARAM(H4,H4M,H4L) st1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[TABLE] @@ -180,7 +195,6 @@ define(`DATA', `x3') define(`D', `v0') define(`C0', `v1') -define(`C0D', `d1') define(`C1', `v2') define(`C2', `v3') define(`C3', `v4') @@ -197,16 +211,52 @@ define(`H3L', `v29') define(`H4M', `v30') define(`H4L', `v31') -.macro PMUL_SUM in, param1, param2 - pmull F2.1q,\param2\().1d,\in\().1d - pmull2 F3.1q,\param2\().2d,\in\().2d - pmull R2.1q,\param1\().1d,\in\().1d - pmull2 R3.1q,\param1\().2d,\in\().2d +C PMUL_SUM(in, param1, param2) +define(`PMUL_SUM', m4_assert_numargs(3)` + pmull F2.1q,$3.1d,$1.1d + pmull2 F3.1q,$3.2d,$1.2d + pmull R2.1q,$2.1d,$1.1d + pmull2 R3.1q,$2.2d,$1.2d eor F2.16b,F2.16b,F3.16b eor R2.16b,R2.16b,R3.16b eor F.16b,F.16b,F2.16b eor R.16b,R.16b,R2.16b -.endm +') + +C Load the final partial block into SIMD register, +C stored in little-endian order for each 64-bit part +C LOAD_REV_PARTIAL_BLOCK(out) +define(`LOAD_REV_PARTIAL_BLOCK', m4_assert_numargs(1)` + tbz LENGTH,3,Lless_8_bytes + ldr `d'substr($1,1,len($1)),[DATA],#8 +IF_LE(` + rev64 $1.16b,$1.16b +') + mov x7,#0 + mov $1.d[1],x7 + tst LENGTH,#7 + b.eq Lload_done +Lless_8_bytes: + mov x6,#0 + mov x5,#64 + and x4,LENGTH,#7 +Lload_byte_loop: + mov x7,#0 + ldrb w7,[DATA],#1 + sub x5,x5,#8 + lsl x7,x7,x5 + orr x6,x6,x7 + subs x4,x4,#1 + b.ne Lload_byte_loop + tbz LENGTH,3,Lstore_hi_dw + mov $1.d[1],x6 + b Lload_done +Lstore_hi_dw: + mov x7,#0 + mov $1.d[0],x6 + mov $1.d[1],x7 +Lload_done: +') C void gcm_hash (const struct gcm_key *key, union gcm_block *x, C size_t length, const uint8_t *data) @@ -221,13 +271,13 @@ IF_LE(` ') ands x4,LENGTH,#-64 - b.eq L2x + b.eq L1_block add x5,TABLE,#64 ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE] ld1 {H3M.2d,H3L.2d,H4M.2d,H4L.2d},[x5] -L4x_loop: +L4_blocks_loop: ld1 {C0.2d,C1.2d,C2.2d,C3.2d},[DATA],#64 IF_LE(` rev64 C0.16b,C0.16b @@ -238,45 +288,25 @@ IF_LE(` eor C0.16b,C0.16b,D.16b - PMUL C1,H3M,H3L - PMUL_SUM C2,H2M,H2L - PMUL_SUM C3,H1M,H1L - PMUL_SUM C0,H4M,H4L + PMUL(C1,H3M,H3L) + PMUL_SUM(C2,H2M,H2L) + PMUL_SUM(C3,H1M,H1L) + PMUL_SUM(C0,H4M,H4L) - REDUCTION D + REDUCTION(D) subs x4,x4,#64 - b.ne L4x_loop + b.ne L4_blocks_loop and LENGTH,LENGTH,#63 -L2x: - tst LENGTH,#-32 - b.eq L1x - - ld1 {H1M.2d,H1L.2d,H2M.2d,H2L.2d},[TABLE] - - ld1 {C0.2d,C1.2d},[DATA],#32 -IF_LE(` - rev64 C0.16b,C0.16b - rev64 C1.16b,C1.16b -') - - eor C0.16b,C0.16b,D.16b - - PMUL C1,H1M,H1L - PMUL_SUM C0,H2M,H2L - - REDUCTION D - - and LENGTH,LENGTH,#31 - -L1x: - tst LENGTH,#-16 - b.eq Lmod +L1_block: + ands x4,LENGTH,#-16 + b.eq Lpartial ld1 {H1M.2d,H1L.2d},[TABLE] +L1_block_loop: ld1 {C0.2d},[DATA],#16 IF_LE(` rev64 C0.16b,C0.16b @@ -284,52 +314,28 @@ IF_LE(` eor C0.16b,C0.16b,D.16b - PMUL C0,H1M,H1L + PMUL(C0,H1M,H1L) + + REDUCTION(D) - REDUCTION D + subs x4,x4,#16 + b.ne L1_block_loop -Lmod: +Lpartial: tst LENGTH,#15 - b.eq Ldone + b.eq Lghash_done ld1 {H1M.2d,H1L.2d},[TABLE] - tbz LENGTH,3,Lmod_8 - ldr C0D,[DATA],#8 -IF_LE(` - rev64 C0.16b,C0.16b -') - mov x7,#0 - mov C0.d[1],x7 -Lmod_8: - tst LENGTH,#7 - b.eq Lmod_8_done - mov x6,#0 - mov x5,#64 - and x4,LENGTH,#7 -Lmod_8_loop: - mov x7,#0 - ldrb w7,[DATA],#1 - sub x5,x5,#8 - lsl x7,x7,x5 - orr x6,x6,x7 - subs x4,x4,#1 - b.ne Lmod_8_loop - tbz LENGTH,3,Lmod_8_load - mov C0.d[1],x6 - b Lmod_8_done -Lmod_8_load: - mov x7,#0 - mov C0.d[0],x6 - mov C0.d[1],x7 -Lmod_8_done: + LOAD_REV_PARTIAL_BLOCK(C0) + eor C0.16b,C0.16b,D.16b - PMUL C0,H1M,H1L + PMUL(C0,H1M,H1L) - REDUCTION D + REDUCTION(D) -Ldone: +Lghash_done: IF_LE(` rev64 D.16b,D.16b ') diff --git a/arm64/fat/gcm-hash.asm b/arm64/fat/gcm-hash.asm new file mode 100644 index 00000000..5ef171b5 --- /dev/null +++ b/arm64/fat/gcm-hash.asm @@ -0,0 +1,38 @@ +C arm64/fat/gcm-hash.asm + +ifelse(` + Copyright (C) 2021 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl picked up by configure +dnl PROLOGUE(_nettle_fat_gcm_init_key) +dnl PROLOGUE(_nettle_fat_gcm_hash) + +define(`fat_transform', `$1_arm64') +include_src(`arm64/crypto/gcm-hash.asm') diff --git a/configure.ac b/configure.ac index 6080a06a..026ae99d 100644 --- a/configure.ac +++ b/configure.ac @@ -476,9 +476,15 @@ if test "x$enable_assembler" = xyes ; then ;; aarch64*) if test "$ABI" = 64 ; then - asm_path=arm64 - if test "$enable_arm64_crypto" = yes ; then - asm_path="arm64/crypto $asm_path" + asm_path=arm64 + if test "x$enable_fat" = xyes ; then + asm_path="arm64/fat $asm_path" + OPT_NETTLE_SOURCES="fat-arm64.c $OPT_NETTLE_SOURCES" + FAT_TEST_LIST="none pmull" + else + if test "$enable_arm64_crypto" = yes ; then + asm_path="arm64/crypto $asm_path" + fi fi else # As far as I understand, Neon instructions are unlikely to be diff --git a/fat-arm64.c b/fat-arm64.c new file mode 100644 index 00000000..9f81951f --- /dev/null +++ b/fat-arm64.c @@ -0,0 +1,156 @@ +/* fat-arm64.c + + Copyright (C) 2021 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +#define _GNU_SOURCE + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#if defined(__linux__) && defined(__GLIBC__) && defined(__GLIBC_PREREQ) +# if __GLIBC_PREREQ(2, 16) +# define USE_GETAUXVAL 1 +# include <asm/hwcap.h> +# include <sys/auxv.h> +# endif +#endif + +#include "nettle-types.h" + +#include "gcm.h" +#include "gcm-internal.h" +#include "fat-setup.h" + +/* Defines from arch/arm64/include/uapi/asm/hwcap.h in Linux kernel */ +#ifndef HWCAP_ASIMD +#define HWCAP_ASIMD (1 << 1) +#endif +#ifndef HWCAP_PMULL +#define HWCAP_PMULL (1 << 4) +#endif + +struct arm64_features +{ + int have_pmull; +}; + +#define MATCH(s, slen, literal, llen) \ + ((slen) == (llen) && memcmp ((s), (literal), llen) == 0) + +static void +get_arm64_features (struct arm64_features *features) +{ + const char *s; + features->have_pmull = 0; + + s = secure_getenv (ENV_OVERRIDE); + if (s) + for (;;) + { + const char *sep = strchr (s, ','); + size_t length = sep ? (size_t) (sep - s) : strlen(s); + + if (MATCH (s, length, "pmull", 5)) + features->have_pmull = 1; + if (!sep) + break; + s = sep + 1; + } + else + { +#if USE_GETAUXVAL + unsigned long hwcap = getauxval(AT_HWCAP); + features->have_pmull + = ((hwcap & (HWCAP_ASIMD | HWCAP_PMULL)) == (HWCAP_ASIMD | HWCAP_PMULL)); +#endif + } +} + +#if GCM_TABLE_BITS == 8 +DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c) +DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, arm64) + +DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c) +DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, arm64) +#endif /* GCM_TABLE_BITS == 8 */ + +static void CONSTRUCTOR +fat_init (void) +{ + struct arm64_features features; + int verbose; + + get_arm64_features (&features); + + verbose = getenv (ENV_VERBOSE) != NULL; + if (verbose) + fprintf (stderr, "libnettle: cpu features: %s\n", + features.have_pmull ? "polynomial multiply long instructions (PMULL/PMULL2)" : ""); + + if (features.have_pmull) + { + if (verbose) + fprintf (stderr, "libnettle: enabling hardware-accelerated polynomial multiply code.\n"); +#if GCM_TABLE_BITS == 8 + /* Make sure _nettle_gcm_init_key_vec function is compatible + with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c() + fills gcm_key table with values that are incompatible with + _nettle_gcm_hash_arm64() */ + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_arm64; + _nettle_gcm_hash_vec = _nettle_gcm_hash_arm64; +#endif /* GCM_TABLE_BITS == 8 */ + } + else + { +#if GCM_TABLE_BITS == 8 + _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c; + _nettle_gcm_hash_vec = _nettle_gcm_hash_c; +#endif /* GCM_TABLE_BITS == 8 */ + } +} + +#if GCM_TABLE_BITS == 8 +DEFINE_FAT_FUNC(_nettle_gcm_init_key, void, + (union nettle_block16 *table), + (table)) + +DEFINE_FAT_FUNC(_nettle_gcm_hash, void, + (const struct gcm_key *key, union nettle_block16 *x, + size_t length, const uint8_t *data), + (key, x, length, data)) +#endif /* GCM_TABLE_BITS == 8 */ |