diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-07-21 11:37:08 +0300 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-07-25 16:11:09 +0300 |
commit | 909daa700e4b45d75469df298ee564b8fc2f4b72 (patch) | |
tree | 979c0ac2b984f0499b4566a5dd26aaffbe27658a | |
parent | beaad75f4655e5316ce24f75ef172c231fd47fc1 (diff) | |
download | libgcrypt-909daa700e4b45d75469df298ee564b8fc2f4b72.tar.gz |
blake2: add AVX512 accelerated implementations
* cipher/Makefile.am: Add 'blake2b-amd64-avx512.S' and
'blake2s-amd64-avx512.S'.
* cipher/blake2.c (USE_AVX512): New.
(ASM_FUNC_ABI): Setup attribute if USE_AVX2 or USE_AVX512 enabled in
addition to USE_AVX.
(BLAKE2B_CONTEXT_S, BLAKE2S_CONTEXT_S): Add 'use_avx512'.
(_gcry_blake2b_transform_amd64_avx512)
(_gcry_blake2s_transform_amd64_avx512): New.
(blake2b_transform, blake2s_transform) [USE_AVX512]: Add AVX512 path.
(blake2b_init_ctx, blake2s_init_ctx) [USE_AVX512]: Use AVX512 if HW
feature available.
* cipher/blake2b-amd64-avx512.S: New.
* cipher/blake2s-amd64-avx512.S: New.
* configure.ac: Add 'blake2b-amd64-avx512.lo' and
'blake2s-amd64-avx512.lo'.
--
Benchmark on Intel Core i3-1115G4 (tigerlake):
Before (AVX/AVX2 implementations):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
BLAKE2B_512 | 0.841 ns/B 1134 MiB/s 3.44 c/B 4089
BLAKE2S_256 | 1.29 ns/B 741.2 MiB/s 5.26 c/B 4089
After (blake2s ~19% faster, blake2b ~25% faster):
| nanosecs/byte mebibytes/sec cycles/byte auto Mhz
BLAKE2B_512 | 0.705 ns/B 1353 MiB/s 2.88 c/B 4088
BLAKE2S_256 | 1.02 ns/B 933.3 MiB/s 4.18 c/B 4088
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r-- | cipher/Makefile.am | 3 | ||||
-rw-r--r-- | cipher/blake2.c | 49 | ||||
-rw-r--r-- | cipher/blake2b-amd64-avx512.S | 312 | ||||
-rw-r--r-- | cipher/blake2s-amd64-avx512.S | 261 | ||||
-rw-r--r-- | configure.ac | 2 |
5 files changed, 622 insertions, 5 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index c33d0754..477e856c 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -148,7 +148,8 @@ EXTRA_libcipher_la_SOURCES = \ camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \ camellia-arm.S camellia-aarch64.S \ blake2.c \ - blake2b-amd64-avx2.S blake2s-amd64-avx.S + blake2b-amd64-avx2.S blake2b-amd64-avx512.S \ + blake2s-amd64-avx.S blake2s-amd64-avx512.S gost28147.lo: gost-sb.h gost-sb.h: gost-s-box$(EXEEXT_FOR_BUILD) diff --git a/cipher/blake2.c b/cipher/blake2.c index d7f9a7e4..45f74a56 100644 --- a/cipher/blake2.c +++ b/cipher/blake2.c @@ -46,11 +46,20 @@ # define USE_AVX2 1 #endif +/* USE_AVX512 indicates whether to compile with Intel AVX512 code. */ +#undef USE_AVX512 +#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_AVX512 1 +#endif + /* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional * stack to store XMM6-XMM15 needed on Win64. */ #undef ASM_FUNC_ABI #undef ASM_EXTRA_STACK -#if defined(USE_AVX2) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) +#if (defined(USE_AVX) || defined(USE_AVX2) || defined(USE_AVX512)) \ + && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) # define ASM_FUNC_ABI __attribute__((sysv_abi)) # define ASM_EXTRA_STACK (10 * 16) #else @@ -98,6 +107,9 @@ typedef struct BLAKE2B_CONTEXT_S #ifdef USE_AVX2 unsigned int use_avx2:1; #endif +#ifdef USE_AVX512 + unsigned int use_avx512:1; +#endif } BLAKE2B_CONTEXT; typedef struct @@ -132,6 +144,9 @@ typedef struct BLAKE2S_CONTEXT_S #ifdef USE_AVX unsigned int use_avx:1; #endif +#ifdef USE_AVX512 + unsigned int use_avx512:1; +#endif } BLAKE2S_CONTEXT; typedef unsigned int (*blake2_transform_t)(void *S, const void *inblk, @@ -346,6 +361,12 @@ unsigned int _gcry_blake2b_transform_amd64_avx2(BLAKE2B_STATE *S, size_t nblks) ASM_FUNC_ABI; #endif +#ifdef USE_AVX512 +unsigned int _gcry_blake2b_transform_amd64_avx512(BLAKE2B_STATE *S, + const void *inblks, + size_t nblks) ASM_FUNC_ABI; +#endif + static unsigned int blake2b_transform(void *ctx, const void *inblks, size_t nblks) { @@ -354,8 +375,12 @@ static unsigned int blake2b_transform(void *ctx, const void *inblks, if (0) {} +#ifdef USE_AVX512 + else if (c->use_avx512) + nburn = _gcry_blake2b_transform_amd64_avx512(&c->state, inblks, nblks); +#endif #ifdef USE_AVX2 - if (c->use_avx2) + else if (c->use_avx2) nburn = _gcry_blake2b_transform_amd64_avx2(&c->state, inblks, nblks); #endif else @@ -468,6 +493,9 @@ static gcry_err_code_t blake2b_init_ctx(void *ctx, unsigned int flags, #ifdef USE_AVX2 c->use_avx2 = !!(features & HWF_INTEL_AVX2); #endif +#ifdef USE_AVX512 + c->use_avx512 = !!(features & HWF_INTEL_AVX512); +#endif c->outlen = dbits / 8; c->buflen = 0; @@ -670,6 +698,12 @@ unsigned int _gcry_blake2s_transform_amd64_avx(BLAKE2S_STATE *S, size_t nblks) ASM_FUNC_ABI; #endif +#ifdef USE_AVX512 +unsigned int _gcry_blake2s_transform_amd64_avx512(BLAKE2S_STATE *S, + const void *inblks, + size_t nblks) ASM_FUNC_ABI; +#endif + static unsigned int blake2s_transform(void *ctx, const void *inblks, size_t nblks) { @@ -677,9 +711,13 @@ static unsigned int blake2s_transform(void *ctx, const void *inblks, unsigned int nburn; if (0) - {} + { } +#ifdef USE_AVX512 + else if (c->use_avx512) + nburn = _gcry_blake2s_transform_amd64_avx512(&c->state, inblks, nblks); +#endif #ifdef USE_AVX - if (c->use_avx) + else if (c->use_avx) nburn = _gcry_blake2s_transform_amd64_avx(&c->state, inblks, nblks); #endif else @@ -792,6 +830,9 @@ static gcry_err_code_t blake2s_init_ctx(void *ctx, unsigned int flags, #ifdef USE_AVX c->use_avx = !!(features & HWF_INTEL_AVX); #endif +#ifdef USE_AVX + c->use_avx512 = !!(features & HWF_INTEL_AVX512); +#endif c->outlen = dbits / 8; c->buflen = 0; diff --git a/cipher/blake2b-amd64-avx512.S b/cipher/blake2b-amd64-avx512.S new file mode 100644 index 00000000..db53474d --- /dev/null +++ b/cipher/blake2b-amd64-avx512.S @@ -0,0 +1,312 @@ +/* blake2b-amd64-avx512.S - AVX512 implementation of BLAKE2b + * + * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* The code is based on public-domain/CC0 BLAKE2 reference implementation + * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse + * Copyright 2012, Samuel Neves <sneves@dei.uc.pt> + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +#include "asm-common-amd64.h" + +.text + +/* register macros */ +#define RSTATE %rdi +#define RINBLKS %rsi +#define RNBLKS %rdx +#define RIV %rcx + +/* state structure */ +#define STATE_H 0 +#define STATE_T (STATE_H + 8 * 8) +#define STATE_F (STATE_T + 2 * 8) + +/* vector registers */ +#define ROW1 %ymm0 +#define ROW2 %ymm1 +#define ROW3 %ymm2 +#define ROW4 %ymm3 +#define TMP1 %ymm4 +#define TMP1x %xmm4 + +#define MA1 %ymm5 +#define MA2 %ymm6 +#define MA3 %ymm7 +#define MA4 %ymm8 +#define MA1x %xmm5 +#define MA2x %xmm6 +#define MA3x %xmm7 +#define MA4x %xmm8 + +#define MB1 %ymm9 +#define MB2 %ymm10 +#define MB3 %ymm11 +#define MB4 %ymm12 +#define MB1x %xmm9 +#define MB2x %xmm10 +#define MB3x %xmm11 +#define MB4x %xmm12 + +/********************************************************************** + blake2b/AVX2 + **********************************************************************/ + +#define GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, gather_masks) \ + vmovdqa gather_masks + (4*4) * 0 rRIP, m2x; \ + vmovdqa gather_masks + (4*4) * 1 rRIP, m3x; \ + vmovdqa gather_masks + (4*4) * 2 rRIP, m4x; \ + vmovdqa gather_masks + (4*4) * 3 rRIP, TMP1x; \ + vpgatherdq (RINBLKS, m2x), m1 {%k1}; \ + vpgatherdq (RINBLKS, m3x), m2 {%k2}; \ + vpgatherdq (RINBLKS, m4x), m3 {%k3}; \ + vpgatherdq (RINBLKS, TMP1x), m4 {%k4} + +#define GEN_GMASK(s0, s1, s2, s3, s4, s5, s6, s7, \ + s8, s9, s10, s11, s12, s13, s14, s15) \ + .long (s0)*8, (s2)*8, (s4)*8, (s6)*8, \ + (s1)*8, (s3)*8, (s5)*8, (s7)*8, \ + (s8)*8, (s10)*8, (s12)*8, (s14)*8, \ + (s9)*8, (s11)*8, (s13)*8, (s15)*8 + +#define RESET_KMASKS() \ + kmovw %k0, %k1; \ + kmovw %k0, %k2; \ + kmovw %k0, %k3; \ + kmovw %k0, %k4 + +#define LOAD_MSG_0(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask0); \ + RESET_KMASKS() +#define LOAD_MSG_1(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask1); \ + RESET_KMASKS() +#define LOAD_MSG_2(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask2); \ + RESET_KMASKS() +#define LOAD_MSG_3(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask3); \ + RESET_KMASKS() +#define LOAD_MSG_4(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask4); \ + RESET_KMASKS() +#define LOAD_MSG_5(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask5); \ + RESET_KMASKS() +#define LOAD_MSG_6(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask6); \ + RESET_KMASKS() +#define LOAD_MSG_7(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask7); \ + RESET_KMASKS() +#define LOAD_MSG_8(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask8); \ + RESET_KMASKS() +#define LOAD_MSG_9(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask9); \ + RESET_KMASKS() +#define LOAD_MSG_10(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask0); \ + RESET_KMASKS() +#define LOAD_MSG_11(m1, m2, m3, m4, m1x, m2x, m3x, m4x) \ + GATHER_MSG(m1, m2, m3, m4, m1x, m2x, m3x, m4x, .Lgmask1); + +#define LOAD_MSG(r, m1, m2, m3, m4) \ + LOAD_MSG_##r(m1, m2, m3, m4, m1##x, m2##x, m3##x, m4##x) + +#define ROR_32(in, out) vpshufd $0xb1, in, out + +#define ROR_24(in, out) vprorq $24, in, out + +#define ROR_16(in, out) vprorq $16, in, out + +#define ROR_63(in, out) vprorq $63, in, out + +#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \ + vpaddq m, r1, r1; \ + vpaddq r2, r1, r1; \ + vpxor r1, r4, r4; \ + ROR_A(r4, r4); \ + vpaddq r4, r3, r3; \ + vpxor r3, r2, r2; \ + ROR_B(r2, r2) + +#define G1(r1, r2, r3, r4, m) \ + G(r1, r2, r3, r4, m, ROR_32, ROR_24) + +#define G2(r1, r2, r3, r4, m) \ + G(r1, r2, r3, r4, m, ROR_16, ROR_63) + +#define MM_SHUFFLE(z,y,x,w) \ + (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +#define DIAGONALIZE(r1, r2, r3, r4) \ + vpermq $MM_SHUFFLE(0,3,2,1), r2, r2; \ + vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \ + vpermq $MM_SHUFFLE(2,1,0,3), r4, r4 + +#define UNDIAGONALIZE(r1, r2, r3, r4) \ + vpermq $MM_SHUFFLE(2,1,0,3), r2, r2; \ + vpermq $MM_SHUFFLE(1,0,3,2), r3, r3; \ + vpermq $MM_SHUFFLE(0,3,2,1), r4, r4 + +#define ROUND(r, m1, m2, m3, m4) \ + G1(ROW1, ROW2, ROW3, ROW4, m1); \ + G2(ROW1, ROW2, ROW3, ROW4, m2); \ + DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \ + G1(ROW1, ROW2, ROW3, ROW4, m3); \ + G2(ROW1, ROW2, ROW3, ROW4, m4); \ + UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4) + +ELF(.type blake2b_data,@object;) +blake2b_data: +.align 32 +.Liv: + .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b + .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 + .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f + .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 +.Lgmask0: + GEN_GMASK(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +.Lgmask1: + GEN_GMASK(14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3) +.Lgmask2: + GEN_GMASK(11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4) +.Lgmask3: + GEN_GMASK(7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8) +.Lgmask4: + GEN_GMASK(9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13) +.Lgmask5: + GEN_GMASK(2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9) +.Lgmask6: + GEN_GMASK(12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11) +.Lgmask7: + GEN_GMASK(13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10) +.Lgmask8: + GEN_GMASK(6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5) +.Lgmask9: + GEN_GMASK(10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0) + +.align 64 +.globl _gcry_blake2b_transform_amd64_avx512 +ELF(.type _gcry_blake2b_transform_amd64_avx512,@function;) + +_gcry_blake2b_transform_amd64_avx512: + /* input: + * %rdi: state + * %rsi: blks + * %rdx: num_blks + */ + CFI_STARTPROC(); + + movl $0xf, %eax; + kmovw %eax, %k0; + xorl %eax, %eax; + RESET_KMASKS(); + + addq $128, (STATE_T + 0)(RSTATE); + adcq $0, (STATE_T + 8)(RSTATE); + + vmovdqa .Liv+(0 * 8) rRIP, ROW3; + vmovdqa .Liv+(4 * 8) rRIP, ROW4; + + vmovdqu (STATE_H + 0 * 8)(RSTATE), ROW1; + vmovdqu (STATE_H + 4 * 8)(RSTATE), ROW2; + + vpxor (STATE_T)(RSTATE), ROW4, ROW4; + + LOAD_MSG(0, MA1, MA2, MA3, MA4); + LOAD_MSG(1, MB1, MB2, MB3, MB4); + jmp .Loop; + +.align 64, 0xcc +.Loop: + ROUND(0, MA1, MA2, MA3, MA4); + LOAD_MSG(2, MA1, MA2, MA3, MA4); + ROUND(1, MB1, MB2, MB3, MB4); + LOAD_MSG(3, MB1, MB2, MB3, MB4); + ROUND(2, MA1, MA2, MA3, MA4); + LOAD_MSG(4, MA1, MA2, MA3, MA4); + ROUND(3, MB1, MB2, MB3, MB4); + LOAD_MSG(5, MB1, MB2, MB3, MB4); + ROUND(4, MA1, MA2, MA3, MA4); + LOAD_MSG(6, MA1, MA2, MA3, MA4); + ROUND(5, MB1, MB2, MB3, MB4); + LOAD_MSG(7, MB1, MB2, MB3, MB4); + ROUND(6, MA1, MA2, MA3, MA4); + LOAD_MSG(8, MA1, MA2, MA3, MA4); + ROUND(7, MB1, MB2, MB3, MB4); + LOAD_MSG(9, MB1, MB2, MB3, MB4); + ROUND(8, MA1, MA2, MA3, MA4); + LOAD_MSG(10, MA1, MA2, MA3, MA4); + ROUND(9, MB1, MB2, MB3, MB4); + LOAD_MSG(11, MB1, MB2, MB3, MB4); + sub $1, RNBLKS; + jz .Loop_end; + RESET_KMASKS(); + + lea 128(RINBLKS), RINBLKS; + addq $128, (STATE_T + 0)(RSTATE); + adcq $0, (STATE_T + 8)(RSTATE); + + ROUND(10, MA1, MA2, MA3, MA4); + LOAD_MSG(0, MA1, MA2, MA3, MA4); + ROUND(11, MB1, MB2, MB3, MB4); + LOAD_MSG(1, MB1, MB2, MB3, MB4); + + vpternlogq $0x96, (STATE_H + 0 * 8)(RSTATE), ROW3, ROW1; + vpternlogq $0x96, (STATE_H + 4 * 8)(RSTATE), ROW4, ROW2; + + vmovdqa .Liv+(0 * 8) rRIP, ROW3; + vmovdqa .Liv+(4 * 8) rRIP, ROW4; + + vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE); + vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE); + + vpxor (STATE_T)(RSTATE), ROW4, ROW4; + + jmp .Loop; + +.align 64, 0xcc +.Loop_end: + ROUND(10, MA1, MA2, MA3, MA4); + ROUND(11, MB1, MB2, MB3, MB4); + + vpternlogq $0x96, (STATE_H + 0 * 8)(RSTATE), ROW3, ROW1; + vpternlogq $0x96, (STATE_H + 4 * 8)(RSTATE), ROW4, ROW2; + + vmovdqu ROW1, (STATE_H + 0 * 8)(RSTATE); + vmovdqu ROW2, (STATE_H + 4 * 8)(RSTATE); + + kxorw %k0, %k0, %k0; + vzeroall; + RESET_KMASKS(); + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_blake2b_transform_amd64_avx512, + .-_gcry_blake2b_transform_amd64_avx512;) + +#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ +#endif /*__x86_64*/ diff --git a/cipher/blake2s-amd64-avx512.S b/cipher/blake2s-amd64-avx512.S new file mode 100644 index 00000000..4457ca99 --- /dev/null +++ b/cipher/blake2s-amd64-avx512.S @@ -0,0 +1,261 @@ +/* blake2s-amd64-avx512.S - AVX512 implementation of BLAKE2s + * + * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* The code is based on public-domain/CC0 BLAKE2 reference implementation + * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse + * Copyright 2012, Samuel Neves <sneves@dei.uc.pt> + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +#include "asm-common-amd64.h" + +.text + +/* register macros */ +#define RSTATE %rdi +#define RINBLKS %rsi +#define RNBLKS %rdx +#define RIV %rcx + +/* state structure */ +#define STATE_H 0 +#define STATE_T (STATE_H + 8 * 4) +#define STATE_F (STATE_T + 2 * 4) + +/* vector registers */ +#define ROW1 %xmm0 +#define ROW2 %xmm1 +#define ROW3 %xmm2 +#define ROW4 %xmm3 +#define TMP1 %xmm4 +#define TMP1x %xmm4 + +#define MA1 %xmm5 +#define MA2 %xmm6 +#define MA3 %xmm7 +#define MA4 %xmm8 + +#define MB1 %xmm9 +#define MB2 %xmm10 +#define MB3 %xmm11 +#define MB4 %xmm12 + +/********************************************************************** + blake2s/AVX + **********************************************************************/ + +/* On Intel tigerlake, vmovd+vpinsrd approach is faster than vpgatherdd. */ +#define GATHER_MSG(m1, m2, m3, m4, \ + s0, s1, s2, s3, s4, s5, s6, s7, s8, \ + s9, s10, s11, s12, s13, s14, s15) \ + vmovd (s0)*4(RINBLKS), m1; \ + vmovd (s1)*4(RINBLKS), m2; \ + vmovd (s8)*4(RINBLKS), m3; \ + vmovd (s9)*4(RINBLKS), m4; \ + vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \ + vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \ + vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \ + vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \ + vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \ + vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \ + vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \ + vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \ + vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \ + vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \ + vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \ + vpinsrd $3, (s15)*4(RINBLKS), m4, m4; + +#define LOAD_MSG_0(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) +#define LOAD_MSG_1(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3) +#define LOAD_MSG_2(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4) +#define LOAD_MSG_3(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8) +#define LOAD_MSG_4(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13) +#define LOAD_MSG_5(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9) +#define LOAD_MSG_6(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11) +#define LOAD_MSG_7(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10) +#define LOAD_MSG_8(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5) +#define LOAD_MSG_9(m1, m2, m3, m4) \ + GATHER_MSG(m1, m2, m3, m4, \ + 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0) + +#define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4) + +#define ROR_16(in, out) vprord $16, in, out; + +#define ROR_8(in, out) vprord $8, in, out; + +#define ROR_12(in, out) vprord $12, in, out; + +#define ROR_7(in, out) vprord $7, in, out; + +#define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \ + vpaddd m, r1, r1; \ + vpaddd r2, r1, r1; \ + vpxor r1, r4, r4; \ + ROR_A(r4, r4); \ + vpaddd r4, r3, r3; \ + vpxor r3, r2, r2; \ + ROR_B(r2, r2); + +#define G1(r1, r2, r3, r4, m) \ + G(r1, r2, r3, r4, m, ROR_16, ROR_12); + +#define G2(r1, r2, r3, r4, m) \ + G(r1, r2, r3, r4, m, ROR_8, ROR_7); + +#define MM_SHUFFLE(z,y,x,w) \ + (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +#define DIAGONALIZE(r1, r2, r3, r4) \ + vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \ + vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \ + vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4; + +#define UNDIAGONALIZE(r1, r2, r3, r4) \ + vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \ + vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \ + vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4; + +#define ROUND(r, m1, m2, m3, m4) \ + G1(ROW1, ROW2, ROW3, ROW4, m1); \ + G2(ROW1, ROW2, ROW3, ROW4, m2); \ + DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \ + G1(ROW1, ROW2, ROW3, ROW4, m3); \ + G2(ROW1, ROW2, ROW3, ROW4, m4); \ + UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4); + +ELF(.type blake2s_data,@object;) +blake2s_data: +.align 16 +.Liv: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A + .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 + +.align 64 +.globl _gcry_blake2s_transform_amd64_avx512 +ELF(.type _gcry_blake2s_transform_amd64_avx512,@function;) + +_gcry_blake2s_transform_amd64_avx512: + /* input: + * %rdi: state + * %rsi: blks + * %rdx: num_blks + */ + CFI_STARTPROC(); + + addq $64, (STATE_T + 0)(RSTATE); + + vmovdqa .Liv+(0 * 4) rRIP, ROW3; + vmovdqa .Liv+(4 * 4) rRIP, ROW4; + + vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1; + vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2; + + vpxor (STATE_T)(RSTATE), ROW4, ROW4; + + LOAD_MSG(0, MA1, MA2, MA3, MA4); + LOAD_MSG(1, MB1, MB2, MB3, MB4); + jmp .Loop; + +.align 64, 0xcc +.Loop: + ROUND(0, MA1, MA2, MA3, MA4); + LOAD_MSG(2, MA1, MA2, MA3, MA4); + ROUND(1, MB1, MB2, MB3, MB4); + LOAD_MSG(3, MB1, MB2, MB3, MB4); + ROUND(2, MA1, MA2, MA3, MA4); + LOAD_MSG(4, MA1, MA2, MA3, MA4); + ROUND(3, MB1, MB2, MB3, MB4); + LOAD_MSG(5, MB1, MB2, MB3, MB4); + ROUND(4, MA1, MA2, MA3, MA4); + LOAD_MSG(6, MA1, MA2, MA3, MA4); + ROUND(5, MB1, MB2, MB3, MB4); + LOAD_MSG(7, MB1, MB2, MB3, MB4); + ROUND(6, MA1, MA2, MA3, MA4); + LOAD_MSG(8, MA1, MA2, MA3, MA4); + ROUND(7, MB1, MB2, MB3, MB4); + LOAD_MSG(9, MB1, MB2, MB3, MB4); + sub $1, RNBLKS; + jz .Loop_end; + + lea 64(RINBLKS), RINBLKS; + addq $64, (STATE_T + 0)(RSTATE); + + ROUND(8, MA1, MA2, MA3, MA4); + LOAD_MSG(0, MA1, MA2, MA3, MA4); + ROUND(9, MB1, MB2, MB3, MB4); + LOAD_MSG(1, MB1, MB2, MB3, MB4); + + vpternlogq $0x96, (STATE_H + 0 * 4)(RSTATE), ROW3, ROW1; + vpternlogq $0x96, (STATE_H + 4 * 4)(RSTATE), ROW4, ROW2; + + vmovdqa .Liv+(0 * 4) rRIP, ROW3; + vmovdqa .Liv+(4 * 4) rRIP, ROW4; + + vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE); + vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE); + + vpxor (STATE_T)(RSTATE), ROW4, ROW4; + + jmp .Loop; + +.align 64, 0xcc +.Loop_end: + ROUND(8, MA1, MA2, MA3, MA4); + ROUND(9, MB1, MB2, MB3, MB4); + + vpternlogq $0x96, (STATE_H + 0 * 4)(RSTATE), ROW3, ROW1; + vpternlogq $0x96, (STATE_H + 4 * 4)(RSTATE), ROW4, ROW2; + + vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE); + vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE); + + xorl %eax, %eax; + vzeroall; + ret_spec_stop; + CFI_ENDPROC(); +ELF(.size _gcry_blake2s_transform_amd64_avx512, + .-_gcry_blake2s_transform_amd64_avx512;) + +#endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/ +#endif /*__x86_64*/ diff --git a/configure.ac b/configure.ac index 27159888..4921d73c 100644 --- a/configure.ac +++ b/configure.ac @@ -3191,7 +3191,9 @@ if test "$found" = "1" ; then x86_64-*-*) # Build with the assembly implementation GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx2.lo" + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2b-amd64-avx512.lo" GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx.lo" + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS blake2s-amd64-avx512.lo" ;; esac fi |