diff options
-rw-r--r-- | LICENSES | 1 | ||||
-rw-r--r-- | cipher/Makefile.am | 3 | ||||
-rw-r--r-- | cipher/keccak-amd64-avx512.S | 583 | ||||
-rw-r--r-- | cipher/keccak.c | 83 | ||||
-rw-r--r-- | configure.ac | 2 |
5 files changed, 670 insertions, 2 deletions
@@ -139,6 +139,7 @@ with any binary distributions derived from the GNU C Library. For files: - cipher/cipher-gcm-ppc.c + - cipher/keccak-amd64-avx512.S #+begin_quote Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org> diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 3d95a794..c33d0754 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -134,7 +134,8 @@ EXTRA_libcipher_la_SOURCES = \ sha512-armv7-neon.S sha512-arm.S \ sha512-ppc.c sha512-ssse3-i386.c \ sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \ - keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \ + keccak.c keccak_permute_32.h keccak_permute_64.h \ + keccak-armv7-neon.S keccak-amd64-avx512.S \ stribog.c \ tiger.c \ whirlpool.c whirlpool-sse2-amd64.S \ diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S new file mode 100644 index 00000000..f44e0285 --- /dev/null +++ b/cipher/keccak-amd64-avx512.S @@ -0,0 +1,583 @@ +/* keccak-amd64-avx512.S - x86-64 AVX512 implementation of Keccak + * + * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + * + * --- + * + * Core function `KeccakF1600_ce` based on ARMv8-CE KeccakF1600 implementation + * by Andy Polyakov from CRYPTOGAMS distribution `arm/keccak1600-armv8.pl`. + * `KeccakF1600_ce` was ported to x86-64 AVX512 and converted to use GCC + * preprocessed assembly and fitted with new absorb function optimized for + * x86-64. SHA3-256 performance on Intel tigerlake, 5.72 cpB. + * + * Original copyright license follows: + * + * Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain copyright notices, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * * Neither the name of the CRYPTOGAMS nor the names of its + * copyright holder and contributors may be used to endorse or + * promote products derived from this software without specific + * prior written permission. + * + * ALTERNATIVELY, provided that this notice is retained in full, this + * product may be distributed under the terms of the GNU General Public + * License (GPL), in which case the provisions of the GPL apply INSTEAD OF + * those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef __x86_64 +#include <config.h> +#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +#include "asm-common-amd64.h" + +.text + +/* Register macros. */ +#define A_0_0 %xmm31 +#define A_0_1 %xmm30 +#define A_0_2 %xmm29 +#define A_0_3 %xmm28 +#define A_0_4 %xmm27 +#define A_1_0 %xmm26 +#define A_1_1 %xmm25 +#define A_1_2 %xmm24 +#define A_1_3 %xmm23 +#define A_1_4 %xmm22 +#define A_2_0 %xmm21 +#define A_2_1 %xmm20 +#define A_2_2 %xmm19 +#define A_2_3 %xmm18 +#define A_2_4 %xmm17 +#define A_3_0 %xmm16 +#define A_3_1 %xmm15 +#define A_3_2 %xmm14 +#define A_3_3 %xmm13 +#define A_3_4 %xmm12 +#define A_4_0 %xmm11 +#define A_4_1 %xmm10 +#define A_4_2 %xmm9 +#define A_4_3 %xmm8 +#define A_4_4 %xmm7 + +#define C_0 %xmm6 +#define C_1 %xmm5 +#define C_2 %xmm4 +#define C_3 %xmm3 +#define C_4 %xmm2 +#define C_5 %xmm1 +#define C_6 %xmm0 + +#define D_0 C_4 +#define D_1 C_5 +#define D_2 C_6 +#define D_3 C_2 +#define D_4 C_3 + +/* Helper macros for ARMv8-CE to x86-64/AVX512 conversion. */ +#define eor3_d(dst_s1, s2, s3) \ + vpternlogq $0x96, s3, s2, dst_s1; + +#define eor3(dst, s1, s2, s3) \ + vmovdqa s1, dst; \ + eor3_d(dst, s2, s3); + +#define rax1_c(dst, s1, s2_rol1) \ + vprolq $1, s2_rol1, dst; \ + vpxor s1, dst, dst; + +#define rax1_t(dst_s1, s2_rol1, tmp) \ + vprolq $1, s2_rol1, tmp; \ + vpxor tmp, dst_s1, dst_s1; + +#define rax1_s(dst_s1, s2_rol1) \ + vprolq $1, s2_rol1, s2_rol1; \ + vpxor s2_rol1, dst_s1, dst_s1; + +#define xar(dst, s1, s2, rol) \ + vpxorq s2, s1, dst; \ + vprolq $(rol), dst, dst; + +#define xar_x(dst, s1, s2, rol) \ + vpxor s2, s1, dst; \ + vprolq $(rol), dst, dst; + +#define bcax_d(dst_s1, s2, s3) \ + vpternlogq $0xb4, s3, s2, dst_s1; + +#define bcax(dst, s1, s2, s3) \ + vmovdqa64 s1, dst; \ + bcax_d(dst, s2, s3); + +#define bcax_x(dst, s1, s2, s3) \ + vmovdqa s1, dst; \ + bcax_d(dst, s2, s3); + +#define eor(dst, s1, s2) \ + vpxorq s2, s1, dst; + +/* Misc helper macros. */ +#define clear_avx512_4regs(a, b, c, d) \ + eor(a, a, a); vmovdqa64 a, b; vmovdqa64 a, c; vmovdqa64 a, d; + +#define clear_regs() \ + vzeroall; /* xmm0-xmm15 */ \ + clear_avx512_4regs(%xmm16, %xmm17, %xmm18, %xmm19); \ + clear_avx512_4regs(%xmm20, %xmm21, %xmm22, %xmm23); \ + clear_avx512_4regs(%xmm24, %xmm25, %xmm26, %xmm27); \ + clear_avx512_4regs(%xmm28, %xmm29, %xmm30, %xmm31); + +ELF(.type KeccakF1600_ce,@function) +.align 64, 0xcc +KeccakF1600_ce: +.Loop_ce: + CFI_STARTPROC() + + ////////////////////////////////////////////////// Theta + eor3( C_0, A_4_0, A_3_0, A_2_0) + eor3( C_1, A_4_1, A_3_1, A_2_1) + eor3( C_3, A_4_3, A_3_3, A_2_3) + eor3( C_2, A_4_2, A_3_2, A_2_2) + eor3( C_4, A_4_4, A_3_4, A_2_4) + eor3_d( C_0, A_1_0, A_0_0) + eor3_d( C_1, A_1_1, A_0_1) + eor3_d( C_3, A_1_3, A_0_3) + eor3_d( C_2, A_1_2, A_0_2) + eor3_d( C_4, A_1_4, A_0_4) + + rax1_c( C_5, C_0, C_2) // D[1] + rax1_t( C_2, C_4, C_6) // D[3] + rax1_c( C_6, C_1, C_3) // D[2] + rax1_s( C_3, C_0) // D[4] + rax1_s( C_4, C_1) // D[0] + + ////////////////////////////////////////////////// Theta+Rho+Pi + xar( C_0, A_0_1, D_1, 1) // C[0]=A[2][0] + + xar( A_0_1, A_1_1, D_1, 44) + xar( A_1_1, A_1_4, D_4, 20) + xar( A_1_4, A_4_2, D_2, 61) + xar( A_4_2, A_2_4, D_4, 39) + xar( A_2_4, A_4_0, D_0, 18) + + xar( C_1, A_0_2, D_2, 62) // C[1]=A[4][0] + + xar( A_0_2, A_2_2, D_2, 43) + xar( A_2_2, A_2_3, D_3, 25) + xar( A_2_3, A_3_4, D_4, 8) + xar_x( A_3_4, A_4_3, D_3, 56) + xar( A_4_3, A_3_0, D_0, 41) + + xar( A_3_0, A_0_4, D_4, 27) + + xar_x( D_4, A_4_4, D_4, 14) // D[4]=A[0][4] + xar_x( A_4_4, A_4_1, D_1, 2) + xar( A_1_3, A_1_3, D_3, 55) // A[1][3]=A[4][1] + xar( A_0_4, A_3_1, D_1, 45) // A[0][4]=A[1][3] + xar( A_3_1, A_1_0, D_0, 36) + + xar( A_1_0, A_0_3, D_3, 28) + + eor( A_0_0, A_0_0, D_0) + + xar_x( D_3, A_3_3, D_3, 21) // D[3]=A[0][3] + xar( A_0_3, A_3_2, D_2, 15) // A[0][3]=A[3][3] + xar( D_1, A_2_1, D_1, 10) // D[1]=A[3][2] + xar( D_2, A_1_2, D_2, 6) // D[2]=A[2][1] + xar( D_0, A_2_0, D_0, 3) // D[0]=A[1][2] + + ////////////////////////////////////////////////// Chi+Iota + bcax_x( A_4_0, C_1, A_4_2, A_1_3) // A[1][3]=A[4][1] + bcax( A_4_1, A_1_3, A_4_3, A_4_2) // A[1][3]=A[4][1] + bcax_d( A_4_2, A_4_4, A_4_3) + bcax_d( A_4_3, C_1, A_4_4) + bcax_d( A_4_4, A_1_3, C_1) // A[1][3]=A[4][1] + + bcax_x( A_3_2, D_1, A_3_4, A_0_3) // A[0][3]=A[3][3] + bcax( A_3_3, A_0_3, A_3_0, A_3_4) // A[0][3]=A[3][3] + bcax_d( A_3_4, A_3_1, A_3_0) + bcax_d( A_3_0, D_1, A_3_1) + bcax_d( A_3_1, A_0_3, D_1) // A[0][3]=A[3][3] + + bcax( A_2_0, C_0, A_2_2, D_2) + bcax( A_2_1, D_2, A_2_3, A_2_2) + bcax_d( A_2_2, A_2_4, A_2_3) + bcax_d( A_2_3, C_0, A_2_4) + bcax_d( A_2_4, D_2, C_0) + + bcax( A_1_2, D_0, A_1_4, A_0_4) // A[0][4]=A[1][3] + bcax( A_1_3, A_0_4, A_1_0, A_1_4) // A[0][4]=A[1][3] + bcax_d( A_1_4, A_1_1, A_1_0) + bcax_d( A_1_0, D_0, A_1_1) + bcax_d( A_1_1, A_0_4, D_0) // A[0][4]=A[1][3] + + bcax( A_0_3, D_3, A_0_0, D_4) + bcax( A_0_4, D_4, A_0_1, A_0_0) + bcax_d( A_0_0, A_0_2, A_0_1) + bcax_d( A_0_1, D_3, A_0_2) + bcax_d( A_0_2, D_4, D_3) + eor( A_0_0, A_0_0, (%r10)) + + cmpq %r10, %r11 + je .Lend_ce + + addq $8, %r10 + jmp .Loop_ce + +.align 64, 0xcc +.Lend_ce: + ret_spec_stop + CFI_ENDPROC() +ELF(.size KeccakF1600_ce,.-KeccakF1600_ce) + +.globl _gcry_keccak_f1600_state_permute64_avx512 +ELF(.type _gcry_keccak_f1600_state_permute64_avx512,@function) +.align 64, 0xcc +_gcry_keccak_f1600_state_permute64_avx512: + /* input: + * %rdi: state + * %rsi: round constants + */ + CFI_STARTPROC() + + leaq 12*8(%rdi), %rax + leaq (24-1)*8(%rsi), %r11 + + vmovdqu64 0*8(%rdi), A_0_0 + vmovdqu64 1*8(%rdi), A_0_1 + vmovdqu64 2*8(%rdi), A_0_2 + vmovdqu64 3*8(%rdi), A_0_3 + vmovdqu64 4*8(%rdi), A_0_4 + vmovdqu64 5*8(%rdi), A_1_0 + vmovdqu64 6*8(%rdi), A_1_1 + vmovdqu64 7*8(%rdi), A_1_2 + vmovdqu64 8*8(%rdi), A_1_3 + vmovdqu64 9*8(%rdi), A_1_4 + vmovdqu64 10*8(%rdi), A_2_0 + vmovdqu64 11*8(%rdi), A_2_1 + vmovdqu64 0*8(%rax), A_2_2 + vmovdqu64 1*8(%rax), A_2_3 + vmovdqu64 2*8(%rax), A_2_4 + vmovdqu64 3*8(%rax), A_3_0 + vmovdqu 4*8(%rax), A_3_1 + vmovdqu 5*8(%rax), A_3_2 + vmovdqu 6*8(%rax), A_3_3 + vmovdqu 7*8(%rax), A_3_4 + vmovdqu 8*8(%rax), A_4_0 + vmovdqu 9*8(%rax), A_4_1 + vmovdqu 10*8(%rax), A_4_2 + vmovdqu 11*8(%rax), A_4_3 + vmovq 12*8(%rax), A_4_4 + + movq %rsi, %r10 + call KeccakF1600_ce + + vpunpcklqdq A_0_1, A_0_0, A_0_0 + vpunpcklqdq A_0_3, A_0_2, A_0_2 + vpunpcklqdq A_1_0, A_0_4, A_0_4 + vpunpcklqdq A_1_2, A_1_1, A_1_1 + vpunpcklqdq A_1_4, A_1_3, A_1_3 + vpunpcklqdq A_2_1, A_2_0, A_2_0 + vpunpcklqdq A_2_3, A_2_2, A_2_2 + vpunpcklqdq A_3_0, A_2_4, A_2_4 + vpunpcklqdq A_3_2, A_3_1, A_3_1 + vpunpcklqdq A_3_4, A_3_3, A_3_3 + vpunpcklqdq A_4_1, A_4_0, A_4_0 + vpunpcklqdq A_4_3, A_4_2, A_4_2 + vmovdqu64 A_0_0, 0*8(%rdi) + vmovdqu64 A_0_2, 2*8(%rdi) + vmovdqu64 A_0_4, 4*8(%rdi) + vmovdqu64 A_1_1, 6*8(%rdi) + vmovdqu64 A_1_3, 8*8(%rdi) + vmovdqu64 A_2_0, 10*8(%rdi) + vmovdqu64 A_2_2, 0*8(%rax) + vmovdqu64 A_2_4, 2*8(%rax) + vmovdqu A_3_1, 4*8(%rax) + vmovdqu A_3_3, 6*8(%rax) + vmovdqu A_4_0, 8*8(%rax) + vmovdqu A_4_2, 10*8(%rax) + vmovq A_4_4, 12*8(%rax) + + xorl %eax, %eax + + clear_regs() + ret_spec_stop + CFI_ENDPROC() +ELF(.size _gcry_keccak_f1600_state_permute64_avx512, + .-_gcry_keccak_f1600_state_permute64_avx512) + +.globl _gcry_keccak_absorb_blocks_avx512 +ELF(.type _gcry_keccak_absorb_blocks_avx512,@function) +.align 64, 0xcc +_gcry_keccak_absorb_blocks_avx512: + /* input: + * %rdi: state + * %rsi: round constants + * %rdx: lanes + * %rcx: nlanes + * %r8 : blocklanes + * %r9 : lanes output pointer + */ + CFI_STARTPROC() + + leaq 12*8(%rdi), %rax + leaq (24-1)*8(%rsi), %r11 + + vmovdqu64 0*8(%rdi), A_0_0 + vmovdqu64 1*8(%rdi), A_0_1 + vmovdqu64 2*8(%rdi), A_0_2 + vmovdqu64 3*8(%rdi), A_0_3 + vmovdqu64 4*8(%rdi), A_0_4 + vmovdqu64 5*8(%rdi), A_1_0 + vmovdqu64 6*8(%rdi), A_1_1 + vmovdqu64 7*8(%rdi), A_1_2 + vmovdqu64 8*8(%rdi), A_1_3 + vmovdqu64 9*8(%rdi), A_1_4 + vmovdqu64 10*8(%rdi), A_2_0 + vmovdqu64 11*8(%rdi), A_2_1 + vmovdqu64 0*8(%rax), A_2_2 + vmovdqu64 1*8(%rax), A_2_3 + vmovdqu64 2*8(%rax), A_2_4 + vmovdqu64 3*8(%rax), A_3_0 + vmovdqu 4*8(%rax), A_3_1 + vmovdqu 5*8(%rax), A_3_2 + vmovdqu 6*8(%rax), A_3_3 + vmovdqu 7*8(%rax), A_3_4 + vmovdqu 8*8(%rax), A_4_0 + vmovdqu 9*8(%rax), A_4_1 + vmovdqu 10*8(%rax), A_4_2 + vmovdqu 11*8(%rax), A_4_3 + vmovq 12*8(%rax), A_4_4 + + cmpq $(104 >> 3), %r8 + jb .Loop_absorb_72_ce + je .Loop_absorb_104_ce + cmpq $(144 >> 3), %r8 + jb .Loop_absorb_136_ce + je .Loop_absorb_144_ce + jmp .Loop_absorb_168_ce + +.align 64, 0xcc +.Loop_absorb_168_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vpxorq 8*8(%rdx), A_1_3, A_1_3 + vpxorq 9*8(%rdx), A_1_4, A_1_4 + vpxorq 10*8(%rdx), A_2_0, A_2_0 + vpxorq 11*8(%rdx), A_2_1, A_2_1 + vpxorq 12*8(%rdx), A_2_2, A_2_2 + vpxorq 13*8(%rdx), A_2_3, A_2_3 + vpxorq 14*8(%rdx), A_2_4, A_2_4 + vpxorq 15*8(%rdx), A_3_0, A_3_0 + vpxor 16*8(%rdx), A_3_1, A_3_1 + vpxor 17*8(%rdx), A_3_2, A_3_2 + vpxor 18*8(%rdx), A_3_3, A_3_3 + vpxor 19*8(%rdx), A_3_4, A_3_4 + vmovq 20*8(%rdx), C_0 + leaq 21*8(%rdx), %rdx + vpxorq C_0, A_4_0, A_4_0 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_168_ce + +.align 64, 0xcc +.Loop_absorb_144_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vpxorq 8*8(%rdx), A_1_3, A_1_3 + vpxorq 9*8(%rdx), A_1_4, A_1_4 + vpxorq 10*8(%rdx), A_2_0, A_2_0 + vpxorq 11*8(%rdx), A_2_1, A_2_1 + vpxorq 12*8(%rdx), A_2_2, A_2_2 + vpxorq 13*8(%rdx), A_2_3, A_2_3 + vpxorq 14*8(%rdx), A_2_4, A_2_4 + vpxorq 15*8(%rdx), A_3_0, A_3_0 + vpxor 16*8(%rdx), A_3_1, A_3_1 + vmovq 17*8(%rdx), C_0 + leaq 18*8(%rdx), %rdx + vpxor C_0, A_3_2, A_3_2 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_144_ce + +.align 64, 0xcc +.Loop_absorb_136_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vpxorq 8*8(%rdx), A_1_3, A_1_3 + vpxorq 9*8(%rdx), A_1_4, A_1_4 + vpxorq 10*8(%rdx), A_2_0, A_2_0 + vpxorq 11*8(%rdx), A_2_1, A_2_1 + vpxorq 12*8(%rdx), A_2_2, A_2_2 + vpxorq 13*8(%rdx), A_2_3, A_2_3 + vpxorq 14*8(%rdx), A_2_4, A_2_4 + vpxorq 15*8(%rdx), A_3_0, A_3_0 + vmovq 16*8(%rdx), C_0 + leaq 17*8(%rdx), %rdx + vpxor C_0, A_3_1, A_3_1 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_136_ce + +.align 64, 0xcc +.Loop_absorb_104_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vpxorq 8*8(%rdx), A_1_3, A_1_3 + vpxorq 9*8(%rdx), A_1_4, A_1_4 + vpxorq 10*8(%rdx), A_2_0, A_2_0 + vpxorq 11*8(%rdx), A_2_1, A_2_1 + vmovq 12*8(%rdx), C_0 + leaq 13*8(%rdx), %rdx + vpxorq C_0, A_2_2, A_2_2 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_104_ce + +.align 64, 0xcc +.Loop_absorb_72_ce: + subq %r8, %rcx // len - bsz + jb .Labsorbed_ce + + vpxorq 0*8(%rdx), A_0_0, A_0_0 + vpxorq 1*8(%rdx), A_0_1, A_0_1 + vpxorq 2*8(%rdx), A_0_2, A_0_2 + vpxorq 3*8(%rdx), A_0_3, A_0_3 + vpxorq 4*8(%rdx), A_0_4, A_0_4 + vpxorq 5*8(%rdx), A_1_0, A_1_0 + vpxorq 6*8(%rdx), A_1_1, A_1_1 + vpxorq 7*8(%rdx), A_1_2, A_1_2 + vmovq 8*8(%rdx), C_0 + leaq 9*8(%rdx), %rdx + vpxorq C_0, A_1_3, A_1_3 + + movq %rsi, %r10 + call KeccakF1600_ce + + jmp .Loop_absorb_72_ce + +.align 64, 0xcc +.Labsorbed_ce: + vpunpcklqdq A_0_1, A_0_0, A_0_0 + vpunpcklqdq A_0_3, A_0_2, A_0_2 + vpunpcklqdq A_1_0, A_0_4, A_0_4 + vpunpcklqdq A_1_2, A_1_1, A_1_1 + vpunpcklqdq A_1_4, A_1_3, A_1_3 + vpunpcklqdq A_2_1, A_2_0, A_2_0 + vpunpcklqdq A_2_3, A_2_2, A_2_2 + vpunpcklqdq A_3_0, A_2_4, A_2_4 + vpunpcklqdq A_3_2, A_3_1, A_3_1 + vpunpcklqdq A_3_4, A_3_3, A_3_3 + vpunpcklqdq A_4_1, A_4_0, A_4_0 + vpunpcklqdq A_4_3, A_4_2, A_4_2 + vmovdqu64 A_0_0, 0*8(%rdi) + vmovdqu64 A_0_2, 2*8(%rdi) + vmovdqu64 A_0_4, 4*8(%rdi) + vmovdqu64 A_1_1, 6*8(%rdi) + vmovdqu64 A_1_3, 8*8(%rdi) + vmovdqu64 A_2_0, 10*8(%rdi) + vmovdqu64 A_2_2, 0*8(%rax) + vmovdqu64 A_2_4, 2*8(%rax) + vmovdqu A_3_1, 4*8(%rax) + vmovdqu A_3_3, 6*8(%rax) + vmovdqu A_4_0, 8*8(%rax) + vmovdqu A_4_2, 10*8(%rax) + vmovq A_4_4, 12*8(%rax) + + leaq (%r8, %rcx), %rax // return value + movq %rdx, (%r9) // return buffer pointer + + clear_regs() + ret_spec_stop + CFI_ENDPROC() +ELF(.size _gcry_keccak_absorb_blocks_avx512, + .-_gcry_keccak_absorb_blocks_avx512) + +#endif /* HAVE_GCC_INLINE_ASM_AVX512 */ +#endif /* __x86_64 */ diff --git a/cipher/keccak.c b/cipher/keccak.c index f3502022..e7e42473 100644 --- a/cipher/keccak.c +++ b/cipher/keccak.c @@ -62,6 +62,16 @@ #endif +/* USE_64BIT_AVX512 indicates whether to compile with Intel AVX512 code. */ +#undef USE_64BIT_AVX512 +#if defined(USE_64BIT) && defined(__x86_64__) && \ + defined(HAVE_GCC_INLINE_ASM_AVX512) && \ + (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) +# define USE_64BIT_AVX512 1 +#endif + + /* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly * code. */ #undef USE_64BIT_ARM_NEON @@ -81,6 +91,16 @@ #endif /* USE_S390X_CRYPTO */ +/* x86-64 vector register assembly implementations use SystemV ABI, ABI + * conversion needed on Win64 through function attribute. */ +#undef ASM_FUNC_ABI +#if defined(USE_64BIT_AVX512) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) +# define ASM_FUNC_ABI __attribute__((sysv_abi)) +#else +# define ASM_FUNC_ABI +#endif + + #if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON) # define NEED_COMMON64 1 #endif @@ -428,6 +448,65 @@ static const keccak_ops_t keccak_bmi2_64_ops = #endif /* USE_64BIT_BMI2 */ +/* 64-bit Intel AVX512 implementation. */ +#ifdef USE_64BIT_AVX512 + +extern ASM_FUNC_ABI unsigned int +_gcry_keccak_f1600_state_permute64_avx512(u64 *state, const u64 *rconst); + +extern ASM_FUNC_ABI unsigned int +_gcry_keccak_absorb_blocks_avx512(u64 *state, const u64 *rconst, + const byte *lanes, size_t nlanes, + size_t blocklanes, const byte **new_lanes); + +static unsigned int +keccak_f1600_state_permute64_avx512(KECCAK_STATE *hd) +{ + return _gcry_keccak_f1600_state_permute64_avx512 ( + hd->u.state64, _gcry_keccak_round_consts_64bit); +} + +static unsigned int +keccak_absorb_lanes64_avx512(KECCAK_STATE *hd, int pos, const byte *lanes, + unsigned int nlanes, int blocklanes) +{ + while (nlanes) + { + if (pos == 0 && blocklanes > 0 && nlanes >= (unsigned int)blocklanes) + { + nlanes = _gcry_keccak_absorb_blocks_avx512 ( + hd->u.state64, _gcry_keccak_round_consts_64bit, + lanes, nlanes, blocklanes, &lanes); + } + + while (nlanes) + { + hd->u.state64[pos] ^= buf_get_le64 (lanes); + lanes += 8; + nlanes--; + + if (++pos == blocklanes) + { + keccak_f1600_state_permute64_avx512 (hd); + pos = 0; + break; + } + } + } + + return 0; +} + +static const keccak_ops_t keccak_avx512_64_ops = +{ + .permute = keccak_f1600_state_permute64_avx512, + .absorb = keccak_absorb_lanes64_avx512, + .extract = keccak_extract64, +}; + +#endif /* USE_64BIT_AVX512 */ + + /* 64-bit ARMv7/NEON implementation. */ #ifdef USE_64BIT_ARM_NEON @@ -894,6 +973,10 @@ keccak_init (int algo, void *context, unsigned int flags) /* Select optimized implementation based in hw features. */ if (0) {} +#ifdef USE_64BIT_AVX512 + else if (features & HWF_INTEL_AVX512) + ctx->ops = &keccak_avx512_64_ops; +#endif #ifdef USE_64BIT_ARM_NEON else if (features & HWF_ARM_NEON) ctx->ops = &keccak_armv7_neon_64_ops; diff --git a/configure.ac b/configure.ac index 34ec058e..27159888 100644 --- a/configure.ac +++ b/configure.ac @@ -3153,7 +3153,7 @@ if test "$found" = "1" ; then case "${host}" in x86_64-*-*) # Build with the assembly implementation - : + GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS keccak-amd64-avx512.lo" ;; esac |