summaryrefslogtreecommitdiff
path: root/cipher
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2022-07-21 11:14:07 +0300
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2022-07-25 16:11:09 +0300
commitbeaad75f4655e5316ce24f75ef172c231fd47fc1 (patch)
tree9d61130f5670af0999601055c9a436c09142a0a5 /cipher
parentdca0bd133dd08ec88e0b4c454cfc26c9093572a9 (diff)
downloadlibgcrypt-beaad75f4655e5316ce24f75ef172c231fd47fc1.tar.gz
sha3: Add x86-64 AVX512 accelerated implementation
* LICENSES: Add 'cipher/keccak-amd64-avx512.S'. * configure.ac: Add 'keccak-amd64-avx512.lo'. * cipher/Makefile.am: Add 'keccak-amd64-avx512.S'. * cipher/keccak-amd64-avx512.S: New. * cipher/keccak.c (USE_64BIT_AVX512, ASM_FUNC_ABI): New. [USE_64BIT_AVX512] (_gcry_keccak_f1600_state_permute64_avx512) (_gcry_keccak_absorb_blocks_avx512, keccak_f1600_state_permute64_avx512) (keccak_absorb_lanes64_avx512, keccak_avx512_64_ops): New. (keccak_init) [USE_64BIT_AVX512]: Enable x86-64 AVX512 implementation if supported by HW features. -- Benchmark on Intel Core i3-1115G4 (tigerlake): Before (BMI2 instructions): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz SHA3-224 | 1.77 ns/B 540.3 MiB/s 7.22 c/B 4088 SHA3-256 | 1.86 ns/B 514.0 MiB/s 7.59 c/B 4089 SHA3-384 | 2.43 ns/B 393.1 MiB/s 9.92 c/B 4089 SHA3-512 | 3.49 ns/B 273.2 MiB/s 14.27 c/B 4088 SHAKE128 | 1.52 ns/B 629.1 MiB/s 6.20 c/B 4089 SHAKE256 | 1.86 ns/B 511.6 MiB/s 7.62 c/B 4089 After (~33% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz SHA3-224 | 1.32 ns/B 721.8 MiB/s 5.40 c/B 4089 SHA3-256 | 1.40 ns/B 681.7 MiB/s 5.72 c/B 4089 SHA3-384 | 1.83 ns/B 522.5 MiB/s 7.46 c/B 4089 SHA3-512 | 2.63 ns/B 362.1 MiB/s 10.77 c/B 4088 SHAKE128 | 1.13 ns/B 840.4 MiB/s 4.64 c/B 4089 SHAKE256 | 1.40 ns/B 682.1 MiB/s 5.72 c/B 4089 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r--cipher/Makefile.am3
-rw-r--r--cipher/keccak-amd64-avx512.S583
-rw-r--r--cipher/keccak.c83
3 files changed, 668 insertions, 1 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3d95a794..c33d0754 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -134,7 +134,8 @@ EXTRA_libcipher_la_SOURCES = \
sha512-armv7-neon.S sha512-arm.S \
sha512-ppc.c sha512-ssse3-i386.c \
sm3.c sm3-avx-bmi2-amd64.S sm3-aarch64.S sm3-armv8-aarch64-ce.S \
- keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
+ keccak.c keccak_permute_32.h keccak_permute_64.h \
+ keccak-armv7-neon.S keccak-amd64-avx512.S \
stribog.c \
tiger.c \
whirlpool.c whirlpool-sse2-amd64.S \
diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S
new file mode 100644
index 00000000..f44e0285
--- /dev/null
+++ b/cipher/keccak-amd64-avx512.S
@@ -0,0 +1,583 @@
+/* keccak-amd64-avx512.S - x86-64 AVX512 implementation of Keccak
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * ---
+ *
+ * Core function `KeccakF1600_ce` based on ARMv8-CE KeccakF1600 implementation
+ * by Andy Polyakov from CRYPTOGAMS distribution `arm/keccak1600-armv8.pl`.
+ * `KeccakF1600_ce` was ported to x86-64 AVX512 and converted to use GCC
+ * preprocessed assembly and fitted with new absorb function optimized for
+ * x86-64. SHA3-256 performance on Intel tigerlake, 5.72 cpB.
+ *
+ * Original copyright license follows:
+ *
+ * Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain copyright notices,
+ * this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * * Neither the name of the CRYPTOGAMS nor the names of its
+ * copyright holder and contributors may be used to endorse or
+ * promote products derived from this software without specific
+ * prior written permission.
+ *
+ * ALTERNATIVELY, provided that this notice is retained in full, this
+ * product may be distributed under the terms of the GNU General Public
+ * License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+ * those given above.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#include "asm-common-amd64.h"
+
+.text
+
+/* Register macros. */
+#define A_0_0 %xmm31
+#define A_0_1 %xmm30
+#define A_0_2 %xmm29
+#define A_0_3 %xmm28
+#define A_0_4 %xmm27
+#define A_1_0 %xmm26
+#define A_1_1 %xmm25
+#define A_1_2 %xmm24
+#define A_1_3 %xmm23
+#define A_1_4 %xmm22
+#define A_2_0 %xmm21
+#define A_2_1 %xmm20
+#define A_2_2 %xmm19
+#define A_2_3 %xmm18
+#define A_2_4 %xmm17
+#define A_3_0 %xmm16
+#define A_3_1 %xmm15
+#define A_3_2 %xmm14
+#define A_3_3 %xmm13
+#define A_3_4 %xmm12
+#define A_4_0 %xmm11
+#define A_4_1 %xmm10
+#define A_4_2 %xmm9
+#define A_4_3 %xmm8
+#define A_4_4 %xmm7
+
+#define C_0 %xmm6
+#define C_1 %xmm5
+#define C_2 %xmm4
+#define C_3 %xmm3
+#define C_4 %xmm2
+#define C_5 %xmm1
+#define C_6 %xmm0
+
+#define D_0 C_4
+#define D_1 C_5
+#define D_2 C_6
+#define D_3 C_2
+#define D_4 C_3
+
+/* Helper macros for ARMv8-CE to x86-64/AVX512 conversion. */
+#define eor3_d(dst_s1, s2, s3) \
+ vpternlogq $0x96, s3, s2, dst_s1;
+
+#define eor3(dst, s1, s2, s3) \
+ vmovdqa s1, dst; \
+ eor3_d(dst, s2, s3);
+
+#define rax1_c(dst, s1, s2_rol1) \
+ vprolq $1, s2_rol1, dst; \
+ vpxor s1, dst, dst;
+
+#define rax1_t(dst_s1, s2_rol1, tmp) \
+ vprolq $1, s2_rol1, tmp; \
+ vpxor tmp, dst_s1, dst_s1;
+
+#define rax1_s(dst_s1, s2_rol1) \
+ vprolq $1, s2_rol1, s2_rol1; \
+ vpxor s2_rol1, dst_s1, dst_s1;
+
+#define xar(dst, s1, s2, rol) \
+ vpxorq s2, s1, dst; \
+ vprolq $(rol), dst, dst;
+
+#define xar_x(dst, s1, s2, rol) \
+ vpxor s2, s1, dst; \
+ vprolq $(rol), dst, dst;
+
+#define bcax_d(dst_s1, s2, s3) \
+ vpternlogq $0xb4, s3, s2, dst_s1;
+
+#define bcax(dst, s1, s2, s3) \
+ vmovdqa64 s1, dst; \
+ bcax_d(dst, s2, s3);
+
+#define bcax_x(dst, s1, s2, s3) \
+ vmovdqa s1, dst; \
+ bcax_d(dst, s2, s3);
+
+#define eor(dst, s1, s2) \
+ vpxorq s2, s1, dst;
+
+/* Misc helper macros. */
+#define clear_avx512_4regs(a, b, c, d) \
+ eor(a, a, a); vmovdqa64 a, b; vmovdqa64 a, c; vmovdqa64 a, d;
+
+#define clear_regs() \
+ vzeroall; /* xmm0-xmm15 */ \
+ clear_avx512_4regs(%xmm16, %xmm17, %xmm18, %xmm19); \
+ clear_avx512_4regs(%xmm20, %xmm21, %xmm22, %xmm23); \
+ clear_avx512_4regs(%xmm24, %xmm25, %xmm26, %xmm27); \
+ clear_avx512_4regs(%xmm28, %xmm29, %xmm30, %xmm31);
+
+ELF(.type KeccakF1600_ce,@function)
+.align 64, 0xcc
+KeccakF1600_ce:
+.Loop_ce:
+ CFI_STARTPROC()
+
+ ////////////////////////////////////////////////// Theta
+ eor3( C_0, A_4_0, A_3_0, A_2_0)
+ eor3( C_1, A_4_1, A_3_1, A_2_1)
+ eor3( C_3, A_4_3, A_3_3, A_2_3)
+ eor3( C_2, A_4_2, A_3_2, A_2_2)
+ eor3( C_4, A_4_4, A_3_4, A_2_4)
+ eor3_d( C_0, A_1_0, A_0_0)
+ eor3_d( C_1, A_1_1, A_0_1)
+ eor3_d( C_3, A_1_3, A_0_3)
+ eor3_d( C_2, A_1_2, A_0_2)
+ eor3_d( C_4, A_1_4, A_0_4)
+
+ rax1_c( C_5, C_0, C_2) // D[1]
+ rax1_t( C_2, C_4, C_6) // D[3]
+ rax1_c( C_6, C_1, C_3) // D[2]
+ rax1_s( C_3, C_0) // D[4]
+ rax1_s( C_4, C_1) // D[0]
+
+ ////////////////////////////////////////////////// Theta+Rho+Pi
+ xar( C_0, A_0_1, D_1, 1) // C[0]=A[2][0]
+
+ xar( A_0_1, A_1_1, D_1, 44)
+ xar( A_1_1, A_1_4, D_4, 20)
+ xar( A_1_4, A_4_2, D_2, 61)
+ xar( A_4_2, A_2_4, D_4, 39)
+ xar( A_2_4, A_4_0, D_0, 18)
+
+ xar( C_1, A_0_2, D_2, 62) // C[1]=A[4][0]
+
+ xar( A_0_2, A_2_2, D_2, 43)
+ xar( A_2_2, A_2_3, D_3, 25)
+ xar( A_2_3, A_3_4, D_4, 8)
+ xar_x( A_3_4, A_4_3, D_3, 56)
+ xar( A_4_3, A_3_0, D_0, 41)
+
+ xar( A_3_0, A_0_4, D_4, 27)
+
+ xar_x( D_4, A_4_4, D_4, 14) // D[4]=A[0][4]
+ xar_x( A_4_4, A_4_1, D_1, 2)
+ xar( A_1_3, A_1_3, D_3, 55) // A[1][3]=A[4][1]
+ xar( A_0_4, A_3_1, D_1, 45) // A[0][4]=A[1][3]
+ xar( A_3_1, A_1_0, D_0, 36)
+
+ xar( A_1_0, A_0_3, D_3, 28)
+
+ eor( A_0_0, A_0_0, D_0)
+
+ xar_x( D_3, A_3_3, D_3, 21) // D[3]=A[0][3]
+ xar( A_0_3, A_3_2, D_2, 15) // A[0][3]=A[3][3]
+ xar( D_1, A_2_1, D_1, 10) // D[1]=A[3][2]
+ xar( D_2, A_1_2, D_2, 6) // D[2]=A[2][1]
+ xar( D_0, A_2_0, D_0, 3) // D[0]=A[1][2]
+
+ ////////////////////////////////////////////////// Chi+Iota
+ bcax_x( A_4_0, C_1, A_4_2, A_1_3) // A[1][3]=A[4][1]
+ bcax( A_4_1, A_1_3, A_4_3, A_4_2) // A[1][3]=A[4][1]
+ bcax_d( A_4_2, A_4_4, A_4_3)
+ bcax_d( A_4_3, C_1, A_4_4)
+ bcax_d( A_4_4, A_1_3, C_1) // A[1][3]=A[4][1]
+
+ bcax_x( A_3_2, D_1, A_3_4, A_0_3) // A[0][3]=A[3][3]
+ bcax( A_3_3, A_0_3, A_3_0, A_3_4) // A[0][3]=A[3][3]
+ bcax_d( A_3_4, A_3_1, A_3_0)
+ bcax_d( A_3_0, D_1, A_3_1)
+ bcax_d( A_3_1, A_0_3, D_1) // A[0][3]=A[3][3]
+
+ bcax( A_2_0, C_0, A_2_2, D_2)
+ bcax( A_2_1, D_2, A_2_3, A_2_2)
+ bcax_d( A_2_2, A_2_4, A_2_3)
+ bcax_d( A_2_3, C_0, A_2_4)
+ bcax_d( A_2_4, D_2, C_0)
+
+ bcax( A_1_2, D_0, A_1_4, A_0_4) // A[0][4]=A[1][3]
+ bcax( A_1_3, A_0_4, A_1_0, A_1_4) // A[0][4]=A[1][3]
+ bcax_d( A_1_4, A_1_1, A_1_0)
+ bcax_d( A_1_0, D_0, A_1_1)
+ bcax_d( A_1_1, A_0_4, D_0) // A[0][4]=A[1][3]
+
+ bcax( A_0_3, D_3, A_0_0, D_4)
+ bcax( A_0_4, D_4, A_0_1, A_0_0)
+ bcax_d( A_0_0, A_0_2, A_0_1)
+ bcax_d( A_0_1, D_3, A_0_2)
+ bcax_d( A_0_2, D_4, D_3)
+ eor( A_0_0, A_0_0, (%r10))
+
+ cmpq %r10, %r11
+ je .Lend_ce
+
+ addq $8, %r10
+ jmp .Loop_ce
+
+.align 64, 0xcc
+.Lend_ce:
+ ret_spec_stop
+ CFI_ENDPROC()
+ELF(.size KeccakF1600_ce,.-KeccakF1600_ce)
+
+.globl _gcry_keccak_f1600_state_permute64_avx512
+ELF(.type _gcry_keccak_f1600_state_permute64_avx512,@function)
+.align 64, 0xcc
+_gcry_keccak_f1600_state_permute64_avx512:
+ /* input:
+ * %rdi: state
+ * %rsi: round constants
+ */
+ CFI_STARTPROC()
+
+ leaq 12*8(%rdi), %rax
+ leaq (24-1)*8(%rsi), %r11
+
+ vmovdqu64 0*8(%rdi), A_0_0
+ vmovdqu64 1*8(%rdi), A_0_1
+ vmovdqu64 2*8(%rdi), A_0_2
+ vmovdqu64 3*8(%rdi), A_0_3
+ vmovdqu64 4*8(%rdi), A_0_4
+ vmovdqu64 5*8(%rdi), A_1_0
+ vmovdqu64 6*8(%rdi), A_1_1
+ vmovdqu64 7*8(%rdi), A_1_2
+ vmovdqu64 8*8(%rdi), A_1_3
+ vmovdqu64 9*8(%rdi), A_1_4
+ vmovdqu64 10*8(%rdi), A_2_0
+ vmovdqu64 11*8(%rdi), A_2_1
+ vmovdqu64 0*8(%rax), A_2_2
+ vmovdqu64 1*8(%rax), A_2_3
+ vmovdqu64 2*8(%rax), A_2_4
+ vmovdqu64 3*8(%rax), A_3_0
+ vmovdqu 4*8(%rax), A_3_1
+ vmovdqu 5*8(%rax), A_3_2
+ vmovdqu 6*8(%rax), A_3_3
+ vmovdqu 7*8(%rax), A_3_4
+ vmovdqu 8*8(%rax), A_4_0
+ vmovdqu 9*8(%rax), A_4_1
+ vmovdqu 10*8(%rax), A_4_2
+ vmovdqu 11*8(%rax), A_4_3
+ vmovq 12*8(%rax), A_4_4
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ vpunpcklqdq A_0_1, A_0_0, A_0_0
+ vpunpcklqdq A_0_3, A_0_2, A_0_2
+ vpunpcklqdq A_1_0, A_0_4, A_0_4
+ vpunpcklqdq A_1_2, A_1_1, A_1_1
+ vpunpcklqdq A_1_4, A_1_3, A_1_3
+ vpunpcklqdq A_2_1, A_2_0, A_2_0
+ vpunpcklqdq A_2_3, A_2_2, A_2_2
+ vpunpcklqdq A_3_0, A_2_4, A_2_4
+ vpunpcklqdq A_3_2, A_3_1, A_3_1
+ vpunpcklqdq A_3_4, A_3_3, A_3_3
+ vpunpcklqdq A_4_1, A_4_0, A_4_0
+ vpunpcklqdq A_4_3, A_4_2, A_4_2
+ vmovdqu64 A_0_0, 0*8(%rdi)
+ vmovdqu64 A_0_2, 2*8(%rdi)
+ vmovdqu64 A_0_4, 4*8(%rdi)
+ vmovdqu64 A_1_1, 6*8(%rdi)
+ vmovdqu64 A_1_3, 8*8(%rdi)
+ vmovdqu64 A_2_0, 10*8(%rdi)
+ vmovdqu64 A_2_2, 0*8(%rax)
+ vmovdqu64 A_2_4, 2*8(%rax)
+ vmovdqu A_3_1, 4*8(%rax)
+ vmovdqu A_3_3, 6*8(%rax)
+ vmovdqu A_4_0, 8*8(%rax)
+ vmovdqu A_4_2, 10*8(%rax)
+ vmovq A_4_4, 12*8(%rax)
+
+ xorl %eax, %eax
+
+ clear_regs()
+ ret_spec_stop
+ CFI_ENDPROC()
+ELF(.size _gcry_keccak_f1600_state_permute64_avx512,
+ .-_gcry_keccak_f1600_state_permute64_avx512)
+
+.globl _gcry_keccak_absorb_blocks_avx512
+ELF(.type _gcry_keccak_absorb_blocks_avx512,@function)
+.align 64, 0xcc
+_gcry_keccak_absorb_blocks_avx512:
+ /* input:
+ * %rdi: state
+ * %rsi: round constants
+ * %rdx: lanes
+ * %rcx: nlanes
+ * %r8 : blocklanes
+ * %r9 : lanes output pointer
+ */
+ CFI_STARTPROC()
+
+ leaq 12*8(%rdi), %rax
+ leaq (24-1)*8(%rsi), %r11
+
+ vmovdqu64 0*8(%rdi), A_0_0
+ vmovdqu64 1*8(%rdi), A_0_1
+ vmovdqu64 2*8(%rdi), A_0_2
+ vmovdqu64 3*8(%rdi), A_0_3
+ vmovdqu64 4*8(%rdi), A_0_4
+ vmovdqu64 5*8(%rdi), A_1_0
+ vmovdqu64 6*8(%rdi), A_1_1
+ vmovdqu64 7*8(%rdi), A_1_2
+ vmovdqu64 8*8(%rdi), A_1_3
+ vmovdqu64 9*8(%rdi), A_1_4
+ vmovdqu64 10*8(%rdi), A_2_0
+ vmovdqu64 11*8(%rdi), A_2_1
+ vmovdqu64 0*8(%rax), A_2_2
+ vmovdqu64 1*8(%rax), A_2_3
+ vmovdqu64 2*8(%rax), A_2_4
+ vmovdqu64 3*8(%rax), A_3_0
+ vmovdqu 4*8(%rax), A_3_1
+ vmovdqu 5*8(%rax), A_3_2
+ vmovdqu 6*8(%rax), A_3_3
+ vmovdqu 7*8(%rax), A_3_4
+ vmovdqu 8*8(%rax), A_4_0
+ vmovdqu 9*8(%rax), A_4_1
+ vmovdqu 10*8(%rax), A_4_2
+ vmovdqu 11*8(%rax), A_4_3
+ vmovq 12*8(%rax), A_4_4
+
+ cmpq $(104 >> 3), %r8
+ jb .Loop_absorb_72_ce
+ je .Loop_absorb_104_ce
+ cmpq $(144 >> 3), %r8
+ jb .Loop_absorb_136_ce
+ je .Loop_absorb_144_ce
+ jmp .Loop_absorb_168_ce
+
+.align 64, 0xcc
+.Loop_absorb_168_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vpxorq 8*8(%rdx), A_1_3, A_1_3
+ vpxorq 9*8(%rdx), A_1_4, A_1_4
+ vpxorq 10*8(%rdx), A_2_0, A_2_0
+ vpxorq 11*8(%rdx), A_2_1, A_2_1
+ vpxorq 12*8(%rdx), A_2_2, A_2_2
+ vpxorq 13*8(%rdx), A_2_3, A_2_3
+ vpxorq 14*8(%rdx), A_2_4, A_2_4
+ vpxorq 15*8(%rdx), A_3_0, A_3_0
+ vpxor 16*8(%rdx), A_3_1, A_3_1
+ vpxor 17*8(%rdx), A_3_2, A_3_2
+ vpxor 18*8(%rdx), A_3_3, A_3_3
+ vpxor 19*8(%rdx), A_3_4, A_3_4
+ vmovq 20*8(%rdx), C_0
+ leaq 21*8(%rdx), %rdx
+ vpxorq C_0, A_4_0, A_4_0
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_168_ce
+
+.align 64, 0xcc
+.Loop_absorb_144_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vpxorq 8*8(%rdx), A_1_3, A_1_3
+ vpxorq 9*8(%rdx), A_1_4, A_1_4
+ vpxorq 10*8(%rdx), A_2_0, A_2_0
+ vpxorq 11*8(%rdx), A_2_1, A_2_1
+ vpxorq 12*8(%rdx), A_2_2, A_2_2
+ vpxorq 13*8(%rdx), A_2_3, A_2_3
+ vpxorq 14*8(%rdx), A_2_4, A_2_4
+ vpxorq 15*8(%rdx), A_3_0, A_3_0
+ vpxor 16*8(%rdx), A_3_1, A_3_1
+ vmovq 17*8(%rdx), C_0
+ leaq 18*8(%rdx), %rdx
+ vpxor C_0, A_3_2, A_3_2
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_144_ce
+
+.align 64, 0xcc
+.Loop_absorb_136_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vpxorq 8*8(%rdx), A_1_3, A_1_3
+ vpxorq 9*8(%rdx), A_1_4, A_1_4
+ vpxorq 10*8(%rdx), A_2_0, A_2_0
+ vpxorq 11*8(%rdx), A_2_1, A_2_1
+ vpxorq 12*8(%rdx), A_2_2, A_2_2
+ vpxorq 13*8(%rdx), A_2_3, A_2_3
+ vpxorq 14*8(%rdx), A_2_4, A_2_4
+ vpxorq 15*8(%rdx), A_3_0, A_3_0
+ vmovq 16*8(%rdx), C_0
+ leaq 17*8(%rdx), %rdx
+ vpxor C_0, A_3_1, A_3_1
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_136_ce
+
+.align 64, 0xcc
+.Loop_absorb_104_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vpxorq 8*8(%rdx), A_1_3, A_1_3
+ vpxorq 9*8(%rdx), A_1_4, A_1_4
+ vpxorq 10*8(%rdx), A_2_0, A_2_0
+ vpxorq 11*8(%rdx), A_2_1, A_2_1
+ vmovq 12*8(%rdx), C_0
+ leaq 13*8(%rdx), %rdx
+ vpxorq C_0, A_2_2, A_2_2
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_104_ce
+
+.align 64, 0xcc
+.Loop_absorb_72_ce:
+ subq %r8, %rcx // len - bsz
+ jb .Labsorbed_ce
+
+ vpxorq 0*8(%rdx), A_0_0, A_0_0
+ vpxorq 1*8(%rdx), A_0_1, A_0_1
+ vpxorq 2*8(%rdx), A_0_2, A_0_2
+ vpxorq 3*8(%rdx), A_0_3, A_0_3
+ vpxorq 4*8(%rdx), A_0_4, A_0_4
+ vpxorq 5*8(%rdx), A_1_0, A_1_0
+ vpxorq 6*8(%rdx), A_1_1, A_1_1
+ vpxorq 7*8(%rdx), A_1_2, A_1_2
+ vmovq 8*8(%rdx), C_0
+ leaq 9*8(%rdx), %rdx
+ vpxorq C_0, A_1_3, A_1_3
+
+ movq %rsi, %r10
+ call KeccakF1600_ce
+
+ jmp .Loop_absorb_72_ce
+
+.align 64, 0xcc
+.Labsorbed_ce:
+ vpunpcklqdq A_0_1, A_0_0, A_0_0
+ vpunpcklqdq A_0_3, A_0_2, A_0_2
+ vpunpcklqdq A_1_0, A_0_4, A_0_4
+ vpunpcklqdq A_1_2, A_1_1, A_1_1
+ vpunpcklqdq A_1_4, A_1_3, A_1_3
+ vpunpcklqdq A_2_1, A_2_0, A_2_0
+ vpunpcklqdq A_2_3, A_2_2, A_2_2
+ vpunpcklqdq A_3_0, A_2_4, A_2_4
+ vpunpcklqdq A_3_2, A_3_1, A_3_1
+ vpunpcklqdq A_3_4, A_3_3, A_3_3
+ vpunpcklqdq A_4_1, A_4_0, A_4_0
+ vpunpcklqdq A_4_3, A_4_2, A_4_2
+ vmovdqu64 A_0_0, 0*8(%rdi)
+ vmovdqu64 A_0_2, 2*8(%rdi)
+ vmovdqu64 A_0_4, 4*8(%rdi)
+ vmovdqu64 A_1_1, 6*8(%rdi)
+ vmovdqu64 A_1_3, 8*8(%rdi)
+ vmovdqu64 A_2_0, 10*8(%rdi)
+ vmovdqu64 A_2_2, 0*8(%rax)
+ vmovdqu64 A_2_4, 2*8(%rax)
+ vmovdqu A_3_1, 4*8(%rax)
+ vmovdqu A_3_3, 6*8(%rax)
+ vmovdqu A_4_0, 8*8(%rax)
+ vmovdqu A_4_2, 10*8(%rax)
+ vmovq A_4_4, 12*8(%rax)
+
+ leaq (%r8, %rcx), %rax // return value
+ movq %rdx, (%r9) // return buffer pointer
+
+ clear_regs()
+ ret_spec_stop
+ CFI_ENDPROC()
+ELF(.size _gcry_keccak_absorb_blocks_avx512,
+ .-_gcry_keccak_absorb_blocks_avx512)
+
+#endif /* HAVE_GCC_INLINE_ASM_AVX512 */
+#endif /* __x86_64 */
diff --git a/cipher/keccak.c b/cipher/keccak.c
index f3502022..e7e42473 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -62,6 +62,16 @@
#endif
+/* USE_64BIT_AVX512 indicates whether to compile with Intel AVX512 code. */
+#undef USE_64BIT_AVX512
+#if defined(USE_64BIT) && defined(__x86_64__) && \
+ defined(HAVE_GCC_INLINE_ASM_AVX512) && \
+ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_64BIT_AVX512 1
+#endif
+
+
/* USE_64BIT_ARM_NEON indicates whether to enable 64-bit ARM/NEON assembly
* code. */
#undef USE_64BIT_ARM_NEON
@@ -81,6 +91,16 @@
#endif /* USE_S390X_CRYPTO */
+/* x86-64 vector register assembly implementations use SystemV ABI, ABI
+ * conversion needed on Win64 through function attribute. */
+#undef ASM_FUNC_ABI
+#if defined(USE_64BIT_AVX512) && defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)
+# define ASM_FUNC_ABI __attribute__((sysv_abi))
+#else
+# define ASM_FUNC_ABI
+#endif
+
+
#if defined(USE_64BIT) || defined(USE_64BIT_ARM_NEON)
# define NEED_COMMON64 1
#endif
@@ -428,6 +448,65 @@ static const keccak_ops_t keccak_bmi2_64_ops =
#endif /* USE_64BIT_BMI2 */
+/* 64-bit Intel AVX512 implementation. */
+#ifdef USE_64BIT_AVX512
+
+extern ASM_FUNC_ABI unsigned int
+_gcry_keccak_f1600_state_permute64_avx512(u64 *state, const u64 *rconst);
+
+extern ASM_FUNC_ABI unsigned int
+_gcry_keccak_absorb_blocks_avx512(u64 *state, const u64 *rconst,
+ const byte *lanes, size_t nlanes,
+ size_t blocklanes, const byte **new_lanes);
+
+static unsigned int
+keccak_f1600_state_permute64_avx512(KECCAK_STATE *hd)
+{
+ return _gcry_keccak_f1600_state_permute64_avx512 (
+ hd->u.state64, _gcry_keccak_round_consts_64bit);
+}
+
+static unsigned int
+keccak_absorb_lanes64_avx512(KECCAK_STATE *hd, int pos, const byte *lanes,
+ unsigned int nlanes, int blocklanes)
+{
+ while (nlanes)
+ {
+ if (pos == 0 && blocklanes > 0 && nlanes >= (unsigned int)blocklanes)
+ {
+ nlanes = _gcry_keccak_absorb_blocks_avx512 (
+ hd->u.state64, _gcry_keccak_round_consts_64bit,
+ lanes, nlanes, blocklanes, &lanes);
+ }
+
+ while (nlanes)
+ {
+ hd->u.state64[pos] ^= buf_get_le64 (lanes);
+ lanes += 8;
+ nlanes--;
+
+ if (++pos == blocklanes)
+ {
+ keccak_f1600_state_permute64_avx512 (hd);
+ pos = 0;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static const keccak_ops_t keccak_avx512_64_ops =
+{
+ .permute = keccak_f1600_state_permute64_avx512,
+ .absorb = keccak_absorb_lanes64_avx512,
+ .extract = keccak_extract64,
+};
+
+#endif /* USE_64BIT_AVX512 */
+
+
/* 64-bit ARMv7/NEON implementation. */
#ifdef USE_64BIT_ARM_NEON
@@ -894,6 +973,10 @@ keccak_init (int algo, void *context, unsigned int flags)
/* Select optimized implementation based in hw features. */
if (0) {}
+#ifdef USE_64BIT_AVX512
+ else if (features & HWF_INTEL_AVX512)
+ ctx->ops = &keccak_avx512_64_ops;
+#endif
#ifdef USE_64BIT_ARM_NEON
else if (features & HWF_ARM_NEON)
ctx->ops = &keccak_armv7_neon_64_ops;