/* keccak-amd64-avx512.S - x86-64 AVX512 implementation of Keccak * * Copyright (C) 2022 Jussi Kivilinna * * This file is part of Libgcrypt. * * Libgcrypt is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 of * the License, or (at your option) any later version. * * Libgcrypt is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this program; if not, see . * * --- * * Core function `KeccakF1600_ce` based on ARMv8-CE KeccakF1600 implementation * by Andy Polyakov from CRYPTOGAMS distribution `arm/keccak1600-armv8.pl`. * `KeccakF1600_ce` was ported to x86-64 AVX512 and converted to use GCC * preprocessed assembly and fitted with new absorb function optimized for * x86-64. SHA3-256 performance on Intel tigerlake, 5.72 cpB. * * Original copyright license follows: * * Copyright (c) 2006, CRYPTOGAMS by * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain copyright notices, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * * Neither the name of the CRYPTOGAMS nor the names of its * copyright holder and contributors may be used to endorse or * promote products derived from this software without specific * prior written permission. * * ALTERNATIVELY, provided that this notice is retained in full, this * product may be distributed under the terms of the GNU General Public * License (GPL), in which case the provisions of the GPL apply INSTEAD OF * those given above. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef __x86_64 #include #if defined(HAVE_GCC_INLINE_ASM_AVX512) && \ (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) #include "asm-common-amd64.h" .text /* Register macros. */ #define A_0_0 %xmm31 #define A_0_1 %xmm30 #define A_0_2 %xmm29 #define A_0_3 %xmm28 #define A_0_4 %xmm27 #define A_1_0 %xmm26 #define A_1_1 %xmm25 #define A_1_2 %xmm24 #define A_1_3 %xmm23 #define A_1_4 %xmm22 #define A_2_0 %xmm21 #define A_2_1 %xmm20 #define A_2_2 %xmm19 #define A_2_3 %xmm18 #define A_2_4 %xmm17 #define A_3_0 %xmm16 #define A_3_1 %xmm15 #define A_3_2 %xmm14 #define A_3_3 %xmm13 #define A_3_4 %xmm12 #define A_4_0 %xmm11 #define A_4_1 %xmm10 #define A_4_2 %xmm9 #define A_4_3 %xmm8 #define A_4_4 %xmm7 #define C_0 %xmm6 #define C_1 %xmm5 #define C_2 %xmm4 #define C_3 %xmm3 #define C_4 %xmm2 #define C_5 %xmm1 #define C_6 %xmm0 #define D_0 C_4 #define D_1 C_5 #define D_2 C_6 #define D_3 C_2 #define D_4 C_3 /* Helper macros for ARMv8-CE to x86-64/AVX512 conversion. */ #define eor3_d(dst_s1, s2, s3) \ vpternlogq $0x96, s3, s2, dst_s1; #define eor3(dst, s1, s2, s3) \ vmovdqa s1, dst; \ eor3_d(dst, s2, s3); #define rax1_c(dst, s1, s2_rol1) \ vprolq $1, s2_rol1, dst; \ vpxor s1, dst, dst; #define rax1_t(dst_s1, s2_rol1, tmp) \ vprolq $1, s2_rol1, tmp; \ vpxor tmp, dst_s1, dst_s1; #define rax1_s(dst_s1, s2_rol1) \ vprolq $1, s2_rol1, s2_rol1; \ vpxor s2_rol1, dst_s1, dst_s1; #define xar(dst, s1, s2, rol) \ vpxorq s2, s1, dst; \ vprolq $(rol), dst, dst; #define xar_x(dst, s1, s2, rol) \ vpxor s2, s1, dst; \ vprolq $(rol), dst, dst; #define bcax_d(dst_s1, s2, s3) \ vpternlogq $0xb4, s3, s2, dst_s1; #define bcax(dst, s1, s2, s3) \ vmovdqa64 s1, dst; \ bcax_d(dst, s2, s3); #define bcax_x(dst, s1, s2, s3) \ vmovdqa s1, dst; \ bcax_d(dst, s2, s3); #define eor(dst, s1, s2) \ vpxorq s2, s1, dst; /* Misc helper macros. */ #define clear_avx512_4regs(a, b, c, d) \ eor(a, a, a); eor(b, b, b); eor(c, c, c); eor(d, d, d); #define clear_regs() \ vzeroall; /* xmm0-xmm15 */ \ clear_avx512_4regs(%ymm16, %ymm17, %ymm18, %ymm19); \ clear_avx512_4regs(%ymm20, %ymm21, %ymm22, %ymm23); \ clear_avx512_4regs(%ymm24, %ymm25, %ymm26, %ymm27); \ clear_avx512_4regs(%ymm28, %ymm29, %ymm30, %ymm31); ELF(.type KeccakF1600_ce,@function) .align 64, 0xcc KeccakF1600_ce: .Loop_ce: CFI_STARTPROC() ////////////////////////////////////////////////// Theta eor3( C_0, A_4_0, A_3_0, A_2_0) eor3( C_1, A_4_1, A_3_1, A_2_1) eor3( C_3, A_4_3, A_3_3, A_2_3) eor3( C_2, A_4_2, A_3_2, A_2_2) eor3( C_4, A_4_4, A_3_4, A_2_4) eor3_d( C_0, A_1_0, A_0_0) eor3_d( C_1, A_1_1, A_0_1) eor3_d( C_3, A_1_3, A_0_3) eor3_d( C_2, A_1_2, A_0_2) eor3_d( C_4, A_1_4, A_0_4) rax1_c( C_5, C_0, C_2) // D[1] rax1_t( C_2, C_4, C_6) // D[3] rax1_c( C_6, C_1, C_3) // D[2] rax1_s( C_3, C_0) // D[4] rax1_s( C_4, C_1) // D[0] ////////////////////////////////////////////////// Theta+Rho+Pi xar( C_0, A_0_1, D_1, 1) // C[0]=A[2][0] xar( A_0_1, A_1_1, D_1, 44) xar( A_1_1, A_1_4, D_4, 20) xar( A_1_4, A_4_2, D_2, 61) xar( A_4_2, A_2_4, D_4, 39) xar( A_2_4, A_4_0, D_0, 18) xar( C_1, A_0_2, D_2, 62) // C[1]=A[4][0] xar( A_0_2, A_2_2, D_2, 43) xar( A_2_2, A_2_3, D_3, 25) xar( A_2_3, A_3_4, D_4, 8) xar_x( A_3_4, A_4_3, D_3, 56) xar( A_4_3, A_3_0, D_0, 41) xar( A_3_0, A_0_4, D_4, 27) xar_x( D_4, A_4_4, D_4, 14) // D[4]=A[0][4] xar_x( A_4_4, A_4_1, D_1, 2) xar( A_1_3, A_1_3, D_3, 55) // A[1][3]=A[4][1] xar( A_0_4, A_3_1, D_1, 45) // A[0][4]=A[1][3] xar( A_3_1, A_1_0, D_0, 36) xar( A_1_0, A_0_3, D_3, 28) eor( A_0_0, A_0_0, D_0) xar_x( D_3, A_3_3, D_3, 21) // D[3]=A[0][3] xar( A_0_3, A_3_2, D_2, 15) // A[0][3]=A[3][3] xar( D_1, A_2_1, D_1, 10) // D[1]=A[3][2] xar( D_2, A_1_2, D_2, 6) // D[2]=A[2][1] xar( D_0, A_2_0, D_0, 3) // D[0]=A[1][2] ////////////////////////////////////////////////// Chi+Iota bcax_x( A_4_0, C_1, A_4_2, A_1_3) // A[1][3]=A[4][1] bcax( A_4_1, A_1_3, A_4_3, A_4_2) // A[1][3]=A[4][1] bcax_d( A_4_2, A_4_4, A_4_3) bcax_d( A_4_3, C_1, A_4_4) bcax_d( A_4_4, A_1_3, C_1) // A[1][3]=A[4][1] bcax_x( A_3_2, D_1, A_3_4, A_0_3) // A[0][3]=A[3][3] bcax( A_3_3, A_0_3, A_3_0, A_3_4) // A[0][3]=A[3][3] bcax_d( A_3_4, A_3_1, A_3_0) bcax_d( A_3_0, D_1, A_3_1) bcax_d( A_3_1, A_0_3, D_1) // A[0][3]=A[3][3] bcax( A_2_0, C_0, A_2_2, D_2) bcax( A_2_1, D_2, A_2_3, A_2_2) bcax_d( A_2_2, A_2_4, A_2_3) bcax_d( A_2_3, C_0, A_2_4) bcax_d( A_2_4, D_2, C_0) bcax( A_1_2, D_0, A_1_4, A_0_4) // A[0][4]=A[1][3] bcax( A_1_3, A_0_4, A_1_0, A_1_4) // A[0][4]=A[1][3] bcax_d( A_1_4, A_1_1, A_1_0) bcax_d( A_1_0, D_0, A_1_1) bcax_d( A_1_1, A_0_4, D_0) // A[0][4]=A[1][3] bcax( A_0_3, D_3, A_0_0, D_4) bcax( A_0_4, D_4, A_0_1, A_0_0) bcax_d( A_0_0, A_0_2, A_0_1) bcax_d( A_0_1, D_3, A_0_2) bcax_d( A_0_2, D_4, D_3) eor( A_0_0, A_0_0, (%r10)) cmpq %r10, %r11 je .Lend_ce addq $8, %r10 jmp .Loop_ce .align 64, 0xcc .Lend_ce: ret_spec_stop CFI_ENDPROC() ELF(.size KeccakF1600_ce,.-KeccakF1600_ce) .globl _gcry_keccak_f1600_state_permute64_avx512 ELF(.type _gcry_keccak_f1600_state_permute64_avx512,@function) .align 64, 0xcc _gcry_keccak_f1600_state_permute64_avx512: /* input: * %rdi: state * %rsi: round constants */ CFI_STARTPROC() spec_stop_avx512; leaq 12*8(%rdi), %rax leaq (24-1)*8(%rsi), %r11 vmovdqu64 0*8(%rdi), A_0_0 vmovdqu64 1*8(%rdi), A_0_1 vmovdqu64 2*8(%rdi), A_0_2 vmovdqu64 3*8(%rdi), A_0_3 vmovdqu64 4*8(%rdi), A_0_4 vmovdqu64 5*8(%rdi), A_1_0 vmovdqu64 6*8(%rdi), A_1_1 vmovdqu64 7*8(%rdi), A_1_2 vmovdqu64 8*8(%rdi), A_1_3 vmovdqu64 9*8(%rdi), A_1_4 vmovdqu64 10*8(%rdi), A_2_0 vmovdqu64 11*8(%rdi), A_2_1 vmovdqu64 0*8(%rax), A_2_2 vmovdqu64 1*8(%rax), A_2_3 vmovdqu64 2*8(%rax), A_2_4 vmovdqu64 3*8(%rax), A_3_0 vmovdqu 4*8(%rax), A_3_1 vmovdqu 5*8(%rax), A_3_2 vmovdqu 6*8(%rax), A_3_3 vmovdqu 7*8(%rax), A_3_4 vmovdqu 8*8(%rax), A_4_0 vmovdqu 9*8(%rax), A_4_1 vmovdqu 10*8(%rax), A_4_2 vmovdqu 11*8(%rax), A_4_3 vmovq 12*8(%rax), A_4_4 movq %rsi, %r10 call KeccakF1600_ce vpunpcklqdq A_0_1, A_0_0, A_0_0 vpunpcklqdq A_0_3, A_0_2, A_0_2 vpunpcklqdq A_1_0, A_0_4, A_0_4 vpunpcklqdq A_1_2, A_1_1, A_1_1 vpunpcklqdq A_1_4, A_1_3, A_1_3 vpunpcklqdq A_2_1, A_2_0, A_2_0 vpunpcklqdq A_2_3, A_2_2, A_2_2 vpunpcklqdq A_3_0, A_2_4, A_2_4 vpunpcklqdq A_3_2, A_3_1, A_3_1 vpunpcklqdq A_3_4, A_3_3, A_3_3 vpunpcklqdq A_4_1, A_4_0, A_4_0 vpunpcklqdq A_4_3, A_4_2, A_4_2 vmovdqu64 A_0_0, 0*8(%rdi) vmovdqu64 A_0_2, 2*8(%rdi) vmovdqu64 A_0_4, 4*8(%rdi) vmovdqu64 A_1_1, 6*8(%rdi) vmovdqu64 A_1_3, 8*8(%rdi) vmovdqu64 A_2_0, 10*8(%rdi) vmovdqu64 A_2_2, 0*8(%rax) vmovdqu64 A_2_4, 2*8(%rax) vmovdqu A_3_1, 4*8(%rax) vmovdqu A_3_3, 6*8(%rax) vmovdqu A_4_0, 8*8(%rax) vmovdqu A_4_2, 10*8(%rax) vmovq A_4_4, 12*8(%rax) xorl %eax, %eax clear_regs() ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_keccak_f1600_state_permute64_avx512, .-_gcry_keccak_f1600_state_permute64_avx512) .globl _gcry_keccak_absorb_blocks_avx512 ELF(.type _gcry_keccak_absorb_blocks_avx512,@function) .align 64, 0xcc _gcry_keccak_absorb_blocks_avx512: /* input: * %rdi: state * %rsi: round constants * %rdx: lanes * %rcx: nlanes * %r8 : blocklanes * %r9 : lanes output pointer */ CFI_STARTPROC() spec_stop_avx512; leaq 12*8(%rdi), %rax leaq (24-1)*8(%rsi), %r11 vmovdqu64 0*8(%rdi), A_0_0 vmovdqu64 1*8(%rdi), A_0_1 vmovdqu64 2*8(%rdi), A_0_2 vmovdqu64 3*8(%rdi), A_0_3 vmovdqu64 4*8(%rdi), A_0_4 vmovdqu64 5*8(%rdi), A_1_0 vmovdqu64 6*8(%rdi), A_1_1 vmovdqu64 7*8(%rdi), A_1_2 vmovdqu64 8*8(%rdi), A_1_3 vmovdqu64 9*8(%rdi), A_1_4 vmovdqu64 10*8(%rdi), A_2_0 vmovdqu64 11*8(%rdi), A_2_1 vmovdqu64 0*8(%rax), A_2_2 vmovdqu64 1*8(%rax), A_2_3 vmovdqu64 2*8(%rax), A_2_4 vmovdqu64 3*8(%rax), A_3_0 vmovdqu 4*8(%rax), A_3_1 vmovdqu 5*8(%rax), A_3_2 vmovdqu 6*8(%rax), A_3_3 vmovdqu 7*8(%rax), A_3_4 vmovdqu 8*8(%rax), A_4_0 vmovdqu 9*8(%rax), A_4_1 vmovdqu 10*8(%rax), A_4_2 vmovdqu 11*8(%rax), A_4_3 vmovq 12*8(%rax), A_4_4 cmpq $(104 >> 3), %r8 jb .Loop_absorb_72_ce je .Loop_absorb_104_ce cmpq $(144 >> 3), %r8 jb .Loop_absorb_136_ce je .Loop_absorb_144_ce jmp .Loop_absorb_168_ce .align 64, 0xcc .Loop_absorb_168_ce: subq %r8, %rcx // len - bsz jb .Labsorbed_ce vpxorq 0*8(%rdx), A_0_0, A_0_0 vpxorq 1*8(%rdx), A_0_1, A_0_1 vpxorq 2*8(%rdx), A_0_2, A_0_2 vpxorq 3*8(%rdx), A_0_3, A_0_3 vpxorq 4*8(%rdx), A_0_4, A_0_4 vpxorq 5*8(%rdx), A_1_0, A_1_0 vpxorq 6*8(%rdx), A_1_1, A_1_1 vpxorq 7*8(%rdx), A_1_2, A_1_2 vpxorq 8*8(%rdx), A_1_3, A_1_3 vpxorq 9*8(%rdx), A_1_4, A_1_4 vpxorq 10*8(%rdx), A_2_0, A_2_0 vpxorq 11*8(%rdx), A_2_1, A_2_1 vpxorq 12*8(%rdx), A_2_2, A_2_2 vpxorq 13*8(%rdx), A_2_3, A_2_3 vpxorq 14*8(%rdx), A_2_4, A_2_4 vpxorq 15*8(%rdx), A_3_0, A_3_0 vpxor 16*8(%rdx), A_3_1, A_3_1 vpxor 17*8(%rdx), A_3_2, A_3_2 vpxor 18*8(%rdx), A_3_3, A_3_3 vpxor 19*8(%rdx), A_3_4, A_3_4 vmovq 20*8(%rdx), C_0 leaq 21*8(%rdx), %rdx vpxorq C_0, A_4_0, A_4_0 movq %rsi, %r10 call KeccakF1600_ce jmp .Loop_absorb_168_ce .align 64, 0xcc .Loop_absorb_144_ce: subq %r8, %rcx // len - bsz jb .Labsorbed_ce vpxorq 0*8(%rdx), A_0_0, A_0_0 vpxorq 1*8(%rdx), A_0_1, A_0_1 vpxorq 2*8(%rdx), A_0_2, A_0_2 vpxorq 3*8(%rdx), A_0_3, A_0_3 vpxorq 4*8(%rdx), A_0_4, A_0_4 vpxorq 5*8(%rdx), A_1_0, A_1_0 vpxorq 6*8(%rdx), A_1_1, A_1_1 vpxorq 7*8(%rdx), A_1_2, A_1_2 vpxorq 8*8(%rdx), A_1_3, A_1_3 vpxorq 9*8(%rdx), A_1_4, A_1_4 vpxorq 10*8(%rdx), A_2_0, A_2_0 vpxorq 11*8(%rdx), A_2_1, A_2_1 vpxorq 12*8(%rdx), A_2_2, A_2_2 vpxorq 13*8(%rdx), A_2_3, A_2_3 vpxorq 14*8(%rdx), A_2_4, A_2_4 vpxorq 15*8(%rdx), A_3_0, A_3_0 vpxor 16*8(%rdx), A_3_1, A_3_1 vmovq 17*8(%rdx), C_0 leaq 18*8(%rdx), %rdx vpxor C_0, A_3_2, A_3_2 movq %rsi, %r10 call KeccakF1600_ce jmp .Loop_absorb_144_ce .align 64, 0xcc .Loop_absorb_136_ce: subq %r8, %rcx // len - bsz jb .Labsorbed_ce vpxorq 0*8(%rdx), A_0_0, A_0_0 vpxorq 1*8(%rdx), A_0_1, A_0_1 vpxorq 2*8(%rdx), A_0_2, A_0_2 vpxorq 3*8(%rdx), A_0_3, A_0_3 vpxorq 4*8(%rdx), A_0_4, A_0_4 vpxorq 5*8(%rdx), A_1_0, A_1_0 vpxorq 6*8(%rdx), A_1_1, A_1_1 vpxorq 7*8(%rdx), A_1_2, A_1_2 vpxorq 8*8(%rdx), A_1_3, A_1_3 vpxorq 9*8(%rdx), A_1_4, A_1_4 vpxorq 10*8(%rdx), A_2_0, A_2_0 vpxorq 11*8(%rdx), A_2_1, A_2_1 vpxorq 12*8(%rdx), A_2_2, A_2_2 vpxorq 13*8(%rdx), A_2_3, A_2_3 vpxorq 14*8(%rdx), A_2_4, A_2_4 vpxorq 15*8(%rdx), A_3_0, A_3_0 vmovq 16*8(%rdx), C_0 leaq 17*8(%rdx), %rdx vpxor C_0, A_3_1, A_3_1 movq %rsi, %r10 call KeccakF1600_ce jmp .Loop_absorb_136_ce .align 64, 0xcc .Loop_absorb_104_ce: subq %r8, %rcx // len - bsz jb .Labsorbed_ce vpxorq 0*8(%rdx), A_0_0, A_0_0 vpxorq 1*8(%rdx), A_0_1, A_0_1 vpxorq 2*8(%rdx), A_0_2, A_0_2 vpxorq 3*8(%rdx), A_0_3, A_0_3 vpxorq 4*8(%rdx), A_0_4, A_0_4 vpxorq 5*8(%rdx), A_1_0, A_1_0 vpxorq 6*8(%rdx), A_1_1, A_1_1 vpxorq 7*8(%rdx), A_1_2, A_1_2 vpxorq 8*8(%rdx), A_1_3, A_1_3 vpxorq 9*8(%rdx), A_1_4, A_1_4 vpxorq 10*8(%rdx), A_2_0, A_2_0 vpxorq 11*8(%rdx), A_2_1, A_2_1 vmovq 12*8(%rdx), C_0 leaq 13*8(%rdx), %rdx vpxorq C_0, A_2_2, A_2_2 movq %rsi, %r10 call KeccakF1600_ce jmp .Loop_absorb_104_ce .align 64, 0xcc .Loop_absorb_72_ce: subq %r8, %rcx // len - bsz jb .Labsorbed_ce vpxorq 0*8(%rdx), A_0_0, A_0_0 vpxorq 1*8(%rdx), A_0_1, A_0_1 vpxorq 2*8(%rdx), A_0_2, A_0_2 vpxorq 3*8(%rdx), A_0_3, A_0_3 vpxorq 4*8(%rdx), A_0_4, A_0_4 vpxorq 5*8(%rdx), A_1_0, A_1_0 vpxorq 6*8(%rdx), A_1_1, A_1_1 vpxorq 7*8(%rdx), A_1_2, A_1_2 vmovq 8*8(%rdx), C_0 leaq 9*8(%rdx), %rdx vpxorq C_0, A_1_3, A_1_3 movq %rsi, %r10 call KeccakF1600_ce jmp .Loop_absorb_72_ce .align 64, 0xcc .Labsorbed_ce: vpunpcklqdq A_0_1, A_0_0, A_0_0 vpunpcklqdq A_0_3, A_0_2, A_0_2 vpunpcklqdq A_1_0, A_0_4, A_0_4 vpunpcklqdq A_1_2, A_1_1, A_1_1 vpunpcklqdq A_1_4, A_1_3, A_1_3 vpunpcklqdq A_2_1, A_2_0, A_2_0 vpunpcklqdq A_2_3, A_2_2, A_2_2 vpunpcklqdq A_3_0, A_2_4, A_2_4 vpunpcklqdq A_3_2, A_3_1, A_3_1 vpunpcklqdq A_3_4, A_3_3, A_3_3 vpunpcklqdq A_4_1, A_4_0, A_4_0 vpunpcklqdq A_4_3, A_4_2, A_4_2 vmovdqu64 A_0_0, 0*8(%rdi) vmovdqu64 A_0_2, 2*8(%rdi) vmovdqu64 A_0_4, 4*8(%rdi) vmovdqu64 A_1_1, 6*8(%rdi) vmovdqu64 A_1_3, 8*8(%rdi) vmovdqu64 A_2_0, 10*8(%rdi) vmovdqu64 A_2_2, 0*8(%rax) vmovdqu64 A_2_4, 2*8(%rax) vmovdqu A_3_1, 4*8(%rax) vmovdqu A_3_3, 6*8(%rax) vmovdqu A_4_0, 8*8(%rax) vmovdqu A_4_2, 10*8(%rax) vmovq A_4_4, 12*8(%rax) leaq (%r8, %rcx), %rax // return value movq %rdx, (%r9) // return buffer pointer clear_regs() ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_keccak_absorb_blocks_avx512, .-_gcry_keccak_absorb_blocks_avx512) #endif /* HAVE_GCC_INLINE_ASM_AVX512 */ #endif /* __x86_64 */