diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-01-16 19:24:33 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-01-17 18:30:23 +0200 |
commit | 7de2fb66e065a97f121bd16ab37efba32983a6bd (patch) | |
tree | bc3bce8cdb71c4946e5b12326dd9a379c8863671 /cipher/keccak-amd64-avx512.S | |
parent | 5e1a04f77933a8295df69d818e9effc076dc68cd (diff) | |
download | libgcrypt-7de2fb66e065a97f121bd16ab37efba32983a6bd.tar.gz |
avx512: tweak zmm16-zmm31 register clearing
* cipher/asm-common-amd64.h (spec_stop_avx512): Clear ymm16
before and after vpopcntb.
* cipher/camellia-gfni-avx512-amd64.S (clear_zmm16_zmm31): Clear
YMM16-YMM31 registers instead of XMM16-XMM31.
* cipher/chacha20-amd64-avx512.S (clear_zmm16_zmm31): Likewise.
* cipher/keccak-amd64-avx512.S (clear_regs): Likewise.
(clear_avx512_4regs): Clear all 4 registers with XOR.
* cipher/cipher-gcm-intel-pclmul.c (_gcry_ghash_intel_pclmul)
(_gcry_polyval_intel_pclmul): Clear YMM16-YMM19 registers instead of
ZMM16-ZMM19.
* cipher/poly1305-amd64-avx512.S (POLY1305_BLOCKS): Clear YMM16-YMM31
registers after vector processing instead of XMM16-XMM31.
* cipher/sha512-avx512-amd64.S
(_gcry_sha512_transform_amd64_avx512): Likewise.
--
Clear zmm16-zmm31 registers with 256bit XOR instead of 128bit
as this is better for AMD Zen4. Also clear xmm16 register
after vpopcnt in avx512 spec-stop so we do not leave any zmm
register state which might end up unnecessarily using CPU
resources.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/keccak-amd64-avx512.S')
-rw-r--r-- | cipher/keccak-amd64-avx512.S | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/cipher/keccak-amd64-avx512.S b/cipher/keccak-amd64-avx512.S index 58b4150f..b1fc7b64 100644 --- a/cipher/keccak-amd64-avx512.S +++ b/cipher/keccak-amd64-avx512.S @@ -160,14 +160,14 @@ /* Misc helper macros. */ #define clear_avx512_4regs(a, b, c, d) \ - eor(a, a, a); vmovdqa64 a, b; vmovdqa64 a, c; vmovdqa64 a, d; + eor(a, a, a); eor(b, b, b); eor(c, c, c); eor(d, d, d); #define clear_regs() \ vzeroall; /* xmm0-xmm15 */ \ - clear_avx512_4regs(%xmm16, %xmm17, %xmm18, %xmm19); \ - clear_avx512_4regs(%xmm20, %xmm21, %xmm22, %xmm23); \ - clear_avx512_4regs(%xmm24, %xmm25, %xmm26, %xmm27); \ - clear_avx512_4regs(%xmm28, %xmm29, %xmm30, %xmm31); + clear_avx512_4regs(%ymm16, %ymm17, %ymm18, %ymm19); \ + clear_avx512_4regs(%ymm20, %ymm21, %ymm22, %ymm23); \ + clear_avx512_4regs(%ymm24, %ymm25, %ymm26, %ymm27); \ + clear_avx512_4regs(%ymm28, %ymm29, %ymm30, %ymm31); ELF(.type KeccakF1600_ce,@function) .align 64, 0xcc |