diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-19 17:57:16 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-22 21:12:58 +0200 |
commit | 87ae2a660d59751ddd7da40da05cfaee73f35ea7 (patch) | |
tree | 2d76eaef0aeb66f61b7e1491a04acbd09fc55d1e /cipher/camellia-aesni-avx2-amd64.h | |
parent | 926cc22058a39c7a931e14590eab6fd7a78ba455 (diff) | |
download | libgcrypt-87ae2a660d59751ddd7da40da05cfaee73f35ea7.tar.gz |
camellia: add CTR-mode byte addition for AVX/AVX2/AVX512 impl.
* cipher/camellia-aesni-avx-amd64.S
(_gcry_camellia_aesni_avx_ctr_enc): Add byte addition fast-path.
* cipher/camellia-aesni-avx2-amd64.h (ctr_enc): Likewise.
* cipher/camellia-gfni-avx512-amd64.S
(_gcry_camellia_gfni_avx512_ctr_enc): Likewise.
* cipher/camellia-glue.c (CAMELLIA_context): Add 'use_avx2'.
(camellia_setkey, _gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec)
(_gcry_camellia_cfb_dec, _gcry_camellia_ocb_crypt)
(_gcry_camellia_ocb_auth) [USE_AESNI_AVX2]: Use 'use_avx2' to check
if any of the AVX2 implementations is enabled.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-aesni-avx2-amd64.h')
-rw-r--r-- | cipher/camellia-aesni-avx2-amd64.h | 83 |
1 files changed, 79 insertions, 4 deletions
diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h index c92a0559..7d451c09 100644 --- a/cipher/camellia-aesni-avx2-amd64.h +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -805,6 +805,36 @@ ELF(.type FUNC_NAME(_constants),@object;) .Lbswap128_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +/* CTR byte addition constants */ +.align 32 +.Lbige_addb_0_1: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 +.Lbige_addb_2_3: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 +.Lbige_addb_4_5: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 +.Lbige_addb_6_7: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 +.Lbige_addb_8_9: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9 +.Lbige_addb_10_11: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11 +.Lbige_addb_12_13: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 +.Lbige_addb_14_15: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 +.Lbige_addb_16_16: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 + #ifdef CAMELLIA_GFNI_BUILD /* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3 @@ -1151,9 +1181,6 @@ FUNC_NAME(ctr_enc): movq %rsp, %rbp; CFI_DEF_CFA_REGISTER(%rbp); - movq 8(%rcx), %r11; - bswapq %r11; - cmpl $128, key_bitlength(CTX); movl $32, %r8d; movl $24, %eax; @@ -1163,6 +1190,12 @@ FUNC_NAME(ctr_enc): andq $~63, %rsp; movq %rsp, %rax; + cmpb $(0x100 - 32), 15(%rcx); + jbe .Lctr_byteadd; + + movq 8(%rcx), %r11; + bswapq %r11; + vpcmpeqd %ymm15, %ymm15, %ymm15; vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ @@ -1275,7 +1308,7 @@ FUNC_NAME(ctr_enc): vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; vmovdqu %xmm13, (%rcx); -.align 4 +.align 8 .Lload_ctr_done: /* inpack32_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; @@ -1325,6 +1358,48 @@ FUNC_NAME(ctr_enc): leave; CFI_LEAVE(); ret_spec_stop; + +.align 8 +.Lctr_byteadd_full_ctr_carry: + movq 8(%rcx), %r11; + movq (%rcx), %r10; + bswapq %r11; + bswapq %r10; + addq $32, %r11; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + jmp .Lctr_byteadd_ymm; +.align 8 +.Lctr_byteadd: + vbroadcasti128 (%rcx), %ymm8; + je .Lctr_byteadd_full_ctr_carry; + addb $32, 15(%rcx); +.Lctr_byteadd_ymm: + vpaddb .Lbige_addb_16_16 rRIP, %ymm8, %ymm0; + vpaddb .Lbige_addb_0_1 rRIP, %ymm8, %ymm15; + vpaddb .Lbige_addb_2_3 rRIP, %ymm8, %ymm14; + vmovdqu %ymm15, 15 * 32(%rax); + vpaddb .Lbige_addb_4_5 rRIP, %ymm8, %ymm13; + vmovdqu %ymm14, 14 * 32(%rax); + vpaddb .Lbige_addb_6_7 rRIP, %ymm8, %ymm12; + vmovdqu %ymm13, 13 * 32(%rax); + vpaddb .Lbige_addb_8_9 rRIP, %ymm8, %ymm11; + vpaddb .Lbige_addb_10_11 rRIP, %ymm8, %ymm10; + vpaddb .Lbige_addb_12_13 rRIP, %ymm8, %ymm9; + vpaddb .Lbige_addb_14_15 rRIP, %ymm8, %ymm8; + vpaddb .Lbige_addb_0_1 rRIP, %ymm0, %ymm7; + vpaddb .Lbige_addb_2_3 rRIP, %ymm0, %ymm6; + vpaddb .Lbige_addb_4_5 rRIP, %ymm0, %ymm5; + vpaddb .Lbige_addb_6_7 rRIP, %ymm0, %ymm4; + vpaddb .Lbige_addb_8_9 rRIP, %ymm0, %ymm3; + vpaddb .Lbige_addb_10_11 rRIP, %ymm0, %ymm2; + vpaddb .Lbige_addb_12_13 rRIP, %ymm0, %ymm1; + vpaddb .Lbige_addb_14_15 rRIP, %ymm0, %ymm0; + + jmp .Lload_ctr_done; CFI_ENDPROC(); ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);) |