summaryrefslogtreecommitdiff
path: root/cipher/camellia-aesni-avx-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-19 17:57:16 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-22 21:12:58 +0200
commit87ae2a660d59751ddd7da40da05cfaee73f35ea7 (patch)
tree2d76eaef0aeb66f61b7e1491a04acbd09fc55d1e /cipher/camellia-aesni-avx-amd64.S
parent926cc22058a39c7a931e14590eab6fd7a78ba455 (diff)
downloadlibgcrypt-87ae2a660d59751ddd7da40da05cfaee73f35ea7.tar.gz
camellia: add CTR-mode byte addition for AVX/AVX2/AVX512 impl.
* cipher/camellia-aesni-avx-amd64.S (_gcry_camellia_aesni_avx_ctr_enc): Add byte addition fast-path. * cipher/camellia-aesni-avx2-amd64.h (ctr_enc): Likewise. * cipher/camellia-gfni-avx512-amd64.S (_gcry_camellia_gfni_avx512_ctr_enc): Likewise. * cipher/camellia-glue.c (CAMELLIA_context): Add 'use_avx2'. (camellia_setkey, _gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec) (_gcry_camellia_cfb_dec, _gcry_camellia_ocb_crypt) (_gcry_camellia_ocb_auth) [USE_AESNI_AVX2]: Use 'use_avx2' to check if any of the AVX2 implementations is enabled. -- Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/camellia-aesni-avx-amd64.S')
-rw-r--r--cipher/camellia-aesni-avx-amd64.S78
1 files changed, 78 insertions, 0 deletions
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 93c96791..5ec33b9b 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -761,6 +761,38 @@ _camellia_aesni_avx_data:
.Ltranspose_8x8_shuf:
.byte 0, 1, 4, 5, 2, 3, 6, 7, 8+0, 8+1, 8+4, 8+5, 8+2, 8+3, 8+6, 8+7
+/* CTR byte addition constants */
+.Lbige_addb_1:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+.Lbige_addb_3:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+.Lbige_addb_5:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+.Lbige_addb_7:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+.Lbige_addb_9:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+.Lbige_addb_11:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+.Lbige_addb_13:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+.Lbige_addb_15:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+
.align 4
/* 4-bit mask */
.L0f0f0f0f:
@@ -930,6 +962,9 @@ _gcry_camellia_aesni_avx_ctr_enc:
andq $~31, %rsp;
movq %rsp, %rax;
+ cmpb $(0x100 - 16), 15(%rcx);
+ jbe .Lctr_byteadd;
+
vmovdqa .Lbswap128_mask rRIP, %xmm14;
/* load IV and byteswap */
@@ -978,6 +1013,8 @@ _gcry_camellia_aesni_avx_ctr_enc:
vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */
vmovdqu %xmm13, (%rcx);
+.align 8
+.Lload_ctr_done:
/* inpack16_pre: */
vmovq (key_table)(CTX), %xmm15;
vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15;
@@ -1026,6 +1063,47 @@ _gcry_camellia_aesni_avx_ctr_enc:
leave;
CFI_LEAVE();
ret_spec_stop;
+
+.align 8
+.Lctr_byteadd_full_ctr_carry:
+ movq 8(%rcx), %r11;
+ movq (%rcx), %r10;
+ bswapq %r11;
+ bswapq %r10;
+ addq $16, %r11;
+ adcq $0, %r10;
+ bswapq %r11;
+ bswapq %r10;
+ movq %r11, 8(%rcx);
+ movq %r10, (%rcx);
+ jmp .Lctr_byteadd_xmm;
+.align 8
+.Lctr_byteadd:
+ vmovdqu (%rcx), %xmm15;
+ je .Lctr_byteadd_full_ctr_carry;
+ addb $16, 15(%rcx);
+.Lctr_byteadd_xmm:
+ vmovdqa %xmm15, %xmm0;
+ vpaddb .Lbige_addb_1 rRIP, %xmm15, %xmm14;
+ vmovdqu %xmm15, 15 * 16(%rax);
+ vpaddb .Lbige_addb_2 rRIP, %xmm15, %xmm13;
+ vmovdqu %xmm14, 14 * 16(%rax);
+ vpaddb .Lbige_addb_3 rRIP, %xmm15, %xmm12;
+ vmovdqu %xmm13, 13 * 16(%rax);
+ vpaddb .Lbige_addb_4 rRIP, %xmm15, %xmm11;
+ vpaddb .Lbige_addb_5 rRIP, %xmm15, %xmm10;
+ vpaddb .Lbige_addb_6 rRIP, %xmm15, %xmm9;
+ vpaddb .Lbige_addb_7 rRIP, %xmm15, %xmm8;
+ vpaddb .Lbige_addb_8 rRIP, %xmm0, %xmm7;
+ vpaddb .Lbige_addb_9 rRIP, %xmm0, %xmm6;
+ vpaddb .Lbige_addb_10 rRIP, %xmm0, %xmm5;
+ vpaddb .Lbige_addb_11 rRIP, %xmm0, %xmm4;
+ vpaddb .Lbige_addb_12 rRIP, %xmm0, %xmm3;
+ vpaddb .Lbige_addb_13 rRIP, %xmm0, %xmm2;
+ vpaddb .Lbige_addb_14 rRIP, %xmm0, %xmm1;
+ vpaddb .Lbige_addb_15 rRIP, %xmm0, %xmm0;
+
+ jmp .Lload_ctr_done;
CFI_ENDPROC();
ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;)