summaryrefslogtreecommitdiff
path: root/cipher/sm4-gfni-avx512-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-19 18:39:36 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-22 20:27:56 +0200
commit978b02fca682c9ecb71e30cdeeb6922fc8331f6e (patch)
tree58e53a100647fe286ecb50994a66f156cffcece4 /cipher/sm4-gfni-avx512-amd64.S
parent8f7f5a9fc63968304bacedbc2f22b9f7188bbd53 (diff)
downloadlibgcrypt-978b02fca682c9ecb71e30cdeeb6922fc8331f6e.tar.gz
sm4: add CTR-mode byte addition for AVX/AVX2/AVX512 implementations
* cipher/sm4-aesni-avx-amd64.S (_gcry_sm4_aesni_avx_ctr_enc): Add byte addition fast-path. * cipher/sm4-aesni-avx2-amd64.S (_gcry_sm4_aesni_avx2_ctr_enc): Likewise. * cipher/sm4-gfni-avx2-amd64.S (_gcry_sm4_gfni_avx2_ctr_enc): Likewise. * cipher/sm4-gfni-avx512-amd64.S (_gcry_sm4_gfni_avx512_ctr_enc) (_gcry_sm4_gfni_avx512_ctr_enc_blk32): Likewise. -- Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/sm4-gfni-avx512-amd64.S')
-rw-r--r--cipher/sm4-gfni-avx512-amd64.S103
1 files changed, 102 insertions, 1 deletions
diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S
index b095f85d..91f6e80b 100644
--- a/cipher/sm4-gfni-avx512-amd64.S
+++ b/cipher/sm4-gfni-avx512-amd64.S
@@ -1,6 +1,6 @@
/* sm4-gfni-avx512-amd64.S - GFNI/AVX512 implementation of SM4 cipher
*
- * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -146,6 +146,35 @@ SECTION_RODATA
.quad 2, 0
.quad 3, 0
+/* CTR byte addition constants */
+.align 64
+.Lbige_addb_0_1:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+.Lbige_addb_2_3:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3
+.Lbige_addb_4_5:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5
+.Lbige_addb_6_7:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7
+.Lbige_addb_8_9:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9
+.Lbige_addb_10_11:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11
+.Lbige_addb_12_13:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13
+.Lbige_addb_14_15:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15
+.Lbige_addb_16:
+ .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16
+
.text
.align 16
@@ -627,6 +656,9 @@ _gcry_sm4_gfni_avx512_ctr_enc:
CFI_STARTPROC();
spec_stop_avx512;
+ cmpb $(0x100 - 16), 15(%rcx);
+ jbe .Lctr_byteadd16;
+
vbroadcasti128 .Lbswap128_mask rRIP, RTMP0;
vmovdqa .Lcounter0123_lo rRIP, RTMP1;
vbroadcasti128 .Lcounter2222_lo rRIP, RTMP2;
@@ -695,6 +727,8 @@ _gcry_sm4_gfni_avx512_ctr_enc:
vpshufb RTMP0, RB2, RB2;
vpshufb RTMP0, RB3, RB3;
+.align 16
+.Lload_ctr_done16:
call __sm4_gfni_crypt_blk16;
vpxor (0 * 32)(%rdx), RA0, RA0;
@@ -719,6 +753,36 @@ _gcry_sm4_gfni_avx512_ctr_enc:
kxorq %k1, %k1, %k1;
ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry16:
+ movq 8(%rcx), %r11;
+ movq (%rcx), %r10;
+ bswapq %r11;
+ bswapq %r10;
+ addq $16, %r11;
+ adcq $0, %r10;
+ bswapq %r11;
+ bswapq %r10;
+ movq %r11, 8(%rcx);
+ movq %r10, (%rcx);
+ jmp .Lctr_byteadd_ymm16;
+.align 16
+.Lctr_byteadd16:
+ vbroadcasti128 (%rcx), RB3;
+ je .Lctr_byteadd_full_ctr_carry16;
+ addb $16, 15(%rcx);
+.Lctr_byteadd_ymm16:
+ vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0;
+ vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1;
+ vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2;
+ vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3;
+ vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0;
+ vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1;
+ vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2;
+ vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3;
+
+ jmp .Lload_ctr_done16;
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx512_ctr_enc,.-_gcry_sm4_gfni_avx512_ctr_enc;)
@@ -1304,6 +1368,9 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
CFI_STARTPROC();
spec_stop_avx512;
+ cmpb $(0x100 - 32), 15(%rcx);
+ jbe .Lctr_byteadd32;
+
vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z;
vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z;
vbroadcasti64x2 .Lcounter4444_lo rRIP, RTMP2z;
@@ -1372,6 +1439,8 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
vpshufb RTMP0z, RB2z, RB2z;
vpshufb RTMP0z, RB3z, RB3z;
+.align 16
+.Lload_ctr_done32:
call __sm4_gfni_crypt_blk32;
vpxord (0 * 64)(%rdx), RA0z, RA0z;
@@ -1396,6 +1465,38 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32:
kxorq %k1, %k1, %k1;
ret_spec_stop;
+
+.align 16
+.Lctr_byteadd_full_ctr_carry32:
+ movq 8(%rcx), %r11;
+ movq (%rcx), %r10;
+ bswapq %r11;
+ bswapq %r10;
+ addq $32, %r11;
+ adcq $0, %r10;
+ bswapq %r11;
+ bswapq %r10;
+ movq %r11, 8(%rcx);
+ movq %r10, (%rcx);
+ jmp .Lctr_byteadd_zmm32;
+.align 16
+.Lctr_byteadd32:
+ vbroadcasti64x2 (%rcx), RA3z;
+ je .Lctr_byteadd_full_ctr_carry32;
+ addb $32, 15(%rcx);
+.Lctr_byteadd_zmm32:
+ vbroadcasti64x2 .Lbige_addb_16 rRIP, RB3z;
+ vpaddb RB3z, RA3z, RB3z;
+ vpaddb .Lbige_addb_0_1 rRIP, RA3z, RA0z;
+ vpaddb .Lbige_addb_4_5 rRIP, RA3z, RA1z;
+ vpaddb .Lbige_addb_8_9 rRIP, RA3z, RA2z;
+ vpaddb .Lbige_addb_12_13 rRIP, RA3z, RA3z;
+ vpaddb .Lbige_addb_0_1 rRIP, RB3z, RB0z;
+ vpaddb .Lbige_addb_4_5 rRIP, RB3z, RB1z;
+ vpaddb .Lbige_addb_8_9 rRIP, RB3z, RB2z;
+ vpaddb .Lbige_addb_12_13 rRIP, RB3z, RB3z;
+
+ jmp .Lload_ctr_done32;
CFI_ENDPROC();
ELF(.size _gcry_sm4_gfni_avx512_ctr_enc_blk32,.-_gcry_sm4_gfni_avx512_ctr_enc_blk32;)