diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-19 18:39:36 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-22 20:27:56 +0200 |
commit | 978b02fca682c9ecb71e30cdeeb6922fc8331f6e (patch) | |
tree | 58e53a100647fe286ecb50994a66f156cffcece4 /cipher | |
parent | 8f7f5a9fc63968304bacedbc2f22b9f7188bbd53 (diff) | |
download | libgcrypt-978b02fca682c9ecb71e30cdeeb6922fc8331f6e.tar.gz |
sm4: add CTR-mode byte addition for AVX/AVX2/AVX512 implementations
* cipher/sm4-aesni-avx-amd64.S
(_gcry_sm4_aesni_avx_ctr_enc): Add byte addition fast-path.
* cipher/sm4-aesni-avx2-amd64.S
(_gcry_sm4_aesni_avx2_ctr_enc): Likewise.
* cipher/sm4-gfni-avx2-amd64.S
(_gcry_sm4_gfni_avx2_ctr_enc): Likewise.
* cipher/sm4-gfni-avx512-amd64.S
(_gcry_sm4_gfni_avx512_ctr_enc)
(_gcry_sm4_gfni_avx512_ctr_enc_blk32): Likewise.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r-- | cipher/sm4-aesni-avx-amd64.S | 68 | ||||
-rw-r--r-- | cipher/sm4-aesni-avx2-amd64.S | 65 | ||||
-rw-r--r-- | cipher/sm4-gfni-avx2-amd64.S | 65 | ||||
-rw-r--r-- | cipher/sm4-gfni-avx512-amd64.S | 103 |
4 files changed, 295 insertions, 6 deletions
diff --git a/cipher/sm4-aesni-avx-amd64.S b/cipher/sm4-aesni-avx-amd64.S index c09b205d..ca9be44a 100644 --- a/cipher/sm4-aesni-avx-amd64.S +++ b/cipher/sm4-aesni-avx-amd64.S @@ -1,6 +1,6 @@ /* sm4-avx-aesni-amd64.S - AES-NI/AVX implementation of SM4 cipher * - * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2020,2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -150,6 +150,38 @@ _sm4_aesni_avx_consts: .Lbswap32_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +/* CTR byte addition constants */ +.Lbige_addb_1: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 +.Lbige_addb_2: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 +.Lbige_addb_3: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 +.Lbige_addb_4: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 +.Lbige_addb_5: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 +.Lbige_addb_6: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 +.Lbige_addb_7: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 +.Lbige_addb_8: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 +.Lbige_addb_9: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9 +.Lbige_addb_10: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10 +.Lbige_addb_11: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11 +.Lbige_addb_12: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 +.Lbige_addb_13: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 +.Lbige_addb_14: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 +.Lbige_addb_15: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 + .align 4 /* 4-bit mask */ .L0f0f0f0f: @@ -529,6 +561,9 @@ _gcry_sm4_aesni_avx_ctr_enc: */ CFI_STARTPROC(); + cmpb $(0x100 - 8), 15(%rcx); + jbe .Lctr_byteadd; + /* load IV and byteswap */ vmovdqu (%rcx), RA0; @@ -565,6 +600,8 @@ _gcry_sm4_aesni_avx_ctr_enc: /* store new IV */ vmovdqu RTMP1, (%rcx); +.align 8 +.Lload_ctr_done: call __sm4_crypt_blk8; vpxor (0 * 16)(%rdx), RA0, RA0; @@ -588,6 +625,35 @@ _gcry_sm4_aesni_avx_ctr_enc: vzeroall; ret_spec_stop; + .align 8 + +.Lctr_byteadd_full_ctr_carry: + movq 8(%rcx), %r11; + movq (%rcx), %r10; + bswapq %r11; + bswapq %r10; + addq $8, %r11; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + jmp .Lctr_byteadd_xmm; +.align 8 +.Lctr_byteadd: + vmovdqu (%rcx), RA0; + je .Lctr_byteadd_full_ctr_carry; + addb $8, 15(%rcx); +.Lctr_byteadd_xmm: + vpaddb .Lbige_addb_1 rRIP, RA0, RA1; + vpaddb .Lbige_addb_2 rRIP, RA0, RA2; + vpaddb .Lbige_addb_3 rRIP, RA0, RA3; + vpaddb .Lbige_addb_4 rRIP, RA0, RB0; + vpaddb .Lbige_addb_5 rRIP, RA0, RB1; + vpaddb .Lbige_addb_6 rRIP, RA0, RB2; + vpaddb .Lbige_addb_7 rRIP, RA0, RB3; + + jmp .Lload_ctr_done; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;) diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S index acd37cff..03f979fa 100644 --- a/cipher/sm4-aesni-avx2-amd64.S +++ b/cipher/sm4-aesni-avx2-amd64.S @@ -1,6 +1,6 @@ /* sm4-avx2-amd64.S - AVX2 implementation of SM4 cipher * - * Copyright (C) 2020, 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -171,6 +171,33 @@ _sm4_aesni_avx2_consts: .Lbswap32_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +/* CTR byte addition constants */ +.align 32 +.Lbige_addb_0_1: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 +.Lbige_addb_2_3: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 +.Lbige_addb_4_5: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 +.Lbige_addb_6_7: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 +.Lbige_addb_8_9: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9 +.Lbige_addb_10_11: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11 +.Lbige_addb_12_13: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 +.Lbige_addb_14_15: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 + .align 4 /* 4-bit mask */ .L0f0f0f0f: @@ -371,6 +398,9 @@ _gcry_sm4_aesni_avx2_ctr_enc: */ CFI_STARTPROC(); + cmpb $(0x100 - 16), 15(%rcx); + jbe .Lctr_byteadd; + movq 8(%rcx), %rax; bswapq %rax; @@ -438,11 +468,12 @@ _gcry_sm4_aesni_avx2_ctr_enc: vextracti128 $1, RTMP0, RTMP0x; vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ -.align 4 .Lctr_carry_done: /* store new IV */ vmovdqu RTMP0x, (%rcx); +.align 8 +.Lload_ctr_done: call __sm4_crypt_blk16; vpxor (0 * 32)(%rdx), RA0, RA0; @@ -466,6 +497,36 @@ _gcry_sm4_aesni_avx2_ctr_enc: vzeroall; ret_spec_stop; + +.align 8 +.Lctr_byteadd_full_ctr_carry: + movq 8(%rcx), %r11; + movq (%rcx), %r10; + bswapq %r11; + bswapq %r10; + addq $16, %r11; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + jmp .Lctr_byteadd_ymm; +.align 8 +.Lctr_byteadd: + vbroadcasti128 (%rcx), RB3; + je .Lctr_byteadd_full_ctr_carry; + addb $16, 15(%rcx); +.Lctr_byteadd_ymm: + vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0; + vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1; + vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2; + vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3; + vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0; + vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1; + vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2; + vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3; + + jmp .Lload_ctr_done; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;) diff --git a/cipher/sm4-gfni-avx2-amd64.S b/cipher/sm4-gfni-avx2-amd64.S index 2fbaffd5..464da399 100644 --- a/cipher/sm4-gfni-avx2-amd64.S +++ b/cipher/sm4-gfni-avx2-amd64.S @@ -1,6 +1,6 @@ /* sm4-gfni-avx2-amd64.S - GFNI/AVX2 implementation of SM4 cipher * - * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -136,6 +136,33 @@ _sm4_gfni_avx2_consts: .Lbswap32_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +/* CTR byte addition constants */ +.align 32 +.Lbige_addb_0_1: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 +.Lbige_addb_2_3: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 +.Lbige_addb_4_5: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 +.Lbige_addb_6_7: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 +.Lbige_addb_8_9: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9 +.Lbige_addb_10_11: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11 +.Lbige_addb_12_13: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 +.Lbige_addb_14_15: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 + .text .align 16 @@ -658,6 +685,9 @@ _gcry_sm4_gfni_avx2_ctr_enc: */ CFI_STARTPROC(); + cmpb $(0x100 - 16), 15(%rcx); + jbe .Lctr_byteadd; + movq 8(%rcx), %rax; bswapq %rax; @@ -725,11 +755,12 @@ _gcry_sm4_gfni_avx2_ctr_enc: vextracti128 $1, RTMP0, RTMP0x; vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ -.align 4 .Lctr_carry_done: /* store new IV */ vmovdqu RTMP0x, (%rcx); +.align 8 +.Lload_ctr_done: call __sm4_gfni_crypt_blk16; vpxor (0 * 32)(%rdx), RA0, RA0; @@ -753,6 +784,36 @@ _gcry_sm4_gfni_avx2_ctr_enc: vzeroall; ret_spec_stop; + +.align 8 +.Lctr_byteadd_full_ctr_carry: + movq 8(%rcx), %r11; + movq (%rcx), %r10; + bswapq %r11; + bswapq %r10; + addq $16, %r11; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + jmp .Lctr_byteadd_ymm; +.align 8 +.Lctr_byteadd: + vbroadcasti128 (%rcx), RB3; + je .Lctr_byteadd_full_ctr_carry; + addb $16, 15(%rcx); +.Lctr_byteadd_ymm: + vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0; + vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1; + vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2; + vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3; + vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0; + vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1; + vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2; + vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3; + + jmp .Lload_ctr_done; CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx2_ctr_enc,.-_gcry_sm4_gfni_avx2_ctr_enc;) diff --git a/cipher/sm4-gfni-avx512-amd64.S b/cipher/sm4-gfni-avx512-amd64.S index b095f85d..91f6e80b 100644 --- a/cipher/sm4-gfni-avx512-amd64.S +++ b/cipher/sm4-gfni-avx512-amd64.S @@ -1,6 +1,6 @@ /* sm4-gfni-avx512-amd64.S - GFNI/AVX512 implementation of SM4 cipher * - * Copyright (C) 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -146,6 +146,35 @@ SECTION_RODATA .quad 2, 0 .quad 3, 0 +/* CTR byte addition constants */ +.align 64 +.Lbige_addb_0_1: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 +.Lbige_addb_2_3: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 +.Lbige_addb_4_5: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 +.Lbige_addb_6_7: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 +.Lbige_addb_8_9: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9 +.Lbige_addb_10_11: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11 +.Lbige_addb_12_13: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 +.Lbige_addb_14_15: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 +.Lbige_addb_16: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 + .text .align 16 @@ -627,6 +656,9 @@ _gcry_sm4_gfni_avx512_ctr_enc: CFI_STARTPROC(); spec_stop_avx512; + cmpb $(0x100 - 16), 15(%rcx); + jbe .Lctr_byteadd16; + vbroadcasti128 .Lbswap128_mask rRIP, RTMP0; vmovdqa .Lcounter0123_lo rRIP, RTMP1; vbroadcasti128 .Lcounter2222_lo rRIP, RTMP2; @@ -695,6 +727,8 @@ _gcry_sm4_gfni_avx512_ctr_enc: vpshufb RTMP0, RB2, RB2; vpshufb RTMP0, RB3, RB3; +.align 16 +.Lload_ctr_done16: call __sm4_gfni_crypt_blk16; vpxor (0 * 32)(%rdx), RA0, RA0; @@ -719,6 +753,36 @@ _gcry_sm4_gfni_avx512_ctr_enc: kxorq %k1, %k1, %k1; ret_spec_stop; + +.align 16 +.Lctr_byteadd_full_ctr_carry16: + movq 8(%rcx), %r11; + movq (%rcx), %r10; + bswapq %r11; + bswapq %r10; + addq $16, %r11; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + jmp .Lctr_byteadd_ymm16; +.align 16 +.Lctr_byteadd16: + vbroadcasti128 (%rcx), RB3; + je .Lctr_byteadd_full_ctr_carry16; + addb $16, 15(%rcx); +.Lctr_byteadd_ymm16: + vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0; + vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1; + vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2; + vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3; + vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0; + vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1; + vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2; + vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3; + + jmp .Lload_ctr_done16; CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx512_ctr_enc,.-_gcry_sm4_gfni_avx512_ctr_enc;) @@ -1304,6 +1368,9 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32: CFI_STARTPROC(); spec_stop_avx512; + cmpb $(0x100 - 32), 15(%rcx); + jbe .Lctr_byteadd32; + vbroadcasti64x2 .Lbswap128_mask rRIP, RTMP0z; vmovdqa32 .Lcounter0123_lo rRIP, RTMP1z; vbroadcasti64x2 .Lcounter4444_lo rRIP, RTMP2z; @@ -1372,6 +1439,8 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32: vpshufb RTMP0z, RB2z, RB2z; vpshufb RTMP0z, RB3z, RB3z; +.align 16 +.Lload_ctr_done32: call __sm4_gfni_crypt_blk32; vpxord (0 * 64)(%rdx), RA0z, RA0z; @@ -1396,6 +1465,38 @@ _gcry_sm4_gfni_avx512_ctr_enc_blk32: kxorq %k1, %k1, %k1; ret_spec_stop; + +.align 16 +.Lctr_byteadd_full_ctr_carry32: + movq 8(%rcx), %r11; + movq (%rcx), %r10; + bswapq %r11; + bswapq %r10; + addq $32, %r11; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + jmp .Lctr_byteadd_zmm32; +.align 16 +.Lctr_byteadd32: + vbroadcasti64x2 (%rcx), RA3z; + je .Lctr_byteadd_full_ctr_carry32; + addb $32, 15(%rcx); +.Lctr_byteadd_zmm32: + vbroadcasti64x2 .Lbige_addb_16 rRIP, RB3z; + vpaddb RB3z, RA3z, RB3z; + vpaddb .Lbige_addb_0_1 rRIP, RA3z, RA0z; + vpaddb .Lbige_addb_4_5 rRIP, RA3z, RA1z; + vpaddb .Lbige_addb_8_9 rRIP, RA3z, RA2z; + vpaddb .Lbige_addb_12_13 rRIP, RA3z, RA3z; + vpaddb .Lbige_addb_0_1 rRIP, RB3z, RB0z; + vpaddb .Lbige_addb_4_5 rRIP, RB3z, RB1z; + vpaddb .Lbige_addb_8_9 rRIP, RB3z, RB2z; + vpaddb .Lbige_addb_12_13 rRIP, RB3z, RB3z; + + jmp .Lload_ctr_done32; CFI_ENDPROC(); ELF(.size _gcry_sm4_gfni_avx512_ctr_enc_blk32,.-_gcry_sm4_gfni_avx512_ctr_enc_blk32;) |