diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-19 18:39:36 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2023-02-22 20:27:56 +0200 |
commit | 978b02fca682c9ecb71e30cdeeb6922fc8331f6e (patch) | |
tree | 58e53a100647fe286ecb50994a66f156cffcece4 /cipher/sm4-aesni-avx2-amd64.S | |
parent | 8f7f5a9fc63968304bacedbc2f22b9f7188bbd53 (diff) | |
download | libgcrypt-978b02fca682c9ecb71e30cdeeb6922fc8331f6e.tar.gz |
sm4: add CTR-mode byte addition for AVX/AVX2/AVX512 implementations
* cipher/sm4-aesni-avx-amd64.S
(_gcry_sm4_aesni_avx_ctr_enc): Add byte addition fast-path.
* cipher/sm4-aesni-avx2-amd64.S
(_gcry_sm4_aesni_avx2_ctr_enc): Likewise.
* cipher/sm4-gfni-avx2-amd64.S
(_gcry_sm4_gfni_avx2_ctr_enc): Likewise.
* cipher/sm4-gfni-avx512-amd64.S
(_gcry_sm4_gfni_avx512_ctr_enc)
(_gcry_sm4_gfni_avx512_ctr_enc_blk32): Likewise.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/sm4-aesni-avx2-amd64.S')
-rw-r--r-- | cipher/sm4-aesni-avx2-amd64.S | 65 |
1 files changed, 63 insertions, 2 deletions
diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S index acd37cff..03f979fa 100644 --- a/cipher/sm4-aesni-avx2-amd64.S +++ b/cipher/sm4-aesni-avx2-amd64.S @@ -1,6 +1,6 @@ /* sm4-avx2-amd64.S - AVX2 implementation of SM4 cipher * - * Copyright (C) 2020, 2022 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * Copyright (C) 2020, 2022-2023 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * This file is part of Libgcrypt. * @@ -171,6 +171,33 @@ _sm4_aesni_avx2_consts: .Lbswap32_mask: .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 +/* CTR byte addition constants */ +.align 32 +.Lbige_addb_0_1: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 +.Lbige_addb_2_3: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3 +.Lbige_addb_4_5: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 +.Lbige_addb_6_7: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7 +.Lbige_addb_8_9: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9 +.Lbige_addb_10_11: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11 +.Lbige_addb_12_13: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 +.Lbige_addb_14_15: + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15 + .align 4 /* 4-bit mask */ .L0f0f0f0f: @@ -371,6 +398,9 @@ _gcry_sm4_aesni_avx2_ctr_enc: */ CFI_STARTPROC(); + cmpb $(0x100 - 16), 15(%rcx); + jbe .Lctr_byteadd; + movq 8(%rcx), %rax; bswapq %rax; @@ -438,11 +468,12 @@ _gcry_sm4_aesni_avx2_ctr_enc: vextracti128 $1, RTMP0, RTMP0x; vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */ -.align 4 .Lctr_carry_done: /* store new IV */ vmovdqu RTMP0x, (%rcx); +.align 8 +.Lload_ctr_done: call __sm4_crypt_blk16; vpxor (0 * 32)(%rdx), RA0, RA0; @@ -466,6 +497,36 @@ _gcry_sm4_aesni_avx2_ctr_enc: vzeroall; ret_spec_stop; + +.align 8 +.Lctr_byteadd_full_ctr_carry: + movq 8(%rcx), %r11; + movq (%rcx), %r10; + bswapq %r11; + bswapq %r10; + addq $16, %r11; + adcq $0, %r10; + bswapq %r11; + bswapq %r10; + movq %r11, 8(%rcx); + movq %r10, (%rcx); + jmp .Lctr_byteadd_ymm; +.align 8 +.Lctr_byteadd: + vbroadcasti128 (%rcx), RB3; + je .Lctr_byteadd_full_ctr_carry; + addb $16, 15(%rcx); +.Lctr_byteadd_ymm: + vpaddb .Lbige_addb_0_1 rRIP, RB3, RA0; + vpaddb .Lbige_addb_2_3 rRIP, RB3, RA1; + vpaddb .Lbige_addb_4_5 rRIP, RB3, RA2; + vpaddb .Lbige_addb_6_7 rRIP, RB3, RA3; + vpaddb .Lbige_addb_8_9 rRIP, RB3, RB0; + vpaddb .Lbige_addb_10_11 rRIP, RB3, RB1; + vpaddb .Lbige_addb_12_13 rRIP, RB3, RB2; + vpaddb .Lbige_addb_14_15 rRIP, RB3, RB3; + + jmp .Lload_ctr_done; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;) |