diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-03-09 20:15:52 +0200 |
---|---|---|
committer | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2022-03-09 20:29:59 +0200 |
commit | d820d27a3bce6365523fbcb6ec607b23dd4ca4e2 (patch) | |
tree | 06e9a3f121b2d94fc8dcfac2a548780cd9090294 /cipher/rijndael-vaes-avx2-amd64.S | |
parent | e6f360019369fff42411b4cca976cc8ebe09281d (diff) | |
download | libgcrypt-d820d27a3bce6365523fbcb6ec607b23dd4ca4e2.tar.gz |
rijndael-vaes-avx2: perform checksumming inline
* cipher/rijndael-vaes-avx2-amd64.S
(_gcry_vaes_avx2_ocb_checksum): Remove.
(_gcry_vaes_avx2_ocb_crypt_amd64): Add inline checksumming.
--
VAES/AVX2/OCB encryption implementation had same issue with
performance drop with large buffers as did AES-NI/OCB implementation,
see e924ce456d5728a81c148de4a6eb23373cb70ca0 for details. Patch
changes VAES/AVX2/OCB to perform checksumming inline with encryption
and decryption instead of using 2-pass approach. Inline checksumming
also gives nice small ~6% speed boost too.
Benchmark on Intel Core i3-1115G4:
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB enc | 0.044 ns/B 21569 MiB/s 0.181 c/B 4089
OCB dec | 0.045 ns/B 21298 MiB/s 0.183 c/B 4089
After:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB enc | 0.042 ns/B 22922 MiB/s 0.170 c/B 4089
OCB dec | 0.042 ns/B 22676 MiB/s 0.172 c/B 4089
GnuPG-bug-id: T5875
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-vaes-avx2-amd64.S')
-rw-r--r-- | cipher/rijndael-vaes-avx2-amd64.S | 424 |
1 files changed, 187 insertions, 237 deletions
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S index f94b58db..e36e82a0 100644 --- a/cipher/rijndael-vaes-avx2-amd64.S +++ b/cipher/rijndael-vaes-avx2-amd64.S @@ -1394,151 +1394,6 @@ ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64) /********************************************************************** OCB-mode encryption/decryption **********************************************************************/ -ELF(.type _gcry_vaes_avx2_ocb_checksum,@function) -_gcry_vaes_avx2_ocb_checksum: - /* input: - * %rax: offset pointer - * %r10: plaintext pointer - * %r11: nblocks - */ - CFI_STARTPROC(); - - vpxor %xmm0, %xmm0, %xmm0; - cmpq $4, %r11; - jb .Locb_checksum_blk1; - vpxor %xmm1, %xmm1, %xmm1; - vpxor %xmm2, %xmm2, %xmm2; - vpxor %xmm3, %xmm3, %xmm3; - cmpq $16, %r11; - jb .Locb_checksum_blk4; - vpxor %xmm4, %xmm4, %xmm4; - vpxor %xmm5, %xmm5, %xmm5; - vpxor %xmm6, %xmm6, %xmm6; - vpxor %xmm7, %xmm7, %xmm7; - cmpq $32, %r11; - jb .Locb_checksum_blk16; - vpxor %xmm8, %xmm8, %xmm8; - vpxor %xmm9, %xmm9, %xmm9; - vpxor %xmm10, %xmm10, %xmm10; - vpxor %xmm11, %xmm11, %xmm11; - vpxor %xmm12, %xmm12, %xmm12; - vpxor %xmm13, %xmm13, %xmm13; - vpxor %xmm14, %xmm14, %xmm14; - vpxor %xmm15, %xmm15, %xmm15; - -.align 8 -.Locb_checksum_blk32: - cmpq $32, %r11; - jb .Locb_checksum_blk32_done; - - leaq -32(%r11), %r11; - - vpxor (0 * 16)(%r10), %ymm0, %ymm0; - vpxor (2 * 16)(%r10), %ymm1, %ymm1; - vpxor (4 * 16)(%r10), %ymm2, %ymm2; - vpxor (6 * 16)(%r10), %ymm3, %ymm3; - vpxor (8 * 16)(%r10), %ymm4, %ymm4; - vpxor (10 * 16)(%r10), %ymm5, %ymm5; - vpxor (12 * 16)(%r10), %ymm6, %ymm6; - vpxor (14 * 16)(%r10), %ymm7, %ymm7; - vpxor (16 * 16)(%r10), %ymm8, %ymm8; - vpxor (18 * 16)(%r10), %ymm9, %ymm9; - vpxor (20 * 16)(%r10), %ymm10, %ymm10; - vpxor (22 * 16)(%r10), %ymm11, %ymm11; - vpxor (24 * 16)(%r10), %ymm12, %ymm12; - vpxor (26 * 16)(%r10), %ymm13, %ymm13; - vpxor (28 * 16)(%r10), %ymm14, %ymm14; - vpxor (30 * 16)(%r10), %ymm15, %ymm15; - leaq (32 * 16)(%r10), %r10; - - jmp .Locb_checksum_blk32; - -.align 8 -.Locb_checksum_blk32_done: - vpxor %ymm8, %ymm0, %ymm0; - vpxor %ymm9, %ymm1, %ymm1; - vpxor %ymm10, %ymm2, %ymm2; - vpxor %ymm11, %ymm3, %ymm3; - vpxor %ymm12, %ymm4, %ymm4; - vpxor %ymm13, %ymm5, %ymm5; - vpxor %ymm14, %ymm6, %ymm6; - vpxor %ymm15, %ymm7, %ymm7; - -.align 8 -.Locb_checksum_blk16: - cmpq $16, %r11; - jb .Locb_checksum_blk16_done; - - leaq -16(%r11), %r11; - - vpxor (0 * 16)(%r10), %ymm0, %ymm0; - vpxor (2 * 16)(%r10), %ymm1, %ymm1; - vpxor (4 * 16)(%r10), %ymm2, %ymm2; - vpxor (6 * 16)(%r10), %ymm3, %ymm3; - vpxor (8 * 16)(%r10), %ymm4, %ymm4; - vpxor (10 * 16)(%r10), %ymm5, %ymm5; - vpxor (12 * 16)(%r10), %ymm6, %ymm6; - vpxor (14 * 16)(%r10), %ymm7, %ymm7; - leaq (16 * 16)(%r10), %r10; - - jmp .Locb_checksum_blk16; - -.align 8 -.Locb_checksum_blk16_done: - vpxor %ymm4, %ymm0, %ymm0; - vpxor %ymm5, %ymm1, %ymm1; - vpxor %ymm6, %ymm2, %ymm2; - vpxor %ymm7, %ymm3, %ymm3; - vextracti128 $1, %ymm0, %xmm4; - vextracti128 $1, %ymm1, %xmm5; - vextracti128 $1, %ymm2, %xmm6; - vextracti128 $1, %ymm3, %xmm7; - vpxor %xmm4, %xmm0, %xmm0; - vpxor %xmm5, %xmm1, %xmm1; - vpxor %xmm6, %xmm2, %xmm2; - vpxor %xmm7, %xmm3, %xmm3; - -.align 8 -.Locb_checksum_blk4: - cmpq $4, %r11; - jb .Locb_checksum_blk4_done; - - leaq -4(%r11), %r11; - - vpxor (0 * 16)(%r10), %xmm0, %xmm0; - vpxor (1 * 16)(%r10), %xmm1, %xmm1; - vpxor (2 * 16)(%r10), %xmm2, %xmm2; - vpxor (3 * 16)(%r10), %xmm3, %xmm3; - leaq (4 * 16)(%r10), %r10; - - jmp .Locb_checksum_blk4; - -.align 8 -.Locb_checksum_blk4_done: - vpxor %xmm1, %xmm0, %xmm0; - vpxor %xmm3, %xmm2, %xmm2; - vpxor %xmm2, %xmm0, %xmm0; - -.align 8 -.Locb_checksum_blk1: - cmpq $1, %r11; - jb .Locb_checksum_done; - - leaq -1(%r11), %r11; - - vpxor (%r10), %xmm0, %xmm0; - leaq 16(%r10), %r10; - - jmp .Locb_checksum_blk1; - -.align 8 -.Locb_checksum_done: - vpxor (%rax), %xmm0, %xmm0; - vmovdqu %xmm0, (%rax); - ret_spec_stop; - CFI_ENDPROC(); -ELF(.size _gcry_vaes_avx2_ocb_checksum,.-_gcry_vaes_avx2_ocb_checksum) - ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64,@function) .globl _gcry_vaes_avx2_ocb_crypt_amd64 _gcry_vaes_avx2_ocb_crypt_amd64: @@ -1556,8 +1411,12 @@ _gcry_vaes_avx2_ocb_crypt_amd64: */ CFI_STARTPROC(); -#define STACK_REGS_POS (16 * 16 + 4 * 16) -#define STACK_ALLOC (STACK_REGS_POS + 6 * 8) +#define STACK_REGS_POS (16 * 16 + 4 * 16 + 2 * 16) +#define STACK_ALLOC (STACK_REGS_POS + 5 * 8) +#define OFFSET_PTR_Q 16(%rbp) +#define CHECKSUM_PTR_Q 24(%rbp) +#define L_ARRAY_PTR_L 32(%rbp) +#define ENCRYPT_FLAG_L 40(%rbp) pushq %rbp; CFI_PUSH(%rbp); @@ -1575,37 +1434,20 @@ _gcry_vaes_avx2_ocb_crypt_amd64: CFI_REG_ON_STACK(r14, STACK_REGS_POS + 2 * 8); movq %r15, (STACK_REGS_POS + 3 * 8)(%rsp); CFI_REG_ON_STACK(r15, STACK_REGS_POS + 3 * 8); + movq %rbx, (STACK_REGS_POS + 4 * 8)(%rsp); + CFI_REG_ON_STACK(rbx, STACK_REGS_POS + 4 * 8); - movl 40(%rbp), %r15d; /* encrypt-flag. */ - movq 16(%rbp), %r14; /* offset ptr. */ - - /* Handle encryption checksumming. */ - testl %r15d, %r15d; - jz .Locb_dec_checksum_prepare; - movq 24(%rbp), %rax; /* checksum ptr. */ - movq %rcx, %r10; - movq %r8, %r11; - call _gcry_vaes_avx2_ocb_checksum; - jmp .Locb_enc_checksum_done; -.Locb_dec_checksum_prepare: - /* Store plaintext address and number of blocks for decryption - * checksumming. */ - movq %rdx, (STACK_REGS_POS + 4 * 8)(%rsp); - movq %r8, (STACK_REGS_POS + 5 * 8)(%rsp); -.Locb_enc_checksum_done: + movl ENCRYPT_FLAG_L, %r15d; /* encrypt-flag. */ + movq OFFSET_PTR_Q, %r14; /* offset ptr. */ + movq CHECKSUM_PTR_Q, %rbx; /* checksum ptr. */ + leal (, %r9d, 4), %eax; vmovdqu (%r14), %xmm15; /* Load offset. */ - movq 32(%rbp), %r14; /* L-array ptr. */ + movq L_ARRAY_PTR_L, %r14; /* L-array ptr. */ vmovdqa (0 * 16)(%rdi), %xmm0; /* first key */ - movl $(10 * 16), %eax; - cmpl $12, %r9d; - jb .Llast_key_ptr; - movl $(12 * 16), %eax; - je .Llast_key_ptr; - movl $(14 * 16), %eax; - .align 8 - .Llast_key_ptr: - vpxor (%rdi, %rax), %xmm0, %xmm0; /* first key ^ last key */ + vpxor %xmm14, %xmm14, %xmm14; + vpxor %xmm13, %xmm13, %xmm13; + vpxor (%rdi, %rax, 4), %xmm0, %xmm0; /* first key ^ last key */ vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key */ vmovdqa %xmm0, (14 * 16)(%rsp); vmovdqa %xmm0, (15 * 16)(%rsp); @@ -1678,16 +1520,24 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vinserti128 $1, %xmm10, %ymm9, %ymm7; vinserti128 $1, %xmm15, %ymm11, %ymm8; - vpxor (0 * 16)(%rcx), %ymm5, %ymm0; - vpxor (2 * 16)(%rcx), %ymm6, %ymm1; - vpxor (4 * 16)(%rcx), %ymm7, %ymm2; - vpxor (6 * 16)(%rcx), %ymm8, %ymm3; - leaq (8 * 16)(%rcx), %rcx; - - vmovdqa (14 * 16)(%rsp), %ymm9; - testl %r15d, %r15d; jz .Locb_unaligned_blk8_dec; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + leaq (8 * 16)(%rcx), %rcx; + vpxor %ymm0, %ymm14, %ymm14; + vpxor %ymm1, %ymm13, %ymm13; + vpxor %ymm2, %ymm14, %ymm14; + vpxor %ymm3, %ymm13, %ymm13; + vpxor %ymm5, %ymm0, %ymm0; + vpxor %ymm6, %ymm1, %ymm1; + vpxor %ymm7, %ymm2, %ymm2; + vpxor %ymm8, %ymm3, %ymm3; + + vmovdqa (14 * 16)(%rsp), %ymm9; + /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); @@ -1739,6 +1589,14 @@ _gcry_vaes_avx2_ocb_crypt_amd64: .align 8 .Locb_unaligned_blk8_dec: + vpxor (0 * 16)(%rcx), %ymm5, %ymm0; + vpxor (2 * 16)(%rcx), %ymm6, %ymm1; + vpxor (4 * 16)(%rcx), %ymm7, %ymm2; + vpxor (6 * 16)(%rcx), %ymm8, %ymm3; + leaq (8 * 16)(%rcx), %rcx; + + vmovdqa (14 * 16)(%rsp), %ymm9; + /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); @@ -1780,6 +1638,10 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vaesdeclast %ymm6, %ymm1, %ymm1; vaesdeclast %ymm7, %ymm2, %ymm2; vaesdeclast %ymm4, %ymm3, %ymm3; + vpxor %ymm0, %ymm14, %ymm14; + vpxor %ymm1, %ymm13, %ymm13; + vpxor %ymm2, %ymm14, %ymm14; + vpxor %ymm3, %ymm13, %ymm13; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); @@ -1817,12 +1679,16 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vpxor (%r14, %rax), %xmm7, %xmm15; vinserti128 $1, %xmm15, %ymm7, %ymm6; - vpxor (0 * 16)(%rcx), %ymm5, %ymm0; - vpxor (2 * 16)(%rcx), %ymm6, %ymm1; - leaq (4 * 16)(%rcx), %rcx; - testl %r15d, %r15d; jz .Locb_unaligned_blk4_dec; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + leaq (4 * 16)(%rcx), %rcx; + vpxor %ymm0, %ymm14, %ymm14; + vpxor %ymm1, %ymm13, %ymm13; + vpxor %ymm5, %ymm0, %ymm0; + vpxor %ymm6, %ymm1, %ymm1; + /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC2(%ymm4, %ymm0, %ymm1); @@ -1869,6 +1735,10 @@ _gcry_vaes_avx2_ocb_crypt_amd64: .align 8 .Locb_unaligned_blk4_dec: + vpxor (0 * 16)(%rcx), %ymm5, %ymm0; + vpxor (2 * 16)(%rcx), %ymm6, %ymm1; + leaq (4 * 16)(%rcx), %rcx; + /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC2(%ymm4, %ymm0, %ymm1); @@ -1907,6 +1777,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vpxor %ymm6, %ymm8, %ymm6; vaesdeclast %ymm5, %ymm0, %ymm0; vaesdeclast %ymm6, %ymm1, %ymm1; + vpxor %ymm0, %ymm14, %ymm14; + vpxor %ymm1, %ymm13, %ymm13; vmovdqu %ymm0, (0 * 16)(%rdx); vmovdqu %ymm1, (2 * 16)(%rdx); leaq (4 * 16)(%rdx), %rdx; @@ -1924,11 +1796,14 @@ _gcry_vaes_avx2_ocb_crypt_amd64: tzcntl %esi, %r11d; shll $4, %r11d; vpxor (%r14, %r11), %xmm15, %xmm15; - vpxor (%rcx), %xmm15, %xmm0; - leaq 16(%rcx), %rcx; testl %r15d, %r15d; jz .Locb_unaligned_blk1_dec; + vmovdqu (%rcx), %xmm0; + vpxor %ymm0, %ymm14, %ymm14; + vpxor %xmm15, %xmm0, %xmm0; + leaq 16(%rcx), %rcx; + /* AES rounds. */ vaesenc (1 * 16)(%rdi), %xmm0, %xmm0; vaesenc (2 * 16)(%rdi), %xmm0, %xmm0; @@ -1958,6 +1833,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64: .align 8 .Locb_unaligned_blk1_dec: + vpxor (%rcx), %xmm15, %xmm0; + leaq 16(%rcx), %rcx; + /* AES rounds. */ vaesdec (1 * 16)(%rdi), %xmm0, %xmm0; vaesdec (2 * 16)(%rdi), %xmm0, %xmm0; @@ -1980,6 +1858,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64: .Locb_unaligned_blk1_dec_last: vpxor (14 * 16)(%rsp), %xmm15, %xmm1; vaesdeclast %xmm1, %xmm0, %xmm0; + vpxor %ymm0, %ymm14, %ymm14; vmovdqu %xmm0, (%rdx); leaq 16(%rdx), %rdx; @@ -2021,6 +1900,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vmovdqu (1 * 16)(%r14), %xmm1; vmovdqu (2 * 16)(%r14), %xmm2; vmovdqu (3 * 16)(%r14), %xmm3; + vpxor %ymm13, %ymm14, %ymm14; + vmovdqa %ymm14, (20 * 16)(%rsp); vpxor %xmm0, %xmm1, %xmm4; /* L[0] ^ L[1] */ vpxor %xmm0, %xmm2, %xmm5; /* L[0] ^ L[2] */ vpxor %xmm0, %xmm3, %xmm6; /* L[0] ^ L[3] */ @@ -2069,26 +1950,40 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */ vinserti128 $1, %xmm14, %ymm13, %ymm14; - vpxor (10 * 16)(%rsp), %ymm15, %ymm13; - vpxor (14 * 16)(%rcx), %ymm14, %ymm7; - - vpxor (0 * 16)(%rcx), %ymm8, %ymm0; - vpxor (2 * 16)(%rcx), %ymm9, %ymm1; - vpxor (4 * 16)(%rcx), %ymm10, %ymm2; - vpxor (6 * 16)(%rcx), %ymm11, %ymm3; - vpxor (8 * 16)(%rcx), %ymm12, %ymm4; - vpxor (10 * 16)(%rcx), %ymm13, %ymm5; - vmovdqa %ymm13, (16 * 16)(%rsp); - vpxor (12 * 16)(%rsp), %ymm15, %ymm13; - vpxor (12 * 16)(%rcx), %ymm13, %ymm6; - vmovdqa %ymm13, (18 * 16)(%rsp); - - leaq (16 * 16)(%rcx), %rcx; - - vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; - testl %r15d, %r15d; jz .Locb_aligned_blk16_dec; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + vpxor (8 * 16)(%rcx), %ymm0, %ymm4; + vpxor (10 * 16)(%rcx), %ymm1, %ymm5; + vpxor (12 * 16)(%rcx), %ymm2, %ymm6; + vpxor (14 * 16)(%rcx), %ymm3, %ymm7; + vpxor %ymm4, %ymm5, %ymm5; + vpxor %ymm6, %ymm7, %ymm7; + vpxor %ymm5, %ymm7, %ymm7; + vpxor (20 * 16)(%rsp), %ymm7, %ymm7; + vmovdqa %ymm7, (20 * 16)(%rsp); + + vpxor (10 * 16)(%rsp), %ymm15, %ymm13; + vpxor (14 * 16)(%rcx), %ymm14, %ymm7; + + vpxor %ymm8, %ymm0, %ymm0; + vpxor %ymm9, %ymm1, %ymm1; + vpxor %ymm10, %ymm2, %ymm2; + vpxor %ymm11, %ymm3, %ymm3; + vpxor (8 * 16)(%rcx), %ymm12, %ymm4; + vpxor (10 * 16)(%rcx), %ymm13, %ymm5; + vmovdqa %ymm13, (16 * 16)(%rsp); + vpxor (12 * 16)(%rsp), %ymm15, %ymm13; + vpxor (12 * 16)(%rcx), %ymm13, %ymm6; + vmovdqa %ymm13, (18 * 16)(%rsp); + + leaq (16 * 16)(%rcx), %rcx; + + vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; + /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm13; VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); @@ -2153,6 +2048,24 @@ _gcry_vaes_avx2_ocb_crypt_amd64: .align 8 .Locb_aligned_blk16_dec: + vpxor (10 * 16)(%rsp), %ymm15, %ymm13; + vpxor (14 * 16)(%rcx), %ymm14, %ymm7; + + vpxor (0 * 16)(%rcx), %ymm8, %ymm0; + vpxor (2 * 16)(%rcx), %ymm9, %ymm1; + vpxor (4 * 16)(%rcx), %ymm10, %ymm2; + vpxor (6 * 16)(%rcx), %ymm11, %ymm3; + vpxor (8 * 16)(%rcx), %ymm12, %ymm4; + vpxor (10 * 16)(%rcx), %ymm13, %ymm5; + vmovdqa %ymm13, (16 * 16)(%rsp); + vpxor (12 * 16)(%rsp), %ymm15, %ymm13; + vpxor (12 * 16)(%rcx), %ymm13, %ymm6; + vmovdqa %ymm13, (18 * 16)(%rsp); + + leaq (16 * 16)(%rcx), %rcx; + + vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; + /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm13; VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7); @@ -2207,12 +2120,22 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vmovdqu %ymm1, (2 * 16)(%rdx); vmovdqu %ymm2, (4 * 16)(%rdx); vmovdqu %ymm3, (6 * 16)(%rdx); + vpxor %ymm1, %ymm0, %ymm0; + vpxor %ymm3, %ymm2, %ymm2; vmovdqu %ymm4, (8 * 16)(%rdx); vmovdqu %ymm5, (10 * 16)(%rdx); vmovdqu %ymm6, (12 * 16)(%rdx); vmovdqu %ymm7, (14 * 16)(%rdx); + vpxor %ymm5, %ymm4, %ymm4; + vpxor %ymm7, %ymm6, %ymm6; leaq (16 * 16)(%rdx), %rdx; + vpxor %ymm4, %ymm0, %ymm0; + vpxor %ymm6, %ymm2, %ymm2; + vpxor %ymm2, %ymm0, %ymm0; + vpxor (20 * 16)(%rsp), %ymm0, %ymm0; + vmovdqa %ymm0, (20 * 16)(%rsp); + jmp .Locb_aligned_blk16; /* Aligned: Process trailing eight blocks. */ @@ -2235,18 +2158,28 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */ vinserti128 $1, %xmm14, %ymm13, %ymm14; - vpxor (0 * 16)(%rcx), %ymm5, %ymm0; - vpxor (2 * 16)(%rcx), %ymm6, %ymm1; - vpxor (4 * 16)(%rcx), %ymm7, %ymm2; - vpxor (6 * 16)(%rcx), %ymm14, %ymm3; - leaq (8 * 16)(%rcx), %rcx; - - vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; - - vmovdqa (14 * 16)(%rsp), %ymm8; - testl %r15d, %r15d; jz .Locb_aligned_blk8_dec; + vmovdqu (0 * 16)(%rcx), %ymm0; + vmovdqu (2 * 16)(%rcx), %ymm1; + vmovdqu (4 * 16)(%rcx), %ymm2; + vmovdqu (6 * 16)(%rcx), %ymm3; + vpxor %ymm2, %ymm0, %ymm10; + vpxor %ymm3, %ymm1, %ymm11; + vpxor %ymm11, %ymm10, %ymm10; + vpxor (20 * 16)(%rsp), %ymm10, %ymm10; + vmovdqa %ymm10, (20 * 16)(%rsp); + + vpxor %ymm5, %ymm0, %ymm0; + vpxor %ymm6, %ymm1, %ymm1; + vpxor %ymm7, %ymm2, %ymm2; + vpxor %ymm14, %ymm3, %ymm3; + leaq (8 * 16)(%rcx), %rcx; + + vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; + + vmovdqa (14 * 16)(%rsp), %ymm8; + /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); @@ -2298,6 +2231,16 @@ _gcry_vaes_avx2_ocb_crypt_amd64: .align 8 .Locb_aligned_blk8_dec: + vpxor (0 * 16)(%rcx), %ymm5, %ymm0; + vpxor (2 * 16)(%rcx), %ymm6, %ymm1; + vpxor (4 * 16)(%rcx), %ymm7, %ymm2; + vpxor (6 * 16)(%rcx), %ymm14, %ymm3; + leaq (8 * 16)(%rcx), %rcx; + + vperm2i128 $0x11, %ymm14, %ymm14, %ymm15; + + vmovdqa (14 * 16)(%rsp), %ymm8; + /* AES rounds */ vbroadcasti128 (1 * 16)(%rdi), %ymm4; VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3); @@ -2346,19 +2289,28 @@ _gcry_vaes_avx2_ocb_crypt_amd64: vmovdqu %ymm3, (6 * 16)(%rdx); leaq (8 * 16)(%rdx), %rdx; + vpxor %ymm1, %ymm0, %ymm0; + vpxor %ymm3, %ymm2, %ymm2; + vpxor %ymm2, %ymm0, %ymm0; + vpxor (20 * 16)(%rsp), %ymm0, %ymm0; + vmovdqa %ymm0, (20 * 16)(%rsp); + .align 8 .Locb_aligned_done: + vmovdqa (20 * 16)(%rsp), %ymm14; + vpxor %xmm13, %xmm13, %xmm13; + /* Burn stack. */ - vpxor %ymm0, %ymm0, %ymm0; - vmovdqa %ymm0, (0 * 16)(%rsp); - vmovdqa %ymm0, (2 * 16)(%rsp); - vmovdqa %ymm0, (4 * 16)(%rsp); - vmovdqa %ymm0, (6 * 16)(%rsp); - vmovdqa %ymm0, (8 * 16)(%rsp); - vmovdqa %ymm0, (10 * 16)(%rsp); - vmovdqa %ymm0, (12 * 16)(%rsp); - vmovdqa %ymm0, (16 * 16)(%rsp); - vmovdqa %ymm0, (18 * 16)(%rsp); + vmovdqa %ymm13, (0 * 16)(%rsp); + vmovdqa %ymm13, (2 * 16)(%rsp); + vmovdqa %ymm13, (4 * 16)(%rsp); + vmovdqa %ymm13, (6 * 16)(%rsp); + vmovdqa %ymm13, (8 * 16)(%rsp); + vmovdqa %ymm13, (10 * 16)(%rsp); + vmovdqa %ymm13, (12 * 16)(%rsp); + vmovdqa %ymm13, (16 * 16)(%rsp); + vmovdqa %ymm13, (18 * 16)(%rsp); + vmovdqa %ymm13, (20 * 16)(%rsp); /* Handle tailing 1…7 blocks in nblk-unaligned loop. */ movq %r8, %r10; @@ -2367,20 +2319,16 @@ _gcry_vaes_avx2_ocb_crypt_amd64: .align 8 .Ldone_ocb: - movq 16(%rbp), %r14; /* offset ptr. */ + vpxor %ymm13, %ymm14, %ymm14; + vextracti128 $1, %ymm14, %xmm13; + vpxor (%rbx), %xmm14, %xmm14; + vpxor %xmm13, %xmm14, %xmm14; + vmovdqu %xmm14, (%rbx); + + movq OFFSET_PTR_Q, %r14; /* offset ptr. */ vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key ^ first key */ vmovdqu %xmm15, (%r14); /* Store offset. */ - /* Handle decryption checksumming. */ - - testl %r15d, %r15d; - jnz .Locb_dec_checksum_done; - movq 24(%rbp), %rax; /* checksum ptr. */ - movq (STACK_REGS_POS + 4 * 8)(%rsp), %r10; - movq (STACK_REGS_POS + 5 * 8)(%rsp), %r11; - call _gcry_vaes_avx2_ocb_checksum; -.Locb_dec_checksum_done: - /* Burn stack. */ vpxor %ymm0, %ymm0, %ymm0; vmovdqa %ymm0, (14 * 16)(%rsp); @@ -2395,6 +2343,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64: CFI_RESTORE(%r14); movq (STACK_REGS_POS + 3 * 8)(%rsp), %r15; CFI_RESTORE(%r15); + movq (STACK_REGS_POS + 4 * 8)(%rsp), %rbx; + CFI_RESTORE(%rbx); leave; CFI_LEAVE(); |