summaryrefslogtreecommitdiff
path: root/cipher/rijndael-vaes-avx2-amd64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2022-03-09 20:15:52 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2022-03-09 20:29:59 +0200
commitd820d27a3bce6365523fbcb6ec607b23dd4ca4e2 (patch)
tree06e9a3f121b2d94fc8dcfac2a548780cd9090294 /cipher/rijndael-vaes-avx2-amd64.S
parente6f360019369fff42411b4cca976cc8ebe09281d (diff)
downloadlibgcrypt-d820d27a3bce6365523fbcb6ec607b23dd4ca4e2.tar.gz
rijndael-vaes-avx2: perform checksumming inline
* cipher/rijndael-vaes-avx2-amd64.S (_gcry_vaes_avx2_ocb_checksum): Remove. (_gcry_vaes_avx2_ocb_crypt_amd64): Add inline checksumming. -- VAES/AVX2/OCB encryption implementation had same issue with performance drop with large buffers as did AES-NI/OCB implementation, see e924ce456d5728a81c148de4a6eb23373cb70ca0 for details. Patch changes VAES/AVX2/OCB to perform checksumming inline with encryption and decryption instead of using 2-pass approach. Inline checksumming also gives nice small ~6% speed boost too. Benchmark on Intel Core i3-1115G4: Before: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz OCB enc | 0.044 ns/B 21569 MiB/s 0.181 c/B 4089 OCB dec | 0.045 ns/B 21298 MiB/s 0.183 c/B 4089 After: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz OCB enc | 0.042 ns/B 22922 MiB/s 0.170 c/B 4089 OCB dec | 0.042 ns/B 22676 MiB/s 0.172 c/B 4089 GnuPG-bug-id: T5875 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher/rijndael-vaes-avx2-amd64.S')
-rw-r--r--cipher/rijndael-vaes-avx2-amd64.S424
1 files changed, 187 insertions, 237 deletions
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index f94b58db..e36e82a0 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -1394,151 +1394,6 @@ ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64)
/**********************************************************************
OCB-mode encryption/decryption
**********************************************************************/
-ELF(.type _gcry_vaes_avx2_ocb_checksum,@function)
-_gcry_vaes_avx2_ocb_checksum:
- /* input:
- * %rax: offset pointer
- * %r10: plaintext pointer
- * %r11: nblocks
- */
- CFI_STARTPROC();
-
- vpxor %xmm0, %xmm0, %xmm0;
- cmpq $4, %r11;
- jb .Locb_checksum_blk1;
- vpxor %xmm1, %xmm1, %xmm1;
- vpxor %xmm2, %xmm2, %xmm2;
- vpxor %xmm3, %xmm3, %xmm3;
- cmpq $16, %r11;
- jb .Locb_checksum_blk4;
- vpxor %xmm4, %xmm4, %xmm4;
- vpxor %xmm5, %xmm5, %xmm5;
- vpxor %xmm6, %xmm6, %xmm6;
- vpxor %xmm7, %xmm7, %xmm7;
- cmpq $32, %r11;
- jb .Locb_checksum_blk16;
- vpxor %xmm8, %xmm8, %xmm8;
- vpxor %xmm9, %xmm9, %xmm9;
- vpxor %xmm10, %xmm10, %xmm10;
- vpxor %xmm11, %xmm11, %xmm11;
- vpxor %xmm12, %xmm12, %xmm12;
- vpxor %xmm13, %xmm13, %xmm13;
- vpxor %xmm14, %xmm14, %xmm14;
- vpxor %xmm15, %xmm15, %xmm15;
-
-.align 8
-.Locb_checksum_blk32:
- cmpq $32, %r11;
- jb .Locb_checksum_blk32_done;
-
- leaq -32(%r11), %r11;
-
- vpxor (0 * 16)(%r10), %ymm0, %ymm0;
- vpxor (2 * 16)(%r10), %ymm1, %ymm1;
- vpxor (4 * 16)(%r10), %ymm2, %ymm2;
- vpxor (6 * 16)(%r10), %ymm3, %ymm3;
- vpxor (8 * 16)(%r10), %ymm4, %ymm4;
- vpxor (10 * 16)(%r10), %ymm5, %ymm5;
- vpxor (12 * 16)(%r10), %ymm6, %ymm6;
- vpxor (14 * 16)(%r10), %ymm7, %ymm7;
- vpxor (16 * 16)(%r10), %ymm8, %ymm8;
- vpxor (18 * 16)(%r10), %ymm9, %ymm9;
- vpxor (20 * 16)(%r10), %ymm10, %ymm10;
- vpxor (22 * 16)(%r10), %ymm11, %ymm11;
- vpxor (24 * 16)(%r10), %ymm12, %ymm12;
- vpxor (26 * 16)(%r10), %ymm13, %ymm13;
- vpxor (28 * 16)(%r10), %ymm14, %ymm14;
- vpxor (30 * 16)(%r10), %ymm15, %ymm15;
- leaq (32 * 16)(%r10), %r10;
-
- jmp .Locb_checksum_blk32;
-
-.align 8
-.Locb_checksum_blk32_done:
- vpxor %ymm8, %ymm0, %ymm0;
- vpxor %ymm9, %ymm1, %ymm1;
- vpxor %ymm10, %ymm2, %ymm2;
- vpxor %ymm11, %ymm3, %ymm3;
- vpxor %ymm12, %ymm4, %ymm4;
- vpxor %ymm13, %ymm5, %ymm5;
- vpxor %ymm14, %ymm6, %ymm6;
- vpxor %ymm15, %ymm7, %ymm7;
-
-.align 8
-.Locb_checksum_blk16:
- cmpq $16, %r11;
- jb .Locb_checksum_blk16_done;
-
- leaq -16(%r11), %r11;
-
- vpxor (0 * 16)(%r10), %ymm0, %ymm0;
- vpxor (2 * 16)(%r10), %ymm1, %ymm1;
- vpxor (4 * 16)(%r10), %ymm2, %ymm2;
- vpxor (6 * 16)(%r10), %ymm3, %ymm3;
- vpxor (8 * 16)(%r10), %ymm4, %ymm4;
- vpxor (10 * 16)(%r10), %ymm5, %ymm5;
- vpxor (12 * 16)(%r10), %ymm6, %ymm6;
- vpxor (14 * 16)(%r10), %ymm7, %ymm7;
- leaq (16 * 16)(%r10), %r10;
-
- jmp .Locb_checksum_blk16;
-
-.align 8
-.Locb_checksum_blk16_done:
- vpxor %ymm4, %ymm0, %ymm0;
- vpxor %ymm5, %ymm1, %ymm1;
- vpxor %ymm6, %ymm2, %ymm2;
- vpxor %ymm7, %ymm3, %ymm3;
- vextracti128 $1, %ymm0, %xmm4;
- vextracti128 $1, %ymm1, %xmm5;
- vextracti128 $1, %ymm2, %xmm6;
- vextracti128 $1, %ymm3, %xmm7;
- vpxor %xmm4, %xmm0, %xmm0;
- vpxor %xmm5, %xmm1, %xmm1;
- vpxor %xmm6, %xmm2, %xmm2;
- vpxor %xmm7, %xmm3, %xmm3;
-
-.align 8
-.Locb_checksum_blk4:
- cmpq $4, %r11;
- jb .Locb_checksum_blk4_done;
-
- leaq -4(%r11), %r11;
-
- vpxor (0 * 16)(%r10), %xmm0, %xmm0;
- vpxor (1 * 16)(%r10), %xmm1, %xmm1;
- vpxor (2 * 16)(%r10), %xmm2, %xmm2;
- vpxor (3 * 16)(%r10), %xmm3, %xmm3;
- leaq (4 * 16)(%r10), %r10;
-
- jmp .Locb_checksum_blk4;
-
-.align 8
-.Locb_checksum_blk4_done:
- vpxor %xmm1, %xmm0, %xmm0;
- vpxor %xmm3, %xmm2, %xmm2;
- vpxor %xmm2, %xmm0, %xmm0;
-
-.align 8
-.Locb_checksum_blk1:
- cmpq $1, %r11;
- jb .Locb_checksum_done;
-
- leaq -1(%r11), %r11;
-
- vpxor (%r10), %xmm0, %xmm0;
- leaq 16(%r10), %r10;
-
- jmp .Locb_checksum_blk1;
-
-.align 8
-.Locb_checksum_done:
- vpxor (%rax), %xmm0, %xmm0;
- vmovdqu %xmm0, (%rax);
- ret_spec_stop;
- CFI_ENDPROC();
-ELF(.size _gcry_vaes_avx2_ocb_checksum,.-_gcry_vaes_avx2_ocb_checksum)
-
ELF(.type _gcry_vaes_avx2_ocb_crypt_amd64,@function)
.globl _gcry_vaes_avx2_ocb_crypt_amd64
_gcry_vaes_avx2_ocb_crypt_amd64:
@@ -1556,8 +1411,12 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
*/
CFI_STARTPROC();
-#define STACK_REGS_POS (16 * 16 + 4 * 16)
-#define STACK_ALLOC (STACK_REGS_POS + 6 * 8)
+#define STACK_REGS_POS (16 * 16 + 4 * 16 + 2 * 16)
+#define STACK_ALLOC (STACK_REGS_POS + 5 * 8)
+#define OFFSET_PTR_Q 16(%rbp)
+#define CHECKSUM_PTR_Q 24(%rbp)
+#define L_ARRAY_PTR_L 32(%rbp)
+#define ENCRYPT_FLAG_L 40(%rbp)
pushq %rbp;
CFI_PUSH(%rbp);
@@ -1575,37 +1434,20 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
CFI_REG_ON_STACK(r14, STACK_REGS_POS + 2 * 8);
movq %r15, (STACK_REGS_POS + 3 * 8)(%rsp);
CFI_REG_ON_STACK(r15, STACK_REGS_POS + 3 * 8);
+ movq %rbx, (STACK_REGS_POS + 4 * 8)(%rsp);
+ CFI_REG_ON_STACK(rbx, STACK_REGS_POS + 4 * 8);
- movl 40(%rbp), %r15d; /* encrypt-flag. */
- movq 16(%rbp), %r14; /* offset ptr. */
-
- /* Handle encryption checksumming. */
- testl %r15d, %r15d;
- jz .Locb_dec_checksum_prepare;
- movq 24(%rbp), %rax; /* checksum ptr. */
- movq %rcx, %r10;
- movq %r8, %r11;
- call _gcry_vaes_avx2_ocb_checksum;
- jmp .Locb_enc_checksum_done;
-.Locb_dec_checksum_prepare:
- /* Store plaintext address and number of blocks for decryption
- * checksumming. */
- movq %rdx, (STACK_REGS_POS + 4 * 8)(%rsp);
- movq %r8, (STACK_REGS_POS + 5 * 8)(%rsp);
-.Locb_enc_checksum_done:
+ movl ENCRYPT_FLAG_L, %r15d; /* encrypt-flag. */
+ movq OFFSET_PTR_Q, %r14; /* offset ptr. */
+ movq CHECKSUM_PTR_Q, %rbx; /* checksum ptr. */
+ leal (, %r9d, 4), %eax;
vmovdqu (%r14), %xmm15; /* Load offset. */
- movq 32(%rbp), %r14; /* L-array ptr. */
+ movq L_ARRAY_PTR_L, %r14; /* L-array ptr. */
vmovdqa (0 * 16)(%rdi), %xmm0; /* first key */
- movl $(10 * 16), %eax;
- cmpl $12, %r9d;
- jb .Llast_key_ptr;
- movl $(12 * 16), %eax;
- je .Llast_key_ptr;
- movl $(14 * 16), %eax;
- .align 8
- .Llast_key_ptr:
- vpxor (%rdi, %rax), %xmm0, %xmm0; /* first key ^ last key */
+ vpxor %xmm14, %xmm14, %xmm14;
+ vpxor %xmm13, %xmm13, %xmm13;
+ vpxor (%rdi, %rax, 4), %xmm0, %xmm0; /* first key ^ last key */
vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key */
vmovdqa %xmm0, (14 * 16)(%rsp);
vmovdqa %xmm0, (15 * 16)(%rsp);
@@ -1678,16 +1520,24 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vinserti128 $1, %xmm10, %ymm9, %ymm7;
vinserti128 $1, %xmm15, %ymm11, %ymm8;
- vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
- vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
- vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
- vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
- leaq (8 * 16)(%rcx), %rcx;
-
- vmovdqa (14 * 16)(%rsp), %ymm9;
-
testl %r15d, %r15d;
jz .Locb_unaligned_blk8_dec;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ vmovdqu (4 * 16)(%rcx), %ymm2;
+ vmovdqu (6 * 16)(%rcx), %ymm3;
+ leaq (8 * 16)(%rcx), %rcx;
+ vpxor %ymm0, %ymm14, %ymm14;
+ vpxor %ymm1, %ymm13, %ymm13;
+ vpxor %ymm2, %ymm14, %ymm14;
+ vpxor %ymm3, %ymm13, %ymm13;
+ vpxor %ymm5, %ymm0, %ymm0;
+ vpxor %ymm6, %ymm1, %ymm1;
+ vpxor %ymm7, %ymm2, %ymm2;
+ vpxor %ymm8, %ymm3, %ymm3;
+
+ vmovdqa (14 * 16)(%rsp), %ymm9;
+
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
@@ -1739,6 +1589,14 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
.align 8
.Locb_unaligned_blk8_dec:
+ vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm8, %ymm3;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ vmovdqa (14 * 16)(%rsp), %ymm9;
+
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
@@ -1780,6 +1638,10 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vaesdeclast %ymm6, %ymm1, %ymm1;
vaesdeclast %ymm7, %ymm2, %ymm2;
vaesdeclast %ymm4, %ymm3, %ymm3;
+ vpxor %ymm0, %ymm14, %ymm14;
+ vpxor %ymm1, %ymm13, %ymm13;
+ vpxor %ymm2, %ymm14, %ymm14;
+ vpxor %ymm3, %ymm13, %ymm13;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
@@ -1817,12 +1679,16 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vpxor (%r14, %rax), %xmm7, %xmm15;
vinserti128 $1, %xmm15, %ymm7, %ymm6;
- vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
- vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
- leaq (4 * 16)(%rcx), %rcx;
-
testl %r15d, %r15d;
jz .Locb_unaligned_blk4_dec;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ leaq (4 * 16)(%rcx), %rcx;
+ vpxor %ymm0, %ymm14, %ymm14;
+ vpxor %ymm1, %ymm13, %ymm13;
+ vpxor %ymm5, %ymm0, %ymm0;
+ vpxor %ymm6, %ymm1, %ymm1;
+
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC2(%ymm4, %ymm0, %ymm1);
@@ -1869,6 +1735,10 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
.align 8
.Locb_unaligned_blk4_dec:
+ vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+ leaq (4 * 16)(%rcx), %rcx;
+
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC2(%ymm4, %ymm0, %ymm1);
@@ -1907,6 +1777,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vpxor %ymm6, %ymm8, %ymm6;
vaesdeclast %ymm5, %ymm0, %ymm0;
vaesdeclast %ymm6, %ymm1, %ymm1;
+ vpxor %ymm0, %ymm14, %ymm14;
+ vpxor %ymm1, %ymm13, %ymm13;
vmovdqu %ymm0, (0 * 16)(%rdx);
vmovdqu %ymm1, (2 * 16)(%rdx);
leaq (4 * 16)(%rdx), %rdx;
@@ -1924,11 +1796,14 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
tzcntl %esi, %r11d;
shll $4, %r11d;
vpxor (%r14, %r11), %xmm15, %xmm15;
- vpxor (%rcx), %xmm15, %xmm0;
- leaq 16(%rcx), %rcx;
testl %r15d, %r15d;
jz .Locb_unaligned_blk1_dec;
+ vmovdqu (%rcx), %xmm0;
+ vpxor %ymm0, %ymm14, %ymm14;
+ vpxor %xmm15, %xmm0, %xmm0;
+ leaq 16(%rcx), %rcx;
+
/* AES rounds. */
vaesenc (1 * 16)(%rdi), %xmm0, %xmm0;
vaesenc (2 * 16)(%rdi), %xmm0, %xmm0;
@@ -1958,6 +1833,9 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
.align 8
.Locb_unaligned_blk1_dec:
+ vpxor (%rcx), %xmm15, %xmm0;
+ leaq 16(%rcx), %rcx;
+
/* AES rounds. */
vaesdec (1 * 16)(%rdi), %xmm0, %xmm0;
vaesdec (2 * 16)(%rdi), %xmm0, %xmm0;
@@ -1980,6 +1858,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
.Locb_unaligned_blk1_dec_last:
vpxor (14 * 16)(%rsp), %xmm15, %xmm1;
vaesdeclast %xmm1, %xmm0, %xmm0;
+ vpxor %ymm0, %ymm14, %ymm14;
vmovdqu %xmm0, (%rdx);
leaq 16(%rdx), %rdx;
@@ -2021,6 +1900,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vmovdqu (1 * 16)(%r14), %xmm1;
vmovdqu (2 * 16)(%r14), %xmm2;
vmovdqu (3 * 16)(%r14), %xmm3;
+ vpxor %ymm13, %ymm14, %ymm14;
+ vmovdqa %ymm14, (20 * 16)(%rsp);
vpxor %xmm0, %xmm1, %xmm4; /* L[0] ^ L[1] */
vpxor %xmm0, %xmm2, %xmm5; /* L[0] ^ L[2] */
vpxor %xmm0, %xmm3, %xmm6; /* L[0] ^ L[3] */
@@ -2069,26 +1950,40 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[3] ^ L[ntz{nblk+16}] */
vinserti128 $1, %xmm14, %ymm13, %ymm14;
- vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
- vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
-
- vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
- vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
- vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
- vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
- vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
- vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
- vmovdqa %ymm13, (16 * 16)(%rsp);
- vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
- vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
- vmovdqa %ymm13, (18 * 16)(%rsp);
-
- leaq (16 * 16)(%rcx), %rcx;
-
- vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
-
testl %r15d, %r15d;
jz .Locb_aligned_blk16_dec;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ vmovdqu (4 * 16)(%rcx), %ymm2;
+ vmovdqu (6 * 16)(%rcx), %ymm3;
+ vpxor (8 * 16)(%rcx), %ymm0, %ymm4;
+ vpxor (10 * 16)(%rcx), %ymm1, %ymm5;
+ vpxor (12 * 16)(%rcx), %ymm2, %ymm6;
+ vpxor (14 * 16)(%rcx), %ymm3, %ymm7;
+ vpxor %ymm4, %ymm5, %ymm5;
+ vpxor %ymm6, %ymm7, %ymm7;
+ vpxor %ymm5, %ymm7, %ymm7;
+ vpxor (20 * 16)(%rsp), %ymm7, %ymm7;
+ vmovdqa %ymm7, (20 * 16)(%rsp);
+
+ vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
+ vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
+
+ vpxor %ymm8, %ymm0, %ymm0;
+ vpxor %ymm9, %ymm1, %ymm1;
+ vpxor %ymm10, %ymm2, %ymm2;
+ vpxor %ymm11, %ymm3, %ymm3;
+ vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
+ vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
+ vmovdqa %ymm13, (16 * 16)(%rsp);
+ vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
+ vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
+ vmovdqa %ymm13, (18 * 16)(%rsp);
+
+ leaq (16 * 16)(%rcx), %rcx;
+
+ vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm13;
VAESENC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
@@ -2153,6 +2048,24 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
.align 8
.Locb_aligned_blk16_dec:
+ vpxor (10 * 16)(%rsp), %ymm15, %ymm13;
+ vpxor (14 * 16)(%rcx), %ymm14, %ymm7;
+
+ vpxor (0 * 16)(%rcx), %ymm8, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm9, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm10, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm11, %ymm3;
+ vpxor (8 * 16)(%rcx), %ymm12, %ymm4;
+ vpxor (10 * 16)(%rcx), %ymm13, %ymm5;
+ vmovdqa %ymm13, (16 * 16)(%rsp);
+ vpxor (12 * 16)(%rsp), %ymm15, %ymm13;
+ vpxor (12 * 16)(%rcx), %ymm13, %ymm6;
+ vmovdqa %ymm13, (18 * 16)(%rsp);
+
+ leaq (16 * 16)(%rcx), %rcx;
+
+ vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm13;
VAESDEC8(%ymm13, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7);
@@ -2207,12 +2120,22 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vmovdqu %ymm1, (2 * 16)(%rdx);
vmovdqu %ymm2, (4 * 16)(%rdx);
vmovdqu %ymm3, (6 * 16)(%rdx);
+ vpxor %ymm1, %ymm0, %ymm0;
+ vpxor %ymm3, %ymm2, %ymm2;
vmovdqu %ymm4, (8 * 16)(%rdx);
vmovdqu %ymm5, (10 * 16)(%rdx);
vmovdqu %ymm6, (12 * 16)(%rdx);
vmovdqu %ymm7, (14 * 16)(%rdx);
+ vpxor %ymm5, %ymm4, %ymm4;
+ vpxor %ymm7, %ymm6, %ymm6;
leaq (16 * 16)(%rdx), %rdx;
+ vpxor %ymm4, %ymm0, %ymm0;
+ vpxor %ymm6, %ymm2, %ymm2;
+ vpxor %ymm2, %ymm0, %ymm0;
+ vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+ vmovdqa %ymm0, (20 * 16)(%rsp);
+
jmp .Locb_aligned_blk16;
/* Aligned: Process trailing eight blocks. */
@@ -2235,18 +2158,28 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vpxor (%r14, %rax), %xmm13, %xmm14; /* offset ^ first key ^ L[2] ^ L[ntz{nblk+8}] */
vinserti128 $1, %xmm14, %ymm13, %ymm14;
- vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
- vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
- vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
- vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
- leaq (8 * 16)(%rcx), %rcx;
-
- vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
-
- vmovdqa (14 * 16)(%rsp), %ymm8;
-
testl %r15d, %r15d;
jz .Locb_aligned_blk8_dec;
+ vmovdqu (0 * 16)(%rcx), %ymm0;
+ vmovdqu (2 * 16)(%rcx), %ymm1;
+ vmovdqu (4 * 16)(%rcx), %ymm2;
+ vmovdqu (6 * 16)(%rcx), %ymm3;
+ vpxor %ymm2, %ymm0, %ymm10;
+ vpxor %ymm3, %ymm1, %ymm11;
+ vpxor %ymm11, %ymm10, %ymm10;
+ vpxor (20 * 16)(%rsp), %ymm10, %ymm10;
+ vmovdqa %ymm10, (20 * 16)(%rsp);
+
+ vpxor %ymm5, %ymm0, %ymm0;
+ vpxor %ymm6, %ymm1, %ymm1;
+ vpxor %ymm7, %ymm2, %ymm2;
+ vpxor %ymm14, %ymm3, %ymm3;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+ vmovdqa (14 * 16)(%rsp), %ymm8;
+
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESENC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
@@ -2298,6 +2231,16 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
.align 8
.Locb_aligned_blk8_dec:
+ vpxor (0 * 16)(%rcx), %ymm5, %ymm0;
+ vpxor (2 * 16)(%rcx), %ymm6, %ymm1;
+ vpxor (4 * 16)(%rcx), %ymm7, %ymm2;
+ vpxor (6 * 16)(%rcx), %ymm14, %ymm3;
+ leaq (8 * 16)(%rcx), %rcx;
+
+ vperm2i128 $0x11, %ymm14, %ymm14, %ymm15;
+
+ vmovdqa (14 * 16)(%rsp), %ymm8;
+
/* AES rounds */
vbroadcasti128 (1 * 16)(%rdi), %ymm4;
VAESDEC4(%ymm4, %ymm0, %ymm1, %ymm2, %ymm3);
@@ -2346,19 +2289,28 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
vmovdqu %ymm3, (6 * 16)(%rdx);
leaq (8 * 16)(%rdx), %rdx;
+ vpxor %ymm1, %ymm0, %ymm0;
+ vpxor %ymm3, %ymm2, %ymm2;
+ vpxor %ymm2, %ymm0, %ymm0;
+ vpxor (20 * 16)(%rsp), %ymm0, %ymm0;
+ vmovdqa %ymm0, (20 * 16)(%rsp);
+
.align 8
.Locb_aligned_done:
+ vmovdqa (20 * 16)(%rsp), %ymm14;
+ vpxor %xmm13, %xmm13, %xmm13;
+
/* Burn stack. */
- vpxor %ymm0, %ymm0, %ymm0;
- vmovdqa %ymm0, (0 * 16)(%rsp);
- vmovdqa %ymm0, (2 * 16)(%rsp);
- vmovdqa %ymm0, (4 * 16)(%rsp);
- vmovdqa %ymm0, (6 * 16)(%rsp);
- vmovdqa %ymm0, (8 * 16)(%rsp);
- vmovdqa %ymm0, (10 * 16)(%rsp);
- vmovdqa %ymm0, (12 * 16)(%rsp);
- vmovdqa %ymm0, (16 * 16)(%rsp);
- vmovdqa %ymm0, (18 * 16)(%rsp);
+ vmovdqa %ymm13, (0 * 16)(%rsp);
+ vmovdqa %ymm13, (2 * 16)(%rsp);
+ vmovdqa %ymm13, (4 * 16)(%rsp);
+ vmovdqa %ymm13, (6 * 16)(%rsp);
+ vmovdqa %ymm13, (8 * 16)(%rsp);
+ vmovdqa %ymm13, (10 * 16)(%rsp);
+ vmovdqa %ymm13, (12 * 16)(%rsp);
+ vmovdqa %ymm13, (16 * 16)(%rsp);
+ vmovdqa %ymm13, (18 * 16)(%rsp);
+ vmovdqa %ymm13, (20 * 16)(%rsp);
/* Handle tailing 1…7 blocks in nblk-unaligned loop. */
movq %r8, %r10;
@@ -2367,20 +2319,16 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
.align 8
.Ldone_ocb:
- movq 16(%rbp), %r14; /* offset ptr. */
+ vpxor %ymm13, %ymm14, %ymm14;
+ vextracti128 $1, %ymm14, %xmm13;
+ vpxor (%rbx), %xmm14, %xmm14;
+ vpxor %xmm13, %xmm14, %xmm14;
+ vmovdqu %xmm14, (%rbx);
+
+ movq OFFSET_PTR_Q, %r14; /* offset ptr. */
vpxor (0 * 16)(%rdi), %xmm15, %xmm15; /* offset ^ first key ^ first key */
vmovdqu %xmm15, (%r14); /* Store offset. */
- /* Handle decryption checksumming. */
-
- testl %r15d, %r15d;
- jnz .Locb_dec_checksum_done;
- movq 24(%rbp), %rax; /* checksum ptr. */
- movq (STACK_REGS_POS + 4 * 8)(%rsp), %r10;
- movq (STACK_REGS_POS + 5 * 8)(%rsp), %r11;
- call _gcry_vaes_avx2_ocb_checksum;
-.Locb_dec_checksum_done:
-
/* Burn stack. */
vpxor %ymm0, %ymm0, %ymm0;
vmovdqa %ymm0, (14 * 16)(%rsp);
@@ -2395,6 +2343,8 @@ _gcry_vaes_avx2_ocb_crypt_amd64:
CFI_RESTORE(%r14);
movq (STACK_REGS_POS + 3 * 8)(%rsp), %r15;
CFI_RESTORE(%r15);
+ movq (STACK_REGS_POS + 4 * 8)(%rsp), %rbx;
+ CFI_RESTORE(%rbx);
leave;
CFI_LEAVE();