summaryrefslogtreecommitdiff
path: root/cipher
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-19 17:10:46 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2023-02-22 20:27:56 +0200
commit8f7f5a9fc63968304bacedbc2f22b9f7188bbd53 (patch)
tree77b27deeb2c6b7a9f49e20737d83869f4dda9077 /cipher
parentcaf402e9b41fff6cf39b914b088ea1e5f8fd3bd1 (diff)
downloadlibgcrypt-8f7f5a9fc63968304bacedbc2f22b9f7188bbd53.tar.gz
aes-vaes-avx2: improve case when only CTR needs carry handling
* cipher/rijndael-vaes-avx2-amd64.S (_gcry_vaes_avx2_ctr_enc_amd64): Add handling for the case when only main counter needs carry handling but generated vector counters do not. -- Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Diffstat (limited to 'cipher')
-rw-r--r--cipher/rijndael-vaes-avx2-amd64.S76
1 files changed, 41 insertions, 35 deletions
diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S
index aceccb96..10213bfb 100644
--- a/cipher/rijndael-vaes-avx2-amd64.S
+++ b/cipher/rijndael-vaes-avx2-amd64.S
@@ -738,6 +738,16 @@ _gcry_vaes_avx2_ctr_enc_amd64:
vpslldq $8, tmp2, tmp2; \
vpsubq tmp2, x, x;
+#define handle_ctr_128bit_add(nblks) \
+ addq $(nblks), %r10; \
+ adcq $0, %r11; \
+ bswapq %r10; \
+ bswapq %r11; \
+ movq %r10, 8(%rsi); \
+ movq %r11, 0(%rsi); \
+ bswapq %r10; \
+ bswapq %r11;
+
/* Process 16 blocks per loop. */
.align 8
.Lctr_enc_blk16:
@@ -753,6 +763,9 @@ _gcry_vaes_avx2_ctr_enc_amd64:
addb $16, 15(%rsi);
jc .Lctr_enc_blk16_handle_carry;
+ leaq 16(%r10), %r10;
+
+ .Lctr_enc_blk16_byte_bige_add:
/* Increment counters. */
vpaddb .Lbige_addb_0 rRIP, %ymm7, %ymm0;
vpaddb .Lbige_addb_2 rRIP, %ymm7, %ymm1;
@@ -762,7 +775,6 @@ _gcry_vaes_avx2_ctr_enc_amd64:
vpaddb .Lbige_addb_10 rRIP, %ymm7, %ymm5;
vpaddb .Lbige_addb_12 rRIP, %ymm7, %ymm6;
vpaddb .Lbige_addb_14 rRIP, %ymm7, %ymm7;
- leaq 16(%r10), %r10;
.Lctr_enc_blk16_rounds:
/* AES rounds */
@@ -830,21 +842,20 @@ _gcry_vaes_avx2_ctr_enc_amd64:
jmp .Lctr_enc_blk16;
.align 8
+ .Lctr_enc_blk16_handle_only_ctr_carry:
+ handle_ctr_128bit_add(16);
+ jmp .Lctr_enc_blk16_byte_bige_add;
+
+ .align 8
.Lctr_enc_blk16_handle_carry:
+ jz .Lctr_enc_blk16_handle_only_ctr_carry;
/* Increment counters (handle carry). */
vpshufb %xmm13, %xmm7, %xmm1; /* be => le */
vmovdqa %xmm1, %xmm0;
inc_le128(%xmm1, %xmm15, %xmm5);
vinserti128 $1, %xmm1, %ymm0, %ymm7; /* ctr: +1:+0 */
vpshufb %ymm13, %ymm7, %ymm0;
- addq $16, %r10;
- adcq $0, %r11;
- bswapq %r10;
- bswapq %r11;
- movq %r10, 8(%rsi);
- movq %r11, 0(%rsi);
- bswapq %r10;
- bswapq %r11;
+ handle_ctr_128bit_add(16);
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +3:+2 */
vpshufb %ymm13, %ymm7, %ymm1;
add2_le128(%ymm7, %ymm15, %ymm14, %ymm9, %ymm10); /* ctr: +5:+4 */
@@ -877,12 +888,14 @@ _gcry_vaes_avx2_ctr_enc_amd64:
addb $8, 15(%rsi);
jc .Lctr_enc_blk8_handle_carry;
+ leaq 8(%r10), %r10;
+
+ .Lctr_enc_blk8_byte_bige_add:
/* Increment counters. */
vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
vpaddb .Lbige_addb_4 rRIP, %ymm3, %ymm2;
vpaddb .Lbige_addb_6 rRIP, %ymm3, %ymm3;
- leaq 8(%r10), %r10;
.Lctr_enc_blk8_rounds:
/* AES rounds */
@@ -938,21 +951,20 @@ _gcry_vaes_avx2_ctr_enc_amd64:
jmp .Lctr_enc_blk4;
.align 8
+ .Lctr_enc_blk8_handle_only_ctr_carry:
+ handle_ctr_128bit_add(8);
+ jmp .Lctr_enc_blk8_byte_bige_add;
+
+ .align 8
.Lctr_enc_blk8_handle_carry:
+ jz .Lctr_enc_blk8_handle_only_ctr_carry;
/* Increment counters (handle carry). */
vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
vmovdqa %xmm1, %xmm0;
inc_le128(%xmm1, %xmm15, %xmm5);
vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
vpshufb %ymm13, %ymm3, %ymm0;
- addq $8, %r10;
- adcq $0, %r11;
- bswapq %r10;
- bswapq %r11;
- movq %r10, 8(%rsi);
- movq %r11, 0(%rsi);
- bswapq %r10;
- bswapq %r11;
+ handle_ctr_128bit_add(8);
add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
vpshufb %ymm13, %ymm3, %ymm1;
add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +5:+4 */
@@ -977,10 +989,12 @@ _gcry_vaes_avx2_ctr_enc_amd64:
addb $4, 15(%rsi);
jc .Lctr_enc_blk4_handle_carry;
+ leaq 4(%r10), %r10;
+
+ .Lctr_enc_blk4_byte_bige_add:
/* Increment counters. */
vpaddb .Lbige_addb_0 rRIP, %ymm3, %ymm0;
vpaddb .Lbige_addb_2 rRIP, %ymm3, %ymm1;
- leaq 4(%r10), %r10;
.Lctr_enc_blk4_rounds:
/* AES rounds */
@@ -1030,21 +1044,20 @@ _gcry_vaes_avx2_ctr_enc_amd64:
jmp .Lctr_enc_blk1;
.align 8
+ .Lctr_enc_blk4_handle_only_ctr_carry:
+ handle_ctr_128bit_add(4);
+ jmp .Lctr_enc_blk4_byte_bige_add;
+
+ .align 8
.Lctr_enc_blk4_handle_carry:
+ jz .Lctr_enc_blk4_handle_only_ctr_carry;
/* Increment counters (handle carry). */
vpshufb %xmm13, %xmm3, %xmm1; /* be => le */
vmovdqa %xmm1, %xmm0;
inc_le128(%xmm1, %xmm15, %xmm5);
vinserti128 $1, %xmm1, %ymm0, %ymm3; /* ctr: +1:+0 */
vpshufb %ymm13, %ymm3, %ymm0;
- addq $4, %r10;
- adcq $0, %r11;
- bswapq %r10;
- bswapq %r11;
- movq %r10, 8(%rsi);
- movq %r11, 0(%rsi);
- bswapq %r10;
- bswapq %r11;
+ handle_ctr_128bit_add(4);
add2_le128(%ymm3, %ymm15, %ymm14, %ymm5, %ymm6); /* ctr: +3:+2 */
vpshufb %ymm13, %ymm3, %ymm1;
@@ -1060,14 +1073,7 @@ _gcry_vaes_avx2_ctr_enc_amd64:
/* Load and increament counter. */
vmovdqu (%rsi), %xmm0;
- addq $1, %r10;
- adcq $0, %r11;
- bswapq %r10;
- bswapq %r11;
- movq %r10, 8(%rsi);
- movq %r11, 0(%rsi);
- bswapq %r10;
- bswapq %r11;
+ handle_ctr_128bit_add(1);
/* AES rounds. */
vpxor (0 * 16)(%rdi), %xmm0, %xmm0;