summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerryDevis <hisec1@huawei.com>2023-03-11 18:35:23 +0800
committerTomas Mraz <tomas@openssl.org>2023-05-09 16:21:04 +0200
commit507356598bc1e17a08f9737df50e1a19525efa88 (patch)
tree04cdf6d8d2ea8ec7904a2316efdaacbd126c5ee5
parentca9ef8ebf5908a6115990967df648d8f29e66f42 (diff)
downloadopenssl-new-507356598bc1e17a08f9737df50e1a19525efa88.tar.gz
aes-gcm-armv8_64 asm support bigdian
Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/20489) (cherry picked from commit 32344a74b7ee2693a5bfda361c40ec60ab5be624)
-rwxr-xr-xcrypto/modes/asm/aes-gcm-armv8_64.pl787
1 files changed, 581 insertions, 206 deletions
diff --git a/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/modes/asm/aes-gcm-armv8_64.pl
index ff5809ec22..b5e7691427 100755
--- a/crypto/modes/asm/aes-gcm-armv8_64.pl
+++ b/crypto/modes/asm/aes-gcm-armv8_64.pl
@@ -217,6 +217,7 @@ my $mod_constant="v8";
my $mod_t="v31";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
+my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
@@ -269,28 +270,36 @@ aes_gcm_enc_128_kernel:
stp d14, d15, [sp, #96]
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+ rev $ctr96_b64x, $ctr96_b64x
+ rev $ctr96_t32x, $ctr96_t32x
+#endif
ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
-
+#ifdef __AARCH64EB__
+ ror $rk10_l, $rk10_l, #32
+ ror $rk10_h, $rk10_h, #32
+#endif
ld1 {$acc_lb}, [$current_tag]
ext $acc_lb, $acc_lb, $acc_lb, #8
rev64 $acc_lb, $acc_lb
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
- ldr $rk9q, [$cc, #144] @ load rk9
+ ld1 {$rk0s}, [$cc], #16 @ load rk0
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
lsr $rctr32x, $ctr96_t32x, #32
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+#ifndef __AARCH64EB__
ext $h4b, $h4b, $h4b, #8
-
+#endif
fmov $ctr1d, $ctr96_b64x @ CTR block 1
rev $rctr32w, $rctr32w @ rev_ctr32
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
- ldr $rk0q, [$cc, #0] @ load rk0
+ ld1 {$rk1s}, [$cc], #16 @ load rk1
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
@@ -310,45 +319,50 @@ aes_gcm_enc_128_kernel:
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
- ldr $rk1q, [$cc, #16] @ load rk1
+ ld1 {$rk2s}, [$cc], #16 @ load rk2
add $rctr32w, $rctr32w, #1 @ CTR block 3
fmov $ctr3.d[1], $ctr32x @ CTR block 3
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+#ifndef __AARCH64EB__
ext $h3b, $h3b, $h3b, #8
-
+#endif
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
- ldr $rk2q, [$cc, #32] @ load rk2
+ ld1 {$rk3s}, [$cc], #16 @ load rk3
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+#ifndef __AARCH64EB__
ext $h1b, $h1b, $h1b, #8
+#endif
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
- ldr $rk8q, [$cc, #128] @ load rk8
+ ld1 {$rk4s}, [$cc], #16 @ load rk4
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
- ldr $rk3q, [$cc, #48] @ load rk3
+ ld1 {$rk5s}, [$cc], #16 @ load rk5
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
- ldr $rk6q, [$cc, #96] @ load rk6
+ ld1 {$rk6s}, [$cc], #16 @ load rk6
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
- ldr $rk7q, [$cc, #112] @ load rk7
+ ld1 {$rk7s}, [$cc], #16 @ load rk7
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
- ldr $rk5q, [$cc, #80] @ load rk5
+ ld1 {$rk8s}, [$cc], #16 @ load rk8
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+#ifndef __AARCH64EB__
ext $h2b, $h2b, $h2b, #8
+#endif
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
@@ -360,7 +374,7 @@ aes_gcm_enc_128_kernel:
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
- ldr $rk4q, [$cc, #64] @ load rk4
+ ld1 {$rk9s}, [$cc], #16 @ load rk9
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
@@ -422,13 +436,25 @@ aes_gcm_enc_128_kernel:
b.ge .L128_enc_tail @ handle tail
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l2, $input_l2
+ rev $input_h2, $input_h2
+#endif
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l1, $input_l1
+ rev $input_h1, $input_h1
+#endif
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l3, $input_l3
+ rev $input_h3, $input_h3
+#endif
eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
@@ -493,6 +519,10 @@ aes_gcm_enc_128_kernel:
.L128_enc_main_loop: @ main loop start
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
+#ifdef __AARCH64EB__
+ rev $input_l3, $input_l3
+ rev $input_h3, $input_h3
+#endif
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
@@ -521,7 +551,10 @@ aes_gcm_enc_128_kernel:
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
rev $ctr32w, $rctr32w @ CTR block 4k+8
@@ -591,13 +624,19 @@ aes_gcm_enc_128_kernel:
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l1, $input_l1
+ rev $input_h1, $input_h1
+#endif
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l2, $input_l2
+ rev $input_h2, $input_h2
+#endif
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
@@ -854,7 +893,10 @@ aes_gcm_enc_128_kernel:
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
cmp $main_end_input_ptr, #48
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
@@ -892,7 +934,10 @@ aes_gcm_enc_128_kernel:
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
rev64 $res0b, $res1b @ GHASH final-3 block
eor $res0b, $res0b, $t0.16b @ feed in partial tag
@@ -921,7 +966,10 @@ aes_gcm_enc_128_kernel:
rev64 $res0b, $res1b @ GHASH final-2 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
@@ -955,7 +1003,10 @@ aes_gcm_enc_128_kernel:
rev64 $res0b, $res1b @ GHASH final-1 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
@@ -1018,9 +1069,11 @@ aes_gcm_enc_128_kernel:
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
-
+#ifndef __AARCH64EB__
rev $ctr32w, $rctr32w
-
+#else
+ mov $ctr32w, $rctr32w
+#endif
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
@@ -1105,20 +1158,29 @@ aes_gcm_dec_128_kernel:
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+ rev $ctr96_b64x, $ctr96_b64x
+ rev $ctr96_t32x, $ctr96_t32x
+#endif
+ ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
+#ifdef __AARCH64EB__
+ ror $rk10_h, $rk10_h, 32
+ ror $rk10_l, $rk10_l, 32
+#endif
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
- ldr $rk0q, [$cc, #0] @ load rk0
+ ld1 {$rk0s}, [$cc], #16 @ load rk0
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+#ifndef __AARCH64EB__
ext $h2b, $h2b, $h2b, #8
-
+#endif
lsr $rctr32x, $ctr96_t32x, #32
fmov $ctr2d, $ctr96_b64x @ CTR block 2
- ldr $rk1q, [$cc, #16] @ load rk1
+ ld1 {$rk1s}, [$cc], #16 @ load rk1
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
rev $rctr32w, $rctr32w @ rev_ctr32
@@ -1129,7 +1191,7 @@ aes_gcm_dec_128_kernel:
rev $ctr32w, $rctr32w @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
- ldr $rk2q, [$cc, #32] @ load rk2
+ ld1 {$rk2s}, [$cc], #16 @ load rk2
add $rctr32w, $rctr32w, #1 @ CTR block 1
fmov $ctr1.d[1], $ctr32x @ CTR block 1
@@ -1150,23 +1212,22 @@ aes_gcm_dec_128_kernel:
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
- ldr $rk3q, [$cc, #48] @ load rk3
+ ld1 {$rk3s}, [$cc], #16 @ load rk3
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
- ldr $rk6q, [$cc, #96] @ load rk6
+ ld1 {$rk4s}, [$cc], #16 @ load rk4
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
- ldr $rk7q, [$cc, #112] @ load rk7
+ ld1 {$rk5s}, [$cc], #16 @ load rk5
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
- ldr $rk4q, [$cc, #64] @ load rk4
+ ld1 {$rk6s}, [$cc], #16 @ load rk6
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
- ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ld1 { $acc_lb}, [$current_tag]
@@ -1174,14 +1235,14 @@ aes_gcm_dec_128_kernel:
rev64 $acc_lb, $acc_lb
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
- ldr $rk5q, [$cc, #80] @ load rk5
+ ld1 {$rk7s}, [$cc], #16 @ load rk7
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
- ldr $rk9q, [$cc, #144] @ load rk9
+ ld1 {$rk8s}, [$cc], #16 @ load rk8
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
@@ -1189,10 +1250,11 @@ aes_gcm_dec_128_kernel:
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+#ifndef __AARCH64EB__
ext $h3b, $h3b, $h3b, #8
-
+#endif
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
- ldr $rk8q, [$cc, #128] @ load rk8
+ ld1 {$rk9s}, [$cc], #16 @ load rk9
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
@@ -1204,8 +1266,9 @@ aes_gcm_dec_128_kernel:
aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+#ifndef __AARCH64EB__
ext $h1b, $h1b, $h1b, #8
-
+#endif
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
@@ -1218,7 +1281,9 @@ aes_gcm_dec_128_kernel:
trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+#ifndef __AARCH64EB__
ext $h4b, $h4b, $h4b, #8
+#endif
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
@@ -1252,12 +1317,10 @@ aes_gcm_dec_128_kernel:
eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
b.ge .L128_dec_tail @ handle tail
- ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
-
- ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
+ ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
- ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
+ ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
rev64 $res0b, $res0b @ GHASH block 0
@@ -1265,10 +1328,9 @@ aes_gcm_dec_128_kernel:
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
add $rctr32w, $rctr32w, #1 @ CTR block 4
- ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
+ ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext
rev64 $res1b, $res1b @ GHASH block 1
- add $input_ptr, $input_ptr, #64 @ AES input_ptr update
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
@@ -1283,7 +1345,9 @@ aes_gcm_dec_128_kernel:
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev $output_l1, $output_l1
+#endif
fmov $ctr1d, $ctr96_b64x @ CTR block 5
add $rctr32w, $rctr32w, #1 @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
@@ -1295,10 +1359,19 @@ aes_gcm_dec_128_kernel:
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
+#ifdef __AARCH64EB__
+ rev $output_h1, $output_h1
+#endif
eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
@@ -1358,9 +1431,14 @@ aes_gcm_dec_128_kernel:
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev $output_l3, $output_l3
+#endif
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
+#ifdef __AARCH64EB__
+ rev $output_h2, $output_h2
+#endif
mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
@@ -1391,13 +1469,17 @@ aes_gcm_dec_128_kernel:
pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev $output_h3, $output_h3
+#endif
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev $output_l2, $output_l2
+#endif
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
movi $mod_constant.8b, #0xc2
@@ -1414,7 +1496,7 @@ aes_gcm_dec_128_kernel:
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
- ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
+ ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+3 - load ciphertext
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
@@ -1435,7 +1517,7 @@ aes_gcm_dec_128_kernel:
rev $ctr32w, $rctr32w @ CTR block 4k+8
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
- ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
+ ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
@@ -1450,17 +1532,16 @@ aes_gcm_dec_128_kernel:
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
- ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
+ ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
- ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
+ ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
- add $input_ptr, $input_ptr, #64 @ AES input_ptr update
rev64 $res1b, $res1b @ GHASH block 4k+5
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
@@ -1482,11 +1563,15 @@ aes_gcm_dec_128_kernel:
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
@@ -1503,9 +1588,15 @@ aes_gcm_dec_128_kernel:
add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
+#ifdef __AARCH64EB__
+ rev $output_h1, $output_h1
+#endif
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
+#ifdef __AARCH64EB__
+ rev $output_l1, $output_l1
+#endif
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
@@ -1598,9 +1689,14 @@ aes_gcm_dec_128_kernel:
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
-
+#ifdef __AARCH64EB__
+ rev $output_l3, $output_l3
+#endif
pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
+#ifdef __AARCH64EB__
+ rev $output_l2, $output_l2
+#endif
eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
@@ -1654,7 +1750,9 @@ aes_gcm_dec_128_kernel:
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev $output_h3, $output_h3
+#endif
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
@@ -1667,7 +1765,9 @@ aes_gcm_dec_128_kernel:
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev $output_h2, $output_h2
+#endif
aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
@@ -1691,9 +1791,14 @@ aes_gcm_dec_128_kernel:
cmp $main_end_input_ptr, #48
eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
b.gt .L128_dec_blocks_more_than_3
mov $ctr3b, $ctr2b
@@ -1737,9 +1842,14 @@ aes_gcm_dec_128_kernel:
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
.L128_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
@@ -1765,12 +1875,18 @@ aes_gcm_dec_128_kernel:
pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
.L128_dec_blocks_more_than_1: @ blocks left > 1
rev64 $res0b, $res1b @ GHASH final-1 block
@@ -1801,8 +1917,13 @@ aes_gcm_dec_128_kernel:
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L128_dec_blocks_less_than_1: @ blocks left <= 1
@@ -1848,7 +1969,11 @@ aes_gcm_dec_128_kernel:
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
and $output_l0, $output_l0, $ctr32x
+#ifndef __AARCH64EB__
rev $ctr32w, $rctr32w
+#else
+ mov $ctr32w, $rctr32w
+#endif
eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
movi $mod_constant.8b, #0xc2
@@ -1957,6 +2082,7 @@ my $mod_t="v31";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
+my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
my $rk4v="v22";
@@ -1988,18 +2114,26 @@ aes_gcm_enc_192_kernel:
stp d14, d15, [sp, #96]
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
+#ifdef __AARCH64EB__
+ rev $ctr96_b64x, $ctr96_b64x
+ rev $ctr96_t32x, $ctr96_t32x
+#endif
+ ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
+#ifdef __AARCH64EB__
+ ror $rk12_l, $rk12_l, #32
+ ror $rk12_h, $rk12_h, #32
+#endif
+ ld1 {$rk0s}, [$cc], #16 @ load rk0
- ldr $rk5q, [$cc, #80] @ load rk5
-
- ldr $rk4q, [$cc, #64] @ load rk4
+ ld1 {$rk1s}, [$cc], #16 @ load rk1
- ldr $rk8q, [$cc, #128] @ load rk8
+ ld1 {$rk2s}, [$cc], #16 @ load rk2
lsr $rctr32x, $ctr96_t32x, #32
- ldr $rk6q, [$cc, #96] @ load rk6
+ ld1 {$rk3s}, [$cc], #16 @ load rk3
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
- ldr $rk7q, [$cc, #112] @ load rk7
+ ld1 {$rk4s}, [$cc], #16 @ load rk4
rev $rctr32w, $rctr32w @ rev_ctr32
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
@@ -2023,15 +2157,13 @@ aes_gcm_enc_192_kernel:
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
- ldr $rk0q, [$cc, #0] @ load rk0
+ ld1 {$rk5s}, [$cc], #16 @ load rk5
fmov $ctr3.d[1], $ctr32x @ CTR block 3
- ldr $rk3q, [$cc, #48] @ load rk3
-
- ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
+ ld1 {$rk6s}, [$cc], #16 @ load rk6
- ldr $rk1q, [$cc, #16] @ load rk1
+ ld1 {$rk7s}, [$cc], #16 @ load rk7
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
ld1 { $acc_lb}, [$current_tag]
@@ -2039,29 +2171,32 @@ aes_gcm_enc_192_kernel:
rev64 $acc_lb, $acc_lb
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
- ldr $rk11q, [$cc, #176] @ load rk11
+ ld1 {$rk8s}, [$cc], #16 @ load rk8
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+#ifndef __AARCH64EB__
ext $h4b, $h4b, $h4b, #8
-
+#endif
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
- ldr $rk2q, [$cc, #32] @ load rk2
+ ld1 {$rk9s}, [$cc], #16 @ load rk9
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
- ldr $rk10q, [$cc, #160] @ load rk10
+ ld1 {$rk10s}, [$cc], #16 @ load rk10
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+#ifndef __AARCH64EB__
ext $h1b, $h1b, $h1b, #8
-
+#endif
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
- ldr $rk9q, [$cc, #144] @ load rk9
+ ld1 {$rk11s}, [$cc], #16 @ load rk11
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+#ifndef __AARCH64EB__
ext $h3b, $h3b, $h3b, #8
-
+#endif
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
@@ -2100,8 +2235,9 @@ aes_gcm_enc_192_kernel:
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+#ifndef __AARCH64EB__
ext $h2b, $h2b, $h2b, #8
-
+#endif
aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
@@ -2163,13 +2299,26 @@ aes_gcm_enc_192_kernel:
rev $ctr32w, $rctr32w @ CTR block 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l2, $input_l2
+ rev $input_h2, $input_h2
+#endif
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l3, $input_l3
+ rev $input_h3, $input_h3
+#endif
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
+#ifdef __AARCH64EB__
+ rev $input_l1, $input_l1
+ rev $input_h1, $input_h1
+#endif
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
@@ -2239,7 +2388,10 @@ aes_gcm_enc_192_kernel:
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l1, $input_l1
+ rev $input_h1, $input_h1
+#endif
ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
@@ -2250,10 +2402,16 @@ aes_gcm_enc_192_kernel:
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l2, $input_l2
+ rev $input_h2, $input_h2
+#endif
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l3, $input_l3
+ rev $input_h3, $input_h3
+#endif
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $res0b, $res0b, $acc_lb @ PRE 1
@@ -2330,7 +2488,10 @@ aes_gcm_enc_192_kernel:
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
@@ -2627,7 +2788,10 @@ aes_gcm_enc_192_kernel:
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
@@ -2664,7 +2828,10 @@ aes_gcm_enc_192_kernel:
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
rev64 $res0b, $res1b @ GHASH final-3 block
eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
@@ -2695,7 +2862,10 @@ aes_gcm_enc_192_kernel:
rev64 $res0b, $res1b @ GHASH final-2 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
eor $res0b, $res0b, $t0.16b @ feed in partial tag
eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
@@ -2726,7 +2896,10 @@ aes_gcm_enc_192_kernel:
st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
rev64 $res0b, $res1b @ GHASH final-1 block
eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
@@ -2758,7 +2931,11 @@ aes_gcm_enc_192_kernel:
.L192_enc_blocks_less_than_1: @ blocks left <= 1
ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
+#ifndef __AARCH64EB__
rev $ctr32w, $rctr32w
+#else
+ mov $ctr32w, $rctr32w
+#endif
and $bit_length, $bit_length, #127 @ bit_length %= 128
sub $bit_length, $bit_length, #128 @ bit_length -= 128
@@ -2875,14 +3052,22 @@ aes_gcm_dec_192_kernel:
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+ rev $ctr96_b64x, $ctr96_b64x
+ rev $ctr96_t32x, $ctr96_t32x
+#endif
+ ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
+#ifdef __AARCH64EB__
+ ror $rk12_l, $rk12_l, #32
+ ror $rk12_h, $rk12_h, #32
+#endif
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
- ldr $rk0q, [$cc, #0] @ load rk0
+ ld1 {$rk0s}, [$cc], #16 @ load rk0
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
- ldr $rk2q, [$cc, #32] @ load rk2
+ ld1 {$rk1s}, [$cc], #16 @ load rk1
lsr $rctr32x, $ctr96_t32x, #32
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
@@ -2892,14 +3077,14 @@ aes_gcm_dec_192_kernel:
fmov $ctr1d, $ctr96_b64x @ CTR block 1
add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
- ldr $rk1q, [$cc, #16] @ load rk1
+ ld1 {$rk2s}, [$cc], #16 @ load rk2
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
rev $ctr32w, $rctr32w @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
- ldr $rk3q, [$cc, #48] @ load rk3
+ ld1 {$rk3s}, [$cc], #16 @ load rk3
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
@@ -2916,43 +3101,46 @@ aes_gcm_dec_192_kernel:
fmov $ctr3.d[1], $ctr32x @ CTR block 3
- ldr $rk8q, [$cc, #128] @ load rk8
+ ld1 {$rk4s}, [$cc], #16 @ load rk4
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
- ldr $rk11q, [$cc, #176] @ load rk11
+ ld1 {$rk5s}, [$cc], #16 @ load rk5
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+#ifndef __AARCH64EB__
ext $h4b, $h4b, $h4b, #8
-
+#endif
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+#ifndef __AARCH64EB__
ext $h2b, $h2b, $h2b, #8
-
+#endif
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+#ifndef __AARCH64EB__
ext $h3b, $h3b, $h3b, #8
-
+#endif
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
- ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+#ifndef __AARCH64EB__
ext $h1b, $h1b, $h1b, #8
-
+#endif
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
- ldr $rk10q, [$cc, #160] @ load rk10
+ ld1 {$rk6s}, [$cc], #16 @ load rk6
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
- ldr $rk9q, [$cc, #144] @ load rk9
+ ld1 {$rk7s}, [$cc], #16 @ load rk7
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
- ldr $rk7q, [$cc, #112] @ load rk7
+ ld1 {$rk8s}, [$cc], #16 @ load rk8
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
- ldr $rk4q, [$cc, #64] @ load rk4
+ ld1 {$rk9s}, [$cc], #16 @ load rk9
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
ld1 { $acc_lb}, [$current_tag]
@@ -2966,7 +3154,7 @@ aes_gcm_dec_192_kernel:
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
- ldr $rk5q, [$cc, #80] @ load rk5
+ ld1 {$rk10s}, [$cc], #16 @ load rk10
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
@@ -2977,7 +3165,7 @@ aes_gcm_dec_192_kernel:
trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
- ldr $rk6q, [$cc, #96] @ load rk6
+ ld1 {$rk11s}, [$cc], #16 @ load rk11
aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
@@ -3041,17 +3229,13 @@ aes_gcm_dec_192_kernel:
aese $ctr0b, $rk11 @ AES block 0 - round 11
b.ge .L192_dec_tail @ handle tail
- ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
-
- ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
+ ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
rev $ctr32w, $rctr32w @ CTR block 4
- ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
-
- ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
+ ld1 {$res2b, $res3b}, [$input_ptr], #32 @ AES block 2,3 - load ciphertext
mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
@@ -3063,27 +3247,35 @@ aes_gcm_dec_192_kernel:
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
rev64 $res0b, $res0b @ GHASH block 0
- add $input_ptr, $input_ptr, #64 @ AES input_ptr update
fmov $ctr0d, $ctr96_b64x @ CTR block 4
rev64 $res1b, $res1b @ GHASH block 1
cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
+#ifdef __AARCH64EB__
+ rev $output_l1, $output_l1
+#endif
fmov $ctr0.d[1], $ctr32x @ CTR block 4
rev $ctr32w, $rctr32w @ CTR block 5
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
fmov $ctr1d, $ctr96_b64x @ CTR block 5
eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev $output_h1, $output_h1
+#endif
add $rctr32w, $rctr32w, #1 @ CTR block 5
fmov $ctr1.d[1], $ctr32x @ CTR block 5
eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
rev $ctr32w, $rctr32w @ CTR block 6
eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
@@ -3138,7 +3330,9 @@ aes_gcm_dec_192_kernel:
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev $output_h2, $output_h2
+#endif
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
@@ -3152,7 +3346,9 @@ aes_gcm_dec_192_kernel:
pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev $output_l2, $output_l2
+#endif
aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
@@ -3230,15 +3426,17 @@ aes_gcm_dec_192_kernel:
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
- ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
+ ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
- ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
+ ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev $output_l3, $output_l3
+#endif
aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
@@ -3249,10 +3447,10 @@ aes_gcm_dec_192_kernel:
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
- ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
+ ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
- ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
+ ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext
rev $ctr32w, $rctr32w @ CTR block 4k+8
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
@@ -3261,11 +3459,13 @@ aes_gcm_dec_192_kernel:
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
- add $input_ptr, $input_ptr, #64 @ AES input_ptr update
cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
+#ifdef __AARCH64EB__
+ rev $output_h3, $output_h3
+#endif
eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
@@ -3295,18 +3495,28 @@ aes_gcm_dec_192_kernel:
rev $ctr32w, $rctr32w @ CTR block 4k+9
eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev $output_l1, $output_l1
+#endif
fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
rev $ctr32w, $rctr32w @ CTR block 4k+10
eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev $output_h1, $output_h1
+#endif
eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
@@ -3355,17 +3565,28 @@ aes_gcm_dec_192_kernel:
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
+#ifdef __AARCH64EB__
+ rev $output_h3, $output_h3
+#endif
fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
-
+#ifdef __AARCH64EB__
+ rev $output_l2, $output_l2
+#endif
pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
+#ifdef __AARCH64EB__
+ rev $output_h2, $output_h2
+#endif
eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
+#ifdef __AARCH64EB__
+ rev $output_l3, $output_l3
+#endif
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
rev64 $res3b, $res3b @ GHASH block 4k+3
@@ -3517,8 +3738,13 @@ aes_gcm_dec_192_kernel:
cmp $main_end_input_ptr, #48
eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
b.gt .L192_dec_blocks_more_than_3
movi $acc_l.8b, #0
@@ -3562,10 +3788,16 @@ aes_gcm_dec_192_kernel:
pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
movi $t0.8b, #0 @ suppress further partial tag feed in
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
.L192_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
@@ -3595,8 +3827,13 @@ aes_gcm_dec_192_kernel:
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
.L192_dec_blocks_more_than_1: @ blocks left > 1
@@ -3627,9 +3864,13 @@ aes_gcm_dec_192_kernel:
movi $t0.8b, #0 @ suppress further partial tag feed in
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
-
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
.L192_dec_blocks_less_than_1: @ blocks left <= 1
@@ -3656,8 +3897,11 @@ aes_gcm_dec_192_kernel:
orr $output_l0, $output_l0, $end_input_ptr
mov $ctr0.d[1], $ctr96_b64x
-
+#ifndef __AARCH64EB__
rev $ctr32w, $rctr32w
+#else
+ mov $ctr32w, $rctr32w
+#endif
and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
str $ctr32w, [$counter, #12] @ store the updated counter
@@ -3782,6 +4026,7 @@ my $mod_constant="v8";
my $mod_t="v7";
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
+my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31));
my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
my $rk2q1="v20.1q";
my $rk3q1="v21.1q";
@@ -3817,14 +4062,22 @@ aes_gcm_enc_256_kernel:
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
-
+#ifdef __AARCH64EB__
+ rev $ctr96_b64x, $ctr96_b64x
+ rev $ctr96_t32x, $ctr96_t32x
+#endif
+ ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
+#ifdef __AARCH64EB__
+ ror $rk14_l, $rk14_l, #32
+ ror $rk14_h, $rk14_h, #32
+#endif
ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
- ldr $rk0q, [$cc, #0] @ load rk0
+ ld1 {$rk0s}, [$cc], #16 @ load rk0
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
- ldr $rk7q, [$cc, #112] @ load rk7
+ ld1 {$rk1s}, [$cc], #16 @ load rk1
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
lsr $rctr32x, $ctr96_t32x, #32
@@ -3843,14 +4096,14 @@ aes_gcm_enc_256_kernel:
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
add $rctr32w, $rctr32w, #1 @ CTR block 1
- ldr $rk1q, [$cc, #16] @ load rk1
+ ld1 {$rk2s}, [$cc], #16 @ load rk2
fmov $ctr1.d[1], $ctr32x @ CTR block 1
rev $ctr32w, $rctr32w @ CTR block 2
add $rctr32w, $rctr32w, #1 @ CTR block 2
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
- ldr $rk2q, [$cc, #32] @ load rk2
+ ld1 {$rk3s}, [$cc], #16 @ load rk3
fmov $ctr2.d[1], $ctr32x @ CTR block 2
rev $ctr32w, $rctr32w @ CTR block 3
@@ -3861,46 +4114,48 @@ aes_gcm_enc_256_kernel:
fmov $ctr3.d[1], $ctr32x @ CTR block 3
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
- ldr $rk3q, [$cc, #48] @ load rk3
+ ld1 {$rk4s}, [$cc], #16 @ load rk4
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
- ldr $rk6q, [$cc, #96] @ load rk6
+ ld1 {$rk5s}, [$cc], #16 @ load rk5
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
- ldr $rk5q, [$cc, #80] @ load rk5
+ ld1 {$rk6s}, [$cc], #16 @ load rk6
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+#ifndef __AARCH64EB__
ext $h3b, $h3b, $h3b, #8
-
+#endif
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
- ldr $rk13q, [$cc, #208] @ load rk13
+ ld1 {$rk7s}, [$cc], #16 @ load rk7
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
- ldr $rk4q, [$cc, #64] @ load rk4
+ ld1 {$rk8s}, [$cc], #16 @ load rk8
aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+#ifndef __AARCH64EB__
ext $h2b, $h2b, $h2b, #8
-
+#endif
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
- ldr $rk12q, [$cc, #192] @ load rk12
+ ld1 {$rk9s}, [$cc], #16 @ load rk9
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+#ifndef __AARCH64EB__
ext $h4b, $h4b, $h4b, #8
-
+#endif
aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
- ldr $rk11q, [$cc, #176] @ load rk11
+ ld1 {$rk10s}, [$cc], #16 @ load rk10
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
- ldr $rk8q, [$cc, #128] @ load rk8
+ ld1 {$rk11s}, [$cc], #16 @ load rk11
aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
- ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
ld1 { $acc_lb}, [$current_tag]
@@ -3927,14 +4182,15 @@ aes_gcm_enc_256_kernel:
trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
- ldr $rk9q, [$cc, #144] @ load rk9
+ ld1 {$rk12s}, [$cc], #16 @ load rk12
aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+#ifndef __AARCH64EB__
ext $h1b, $h1b, $h1b, #8
-
+#endif
aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
- ldr $rk10q, [$cc, #160] @ load rk10
+ ld1 {$rk13s}, [$cc], #16 @ load rk13
aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
@@ -3999,13 +4255,26 @@ aes_gcm_enc_256_kernel:
b.ge .L256_enc_tail @ handle tail
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l1, $input_l1
+ rev $input_h1, $input_h1
+#endif
rev $ctr32w, $rctr32w @ CTR block 4
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l3, $input_l3
+ rev $input_h3, $input_h3
+#endif
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
+#ifdef __AARCH64EB__
+ rev $input_l2, $input_l2
+ rev $input_h2, $input_h2
+#endif
add $input_ptr, $input_ptr, #64 @ AES input_ptr update
eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
@@ -4083,10 +4352,16 @@ aes_gcm_enc_256_kernel:
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l3, $input_l3
+ rev $input_h3, $input_h3
+#endif
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l2, $input_l2
+ rev $input_h2, $input_h2
+#endif
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
eor $res0b, $res0b, $acc_lb @ PRE 1
@@ -4172,7 +4447,10 @@ aes_gcm_enc_256_kernel:
aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l1, $input_l1
+ rev $input_h1, $input_h1
+#endif
aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
@@ -4202,7 +4480,10 @@ aes_gcm_enc_256_kernel:
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
shl $mod_constantd, $mod_constantd, #56 @ mod_constant
@@ -4497,7 +4778,10 @@ aes_gcm_enc_256_kernel:
ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
@@ -4532,7 +4816,10 @@ aes_gcm_enc_256_kernel:
st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
rev64 $res0b, $res1b @ GHASH final-3 block
eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
@@ -4561,7 +4848,10 @@ aes_gcm_enc_256_kernel:
st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
rev64 $res0b, $res1b @ GHASH final-2 block
eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
@@ -4597,7 +4887,10 @@ aes_gcm_enc_256_kernel:
rev64 $res0b, $res1b @ GHASH final-1 block
ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
-
+#ifdef __AARCH64EB__
+ rev $input_l0, $input_l0
+ rev $input_h0, $input_h0
+#endif
eor $res0b, $res0b, $t0.16b @ feed in partial tag
movi $t0.8b, #0 @ suppress further partial tag feed in
@@ -4658,7 +4951,11 @@ aes_gcm_enc_256_kernel:
pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
mov $t0d, $res0.d[1] @ GHASH final block - mid
+#ifndef __AARCH64EB__
rev $ctr32w, $rctr32w
+#else
+ mov $ctr32w, $rctr32w
+#endif
pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
@@ -4749,21 +5046,29 @@ aes_gcm_dec_256_kernel:
lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
mov $len, $main_end_input_ptr
ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
-
- ldr $rk8q, [$cc, #128] @ load rk8
+#ifdef __AARCH64EB__
+ rev $ctr96_b64x, $ctr96_b64x
+ rev $ctr96_t32x, $ctr96_t32x
+#endif
+ ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
+#ifdef __AARCH64EB__
+ ror $rk14_h, $rk14_h, #32
+ ror $rk14_l, $rk14_l, #32
+#endif
+ ld1 {$rk0s}, [$cc], #16 @ load rk0
sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
- ldr $rk7q, [$cc, #112] @ load rk7
+ ld1 {$rk1s}, [$cc], #16 @ load rk1
and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
- ldr $rk6q, [$cc, #96] @ load rk6
+ ld1 {$rk2s}, [$cc], #16 @ load rk2
lsr $rctr32x, $ctr96_t32x, #32
- ldr $rk5q, [$cc, #80] @ load rk5
+ ld1 {$rk3s}, [$cc], #16 @ load rk3
orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
- ldr $rk3q, [$cc, #48] @ load rk3
+ ld1 {$rk4s}, [$cc], #16 @ load rk4
add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
rev $rctr32w, $rctr32w @ rev_ctr32
@@ -4788,34 +5093,39 @@ aes_gcm_dec_256_kernel:
rev $ctr32w, $rctr32w @ CTR block 3
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
- ldr $rk0q, [$cc, #0] @ load rk0
+ ld1 {$rk5s}, [$cc], #16 @ load rk5
fmov $ctr3.d[1], $ctr32x @ CTR block 3
add $rctr32w, $rctr32w, #1 @ CTR block 3
- ldr $rk4q, [$cc, #64] @ load rk4
+ ld1 {$rk6s}, [$cc], #16 @ load rk6
- ldr $rk13q, [$cc, #208] @ load rk13
+ ld1 {$rk7s}, [$cc], #16 @ load rk7
- ldr $rk1q, [$cc, #16] @ load rk1
+ ld1 {$rk8s}, [$cc], #16 @ load rk8
aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
ldr $h3q, [$current_tag, #80] @ load h3l | h3h
+#ifndef __AARCH64EB__
ext $h3b, $h3b, $h3b, #8
+#endif
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
ldr $h4q, [$current_tag, #112] @ load h4l | h4h
+#ifndef __AARCH64EB__
ext $h4b, $h4b, $h4b, #8
+#endif
aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
ldr $h2q, [$current_tag, #64] @ load h2l | h2h
+#ifndef __AARCH64EB__
ext $h2b, $h2b, $h2b, #8
+#endif
aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
- ldr $rk2q, [$cc, #32] @ load rk2
+ ld1 {$rk9s}, [$cc], #16 @ load rk9
aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
- ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
ld1 { $acc_lb}, [$current_tag]
@@ -4823,17 +5133,18 @@ aes_gcm_dec_256_kernel:
rev64 $acc_lb, $acc_lb
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
- ldr $rk9q, [$cc, #144] @ load rk9
+ ld1 {$rk10s}, [$cc], #16 @ load rk10
aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
- ldr $rk12q, [$cc, #192] @ load rk12
+ ld1 {$rk11s}, [$cc], #16 @ load rk11
aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
ldr $h1q, [$current_tag, #32] @ load h1l | h1h
+#ifndef __AARCH64EB__
ext $h1b, $h1b, $h1b, #8
-
+#endif
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
- ldr $rk10q, [$cc, #160] @ load rk10
+ ld1 {$rk12s}, [$cc], #16 @ load rk12
aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
@@ -4889,7 +5200,7 @@ aes_gcm_dec_256_kernel:
aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
- ldr $rk11q, [$cc, #176] @ load rk11
+ ld1 {$rk13s}, [$cc], #16 @ load rk13
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
@@ -4939,9 +5250,7 @@ aes_gcm_dec_256_kernel:
aese $ctr0b, $rk13 @ AES block 0 - round 13
b.ge .L256_dec_tail @ handle tail
- ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
-
- ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
+ ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext
rev $ctr32w, $rctr32w @ CTR block 4
@@ -4949,7 +5258,7 @@ aes_gcm_dec_256_kernel:
eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
rev64 $res1b, $res1b @ GHASH block 1
- ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
+ ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext
mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
@@ -4969,22 +5278,32 @@ aes_gcm_dec_256_kernel:
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
fmov $ctr1d, $ctr96_b64x @ CTR block 5
- ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
- add $input_ptr, $input_ptr, #64 @ AES input_ptr update
+ ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext
fmov $ctr1.d[1], $ctr32x @ CTR block 5
rev $ctr32w, $rctr32w @ CTR block 6
add $rctr32w, $rctr32w, #1 @ CTR block 6
eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
+#ifdef __AARCH64EB__
+ rev $output_l1, $output_l1
+#endif
orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
+#ifdef __AARCH64EB__
+ rev $output_h1, $output_h1
+#endif
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
@@ -5027,7 +5346,9 @@ aes_gcm_dec_256_kernel:
aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
-
+#ifdef __AARCH64EB__
+ rev $output_h2, $output_h2
+#endif
aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
@@ -5036,7 +5357,9 @@ aes_gcm_dec_256_kernel:
aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
-
+#ifdef __AARCH64EB__
+ rev $output_l2, $output_l2
+#endif
aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
@@ -5049,9 +5372,14 @@ aes_gcm_dec_256_kernel:
pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
-
+#ifdef __AARCH64EB__
+ rev $output_l3, $output_l3
+#endif
pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
+#ifdef __AARCH64EB__
+ rev $output_h3, $output_h3
+#endif
eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
@@ -5145,7 +5473,7 @@ aes_gcm_dec_256_kernel:
eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
- ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
+ ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
@@ -5154,7 +5482,7 @@ aes_gcm_dec_256_kernel:
eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
- ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
+ ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
@@ -5166,10 +5494,10 @@ aes_gcm_dec_256_kernel:
eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
- ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
+ ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
- ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
+ ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
@@ -5178,7 +5506,6 @@ aes_gcm_dec_256_kernel:
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
- add $input_ptr, $input_ptr, #64 @ AES input_ptr update
mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
@@ -5198,8 +5525,13 @@ aes_gcm_dec_256_kernel:
add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
-
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
@@ -5219,9 +5551,15 @@ aes_gcm_dec_256_kernel:
rev64 $res1b, $res1b @ GHASH block 4k+5
eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
+#ifdef __AARCH64EB__
+ rev $output_h1, $output_h1
+#endif
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
+#ifdef __AARCH64EB__
+ rev $output_l1, $output_l1
+#endif
stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
rev64 $res0b, $res0b @ GHASH block 4k+4
@@ -5385,10 +5723,14 @@ aes_gcm_dec_256_kernel:
aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
-
+#ifdef __AARCH64EB__
+ rev $output_h2, $output_h2
+#endif
aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
-
+#ifdef __AARCH64EB__
+ rev $output_l3, $output_l3
+#endif
aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
@@ -5397,11 +5739,17 @@ aes_gcm_dec_256_kernel:
aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
+#ifdef __AARCH64EB__
+ rev $output_l2, $output_l2
+#endif
aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
+#ifdef __AARCH64EB__
+ rev $output_h3, $output_h3
+#endif
aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
@@ -5438,8 +5786,14 @@ aes_gcm_dec_256_kernel:
cmp $main_end_input_ptr, #48
eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
b.gt .L256_dec_blocks_more_than_3
sub $rctr32w, $rctr32w, #1
@@ -5487,9 +5841,15 @@ aes_gcm_dec_256_kernel:
pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
.L256_dec_blocks_more_than_2: @ blocks left > 2
rev64 $res0b, $res1b @ GHASH final-2 block
@@ -5517,9 +5877,15 @@ aes_gcm_dec_256_kernel:
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
.L256_dec_blocks_more_than_1: @ blocks left > 1
stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
@@ -5547,13 +5913,18 @@ aes_gcm_dec_256_kernel:
pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
-
+#ifdef __AARCH64EB__
+ rev $output_l0, $output_l0
+#endif
eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
+#ifdef __AARCH64EB__
+ rev $output_h0, $output_h0
+#endif
.L256_dec_blocks_less_than_1: @ blocks left <= 1
and $bit_length, $bit_length, #127 @ bit_length %= 128
@@ -5579,7 +5950,11 @@ aes_gcm_dec_256_kernel:
mov $ctr0.d[1], $ctr96_b64x
bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
+#ifndef __AARCH64EB__
rev $ctr32w, $rctr32w
+#else
+ mov $ctr32w, $rctr32w
+#endif
bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
@@ -5720,7 +6095,7 @@ if ($flavour =~ /64/) { ######## 64-bit code
if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
print " it $2\n";
}
-
+ s/__AARCH64E([BL])__/__ARME$1__/go;
print $_,"\n";
}
}