From 507356598bc1e17a08f9737df50e1a19525efa88 Mon Sep 17 00:00:00 2001 From: JerryDevis Date: Sat, 11 Mar 2023 18:35:23 +0800 Subject: aes-gcm-armv8_64 asm support bigdian Reviewed-by: Tom Cosgrove Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/20489) (cherry picked from commit 32344a74b7ee2693a5bfda361c40ec60ab5be624) --- crypto/modes/asm/aes-gcm-armv8_64.pl | 787 ++++++++++++++++++++++++++--------- 1 file changed, 581 insertions(+), 206 deletions(-) diff --git a/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/modes/asm/aes-gcm-armv8_64.pl index ff5809ec22..b5e7691427 100755 --- a/crypto/modes/asm/aes-gcm-armv8_64.pl +++ b/crypto/modes/asm/aes-gcm-armv8_64.pl @@ -217,6 +217,7 @@ my $mod_constant="v8"; my $mod_t="v31"; my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27)); +my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27)); my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27)); my $rk2q1="v20.1q"; my $rk3q1="v21.1q"; @@ -269,28 +270,36 @@ aes_gcm_enc_128_kernel: stp d14, d15, [sp, #96] ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 - +#ifdef __AARCH64EB__ + ror $rk10_l, $rk10_l, #32 + ror $rk10_h, $rk10_h, #32 +#endif ld1 {$acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr - ldr $rk9q, [$cc, #144] @ load rk9 + ld1 {$rk0s}, [$cc], #16 @ load rk0 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 lsr $rctr32x, $ctr96_t32x, #32 ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 - +#endif fmov $ctr1d, $ctr96_b64x @ CTR block 1 rev $rctr32w, $rctr32w @ rev_ctr32 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w - ldr $rk0q, [$cc, #0] @ load rk0 + ld1 {$rk1s}, [$cc], #16 @ load rk1 rev $ctr32w, $rctr32w @ CTR block 1 add $rctr32w, $rctr32w, #1 @ CTR block 1 @@ -310,45 +319,50 @@ aes_gcm_enc_128_kernel: rev $ctr32w, $rctr32w @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 - ldr $rk1q, [$cc, #16] @ load rk1 + ld1 {$rk2s}, [$cc], #16 @ load rk2 add $rctr32w, $rctr32w, #1 @ CTR block 3 fmov $ctr3.d[1], $ctr32x @ CTR block 3 ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 - +#endif aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 - ldr $rk2q, [$cc, #32] @ load rk2 + ld1 {$rk3s}, [$cc], #16 @ load rk3 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 +#endif aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 - ldr $rk8q, [$cc, #128] @ load rk8 + ld1 {$rk4s}, [$cc], #16 @ load rk4 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 - ldr $rk3q, [$cc, #48] @ load rk3 + ld1 {$rk5s}, [$cc], #16 @ load rk5 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 - ldr $rk6q, [$cc, #96] @ load rk6 + ld1 {$rk6s}, [$cc], #16 @ load rk6 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 - ldr $rk7q, [$cc, #112] @ load rk7 + ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 - ldr $rk5q, [$cc, #80] @ load rk5 + ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 +#endif aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 @@ -360,7 +374,7 @@ aes_gcm_enc_128_kernel: aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 - ldr $rk4q, [$cc, #64] @ load rk4 + ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 @@ -422,13 +436,25 @@ aes_gcm_enc_128_kernel: b.ge .L128_enc_tail @ handle tail ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high @@ -493,6 +519,10 @@ aes_gcm_enc_128_kernel: .L128_enc_main_loop: @ main loop start ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) @@ -521,7 +551,10 @@ aes_gcm_enc_128_kernel: pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 rev $ctr32w, $rctr32w @ CTR block 4k+8 @@ -591,13 +624,19 @@ aes_gcm_enc_128_kernel: aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low @@ -854,7 +893,10 @@ aes_gcm_enc_128_kernel: sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif cmp $main_end_input_ptr, #48 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag @@ -892,7 +934,10 @@ aes_gcm_enc_128_kernel: st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif rev64 $res0b, $res1b @ GHASH final-3 block eor $res0b, $res0b, $t0.16b @ feed in partial tag @@ -921,7 +966,10 @@ aes_gcm_enc_128_kernel: rev64 $res0b, $res1b @ GHASH final-2 block ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low @@ -955,7 +1003,10 @@ aes_gcm_enc_128_kernel: rev64 $res0b, $res1b @ GHASH final-1 block ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high @@ -1018,9 +1069,11 @@ aes_gcm_enc_128_kernel: ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid - +#ifndef __AARCH64EB__ rev $ctr32w, $rctr32w - +#else + mov $ctr32w, $rctr32w +#endif pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid @@ -1105,20 +1158,29 @@ aes_gcm_dec_128_kernel: lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 - +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 +#ifdef __AARCH64EB__ + ror $rk10_h, $rk10_h, 32 + ror $rk10_l, $rk10_l, 32 +#endif sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 - ldr $rk0q, [$cc, #0] @ load rk0 + ld1 {$rk0s}, [$cc], #16 @ load rk0 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 - +#endif lsr $rctr32x, $ctr96_t32x, #32 fmov $ctr2d, $ctr96_b64x @ CTR block 2 - ldr $rk1q, [$cc, #16] @ load rk1 + ld1 {$rk1s}, [$cc], #16 @ load rk1 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w rev $rctr32w, $rctr32w @ rev_ctr32 @@ -1129,7 +1191,7 @@ aes_gcm_dec_128_kernel: rev $ctr32w, $rctr32w @ CTR block 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 - ldr $rk2q, [$cc, #32] @ load rk2 + ld1 {$rk2s}, [$cc], #16 @ load rk2 add $rctr32w, $rctr32w, #1 @ CTR block 1 fmov $ctr1.d[1], $ctr32x @ CTR block 1 @@ -1150,23 +1212,22 @@ aes_gcm_dec_128_kernel: add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 - ldr $rk3q, [$cc, #48] @ load rk3 + ld1 {$rk3s}, [$cc], #16 @ load rk3 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 - ldr $rk6q, [$cc, #96] @ load rk6 + ld1 {$rk4s}, [$cc], #16 @ load rk4 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 - ldr $rk7q, [$cc, #112] @ load rk7 + ld1 {$rk5s}, [$cc], #16 @ load rk5 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 - ldr $rk4q, [$cc, #64] @ load rk4 + ld1 {$rk6s}, [$cc], #16 @ load rk6 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 - ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ld1 { $acc_lb}, [$current_tag] @@ -1174,14 +1235,14 @@ aes_gcm_dec_128_kernel: rev64 $acc_lb, $acc_lb aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 - ldr $rk5q, [$cc, #80] @ load rk5 + ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 - ldr $rk9q, [$cc, #144] @ load rk9 + ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 @@ -1189,10 +1250,11 @@ aes_gcm_dec_128_kernel: aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 - +#endif aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 - ldr $rk8q, [$cc, #128] @ load rk8 + ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 @@ -1204,8 +1266,9 @@ aes_gcm_dec_128_kernel: aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 - +#endif aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 @@ -1218,7 +1281,9 @@ aes_gcm_dec_128_kernel: trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 +#endif trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l add $main_end_input_ptr, $main_end_input_ptr, $input_ptr @@ -1252,12 +1317,10 @@ aes_gcm_dec_128_kernel: eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k b.ge .L128_dec_tail @ handle tail - ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext - - ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext + ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result - ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext + ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result rev64 $res0b, $res0b @ GHASH block 0 @@ -1265,10 +1328,9 @@ aes_gcm_dec_128_kernel: orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 add $rctr32w, $rctr32w, #1 @ CTR block 4 - ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext + ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext rev64 $res1b, $res1b @ GHASH block 1 - add $input_ptr, $input_ptr, #64 @ AES input_ptr update mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high @@ -1283,7 +1345,9 @@ aes_gcm_dec_128_kernel: fmov $ctr0.d[1], $ctr32x @ CTR block 4 rev $ctr32w, $rctr32w @ CTR block 5 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low - +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif fmov $ctr1d, $ctr96_b64x @ CTR block 5 add $rctr32w, $rctr32w, #1 @ CTR block 5 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 @@ -1295,10 +1359,19 @@ aes_gcm_dec_128_kernel: orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result @@ -1358,9 +1431,14 @@ aes_gcm_dec_128_kernel: aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low - +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 @@ -1391,13 +1469,17 @@ aes_gcm_dec_128_kernel: pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high - +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low - +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 movi $mod_constant.8b, #0xc2 @@ -1414,7 +1496,7 @@ aes_gcm_dec_128_kernel: pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high - ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext + ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+3 - load ciphertext aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 @@ -1435,7 +1517,7 @@ aes_gcm_dec_128_kernel: rev $ctr32w, $rctr32w @ CTR block 4k+8 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid - ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext + ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 @@ -1450,17 +1532,16 @@ aes_gcm_dec_128_kernel: eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 - ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext + ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 - ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext + ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 - add $input_ptr, $input_ptr, #64 @ AES input_ptr update rev64 $res1b, $res1b @ GHASH block 4k+5 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid @@ -1482,11 +1563,15 @@ aes_gcm_dec_128_kernel: aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low - +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 @@ -1503,9 +1588,15 @@ aes_gcm_dec_128_kernel: add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 @@ -1598,9 +1689,14 @@ aes_gcm_dec_128_kernel: aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low - +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 @@ -1654,7 +1750,9 @@ aes_gcm_dec_128_kernel: pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high - +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment @@ -1667,7 +1765,9 @@ aes_gcm_dec_128_kernel: aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high - +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result @@ -1691,9 +1791,14 @@ aes_gcm_dec_128_kernel: cmp $main_end_input_ptr, #48 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif b.gt .L128_dec_blocks_more_than_3 mov $ctr3b, $ctr2b @@ -1737,9 +1842,14 @@ aes_gcm_dec_128_kernel: movi $t0.8b, #0 @ suppress further partial tag feed in eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif .L128_dec_blocks_more_than_2: @ blocks left > 2 rev64 $res0b, $res1b @ GHASH final-2 block @@ -1765,12 +1875,18 @@ aes_gcm_dec_128_kernel: pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif .L128_dec_blocks_more_than_1: @ blocks left > 1 rev64 $res0b, $res1b @ GHASH final-1 block @@ -1801,8 +1917,13 @@ aes_gcm_dec_128_kernel: eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid .L128_dec_blocks_less_than_1: @ blocks left <= 1 @@ -1848,7 +1969,11 @@ aes_gcm_dec_128_kernel: bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes and $output_l0, $output_l0, $ctr32x +#ifndef __AARCH64EB__ rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid movi $mod_constant.8b, #0xc2 @@ -1957,6 +2082,7 @@ my $mod_t="v31"; my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29)); my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29)); +my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29)); my $rk2q1="v20.1q"; my $rk3q1="v21.1q"; my $rk4v="v22"; @@ -1988,18 +2114,26 @@ aes_gcm_enc_192_kernel: stp d14, d15, [sp, #96] ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 +#ifdef __AARCH64EB__ + ror $rk12_l, $rk12_l, #32 + ror $rk12_h, $rk12_h, #32 +#endif + ld1 {$rk0s}, [$cc], #16 @ load rk0 - ldr $rk5q, [$cc, #80] @ load rk5 - - ldr $rk4q, [$cc, #64] @ load rk4 + ld1 {$rk1s}, [$cc], #16 @ load rk1 - ldr $rk8q, [$cc, #128] @ load rk8 + ld1 {$rk2s}, [$cc], #16 @ load rk2 lsr $rctr32x, $ctr96_t32x, #32 - ldr $rk6q, [$cc, #96] @ load rk6 + ld1 {$rk3s}, [$cc], #16 @ load rk3 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w - ldr $rk7q, [$cc, #112] @ load rk7 + ld1 {$rk4s}, [$cc], #16 @ load rk4 rev $rctr32w, $rctr32w @ rev_ctr32 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 @@ -2023,15 +2157,13 @@ aes_gcm_enc_192_kernel: rev $ctr32w, $rctr32w @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 - ldr $rk0q, [$cc, #0] @ load rk0 + ld1 {$rk5s}, [$cc], #16 @ load rk5 fmov $ctr3.d[1], $ctr32x @ CTR block 3 - ldr $rk3q, [$cc, #48] @ load rk3 - - ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 + ld1 {$rk6s}, [$cc], #16 @ load rk6 - ldr $rk1q, [$cc, #16] @ load rk1 + ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ld1 { $acc_lb}, [$current_tag] @@ -2039,29 +2171,32 @@ aes_gcm_enc_192_kernel: rev64 $acc_lb, $acc_lb aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 - ldr $rk11q, [$cc, #176] @ load rk11 + ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 - +#endif aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 - ldr $rk2q, [$cc, #32] @ load rk2 + ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 - ldr $rk10q, [$cc, #160] @ load rk10 + ld1 {$rk10s}, [$cc], #16 @ load rk10 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 - +#endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 - ldr $rk9q, [$cc, #144] @ load rk9 + ld1 {$rk11s}, [$cc], #16 @ load rk11 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 - +#endif aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 @@ -2100,8 +2235,9 @@ aes_gcm_enc_192_kernel: aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 - +#endif aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 @@ -2163,13 +2299,26 @@ aes_gcm_enc_192_kernel: rev $ctr32w, $rctr32w @ CTR block 4 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif add $input_ptr, $input_ptr, #64 @ AES input_ptr update cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks @@ -2239,7 +2388,10 @@ aes_gcm_enc_192_kernel: aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) @@ -2250,10 +2402,16 @@ aes_gcm_enc_192_kernel: pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low eor $res0b, $res0b, $acc_lb @ PRE 1 @@ -2330,7 +2488,10 @@ aes_gcm_enc_192_kernel: aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low @@ -2627,7 +2788,10 @@ aes_gcm_enc_192_kernel: sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high @@ -2664,7 +2828,10 @@ aes_gcm_enc_192_kernel: st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif rev64 $res0b, $res1b @ GHASH final-3 block eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low @@ -2695,7 +2862,10 @@ aes_gcm_enc_192_kernel: rev64 $res0b, $res1b @ GHASH final-2 block ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high @@ -2726,7 +2896,10 @@ aes_gcm_enc_192_kernel: st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif rev64 $res0b, $res1b @ GHASH final-1 block eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low @@ -2758,7 +2931,11 @@ aes_gcm_enc_192_kernel: .L192_enc_blocks_less_than_1: @ blocks left <= 1 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored +#ifndef __AARCH64EB__ rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif and $bit_length, $bit_length, #127 @ bit_length %= 128 sub $bit_length, $bit_length, #128 @ bit_length -= 128 @@ -2875,14 +3052,22 @@ aes_gcm_dec_192_kernel: add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 - +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 +#ifdef __AARCH64EB__ + ror $rk12_l, $rk12_l, #32 + ror $rk12_h, $rk12_h, #32 +#endif ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible - ldr $rk0q, [$cc, #0] @ load rk0 + ld1 {$rk0s}, [$cc], #16 @ load rk0 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr - ldr $rk2q, [$cc, #32] @ load rk2 + ld1 {$rk1s}, [$cc], #16 @ load rk1 lsr $rctr32x, $ctr96_t32x, #32 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w @@ -2892,14 +3077,14 @@ aes_gcm_dec_192_kernel: fmov $ctr1d, $ctr96_b64x @ CTR block 1 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 - ldr $rk1q, [$cc, #16] @ load rk1 + ld1 {$rk2s}, [$cc], #16 @ load rk2 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 rev $ctr32w, $rctr32w @ CTR block 1 add $rctr32w, $rctr32w, #1 @ CTR block 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 - ldr $rk3q, [$cc, #48] @ load rk3 + ld1 {$rk3s}, [$cc], #16 @ load rk3 fmov $ctr1.d[1], $ctr32x @ CTR block 1 rev $ctr32w, $rctr32w @ CTR block 2 @@ -2916,43 +3101,46 @@ aes_gcm_dec_192_kernel: fmov $ctr3.d[1], $ctr32x @ CTR block 3 - ldr $rk8q, [$cc, #128] @ load rk8 + ld1 {$rk4s}, [$cc], #16 @ load rk4 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 - ldr $rk11q, [$cc, #176] @ load rk11 + ld1 {$rk5s}, [$cc], #16 @ load rk5 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 - +#endif aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 - +#endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 - +#endif aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 - ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 - +#endif aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 - ldr $rk10q, [$cc, #160] @ load rk10 + ld1 {$rk6s}, [$cc], #16 @ load rk6 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 - ldr $rk9q, [$cc, #144] @ load rk9 + ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 - ldr $rk7q, [$cc, #112] @ load rk7 + ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 - ldr $rk4q, [$cc, #64] @ load rk4 + ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ld1 { $acc_lb}, [$current_tag] @@ -2966,7 +3154,7 @@ aes_gcm_dec_192_kernel: trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 - ldr $rk5q, [$cc, #80] @ load rk5 + ld1 {$rk10s}, [$cc], #16 @ load rk10 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l @@ -2977,7 +3165,7 @@ aes_gcm_dec_192_kernel: trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 - ldr $rk6q, [$cc, #96] @ load rk6 + ld1 {$rk11s}, [$cc], #16 @ load rk11 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 @@ -3041,17 +3229,13 @@ aes_gcm_dec_192_kernel: aese $ctr0b, $rk11 @ AES block 0 - round 11 b.ge .L192_dec_tail @ handle tail - ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext - - ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext + ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result rev $ctr32w, $rctr32w @ CTR block 4 - ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext - - ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext + ld1 {$res2b, $res3b}, [$input_ptr], #32 @ AES block 2,3 - load ciphertext mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low @@ -3063,27 +3247,35 @@ aes_gcm_dec_192_kernel: mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high rev64 $res0b, $res0b @ GHASH block 0 - add $input_ptr, $input_ptr, #64 @ AES input_ptr update fmov $ctr0d, $ctr96_b64x @ CTR block 4 rev64 $res1b, $res1b @ GHASH block 1 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif fmov $ctr0.d[1], $ctr32x @ CTR block 4 rev $ctr32w, $rctr32w @ CTR block 5 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 fmov $ctr1d, $ctr96_b64x @ CTR block 5 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high - +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif add $rctr32w, $rctr32w, #1 @ CTR block 5 fmov $ctr1.d[1], $ctr32x @ CTR block 5 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low - +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif rev $ctr32w, $rctr32w @ CTR block 6 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 @@ -3138,7 +3330,9 @@ aes_gcm_dec_192_kernel: aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high - +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid @@ -3152,7 +3346,9 @@ aes_gcm_dec_192_kernel: pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low - +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 @@ -3230,15 +3426,17 @@ aes_gcm_dec_192_kernel: aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 - ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext + ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid - ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext + ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low - +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment @@ -3249,10 +3447,10 @@ aes_gcm_dec_192_kernel: eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 - ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext + ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 - ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext + ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext rev $ctr32w, $rctr32w @ CTR block 4k+8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 @@ -3261,11 +3459,13 @@ aes_gcm_dec_192_kernel: aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid - add $input_ptr, $input_ptr, #64 @ AES input_ptr update cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 @@ -3295,18 +3495,28 @@ aes_gcm_dec_192_kernel: rev $ctr32w, $rctr32w @ CTR block 4k+9 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low - +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 rev $ctr32w, $rctr32w @ CTR block 4k+10 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high - +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low @@ -3355,17 +3565,28 @@ aes_gcm_dec_192_kernel: pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low - +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result rev64 $res3b, $res3b @ GHASH block 4k+3 @@ -3517,8 +3738,13 @@ aes_gcm_dec_192_kernel: cmp $main_end_input_ptr, #48 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif b.gt .L192_dec_blocks_more_than_3 movi $acc_l.8b, #0 @@ -3562,10 +3788,16 @@ aes_gcm_dec_192_kernel: pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif movi $t0.8b, #0 @ suppress further partial tag feed in pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif .L192_dec_blocks_more_than_2: @ blocks left > 2 rev64 $res0b, $res1b @ GHASH final-2 block @@ -3595,8 +3827,13 @@ aes_gcm_dec_192_kernel: eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid .L192_dec_blocks_more_than_1: @ blocks left > 1 @@ -3627,9 +3864,13 @@ aes_gcm_dec_192_kernel: movi $t0.8b, #0 @ suppress further partial tag feed in eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low - +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid .L192_dec_blocks_less_than_1: @ blocks left <= 1 @@ -3656,8 +3897,11 @@ aes_gcm_dec_192_kernel: orr $output_l0, $output_l0, $end_input_ptr mov $ctr0.d[1], $ctr96_b64x - +#ifndef __AARCH64EB__ rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits str $ctr32w, [$counter, #12] @ store the updated counter @@ -3782,6 +4026,7 @@ my $mod_constant="v8"; my $mod_t="v7"; my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31)); +my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31)); my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31)); my $rk2q1="v20.1q"; my $rk3q1="v21.1q"; @@ -3817,14 +4062,22 @@ aes_gcm_enc_256_kernel: lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 - +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 +#ifdef __AARCH64EB__ + ror $rk14_l, $rk14_l, #32 + ror $rk14_h, $rk14_h, #32 +#endif ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 - ldr $rk0q, [$cc, #0] @ load rk0 + ld1 {$rk0s}, [$cc], #16 @ load rk0 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) - ldr $rk7q, [$cc, #112] @ load rk7 + ld1 {$rk1s}, [$cc], #16 @ load rk1 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr lsr $rctr32x, $ctr96_t32x, #32 @@ -3843,14 +4096,14 @@ aes_gcm_enc_256_kernel: orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 add $rctr32w, $rctr32w, #1 @ CTR block 1 - ldr $rk1q, [$cc, #16] @ load rk1 + ld1 {$rk2s}, [$cc], #16 @ load rk2 fmov $ctr1.d[1], $ctr32x @ CTR block 1 rev $ctr32w, $rctr32w @ CTR block 2 add $rctr32w, $rctr32w, #1 @ CTR block 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 - ldr $rk2q, [$cc, #32] @ load rk2 + ld1 {$rk3s}, [$cc], #16 @ load rk3 fmov $ctr2.d[1], $ctr32x @ CTR block 2 rev $ctr32w, $rctr32w @ CTR block 3 @@ -3861,46 +4114,48 @@ aes_gcm_enc_256_kernel: fmov $ctr3.d[1], $ctr32x @ CTR block 3 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 - ldr $rk3q, [$cc, #48] @ load rk3 + ld1 {$rk4s}, [$cc], #16 @ load rk4 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 - ldr $rk6q, [$cc, #96] @ load rk6 + ld1 {$rk5s}, [$cc], #16 @ load rk5 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 - ldr $rk5q, [$cc, #80] @ load rk5 + ld1 {$rk6s}, [$cc], #16 @ load rk6 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 - +#endif aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 - ldr $rk13q, [$cc, #208] @ load rk13 + ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 - ldr $rk4q, [$cc, #64] @ load rk4 + ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 - +#endif aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 - ldr $rk12q, [$cc, #192] @ load rk12 + ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 - +#endif aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 - ldr $rk11q, [$cc, #176] @ load rk11 + ld1 {$rk10s}, [$cc], #16 @ load rk10 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 - ldr $rk8q, [$cc, #128] @ load rk8 + ld1 {$rk11s}, [$cc], #16 @ load rk11 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 add $rctr32w, $rctr32w, #1 @ CTR block 3 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 - ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ld1 { $acc_lb}, [$current_tag] @@ -3927,14 +4182,15 @@ aes_gcm_enc_256_kernel: trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 - ldr $rk9q, [$cc, #144] @ load rk9 + ld1 {$rk12s}, [$cc], #16 @ load rk12 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 - +#endif aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 - ldr $rk10q, [$cc, #160] @ load rk10 + ld1 {$rk13s}, [$cc], #16 @ load rk13 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h @@ -3999,13 +4255,26 @@ aes_gcm_enc_256_kernel: b.ge .L256_enc_tail @ handle tail ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif rev $ctr32w, $rctr32w @ CTR block 4 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif add $input_ptr, $input_ptr, #64 @ AES input_ptr update eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low @@ -4083,10 +4352,16 @@ aes_gcm_enc_256_kernel: aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l3, $input_l3 + rev $input_h3, $input_h3 +#endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l2, $input_l2 + rev $input_h2, $input_h2 +#endif aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 eor $res0b, $res0b, $acc_lb @ PRE 1 @@ -4172,7 +4447,10 @@ aes_gcm_enc_256_kernel: aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l1, $input_l1 + rev $input_h1, $input_h1 +#endif aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid @@ -4202,7 +4480,10 @@ aes_gcm_enc_256_kernel: aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 shl $mod_constantd, $mod_constantd, #56 @ mod_constant @@ -4497,7 +4778,10 @@ aes_gcm_enc_256_kernel: ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high @@ -4532,7 +4816,10 @@ aes_gcm_enc_256_kernel: st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif rev64 $res0b, $res1b @ GHASH final-3 block eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low @@ -4561,7 +4848,10 @@ aes_gcm_enc_256_kernel: st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif rev64 $res0b, $res1b @ GHASH final-2 block eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low @@ -4597,7 +4887,10 @@ aes_gcm_enc_256_kernel: rev64 $res0b, $res1b @ GHASH final-1 block ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high - +#ifdef __AARCH64EB__ + rev $input_l0, $input_l0 + rev $input_h0, $input_h0 +#endif eor $res0b, $res0b, $t0.16b @ feed in partial tag movi $t0.8b, #0 @ suppress further partial tag feed in @@ -4658,7 +4951,11 @@ aes_gcm_enc_256_kernel: pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high mov $t0d, $res0.d[1] @ GHASH final block - mid +#ifndef __AARCH64EB__ rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low @@ -4749,21 +5046,29 @@ aes_gcm_dec_256_kernel: lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 - - ldr $rk8q, [$cc, #128] @ load rk8 +#ifdef __AARCH64EB__ + rev $ctr96_b64x, $ctr96_b64x + rev $ctr96_t32x, $ctr96_t32x +#endif + ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 +#ifdef __AARCH64EB__ + ror $rk14_h, $rk14_h, #32 + ror $rk14_l, $rk14_l, #32 +#endif + ld1 {$rk0s}, [$cc], #16 @ load rk0 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 - ldr $rk7q, [$cc, #112] @ load rk7 + ld1 {$rk1s}, [$cc], #16 @ load rk1 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr - ldr $rk6q, [$cc, #96] @ load rk6 + ld1 {$rk2s}, [$cc], #16 @ load rk2 lsr $rctr32x, $ctr96_t32x, #32 - ldr $rk5q, [$cc, #80] @ load rk5 + ld1 {$rk3s}, [$cc], #16 @ load rk3 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w - ldr $rk3q, [$cc, #48] @ load rk3 + ld1 {$rk4s}, [$cc], #16 @ load rk4 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr rev $rctr32w, $rctr32w @ rev_ctr32 @@ -4788,34 +5093,39 @@ aes_gcm_dec_256_kernel: rev $ctr32w, $rctr32w @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 - ldr $rk0q, [$cc, #0] @ load rk0 + ld1 {$rk5s}, [$cc], #16 @ load rk5 fmov $ctr3.d[1], $ctr32x @ CTR block 3 add $rctr32w, $rctr32w, #1 @ CTR block 3 - ldr $rk4q, [$cc, #64] @ load rk4 + ld1 {$rk6s}, [$cc], #16 @ load rk6 - ldr $rk13q, [$cc, #208] @ load rk13 + ld1 {$rk7s}, [$cc], #16 @ load rk7 - ldr $rk1q, [$cc, #16] @ load rk1 + ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ldr $h3q, [$current_tag, #80] @ load h3l | h3h +#ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 +#endif aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ldr $h4q, [$current_tag, #112] @ load h4l | h4h +#ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 +#endif aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ldr $h2q, [$current_tag, #64] @ load h2l | h2h +#ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 +#endif aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 - ldr $rk2q, [$cc, #32] @ load rk2 + ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 - ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ld1 { $acc_lb}, [$current_tag] @@ -4823,17 +5133,18 @@ aes_gcm_dec_256_kernel: rev64 $acc_lb, $acc_lb aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 - ldr $rk9q, [$cc, #144] @ load rk9 + ld1 {$rk10s}, [$cc], #16 @ load rk10 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 - ldr $rk12q, [$cc, #192] @ load rk12 + ld1 {$rk11s}, [$cc], #16 @ load rk11 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ldr $h1q, [$current_tag, #32] @ load h1l | h1h +#ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 - +#endif aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 - ldr $rk10q, [$cc, #160] @ load rk10 + ld1 {$rk12s}, [$cc], #16 @ load rk12 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 @@ -4889,7 +5200,7 @@ aes_gcm_dec_256_kernel: aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 - ldr $rk11q, [$cc, #176] @ load rk11 + ld1 {$rk13s}, [$cc], #16 @ load rk13 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 @@ -4939,9 +5250,7 @@ aes_gcm_dec_256_kernel: aese $ctr0b, $rk13 @ AES block 0 - round 13 b.ge .L256_dec_tail @ handle tail - ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext - - ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext + ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext rev $ctr32w, $rctr32w @ CTR block 4 @@ -4949,7 +5258,7 @@ aes_gcm_dec_256_kernel: eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result rev64 $res1b, $res1b @ GHASH block 1 - ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext + ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high @@ -4969,22 +5278,32 @@ aes_gcm_dec_256_kernel: orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result fmov $ctr1d, $ctr96_b64x @ CTR block 5 - ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext - add $input_ptr, $input_ptr, #64 @ AES input_ptr update + ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext fmov $ctr1.d[1], $ctr32x @ CTR block 5 rev $ctr32w, $rctr32w @ CTR block 6 add $rctr32w, $rctr32w, #1 @ CTR block 6 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result @@ -5027,7 +5346,9 @@ aes_gcm_dec_256_kernel: aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high - +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid @@ -5036,7 +5357,9 @@ aes_gcm_dec_256_kernel: aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low - +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result @@ -5049,9 +5372,14 @@ aes_gcm_dec_256_kernel: pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low - +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 @@ -5145,7 +5473,7 @@ aes_gcm_dec_256_kernel: eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 - ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext + ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment @@ -5154,7 +5482,7 @@ aes_gcm_dec_256_kernel: eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 - ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext + ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result @@ -5166,10 +5494,10 @@ aes_gcm_dec_256_kernel: eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 - ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext + ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 - ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext + ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high @@ -5178,7 +5506,6 @@ aes_gcm_dec_256_kernel: eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 - add $input_ptr, $input_ptr, #64 @ AES input_ptr update mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 @@ -5198,8 +5525,13 @@ aes_gcm_dec_256_kernel: add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high - +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low @@ -5219,9 +5551,15 @@ aes_gcm_dec_256_kernel: rev64 $res1b, $res1b @ GHASH block 4k+5 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h1, $output_h1 +#endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l1, $output_l1 +#endif stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result rev64 $res0b, $res0b @ GHASH block 4k+4 @@ -5385,10 +5723,14 @@ aes_gcm_dec_256_kernel: aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high - +#ifdef __AARCH64EB__ + rev $output_h2, $output_h2 +#endif aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low - +#ifdef __AARCH64EB__ + rev $output_l3, $output_l3 +#endif aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid @@ -5397,11 +5739,17 @@ aes_gcm_dec_256_kernel: aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l2, $output_l2 +#endif aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h3, $output_h3 +#endif aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result @@ -5438,8 +5786,14 @@ aes_gcm_dec_256_kernel: cmp $main_end_input_ptr, #48 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif b.gt .L256_dec_blocks_more_than_3 sub $rctr32w, $rctr32w, #1 @@ -5487,9 +5841,15 @@ aes_gcm_dec_256_kernel: pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif .L256_dec_blocks_more_than_2: @ blocks left > 2 rev64 $res0b, $res1b @ GHASH final-2 block @@ -5517,9 +5877,15 @@ aes_gcm_dec_256_kernel: eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif .L256_dec_blocks_more_than_1: @ blocks left > 1 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result @@ -5547,13 +5913,18 @@ aes_gcm_dec_256_kernel: pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low - +#ifdef __AARCH64EB__ + rev $output_l0, $output_l0 +#endif eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high +#ifdef __AARCH64EB__ + rev $output_h0, $output_h0 +#endif .L256_dec_blocks_less_than_1: @ blocks left <= 1 and $bit_length, $bit_length, #127 @ bit_length %= 128 @@ -5579,7 +5950,11 @@ aes_gcm_dec_256_kernel: mov $ctr0.d[1], $ctr96_b64x bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes +#ifndef __AARCH64EB__ rev $ctr32w, $rctr32w +#else + mov $ctr32w, $rctr32w +#endif bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes @@ -5720,7 +6095,7 @@ if ($flavour =~ /64/) { ######## 64-bit code if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { print " it $2\n"; } - + s/__AARCH64E([BL])__/__ARME$1__/go; print $_,"\n"; } } -- cgit v1.2.1