Optimize AES-XTS mode in OpenSSL for aarch64

Aes-xts mode can be optimized by interleaving cipher operation on several blocks and loop unrolling. Interleaving needs one ideal unrolling factor, here we adopt the same factor with aes-cbc, which is described as below: If blocks number > 5, select 5 blocks as one iteration,every loop, decrease the blocks number by 5. If left blocks < 5, treat them as tail blocks. Detailed implementation has a little adjustment for squeezing code space. With this way, for small size such as 16 bytes, the performance is similar as before, but for big size such as 16k bytes, the performance improves a lot, even reaches to 2x uplift, for some arches such as A57, the improvement even reaches more than 2x uplift. We collect many performance datas on different micro-archs such as thunderx2, ampere-emag, a72, a75, a57, a53 and N1, all of which reach 0.5-2x uplift. The following table lists the encryption performance data on aarch64, take a72, a75, a57, a53 and N1 as examples. Performance value takes the unit of cycles per byte, takes the format as comparision of values. List them as below: A72: Before optimization After optimization Improve evp-aes-128-xts@16 8.899913518 5.949087263 49.60% evp-aes-128-xts@64 4.525512668 3.389141845 33.53% evp-aes-128-xts@256 3.502906908 1.633573479 114.43% evp-aes-128-xts@1024 3.174210419 1.155952639 174.60% evp-aes-128-xts@8192 3.053019303 1.028134888 196.95% evp-aes-128-xts@16384 3.025292462 1.02021169 196.54% evp-aes-256-xts@16 9.971105023 6.754233758 47.63% evp-aes-256-xts@64 4.931479093 3.786527393 30.24% evp-aes-256-xts@256 3.746788153 1.943975947 92.74% evp-aes-256-xts@1024 3.401743802 1.477394648 130.25% evp-aes-256-xts@8192 3.278769327 1.32950421 146.62% evp-aes-256-xts@16384 3.27093296 1.325276257 146.81% A75: Before optimization After optimization Improve evp-aes-128-xts@16 8.397965173 5.126839098 63.80% evp-aes-128-xts@64 4.176860631 2.59817764 60.76% evp-aes-128-xts@256 3.069126585 1.284561028 138.92% evp-aes-128-xts@1024 2.805962699 0.932754655 200.83% evp-aes-128-xts@8192 2.725820131 0.829820397 228.48% evp-aes-128-xts@16384 2.71521905 0.823251591 229.82% evp-aes-256-xts@16 11.24790935 7.383914448 52.33% evp-aes-256-xts@64 5.294128847 3.048641998 73.66% evp-aes-256-xts@256 3.861649617 1.570359905 145.91% evp-aes-256-xts@1024 3.537646797 1.200493533 194.68% evp-aes-256-xts@8192 3.435353012 1.085345319 216.52% evp-aes-256-xts@16384 3.437952563 1.097963822 213.12% A57: Before optimization After optimization Improve evp-aes-128-xts@16 10.57455446 7.165438012 47.58% evp-aes-128-xts@64 5.418185447 3.721241202 45.60% evp-aes-128-xts@256 3.855184592 1.747145379 120.66% evp-aes-128-xts@1024 3.477199757 1.253049735 177.50% evp-aes-128-xts@8192 3.36768104 1.091943159 208.41% evp-aes-128-xts@16384 3.360373443 1.088942789 208.59% evp-aes-256-xts@16 12.54559459 8.745489036 43.45% evp-aes-256-xts@64 6.542808937 4.326387568 51.23% evp-aes-256-xts@256 4.62668822 2.119908754 118.25% evp-aes-256-xts@1024 4.161716505 1.557335554 167.23% evp-aes-256-xts@8192 4.032462227 1.377749511 192.68% evp-aes-256-xts@16384 4.023293877 1.371558933 193.34% A53: Before optimization After optimization Improve evp-aes-128-xts@16 18.07842135 13.96980808 29.40% evp-aes-128-xts@64 7.933818397 6.07159276 30.70% evp-aes-128-xts@256 5.264604704 2.611155744 101.60% evp-aes-128-xts@1024 4.606660117 1.722713454 167.40% evp-aes-128-xts@8192 4.405160115 1.454379201 202.90% evp-aes-128-xts@16384 4.401592028 1.442279392 205.20% evp-aes-256-xts@16 20.07084054 16.00803726 25.40% evp-aes-256-xts@64 9.192647294 6.883876732 33.50% evp-aes-256-xts@256 6.336143161 3.108140452 103.90% evp-aes-256-xts@1024 5.62502952 2.097960651 168.10% evp-aes-256-xts@8192 5.412085608 1.807294191 199.50% evp-aes-256-xts@16384 5.403062591 1.790135764 201.80% N1: Before optimization After optimization Improve evp-aes-128-xts@16 6.48147613 4.209415473 53.98% evp-aes-128-xts@64 2.847744115 1.950757468 45.98% evp-aes-128-xts@256 2.085711968 1.061903238 96.41% evp-aes-128-xts@1024 1.842014669 0.798486302 130.69% evp-aes-128-xts@8192 1.760449052 0.713853939 146.61% evp-aes-128-xts@16384 1.760763546 0.707702009 148.80% evp-aes-256-xts@16 7.264142817 5.265970454 37.94% evp-aes-256-xts@64 3.251356212 2.41176323 34.81% evp-aes-256-xts@256 2.380488469 1.342095742 77.37% evp-aes-256-xts@1024 2.08853022 1.041718215 100.49% evp-aes-256-xts@8192 2.027432668 0.944571334 114.64% evp-aes-256-xts@16384 2.00740782 0.941991415 113.10% Add more XTS test cases to cover the cipher stealing mode and cases of different number of blocks. CustomizedGitHooks: yes Change-Id: I93ee31b2575e1413764e27b599af62994deb4c96 Reviewed-by: Paul Dale <paul.dale@oracle.com> Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org> (Merged from https://github.com/openssl/openssl/pull/11399)
author: XiaokangQian <xiaokang.qian@arm.com> 2020-03-13 03:27:34 +0000
committer: Tomas Mraz <tmraz@fedoraproject.org> 2020-11-12 11:09:22 +0100
commit: 9ce8e0d17e608de4f85f7543c52b146e3c6a2291 (patch)
tree: 6b7472519cee7aa202123ed90c9b982be41457a5 /crypto/aes
parent: c87a7f31a3db97376d764583ad5ee4a76db2cbef (diff)
download: openssl-new-9ce8e0d17e608de4f85f7543c52b146e3c6a2291.tar.gz
1 files changed, 1426 insertions, 0 deletions
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
index d084885049..ee2e29823a 100755
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -2131,6 +2131,1432 @@ $code.=<<___;
 .size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
 ___
 }}}
+# Performance in cycles per byte.
+# Processed with AES-XTS different key size.
+# It shows the value before and after optimization as below:
+# (before/after):
+#
+#		AES-128-XTS		AES-256-XTS
+# Cortex-A57	3.36/1.09		4.02/1.37
+# Cortex-A72	3.03/1.02		3.28/1.33
+
+# Optimization is implemented by loop unrolling and interleaving.
+# Commonly, we choose the unrolling factor as 5, if the input
+# data size smaller than 5 blocks, but not smaller than 3 blocks,
+# choose 3 as the unrolling factor.
+# If the input data size dsize >= 5*16 bytes, then take 5 blocks
+# as one iteration, every loop the left size lsize -= 5*16.
+# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
+# will be processed specially, which be integrated into the 5*16 bytes
+# loop to improve the efficiency.
+# There is one special case, if the original input data size dsize
+# = 16 bytes, we will treat it seperately to improve the
+# performance: one independent code block without LR, FP load and
+# store.
+# Encryption will process the (length -tailcnt) bytes as mentioned
+# previously, then encrypt the composite block as last second
+# cipher block.
+# Decryption will process the (length -tailcnt -1) bytes as mentioned
+# previously, then decrypt the last second cipher block to get the
+# last plain block(tail), decrypt the composite block as last second
+# plain text block.
+
+{{{
+my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
+my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
+my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
+my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
+my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
+my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
+my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
+my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
+
+my ($tmpin)=("v26.16b");
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+# q7	last round key
+# q10-q15, q7	Last 7 round keys
+# q8-q9	preloaded round keys except last 7 keys for big size
+# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
+
+
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
+$code.=<<___	if ($flavour =~ /64/);
+.globl	${prefix}_xts_encrypt
+.type	${prefix}_xts_encrypt,%function
+.align	5
+${prefix}_xts_encrypt:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	cmp	$len,#16
+	// Original input data size bigger than 16, jump to big size processing.
+	b.ne	.Lxts_enc_big_size
+	// Encrypt the iv with key2, as the first XEX iv.
+	ldr	$rounds,[$key2,#240]
+	vld1.8	{$dat},[$key2],#16
+	vld1.8	{$iv0},[$ivp]
+	sub	$rounds,$rounds,#2
+	vld1.8	{$dat1},[$key2],#16
+
+.Loop_enc_iv_enc:
+	aese	$iv0,$dat
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat},[$key2],#16
+	subs	$rounds,$rounds,#2
+	aese	$iv0,$dat1
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat1},[$key2],#16
+	b.gt	.Loop_enc_iv_enc
+
+	aese	$iv0,$dat
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat},[$key2]
+	aese	$iv0,$dat1
+	veor	$iv0,$iv0,$dat
+
+	vld1.8	{$dat0},[$inp]
+	veor	$dat0,$iv0,$dat0
+
+	ldr	$rounds,[$key1,#240]
+	vld1.32	{q20-q21},[$key1],#32		// load key schedule...
+
+	aese	$dat0,q20
+	aesmc	$dat0,$dat0
+	vld1.32	{q8-q9},[$key1],#32		// load key schedule...
+	aese	$dat0,q21
+	aesmc	$dat0,$dat0
+	subs	$rounds,$rounds,#10		// if rounds==10, jump to aes-128-xts processing
+	b.eq	.Lxts_128_enc
+.Lxts_enc_round_loop:
+	aese	$dat0,q8
+	aesmc	$dat0,$dat0
+	vld1.32	{q8},[$key1],#16		// load key schedule...
+	aese	$dat0,q9
+	aesmc	$dat0,$dat0
+	vld1.32	{q9},[$key1],#16		// load key schedule...
+	subs	$rounds,$rounds,#2		// bias
+	b.gt	.Lxts_enc_round_loop
+.Lxts_128_enc:
+	vld1.32	{q10-q11},[$key1],#32		// load key schedule...
+	aese	$dat0,q8
+	aesmc	$dat0,$dat0
+	aese	$dat0,q9
+	aesmc	$dat0,$dat0
+	vld1.32	{q12-q13},[$key1],#32		// load key schedule...
+	aese	$dat0,q10
+	aesmc	$dat0,$dat0
+	aese	$dat0,q11
+	aesmc	$dat0,$dat0
+	vld1.32	{q14-q15},[$key1],#32		// load key schedule...
+	aese	$dat0,q12
+	aesmc	$dat0,$dat0
+	aese	$dat0,q13
+	aesmc	$dat0,$dat0
+	vld1.32	{$rndlast},[$key1]
+	aese	$dat0,q14
+	aesmc	$dat0,$dat0
+	aese	$dat0,q15
+	veor	$dat0,$dat0,$rndlast
+	veor	$dat0,$dat0,$iv0
+	vst1.8	{$dat0},[$out]
+	b	.Lxts_enc_final_abort
+
+.align	4
+.Lxts_enc_big_size:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp	$constnumx,$tmpinp,[sp,#-64]!
+	stp	$tailcnt,$midnumx,[sp,#48]
+	stp	$ivd10,$ivd20,[sp,#32]
+	stp	$ivd30,$ivd40,[sp,#16]
+
+	// tailcnt store the tail value of length%16.
+	and	$tailcnt,$len,#0xf
+	and	$len,$len,#-16
+	subs	$len,$len,#16
+	mov	$step,#16
+	b.lo	.Lxts_abort
+	csel	$step,xzr,$step,eq
+
+	// Firstly, encrypt the iv with key2, as the first iv of XEX.
+	ldr	$rounds,[$key2,#240]
+	vld1.32	{$dat},[$key2],#16
+	vld1.8	{$iv0},[$ivp]
+	sub	$rounds,$rounds,#2
+	vld1.32	{$dat1},[$key2],#16
+
+.Loop_iv_enc:
+	aese	$iv0,$dat
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat},[$key2],#16
+	subs	$rounds,$rounds,#2
+	aese	$iv0,$dat1
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat1},[$key2],#16
+	b.gt	.Loop_iv_enc
+
+	aese	$iv0,$dat
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat},[$key2]
+	aese	$iv0,$dat1
+	veor	$iv0,$iv0,$dat
+
+	// The iv for second block
+	// $ivl- iv(low), $ivh - iv(high)
+	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
+	fmov	$ivl,$ivd00
+	fmov	$ivh,$ivd01
+	mov	$constnum,#0x87
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd10,$ivl
+	fmov	$ivd11,$ivh
+
+	ldr	$rounds0,[$key1,#240]		// next starting point
+	vld1.8	{$dat},[$inp],$step
+
+	vld1.32	{q8-q9},[$key1]			// load key schedule...
+	sub	$rounds0,$rounds0,#6
+	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
+	sub	$rounds0,$rounds0,#2
+	vld1.32	{q10-q11},[$key_],#32
+	vld1.32	{q12-q13},[$key_],#32
+	vld1.32	{q14-q15},[$key_],#32
+	vld1.32	{$rndlast},[$key_]
+
+	add	$key_,$key1,#32
+	mov	$rounds,$rounds0
+
+	// Encryption
+.Lxts_enc:
+	vld1.8	{$dat2},[$inp],#16
+	subs	$len,$len,#32			// bias
+	add	$rounds,$rounds0,#2
+	vorr	$in1,$dat,$dat
+	vorr	$dat1,$dat,$dat
+	vorr	$in3,$dat,$dat
+	vorr	$in2,$dat2,$dat2
+	vorr	$in4,$dat2,$dat2
+	b.lo	.Lxts_inner_enc_tail
+	veor	$dat,$dat,$iv0			// before encryption, xor with iv
+	veor	$dat2,$dat2,$iv1
+
+	// The iv for third block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd20,$ivl
+	fmov	$ivd21,$ivh
+
+
+	vorr	$dat1,$dat2,$dat2
+	vld1.8	{$dat2},[$inp],#16
+	vorr	$in0,$dat,$dat
+	vorr	$in1,$dat1,$dat1
+	veor	$in2,$dat2,$iv2 		// the third block
+	veor	$dat2,$dat2,$iv2
+	cmp	$len,#32
+	b.lo	.Lxts_outer_enc_tail
+
+	// The iv for fourth block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd30,$ivl
+	fmov	$ivd31,$ivh
+
+	vld1.8	{$dat3},[$inp],#16
+	// The iv for fifth block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd40,$ivl
+	fmov	$ivd41,$ivh
+
+	vld1.8	{$dat4},[$inp],#16
+	veor	$dat3,$dat3,$iv3		// the fourth block
+	veor	$dat4,$dat4,$iv4
+	sub	$len,$len,#32			// bias
+	mov	$rounds,$rounds0
+	b	.Loop5x_xts_enc
+
+.align	4
+.Loop5x_xts_enc:
+	aese	$dat0,q8
+	aesmc	$dat0,$dat0
+	aese	$dat1,q8
+	aesmc	$dat1,$dat1
+	aese	$dat2,q8
+	aesmc	$dat2,$dat2
+	aese	$dat3,q8
+	aesmc	$dat3,$dat3
+	aese	$dat4,q8
+	aesmc	$dat4,$dat4
+	vld1.32	{q8},[$key_],#16
+	subs	$rounds,$rounds,#2
+	aese	$dat0,q9
+	aesmc	$dat0,$dat0
+	aese	$dat1,q9
+	aesmc	$dat1,$dat1
+	aese	$dat2,q9
+	aesmc	$dat2,$dat2
+	aese	$dat3,q9
+	aesmc	$dat3,$dat3
+	aese	$dat4,q9
+	aesmc	$dat4,$dat4
+	vld1.32	{q9},[$key_],#16
+	b.gt	.Loop5x_xts_enc
+
+	aese	$dat0,q8
+	aesmc	$dat0,$dat0
+	aese	$dat1,q8
+	aesmc	$dat1,$dat1
+	aese	$dat2,q8
+	aesmc	$dat2,$dat2
+	aese	$dat3,q8
+	aesmc	$dat3,$dat3
+	aese	$dat4,q8
+	aesmc	$dat4,$dat4
+	subs	$len,$len,#0x50			// because .Lxts_enc_tail4x
+
+	aese	$dat0,q9
+	aesmc	$dat0,$dat0
+	aese	$dat1,q9
+	aesmc	$dat1,$dat1
+	aese	$dat2,q9
+	aesmc	$dat2,$dat2
+	aese	$dat3,q9
+	aesmc	$dat3,$dat3
+	aese	$dat4,q9
+	aesmc	$dat4,$dat4
+	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
+	mov	$key_,$key1
+
+	aese	$dat0,q10
+	aesmc	$dat0,$dat0
+	aese	$dat1,q10
+	aesmc	$dat1,$dat1
+	aese	$dat2,q10
+	aesmc	$dat2,$dat2
+	aese	$dat3,q10
+	aesmc	$dat3,$dat3
+	aese	$dat4,q10
+	aesmc	$dat4,$dat4
+	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
+						// at exit from the loop v1.16b-v26.16b
+						// are loaded with last "words"
+	add	$xoffset,$len,#0x60		// because .Lxts_enc_tail4x
+
+	aese	$dat0,q11
+	aesmc	$dat0,$dat0
+	aese	$dat1,q11
+	aesmc	$dat1,$dat1
+	aese	$dat2,q11
+	aesmc	$dat2,$dat2
+	aese	$dat3,q11
+	aesmc	$dat3,$dat3
+	aese	$dat4,q11
+	aesmc	$dat4,$dat4
+
+	aese	$dat0,q12
+	aesmc	$dat0,$dat0
+	aese	$dat1,q12
+	aesmc	$dat1,$dat1
+	aese	$dat2,q12
+	aesmc	$dat2,$dat2
+	aese	$dat3,q12
+	aesmc	$dat3,$dat3
+	aese	$dat4,q12
+	aesmc	$dat4,$dat4
+
+	aese	$dat0,q13
+	aesmc	$dat0,$dat0
+	aese	$dat1,q13
+	aesmc	$dat1,$dat1
+	aese	$dat2,q13
+	aesmc	$dat2,$dat2
+	aese	$dat3,q13
+	aesmc	$dat3,$dat3
+	aese	$dat4,q13
+	aesmc	$dat4,$dat4
+
+	aese	$dat0,q14
+	aesmc	$dat0,$dat0
+	aese	$dat1,q14
+	aesmc	$dat1,$dat1
+	aese	$dat2,q14
+	aesmc	$dat2,$dat2
+	aese	$dat3,q14
+	aesmc	$dat3,$dat3
+	aese	$dat4,q14
+	aesmc	$dat4,$dat4
+
+	veor	$tmp0,$rndlast,$iv0
+	aese	$dat0,q15
+	// The iv for first block of one iteration
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd00,$ivl
+	fmov	$ivd01,$ivh
+	veor	$tmp1,$rndlast,$iv1
+	vld1.8	{$in0},[$inp],#16
+	aese	$dat1,q15
+	// The iv for second block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd10,$ivl
+	fmov	$ivd11,$ivh
+	veor	$tmp2,$rndlast,$iv2
+	vld1.8	{$in1},[$inp],#16
+	aese	$dat2,q15
+	// The iv for third block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd20,$ivl
+	fmov	$ivd21,$ivh
+	veor	$tmp3,$rndlast,$iv3
+	vld1.8	{$in2},[$inp],#16
+	aese	$dat3,q15
+	// The iv for fourth block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd30,$ivl
+	fmov	$ivd31,$ivh
+	veor	$tmp4,$rndlast,$iv4
+	vld1.8	{$in3},[$inp],#16
+	aese	$dat4,q15
+
+	// The iv for fifth block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd40,$ivl
+	fmov	$ivd41,$ivh
+
+	vld1.8	{$in4},[$inp],#16
+	cbz	$xoffset,.Lxts_enc_tail4x
+	vld1.32 {q8},[$key_],#16		// re-pre-load rndkey[0]
+	veor	$tmp0,$tmp0,$dat0
+	veor	$dat0,$in0,$iv0
+	veor	$tmp1,$tmp1,$dat1
+	veor	$dat1,$in1,$iv1
+	veor	$tmp2,$tmp2,$dat2
+	veor	$dat2,$in2,$iv2
+	veor	$tmp3,$tmp3,$dat3
+	veor	$dat3,$in3,$iv3
+	veor	$tmp4,$tmp4,$dat4
+	vst1.8	{$tmp0},[$out],#16
+	veor	$dat4,$in4,$iv4
+	vst1.8	{$tmp1},[$out],#16
+	mov	$rounds,$rounds0
+	vst1.8	{$tmp2},[$out],#16
+	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
+	vst1.8	{$tmp3},[$out],#16
+	vst1.8	{$tmp4},[$out],#16
+	b.hs	.Loop5x_xts_enc
+
+
+	// If left 4 blocks, borrow the five block's processing.
+	cmn	$len,#0x10
+	b.ne	.Loop5x_enc_after
+	vorr	$iv4,$iv3,$iv3
+	vorr	$iv3,$iv2,$iv2
+	vorr	$iv2,$iv1,$iv1
+	vorr	$iv1,$iv0,$iv0
+	fmov	$ivl,$ivd40
+	fmov	$ivh,$ivd41
+	veor	$dat0,$iv0,$in0
+	veor	$dat1,$iv1,$in1
+	veor	$dat2,$in2,$iv2
+	veor	$dat3,$in3,$iv3
+	veor	$dat4,$in4,$iv4
+	b.eq	.Loop5x_xts_enc
+
+.Loop5x_enc_after:
+	add	$len,$len,#0x50
+	cbz	$len,.Lxts_enc_done
+
+	add	$rounds,$rounds0,#2
+	subs	$len,$len,#0x30
+	b.lo	.Lxts_inner_enc_tail
+
+	veor	$dat0,$iv0,$in2
+	veor	$dat1,$iv1,$in3
+	veor	$dat2,$in4,$iv2
+	b	.Lxts_outer_enc_tail
+
+.align	4
+.Lxts_enc_tail4x:
+	add	$inp,$inp,#16
+	veor	$tmp1,$dat1,$tmp1
+	vst1.8	{$tmp1},[$out],#16
+	veor	$tmp2,$dat2,$tmp2
+	vst1.8	{$tmp2},[$out],#16
+	veor	$tmp3,$dat3,$tmp3
+	veor	$tmp4,$dat4,$tmp4
+	vst1.8	{$tmp3-$tmp4},[$out],#32
+
+	b	.Lxts_enc_done
+.align	4
+.Lxts_outer_enc_tail:
+	aese	$dat0,q8
+	aesmc	$dat0,$dat0
+	aese	$dat1,q8
+	aesmc	$dat1,$dat1
+	aese	$dat2,q8
+	aesmc	$dat2,$dat2
+	vld1.32	{q8},[$key_],#16
+	subs	$rounds,$rounds,#2
+	aese	$dat0,q9
+	aesmc	$dat0,$dat0
+	aese	$dat1,q9
+	aesmc	$dat1,$dat1
+	aese	$dat2,q9
+	aesmc	$dat2,$dat2
+	vld1.32	{q9},[$key_],#16
+	b.gt	.Lxts_outer_enc_tail
+
+	aese	$dat0,q8
+	aesmc	$dat0,$dat0
+	aese	$dat1,q8
+	aesmc	$dat1,$dat1
+	aese	$dat2,q8
+	aesmc	$dat2,$dat2
+	veor	$tmp0,$iv0,$rndlast
+	subs	$len,$len,#0x30
+	// The iv for first block
+	fmov	$ivl,$ivd20
+	fmov	$ivh,$ivd21
+	//mov	$constnum,#0x87
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr#31
+	eor	$ivl,$tmpmx,$ivl,lsl#1
+	fmov	$ivd00,$ivl
+	fmov	$ivd01,$ivh
+	veor	$tmp1,$iv1,$rndlast
+	csel	$xoffset,$len,$xoffset,lo       // x6, w6, is zero at this point
+	aese	$dat0,q9
+	aesmc	$dat0,$dat0
+	aese	$dat1,q9
+	aesmc	$dat1,$dat1
+	aese	$dat2,q9
+	aesmc	$dat2,$dat2
+	veor	$tmp2,$iv2,$rndlast
+
+	add	$xoffset,$xoffset,#0x20
+	add	$inp,$inp,$xoffset
+	mov	$key_,$key1
+
+	aese	$dat0,q12
+	aesmc	$dat0,$dat0
+	aese	$dat1,q12
+	aesmc	$dat1,$dat1
+	aese	$dat2,q12
+	aesmc	$dat2,$dat2
+	aese	$dat0,q13
+	aesmc	$dat0,$dat0
+	aese	$dat1,q13
+	aesmc	$dat1,$dat1
+	aese	$dat2,q13
+	aesmc	$dat2,$dat2
+	aese	$dat0,q14
+	aesmc	$dat0,$dat0
+	aese	$dat1,q14
+	aesmc	$dat1,$dat1
+	aese	$dat2,q14
+	aesmc	$dat2,$dat2
+	aese	$dat0,q15
+	aese	$dat1,q15
+	aese	$dat2,q15
+	vld1.8	{$in2},[$inp],#16
+	add	$rounds,$rounds0,#2
+	vld1.32	{q8},[$key_],#16                // re-pre-load rndkey[0]
+	veor	$tmp0,$tmp0,$dat0
+	veor	$tmp1,$tmp1,$dat1
+	veor	$dat2,$dat2,$tmp2
+	vld1.32	{q9},[$key_],#16                // re-pre-load rndkey[1]
+	vst1.8	{$tmp0},[$out],#16
+	vst1.8	{$tmp1},[$out],#16
+	vst1.8	{$dat2},[$out],#16
+	cmn	$len,#0x30
+	b.eq	.Lxts_enc_done
+.Lxts_encxor_one:
+	vorr	$in3,$in1,$in1
+	vorr	$in4,$in2,$in2
+	nop
+
+.Lxts_inner_enc_tail:
+	cmn	$len,#0x10
+	veor	$dat1,$in3,$iv0
+	veor	$dat2,$in4,$iv1
+	b.eq	.Lxts_enc_tail_loop
+	veor	$dat2,$in4,$iv0
+.Lxts_enc_tail_loop:
+	aese	$dat1,q8
+	aesmc	$dat1,$dat1
+	aese	$dat2,q8
+	aesmc	$dat2,$dat2
+	vld1.32	{q8},[$key_],#16
+	subs	$rounds,$rounds,#2
+	aese	$dat1,q9
+	aesmc	$dat1,$dat1
+	aese	$dat2,q9
+	aesmc	$dat2,$dat2
+	vld1.32	{q9},[$key_],#16
+	b.gt	.Lxts_enc_tail_loop
+
+	aese	$dat1,q8
+	aesmc	$dat1,$dat1
+	aese	$dat2,q8
+	aesmc	$dat2,$dat2
+	aese	$dat1,q9
+	aesmc	$dat1,$dat1
+	aese	$dat2,q9
+	aesmc	$dat2,$dat2
+	aese	$dat1,q12
+	aesmc	$dat1,$dat1
+	aese	$dat2,q12
+	aesmc	$dat2,$dat2
+	cmn	$len,#0x20
+	aese	$dat1,q13
+	aesmc	$dat1,$dat1
+	aese	$dat2,q13
+	aesmc	$dat2,$dat2
+	veor	$tmp1,$iv0,$rndlast
+	aese	$dat1,q14
+	aesmc	$dat1,$dat1
+	aese	$dat2,q14
+	aesmc	$dat2,$dat2
+	veor	$tmp2,$iv1,$rndlast
+	aese	$dat1,q15
+	aese	$dat2,q15
+	b.eq	.Lxts_enc_one
+	veor	$tmp1,$tmp1,$dat1
+	vst1.8	{$tmp1},[$out],#16
+	veor	$tmp2,$tmp2,$dat2
+	vorr	$iv0,$iv1,$iv1
+	vst1.8	{$tmp2},[$out],#16
+	fmov	$ivl,$ivd10
+	fmov	$ivh,$ivd11
+	mov	$constnum,#0x87
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd00,$ivl
+	fmov	$ivd01,$ivh
+	b	.Lxts_enc_done
+
+.Lxts_enc_one:
+	veor	$tmp1,$tmp1,$dat2
+	vorr	$iv0,$iv0,$iv0
+	vst1.8	{$tmp1},[$out],#16
+	fmov	$ivl,$ivd00
+	fmov	$ivh,$ivd01
+	mov	$constnum,#0x87
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd00,$ivl
+	fmov	$ivd01,$ivh
+	b	.Lxts_enc_done
+.align	5
+.Lxts_enc_done:
+	// Process the tail block with cipher stealing.
+	tst	$tailcnt,#0xf
+	b.eq	.Lxts_abort
+
+	mov	$tmpinp,$inp
+	mov	$tmpoutp,$out
+	sub	$out,$out,#16
+.composite_enc_loop:
+	subs	$tailcnt,$tailcnt,#1
+	ldrb	$l2outp,[$out,$tailcnt]
+	ldrb	$loutp,[$tmpinp,$tailcnt]
+	strb	$l2outp,[$tmpoutp,$tailcnt]
+	strb	$loutp,[$out,$tailcnt]
+	b.gt	.composite_enc_loop
+.Lxts_enc_load_done:
+	vld1.8	{$tmpin},[$out]
+	veor	$tmpin,$tmpin,$iv0
+
+	// Encrypt the composite block to get the last second encrypted text block
+	ldr	$rounds,[$key1,#240]		// load key schedule...
+	vld1.8	{$dat},[$key1],#16
+	sub	$rounds,$rounds,#2
+	vld1.8	{$dat1},[$key1],#16		// load key schedule...
+.Loop_final_enc:
+	aese	$tmpin,$dat0
+	aesmc	$tmpin,$tmpin
+	vld1.32	{$dat0},[$key1],#16
+	subs	$rounds,$rounds,#2
+	aese	$tmpin,$dat1
+	aesmc	$tmpin,$tmpin
+	vld1.32	{$dat1},[$key1],#16
+	b.gt	.Loop_final_enc
+
+	aese	$tmpin,$dat0
+	aesmc	$tmpin,$tmpin
+	vld1.32	{$dat0},[$key1]
+	aese	$tmpin,$dat1
+	veor	$tmpin,$tmpin,$dat0
+	veor	$tmpin,$tmpin,$iv0
+	vst1.8	{$tmpin},[$out]
+
+.Lxts_abort:
+	ldp	$tailcnt,$midnumx,[sp,#48]
+	ldp	$ivd10,$ivd20,[sp,#32]
+	ldp	$ivd30,$ivd40,[sp,#16]
+	ldp	$constnumx,$tmpinp,[sp],#64
+.Lxts_enc_final_abort:
+	ret
+.size	${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
+___
+
+}}}
+{{{
+my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
+my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
+my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
+my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
+my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
+my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
+my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
+my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+# q7	last round key
+# q10-q15, q7	Last 7 round keys
+# q8-q9	preloaded round keys except last 7 keys for big size
+# q20, q21, q8-q9	preloaded round keys except last 7 keys for only 16 byte
+
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3);	# used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+    ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
+$code.=<<___	if ($flavour =~ /64/);
+.globl	${prefix}_xts_decrypt
+.type	${prefix}_xts_decrypt,%function
+.align	5
+${prefix}_xts_decrypt:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	cmp	$len,#16
+	// Original input data size bigger than 16, jump to big size processing.
+	b.ne	.Lxts_dec_big_size
+	// Encrypt the iv with key2, as the first XEX iv.
+	ldr	$rounds,[$key2,#240]
+	vld1.8	{$dat},[$key2],#16
+	vld1.8	{$iv0},[$ivp]
+	sub	$rounds,$rounds,#2
+	vld1.8	{$dat1},[$key2],#16
+
+.Loop_dec_small_iv_enc:
+	aese	$iv0,$dat
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat},[$key2],#16
+	subs	$rounds,$rounds,#2
+	aese	$iv0,$dat1
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat1},[$key2],#16
+	b.gt	.Loop_dec_small_iv_enc
+
+	aese	$iv0,$dat
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat},[$key2]
+	aese	$iv0,$dat1
+	veor	$iv0,$iv0,$dat
+
+	vld1.8	{$dat0},[$inp]
+	veor	$dat0,$iv0,$dat0
+
+	ldr	$rounds,[$key1,#240]
+	vld1.32	{q20-q21},[$key1],#32			// load key schedule...
+
+	aesd	$dat0,q20
+	aesimc	$dat0,$dat0
+	vld1.32	{q8-q9},[$key1],#32			// load key schedule...
+	aesd	$dat0,q21
+	aesimc	$dat0,$dat0
+	subs	$rounds,$rounds,#10			// bias
+	b.eq	.Lxts_128_dec
+.Lxts_dec_round_loop:
+	aesd	$dat0,q8
+	aesimc	$dat0,$dat0
+	vld1.32	{q8},[$key1],#16			// load key schedule...
+	aesd	$dat0,q9
+	aesimc	$dat0,$dat0
+	vld1.32	{q9},[$key1],#16			// load key schedule...
+	subs	$rounds,$rounds,#2			// bias
+	b.gt	.Lxts_dec_round_loop
+.Lxts_128_dec:
+	vld1.32	{q10-q11},[$key1],#32			// load key schedule...
+	aesd	$dat0,q8
+	aesimc	$dat0,$dat0
+	aesd	$dat0,q9
+	aesimc	$dat0,$dat0
+	vld1.32	{q12-q13},[$key1],#32			// load key schedule...
+	aesd	$dat0,q10
+	aesimc	$dat0,$dat0
+	aesd	$dat0,q11
+	aesimc	$dat0,$dat0
+	vld1.32	{q14-q15},[$key1],#32			// load key schedule...
+	aesd	$dat0,q12
+	aesimc	$dat0,$dat0
+	aesd	$dat0,q13
+	aesimc	$dat0,$dat0
+	vld1.32	{$rndlast},[$key1]
+	aesd	$dat0,q14
+	aesimc	$dat0,$dat0
+	aesd	$dat0,q15
+	veor	$dat0,$dat0,$rndlast
+	veor	$dat0,$iv0,$dat0
+	vst1.8	{$dat0},[$out]
+	b	.Lxts_dec_final_abort
+.Lxts_dec_big_size:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp	$constnumx,$tmpinp,[sp,#-64]!
+	stp	$tailcnt,$midnumx,[sp,#48]
+	stp	$ivd10,$ivd20,[sp,#32]
+	stp	$ivd30,$ivd40,[sp,#16]
+
+	and	$tailcnt,$len,#0xf
+	and	$len,$len,#-16
+	subs	$len,$len,#16
+	mov	$step,#16
+	b.lo	.Lxts_dec_abort
+
+	// Encrypt the iv with key2, as the first XEX iv
+	ldr	$rounds,[$key2,#240]
+	vld1.8	{$dat},[$key2],#16
+	vld1.8	{$iv0},[$ivp]
+	sub	$rounds,$rounds,#2
+	vld1.8	{$dat1},[$key2],#16
+
+.Loop_dec_iv_enc:
+	aese	$iv0,$dat
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat},[$key2],#16
+	subs	$rounds,$rounds,#2
+	aese	$iv0,$dat1
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat1},[$key2],#16
+	b.gt	.Loop_dec_iv_enc
+
+	aese	$iv0,$dat
+	aesmc	$iv0,$iv0
+	vld1.32	{$dat},[$key2]
+	aese	$iv0,$dat1
+	veor	$iv0,$iv0,$dat
+
+	// The iv for second block
+	// $ivl- iv(low), $ivh - iv(high)
+	// the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
+	fmov	$ivl,$ivd00
+	fmov	$ivh,$ivd01
+	mov	$constnum,#0x87
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd10,$ivl
+	fmov	$ivd11,$ivh
+
+	ldr	$rounds0,[$key1,#240]		// load rounds number
+
+	// The iv for third block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd20,$ivl
+	fmov	$ivd21,$ivh
+
+	vld1.32	{q8-q9},[$key1]			// load key schedule...
+	sub	$rounds0,$rounds0,#6
+	add	$key_,$key1,$ivp,lsl#4		// pointer to last 7 round keys
+	sub	$rounds0,$rounds0,#2
+	vld1.32	{q10-q11},[$key_],#32		// load key schedule...
+	vld1.32	{q12-q13},[$key_],#32
+	vld1.32	{q14-q15},[$key_],#32
+	vld1.32	{$rndlast},[$key_]
+
+	// The iv for fourth block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd30,$ivl
+	fmov	$ivd31,$ivh
+
+	add	$key_,$key1,#32
+	mov	$rounds,$rounds0
+	b	.Lxts_dec
+
+	// Decryption
+.align	5
+.Lxts_dec:
+	tst	$tailcnt,#0xf
+	b.eq	.Lxts_dec_begin
+	subs	$len,$len,#16
+	csel	$step,xzr,$step,eq
+	vld1.8	{$dat},[$inp],#16
+	b.lo	.Lxts_done
+	sub	$inp,$inp,#16
+.Lxts_dec_begin:
+	vld1.8	{$dat},[$inp],$step
+	subs	$len,$len,#32			// bias
+	add	$rounds,$rounds0,#2
+	vorr	$in1,$dat,$dat
+	vorr	$dat1,$dat,$dat
+	vorr	$in3,$dat,$dat
+	vld1.8	{$dat2},[$inp],#16
+	vorr	$in2,$dat2,$dat2
+	vorr	$in4,$dat2,$dat2
+	b.lo	.Lxts_inner_dec_tail
+	veor	$dat,$dat,$iv0			// before decryt, xor with iv
+	veor	$dat2,$dat2,$iv1
+
+	vorr	$dat1,$dat2,$dat2
+	vld1.8	{$dat2},[$inp],#16
+	vorr	$in0,$dat,$dat
+	vorr	$in1,$dat1,$dat1
+	veor	$in2,$dat2,$iv2			// third block xox with third iv
+	veor	$dat2,$dat2,$iv2
+	cmp	$len,#32
+	b.lo	.Lxts_outer_dec_tail
+
+	vld1.8	{$dat3},[$inp],#16
+
+	// The iv for fifth block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd40,$ivl
+	fmov	$ivd41,$ivh
+
+	vld1.8	{$dat4},[$inp],#16
+	veor	$dat3,$dat3,$iv3		// the fourth block
+	veor	$dat4,$dat4,$iv4
+	sub $len,$len,#32			// bias
+	mov	$rounds,$rounds0
+	b	.Loop5x_xts_dec
+
+.align	4
+.Loop5x_xts_dec:
+	aesd	$dat0,q8
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q8
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q8
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q8
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q8
+	aesimc	$dat4,$dat4
+	vld1.32	{q8},[$key_],#16		// load key schedule...
+	subs	$rounds,$rounds,#2
+	aesd	$dat0,q9
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q9
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q9
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q9
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q9
+	aesimc	$dat4,$dat4
+	vld1.32	{q9},[$key_],#16		// load key schedule...
+	b.gt	.Loop5x_xts_dec
+
+	aesd	$dat0,q8
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q8
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q8
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q8
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q8
+	aesimc	$dat4,$dat4
+	subs	$len,$len,#0x50			// because .Lxts_dec_tail4x
+
+	aesd	$dat0,q9
+	aesimc	$dat0,$dat
+	aesd	$dat1,q9
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q9
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q9
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q9
+	aesimc	$dat4,$dat4
+	csel	$xoffset,xzr,$len,gt		// borrow x6, w6, "gt" is not typo
+	mov	$key_,$key1
+
+	aesd	$dat0,q10
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q10
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q10
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q10
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q10
+	aesimc	$dat4,$dat4
+	add	$inp,$inp,$xoffset		// x0 is adjusted in such way that
+						// at exit from the loop v1.16b-v26.16b
+						// are loaded with last "words"
+	add	$xoffset,$len,#0x60		// because .Lxts_dec_tail4x
+
+	aesd	$dat0,q11
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q11
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q11
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q11
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q11
+	aesimc	$dat4,$dat4
+
+	aesd	$dat0,q12
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q12
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q12
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q12
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q12
+	aesimc	$dat4,$dat4
+
+	aesd	$dat0,q13
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q13
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q13
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q13
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q13
+	aesimc	$dat4,$dat4
+
+	aesd	$dat0,q14
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q14
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q14
+	aesimc	$dat2,$dat2
+	aesd	$dat3,q14
+	aesimc	$dat3,$dat3
+	aesd	$dat4,q14
+	aesimc	$dat4,$dat4
+
+	veor	$tmp0,$rndlast,$iv0
+	aesd	$dat0,q15
+	// The iv for first block of next iteration.
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd00,$ivl
+	fmov	$ivd01,$ivh
+	veor	$tmp1,$rndlast,$iv1
+	vld1.8	{$in0},[$inp],#16
+	aesd	$dat1,q15
+	// The iv for second block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd10,$ivl
+	fmov	$ivd11,$ivh
+	veor	$tmp2,$rndlast,$iv2
+	vld1.8	{$in1},[$inp],#16
+	aesd	$dat2,q15
+	// The iv for third block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd20,$ivl
+	fmov	$ivd21,$ivh
+	veor	$tmp3,$rndlast,$iv3
+	vld1.8	{$in2},[$inp],#16
+	aesd	$dat3,q15
+	// The iv for fourth block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd30,$ivl
+	fmov	$ivd31,$ivh
+	veor	$tmp4,$rndlast,$iv4
+	vld1.8	{$in3},[$inp],#16
+	aesd	$dat4,q15
+
+	// The iv for fifth block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd40,$ivl
+	fmov	$ivd41,$ivh
+
+	vld1.8	{$in4},[$inp],#16
+	cbz	$xoffset,.Lxts_dec_tail4x
+	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
+	veor	$tmp0,$tmp0,$dat0
+	veor	$dat0,$in0,$iv0
+	veor	$tmp1,$tmp1,$dat1
+	veor	$dat1,$in1,$iv1
+	veor	$tmp2,$tmp2,$dat2
+	veor	$dat2,$in2,$iv2
+	veor	$tmp3,$tmp3,$dat3
+	veor	$dat3,$in3,$iv3
+	veor	$tmp4,$tmp4,$dat4
+	vst1.8	{$tmp0},[$out],#16
+	veor	$dat4,$in4,$iv4
+	vst1.8	{$tmp1},[$out],#16
+	mov	$rounds,$rounds0
+	vst1.8	{$tmp2},[$out],#16
+	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
+	vst1.8	{$tmp3},[$out],#16
+	vst1.8	{$tmp4},[$out],#16
+	b.hs	.Loop5x_xts_dec
+
+	cmn	$len,#0x10
+	b.ne	.Loop5x_dec_after
+	// If x2($len) equal to -0x10, the left blocks is 4.
+	// After specially processing, utilize the five blocks processing again.
+	// It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
+	vorr	$iv4,$iv3,$iv3
+	vorr	$iv3,$iv2,$iv2
+	vorr	$iv2,$iv1,$iv1
+	vorr	$iv1,$iv0,$iv0
+	fmov	$ivl,$ivd40
+	fmov	$ivh,$ivd41
+	veor	$dat0,$iv0,$in0
+	veor	$dat1,$iv1,$in1
+	veor	$dat2,$in2,$iv2
+	veor	$dat3,$in3,$iv3
+	veor	$dat4,$in4,$iv4
+	b.eq	.Loop5x_xts_dec
+
+.Loop5x_dec_after:
+	add	$len,$len,#0x50
+	cbz	$len,.Lxts_done
+
+	add	$rounds,$rounds0,#2
+	subs	$len,$len,#0x30
+	b.lo	.Lxts_inner_dec_tail
+
+	veor	$dat0,$iv0,$in2
+	veor	$dat1,$iv1,$in3
+	veor	$dat2,$in4,$iv2
+	b	.Lxts_outer_dec_tail
+
+.align	4
+.Lxts_dec_tail4x:
+	add	$inp,$inp,#16
+	vld1.32	{$dat0},[$inp],#16
+	veor	$tmp1,$dat1,$tmp0
+	vst1.8	{$tmp1},[$out],#16
+	veor	$tmp2,$dat2,$tmp2
+	vst1.8	{$tmp2},[$out],#16
+	veor	$tmp3,$dat3,$tmp3
+	veor	$tmp4,$dat4,$tmp4
+	vst1.8	{$tmp3-$tmp4},[$out],#32
+
+	b	.Lxts_done
+.align	4
+.Lxts_outer_dec_tail:
+	aesd	$dat0,q8
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q8
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q8
+	aesimc	$dat2,$dat2
+	vld1.32	{q8},[$key_],#16
+	subs	$rounds,$rounds,#2
+	aesd	$dat0,q9
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q9
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q9
+	aesimc	$dat2,$dat2
+	vld1.32	{q9},[$key_],#16
+	b.gt	.Lxts_outer_dec_tail
+
+	aesd	$dat0,q8
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q8
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q8
+	aesimc	$dat2,$dat2
+	veor	$tmp0,$iv0,$rndlast
+	subs	$len,$len,#0x30
+	// The iv for first block
+	fmov	$ivl,$ivd20
+	fmov	$ivh,$ivd21
+	mov	$constnum,#0x87
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd00,$ivl
+	fmov	$ivd01,$ivh
+	veor	$tmp1,$iv1,$rndlast
+	csel	$xoffset,$len,$xoffset,lo	// x6, w6, is zero at this point
+	aesd	$dat0,q9
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q9
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q9
+	aesimc	$dat2,$dat2
+	veor	$tmp2,$iv2,$rndlast
+	// The iv for second block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd10,$ivl
+	fmov	$ivd11,$ivh
+
+	add	$xoffset,$xoffset,#0x20
+	add	$inp,$inp,$xoffset		// $inp is adjusted to the last data
+
+	mov	$key_,$key1
+
+	// The iv for third block
+	extr	$midnumx,$ivh,$ivh,#32
+	extr	$ivh,$ivh,$ivl,#63
+	and	$tmpmw,$constnum,$midnum,asr #31
+	eor	$ivl,$tmpmx,$ivl,lsl #1
+	fmov	$ivd20,$ivl
+	fmov	$ivd21,$ivh
+
+	aesd	$dat0,q12
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q12
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q12
+	aesimc	$dat2,$dat2
+	aesd	$dat0,q13
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q13
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q13
+	aesimc	$dat2,$dat2
+	aesd	$dat0,q14
+	aesimc	$dat0,$dat0
+	aesd	$dat1,q14
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q14
+	aesimc	$dat2,$dat2
+	vld1.8	{$in2},[$inp],#16
+	aesd	$dat0,q15
+	aesd	$dat1,q15
+	aesd	$dat2,q15
+	vld1.32	{q8},[$key_],#16		// re-pre-load rndkey[0]
+	add	$rounds,$rounds0,#2
+	veor	$tmp0,$tmp0,$dat0
+	veor	$tmp1,$tmp1,$dat1
+	veor	$dat2,$dat2,$tmp2
+	vld1.32	{q9},[$key_],#16		// re-pre-load rndkey[1]
+	vst1.8	{$tmp0},[$out],#16
+	vst1.8	{$tmp1},[$out],#16
+	vst1.8	{$dat2},[$out],#16
+
+	cmn	$len,#0x30
+	add	$len,$len,#0x30
+	b.eq	.Lxts_done
+	sub	$len,$len,#0x30
+	vorr	$in3,$in1,$in1
+	vorr	$in4,$in2,$in2
+	nop
+
+.Lxts_inner_dec_tail:
+	// $len == -0x10 means two blocks left.
+	cmn	$len,#0x10
+	veor	$dat1,$in3,$iv0
+	veor	$dat2,$in4,$iv1
+	b.eq	.Lxts_dec_tail_loop
+	veor	$dat2,$in4,$iv0
+.Lxts_dec_tail_loop:
+	aesd	$dat1,q8
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q8
+	aesimc	$dat2,$dat2
+	vld1.32	{q8},[$key_],#16
+	subs	$rounds,$rounds,#2
+	aesd	$dat1,q9
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q9
+	aesimc	$dat2,$dat2
+	vld1.32	{q9},[$key_],#16
+	b.gt	.Lxts_dec_tail_loop
+
+	aesd	$dat1,q8
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q8
+	aesimc	$dat2,$dat2
+	aesd	$dat1,q9
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q9
+	aesimc	$dat2,$dat2
+	aesd	$dat1,q12
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q12
+	aesimc	$dat2,$dat2
+	cmn	$len,#0x20
+	aesd	$dat1,q13
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q13
+	aesimc	$dat2,$dat2
+	veor	$tmp1,$iv0,$rndlast
+	aesd	$dat1,q14
+	aesimc	$dat1,$dat1
+	aesd	$dat2,q14
+	aesimc	$dat2,$dat2
+	veor	$tmp2,$iv1,$rndlast
+	aesd	$dat1,q15
+	aesd	$dat2,q15
+	b.eq	.Lxts_dec_one
+	veor	$tmp1,$tmp1,$dat1
+	veor	$tmp2,$tmp2,$dat2
+	vorr	$iv0,$iv2,$iv2
+	vorr	$iv1,$iv3,$iv3
+	vst1.8	{$tmp1},[$out],#16
+	vst1.8	{$tmp2},[$out],#16
+	add	$len,$len,#16
+	b	.Lxts_done
+
+.Lxts_dec_one:
+	veor	$tmp1,$tmp1,$dat2
+	vorr	$iv0,$iv1,$iv1
+	vorr	$iv1,$iv2,$iv2
+	vst1.8	{$tmp1},[$out],#16
+	add	$len,$len,#32
+
+.Lxts_done:
+	tst	$tailcnt,#0xf
+	b.eq	.Lxts_dec_abort
+	// Processing the last two blocks with cipher stealing.
+	mov	x7,x3
+	cbnz	x2,.Lxts_dec_1st_done
+	vld1.32	{$dat0},[$inp],#16
+
+	// Decrypt the last secod block to get the last plain text block
+.Lxts_dec_1st_done:
+	eor	$tmpin,$dat0,$iv1
+	ldr	$rounds,[$key1,#240]
+	vld1.32	{$dat0},[$key1],#16
+	sub	$rounds,$rounds,#2
+	vld1.32	{$dat1},[$key1],#16
+.Loop_final_2nd_dec:
+	aesd	$tmpin,$dat0
+	aesimc	$tmpin,$tmpin
+	vld1.32	{$dat0},[$key1],#16		// load key schedule...
+	subs	$rounds,$rounds,#2
+	aesd	$tmpin,$dat1
+	aesimc	$tmpin,$tmpin
+	vld1.32	{$dat1},[$key1],#16		// load key schedule...
+	b.gt	.Loop_final_2nd_dec
+
+	aesd	$tmpin,$dat0
+	aesimc	$tmpin,$tmpin
+	vld1.32	{$dat0},[$key1]
+	aesd	$tmpin,$dat1
+	veor	$tmpin,$tmpin,$dat0
+	veor	$tmpin,$tmpin,$iv1
+	vst1.8	{$tmpin},[$out]
+
+	mov	$tmpinp,$inp
+	add	$tmpoutp,$out,#16
+
+	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
+	// to get the last encrypted block.
+.composite_dec_loop:
+	subs	$tailcnt,$tailcnt,#1
+	ldrb	$l2outp,[$out,$tailcnt]
+	ldrb	$loutp,[$tmpinp,$tailcnt]
+	strb	$l2outp,[$tmpoutp,$tailcnt]
+	strb	$loutp,[$out,$tailcnt]
+	b.gt	.composite_dec_loop
+.Lxts_dec_load_done:
+	vld1.8	{$tmpin},[$out]
+	veor	$tmpin,$tmpin,$iv0
+
+	// Decrypt the composite block to get the last second plain text block
+	ldr	$rounds,[$key_,#240]
+	vld1.8	{$dat},[$key_],#16
+	sub	$rounds,$rounds,#2
+	vld1.8	{$dat1},[$key_],#16
+.Loop_final_dec:
+	aesd	$tmpin,$dat0
+	aesimc	$tmpin,$tmpin
+	vld1.32	{$dat0},[$key_],#16		// load key schedule...
+	subs	$rounds,$rounds,#2
+	aesd	$tmpin,$dat1
+	aesimc	$tmpin,$tmpin
+	vld1.32	{$dat1},[$key_],#16		// load key schedule...
+	b.gt	.Loop_final_dec
+
+	aesd	$tmpin,$dat0
+	aesimc	$tmpin,$tmpin
+	vld1.32	{$dat0},[$key_]
+	aesd	$tmpin,$dat1
+	veor	$tmpin,$tmpin,$dat0
+	veor	$tmpin,$tmpin,$iv0
+	vst1.8	{$tmpin},[$out]
+
+.Lxts_dec_abort:
+	ldp	$tailcnt,$midnumx,[sp,#48]
+	ldp	$ivd10,$ivd20,[sp,#32]
+	ldp	$ivd30,$ivd40,[sp,#16]
+	ldp	$constnumx,$tmpinp,[sp],#64
+
+.Lxts_dec_final_abort:
+	ret
+.size	${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
+___
+}
+}}}
 $code.=<<___;
 #endif
 ___
author	XiaokangQian <xiaokang.qian@arm.com>	2020-03-13 03:27:34 +0000
committer	Tomas Mraz <tmraz@fedoraproject.org>	2020-11-12 11:09:22 +0100
commit	9ce8e0d17e608de4f85f7543c52b146e3c6a2291 (patch)
tree	6b7472519cee7aa202123ed90c9b982be41457a5 /crypto/aes
parent	c87a7f31a3db97376d764583ad5ee4a76db2cbef (diff)
download	openssl-new-9ce8e0d17e608de4f85f7543c52b146e3c6a2291.tar.gz