Optimized ARM memxor.

author: Niels Möller <nisse@lysator.liu.se> 2013-02-19 13:09:55 +0100
committer: Niels Möller <nisse@lysator.liu.se> 2013-02-19 13:09:55 +0100
commit: 993ae2d6c4653e3ffb836ff18d2b65316446372f (patch)
tree: 6fa01178f68b793f167fadc6d9876eda165c2a2b /armv7
parent: 39c037437d8807e2660baabe766fa391fa762a52 (diff)
download: nettle-993ae2d6c4653e3ffb836ff18d2b65316446372f.tar.gz
1 files changed, 72 insertions, 34 deletions
diff --git a/armv7/memxor.asm b/armv7/memxor.asm
index 94b8f532..929fffcd 100644
--- a/armv7/memxor.asm
+++ b/armv7/memxor.asm
@@ -40,12 +40,13 @@ define(<TNC>, <r7>)
 	.arm
 
 	C memxor(uint8_t *dst, const uint8_t *src, size_t n)
-	.align 2
+	.align 4
 PROLOGUE(memxor)
 	cmp	N, #0
 	beq	.Lmemxor_ret
 
-	push	{r4, r5, r6, r7}
+	C FIXME: Delay push until we know how many registers we need.
+	push	{r4,r5,r6,r7,r8,r10,r11,r14}	C lr is the link register
 
 	cmp	N, #7
 	bcs	.Lmemxor_large
@@ -60,7 +61,7 @@ PROLOGUE(memxor)
 	bne	.Lmemxor_bytes
 
 .Lmemxor_done:
-	pop	{r4,r5,r6,r7}
+	pop	{r4,r5,r6,r7,r8,r10,r11,r14}
 .Lmemxor_ret:
 	bx	lr
 
@@ -140,16 +141,42 @@ PROLOGUE(memxor)
 	subs	N, #8
 	bcc	.Lmemxor_same_end
 
+	ldmia	SRC!, {r3, r4, r5}
+	C Keep address for loads in r14
+	mov	r14, DST
+	ldmia	r14!, {r6, r7, r8}
+	subs	N, #12
+	eor	r10, r3, r6
+	eor	r11, r4, r7
+	eor	r12, r5, r8
+	bcc	.Lmemxor_same_final_store
+	subs	N, #12
+	ldmia	r14!, {r6, r7, r8}
+	bcc	.Lmemxor_same_wind_down
+
+	C 7 cycles per iteration, 0.58 cycles/byte
+	C Loopmixer could perhaps get it down to 6 cycles.
 .Lmemxor_same_loop:
-	C 8 cycles per iteration, 0.67 cycles/byte
+	C r10-r12 contains values to be stored at DST
+	C r6-r8 contains values read from r14, in advance
 	ldmia	SRC!, {r3, r4, r5}
-	ldmia	DST, {r6, r7, r12}
 	subs	N, #12
-	eor	r3, r6
-	eor	r4, r7
-	eor	r5, r12
-	stmia	DST!, {r3, r4, r5}
+	stmia	DST!, {r10, r11, r12}
+	eor	r10, r3, r6
+	eor	r11, r4, r7
+	eor	r12, r5, r8
+	ldmia	r14!, {r6, r7, r8}
 	bcs	.Lmemxor_same_loop
+
+.Lmemxor_same_wind_down:
+	C Wind down code
+	ldmia	SRC!, {r3, r4, r5}
+	stmia	DST!, {r10, r11, r12}
+	eor	r10, r3, r6
+	eor	r11, r4, r7
+	eor	r12, r5, r8
+.Lmemxor_same_final_store:
+	stmia	DST!, {r10, r11, r12}
 	
 .Lmemxor_same_end:
 	C We have 0-11 bytes left to do, and N holds number of bytes -12.
@@ -312,40 +339,51 @@ PROLOGUE(memxor3)
 	bne	.Lmemxor3_au ;
 
 	C a, b and dst all have the same alignment.
-	sub	AP, #4
-	sub	BP, #4
-	sub	DST, #4
-	tst	N, #4
-	it	ne
-	subne	N, #4
-	bne	.Lmemxor3_aligned_word_loop
-
-	ldr	r4, [AP], #-4
-	ldr	r5, [BP], #-4
-	eor	r4, r5
-	str	r4, [DST], #-4
 	subs	N, #8
 	bcc	.Lmemxor3_aligned_word_end
+
+	C This loop runs at 7 cycles per iteration, but it seems to
+	C have a strange alignment requirement. For this speed, the
+	C loop started at offset 0x2ac in the object file, and all
+	C other offsets made it slower.
 	
 .Lmemxor3_aligned_word_loop:
-	ldr	r4, [AP, #-4]
-	ldr	r5, [AP], #-8
-	ldr	r6, [BP, #-4]
-	ldr	r7, [BP], #-8
+	ldmdb	AP!, {r4,r5,r6}
+	ldmdb	BP!, {r7,r8,r10}
+	subs	N, #12
+	eor	r4, r7
+	eor	r5, r8
+	eor	r6, r10
+	stmdb	DST!, {r4, r5,r6}
+	bcs	.Lmemxor3_aligned_word_loop
 
+.Lmemxor3_aligned_word_end:
+	C We have 0-11 bytes left to do, and N holds number of bytes -12.
+	adds	N, #4
+	bcc	.Lmemxor3_aligned_lt_8
+	C Do 8 bytes more, leftover is in N
+	ldmdb	AP!, {r4, r5}
+	ldmdb	BP!, {r6, r7}
 	eor	r4, r6
 	eor	r5, r7
-	subs	N, #8
-	str	r4, [DST, #-4]
-	str	r5, [DST], #-8
+	stmdb	DST!, {r4,r5}
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
 
-	bcs	.Lmemxor3_aligned_word_loop
-.Lmemxor3_aligned_word_end:
-	adds	N, #8
+.Lmemxor3_aligned_lt_8:
+	adds	N, #4
+	bcc	.Lmemxor3_aligned_lt_4
+
+	ldr	r4, [AP,#-4]!
+	ldr	r5, [BP,#-4]!
+	eor	r4, r5
+	str	r4, [DST,#-4]!
+	beq	.Lmemxor3_done
+	b	.Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_4:
+	adds	N, #4	
 	beq	.Lmemxor3_done
-	add	AP, #4
-	add	BP, #4
-	add	DST, #4
 	b	.Lmemxor3_bytes
 
 .Lmemxor3_uu:
author	Niels Möller <nisse@lysator.liu.se>	2013-02-19 13:09:55 +0100
committer	Niels Möller <nisse@lysator.liu.se>	2013-02-19 13:09:55 +0100
commit	993ae2d6c4653e3ffb836ff18d2b65316446372f (patch)
tree	6fa01178f68b793f167fadc6d9876eda165c2a2b /armv7
parent	39c037437d8807e2660baabe766fa391fa762a52 (diff)
download	nettle-993ae2d6c4653e3ffb836ff18d2b65316446372f.tar.gz