summaryrefslogtreecommitdiff
path: root/armv7
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2013-02-19 13:09:55 +0100
committerNiels Möller <nisse@lysator.liu.se>2013-02-19 13:09:55 +0100
commit993ae2d6c4653e3ffb836ff18d2b65316446372f (patch)
tree6fa01178f68b793f167fadc6d9876eda165c2a2b /armv7
parent39c037437d8807e2660baabe766fa391fa762a52 (diff)
downloadnettle-993ae2d6c4653e3ffb836ff18d2b65316446372f.tar.gz
Optimized ARM memxor.
Diffstat (limited to 'armv7')
-rw-r--r--armv7/memxor.asm106
1 files changed, 72 insertions, 34 deletions
diff --git a/armv7/memxor.asm b/armv7/memxor.asm
index 94b8f532..929fffcd 100644
--- a/armv7/memxor.asm
+++ b/armv7/memxor.asm
@@ -40,12 +40,13 @@ define(<TNC>, <r7>)
.arm
C memxor(uint8_t *dst, const uint8_t *src, size_t n)
- .align 2
+ .align 4
PROLOGUE(memxor)
cmp N, #0
beq .Lmemxor_ret
- push {r4, r5, r6, r7}
+ C FIXME: Delay push until we know how many registers we need.
+ push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register
cmp N, #7
bcs .Lmemxor_large
@@ -60,7 +61,7 @@ PROLOGUE(memxor)
bne .Lmemxor_bytes
.Lmemxor_done:
- pop {r4,r5,r6,r7}
+ pop {r4,r5,r6,r7,r8,r10,r11,r14}
.Lmemxor_ret:
bx lr
@@ -140,16 +141,42 @@ PROLOGUE(memxor)
subs N, #8
bcc .Lmemxor_same_end
+ ldmia SRC!, {r3, r4, r5}
+ C Keep address for loads in r14
+ mov r14, DST
+ ldmia r14!, {r6, r7, r8}
+ subs N, #12
+ eor r10, r3, r6
+ eor r11, r4, r7
+ eor r12, r5, r8
+ bcc .Lmemxor_same_final_store
+ subs N, #12
+ ldmia r14!, {r6, r7, r8}
+ bcc .Lmemxor_same_wind_down
+
+ C 7 cycles per iteration, 0.58 cycles/byte
+ C Loopmixer could perhaps get it down to 6 cycles.
.Lmemxor_same_loop:
- C 8 cycles per iteration, 0.67 cycles/byte
+ C r10-r12 contains values to be stored at DST
+ C r6-r8 contains values read from r14, in advance
ldmia SRC!, {r3, r4, r5}
- ldmia DST, {r6, r7, r12}
subs N, #12
- eor r3, r6
- eor r4, r7
- eor r5, r12
- stmia DST!, {r3, r4, r5}
+ stmia DST!, {r10, r11, r12}
+ eor r10, r3, r6
+ eor r11, r4, r7
+ eor r12, r5, r8
+ ldmia r14!, {r6, r7, r8}
bcs .Lmemxor_same_loop
+
+.Lmemxor_same_wind_down:
+ C Wind down code
+ ldmia SRC!, {r3, r4, r5}
+ stmia DST!, {r10, r11, r12}
+ eor r10, r3, r6
+ eor r11, r4, r7
+ eor r12, r5, r8
+.Lmemxor_same_final_store:
+ stmia DST!, {r10, r11, r12}
.Lmemxor_same_end:
C We have 0-11 bytes left to do, and N holds number of bytes -12.
@@ -312,40 +339,51 @@ PROLOGUE(memxor3)
bne .Lmemxor3_au ;
C a, b and dst all have the same alignment.
- sub AP, #4
- sub BP, #4
- sub DST, #4
- tst N, #4
- it ne
- subne N, #4
- bne .Lmemxor3_aligned_word_loop
-
- ldr r4, [AP], #-4
- ldr r5, [BP], #-4
- eor r4, r5
- str r4, [DST], #-4
subs N, #8
bcc .Lmemxor3_aligned_word_end
+
+ C This loop runs at 7 cycles per iteration, but it seems to
+ C have a strange alignment requirement. For this speed, the
+ C loop started at offset 0x2ac in the object file, and all
+ C other offsets made it slower.
.Lmemxor3_aligned_word_loop:
- ldr r4, [AP, #-4]
- ldr r5, [AP], #-8
- ldr r6, [BP, #-4]
- ldr r7, [BP], #-8
+ ldmdb AP!, {r4,r5,r6}
+ ldmdb BP!, {r7,r8,r10}
+ subs N, #12
+ eor r4, r7
+ eor r5, r8
+ eor r6, r10
+ stmdb DST!, {r4, r5,r6}
+ bcs .Lmemxor3_aligned_word_loop
+.Lmemxor3_aligned_word_end:
+ C We have 0-11 bytes left to do, and N holds number of bytes -12.
+ adds N, #4
+ bcc .Lmemxor3_aligned_lt_8
+ C Do 8 bytes more, leftover is in N
+ ldmdb AP!, {r4, r5}
+ ldmdb BP!, {r6, r7}
eor r4, r6
eor r5, r7
- subs N, #8
- str r4, [DST, #-4]
- str r5, [DST], #-8
+ stmdb DST!, {r4,r5}
+ beq .Lmemxor3_done
+ b .Lmemxor3_bytes
- bcs .Lmemxor3_aligned_word_loop
-.Lmemxor3_aligned_word_end:
- adds N, #8
+.Lmemxor3_aligned_lt_8:
+ adds N, #4
+ bcc .Lmemxor3_aligned_lt_4
+
+ ldr r4, [AP,#-4]!
+ ldr r5, [BP,#-4]!
+ eor r4, r5
+ str r4, [DST,#-4]!
+ beq .Lmemxor3_done
+ b .Lmemxor3_bytes
+
+.Lmemxor3_aligned_lt_4:
+ adds N, #4
beq .Lmemxor3_done
- add AP, #4
- add BP, #4
- add DST, #4
b .Lmemxor3_bytes
.Lmemxor3_uu: