summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2013-02-12 15:57:37 +0100
committerNiels Möller <nisse@lysator.liu.se>2013-02-12 15:57:37 +0100
commit39c037437d8807e2660baabe766fa391fa762a52 (patch)
treeed84e1ad38ab7ec340ea9495bccd2fa3d7d3e95e
parent610677e4e008365bd6be1d0149c2f0470debf769 (diff)
downloadnettle-39c037437d8807e2660baabe766fa391fa762a52.tar.gz
armv7: Optimized aligned case of memxor, using 3-way unrolling.
-rw-r--r--ChangeLog5
-rw-r--r--armv7/memxor.asm63
2 files changed, 45 insertions, 23 deletions
diff --git a/ChangeLog b/ChangeLog
index 90f03b7d..e1a0d6f5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2013-02-12 Niels Möller <nisse@lysator.liu.se>
+
+ * armv7/memxor.asm (memxor): Optimized aligned case, using 3-way
+ unrolling.
+
2013-02-06 Niels Möller <nisse@lysator.liu.se>
* armv7/memxor.asm (memxor, memxor3): Optimized aligned case, now
diff --git a/armv7/memxor.asm b/armv7/memxor.asm
index 52d4bf46..94b8f532 100644
--- a/armv7/memxor.asm
+++ b/armv7/memxor.asm
@@ -18,6 +18,12 @@ C along with the nettle library; see the file COPYING.LIB. If not, write to
C the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
C MA 02111-1301, USA.
+C Possible speedups:
+C
+C The ldm instruction can do load two registers per cycle,
+C if the address is two-word aligned. Or three registers in two
+C cycles, regardless of alignment.
+
C Register usage:
define(<DST>, <r0>)
@@ -131,38 +137,49 @@ PROLOGUE(memxor)
b .Lmemxor_bytes
.Lmemxor_same:
- tst N, #4
- it ne
- subne N, #4
- bne .Lmemxor_same_loop
-
- ldr r3, [SRC], #+4
- ldr r4, [DST]
- eor r3, r4
- str r3, [DST], #+4
-
subs N, #8
bcc .Lmemxor_same_end
.Lmemxor_same_loop:
- C 6 cycles per iteration, 0.75 cycles/byte
- ldr r4, [SRC, #+4]
- ldr r3, [SRC], #+8
- ldr r6, [DST, #+4]
- ldr r5, [DST]
-
- eor r4, r6
- eor r3, r5
- subs N, #8
-
- str r4, [DST, #+4]
- str r3, [DST], #+8
+ C 8 cycles per iteration, 0.67 cycles/byte
+ ldmia SRC!, {r3, r4, r5}
+ ldmia DST, {r6, r7, r12}
+ subs N, #12
+ eor r3, r6
+ eor r4, r7
+ eor r5, r12
+ stmia DST!, {r3, r4, r5}
bcs .Lmemxor_same_loop
.Lmemxor_same_end:
- adds N, #8
+ C We have 0-11 bytes left to do, and N holds number of bytes -12.
+ adds N, #4
+ bcc .Lmemxor_same_lt_8
+ C Do 8 bytes more, leftover is in N
+ ldmia SRC!, {r3, r4}
+ ldmia DST, {r6, r7}
+ eor r3, r6
+ eor r4, r7
+ stmia DST!, {r3, r4}
+ beq .Lmemxor_done
+ b .Lmemxor_bytes
+
+.Lmemxor_same_lt_8:
+ adds N, #4
+ bcc .Lmemxor_same_lt_4
+
+ ldr r3, [SRC], #+4
+ ldr r4, [DST]
+ eor r3, r4
+ str r3, [DST], #+4
beq .Lmemxor_done
b .Lmemxor_bytes
+
+.Lmemxor_same_lt_4:
+ adds N, #4
+ beq .Lmemxor_done
+ b .Lmemxor_bytes
+
EPILOGUE(memxor)
define(<DST>, <r0>)