diff options
author | Niels Möller <nisse@lysator.liu.se> | 2013-02-19 13:09:55 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2013-02-19 13:09:55 +0100 |
commit | 993ae2d6c4653e3ffb836ff18d2b65316446372f (patch) | |
tree | 6fa01178f68b793f167fadc6d9876eda165c2a2b /armv7 | |
parent | 39c037437d8807e2660baabe766fa391fa762a52 (diff) | |
download | nettle-993ae2d6c4653e3ffb836ff18d2b65316446372f.tar.gz |
Optimized ARM memxor.
Diffstat (limited to 'armv7')
-rw-r--r-- | armv7/memxor.asm | 106 |
1 files changed, 72 insertions, 34 deletions
diff --git a/armv7/memxor.asm b/armv7/memxor.asm index 94b8f532..929fffcd 100644 --- a/armv7/memxor.asm +++ b/armv7/memxor.asm @@ -40,12 +40,13 @@ define(<TNC>, <r7>) .arm C memxor(uint8_t *dst, const uint8_t *src, size_t n) - .align 2 + .align 4 PROLOGUE(memxor) cmp N, #0 beq .Lmemxor_ret - push {r4, r5, r6, r7} + C FIXME: Delay push until we know how many registers we need. + push {r4,r5,r6,r7,r8,r10,r11,r14} C lr is the link register cmp N, #7 bcs .Lmemxor_large @@ -60,7 +61,7 @@ PROLOGUE(memxor) bne .Lmemxor_bytes .Lmemxor_done: - pop {r4,r5,r6,r7} + pop {r4,r5,r6,r7,r8,r10,r11,r14} .Lmemxor_ret: bx lr @@ -140,16 +141,42 @@ PROLOGUE(memxor) subs N, #8 bcc .Lmemxor_same_end + ldmia SRC!, {r3, r4, r5} + C Keep address for loads in r14 + mov r14, DST + ldmia r14!, {r6, r7, r8} + subs N, #12 + eor r10, r3, r6 + eor r11, r4, r7 + eor r12, r5, r8 + bcc .Lmemxor_same_final_store + subs N, #12 + ldmia r14!, {r6, r7, r8} + bcc .Lmemxor_same_wind_down + + C 7 cycles per iteration, 0.58 cycles/byte + C Loopmixer could perhaps get it down to 6 cycles. .Lmemxor_same_loop: - C 8 cycles per iteration, 0.67 cycles/byte + C r10-r12 contains values to be stored at DST + C r6-r8 contains values read from r14, in advance ldmia SRC!, {r3, r4, r5} - ldmia DST, {r6, r7, r12} subs N, #12 - eor r3, r6 - eor r4, r7 - eor r5, r12 - stmia DST!, {r3, r4, r5} + stmia DST!, {r10, r11, r12} + eor r10, r3, r6 + eor r11, r4, r7 + eor r12, r5, r8 + ldmia r14!, {r6, r7, r8} bcs .Lmemxor_same_loop + +.Lmemxor_same_wind_down: + C Wind down code + ldmia SRC!, {r3, r4, r5} + stmia DST!, {r10, r11, r12} + eor r10, r3, r6 + eor r11, r4, r7 + eor r12, r5, r8 +.Lmemxor_same_final_store: + stmia DST!, {r10, r11, r12} .Lmemxor_same_end: C We have 0-11 bytes left to do, and N holds number of bytes -12. @@ -312,40 +339,51 @@ PROLOGUE(memxor3) bne .Lmemxor3_au ; C a, b and dst all have the same alignment. - sub AP, #4 - sub BP, #4 - sub DST, #4 - tst N, #4 - it ne - subne N, #4 - bne .Lmemxor3_aligned_word_loop - - ldr r4, [AP], #-4 - ldr r5, [BP], #-4 - eor r4, r5 - str r4, [DST], #-4 subs N, #8 bcc .Lmemxor3_aligned_word_end + + C This loop runs at 7 cycles per iteration, but it seems to + C have a strange alignment requirement. For this speed, the + C loop started at offset 0x2ac in the object file, and all + C other offsets made it slower. .Lmemxor3_aligned_word_loop: - ldr r4, [AP, #-4] - ldr r5, [AP], #-8 - ldr r6, [BP, #-4] - ldr r7, [BP], #-8 + ldmdb AP!, {r4,r5,r6} + ldmdb BP!, {r7,r8,r10} + subs N, #12 + eor r4, r7 + eor r5, r8 + eor r6, r10 + stmdb DST!, {r4, r5,r6} + bcs .Lmemxor3_aligned_word_loop +.Lmemxor3_aligned_word_end: + C We have 0-11 bytes left to do, and N holds number of bytes -12. + adds N, #4 + bcc .Lmemxor3_aligned_lt_8 + C Do 8 bytes more, leftover is in N + ldmdb AP!, {r4, r5} + ldmdb BP!, {r6, r7} eor r4, r6 eor r5, r7 - subs N, #8 - str r4, [DST, #-4] - str r5, [DST], #-8 + stmdb DST!, {r4,r5} + beq .Lmemxor3_done + b .Lmemxor3_bytes - bcs .Lmemxor3_aligned_word_loop -.Lmemxor3_aligned_word_end: - adds N, #8 +.Lmemxor3_aligned_lt_8: + adds N, #4 + bcc .Lmemxor3_aligned_lt_4 + + ldr r4, [AP,#-4]! + ldr r5, [BP,#-4]! + eor r4, r5 + str r4, [DST,#-4]! + beq .Lmemxor3_done + b .Lmemxor3_bytes + +.Lmemxor3_aligned_lt_4: + adds N, #4 beq .Lmemxor3_done - add AP, #4 - add BP, #4 - add DST, #4 b .Lmemxor3_bytes .Lmemxor3_uu: |