diff options
-rw-r--r-- | ChangeLog | 8 | ||||
-rw-r--r-- | arm/aes-decrypt-internal.asm | 85 | ||||
-rw-r--r-- | arm/aes-encrypt-internal.asm | 87 | ||||
-rw-r--r-- | arm/v6/aes-decrypt-internal.asm | 54 | ||||
-rw-r--r-- | arm/v6/aes-encrypt-internal.asm | 54 |
5 files changed, 174 insertions, 114 deletions
@@ -1,3 +1,11 @@ +2013-05-22 Niels Möller <nisse@lysator.liu.se> + + * arm/v6/aes-encrypt-internal.asm: Adapted to new interface. + Unfortunately, 4% slowdown on Cortex-A9, for unknown reason. + * arm/v6/aes-decrypt-internal.asm: Likewise. + * arm/aes-encrypt-internal.asm: Adapted to new interface. + * arm/aes-decrypt-internal.asm: Likewise. + 2013-05-21 Niels Möller <nisse@lysator.liu.se> * sparc32/aes-encrypt-internal.asm: Adapted to new interface. diff --git a/arm/aes-decrypt-internal.asm b/arm/aes-decrypt-internal.asm index 37abf1ec..94717872 100644 --- a/arm/aes-decrypt-internal.asm +++ b/arm/aes-decrypt-internal.asm @@ -19,26 +19,32 @@ C MA 02111-1301, USA. include_src(<arm/aes.m4>) -C define(<CTX>, <r0>) -define(<TABLE>, <r1>) -define(<LENGTH>, <r2>) -define(<DST>, <r3>) -define(<SRC>, <r12>) - +define(<PARAM_ROUNDS>, <r0>) +define(<PARAM_KEYS>, <r1>) +define(<TABLE>, <r2>) +define(<PARAM_LENGTH>, <r3>) +C On stack: DST, SRC + define(<W0>, <r4>) define(<W1>, <r5>) define(<W2>, <r6>) define(<W3>, <r7>) define(<T0>, <r8>) -define(<KEY>, <r10>) -define(<ROUND>, <r11>) +define(<COUNT>, <r10>) +define(<KEY>, <r11>) -define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST +define(<MASK>, <r0>) C Overlaps inputs, except TABLE +define(<X0>, <r1>) define(<X1>, <r3>) define(<X2>, <r12>) define(<X3>, <r14>) C lr -define(<MASK>, <r0>) C Overlaps CTX input -define(<CTX>, <[sp]>) + +define(<FRAME_ROUNDS>, <[sp]>) +define(<FRAME_KEYS>, <[sp, #+4]>) +define(<FRAME_LENGTH>, <[sp, #+8]>) +C 8 saved registers +define(<FRAME_DST>, <[sp, #+44]>) +define(<FRAME_SRC>, <[sp, #+48]>) define(<AES_DECRYPT_ROUND>, < @@ -103,29 +109,30 @@ define(<AES_DECRYPT_ROUND>, < .file "aes-decrypt-internal.asm" - C _aes_decrypt(struct aes_context *ctx, + C _aes_decrypt(unsigned rounds, const uint32_t *keys, C const struct aes_table *T, C size_t length, uint8_t *dst, C uint8_t *src) .text ALIGN(4) PROLOGUE(_nettle_aes_decrypt) - teq LENGTH, #0 + teq PARAM_LENGTH, #0 beq .Lend - ldr SRC, [sp] - push {r0, r4,r5,r6,r7,r8,r10,r11,lr} + push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr} mov MASK, #0x3fc ALIGN(16) .Lblock_loop: - ldr KEY, CTX - ldr ROUND, [KEY, #+AES_NROUNDS] - AES_LOAD(SRC,KEY,W0) - AES_LOAD(SRC,KEY,W1) - AES_LOAD(SRC,KEY,W2) - AES_LOAD(SRC,KEY,W3) - - push {LENGTH, DST, SRC} + ldr X0, FRAME_SRC C Use X0 as SRC pointer + ldm sp, {COUNT, KEY} + + AES_LOAD(X0,KEY,W0) + AES_LOAD(X0,KEY,W1) + AES_LOAD(X0,KEY,W2) + AES_LOAD(X0,KEY,W3) + + str X0, FRAME_SRC + add TABLE, TABLE, #AES_TABLE0 b .Lentry @@ -135,31 +142,35 @@ PROLOGUE(_nettle_aes_decrypt) AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) .Lentry: - subs ROUND, ROUND,#2 + subs COUNT, COUNT,#2 C Transform W -> X AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) bne .Lround_loop - lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register + lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register sub TABLE, TABLE, #AES_TABLE0 C Final round - AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, ROUND) - AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, ROUND) - AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, ROUND) - AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, ROUND) + AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, COUNT) + AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, COUNT) + AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, COUNT) + AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, COUNT) - pop {LENGTH, DST, SRC} - - AES_STORE(DST,W0) - AES_STORE(DST,W1) - AES_STORE(DST,W2) - AES_STORE(DST,W3) + ldr X0, FRAME_DST + ldr X1, FRAME_LENGTH + + AES_STORE(X0,W0) + AES_STORE(X0,W1) + AES_STORE(X0,W2) + AES_STORE(X0,W3) + + subs X1, X1, #16 + str X0, FRAME_DST + str X1, FRAME_LENGTH - subs LENGTH, LENGTH, #16 bhi .Lblock_loop - add sp, sp, #4 C Drop saved r0 + add sp, sp, #12 C Drop saved r0, r1, r3 pop {r4,r5,r6,r7,r8,r10,r11,pc} .Lend: diff --git a/arm/aes-encrypt-internal.asm b/arm/aes-encrypt-internal.asm index eb2f1489..0d396185 100644 --- a/arm/aes-encrypt-internal.asm +++ b/arm/aes-encrypt-internal.asm @@ -19,32 +19,38 @@ C MA 02111-1301, USA. include_src(<arm/aes.m4>) -C Benchmarked at at 725, 930, 990 cycles/block on cortex A9, +C Benchmarked at at 725, 815, 990 cycles/block on cortex A9, C for 128, 192 and 256 bit key sizes. C Possible improvements: More efficient load and store with C aligned accesses. Better scheduling. -C define(<CTX>, <r0>) -define(<TABLE>, <r1>) -define(<LENGTH>, <r2>) -define(<DST>, <r3>) -define(<SRC>, <r12>) - +define(<PARAM_ROUNDS>, <r0>) +define(<PARAM_KEYS>, <r1>) +define(<TABLE>, <r2>) +define(<PARAM_LENGTH>, <r3>) +C On stack: DST, SRC + define(<W0>, <r4>) define(<W1>, <r5>) define(<W2>, <r6>) define(<W3>, <r7>) define(<T0>, <r8>) -define(<KEY>, <r10>) -define(<ROUND>, <r11>) +define(<COUNT>, <r10>) +define(<KEY>, <r11>) -define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST +define(<MASK>, <r0>) C Overlaps inputs, except TABLE +define(<X0>, <r1>) define(<X1>, <r3>) define(<X2>, <r12>) define(<X3>, <r14>) C lr -define(<MASK>, <r0>) C Overlaps CTX input -define(<CTX>, <[sp]>) + +define(<FRAME_ROUNDS>, <[sp]>) +define(<FRAME_KEYS>, <[sp, #+4]>) +define(<FRAME_LENGTH>, <[sp, #+8]>) +C 8 saved registers +define(<FRAME_DST>, <[sp, #+44]>) +define(<FRAME_SRC>, <[sp, #+48]>) C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key) @@ -112,29 +118,30 @@ define(<AES_ENCRYPT_ROUND>, < .file "aes-encrypt-internal.asm" - C _aes_encrypt(struct aes_context *ctx, + C _aes_encrypt(unsigned rounds, const uint32_t *keys, C const struct aes_table *T, C size_t length, uint8_t *dst, C uint8_t *src) .text ALIGN(4) PROLOGUE(_nettle_aes_encrypt) - teq LENGTH, #0 + teq PARAM_LENGTH, #0 beq .Lend - ldr SRC, [sp] - push {r0, r4,r5,r6,r7,r8,r10,r11,lr} + push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr} mov MASK, #0x3fc ALIGN(16) .Lblock_loop: - ldr KEY, CTX - ldr ROUND, [KEY, #+AES_NROUNDS] - AES_LOAD(SRC,KEY,W0) - AES_LOAD(SRC,KEY,W1) - AES_LOAD(SRC,KEY,W2) - AES_LOAD(SRC,KEY,W3) - - push {LENGTH, DST, SRC} + ldr X0, FRAME_SRC C Use X0 as SRC pointer + ldm sp, {COUNT, KEY} + + AES_LOAD(X0,KEY,W0) + AES_LOAD(X0,KEY,W1) + AES_LOAD(X0,KEY,W2) + AES_LOAD(X0,KEY,W3) + + str X0, FRAME_SRC + add TABLE, TABLE, #AES_TABLE0 b .Lentry @@ -144,31 +151,35 @@ PROLOGUE(_nettle_aes_encrypt) AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) .Lentry: - subs ROUND, ROUND,#2 + subs COUNT, COUNT,#2 C Transform W -> X AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) bne .Lround_loop - lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register + lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register sub TABLE, TABLE, #AES_TABLE0 C Final round - AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, ROUND) - AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, ROUND) - AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, ROUND) - AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, ROUND) + AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, COUNT) + AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, COUNT) + AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, COUNT) + AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, COUNT) - pop {LENGTH, DST, SRC} - - AES_STORE(DST,W0) - AES_STORE(DST,W1) - AES_STORE(DST,W2) - AES_STORE(DST,W3) + ldr X0, FRAME_DST + ldr X1, FRAME_LENGTH + + AES_STORE(X0,W0) + AES_STORE(X0,W1) + AES_STORE(X0,W2) + AES_STORE(X0,W3) + + subs X1, X1, #16 + str X0, FRAME_DST + str X1, FRAME_LENGTH - subs LENGTH, LENGTH, #16 bhi .Lblock_loop - add sp, sp, #4 C Drop saved r0 + add sp, sp, #12 C Drop saved r0, r1, r3 pop {r4,r5,r6,r7,r8,r10,r11,pc} .Lend: diff --git a/arm/v6/aes-decrypt-internal.asm b/arm/v6/aes-decrypt-internal.asm index f550506d..f9f0b7ad 100644 --- a/arm/v6/aes-decrypt-internal.asm +++ b/arm/v6/aes-decrypt-internal.asm @@ -19,25 +19,33 @@ C MA 02111-1301, USA. include_src(<arm/aes.m4>) -define(<CTX>, <r0>) -define(<TABLE>, <r1>) -define(<LENGTH>, <r2>) -define(<DST>, <r3>) -define(<SRC>, <r12>) +define(<PARAM_ROUNDS>, <r0>) +define(<PARAM_KEYS>, <r1>) +define(<TABLE>, <r2>) +define(<LENGTH>, <r3>) +C On stack: DST, SRC define(<W0>, <r4>) define(<W1>, <r5>) define(<W2>, <r6>) define(<W3>, <r7>) define(<T0>, <r8>) -define(<KEY>, <r10>) -define(<ROUND>, <r11>) +define(<COUNT>, <r10>) +define(<KEY>, <r11>) -define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST -define(<X1>, <r3>) +define(<X0>, <r0>) C Overlaps PARAM_ROUNDS and PARAM_KEYS +define(<X1>, <r1>) define(<X2>, <r12>) define(<X3>, <r14>) C lr +define(<FRAME_ROUNDS>>, <[sp]>) +define(<FRAME_KEYS>, <[sp, #+4]>) +C 8 saved registers +define(<FRAME_DST>, <[sp, #+40]>) +define(<FRAME_SRC>, <[sp, #+44]>) + +define(<SRC>, <%r12>) C Overlap registers used in inner loop. +define(<DST>, <COUNT>) C AES_DECRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key) define(<AES_DECRYPT_ROUND>, < @@ -102,7 +110,7 @@ define(<AES_DECRYPT_ROUND>, < .file "aes-decrypt-internal.asm" - C _aes_decrypt(struct aes_context *ctx, + C _aes_decrypt(unsigned rounds, const uint32_t *keys, C const struct aes_table *T, C size_t length, uint8_t *dst, C uint8_t *src) @@ -111,22 +119,23 @@ define(<AES_DECRYPT_ROUND>, < PROLOGUE(_nettle_aes_decrypt) teq LENGTH, #0 beq .Lend - ldr SRC, [sp] - push {r4,r5,r6,r7,r8,r10,r11,lr} - nop C For some mysterious reason, taking out this nop - C slows this function down by 10(!) % on Cortex-A9. + ldr SRC, [sp, #+4] + + push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr} + ALIGN(16) .Lblock_loop: - mov KEY, CTX + ldm sp, {COUNT, KEY} + + add TABLE, TABLE, #AES_TABLE0 + AES_LOAD(SRC,KEY,W0) AES_LOAD(SRC,KEY,W1) AES_LOAD(SRC,KEY,W2) AES_LOAD(SRC,KEY,W3) - push {LENGTH, DST, SRC} - ldr ROUND, [CTX, #+AES_NROUNDS] - add TABLE, TABLE, #AES_TABLE0 + str SRC, FRAME_SRC b .Lentry ALIGN(16) @@ -135,29 +144,34 @@ PROLOGUE(_nettle_aes_decrypt) AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) .Lentry: - subs ROUND, ROUND,#2 + subs COUNT, COUNT,#2 C Transform W -> X AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) bne .Lround_loop sub TABLE, TABLE, #AES_TABLE0 + C Final round + ldr DST, FRAME_DST + AES_FINAL_ROUND_V6(X0, X3, X2, X1, KEY, W0) AES_FINAL_ROUND_V6(X1, X0, X3, X2, KEY, W1) AES_FINAL_ROUND_V6(X2, X1, X0, X3, KEY, W2) AES_FINAL_ROUND_V6(X3, X2, X1, X0, KEY, W3) - pop {LENGTH, DST, SRC} + ldr SRC, FRAME_SRC AES_STORE(DST,W0) AES_STORE(DST,W1) AES_STORE(DST,W2) AES_STORE(DST,W3) + str DST, FRAME_DST subs LENGTH, LENGTH, #16 bhi .Lblock_loop + add sp, sp, #8 C Drop saved r0, r1 pop {r4,r5,r6,r7,r8,r10,r11,pc} .Lend: diff --git a/arm/v6/aes-encrypt-internal.asm b/arm/v6/aes-encrypt-internal.asm index 3cf13072..3c817de1 100644 --- a/arm/v6/aes-encrypt-internal.asm +++ b/arm/v6/aes-encrypt-internal.asm @@ -19,31 +19,39 @@ C MA 02111-1301, USA. include_src(<arm/aes.m4>) -C Benchmarked at at 680, 818, 929 cycles/block on cortex A9, +C Benchmarked at at 706, 870, 963 cycles/block on cortex A9, C for 128, 192 and 256 bit key sizes. C Possible improvements: More efficient load and store with C aligned accesses. Better scheduling. -define(<CTX>, <r0>) -define(<TABLE>, <r1>) -define(<LENGTH>, <r2>) -define(<DST>, <r3>) -define(<SRC>, <r12>) +define(<PARAM_ROUNDS>, <r0>) +define(<PARAM_KEYS>, <r1>) +define(<TABLE>, <r2>) +define(<LENGTH>, <r3>) +C On stack: DST, SRC define(<W0>, <r4>) define(<W1>, <r5>) define(<W2>, <r6>) define(<W3>, <r7>) define(<T0>, <r8>) -define(<KEY>, <r10>) -define(<ROUND>, <r11>) +define(<COUNT>, <r10>) +define(<KEY>, <r11>) -define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST -define(<X1>, <r3>) +define(<X0>, <r0>) C Overlaps PARAM_ROUNDS and PARAM_KEYS +define(<X1>, <r1>) define(<X2>, <r12>) define(<X3>, <r14>) C lr +define(<FRAME_ROUNDS>>, <[sp]>) +define(<FRAME_KEYS>, <[sp, #+4]>) +C 8 saved registers +define(<FRAME_DST>, <[sp, #+40]>) +define(<FRAME_SRC>, <[sp, #+44]>) + +define(<SRC>, <%r12>) C Overlap registers used in inner loop. +define(<DST>, <COUNT>) C 53 instr. C It's tempting to use eor with rotation, but that's slower. @@ -110,7 +118,7 @@ define(<AES_ENCRYPT_ROUND>, < .file "aes-encrypt-internal.asm" - C _aes_encrypt(struct aes_context *ctx, + C _aes_encrypt(unsigned rounds, const uint32_t *keys, C const struct aes_table *T, C size_t length, uint8_t *dst, C uint8_t *src) @@ -119,20 +127,23 @@ define(<AES_ENCRYPT_ROUND>, < PROLOGUE(_nettle_aes_encrypt) teq LENGTH, #0 beq .Lend - ldr SRC, [sp] - push {r4,r5,r6,r7,r8,r10,r11,lr} + ldr SRC, [sp, #+4] + + push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr} + ALIGN(16) .Lblock_loop: - mov KEY, CTX + ldm sp, {COUNT, KEY} + + add TABLE, TABLE, #AES_TABLE0 + AES_LOAD(SRC,KEY,W0) AES_LOAD(SRC,KEY,W1) AES_LOAD(SRC,KEY,W2) AES_LOAD(SRC,KEY,W3) - push {LENGTH, DST, SRC} - ldr ROUND, [CTX, #+AES_NROUNDS] - add TABLE, TABLE, #AES_TABLE0 + str SRC, FRAME_SRC b .Lentry ALIGN(16) @@ -141,29 +152,34 @@ PROLOGUE(_nettle_aes_encrypt) AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY) .Lentry: - subs ROUND, ROUND,#2 + subs COUNT, COUNT,#2 C Transform W -> X AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY) bne .Lround_loop sub TABLE, TABLE, #AES_TABLE0 + C Final round + ldr DST, FRAME_DST + AES_FINAL_ROUND_V6(X0, X1, X2, X3, KEY, W0) AES_FINAL_ROUND_V6(X1, X2, X3, X0, KEY, W1) AES_FINAL_ROUND_V6(X2, X3, X0, X1, KEY, W2) AES_FINAL_ROUND_V6(X3, X0, X1, X2, KEY, W3) - pop {LENGTH, DST, SRC} + ldr SRC, FRAME_SRC AES_STORE(DST,W0) AES_STORE(DST,W1) AES_STORE(DST,W2) AES_STORE(DST,W3) + str DST, FRAME_DST subs LENGTH, LENGTH, #16 bhi .Lblock_loop + add sp, sp, #8 C Drop saved r0, r1 pop {r4,r5,r6,r7,r8,r10,r11,pc} .Lend: |