summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--arm/aes-decrypt-internal.asm85
-rw-r--r--arm/aes-encrypt-internal.asm87
-rw-r--r--arm/v6/aes-decrypt-internal.asm54
-rw-r--r--arm/v6/aes-encrypt-internal.asm54
5 files changed, 174 insertions, 114 deletions
diff --git a/ChangeLog b/ChangeLog
index 7d3a3454..399eeee9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2013-05-22 Niels Möller <nisse@lysator.liu.se>
+
+ * arm/v6/aes-encrypt-internal.asm: Adapted to new interface.
+ Unfortunately, 4% slowdown on Cortex-A9, for unknown reason.
+ * arm/v6/aes-decrypt-internal.asm: Likewise.
+ * arm/aes-encrypt-internal.asm: Adapted to new interface.
+ * arm/aes-decrypt-internal.asm: Likewise.
+
2013-05-21 Niels Möller <nisse@lysator.liu.se>
* sparc32/aes-encrypt-internal.asm: Adapted to new interface.
diff --git a/arm/aes-decrypt-internal.asm b/arm/aes-decrypt-internal.asm
index 37abf1ec..94717872 100644
--- a/arm/aes-decrypt-internal.asm
+++ b/arm/aes-decrypt-internal.asm
@@ -19,26 +19,32 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
-C define(<CTX>, <r0>)
-define(<TABLE>, <r1>)
-define(<LENGTH>, <r2>)
-define(<DST>, <r3>)
-define(<SRC>, <r12>)
-
+define(<PARAM_ROUNDS>, <r0>)
+define(<PARAM_KEYS>, <r1>)
+define(<TABLE>, <r2>)
+define(<PARAM_LENGTH>, <r3>)
+C On stack: DST, SRC
+
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
-define(<KEY>, <r10>)
-define(<ROUND>, <r11>)
+define(<COUNT>, <r10>)
+define(<KEY>, <r11>)
-define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
+define(<MASK>, <r0>) C Overlaps inputs, except TABLE
+define(<X0>, <r1>)
define(<X1>, <r3>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
-define(<MASK>, <r0>) C Overlaps CTX input
-define(<CTX>, <[sp]>)
+
+define(<FRAME_ROUNDS>, <[sp]>)
+define(<FRAME_KEYS>, <[sp, #+4]>)
+define(<FRAME_LENGTH>, <[sp, #+8]>)
+C 8 saved registers
+define(<FRAME_DST>, <[sp, #+44]>)
+define(<FRAME_SRC>, <[sp, #+48]>)
define(<AES_DECRYPT_ROUND>, <
@@ -103,29 +109,30 @@ define(<AES_DECRYPT_ROUND>, <
.file "aes-decrypt-internal.asm"
- C _aes_decrypt(struct aes_context *ctx,
+ C _aes_decrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
.text
ALIGN(4)
PROLOGUE(_nettle_aes_decrypt)
- teq LENGTH, #0
+ teq PARAM_LENGTH, #0
beq .Lend
- ldr SRC, [sp]
- push {r0, r4,r5,r6,r7,r8,r10,r11,lr}
+ push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr}
mov MASK, #0x3fc
ALIGN(16)
.Lblock_loop:
- ldr KEY, CTX
- ldr ROUND, [KEY, #+AES_NROUNDS]
- AES_LOAD(SRC,KEY,W0)
- AES_LOAD(SRC,KEY,W1)
- AES_LOAD(SRC,KEY,W2)
- AES_LOAD(SRC,KEY,W3)
-
- push {LENGTH, DST, SRC}
+ ldr X0, FRAME_SRC C Use X0 as SRC pointer
+ ldm sp, {COUNT, KEY}
+
+ AES_LOAD(X0,KEY,W0)
+ AES_LOAD(X0,KEY,W1)
+ AES_LOAD(X0,KEY,W2)
+ AES_LOAD(X0,KEY,W3)
+
+ str X0, FRAME_SRC
+
add TABLE, TABLE, #AES_TABLE0
b .Lentry
@@ -135,31 +142,35 @@ PROLOGUE(_nettle_aes_decrypt)
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
- subs ROUND, ROUND,#2
+ subs COUNT, COUNT,#2
C Transform W -> X
AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
- lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register
+ lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register
sub TABLE, TABLE, #AES_TABLE0
C Final round
- AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, ROUND)
- AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, ROUND)
- AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, ROUND)
- AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, ROUND)
+ AES_FINAL_ROUND_V5(X0, X3, X2, X1, KEY, W0, COUNT)
+ AES_FINAL_ROUND_V5(X1, X0, X3, X2, KEY, W1, COUNT)
+ AES_FINAL_ROUND_V5(X2, X1, X0, X3, KEY, W2, COUNT)
+ AES_FINAL_ROUND_V5(X3, X2, X1, X0, KEY, W3, COUNT)
- pop {LENGTH, DST, SRC}
-
- AES_STORE(DST,W0)
- AES_STORE(DST,W1)
- AES_STORE(DST,W2)
- AES_STORE(DST,W3)
+ ldr X0, FRAME_DST
+ ldr X1, FRAME_LENGTH
+
+ AES_STORE(X0,W0)
+ AES_STORE(X0,W1)
+ AES_STORE(X0,W2)
+ AES_STORE(X0,W3)
+
+ subs X1, X1, #16
+ str X0, FRAME_DST
+ str X1, FRAME_LENGTH
- subs LENGTH, LENGTH, #16
bhi .Lblock_loop
- add sp, sp, #4 C Drop saved r0
+ add sp, sp, #12 C Drop saved r0, r1, r3
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
diff --git a/arm/aes-encrypt-internal.asm b/arm/aes-encrypt-internal.asm
index eb2f1489..0d396185 100644
--- a/arm/aes-encrypt-internal.asm
+++ b/arm/aes-encrypt-internal.asm
@@ -19,32 +19,38 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
-C Benchmarked at at 725, 930, 990 cycles/block on cortex A9,
+C Benchmarked at at 725, 815, 990 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with
C aligned accesses. Better scheduling.
-C define(<CTX>, <r0>)
-define(<TABLE>, <r1>)
-define(<LENGTH>, <r2>)
-define(<DST>, <r3>)
-define(<SRC>, <r12>)
-
+define(<PARAM_ROUNDS>, <r0>)
+define(<PARAM_KEYS>, <r1>)
+define(<TABLE>, <r2>)
+define(<PARAM_LENGTH>, <r3>)
+C On stack: DST, SRC
+
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
-define(<KEY>, <r10>)
-define(<ROUND>, <r11>)
+define(<COUNT>, <r10>)
+define(<KEY>, <r11>)
-define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
+define(<MASK>, <r0>) C Overlaps inputs, except TABLE
+define(<X0>, <r1>)
define(<X1>, <r3>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
-define(<MASK>, <r0>) C Overlaps CTX input
-define(<CTX>, <[sp]>)
+
+define(<FRAME_ROUNDS>, <[sp]>)
+define(<FRAME_KEYS>, <[sp, #+4]>)
+define(<FRAME_LENGTH>, <[sp, #+8]>)
+C 8 saved registers
+define(<FRAME_DST>, <[sp, #+44]>)
+define(<FRAME_SRC>, <[sp, #+48]>)
C AES_ENCRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
@@ -112,29 +118,30 @@ define(<AES_ENCRYPT_ROUND>, <
.file "aes-encrypt-internal.asm"
- C _aes_encrypt(struct aes_context *ctx,
+ C _aes_encrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
.text
ALIGN(4)
PROLOGUE(_nettle_aes_encrypt)
- teq LENGTH, #0
+ teq PARAM_LENGTH, #0
beq .Lend
- ldr SRC, [sp]
- push {r0, r4,r5,r6,r7,r8,r10,r11,lr}
+ push {r0,r1,r3, r4,r5,r6,r7,r8,r10,r11,lr}
mov MASK, #0x3fc
ALIGN(16)
.Lblock_loop:
- ldr KEY, CTX
- ldr ROUND, [KEY, #+AES_NROUNDS]
- AES_LOAD(SRC,KEY,W0)
- AES_LOAD(SRC,KEY,W1)
- AES_LOAD(SRC,KEY,W2)
- AES_LOAD(SRC,KEY,W3)
-
- push {LENGTH, DST, SRC}
+ ldr X0, FRAME_SRC C Use X0 as SRC pointer
+ ldm sp, {COUNT, KEY}
+
+ AES_LOAD(X0,KEY,W0)
+ AES_LOAD(X0,KEY,W1)
+ AES_LOAD(X0,KEY,W2)
+ AES_LOAD(X0,KEY,W3)
+
+ str X0, FRAME_SRC
+
add TABLE, TABLE, #AES_TABLE0
b .Lentry
@@ -144,31 +151,35 @@ PROLOGUE(_nettle_aes_encrypt)
AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
- subs ROUND, ROUND,#2
+ subs COUNT, COUNT,#2
C Transform W -> X
AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
- lsr ROUND, MASK, #2 C Put the needed mask in the unused ROUND register
+ lsr COUNT, MASK, #2 C Put the needed mask in the unused COUNT register
sub TABLE, TABLE, #AES_TABLE0
C Final round
- AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, ROUND)
- AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, ROUND)
- AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, ROUND)
- AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, ROUND)
+ AES_FINAL_ROUND_V5(X0, X1, X2, X3, KEY, W0, COUNT)
+ AES_FINAL_ROUND_V5(X1, X2, X3, X0, KEY, W1, COUNT)
+ AES_FINAL_ROUND_V5(X2, X3, X0, X1, KEY, W2, COUNT)
+ AES_FINAL_ROUND_V5(X3, X0, X1, X2, KEY, W3, COUNT)
- pop {LENGTH, DST, SRC}
-
- AES_STORE(DST,W0)
- AES_STORE(DST,W1)
- AES_STORE(DST,W2)
- AES_STORE(DST,W3)
+ ldr X0, FRAME_DST
+ ldr X1, FRAME_LENGTH
+
+ AES_STORE(X0,W0)
+ AES_STORE(X0,W1)
+ AES_STORE(X0,W2)
+ AES_STORE(X0,W3)
+
+ subs X1, X1, #16
+ str X0, FRAME_DST
+ str X1, FRAME_LENGTH
- subs LENGTH, LENGTH, #16
bhi .Lblock_loop
- add sp, sp, #4 C Drop saved r0
+ add sp, sp, #12 C Drop saved r0, r1, r3
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
diff --git a/arm/v6/aes-decrypt-internal.asm b/arm/v6/aes-decrypt-internal.asm
index f550506d..f9f0b7ad 100644
--- a/arm/v6/aes-decrypt-internal.asm
+++ b/arm/v6/aes-decrypt-internal.asm
@@ -19,25 +19,33 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
-define(<CTX>, <r0>)
-define(<TABLE>, <r1>)
-define(<LENGTH>, <r2>)
-define(<DST>, <r3>)
-define(<SRC>, <r12>)
+define(<PARAM_ROUNDS>, <r0>)
+define(<PARAM_KEYS>, <r1>)
+define(<TABLE>, <r2>)
+define(<LENGTH>, <r3>)
+C On stack: DST, SRC
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
-define(<KEY>, <r10>)
-define(<ROUND>, <r11>)
+define(<COUNT>, <r10>)
+define(<KEY>, <r11>)
-define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
-define(<X1>, <r3>)
+define(<X0>, <r0>) C Overlaps PARAM_ROUNDS and PARAM_KEYS
+define(<X1>, <r1>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
+define(<FRAME_ROUNDS>>, <[sp]>)
+define(<FRAME_KEYS>, <[sp, #+4]>)
+C 8 saved registers
+define(<FRAME_DST>, <[sp, #+40]>)
+define(<FRAME_SRC>, <[sp, #+44]>)
+
+define(<SRC>, <%r12>) C Overlap registers used in inner loop.
+define(<DST>, <COUNT>)
C AES_DECRYPT_ROUND(x0,x1,x2,x3,w0,w1,w2,w3,key)
define(<AES_DECRYPT_ROUND>, <
@@ -102,7 +110,7 @@ define(<AES_DECRYPT_ROUND>, <
.file "aes-decrypt-internal.asm"
- C _aes_decrypt(struct aes_context *ctx,
+ C _aes_decrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
@@ -111,22 +119,23 @@ define(<AES_DECRYPT_ROUND>, <
PROLOGUE(_nettle_aes_decrypt)
teq LENGTH, #0
beq .Lend
- ldr SRC, [sp]
- push {r4,r5,r6,r7,r8,r10,r11,lr}
- nop C For some mysterious reason, taking out this nop
- C slows this function down by 10(!) % on Cortex-A9.
+ ldr SRC, [sp, #+4]
+
+ push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
+
ALIGN(16)
.Lblock_loop:
- mov KEY, CTX
+ ldm sp, {COUNT, KEY}
+
+ add TABLE, TABLE, #AES_TABLE0
+
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
- push {LENGTH, DST, SRC}
- ldr ROUND, [CTX, #+AES_NROUNDS]
- add TABLE, TABLE, #AES_TABLE0
+ str SRC, FRAME_SRC
b .Lentry
ALIGN(16)
@@ -135,29 +144,34 @@ PROLOGUE(_nettle_aes_decrypt)
AES_DECRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
- subs ROUND, ROUND,#2
+ subs COUNT, COUNT,#2
C Transform W -> X
AES_DECRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0
+
C Final round
+ ldr DST, FRAME_DST
+
AES_FINAL_ROUND_V6(X0, X3, X2, X1, KEY, W0)
AES_FINAL_ROUND_V6(X1, X0, X3, X2, KEY, W1)
AES_FINAL_ROUND_V6(X2, X1, X0, X3, KEY, W2)
AES_FINAL_ROUND_V6(X3, X2, X1, X0, KEY, W3)
- pop {LENGTH, DST, SRC}
+ ldr SRC, FRAME_SRC
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
+ str DST, FRAME_DST
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
+ add sp, sp, #8 C Drop saved r0, r1
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend:
diff --git a/arm/v6/aes-encrypt-internal.asm b/arm/v6/aes-encrypt-internal.asm
index 3cf13072..3c817de1 100644
--- a/arm/v6/aes-encrypt-internal.asm
+++ b/arm/v6/aes-encrypt-internal.asm
@@ -19,31 +19,39 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
-C Benchmarked at at 680, 818, 929 cycles/block on cortex A9,
+C Benchmarked at at 706, 870, 963 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with
C aligned accesses. Better scheduling.
-define(<CTX>, <r0>)
-define(<TABLE>, <r1>)
-define(<LENGTH>, <r2>)
-define(<DST>, <r3>)
-define(<SRC>, <r12>)
+define(<PARAM_ROUNDS>, <r0>)
+define(<PARAM_KEYS>, <r1>)
+define(<TABLE>, <r2>)
+define(<LENGTH>, <r3>)
+C On stack: DST, SRC
define(<W0>, <r4>)
define(<W1>, <r5>)
define(<W2>, <r6>)
define(<W3>, <r7>)
define(<T0>, <r8>)
-define(<KEY>, <r10>)
-define(<ROUND>, <r11>)
+define(<COUNT>, <r10>)
+define(<KEY>, <r11>)
-define(<X0>, <r2>) C Overlaps LENGTH, SRC, DST
-define(<X1>, <r3>)
+define(<X0>, <r0>) C Overlaps PARAM_ROUNDS and PARAM_KEYS
+define(<X1>, <r1>)
define(<X2>, <r12>)
define(<X3>, <r14>) C lr
+define(<FRAME_ROUNDS>>, <[sp]>)
+define(<FRAME_KEYS>, <[sp, #+4]>)
+C 8 saved registers
+define(<FRAME_DST>, <[sp, #+40]>)
+define(<FRAME_SRC>, <[sp, #+44]>)
+
+define(<SRC>, <%r12>) C Overlap registers used in inner loop.
+define(<DST>, <COUNT>)
C 53 instr.
C It's tempting to use eor with rotation, but that's slower.
@@ -110,7 +118,7 @@ define(<AES_ENCRYPT_ROUND>, <
.file "aes-encrypt-internal.asm"
- C _aes_encrypt(struct aes_context *ctx,
+ C _aes_encrypt(unsigned rounds, const uint32_t *keys,
C const struct aes_table *T,
C size_t length, uint8_t *dst,
C uint8_t *src)
@@ -119,20 +127,23 @@ define(<AES_ENCRYPT_ROUND>, <
PROLOGUE(_nettle_aes_encrypt)
teq LENGTH, #0
beq .Lend
- ldr SRC, [sp]
- push {r4,r5,r6,r7,r8,r10,r11,lr}
+ ldr SRC, [sp, #+4]
+
+ push {r0,r1, r4,r5,r6,r7,r8,r10,r11,lr}
+
ALIGN(16)
.Lblock_loop:
- mov KEY, CTX
+ ldm sp, {COUNT, KEY}
+
+ add TABLE, TABLE, #AES_TABLE0
+
AES_LOAD(SRC,KEY,W0)
AES_LOAD(SRC,KEY,W1)
AES_LOAD(SRC,KEY,W2)
AES_LOAD(SRC,KEY,W3)
- push {LENGTH, DST, SRC}
- ldr ROUND, [CTX, #+AES_NROUNDS]
- add TABLE, TABLE, #AES_TABLE0
+ str SRC, FRAME_SRC
b .Lentry
ALIGN(16)
@@ -141,29 +152,34 @@ PROLOGUE(_nettle_aes_encrypt)
AES_ENCRYPT_ROUND(X0, X1, X2, X3, W0, W1, W2, W3, KEY)
.Lentry:
- subs ROUND, ROUND,#2
+ subs COUNT, COUNT,#2
C Transform W -> X
AES_ENCRYPT_ROUND(W0, W1, W2, W3, X0, X1, X2, X3, KEY)
bne .Lround_loop
sub TABLE, TABLE, #AES_TABLE0
+
C Final round
+ ldr DST, FRAME_DST
+
AES_FINAL_ROUND_V6(X0, X1, X2, X3, KEY, W0)
AES_FINAL_ROUND_V6(X1, X2, X3, X0, KEY, W1)
AES_FINAL_ROUND_V6(X2, X3, X0, X1, KEY, W2)
AES_FINAL_ROUND_V6(X3, X0, X1, X2, KEY, W3)
- pop {LENGTH, DST, SRC}
+ ldr SRC, FRAME_SRC
AES_STORE(DST,W0)
AES_STORE(DST,W1)
AES_STORE(DST,W2)
AES_STORE(DST,W3)
+ str DST, FRAME_DST
subs LENGTH, LENGTH, #16
bhi .Lblock_loop
+ add sp, sp, #8 C Drop saved r0, r1
pop {r4,r5,r6,r7,r8,r10,r11,pc}
.Lend: