summaryrefslogtreecommitdiff
path: root/arm
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2013-05-16 11:09:30 +0200
committerNiels Möller <nisse@lysator.liu.se>2013-05-16 11:09:30 +0200
commit23cd4cf7aea5b4c13135dd38b6e88423a045836a (patch)
tree751f4bf72dc17e347dcdb7ddb93253129a73ee03 /arm
parent8136e192d9da6d501a9d19b81e63c15b8c9729b9 (diff)
downloadnettle-23cd4cf7aea5b4c13135dd38b6e88423a045836a.tar.gz
arm/v6: AES microptimization.
Diffstat (limited to 'arm')
-rw-r--r--arm/aes.m45
-rw-r--r--arm/v6/aes-decrypt-internal.asm6
-rw-r--r--arm/v6/aes-encrypt-internal.asm2
3 files changed, 4 insertions, 9 deletions
diff --git a/arm/aes.m4 b/arm/aes.m4
index 408e35e4..d1fdc761 100644
--- a/arm/aes.m4
+++ b/arm/aes.m4
@@ -33,9 +33,8 @@ define(<AES_FINAL_ROUND>, <
uxtb T0, $3, ror #16
ldrb T0, [TABLE, T0]
eor $6, $6, T0, lsl #16
- uxtb T0, $4, ror #24
- ldrb T0, [TABLE, T0]
+ ldrb T0, [TABLE, $4, lsr #24]
eor $6, $6, T0, lsl #24
ldr T0, [$5], #+4
- eor $6, T0
+ eor $6, $6, T0
>)
diff --git a/arm/v6/aes-decrypt-internal.asm b/arm/v6/aes-decrypt-internal.asm
index 0c8cfc5e..e9b6e570 100644
--- a/arm/v6/aes-decrypt-internal.asm
+++ b/arm/v6/aes-decrypt-internal.asm
@@ -19,10 +19,6 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
-C Benchmarked at at 785, 914, 1051 cycles/block on cortex A9,
-C for 128, 192 and 256 bit key sizes. Unclear why it is slower
-C than _aes_encrypt.
-
define(<CTX>, <r0>)
define(<TABLE>, <r1>)
define(<LENGTH>, <r2>)
@@ -119,7 +115,7 @@ PROLOGUE(_nettle_aes_decrypt)
push {r4,r5,r6,r7,r8,r10,r11,lr}
nop C For some mysterious reason, taking out this nop
- C slows this function down on Cortex-A9.
+ C slows this function down by 10(!) % on Cortex-A9.
ALIGN(16)
.Lblock_loop:
mov KEY, CTX
diff --git a/arm/v6/aes-encrypt-internal.asm b/arm/v6/aes-encrypt-internal.asm
index 69556a35..6887b899 100644
--- a/arm/v6/aes-encrypt-internal.asm
+++ b/arm/v6/aes-encrypt-internal.asm
@@ -19,7 +19,7 @@ C MA 02111-1301, USA.
include_src(<arm/aes.m4>)
-C Benchmarked at at 693, 824, 950 cycles/block on cortex A9,
+C Benchmarked at at 680, 818, 929 cycles/block on cortex A9,
C for 128, 192 and 256 bit key sizes.
C Possible improvements: More efficient load and store with