summaryrefslogtreecommitdiff
path: root/x86_64
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2018-01-03 19:00:19 +0100
committerNiels Möller <nisse@lysator.liu.se>2018-01-03 19:00:19 +0100
commitdb9b8594e4caa5459483359567fd077025a0cb65 (patch)
tree2e4fddb5c778512516502bbafc70d95a8f358659 /x86_64
parent0a6790905f30b3171bd5734138fe66b5f2e290a6 (diff)
downloadnettle-db9b8594e4caa5459483359567fd077025a0cb65.tar.gz
Unroll x86_64 aesni loops.
Diffstat (limited to 'x86_64')
-rw-r--r--x86_64/aesni/aes-decrypt-internal.asm106
-rw-r--r--x86_64/aesni/aes-encrypt-internal.asm106
2 files changed, 140 insertions, 72 deletions
diff --git a/x86_64/aesni/aes-decrypt-internal.asm b/x86_64/aesni/aes-decrypt-internal.asm
index 412e8d31..3d6d6e30 100644
--- a/x86_64/aesni/aes-decrypt-internal.asm
+++ b/x86_64/aesni/aes-decrypt-internal.asm
@@ -2,7 +2,7 @@ C x86_64/aesni/aes-decrypt-internal.asm
ifelse(<
- Copyright (C) 2015 Niels Möller
+ Copyright (C) 2015, 2018 Niels Möller
This file is part of GNU Nettle.
@@ -39,15 +39,22 @@ define(<LENGTH>,<%rcx>)
define(<DST>, <%r8>)
define(<SRC>, <%r9>)
-C Round counter
-define(<CNT>, <%rdx>)
-C Subkey pointer
-define(<KEY>, <%rax>)
-
-dnl aesdec %xmm1, %xmm0
-define(<AESDEC>, <.byte 0x66, 0x0f, 0x38, 0xde, 0xc1>)
-dnl aesdeclast %xmm1, %xmm0
-define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>)
+define(<KEY0>, <%xmm0>)
+define(<KEY1>, <%xmm1>)
+define(<KEY2>, <%xmm2>)
+define(<KEY3>, <%xmm3>)
+define(<KEY4>, <%xmm4>)
+define(<KEY5>, <%xmm5>)
+define(<KEY6>, <%xmm6>)
+define(<KEY7>, <%xmm7>)
+define(<KEY8>, <%xmm8>)
+define(<KEY9>, <%xmm9>)
+define(<KEY10>, <%xmm10>)
+define(<KEY11>, <%xmm11>)
+define(<KEY12>, <%xmm12>)
+define(<KEY13>, <%xmm13>)
+define(<KEYLAST>, <%xmm14>)
+define(<BLOCK>, <%xmm15>)
.file "aes-decrypt-internal.asm"
@@ -58,43 +65,70 @@ define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>)
.text
ALIGN(16)
PROLOGUE(_nettle_aes_decrypt)
- W64_ENTRY(6, 2)
+ W64_ENTRY(6, 16)
shr $4, LENGTH
test LENGTH, LENGTH
jz .Lend
- decl XREG(ROUNDS)
+ movups (KEYS), KEY0
+ movups 16(KEYS), KEY1
+ movups 32(KEYS), KEY2
+ movups 48(KEYS), KEY3
+ movups 64(KEYS), KEY4
+ movups 80(KEYS), KEY5
+ movups 96(KEYS), KEY6
+ movups 112(KEYS), KEY7
+ movups 128(KEYS), KEY8
+ movups 144(KEYS), KEY9
+ lea 160(KEYS), KEYS
+ sub $10, XREG(ROUNDS) C Also clears high half
+ je .Lkey_last
+
+ movups (KEYS), KEY10
+ movups 16(KEYS), KEY11
+ lea (KEYS, ROUNDS, 8), KEYS
+ lea (KEYS, ROUNDS, 8), KEYS
+
+ cmpl $2, XREG(ROUNDS)
+ je .Lkey_last
+ movups -32(KEYS), KEY12
+ movups -16(KEYS), KEY13
+
+.Lkey_last:
+ movups (KEYS), KEYLAST
.Lblock_loop:
- mov ROUNDS, CNT
- mov KEYS, KEY
- movups (SRC), %xmm0
- C FIXME: Better alignment of subkeys, so we can use movaps.
- movups (KEY), %xmm1
- pxor %xmm1, %xmm0
-
- C FIXME: Could use some unrolling. Also all subkeys fit in
- C registers, so they could be loaded once (on W64 we would
- C need to save and restore some xmm registers, though).
-
-.Lround_loop:
- add $16, KEY
-
- movups (KEY), %xmm1
- AESDEC C %xmm1, %xmm0
- decl XREG(CNT)
- jnz .Lround_loop
-
- movups 16(KEY), %xmm1
- AESDECLAST C %xmm1, %xmm0
-
- movups %xmm0, (DST)
+ movups (SRC), BLOCK
+ pxor KEY0, BLOCK
+ aesdec KEY1, BLOCK
+ aesdec KEY2, BLOCK
+ aesdec KEY3, BLOCK
+ aesdec KEY4, BLOCK
+ aesdec KEY5, BLOCK
+ aesdec KEY6, BLOCK
+ aesdec KEY7, BLOCK
+ aesdec KEY8, BLOCK
+ aesdec KEY9, BLOCK
+ testl XREG(ROUNDS), XREG(ROUNDS)
+ je .Lblock_end
+ aesdec KEY10, BLOCK
+ aesdec KEY11, BLOCK
+ cmpl $2, XREG(ROUNDS)
+ je .Lblock_end
+
+ aesdec KEY12, BLOCK
+ aesdec KEY13, BLOCK
+
+.Lblock_end:
+ aesdeclast KEYLAST, BLOCK
+
+ movups BLOCK, (DST)
add $16, SRC
add $16, DST
dec LENGTH
jnz .Lblock_loop
.Lend:
- W64_EXIT(6, 2)
+ W64_EXIT(6, 16)
ret
EPILOGUE(_nettle_aes_decrypt)
diff --git a/x86_64/aesni/aes-encrypt-internal.asm b/x86_64/aesni/aes-encrypt-internal.asm
index 07f17b25..99caf1f8 100644
--- a/x86_64/aesni/aes-encrypt-internal.asm
+++ b/x86_64/aesni/aes-encrypt-internal.asm
@@ -2,7 +2,7 @@ C x86_64/aesni/aes-encrypt-internal.asm
ifelse(<
- Copyright (C) 2015 Niels Möller
+ Copyright (C) 2015, 2018 Niels Möller
This file is part of GNU Nettle.
@@ -39,16 +39,23 @@ define(<LENGTH>,<%rcx>)
define(<DST>, <%r8>)
define(<SRC>, <%r9>)
-C Round counter
-define(<CNT>, <%rdx>)
-C Subkey pointer
-define(<KEY>, <%rax>)
+define(<KEY0>, <%xmm0>)
+define(<KEY1>, <%xmm1>)
+define(<KEY2>, <%xmm2>)
+define(<KEY3>, <%xmm3>)
+define(<KEY4>, <%xmm4>)
+define(<KEY5>, <%xmm5>)
+define(<KEY6>, <%xmm6>)
+define(<KEY7>, <%xmm7>)
+define(<KEY8>, <%xmm8>)
+define(<KEY9>, <%xmm9>)
+define(<KEY10>, <%xmm10>)
+define(<KEY11>, <%xmm11>)
+define(<KEY12>, <%xmm12>)
+define(<KEY13>, <%xmm13>)
+define(<KEYLAST>, <%xmm14>)
+define(<BLOCK>, <%xmm15>)
-dnl aesenc %xmm1, %xmm0
-define(<AESENC>, <.byte 0x66, 0x0f, 0x38, 0xdc, 0xc1>)
-dnl aesenclast %xmm1, %xmm0
-define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
-
.file "aes-encrypt-internal.asm"
C _aes_encrypt(unsigned rounds, const uint32_t *keys,
@@ -58,43 +65,70 @@ define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>)
.text
ALIGN(16)
PROLOGUE(_nettle_aes_encrypt)
- W64_ENTRY(6, 2)
+ W64_ENTRY(6, 16)
shr $4, LENGTH
test LENGTH, LENGTH
jz .Lend
- decl XREG(ROUNDS)
+ movups (KEYS), KEY0
+ movups 16(KEYS), KEY1
+ movups 32(KEYS), KEY2
+ movups 48(KEYS), KEY3
+ movups 64(KEYS), KEY4
+ movups 80(KEYS), KEY5
+ movups 96(KEYS), KEY6
+ movups 112(KEYS), KEY7
+ movups 128(KEYS), KEY8
+ movups 144(KEYS), KEY9
+ lea 160(KEYS), KEYS
+ sub $10, XREG(ROUNDS) C Also clears high half
+ je .Lkey_last
+
+ movups (KEYS), KEY10
+ movups 16(KEYS), KEY11
+ lea (KEYS, ROUNDS, 8), KEYS
+ lea (KEYS, ROUNDS, 8), KEYS
+
+ cmpl $2, XREG(ROUNDS)
+ je .Lkey_last
+ movups -32(KEYS), KEY12
+ movups -16(KEYS), KEY13
+
+.Lkey_last:
+ movups (KEYS), KEYLAST
.Lblock_loop:
- mov ROUNDS, CNT
- mov KEYS, KEY
- movups (SRC), %xmm0
- C FIXME: Better alignment of subkeys, so we can use movaps.
- movups (KEY), %xmm1
- pxor %xmm1, %xmm0
-
- C FIXME: Could use some unrolling. Also all subkeys fit in
- C registers, so they could be loaded once (on W64 we would
- C need to save and restore some xmm registers, though).
-
-.Lround_loop:
- add $16, KEY
-
- movups (KEY), %xmm1
- AESENC C %xmm1, %xmm0
- decl XREG(CNT)
- jnz .Lround_loop
-
- movups 16(KEY), %xmm1
- AESENCLAST C %xmm1, %xmm0
-
- movups %xmm0, (DST)
+ movups (SRC), BLOCK
+ pxor KEY0, BLOCK
+ aesenc KEY1, BLOCK
+ aesenc KEY2, BLOCK
+ aesenc KEY3, BLOCK
+ aesenc KEY4, BLOCK
+ aesenc KEY5, BLOCK
+ aesenc KEY6, BLOCK
+ aesenc KEY7, BLOCK
+ aesenc KEY8, BLOCK
+ aesenc KEY9, BLOCK
+ testl XREG(ROUNDS), XREG(ROUNDS)
+ je .Lblock_end
+ aesenc KEY10, BLOCK
+ aesenc KEY11, BLOCK
+ cmpl $2, XREG(ROUNDS)
+ je .Lblock_end
+
+ aesenc KEY12, BLOCK
+ aesenc KEY13, BLOCK
+
+.Lblock_end:
+ aesenclast KEYLAST, BLOCK
+
+ movups BLOCK, (DST)
add $16, SRC
add $16, DST
dec LENGTH
jnz .Lblock_loop
.Lend:
- W64_EXIT(6, 2)
+ W64_EXIT(6, 16)
ret
EPILOGUE(_nettle_aes_encrypt)