diff options
author | Niels Möller <nisse@lysator.liu.se> | 2018-01-03 19:00:19 +0100 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2018-01-03 19:00:19 +0100 |
commit | db9b8594e4caa5459483359567fd077025a0cb65 (patch) | |
tree | 2e4fddb5c778512516502bbafc70d95a8f358659 /x86_64 | |
parent | 0a6790905f30b3171bd5734138fe66b5f2e290a6 (diff) | |
download | nettle-db9b8594e4caa5459483359567fd077025a0cb65.tar.gz |
Unroll x86_64 aesni loops.
Diffstat (limited to 'x86_64')
-rw-r--r-- | x86_64/aesni/aes-decrypt-internal.asm | 106 | ||||
-rw-r--r-- | x86_64/aesni/aes-encrypt-internal.asm | 106 |
2 files changed, 140 insertions, 72 deletions
diff --git a/x86_64/aesni/aes-decrypt-internal.asm b/x86_64/aesni/aes-decrypt-internal.asm index 412e8d31..3d6d6e30 100644 --- a/x86_64/aesni/aes-decrypt-internal.asm +++ b/x86_64/aesni/aes-decrypt-internal.asm @@ -2,7 +2,7 @@ C x86_64/aesni/aes-decrypt-internal.asm ifelse(< - Copyright (C) 2015 Niels Möller + Copyright (C) 2015, 2018 Niels Möller This file is part of GNU Nettle. @@ -39,15 +39,22 @@ define(<LENGTH>,<%rcx>) define(<DST>, <%r8>) define(<SRC>, <%r9>) -C Round counter -define(<CNT>, <%rdx>) -C Subkey pointer -define(<KEY>, <%rax>) - -dnl aesdec %xmm1, %xmm0 -define(<AESDEC>, <.byte 0x66, 0x0f, 0x38, 0xde, 0xc1>) -dnl aesdeclast %xmm1, %xmm0 -define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>) +define(<KEY0>, <%xmm0>) +define(<KEY1>, <%xmm1>) +define(<KEY2>, <%xmm2>) +define(<KEY3>, <%xmm3>) +define(<KEY4>, <%xmm4>) +define(<KEY5>, <%xmm5>) +define(<KEY6>, <%xmm6>) +define(<KEY7>, <%xmm7>) +define(<KEY8>, <%xmm8>) +define(<KEY9>, <%xmm9>) +define(<KEY10>, <%xmm10>) +define(<KEY11>, <%xmm11>) +define(<KEY12>, <%xmm12>) +define(<KEY13>, <%xmm13>) +define(<KEYLAST>, <%xmm14>) +define(<BLOCK>, <%xmm15>) .file "aes-decrypt-internal.asm" @@ -58,43 +65,70 @@ define(<AESDECLAST>, <.byte 0x66, 0x0f, 0x38, 0xdf, 0xc1>) .text ALIGN(16) PROLOGUE(_nettle_aes_decrypt) - W64_ENTRY(6, 2) + W64_ENTRY(6, 16) shr $4, LENGTH test LENGTH, LENGTH jz .Lend - decl XREG(ROUNDS) + movups (KEYS), KEY0 + movups 16(KEYS), KEY1 + movups 32(KEYS), KEY2 + movups 48(KEYS), KEY3 + movups 64(KEYS), KEY4 + movups 80(KEYS), KEY5 + movups 96(KEYS), KEY6 + movups 112(KEYS), KEY7 + movups 128(KEYS), KEY8 + movups 144(KEYS), KEY9 + lea 160(KEYS), KEYS + sub $10, XREG(ROUNDS) C Also clears high half + je .Lkey_last + + movups (KEYS), KEY10 + movups 16(KEYS), KEY11 + lea (KEYS, ROUNDS, 8), KEYS + lea (KEYS, ROUNDS, 8), KEYS + + cmpl $2, XREG(ROUNDS) + je .Lkey_last + movups -32(KEYS), KEY12 + movups -16(KEYS), KEY13 + +.Lkey_last: + movups (KEYS), KEYLAST .Lblock_loop: - mov ROUNDS, CNT - mov KEYS, KEY - movups (SRC), %xmm0 - C FIXME: Better alignment of subkeys, so we can use movaps. - movups (KEY), %xmm1 - pxor %xmm1, %xmm0 - - C FIXME: Could use some unrolling. Also all subkeys fit in - C registers, so they could be loaded once (on W64 we would - C need to save and restore some xmm registers, though). - -.Lround_loop: - add $16, KEY - - movups (KEY), %xmm1 - AESDEC C %xmm1, %xmm0 - decl XREG(CNT) - jnz .Lround_loop - - movups 16(KEY), %xmm1 - AESDECLAST C %xmm1, %xmm0 - - movups %xmm0, (DST) + movups (SRC), BLOCK + pxor KEY0, BLOCK + aesdec KEY1, BLOCK + aesdec KEY2, BLOCK + aesdec KEY3, BLOCK + aesdec KEY4, BLOCK + aesdec KEY5, BLOCK + aesdec KEY6, BLOCK + aesdec KEY7, BLOCK + aesdec KEY8, BLOCK + aesdec KEY9, BLOCK + testl XREG(ROUNDS), XREG(ROUNDS) + je .Lblock_end + aesdec KEY10, BLOCK + aesdec KEY11, BLOCK + cmpl $2, XREG(ROUNDS) + je .Lblock_end + + aesdec KEY12, BLOCK + aesdec KEY13, BLOCK + +.Lblock_end: + aesdeclast KEYLAST, BLOCK + + movups BLOCK, (DST) add $16, SRC add $16, DST dec LENGTH jnz .Lblock_loop .Lend: - W64_EXIT(6, 2) + W64_EXIT(6, 16) ret EPILOGUE(_nettle_aes_decrypt) diff --git a/x86_64/aesni/aes-encrypt-internal.asm b/x86_64/aesni/aes-encrypt-internal.asm index 07f17b25..99caf1f8 100644 --- a/x86_64/aesni/aes-encrypt-internal.asm +++ b/x86_64/aesni/aes-encrypt-internal.asm @@ -2,7 +2,7 @@ C x86_64/aesni/aes-encrypt-internal.asm ifelse(< - Copyright (C) 2015 Niels Möller + Copyright (C) 2015, 2018 Niels Möller This file is part of GNU Nettle. @@ -39,16 +39,23 @@ define(<LENGTH>,<%rcx>) define(<DST>, <%r8>) define(<SRC>, <%r9>) -C Round counter -define(<CNT>, <%rdx>) -C Subkey pointer -define(<KEY>, <%rax>) +define(<KEY0>, <%xmm0>) +define(<KEY1>, <%xmm1>) +define(<KEY2>, <%xmm2>) +define(<KEY3>, <%xmm3>) +define(<KEY4>, <%xmm4>) +define(<KEY5>, <%xmm5>) +define(<KEY6>, <%xmm6>) +define(<KEY7>, <%xmm7>) +define(<KEY8>, <%xmm8>) +define(<KEY9>, <%xmm9>) +define(<KEY10>, <%xmm10>) +define(<KEY11>, <%xmm11>) +define(<KEY12>, <%xmm12>) +define(<KEY13>, <%xmm13>) +define(<KEYLAST>, <%xmm14>) +define(<BLOCK>, <%xmm15>) -dnl aesenc %xmm1, %xmm0 -define(<AESENC>, <.byte 0x66, 0x0f, 0x38, 0xdc, 0xc1>) -dnl aesenclast %xmm1, %xmm0 -define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>) - .file "aes-encrypt-internal.asm" C _aes_encrypt(unsigned rounds, const uint32_t *keys, @@ -58,43 +65,70 @@ define(<AESENCLAST>, <.byte 0x66, 0x0f, 0x38, 0xdd, 0xc1>) .text ALIGN(16) PROLOGUE(_nettle_aes_encrypt) - W64_ENTRY(6, 2) + W64_ENTRY(6, 16) shr $4, LENGTH test LENGTH, LENGTH jz .Lend - decl XREG(ROUNDS) + movups (KEYS), KEY0 + movups 16(KEYS), KEY1 + movups 32(KEYS), KEY2 + movups 48(KEYS), KEY3 + movups 64(KEYS), KEY4 + movups 80(KEYS), KEY5 + movups 96(KEYS), KEY6 + movups 112(KEYS), KEY7 + movups 128(KEYS), KEY8 + movups 144(KEYS), KEY9 + lea 160(KEYS), KEYS + sub $10, XREG(ROUNDS) C Also clears high half + je .Lkey_last + + movups (KEYS), KEY10 + movups 16(KEYS), KEY11 + lea (KEYS, ROUNDS, 8), KEYS + lea (KEYS, ROUNDS, 8), KEYS + + cmpl $2, XREG(ROUNDS) + je .Lkey_last + movups -32(KEYS), KEY12 + movups -16(KEYS), KEY13 + +.Lkey_last: + movups (KEYS), KEYLAST .Lblock_loop: - mov ROUNDS, CNT - mov KEYS, KEY - movups (SRC), %xmm0 - C FIXME: Better alignment of subkeys, so we can use movaps. - movups (KEY), %xmm1 - pxor %xmm1, %xmm0 - - C FIXME: Could use some unrolling. Also all subkeys fit in - C registers, so they could be loaded once (on W64 we would - C need to save and restore some xmm registers, though). - -.Lround_loop: - add $16, KEY - - movups (KEY), %xmm1 - AESENC C %xmm1, %xmm0 - decl XREG(CNT) - jnz .Lround_loop - - movups 16(KEY), %xmm1 - AESENCLAST C %xmm1, %xmm0 - - movups %xmm0, (DST) + movups (SRC), BLOCK + pxor KEY0, BLOCK + aesenc KEY1, BLOCK + aesenc KEY2, BLOCK + aesenc KEY3, BLOCK + aesenc KEY4, BLOCK + aesenc KEY5, BLOCK + aesenc KEY6, BLOCK + aesenc KEY7, BLOCK + aesenc KEY8, BLOCK + aesenc KEY9, BLOCK + testl XREG(ROUNDS), XREG(ROUNDS) + je .Lblock_end + aesenc KEY10, BLOCK + aesenc KEY11, BLOCK + cmpl $2, XREG(ROUNDS) + je .Lblock_end + + aesenc KEY12, BLOCK + aesenc KEY13, BLOCK + +.Lblock_end: + aesenclast KEYLAST, BLOCK + + movups BLOCK, (DST) add $16, SRC add $16, DST dec LENGTH jnz .Lblock_loop .Lend: - W64_EXIT(6, 2) + W64_EXIT(6, 16) ret EPILOGUE(_nettle_aes_encrypt) |