diff options
author | Niels Möller <nisse@lysator.liu.se> | 2021-08-10 22:23:14 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2021-08-10 22:23:14 +0200 |
commit | 463553ae61f1844e62d26107accacba21b1ddef1 (patch) | |
tree | 863db63e9dce9cfcbfb50aec660d992fdab72460 | |
parent | c7391e5cdb8a0afc05186d484bc9f752b8f0c074 (diff) | |
download | nettle-x86_64-aes-refactor.tar.gz |
x86_64: New 2-way aesni loop also for aes256x86_64-aes-refactor
-rw-r--r-- | ChangeLog | 2 | ||||
-rw-r--r-- | x86_64/aesni/aes256-decrypt.asm | 76 | ||||
-rw-r--r-- | x86_64/aesni/aes256-encrypt.asm | 75 |
3 files changed, 122 insertions, 31 deletions
@@ -4,7 +4,7 @@ * x86_64/aesni/aes128-decrypt.asm: Likewise. * x86_64/aesni/aes192-encrypt.asm: Likewise. * x86_64/aesni/aes192-decrypt.asm: Likewise. - * x86_64/aesni/aes256-encrypt.asm: New file, but 1-way loop. + * x86_64/aesni/aes256-encrypt.asm: Likewise. * x86_64/aesni/aes256-decrypt.asm: Likewise. * x86_64/aesni/aes-encrypt-internal.asm: Deleted. * x86_64/aesni/aes-decrypt-internal.asm: Deleted. diff --git a/x86_64/aesni/aes256-decrypt.asm b/x86_64/aesni/aes256-decrypt.asm index 122f1db6..0fc5ad2a 100644 --- a/x86_64/aesni/aes256-decrypt.asm +++ b/x86_64/aesni/aes256-decrypt.asm @@ -36,22 +36,22 @@ define(`LENGTH',`%rsi') define(`DST', `%rdx') define(`SRC', `%rcx') -define(`KEY0', `%xmm0') +define(`KEY0_7', `%xmm0') define(`KEY1', `%xmm1') define(`KEY2', `%xmm2') define(`KEY3', `%xmm3') define(`KEY4', `%xmm4') define(`KEY5', `%xmm5') define(`KEY6', `%xmm6') -define(`KEY7', `%xmm7') -define(`KEY8', `%xmm8') -define(`KEY9', `%xmm9') -define(`KEY10', `%xmm10') -define(`KEY11', `%xmm11') -define(`KEY12', `%xmm12') -define(`KEY13', `%xmm13') -define(`KEY14', `%xmm14') -define(`X', `%xmm15') +define(`KEY8', `%xmm7') +define(`KEY9', `%xmm8') +define(`KEY10', `%xmm9') +define(`KEY11', `%xmm10') +define(`KEY12', `%xmm11') +define(`KEY13', `%xmm12') +define(`KEY14', `%xmm13') +define(`X', `%xmm14') +define(`Y', `%xmm15') .file "aes256-decrypt.asm" @@ -67,14 +67,13 @@ PROLOGUE(nettle_aes256_decrypt) test LENGTH, LENGTH jz .Lend - movups (CTX), KEY0 + movups (CTX), KEY0_7 movups 16(CTX), KEY1 movups 32(CTX), KEY2 movups 48(CTX), KEY3 movups 64(CTX), KEY4 movups 80(CTX), KEY5 movups 96(CTX), KEY6 - movups 112(CTX), KEY7 movups 128(CTX), KEY8 movups 144(CTX), KEY9 movups 160(CTX), KEY10 @@ -83,16 +82,20 @@ PROLOGUE(nettle_aes256_decrypt) movups 208(CTX), KEY13 movups 224(CTX), KEY14 -.Lblock_loop: + shr LENGTH + jnc .Lblock_loop + movups (SRC), X - pxor KEY0, X + pxor KEY0_7, X + movups 112(CTX), KEY0_7 aesdec KEY1, X aesdec KEY2, X aesdec KEY3, X aesdec KEY4, X aesdec KEY5, X aesdec KEY6, X - aesdec KEY7, X + aesdec KEY0_7, X + movups (CTX), KEY0_7 aesdec KEY8, X aesdec KEY9, X aesdec KEY10, X @@ -104,6 +107,49 @@ PROLOGUE(nettle_aes256_decrypt) movups X, (DST) add $16, SRC add $16, DST + test LENGTH, LENGTH + jz .Lend + +.Lblock_loop: + movups (SRC), X + movups 16(SRC), Y + pxor KEY0_7, X + pxor KEY0_7, Y + movups 112(CTX), KEY0_7 + aesdec KEY1, X + aesdec KEY1, Y + aesdec KEY2, X + aesdec KEY2, Y + aesdec KEY3, X + aesdec KEY3, Y + aesdec KEY4, X + aesdec KEY4, Y + aesdec KEY5, X + aesdec KEY5, Y + aesdec KEY6, X + aesdec KEY6, Y + aesdec KEY0_7, X + aesdec KEY0_7, Y + movups (CTX), KEY0_7 + aesdec KEY8, X + aesdec KEY8, Y + aesdec KEY9, X + aesdec KEY9, Y + aesdec KEY10, X + aesdec KEY10, Y + aesdec KEY11, X + aesdec KEY11, Y + aesdec KEY12, X + aesdec KEY12, Y + aesdec KEY13, X + aesdec KEY13, Y + aesdeclast KEY14, X + aesdeclast KEY14, Y + + movups X, (DST) + movups Y, 16(DST) + add $32, SRC + add $32, DST dec LENGTH jnz .Lblock_loop diff --git a/x86_64/aesni/aes256-encrypt.asm b/x86_64/aesni/aes256-encrypt.asm index b261a237..57cfc4d2 100644 --- a/x86_64/aesni/aes256-encrypt.asm +++ b/x86_64/aesni/aes256-encrypt.asm @@ -36,22 +36,22 @@ define(`LENGTH',`%rsi') define(`DST', `%rdx') define(`SRC', `%rcx') -define(`KEY0', `%xmm0') +define(`KEY0_7', `%xmm0') define(`KEY1', `%xmm1') define(`KEY2', `%xmm2') define(`KEY3', `%xmm3') define(`KEY4', `%xmm4') define(`KEY5', `%xmm5') define(`KEY6', `%xmm6') -define(`KEY7', `%xmm7') -define(`KEY8', `%xmm8') -define(`KEY9', `%xmm9') -define(`KEY10', `%xmm10') -define(`KEY11', `%xmm11') -define(`KEY12', `%xmm12') -define(`KEY13', `%xmm13') -define(`KEY14', `%xmm14') -define(`X', `%xmm15') +define(`KEY8', `%xmm7') +define(`KEY9', `%xmm8') +define(`KEY10', `%xmm9') +define(`KEY11', `%xmm10') +define(`KEY12', `%xmm11') +define(`KEY13', `%xmm12') +define(`KEY14', `%xmm13') +define(`X', `%xmm14') +define(`Y', `%xmm15') .file "aes256-encrypt.asm" @@ -67,14 +67,13 @@ PROLOGUE(nettle_aes256_encrypt) test LENGTH, LENGTH jz .Lend - movups (CTX), KEY0 + movups (CTX), KEY0_7 movups 16(CTX), KEY1 movups 32(CTX), KEY2 movups 48(CTX), KEY3 movups 64(CTX), KEY4 movups 80(CTX), KEY5 movups 96(CTX), KEY6 - movups 112(CTX), KEY7 movups 128(CTX), KEY8 movups 144(CTX), KEY9 movups 160(CTX), KEY10 @@ -82,17 +81,20 @@ PROLOGUE(nettle_aes256_encrypt) movups 192(CTX), KEY12 movups 208(CTX), KEY13 movups 224(CTX), KEY14 + shr LENGTH + jnc .Lblock_loop -.Lblock_loop: movups (SRC), X - pxor KEY0, X + pxor KEY0_7, X + movups 112(CTX), KEY0_7 aesenc KEY1, X aesenc KEY2, X aesenc KEY3, X aesenc KEY4, X aesenc KEY5, X aesenc KEY6, X - aesenc KEY7, X + aesenc KEY0_7, X + movups (CTX), KEY0_7 aesenc KEY8, X aesenc KEY9, X aesenc KEY10, X @@ -104,6 +106,49 @@ PROLOGUE(nettle_aes256_encrypt) movups X, (DST) add $16, SRC add $16, DST + test LENGTH, LENGTH + jz .Lend + +.Lblock_loop: + movups (SRC), X + movups 16(SRC), Y + pxor KEY0_7, X + pxor KEY0_7, Y + movups 112(CTX), KEY0_7 + aesenc KEY1, X + aesenc KEY1, Y + aesenc KEY2, X + aesenc KEY2, Y + aesenc KEY3, X + aesenc KEY3, Y + aesenc KEY4, X + aesenc KEY4, Y + aesenc KEY5, X + aesenc KEY5, Y + aesenc KEY6, X + aesenc KEY6, Y + aesenc KEY0_7, X + aesenc KEY0_7, Y + movups (CTX), KEY0_7 + aesenc KEY8, X + aesenc KEY8, Y + aesenc KEY9, X + aesenc KEY9, Y + aesenc KEY10, X + aesenc KEY10, Y + aesenc KEY11, X + aesenc KEY11, Y + aesenc KEY12, X + aesenc KEY12, Y + aesenc KEY13, X + aesenc KEY13, Y + aesenclast KEY14, X + aesenclast KEY14, Y + + movups X, (DST) + movups Y, 16(DST) + add $32, SRC + add $32, DST dec LENGTH jnz .Lblock_loop |