diff options
author | Niels Möller <nisse@lysator.liu.se> | 2020-07-10 20:53:09 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2020-07-10 20:53:09 +0200 |
commit | e951e4ddd7b66c5adc3d5daf48c9de149965cd0d (patch) | |
tree | 5798c681e73c659e7bf6ae740d32ba60e79c53cc /x86_64 | |
parent | eb143cc5685f537b88c10e6a0bd1677970ac9bd6 (diff) | |
download | nettle-e951e4ddd7b66c5adc3d5daf48c9de149965cd0d.tar.gz |
x86_64: Replace salsa20_crypt assembly with salsa20_2corex86_64-salsa20-2core
Diffstat (limited to 'x86_64')
-rw-r--r-- | x86_64/salsa20-2core.asm | 318 | ||||
-rw-r--r-- | x86_64/salsa20-crypt.asm | 247 |
2 files changed, 318 insertions, 247 deletions
diff --git a/x86_64/salsa20-2core.asm b/x86_64/salsa20-2core.asm new file mode 100644 index 00000000..36f7438d --- /dev/null +++ b/x86_64/salsa20-2core.asm @@ -0,0 +1,318 @@ +C x86_64/salsa20-2core.asm + +ifelse(< + Copyright (C) 2012, 2020 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +define(<DST>, <%rdi>) +define(<SRC>, <%rsi>) +define(<COUNT>, <%rdx>) + +C State, even elements in X, odd elements in Y +define(<X0>, <%xmm0>) +define(<X1>, <%xmm1>) +define(<X2>, <%xmm2>) +define(<X3>, <%xmm3>) +define(<Y0>, <%xmm4>) +define(<Y1>, <%xmm5>) +define(<Y2>, <%xmm6>) +define(<Y3>, <%xmm7>) + +define(<T0>, <%xmm8>) +define(<T1>, <%xmm9>) +define(<T2>, <%xmm10>) +define(<T3>, <%xmm11>) + +define(<M0011>, <%xmm12>) + +include_src(<x86_64/salsa20.m4>) + + .text + ALIGN(16) + C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds) +PROLOGUE(_nettle_salsa20_2core) + W64_ENTRY(3, 13) + + movups (SRC), T0 C [0, 1, 2, 3] + movups 16(SRC), T1 C [4, 5, 6, 7] + movups 32(SRC), T2 C [8, 9, 10, 11] + movups 48(SRC), T3 C [12, 13, 14, 15] + + pshufd $0xa0, T0, X0 C X0: [0,0,2,2] + pshufd $0xf5, T0, Y3 C Y3: [1,1,3,3] + pshufd $0xa0, T1, X1 C X1: [4,4,6,6] + pshufd $0xf5, T1, Y0 C Y0: [5,5,7,7] + pshufd $0xa0, T2, X2 C X2: [8,8,10,10] + pshufd $0xf5, T2, Y1 C Y1: [9,9,11,11] + pshufd $0xa0, T3, X3 C [12,12,14,14] + pshufd $0xf5, T3, Y2 C [13,13,15,15] + + C Complicated counter increment. Could be done with + C mov $1, %eax; movd %eax, TMP; paddq T2, TMP + C earlier, but then it gets more complicated to construct X2 and Y1. + + mov $1, %eax + movd %eax, T0 C [1,0,0,0] + pshufd $0x51, T0, T0 C [0,1,0,0] + pxor T1, T1 + paddd T0, X2 + pcmpeqd X2, T1 + pand T0, T1 + paddd T1, Y1 + + C Load mask registers + mov $-1, %eax + movd %eax, M0011 + pshufd $0x09, M0011, M0011 C 01 01 00 00 + + C Swap, to get + C X0: 0 10 Y0: 5 15 + C X1: 4 14 Y1: 9 3 + C X2: 8 2 Y2: 13 7 + C X3: 12 6 Y3: 1 11 + SWAP(X0, X2, M0011) + SWAP(X1, X3, M0011) + SWAP(Y0, Y2, M0011) + SWAP(Y1, Y3, M0011) + + shrl $1, XREG(COUNT) + + ALIGN(16) + +.Loop: +C Register layout (A is first block, B is second block) +C +C X0: A0 B0 A10 B10 Y0: A5 A5 A15 B15 +C X1: A4 B4 A14 B14 Y1: A9 B9 A3 B3 +C X2: A8 B8 A2 B2 Y2: A13 B13 A7 B7 +C X3: A12 B12 A6 B6 Y3: A1 B1 A11 B11 + + movaps X0, T0 + paddd X3, T0 + movaps T0, T1 + movaps Y0, T2 + pslld $7, T0 + paddd Y3, T2 + psrld $25, T1 + movaps T2, T3 + pxor T0, X1 + pslld $7, T2 + pxor T1, X1 + psrld $25, T3 + + movaps X0, T0 + pxor T2, Y1 + paddd X1, T0 + pxor T3, Y1 + movaps T0, T1 + movaps Y0, T2 + pslld $9, T0 + paddd Y1, T2 + psrld $23, T1 + movaps T2, T3 + pxor T0, X2 + pslld $9, T2 + pxor T1, X2 + psrld $23, T3 + + movaps X1, T0 + pxor T2, Y2 + paddd X2, T0 + pxor T3, Y2 + movaps T0, T1 + movaps Y1, T2 + pslld $13, T0 + paddd Y2, T2 + psrld $19, T1 + movaps T2, T3 + pxor T0, X3 + pslld $13, T2 + pxor T1, X3 + psrld $19, T3 + + movaps X2, T0 + pxor T2, Y3 + paddd X3, T0 + pxor T3, Y3 + movaps T0, T1 + movaps Y2, T2 + pslld $18, T0 + paddd Y3, T2 + psrld $14, T1 + movaps T2, T3 + pxor T0, X0 + pslld $18, T2 + pxor T1, X0 + psrld $14, T3 + pxor T2, Y0 + pxor T3, Y0 + +C Register layout: +C X0: A0 B0 A10 B10 Y0: A5 A5 A15 B15 +C Y1: A3 B3 A9 B9 X1: A4 B4 A14 B14 (Y1 swapped) +C X2: A2 B2 A8 B8 Y2: A7 B7 A13 B13 (X2, Y2 swapped) +C Y3: A1 B1 A11 B11 X3: A6 B6 A12 B12 (X3 swapped) + + pshufd $0x4e, Y1, Y1 C 10 11 00 01 + pshufd $0x4e, X2, X2 + pshufd $0x4e, Y2, Y2 + pshufd $0x4e, X3, X3 + + movaps X0, T0 + paddd Y1, T0 + movaps T0, T1 + movaps Y0, T2 + pslld $7, T0 + paddd X1, T2 + psrld $25, T1 + movaps T2, T3 + pxor T0, Y3 + pslld $7, T2 + pxor T1, Y3 + psrld $25, T3 + + movaps Y3, T0 + pxor T2, X3 + paddd X0, T0 + pxor T3, X3 + movaps T0, T1 + movaps X3, T2 + pslld $9, T0 + paddd Y0, T2 + psrld $23, T1 + movaps T2, T3 + pxor T0, X2 + pslld $9, T2 + pxor T1, X2 + psrld $23, T3 + + movaps X2, T0 + pxor T2, Y2 + paddd Y3, T0 + pxor T3, Y2 + movaps T0, T1 + movaps Y2, T2 + pslld $13, T0 + paddd X3, T2 + psrld $19, T1 + movaps T2, T3 + pxor T0, Y1 + pslld $13, T2 + pxor T1, Y1 + psrld $19, T3 + + movaps Y1, T0 + pxor T2, X1 + paddd X2, T0 + pxor T3, X1 + movaps T0, T1 + movaps X1, T2 + pslld $18, T0 + paddd Y2, T2 + psrld $14, T1 + movaps T2, T3 + pxor T0, X0 + pslld $18, T2 + pxor T1, X0 + psrld $14, T3 + pxor T2, Y0 + pxor T3, Y0 + + pshufd $0x4e, Y1, Y1 C 10 11 00 01 + pshufd $0x4e, X2, X2 + pshufd $0x4e, Y2, Y2 + pshufd $0x4e, X3, X3 + + decl XREG(COUNT) + jnz .Loop + + SWAP(X0, X2, M0011) + SWAP(X1, X3, M0011) + SWAP(Y0, Y2, M0011) + SWAP(Y1, Y3, M0011) + + movaps X0, T0 + punpckldq Y3, X0 C [A0, A1, B0, B1] + punpckhdq Y3, T0 C [A2, A3, B2, B3] + movaps X0, Y3 + punpcklqdq T0, X0 C [A0, A1, A2, A3] + punpckhqdq T0, Y3 C [B0, B1, B2, B3] + + movups (SRC), T0 + paddd T0, X0 + paddd T0, Y3 + + movaps X1, T1 + punpckldq Y0, X1 C [A4, A5, B4, B5] + punpckhdq Y0, T1 C [A6, A7, B6, B7] + movaps X1, Y0 + punpcklqdq T1, X1 C [A4, A5, A6, A7] + punpckhqdq T1, Y0 C [B4, B5, B6, B7] + + movups 16(SRC), T1 + paddd T1, X1 + paddd T1, Y0 + + movaps X2, T2 + punpckldq Y1, X2 C [A8, A9, B8, B9] + punpckhdq Y1, T2 C [A10, A11, B10, B11] + movaps X2, Y1 + punpcklqdq T2, X2 C [A8, A9, A10, A11] + punpckhqdq T2, Y1 C [B8, B9, B10, B11] + + movups 32(SRC), T2 + paddd T2, X2 + mov $1, %eax + movd %eax, M0011 + paddq M0011, T2 + paddd T2, Y1 + + movaps X3, T3 + punpckldq Y2, X3 C [A12, A13, B12, B13] + punpckhdq Y2, T3 C [A14, A15, B14, B15] + movaps X3, Y2 + punpcklqdq T3, X3 C [A12, A13, A14, A15] + punpckhqdq T3, Y2 C [B12, B13, B14, B15] + + movups 48(SRC), T3 + paddd T3, X3 + paddd T3, Y2 + + movups X0,(DST) + movups X1,16(DST) + movups X2,32(DST) + movups X3,48(DST) C XXX + movups Y3,64(DST) + movups Y0,80(DST) + movups Y1,96(DST) + movups Y2,112(DST) C XXX + + W64_EXIT(3, 14) + ret +EPILOGUE(_nettle_salsa20_2core) diff --git a/x86_64/salsa20-crypt.asm b/x86_64/salsa20-crypt.asm deleted file mode 100644 index cc1d58ca..00000000 --- a/x86_64/salsa20-crypt.asm +++ /dev/null @@ -1,247 +0,0 @@ -C x86_64/salsa20-crypt.asm - -ifelse(< - Copyright (C) 2012 Niels Möller - - This file is part of GNU Nettle. - - GNU Nettle is free software: you can redistribute it and/or - modify it under the terms of either: - - * the GNU Lesser General Public License as published by the Free - Software Foundation; either version 3 of the License, or (at your - option) any later version. - - or - - * the GNU General Public License as published by the Free - Software Foundation; either version 2 of the License, or (at your - option) any later version. - - or both in parallel, as here. - - GNU Nettle is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received copies of the GNU General Public License and - the GNU Lesser General Public License along with this program. If - not, see http://www.gnu.org/licenses/. ->) - -define(<CTX>, <%rdi>) -define(<LENGTH>, <%rsi>) -define(<DST>, <%rdx>) -define(<SRC>, <%rcx>) -define(<T64>, <%r8>) -define(<POS>, <%r9>) -define(<X0>, <%xmm0>) -define(<X1>, <%xmm1>) -define(<X2>, <%xmm2>) -define(<X3>, <%xmm3>) -define(<T0>, <%xmm4>) -define(<T1>, <%xmm5>) -define(<M0101>, <%xmm6>) -define(<M0110>, <%xmm7>) -define(<M0011>, <%xmm8>) -define(<COUNT>, <%rax>) - -include_src(<x86_64/salsa20.m4>) - -C Possible improvements: -C -C Do two blocks (or more) at a time in parallel, to avoid limitations -C due to data dependencies. -C -C Avoid redoing the permutation of the input for each block (all but -C the two counter words are constant). Could also keep the input in -C registers. - - .file "salsa20-crypt.asm" - - C salsa20_crypt(struct salsa20_ctx *ctx, size_t length, - C uint8_t *dst, const uint8_t *src) - .text - ALIGN(16) -PROLOGUE(nettle_salsa20_crypt) - W64_ENTRY(4, 9) - - test LENGTH, LENGTH - jz .Lend - - C Load mask registers - mov $-1, XREG(COUNT) - movd XREG(COUNT), M0101 - pshufd $0x09, M0101, M0011 C 01 01 00 00 - pshufd $0x41, M0101, M0110 C 01 00 00 01 - pshufd $0x22, M0101, M0101 C 01 00 01 00 - -.Lblock_loop: - movups (CTX), X0 - movups 16(CTX), X1 - movups 32(CTX), X2 - movups 48(CTX), X3 - - C On input, each xmm register is one row. We start with - C - C 0 1 2 3 C K K K - C 4 5 6 7 K C I I - C 8 9 10 11 B B C K - C 12 13 14 15 K K K C - C - C Diagrams are in little-endian order, with least significant word to - C the left. We rotate the columns, to get instead - C - C 0 5 10 15 C C C C - C 4 9 14 3 K B K K - C 8 13 2 7 B K K I - C 12 1 6 11 K K I K - C - C The original rows are now diagonals. - SWAP(X0, X1, M0101) - SWAP(X2, X3, M0101) - SWAP(X1, X3, M0110) - SWAP(X0, X2, M0011) - - movl $10, XREG(COUNT) - ALIGN(16) -.Loop: - QROUND(X0, X1, X2, X3) - C For the row operations, we first rotate the rows, to get - C - C 0 5 10 15 - C 3 4 9 14 - C 2 7 8 13 - C 1 6 11 12 - C - C Now the original rows are turned into into columns. (This - C SIMD hack described in djb's papers). - - pshufd $0x93, X1, X1 C 11 00 01 10 (least sign. left) - pshufd $0x4e, X2, X2 C 10 11 00 01 - pshufd $0x39, X3, X3 C 01 10 11 00 - - QROUND(X0, X3, X2, X1) - - C Inverse rotation of the rows - pshufd $0x39, X1, X1 C 01 10 11 00 - pshufd $0x4e, X2, X2 C 10 11 00 01 - pshufd $0x93, X3, X3 C 11 00 01 10 - - decl XREG(COUNT) - jnz .Loop - - SWAP(X0, X2, M0011) - SWAP(X1, X3, M0110) - SWAP(X0, X1, M0101) - SWAP(X2, X3, M0101) - - movups (CTX), T0 - movups 16(CTX), T1 - paddd T0, X0 - paddd T1, X1 - movups 32(CTX), T0 - movups 48(CTX), T1 - paddd T0, X2 - paddd T1, X3 - - C Increment block counter - incq 32(CTX) - - cmp $64, LENGTH - jc .Lfinal_xor - - movups 48(SRC), T1 - pxor T1, X3 - movups X3, 48(DST) -.Lxor3: - movups 32(SRC), T0 - pxor T0, X2 - movups X2, 32(DST) -.Lxor2: - movups 16(SRC), T1 - pxor T1, X1 - movups X1, 16(DST) -.Lxor1: - movups (SRC), T0 - pxor T0, X0 - movups X0, (DST) - - lea 64(SRC), SRC - lea 64(DST), DST - sub $64, LENGTH - ja .Lblock_loop -.Lend: - W64_EXIT(4, 9) - ret - -.Lfinal_xor: - cmp $32, LENGTH - jz .Lxor2 - jc .Llt32 - cmp $48, LENGTH - jz .Lxor3 - jc .Llt48 - movaps X3, T0 - call .Lpartial - jmp .Lxor3 -.Llt48: - movaps X2, T0 - call .Lpartial - jmp .Lxor2 -.Llt32: - cmp $16, LENGTH - jz .Lxor1 - jc .Llt16 - movaps X1, T0 - call .Lpartial - jmp .Lxor1 -.Llt16: - movaps X0, T0 - call .Lpartial - jmp .Lend - -.Lpartial: - mov LENGTH, POS - and $-16, POS - test $8, LENGTH - jz .Llt8 - C This "movd" instruction should assemble to - C 66 49 0f 7e e0 movq %xmm4,%r8 - C Apparently, assemblers treat movd and movq (with the - C arguments we use) in the same way, except for osx, which - C barfs at movq. - movd T0, T64 - xor (SRC, POS), T64 - mov T64, (DST, POS) - lea 8(POS), POS - pshufd $0xee, T0, T0 C 10 11 10 11 -.Llt8: - C And this is also really a movq. - movd T0, T64 - test $4, LENGTH - jz .Llt4 - mov XREG(T64), XREG(COUNT) - xor (SRC, POS), XREG(COUNT) - mov XREG(COUNT), (DST, POS) - lea 4(POS), POS - shr $32, T64 -.Llt4: - test $2, LENGTH - jz .Llt2 - mov WREG(T64), WREG(COUNT) - xor (SRC, POS), WREG(COUNT) - mov WREG(COUNT), (DST, POS) - lea 2(POS), POS - shr $16, XREG(T64) -.Llt2: - test $1, LENGTH - jz .Lret - xor (SRC, POS), LREG(T64) - mov LREG(T64), (DST, POS) - -.Lret: - ret - -EPILOGUE(nettle_salsa20_crypt) |