diff options
author | Niels Möller <nisse@lysator.liu.se> | 2012-04-18 21:27:58 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2012-04-18 21:27:58 +0200 |
commit | 06fe7d833371ee38e61507c35f7b901482a7e2f0 (patch) | |
tree | ae174ac5ddfd758456ee25be7f8fc1d8482c66b1 | |
parent | 95c8eb724ca6f2508c9ea8b4b6763e365f25007d (diff) | |
download | nettle-06fe7d833371ee38e61507c35f7b901482a7e2f0.tar.gz |
x86_64 implementation of salsa20.
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | x86_64/salsa20-crypt.asm | 270 |
3 files changed, 275 insertions, 1 deletions
@@ -1,3 +1,7 @@ +2012-04-18 Niels Möller <nisse@lysator.liu.se> + + * x86_64/salsa20-crypt.asm: New file. + 2012-04-17 Niels Möller <nisse@lysator.liu.se> * testsuite/salsa20-test.c (test_salsa20_stream): Check that diff --git a/configure.ac b/configure.ac index 26339693..6bf2b8bd 100644 --- a/configure.ac +++ b/configure.ac @@ -233,7 +233,7 @@ if test "x$enable_assembler" = xyes ; then found=no for tmp_f in aes-encrypt-internal.asm aes-decrypt-internal.asm \ arcfour-crypt.asm camellia-crypt-internal.asm \ - md5-compress.asm memxor.asm \ + md5-compress.asm memxor.asm salsa20-crypt.asm \ serpent-encrypt.asm serpent-decrypt.asm \ sha1-compress.asm machine.m4; do # echo "Looking for $srcdir/$asm_path/$tmp_f" diff --git a/x86_64/salsa20-crypt.asm b/x86_64/salsa20-crypt.asm new file mode 100644 index 00000000..799d5744 --- /dev/null +++ b/x86_64/salsa20-crypt.asm @@ -0,0 +1,270 @@ +C nettle, low-level cryptographics library +C +C Copyright (C) 2012 Niels Möller +C +C The nettle library is free software; you can redistribute it and/or modify +C it under the terms of the GNU Lesser General Public License as published by +C the Free Software Foundation; either version 2.1 of the License, or (at your +C option) any later version. +C +C The nettle library is distributed in the hope that it will be useful, but +C WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +C or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +C License for more details. +C +C You should have received a copy of the GNU Lesser General Public License +C along with the nettle library; see the file COPYING.LIB. If not, write to +C the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +C MA 02111-1307, USA. + +define(<CTX>, <%rdi>) +define(<LENGTH>, <%rsi>) +define(<DST>, <%rdx>) +define(<SRC>, <%rcx>) +define(<T64>, <%r8>) +define(<POS>, <%r9>) +define(<X0>, <%xmm0>) +define(<X1>, <%xmm1>) +define(<X2>, <%xmm2>) +define(<X3>, <%xmm3>) +define(<T0>, <%xmm4>) +define(<T1>, <%xmm5>) +define(<M0101>, <%xmm6>) +define(<M0110>, <%xmm7>) +define(<M0011>, <%xmm8>) +define(<COUNT>, <%rax>) + +C Possible improvements: +C +C Do two blocks (or more) at a time in parallel, to avoid limitations +C due to data dependencies. +C +C Avoid redoing the permutation of the input for each block (all but +C the two counter words are constant). Could also keep the input in +C registers. + +C QROUND(x0, x1, x2, x3) +define(<QROUND>, < + movaps $4, T0 C 0 + paddd $1, T0 C 1 + movaps T0, T1 C 2 + pslld <$>7, T0 C 2 + psrld <$>25, T1 C 3 + pxor T0, $2 C 3 + pxor T1, $2 C 4 + + movaps $1, T0 C 0 + paddd $2, T0 C 5 + movaps T0, T1 C 6 + pslld <$>9, T0 C 6 + psrld <$>23, T1 C 7 + pxor T0, $3 C 7 + pxor T1, $3 C 8 + + movaps $2, T0 C 0 + paddd $3, T0 C 9 + movaps T0, T1 C 10 + pslld <$>13, T0 C 10 + psrld <$>19, T1 C 11 + pxor T0, $4 C 11 + pxor T1, $4 C 12 + + movaps $3, T0 C 0 + paddd $4, T0 C 13 + movaps T0, T1 C 14 + pslld <$>18, T0 C 14 + psrld <$>14, T1 C 15 + pxor T0, $1 C 15 + pxor T1, $1 C 16 +>) + +C SWAP(x0, x1, mask) +C Swaps bits in x0 and x1, with bits selected by the mask +define(<SWAP>, < + movaps $1, T0 + pxor $2, $1 + pand $3, $1 + pxor $1, $2 + pxor T0, $1 +>) + + .file "salsa20.asm" + + C salsa20_crypt(struct salsa20_ctx *ctx, unsigned length, + C uint8_t *dst, const uint8_t *src) + .text + ALIGN(4) +PROLOGUE(nettle_salsa20_crypt) + W64_ENTRY(4, 9) + + test LENGTH, LENGTH + jz .Lend + + C Load mask registers + mov $-1, XREG(COUNT) + movd XREG(COUNT), M0101 + pshufd $0x09, M0101, M0011 C 01 01 00 00 + pshufd $0x41, M0101, M0110 C 01 00 00 01 + pshufd $0x22, M0101, M0101 C 01 00 01 00 + +.Lblock_loop: + movups (CTX), X0 + movups 16(CTX), X1 + movups 32(CTX), X2 + movups 48(CTX), X3 + + C On input, each xmm register is one row. We start with + C + C 0 1 2 3 + C 4 5 6 7 + C 8 9 10 11 + C 12 13 14 15 + C + C Diagrams are in little-endian order, with least significant word to + C the left. We rotate the columns, to get instead + C + C 0 5 10 15 + C 4 9 14 3 + C 8 13 2 7 + C 12 1 6 11 + C + C The original rows are now diagonals. + SWAP(X0, X1, M0101) + SWAP(X2, X3, M0101) + SWAP(X1, X3, M0110) + SWAP(X0, X2, M0011) + + movl $10, XREG(COUNT) + ALIGN(4) +.Loop: + QROUND(X0, X1, X2, X3) + C For the row operations, we first rotate the rows, to get + C + C 0 5 10 15 + C 3 4 9 14 + C 2 7 8 13 + C 1 6 11 12 + C + C Now the original rows are turned into into columns. (This + C SIMD hack described in djb's papers). + + pshufd $0x93, X1, X1 C 11 00 01 10 (least sign. left) + pshufd $0x4e, X2, X2 C 10 11 00 01 + pshufd $0x39, X3, X3 C 01 10 11 00 + + QROUND(X0, X3, X2, X1) + + C Inverse rotation of the rows + pshufd $0x39, X1, X1 C 01 10 11 00 + pshufd $0x4e, X2, X2 C 10 11 00 01 + pshufd $0x93, X3, X3 C 11 00 01 10 + + decl XREG(COUNT) + jnz .Loop + + SWAP(X0, X2, M0011) + SWAP(X1, X3, M0110) + SWAP(X0, X1, M0101) + SWAP(X2, X3, M0101) + + movups (CTX), T0 + movups 16(CTX), T1 + paddd T0, X0 + paddd T1, X1 + movups 32(CTX), T0 + movups 48(CTX), T1 + paddd T0, X2 + paddd T1, X3 + + C Increment block counter + incq 32(CTX) + + cmp $64, LENGTH + jc .Lfinal_xor + + movups 48(SRC), T1 + pxor T1, X3 + movups X3, 48(DST) +.Lxor3: + movups 32(SRC), T0 + pxor T0, X2 + movups X2, 32(DST) +.Lxor2: + movups 16(SRC), T1 + pxor T1, X1 + movups X1, 16(DST) +.Lxor1: + movups (SRC), T0 + pxor T0, X0 + movups X0, (DST) + + lea 64(SRC), SRC + lea 64(DST), DST + sub $64, LENGTH + ja .Lblock_loop +.Lend: + W64_EXIT(4, 9) + ret + +.Lfinal_xor: + cmp $32, LENGTH + jz .Lxor2 + jc .Llt32 + cmp $48, LENGTH + jz .Lxor3 + jc .Llt48 + movaps X3, T0 + call .Lpartial + jmp .Lxor3 +.Llt48: + movaps X2, T0 + call .Lpartial + jmp .Lxor2 +.Llt32: + cmp $16, LENGTH + jz .Lxor1 + jc .Llt16 + movaps X1, T0 + call .Lpartial + jmp .Lxor1 +.Llt16: + movaps X0, T0 + call .Lpartial + jmp .Lend + +.Lpartial: + mov LENGTH, POS + and $-16, POS + test $8, LENGTH + jz .Llt8 + movq T0, T64 + xor (SRC, POS), T64 + mov T64, (DST, POS) + lea 8(POS), POS + pshufd $0xee, T0, T0 C 10 11 10 11 +.Llt8: + movq T0, T64 + test $4, LENGTH + jz .Llt4 + mov XREG(T64), XREG(COUNT) + xor (SRC, POS), XREG(COUNT) + mov XREG(COUNT), (DST, POS) + lea 4(POS), POS + shr $32, T64 +.Llt4: + test $2, LENGTH + jz .Llt2 + mov WREG(T64), WREG(COUNT) + xor (SRC, POS), WREG(COUNT) + mov WREG(COUNT), (DST, POS) + lea 2(POS), POS + shr $16, XREG(T64) +.Llt2: + test $1, LENGTH + jz .Lpartial_done + xor (SRC, POS), LREG(T64) + mov LREG(T64), (DST, POS) +.Lpartial_done: + ret + +EPILOGUE(nettle_salsa20_crypt) |